Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

tuntap: choose the txq based on rxq

This patch implements a simple multiqueue flow steering policy - tx follows rx
for tun/tap. The idea is simple, it just choose the txq based on which rxq it
comes. The flow were identified through the rxhash of a skb, and the hash to
queue mapping were recorded in a hlist with an ageing timer to retire the
mapping. The mapping were created when tun receives packet from userspace, and
was quired in .ndo_select_queue().

I run co-current TCP_CRR test and didn't see any mapping manipulation helpers in
perf top, so the overhead could be negelected.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Jason Wang and committed by
David S. Miller
96442e42 cde8b15f

+224 -3
+224 -3
drivers/net/tun.c
··· 115 115 */ 116 116 #define MAX_TAP_QUEUES 1024 117 117 118 + #define TUN_FLOW_EXPIRE (3 * HZ) 119 + 118 120 /* A tun_file connects an open character device to a tuntap netdevice. It 119 121 * also contains all socket related strctures (except sock_fprog and tap_filter) 120 122 * to serve as one transmit queue for tuntap device. The sock_fprog and ··· 139 137 unsigned int flags; 140 138 u16 queue_index; 141 139 }; 140 + 141 + struct tun_flow_entry { 142 + struct hlist_node hash_link; 143 + struct rcu_head rcu; 144 + struct tun_struct *tun; 145 + 146 + u32 rxhash; 147 + int queue_index; 148 + unsigned long updated; 149 + }; 150 + 151 + #define TUN_NUM_FLOW_ENTRIES 1024 142 152 143 153 /* Since the socket were moved to tun_file, to preserve the behavior of persist 144 154 * device, socket fileter, sndbuf and vnet header size were restore when the ··· 177 163 #ifdef TUN_DEBUG 178 164 int debug; 179 165 #endif 166 + spinlock_t lock; 167 + struct kmem_cache *flow_cache; 168 + struct hlist_head flows[TUN_NUM_FLOW_ENTRIES]; 169 + struct timer_list flow_gc_timer; 170 + unsigned long ageing_time; 180 171 }; 172 + 173 + static inline u32 tun_hashfn(u32 rxhash) 174 + { 175 + return rxhash & 0x3ff; 176 + } 177 + 178 + static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash) 179 + { 180 + struct tun_flow_entry *e; 181 + struct hlist_node *n; 182 + 183 + hlist_for_each_entry_rcu(e, n, head, hash_link) { 184 + if (e->rxhash == rxhash) 185 + return e; 186 + } 187 + return NULL; 188 + } 189 + 190 + static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun, 191 + struct hlist_head *head, 192 + u32 rxhash, u16 queue_index) 193 + { 194 + struct tun_flow_entry *e = kmem_cache_alloc(tun->flow_cache, 195 + GFP_ATOMIC); 196 + if (e) { 197 + tun_debug(KERN_INFO, tun, "create flow: hash %u index %u\n", 198 + rxhash, queue_index); 199 + e->updated = jiffies; 200 + e->rxhash = rxhash; 201 + e->queue_index = queue_index; 202 + e->tun = tun; 203 + hlist_add_head_rcu(&e->hash_link, head); 204 + } 205 + return e; 206 + } 207 + 208 + static void tun_flow_free(struct rcu_head *head) 209 + { 210 + struct tun_flow_entry *e 211 + = container_of(head, struct tun_flow_entry, rcu); 212 + kmem_cache_free(e->tun->flow_cache, e); 213 + } 214 + 215 + static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e) 216 + { 217 + tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n", 218 + e->rxhash, e->queue_index); 219 + hlist_del_rcu(&e->hash_link); 220 + call_rcu(&e->rcu, tun_flow_free); 221 + } 222 + 223 + static void tun_flow_flush(struct tun_struct *tun) 224 + { 225 + int i; 226 + 227 + spin_lock_bh(&tun->lock); 228 + for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { 229 + struct tun_flow_entry *e; 230 + struct hlist_node *h, *n; 231 + 232 + hlist_for_each_entry_safe(e, h, n, &tun->flows[i], hash_link) 233 + tun_flow_delete(tun, e); 234 + } 235 + spin_unlock_bh(&tun->lock); 236 + } 237 + 238 + static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index) 239 + { 240 + int i; 241 + 242 + spin_lock_bh(&tun->lock); 243 + for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { 244 + struct tun_flow_entry *e; 245 + struct hlist_node *h, *n; 246 + 247 + hlist_for_each_entry_safe(e, h, n, &tun->flows[i], hash_link) { 248 + if (e->queue_index == queue_index) 249 + tun_flow_delete(tun, e); 250 + } 251 + } 252 + spin_unlock_bh(&tun->lock); 253 + } 254 + 255 + static void tun_flow_cleanup(unsigned long data) 256 + { 257 + struct tun_struct *tun = (struct tun_struct *)data; 258 + unsigned long delay = tun->ageing_time; 259 + unsigned long next_timer = jiffies + delay; 260 + unsigned long count = 0; 261 + int i; 262 + 263 + tun_debug(KERN_INFO, tun, "tun_flow_cleanup\n"); 264 + 265 + spin_lock_bh(&tun->lock); 266 + for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { 267 + struct tun_flow_entry *e; 268 + struct hlist_node *h, *n; 269 + 270 + hlist_for_each_entry_safe(e, h, n, &tun->flows[i], hash_link) { 271 + unsigned long this_timer; 272 + count++; 273 + this_timer = e->updated + delay; 274 + if (time_before_eq(this_timer, jiffies)) 275 + tun_flow_delete(tun, e); 276 + else if (time_before(this_timer, next_timer)) 277 + next_timer = this_timer; 278 + } 279 + } 280 + 281 + if (count) 282 + mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer)); 283 + spin_unlock_bh(&tun->lock); 284 + } 285 + 286 + static void tun_flow_update(struct tun_struct *tun, struct sk_buff *skb, 287 + u16 queue_index) 288 + { 289 + struct hlist_head *head; 290 + struct tun_flow_entry *e; 291 + unsigned long delay = tun->ageing_time; 292 + u32 rxhash = skb_get_rxhash(skb); 293 + 294 + if (!rxhash) 295 + return; 296 + else 297 + head = &tun->flows[tun_hashfn(rxhash)]; 298 + 299 + rcu_read_lock(); 300 + 301 + if (tun->numqueues == 1) 302 + goto unlock; 303 + 304 + e = tun_flow_find(head, rxhash); 305 + if (likely(e)) { 306 + /* TODO: keep queueing to old queue until it's empty? */ 307 + e->queue_index = queue_index; 308 + e->updated = jiffies; 309 + } else { 310 + spin_lock_bh(&tun->lock); 311 + if (!tun_flow_find(head, rxhash)) 312 + tun_flow_create(tun, head, rxhash, queue_index); 313 + 314 + if (!timer_pending(&tun->flow_gc_timer)) 315 + mod_timer(&tun->flow_gc_timer, 316 + round_jiffies_up(jiffies + delay)); 317 + spin_unlock_bh(&tun->lock); 318 + } 319 + 320 + unlock: 321 + rcu_read_unlock(); 322 + } 181 323 182 324 /* We try to identify a flow through its rxhash first. The reason that 183 325 * we do not check rxq no. is becuase some cards(e.g 82599), chooses ··· 345 175 static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb) 346 176 { 347 177 struct tun_struct *tun = netdev_priv(dev); 178 + struct tun_flow_entry *e; 348 179 u32 txq = 0; 349 180 u32 numqueues = 0; 350 181 ··· 354 183 355 184 txq = skb_get_rxhash(skb); 356 185 if (txq) { 357 - /* use multiply and shift instead of expensive divide */ 358 - txq = ((u64)txq * numqueues) >> 32; 186 + e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); 187 + if (e) 188 + txq = e->queue_index; 189 + else 190 + /* use multiply and shift instead of expensive divide */ 191 + txq = ((u64)txq * numqueues) >> 32; 359 192 } else if (likely(skb_rx_queue_recorded(skb))) { 360 193 txq = skb_get_rx_queue(skb); 361 194 while (unlikely(txq >= numqueues)) ··· 409 234 sock_put(&tfile->sk); 410 235 411 236 synchronize_net(); 237 + tun_flow_delete_by_queue(tun, tun->numqueues + 1); 412 238 /* Drop read queue */ 413 239 skb_queue_purge(&tfile->sk.sk_receive_queue); 414 240 tun_set_real_num_queues(tun); ··· 807 631 #endif 808 632 }; 809 633 634 + static int tun_flow_init(struct tun_struct *tun) 635 + { 636 + int i; 637 + 638 + tun->flow_cache = kmem_cache_create("tun_flow_cache", 639 + sizeof(struct tun_flow_entry), 0, 0, 640 + NULL); 641 + if (!tun->flow_cache) 642 + return -ENOMEM; 643 + 644 + for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) 645 + INIT_HLIST_HEAD(&tun->flows[i]); 646 + 647 + tun->ageing_time = TUN_FLOW_EXPIRE; 648 + setup_timer(&tun->flow_gc_timer, tun_flow_cleanup, (unsigned long)tun); 649 + mod_timer(&tun->flow_gc_timer, 650 + round_jiffies_up(jiffies + tun->ageing_time)); 651 + 652 + return 0; 653 + } 654 + 655 + static void tun_flow_uninit(struct tun_struct *tun) 656 + { 657 + del_timer_sync(&tun->flow_gc_timer); 658 + tun_flow_flush(tun); 659 + 660 + /* Wait for completion of call_rcu()'s */ 661 + rcu_barrier(); 662 + kmem_cache_destroy(tun->flow_cache); 663 + } 664 + 810 665 /* Initialize net device. */ 811 666 static void tun_net_init(struct net_device *dev) 812 667 { ··· 1180 973 tun->dev->stats.rx_packets++; 1181 974 tun->dev->stats.rx_bytes += len; 1182 975 976 + tun_flow_update(tun, skb, tfile->queue_index); 1183 977 return total_len; 1184 978 } 1185 979 ··· 1358 1150 return ret; 1359 1151 } 1360 1152 1153 + static void tun_free_netdev(struct net_device *dev) 1154 + { 1155 + struct tun_struct *tun = netdev_priv(dev); 1156 + 1157 + tun_flow_uninit(tun); 1158 + free_netdev(dev); 1159 + } 1160 + 1361 1161 static void tun_setup(struct net_device *dev) 1362 1162 { 1363 1163 struct tun_struct *tun = netdev_priv(dev); ··· 1374 1158 tun->group = INVALID_GID; 1375 1159 1376 1160 dev->ethtool_ops = &tun_ethtool_ops; 1377 - dev->destructor = free_netdev; 1161 + dev->destructor = tun_free_netdev; 1378 1162 } 1379 1163 1380 1164 /* Trivial set of netlink ops to allow deleting tun or tap ··· 1597 1381 tun->filter_attached = false; 1598 1382 tun->sndbuf = tfile->socket.sk->sk_sndbuf; 1599 1383 1384 + spin_lock_init(&tun->lock); 1385 + 1600 1386 security_tun_dev_post_create(&tfile->sk); 1601 1387 1602 1388 tun_net_init(dev); 1389 + 1390 + if (tun_flow_init(tun)) 1391 + goto err_free_dev; 1603 1392 1604 1393 dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | 1605 1394 TUN_USER_FEATURES;