Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'vhost_net-batching'

Jason Wang says:

====================
vhost_net tx batching

This series tries to implement tx batching support for vhost. This was
done by using MSG_MORE as a hint for under layer socket. The backend
(e.g tap) can then batch the packets temporarily in a list and
submit it all once the number of bacthed exceeds a limitation.

Tests shows obvious improvement on guest pktgen over over
mlx4(noqueue) on host:

Mpps -+%
rx-frames = 0 0.91 +0%
rx-frames = 4 1.00 +9.8%
rx-frames = 8 1.00 +9.8%
rx-frames = 16 1.01 +10.9%
rx-frames = 32 1.07 +17.5%
rx-frames = 48 1.07 +17.5%
rx-frames = 64 1.08 +18.6%
rx-frames = 64 (no MSG_MORE) 0.91 +0%

Changes from V4:
- stick to NAPI_POLL_WEIGHT for rx-frames is user specify a value
greater than it.
Changes from V3:
- use ethtool instead of module parameter to control the maximum
number of batched packets
- avoid overhead when MSG_MORE were not set and no packet queued
Changes from V2:
- remove uselss queue limitation check (and we don't drop any packet now)
Changes from V1:
- drop NAPI handler since we don't use NAPI now
- fix the issues that may exceeds max pending of zerocopy
- more improvement on available buffer detection
- move the limitation of batched pacekts from vhost to tuntap
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+97 -12
+70 -6
drivers/net/tun.c
··· 218 218 struct list_head disabled; 219 219 void *security; 220 220 u32 flow_count; 221 + u32 rx_batched; 221 222 struct tun_pcpu_stats __percpu *pcpu_stats; 222 223 }; 223 224 ··· 523 522 while ((skb = skb_array_consume(&tfile->tx_array)) != NULL) 524 523 kfree_skb(skb); 525 524 525 + skb_queue_purge(&tfile->sk.sk_write_queue); 526 526 skb_queue_purge(&tfile->sk.sk_error_queue); 527 527 } 528 528 ··· 1141 1139 return skb; 1142 1140 } 1143 1141 1142 + static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile, 1143 + struct sk_buff *skb, int more) 1144 + { 1145 + struct sk_buff_head *queue = &tfile->sk.sk_write_queue; 1146 + struct sk_buff_head process_queue; 1147 + u32 rx_batched = tun->rx_batched; 1148 + bool rcv = false; 1149 + 1150 + if (!rx_batched || (!more && skb_queue_empty(queue))) { 1151 + local_bh_disable(); 1152 + netif_receive_skb(skb); 1153 + local_bh_enable(); 1154 + return; 1155 + } 1156 + 1157 + spin_lock(&queue->lock); 1158 + if (!more || skb_queue_len(queue) == rx_batched) { 1159 + __skb_queue_head_init(&process_queue); 1160 + skb_queue_splice_tail_init(queue, &process_queue); 1161 + rcv = true; 1162 + } else { 1163 + __skb_queue_tail(queue, skb); 1164 + } 1165 + spin_unlock(&queue->lock); 1166 + 1167 + if (rcv) { 1168 + struct sk_buff *nskb; 1169 + 1170 + local_bh_disable(); 1171 + while ((nskb = __skb_dequeue(&process_queue))) 1172 + netif_receive_skb(nskb); 1173 + netif_receive_skb(skb); 1174 + local_bh_enable(); 1175 + } 1176 + } 1177 + 1144 1178 /* Get packet from user space buffer */ 1145 1179 static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, 1146 1180 void *msg_control, struct iov_iter *from, 1147 - int noblock) 1181 + int noblock, bool more) 1148 1182 { 1149 1183 struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; 1150 1184 struct sk_buff *skb; ··· 1321 1283 1322 1284 rxhash = skb_get_hash(skb); 1323 1285 #ifndef CONFIG_4KSTACKS 1324 - local_bh_disable(); 1325 - netif_receive_skb(skb); 1326 - local_bh_enable(); 1286 + tun_rx_batched(tun, tfile, skb, more); 1327 1287 #else 1328 1288 netif_rx_ni(skb); 1329 1289 #endif ··· 1347 1311 if (!tun) 1348 1312 return -EBADFD; 1349 1313 1350 - result = tun_get_user(tun, tfile, NULL, from, file->f_flags & O_NONBLOCK); 1314 + result = tun_get_user(tun, tfile, NULL, from, 1315 + file->f_flags & O_NONBLOCK, false); 1351 1316 1352 1317 tun_put(tun); 1353 1318 return result; ··· 1606 1569 return -EBADFD; 1607 1570 1608 1571 ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter, 1609 - m->msg_flags & MSG_DONTWAIT); 1572 + m->msg_flags & MSG_DONTWAIT, 1573 + m->msg_flags & MSG_MORE); 1610 1574 tun_put(tun); 1611 1575 return ret; 1612 1576 } ··· 1808 1770 tun->align = NET_SKB_PAD; 1809 1771 tun->filter_attached = false; 1810 1772 tun->sndbuf = tfile->socket.sk->sk_sndbuf; 1773 + tun->rx_batched = 0; 1811 1774 1812 1775 tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats); 1813 1776 if (!tun->pcpu_stats) { ··· 2477 2438 #endif 2478 2439 } 2479 2440 2441 + static int tun_get_coalesce(struct net_device *dev, 2442 + struct ethtool_coalesce *ec) 2443 + { 2444 + struct tun_struct *tun = netdev_priv(dev); 2445 + 2446 + ec->rx_max_coalesced_frames = tun->rx_batched; 2447 + 2448 + return 0; 2449 + } 2450 + 2451 + static int tun_set_coalesce(struct net_device *dev, 2452 + struct ethtool_coalesce *ec) 2453 + { 2454 + struct tun_struct *tun = netdev_priv(dev); 2455 + 2456 + if (ec->rx_max_coalesced_frames > NAPI_POLL_WEIGHT) 2457 + tun->rx_batched = NAPI_POLL_WEIGHT; 2458 + else 2459 + tun->rx_batched = ec->rx_max_coalesced_frames; 2460 + 2461 + return 0; 2462 + } 2463 + 2480 2464 static const struct ethtool_ops tun_ethtool_ops = { 2481 2465 .get_settings = tun_get_settings, 2482 2466 .get_drvinfo = tun_get_drvinfo, ··· 2507 2445 .set_msglevel = tun_set_msglevel, 2508 2446 .get_link = ethtool_op_get_link, 2509 2447 .get_ts_info = ethtool_op_get_ts_info, 2448 + .get_coalesce = tun_get_coalesce, 2449 + .set_coalesce = tun_set_coalesce, 2510 2450 }; 2511 2451 2512 2452 static int tun_queue_resize(struct tun_struct *tun)
+20 -3
drivers/vhost/net.c
··· 351 351 return r; 352 352 } 353 353 354 + static bool vhost_exceeds_maxpend(struct vhost_net *net) 355 + { 356 + struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; 357 + struct vhost_virtqueue *vq = &nvq->vq; 358 + 359 + return (nvq->upend_idx + vq->num - VHOST_MAX_PEND) % UIO_MAXIOV 360 + == nvq->done_idx; 361 + } 362 + 354 363 /* Expects to be always run from workqueue - which acts as 355 364 * read-size critical section for our kind of RCU. */ 356 365 static void handle_tx(struct vhost_net *net) ··· 403 394 /* If more outstanding DMAs, queue the work. 404 395 * Handle upend_idx wrap around 405 396 */ 406 - if (unlikely((nvq->upend_idx + vq->num - VHOST_MAX_PEND) 407 - % UIO_MAXIOV == nvq->done_idx)) 397 + if (unlikely(vhost_exceeds_maxpend(net))) 408 398 break; 409 399 410 400 head = vhost_net_tx_get_vq_desc(net, vq, vq->iov, ··· 462 454 msg.msg_control = NULL; 463 455 ubufs = NULL; 464 456 } 457 + 458 + total_len += len; 459 + if (total_len < VHOST_NET_WEIGHT && 460 + !vhost_vq_avail_empty(&net->dev, vq) && 461 + likely(!vhost_exceeds_maxpend(net))) { 462 + msg.msg_flags |= MSG_MORE; 463 + } else { 464 + msg.msg_flags &= ~MSG_MORE; 465 + } 466 + 465 467 /* TODO: Check specific error and bomb out unless ENOBUFS? */ 466 468 err = sock->ops->sendmsg(sock, &msg, len); 467 469 if (unlikely(err < 0)) { ··· 490 472 vhost_add_used_and_signal(&net->dev, vq, head, 0); 491 473 else 492 474 vhost_zerocopy_signal_used(net, vq); 493 - total_len += len; 494 475 vhost_net_tx_packet(net); 495 476 if (unlikely(total_len >= VHOST_NET_WEIGHT)) { 496 477 vhost_poll_queue(&vq->poll);
+7 -3
drivers/vhost/vhost.c
··· 2241 2241 __virtio16 avail_idx; 2242 2242 int r; 2243 2243 2244 - r = vhost_get_user(vq, avail_idx, &vq->avail->idx); 2245 - if (r) 2244 + if (vq->avail_idx != vq->last_avail_idx) 2246 2245 return false; 2247 2246 2248 - return vhost16_to_cpu(vq, avail_idx) == vq->avail_idx; 2247 + r = vhost_get_user(vq, avail_idx, &vq->avail->idx); 2248 + if (unlikely(r)) 2249 + return false; 2250 + vq->avail_idx = vhost16_to_cpu(vq, avail_idx); 2251 + 2252 + return vq->avail_idx == vq->last_avail_idx; 2249 2253 } 2250 2254 EXPORT_SYMBOL_GPL(vhost_vq_avail_empty); 2251 2255