Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'remove-page-frag-implementation-in-vhost_net'

Yunsheng Lin says:

====================
remove page frag implementation in vhost_net

Currently there are three implementations for page frag:

1. mm/page_alloc.c: net stack seems to be using it in the
rx part with 'struct page_frag_cache' and the main API
being page_frag_alloc_align().
2. net/core/sock.c: net stack seems to be using it in the
tx part with 'struct page_frag' and the main API being
skb_page_frag_refill().
3. drivers/vhost/net.c: vhost seems to be using it to build
xdp frame, and it's implementation seems to be a mix of
the above two.

This patchset tries to unfiy the page frag implementation a
little bit by unifying gfp bit for order 3 page allocation
and replacing page frag implementation in vhost.c with the
one in page_alloc.c.

After this patchset, we are not only able to unify the page
frag implementation a little, but also able to have about
0.5% performance boost testing by using the vhost_net_test
introduced in the last patch.

Before this patchset:
Performance counter stats for './vhost_net_test' (10 runs):

305325.78 msec task-clock # 1.738 CPUs utilized ( +- 0.12% )
1048668 context-switches # 3.435 K/sec ( +- 0.00% )
11 cpu-migrations # 0.036 /sec ( +- 17.64% )
33 page-faults # 0.108 /sec ( +- 0.49% )
244651819491 cycles # 0.801 GHz ( +- 0.43% ) (64)
64714638024 stalled-cycles-frontend # 26.45% frontend cycles idle ( +- 2.19% ) (67)
30774313491 stalled-cycles-backend # 12.58% backend cycles idle ( +- 7.68% ) (70)
201749748680 instructions # 0.82 insn per cycle
# 0.32 stalled cycles per insn ( +- 0.41% ) (66.76%)
65494787909 branches # 214.508 M/sec ( +- 0.35% ) (64)
4284111313 branch-misses # 6.54% of all branches ( +- 0.45% ) (66)

175.699 +- 0.189 seconds time elapsed ( +- 0.11% )

After this patchset:
Performance counter stats for './vhost_net_test' (10 runs):

303974.38 msec task-clock # 1.739 CPUs utilized ( +- 0.14% )
1048807 context-switches # 3.450 K/sec ( +- 0.00% )
14 cpu-migrations # 0.046 /sec ( +- 12.86% )
33 page-faults # 0.109 /sec ( +- 0.46% )
251289376347 cycles # 0.827 GHz ( +- 0.32% ) (60)
67885175415 stalled-cycles-frontend # 27.01% frontend cycles idle ( +- 0.48% ) (63)
27809282600 stalled-cycles-backend # 11.07% backend cycles idle ( +- 0.36% ) (71)
195543234672 instructions # 0.78 insn per cycle
# 0.35 stalled cycles per insn ( +- 0.29% ) (69.04%)
62423183552 branches # 205.357 M/sec ( +- 0.48% ) (67)
4135666632 branch-misses # 6.63% of all branches ( +- 0.63% ) (67)

174.764 +- 0.214 seconds time elapsed ( +- 0.12% )

Changelog:
V6: Add timeout for poll() and simplify some logic as suggested
by Jason.

V5: Address the comment from jason in vhost_net_test.c and the
comment about leaving out the gfp change for page frag in
sock.c as suggested by Paolo.

V4: Resend based on latest net-next branch.

V3:
1. Add __page_frag_alloc_align() which is passed with the align mask
the original function expected as suggested by Alexander.
2. Drop patch 3 in v2 suggested by Alexander.
3. Reorder patch 4 & 5 in v2 suggested by Alexander.

Note that placing this gfp flags handing for order 3 page in an inline
function is not considered, as we may be able to unify the page_frag
and page_frag_cache handling.

V2: Change 'xor'd' to 'masked off', add vhost tx testing for
vhost_net_test.

V1: Fix some typo, drop RFC tag and rebase on latest net-next.
====================

Link: https://lore.kernel.org/r/20240228093013.8263-1-linyunsheng@huawei.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

+609 -113
+2 -9
drivers/net/ethernet/google/gve/gve_main.c
··· 1276 1276 1277 1277 static void gve_drain_page_cache(struct gve_priv *priv) 1278 1278 { 1279 - struct page_frag_cache *nc; 1280 1279 int i; 1281 1280 1282 - for (i = 0; i < priv->rx_cfg.num_queues; i++) { 1283 - nc = &priv->rx[i].page_cache; 1284 - if (nc->va) { 1285 - __page_frag_cache_drain(virt_to_page(nc->va), 1286 - nc->pagecnt_bias); 1287 - nc->va = NULL; 1288 - } 1289 - } 1281 + for (i = 0; i < priv->rx_cfg.num_queues; i++) 1282 + page_frag_cache_drain(&priv->rx[i].page_cache); 1290 1283 } 1291 1284 1292 1285 static void gve_qpls_get_curr_alloc_cfg(struct gve_priv *priv,
+2 -15
drivers/net/ethernet/mediatek/mtk_wed_wo.c
··· 286 286 static void 287 287 mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q) 288 288 { 289 - struct page *page; 290 289 int i; 291 290 292 291 for (i = 0; i < q->n_desc; i++) { ··· 300 301 entry->buf = NULL; 301 302 } 302 303 303 - if (!q->cache.va) 304 - return; 305 - 306 - page = virt_to_page(q->cache.va); 307 - __page_frag_cache_drain(page, q->cache.pagecnt_bias); 308 - memset(&q->cache, 0, sizeof(q->cache)); 304 + page_frag_cache_drain(&q->cache); 309 305 } 310 306 311 307 static void 312 308 mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q) 313 309 { 314 - struct page *page; 315 - 316 310 for (;;) { 317 311 void *buf = mtk_wed_wo_dequeue(wo, q, NULL, true); 318 312 ··· 315 323 skb_free_frag(buf); 316 324 } 317 325 318 - if (!q->cache.va) 319 - return; 320 - 321 - page = virt_to_page(q->cache.va); 322 - __page_frag_cache_drain(page, q->cache.pagecnt_bias); 323 - memset(&q->cache, 0, sizeof(q->cache)); 326 + page_frag_cache_drain(&q->cache); 324 327 } 325 328 326 329 static void
+1 -6
drivers/nvme/host/tcp.c
··· 1344 1344 1345 1345 static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) 1346 1346 { 1347 - struct page *page; 1348 1347 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); 1349 1348 struct nvme_tcp_queue *queue = &ctrl->queues[qid]; 1350 1349 unsigned int noreclaim_flag; ··· 1354 1355 if (queue->hdr_digest || queue->data_digest) 1355 1356 nvme_tcp_free_crypto(queue); 1356 1357 1357 - if (queue->pf_cache.va) { 1358 - page = virt_to_head_page(queue->pf_cache.va); 1359 - __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias); 1360 - queue->pf_cache.va = NULL; 1361 - } 1358 + page_frag_cache_drain(&queue->pf_cache); 1362 1359 1363 1360 noreclaim_flag = memalloc_noreclaim_save(); 1364 1361 /* ->sock will be released by fput() */
+1 -3
drivers/nvme/target/tcp.c
··· 1591 1591 1592 1592 static void nvmet_tcp_release_queue_work(struct work_struct *w) 1593 1593 { 1594 - struct page *page; 1595 1594 struct nvmet_tcp_queue *queue = 1596 1595 container_of(w, struct nvmet_tcp_queue, release_work); 1597 1596 ··· 1614 1615 if (queue->hdr_digest || queue->data_digest) 1615 1616 nvmet_tcp_free_crypto(queue); 1616 1617 ida_free(&nvmet_tcp_queue_ida, queue->idx); 1617 - page = virt_to_head_page(queue->pf_cache.va); 1618 - __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias); 1618 + page_frag_cache_drain(&queue->pf_cache); 1619 1619 kfree(queue); 1620 1620 } 1621 1621
+27 -64
drivers/vhost/net.c
··· 141 141 unsigned tx_zcopy_err; 142 142 /* Flush in progress. Protected by tx vq lock. */ 143 143 bool tx_flush; 144 - /* Private page frag */ 145 - struct page_frag page_frag; 146 - /* Refcount bias of page frag */ 147 - int refcnt_bias; 144 + /* Private page frag cache */ 145 + struct page_frag_cache pf_cache; 148 146 }; 149 147 150 148 static unsigned vhost_net_zcopy_mask __read_mostly; ··· 653 655 !vhost_vq_avail_empty(vq->dev, vq); 654 656 } 655 657 656 - static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz, 657 - struct page_frag *pfrag, gfp_t gfp) 658 - { 659 - if (pfrag->page) { 660 - if (pfrag->offset + sz <= pfrag->size) 661 - return true; 662 - __page_frag_cache_drain(pfrag->page, net->refcnt_bias); 663 - } 664 - 665 - pfrag->offset = 0; 666 - net->refcnt_bias = 0; 667 - if (SKB_FRAG_PAGE_ORDER) { 668 - /* Avoid direct reclaim but allow kswapd to wake */ 669 - pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 670 - __GFP_COMP | __GFP_NOWARN | 671 - __GFP_NORETRY, 672 - SKB_FRAG_PAGE_ORDER); 673 - if (likely(pfrag->page)) { 674 - pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 675 - goto done; 676 - } 677 - } 678 - pfrag->page = alloc_page(gfp); 679 - if (likely(pfrag->page)) { 680 - pfrag->size = PAGE_SIZE; 681 - goto done; 682 - } 683 - return false; 684 - 685 - done: 686 - net->refcnt_bias = USHRT_MAX; 687 - page_ref_add(pfrag->page, USHRT_MAX - 1); 688 - return true; 689 - } 690 - 691 658 #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) 692 659 693 660 static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq, ··· 662 699 struct vhost_net *net = container_of(vq->dev, struct vhost_net, 663 700 dev); 664 701 struct socket *sock = vhost_vq_get_backend(vq); 665 - struct page_frag *alloc_frag = &net->page_frag; 666 702 struct virtio_net_hdr *gso; 667 703 struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp]; 668 704 struct tun_xdp_hdr *hdr; ··· 672 710 int sock_hlen = nvq->sock_hlen; 673 711 void *buf; 674 712 int copied; 713 + int ret; 675 714 676 715 if (unlikely(len < nvq->sock_hlen)) 677 716 return -EFAULT; ··· 682 719 return -ENOSPC; 683 720 684 721 buflen += SKB_DATA_ALIGN(len + pad); 685 - alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES); 686 - if (unlikely(!vhost_net_page_frag_refill(net, buflen, 687 - alloc_frag, GFP_KERNEL))) 722 + buf = page_frag_alloc_align(&net->pf_cache, buflen, GFP_KERNEL, 723 + SMP_CACHE_BYTES); 724 + if (unlikely(!buf)) 688 725 return -ENOMEM; 689 726 690 - buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; 691 - copied = copy_page_from_iter(alloc_frag->page, 692 - alloc_frag->offset + 693 - offsetof(struct tun_xdp_hdr, gso), 694 - sock_hlen, from); 695 - if (copied != sock_hlen) 696 - return -EFAULT; 727 + copied = copy_from_iter(buf + offsetof(struct tun_xdp_hdr, gso), 728 + sock_hlen, from); 729 + if (copied != sock_hlen) { 730 + ret = -EFAULT; 731 + goto err; 732 + } 697 733 698 734 hdr = buf; 699 735 gso = &hdr->gso; ··· 705 743 vhost16_to_cpu(vq, gso->csum_start) + 706 744 vhost16_to_cpu(vq, gso->csum_offset) + 2); 707 745 708 - if (vhost16_to_cpu(vq, gso->hdr_len) > len) 709 - return -EINVAL; 746 + if (vhost16_to_cpu(vq, gso->hdr_len) > len) { 747 + ret = -EINVAL; 748 + goto err; 749 + } 710 750 } 711 751 712 752 len -= sock_hlen; 713 - copied = copy_page_from_iter(alloc_frag->page, 714 - alloc_frag->offset + pad, 715 - len, from); 716 - if (copied != len) 717 - return -EFAULT; 753 + copied = copy_from_iter(buf + pad, len, from); 754 + if (copied != len) { 755 + ret = -EFAULT; 756 + goto err; 757 + } 718 758 719 759 xdp_init_buff(xdp, buflen, NULL); 720 760 xdp_prepare_buff(xdp, buf, pad, len, true); 721 761 hdr->buflen = buflen; 722 762 723 - --net->refcnt_bias; 724 - alloc_frag->offset += buflen; 725 - 726 763 ++nvq->batched_xdp; 727 764 728 765 return 0; 766 + 767 + err: 768 + page_frag_free(buf); 769 + return ret; 729 770 } 730 771 731 772 static void handle_tx_copy(struct vhost_net *net, struct socket *sock) ··· 1318 1353 vqs[VHOST_NET_VQ_RX]); 1319 1354 1320 1355 f->private_data = n; 1321 - n->page_frag.page = NULL; 1322 - n->refcnt_bias = 0; 1356 + n->pf_cache.va = NULL; 1323 1357 1324 1358 return 0; 1325 1359 } ··· 1386 1422 kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue); 1387 1423 kfree(n->vqs[VHOST_NET_VQ_TX].xdp); 1388 1424 kfree(n->dev.vqs); 1389 - if (n->page_frag.page) 1390 - __page_frag_cache_drain(n->page_frag.page, n->refcnt_bias); 1425 + page_frag_cache_drain(&n->pf_cache); 1391 1426 kvfree(n); 1392 1427 return 0; 1393 1428 }
+12 -4
include/linux/gfp.h
··· 311 311 extern void free_pages(unsigned long addr, unsigned int order); 312 312 313 313 struct page_frag_cache; 314 + void page_frag_cache_drain(struct page_frag_cache *nc); 314 315 extern void __page_frag_cache_drain(struct page *page, unsigned int count); 315 - extern void *page_frag_alloc_align(struct page_frag_cache *nc, 316 - unsigned int fragsz, gfp_t gfp_mask, 317 - unsigned int align_mask); 316 + void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz, 317 + gfp_t gfp_mask, unsigned int align_mask); 318 + 319 + static inline void *page_frag_alloc_align(struct page_frag_cache *nc, 320 + unsigned int fragsz, gfp_t gfp_mask, 321 + unsigned int align) 322 + { 323 + WARN_ON_ONCE(!is_power_of_2(align)); 324 + return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align); 325 + } 318 326 319 327 static inline void *page_frag_alloc(struct page_frag_cache *nc, 320 328 unsigned int fragsz, gfp_t gfp_mask) 321 329 { 322 - return page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u); 330 + return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u); 323 331 } 324 332 325 333 extern void page_frag_free(void *addr);
+16 -6
mm/page_alloc.c
··· 4685 4685 gfp_t gfp = gfp_mask; 4686 4686 4687 4687 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 4688 - gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | 4689 - __GFP_NOMEMALLOC; 4688 + gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | 4689 + __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC; 4690 4690 page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, 4691 4691 PAGE_FRAG_CACHE_MAX_ORDER); 4692 4692 nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE; ··· 4699 4699 return page; 4700 4700 } 4701 4701 4702 + void page_frag_cache_drain(struct page_frag_cache *nc) 4703 + { 4704 + if (!nc->va) 4705 + return; 4706 + 4707 + __page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias); 4708 + nc->va = NULL; 4709 + } 4710 + EXPORT_SYMBOL(page_frag_cache_drain); 4711 + 4702 4712 void __page_frag_cache_drain(struct page *page, unsigned int count) 4703 4713 { 4704 4714 VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); ··· 4718 4708 } 4719 4709 EXPORT_SYMBOL(__page_frag_cache_drain); 4720 4710 4721 - void *page_frag_alloc_align(struct page_frag_cache *nc, 4722 - unsigned int fragsz, gfp_t gfp_mask, 4723 - unsigned int align_mask) 4711 + void *__page_frag_alloc_align(struct page_frag_cache *nc, 4712 + unsigned int fragsz, gfp_t gfp_mask, 4713 + unsigned int align_mask) 4724 4714 { 4725 4715 unsigned int size = PAGE_SIZE; 4726 4716 struct page *page; ··· 4789 4779 4790 4780 return nc->va + offset; 4791 4781 } 4792 - EXPORT_SYMBOL(page_frag_alloc_align); 4782 + EXPORT_SYMBOL(__page_frag_alloc_align); 4793 4783 4794 4784 /* 4795 4785 * Frees a page fragment allocated out of either a compound or order 0 page.
+6 -3
net/core/skbuff.c
··· 315 315 316 316 fragsz = SKB_DATA_ALIGN(fragsz); 317 317 318 - return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); 318 + return __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, 319 + align_mask); 319 320 } 320 321 EXPORT_SYMBOL(__napi_alloc_frag_align); 321 322 ··· 328 327 if (in_hardirq() || irqs_disabled()) { 329 328 struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache); 330 329 331 - data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask); 330 + data = __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, 331 + align_mask); 332 332 } else { 333 333 struct napi_alloc_cache *nc; 334 334 335 335 local_bh_disable(); 336 336 nc = this_cpu_ptr(&napi_alloc_cache); 337 - data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); 337 + data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, 338 + align_mask); 338 339 local_bh_enable(); 339 340 } 340 341 return data;
+1
tools/virtio/.gitignore
··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 *.d 3 3 virtio_test 4 + vhost_net_test 4 5 vringh_test 5 6 virtio-trace/trace-agent
+5 -3
tools/virtio/Makefile
··· 1 1 # SPDX-License-Identifier: GPL-2.0 2 2 all: test mod 3 - test: virtio_test vringh_test 3 + test: virtio_test vringh_test vhost_net_test 4 4 virtio_test: virtio_ring.o virtio_test.o 5 5 vringh_test: vringh_test.o vringh.o virtio_ring.o 6 + vhost_net_test: virtio_ring.o vhost_net_test.o 6 7 7 8 try-run = $(shell set -e; \ 8 9 if ($(1)) >/dev/null 2>&1; \ ··· 50 49 51 50 .PHONY: all test mod clean vhost oot oot-clean oot-build 52 51 clean: 53 - ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \ 54 - vhost_test/Module.symvers vhost_test/modules.order *.d 52 + ${RM} *.o vringh_test virtio_test vhost_net_test vhost_test/*.o \ 53 + vhost_test/.*.cmd vhost_test/Module.symvers \ 54 + vhost_test/modules.order *.d 55 55 -include *.d
+4
tools/virtio/linux/virtio_config.h
··· 1 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef LINUX_VIRTIO_CONFIG_H 3 + #define LINUX_VIRTIO_CONFIG_H 2 4 #include <linux/virtio_byteorder.h> 3 5 #include <linux/virtio.h> 4 6 #include <uapi/linux/virtio_config.h> ··· 97 95 { 98 96 return __cpu_to_virtio64(virtio_is_little_endian(vdev), val); 99 97 } 98 + 99 + #endif
+532
tools/virtio/vhost_net_test.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #define _GNU_SOURCE 3 + #include <getopt.h> 4 + #include <limits.h> 5 + #include <string.h> 6 + #include <poll.h> 7 + #include <sys/eventfd.h> 8 + #include <stdlib.h> 9 + #include <assert.h> 10 + #include <unistd.h> 11 + #include <sys/ioctl.h> 12 + #include <sys/stat.h> 13 + #include <sys/types.h> 14 + #include <fcntl.h> 15 + #include <stdbool.h> 16 + #include <linux/vhost.h> 17 + #include <linux/if.h> 18 + #include <linux/if_tun.h> 19 + #include <linux/in.h> 20 + #include <linux/if_packet.h> 21 + #include <linux/virtio_net.h> 22 + #include <netinet/ether.h> 23 + 24 + #define HDR_LEN sizeof(struct virtio_net_hdr_mrg_rxbuf) 25 + #define TEST_BUF_LEN 256 26 + #define TEST_PTYPE ETH_P_LOOPBACK 27 + #define DESC_NUM 256 28 + 29 + /* Used by implementation of kmalloc() in tools/virtio/linux/kernel.h */ 30 + void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end; 31 + 32 + struct vq_info { 33 + int kick; 34 + int call; 35 + int idx; 36 + long started; 37 + long completed; 38 + struct pollfd fds; 39 + void *ring; 40 + /* copy used for control */ 41 + struct vring vring; 42 + struct virtqueue *vq; 43 + }; 44 + 45 + struct vdev_info { 46 + struct virtio_device vdev; 47 + int control; 48 + struct vq_info vqs[2]; 49 + int nvqs; 50 + void *buf; 51 + size_t buf_size; 52 + char *test_buf; 53 + char *res_buf; 54 + struct vhost_memory *mem; 55 + int sock; 56 + int ifindex; 57 + unsigned char mac[ETHER_ADDR_LEN]; 58 + }; 59 + 60 + static int tun_alloc(struct vdev_info *dev, char *tun_name) 61 + { 62 + struct ifreq ifr; 63 + int len = HDR_LEN; 64 + int fd, e; 65 + 66 + fd = open("/dev/net/tun", O_RDWR); 67 + if (fd < 0) { 68 + perror("Cannot open /dev/net/tun"); 69 + return fd; 70 + } 71 + 72 + memset(&ifr, 0, sizeof(ifr)); 73 + 74 + ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; 75 + strncpy(ifr.ifr_name, tun_name, IFNAMSIZ); 76 + 77 + e = ioctl(fd, TUNSETIFF, &ifr); 78 + if (e < 0) { 79 + perror("ioctl[TUNSETIFF]"); 80 + close(fd); 81 + return e; 82 + } 83 + 84 + e = ioctl(fd, TUNSETVNETHDRSZ, &len); 85 + if (e < 0) { 86 + perror("ioctl[TUNSETVNETHDRSZ]"); 87 + close(fd); 88 + return e; 89 + } 90 + 91 + e = ioctl(fd, SIOCGIFHWADDR, &ifr); 92 + if (e < 0) { 93 + perror("ioctl[SIOCGIFHWADDR]"); 94 + close(fd); 95 + return e; 96 + } 97 + 98 + memcpy(dev->mac, &ifr.ifr_hwaddr.sa_data, ETHER_ADDR_LEN); 99 + return fd; 100 + } 101 + 102 + static void vdev_create_socket(struct vdev_info *dev, char *tun_name) 103 + { 104 + struct ifreq ifr; 105 + 106 + dev->sock = socket(AF_PACKET, SOCK_RAW, htons(TEST_PTYPE)); 107 + assert(dev->sock != -1); 108 + 109 + strncpy(ifr.ifr_name, tun_name, IFNAMSIZ); 110 + assert(ioctl(dev->sock, SIOCGIFINDEX, &ifr) >= 0); 111 + 112 + dev->ifindex = ifr.ifr_ifindex; 113 + 114 + /* Set the flags that bring the device up */ 115 + assert(ioctl(dev->sock, SIOCGIFFLAGS, &ifr) >= 0); 116 + ifr.ifr_flags |= (IFF_UP | IFF_RUNNING); 117 + assert(ioctl(dev->sock, SIOCSIFFLAGS, &ifr) >= 0); 118 + } 119 + 120 + static void vdev_send_packet(struct vdev_info *dev) 121 + { 122 + char *sendbuf = dev->test_buf + HDR_LEN; 123 + struct sockaddr_ll saddrll = {0}; 124 + int sockfd = dev->sock; 125 + int ret; 126 + 127 + saddrll.sll_family = PF_PACKET; 128 + saddrll.sll_ifindex = dev->ifindex; 129 + saddrll.sll_halen = ETH_ALEN; 130 + saddrll.sll_protocol = htons(TEST_PTYPE); 131 + 132 + ret = sendto(sockfd, sendbuf, TEST_BUF_LEN, 0, 133 + (struct sockaddr *)&saddrll, 134 + sizeof(struct sockaddr_ll)); 135 + assert(ret >= 0); 136 + } 137 + 138 + static bool vq_notify(struct virtqueue *vq) 139 + { 140 + struct vq_info *info = vq->priv; 141 + unsigned long long v = 1; 142 + int r; 143 + 144 + r = write(info->kick, &v, sizeof(v)); 145 + assert(r == sizeof(v)); 146 + 147 + return true; 148 + } 149 + 150 + static void vhost_vq_setup(struct vdev_info *dev, struct vq_info *info) 151 + { 152 + struct vhost_vring_addr addr = { 153 + .index = info->idx, 154 + .desc_user_addr = (uint64_t)(unsigned long)info->vring.desc, 155 + .avail_user_addr = (uint64_t)(unsigned long)info->vring.avail, 156 + .used_user_addr = (uint64_t)(unsigned long)info->vring.used, 157 + }; 158 + struct vhost_vring_state state = { .index = info->idx }; 159 + struct vhost_vring_file file = { .index = info->idx }; 160 + int r; 161 + 162 + state.num = info->vring.num; 163 + r = ioctl(dev->control, VHOST_SET_VRING_NUM, &state); 164 + assert(r >= 0); 165 + 166 + state.num = 0; 167 + r = ioctl(dev->control, VHOST_SET_VRING_BASE, &state); 168 + assert(r >= 0); 169 + 170 + r = ioctl(dev->control, VHOST_SET_VRING_ADDR, &addr); 171 + assert(r >= 0); 172 + 173 + file.fd = info->kick; 174 + r = ioctl(dev->control, VHOST_SET_VRING_KICK, &file); 175 + assert(r >= 0); 176 + } 177 + 178 + static void vq_reset(struct vq_info *info, int num, struct virtio_device *vdev) 179 + { 180 + if (info->vq) 181 + vring_del_virtqueue(info->vq); 182 + 183 + memset(info->ring, 0, vring_size(num, 4096)); 184 + vring_init(&info->vring, num, info->ring, 4096); 185 + info->vq = vring_new_virtqueue(info->idx, num, 4096, vdev, true, false, 186 + info->ring, vq_notify, NULL, "test"); 187 + assert(info->vq); 188 + info->vq->priv = info; 189 + } 190 + 191 + static void vq_info_add(struct vdev_info *dev, int idx, int num, int fd) 192 + { 193 + struct vhost_vring_file backend = { .index = idx, .fd = fd }; 194 + struct vq_info *info = &dev->vqs[idx]; 195 + int r; 196 + 197 + info->idx = idx; 198 + info->kick = eventfd(0, EFD_NONBLOCK); 199 + r = posix_memalign(&info->ring, 4096, vring_size(num, 4096)); 200 + assert(r >= 0); 201 + vq_reset(info, num, &dev->vdev); 202 + vhost_vq_setup(dev, info); 203 + 204 + r = ioctl(dev->control, VHOST_NET_SET_BACKEND, &backend); 205 + assert(!r); 206 + } 207 + 208 + static void vdev_info_init(struct vdev_info *dev, unsigned long long features) 209 + { 210 + struct ether_header *eh; 211 + int i, r; 212 + 213 + dev->vdev.features = features; 214 + INIT_LIST_HEAD(&dev->vdev.vqs); 215 + spin_lock_init(&dev->vdev.vqs_list_lock); 216 + 217 + dev->buf_size = (HDR_LEN + TEST_BUF_LEN) * 2; 218 + dev->buf = malloc(dev->buf_size); 219 + assert(dev->buf); 220 + dev->test_buf = dev->buf; 221 + dev->res_buf = dev->test_buf + HDR_LEN + TEST_BUF_LEN; 222 + 223 + memset(dev->test_buf, 0, HDR_LEN + TEST_BUF_LEN); 224 + eh = (struct ether_header *)(dev->test_buf + HDR_LEN); 225 + eh->ether_type = htons(TEST_PTYPE); 226 + memcpy(eh->ether_dhost, dev->mac, ETHER_ADDR_LEN); 227 + memcpy(eh->ether_shost, dev->mac, ETHER_ADDR_LEN); 228 + 229 + for (i = sizeof(*eh); i < TEST_BUF_LEN; i++) 230 + dev->test_buf[i + HDR_LEN] = (char)i; 231 + 232 + dev->control = open("/dev/vhost-net", O_RDWR); 233 + assert(dev->control >= 0); 234 + 235 + r = ioctl(dev->control, VHOST_SET_OWNER, NULL); 236 + assert(r >= 0); 237 + 238 + dev->mem = malloc(offsetof(struct vhost_memory, regions) + 239 + sizeof(dev->mem->regions[0])); 240 + assert(dev->mem); 241 + memset(dev->mem, 0, offsetof(struct vhost_memory, regions) + 242 + sizeof(dev->mem->regions[0])); 243 + dev->mem->nregions = 1; 244 + dev->mem->regions[0].guest_phys_addr = (long)dev->buf; 245 + dev->mem->regions[0].userspace_addr = (long)dev->buf; 246 + dev->mem->regions[0].memory_size = dev->buf_size; 247 + 248 + r = ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem); 249 + assert(r >= 0); 250 + 251 + r = ioctl(dev->control, VHOST_SET_FEATURES, &features); 252 + assert(r >= 0); 253 + 254 + dev->nvqs = 2; 255 + } 256 + 257 + static void wait_for_interrupt(struct vq_info *vq) 258 + { 259 + unsigned long long val; 260 + 261 + poll(&vq->fds, 1, 100); 262 + 263 + if (vq->fds.revents & POLLIN) 264 + read(vq->fds.fd, &val, sizeof(val)); 265 + } 266 + 267 + static void verify_res_buf(char *res_buf) 268 + { 269 + int i; 270 + 271 + for (i = ETHER_HDR_LEN; i < TEST_BUF_LEN; i++) 272 + assert(res_buf[i] == (char)i); 273 + } 274 + 275 + static void run_tx_test(struct vdev_info *dev, struct vq_info *vq, 276 + bool delayed, int bufs) 277 + { 278 + long long spurious = 0; 279 + struct scatterlist sl; 280 + unsigned int len; 281 + int r; 282 + 283 + for (;;) { 284 + long started_before = vq->started; 285 + long completed_before = vq->completed; 286 + 287 + virtqueue_disable_cb(vq->vq); 288 + do { 289 + while (vq->started < bufs && 290 + (vq->started - vq->completed) < 1) { 291 + sg_init_one(&sl, dev->test_buf, HDR_LEN + TEST_BUF_LEN); 292 + r = virtqueue_add_outbuf(vq->vq, &sl, 1, 293 + dev->test_buf + vq->started, 294 + GFP_ATOMIC); 295 + if (unlikely(r != 0)) 296 + break; 297 + 298 + ++vq->started; 299 + 300 + if (unlikely(!virtqueue_kick(vq->vq))) { 301 + r = -1; 302 + break; 303 + } 304 + } 305 + 306 + if (vq->started >= bufs) 307 + r = -1; 308 + 309 + /* Flush out completed bufs if any */ 310 + while (virtqueue_get_buf(vq->vq, &len)) { 311 + int n; 312 + 313 + n = recvfrom(dev->sock, dev->res_buf, TEST_BUF_LEN, 0, NULL, NULL); 314 + assert(n == TEST_BUF_LEN); 315 + verify_res_buf(dev->res_buf); 316 + 317 + ++vq->completed; 318 + r = 0; 319 + } 320 + } while (r == 0); 321 + 322 + if (vq->completed == completed_before && vq->started == started_before) 323 + ++spurious; 324 + 325 + assert(vq->completed <= bufs); 326 + assert(vq->started <= bufs); 327 + if (vq->completed == bufs) 328 + break; 329 + 330 + if (delayed) { 331 + if (virtqueue_enable_cb_delayed(vq->vq)) 332 + wait_for_interrupt(vq); 333 + } else { 334 + if (virtqueue_enable_cb(vq->vq)) 335 + wait_for_interrupt(vq); 336 + } 337 + } 338 + printf("TX spurious wakeups: 0x%llx started=0x%lx completed=0x%lx\n", 339 + spurious, vq->started, vq->completed); 340 + } 341 + 342 + static void run_rx_test(struct vdev_info *dev, struct vq_info *vq, 343 + bool delayed, int bufs) 344 + { 345 + long long spurious = 0; 346 + struct scatterlist sl; 347 + unsigned int len; 348 + int r; 349 + 350 + for (;;) { 351 + long started_before = vq->started; 352 + long completed_before = vq->completed; 353 + 354 + do { 355 + while (vq->started < bufs && 356 + (vq->started - vq->completed) < 1) { 357 + sg_init_one(&sl, dev->res_buf, HDR_LEN + TEST_BUF_LEN); 358 + 359 + r = virtqueue_add_inbuf(vq->vq, &sl, 1, 360 + dev->res_buf + vq->started, 361 + GFP_ATOMIC); 362 + if (unlikely(r != 0)) 363 + break; 364 + 365 + ++vq->started; 366 + 367 + vdev_send_packet(dev); 368 + 369 + if (unlikely(!virtqueue_kick(vq->vq))) { 370 + r = -1; 371 + break; 372 + } 373 + } 374 + 375 + if (vq->started >= bufs) 376 + r = -1; 377 + 378 + /* Flush out completed bufs if any */ 379 + while (virtqueue_get_buf(vq->vq, &len)) { 380 + struct ether_header *eh; 381 + 382 + eh = (struct ether_header *)(dev->res_buf + HDR_LEN); 383 + 384 + /* tun netdev is up and running, only handle the 385 + * TEST_PTYPE packet. 386 + */ 387 + if (eh->ether_type == htons(TEST_PTYPE)) { 388 + assert(len == TEST_BUF_LEN + HDR_LEN); 389 + verify_res_buf(dev->res_buf + HDR_LEN); 390 + } 391 + 392 + ++vq->completed; 393 + r = 0; 394 + } 395 + } while (r == 0); 396 + 397 + if (vq->completed == completed_before && vq->started == started_before) 398 + ++spurious; 399 + 400 + assert(vq->completed <= bufs); 401 + assert(vq->started <= bufs); 402 + if (vq->completed == bufs) 403 + break; 404 + } 405 + 406 + printf("RX spurious wakeups: 0x%llx started=0x%lx completed=0x%lx\n", 407 + spurious, vq->started, vq->completed); 408 + } 409 + 410 + static const char optstring[] = "h"; 411 + static const struct option longopts[] = { 412 + { 413 + .name = "help", 414 + .val = 'h', 415 + }, 416 + { 417 + .name = "event-idx", 418 + .val = 'E', 419 + }, 420 + { 421 + .name = "no-event-idx", 422 + .val = 'e', 423 + }, 424 + { 425 + .name = "indirect", 426 + .val = 'I', 427 + }, 428 + { 429 + .name = "no-indirect", 430 + .val = 'i', 431 + }, 432 + { 433 + .name = "virtio-1", 434 + .val = '1', 435 + }, 436 + { 437 + .name = "no-virtio-1", 438 + .val = '0', 439 + }, 440 + { 441 + .name = "delayed-interrupt", 442 + .val = 'D', 443 + }, 444 + { 445 + .name = "no-delayed-interrupt", 446 + .val = 'd', 447 + }, 448 + { 449 + .name = "buf-num", 450 + .val = 'n', 451 + .has_arg = required_argument, 452 + }, 453 + { 454 + .name = "batch", 455 + .val = 'b', 456 + .has_arg = required_argument, 457 + }, 458 + { 459 + } 460 + }; 461 + 462 + static void help(int status) 463 + { 464 + fprintf(stderr, "Usage: vhost_net_test [--help]" 465 + " [--no-indirect]" 466 + " [--no-event-idx]" 467 + " [--no-virtio-1]" 468 + " [--delayed-interrupt]" 469 + " [--buf-num]" 470 + "\n"); 471 + 472 + exit(status); 473 + } 474 + 475 + int main(int argc, char **argv) 476 + { 477 + unsigned long long features = (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | 478 + (1ULL << VIRTIO_RING_F_EVENT_IDX) | (1ULL << VIRTIO_F_VERSION_1); 479 + char tun_name[IFNAMSIZ]; 480 + long nbufs = 0x100000; 481 + struct vdev_info dev; 482 + bool delayed = false; 483 + int o, fd; 484 + 485 + for (;;) { 486 + o = getopt_long(argc, argv, optstring, longopts, NULL); 487 + switch (o) { 488 + case -1: 489 + goto done; 490 + case '?': 491 + help(2); 492 + case 'e': 493 + features &= ~(1ULL << VIRTIO_RING_F_EVENT_IDX); 494 + break; 495 + case 'h': 496 + help(0); 497 + case 'i': 498 + features &= ~(1ULL << VIRTIO_RING_F_INDIRECT_DESC); 499 + break; 500 + case '0': 501 + features &= ~(1ULL << VIRTIO_F_VERSION_1); 502 + break; 503 + case 'D': 504 + delayed = true; 505 + break; 506 + case 'n': 507 + nbufs = strtol(optarg, NULL, 10); 508 + assert(nbufs > 0); 509 + break; 510 + default: 511 + assert(0); 512 + break; 513 + } 514 + } 515 + 516 + done: 517 + memset(&dev, 0, sizeof(dev)); 518 + snprintf(tun_name, IFNAMSIZ, "tun_%d", getpid()); 519 + 520 + fd = tun_alloc(&dev, tun_name); 521 + assert(fd >= 0); 522 + 523 + vdev_info_init(&dev, features); 524 + vq_info_add(&dev, 0, DESC_NUM, fd); 525 + vq_info_add(&dev, 1, DESC_NUM, fd); 526 + vdev_create_socket(&dev, tun_name); 527 + 528 + run_rx_test(&dev, &dev.vqs[0], delayed, nbufs); 529 + run_tx_test(&dev, &dev.vqs[1], delayed, nbufs); 530 + 531 + return 0; 532 + }