Merge tag 'for-5.20/io_uring-zerocopy-send-2022-07-29' of git://git.kernel.dk/linux-block

+30

include/linux/io_uring_types.h

··· 4 4 #include <linux/blkdev.h> 5 5 #include <linux/task_work.h> 6 6 #include <linux/bitmap.h> 7 + #include <linux/llist.h> 7 8 #include <uapi/linux/io_uring.h> 8 9 9 10 struct io_wq_work_node { ··· 34 33 unsigned int alloc_hint; 35 34 }; 36 35 36 + struct io_notif; 37 + struct io_notif_slot; 38 + 37 39 struct io_hash_bucket { 38 40 spinlock_t lock; 39 41 struct hlist_head list; ··· 45 41 struct io_hash_table { 46 42 struct io_hash_bucket *hbs; 47 43 unsigned hash_bits; 44 + }; 45 + 46 + /* 47 + * Arbitrary limit, can be raised if need be 48 + */ 49 + #define IO_RINGFD_REG_MAX 16 50 + 51 + struct io_uring_task { 52 + /* submission side */ 53 + int cached_refs; 54 + const struct io_ring_ctx *last; 55 + struct io_wq *io_wq; 56 + struct file *registered_rings[IO_RINGFD_REG_MAX]; 57 + 58 + struct xarray xa; 59 + struct wait_queue_head wait; 60 + atomic_t in_idle; 61 + atomic_t inflight_tracked; 62 + struct percpu_counter inflight; 63 + 64 + struct { /* task_work */ 65 + struct llist_head task_list; 66 + struct callback_head task_work; 67 + } ____cacheline_aligned_in_smp; 48 68 }; 49 69 50 70 struct io_uring { ··· 240 212 unsigned nr_user_files; 241 213 unsigned nr_user_bufs; 242 214 struct io_mapped_ubuf **user_bufs; 215 + struct io_notif_slot *notif_slots; 216 + unsigned nr_notif_slots; 243 217 244 218 struct io_submit_state submit_state; 245 219

+48 -18

include/linux/skbuff.h

··· 686 686 * charged to the kernel memory. 687 687 */ 688 688 SKBFL_PURE_ZEROCOPY = BIT(2), 689 + 690 + SKBFL_DONT_ORPHAN = BIT(3), 691 + 692 + /* page references are managed by the ubuf_info, so it's safe to 693 + * use frags only up until ubuf_info is released 694 + */ 695 + SKBFL_MANAGED_FRAG_REFS = BIT(4), 689 696 }; 690 697 691 698 #define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG) 692 - #define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY) 699 + #define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \ 700 + SKBFL_DONT_ORPHAN | SKBFL_MANAGED_FRAG_REFS) 693 701 694 702 /* 695 703 * The callback notifies userspace to release buffers when skb DMA is done in ··· 1781 1773 void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, 1782 1774 bool success); 1783 1775 1784 - int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, 1785 - struct iov_iter *from, size_t length); 1776 + int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, 1777 + struct sk_buff *skb, struct iov_iter *from, 1778 + size_t length); 1786 1779 1787 1780 static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb, 1788 1781 struct msghdr *msg, int len) 1789 1782 { 1790 - return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len); 1783 + return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len); 1791 1784 } 1792 1785 1793 1786 int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, ··· 1813 1804 static inline bool skb_zcopy_pure(const struct sk_buff *skb) 1814 1805 { 1815 1806 return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY; 1807 + } 1808 + 1809 + static inline bool skb_zcopy_managed(const struct sk_buff *skb) 1810 + { 1811 + return skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAG_REFS; 1816 1812 } 1817 1813 1818 1814 static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1, ··· 1892 1878 1893 1879 skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY; 1894 1880 } 1881 + } 1882 + 1883 + void __skb_zcopy_downgrade_managed(struct sk_buff *skb); 1884 + 1885 + static inline void skb_zcopy_downgrade_managed(struct sk_buff *skb) 1886 + { 1887 + if (unlikely(skb_zcopy_managed(skb))) 1888 + __skb_zcopy_downgrade_managed(skb); 1895 1889 } 1896 1890 1897 1891 static inline void skb_mark_not_on_list(struct sk_buff *skb) ··· 2550 2528 return skb_headlen(skb) + __skb_pagelen(skb); 2551 2529 } 2552 2530 2531 + static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo, 2532 + int i, struct page *page, 2533 + int off, int size) 2534 + { 2535 + skb_frag_t *frag = &shinfo->frags[i]; 2536 + 2537 + /* 2538 + * Propagate page pfmemalloc to the skb if we can. The problem is 2539 + * that not all callers have unique ownership of the page but rely 2540 + * on page_is_pfmemalloc doing the right thing(tm). 2541 + */ 2542 + frag->bv_page = page; 2543 + frag->bv_offset = off; 2544 + skb_frag_size_set(frag, size); 2545 + } 2546 + 2553 2547 /** 2554 2548 * __skb_fill_page_desc - initialise a paged fragment in an skb 2555 2549 * @skb: buffer containing fragment to be initialised ··· 2582 2544 static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, 2583 2545 struct page *page, int off, int size) 2584 2546 { 2585 - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2586 - 2587 - /* 2588 - * Propagate page pfmemalloc to the skb if we can. The problem is 2589 - * that not all callers have unique ownership of the page but rely 2590 - * on page_is_pfmemalloc doing the right thing(tm). 2591 - */ 2592 - frag->bv_page = page; 2593 - frag->bv_offset = off; 2594 - skb_frag_size_set(frag, size); 2595 - 2547 + __skb_fill_page_desc_noacc(skb_shinfo(skb), i, page, off, size); 2596 2548 page = compound_head(page); 2597 2549 if (page_is_pfmemalloc(page)) 2598 2550 skb->pfmemalloc = true; ··· 3210 3182 { 3211 3183 if (likely(!skb_zcopy(skb))) 3212 3184 return 0; 3213 - if (!skb_zcopy_is_nouarg(skb) && 3214 - skb_uarg(skb)->callback == msg_zerocopy_callback) 3185 + if (skb_shinfo(skb)->flags & SKBFL_DONT_ORPHAN) 3215 3186 return 0; 3216 3187 return skb_copy_ubufs(skb, gfp_mask); 3217 3188 } ··· 3523 3496 */ 3524 3497 static inline void skb_frag_unref(struct sk_buff *skb, int f) 3525 3498 { 3526 - __skb_frag_unref(&skb_shinfo(skb)->frags[f], skb->pp_recycle); 3499 + struct skb_shared_info *shinfo = skb_shinfo(skb); 3500 + 3501 + if (!skb_zcopy_managed(skb)) 3502 + __skb_frag_unref(&shinfo->frags[f], skb->pp_recycle); 3527 3503 } 3528 3504 3529 3505 /**

+5

include/linux/socket.h

··· 14 14 struct pid; 15 15 struct cred; 16 16 struct socket; 17 + struct sock; 18 + struct sk_buff; 17 19 18 20 #define __sockaddr_check_size(size) \ 19 21 BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage))) ··· 71 69 unsigned int msg_flags; /* flags on received message */ 72 70 __kernel_size_t msg_controllen; /* ancillary data buffer length */ 73 71 struct kiocb *msg_iocb; /* ptr to iocb for async requests */ 72 + struct ubuf_info *msg_ubuf; 73 + int (*sg_from_iter)(struct sock *sk, struct sk_buff *skb, 74 + struct iov_iter *from, size_t length); 74 75 }; 75 76 76 77 struct user_msghdr {

+43 -2

include/uapi/linux/io_uring.h

··· 66 66 union { 67 67 __s32 splice_fd_in; 68 68 __u32 file_index; 69 + struct { 70 + __u16 notification_idx; 71 + __u16 addr_len; 72 + }; 69 73 }; 70 74 union { 71 75 struct { ··· 174 170 IORING_OP_FALLOCATE, 175 171 IORING_OP_OPENAT, 176 172 IORING_OP_CLOSE, 177 - IORING_OP_FILES_UPDATE, 173 + IORING_OP_RSRC_UPDATE, 174 + IORING_OP_FILES_UPDATE = IORING_OP_RSRC_UPDATE, 178 175 IORING_OP_STATX, 179 176 IORING_OP_READ, 180 177 IORING_OP_WRITE, ··· 202 197 IORING_OP_GETXATTR, 203 198 IORING_OP_SOCKET, 204 199 IORING_OP_URING_CMD, 200 + IORING_OP_SENDZC_NOTIF, 205 201 206 202 /* this goes last, obviously */ 207 203 IORING_OP_LAST, ··· 224 218 #define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5) 225 219 #define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME) 226 220 #define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE) 221 + 227 222 /* 228 223 * sqe->splice_flags 229 224 * extends splice(2) flags ··· 274 267 * IORING_RECV_MULTISHOT Multishot recv. Sets IORING_CQE_F_MORE if 275 268 * the handler will continue to report 276 269 * CQEs on behalf of the same SQE. 270 + * 271 + * IORING_RECVSEND_FIXED_BUF Use registered buffers, the index is stored in 272 + * the buf_index field. 273 + * 274 + * IORING_RECVSEND_NOTIF_FLUSH Flush a notification after a successful 275 + * successful. Only for zerocopy sends. 277 276 */ 278 277 #define IORING_RECVSEND_POLL_FIRST (1U << 0) 279 - #define IORING_RECV_MULTISHOT (1U << 1) 278 + #define IORING_RECV_MULTISHOT (1U << 1) 279 + #define IORING_RECVSEND_FIXED_BUF (1U << 2) 280 + #define IORING_RECVSEND_NOTIF_FLUSH (1U << 3) 280 281 281 282 /* 282 283 * accept flags stored in sqe->ioprio 283 284 */ 284 285 #define IORING_ACCEPT_MULTISHOT (1U << 0) 286 + 287 + 288 + /* 289 + * IORING_OP_RSRC_UPDATE flags 290 + */ 291 + enum { 292 + IORING_RSRC_UPDATE_FILES, 293 + IORING_RSRC_UPDATE_NOTIF, 294 + }; 285 295 286 296 /* 287 297 * IORING_OP_MSG_RING command types, stored in sqe->addr ··· 481 457 /* register a range of fixed file slots for automatic slot allocation */ 482 458 IORING_REGISTER_FILE_ALLOC_RANGE = 25, 483 459 460 + /* zerocopy notification API */ 461 + IORING_REGISTER_NOTIFIERS = 26, 462 + IORING_UNREGISTER_NOTIFIERS = 27, 463 + 484 464 /* this goes last */ 485 465 IORING_REGISTER_LAST 486 466 }; ··· 529 501 __aligned_u64 tags; 530 502 __u32 nr; 531 503 __u32 resv2; 504 + }; 505 + 506 + struct io_uring_notification_slot { 507 + __u64 tag; 508 + __u64 resv[3]; 509 + }; 510 + 511 + struct io_uring_notification_register { 512 + __u32 nr_slots; 513 + __u32 resv; 514 + __u64 resv2; 515 + __u64 data; 516 + __u64 resv3; 532 517 }; 533 518 534 519 /* Skip updating fd indexes set to this value in the fd table */

+1 -1

io_uring/Makefile

··· 7 7 openclose.o uring_cmd.o epoll.o \ 8 8 statx.o net.o msg_ring.o timeout.o \ 9 9 sqpoll.o fdinfo.o tctx.o poll.o \ 10 - cancel.o kbuf.o rsrc.o rw.o opdef.o 10 + cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o 11 11 obj-$(CONFIG_IO_WQ) += io-wq.o

+17 -44

io_uring/io_uring.c

··· 90 90 #include "rsrc.h" 91 91 #include "cancel.h" 92 92 #include "net.h" 93 + #include "notif.h" 93 94 94 95 #include "timeout.h" 95 96 #include "poll.h" ··· 609 608 return ret; 610 609 } 611 610 612 - static void __io_put_task(struct task_struct *task, int nr) 611 + void __io_put_task(struct task_struct *task, int nr) 613 612 { 614 613 struct io_uring_task *tctx = task->io_uring; 615 614 ··· 619 618 put_task_struct_many(task, nr); 620 619 } 621 620 622 - /* must to be called somewhat shortly after putting a request */ 623 - static inline void io_put_task(struct task_struct *task, int nr) 624 - { 625 - if (likely(task == current)) 626 - task->io_uring->cached_refs += nr; 627 - else 628 - __io_put_task(task, nr); 629 - } 630 - 631 - static void io_task_refs_refill(struct io_uring_task *tctx) 621 + void io_task_refs_refill(struct io_uring_task *tctx) 632 622 { 633 623 unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; 634 624 635 625 percpu_counter_add(&tctx->inflight, refill); 636 626 refcount_add(refill, &current->usage); 637 627 tctx->cached_refs += refill; 638 - } 639 - 640 - static inline void io_get_task_refs(int nr) 641 - { 642 - struct io_uring_task *tctx = current->io_uring; 643 - 644 - tctx->cached_refs -= nr; 645 - if (unlikely(tctx->cached_refs < 0)) 646 - io_task_refs_refill(tctx); 647 628 } 648 629 649 630 static __cold void io_uring_drop_tctx_refs(struct task_struct *task) ··· 724 741 return &rings->cqes[off]; 725 742 } 726 743 727 - static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, 728 - u64 user_data, s32 res, u32 cflags, 729 - bool allow_overflow) 744 + bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, 745 + bool allow_overflow) 730 746 { 731 747 struct io_uring_cqe *cqe; 732 748 ··· 850 868 spin_unlock(&ctx->completion_lock); 851 869 } 852 870 853 - static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) 854 - { 855 - return !ctx->submit_state.free_list.next; 856 - } 857 - 858 871 /* 859 872 * A request might get retired back into the request caches even before opcode 860 873 * handlers and io_issue_sqe() are done with it, e.g. inline completion path. 861 874 * Because of that, io_alloc_req() should be called only under ->uring_lock 862 875 * and with extra caution to not get a request that is still worked on. 863 876 */ 864 - static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) 877 + __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) 865 878 __must_hold(&ctx->uring_lock) 866 879 { 867 880 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; ··· 895 918 io_req_add_to_cache(req, ctx); 896 919 } 897 920 return true; 898 - } 899 - 900 - static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) 901 - { 902 - if (unlikely(io_req_cache_empty(ctx))) 903 - return __io_alloc_req_refill(ctx); 904 - return true; 905 - } 906 - 907 - static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) 908 - { 909 - struct io_wq_work_node *node; 910 - 911 - node = wq_stack_extract(&ctx->submit_state.free_list); 912 - return container_of(node, struct io_kiocb, comp_list); 913 921 } 914 922 915 923 static inline void io_dismantle_req(struct io_kiocb *req) ··· 2462 2500 } 2463 2501 #endif 2464 2502 WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); 2503 + WARN_ON_ONCE(ctx->notif_slots || ctx->nr_notif_slots); 2465 2504 2466 2505 io_mem_free(ctx->rings); 2467 2506 io_mem_free(ctx->sq_sqes); ··· 2639 2676 io_unregister_personality(ctx, index); 2640 2677 if (ctx->rings) 2641 2678 io_poll_remove_all(ctx, NULL, true); 2679 + io_notif_unregister(ctx); 2642 2680 mutex_unlock(&ctx->uring_lock); 2643 2681 2644 2682 /* failed during ring init, it couldn't have issued any requests */ ··· 3837 3873 if (!arg || nr_args) 3838 3874 break; 3839 3875 ret = io_register_file_alloc_range(ctx, arg); 3876 + break; 3877 + case IORING_REGISTER_NOTIFIERS: 3878 + ret = io_notif_register(ctx, arg, nr_args); 3879 + break; 3880 + case IORING_UNREGISTER_NOTIFIERS: 3881 + ret = -EINVAL; 3882 + if (arg || nr_args) 3883 + break; 3884 + ret = io_notif_unregister(ctx); 3840 3885 break; 3841 3886 default: 3842 3887 ret = -EINVAL;

+43

io_uring/io_uring.h

··· 33 33 void __io_req_complete_post(struct io_kiocb *req); 34 34 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, 35 35 bool allow_overflow); 36 + bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, 37 + bool allow_overflow); 36 38 void __io_commit_cqring_flush(struct io_ring_ctx *ctx); 37 39 38 40 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); ··· 73 71 74 72 void io_free_req(struct io_kiocb *req); 75 73 void io_queue_next(struct io_kiocb *req); 74 + void __io_put_task(struct task_struct *task, int nr); 75 + void io_task_refs_refill(struct io_uring_task *tctx); 76 + bool __io_alloc_req_refill(struct io_ring_ctx *ctx); 76 77 77 78 bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, 78 79 bool cancel_all); ··· 261 256 { 262 257 if (unlikely(ctx->off_timeout_used || ctx->drain_active || ctx->has_evfd)) 263 258 __io_commit_cqring_flush(ctx); 259 + } 260 + 261 + /* must to be called somewhat shortly after putting a request */ 262 + static inline void io_put_task(struct task_struct *task, int nr) 263 + { 264 + if (likely(task == current)) 265 + task->io_uring->cached_refs += nr; 266 + else 267 + __io_put_task(task, nr); 268 + } 269 + 270 + static inline void io_get_task_refs(int nr) 271 + { 272 + struct io_uring_task *tctx = current->io_uring; 273 + 274 + tctx->cached_refs -= nr; 275 + if (unlikely(tctx->cached_refs < 0)) 276 + io_task_refs_refill(tctx); 277 + } 278 + 279 + static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) 280 + { 281 + return !ctx->submit_state.free_list.next; 282 + } 283 + 284 + static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) 285 + { 286 + if (unlikely(io_req_cache_empty(ctx))) 287 + return __io_alloc_req_refill(ctx); 288 + return true; 289 + } 290 + 291 + static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) 292 + { 293 + struct io_wq_work_node *node; 294 + 295 + node = wq_stack_extract(&ctx->submit_state.free_list); 296 + return container_of(node, struct io_kiocb, comp_list); 264 297 } 265 298 266 299 #endif

+191 -2

io_uring/net.c

··· 14 14 #include "kbuf.h" 15 15 #include "alloc_cache.h" 16 16 #include "net.h" 17 + #include "notif.h" 18 + #include "rsrc.h" 17 19 18 20 #if defined(CONFIG_NET) 19 21 struct io_shutdown { ··· 55 53 struct user_msghdr __user *umsg; 56 54 void __user *buf; 57 55 }; 58 - int msg_flags; 56 + unsigned msg_flags; 57 + unsigned flags; 59 58 size_t len; 60 59 size_t done_io; 61 - unsigned int flags; 60 + }; 61 + 62 + struct io_sendzc { 63 + struct file *file; 64 + void __user *buf; 65 + size_t len; 66 + u16 slot_idx; 67 + unsigned msg_flags; 68 + unsigned flags; 69 + unsigned addr_len; 70 + void __user *addr; 62 71 }; 63 72 64 73 #define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED) ··· 307 294 msg.msg_control = NULL; 308 295 msg.msg_controllen = 0; 309 296 msg.msg_namelen = 0; 297 + msg.msg_ubuf = NULL; 310 298 311 299 flags = sr->msg_flags; 312 300 if (issue_flags & IO_URING_F_NONBLOCK) ··· 797 783 msg.msg_flags = 0; 798 784 msg.msg_controllen = 0; 799 785 msg.msg_iocb = NULL; 786 + msg.msg_ubuf = NULL; 800 787 801 788 flags = sr->msg_flags; 802 789 if (force_nonblock) ··· 845 830 goto retry_multishot; 846 831 847 832 return ret; 833 + } 834 + 835 + int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 836 + { 837 + struct io_sendzc *zc = io_kiocb_to_cmd(req); 838 + struct io_ring_ctx *ctx = req->ctx; 839 + 840 + if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)) 841 + return -EINVAL; 842 + 843 + zc->flags = READ_ONCE(sqe->ioprio); 844 + if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | 845 + IORING_RECVSEND_FIXED_BUF | IORING_RECVSEND_NOTIF_FLUSH)) 846 + return -EINVAL; 847 + if (zc->flags & IORING_RECVSEND_FIXED_BUF) { 848 + unsigned idx = READ_ONCE(sqe->buf_index); 849 + 850 + if (unlikely(idx >= ctx->nr_user_bufs)) 851 + return -EFAULT; 852 + idx = array_index_nospec(idx, ctx->nr_user_bufs); 853 + req->imu = READ_ONCE(ctx->user_bufs[idx]); 854 + io_req_set_rsrc_node(req, ctx, 0); 855 + } 856 + 857 + zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 858 + zc->len = READ_ONCE(sqe->len); 859 + zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 860 + zc->slot_idx = READ_ONCE(sqe->notification_idx); 861 + if (zc->msg_flags & MSG_DONTWAIT) 862 + req->flags |= REQ_F_NOWAIT; 863 + 864 + zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 865 + zc->addr_len = READ_ONCE(sqe->addr_len); 866 + 867 + #ifdef CONFIG_COMPAT 868 + if (req->ctx->compat) 869 + zc->msg_flags |= MSG_CMSG_COMPAT; 870 + #endif 871 + return 0; 872 + } 873 + 874 + static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb, 875 + struct iov_iter *from, size_t length) 876 + { 877 + struct skb_shared_info *shinfo = skb_shinfo(skb); 878 + int frag = shinfo->nr_frags; 879 + int ret = 0; 880 + struct bvec_iter bi; 881 + ssize_t copied = 0; 882 + unsigned long truesize = 0; 883 + 884 + if (!shinfo->nr_frags) 885 + shinfo->flags |= SKBFL_MANAGED_FRAG_REFS; 886 + 887 + if (!skb_zcopy_managed(skb) || !iov_iter_is_bvec(from)) { 888 + skb_zcopy_downgrade_managed(skb); 889 + return __zerocopy_sg_from_iter(NULL, sk, skb, from, length); 890 + } 891 + 892 + bi.bi_size = min(from->count, length); 893 + bi.bi_bvec_done = from->iov_offset; 894 + bi.bi_idx = 0; 895 + 896 + while (bi.bi_size && frag < MAX_SKB_FRAGS) { 897 + struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi); 898 + 899 + copied += v.bv_len; 900 + truesize += PAGE_ALIGN(v.bv_len + v.bv_offset); 901 + __skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page, 902 + v.bv_offset, v.bv_len); 903 + bvec_iter_advance_single(from->bvec, &bi, v.bv_len); 904 + } 905 + if (bi.bi_size) 906 + ret = -EMSGSIZE; 907 + 908 + shinfo->nr_frags = frag; 909 + from->bvec += bi.bi_idx; 910 + from->nr_segs -= bi.bi_idx; 911 + from->count = bi.bi_size; 912 + from->iov_offset = bi.bi_bvec_done; 913 + 914 + skb->data_len += copied; 915 + skb->len += copied; 916 + skb->truesize += truesize; 917 + 918 + if (sk && sk->sk_type == SOCK_STREAM) { 919 + sk_wmem_queued_add(sk, truesize); 920 + if (!skb_zcopy_pure(skb)) 921 + sk_mem_charge(sk, truesize); 922 + } else { 923 + refcount_add(truesize, &skb->sk->sk_wmem_alloc); 924 + } 925 + return ret; 926 + } 927 + 928 + int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) 929 + { 930 + struct sockaddr_storage address; 931 + struct io_ring_ctx *ctx = req->ctx; 932 + struct io_sendzc *zc = io_kiocb_to_cmd(req); 933 + struct io_notif_slot *notif_slot; 934 + struct io_kiocb *notif; 935 + struct msghdr msg; 936 + struct iovec iov; 937 + struct socket *sock; 938 + unsigned msg_flags; 939 + int ret, min_ret = 0; 940 + 941 + if (!(req->flags & REQ_F_POLLED) && 942 + (zc->flags & IORING_RECVSEND_POLL_FIRST)) 943 + return -EAGAIN; 944 + 945 + if (issue_flags & IO_URING_F_UNLOCKED) 946 + return -EAGAIN; 947 + sock = sock_from_file(req->file); 948 + if (unlikely(!sock)) 949 + return -ENOTSOCK; 950 + 951 + notif_slot = io_get_notif_slot(ctx, zc->slot_idx); 952 + if (!notif_slot) 953 + return -EINVAL; 954 + notif = io_get_notif(ctx, notif_slot); 955 + if (!notif) 956 + return -ENOMEM; 957 + 958 + msg.msg_name = NULL; 959 + msg.msg_control = NULL; 960 + msg.msg_controllen = 0; 961 + msg.msg_namelen = 0; 962 + 963 + if (zc->flags & IORING_RECVSEND_FIXED_BUF) { 964 + ret = io_import_fixed(WRITE, &msg.msg_iter, req->imu, 965 + (u64)(uintptr_t)zc->buf, zc->len); 966 + if (unlikely(ret)) 967 + return ret; 968 + } else { 969 + ret = import_single_range(WRITE, zc->buf, zc->len, &iov, 970 + &msg.msg_iter); 971 + if (unlikely(ret)) 972 + return ret; 973 + ret = io_notif_account_mem(notif, zc->len); 974 + if (unlikely(ret)) 975 + return ret; 976 + } 977 + 978 + if (zc->addr) { 979 + ret = move_addr_to_kernel(zc->addr, zc->addr_len, &address); 980 + if (unlikely(ret < 0)) 981 + return ret; 982 + msg.msg_name = (struct sockaddr *)&address; 983 + msg.msg_namelen = zc->addr_len; 984 + } 985 + 986 + msg_flags = zc->msg_flags | MSG_ZEROCOPY; 987 + if (issue_flags & IO_URING_F_NONBLOCK) 988 + msg_flags |= MSG_DONTWAIT; 989 + if (msg_flags & MSG_WAITALL) 990 + min_ret = iov_iter_count(&msg.msg_iter); 991 + 992 + msg.msg_flags = msg_flags; 993 + msg.msg_ubuf = &io_notif_to_data(notif)->uarg; 994 + msg.sg_from_iter = io_sg_from_iter; 995 + ret = sock_sendmsg(sock, &msg); 996 + 997 + if (unlikely(ret < min_ret)) { 998 + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 999 + return -EAGAIN; 1000 + return ret == -ERESTARTSYS ? -EINTR : ret; 1001 + } 1002 + 1003 + if (zc->flags & IORING_RECVSEND_NOTIF_FLUSH) 1004 + io_notif_slot_flush_submit(notif_slot, 0); 1005 + io_req_set_res(req, ret, 0); 1006 + return IOU_OK; 848 1007 } 849 1008 850 1009 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)

+3

io_uring/net.h

··· 52 52 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 53 53 int io_connect(struct io_kiocb *req, unsigned int issue_flags); 54 54 55 + int io_sendzc(struct io_kiocb *req, unsigned int issue_flags); 56 + int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 57 + 55 58 void io_netmsg_cache_free(struct io_cache_entry *entry); 56 59 #else 57 60 static inline void io_netmsg_cache_free(struct io_cache_entry *entry)

+159

io_uring/notif.c

··· 1 + #include <linux/kernel.h> 2 + #include <linux/errno.h> 3 + #include <linux/file.h> 4 + #include <linux/slab.h> 5 + #include <linux/net.h> 6 + #include <linux/io_uring.h> 7 + 8 + #include "io_uring.h" 9 + #include "notif.h" 10 + #include "rsrc.h" 11 + 12 + static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked) 13 + { 14 + struct io_notif_data *nd = io_notif_to_data(notif); 15 + struct io_ring_ctx *ctx = notif->ctx; 16 + 17 + if (nd->account_pages && ctx->user) { 18 + __io_unaccount_mem(ctx->user, nd->account_pages); 19 + nd->account_pages = 0; 20 + } 21 + io_req_task_complete(notif, locked); 22 + } 23 + 24 + static inline void io_notif_complete(struct io_kiocb *notif) 25 + __must_hold(&notif->ctx->uring_lock) 26 + { 27 + bool locked = true; 28 + 29 + __io_notif_complete_tw(notif, &locked); 30 + } 31 + 32 + static void io_uring_tx_zerocopy_callback(struct sk_buff *skb, 33 + struct ubuf_info *uarg, 34 + bool success) 35 + { 36 + struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); 37 + struct io_kiocb *notif = cmd_to_io_kiocb(nd); 38 + 39 + if (refcount_dec_and_test(&uarg->refcnt)) { 40 + notif->io_task_work.func = __io_notif_complete_tw; 41 + io_req_task_work_add(notif); 42 + } 43 + } 44 + 45 + struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx, 46 + struct io_notif_slot *slot) 47 + __must_hold(&ctx->uring_lock) 48 + { 49 + struct io_kiocb *notif; 50 + struct io_notif_data *nd; 51 + 52 + if (unlikely(!io_alloc_req_refill(ctx))) 53 + return NULL; 54 + notif = io_alloc_req(ctx); 55 + notif->opcode = IORING_OP_NOP; 56 + notif->flags = 0; 57 + notif->file = NULL; 58 + notif->task = current; 59 + io_get_task_refs(1); 60 + notif->rsrc_node = NULL; 61 + io_req_set_rsrc_node(notif, ctx, 0); 62 + notif->cqe.user_data = slot->tag; 63 + notif->cqe.flags = slot->seq++; 64 + notif->cqe.res = 0; 65 + 66 + nd = io_notif_to_data(notif); 67 + nd->account_pages = 0; 68 + nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; 69 + nd->uarg.callback = io_uring_tx_zerocopy_callback; 70 + /* master ref owned by io_notif_slot, will be dropped on flush */ 71 + refcount_set(&nd->uarg.refcnt, 1); 72 + return notif; 73 + } 74 + 75 + void io_notif_slot_flush(struct io_notif_slot *slot) 76 + __must_hold(&ctx->uring_lock) 77 + { 78 + struct io_kiocb *notif = slot->notif; 79 + struct io_notif_data *nd = io_notif_to_data(notif); 80 + 81 + slot->notif = NULL; 82 + 83 + /* drop slot's master ref */ 84 + if (refcount_dec_and_test(&nd->uarg.refcnt)) 85 + io_notif_complete(notif); 86 + } 87 + 88 + __cold int io_notif_unregister(struct io_ring_ctx *ctx) 89 + __must_hold(&ctx->uring_lock) 90 + { 91 + int i; 92 + 93 + if (!ctx->notif_slots) 94 + return -ENXIO; 95 + 96 + for (i = 0; i < ctx->nr_notif_slots; i++) { 97 + struct io_notif_slot *slot = &ctx->notif_slots[i]; 98 + struct io_kiocb *notif = slot->notif; 99 + struct io_notif_data *nd; 100 + 101 + if (!notif) 102 + continue; 103 + nd = io_kiocb_to_cmd(notif); 104 + slot->notif = NULL; 105 + if (!refcount_dec_and_test(&nd->uarg.refcnt)) 106 + continue; 107 + notif->io_task_work.func = __io_notif_complete_tw; 108 + io_req_task_work_add(notif); 109 + } 110 + 111 + kvfree(ctx->notif_slots); 112 + ctx->notif_slots = NULL; 113 + ctx->nr_notif_slots = 0; 114 + return 0; 115 + } 116 + 117 + __cold int io_notif_register(struct io_ring_ctx *ctx, 118 + void __user *arg, unsigned int size) 119 + __must_hold(&ctx->uring_lock) 120 + { 121 + struct io_uring_notification_slot __user *slots; 122 + struct io_uring_notification_slot slot; 123 + struct io_uring_notification_register reg; 124 + unsigned i; 125 + 126 + BUILD_BUG_ON(sizeof(struct io_notif_data) > 64); 127 + 128 + if (ctx->nr_notif_slots) 129 + return -EBUSY; 130 + if (size != sizeof(reg)) 131 + return -EINVAL; 132 + if (copy_from_user(&reg, arg, sizeof(reg))) 133 + return -EFAULT; 134 + if (!reg.nr_slots || reg.nr_slots > IORING_MAX_NOTIF_SLOTS) 135 + return -EINVAL; 136 + if (reg.resv || reg.resv2 || reg.resv3) 137 + return -EINVAL; 138 + 139 + slots = u64_to_user_ptr(reg.data); 140 + ctx->notif_slots = kvcalloc(reg.nr_slots, sizeof(ctx->notif_slots[0]), 141 + GFP_KERNEL_ACCOUNT); 142 + if (!ctx->notif_slots) 143 + return -ENOMEM; 144 + 145 + for (i = 0; i < reg.nr_slots; i++, ctx->nr_notif_slots++) { 146 + struct io_notif_slot *notif_slot = &ctx->notif_slots[i]; 147 + 148 + if (copy_from_user(&slot, &slots[i], sizeof(slot))) { 149 + io_notif_unregister(ctx); 150 + return -EFAULT; 151 + } 152 + if (slot.resv[0] | slot.resv[1] | slot.resv[2]) { 153 + io_notif_unregister(ctx); 154 + return -EINVAL; 155 + } 156 + notif_slot->tag = slot.tag; 157 + } 158 + return 0; 159 + }

+90

io_uring/notif.h

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <linux/net.h> 4 + #include <linux/uio.h> 5 + #include <net/sock.h> 6 + #include <linux/nospec.h> 7 + 8 + #include "rsrc.h" 9 + 10 + #define IO_NOTIF_SPLICE_BATCH 32 11 + #define IORING_MAX_NOTIF_SLOTS (1U << 10) 12 + 13 + struct io_notif_data { 14 + struct file *file; 15 + struct ubuf_info uarg; 16 + unsigned long account_pages; 17 + }; 18 + 19 + struct io_notif_slot { 20 + /* 21 + * Current/active notifier. A slot holds only one active notifier at a 22 + * time and keeps one reference to it. Flush releases the reference and 23 + * lazily replaces it with a new notifier. 24 + */ 25 + struct io_kiocb *notif; 26 + 27 + /* 28 + * Default ->user_data for this slot notifiers CQEs 29 + */ 30 + u64 tag; 31 + /* 32 + * Notifiers of a slot live in generations, we create a new notifier 33 + * only after flushing the previous one. Track the sequential number 34 + * for all notifiers and copy it into notifiers's cqe->cflags 35 + */ 36 + u32 seq; 37 + }; 38 + 39 + int io_notif_register(struct io_ring_ctx *ctx, 40 + void __user *arg, unsigned int size); 41 + int io_notif_unregister(struct io_ring_ctx *ctx); 42 + 43 + void io_notif_slot_flush(struct io_notif_slot *slot); 44 + struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx, 45 + struct io_notif_slot *slot); 46 + 47 + static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif) 48 + { 49 + return io_kiocb_to_cmd(notif); 50 + } 51 + 52 + static inline struct io_kiocb *io_get_notif(struct io_ring_ctx *ctx, 53 + struct io_notif_slot *slot) 54 + { 55 + if (!slot->notif) 56 + slot->notif = io_alloc_notif(ctx, slot); 57 + return slot->notif; 58 + } 59 + 60 + static inline struct io_notif_slot *io_get_notif_slot(struct io_ring_ctx *ctx, 61 + unsigned idx) 62 + __must_hold(&ctx->uring_lock) 63 + { 64 + if (idx >= ctx->nr_notif_slots) 65 + return NULL; 66 + idx = array_index_nospec(idx, ctx->nr_notif_slots); 67 + return &ctx->notif_slots[idx]; 68 + } 69 + 70 + static inline void io_notif_slot_flush_submit(struct io_notif_slot *slot, 71 + unsigned int issue_flags) 72 + { 73 + io_notif_slot_flush(slot); 74 + } 75 + 76 + static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len) 77 + { 78 + struct io_ring_ctx *ctx = notif->ctx; 79 + struct io_notif_data *nd = io_notif_to_data(notif); 80 + unsigned nr_pages = (len >> PAGE_SHIFT) + 2; 81 + int ret; 82 + 83 + if (ctx->user) { 84 + ret = __io_account_mem(ctx->user, nr_pages); 85 + if (ret) 86 + return ret; 87 + nd->account_pages += nr_pages; 88 + } 89 + return 0; 90 + }

+20 -4

io_uring/opdef.c

··· 246 246 .prep = io_close_prep, 247 247 .issue = io_close, 248 248 }, 249 - [IORING_OP_FILES_UPDATE] = { 249 + [IORING_OP_RSRC_UPDATE] = { 250 250 .audit_skip = 1, 251 251 .iopoll = 1, 252 - .name = "FILES_UPDATE", 253 - .prep = io_files_update_prep, 254 - .issue = io_files_update, 252 + .name = "RSRC_UPDATE", 253 + .prep = io_rsrc_update_prep, 254 + .issue = io_rsrc_update, 255 + .ioprio = 1, 255 256 }, 256 257 [IORING_OP_STATX] = { 257 258 .audit_skip = 1, ··· 470 469 .prep = io_uring_cmd_prep, 471 470 .issue = io_uring_cmd, 472 471 .prep_async = io_uring_cmd_prep_async, 472 + }, 473 + [IORING_OP_SENDZC_NOTIF] = { 474 + .name = "SENDZC_NOTIF", 475 + .needs_file = 1, 476 + .unbound_nonreg_file = 1, 477 + .pollout = 1, 478 + .audit_skip = 1, 479 + .ioprio = 1, 480 + #if defined(CONFIG_NET) 481 + .prep = io_sendzc_prep, 482 + .issue = io_sendzc, 483 + #else 484 + .prep = io_eopnotsupp_prep, 485 + #endif 486 + 473 487 }, 474 488 }; 475 489

+57 -10

io_uring/rsrc.c

··· 15 15 #include "io_uring.h" 16 16 #include "openclose.h" 17 17 #include "rsrc.h" 18 + #include "notif.h" 18 19 19 20 struct io_rsrc_update { 20 21 struct file *file; 21 22 u64 arg; 22 23 u32 nr_args; 23 24 u32 offset; 25 + int type; 24 26 }; 25 27 26 28 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, ··· 44 42 } 45 43 } 46 44 47 - static inline void __io_unaccount_mem(struct user_struct *user, 48 - unsigned long nr_pages) 49 - { 50 - atomic_long_sub(nr_pages, &user->locked_vm); 51 - } 52 - 53 - static inline int __io_account_mem(struct user_struct *user, 54 - unsigned long nr_pages) 45 + int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 55 46 { 56 47 unsigned long page_limit, cur_pages, new_pages; 48 + 49 + if (!nr_pages) 50 + return 0; 57 51 58 52 /* Don't allow more pages than we can safely lock */ 59 53 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; ··· 655 657 return -EINVAL; 656 658 } 657 659 658 - int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 660 + int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 659 661 { 660 662 struct io_rsrc_update *up = io_kiocb_to_cmd(req); 661 663 ··· 669 671 if (!up->nr_args) 670 672 return -EINVAL; 671 673 up->arg = READ_ONCE(sqe->addr); 674 + up->type = READ_ONCE(sqe->ioprio); 672 675 return 0; 673 676 } 674 677 ··· 712 713 return ret; 713 714 } 714 715 715 - int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 716 + static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 716 717 { 717 718 struct io_rsrc_update *up = io_kiocb_to_cmd(req); 718 719 struct io_ring_ctx *ctx = req->ctx; ··· 739 740 req_set_fail(req); 740 741 io_req_set_res(req, ret, 0); 741 742 return IOU_OK; 743 + } 744 + 745 + static int io_notif_update(struct io_kiocb *req, unsigned int issue_flags) 746 + { 747 + struct io_rsrc_update *up = io_kiocb_to_cmd(req); 748 + struct io_ring_ctx *ctx = req->ctx; 749 + unsigned len = up->nr_args; 750 + unsigned idx_end, idx = up->offset; 751 + int ret = 0; 752 + 753 + io_ring_submit_lock(ctx, issue_flags); 754 + if (unlikely(check_add_overflow(idx, len, &idx_end))) { 755 + ret = -EOVERFLOW; 756 + goto out; 757 + } 758 + if (unlikely(idx_end > ctx->nr_notif_slots)) { 759 + ret = -EINVAL; 760 + goto out; 761 + } 762 + 763 + for (; idx < idx_end; idx++) { 764 + struct io_notif_slot *slot = &ctx->notif_slots[idx]; 765 + 766 + if (!slot->notif) 767 + continue; 768 + if (up->arg) 769 + slot->tag = up->arg; 770 + io_notif_slot_flush_submit(slot, issue_flags); 771 + } 772 + out: 773 + io_ring_submit_unlock(ctx, issue_flags); 774 + if (ret < 0) 775 + req_set_fail(req); 776 + io_req_set_res(req, ret, 0); 777 + return IOU_OK; 778 + } 779 + 780 + int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags) 781 + { 782 + struct io_rsrc_update *up = io_kiocb_to_cmd(req); 783 + 784 + switch (up->type) { 785 + case IORING_RSRC_UPDATE_FILES: 786 + return io_files_update(req, issue_flags); 787 + case IORING_RSRC_UPDATE_NOTIF: 788 + return io_notif_update(req, issue_flags); 789 + } 790 + return -EINVAL; 742 791 } 743 792 744 793 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,

+20 -5

io_uring/rsrc.h

··· 135 135 } 136 136 } 137 137 138 + static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx) 139 + { 140 + ctx->rsrc_cached_refs--; 141 + if (unlikely(ctx->rsrc_cached_refs < 0)) 142 + io_rsrc_refs_refill(ctx); 143 + } 144 + 138 145 static inline void io_req_set_rsrc_node(struct io_kiocb *req, 139 146 struct io_ring_ctx *ctx, 140 147 unsigned int issue_flags) ··· 151 144 152 145 if (!(issue_flags & IO_URING_F_UNLOCKED)) { 153 146 lockdep_assert_held(&ctx->uring_lock); 154 - ctx->rsrc_cached_refs--; 155 - if (unlikely(ctx->rsrc_cached_refs < 0)) 156 - io_rsrc_refs_refill(ctx); 147 + 148 + io_charge_rsrc_node(ctx); 157 149 } else { 158 150 percpu_ref_get(&req->rsrc_node->refs); 159 151 } ··· 167 161 return &data->tags[table_idx][off]; 168 162 } 169 163 170 - int io_files_update(struct io_kiocb *req, unsigned int issue_flags); 171 - int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 164 + int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags); 165 + int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 166 + 167 + int __io_account_mem(struct user_struct *user, unsigned long nr_pages); 168 + 169 + static inline void __io_unaccount_mem(struct user_struct *user, 170 + unsigned long nr_pages) 171 + { 172 + atomic_long_sub(nr_pages, &user->locked_vm); 173 + } 174 + 172 175 #endif

-26

io_uring/tctx.h

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 3 - #include <linux/llist.h> 4 - 5 - /* 6 - * Arbitrary limit, can be raised if need be 7 - */ 8 - #define IO_RINGFD_REG_MAX 16 9 - 10 - struct io_uring_task { 11 - /* submission side */ 12 - int cached_refs; 13 - const struct io_ring_ctx *last; 14 - struct io_wq *io_wq; 15 - struct file *registered_rings[IO_RINGFD_REG_MAX]; 16 - 17 - struct xarray xa; 18 - struct wait_queue_head wait; 19 - atomic_t in_idle; 20 - atomic_t inflight_tracked; 21 - struct percpu_counter inflight; 22 - 23 - struct { /* task_work */ 24 - struct llist_head task_list; 25 - struct callback_head task_work; 26 - } ____cacheline_aligned_in_smp; 27 - }; 28 - 29 3 struct io_tctx_node { 30 4 struct list_head ctx_node; 31 5 struct task_struct *task;

+1

net/compat.c

··· 75 75 return -EMSGSIZE; 76 76 77 77 kmsg->msg_iocb = NULL; 78 + kmsg->msg_ubuf = NULL; 78 79 return 0; 79 80 } 80 81

+10 -4

net/core/datagram.c

··· 613 613 } 614 614 EXPORT_SYMBOL(skb_copy_datagram_from_iter); 615 615 616 - int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, 617 - struct iov_iter *from, size_t length) 616 + int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, 617 + struct sk_buff *skb, struct iov_iter *from, 618 + size_t length) 618 619 { 619 - int frag = skb_shinfo(skb)->nr_frags; 620 + int frag; 621 + 622 + if (msg && msg->msg_ubuf && msg->sg_from_iter) 623 + return msg->sg_from_iter(sk, skb, from, length); 624 + 625 + frag = skb_shinfo(skb)->nr_frags; 620 626 621 627 while (length && iov_iter_count(from)) { 622 628 struct page *pages[MAX_SKB_FRAGS]; ··· 708 702 if (skb_copy_datagram_from_iter(skb, 0, from, copy)) 709 703 return -EFAULT; 710 704 711 - return __zerocopy_sg_from_iter(NULL, skb, from, ~0U); 705 + return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U); 712 706 } 713 707 EXPORT_SYMBOL(zerocopy_sg_from_iter); 714 708

+33 -4

net/core/skbuff.c

··· 666 666 &shinfo->dataref)) 667 667 goto exit; 668 668 669 - skb_zcopy_clear(skb, true); 669 + if (skb_zcopy(skb)) { 670 + bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS; 671 + 672 + skb_zcopy_clear(skb, true); 673 + if (skip_unref) 674 + goto free_head; 675 + } 670 676 671 677 for (i = 0; i < shinfo->nr_frags; i++) 672 678 __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); 673 679 680 + free_head: 674 681 if (shinfo->frag_list) 675 682 kfree_skb_list(shinfo->frag_list); 676 683 ··· 902 895 */ 903 896 void skb_tx_error(struct sk_buff *skb) 904 897 { 905 - skb_zcopy_clear(skb, true); 898 + if (skb) { 899 + skb_zcopy_downgrade_managed(skb); 900 + skb_zcopy_clear(skb, true); 901 + } 906 902 } 907 903 EXPORT_SYMBOL(skb_tx_error); 908 904 ··· 1203 1193 uarg->len = 1; 1204 1194 uarg->bytelen = size; 1205 1195 uarg->zerocopy = 1; 1206 - uarg->flags = SKBFL_ZEROCOPY_FRAG; 1196 + uarg->flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; 1207 1197 refcount_set(&uarg->refcnt, 1); 1208 1198 sock_hold(sk); 1209 1199 ··· 1221 1211 if (uarg) { 1222 1212 const u32 byte_limit = 1 << 19; /* limit to a few TSO */ 1223 1213 u32 bytelen, next; 1214 + 1215 + /* there might be non MSG_ZEROCOPY users */ 1216 + if (uarg->callback != msg_zerocopy_callback) 1217 + return NULL; 1224 1218 1225 1219 /* realloc only when socket is locked (TCP, UDP cork), 1226 1220 * so uarg->len and sk_zckey access is serialized ··· 1368 1354 if (orig_uarg && uarg != orig_uarg) 1369 1355 return -EEXIST; 1370 1356 1371 - err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len); 1357 + err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len); 1372 1358 if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { 1373 1359 struct sock *save_sk = skb->sk; 1374 1360 ··· 1384 1370 return skb->len - orig_len; 1385 1371 } 1386 1372 EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); 1373 + 1374 + void __skb_zcopy_downgrade_managed(struct sk_buff *skb) 1375 + { 1376 + int i; 1377 + 1378 + skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS; 1379 + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 1380 + skb_frag_ref(skb, i); 1381 + } 1382 + EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed); 1387 1383 1388 1384 static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, 1389 1385 gfp_t gfp_mask) ··· 1711 1687 BUG_ON(nhead < 0); 1712 1688 1713 1689 BUG_ON(skb_shared(skb)); 1690 + 1691 + skb_zcopy_downgrade_managed(skb); 1714 1692 1715 1693 size = SKB_DATA_ALIGN(size); 1716 1694 ··· 3510 3484 int pos = skb_headlen(skb); 3511 3485 const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY; 3512 3486 3487 + skb_zcopy_downgrade_managed(skb); 3488 + 3513 3489 skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags; 3514 3490 skb_zerocopy_clone(skb1, skb, 0); 3515 3491 if (len < pos) /* Split line is inside header. */ ··· 3865 3837 if (skb_can_coalesce(skb, i, page, offset)) { 3866 3838 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); 3867 3839 } else if (i < MAX_SKB_FRAGS) { 3840 + skb_zcopy_downgrade_managed(skb); 3868 3841 get_page(page); 3869 3842 skb_fill_page_desc(skb, i, page, offset, size); 3870 3843 } else {

+36 -14

net/ipv4/ip_output.c

··· 969 969 struct inet_sock *inet = inet_sk(sk); 970 970 struct ubuf_info *uarg = NULL; 971 971 struct sk_buff *skb; 972 - 973 972 struct ip_options *opt = cork->opt; 974 973 int hh_len; 975 974 int exthdrlen; ··· 976 977 int copy; 977 978 int err; 978 979 int offset = 0; 980 + bool zc = false; 979 981 unsigned int maxfraglen, fragheaderlen, maxnonfragsize; 980 982 int csummode = CHECKSUM_NONE; 981 983 struct rtable *rt = (struct rtable *)cork->dst; ··· 1017 1017 (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM))) 1018 1018 csummode = CHECKSUM_PARTIAL; 1019 1019 1020 - if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { 1021 - uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); 1022 - if (!uarg) 1023 - return -ENOBUFS; 1024 - extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ 1025 - if (rt->dst.dev->features & NETIF_F_SG && 1026 - csummode == CHECKSUM_PARTIAL) { 1027 - paged = true; 1028 - } else { 1029 - uarg->zerocopy = 0; 1030 - skb_zcopy_set(skb, uarg, &extra_uref); 1020 + if ((flags & MSG_ZEROCOPY) && length) { 1021 + struct msghdr *msg = from; 1022 + 1023 + if (getfrag == ip_generic_getfrag && msg->msg_ubuf) { 1024 + if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb)) 1025 + return -EINVAL; 1026 + 1027 + /* Leave uarg NULL if can't zerocopy, callers should 1028 + * be able to handle it. 1029 + */ 1030 + if ((rt->dst.dev->features & NETIF_F_SG) && 1031 + csummode == CHECKSUM_PARTIAL) { 1032 + paged = true; 1033 + zc = true; 1034 + uarg = msg->msg_ubuf; 1035 + } 1036 + } else if (sock_flag(sk, SOCK_ZEROCOPY)) { 1037 + uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); 1038 + if (!uarg) 1039 + return -ENOBUFS; 1040 + extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ 1041 + if (rt->dst.dev->features & NETIF_F_SG && 1042 + csummode == CHECKSUM_PARTIAL) { 1043 + paged = true; 1044 + zc = true; 1045 + } else { 1046 + uarg->zerocopy = 0; 1047 + skb_zcopy_set(skb, uarg, &extra_uref); 1048 + } 1031 1049 } 1032 1050 } 1033 1051 ··· 1109 1091 (fraglen + alloc_extra < SKB_MAX_ALLOC || 1110 1092 !(rt->dst.dev->features & NETIF_F_SG))) 1111 1093 alloclen = fraglen; 1112 - else { 1094 + else if (!zc) { 1113 1095 alloclen = min_t(int, fraglen, MAX_HEADER); 1114 1096 pagedlen = fraglen - alloclen; 1097 + } else { 1098 + alloclen = fragheaderlen + transhdrlen; 1099 + pagedlen = datalen - transhdrlen; 1115 1100 } 1116 1101 1117 1102 alloclen += alloc_extra; ··· 1209 1188 err = -EFAULT; 1210 1189 goto error; 1211 1190 } 1212 - } else if (!uarg || !uarg->zerocopy) { 1191 + } else if (!zc) { 1213 1192 int i = skb_shinfo(skb)->nr_frags; 1214 1193 1215 1194 err = -ENOMEM; 1216 1195 if (!sk_page_frag_refill(sk, pfrag)) 1217 1196 goto error; 1218 1197 1198 + skb_zcopy_downgrade_managed(skb); 1219 1199 if (!skb_can_coalesce(skb, i, pfrag->page, 1220 1200 pfrag->offset)) { 1221 1201 err = -EMSGSIZE;

+22 -11

net/ipv4/tcp.c

··· 1203 1203 1204 1204 flags = msg->msg_flags; 1205 1205 1206 - if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) { 1206 + if ((flags & MSG_ZEROCOPY) && size) { 1207 1207 skb = tcp_write_queue_tail(sk); 1208 - uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); 1209 - if (!uarg) { 1210 - err = -ENOBUFS; 1211 - goto out_err; 1212 - } 1213 1208 1214 - zc = sk->sk_route_caps & NETIF_F_SG; 1215 - if (!zc) 1216 - uarg->zerocopy = 0; 1209 + if (msg->msg_ubuf) { 1210 + uarg = msg->msg_ubuf; 1211 + net_zcopy_get(uarg); 1212 + zc = sk->sk_route_caps & NETIF_F_SG; 1213 + } else if (sock_flag(sk, SOCK_ZEROCOPY)) { 1214 + uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); 1215 + if (!uarg) { 1216 + err = -ENOBUFS; 1217 + goto out_err; 1218 + } 1219 + zc = sk->sk_route_caps & NETIF_F_SG; 1220 + if (!zc) 1221 + uarg->zerocopy = 0; 1222 + } 1217 1223 } 1218 1224 1219 1225 if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) && ··· 1342 1336 1343 1337 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1344 1338 1345 - if (tcp_downgrade_zcopy_pure(sk, skb) || 1346 - !sk_wmem_schedule(sk, copy)) 1339 + if (unlikely(skb_zcopy_pure(skb) || skb_zcopy_managed(skb))) { 1340 + if (tcp_downgrade_zcopy_pure(sk, skb)) 1341 + goto wait_for_space; 1342 + skb_zcopy_downgrade_managed(skb); 1343 + } 1344 + 1345 + if (!sk_wmem_schedule(sk, copy)) 1347 1346 goto wait_for_space; 1348 1347 1349 1348 err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,

+36 -13

net/ipv6/ip6_output.c

··· 1464 1464 int copy; 1465 1465 int err; 1466 1466 int offset = 0; 1467 + bool zc = false; 1467 1468 u32 tskey = 0; 1468 1469 struct rt6_info *rt = (struct rt6_info *)cork->dst; 1469 1470 struct ipv6_txoptions *opt = v6_cork->opt; ··· 1542 1541 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1543 1542 csummode = CHECKSUM_PARTIAL; 1544 1543 1545 - if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { 1546 - uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); 1547 - if (!uarg) 1548 - return -ENOBUFS; 1549 - extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ 1550 - if (rt->dst.dev->features & NETIF_F_SG && 1551 - csummode == CHECKSUM_PARTIAL) { 1552 - paged = true; 1553 - } else { 1554 - uarg->zerocopy = 0; 1555 - skb_zcopy_set(skb, uarg, &extra_uref); 1544 + if ((flags & MSG_ZEROCOPY) && length) { 1545 + struct msghdr *msg = from; 1546 + 1547 + if (getfrag == ip_generic_getfrag && msg->msg_ubuf) { 1548 + if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb)) 1549 + return -EINVAL; 1550 + 1551 + /* Leave uarg NULL if can't zerocopy, callers should 1552 + * be able to handle it. 1553 + */ 1554 + if ((rt->dst.dev->features & NETIF_F_SG) && 1555 + csummode == CHECKSUM_PARTIAL) { 1556 + paged = true; 1557 + zc = true; 1558 + uarg = msg->msg_ubuf; 1559 + } 1560 + } else if (sock_flag(sk, SOCK_ZEROCOPY)) { 1561 + uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); 1562 + if (!uarg) 1563 + return -ENOBUFS; 1564 + extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ 1565 + if (rt->dst.dev->features & NETIF_F_SG && 1566 + csummode == CHECKSUM_PARTIAL) { 1567 + paged = true; 1568 + zc = true; 1569 + } else { 1570 + uarg->zerocopy = 0; 1571 + skb_zcopy_set(skb, uarg, &extra_uref); 1572 + } 1556 1573 } 1557 1574 } 1558 1575 ··· 1649 1630 (fraglen + alloc_extra < SKB_MAX_ALLOC || 1650 1631 !(rt->dst.dev->features & NETIF_F_SG))) 1651 1632 alloclen = fraglen; 1652 - else { 1633 + else if (!zc) { 1653 1634 alloclen = min_t(int, fraglen, MAX_HEADER); 1654 1635 pagedlen = fraglen - alloclen; 1636 + } else { 1637 + alloclen = fragheaderlen + transhdrlen; 1638 + pagedlen = datalen - transhdrlen; 1655 1639 } 1656 1640 alloclen += alloc_extra; 1657 1641 ··· 1764 1742 err = -EFAULT; 1765 1743 goto error; 1766 1744 } 1767 - } else if (!uarg || !uarg->zerocopy) { 1745 + } else if (!zc) { 1768 1746 int i = skb_shinfo(skb)->nr_frags; 1769 1747 1770 1748 err = -ENOMEM; 1771 1749 if (!sk_page_frag_refill(sk, pfrag)) 1772 1750 goto error; 1773 1751 1752 + skb_zcopy_downgrade_managed(skb); 1774 1753 if (!skb_can_coalesce(skb, i, pfrag->page, 1775 1754 pfrag->offset)) { 1776 1755 err = -EMSGSIZE;

+2

net/socket.c

··· 2106 2106 msg.msg_control = NULL; 2107 2107 msg.msg_controllen = 0; 2108 2108 msg.msg_namelen = 0; 2109 + msg.msg_ubuf = NULL; 2109 2110 if (addr) { 2110 2111 err = move_addr_to_kernel(addr, addr_len, &address); 2111 2112 if (err < 0) ··· 2401 2400 return -EMSGSIZE; 2402 2401 2403 2402 kmsg->msg_iocb = NULL; 2403 + kmsg->msg_ubuf = NULL; 2404 2404 return 0; 2405 2405 } 2406 2406

+1

tools/testing/selftests/net/Makefile

··· 59 59 TEST_GEN_FILES += cmsg_sender 60 60 TEST_GEN_FILES += stress_reuseport_listen 61 61 TEST_PROGS += test_vxlan_vnifiltering.sh 62 + TEST_GEN_FILES += io_uring_zerocopy_tx 62 63 63 64 TEST_FILES := settings 64 65

+605

tools/testing/selftests/net/io_uring_zerocopy_tx.c

··· 1 + /* SPDX-License-Identifier: MIT */ 2 + /* based on linux-kernel/tools/testing/selftests/net/msg_zerocopy.c */ 3 + #include <assert.h> 4 + #include <errno.h> 5 + #include <error.h> 6 + #include <fcntl.h> 7 + #include <limits.h> 8 + #include <stdbool.h> 9 + #include <stdint.h> 10 + #include <stdio.h> 11 + #include <stdlib.h> 12 + #include <string.h> 13 + #include <unistd.h> 14 + 15 + #include <arpa/inet.h> 16 + #include <linux/errqueue.h> 17 + #include <linux/if_packet.h> 18 + #include <linux/io_uring.h> 19 + #include <linux/ipv6.h> 20 + #include <linux/socket.h> 21 + #include <linux/sockios.h> 22 + #include <net/ethernet.h> 23 + #include <net/if.h> 24 + #include <netinet/in.h> 25 + #include <netinet/ip.h> 26 + #include <netinet/ip6.h> 27 + #include <netinet/tcp.h> 28 + #include <netinet/udp.h> 29 + #include <sys/ioctl.h> 30 + #include <sys/mman.h> 31 + #include <sys/resource.h> 32 + #include <sys/socket.h> 33 + #include <sys/stat.h> 34 + #include <sys/time.h> 35 + #include <sys/types.h> 36 + #include <sys/un.h> 37 + #include <sys/wait.h> 38 + 39 + #define NOTIF_TAG 0xfffffffULL 40 + #define NONZC_TAG 0 41 + #define ZC_TAG 1 42 + 43 + enum { 44 + MODE_NONZC = 0, 45 + MODE_ZC = 1, 46 + MODE_ZC_FIXED = 2, 47 + MODE_MIXED = 3, 48 + }; 49 + 50 + static bool cfg_flush = false; 51 + static bool cfg_cork = false; 52 + static int cfg_mode = MODE_ZC_FIXED; 53 + static int cfg_nr_reqs = 8; 54 + static int cfg_family = PF_UNSPEC; 55 + static int cfg_payload_len; 56 + static int cfg_port = 8000; 57 + static int cfg_runtime_ms = 4200; 58 + 59 + static socklen_t cfg_alen; 60 + static struct sockaddr_storage cfg_dst_addr; 61 + 62 + static char payload[IP_MAXPACKET] __attribute__((aligned(4096))); 63 + 64 + struct io_sq_ring { 65 + unsigned *head; 66 + unsigned *tail; 67 + unsigned *ring_mask; 68 + unsigned *ring_entries; 69 + unsigned *flags; 70 + unsigned *array; 71 + }; 72 + 73 + struct io_cq_ring { 74 + unsigned *head; 75 + unsigned *tail; 76 + unsigned *ring_mask; 77 + unsigned *ring_entries; 78 + struct io_uring_cqe *cqes; 79 + }; 80 + 81 + struct io_uring_sq { 82 + unsigned *khead; 83 + unsigned *ktail; 84 + unsigned *kring_mask; 85 + unsigned *kring_entries; 86 + unsigned *kflags; 87 + unsigned *kdropped; 88 + unsigned *array; 89 + struct io_uring_sqe *sqes; 90 + 91 + unsigned sqe_head; 92 + unsigned sqe_tail; 93 + 94 + size_t ring_sz; 95 + }; 96 + 97 + struct io_uring_cq { 98 + unsigned *khead; 99 + unsigned *ktail; 100 + unsigned *kring_mask; 101 + unsigned *kring_entries; 102 + unsigned *koverflow; 103 + struct io_uring_cqe *cqes; 104 + 105 + size_t ring_sz; 106 + }; 107 + 108 + struct io_uring { 109 + struct io_uring_sq sq; 110 + struct io_uring_cq cq; 111 + int ring_fd; 112 + }; 113 + 114 + #ifdef __alpha__ 115 + # ifndef __NR_io_uring_setup 116 + # define __NR_io_uring_setup 535 117 + # endif 118 + # ifndef __NR_io_uring_enter 119 + # define __NR_io_uring_enter 536 120 + # endif 121 + # ifndef __NR_io_uring_register 122 + # define __NR_io_uring_register 537 123 + # endif 124 + #else /* !__alpha__ */ 125 + # ifndef __NR_io_uring_setup 126 + # define __NR_io_uring_setup 425 127 + # endif 128 + # ifndef __NR_io_uring_enter 129 + # define __NR_io_uring_enter 426 130 + # endif 131 + # ifndef __NR_io_uring_register 132 + # define __NR_io_uring_register 427 133 + # endif 134 + #endif 135 + 136 + #if defined(__x86_64) || defined(__i386__) 137 + #define read_barrier() __asm__ __volatile__("":::"memory") 138 + #define write_barrier() __asm__ __volatile__("":::"memory") 139 + #else 140 + 141 + #define read_barrier() __sync_synchronize() 142 + #define write_barrier() __sync_synchronize() 143 + #endif 144 + 145 + static int io_uring_setup(unsigned int entries, struct io_uring_params *p) 146 + { 147 + return syscall(__NR_io_uring_setup, entries, p); 148 + } 149 + 150 + static int io_uring_enter(int fd, unsigned int to_submit, 151 + unsigned int min_complete, 152 + unsigned int flags, sigset_t *sig) 153 + { 154 + return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, 155 + flags, sig, _NSIG / 8); 156 + } 157 + 158 + static int io_uring_register_buffers(struct io_uring *ring, 159 + const struct iovec *iovecs, 160 + unsigned nr_iovecs) 161 + { 162 + int ret; 163 + 164 + ret = syscall(__NR_io_uring_register, ring->ring_fd, 165 + IORING_REGISTER_BUFFERS, iovecs, nr_iovecs); 166 + return (ret < 0) ? -errno : ret; 167 + } 168 + 169 + static int io_uring_register_notifications(struct io_uring *ring, 170 + unsigned nr, 171 + struct io_uring_notification_slot *slots) 172 + { 173 + int ret; 174 + struct io_uring_notification_register r = { 175 + .nr_slots = nr, 176 + .data = (unsigned long)slots, 177 + }; 178 + 179 + ret = syscall(__NR_io_uring_register, ring->ring_fd, 180 + IORING_REGISTER_NOTIFIERS, &r, sizeof(r)); 181 + return (ret < 0) ? -errno : ret; 182 + } 183 + 184 + static int io_uring_mmap(int fd, struct io_uring_params *p, 185 + struct io_uring_sq *sq, struct io_uring_cq *cq) 186 + { 187 + size_t size; 188 + void *ptr; 189 + int ret; 190 + 191 + sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned); 192 + ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE, 193 + MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); 194 + if (ptr == MAP_FAILED) 195 + return -errno; 196 + sq->khead = ptr + p->sq_off.head; 197 + sq->ktail = ptr + p->sq_off.tail; 198 + sq->kring_mask = ptr + p->sq_off.ring_mask; 199 + sq->kring_entries = ptr + p->sq_off.ring_entries; 200 + sq->kflags = ptr + p->sq_off.flags; 201 + sq->kdropped = ptr + p->sq_off.dropped; 202 + sq->array = ptr + p->sq_off.array; 203 + 204 + size = p->sq_entries * sizeof(struct io_uring_sqe); 205 + sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE, 206 + MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES); 207 + if (sq->sqes == MAP_FAILED) { 208 + ret = -errno; 209 + err: 210 + munmap(sq->khead, sq->ring_sz); 211 + return ret; 212 + } 213 + 214 + cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe); 215 + ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE, 216 + MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING); 217 + if (ptr == MAP_FAILED) { 218 + ret = -errno; 219 + munmap(sq->sqes, p->sq_entries * sizeof(struct io_uring_sqe)); 220 + goto err; 221 + } 222 + cq->khead = ptr + p->cq_off.head; 223 + cq->ktail = ptr + p->cq_off.tail; 224 + cq->kring_mask = ptr + p->cq_off.ring_mask; 225 + cq->kring_entries = ptr + p->cq_off.ring_entries; 226 + cq->koverflow = ptr + p->cq_off.overflow; 227 + cq->cqes = ptr + p->cq_off.cqes; 228 + return 0; 229 + } 230 + 231 + static int io_uring_queue_init(unsigned entries, struct io_uring *ring, 232 + unsigned flags) 233 + { 234 + struct io_uring_params p; 235 + int fd, ret; 236 + 237 + memset(ring, 0, sizeof(*ring)); 238 + memset(&p, 0, sizeof(p)); 239 + p.flags = flags; 240 + 241 + fd = io_uring_setup(entries, &p); 242 + if (fd < 0) 243 + return fd; 244 + ret = io_uring_mmap(fd, &p, &ring->sq, &ring->cq); 245 + if (!ret) 246 + ring->ring_fd = fd; 247 + else 248 + close(fd); 249 + return ret; 250 + } 251 + 252 + static int io_uring_submit(struct io_uring *ring) 253 + { 254 + struct io_uring_sq *sq = &ring->sq; 255 + const unsigned mask = *sq->kring_mask; 256 + unsigned ktail, submitted, to_submit; 257 + int ret; 258 + 259 + read_barrier(); 260 + if (*sq->khead != *sq->ktail) { 261 + submitted = *sq->kring_entries; 262 + goto submit; 263 + } 264 + if (sq->sqe_head == sq->sqe_tail) 265 + return 0; 266 + 267 + ktail = *sq->ktail; 268 + to_submit = sq->sqe_tail - sq->sqe_head; 269 + for (submitted = 0; submitted < to_submit; submitted++) { 270 + read_barrier(); 271 + sq->array[ktail++ & mask] = sq->sqe_head++ & mask; 272 + } 273 + if (!submitted) 274 + return 0; 275 + 276 + if (*sq->ktail != ktail) { 277 + write_barrier(); 278 + *sq->ktail = ktail; 279 + write_barrier(); 280 + } 281 + submit: 282 + ret = io_uring_enter(ring->ring_fd, submitted, 0, 283 + IORING_ENTER_GETEVENTS, NULL); 284 + return ret < 0 ? -errno : ret; 285 + } 286 + 287 + static inline void io_uring_prep_send(struct io_uring_sqe *sqe, int sockfd, 288 + const void *buf, size_t len, int flags) 289 + { 290 + memset(sqe, 0, sizeof(*sqe)); 291 + sqe->opcode = (__u8) IORING_OP_SEND; 292 + sqe->fd = sockfd; 293 + sqe->addr = (unsigned long) buf; 294 + sqe->len = len; 295 + sqe->msg_flags = (__u32) flags; 296 + } 297 + 298 + static inline void io_uring_prep_sendzc(struct io_uring_sqe *sqe, int sockfd, 299 + const void *buf, size_t len, int flags, 300 + unsigned slot_idx, unsigned zc_flags) 301 + { 302 + io_uring_prep_send(sqe, sockfd, buf, len, flags); 303 + sqe->opcode = (__u8) IORING_OP_SENDZC_NOTIF; 304 + sqe->notification_idx = slot_idx; 305 + sqe->ioprio = zc_flags; 306 + } 307 + 308 + static struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring) 309 + { 310 + struct io_uring_sq *sq = &ring->sq; 311 + 312 + if (sq->sqe_tail + 1 - sq->sqe_head > *sq->kring_entries) 313 + return NULL; 314 + return &sq->sqes[sq->sqe_tail++ & *sq->kring_mask]; 315 + } 316 + 317 + static int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr) 318 + { 319 + struct io_uring_cq *cq = &ring->cq; 320 + const unsigned mask = *cq->kring_mask; 321 + unsigned head = *cq->khead; 322 + int ret; 323 + 324 + *cqe_ptr = NULL; 325 + do { 326 + read_barrier(); 327 + if (head != *cq->ktail) { 328 + *cqe_ptr = &cq->cqes[head & mask]; 329 + break; 330 + } 331 + ret = io_uring_enter(ring->ring_fd, 0, 1, 332 + IORING_ENTER_GETEVENTS, NULL); 333 + if (ret < 0) 334 + return -errno; 335 + } while (1); 336 + 337 + return 0; 338 + } 339 + 340 + static inline void io_uring_cqe_seen(struct io_uring *ring) 341 + { 342 + *(&ring->cq)->khead += 1; 343 + write_barrier(); 344 + } 345 + 346 + static unsigned long gettimeofday_ms(void) 347 + { 348 + struct timeval tv; 349 + 350 + gettimeofday(&tv, NULL); 351 + return (tv.tv_sec * 1000) + (tv.tv_usec / 1000); 352 + } 353 + 354 + static void do_setsockopt(int fd, int level, int optname, int val) 355 + { 356 + if (setsockopt(fd, level, optname, &val, sizeof(val))) 357 + error(1, errno, "setsockopt %d.%d: %d", level, optname, val); 358 + } 359 + 360 + static int do_setup_tx(int domain, int type, int protocol) 361 + { 362 + int fd; 363 + 364 + fd = socket(domain, type, protocol); 365 + if (fd == -1) 366 + error(1, errno, "socket t"); 367 + 368 + do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21); 369 + 370 + if (connect(fd, (void *) &cfg_dst_addr, cfg_alen)) 371 + error(1, errno, "connect"); 372 + return fd; 373 + } 374 + 375 + static void do_tx(int domain, int type, int protocol) 376 + { 377 + struct io_uring_notification_slot b[1] = {{.tag = NOTIF_TAG}}; 378 + struct io_uring_sqe *sqe; 379 + struct io_uring_cqe *cqe; 380 + unsigned long packets = 0, bytes = 0; 381 + struct io_uring ring; 382 + struct iovec iov; 383 + uint64_t tstop; 384 + int i, fd, ret; 385 + int compl_cqes = 0; 386 + 387 + fd = do_setup_tx(domain, type, protocol); 388 + 389 + ret = io_uring_queue_init(512, &ring, 0); 390 + if (ret) 391 + error(1, ret, "io_uring: queue init"); 392 + 393 + ret = io_uring_register_notifications(&ring, 1, b); 394 + if (ret) 395 + error(1, ret, "io_uring: tx ctx registration"); 396 + 397 + iov.iov_base = payload; 398 + iov.iov_len = cfg_payload_len; 399 + 400 + ret = io_uring_register_buffers(&ring, &iov, 1); 401 + if (ret) 402 + error(1, ret, "io_uring: buffer registration"); 403 + 404 + tstop = gettimeofday_ms() + cfg_runtime_ms; 405 + do { 406 + if (cfg_cork) 407 + do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1); 408 + 409 + for (i = 0; i < cfg_nr_reqs; i++) { 410 + unsigned zc_flags = 0; 411 + unsigned buf_idx = 0; 412 + unsigned slot_idx = 0; 413 + unsigned mode = cfg_mode; 414 + unsigned msg_flags = 0; 415 + 416 + if (cfg_mode == MODE_MIXED) 417 + mode = rand() % 3; 418 + 419 + sqe = io_uring_get_sqe(&ring); 420 + 421 + if (mode == MODE_NONZC) { 422 + io_uring_prep_send(sqe, fd, payload, 423 + cfg_payload_len, msg_flags); 424 + sqe->user_data = NONZC_TAG; 425 + } else { 426 + if (cfg_flush) { 427 + zc_flags |= IORING_RECVSEND_NOTIF_FLUSH; 428 + compl_cqes++; 429 + } 430 + io_uring_prep_sendzc(sqe, fd, payload, 431 + cfg_payload_len, 432 + msg_flags, slot_idx, zc_flags); 433 + if (mode == MODE_ZC_FIXED) { 434 + sqe->ioprio |= IORING_RECVSEND_FIXED_BUF; 435 + sqe->buf_index = buf_idx; 436 + } 437 + sqe->user_data = ZC_TAG; 438 + } 439 + } 440 + 441 + ret = io_uring_submit(&ring); 442 + if (ret != cfg_nr_reqs) 443 + error(1, ret, "submit"); 444 + 445 + for (i = 0; i < cfg_nr_reqs; i++) { 446 + ret = io_uring_wait_cqe(&ring, &cqe); 447 + if (ret) 448 + error(1, ret, "wait cqe"); 449 + 450 + if (cqe->user_data == NOTIF_TAG) { 451 + compl_cqes--; 452 + i--; 453 + } else if (cqe->user_data != NONZC_TAG && 454 + cqe->user_data != ZC_TAG) { 455 + error(1, cqe->res, "invalid user_data"); 456 + } else if (cqe->res <= 0 && cqe->res != -EAGAIN) { 457 + error(1, cqe->res, "send failed"); 458 + } else { 459 + if (cqe->res > 0) { 460 + packets++; 461 + bytes += cqe->res; 462 + } 463 + /* failed requests don't flush */ 464 + if (cfg_flush && 465 + cqe->res <= 0 && 466 + cqe->user_data == ZC_TAG) 467 + compl_cqes--; 468 + } 469 + io_uring_cqe_seen(&ring); 470 + } 471 + if (cfg_cork) 472 + do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0); 473 + } while (gettimeofday_ms() < tstop); 474 + 475 + if (close(fd)) 476 + error(1, errno, "close"); 477 + 478 + fprintf(stderr, "tx=%lu (MB=%lu), tx/s=%lu (MB/s=%lu)\n", 479 + packets, bytes >> 20, 480 + packets / (cfg_runtime_ms / 1000), 481 + (bytes >> 20) / (cfg_runtime_ms / 1000)); 482 + 483 + while (compl_cqes) { 484 + ret = io_uring_wait_cqe(&ring, &cqe); 485 + if (ret) 486 + error(1, ret, "wait cqe"); 487 + io_uring_cqe_seen(&ring); 488 + compl_cqes--; 489 + } 490 + } 491 + 492 + static void do_test(int domain, int type, int protocol) 493 + { 494 + int i; 495 + 496 + for (i = 0; i < IP_MAXPACKET; i++) 497 + payload[i] = 'a' + (i % 26); 498 + do_tx(domain, type, protocol); 499 + } 500 + 501 + static void usage(const char *filepath) 502 + { 503 + error(1, 0, "Usage: %s [-f] [-n<N>] [-z0] [-s<payload size>] " 504 + "(-4|-6) [-t<time s>] -D<dst_ip> udp", filepath); 505 + } 506 + 507 + static void parse_opts(int argc, char **argv) 508 + { 509 + const int max_payload_len = sizeof(payload) - 510 + sizeof(struct ipv6hdr) - 511 + sizeof(struct tcphdr) - 512 + 40 /* max tcp options */; 513 + struct sockaddr_in6 *addr6 = (void *) &cfg_dst_addr; 514 + struct sockaddr_in *addr4 = (void *) &cfg_dst_addr; 515 + char *daddr = NULL; 516 + int c; 517 + 518 + if (argc <= 1) 519 + usage(argv[0]); 520 + cfg_payload_len = max_payload_len; 521 + 522 + while ((c = getopt(argc, argv, "46D:p:s:t:n:fc:m:")) != -1) { 523 + switch (c) { 524 + case '4': 525 + if (cfg_family != PF_UNSPEC) 526 + error(1, 0, "Pass one of -4 or -6"); 527 + cfg_family = PF_INET; 528 + cfg_alen = sizeof(struct sockaddr_in); 529 + break; 530 + case '6': 531 + if (cfg_family != PF_UNSPEC) 532 + error(1, 0, "Pass one of -4 or -6"); 533 + cfg_family = PF_INET6; 534 + cfg_alen = sizeof(struct sockaddr_in6); 535 + break; 536 + case 'D': 537 + daddr = optarg; 538 + break; 539 + case 'p': 540 + cfg_port = strtoul(optarg, NULL, 0); 541 + break; 542 + case 's': 543 + cfg_payload_len = strtoul(optarg, NULL, 0); 544 + break; 545 + case 't': 546 + cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000; 547 + break; 548 + case 'n': 549 + cfg_nr_reqs = strtoul(optarg, NULL, 0); 550 + break; 551 + case 'f': 552 + cfg_flush = 1; 553 + break; 554 + case 'c': 555 + cfg_cork = strtol(optarg, NULL, 0); 556 + break; 557 + case 'm': 558 + cfg_mode = strtol(optarg, NULL, 0); 559 + break; 560 + } 561 + } 562 + 563 + switch (cfg_family) { 564 + case PF_INET: 565 + memset(addr4, 0, sizeof(*addr4)); 566 + addr4->sin_family = AF_INET; 567 + addr4->sin_port = htons(cfg_port); 568 + if (daddr && 569 + inet_pton(AF_INET, daddr, &(addr4->sin_addr)) != 1) 570 + error(1, 0, "ipv4 parse error: %s", daddr); 571 + break; 572 + case PF_INET6: 573 + memset(addr6, 0, sizeof(*addr6)); 574 + addr6->sin6_family = AF_INET6; 575 + addr6->sin6_port = htons(cfg_port); 576 + if (daddr && 577 + inet_pton(AF_INET6, daddr, &(addr6->sin6_addr)) != 1) 578 + error(1, 0, "ipv6 parse error: %s", daddr); 579 + break; 580 + default: 581 + error(1, 0, "illegal domain"); 582 + } 583 + 584 + if (cfg_payload_len > max_payload_len) 585 + error(1, 0, "-s: payload exceeds max (%d)", max_payload_len); 586 + if (cfg_mode == MODE_NONZC && cfg_flush) 587 + error(1, 0, "-f: only zerocopy modes support notifications"); 588 + if (optind != argc - 1) 589 + usage(argv[0]); 590 + } 591 + 592 + int main(int argc, char **argv) 593 + { 594 + const char *cfg_test = argv[argc - 1]; 595 + 596 + parse_opts(argc, argv); 597 + 598 + if (!strcmp(cfg_test, "tcp")) 599 + do_test(cfg_family, SOCK_STREAM, 0); 600 + else if (!strcmp(cfg_test, "udp")) 601 + do_test(cfg_family, SOCK_DGRAM, 0); 602 + else 603 + error(1, 0, "unknown cfg_test %s", cfg_test); 604 + return 0; 605 + }

+131

tools/testing/selftests/net/io_uring_zerocopy_tx.sh

··· 1 + #!/bin/bash 2 + # 3 + # Send data between two processes across namespaces 4 + # Run twice: once without and once with zerocopy 5 + 6 + set -e 7 + 8 + readonly DEV="veth0" 9 + readonly DEV_MTU=65535 10 + readonly BIN_TX="./io_uring_zerocopy_tx" 11 + readonly BIN_RX="./msg_zerocopy" 12 + 13 + readonly RAND="$(mktemp -u XXXXXX)" 14 + readonly NSPREFIX="ns-${RAND}" 15 + readonly NS1="${NSPREFIX}1" 16 + readonly NS2="${NSPREFIX}2" 17 + 18 + readonly SADDR4='192.168.1.1' 19 + readonly DADDR4='192.168.1.2' 20 + readonly SADDR6='fd::1' 21 + readonly DADDR6='fd::2' 22 + 23 + readonly path_sysctl_mem="net.core.optmem_max" 24 + 25 + # No arguments: automated test 26 + if [[ "$#" -eq "0" ]]; then 27 + IPs=( "4" "6" ) 28 + protocols=( "tcp" "udp" ) 29 + 30 + for IP in "${IPs[@]}"; do 31 + for proto in "${protocols[@]}"; do 32 + for mode in $(seq 1 3); do 33 + $0 "$IP" "$proto" -m "$mode" -t 1 -n 32 34 + $0 "$IP" "$proto" -m "$mode" -t 1 -n 32 -f 35 + $0 "$IP" "$proto" -m "$mode" -t 1 -n 32 -c -f 36 + done 37 + done 38 + done 39 + 40 + echo "OK. All tests passed" 41 + exit 0 42 + fi 43 + 44 + # Argument parsing 45 + if [[ "$#" -lt "2" ]]; then 46 + echo "Usage: $0 [4|6] [tcp|udp|raw|raw_hdrincl|packet|packet_dgram] <args>" 47 + exit 1 48 + fi 49 + 50 + readonly IP="$1" 51 + shift 52 + readonly TXMODE="$1" 53 + shift 54 + readonly EXTRA_ARGS="$@" 55 + 56 + # Argument parsing: configure addresses 57 + if [[ "${IP}" == "4" ]]; then 58 + readonly SADDR="${SADDR4}" 59 + readonly DADDR="${DADDR4}" 60 + elif [[ "${IP}" == "6" ]]; then 61 + readonly SADDR="${SADDR6}" 62 + readonly DADDR="${DADDR6}" 63 + else 64 + echo "Invalid IP version ${IP}" 65 + exit 1 66 + fi 67 + 68 + # Argument parsing: select receive mode 69 + # 70 + # This differs from send mode for 71 + # - packet: use raw recv, because packet receives skb clones 72 + # - raw_hdrinc: use raw recv, because hdrincl is a tx-only option 73 + case "${TXMODE}" in 74 + 'packet' | 'packet_dgram' | 'raw_hdrincl') 75 + RXMODE='raw' 76 + ;; 77 + *) 78 + RXMODE="${TXMODE}" 79 + ;; 80 + esac 81 + 82 + # Start of state changes: install cleanup handler 83 + save_sysctl_mem="$(sysctl -n ${path_sysctl_mem})" 84 + 85 + cleanup() { 86 + ip netns del "${NS2}" 87 + ip netns del "${NS1}" 88 + sysctl -w -q "${path_sysctl_mem}=${save_sysctl_mem}" 89 + } 90 + 91 + trap cleanup EXIT 92 + 93 + # Configure system settings 94 + sysctl -w -q "${path_sysctl_mem}=1000000" 95 + 96 + # Create virtual ethernet pair between network namespaces 97 + ip netns add "${NS1}" 98 + ip netns add "${NS2}" 99 + 100 + ip link add "${DEV}" mtu "${DEV_MTU}" netns "${NS1}" type veth \ 101 + peer name "${DEV}" mtu "${DEV_MTU}" netns "${NS2}" 102 + 103 + # Bring the devices up 104 + ip -netns "${NS1}" link set "${DEV}" up 105 + ip -netns "${NS2}" link set "${DEV}" up 106 + 107 + # Set fixed MAC addresses on the devices 108 + ip -netns "${NS1}" link set dev "${DEV}" address 02:02:02:02:02:02 109 + ip -netns "${NS2}" link set dev "${DEV}" address 06:06:06:06:06:06 110 + 111 + # Add fixed IP addresses to the devices 112 + ip -netns "${NS1}" addr add 192.168.1.1/24 dev "${DEV}" 113 + ip -netns "${NS2}" addr add 192.168.1.2/24 dev "${DEV}" 114 + ip -netns "${NS1}" addr add fd::1/64 dev "${DEV}" nodad 115 + ip -netns "${NS2}" addr add fd::2/64 dev "${DEV}" nodad 116 + 117 + # Optionally disable sg or csum offload to test edge cases 118 + # ip netns exec "${NS1}" ethtool -K "${DEV}" sg off 119 + 120 + do_test() { 121 + local readonly ARGS="$1" 122 + 123 + echo "ipv${IP} ${TXMODE} ${ARGS}" 124 + ip netns exec "${NS2}" "${BIN_RX}" "-${IP}" -t 2 -C 2 -S "${SADDR}" -D "${DADDR}" -r "${RXMODE}" & 125 + sleep 0.2 126 + ip netns exec "${NS1}" "${BIN_TX}" "-${IP}" -t 1 -D "${DADDR}" ${ARGS} "${TXMODE}" 127 + wait 128 + } 129 + 130 + do_test "${EXTRA_ARGS}" 131 + echo ok