Merge tag 'for-6.8/io_uring-2024-01-08' of git://git.kernel.dk/linux

+1

MAINTAINERS

··· 11142 11142 S: Maintained 11143 11143 T: git git://git.kernel.dk/linux-block 11144 11144 T: git git://git.kernel.dk/liburing 11145 + F: include/linux/io_uring/ 11145 11146 F: include/linux/io_uring.h 11146 11147 F: include/linux/io_uring_types.h 11147 11148 F: include/trace/events/io_uring.h

+1 -1

drivers/block/ublk_drv.c

··· 36 36 #include <linux/sched/mm.h> 37 37 #include <linux/uaccess.h> 38 38 #include <linux/cdev.h> 39 - #include <linux/io_uring.h> 39 + #include <linux/io_uring/cmd.h> 40 40 #include <linux/blk-mq.h> 41 41 #include <linux/delay.h> 42 42 #include <linux/mm.h>

+1 -1

drivers/nvme/host/ioctl.c

··· 5 5 */ 6 6 #include <linux/ptrace.h> /* for force_successful_syscall_return */ 7 7 #include <linux/nvme_ioctl.h> 8 - #include <linux/io_uring.h> 8 + #include <linux/io_uring/cmd.h> 9 9 #include "nvme.h" 10 10 11 11 enum {

+4 -86

include/linux/io_uring.h

··· 6 6 #include <linux/xarray.h> 7 7 #include <uapi/linux/io_uring.h> 8 8 9 - enum io_uring_cmd_flags { 10 - IO_URING_F_COMPLETE_DEFER = 1, 11 - IO_URING_F_UNLOCKED = 2, 12 - /* the request is executed from poll, it should not be freed */ 13 - IO_URING_F_MULTISHOT = 4, 14 - /* executed by io-wq */ 15 - IO_URING_F_IOWQ = 8, 16 - /* int's last bit, sign checks are usually faster than a bit test */ 17 - IO_URING_F_NONBLOCK = INT_MIN, 18 - 19 - /* ctx state flags, for URING_CMD */ 20 - IO_URING_F_SQE128 = (1 << 8), 21 - IO_URING_F_CQE32 = (1 << 9), 22 - IO_URING_F_IOPOLL = (1 << 10), 23 - 24 - /* set when uring wants to cancel a previously issued command */ 25 - IO_URING_F_CANCEL = (1 << 11), 26 - IO_URING_F_COMPAT = (1 << 12), 27 - }; 28 - 29 - /* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */ 30 - #define IORING_URING_CMD_CANCELABLE (1U << 30) 31 - 32 - struct io_uring_cmd { 33 - struct file *file; 34 - const struct io_uring_sqe *sqe; 35 - /* callback to defer completions to task context */ 36 - void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned); 37 - u32 cmd_op; 38 - u32 flags; 39 - u8 pdu[32]; /* available inline for free use */ 40 - }; 41 - 42 - static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) 43 - { 44 - return sqe->cmd; 45 - } 46 - 47 9 #if defined(CONFIG_IO_URING) 48 - int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, 49 - struct iov_iter *iter, void *ioucmd); 50 - void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, 51 - unsigned issue_flags); 52 - struct sock *io_uring_get_socket(struct file *file); 53 10 void __io_uring_cancel(bool cancel_all); 54 11 void __io_uring_free(struct task_struct *tsk); 55 12 void io_uring_unreg_ringfd(void); 56 13 const char *io_uring_get_opcode(u8 opcode); 57 - void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, 58 - void (*task_work_cb)(struct io_uring_cmd *, unsigned), 59 - unsigned flags); 60 - /* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */ 61 - void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, 62 - void (*task_work_cb)(struct io_uring_cmd *, unsigned)); 63 - 64 - static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, 65 - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) 66 - { 67 - __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); 68 - } 14 + int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); 15 + bool io_is_uring_fops(struct file *file); 69 16 70 17 static inline void io_uring_files_cancel(void) 71 18 { ··· 31 84 if (tsk->io_uring) 32 85 __io_uring_free(tsk); 33 86 } 34 - int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); 35 - void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, 36 - unsigned int issue_flags); 37 - struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd); 38 87 #else 39 - static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, 40 - struct iov_iter *iter, void *ioucmd) 41 - { 42 - return -EOPNOTSUPP; 43 - } 44 - static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, 45 - ssize_t ret2, unsigned issue_flags) 46 - { 47 - } 48 - static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, 49 - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) 50 - { 51 - } 52 - static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, 53 - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) 54 - { 55 - } 56 - static inline struct sock *io_uring_get_socket(struct file *file) 57 - { 58 - return NULL; 59 - } 60 88 static inline void io_uring_task_cancel(void) 61 89 { 62 90 } ··· 50 128 { 51 129 return -EOPNOTSUPP; 52 130 } 53 - static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, 54 - unsigned int issue_flags) 131 + static inline bool io_is_uring_fops(struct file *file) 55 132 { 56 - } 57 - static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd) 58 - { 59 - return NULL; 133 + return false; 60 134 } 61 135 #endif 62 136

+77

include/linux/io_uring/cmd.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + #ifndef _LINUX_IO_URING_CMD_H 3 + #define _LINUX_IO_URING_CMD_H 4 + 5 + #include <uapi/linux/io_uring.h> 6 + #include <linux/io_uring_types.h> 7 + 8 + /* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */ 9 + #define IORING_URING_CMD_CANCELABLE (1U << 30) 10 + 11 + struct io_uring_cmd { 12 + struct file *file; 13 + const struct io_uring_sqe *sqe; 14 + /* callback to defer completions to task context */ 15 + void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned); 16 + u32 cmd_op; 17 + u32 flags; 18 + u8 pdu[32]; /* available inline for free use */ 19 + }; 20 + 21 + static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) 22 + { 23 + return sqe->cmd; 24 + } 25 + 26 + #if defined(CONFIG_IO_URING) 27 + int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, 28 + struct iov_iter *iter, void *ioucmd); 29 + void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, 30 + unsigned issue_flags); 31 + void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, 32 + void (*task_work_cb)(struct io_uring_cmd *, unsigned), 33 + unsigned flags); 34 + 35 + void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, 36 + unsigned int issue_flags); 37 + 38 + #else 39 + static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, 40 + struct iov_iter *iter, void *ioucmd) 41 + { 42 + return -EOPNOTSUPP; 43 + } 44 + static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, 45 + ssize_t ret2, unsigned issue_flags) 46 + { 47 + } 48 + static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, 49 + void (*task_work_cb)(struct io_uring_cmd *, unsigned), 50 + unsigned flags) 51 + { 52 + } 53 + static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, 54 + unsigned int issue_flags) 55 + { 56 + } 57 + #endif 58 + 59 + /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ 60 + static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, 61 + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) 62 + { 63 + __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE); 64 + } 65 + 66 + static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, 67 + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) 68 + { 69 + __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); 70 + } 71 + 72 + static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd) 73 + { 74 + return cmd_to_io_kiocb(cmd)->task; 75 + } 76 + 77 + #endif /* _LINUX_IO_URING_CMD_H */

+31 -3

include/linux/io_uring_types.h

··· 7 7 #include <linux/llist.h> 8 8 #include <uapi/linux/io_uring.h> 9 9 10 + enum { 11 + /* 12 + * A hint to not wake right away but delay until there are enough of 13 + * tw's queued to match the number of CQEs the task is waiting for. 14 + * 15 + * Must not be used wirh requests generating more than one CQE. 16 + * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set. 17 + */ 18 + IOU_F_TWQ_LAZY_WAKE = 1, 19 + }; 20 + 21 + enum io_uring_cmd_flags { 22 + IO_URING_F_COMPLETE_DEFER = 1, 23 + IO_URING_F_UNLOCKED = 2, 24 + /* the request is executed from poll, it should not be freed */ 25 + IO_URING_F_MULTISHOT = 4, 26 + /* executed by io-wq */ 27 + IO_URING_F_IOWQ = 8, 28 + /* int's last bit, sign checks are usually faster than a bit test */ 29 + IO_URING_F_NONBLOCK = INT_MIN, 30 + 31 + /* ctx state flags, for URING_CMD */ 32 + IO_URING_F_SQE128 = (1 << 8), 33 + IO_URING_F_CQE32 = (1 << 9), 34 + IO_URING_F_IOPOLL = (1 << 10), 35 + 36 + /* set when uring wants to cancel a previously issued command */ 37 + IO_URING_F_CANCEL = (1 << 11), 38 + IO_URING_F_COMPAT = (1 << 12), 39 + }; 40 + 10 41 struct io_wq_work_node { 11 42 struct io_wq_work_node *next; 12 43 }; ··· 389 358 struct wait_queue_head rsrc_quiesce_wq; 390 359 unsigned rsrc_quiesce; 391 360 392 - #if defined(CONFIG_UNIX) 393 - struct socket *ring_sock; 394 - #endif 395 361 /* hashed buffered write serialization */ 396 362 struct io_wq_hash *hash_map; 397 363

+19

include/uapi/linux/io_uring.h

··· 71 71 __u32 uring_cmd_flags; 72 72 __u32 waitid_flags; 73 73 __u32 futex_flags; 74 + __u32 install_fd_flags; 74 75 }; 75 76 __u64 user_data; /* data to be passed back at completion time */ 76 77 /* pack this to avoid bogus arm OABI complaints */ ··· 254 253 IORING_OP_FUTEX_WAIT, 255 254 IORING_OP_FUTEX_WAKE, 256 255 IORING_OP_FUTEX_WAITV, 256 + IORING_OP_FIXED_FD_INSTALL, 257 257 258 258 /* this goes last, obviously */ 259 259 IORING_OP_LAST, ··· 387 385 #define IORING_MSG_RING_CQE_SKIP (1U << 0) 388 386 /* Pass through the flags from sqe->file_index to cqe->flags */ 389 387 #define IORING_MSG_RING_FLAGS_PASS (1U << 1) 388 + 389 + /* 390 + * IORING_OP_FIXED_FD_INSTALL flags (sqe->install_fd_flags) 391 + * 392 + * IORING_FIXED_FD_NO_CLOEXEC Don't mark the fd as O_CLOEXEC 393 + */ 394 + #define IORING_FIXED_FD_NO_CLOEXEC (1U << 0) 390 395 391 396 /* 392 397 * IO completion data structure (Completion Queue Entry) ··· 567 558 /* register a range of fixed file slots for automatic slot allocation */ 568 559 IORING_REGISTER_FILE_ALLOC_RANGE = 25, 569 560 561 + /* return status information for a buffer group */ 562 + IORING_REGISTER_PBUF_STATUS = 26, 563 + 570 564 /* this goes last */ 571 565 IORING_REGISTER_LAST, 572 566 ··· 694 682 __u16 bgid; 695 683 __u16 flags; 696 684 __u64 resv[3]; 685 + }; 686 + 687 + /* argument for IORING_REGISTER_PBUF_STATUS */ 688 + struct io_uring_buf_status { 689 + __u32 buf_group; /* input */ 690 + __u32 head; /* output */ 691 + __u32 resv[8]; 697 692 }; 698 693 699 694 /*

+1 -1

io_uring/Makefile

··· 8 8 statx.o net.o msg_ring.o timeout.o \ 9 9 sqpoll.o fdinfo.o tctx.o poll.o \ 10 10 cancel.o kbuf.o rsrc.o rw.o opdef.o \ 11 - notif.o waitid.o 11 + notif.o waitid.o register.o 12 12 obj-$(CONFIG_IO_WQ) += io-wq.o 13 13 obj-$(CONFIG_FUTEX) += futex.o

+4 -7

io_uring/filetable.c

··· 87 87 io_file_bitmap_clear(&ctx->file_table, slot_index); 88 88 } 89 89 90 - ret = io_scm_file_account(ctx, file); 91 - if (!ret) { 92 - *io_get_tag_slot(ctx->file_data, slot_index) = 0; 93 - io_fixed_file_set(file_slot, file); 94 - io_file_bitmap_set(&ctx->file_table, slot_index); 95 - } 96 - return ret; 90 + *io_get_tag_slot(ctx->file_data, slot_index) = 0; 91 + io_fixed_file_set(file_slot, file); 92 + io_file_bitmap_set(&ctx->file_table, slot_index); 93 + return 0; 97 94 } 98 95 99 96 int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file,

+30 -633

io_uring/io_uring.c

··· 60 60 #include <linux/net.h> 61 61 #include <net/sock.h> 62 62 #include <net/af_unix.h> 63 - #include <net/scm.h> 64 63 #include <linux/anon_inodes.h> 65 64 #include <linux/sched/mm.h> 66 65 #include <linux/uaccess.h> ··· 69 70 #include <linux/fadvise.h> 70 71 #include <linux/task_work.h> 71 72 #include <linux/io_uring.h> 73 + #include <linux/io_uring/cmd.h> 72 74 #include <linux/audit.h> 73 75 #include <linux/security.h> 74 76 #include <asm/shmparam.h> ··· 85 85 #include "opdef.h" 86 86 #include "refs.h" 87 87 #include "tctx.h" 88 + #include "register.h" 88 89 #include "sqpoll.h" 89 90 #include "fdinfo.h" 90 91 #include "kbuf.h" ··· 103 102 104 103 #define IORING_MAX_ENTRIES 32768 105 104 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) 106 - 107 - #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 108 - IORING_REGISTER_LAST + IORING_OP_LAST) 109 105 110 106 #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ 111 107 IOSQE_IO_HARDLINK | IOSQE_ASYNC) ··· 125 127 enum { 126 128 IO_CHECK_CQ_OVERFLOW_BIT, 127 129 IO_CHECK_CQ_DROPPED_BIT, 128 - }; 129 - 130 - enum { 131 - IO_EVENTFD_OP_SIGNAL_BIT, 132 - IO_EVENTFD_OP_FREE_BIT, 133 130 }; 134 131 135 132 struct io_defer_entry { ··· 169 176 {}, 170 177 }; 171 178 #endif 172 - 173 - struct sock *io_uring_get_socket(struct file *file) 174 - { 175 - #if defined(CONFIG_UNIX) 176 - if (io_is_uring_fops(file)) { 177 - struct io_ring_ctx *ctx = file->private_data; 178 - 179 - return ctx->ring_sock->sk; 180 - } 181 - #endif 182 - return NULL; 183 - } 184 - EXPORT_SYMBOL(io_uring_get_socket); 185 179 186 180 static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) 187 181 { ··· 534 554 } 535 555 } 536 556 537 - 538 - static void io_eventfd_ops(struct rcu_head *rcu) 557 + void io_eventfd_ops(struct rcu_head *rcu) 539 558 { 540 559 struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); 541 560 int ops = atomic_xchg(&ev_fd->ops, 0); ··· 1877 1898 io_req_complete_defer(req); 1878 1899 else 1879 1900 io_req_complete_post(req, issue_flags); 1880 - } else if (ret != IOU_ISSUE_SKIP_COMPLETE) 1881 - return ret; 1882 1901 1883 - /* If the op doesn't have a file, we're not polling for it */ 1884 - if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) 1885 - io_iopoll_req_issued(req, issue_flags); 1902 + return 0; 1903 + } 1886 1904 1887 - return 0; 1905 + if (ret == IOU_ISSUE_SKIP_COMPLETE) { 1906 + ret = 0; 1907 + io_arm_ltimeout(req); 1908 + 1909 + /* If the op doesn't have a file, we're not polling for it */ 1910 + if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) 1911 + io_iopoll_req_issued(req, issue_flags); 1912 + } 1913 + return ret; 1888 1914 } 1889 1915 1890 1916 int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts) ··· 2060 2076 * We async punt it if the file wasn't marked NOWAIT, or if the file 2061 2077 * doesn't support non-blocking read/write attempts 2062 2078 */ 2063 - if (likely(!ret)) 2064 - io_arm_ltimeout(req); 2065 - else 2079 + if (unlikely(ret)) 2066 2080 io_queue_async(req, ret); 2067 2081 } 2068 2082 ··· 2615 2633 __set_current_state(TASK_RUNNING); 2616 2634 atomic_set(&ctx->cq_wait_nr, 0); 2617 2635 2618 - if (ret < 0) 2619 - break; 2620 2636 /* 2621 2637 * Run task_work after scheduling and before io_should_wake(). 2622 2638 * If we got woken because of task_work being processed, run it ··· 2623 2643 io_run_task_work(); 2624 2644 if (!llist_empty(&ctx->work_llist)) 2625 2645 io_run_local_work(ctx); 2646 + 2647 + /* 2648 + * Non-local task_work will be run on exit to userspace, but 2649 + * if we're using DEFER_TASKRUN, then we could have waited 2650 + * with a timeout for a number of requests. If the timeout 2651 + * hits, we could have some requests ready to process. Ensure 2652 + * this break is _after_ we have run task_work, to avoid 2653 + * deferring running potentially pending requests until the 2654 + * next time we wait for events. 2655 + */ 2656 + if (ret < 0) 2657 + break; 2626 2658 2627 2659 check_cq = READ_ONCE(ctx->check_cq); 2628 2660 if (unlikely(check_cq)) { ··· 2823 2831 return off; 2824 2832 } 2825 2833 2826 - static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, 2827 - unsigned int eventfd_async) 2828 - { 2829 - struct io_ev_fd *ev_fd; 2830 - __s32 __user *fds = arg; 2831 - int fd; 2832 - 2833 - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, 2834 - lockdep_is_held(&ctx->uring_lock)); 2835 - if (ev_fd) 2836 - return -EBUSY; 2837 - 2838 - if (copy_from_user(&fd, fds, sizeof(*fds))) 2839 - return -EFAULT; 2840 - 2841 - ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); 2842 - if (!ev_fd) 2843 - return -ENOMEM; 2844 - 2845 - ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); 2846 - if (IS_ERR(ev_fd->cq_ev_fd)) { 2847 - int ret = PTR_ERR(ev_fd->cq_ev_fd); 2848 - kfree(ev_fd); 2849 - return ret; 2850 - } 2851 - 2852 - spin_lock(&ctx->completion_lock); 2853 - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; 2854 - spin_unlock(&ctx->completion_lock); 2855 - 2856 - ev_fd->eventfd_async = eventfd_async; 2857 - ctx->has_evfd = true; 2858 - rcu_assign_pointer(ctx->io_ev_fd, ev_fd); 2859 - atomic_set(&ev_fd->refs, 1); 2860 - atomic_set(&ev_fd->ops, 0); 2861 - return 0; 2862 - } 2863 - 2864 - static int io_eventfd_unregister(struct io_ring_ctx *ctx) 2865 - { 2866 - struct io_ev_fd *ev_fd; 2867 - 2868 - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, 2869 - lockdep_is_held(&ctx->uring_lock)); 2870 - if (ev_fd) { 2871 - ctx->has_evfd = false; 2872 - rcu_assign_pointer(ctx->io_ev_fd, NULL); 2873 - if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops)) 2874 - call_rcu(&ev_fd->rcu, io_eventfd_ops); 2875 - return 0; 2876 - } 2877 - 2878 - return -ENXIO; 2879 - } 2880 - 2881 2834 static void io_req_caches_free(struct io_ring_ctx *ctx) 2882 2835 { 2883 2836 struct io_kiocb *req; ··· 2875 2938 io_rsrc_node_destroy(ctx, ctx->rsrc_node); 2876 2939 2877 2940 WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); 2878 - 2879 - #if defined(CONFIG_UNIX) 2880 - if (ctx->ring_sock) { 2881 - ctx->ring_sock->file = NULL; /* so that iput() is called */ 2882 - sock_release(ctx->ring_sock); 2883 - } 2884 - #endif 2885 2941 WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); 2886 2942 2887 2943 io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free); ··· 2914 2984 percpu_ref_put(&ctx->refs); 2915 2985 } 2916 2986 2917 - static __cold void io_activate_pollwq(struct io_ring_ctx *ctx) 2987 + __cold void io_activate_pollwq(struct io_ring_ctx *ctx) 2918 2988 { 2919 2989 spin_lock(&ctx->completion_lock); 2920 2990 /* already activated or in progress */ ··· 2971 3041 mask |= EPOLLIN | EPOLLRDNORM; 2972 3042 2973 3043 return mask; 2974 - } 2975 - 2976 - static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) 2977 - { 2978 - const struct cred *creds; 2979 - 2980 - creds = xa_erase(&ctx->personalities, id); 2981 - if (creds) { 2982 - put_cred(creds); 2983 - return 0; 2984 - } 2985 - 2986 - return -EINVAL; 2987 3044 } 2988 3045 2989 3046 struct io_tctx_exit { ··· 3783 3866 /* 3784 3867 * Allocate an anonymous fd, this is what constitutes the application 3785 3868 * visible backing of an io_uring instance. The application mmaps this 3786 - * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled, 3787 - * we have to tie this fd to a socket for file garbage collection purposes. 3869 + * fd to gain access to the SQ/CQ ring details. 3788 3870 */ 3789 3871 static struct file *io_uring_get_file(struct io_ring_ctx *ctx) 3790 3872 { 3791 - struct file *file; 3792 - #if defined(CONFIG_UNIX) 3793 - int ret; 3794 - 3795 - ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP, 3796 - &ctx->ring_sock); 3797 - if (ret) 3798 - return ERR_PTR(ret); 3799 - #endif 3800 - 3801 - file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx, 3873 + return anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx, 3802 3874 O_RDWR | O_CLOEXEC, NULL); 3803 - #if defined(CONFIG_UNIX) 3804 - if (IS_ERR(file)) { 3805 - sock_release(ctx->ring_sock); 3806 - ctx->ring_sock = NULL; 3807 - } else { 3808 - ctx->ring_sock->file = file; 3809 - } 3810 - #endif 3811 - return file; 3812 3875 } 3813 3876 3814 3877 static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, ··· 4053 4156 return -EPERM; 4054 4157 4055 4158 return io_uring_setup(entries, params); 4056 - } 4057 - 4058 - static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, 4059 - unsigned nr_args) 4060 - { 4061 - struct io_uring_probe *p; 4062 - size_t size; 4063 - int i, ret; 4064 - 4065 - size = struct_size(p, ops, nr_args); 4066 - if (size == SIZE_MAX) 4067 - return -EOVERFLOW; 4068 - p = kzalloc(size, GFP_KERNEL); 4069 - if (!p) 4070 - return -ENOMEM; 4071 - 4072 - ret = -EFAULT; 4073 - if (copy_from_user(p, arg, size)) 4074 - goto out; 4075 - ret = -EINVAL; 4076 - if (memchr_inv(p, 0, size)) 4077 - goto out; 4078 - 4079 - p->last_op = IORING_OP_LAST - 1; 4080 - if (nr_args > IORING_OP_LAST) 4081 - nr_args = IORING_OP_LAST; 4082 - 4083 - for (i = 0; i < nr_args; i++) { 4084 - p->ops[i].op = i; 4085 - if (!io_issue_defs[i].not_supported) 4086 - p->ops[i].flags = IO_URING_OP_SUPPORTED; 4087 - } 4088 - p->ops_len = i; 4089 - 4090 - ret = 0; 4091 - if (copy_to_user(arg, p, size)) 4092 - ret = -EFAULT; 4093 - out: 4094 - kfree(p); 4095 - return ret; 4096 - } 4097 - 4098 - static int io_register_personality(struct io_ring_ctx *ctx) 4099 - { 4100 - const struct cred *creds; 4101 - u32 id; 4102 - int ret; 4103 - 4104 - creds = get_current_cred(); 4105 - 4106 - ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, 4107 - XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); 4108 - if (ret < 0) { 4109 - put_cred(creds); 4110 - return ret; 4111 - } 4112 - return id; 4113 - } 4114 - 4115 - static __cold int io_register_restrictions(struct io_ring_ctx *ctx, 4116 - void __user *arg, unsigned int nr_args) 4117 - { 4118 - struct io_uring_restriction *res; 4119 - size_t size; 4120 - int i, ret; 4121 - 4122 - /* Restrictions allowed only if rings started disabled */ 4123 - if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 4124 - return -EBADFD; 4125 - 4126 - /* We allow only a single restrictions registration */ 4127 - if (ctx->restrictions.registered) 4128 - return -EBUSY; 4129 - 4130 - if (!arg || nr_args > IORING_MAX_RESTRICTIONS) 4131 - return -EINVAL; 4132 - 4133 - size = array_size(nr_args, sizeof(*res)); 4134 - if (size == SIZE_MAX) 4135 - return -EOVERFLOW; 4136 - 4137 - res = memdup_user(arg, size); 4138 - if (IS_ERR(res)) 4139 - return PTR_ERR(res); 4140 - 4141 - ret = 0; 4142 - 4143 - for (i = 0; i < nr_args; i++) { 4144 - switch (res[i].opcode) { 4145 - case IORING_RESTRICTION_REGISTER_OP: 4146 - if (res[i].register_op >= IORING_REGISTER_LAST) { 4147 - ret = -EINVAL; 4148 - goto out; 4149 - } 4150 - 4151 - __set_bit(res[i].register_op, 4152 - ctx->restrictions.register_op); 4153 - break; 4154 - case IORING_RESTRICTION_SQE_OP: 4155 - if (res[i].sqe_op >= IORING_OP_LAST) { 4156 - ret = -EINVAL; 4157 - goto out; 4158 - } 4159 - 4160 - __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); 4161 - break; 4162 - case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: 4163 - ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; 4164 - break; 4165 - case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: 4166 - ctx->restrictions.sqe_flags_required = res[i].sqe_flags; 4167 - break; 4168 - default: 4169 - ret = -EINVAL; 4170 - goto out; 4171 - } 4172 - } 4173 - 4174 - out: 4175 - /* Reset all restrictions if an error happened */ 4176 - if (ret != 0) 4177 - memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); 4178 - else 4179 - ctx->restrictions.registered = true; 4180 - 4181 - kfree(res); 4182 - return ret; 4183 - } 4184 - 4185 - static int io_register_enable_rings(struct io_ring_ctx *ctx) 4186 - { 4187 - if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 4188 - return -EBADFD; 4189 - 4190 - if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { 4191 - WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); 4192 - /* 4193 - * Lazy activation attempts would fail if it was polled before 4194 - * submitter_task is set. 4195 - */ 4196 - if (wq_has_sleeper(&ctx->poll_wq)) 4197 - io_activate_pollwq(ctx); 4198 - } 4199 - 4200 - if (ctx->restrictions.registered) 4201 - ctx->restricted = 1; 4202 - 4203 - ctx->flags &= ~IORING_SETUP_R_DISABLED; 4204 - if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) 4205 - wake_up(&ctx->sq_data->wait); 4206 - return 0; 4207 - } 4208 - 4209 - static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, 4210 - cpumask_var_t new_mask) 4211 - { 4212 - int ret; 4213 - 4214 - if (!(ctx->flags & IORING_SETUP_SQPOLL)) { 4215 - ret = io_wq_cpu_affinity(current->io_uring, new_mask); 4216 - } else { 4217 - mutex_unlock(&ctx->uring_lock); 4218 - ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); 4219 - mutex_lock(&ctx->uring_lock); 4220 - } 4221 - 4222 - return ret; 4223 - } 4224 - 4225 - static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, 4226 - void __user *arg, unsigned len) 4227 - { 4228 - cpumask_var_t new_mask; 4229 - int ret; 4230 - 4231 - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 4232 - return -ENOMEM; 4233 - 4234 - cpumask_clear(new_mask); 4235 - if (len > cpumask_size()) 4236 - len = cpumask_size(); 4237 - 4238 - if (in_compat_syscall()) { 4239 - ret = compat_get_bitmap(cpumask_bits(new_mask), 4240 - (const compat_ulong_t __user *)arg, 4241 - len * 8 /* CHAR_BIT */); 4242 - } else { 4243 - ret = copy_from_user(new_mask, arg, len); 4244 - } 4245 - 4246 - if (ret) { 4247 - free_cpumask_var(new_mask); 4248 - return -EFAULT; 4249 - } 4250 - 4251 - ret = __io_register_iowq_aff(ctx, new_mask); 4252 - free_cpumask_var(new_mask); 4253 - return ret; 4254 - } 4255 - 4256 - static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) 4257 - { 4258 - return __io_register_iowq_aff(ctx, NULL); 4259 - } 4260 - 4261 - static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, 4262 - void __user *arg) 4263 - __must_hold(&ctx->uring_lock) 4264 - { 4265 - struct io_tctx_node *node; 4266 - struct io_uring_task *tctx = NULL; 4267 - struct io_sq_data *sqd = NULL; 4268 - __u32 new_count[2]; 4269 - int i, ret; 4270 - 4271 - if (copy_from_user(new_count, arg, sizeof(new_count))) 4272 - return -EFAULT; 4273 - for (i = 0; i < ARRAY_SIZE(new_count); i++) 4274 - if (new_count[i] > INT_MAX) 4275 - return -EINVAL; 4276 - 4277 - if (ctx->flags & IORING_SETUP_SQPOLL) { 4278 - sqd = ctx->sq_data; 4279 - if (sqd) { 4280 - /* 4281 - * Observe the correct sqd->lock -> ctx->uring_lock 4282 - * ordering. Fine to drop uring_lock here, we hold 4283 - * a ref to the ctx. 4284 - */ 4285 - refcount_inc(&sqd->refs); 4286 - mutex_unlock(&ctx->uring_lock); 4287 - mutex_lock(&sqd->lock); 4288 - mutex_lock(&ctx->uring_lock); 4289 - if (sqd->thread) 4290 - tctx = sqd->thread->io_uring; 4291 - } 4292 - } else { 4293 - tctx = current->io_uring; 4294 - } 4295 - 4296 - BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); 4297 - 4298 - for (i = 0; i < ARRAY_SIZE(new_count); i++) 4299 - if (new_count[i]) 4300 - ctx->iowq_limits[i] = new_count[i]; 4301 - ctx->iowq_limits_set = true; 4302 - 4303 - if (tctx && tctx->io_wq) { 4304 - ret = io_wq_max_workers(tctx->io_wq, new_count); 4305 - if (ret) 4306 - goto err; 4307 - } else { 4308 - memset(new_count, 0, sizeof(new_count)); 4309 - } 4310 - 4311 - if (sqd) { 4312 - mutex_unlock(&sqd->lock); 4313 - io_put_sq_data(sqd); 4314 - } 4315 - 4316 - if (copy_to_user(arg, new_count, sizeof(new_count))) 4317 - return -EFAULT; 4318 - 4319 - /* that's it for SQPOLL, only the SQPOLL task creates requests */ 4320 - if (sqd) 4321 - return 0; 4322 - 4323 - /* now propagate the restriction to all registered users */ 4324 - list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 4325 - struct io_uring_task *tctx = node->task->io_uring; 4326 - 4327 - if (WARN_ON_ONCE(!tctx->io_wq)) 4328 - continue; 4329 - 4330 - for (i = 0; i < ARRAY_SIZE(new_count); i++) 4331 - new_count[i] = ctx->iowq_limits[i]; 4332 - /* ignore errors, it always returns zero anyway */ 4333 - (void)io_wq_max_workers(tctx->io_wq, new_count); 4334 - } 4335 - return 0; 4336 - err: 4337 - if (sqd) { 4338 - mutex_unlock(&sqd->lock); 4339 - io_put_sq_data(sqd); 4340 - } 4341 - return ret; 4342 - } 4343 - 4344 - static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, 4345 - void __user *arg, unsigned nr_args) 4346 - __releases(ctx->uring_lock) 4347 - __acquires(ctx->uring_lock) 4348 - { 4349 - int ret; 4350 - 4351 - /* 4352 - * We don't quiesce the refs for register anymore and so it can't be 4353 - * dying as we're holding a file ref here. 4354 - */ 4355 - if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) 4356 - return -ENXIO; 4357 - 4358 - if (ctx->submitter_task && ctx->submitter_task != current) 4359 - return -EEXIST; 4360 - 4361 - if (ctx->restricted) { 4362 - opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); 4363 - if (!test_bit(opcode, ctx->restrictions.register_op)) 4364 - return -EACCES; 4365 - } 4366 - 4367 - switch (opcode) { 4368 - case IORING_REGISTER_BUFFERS: 4369 - ret = -EFAULT; 4370 - if (!arg) 4371 - break; 4372 - ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); 4373 - break; 4374 - case IORING_UNREGISTER_BUFFERS: 4375 - ret = -EINVAL; 4376 - if (arg || nr_args) 4377 - break; 4378 - ret = io_sqe_buffers_unregister(ctx); 4379 - break; 4380 - case IORING_REGISTER_FILES: 4381 - ret = -EFAULT; 4382 - if (!arg) 4383 - break; 4384 - ret = io_sqe_files_register(ctx, arg, nr_args, NULL); 4385 - break; 4386 - case IORING_UNREGISTER_FILES: 4387 - ret = -EINVAL; 4388 - if (arg || nr_args) 4389 - break; 4390 - ret = io_sqe_files_unregister(ctx); 4391 - break; 4392 - case IORING_REGISTER_FILES_UPDATE: 4393 - ret = io_register_files_update(ctx, arg, nr_args); 4394 - break; 4395 - case IORING_REGISTER_EVENTFD: 4396 - ret = -EINVAL; 4397 - if (nr_args != 1) 4398 - break; 4399 - ret = io_eventfd_register(ctx, arg, 0); 4400 - break; 4401 - case IORING_REGISTER_EVENTFD_ASYNC: 4402 - ret = -EINVAL; 4403 - if (nr_args != 1) 4404 - break; 4405 - ret = io_eventfd_register(ctx, arg, 1); 4406 - break; 4407 - case IORING_UNREGISTER_EVENTFD: 4408 - ret = -EINVAL; 4409 - if (arg || nr_args) 4410 - break; 4411 - ret = io_eventfd_unregister(ctx); 4412 - break; 4413 - case IORING_REGISTER_PROBE: 4414 - ret = -EINVAL; 4415 - if (!arg || nr_args > 256) 4416 - break; 4417 - ret = io_probe(ctx, arg, nr_args); 4418 - break; 4419 - case IORING_REGISTER_PERSONALITY: 4420 - ret = -EINVAL; 4421 - if (arg || nr_args) 4422 - break; 4423 - ret = io_register_personality(ctx); 4424 - break; 4425 - case IORING_UNREGISTER_PERSONALITY: 4426 - ret = -EINVAL; 4427 - if (arg) 4428 - break; 4429 - ret = io_unregister_personality(ctx, nr_args); 4430 - break; 4431 - case IORING_REGISTER_ENABLE_RINGS: 4432 - ret = -EINVAL; 4433 - if (arg || nr_args) 4434 - break; 4435 - ret = io_register_enable_rings(ctx); 4436 - break; 4437 - case IORING_REGISTER_RESTRICTIONS: 4438 - ret = io_register_restrictions(ctx, arg, nr_args); 4439 - break; 4440 - case IORING_REGISTER_FILES2: 4441 - ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); 4442 - break; 4443 - case IORING_REGISTER_FILES_UPDATE2: 4444 - ret = io_register_rsrc_update(ctx, arg, nr_args, 4445 - IORING_RSRC_FILE); 4446 - break; 4447 - case IORING_REGISTER_BUFFERS2: 4448 - ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); 4449 - break; 4450 - case IORING_REGISTER_BUFFERS_UPDATE: 4451 - ret = io_register_rsrc_update(ctx, arg, nr_args, 4452 - IORING_RSRC_BUFFER); 4453 - break; 4454 - case IORING_REGISTER_IOWQ_AFF: 4455 - ret = -EINVAL; 4456 - if (!arg || !nr_args) 4457 - break; 4458 - ret = io_register_iowq_aff(ctx, arg, nr_args); 4459 - break; 4460 - case IORING_UNREGISTER_IOWQ_AFF: 4461 - ret = -EINVAL; 4462 - if (arg || nr_args) 4463 - break; 4464 - ret = io_unregister_iowq_aff(ctx); 4465 - break; 4466 - case IORING_REGISTER_IOWQ_MAX_WORKERS: 4467 - ret = -EINVAL; 4468 - if (!arg || nr_args != 2) 4469 - break; 4470 - ret = io_register_iowq_max_workers(ctx, arg); 4471 - break; 4472 - case IORING_REGISTER_RING_FDS: 4473 - ret = io_ringfd_register(ctx, arg, nr_args); 4474 - break; 4475 - case IORING_UNREGISTER_RING_FDS: 4476 - ret = io_ringfd_unregister(ctx, arg, nr_args); 4477 - break; 4478 - case IORING_REGISTER_PBUF_RING: 4479 - ret = -EINVAL; 4480 - if (!arg || nr_args != 1) 4481 - break; 4482 - ret = io_register_pbuf_ring(ctx, arg); 4483 - break; 4484 - case IORING_UNREGISTER_PBUF_RING: 4485 - ret = -EINVAL; 4486 - if (!arg || nr_args != 1) 4487 - break; 4488 - ret = io_unregister_pbuf_ring(ctx, arg); 4489 - break; 4490 - case IORING_REGISTER_SYNC_CANCEL: 4491 - ret = -EINVAL; 4492 - if (!arg || nr_args != 1) 4493 - break; 4494 - ret = io_sync_cancel(ctx, arg); 4495 - break; 4496 - case IORING_REGISTER_FILE_ALLOC_RANGE: 4497 - ret = -EINVAL; 4498 - if (!arg || nr_args) 4499 - break; 4500 - ret = io_register_file_alloc_range(ctx, arg); 4501 - break; 4502 - default: 4503 - ret = -EINVAL; 4504 - break; 4505 - } 4506 - 4507 - return ret; 4508 - } 4509 - 4510 - SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, 4511 - void __user *, arg, unsigned int, nr_args) 4512 - { 4513 - struct io_ring_ctx *ctx; 4514 - long ret = -EBADF; 4515 - struct file *file; 4516 - bool use_registered_ring; 4517 - 4518 - use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); 4519 - opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; 4520 - 4521 - if (opcode >= IORING_REGISTER_LAST) 4522 - return -EINVAL; 4523 - 4524 - if (use_registered_ring) { 4525 - /* 4526 - * Ring fd has been registered via IORING_REGISTER_RING_FDS, we 4527 - * need only dereference our task private array to find it. 4528 - */ 4529 - struct io_uring_task *tctx = current->io_uring; 4530 - 4531 - if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) 4532 - return -EINVAL; 4533 - fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); 4534 - file = tctx->registered_rings[fd]; 4535 - if (unlikely(!file)) 4536 - return -EBADF; 4537 - } else { 4538 - file = fget(fd); 4539 - if (unlikely(!file)) 4540 - return -EBADF; 4541 - ret = -EOPNOTSUPP; 4542 - if (!io_is_uring_fops(file)) 4543 - goto out_fput; 4544 - } 4545 - 4546 - ctx = file->private_data; 4547 - 4548 - mutex_lock(&ctx->uring_lock); 4549 - ret = __io_uring_register(ctx, opcode, arg, nr_args); 4550 - mutex_unlock(&ctx->uring_lock); 4551 - trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret); 4552 - out_fput: 4553 - if (!use_registered_ring) 4554 - fput(file); 4555 - return ret; 4556 4159 } 4557 4160 4558 4161 static int __init io_uring_init(void)

+8 -11

io_uring/io_uring.h

··· 15 15 #include <trace/events/io_uring.h> 16 16 #endif 17 17 18 - enum { 19 - /* 20 - * A hint to not wake right away but delay until there are enough of 21 - * tw's queued to match the number of CQEs the task is waiting for. 22 - * 23 - * Must not be used wirh requests generating more than one CQE. 24 - * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set. 25 - */ 26 - IOU_F_TWQ_LAZY_WAKE = 1, 27 - }; 28 18 29 19 enum { 30 20 IOU_OK = 0, ··· 44 54 unsigned issue_flags); 45 55 46 56 void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); 47 - bool io_is_uring_fops(struct file *file); 48 57 bool io_alloc_async_data(struct io_kiocb *req); 49 58 void io_req_task_queue(struct io_kiocb *req); 50 59 void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use); ··· 77 88 78 89 void *io_mem_alloc(size_t size); 79 90 void io_mem_free(void *ptr); 91 + 92 + enum { 93 + IO_EVENTFD_OP_SIGNAL_BIT, 94 + IO_EVENTFD_OP_FREE_BIT, 95 + }; 96 + 97 + void io_eventfd_ops(struct rcu_head *rcu); 98 + void io_activate_pollwq(struct io_ring_ctx *ctx); 80 99 81 100 #if defined(CONFIG_PROVE_LOCKING) 82 101 static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)

+26

io_uring/kbuf.c

··· 750 750 return 0; 751 751 } 752 752 753 + int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) 754 + { 755 + struct io_uring_buf_status buf_status; 756 + struct io_buffer_list *bl; 757 + int i; 758 + 759 + if (copy_from_user(&buf_status, arg, sizeof(buf_status))) 760 + return -EFAULT; 761 + 762 + for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++) 763 + if (buf_status.resv[i]) 764 + return -EINVAL; 765 + 766 + bl = io_buffer_get_list(ctx, buf_status.buf_group); 767 + if (!bl) 768 + return -ENOENT; 769 + if (!bl->is_mapped) 770 + return -EINVAL; 771 + 772 + buf_status.head = bl->head; 773 + if (copy_to_user(arg, &buf_status, sizeof(buf_status))) 774 + return -EFAULT; 775 + 776 + return 0; 777 + } 778 + 753 779 void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) 754 780 { 755 781 struct io_buffer_list *bl;

+1

io_uring/kbuf.h

··· 53 53 54 54 int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); 55 55 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); 56 + int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); 56 57 57 58 void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx); 58 59

+9

io_uring/opdef.c

··· 469 469 .prep = io_eopnotsupp_prep, 470 470 #endif 471 471 }, 472 + [IORING_OP_FIXED_FD_INSTALL] = { 473 + .needs_file = 1, 474 + .audit_skip = 1, 475 + .prep = io_install_fixed_fd_prep, 476 + .issue = io_install_fixed_fd, 477 + }, 472 478 }; 473 479 474 480 const struct io_cold_def io_cold_defs[] = { ··· 709 703 }, 710 704 [IORING_OP_FUTEX_WAITV] = { 711 705 .name = "FUTEX_WAITV", 706 + }, 707 + [IORING_OP_FIXED_FD_INSTALL] = { 708 + .name = "FIXED_FD_INSTALL", 712 709 }, 713 710 }; 714 711

+44

io_uring/openclose.c

··· 31 31 u32 file_slot; 32 32 }; 33 33 34 + struct io_fixed_install { 35 + struct file *file; 36 + unsigned int o_flags; 37 + }; 38 + 34 39 static bool io_openat_force_async(struct io_open *open) 35 40 { 36 41 /* ··· 254 249 /* No ->flush() or already async, safely close from here */ 255 250 ret = filp_close(file, current->files); 256 251 err: 252 + if (ret < 0) 253 + req_set_fail(req); 254 + io_req_set_res(req, ret, 0); 255 + return IOU_OK; 256 + } 257 + 258 + int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 259 + { 260 + struct io_fixed_install *ifi; 261 + unsigned int flags; 262 + 263 + if (sqe->off || sqe->addr || sqe->len || sqe->buf_index || 264 + sqe->splice_fd_in || sqe->addr3) 265 + return -EINVAL; 266 + 267 + /* must be a fixed file */ 268 + if (!(req->flags & REQ_F_FIXED_FILE)) 269 + return -EBADF; 270 + 271 + flags = READ_ONCE(sqe->install_fd_flags); 272 + if (flags & ~IORING_FIXED_FD_NO_CLOEXEC) 273 + return -EINVAL; 274 + 275 + /* default to O_CLOEXEC, disable if IORING_FIXED_FD_NO_CLOEXEC is set */ 276 + ifi = io_kiocb_to_cmd(req, struct io_fixed_install); 277 + ifi->o_flags = O_CLOEXEC; 278 + if (flags & IORING_FIXED_FD_NO_CLOEXEC) 279 + ifi->o_flags = 0; 280 + 281 + return 0; 282 + } 283 + 284 + int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags) 285 + { 286 + struct io_fixed_install *ifi; 287 + int ret; 288 + 289 + ifi = io_kiocb_to_cmd(req, struct io_fixed_install); 290 + ret = receive_fd(req->file, NULL, ifi->o_flags); 257 291 if (ret < 0) 258 292 req_set_fail(req); 259 293 io_req_set_res(req, ret, 0);

+3

io_uring/openclose.h

··· 12 12 13 13 int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 14 14 int io_close(struct io_kiocb *req, unsigned int issue_flags); 15 + 16 + int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 17 + int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags);

+605

io_uring/register.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Code related to the io_uring_register() syscall 4 + * 5 + * Copyright (C) 2023 Jens Axboe 6 + */ 7 + #include <linux/kernel.h> 8 + #include <linux/errno.h> 9 + #include <linux/syscalls.h> 10 + #include <linux/refcount.h> 11 + #include <linux/bits.h> 12 + #include <linux/fs.h> 13 + #include <linux/file.h> 14 + #include <linux/slab.h> 15 + #include <linux/uaccess.h> 16 + #include <linux/nospec.h> 17 + #include <linux/io_uring.h> 18 + #include <linux/io_uring_types.h> 19 + 20 + #include "io_uring.h" 21 + #include "opdef.h" 22 + #include "tctx.h" 23 + #include "rsrc.h" 24 + #include "sqpoll.h" 25 + #include "register.h" 26 + #include "cancel.h" 27 + #include "kbuf.h" 28 + 29 + #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 30 + IORING_REGISTER_LAST + IORING_OP_LAST) 31 + 32 + static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, 33 + unsigned int eventfd_async) 34 + { 35 + struct io_ev_fd *ev_fd; 36 + __s32 __user *fds = arg; 37 + int fd; 38 + 39 + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, 40 + lockdep_is_held(&ctx->uring_lock)); 41 + if (ev_fd) 42 + return -EBUSY; 43 + 44 + if (copy_from_user(&fd, fds, sizeof(*fds))) 45 + return -EFAULT; 46 + 47 + ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); 48 + if (!ev_fd) 49 + return -ENOMEM; 50 + 51 + ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); 52 + if (IS_ERR(ev_fd->cq_ev_fd)) { 53 + int ret = PTR_ERR(ev_fd->cq_ev_fd); 54 + kfree(ev_fd); 55 + return ret; 56 + } 57 + 58 + spin_lock(&ctx->completion_lock); 59 + ctx->evfd_last_cq_tail = ctx->cached_cq_tail; 60 + spin_unlock(&ctx->completion_lock); 61 + 62 + ev_fd->eventfd_async = eventfd_async; 63 + ctx->has_evfd = true; 64 + rcu_assign_pointer(ctx->io_ev_fd, ev_fd); 65 + atomic_set(&ev_fd->refs, 1); 66 + atomic_set(&ev_fd->ops, 0); 67 + return 0; 68 + } 69 + 70 + int io_eventfd_unregister(struct io_ring_ctx *ctx) 71 + { 72 + struct io_ev_fd *ev_fd; 73 + 74 + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, 75 + lockdep_is_held(&ctx->uring_lock)); 76 + if (ev_fd) { 77 + ctx->has_evfd = false; 78 + rcu_assign_pointer(ctx->io_ev_fd, NULL); 79 + if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops)) 80 + call_rcu(&ev_fd->rcu, io_eventfd_ops); 81 + return 0; 82 + } 83 + 84 + return -ENXIO; 85 + } 86 + 87 + static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, 88 + unsigned nr_args) 89 + { 90 + struct io_uring_probe *p; 91 + size_t size; 92 + int i, ret; 93 + 94 + size = struct_size(p, ops, nr_args); 95 + if (size == SIZE_MAX) 96 + return -EOVERFLOW; 97 + p = kzalloc(size, GFP_KERNEL); 98 + if (!p) 99 + return -ENOMEM; 100 + 101 + ret = -EFAULT; 102 + if (copy_from_user(p, arg, size)) 103 + goto out; 104 + ret = -EINVAL; 105 + if (memchr_inv(p, 0, size)) 106 + goto out; 107 + 108 + p->last_op = IORING_OP_LAST - 1; 109 + if (nr_args > IORING_OP_LAST) 110 + nr_args = IORING_OP_LAST; 111 + 112 + for (i = 0; i < nr_args; i++) { 113 + p->ops[i].op = i; 114 + if (!io_issue_defs[i].not_supported) 115 + p->ops[i].flags = IO_URING_OP_SUPPORTED; 116 + } 117 + p->ops_len = i; 118 + 119 + ret = 0; 120 + if (copy_to_user(arg, p, size)) 121 + ret = -EFAULT; 122 + out: 123 + kfree(p); 124 + return ret; 125 + } 126 + 127 + int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) 128 + { 129 + const struct cred *creds; 130 + 131 + creds = xa_erase(&ctx->personalities, id); 132 + if (creds) { 133 + put_cred(creds); 134 + return 0; 135 + } 136 + 137 + return -EINVAL; 138 + } 139 + 140 + 141 + static int io_register_personality(struct io_ring_ctx *ctx) 142 + { 143 + const struct cred *creds; 144 + u32 id; 145 + int ret; 146 + 147 + creds = get_current_cred(); 148 + 149 + ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, 150 + XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); 151 + if (ret < 0) { 152 + put_cred(creds); 153 + return ret; 154 + } 155 + return id; 156 + } 157 + 158 + static __cold int io_register_restrictions(struct io_ring_ctx *ctx, 159 + void __user *arg, unsigned int nr_args) 160 + { 161 + struct io_uring_restriction *res; 162 + size_t size; 163 + int i, ret; 164 + 165 + /* Restrictions allowed only if rings started disabled */ 166 + if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 167 + return -EBADFD; 168 + 169 + /* We allow only a single restrictions registration */ 170 + if (ctx->restrictions.registered) 171 + return -EBUSY; 172 + 173 + if (!arg || nr_args > IORING_MAX_RESTRICTIONS) 174 + return -EINVAL; 175 + 176 + size = array_size(nr_args, sizeof(*res)); 177 + if (size == SIZE_MAX) 178 + return -EOVERFLOW; 179 + 180 + res = memdup_user(arg, size); 181 + if (IS_ERR(res)) 182 + return PTR_ERR(res); 183 + 184 + ret = 0; 185 + 186 + for (i = 0; i < nr_args; i++) { 187 + switch (res[i].opcode) { 188 + case IORING_RESTRICTION_REGISTER_OP: 189 + if (res[i].register_op >= IORING_REGISTER_LAST) { 190 + ret = -EINVAL; 191 + goto out; 192 + } 193 + 194 + __set_bit(res[i].register_op, 195 + ctx->restrictions.register_op); 196 + break; 197 + case IORING_RESTRICTION_SQE_OP: 198 + if (res[i].sqe_op >= IORING_OP_LAST) { 199 + ret = -EINVAL; 200 + goto out; 201 + } 202 + 203 + __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); 204 + break; 205 + case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: 206 + ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; 207 + break; 208 + case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: 209 + ctx->restrictions.sqe_flags_required = res[i].sqe_flags; 210 + break; 211 + default: 212 + ret = -EINVAL; 213 + goto out; 214 + } 215 + } 216 + 217 + out: 218 + /* Reset all restrictions if an error happened */ 219 + if (ret != 0) 220 + memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); 221 + else 222 + ctx->restrictions.registered = true; 223 + 224 + kfree(res); 225 + return ret; 226 + } 227 + 228 + static int io_register_enable_rings(struct io_ring_ctx *ctx) 229 + { 230 + if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 231 + return -EBADFD; 232 + 233 + if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { 234 + WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); 235 + /* 236 + * Lazy activation attempts would fail if it was polled before 237 + * submitter_task is set. 238 + */ 239 + if (wq_has_sleeper(&ctx->poll_wq)) 240 + io_activate_pollwq(ctx); 241 + } 242 + 243 + if (ctx->restrictions.registered) 244 + ctx->restricted = 1; 245 + 246 + ctx->flags &= ~IORING_SETUP_R_DISABLED; 247 + if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) 248 + wake_up(&ctx->sq_data->wait); 249 + return 0; 250 + } 251 + 252 + static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, 253 + cpumask_var_t new_mask) 254 + { 255 + int ret; 256 + 257 + if (!(ctx->flags & IORING_SETUP_SQPOLL)) { 258 + ret = io_wq_cpu_affinity(current->io_uring, new_mask); 259 + } else { 260 + mutex_unlock(&ctx->uring_lock); 261 + ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); 262 + mutex_lock(&ctx->uring_lock); 263 + } 264 + 265 + return ret; 266 + } 267 + 268 + static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, 269 + void __user *arg, unsigned len) 270 + { 271 + cpumask_var_t new_mask; 272 + int ret; 273 + 274 + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 275 + return -ENOMEM; 276 + 277 + cpumask_clear(new_mask); 278 + if (len > cpumask_size()) 279 + len = cpumask_size(); 280 + 281 + if (in_compat_syscall()) { 282 + ret = compat_get_bitmap(cpumask_bits(new_mask), 283 + (const compat_ulong_t __user *)arg, 284 + len * 8 /* CHAR_BIT */); 285 + } else { 286 + ret = copy_from_user(new_mask, arg, len); 287 + } 288 + 289 + if (ret) { 290 + free_cpumask_var(new_mask); 291 + return -EFAULT; 292 + } 293 + 294 + ret = __io_register_iowq_aff(ctx, new_mask); 295 + free_cpumask_var(new_mask); 296 + return ret; 297 + } 298 + 299 + static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) 300 + { 301 + return __io_register_iowq_aff(ctx, NULL); 302 + } 303 + 304 + static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, 305 + void __user *arg) 306 + __must_hold(&ctx->uring_lock) 307 + { 308 + struct io_tctx_node *node; 309 + struct io_uring_task *tctx = NULL; 310 + struct io_sq_data *sqd = NULL; 311 + __u32 new_count[2]; 312 + int i, ret; 313 + 314 + if (copy_from_user(new_count, arg, sizeof(new_count))) 315 + return -EFAULT; 316 + for (i = 0; i < ARRAY_SIZE(new_count); i++) 317 + if (new_count[i] > INT_MAX) 318 + return -EINVAL; 319 + 320 + if (ctx->flags & IORING_SETUP_SQPOLL) { 321 + sqd = ctx->sq_data; 322 + if (sqd) { 323 + /* 324 + * Observe the correct sqd->lock -> ctx->uring_lock 325 + * ordering. Fine to drop uring_lock here, we hold 326 + * a ref to the ctx. 327 + */ 328 + refcount_inc(&sqd->refs); 329 + mutex_unlock(&ctx->uring_lock); 330 + mutex_lock(&sqd->lock); 331 + mutex_lock(&ctx->uring_lock); 332 + if (sqd->thread) 333 + tctx = sqd->thread->io_uring; 334 + } 335 + } else { 336 + tctx = current->io_uring; 337 + } 338 + 339 + BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); 340 + 341 + for (i = 0; i < ARRAY_SIZE(new_count); i++) 342 + if (new_count[i]) 343 + ctx->iowq_limits[i] = new_count[i]; 344 + ctx->iowq_limits_set = true; 345 + 346 + if (tctx && tctx->io_wq) { 347 + ret = io_wq_max_workers(tctx->io_wq, new_count); 348 + if (ret) 349 + goto err; 350 + } else { 351 + memset(new_count, 0, sizeof(new_count)); 352 + } 353 + 354 + if (sqd) { 355 + mutex_unlock(&sqd->lock); 356 + io_put_sq_data(sqd); 357 + } 358 + 359 + if (copy_to_user(arg, new_count, sizeof(new_count))) 360 + return -EFAULT; 361 + 362 + /* that's it for SQPOLL, only the SQPOLL task creates requests */ 363 + if (sqd) 364 + return 0; 365 + 366 + /* now propagate the restriction to all registered users */ 367 + list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 368 + struct io_uring_task *tctx = node->task->io_uring; 369 + 370 + if (WARN_ON_ONCE(!tctx->io_wq)) 371 + continue; 372 + 373 + for (i = 0; i < ARRAY_SIZE(new_count); i++) 374 + new_count[i] = ctx->iowq_limits[i]; 375 + /* ignore errors, it always returns zero anyway */ 376 + (void)io_wq_max_workers(tctx->io_wq, new_count); 377 + } 378 + return 0; 379 + err: 380 + if (sqd) { 381 + mutex_unlock(&sqd->lock); 382 + io_put_sq_data(sqd); 383 + } 384 + return ret; 385 + } 386 + 387 + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, 388 + void __user *arg, unsigned nr_args) 389 + __releases(ctx->uring_lock) 390 + __acquires(ctx->uring_lock) 391 + { 392 + int ret; 393 + 394 + /* 395 + * We don't quiesce the refs for register anymore and so it can't be 396 + * dying as we're holding a file ref here. 397 + */ 398 + if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) 399 + return -ENXIO; 400 + 401 + if (ctx->submitter_task && ctx->submitter_task != current) 402 + return -EEXIST; 403 + 404 + if (ctx->restricted) { 405 + opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); 406 + if (!test_bit(opcode, ctx->restrictions.register_op)) 407 + return -EACCES; 408 + } 409 + 410 + switch (opcode) { 411 + case IORING_REGISTER_BUFFERS: 412 + ret = -EFAULT; 413 + if (!arg) 414 + break; 415 + ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); 416 + break; 417 + case IORING_UNREGISTER_BUFFERS: 418 + ret = -EINVAL; 419 + if (arg || nr_args) 420 + break; 421 + ret = io_sqe_buffers_unregister(ctx); 422 + break; 423 + case IORING_REGISTER_FILES: 424 + ret = -EFAULT; 425 + if (!arg) 426 + break; 427 + ret = io_sqe_files_register(ctx, arg, nr_args, NULL); 428 + break; 429 + case IORING_UNREGISTER_FILES: 430 + ret = -EINVAL; 431 + if (arg || nr_args) 432 + break; 433 + ret = io_sqe_files_unregister(ctx); 434 + break; 435 + case IORING_REGISTER_FILES_UPDATE: 436 + ret = io_register_files_update(ctx, arg, nr_args); 437 + break; 438 + case IORING_REGISTER_EVENTFD: 439 + ret = -EINVAL; 440 + if (nr_args != 1) 441 + break; 442 + ret = io_eventfd_register(ctx, arg, 0); 443 + break; 444 + case IORING_REGISTER_EVENTFD_ASYNC: 445 + ret = -EINVAL; 446 + if (nr_args != 1) 447 + break; 448 + ret = io_eventfd_register(ctx, arg, 1); 449 + break; 450 + case IORING_UNREGISTER_EVENTFD: 451 + ret = -EINVAL; 452 + if (arg || nr_args) 453 + break; 454 + ret = io_eventfd_unregister(ctx); 455 + break; 456 + case IORING_REGISTER_PROBE: 457 + ret = -EINVAL; 458 + if (!arg || nr_args > 256) 459 + break; 460 + ret = io_probe(ctx, arg, nr_args); 461 + break; 462 + case IORING_REGISTER_PERSONALITY: 463 + ret = -EINVAL; 464 + if (arg || nr_args) 465 + break; 466 + ret = io_register_personality(ctx); 467 + break; 468 + case IORING_UNREGISTER_PERSONALITY: 469 + ret = -EINVAL; 470 + if (arg) 471 + break; 472 + ret = io_unregister_personality(ctx, nr_args); 473 + break; 474 + case IORING_REGISTER_ENABLE_RINGS: 475 + ret = -EINVAL; 476 + if (arg || nr_args) 477 + break; 478 + ret = io_register_enable_rings(ctx); 479 + break; 480 + case IORING_REGISTER_RESTRICTIONS: 481 + ret = io_register_restrictions(ctx, arg, nr_args); 482 + break; 483 + case IORING_REGISTER_FILES2: 484 + ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); 485 + break; 486 + case IORING_REGISTER_FILES_UPDATE2: 487 + ret = io_register_rsrc_update(ctx, arg, nr_args, 488 + IORING_RSRC_FILE); 489 + break; 490 + case IORING_REGISTER_BUFFERS2: 491 + ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); 492 + break; 493 + case IORING_REGISTER_BUFFERS_UPDATE: 494 + ret = io_register_rsrc_update(ctx, arg, nr_args, 495 + IORING_RSRC_BUFFER); 496 + break; 497 + case IORING_REGISTER_IOWQ_AFF: 498 + ret = -EINVAL; 499 + if (!arg || !nr_args) 500 + break; 501 + ret = io_register_iowq_aff(ctx, arg, nr_args); 502 + break; 503 + case IORING_UNREGISTER_IOWQ_AFF: 504 + ret = -EINVAL; 505 + if (arg || nr_args) 506 + break; 507 + ret = io_unregister_iowq_aff(ctx); 508 + break; 509 + case IORING_REGISTER_IOWQ_MAX_WORKERS: 510 + ret = -EINVAL; 511 + if (!arg || nr_args != 2) 512 + break; 513 + ret = io_register_iowq_max_workers(ctx, arg); 514 + break; 515 + case IORING_REGISTER_RING_FDS: 516 + ret = io_ringfd_register(ctx, arg, nr_args); 517 + break; 518 + case IORING_UNREGISTER_RING_FDS: 519 + ret = io_ringfd_unregister(ctx, arg, nr_args); 520 + break; 521 + case IORING_REGISTER_PBUF_RING: 522 + ret = -EINVAL; 523 + if (!arg || nr_args != 1) 524 + break; 525 + ret = io_register_pbuf_ring(ctx, arg); 526 + break; 527 + case IORING_UNREGISTER_PBUF_RING: 528 + ret = -EINVAL; 529 + if (!arg || nr_args != 1) 530 + break; 531 + ret = io_unregister_pbuf_ring(ctx, arg); 532 + break; 533 + case IORING_REGISTER_SYNC_CANCEL: 534 + ret = -EINVAL; 535 + if (!arg || nr_args != 1) 536 + break; 537 + ret = io_sync_cancel(ctx, arg); 538 + break; 539 + case IORING_REGISTER_FILE_ALLOC_RANGE: 540 + ret = -EINVAL; 541 + if (!arg || nr_args) 542 + break; 543 + ret = io_register_file_alloc_range(ctx, arg); 544 + break; 545 + case IORING_REGISTER_PBUF_STATUS: 546 + ret = -EINVAL; 547 + if (!arg || nr_args != 1) 548 + break; 549 + ret = io_register_pbuf_status(ctx, arg); 550 + break; 551 + default: 552 + ret = -EINVAL; 553 + break; 554 + } 555 + 556 + return ret; 557 + } 558 + 559 + SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, 560 + void __user *, arg, unsigned int, nr_args) 561 + { 562 + struct io_ring_ctx *ctx; 563 + long ret = -EBADF; 564 + struct file *file; 565 + bool use_registered_ring; 566 + 567 + use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); 568 + opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; 569 + 570 + if (opcode >= IORING_REGISTER_LAST) 571 + return -EINVAL; 572 + 573 + if (use_registered_ring) { 574 + /* 575 + * Ring fd has been registered via IORING_REGISTER_RING_FDS, we 576 + * need only dereference our task private array to find it. 577 + */ 578 + struct io_uring_task *tctx = current->io_uring; 579 + 580 + if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) 581 + return -EINVAL; 582 + fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); 583 + file = tctx->registered_rings[fd]; 584 + if (unlikely(!file)) 585 + return -EBADF; 586 + } else { 587 + file = fget(fd); 588 + if (unlikely(!file)) 589 + return -EBADF; 590 + ret = -EOPNOTSUPP; 591 + if (!io_is_uring_fops(file)) 592 + goto out_fput; 593 + } 594 + 595 + ctx = file->private_data; 596 + 597 + mutex_lock(&ctx->uring_lock); 598 + ret = __io_uring_register(ctx, opcode, arg, nr_args); 599 + mutex_unlock(&ctx->uring_lock); 600 + trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret); 601 + out_fput: 602 + if (!use_registered_ring) 603 + fput(file); 604 + return ret; 605 + }

+8

io_uring/register.h

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #ifndef IORING_REGISTER_H 3 + #define IORING_REGISTER_H 4 + 5 + int io_eventfd_unregister(struct io_ring_ctx *ctx); 6 + int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id); 7 + 8 + #endif

+4 -165

io_uring/rsrc.c

··· 24 24 }; 25 25 26 26 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); 27 - static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); 28 27 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 29 28 struct io_mapped_ubuf **pimu, 30 29 struct page **last_hpage); ··· 156 157 157 158 switch (node->type) { 158 159 case IORING_RSRC_FILE: 159 - io_rsrc_file_put(node->ctx, prsrc); 160 + fput(prsrc->file); 160 161 break; 161 162 case IORING_RSRC_BUFFER: 162 163 io_rsrc_buf_put(node->ctx, prsrc); ··· 401 402 break; 402 403 } 403 404 /* 404 - * Don't allow io_uring instances to be registered. If 405 - * UNIX isn't enabled, then this causes a reference 406 - * cycle and this instance can never get freed. If UNIX 407 - * is enabled we'll handle it just fine, but there's 408 - * still no point in allowing a ring fd as it doesn't 409 - * support regular read/write anyway. 405 + * Don't allow io_uring instances to be registered. 410 406 */ 411 407 if (io_is_uring_fops(file)) { 412 408 fput(file); 413 409 err = -EBADF; 414 - break; 415 - } 416 - err = io_scm_file_account(ctx, file); 417 - if (err) { 418 - fput(file); 419 410 break; 420 411 } 421 412 *io_get_tag_slot(data, i) = tag; ··· 664 675 for (i = 0; i < ctx->nr_user_files; i++) { 665 676 struct file *file = io_file_from_index(&ctx->file_table, i); 666 677 667 - /* skip scm accounted files, they'll be freed by ->ring_sock */ 668 - if (!file || io_file_need_scm(file)) 678 + if (!file) 669 679 continue; 670 680 io_file_bitmap_clear(&ctx->file_table, i); 671 681 fput(file); 672 682 } 673 683 674 - #if defined(CONFIG_UNIX) 675 - if (ctx->ring_sock) { 676 - struct sock *sock = ctx->ring_sock->sk; 677 - struct sk_buff *skb; 678 - 679 - while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) 680 - kfree_skb(skb); 681 - } 682 - #endif 683 684 io_free_file_tables(&ctx->file_table); 684 685 io_file_table_set_alloc_range(ctx, 0, 0); 685 686 io_rsrc_data_free(ctx->file_data); ··· 695 716 if (!ret) 696 717 __io_sqe_files_unregister(ctx); 697 718 return ret; 698 - } 699 - 700 - /* 701 - * Ensure the UNIX gc is aware of our file set, so we are certain that 702 - * the io_uring can be safely unregistered on process exit, even if we have 703 - * loops in the file referencing. We account only files that can hold other 704 - * files because otherwise they can't form a loop and so are not interesting 705 - * for GC. 706 - */ 707 - int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) 708 - { 709 - #if defined(CONFIG_UNIX) 710 - struct sock *sk = ctx->ring_sock->sk; 711 - struct sk_buff_head *head = &sk->sk_receive_queue; 712 - struct scm_fp_list *fpl; 713 - struct sk_buff *skb; 714 - 715 - if (likely(!io_file_need_scm(file))) 716 - return 0; 717 - 718 - /* 719 - * See if we can merge this file into an existing skb SCM_RIGHTS 720 - * file set. If there's no room, fall back to allocating a new skb 721 - * and filling it in. 722 - */ 723 - spin_lock_irq(&head->lock); 724 - skb = skb_peek(head); 725 - if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) 726 - __skb_unlink(skb, head); 727 - else 728 - skb = NULL; 729 - spin_unlock_irq(&head->lock); 730 - 731 - if (!skb) { 732 - fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); 733 - if (!fpl) 734 - return -ENOMEM; 735 - 736 - skb = alloc_skb(0, GFP_KERNEL); 737 - if (!skb) { 738 - kfree(fpl); 739 - return -ENOMEM; 740 - } 741 - 742 - fpl->user = get_uid(current_user()); 743 - fpl->max = SCM_MAX_FD; 744 - fpl->count = 0; 745 - 746 - UNIXCB(skb).fp = fpl; 747 - skb->sk = sk; 748 - skb->destructor = io_uring_destruct_scm; 749 - refcount_add(skb->truesize, &sk->sk_wmem_alloc); 750 - } 751 - 752 - fpl = UNIXCB(skb).fp; 753 - fpl->fp[fpl->count++] = get_file(file); 754 - unix_inflight(fpl->user, file); 755 - skb_queue_head(head, skb); 756 - fput(file); 757 - #endif 758 - return 0; 759 - } 760 - 761 - static __cold void io_rsrc_file_scm_put(struct io_ring_ctx *ctx, struct file *file) 762 - { 763 - #if defined(CONFIG_UNIX) 764 - struct sock *sock = ctx->ring_sock->sk; 765 - struct sk_buff_head list, *head = &sock->sk_receive_queue; 766 - struct sk_buff *skb; 767 - int i; 768 - 769 - __skb_queue_head_init(&list); 770 - 771 - /* 772 - * Find the skb that holds this file in its SCM_RIGHTS. When found, 773 - * remove this entry and rearrange the file array. 774 - */ 775 - skb = skb_dequeue(head); 776 - while (skb) { 777 - struct scm_fp_list *fp; 778 - 779 - fp = UNIXCB(skb).fp; 780 - for (i = 0; i < fp->count; i++) { 781 - int left; 782 - 783 - if (fp->fp[i] != file) 784 - continue; 785 - 786 - unix_notinflight(fp->user, fp->fp[i]); 787 - left = fp->count - 1 - i; 788 - if (left) { 789 - memmove(&fp->fp[i], &fp->fp[i + 1], 790 - left * sizeof(struct file *)); 791 - } 792 - fp->count--; 793 - if (!fp->count) { 794 - kfree_skb(skb); 795 - skb = NULL; 796 - } else { 797 - __skb_queue_tail(&list, skb); 798 - } 799 - fput(file); 800 - file = NULL; 801 - break; 802 - } 803 - 804 - if (!file) 805 - break; 806 - 807 - __skb_queue_tail(&list, skb); 808 - 809 - skb = skb_dequeue(head); 810 - } 811 - 812 - if (skb_peek(&list)) { 813 - spin_lock_irq(&head->lock); 814 - while ((skb = __skb_dequeue(&list)) != NULL) 815 - __skb_queue_tail(head, skb); 816 - spin_unlock_irq(&head->lock); 817 - } 818 - #endif 819 - } 820 - 821 - static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 822 - { 823 - struct file *file = prsrc->file; 824 - 825 - if (likely(!io_file_need_scm(file))) 826 - fput(file); 827 - else 828 - io_rsrc_file_scm_put(ctx, file); 829 719 } 830 720 831 721 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, ··· 745 897 goto fail; 746 898 747 899 /* 748 - * Don't allow io_uring instances to be registered. If UNIX 749 - * isn't enabled, then this causes a reference cycle and this 750 - * instance can never get freed. If UNIX is enabled we'll 751 - * handle it just fine, but there's still no point in allowing 752 - * a ring fd as it doesn't support regular read/write anyway. 900 + * Don't allow io_uring instances to be registered. 753 901 */ 754 902 if (io_is_uring_fops(file)) { 755 - fput(file); 756 - goto fail; 757 - } 758 - ret = io_scm_file_account(ctx, file); 759 - if (ret) { 760 903 fput(file); 761 904 goto fail; 762 905 }

-15

io_uring/rsrc.h

··· 75 75 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 76 76 unsigned nr_args, u64 __user *tags); 77 77 78 - int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file); 79 - 80 - static inline bool io_file_need_scm(struct file *filp) 81 - { 82 - return false; 83 - } 84 - 85 - static inline int io_scm_file_account(struct io_ring_ctx *ctx, 86 - struct file *file) 87 - { 88 - if (likely(!io_file_need_scm(file))) 89 - return 0; 90 - return __io_scm_file_account(ctx, file); 91 - } 92 - 93 78 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 94 79 unsigned nr_args); 95 80 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,

+8 -4

io_uring/rw.c

··· 10 10 #include <linux/poll.h> 11 11 #include <linux/nospec.h> 12 12 #include <linux/compat.h> 13 - #include <linux/io_uring.h> 13 + #include <linux/io_uring/cmd.h> 14 14 15 15 #include <uapi/linux/io_uring.h> 16 16 ··· 589 589 struct iovec *iov; 590 590 int ret; 591 591 592 + iorw->bytes_done = 0; 593 + iorw->free_iovec = NULL; 594 + 592 595 /* submission path, ->uring_lock should already be taken */ 593 596 ret = io_import_iovec(rw, req, &iov, &iorw->s, 0); 594 597 if (unlikely(ret < 0)) 595 598 return ret; 596 599 597 - iorw->bytes_done = 0; 598 - iorw->free_iovec = iov; 599 - if (iov) 600 + if (iov) { 601 + iorw->free_iovec = iov; 600 602 req->flags |= REQ_F_NEED_CLEANUP; 603 + } 604 + 601 605 return 0; 602 606 } 603 607

+1 -14

io_uring/uring_cmd.c

··· 2 2 #include <linux/kernel.h> 3 3 #include <linux/errno.h> 4 4 #include <linux/file.h> 5 - #include <linux/io_uring.h> 5 + #include <linux/io_uring/cmd.h> 6 6 #include <linux/security.h> 7 7 #include <linux/nospec.h> 8 8 ··· 52 52 } 53 53 EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); 54 54 55 - struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd) 56 - { 57 - return cmd_to_io_kiocb(cmd)->task; 58 - } 59 - EXPORT_SYMBOL_GPL(io_uring_cmd_get_task); 60 - 61 55 static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) 62 56 { 63 57 struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); ··· 71 77 __io_req_task_work_add(req, flags); 72 78 } 73 79 EXPORT_SYMBOL_GPL(__io_uring_cmd_do_in_task); 74 - 75 - void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, 76 - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) 77 - { 78 - __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE); 79 - } 80 - EXPORT_SYMBOL_GPL(io_uring_cmd_do_in_task_lazy); 81 80 82 81 static inline void io_req_set_cqe32_extra(struct io_kiocb *req, 83 82 u64 extra1, u64 extra2)

+1 -1

net/core/scm.c

··· 105 105 if (fd < 0 || !(file = fget_raw(fd))) 106 106 return -EBADF; 107 107 /* don't allow io_uring files */ 108 - if (io_uring_get_socket(file)) { 108 + if (io_is_uring_fops(file)) { 109 109 fput(file); 110 110 return -EINVAL; 111 111 }

+1 -3

net/unix/scm.c

··· 35 35 /* PF_UNIX ? */ 36 36 if (s && ops && ops->family == PF_UNIX) 37 37 u_sock = s; 38 - } else { 39 - /* Could be an io_uring instance */ 40 - u_sock = io_uring_get_socket(filp); 41 38 } 39 + 42 40 return u_sock; 43 41 } 44 42 EXPORT_SYMBOL(unix_get_socket);

+1 -1

security/selinux/hooks.c

··· 92 92 #include <uapi/linux/mount.h> 93 93 #include <linux/fsnotify.h> 94 94 #include <linux/fanotify.h> 95 - #include <linux/io_uring.h> 95 + #include <linux/io_uring/cmd.h> 96 96 #include <uapi/linux/lsm.h> 97 97 98 98 #include "avc.h"

+1 -1

security/smack/smack_lsm.c

··· 43 43 #include <linux/fs_context.h> 44 44 #include <linux/fs_parser.h> 45 45 #include <linux/watch_queue.h> 46 - #include <linux/io_uring.h> 46 + #include <linux/io_uring/cmd.h> 47 47 #include <uapi/linux/lsm.h> 48 48 #include "smack.h" 49 49