Merge tag 'for-5.11/io_uring-2020-12-14' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:
"Fairly light set of changes this time around, and mostly some bits
that were pushed out to 5.11 instead of 5.10, fixes/cleanups, and a
few features. In particular:

- Cleanups around iovec import (David Laight, Pavel)

- Add timeout support for io_uring_enter(2), which enables us to
clean up liburing and avoid a timeout sqe submission in the
completion path.

The big win here is that it allows setups that split SQ and CQ
handling into separate threads to avoid locking, as the CQ side
will no longer submit when timeouts are needed when waiting for
events (Hao Xu)

- Add support for socket shutdown, and renameat/unlinkat.

- SQPOLL cleanups and improvements (Xiaoguang Wang)

- Allow SQPOLL setups for CAP_SYS_NICE, and enable regular
(non-fixed) files to be used.

- Cancelation improvements (Pavel)

- Fixed file reference improvements (Pavel)

- IOPOLL related race fixes (Pavel)

- Lots of other little fixes and cleanups (mostly Pavel)"

* tag 'for-5.11/io_uring-2020-12-14' of git://git.kernel.dk/linux-block: (43 commits)
io_uring: fix io_cqring_events()'s noflush
io_uring: fix racy IOPOLL flush overflow
io_uring: fix racy IOPOLL completions
io_uring: always let io_iopoll_complete() complete polled io
io_uring: add timeout update
io_uring: restructure io_timeout_cancel()
io_uring: fix files cancellation
io_uring: use bottom half safe lock for fixed file data
io_uring: fix miscounting ios_left
io_uring: change submit file state invariant
io_uring: check kthread stopped flag when sq thread is unparked
io_uring: share fixed_file_refs b/w multiple rsrcs
io_uring: replace inflight_wait with tctx->wait
io_uring: don't take fs for recvmsg/sendmsg
io_uring: only wake up sq thread while current task is in io worker context
io_uring: don't acquire uring_lock twice
io_uring: initialize 'timeout' properly in io_sq_thread()
io_uring: refactor io_sq_thread() handling
io_uring: always batch cancel in *cancel_files()
io_uring: pass files into kill timeouts/poll
...

+829 -595
+2
fs/internal.h
··· 78 long do_rmdir(int dfd, struct filename *name); 79 long do_unlinkat(int dfd, struct filename *name); 80 int may_linkat(struct path *link); 81 82 /* 83 * namespace.c
··· 78 long do_rmdir(int dfd, struct filename *name); 79 long do_unlinkat(int dfd, struct filename *name); 80 int may_linkat(struct path *link); 81 + int do_renameat2(int olddfd, struct filename *oldname, int newdfd, 82 + struct filename *newname, unsigned int flags); 83 84 /* 85 * namespace.c
-10
fs/io-wq.c
··· 1078 return IO_WQ_CANCEL_NOTFOUND; 1079 } 1080 1081 - static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data) 1082 - { 1083 - return work == data; 1084 - } 1085 - 1086 - enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) 1087 - { 1088 - return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork, false); 1089 - } 1090 - 1091 struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) 1092 { 1093 int ret = -ENOMEM, node;
··· 1078 return IO_WQ_CANCEL_NOTFOUND; 1079 } 1080 1081 struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) 1082 { 1083 int ret = -ENOMEM, node;
-1
fs/io-wq.h
··· 129 } 130 131 void io_wq_cancel_all(struct io_wq *wq); 132 - enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork); 133 134 typedef bool (work_cancel_fn)(struct io_wq_work *, void *); 135
··· 129 } 130 131 void io_wq_cancel_all(struct io_wq *wq); 132 133 typedef bool (work_cancel_fn)(struct io_wq_work *, void *); 134
+775 -562
fs/io_uring.c
··· 245 246 struct task_struct *thread; 247 struct wait_queue_head wait; 248 }; 249 250 struct io_ring_ctx { ··· 287 struct list_head timeout_list; 288 struct list_head cq_overflow_list; 289 290 - wait_queue_head_t inflight_wait; 291 struct io_uring_sqe *sq_sqes; 292 } ____cacheline_aligned_in_smp; 293 ··· 311 struct io_sq_data *sq_data; /* if using sq thread polling */ 312 313 struct wait_queue_head sqo_sq_wait; 314 - struct wait_queue_entry sqo_wait_entry; 315 struct list_head sqd_list; 316 317 /* ··· 395 */ 396 struct io_poll_iocb { 397 struct file *file; 398 - union { 399 - struct wait_queue_head *head; 400 - u64 addr; 401 - }; 402 __poll_t events; 403 bool done; 404 bool canceled; 405 struct wait_queue_entry wait; 406 }; 407 408 struct io_close { ··· 446 u32 off; 447 u32 target_seq; 448 struct list_head list; 449 }; 450 451 struct io_timeout_rem { 452 struct file *file; 453 u64 addr; 454 }; 455 456 struct io_rw { ··· 549 struct statx __user *buffer; 550 }; 551 552 struct io_completion { 553 struct file *file; 554 struct list_head list; ··· 604 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, 605 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, 606 607 - REQ_F_LINK_HEAD_BIT, 608 REQ_F_FAIL_LINK_BIT, 609 REQ_F_INFLIGHT_BIT, 610 REQ_F_CUR_POS_BIT, ··· 635 /* IOSQE_BUFFER_SELECT */ 636 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), 637 638 - /* head of a link */ 639 - REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT), 640 /* fail rest of links */ 641 REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), 642 /* on inflight list */ ··· 677 struct file *file; 678 struct io_rw rw; 679 struct io_poll_iocb poll; 680 struct io_accept accept; 681 struct io_sync sync; 682 struct io_cancel cancel; ··· 694 struct io_splice splice; 695 struct io_provide_buf pbuf; 696 struct io_statx statx; 697 /* use only after cleaning per-op data, see io_clean_op() */ 698 struct io_completion compl; 699 }; ··· 716 struct task_struct *task; 717 u64 user_data; 718 719 - struct list_head link_list; 720 721 /* 722 * 1. used with ctx->iopoll_list with reads/writes 723 * 2. to track reqs with ->files (see io_op_def::file_table) 724 */ 725 struct list_head inflight_entry; 726 - 727 - struct percpu_ref *fixed_file_refs; 728 struct callback_head task_work; 729 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ 730 struct hlist_node hash_node; ··· 754 void *reqs[IO_IOPOLL_BATCH]; 755 unsigned int free_reqs; 756 757 /* 758 * Batch completion logic 759 */ ··· 766 */ 767 struct file *file; 768 unsigned int fd; 769 - unsigned int has_refs; 770 unsigned int ios_left; 771 }; 772 ··· 788 unsigned buffer_select : 1; 789 /* must always have async data allocated */ 790 unsigned needs_async_data : 1; 791 /* size of async data needed, if any */ 792 unsigned short async_size; 793 unsigned work_flags; ··· 803 .pollin = 1, 804 .buffer_select = 1, 805 .needs_async_data = 1, 806 .async_size = sizeof(struct io_async_rw), 807 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, 808 }, ··· 813 .unbound_nonreg_file = 1, 814 .pollout = 1, 815 .needs_async_data = 1, 816 .async_size = sizeof(struct io_async_rw), 817 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | 818 IO_WQ_WORK_FSIZE, ··· 826 .needs_file = 1, 827 .unbound_nonreg_file = 1, 828 .pollin = 1, 829 .async_size = sizeof(struct io_async_rw), 830 .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM, 831 }, ··· 835 .hash_reg_file = 1, 836 .unbound_nonreg_file = 1, 837 .pollout = 1, 838 .async_size = sizeof(struct io_async_rw), 839 .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE | 840 IO_WQ_WORK_MM, ··· 855 .pollout = 1, 856 .needs_async_data = 1, 857 .async_size = sizeof(struct io_async_msghdr), 858 - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | 859 - IO_WQ_WORK_FS, 860 }, 861 [IORING_OP_RECVMSG] = { 862 .needs_file = 1, ··· 864 .buffer_select = 1, 865 .needs_async_data = 1, 866 .async_size = sizeof(struct io_async_msghdr), 867 - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | 868 - IO_WQ_WORK_FS, 869 }, 870 [IORING_OP_TIMEOUT] = { 871 .needs_async_data = 1, 872 .async_size = sizeof(struct io_timeout_data), 873 .work_flags = IO_WQ_WORK_MM, 874 }, 875 - [IORING_OP_TIMEOUT_REMOVE] = {}, 876 [IORING_OP_ACCEPT] = { 877 .needs_file = 1, 878 .unbound_nonreg_file = 1, ··· 901 }, 902 [IORING_OP_OPENAT] = { 903 .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG | 904 - IO_WQ_WORK_FS, 905 }, 906 [IORING_OP_CLOSE] = { 907 .needs_file = 1, ··· 920 .unbound_nonreg_file = 1, 921 .pollin = 1, 922 .buffer_select = 1, 923 .async_size = sizeof(struct io_async_rw), 924 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, 925 }, ··· 928 .needs_file = 1, 929 .unbound_nonreg_file = 1, 930 .pollout = 1, 931 .async_size = sizeof(struct io_async_rw), 932 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | 933 IO_WQ_WORK_FSIZE, ··· 955 }, 956 [IORING_OP_OPENAT2] = { 957 .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS | 958 - IO_WQ_WORK_BLKCG, 959 }, 960 [IORING_OP_EPOLL_CTL] = { 961 .unbound_nonreg_file = 1, ··· 973 .needs_file = 1, 974 .hash_reg_file = 1, 975 .unbound_nonreg_file = 1, 976 }, 977 }; 978 ··· 1034 } 1035 EXPORT_SYMBOL(io_uring_get_socket); 1036 1037 static inline void io_clean_op(struct io_kiocb *req) 1038 { 1039 if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED | ··· 1044 __io_clean_op(req); 1045 } 1046 1047 - static void io_sq_thread_drop_mm(void) 1048 { 1049 struct mm_struct *mm = current->mm; 1050 1051 if (mm) { ··· 1084 mmput(mm); 1085 current->mm = NULL; 1086 } 1087 } 1088 1089 static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) ··· 1146 return -EFAULT; 1147 } 1148 1149 - static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, 1150 - struct io_kiocb *req) 1151 { 1152 - if (!(io_op_defs[req->opcode].work_flags & IO_WQ_WORK_MM)) 1153 - return 0; 1154 - return __io_sq_thread_acquire_mm(ctx); 1155 } 1156 1157 static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx, ··· 1307 INIT_LIST_HEAD(&ctx->iopoll_list); 1308 INIT_LIST_HEAD(&ctx->defer_list); 1309 INIT_LIST_HEAD(&ctx->timeout_list); 1310 - init_waitqueue_head(&ctx->inflight_wait); 1311 spin_lock_init(&ctx->inflight_lock); 1312 INIT_LIST_HEAD(&ctx->inflight_list); 1313 INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work); ··· 1548 { 1549 struct io_kiocb *cur; 1550 1551 - io_prep_async_work(req); 1552 - if (req->flags & REQ_F_LINK_HEAD) 1553 - list_for_each_entry(cur, &req->link_list, link_list) 1554 - io_prep_async_work(cur); 1555 } 1556 1557 static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req) ··· 1590 } 1591 } 1592 1593 - static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk) 1594 - { 1595 - struct io_ring_ctx *ctx = req->ctx; 1596 - 1597 - if (!tsk || req->task == tsk) 1598 - return true; 1599 - if (ctx->flags & IORING_SETUP_SQPOLL) { 1600 - if (ctx->sq_data && req->task == ctx->sq_data->thread) 1601 - return true; 1602 - } 1603 - return false; 1604 - } 1605 - 1606 /* 1607 * Returns true if we found and killed one or more timeouts 1608 */ 1609 - static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk) 1610 { 1611 struct io_kiocb *req, *tmp; 1612 int canceled = 0; 1613 1614 spin_lock_irq(&ctx->completion_lock); 1615 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { 1616 - if (io_task_match(req, tsk)) { 1617 io_kill_timeout(req); 1618 canceled++; 1619 } ··· 1712 } 1713 } 1714 1715 - static inline bool __io_match_files(struct io_kiocb *req, 1716 - struct files_struct *files) 1717 - { 1718 - return ((req->flags & REQ_F_WORK_INITIALIZED) && 1719 - (req->work.flags & IO_WQ_WORK_FILES)) && 1720 - req->work.identity->files == files; 1721 - } 1722 - 1723 - static bool io_match_files(struct io_kiocb *req, 1724 - struct files_struct *files) 1725 - { 1726 - struct io_kiocb *link; 1727 - 1728 - if (!files) 1729 - return true; 1730 - if (__io_match_files(req, files)) 1731 - return true; 1732 - if (req->flags & REQ_F_LINK_HEAD) { 1733 - list_for_each_entry(link, &req->link_list, link_list) { 1734 - if (__io_match_files(link, files)) 1735 - return true; 1736 - } 1737 - } 1738 - return false; 1739 - } 1740 - 1741 /* Returns true if there are no backlogged entries after the flush */ 1742 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, 1743 struct task_struct *tsk, ··· 1739 1740 cqe = NULL; 1741 list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) { 1742 - if (tsk && req->task != tsk) 1743 - continue; 1744 - if (!io_match_files(req, files)) 1745 continue; 1746 1747 cqe = io_get_cqring(ctx); ··· 1935 static inline void io_put_file(struct io_kiocb *req, struct file *file, 1936 bool fixed) 1937 { 1938 - if (fixed) 1939 - percpu_ref_put(req->fixed_file_refs); 1940 - else 1941 fput(file); 1942 } 1943 ··· 1947 kfree(req->async_data); 1948 if (req->file) 1949 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); 1950 - 1951 io_req_clean_work(req); 1952 } 1953 ··· 1971 percpu_ref_put(&ctx->refs); 1972 } 1973 1974 static void io_kill_linked_timeout(struct io_kiocb *req) 1975 { 1976 struct io_ring_ctx *ctx = req->ctx; ··· 1987 unsigned long flags; 1988 1989 spin_lock_irqsave(&ctx->completion_lock, flags); 1990 - link = list_first_entry_or_null(&req->link_list, struct io_kiocb, 1991 - link_list); 1992 /* 1993 * Can happen if a linked timeout fired and link had been like 1994 * req -> link t-out -> link t-out [-> ...] ··· 1997 struct io_timeout_data *io = link->async_data; 1998 int ret; 1999 2000 - list_del_init(&link->link_list); 2001 ret = hrtimer_try_to_cancel(&io->timer); 2002 if (ret != -1) { 2003 io_cqring_fill_event(link, -ECANCELED); ··· 2015 } 2016 } 2017 2018 - static struct io_kiocb *io_req_link_next(struct io_kiocb *req) 2019 - { 2020 - struct io_kiocb *nxt; 2021 2022 - /* 2023 - * The list should never be empty when we are called here. But could 2024 - * potentially happen if the chain is messed up, check to be on the 2025 - * safe side. 2026 - */ 2027 - if (unlikely(list_empty(&req->link_list))) 2028 - return NULL; 2029 - 2030 - nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list); 2031 - list_del_init(&req->link_list); 2032 - if (!list_empty(&nxt->link_list)) 2033 - nxt->flags |= REQ_F_LINK_HEAD; 2034 - return nxt; 2035 - } 2036 - 2037 - /* 2038 - * Called if REQ_F_LINK_HEAD is set, and we fail the head request 2039 - */ 2040 static void io_fail_links(struct io_kiocb *req) 2041 { 2042 struct io_ring_ctx *ctx = req->ctx; 2043 unsigned long flags; 2044 2045 spin_lock_irqsave(&ctx->completion_lock, flags); 2046 - while (!list_empty(&req->link_list)) { 2047 - struct io_kiocb *link = list_first_entry(&req->link_list, 2048 - struct io_kiocb, link_list); 2049 2050 - list_del_init(&link->link_list); 2051 trace_io_uring_fail_link(req, link); 2052 - 2053 io_cqring_fill_event(link, -ECANCELED); 2054 2055 /* ··· 2042 io_put_req_deferred(link, 2); 2043 else 2044 io_double_put_req(link); 2045 } 2046 - 2047 io_commit_cqring(ctx); 2048 spin_unlock_irqrestore(&ctx->completion_lock, flags); 2049 ··· 2052 2053 static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) 2054 { 2055 - req->flags &= ~REQ_F_LINK_HEAD; 2056 if (req->flags & REQ_F_LINK_TIMEOUT) 2057 io_kill_linked_timeout(req); 2058 ··· 2061 * dependencies to the next request. In case of failure, fail the rest 2062 * of the chain. 2063 */ 2064 - if (likely(!(req->flags & REQ_F_FAIL_LINK))) 2065 - return io_req_link_next(req); 2066 io_fail_links(req); 2067 return NULL; 2068 } 2069 2070 - static struct io_kiocb *io_req_find_next(struct io_kiocb *req) 2071 { 2072 - if (likely(!(req->flags & REQ_F_LINK_HEAD))) 2073 return NULL; 2074 return __io_req_find_next(req); 2075 } ··· 2132 { 2133 struct io_ring_ctx *ctx = req->ctx; 2134 2135 - if (!__io_sq_thread_acquire_mm(ctx)) { 2136 mutex_lock(&ctx->uring_lock); 2137 __io_queue_sqe(req, NULL); 2138 mutex_unlock(&ctx->uring_lock); ··· 2169 } 2170 } 2171 2172 - static void io_queue_next(struct io_kiocb *req) 2173 { 2174 struct io_kiocb *nxt = io_req_find_next(req); 2175 ··· 2226 io_free_req(req); 2227 return; 2228 } 2229 - if (req->flags & REQ_F_LINK_HEAD) 2230 - io_queue_next(req); 2231 2232 if (req->task != rb->task) { 2233 if (rb->task) { ··· 2328 * we wake up the task, and the next invocation will flush the 2329 * entries. We cannot safely to it from here. 2330 */ 2331 - if (noflush && !list_empty(&ctx->cq_overflow_list)) 2332 return -1U; 2333 2334 io_cqring_overflow_flush(ctx, false, NULL, NULL); ··· 2675 if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) 2676 return false; 2677 2678 - ret = io_sq_thread_acquire_mm(req->ctx, req); 2679 2680 if (io_resubmit_prep(req, ret)) { 2681 refcount_inc(&req->refs); ··· 2723 * find it from a io_iopoll_getevents() thread before the issuer is done 2724 * accessing the kiocb cookie. 2725 */ 2726 - static void io_iopoll_req_issued(struct io_kiocb *req) 2727 { 2728 struct io_ring_ctx *ctx = req->ctx; 2729 ··· 2752 else 2753 list_add_tail(&req->inflight_entry, &ctx->iopoll_list); 2754 2755 - if ((ctx->flags & IORING_SETUP_SQPOLL) && 2756 wq_has_sleeper(&ctx->sq_data->wait)) 2757 wake_up(&ctx->sq_data->wait); 2758 } 2759 2760 - static void __io_state_file_put(struct io_submit_state *state) 2761 { 2762 - if (state->has_refs) 2763 - fput_many(state->file, state->has_refs); 2764 - state->file = NULL; 2765 } 2766 2767 static inline void io_state_file_put(struct io_submit_state *state) 2768 { 2769 - if (state->file) 2770 __io_state_file_put(state); 2771 } 2772 ··· 2784 if (!state) 2785 return fget(fd); 2786 2787 - if (state->file) { 2788 if (state->fd == fd) { 2789 - state->has_refs--; 2790 return state->file; 2791 } 2792 __io_state_file_put(state); 2793 } 2794 state->file = fget_many(fd, state->ios_left); 2795 - if (!state->file) 2796 return NULL; 2797 2798 state->fd = fd; 2799 - state->has_refs = state->ios_left - 1; 2800 return state->file; 2801 } 2802 ··· 3151 return __io_iov_buffer_select(req, iov, needs_lock); 3152 } 3153 3154 - static ssize_t __io_import_iovec(int rw, struct io_kiocb *req, 3155 struct iovec **iovec, struct iov_iter *iter, 3156 bool needs_lock) 3157 { ··· 3180 3181 ret = import_single_range(rw, buf, sqe_len, *iovec, iter); 3182 *iovec = NULL; 3183 - return ret < 0 ? ret : sqe_len; 3184 } 3185 3186 if (req->flags & REQ_F_BUFFER_SELECT) { ··· 3195 3196 return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter, 3197 req->ctx->compat); 3198 - } 3199 - 3200 - static ssize_t io_import_iovec(int rw, struct io_kiocb *req, 3201 - struct iovec **iovec, struct iov_iter *iter, 3202 - bool needs_lock) 3203 - { 3204 - struct io_async_rw *iorw = req->async_data; 3205 - 3206 - if (!iorw) 3207 - return __io_import_iovec(rw, req, iovec, iter, needs_lock); 3208 - *iovec = NULL; 3209 - return iov_iter_count(&iorw->iter); 3210 } 3211 3212 static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) ··· 3320 struct iovec *iov = iorw->fast_iov; 3321 ssize_t ret; 3322 3323 - ret = __io_import_iovec(rw, req, &iov, &iorw->iter, false); 3324 if (unlikely(ret < 0)) 3325 return ret; 3326 ··· 3453 struct iov_iter __iter, *iter = &__iter; 3454 struct io_async_rw *rw = req->async_data; 3455 ssize_t io_size, ret, ret2; 3456 - size_t iov_count; 3457 bool no_async; 3458 3459 - if (rw) 3460 iter = &rw->iter; 3461 - 3462 - ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); 3463 - if (ret < 0) 3464 - return ret; 3465 - iov_count = iov_iter_count(iter); 3466 - io_size = ret; 3467 req->result = io_size; 3468 ret = 0; 3469 ··· 3479 if (no_async) 3480 goto copy_iov; 3481 3482 - ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count); 3483 if (unlikely(ret)) 3484 goto out_free; 3485 ··· 3498 if (req->file->f_flags & O_NONBLOCK) 3499 goto done; 3500 /* some cases will consume bytes even on error returns */ 3501 - iov_iter_revert(iter, iov_count - iov_iter_count(iter)); 3502 ret = 0; 3503 goto copy_iov; 3504 } else if (ret < 0) { ··· 3581 struct kiocb *kiocb = &req->rw.kiocb; 3582 struct iov_iter __iter, *iter = &__iter; 3583 struct io_async_rw *rw = req->async_data; 3584 - size_t iov_count; 3585 ssize_t ret, ret2, io_size; 3586 3587 - if (rw) 3588 iter = &rw->iter; 3589 - 3590 - ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); 3591 - if (ret < 0) 3592 - return ret; 3593 - iov_count = iov_iter_count(iter); 3594 - io_size = ret; 3595 req->result = io_size; 3596 3597 /* Ensure we clear previously set non-block flag */ ··· 3609 (req->flags & REQ_F_ISREG)) 3610 goto copy_iov; 3611 3612 - ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count); 3613 if (unlikely(ret)) 3614 goto out_free; 3615 ··· 3652 } else { 3653 copy_iov: 3654 /* some cases will consume bytes even on error returns */ 3655 - iov_iter_revert(iter, iov_count - iov_iter_count(iter)); 3656 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); 3657 if (!ret) 3658 return -EAGAIN; ··· 3662 if (iovec) 3663 kfree(iovec); 3664 return ret; 3665 } 3666 3667 static int __io_splice_prep(struct io_kiocb *req, ··· 4007 { 4008 u64 flags, mode; 4009 4010 - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) 4011 return -EINVAL; 4012 mode = READ_ONCE(sqe->len); 4013 flags = READ_ONCE(sqe->open_flags); ··· 4021 size_t len; 4022 int ret; 4023 4024 - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) 4025 return -EINVAL; 4026 how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 4027 len = READ_ONCE(sqe->len); ··· 4151 head = idr_find(&ctx->io_buffer_idr, p->bgid); 4152 if (head) 4153 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); 4154 - 4155 - io_ring_submit_lock(ctx, !force_nonblock); 4156 if (ret < 0) 4157 req_set_fail_links(req); 4158 - __io_req_complete(req, ret, 0, cs); 4159 return 0; 4160 } 4161 ··· 4246 } 4247 } 4248 out: 4249 - io_ring_submit_unlock(ctx, !force_nonblock); 4250 if (ret < 0) 4251 req_set_fail_links(req); 4252 - __io_req_complete(req, ret, 0, cs); 4253 return 0; 4254 } 4255 ··· 4428 io_req_init_async(req); 4429 req->work.flags |= IO_WQ_WORK_NO_CANCEL; 4430 4431 - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) 4432 return -EINVAL; 4433 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || 4434 sqe->rw_flags || sqe->buf_index) ··· 4910 { 4911 struct io_accept *accept = &req->accept; 4912 4913 - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) 4914 return -EINVAL; 4915 if (sqe->ioprio || sqe->len || sqe->buf_index) 4916 return -EINVAL; ··· 4951 struct io_connect *conn = &req->connect; 4952 struct io_async_connect *io = req->async_data; 4953 4954 - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) 4955 return -EINVAL; 4956 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) 4957 return -EINVAL; ··· 5486 /* 5487 * Returns true if we found and killed one or more poll requests 5488 */ 5489 - static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk) 5490 { 5491 struct hlist_node *tmp; 5492 struct io_kiocb *req; ··· 5499 5500 list = &ctx->cancel_hash[i]; 5501 hlist_for_each_entry_safe(req, tmp, list, hash_node) { 5502 - if (io_task_match(req, tsk)) 5503 posted += io_poll_remove_one(req); 5504 } 5505 } ··· 5537 sqe->poll_events) 5538 return -EINVAL; 5539 5540 - req->poll.addr = READ_ONCE(sqe->addr); 5541 return 0; 5542 } 5543 ··· 5548 static int io_poll_remove(struct io_kiocb *req) 5549 { 5550 struct io_ring_ctx *ctx = req->ctx; 5551 - u64 addr; 5552 int ret; 5553 5554 - addr = req->poll.addr; 5555 spin_lock_irq(&ctx->completion_lock); 5556 - ret = io_poll_cancel(ctx, addr); 5557 spin_unlock_irq(&ctx->completion_lock); 5558 5559 if (ret < 0) ··· 5644 return HRTIMER_NORESTART; 5645 } 5646 5647 - static int __io_timeout_cancel(struct io_kiocb *req) 5648 { 5649 - struct io_timeout_data *io = req->async_data; 5650 - int ret; 5651 - 5652 - ret = hrtimer_try_to_cancel(&io->timer); 5653 - if (ret == -1) 5654 - return -EALREADY; 5655 - list_del_init(&req->timeout.list); 5656 - 5657 - req_set_fail_links(req); 5658 - io_cqring_fill_event(req, -ECANCELED); 5659 - io_put_req_deferred(req, 1); 5660 - return 0; 5661 - } 5662 - 5663 - static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) 5664 - { 5665 struct io_kiocb *req; 5666 int ret = -ENOENT; 5667 ··· 5659 } 5660 5661 if (ret == -ENOENT) 5662 - return ret; 5663 5664 - return __io_timeout_cancel(req); 5665 } 5666 5667 static int io_timeout_remove_prep(struct io_kiocb *req, 5668 const struct io_uring_sqe *sqe) 5669 { 5670 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5671 return -EINVAL; 5672 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 5673 return -EINVAL; 5674 - if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->timeout_flags) 5675 return -EINVAL; 5676 5677 - req->timeout_rem.addr = READ_ONCE(sqe->addr); 5678 return 0; 5679 } 5680 ··· 5732 */ 5733 static int io_timeout_remove(struct io_kiocb *req) 5734 { 5735 struct io_ring_ctx *ctx = req->ctx; 5736 int ret; 5737 5738 spin_lock_irq(&ctx->completion_lock); 5739 - ret = io_timeout_cancel(ctx, req->timeout_rem.addr); 5740 5741 io_cqring_fill_event(req, ret); 5742 io_commit_cqring(ctx); ··· 6024 return io_remove_buffers_prep(req, sqe); 6025 case IORING_OP_TEE: 6026 return io_tee_prep(req, sqe); 6027 } 6028 6029 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", ··· 6051 { 6052 struct io_kiocb *pos; 6053 struct io_ring_ctx *ctx = req->ctx; 6054 - u32 total_submitted, nr_reqs = 1; 6055 6056 - if (req->flags & REQ_F_LINK_HEAD) 6057 - list_for_each_entry(pos, &req->link_list, link_list) 6058 - nr_reqs++; 6059 6060 total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped; 6061 return total_submitted - nr_reqs; ··· 6106 static void io_req_drop_files(struct io_kiocb *req) 6107 { 6108 struct io_ring_ctx *ctx = req->ctx; 6109 unsigned long flags; 6110 6111 spin_lock_irqsave(&ctx->inflight_lock, flags); 6112 list_del(&req->inflight_entry); 6113 - if (waitqueue_active(&ctx->inflight_wait)) 6114 - wake_up(&ctx->inflight_wait); 6115 spin_unlock_irqrestore(&ctx->inflight_lock, flags); 6116 req->flags &= ~REQ_F_INFLIGHT; 6117 put_files_struct(req->work.identity->files); ··· 6166 case IORING_OP_OPENAT2: 6167 if (req->open.filename) 6168 putname(req->open.filename); 6169 break; 6170 } 6171 req->flags &= ~REQ_F_NEED_CLEANUP; ··· 6280 case IORING_OP_TEE: 6281 ret = io_tee(req, force_nonblock); 6282 break; 6283 default: 6284 ret = -EINVAL; 6285 break; ··· 6305 if (in_async) 6306 mutex_lock(&ctx->uring_lock); 6307 6308 - io_iopoll_req_issued(req); 6309 6310 if (in_async) 6311 mutex_unlock(&ctx->uring_lock); ··· 6345 } 6346 6347 if (ret) { 6348 - req_set_fail_links(req); 6349 - io_req_complete(req, ret); 6350 } 6351 6352 return io_steal_work(req); ··· 6383 return NULL; 6384 fd = array_index_nospec(fd, ctx->nr_user_files); 6385 file = io_file_from_index(ctx, fd); 6386 - if (file) { 6387 - req->fixed_file_refs = &ctx->file_data->node->refs; 6388 - percpu_ref_get(req->fixed_file_refs); 6389 - } 6390 } else { 6391 trace_io_uring_file_get(ctx, fd); 6392 file = __io_file_get(state, fd); ··· 6392 return file; 6393 } 6394 6395 - static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, 6396 - int fd) 6397 - { 6398 - bool fixed; 6399 - 6400 - fixed = (req->flags & REQ_F_FIXED_FILE) != 0; 6401 - if (unlikely(!fixed && io_async_submit(req->ctx))) 6402 - return -EBADF; 6403 - 6404 - req->file = io_file_get(state, req, fd, fixed); 6405 - if (req->file || io_op_defs[req->opcode].needs_file_no_error) 6406 - return 0; 6407 - return -EBADF; 6408 - } 6409 - 6410 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) 6411 { 6412 struct io_timeout_data *data = container_of(timer, 6413 struct io_timeout_data, timer); 6414 - struct io_kiocb *req = data->req; 6415 struct io_ring_ctx *ctx = req->ctx; 6416 - struct io_kiocb *prev = NULL; 6417 unsigned long flags; 6418 6419 spin_lock_irqsave(&ctx->completion_lock, flags); 6420 6421 /* 6422 * We don't expect the list to be empty, that will only happen if we 6423 * race with the completion of the linked work. 6424 */ 6425 - if (!list_empty(&req->link_list)) { 6426 - prev = list_entry(req->link_list.prev, struct io_kiocb, 6427 - link_list); 6428 - if (refcount_inc_not_zero(&prev->refs)) 6429 - list_del_init(&req->link_list); 6430 - else 6431 - prev = NULL; 6432 - } 6433 - 6434 spin_unlock_irqrestore(&ctx->completion_lock, flags); 6435 6436 if (prev) { ··· 6427 static void __io_queue_linked_timeout(struct io_kiocb *req) 6428 { 6429 /* 6430 - * If the list is now empty, then our linked request finished before 6431 - * we got a chance to setup the timer 6432 */ 6433 - if (!list_empty(&req->link_list)) { 6434 struct io_timeout_data *data = req->async_data; 6435 6436 data->timer.function = io_link_timeout_fn; ··· 6453 6454 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) 6455 { 6456 - struct io_kiocb *nxt; 6457 6458 - if (!(req->flags & REQ_F_LINK_HEAD)) 6459 - return NULL; 6460 - if (req->flags & REQ_F_LINK_TIMEOUT) 6461 - return NULL; 6462 - 6463 - nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, 6464 - link_list); 6465 - if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT) 6466 return NULL; 6467 6468 nxt->flags |= REQ_F_LTIMEOUT_ACTIVE; 6469 req->flags |= REQ_F_LINK_TIMEOUT; 6470 return nxt; ··· 6565 io_queue_sqe(req, NULL, cs); 6566 } 6567 6568 static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, 6569 - struct io_kiocb **link, struct io_comp_state *cs) 6570 { 6571 struct io_ring_ctx *ctx = req->ctx; 6572 int ret; ··· 6583 * submitted sync once the chain is complete. If none of those 6584 * conditions are true (normal request), then just queue it. 6585 */ 6586 - if (*link) { 6587 - struct io_kiocb *head = *link; 6588 6589 /* 6590 * Taking sequential execution of a link, draining both sides ··· 6604 return ret; 6605 } 6606 trace_io_uring_link(ctx, req, head); 6607 - list_add_tail(&req->link_list, &head->link_list); 6608 6609 /* last request of a link, enqueue the link */ 6610 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { 6611 io_queue_link_head(head, cs); 6612 - *link = NULL; 6613 } 6614 } else { 6615 if (unlikely(ctx->drain_next)) { ··· 6618 ctx->drain_next = 0; 6619 } 6620 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { 6621 - req->flags |= REQ_F_LINK_HEAD; 6622 - INIT_LIST_HEAD(&req->link_list); 6623 - 6624 ret = io_req_defer_prep(req, sqe); 6625 if (unlikely(ret)) 6626 req->flags |= REQ_F_FAIL_LINK; 6627 - *link = req; 6628 } else { 6629 io_queue_sqe(req, sqe, cs); 6630 } ··· 6638 { 6639 if (!list_empty(&state->comp.list)) 6640 io_submit_flush_completions(&state->comp); 6641 - blk_finish_plug(&state->plug); 6642 io_state_file_put(state); 6643 if (state->free_reqs) 6644 kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs); ··· 6651 static void io_submit_state_start(struct io_submit_state *state, 6652 struct io_ring_ctx *ctx, unsigned int max_ios) 6653 { 6654 - blk_start_plug(&state->plug); 6655 state->comp.nr = 0; 6656 INIT_LIST_HEAD(&state->comp.list); 6657 state->comp.ctx = ctx; 6658 state->free_reqs = 0; 6659 - state->file = NULL; 6660 state->ios_left = max_ios; 6661 } 6662 ··· 6751 req->file = NULL; 6752 req->ctx = ctx; 6753 req->flags = 0; 6754 /* one is dropped after submission, the other at completion */ 6755 refcount_set(&req->refs, 2); 6756 req->task = current; ··· 6761 if (unlikely(req->opcode >= IORING_OP_LAST)) 6762 return -EINVAL; 6763 6764 - if (unlikely(io_sq_thread_acquire_mm(ctx, req))) 6765 return -EFAULT; 6766 6767 sqe_flags = READ_ONCE(sqe->flags); ··· 6794 /* same numerical values with corresponding REQ_F_*, safe to copy */ 6795 req->flags |= sqe_flags; 6796 6797 - if (!io_op_defs[req->opcode].needs_file) 6798 - return 0; 6799 6800 - ret = io_req_set_file(state, req, READ_ONCE(sqe->fd)); 6801 state->ios_left--; 6802 return ret; 6803 } ··· 6821 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) 6822 { 6823 struct io_submit_state state; 6824 - struct io_kiocb *link = NULL; 6825 int i, submitted = 0; 6826 6827 /* if we have a backlog and couldn't flush it all, return BUSY */ ··· 6841 refcount_add(nr, &current->usage); 6842 6843 io_submit_state_start(&state, ctx, nr); 6844 6845 for (i = 0; i < nr; i++) { 6846 const struct io_uring_sqe *sqe; ··· 6887 percpu_counter_sub(&tctx->inflight, unused); 6888 put_task_struct_many(current, unused); 6889 } 6890 - if (link) 6891 - io_queue_link_head(link, &state.comp); 6892 io_submit_state_end(&state); 6893 6894 /* Commit SQ ring head once we've consumed and submitted all SQEs */ ··· 6912 spin_unlock_irq(&ctx->completion_lock); 6913 } 6914 6915 - static int io_sq_wake_function(struct wait_queue_entry *wqe, unsigned mode, 6916 - int sync, void *key) 6917 { 6918 - struct io_ring_ctx *ctx = container_of(wqe, struct io_ring_ctx, sqo_wait_entry); 6919 - int ret; 6920 - 6921 - ret = autoremove_wake_function(wqe, mode, sync, key); 6922 - if (ret) { 6923 - unsigned long flags; 6924 - 6925 - spin_lock_irqsave(&ctx->completion_lock, flags); 6926 - ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; 6927 - spin_unlock_irqrestore(&ctx->completion_lock, flags); 6928 - } 6929 - return ret; 6930 - } 6931 - 6932 - enum sq_ret { 6933 - SQT_IDLE = 1, 6934 - SQT_SPIN = 2, 6935 - SQT_DID_WORK = 4, 6936 - }; 6937 - 6938 - static enum sq_ret __io_sq_thread(struct io_ring_ctx *ctx, 6939 - unsigned long start_jiffies, bool cap_entries) 6940 - { 6941 - unsigned long timeout = start_jiffies + ctx->sq_thread_idle; 6942 - struct io_sq_data *sqd = ctx->sq_data; 6943 unsigned int to_submit; 6944 int ret = 0; 6945 6946 - again: 6947 - if (!list_empty(&ctx->iopoll_list)) { 6948 - unsigned nr_events = 0; 6949 - 6950 - mutex_lock(&ctx->uring_lock); 6951 - if (!list_empty(&ctx->iopoll_list) && !need_resched()) 6952 - io_do_iopoll(ctx, &nr_events, 0); 6953 - mutex_unlock(&ctx->uring_lock); 6954 - } 6955 - 6956 to_submit = io_sqring_entries(ctx); 6957 - 6958 - /* 6959 - * If submit got -EBUSY, flag us as needing the application 6960 - * to enter the kernel to reap and flush events. 6961 - */ 6962 - if (!to_submit || ret == -EBUSY || need_resched()) { 6963 - /* 6964 - * Drop cur_mm before scheduling, we can't hold it for 6965 - * long periods (or over schedule()). Do this before 6966 - * adding ourselves to the waitqueue, as the unuse/drop 6967 - * may sleep. 6968 - */ 6969 - io_sq_thread_drop_mm(); 6970 - 6971 - /* 6972 - * We're polling. If we're within the defined idle 6973 - * period, then let us spin without work before going 6974 - * to sleep. The exception is if we got EBUSY doing 6975 - * more IO, we should wait for the application to 6976 - * reap events and wake us up. 6977 - */ 6978 - if (!list_empty(&ctx->iopoll_list) || need_resched() || 6979 - (!time_after(jiffies, timeout) && ret != -EBUSY && 6980 - !percpu_ref_is_dying(&ctx->refs))) 6981 - return SQT_SPIN; 6982 - 6983 - prepare_to_wait(&sqd->wait, &ctx->sqo_wait_entry, 6984 - TASK_INTERRUPTIBLE); 6985 - 6986 - /* 6987 - * While doing polled IO, before going to sleep, we need 6988 - * to check if there are new reqs added to iopoll_list, 6989 - * it is because reqs may have been punted to io worker 6990 - * and will be added to iopoll_list later, hence check 6991 - * the iopoll_list again. 6992 - */ 6993 - if ((ctx->flags & IORING_SETUP_IOPOLL) && 6994 - !list_empty_careful(&ctx->iopoll_list)) { 6995 - finish_wait(&sqd->wait, &ctx->sqo_wait_entry); 6996 - goto again; 6997 - } 6998 - 6999 - to_submit = io_sqring_entries(ctx); 7000 - if (!to_submit || ret == -EBUSY) 7001 - return SQT_IDLE; 7002 - } 7003 - 7004 - finish_wait(&sqd->wait, &ctx->sqo_wait_entry); 7005 - io_ring_clear_wakeup_flag(ctx); 7006 - 7007 /* if we're handling multiple rings, cap submit size for fairness */ 7008 if (cap_entries && to_submit > 8) 7009 to_submit = 8; 7010 7011 - mutex_lock(&ctx->uring_lock); 7012 - if (likely(!percpu_ref_is_dying(&ctx->refs))) 7013 - ret = io_submit_sqes(ctx, to_submit); 7014 - mutex_unlock(&ctx->uring_lock); 7015 7016 if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait)) 7017 wake_up(&ctx->sqo_sq_wait); 7018 7019 - return SQT_DID_WORK; 7020 } 7021 7022 static void io_sqd_init_new(struct io_sq_data *sqd) ··· 6959 6960 while (!list_empty(&sqd->ctx_new_list)) { 6961 ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list); 6962 - init_wait(&ctx->sqo_wait_entry); 6963 - ctx->sqo_wait_entry.func = io_sq_wake_function; 6964 list_move_tail(&ctx->sqd_list, &sqd->ctx_list); 6965 complete(&ctx->sq_thread_comp); 6966 } 6967 } 6968 6969 static int io_sq_thread(void *data) 6970 { 6971 struct cgroup_subsys_state *cur_css = NULL; 6972 const struct cred *old_cred = NULL; 6973 struct io_sq_data *sqd = data; 6974 struct io_ring_ctx *ctx; 6975 - unsigned long start_jiffies; 6976 6977 - start_jiffies = jiffies; 6978 while (!kthread_should_stop()) { 6979 - enum sq_ret ret = 0; 6980 - bool cap_entries; 6981 6982 /* 6983 * Any changes to the sqd lists are synchronized through the 6984 * kthread parking. This synchronizes the thread vs users, 6985 * the users are synchronized on the sqd->ctx_lock. 6986 */ 6987 - if (kthread_should_park()) 6988 kthread_parkme(); 6989 6990 - if (unlikely(!list_empty(&sqd->ctx_new_list))) 6991 io_sqd_init_new(sqd); 6992 6993 cap_entries = !list_is_singular(&sqd->ctx_list); 6994 - 6995 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 6996 if (current->cred != ctx->creds) { 6997 if (old_cred) ··· 7021 current->sessionid = ctx->sessionid; 7022 #endif 7023 7024 - ret |= __io_sq_thread(ctx, start_jiffies, cap_entries); 7025 7026 - io_sq_thread_drop_mm(); 7027 } 7028 7029 - if (ret & SQT_SPIN) { 7030 io_run_task_work(); 7031 cond_resched(); 7032 - } else if (ret == SQT_IDLE) { 7033 - if (kthread_should_park()) 7034 - continue; 7035 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 7036 io_ring_set_wakeup_flag(ctx); 7037 schedule(); 7038 - start_jiffies = jiffies; 7039 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 7040 io_ring_clear_wakeup_flag(ctx); 7041 } 7042 } 7043 7044 io_run_task_work(); ··· 7072 io_sq_thread_unassociate_blkcg(); 7073 if (old_cred) 7074 revert_creds(old_cred); 7075 7076 kthread_parkme(); 7077 ··· 7132 * application must reap them itself, as they reside on the shared cq ring. 7133 */ 7134 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, 7135 - const sigset_t __user *sig, size_t sigsz) 7136 { 7137 struct io_wait_queue iowq = { 7138 .wq = { ··· 7145 .to_wait = min_events, 7146 }; 7147 struct io_rings *rings = ctx->rings; 7148 int ret = 0; 7149 7150 do { ··· 7169 return ret; 7170 } 7171 7172 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); 7173 trace_io_uring_cqring_wait(ctx, min_events); 7174 do { ··· 7188 break; 7189 if (io_should_wake(&iowq, false)) 7190 break; 7191 - schedule(); 7192 } while (1); 7193 finish_wait(&ctx->wait, &iowq.wq); 7194 ··· 7245 if (!data) 7246 return -ENXIO; 7247 7248 - spin_lock(&data->lock); 7249 ref_node = data->node; 7250 - spin_unlock(&data->lock); 7251 if (ref_node) 7252 percpu_ref_kill(&ref_node->refs); 7253 ··· 7370 7371 mutex_lock(&sqd->ctx_lock); 7372 list_del(&ctx->sqd_list); 7373 mutex_unlock(&sqd->ctx_lock); 7374 7375 - if (sqd->thread) { 7376 - finish_wait(&sqd->wait, &ctx->sqo_wait_entry); 7377 io_sq_thread_unpark(sqd); 7378 - } 7379 7380 io_put_sq_data(sqd); 7381 ctx->sq_data = NULL; ··· 7629 data = ref_node->file_data; 7630 ctx = data->ctx; 7631 7632 - spin_lock(&data->lock); 7633 ref_node->done = true; 7634 7635 while (!list_empty(&data->ref_list)) { ··· 7641 list_del(&ref_node->node); 7642 first_add |= llist_add(&ref_node->llist, &ctx->file_put_llist); 7643 } 7644 - spin_unlock(&data->lock); 7645 7646 if (percpu_ref_is_dying(&data->refs)) 7647 delay = 0; ··· 7764 } 7765 7766 file_data->node = ref_node; 7767 - spin_lock(&file_data->lock); 7768 list_add_tail(&ref_node->node, &file_data->ref_list); 7769 - spin_unlock(&file_data->lock); 7770 percpu_ref_get(&file_data->refs); 7771 return ret; 7772 out_fput: ··· 7923 7924 if (needs_switch) { 7925 percpu_ref_kill(&data->node->refs); 7926 - spin_lock(&data->lock); 7927 list_add_tail(&ref_node->node, &data->ref_list); 7928 data->node = ref_node; 7929 - spin_unlock(&data->lock); 7930 percpu_ref_get(&ctx->file_data->refs); 7931 } else 7932 destroy_fixed_file_ref_node(ref_node); ··· 8054 struct io_sq_data *sqd; 8055 8056 ret = -EPERM; 8057 - if (!capable(CAP_SYS_ADMIN)) 8058 goto err; 8059 8060 sqd = io_get_sq_data(p); ··· 8640 * as nobody else will be looking for them. 8641 */ 8642 do { 8643 - if (ctx->rings) 8644 - io_cqring_overflow_flush(ctx, true, NULL, NULL); 8645 io_iopoll_try_reap_events(ctx); 8646 } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); 8647 io_ring_ctx_free(ctx); ··· 8649 { 8650 mutex_lock(&ctx->uring_lock); 8651 percpu_ref_kill(&ctx->refs); 8652 mutex_unlock(&ctx->uring_lock); 8653 8654 - io_kill_timeouts(ctx, NULL); 8655 - io_poll_remove_all(ctx, NULL); 8656 8657 if (ctx->io_wq) 8658 io_wq_cancel_all(ctx->io_wq); 8659 8660 /* if we failed setting up the ctx, we might not have any rings */ 8661 - if (ctx->rings) 8662 - io_cqring_overflow_flush(ctx, true, NULL, NULL); 8663 io_iopoll_try_reap_events(ctx); 8664 idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); 8665 ··· 8690 return 0; 8691 } 8692 8693 - static bool io_wq_files_match(struct io_wq_work *work, void *data) 8694 - { 8695 - struct files_struct *files = data; 8696 8697 - return !files || ((work->flags & IO_WQ_WORK_FILES) && 8698 - work->identity->files == files); 8699 - } 8700 - 8701 - /* 8702 - * Returns true if 'preq' is the link parent of 'req' 8703 - */ 8704 - static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req) 8705 - { 8706 - struct io_kiocb *link; 8707 - 8708 - if (!(preq->flags & REQ_F_LINK_HEAD)) 8709 - return false; 8710 - 8711 - list_for_each_entry(link, &preq->link_list, link_list) { 8712 - if (link == req) 8713 - return true; 8714 - } 8715 - 8716 - return false; 8717 - } 8718 - 8719 - /* 8720 - * We're looking to cancel 'req' because it's holding on to our files, but 8721 - * 'req' could be a link to another request. See if it is, and cancel that 8722 - * parent request if so. 8723 - */ 8724 - static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req) 8725 - { 8726 - struct hlist_node *tmp; 8727 - struct io_kiocb *preq; 8728 - bool found = false; 8729 - int i; 8730 - 8731 - spin_lock_irq(&ctx->completion_lock); 8732 - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { 8733 - struct hlist_head *list; 8734 - 8735 - list = &ctx->cancel_hash[i]; 8736 - hlist_for_each_entry_safe(preq, tmp, list, hash_node) { 8737 - found = io_match_link(preq, req); 8738 - if (found) { 8739 - io_poll_remove_one(preq); 8740 - break; 8741 - } 8742 - } 8743 - } 8744 - spin_unlock_irq(&ctx->completion_lock); 8745 - return found; 8746 - } 8747 - 8748 - static bool io_timeout_remove_link(struct io_ring_ctx *ctx, 8749 - struct io_kiocb *req) 8750 - { 8751 - struct io_kiocb *preq; 8752 - bool found = false; 8753 - 8754 - spin_lock_irq(&ctx->completion_lock); 8755 - list_for_each_entry(preq, &ctx->timeout_list, timeout.list) { 8756 - found = io_match_link(preq, req); 8757 - if (found) { 8758 - __io_timeout_cancel(preq); 8759 - break; 8760 - } 8761 - } 8762 - spin_unlock_irq(&ctx->completion_lock); 8763 - return found; 8764 - } 8765 - 8766 - static bool io_cancel_link_cb(struct io_wq_work *work, void *data) 8767 { 8768 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 8769 bool ret; 8770 8771 - if (req->flags & REQ_F_LINK_TIMEOUT) { 8772 unsigned long flags; 8773 struct io_ring_ctx *ctx = req->ctx; 8774 8775 /* protect against races with linked timeouts */ 8776 spin_lock_irqsave(&ctx->completion_lock, flags); 8777 - ret = io_match_link(req, data); 8778 spin_unlock_irqrestore(&ctx->completion_lock, flags); 8779 } else { 8780 - ret = io_match_link(req, data); 8781 } 8782 return ret; 8783 - } 8784 - 8785 - static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) 8786 - { 8787 - enum io_wq_cancel cret; 8788 - 8789 - /* cancel this particular work, if it's running */ 8790 - cret = io_wq_cancel_work(ctx->io_wq, &req->work); 8791 - if (cret != IO_WQ_CANCEL_NOTFOUND) 8792 - return; 8793 - 8794 - /* find links that hold this pending, cancel those */ 8795 - cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true); 8796 - if (cret != IO_WQ_CANCEL_NOTFOUND) 8797 - return; 8798 - 8799 - /* if we have a poll link holding this pending, cancel that */ 8800 - if (io_poll_remove_link(ctx, req)) 8801 - return; 8802 - 8803 - /* final option, timeout link is holding this req pending */ 8804 - io_timeout_remove_link(ctx, req); 8805 } 8806 8807 static void io_cancel_defer_files(struct io_ring_ctx *ctx, ··· 8724 8725 spin_lock_irq(&ctx->completion_lock); 8726 list_for_each_entry_reverse(de, &ctx->defer_list, list) { 8727 - if (io_task_match(de->req, task) && 8728 - io_match_files(de->req, files)) { 8729 list_cut_position(&list, &ctx->defer_list, &de->list); 8730 break; 8731 } ··· 8741 } 8742 } 8743 8744 - /* 8745 - * Returns true if we found and killed one or more files pinning requests 8746 - */ 8747 - static bool io_uring_cancel_files(struct io_ring_ctx *ctx, 8748 struct files_struct *files) 8749 { 8750 - if (list_empty_careful(&ctx->inflight_list)) 8751 - return false; 8752 - 8753 - /* cancel all at once, should be faster than doing it one by one*/ 8754 - io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true); 8755 - 8756 while (!list_empty_careful(&ctx->inflight_list)) { 8757 - struct io_kiocb *cancel_req = NULL, *req; 8758 DEFINE_WAIT(wait); 8759 8760 spin_lock_irq(&ctx->inflight_lock); 8761 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { 8762 - if (files && (req->work.flags & IO_WQ_WORK_FILES) && 8763 req->work.identity->files != files) 8764 continue; 8765 - /* req is being completed, ignore */ 8766 - if (!refcount_inc_not_zero(&req->refs)) 8767 - continue; 8768 - cancel_req = req; 8769 break; 8770 } 8771 - if (cancel_req) 8772 - prepare_to_wait(&ctx->inflight_wait, &wait, 8773 - TASK_UNINTERRUPTIBLE); 8774 spin_unlock_irq(&ctx->inflight_lock); 8775 8776 /* We need to keep going until we don't find a matching req */ 8777 - if (!cancel_req) 8778 break; 8779 - /* cancel this request, or head link requests */ 8780 - io_attempt_cancel(ctx, cancel_req); 8781 - io_put_req(cancel_req); 8782 /* cancellations _may_ trigger task work */ 8783 io_run_task_work(); 8784 schedule(); 8785 - finish_wait(&ctx->inflight_wait, &wait); 8786 } 8787 - 8788 - return true; 8789 } 8790 8791 - static bool io_cancel_task_cb(struct io_wq_work *work, void *data) 8792 { 8793 - struct io_kiocb *req = container_of(work, struct io_kiocb, work); 8794 - struct task_struct *task = data; 8795 - 8796 - return io_task_match(req, task); 8797 - } 8798 - 8799 - static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, 8800 - struct task_struct *task, 8801 - struct files_struct *files) 8802 - { 8803 - bool ret; 8804 - 8805 - ret = io_uring_cancel_files(ctx, files); 8806 - if (!files) { 8807 enum io_wq_cancel cret; 8808 8809 - cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true); 8810 if (cret != IO_WQ_CANCEL_NOTFOUND) 8811 ret = true; 8812 ··· 8798 } 8799 } 8800 8801 - ret |= io_poll_remove_all(ctx, task); 8802 - ret |= io_kill_timeouts(ctx, task); 8803 } 8804 - 8805 - return ret; 8806 } 8807 8808 /* ··· 8823 io_sq_thread_park(ctx->sq_data); 8824 } 8825 8826 - if (files) 8827 - io_cancel_defer_files(ctx, NULL, files); 8828 - else 8829 - io_cancel_defer_files(ctx, task, NULL); 8830 - 8831 io_cqring_overflow_flush(ctx, true, task, files); 8832 8833 - while (__io_uring_cancel_task_requests(ctx, task, files)) { 8834 - io_run_task_work(); 8835 - cond_resched(); 8836 - } 8837 8838 if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { 8839 atomic_dec(&task->io_uring->in_idle); ··· 9089 finish_wait(&ctx->sqo_sq_wait, &wait); 9090 } 9091 9092 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, 9093 - u32, min_complete, u32, flags, const sigset_t __user *, sig, 9094 - size_t, sigsz) 9095 { 9096 struct io_ring_ctx *ctx; 9097 long ret = -EBADF; ··· 9131 io_run_task_work(); 9132 9133 if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | 9134 - IORING_ENTER_SQ_WAIT)) 9135 return -EINVAL; 9136 9137 f = fdget(fd); ··· 9158 */ 9159 ret = 0; 9160 if (ctx->flags & IORING_SETUP_SQPOLL) { 9161 if (!list_empty_careful(&ctx->cq_overflow_list)) 9162 io_cqring_overflow_flush(ctx, false, NULL, NULL); 9163 if (flags & IORING_ENTER_SQ_WAKEUP) 9164 wake_up(&ctx->sq_data->wait); 9165 if (flags & IORING_ENTER_SQ_WAIT) ··· 9179 goto out; 9180 } 9181 if (flags & IORING_ENTER_GETEVENTS) { 9182 min_complete = min(min_complete, ctx->cq_entries); 9183 9184 /* ··· 9198 !(ctx->flags & IORING_SETUP_SQPOLL)) { 9199 ret = io_iopoll_check(ctx, min_complete); 9200 } else { 9201 - ret = io_cqring_wait(ctx, min_complete, sig, sigsz); 9202 } 9203 } 9204 ··· 9566 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | 9567 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | 9568 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | 9569 - IORING_FEAT_POLL_32BITS; 9570 9571 if (copy_to_user(params, p, sizeof(*p))) { 9572 ret = -EFAULT;
··· 245 246 struct task_struct *thread; 247 struct wait_queue_head wait; 248 + 249 + unsigned sq_thread_idle; 250 }; 251 252 struct io_ring_ctx { ··· 285 struct list_head timeout_list; 286 struct list_head cq_overflow_list; 287 288 struct io_uring_sqe *sq_sqes; 289 } ____cacheline_aligned_in_smp; 290 ··· 310 struct io_sq_data *sq_data; /* if using sq thread polling */ 311 312 struct wait_queue_head sqo_sq_wait; 313 struct list_head sqd_list; 314 315 /* ··· 395 */ 396 struct io_poll_iocb { 397 struct file *file; 398 + struct wait_queue_head *head; 399 __poll_t events; 400 bool done; 401 bool canceled; 402 struct wait_queue_entry wait; 403 + }; 404 + 405 + struct io_poll_remove { 406 + struct file *file; 407 + u64 addr; 408 }; 409 410 struct io_close { ··· 444 u32 off; 445 u32 target_seq; 446 struct list_head list; 447 + /* head of the link, used by linked timeouts only */ 448 + struct io_kiocb *head; 449 }; 450 451 struct io_timeout_rem { 452 struct file *file; 453 u64 addr; 454 + 455 + /* timeout update */ 456 + struct timespec64 ts; 457 + u32 flags; 458 }; 459 460 struct io_rw { ··· 541 struct statx __user *buffer; 542 }; 543 544 + struct io_shutdown { 545 + struct file *file; 546 + int how; 547 + }; 548 + 549 + struct io_rename { 550 + struct file *file; 551 + int old_dfd; 552 + int new_dfd; 553 + struct filename *oldpath; 554 + struct filename *newpath; 555 + int flags; 556 + }; 557 + 558 + struct io_unlink { 559 + struct file *file; 560 + int dfd; 561 + int flags; 562 + struct filename *filename; 563 + }; 564 + 565 struct io_completion { 566 struct file *file; 567 struct list_head list; ··· 575 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, 576 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, 577 578 REQ_F_FAIL_LINK_BIT, 579 REQ_F_INFLIGHT_BIT, 580 REQ_F_CUR_POS_BIT, ··· 607 /* IOSQE_BUFFER_SELECT */ 608 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), 609 610 /* fail rest of links */ 611 REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), 612 /* on inflight list */ ··· 651 struct file *file; 652 struct io_rw rw; 653 struct io_poll_iocb poll; 654 + struct io_poll_remove poll_remove; 655 struct io_accept accept; 656 struct io_sync sync; 657 struct io_cancel cancel; ··· 667 struct io_splice splice; 668 struct io_provide_buf pbuf; 669 struct io_statx statx; 670 + struct io_shutdown shutdown; 671 + struct io_rename rename; 672 + struct io_unlink unlink; 673 /* use only after cleaning per-op data, see io_clean_op() */ 674 struct io_completion compl; 675 }; ··· 686 struct task_struct *task; 687 u64 user_data; 688 689 + struct io_kiocb *link; 690 + struct percpu_ref *fixed_file_refs; 691 692 /* 693 * 1. used with ctx->iopoll_list with reads/writes 694 * 2. to track reqs with ->files (see io_op_def::file_table) 695 */ 696 struct list_head inflight_entry; 697 struct callback_head task_work; 698 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ 699 struct hlist_node hash_node; ··· 725 void *reqs[IO_IOPOLL_BATCH]; 726 unsigned int free_reqs; 727 728 + bool plug_started; 729 + 730 /* 731 * Batch completion logic 732 */ ··· 735 */ 736 struct file *file; 737 unsigned int fd; 738 + unsigned int file_refs; 739 unsigned int ios_left; 740 }; 741 ··· 757 unsigned buffer_select : 1; 758 /* must always have async data allocated */ 759 unsigned needs_async_data : 1; 760 + /* should block plug */ 761 + unsigned plug : 1; 762 /* size of async data needed, if any */ 763 unsigned short async_size; 764 unsigned work_flags; ··· 770 .pollin = 1, 771 .buffer_select = 1, 772 .needs_async_data = 1, 773 + .plug = 1, 774 .async_size = sizeof(struct io_async_rw), 775 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, 776 }, ··· 779 .unbound_nonreg_file = 1, 780 .pollout = 1, 781 .needs_async_data = 1, 782 + .plug = 1, 783 .async_size = sizeof(struct io_async_rw), 784 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | 785 IO_WQ_WORK_FSIZE, ··· 791 .needs_file = 1, 792 .unbound_nonreg_file = 1, 793 .pollin = 1, 794 + .plug = 1, 795 .async_size = sizeof(struct io_async_rw), 796 .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM, 797 }, ··· 799 .hash_reg_file = 1, 800 .unbound_nonreg_file = 1, 801 .pollout = 1, 802 + .plug = 1, 803 .async_size = sizeof(struct io_async_rw), 804 .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE | 805 IO_WQ_WORK_MM, ··· 818 .pollout = 1, 819 .needs_async_data = 1, 820 .async_size = sizeof(struct io_async_msghdr), 821 + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, 822 }, 823 [IORING_OP_RECVMSG] = { 824 .needs_file = 1, ··· 828 .buffer_select = 1, 829 .needs_async_data = 1, 830 .async_size = sizeof(struct io_async_msghdr), 831 + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, 832 }, 833 [IORING_OP_TIMEOUT] = { 834 .needs_async_data = 1, 835 .async_size = sizeof(struct io_timeout_data), 836 .work_flags = IO_WQ_WORK_MM, 837 }, 838 + [IORING_OP_TIMEOUT_REMOVE] = { 839 + /* used by timeout updates' prep() */ 840 + .work_flags = IO_WQ_WORK_MM, 841 + }, 842 [IORING_OP_ACCEPT] = { 843 .needs_file = 1, 844 .unbound_nonreg_file = 1, ··· 863 }, 864 [IORING_OP_OPENAT] = { 865 .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG | 866 + IO_WQ_WORK_FS | IO_WQ_WORK_MM, 867 }, 868 [IORING_OP_CLOSE] = { 869 .needs_file = 1, ··· 882 .unbound_nonreg_file = 1, 883 .pollin = 1, 884 .buffer_select = 1, 885 + .plug = 1, 886 .async_size = sizeof(struct io_async_rw), 887 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, 888 }, ··· 889 .needs_file = 1, 890 .unbound_nonreg_file = 1, 891 .pollout = 1, 892 + .plug = 1, 893 .async_size = sizeof(struct io_async_rw), 894 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | 895 IO_WQ_WORK_FSIZE, ··· 915 }, 916 [IORING_OP_OPENAT2] = { 917 .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS | 918 + IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM, 919 }, 920 [IORING_OP_EPOLL_CTL] = { 921 .unbound_nonreg_file = 1, ··· 933 .needs_file = 1, 934 .hash_reg_file = 1, 935 .unbound_nonreg_file = 1, 936 + }, 937 + [IORING_OP_SHUTDOWN] = { 938 + .needs_file = 1, 939 + }, 940 + [IORING_OP_RENAMEAT] = { 941 + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES | 942 + IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, 943 + }, 944 + [IORING_OP_UNLINKAT] = { 945 + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES | 946 + IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, 947 }, 948 }; 949 ··· 983 } 984 EXPORT_SYMBOL(io_uring_get_socket); 985 986 + #define io_for_each_link(pos, head) \ 987 + for (pos = (head); pos; pos = pos->link) 988 + 989 static inline void io_clean_op(struct io_kiocb *req) 990 { 991 if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED | ··· 990 __io_clean_op(req); 991 } 992 993 + static inline void io_set_resource_node(struct io_kiocb *req) 994 { 995 + struct io_ring_ctx *ctx = req->ctx; 996 + 997 + if (!req->fixed_file_refs) { 998 + req->fixed_file_refs = &ctx->file_data->node->refs; 999 + percpu_ref_get(req->fixed_file_refs); 1000 + } 1001 + } 1002 + 1003 + static bool io_match_task(struct io_kiocb *head, 1004 + struct task_struct *task, 1005 + struct files_struct *files) 1006 + { 1007 + struct io_kiocb *req; 1008 + 1009 + if (task && head->task != task) 1010 + return false; 1011 + if (!files) 1012 + return true; 1013 + 1014 + io_for_each_link(req, head) { 1015 + if ((req->flags & REQ_F_WORK_INITIALIZED) && 1016 + (req->work.flags & IO_WQ_WORK_FILES) && 1017 + req->work.identity->files == files) 1018 + return true; 1019 + } 1020 + return false; 1021 + } 1022 + 1023 + static void io_sq_thread_drop_mm_files(void) 1024 + { 1025 + struct files_struct *files = current->files; 1026 struct mm_struct *mm = current->mm; 1027 1028 if (mm) { ··· 999 mmput(mm); 1000 current->mm = NULL; 1001 } 1002 + if (files) { 1003 + struct nsproxy *nsproxy = current->nsproxy; 1004 + 1005 + task_lock(current); 1006 + current->files = NULL; 1007 + current->nsproxy = NULL; 1008 + task_unlock(current); 1009 + put_files_struct(files); 1010 + put_nsproxy(nsproxy); 1011 + } 1012 + } 1013 + 1014 + static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx) 1015 + { 1016 + if (!current->files) { 1017 + struct files_struct *files; 1018 + struct nsproxy *nsproxy; 1019 + 1020 + task_lock(ctx->sqo_task); 1021 + files = ctx->sqo_task->files; 1022 + if (!files) { 1023 + task_unlock(ctx->sqo_task); 1024 + return -EOWNERDEAD; 1025 + } 1026 + atomic_inc(&files->count); 1027 + get_nsproxy(ctx->sqo_task->nsproxy); 1028 + nsproxy = ctx->sqo_task->nsproxy; 1029 + task_unlock(ctx->sqo_task); 1030 + 1031 + task_lock(current); 1032 + current->files = files; 1033 + current->nsproxy = nsproxy; 1034 + task_unlock(current); 1035 + } 1036 + return 0; 1037 } 1038 1039 static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) ··· 1026 return -EFAULT; 1027 } 1028 1029 + static int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx, 1030 + struct io_kiocb *req) 1031 { 1032 + const struct io_op_def *def = &io_op_defs[req->opcode]; 1033 + int ret; 1034 + 1035 + if (def->work_flags & IO_WQ_WORK_MM) { 1036 + ret = __io_sq_thread_acquire_mm(ctx); 1037 + if (unlikely(ret)) 1038 + return ret; 1039 + } 1040 + 1041 + if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) { 1042 + ret = __io_sq_thread_acquire_files(ctx); 1043 + if (unlikely(ret)) 1044 + return ret; 1045 + } 1046 + 1047 + return 0; 1048 } 1049 1050 static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx, ··· 1174 INIT_LIST_HEAD(&ctx->iopoll_list); 1175 INIT_LIST_HEAD(&ctx->defer_list); 1176 INIT_LIST_HEAD(&ctx->timeout_list); 1177 spin_lock_init(&ctx->inflight_lock); 1178 INIT_LIST_HEAD(&ctx->inflight_list); 1179 INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work); ··· 1416 { 1417 struct io_kiocb *cur; 1418 1419 + io_for_each_link(cur, req) 1420 + io_prep_async_work(cur); 1421 } 1422 1423 static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req) ··· 1460 } 1461 } 1462 1463 /* 1464 * Returns true if we found and killed one or more timeouts 1465 */ 1466 + static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, 1467 + struct files_struct *files) 1468 { 1469 struct io_kiocb *req, *tmp; 1470 int canceled = 0; 1471 1472 spin_lock_irq(&ctx->completion_lock); 1473 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { 1474 + if (io_match_task(req, tsk, files)) { 1475 io_kill_timeout(req); 1476 canceled++; 1477 } ··· 1594 } 1595 } 1596 1597 /* Returns true if there are no backlogged entries after the flush */ 1598 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, 1599 struct task_struct *tsk, ··· 1647 1648 cqe = NULL; 1649 list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) { 1650 + if (!io_match_task(req, tsk, files)) 1651 continue; 1652 1653 cqe = io_get_cqring(ctx); ··· 1845 static inline void io_put_file(struct io_kiocb *req, struct file *file, 1846 bool fixed) 1847 { 1848 + if (!fixed) 1849 fput(file); 1850 } 1851 ··· 1859 kfree(req->async_data); 1860 if (req->file) 1861 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); 1862 + if (req->fixed_file_refs) 1863 + percpu_ref_put(req->fixed_file_refs); 1864 io_req_clean_work(req); 1865 } 1866 ··· 1882 percpu_ref_put(&ctx->refs); 1883 } 1884 1885 + static inline void io_remove_next_linked(struct io_kiocb *req) 1886 + { 1887 + struct io_kiocb *nxt = req->link; 1888 + 1889 + req->link = nxt->link; 1890 + nxt->link = NULL; 1891 + } 1892 + 1893 static void io_kill_linked_timeout(struct io_kiocb *req) 1894 { 1895 struct io_ring_ctx *ctx = req->ctx; ··· 1890 unsigned long flags; 1891 1892 spin_lock_irqsave(&ctx->completion_lock, flags); 1893 + link = req->link; 1894 + 1895 /* 1896 * Can happen if a linked timeout fired and link had been like 1897 * req -> link t-out -> link t-out [-> ...] ··· 1900 struct io_timeout_data *io = link->async_data; 1901 int ret; 1902 1903 + io_remove_next_linked(req); 1904 + link->timeout.head = NULL; 1905 ret = hrtimer_try_to_cancel(&io->timer); 1906 if (ret != -1) { 1907 io_cqring_fill_event(link, -ECANCELED); ··· 1917 } 1918 } 1919 1920 1921 static void io_fail_links(struct io_kiocb *req) 1922 { 1923 + struct io_kiocb *link, *nxt; 1924 struct io_ring_ctx *ctx = req->ctx; 1925 unsigned long flags; 1926 1927 spin_lock_irqsave(&ctx->completion_lock, flags); 1928 + link = req->link; 1929 + req->link = NULL; 1930 1931 + while (link) { 1932 + nxt = link->link; 1933 + link->link = NULL; 1934 + 1935 trace_io_uring_fail_link(req, link); 1936 io_cqring_fill_event(link, -ECANCELED); 1937 1938 /* ··· 1963 io_put_req_deferred(link, 2); 1964 else 1965 io_double_put_req(link); 1966 + link = nxt; 1967 } 1968 io_commit_cqring(ctx); 1969 spin_unlock_irqrestore(&ctx->completion_lock, flags); 1970 ··· 1973 1974 static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) 1975 { 1976 if (req->flags & REQ_F_LINK_TIMEOUT) 1977 io_kill_linked_timeout(req); 1978 ··· 1983 * dependencies to the next request. In case of failure, fail the rest 1984 * of the chain. 1985 */ 1986 + if (likely(!(req->flags & REQ_F_FAIL_LINK))) { 1987 + struct io_kiocb *nxt = req->link; 1988 + 1989 + req->link = NULL; 1990 + return nxt; 1991 + } 1992 io_fail_links(req); 1993 return NULL; 1994 } 1995 1996 + static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) 1997 { 1998 + if (likely(!(req->link) && !(req->flags & REQ_F_LINK_TIMEOUT))) 1999 return NULL; 2000 return __io_req_find_next(req); 2001 } ··· 2050 { 2051 struct io_ring_ctx *ctx = req->ctx; 2052 2053 + if (!__io_sq_thread_acquire_mm(ctx) && 2054 + !__io_sq_thread_acquire_files(ctx)) { 2055 mutex_lock(&ctx->uring_lock); 2056 __io_queue_sqe(req, NULL); 2057 mutex_unlock(&ctx->uring_lock); ··· 2086 } 2087 } 2088 2089 + static inline void io_queue_next(struct io_kiocb *req) 2090 { 2091 struct io_kiocb *nxt = io_req_find_next(req); 2092 ··· 2143 io_free_req(req); 2144 return; 2145 } 2146 + io_queue_next(req); 2147 2148 if (req->task != rb->task) { 2149 if (rb->task) { ··· 2246 * we wake up the task, and the next invocation will flush the 2247 * entries. We cannot safely to it from here. 2248 */ 2249 + if (noflush) 2250 return -1U; 2251 2252 io_cqring_overflow_flush(ctx, false, NULL, NULL); ··· 2593 if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) 2594 return false; 2595 2596 + ret = io_sq_thread_acquire_mm_files(req->ctx, req); 2597 2598 if (io_resubmit_prep(req, ret)) { 2599 refcount_inc(&req->refs); ··· 2641 * find it from a io_iopoll_getevents() thread before the issuer is done 2642 * accessing the kiocb cookie. 2643 */ 2644 + static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async) 2645 { 2646 struct io_ring_ctx *ctx = req->ctx; 2647 ··· 2670 else 2671 list_add_tail(&req->inflight_entry, &ctx->iopoll_list); 2672 2673 + /* 2674 + * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread 2675 + * task context or in io worker task context. If current task context is 2676 + * sq thread, we don't need to check whether should wake up sq thread. 2677 + */ 2678 + if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) && 2679 wq_has_sleeper(&ctx->sq_data->wait)) 2680 wake_up(&ctx->sq_data->wait); 2681 } 2682 2683 + static inline void __io_state_file_put(struct io_submit_state *state) 2684 { 2685 + fput_many(state->file, state->file_refs); 2686 + state->file_refs = 0; 2687 } 2688 2689 static inline void io_state_file_put(struct io_submit_state *state) 2690 { 2691 + if (state->file_refs) 2692 __io_state_file_put(state); 2693 } 2694 ··· 2698 if (!state) 2699 return fget(fd); 2700 2701 + if (state->file_refs) { 2702 if (state->fd == fd) { 2703 + state->file_refs--; 2704 return state->file; 2705 } 2706 __io_state_file_put(state); 2707 } 2708 state->file = fget_many(fd, state->ios_left); 2709 + if (unlikely(!state->file)) 2710 return NULL; 2711 2712 state->fd = fd; 2713 + state->file_refs = state->ios_left - 1; 2714 return state->file; 2715 } 2716 ··· 3065 return __io_iov_buffer_select(req, iov, needs_lock); 3066 } 3067 3068 + static ssize_t io_import_iovec(int rw, struct io_kiocb *req, 3069 struct iovec **iovec, struct iov_iter *iter, 3070 bool needs_lock) 3071 { ··· 3094 3095 ret = import_single_range(rw, buf, sqe_len, *iovec, iter); 3096 *iovec = NULL; 3097 + return ret; 3098 } 3099 3100 if (req->flags & REQ_F_BUFFER_SELECT) { ··· 3109 3110 return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter, 3111 req->ctx->compat); 3112 } 3113 3114 static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) ··· 3246 struct iovec *iov = iorw->fast_iov; 3247 ssize_t ret; 3248 3249 + ret = io_import_iovec(rw, req, &iov, &iorw->iter, false); 3250 if (unlikely(ret < 0)) 3251 return ret; 3252 ··· 3379 struct iov_iter __iter, *iter = &__iter; 3380 struct io_async_rw *rw = req->async_data; 3381 ssize_t io_size, ret, ret2; 3382 bool no_async; 3383 3384 + if (rw) { 3385 iter = &rw->iter; 3386 + iovec = NULL; 3387 + } else { 3388 + ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); 3389 + if (ret < 0) 3390 + return ret; 3391 + } 3392 + io_size = iov_iter_count(iter); 3393 req->result = io_size; 3394 ret = 0; 3395 ··· 3405 if (no_async) 3406 goto copy_iov; 3407 3408 + ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size); 3409 if (unlikely(ret)) 3410 goto out_free; 3411 ··· 3424 if (req->file->f_flags & O_NONBLOCK) 3425 goto done; 3426 /* some cases will consume bytes even on error returns */ 3427 + iov_iter_revert(iter, io_size - iov_iter_count(iter)); 3428 ret = 0; 3429 goto copy_iov; 3430 } else if (ret < 0) { ··· 3507 struct kiocb *kiocb = &req->rw.kiocb; 3508 struct iov_iter __iter, *iter = &__iter; 3509 struct io_async_rw *rw = req->async_data; 3510 ssize_t ret, ret2, io_size; 3511 3512 + if (rw) { 3513 iter = &rw->iter; 3514 + iovec = NULL; 3515 + } else { 3516 + ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); 3517 + if (ret < 0) 3518 + return ret; 3519 + } 3520 + io_size = iov_iter_count(iter); 3521 req->result = io_size; 3522 3523 /* Ensure we clear previously set non-block flag */ ··· 3535 (req->flags & REQ_F_ISREG)) 3536 goto copy_iov; 3537 3538 + ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size); 3539 if (unlikely(ret)) 3540 goto out_free; 3541 ··· 3578 } else { 3579 copy_iov: 3580 /* some cases will consume bytes even on error returns */ 3581 + iov_iter_revert(iter, io_size - iov_iter_count(iter)); 3582 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); 3583 if (!ret) 3584 return -EAGAIN; ··· 3588 if (iovec) 3589 kfree(iovec); 3590 return ret; 3591 + } 3592 + 3593 + static int io_renameat_prep(struct io_kiocb *req, 3594 + const struct io_uring_sqe *sqe) 3595 + { 3596 + struct io_rename *ren = &req->rename; 3597 + const char __user *oldf, *newf; 3598 + 3599 + if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3600 + return -EBADF; 3601 + 3602 + ren->old_dfd = READ_ONCE(sqe->fd); 3603 + oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3604 + newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 3605 + ren->new_dfd = READ_ONCE(sqe->len); 3606 + ren->flags = READ_ONCE(sqe->rename_flags); 3607 + 3608 + ren->oldpath = getname(oldf); 3609 + if (IS_ERR(ren->oldpath)) 3610 + return PTR_ERR(ren->oldpath); 3611 + 3612 + ren->newpath = getname(newf); 3613 + if (IS_ERR(ren->newpath)) { 3614 + putname(ren->oldpath); 3615 + return PTR_ERR(ren->newpath); 3616 + } 3617 + 3618 + req->flags |= REQ_F_NEED_CLEANUP; 3619 + return 0; 3620 + } 3621 + 3622 + static int io_renameat(struct io_kiocb *req, bool force_nonblock) 3623 + { 3624 + struct io_rename *ren = &req->rename; 3625 + int ret; 3626 + 3627 + if (force_nonblock) 3628 + return -EAGAIN; 3629 + 3630 + ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, 3631 + ren->newpath, ren->flags); 3632 + 3633 + req->flags &= ~REQ_F_NEED_CLEANUP; 3634 + if (ret < 0) 3635 + req_set_fail_links(req); 3636 + io_req_complete(req, ret); 3637 + return 0; 3638 + } 3639 + 3640 + static int io_unlinkat_prep(struct io_kiocb *req, 3641 + const struct io_uring_sqe *sqe) 3642 + { 3643 + struct io_unlink *un = &req->unlink; 3644 + const char __user *fname; 3645 + 3646 + if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3647 + return -EBADF; 3648 + 3649 + un->dfd = READ_ONCE(sqe->fd); 3650 + 3651 + un->flags = READ_ONCE(sqe->unlink_flags); 3652 + if (un->flags & ~AT_REMOVEDIR) 3653 + return -EINVAL; 3654 + 3655 + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3656 + un->filename = getname(fname); 3657 + if (IS_ERR(un->filename)) 3658 + return PTR_ERR(un->filename); 3659 + 3660 + req->flags |= REQ_F_NEED_CLEANUP; 3661 + return 0; 3662 + } 3663 + 3664 + static int io_unlinkat(struct io_kiocb *req, bool force_nonblock) 3665 + { 3666 + struct io_unlink *un = &req->unlink; 3667 + int ret; 3668 + 3669 + if (force_nonblock) 3670 + return -EAGAIN; 3671 + 3672 + if (un->flags & AT_REMOVEDIR) 3673 + ret = do_rmdir(un->dfd, un->filename); 3674 + else 3675 + ret = do_unlinkat(un->dfd, un->filename); 3676 + 3677 + req->flags &= ~REQ_F_NEED_CLEANUP; 3678 + if (ret < 0) 3679 + req_set_fail_links(req); 3680 + io_req_complete(req, ret); 3681 + return 0; 3682 + } 3683 + 3684 + static int io_shutdown_prep(struct io_kiocb *req, 3685 + const struct io_uring_sqe *sqe) 3686 + { 3687 + #if defined(CONFIG_NET) 3688 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3689 + return -EINVAL; 3690 + if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || 3691 + sqe->buf_index) 3692 + return -EINVAL; 3693 + 3694 + req->shutdown.how = READ_ONCE(sqe->len); 3695 + return 0; 3696 + #else 3697 + return -EOPNOTSUPP; 3698 + #endif 3699 + } 3700 + 3701 + static int io_shutdown(struct io_kiocb *req, bool force_nonblock) 3702 + { 3703 + #if defined(CONFIG_NET) 3704 + struct socket *sock; 3705 + int ret; 3706 + 3707 + if (force_nonblock) 3708 + return -EAGAIN; 3709 + 3710 + sock = sock_from_file(req->file); 3711 + if (unlikely(!sock)) 3712 + return -ENOTSOCK; 3713 + 3714 + ret = __sys_shutdown_sock(sock, req->shutdown.how); 3715 + io_req_complete(req, ret); 3716 + return 0; 3717 + #else 3718 + return -EOPNOTSUPP; 3719 + #endif 3720 } 3721 3722 static int __io_splice_prep(struct io_kiocb *req, ··· 3804 { 3805 u64 flags, mode; 3806 3807 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3808 return -EINVAL; 3809 mode = READ_ONCE(sqe->len); 3810 flags = READ_ONCE(sqe->open_flags); ··· 3818 size_t len; 3819 int ret; 3820 3821 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3822 return -EINVAL; 3823 how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 3824 len = READ_ONCE(sqe->len); ··· 3948 head = idr_find(&ctx->io_buffer_idr, p->bgid); 3949 if (head) 3950 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); 3951 if (ret < 0) 3952 req_set_fail_links(req); 3953 + 3954 + /* need to hold the lock to complete IOPOLL requests */ 3955 + if (ctx->flags & IORING_SETUP_IOPOLL) { 3956 + __io_req_complete(req, ret, 0, cs); 3957 + io_ring_submit_unlock(ctx, !force_nonblock); 3958 + } else { 3959 + io_ring_submit_unlock(ctx, !force_nonblock); 3960 + __io_req_complete(req, ret, 0, cs); 3961 + } 3962 return 0; 3963 } 3964 ··· 4037 } 4038 } 4039 out: 4040 if (ret < 0) 4041 req_set_fail_links(req); 4042 + 4043 + /* need to hold the lock to complete IOPOLL requests */ 4044 + if (ctx->flags & IORING_SETUP_IOPOLL) { 4045 + __io_req_complete(req, ret, 0, cs); 4046 + io_ring_submit_unlock(ctx, !force_nonblock); 4047 + } else { 4048 + io_ring_submit_unlock(ctx, !force_nonblock); 4049 + __io_req_complete(req, ret, 0, cs); 4050 + } 4051 return 0; 4052 } 4053 ··· 4212 io_req_init_async(req); 4213 req->work.flags |= IO_WQ_WORK_NO_CANCEL; 4214 4215 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4216 return -EINVAL; 4217 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || 4218 sqe->rw_flags || sqe->buf_index) ··· 4694 { 4695 struct io_accept *accept = &req->accept; 4696 4697 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4698 return -EINVAL; 4699 if (sqe->ioprio || sqe->len || sqe->buf_index) 4700 return -EINVAL; ··· 4735 struct io_connect *conn = &req->connect; 4736 struct io_async_connect *io = req->async_data; 4737 4738 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4739 return -EINVAL; 4740 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) 4741 return -EINVAL; ··· 5270 /* 5271 * Returns true if we found and killed one or more poll requests 5272 */ 5273 + static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, 5274 + struct files_struct *files) 5275 { 5276 struct hlist_node *tmp; 5277 struct io_kiocb *req; ··· 5282 5283 list = &ctx->cancel_hash[i]; 5284 hlist_for_each_entry_safe(req, tmp, list, hash_node) { 5285 + if (io_match_task(req, tsk, files)) 5286 posted += io_poll_remove_one(req); 5287 } 5288 } ··· 5320 sqe->poll_events) 5321 return -EINVAL; 5322 5323 + req->poll_remove.addr = READ_ONCE(sqe->addr); 5324 return 0; 5325 } 5326 ··· 5331 static int io_poll_remove(struct io_kiocb *req) 5332 { 5333 struct io_ring_ctx *ctx = req->ctx; 5334 int ret; 5335 5336 spin_lock_irq(&ctx->completion_lock); 5337 + ret = io_poll_cancel(ctx, req->poll_remove.addr); 5338 spin_unlock_irq(&ctx->completion_lock); 5339 5340 if (ret < 0) ··· 5429 return HRTIMER_NORESTART; 5430 } 5431 5432 + static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, 5433 + __u64 user_data) 5434 { 5435 + struct io_timeout_data *io; 5436 struct io_kiocb *req; 5437 int ret = -ENOENT; 5438 ··· 5458 } 5459 5460 if (ret == -ENOENT) 5461 + return ERR_PTR(ret); 5462 5463 + io = req->async_data; 5464 + ret = hrtimer_try_to_cancel(&io->timer); 5465 + if (ret == -1) 5466 + return ERR_PTR(-EALREADY); 5467 + list_del_init(&req->timeout.list); 5468 + return req; 5469 + } 5470 + 5471 + static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) 5472 + { 5473 + struct io_kiocb *req = io_timeout_extract(ctx, user_data); 5474 + 5475 + if (IS_ERR(req)) 5476 + return PTR_ERR(req); 5477 + 5478 + req_set_fail_links(req); 5479 + io_cqring_fill_event(req, -ECANCELED); 5480 + io_put_req_deferred(req, 1); 5481 + return 0; 5482 + } 5483 + 5484 + static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, 5485 + struct timespec64 *ts, enum hrtimer_mode mode) 5486 + { 5487 + struct io_kiocb *req = io_timeout_extract(ctx, user_data); 5488 + struct io_timeout_data *data; 5489 + 5490 + if (IS_ERR(req)) 5491 + return PTR_ERR(req); 5492 + 5493 + req->timeout.off = 0; /* noseq */ 5494 + data = req->async_data; 5495 + list_add_tail(&req->timeout.list, &ctx->timeout_list); 5496 + hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode); 5497 + data->timer.function = io_timeout_fn; 5498 + hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); 5499 + return 0; 5500 } 5501 5502 static int io_timeout_remove_prep(struct io_kiocb *req, 5503 const struct io_uring_sqe *sqe) 5504 { 5505 + struct io_timeout_rem *tr = &req->timeout_rem; 5506 + 5507 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5508 return -EINVAL; 5509 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 5510 return -EINVAL; 5511 + if (sqe->ioprio || sqe->buf_index || sqe->len) 5512 return -EINVAL; 5513 5514 + tr->addr = READ_ONCE(sqe->addr); 5515 + tr->flags = READ_ONCE(sqe->timeout_flags); 5516 + if (tr->flags & IORING_TIMEOUT_UPDATE) { 5517 + if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS)) 5518 + return -EINVAL; 5519 + if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) 5520 + return -EFAULT; 5521 + } else if (tr->flags) { 5522 + /* timeout removal doesn't support flags */ 5523 + return -EINVAL; 5524 + } 5525 + 5526 return 0; 5527 } 5528 ··· 5482 */ 5483 static int io_timeout_remove(struct io_kiocb *req) 5484 { 5485 + struct io_timeout_rem *tr = &req->timeout_rem; 5486 struct io_ring_ctx *ctx = req->ctx; 5487 int ret; 5488 5489 spin_lock_irq(&ctx->completion_lock); 5490 + if (req->timeout_rem.flags & IORING_TIMEOUT_UPDATE) { 5491 + enum hrtimer_mode mode = (tr->flags & IORING_TIMEOUT_ABS) 5492 + ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; 5493 + 5494 + ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); 5495 + } else { 5496 + ret = io_timeout_cancel(ctx, tr->addr); 5497 + } 5498 5499 io_cqring_fill_event(req, ret); 5500 io_commit_cqring(ctx); ··· 5766 return io_remove_buffers_prep(req, sqe); 5767 case IORING_OP_TEE: 5768 return io_tee_prep(req, sqe); 5769 + case IORING_OP_SHUTDOWN: 5770 + return io_shutdown_prep(req, sqe); 5771 + case IORING_OP_RENAMEAT: 5772 + return io_renameat_prep(req, sqe); 5773 + case IORING_OP_UNLINKAT: 5774 + return io_unlinkat_prep(req, sqe); 5775 } 5776 5777 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", ··· 5787 { 5788 struct io_kiocb *pos; 5789 struct io_ring_ctx *ctx = req->ctx; 5790 + u32 total_submitted, nr_reqs = 0; 5791 5792 + io_for_each_link(pos, req) 5793 + nr_reqs++; 5794 5795 total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped; 5796 return total_submitted - nr_reqs; ··· 5843 static void io_req_drop_files(struct io_kiocb *req) 5844 { 5845 struct io_ring_ctx *ctx = req->ctx; 5846 + struct io_uring_task *tctx = req->task->io_uring; 5847 unsigned long flags; 5848 5849 spin_lock_irqsave(&ctx->inflight_lock, flags); 5850 list_del(&req->inflight_entry); 5851 + if (atomic_read(&tctx->in_idle)) 5852 + wake_up(&tctx->wait); 5853 spin_unlock_irqrestore(&ctx->inflight_lock, flags); 5854 req->flags &= ~REQ_F_INFLIGHT; 5855 put_files_struct(req->work.identity->files); ··· 5902 case IORING_OP_OPENAT2: 5903 if (req->open.filename) 5904 putname(req->open.filename); 5905 + break; 5906 + case IORING_OP_RENAMEAT: 5907 + putname(req->rename.oldpath); 5908 + putname(req->rename.newpath); 5909 + break; 5910 + case IORING_OP_UNLINKAT: 5911 + putname(req->unlink.filename); 5912 break; 5913 } 5914 req->flags &= ~REQ_F_NEED_CLEANUP; ··· 6009 case IORING_OP_TEE: 6010 ret = io_tee(req, force_nonblock); 6011 break; 6012 + case IORING_OP_SHUTDOWN: 6013 + ret = io_shutdown(req, force_nonblock); 6014 + break; 6015 + case IORING_OP_RENAMEAT: 6016 + ret = io_renameat(req, force_nonblock); 6017 + break; 6018 + case IORING_OP_UNLINKAT: 6019 + ret = io_unlinkat(req, force_nonblock); 6020 + break; 6021 default: 6022 ret = -EINVAL; 6023 break; ··· 6025 if (in_async) 6026 mutex_lock(&ctx->uring_lock); 6027 6028 + io_iopoll_req_issued(req, in_async); 6029 6030 if (in_async) 6031 mutex_unlock(&ctx->uring_lock); ··· 6065 } 6066 6067 if (ret) { 6068 + /* 6069 + * io_iopoll_complete() does not hold completion_lock to complete 6070 + * polled io, so here for polled io, just mark it done and still let 6071 + * io_iopoll_complete() complete it. 6072 + */ 6073 + if (req->ctx->flags & IORING_SETUP_IOPOLL) { 6074 + struct kiocb *kiocb = &req->rw.kiocb; 6075 + 6076 + kiocb_done(kiocb, ret, NULL); 6077 + } else { 6078 + req_set_fail_links(req); 6079 + io_req_complete(req, ret); 6080 + } 6081 } 6082 6083 return io_steal_work(req); ··· 6092 return NULL; 6093 fd = array_index_nospec(fd, ctx->nr_user_files); 6094 file = io_file_from_index(ctx, fd); 6095 + io_set_resource_node(req); 6096 } else { 6097 trace_io_uring_file_get(ctx, fd); 6098 file = __io_file_get(state, fd); ··· 6104 return file; 6105 } 6106 6107 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) 6108 { 6109 struct io_timeout_data *data = container_of(timer, 6110 struct io_timeout_data, timer); 6111 + struct io_kiocb *prev, *req = data->req; 6112 struct io_ring_ctx *ctx = req->ctx; 6113 unsigned long flags; 6114 6115 spin_lock_irqsave(&ctx->completion_lock, flags); 6116 + prev = req->timeout.head; 6117 + req->timeout.head = NULL; 6118 6119 /* 6120 * We don't expect the list to be empty, that will only happen if we 6121 * race with the completion of the linked work. 6122 */ 6123 + if (prev && refcount_inc_not_zero(&prev->refs)) 6124 + io_remove_next_linked(prev); 6125 + else 6126 + prev = NULL; 6127 spin_unlock_irqrestore(&ctx->completion_lock, flags); 6128 6129 if (prev) { ··· 6158 static void __io_queue_linked_timeout(struct io_kiocb *req) 6159 { 6160 /* 6161 + * If the back reference is NULL, then our linked request finished 6162 + * before we got a chance to setup the timer 6163 */ 6164 + if (req->timeout.head) { 6165 struct io_timeout_data *data = req->async_data; 6166 6167 data->timer.function = io_link_timeout_fn; ··· 6184 6185 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) 6186 { 6187 + struct io_kiocb *nxt = req->link; 6188 6189 + if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) || 6190 + nxt->opcode != IORING_OP_LINK_TIMEOUT) 6191 return NULL; 6192 6193 + nxt->timeout.head = req; 6194 nxt->flags |= REQ_F_LTIMEOUT_ACTIVE; 6195 req->flags |= REQ_F_LINK_TIMEOUT; 6196 return nxt; ··· 6301 io_queue_sqe(req, NULL, cs); 6302 } 6303 6304 + struct io_submit_link { 6305 + struct io_kiocb *head; 6306 + struct io_kiocb *last; 6307 + }; 6308 + 6309 static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, 6310 + struct io_submit_link *link, struct io_comp_state *cs) 6311 { 6312 struct io_ring_ctx *ctx = req->ctx; 6313 int ret; ··· 6314 * submitted sync once the chain is complete. If none of those 6315 * conditions are true (normal request), then just queue it. 6316 */ 6317 + if (link->head) { 6318 + struct io_kiocb *head = link->head; 6319 6320 /* 6321 * Taking sequential execution of a link, draining both sides ··· 6335 return ret; 6336 } 6337 trace_io_uring_link(ctx, req, head); 6338 + link->last->link = req; 6339 + link->last = req; 6340 6341 /* last request of a link, enqueue the link */ 6342 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { 6343 io_queue_link_head(head, cs); 6344 + link->head = NULL; 6345 } 6346 } else { 6347 if (unlikely(ctx->drain_next)) { ··· 6348 ctx->drain_next = 0; 6349 } 6350 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { 6351 ret = io_req_defer_prep(req, sqe); 6352 if (unlikely(ret)) 6353 req->flags |= REQ_F_FAIL_LINK; 6354 + link->head = req; 6355 + link->last = req; 6356 } else { 6357 io_queue_sqe(req, sqe, cs); 6358 } ··· 6370 { 6371 if (!list_empty(&state->comp.list)) 6372 io_submit_flush_completions(&state->comp); 6373 + if (state->plug_started) 6374 + blk_finish_plug(&state->plug); 6375 io_state_file_put(state); 6376 if (state->free_reqs) 6377 kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs); ··· 6382 static void io_submit_state_start(struct io_submit_state *state, 6383 struct io_ring_ctx *ctx, unsigned int max_ios) 6384 { 6385 + state->plug_started = false; 6386 state->comp.nr = 0; 6387 INIT_LIST_HEAD(&state->comp.list); 6388 state->comp.ctx = ctx; 6389 state->free_reqs = 0; 6390 + state->file_refs = 0; 6391 state->ios_left = max_ios; 6392 } 6393 ··· 6482 req->file = NULL; 6483 req->ctx = ctx; 6484 req->flags = 0; 6485 + req->link = NULL; 6486 + req->fixed_file_refs = NULL; 6487 /* one is dropped after submission, the other at completion */ 6488 refcount_set(&req->refs, 2); 6489 req->task = current; ··· 6490 if (unlikely(req->opcode >= IORING_OP_LAST)) 6491 return -EINVAL; 6492 6493 + if (unlikely(io_sq_thread_acquire_mm_files(ctx, req))) 6494 return -EFAULT; 6495 6496 sqe_flags = READ_ONCE(sqe->flags); ··· 6523 /* same numerical values with corresponding REQ_F_*, safe to copy */ 6524 req->flags |= sqe_flags; 6525 6526 + /* 6527 + * Plug now if we have more than 1 IO left after this, and the target 6528 + * is potentially a read/write to block based storage. 6529 + */ 6530 + if (!state->plug_started && state->ios_left > 1 && 6531 + io_op_defs[req->opcode].plug) { 6532 + blk_start_plug(&state->plug); 6533 + state->plug_started = true; 6534 + } 6535 6536 + ret = 0; 6537 + if (io_op_defs[req->opcode].needs_file) { 6538 + bool fixed = req->flags & REQ_F_FIXED_FILE; 6539 + 6540 + req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed); 6541 + if (unlikely(!req->file && 6542 + !io_op_defs[req->opcode].needs_file_no_error)) 6543 + ret = -EBADF; 6544 + } 6545 + 6546 state->ios_left--; 6547 return ret; 6548 } ··· 6534 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) 6535 { 6536 struct io_submit_state state; 6537 + struct io_submit_link link; 6538 int i, submitted = 0; 6539 6540 /* if we have a backlog and couldn't flush it all, return BUSY */ ··· 6554 refcount_add(nr, &current->usage); 6555 6556 io_submit_state_start(&state, ctx, nr); 6557 + link.head = NULL; 6558 6559 for (i = 0; i < nr; i++) { 6560 const struct io_uring_sqe *sqe; ··· 6599 percpu_counter_sub(&tctx->inflight, unused); 6600 put_task_struct_many(current, unused); 6601 } 6602 + if (link.head) 6603 + io_queue_link_head(link.head, &state.comp); 6604 io_submit_state_end(&state); 6605 6606 /* Commit SQ ring head once we've consumed and submitted all SQEs */ ··· 6624 spin_unlock_irq(&ctx->completion_lock); 6625 } 6626 6627 + static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) 6628 { 6629 unsigned int to_submit; 6630 int ret = 0; 6631 6632 to_submit = io_sqring_entries(ctx); 6633 /* if we're handling multiple rings, cap submit size for fairness */ 6634 if (cap_entries && to_submit > 8) 6635 to_submit = 8; 6636 6637 + if (!list_empty(&ctx->iopoll_list) || to_submit) { 6638 + unsigned nr_events = 0; 6639 + 6640 + mutex_lock(&ctx->uring_lock); 6641 + if (!list_empty(&ctx->iopoll_list)) 6642 + io_do_iopoll(ctx, &nr_events, 0); 6643 + 6644 + if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs))) 6645 + ret = io_submit_sqes(ctx, to_submit); 6646 + mutex_unlock(&ctx->uring_lock); 6647 + } 6648 6649 if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait)) 6650 wake_up(&ctx->sqo_sq_wait); 6651 6652 + return ret; 6653 + } 6654 + 6655 + static void io_sqd_update_thread_idle(struct io_sq_data *sqd) 6656 + { 6657 + struct io_ring_ctx *ctx; 6658 + unsigned sq_thread_idle = 0; 6659 + 6660 + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 6661 + if (sq_thread_idle < ctx->sq_thread_idle) 6662 + sq_thread_idle = ctx->sq_thread_idle; 6663 + } 6664 + 6665 + sqd->sq_thread_idle = sq_thread_idle; 6666 } 6667 6668 static void io_sqd_init_new(struct io_sq_data *sqd) ··· 6737 6738 while (!list_empty(&sqd->ctx_new_list)) { 6739 ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list); 6740 list_move_tail(&ctx->sqd_list, &sqd->ctx_list); 6741 complete(&ctx->sq_thread_comp); 6742 } 6743 + 6744 + io_sqd_update_thread_idle(sqd); 6745 } 6746 6747 static int io_sq_thread(void *data) 6748 { 6749 struct cgroup_subsys_state *cur_css = NULL; 6750 + struct files_struct *old_files = current->files; 6751 + struct nsproxy *old_nsproxy = current->nsproxy; 6752 const struct cred *old_cred = NULL; 6753 struct io_sq_data *sqd = data; 6754 struct io_ring_ctx *ctx; 6755 + unsigned long timeout = 0; 6756 + DEFINE_WAIT(wait); 6757 6758 + task_lock(current); 6759 + current->files = NULL; 6760 + current->nsproxy = NULL; 6761 + task_unlock(current); 6762 + 6763 while (!kthread_should_stop()) { 6764 + int ret; 6765 + bool cap_entries, sqt_spin, needs_sched; 6766 6767 /* 6768 * Any changes to the sqd lists are synchronized through the 6769 * kthread parking. This synchronizes the thread vs users, 6770 * the users are synchronized on the sqd->ctx_lock. 6771 */ 6772 + if (kthread_should_park()) { 6773 kthread_parkme(); 6774 + /* 6775 + * When sq thread is unparked, in case the previous park operation 6776 + * comes from io_put_sq_data(), which means that sq thread is going 6777 + * to be stopped, so here needs to have a check. 6778 + */ 6779 + if (kthread_should_stop()) 6780 + break; 6781 + } 6782 6783 + if (unlikely(!list_empty(&sqd->ctx_new_list))) { 6784 io_sqd_init_new(sqd); 6785 + timeout = jiffies + sqd->sq_thread_idle; 6786 + } 6787 6788 + sqt_spin = false; 6789 cap_entries = !list_is_singular(&sqd->ctx_list); 6790 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 6791 if (current->cred != ctx->creds) { 6792 if (old_cred) ··· 6782 current->sessionid = ctx->sessionid; 6783 #endif 6784 6785 + ret = __io_sq_thread(ctx, cap_entries); 6786 + if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list))) 6787 + sqt_spin = true; 6788 6789 + io_sq_thread_drop_mm_files(); 6790 } 6791 6792 + if (sqt_spin || !time_after(jiffies, timeout)) { 6793 io_run_task_work(); 6794 cond_resched(); 6795 + if (sqt_spin) 6796 + timeout = jiffies + sqd->sq_thread_idle; 6797 + continue; 6798 + } 6799 + 6800 + if (kthread_should_park()) 6801 + continue; 6802 + 6803 + needs_sched = true; 6804 + prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); 6805 + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 6806 + if ((ctx->flags & IORING_SETUP_IOPOLL) && 6807 + !list_empty_careful(&ctx->iopoll_list)) { 6808 + needs_sched = false; 6809 + break; 6810 + } 6811 + if (io_sqring_entries(ctx)) { 6812 + needs_sched = false; 6813 + break; 6814 + } 6815 + } 6816 + 6817 + if (needs_sched) { 6818 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 6819 io_ring_set_wakeup_flag(ctx); 6820 + 6821 schedule(); 6822 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 6823 io_ring_clear_wakeup_flag(ctx); 6824 } 6825 + 6826 + finish_wait(&sqd->wait, &wait); 6827 + timeout = jiffies + sqd->sq_thread_idle; 6828 } 6829 6830 io_run_task_work(); ··· 6808 io_sq_thread_unassociate_blkcg(); 6809 if (old_cred) 6810 revert_creds(old_cred); 6811 + 6812 + task_lock(current); 6813 + current->files = old_files; 6814 + current->nsproxy = old_nsproxy; 6815 + task_unlock(current); 6816 6817 kthread_parkme(); 6818 ··· 6863 * application must reap them itself, as they reside on the shared cq ring. 6864 */ 6865 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, 6866 + const sigset_t __user *sig, size_t sigsz, 6867 + struct __kernel_timespec __user *uts) 6868 { 6869 struct io_wait_queue iowq = { 6870 .wq = { ··· 6875 .to_wait = min_events, 6876 }; 6877 struct io_rings *rings = ctx->rings; 6878 + struct timespec64 ts; 6879 + signed long timeout = 0; 6880 int ret = 0; 6881 6882 do { ··· 6897 return ret; 6898 } 6899 6900 + if (uts) { 6901 + if (get_timespec64(&ts, uts)) 6902 + return -EFAULT; 6903 + timeout = timespec64_to_jiffies(&ts); 6904 + } 6905 + 6906 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); 6907 trace_io_uring_cqring_wait(ctx, min_events); 6908 do { ··· 6910 break; 6911 if (io_should_wake(&iowq, false)) 6912 break; 6913 + if (uts) { 6914 + timeout = schedule_timeout(timeout); 6915 + if (timeout == 0) { 6916 + ret = -ETIME; 6917 + break; 6918 + } 6919 + } else { 6920 + schedule(); 6921 + } 6922 } while (1); 6923 finish_wait(&ctx->wait, &iowq.wq); 6924 ··· 6959 if (!data) 6960 return -ENXIO; 6961 6962 + spin_lock_bh(&data->lock); 6963 ref_node = data->node; 6964 + spin_unlock_bh(&data->lock); 6965 if (ref_node) 6966 percpu_ref_kill(&ref_node->refs); 6967 ··· 7084 7085 mutex_lock(&sqd->ctx_lock); 7086 list_del(&ctx->sqd_list); 7087 + io_sqd_update_thread_idle(sqd); 7088 mutex_unlock(&sqd->ctx_lock); 7089 7090 + if (sqd->thread) 7091 io_sq_thread_unpark(sqd); 7092 7093 io_put_sq_data(sqd); 7094 ctx->sq_data = NULL; ··· 7344 data = ref_node->file_data; 7345 ctx = data->ctx; 7346 7347 + spin_lock_bh(&data->lock); 7348 ref_node->done = true; 7349 7350 while (!list_empty(&data->ref_list)) { ··· 7356 list_del(&ref_node->node); 7357 first_add |= llist_add(&ref_node->llist, &ctx->file_put_llist); 7358 } 7359 + spin_unlock_bh(&data->lock); 7360 7361 if (percpu_ref_is_dying(&data->refs)) 7362 delay = 0; ··· 7479 } 7480 7481 file_data->node = ref_node; 7482 + spin_lock_bh(&file_data->lock); 7483 list_add_tail(&ref_node->node, &file_data->ref_list); 7484 + spin_unlock_bh(&file_data->lock); 7485 percpu_ref_get(&file_data->refs); 7486 return ret; 7487 out_fput: ··· 7638 7639 if (needs_switch) { 7640 percpu_ref_kill(&data->node->refs); 7641 + spin_lock_bh(&data->lock); 7642 list_add_tail(&ref_node->node, &data->ref_list); 7643 data->node = ref_node; 7644 + spin_unlock_bh(&data->lock); 7645 percpu_ref_get(&ctx->file_data->refs); 7646 } else 7647 destroy_fixed_file_ref_node(ref_node); ··· 7769 struct io_sq_data *sqd; 7770 7771 ret = -EPERM; 7772 + if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE)) 7773 goto err; 7774 7775 sqd = io_get_sq_data(p); ··· 8355 * as nobody else will be looking for them. 8356 */ 8357 do { 8358 io_iopoll_try_reap_events(ctx); 8359 } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); 8360 io_ring_ctx_free(ctx); ··· 8366 { 8367 mutex_lock(&ctx->uring_lock); 8368 percpu_ref_kill(&ctx->refs); 8369 + if (ctx->rings) 8370 + io_cqring_overflow_flush(ctx, true, NULL, NULL); 8371 mutex_unlock(&ctx->uring_lock); 8372 8373 + io_kill_timeouts(ctx, NULL, NULL); 8374 + io_poll_remove_all(ctx, NULL, NULL); 8375 8376 if (ctx->io_wq) 8377 io_wq_cancel_all(ctx->io_wq); 8378 8379 /* if we failed setting up the ctx, we might not have any rings */ 8380 io_iopoll_try_reap_events(ctx); 8381 idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); 8382 ··· 8407 return 0; 8408 } 8409 8410 + struct io_task_cancel { 8411 + struct task_struct *task; 8412 + struct files_struct *files; 8413 + }; 8414 8415 + static bool io_cancel_task_cb(struct io_wq_work *work, void *data) 8416 { 8417 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 8418 + struct io_task_cancel *cancel = data; 8419 bool ret; 8420 8421 + if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) { 8422 unsigned long flags; 8423 struct io_ring_ctx *ctx = req->ctx; 8424 8425 /* protect against races with linked timeouts */ 8426 spin_lock_irqsave(&ctx->completion_lock, flags); 8427 + ret = io_match_task(req, cancel->task, cancel->files); 8428 spin_unlock_irqrestore(&ctx->completion_lock, flags); 8429 } else { 8430 + ret = io_match_task(req, cancel->task, cancel->files); 8431 } 8432 return ret; 8433 } 8434 8435 static void io_cancel_defer_files(struct io_ring_ctx *ctx, ··· 8530 8531 spin_lock_irq(&ctx->completion_lock); 8532 list_for_each_entry_reverse(de, &ctx->defer_list, list) { 8533 + if (io_match_task(de->req, task, files)) { 8534 list_cut_position(&list, &ctx->defer_list, &de->list); 8535 break; 8536 } ··· 8548 } 8549 } 8550 8551 + static void io_uring_cancel_files(struct io_ring_ctx *ctx, 8552 + struct task_struct *task, 8553 struct files_struct *files) 8554 { 8555 while (!list_empty_careful(&ctx->inflight_list)) { 8556 + struct io_task_cancel cancel = { .task = task, .files = files }; 8557 + struct io_kiocb *req; 8558 DEFINE_WAIT(wait); 8559 + bool found = false; 8560 8561 spin_lock_irq(&ctx->inflight_lock); 8562 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { 8563 + if (req->task != task || 8564 req->work.identity->files != files) 8565 continue; 8566 + found = true; 8567 break; 8568 } 8569 + if (found) 8570 + prepare_to_wait(&task->io_uring->wait, &wait, 8571 + TASK_UNINTERRUPTIBLE); 8572 spin_unlock_irq(&ctx->inflight_lock); 8573 8574 /* We need to keep going until we don't find a matching req */ 8575 + if (!found) 8576 break; 8577 + 8578 + io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true); 8579 + io_poll_remove_all(ctx, task, files); 8580 + io_kill_timeouts(ctx, task, files); 8581 /* cancellations _may_ trigger task work */ 8582 io_run_task_work(); 8583 schedule(); 8584 + finish_wait(&task->io_uring->wait, &wait); 8585 } 8586 } 8587 8588 + static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, 8589 + struct task_struct *task) 8590 { 8591 + while (1) { 8592 + struct io_task_cancel cancel = { .task = task, .files = NULL, }; 8593 enum io_wq_cancel cret; 8594 + bool ret = false; 8595 8596 + cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true); 8597 if (cret != IO_WQ_CANCEL_NOTFOUND) 8598 ret = true; 8599 ··· 8625 } 8626 } 8627 8628 + ret |= io_poll_remove_all(ctx, task, NULL); 8629 + ret |= io_kill_timeouts(ctx, task, NULL); 8630 + if (!ret) 8631 + break; 8632 + io_run_task_work(); 8633 + cond_resched(); 8634 } 8635 } 8636 8637 /* ··· 8648 io_sq_thread_park(ctx->sq_data); 8649 } 8650 8651 + io_cancel_defer_files(ctx, task, files); 8652 + io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); 8653 io_cqring_overflow_flush(ctx, true, task, files); 8654 + io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); 8655 8656 + if (!files) 8657 + __io_uring_cancel_task_requests(ctx, task); 8658 + else 8659 + io_uring_cancel_files(ctx, task, files); 8660 8661 if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { 8662 atomic_dec(&task->io_uring->in_idle); ··· 8916 finish_wait(&ctx->sqo_sq_wait, &wait); 8917 } 8918 8919 + static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, 8920 + struct __kernel_timespec __user **ts, 8921 + const sigset_t __user **sig) 8922 + { 8923 + struct io_uring_getevents_arg arg; 8924 + 8925 + /* 8926 + * If EXT_ARG isn't set, then we have no timespec and the argp pointer 8927 + * is just a pointer to the sigset_t. 8928 + */ 8929 + if (!(flags & IORING_ENTER_EXT_ARG)) { 8930 + *sig = (const sigset_t __user *) argp; 8931 + *ts = NULL; 8932 + return 0; 8933 + } 8934 + 8935 + /* 8936 + * EXT_ARG is set - ensure we agree on the size of it and copy in our 8937 + * timespec and sigset_t pointers if good. 8938 + */ 8939 + if (*argsz != sizeof(arg)) 8940 + return -EINVAL; 8941 + if (copy_from_user(&arg, argp, sizeof(arg))) 8942 + return -EFAULT; 8943 + *sig = u64_to_user_ptr(arg.sigmask); 8944 + *argsz = arg.sigmask_sz; 8945 + *ts = u64_to_user_ptr(arg.ts); 8946 + return 0; 8947 + } 8948 + 8949 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, 8950 + u32, min_complete, u32, flags, const void __user *, argp, 8951 + size_t, argsz) 8952 { 8953 struct io_ring_ctx *ctx; 8954 long ret = -EBADF; ··· 8928 io_run_task_work(); 8929 8930 if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | 8931 + IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)) 8932 return -EINVAL; 8933 8934 f = fdget(fd); ··· 8955 */ 8956 ret = 0; 8957 if (ctx->flags & IORING_SETUP_SQPOLL) { 8958 + io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); 8959 if (!list_empty_careful(&ctx->cq_overflow_list)) 8960 io_cqring_overflow_flush(ctx, false, NULL, NULL); 8961 + io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); 8962 if (flags & IORING_ENTER_SQ_WAKEUP) 8963 wake_up(&ctx->sq_data->wait); 8964 if (flags & IORING_ENTER_SQ_WAIT) ··· 8974 goto out; 8975 } 8976 if (flags & IORING_ENTER_GETEVENTS) { 8977 + const sigset_t __user *sig; 8978 + struct __kernel_timespec __user *ts; 8979 + 8980 + ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); 8981 + if (unlikely(ret)) 8982 + goto out; 8983 + 8984 min_complete = min(min_complete, ctx->cq_entries); 8985 8986 /* ··· 8986 !(ctx->flags & IORING_SETUP_SQPOLL)) { 8987 ret = io_iopoll_check(ctx, min_complete); 8988 } else { 8989 + ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts); 8990 } 8991 } 8992 ··· 9354 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | 9355 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | 9356 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | 9357 + IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | 9358 + IORING_FEAT_EXT_ARG; 9359 9360 if (copy_to_user(params, p, sizeof(*p))) { 9361 ret = -EFAULT;
+22 -18
fs/namei.c
··· 4346 } 4347 EXPORT_SYMBOL(vfs_rename); 4348 4349 - static int do_renameat2(int olddfd, const char __user *oldname, int newdfd, 4350 - const char __user *newname, unsigned int flags) 4351 { 4352 struct dentry *old_dentry, *new_dentry; 4353 struct dentry *trap; ··· 4355 struct qstr old_last, new_last; 4356 int old_type, new_type; 4357 struct inode *delegated_inode = NULL; 4358 - struct filename *from; 4359 - struct filename *to; 4360 unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET; 4361 bool should_retry = false; 4362 - int error; 4363 4364 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 4365 - return -EINVAL; 4366 4367 if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && 4368 (flags & RENAME_EXCHANGE)) 4369 - return -EINVAL; 4370 4371 if (flags & RENAME_EXCHANGE) 4372 target_flags = 0; 4373 4374 retry: 4375 - from = filename_parentat(olddfd, getname(oldname), lookup_flags, 4376 - &old_path, &old_last, &old_type); 4377 if (IS_ERR(from)) { 4378 error = PTR_ERR(from); 4379 - goto exit; 4380 } 4381 4382 - to = filename_parentat(newdfd, getname(newname), lookup_flags, 4383 - &new_path, &new_last, &new_type); 4384 if (IS_ERR(to)) { 4385 error = PTR_ERR(to); 4386 goto exit1; ··· 4471 if (retry_estale(error, lookup_flags)) 4472 should_retry = true; 4473 path_put(&new_path); 4474 - putname(to); 4475 exit1: 4476 path_put(&old_path); 4477 - putname(from); 4478 if (should_retry) { 4479 should_retry = false; 4480 lookup_flags |= LOOKUP_REVAL; 4481 goto retry; 4482 } 4483 - exit: 4484 return error; 4485 } 4486 4487 SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, 4488 int, newdfd, const char __user *, newname, unsigned int, flags) 4489 { 4490 - return do_renameat2(olddfd, oldname, newdfd, newname, flags); 4491 } 4492 4493 SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, 4494 int, newdfd, const char __user *, newname) 4495 { 4496 - return do_renameat2(olddfd, oldname, newdfd, newname, 0); 4497 } 4498 4499 SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) 4500 { 4501 - return do_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); 4502 } 4503 4504 int readlink_copy(char __user *buffer, int buflen, const char *link)
··· 4346 } 4347 EXPORT_SYMBOL(vfs_rename); 4348 4349 + int do_renameat2(int olddfd, struct filename *from, int newdfd, 4350 + struct filename *to, unsigned int flags) 4351 { 4352 struct dentry *old_dentry, *new_dentry; 4353 struct dentry *trap; ··· 4355 struct qstr old_last, new_last; 4356 int old_type, new_type; 4357 struct inode *delegated_inode = NULL; 4358 unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET; 4359 bool should_retry = false; 4360 + int error = -EINVAL; 4361 4362 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 4363 + goto put_both; 4364 4365 if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && 4366 (flags & RENAME_EXCHANGE)) 4367 + goto put_both; 4368 4369 if (flags & RENAME_EXCHANGE) 4370 target_flags = 0; 4371 4372 retry: 4373 + from = filename_parentat(olddfd, from, lookup_flags, &old_path, 4374 + &old_last, &old_type); 4375 if (IS_ERR(from)) { 4376 error = PTR_ERR(from); 4377 + goto put_new; 4378 } 4379 4380 + to = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last, 4381 + &new_type); 4382 if (IS_ERR(to)) { 4383 error = PTR_ERR(to); 4384 goto exit1; ··· 4473 if (retry_estale(error, lookup_flags)) 4474 should_retry = true; 4475 path_put(&new_path); 4476 exit1: 4477 path_put(&old_path); 4478 if (should_retry) { 4479 should_retry = false; 4480 lookup_flags |= LOOKUP_REVAL; 4481 goto retry; 4482 } 4483 + put_both: 4484 + if (!IS_ERR(from)) 4485 + putname(from); 4486 + put_new: 4487 + if (!IS_ERR(to)) 4488 + putname(to); 4489 return error; 4490 } 4491 4492 SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, 4493 int, newdfd, const char __user *, newname, unsigned int, flags) 4494 { 4495 + return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), 4496 + flags); 4497 } 4498 4499 SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, 4500 int, newdfd, const char __user *, newname) 4501 { 4502 + return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), 4503 + 0); 4504 } 4505 4506 SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) 4507 { 4508 + return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD, 4509 + getname(newname), 0); 4510 } 4511 4512 int readlink_copy(char __user *buffer, int buflen, const char *link)
+1
include/linux/socket.h
··· 436 int __user *usockaddr_len); 437 extern int __sys_socketpair(int family, int type, int protocol, 438 int __user *usockvec); 439 extern int __sys_shutdown(int fd, int how); 440 441 extern struct ns_common *get_net_ns(struct ns_common *ns);
··· 436 int __user *usockaddr_len); 437 extern int __sys_socketpair(int family, int type, int protocol, 438 int __user *usockvec); 439 + extern int __sys_shutdown_sock(struct socket *sock, int how); 440 extern int __sys_shutdown(int fd, int how); 441 442 extern struct ns_common *get_net_ns(struct ns_common *ns);
+1 -1
include/linux/syscalls.h
··· 317 struct io_uring_params __user *p); 318 asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit, 319 u32 min_complete, u32 flags, 320 - const sigset_t __user *sig, size_t sigsz); 321 asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op, 322 void __user *arg, unsigned int nr_args); 323
··· 317 struct io_uring_params __user *p); 318 asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit, 319 u32 min_complete, u32 flags, 320 + const void __user *argp, size_t argsz); 321 asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op, 322 void __user *arg, unsigned int nr_args); 323
+16
include/uapi/linux/io_uring.h
··· 42 __u32 statx_flags; 43 __u32 fadvise_advice; 44 __u32 splice_flags; 45 }; 46 __u64 user_data; /* data to be passed back at completion time */ 47 union { ··· 134 IORING_OP_PROVIDE_BUFFERS, 135 IORING_OP_REMOVE_BUFFERS, 136 IORING_OP_TEE, 137 138 /* this goes last, obviously */ 139 IORING_OP_LAST, ··· 151 * sqe->timeout_flags 152 */ 153 #define IORING_TIMEOUT_ABS (1U << 0) 154 155 /* 156 * sqe->splice_flags ··· 232 #define IORING_ENTER_GETEVENTS (1U << 0) 233 #define IORING_ENTER_SQ_WAKEUP (1U << 1) 234 #define IORING_ENTER_SQ_WAIT (1U << 2) 235 236 /* 237 * Passed in for io_uring_setup(2). Copied back with updated info on success ··· 260 #define IORING_FEAT_CUR_PERSONALITY (1U << 4) 261 #define IORING_FEAT_FAST_POLL (1U << 5) 262 #define IORING_FEAT_POLL_32BITS (1U << 6) 263 264 /* 265 * io_uring_register(2) opcodes and arguments ··· 336 IORING_RESTRICTION_SQE_FLAGS_REQUIRED = 3, 337 338 IORING_RESTRICTION_LAST 339 }; 340 341 #endif
··· 42 __u32 statx_flags; 43 __u32 fadvise_advice; 44 __u32 splice_flags; 45 + __u32 rename_flags; 46 + __u32 unlink_flags; 47 }; 48 __u64 user_data; /* data to be passed back at completion time */ 49 union { ··· 132 IORING_OP_PROVIDE_BUFFERS, 133 IORING_OP_REMOVE_BUFFERS, 134 IORING_OP_TEE, 135 + IORING_OP_SHUTDOWN, 136 + IORING_OP_RENAMEAT, 137 + IORING_OP_UNLINKAT, 138 139 /* this goes last, obviously */ 140 IORING_OP_LAST, ··· 146 * sqe->timeout_flags 147 */ 148 #define IORING_TIMEOUT_ABS (1U << 0) 149 + #define IORING_TIMEOUT_UPDATE (1U << 1) 150 151 /* 152 * sqe->splice_flags ··· 226 #define IORING_ENTER_GETEVENTS (1U << 0) 227 #define IORING_ENTER_SQ_WAKEUP (1U << 1) 228 #define IORING_ENTER_SQ_WAIT (1U << 2) 229 + #define IORING_ENTER_EXT_ARG (1U << 3) 230 231 /* 232 * Passed in for io_uring_setup(2). Copied back with updated info on success ··· 253 #define IORING_FEAT_CUR_PERSONALITY (1U << 4) 254 #define IORING_FEAT_FAST_POLL (1U << 5) 255 #define IORING_FEAT_POLL_32BITS (1U << 6) 256 + #define IORING_FEAT_SQPOLL_NONFIXED (1U << 7) 257 + #define IORING_FEAT_EXT_ARG (1U << 8) 258 259 /* 260 * io_uring_register(2) opcodes and arguments ··· 327 IORING_RESTRICTION_SQE_FLAGS_REQUIRED = 3, 328 329 IORING_RESTRICTION_LAST 330 + }; 331 + 332 + struct io_uring_getevents_arg { 333 + __u64 sigmask; 334 + __u32 sigmask_sz; 335 + __u32 pad; 336 + __u64 ts; 337 }; 338 339 #endif
+12 -3
net/socket.c
··· 2175 * Shutdown a socket. 2176 */ 2177 2178 int __sys_shutdown(int fd, int how) 2179 { 2180 int err, fput_needed; ··· 2193 2194 sock = sockfd_lookup_light(fd, &err, &fput_needed); 2195 if (sock != NULL) { 2196 - err = security_socket_shutdown(sock, how); 2197 - if (!err) 2198 - err = sock->ops->shutdown(sock, how); 2199 fput_light(sock->file, fput_needed); 2200 } 2201 return err;
··· 2175 * Shutdown a socket. 2176 */ 2177 2178 + int __sys_shutdown_sock(struct socket *sock, int how) 2179 + { 2180 + int err; 2181 + 2182 + err = security_socket_shutdown(sock, how); 2183 + if (!err) 2184 + err = sock->ops->shutdown(sock, how); 2185 + 2186 + return err; 2187 + } 2188 + 2189 int __sys_shutdown(int fd, int how) 2190 { 2191 int err, fput_needed; ··· 2182 2183 sock = sockfd_lookup_light(fd, &err, &fput_needed); 2184 if (sock != NULL) { 2185 + err = __sys_shutdown_sock(sock, how); 2186 fput_light(sock->file, fput_needed); 2187 } 2188 return err;