commit 48aba79bcf6ea05148dc82ad9c40713960b00396 · tjh.dev/kernel

+2

fs/internal.h

··· 78 long do_rmdir(int dfd, struct filename *name); 79 long do_unlinkat(int dfd, struct filename *name); 80 int may_linkat(struct path *link); 81 82 /* 83 * namespace.c

··· 78 long do_rmdir(int dfd, struct filename *name); 79 long do_unlinkat(int dfd, struct filename *name); 80 int may_linkat(struct path *link); 81 + int do_renameat2(int olddfd, struct filename *oldname, int newdfd, 82 + struct filename *newname, unsigned int flags); 83 84 /* 85 * namespace.c

-10

fs/io-wq.c

··· 1078 return IO_WQ_CANCEL_NOTFOUND; 1079 } 1080 1081 - static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data) 1082 - { 1083 - return work == data; 1084 - } 1085 - 1086 - enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) 1087 - { 1088 - return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork, false); 1089 - } 1090 - 1091 struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) 1092 { 1093 int ret = -ENOMEM, node;

··· 1078 return IO_WQ_CANCEL_NOTFOUND; 1079 } 1080 1081 struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) 1082 { 1083 int ret = -ENOMEM, node;

-1

fs/io-wq.h

··· 129 } 130 131 void io_wq_cancel_all(struct io_wq *wq); 132 - enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork); 133 134 typedef bool (work_cancel_fn)(struct io_wq_work *, void *); 135

··· 129 } 130 131 void io_wq_cancel_all(struct io_wq *wq); 132 133 typedef bool (work_cancel_fn)(struct io_wq_work *, void *); 134

+775 -562

fs/io_uring.c

··· 245 246 struct task_struct *thread; 247 struct wait_queue_head wait; 248 }; 249 250 struct io_ring_ctx { ··· 287 struct list_head timeout_list; 288 struct list_head cq_overflow_list; 289 290 - wait_queue_head_t inflight_wait; 291 struct io_uring_sqe *sq_sqes; 292 } ____cacheline_aligned_in_smp; 293 ··· 311 struct io_sq_data *sq_data; /* if using sq thread polling */ 312 313 struct wait_queue_head sqo_sq_wait; 314 - struct wait_queue_entry sqo_wait_entry; 315 struct list_head sqd_list; 316 317 /* ··· 395 */ 396 struct io_poll_iocb { 397 struct file *file; 398 - union { 399 - struct wait_queue_head *head; 400 - u64 addr; 401 - }; 402 __poll_t events; 403 bool done; 404 bool canceled; 405 struct wait_queue_entry wait; 406 }; 407 408 struct io_close { ··· 446 u32 off; 447 u32 target_seq; 448 struct list_head list; 449 }; 450 451 struct io_timeout_rem { 452 struct file *file; 453 u64 addr; 454 }; 455 456 struct io_rw { ··· 549 struct statx __user *buffer; 550 }; 551 552 struct io_completion { 553 struct file *file; 554 struct list_head list; ··· 604 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, 605 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, 606 607 - REQ_F_LINK_HEAD_BIT, 608 REQ_F_FAIL_LINK_BIT, 609 REQ_F_INFLIGHT_BIT, 610 REQ_F_CUR_POS_BIT, ··· 635 /* IOSQE_BUFFER_SELECT */ 636 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), 637 638 - /* head of a link */ 639 - REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT), 640 /* fail rest of links */ 641 REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), 642 /* on inflight list */ ··· 677 struct file *file; 678 struct io_rw rw; 679 struct io_poll_iocb poll; 680 struct io_accept accept; 681 struct io_sync sync; 682 struct io_cancel cancel; ··· 694 struct io_splice splice; 695 struct io_provide_buf pbuf; 696 struct io_statx statx; 697 /* use only after cleaning per-op data, see io_clean_op() */ 698 struct io_completion compl; 699 }; ··· 716 struct task_struct *task; 717 u64 user_data; 718 719 - struct list_head link_list; 720 721 /* 722 * 1. used with ctx->iopoll_list with reads/writes 723 * 2. to track reqs with ->files (see io_op_def::file_table) 724 */ 725 struct list_head inflight_entry; 726 - 727 - struct percpu_ref *fixed_file_refs; 728 struct callback_head task_work; 729 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ 730 struct hlist_node hash_node; ··· 754 void *reqs[IO_IOPOLL_BATCH]; 755 unsigned int free_reqs; 756 757 /* 758 * Batch completion logic 759 */ ··· 766 */ 767 struct file *file; 768 unsigned int fd; 769 - unsigned int has_refs; 770 unsigned int ios_left; 771 }; 772 ··· 788 unsigned buffer_select : 1; 789 /* must always have async data allocated */ 790 unsigned needs_async_data : 1; 791 /* size of async data needed, if any */ 792 unsigned short async_size; 793 unsigned work_flags; ··· 803 .pollin = 1, 804 .buffer_select = 1, 805 .needs_async_data = 1, 806 .async_size = sizeof(struct io_async_rw), 807 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, 808 }, ··· 813 .unbound_nonreg_file = 1, 814 .pollout = 1, 815 .needs_async_data = 1, 816 .async_size = sizeof(struct io_async_rw), 817 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | 818 IO_WQ_WORK_FSIZE, ··· 826 .needs_file = 1, 827 .unbound_nonreg_file = 1, 828 .pollin = 1, 829 .async_size = sizeof(struct io_async_rw), 830 .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM, 831 }, ··· 835 .hash_reg_file = 1, 836 .unbound_nonreg_file = 1, 837 .pollout = 1, 838 .async_size = sizeof(struct io_async_rw), 839 .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE | 840 IO_WQ_WORK_MM, ··· 855 .pollout = 1, 856 .needs_async_data = 1, 857 .async_size = sizeof(struct io_async_msghdr), 858 - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | 859 - IO_WQ_WORK_FS, 860 }, 861 [IORING_OP_RECVMSG] = { 862 .needs_file = 1, ··· 864 .buffer_select = 1, 865 .needs_async_data = 1, 866 .async_size = sizeof(struct io_async_msghdr), 867 - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | 868 - IO_WQ_WORK_FS, 869 }, 870 [IORING_OP_TIMEOUT] = { 871 .needs_async_data = 1, 872 .async_size = sizeof(struct io_timeout_data), 873 .work_flags = IO_WQ_WORK_MM, 874 }, 875 - [IORING_OP_TIMEOUT_REMOVE] = {}, 876 [IORING_OP_ACCEPT] = { 877 .needs_file = 1, 878 .unbound_nonreg_file = 1, ··· 901 }, 902 [IORING_OP_OPENAT] = { 903 .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG | 904 - IO_WQ_WORK_FS, 905 }, 906 [IORING_OP_CLOSE] = { 907 .needs_file = 1, ··· 920 .unbound_nonreg_file = 1, 921 .pollin = 1, 922 .buffer_select = 1, 923 .async_size = sizeof(struct io_async_rw), 924 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, 925 }, ··· 928 .needs_file = 1, 929 .unbound_nonreg_file = 1, 930 .pollout = 1, 931 .async_size = sizeof(struct io_async_rw), 932 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | 933 IO_WQ_WORK_FSIZE, ··· 955 }, 956 [IORING_OP_OPENAT2] = { 957 .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS | 958 - IO_WQ_WORK_BLKCG, 959 }, 960 [IORING_OP_EPOLL_CTL] = { 961 .unbound_nonreg_file = 1, ··· 973 .needs_file = 1, 974 .hash_reg_file = 1, 975 .unbound_nonreg_file = 1, 976 }, 977 }; 978 ··· 1034 } 1035 EXPORT_SYMBOL(io_uring_get_socket); 1036 1037 static inline void io_clean_op(struct io_kiocb *req) 1038 { 1039 if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED | ··· 1044 __io_clean_op(req); 1045 } 1046 1047 - static void io_sq_thread_drop_mm(void) 1048 { 1049 struct mm_struct *mm = current->mm; 1050 1051 if (mm) { ··· 1084 mmput(mm); 1085 current->mm = NULL; 1086 } 1087 } 1088 1089 static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) ··· 1146 return -EFAULT; 1147 } 1148 1149 - static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, 1150 - struct io_kiocb *req) 1151 { 1152 - if (!(io_op_defs[req->opcode].work_flags & IO_WQ_WORK_MM)) 1153 - return 0; 1154 - return __io_sq_thread_acquire_mm(ctx); 1155 } 1156 1157 static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx, ··· 1307 INIT_LIST_HEAD(&ctx->iopoll_list); 1308 INIT_LIST_HEAD(&ctx->defer_list); 1309 INIT_LIST_HEAD(&ctx->timeout_list); 1310 - init_waitqueue_head(&ctx->inflight_wait); 1311 spin_lock_init(&ctx->inflight_lock); 1312 INIT_LIST_HEAD(&ctx->inflight_list); 1313 INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work); ··· 1548 { 1549 struct io_kiocb *cur; 1550 1551 - io_prep_async_work(req); 1552 - if (req->flags & REQ_F_LINK_HEAD) 1553 - list_for_each_entry(cur, &req->link_list, link_list) 1554 - io_prep_async_work(cur); 1555 } 1556 1557 static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req) ··· 1590 } 1591 } 1592 1593 - static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk) 1594 - { 1595 - struct io_ring_ctx *ctx = req->ctx; 1596 - 1597 - if (!tsk || req->task == tsk) 1598 - return true; 1599 - if (ctx->flags & IORING_SETUP_SQPOLL) { 1600 - if (ctx->sq_data && req->task == ctx->sq_data->thread) 1601 - return true; 1602 - } 1603 - return false; 1604 - } 1605 - 1606 /* 1607 * Returns true if we found and killed one or more timeouts 1608 */ 1609 - static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk) 1610 { 1611 struct io_kiocb *req, *tmp; 1612 int canceled = 0; 1613 1614 spin_lock_irq(&ctx->completion_lock); 1615 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { 1616 - if (io_task_match(req, tsk)) { 1617 io_kill_timeout(req); 1618 canceled++; 1619 } ··· 1712 } 1713 } 1714 1715 - static inline bool __io_match_files(struct io_kiocb *req, 1716 - struct files_struct *files) 1717 - { 1718 - return ((req->flags & REQ_F_WORK_INITIALIZED) && 1719 - (req->work.flags & IO_WQ_WORK_FILES)) && 1720 - req->work.identity->files == files; 1721 - } 1722 - 1723 - static bool io_match_files(struct io_kiocb *req, 1724 - struct files_struct *files) 1725 - { 1726 - struct io_kiocb *link; 1727 - 1728 - if (!files) 1729 - return true; 1730 - if (__io_match_files(req, files)) 1731 - return true; 1732 - if (req->flags & REQ_F_LINK_HEAD) { 1733 - list_for_each_entry(link, &req->link_list, link_list) { 1734 - if (__io_match_files(link, files)) 1735 - return true; 1736 - } 1737 - } 1738 - return false; 1739 - } 1740 - 1741 /* Returns true if there are no backlogged entries after the flush */ 1742 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, 1743 struct task_struct *tsk, ··· 1739 1740 cqe = NULL; 1741 list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) { 1742 - if (tsk && req->task != tsk) 1743 - continue; 1744 - if (!io_match_files(req, files)) 1745 continue; 1746 1747 cqe = io_get_cqring(ctx); ··· 1935 static inline void io_put_file(struct io_kiocb *req, struct file *file, 1936 bool fixed) 1937 { 1938 - if (fixed) 1939 - percpu_ref_put(req->fixed_file_refs); 1940 - else 1941 fput(file); 1942 } 1943 ··· 1947 kfree(req->async_data); 1948 if (req->file) 1949 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); 1950 - 1951 io_req_clean_work(req); 1952 } 1953 ··· 1971 percpu_ref_put(&ctx->refs); 1972 } 1973 1974 static void io_kill_linked_timeout(struct io_kiocb *req) 1975 { 1976 struct io_ring_ctx *ctx = req->ctx; ··· 1987 unsigned long flags; 1988 1989 spin_lock_irqsave(&ctx->completion_lock, flags); 1990 - link = list_first_entry_or_null(&req->link_list, struct io_kiocb, 1991 - link_list); 1992 /* 1993 * Can happen if a linked timeout fired and link had been like 1994 * req -> link t-out -> link t-out [-> ...] ··· 1997 struct io_timeout_data *io = link->async_data; 1998 int ret; 1999 2000 - list_del_init(&link->link_list); 2001 ret = hrtimer_try_to_cancel(&io->timer); 2002 if (ret != -1) { 2003 io_cqring_fill_event(link, -ECANCELED); ··· 2015 } 2016 } 2017 2018 - static struct io_kiocb *io_req_link_next(struct io_kiocb *req) 2019 - { 2020 - struct io_kiocb *nxt; 2021 2022 - /* 2023 - * The list should never be empty when we are called here. But could 2024 - * potentially happen if the chain is messed up, check to be on the 2025 - * safe side. 2026 - */ 2027 - if (unlikely(list_empty(&req->link_list))) 2028 - return NULL; 2029 - 2030 - nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list); 2031 - list_del_init(&req->link_list); 2032 - if (!list_empty(&nxt->link_list)) 2033 - nxt->flags |= REQ_F_LINK_HEAD; 2034 - return nxt; 2035 - } 2036 - 2037 - /* 2038 - * Called if REQ_F_LINK_HEAD is set, and we fail the head request 2039 - */ 2040 static void io_fail_links(struct io_kiocb *req) 2041 { 2042 struct io_ring_ctx *ctx = req->ctx; 2043 unsigned long flags; 2044 2045 spin_lock_irqsave(&ctx->completion_lock, flags); 2046 - while (!list_empty(&req->link_list)) { 2047 - struct io_kiocb *link = list_first_entry(&req->link_list, 2048 - struct io_kiocb, link_list); 2049 2050 - list_del_init(&link->link_list); 2051 trace_io_uring_fail_link(req, link); 2052 - 2053 io_cqring_fill_event(link, -ECANCELED); 2054 2055 /* ··· 2042 io_put_req_deferred(link, 2); 2043 else 2044 io_double_put_req(link); 2045 } 2046 - 2047 io_commit_cqring(ctx); 2048 spin_unlock_irqrestore(&ctx->completion_lock, flags); 2049 ··· 2052 2053 static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) 2054 { 2055 - req->flags &= ~REQ_F_LINK_HEAD; 2056 if (req->flags & REQ_F_LINK_TIMEOUT) 2057 io_kill_linked_timeout(req); 2058 ··· 2061 * dependencies to the next request. In case of failure, fail the rest 2062 * of the chain. 2063 */ 2064 - if (likely(!(req->flags & REQ_F_FAIL_LINK))) 2065 - return io_req_link_next(req); 2066 io_fail_links(req); 2067 return NULL; 2068 } 2069 2070 - static struct io_kiocb *io_req_find_next(struct io_kiocb *req) 2071 { 2072 - if (likely(!(req->flags & REQ_F_LINK_HEAD))) 2073 return NULL; 2074 return __io_req_find_next(req); 2075 } ··· 2132 { 2133 struct io_ring_ctx *ctx = req->ctx; 2134 2135 - if (!__io_sq_thread_acquire_mm(ctx)) { 2136 mutex_lock(&ctx->uring_lock); 2137 __io_queue_sqe(req, NULL); 2138 mutex_unlock(&ctx->uring_lock); ··· 2169 } 2170 } 2171 2172 - static void io_queue_next(struct io_kiocb *req) 2173 { 2174 struct io_kiocb *nxt = io_req_find_next(req); 2175 ··· 2226 io_free_req(req); 2227 return; 2228 } 2229 - if (req->flags & REQ_F_LINK_HEAD) 2230 - io_queue_next(req); 2231 2232 if (req->task != rb->task) { 2233 if (rb->task) { ··· 2328 * we wake up the task, and the next invocation will flush the 2329 * entries. We cannot safely to it from here. 2330 */ 2331 - if (noflush && !list_empty(&ctx->cq_overflow_list)) 2332 return -1U; 2333 2334 io_cqring_overflow_flush(ctx, false, NULL, NULL); ··· 2675 if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) 2676 return false; 2677 2678 - ret = io_sq_thread_acquire_mm(req->ctx, req); 2679 2680 if (io_resubmit_prep(req, ret)) { 2681 refcount_inc(&req->refs); ··· 2723 * find it from a io_iopoll_getevents() thread before the issuer is done 2724 * accessing the kiocb cookie. 2725 */ 2726 - static void io_iopoll_req_issued(struct io_kiocb *req) 2727 { 2728 struct io_ring_ctx *ctx = req->ctx; 2729 ··· 2752 else 2753 list_add_tail(&req->inflight_entry, &ctx->iopoll_list); 2754 2755 - if ((ctx->flags & IORING_SETUP_SQPOLL) && 2756 wq_has_sleeper(&ctx->sq_data->wait)) 2757 wake_up(&ctx->sq_data->wait); 2758 } 2759 2760 - static void __io_state_file_put(struct io_submit_state *state) 2761 { 2762 - if (state->has_refs) 2763 - fput_many(state->file, state->has_refs); 2764 - state->file = NULL; 2765 } 2766 2767 static inline void io_state_file_put(struct io_submit_state *state) 2768 { 2769 - if (state->file) 2770 __io_state_file_put(state); 2771 } 2772 ··· 2784 if (!state) 2785 return fget(fd); 2786 2787 - if (state->file) { 2788 if (state->fd == fd) { 2789 - state->has_refs--; 2790 return state->file; 2791 } 2792 __io_state_file_put(state); 2793 } 2794 state->file = fget_many(fd, state->ios_left); 2795 - if (!state->file) 2796 return NULL; 2797 2798 state->fd = fd; 2799 - state->has_refs = state->ios_left - 1; 2800 return state->file; 2801 } 2802 ··· 3151 return __io_iov_buffer_select(req, iov, needs_lock); 3152 } 3153 3154 - static ssize_t __io_import_iovec(int rw, struct io_kiocb *req, 3155 struct iovec **iovec, struct iov_iter *iter, 3156 bool needs_lock) 3157 { ··· 3180 3181 ret = import_single_range(rw, buf, sqe_len, *iovec, iter); 3182 *iovec = NULL; 3183 - return ret < 0 ? ret : sqe_len; 3184 } 3185 3186 if (req->flags & REQ_F_BUFFER_SELECT) { ··· 3195 3196 return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter, 3197 req->ctx->compat); 3198 - } 3199 - 3200 - static ssize_t io_import_iovec(int rw, struct io_kiocb *req, 3201 - struct iovec **iovec, struct iov_iter *iter, 3202 - bool needs_lock) 3203 - { 3204 - struct io_async_rw *iorw = req->async_data; 3205 - 3206 - if (!iorw) 3207 - return __io_import_iovec(rw, req, iovec, iter, needs_lock); 3208 - *iovec = NULL; 3209 - return iov_iter_count(&iorw->iter); 3210 } 3211 3212 static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) ··· 3320 struct iovec *iov = iorw->fast_iov; 3321 ssize_t ret; 3322 3323 - ret = __io_import_iovec(rw, req, &iov, &iorw->iter, false); 3324 if (unlikely(ret < 0)) 3325 return ret; 3326 ··· 3453 struct iov_iter __iter, *iter = &__iter; 3454 struct io_async_rw *rw = req->async_data; 3455 ssize_t io_size, ret, ret2; 3456 - size_t iov_count; 3457 bool no_async; 3458 3459 - if (rw) 3460 iter = &rw->iter; 3461 - 3462 - ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); 3463 - if (ret < 0) 3464 - return ret; 3465 - iov_count = iov_iter_count(iter); 3466 - io_size = ret; 3467 req->result = io_size; 3468 ret = 0; 3469 ··· 3479 if (no_async) 3480 goto copy_iov; 3481 3482 - ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count); 3483 if (unlikely(ret)) 3484 goto out_free; 3485 ··· 3498 if (req->file->f_flags & O_NONBLOCK) 3499 goto done; 3500 /* some cases will consume bytes even on error returns */ 3501 - iov_iter_revert(iter, iov_count - iov_iter_count(iter)); 3502 ret = 0; 3503 goto copy_iov; 3504 } else if (ret < 0) { ··· 3581 struct kiocb *kiocb = &req->rw.kiocb; 3582 struct iov_iter __iter, *iter = &__iter; 3583 struct io_async_rw *rw = req->async_data; 3584 - size_t iov_count; 3585 ssize_t ret, ret2, io_size; 3586 3587 - if (rw) 3588 iter = &rw->iter; 3589 - 3590 - ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); 3591 - if (ret < 0) 3592 - return ret; 3593 - iov_count = iov_iter_count(iter); 3594 - io_size = ret; 3595 req->result = io_size; 3596 3597 /* Ensure we clear previously set non-block flag */ ··· 3609 (req->flags & REQ_F_ISREG)) 3610 goto copy_iov; 3611 3612 - ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count); 3613 if (unlikely(ret)) 3614 goto out_free; 3615 ··· 3652 } else { 3653 copy_iov: 3654 /* some cases will consume bytes even on error returns */ 3655 - iov_iter_revert(iter, iov_count - iov_iter_count(iter)); 3656 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); 3657 if (!ret) 3658 return -EAGAIN; ··· 3662 if (iovec) 3663 kfree(iovec); 3664 return ret; 3665 } 3666 3667 static int __io_splice_prep(struct io_kiocb *req, ··· 4007 { 4008 u64 flags, mode; 4009 4010 - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) 4011 return -EINVAL; 4012 mode = READ_ONCE(sqe->len); 4013 flags = READ_ONCE(sqe->open_flags); ··· 4021 size_t len; 4022 int ret; 4023 4024 - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) 4025 return -EINVAL; 4026 how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 4027 len = READ_ONCE(sqe->len); ··· 4151 head = idr_find(&ctx->io_buffer_idr, p->bgid); 4152 if (head) 4153 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); 4154 - 4155 - io_ring_submit_lock(ctx, !force_nonblock); 4156 if (ret < 0) 4157 req_set_fail_links(req); 4158 - __io_req_complete(req, ret, 0, cs); 4159 return 0; 4160 } 4161 ··· 4246 } 4247 } 4248 out: 4249 - io_ring_submit_unlock(ctx, !force_nonblock); 4250 if (ret < 0) 4251 req_set_fail_links(req); 4252 - __io_req_complete(req, ret, 0, cs); 4253 return 0; 4254 } 4255 ··· 4428 io_req_init_async(req); 4429 req->work.flags |= IO_WQ_WORK_NO_CANCEL; 4430 4431 - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) 4432 return -EINVAL; 4433 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || 4434 sqe->rw_flags || sqe->buf_index) ··· 4910 { 4911 struct io_accept *accept = &req->accept; 4912 4913 - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) 4914 return -EINVAL; 4915 if (sqe->ioprio || sqe->len || sqe->buf_index) 4916 return -EINVAL; ··· 4951 struct io_connect *conn = &req->connect; 4952 struct io_async_connect *io = req->async_data; 4953 4954 - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) 4955 return -EINVAL; 4956 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) 4957 return -EINVAL; ··· 5486 /* 5487 * Returns true if we found and killed one or more poll requests 5488 */ 5489 - static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk) 5490 { 5491 struct hlist_node *tmp; 5492 struct io_kiocb *req; ··· 5499 5500 list = &ctx->cancel_hash[i]; 5501 hlist_for_each_entry_safe(req, tmp, list, hash_node) { 5502 - if (io_task_match(req, tsk)) 5503 posted += io_poll_remove_one(req); 5504 } 5505 } ··· 5537 sqe->poll_events) 5538 return -EINVAL; 5539 5540 - req->poll.addr = READ_ONCE(sqe->addr); 5541 return 0; 5542 } 5543 ··· 5548 static int io_poll_remove(struct io_kiocb *req) 5549 { 5550 struct io_ring_ctx *ctx = req->ctx; 5551 - u64 addr; 5552 int ret; 5553 5554 - addr = req->poll.addr; 5555 spin_lock_irq(&ctx->completion_lock); 5556 - ret = io_poll_cancel(ctx, addr); 5557 spin_unlock_irq(&ctx->completion_lock); 5558 5559 if (ret < 0) ··· 5644 return HRTIMER_NORESTART; 5645 } 5646 5647 - static int __io_timeout_cancel(struct io_kiocb *req) 5648 { 5649 - struct io_timeout_data *io = req->async_data; 5650 - int ret; 5651 - 5652 - ret = hrtimer_try_to_cancel(&io->timer); 5653 - if (ret == -1) 5654 - return -EALREADY; 5655 - list_del_init(&req->timeout.list); 5656 - 5657 - req_set_fail_links(req); 5658 - io_cqring_fill_event(req, -ECANCELED); 5659 - io_put_req_deferred(req, 1); 5660 - return 0; 5661 - } 5662 - 5663 - static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) 5664 - { 5665 struct io_kiocb *req; 5666 int ret = -ENOENT; 5667 ··· 5659 } 5660 5661 if (ret == -ENOENT) 5662 - return ret; 5663 5664 - return __io_timeout_cancel(req); 5665 } 5666 5667 static int io_timeout_remove_prep(struct io_kiocb *req, 5668 const struct io_uring_sqe *sqe) 5669 { 5670 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5671 return -EINVAL; 5672 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 5673 return -EINVAL; 5674 - if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->timeout_flags) 5675 return -EINVAL; 5676 5677 - req->timeout_rem.addr = READ_ONCE(sqe->addr); 5678 return 0; 5679 } 5680 ··· 5732 */ 5733 static int io_timeout_remove(struct io_kiocb *req) 5734 { 5735 struct io_ring_ctx *ctx = req->ctx; 5736 int ret; 5737 5738 spin_lock_irq(&ctx->completion_lock); 5739 - ret = io_timeout_cancel(ctx, req->timeout_rem.addr); 5740 5741 io_cqring_fill_event(req, ret); 5742 io_commit_cqring(ctx); ··· 6024 return io_remove_buffers_prep(req, sqe); 6025 case IORING_OP_TEE: 6026 return io_tee_prep(req, sqe); 6027 } 6028 6029 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", ··· 6051 { 6052 struct io_kiocb *pos; 6053 struct io_ring_ctx *ctx = req->ctx; 6054 - u32 total_submitted, nr_reqs = 1; 6055 6056 - if (req->flags & REQ_F_LINK_HEAD) 6057 - list_for_each_entry(pos, &req->link_list, link_list) 6058 - nr_reqs++; 6059 6060 total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped; 6061 return total_submitted - nr_reqs; ··· 6106 static void io_req_drop_files(struct io_kiocb *req) 6107 { 6108 struct io_ring_ctx *ctx = req->ctx; 6109 unsigned long flags; 6110 6111 spin_lock_irqsave(&ctx->inflight_lock, flags); 6112 list_del(&req->inflight_entry); 6113 - if (waitqueue_active(&ctx->inflight_wait)) 6114 - wake_up(&ctx->inflight_wait); 6115 spin_unlock_irqrestore(&ctx->inflight_lock, flags); 6116 req->flags &= ~REQ_F_INFLIGHT; 6117 put_files_struct(req->work.identity->files); ··· 6166 case IORING_OP_OPENAT2: 6167 if (req->open.filename) 6168 putname(req->open.filename); 6169 break; 6170 } 6171 req->flags &= ~REQ_F_NEED_CLEANUP; ··· 6280 case IORING_OP_TEE: 6281 ret = io_tee(req, force_nonblock); 6282 break; 6283 default: 6284 ret = -EINVAL; 6285 break; ··· 6305 if (in_async) 6306 mutex_lock(&ctx->uring_lock); 6307 6308 - io_iopoll_req_issued(req); 6309 6310 if (in_async) 6311 mutex_unlock(&ctx->uring_lock); ··· 6345 } 6346 6347 if (ret) { 6348 - req_set_fail_links(req); 6349 - io_req_complete(req, ret); 6350 } 6351 6352 return io_steal_work(req); ··· 6383 return NULL; 6384 fd = array_index_nospec(fd, ctx->nr_user_files); 6385 file = io_file_from_index(ctx, fd); 6386 - if (file) { 6387 - req->fixed_file_refs = &ctx->file_data->node->refs; 6388 - percpu_ref_get(req->fixed_file_refs); 6389 - } 6390 } else { 6391 trace_io_uring_file_get(ctx, fd); 6392 file = __io_file_get(state, fd); ··· 6392 return file; 6393 } 6394 6395 - static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, 6396 - int fd) 6397 - { 6398 - bool fixed; 6399 - 6400 - fixed = (req->flags & REQ_F_FIXED_FILE) != 0; 6401 - if (unlikely(!fixed && io_async_submit(req->ctx))) 6402 - return -EBADF; 6403 - 6404 - req->file = io_file_get(state, req, fd, fixed); 6405 - if (req->file || io_op_defs[req->opcode].needs_file_no_error) 6406 - return 0; 6407 - return -EBADF; 6408 - } 6409 - 6410 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) 6411 { 6412 struct io_timeout_data *data = container_of(timer, 6413 struct io_timeout_data, timer); 6414 - struct io_kiocb *req = data->req; 6415 struct io_ring_ctx *ctx = req->ctx; 6416 - struct io_kiocb *prev = NULL; 6417 unsigned long flags; 6418 6419 spin_lock_irqsave(&ctx->completion_lock, flags); 6420 6421 /* 6422 * We don't expect the list to be empty, that will only happen if we 6423 * race with the completion of the linked work. 6424 */ 6425 - if (!list_empty(&req->link_list)) { 6426 - prev = list_entry(req->link_list.prev, struct io_kiocb, 6427 - link_list); 6428 - if (refcount_inc_not_zero(&prev->refs)) 6429 - list_del_init(&req->link_list); 6430 - else 6431 - prev = NULL; 6432 - } 6433 - 6434 spin_unlock_irqrestore(&ctx->completion_lock, flags); 6435 6436 if (prev) { ··· 6427 static void __io_queue_linked_timeout(struct io_kiocb *req) 6428 { 6429 /* 6430 - * If the list is now empty, then our linked request finished before 6431 - * we got a chance to setup the timer 6432 */ 6433 - if (!list_empty(&req->link_list)) { 6434 struct io_timeout_data *data = req->async_data; 6435 6436 data->timer.function = io_link_timeout_fn; ··· 6453 6454 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) 6455 { 6456 - struct io_kiocb *nxt; 6457 6458 - if (!(req->flags & REQ_F_LINK_HEAD)) 6459 - return NULL; 6460 - if (req->flags & REQ_F_LINK_TIMEOUT) 6461 - return NULL; 6462 - 6463 - nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, 6464 - link_list); 6465 - if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT) 6466 return NULL; 6467 6468 nxt->flags |= REQ_F_LTIMEOUT_ACTIVE; 6469 req->flags |= REQ_F_LINK_TIMEOUT; 6470 return nxt; ··· 6565 io_queue_sqe(req, NULL, cs); 6566 } 6567 6568 static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, 6569 - struct io_kiocb **link, struct io_comp_state *cs) 6570 { 6571 struct io_ring_ctx *ctx = req->ctx; 6572 int ret; ··· 6583 * submitted sync once the chain is complete. If none of those 6584 * conditions are true (normal request), then just queue it. 6585 */ 6586 - if (*link) { 6587 - struct io_kiocb *head = *link; 6588 6589 /* 6590 * Taking sequential execution of a link, draining both sides ··· 6604 return ret; 6605 } 6606 trace_io_uring_link(ctx, req, head); 6607 - list_add_tail(&req->link_list, &head->link_list); 6608 6609 /* last request of a link, enqueue the link */ 6610 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { 6611 io_queue_link_head(head, cs); 6612 - *link = NULL; 6613 } 6614 } else { 6615 if (unlikely(ctx->drain_next)) { ··· 6618 ctx->drain_next = 0; 6619 } 6620 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { 6621 - req->flags |= REQ_F_LINK_HEAD; 6622 - INIT_LIST_HEAD(&req->link_list); 6623 - 6624 ret = io_req_defer_prep(req, sqe); 6625 if (unlikely(ret)) 6626 req->flags |= REQ_F_FAIL_LINK; 6627 - *link = req; 6628 } else { 6629 io_queue_sqe(req, sqe, cs); 6630 } ··· 6638 { 6639 if (!list_empty(&state->comp.list)) 6640 io_submit_flush_completions(&state->comp); 6641 - blk_finish_plug(&state->plug); 6642 io_state_file_put(state); 6643 if (state->free_reqs) 6644 kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs); ··· 6651 static void io_submit_state_start(struct io_submit_state *state, 6652 struct io_ring_ctx *ctx, unsigned int max_ios) 6653 { 6654 - blk_start_plug(&state->plug); 6655 state->comp.nr = 0; 6656 INIT_LIST_HEAD(&state->comp.list); 6657 state->comp.ctx = ctx; 6658 state->free_reqs = 0; 6659 - state->file = NULL; 6660 state->ios_left = max_ios; 6661 } 6662 ··· 6751 req->file = NULL; 6752 req->ctx = ctx; 6753 req->flags = 0; 6754 /* one is dropped after submission, the other at completion */ 6755 refcount_set(&req->refs, 2); 6756 req->task = current; ··· 6761 if (unlikely(req->opcode >= IORING_OP_LAST)) 6762 return -EINVAL; 6763 6764 - if (unlikely(io_sq_thread_acquire_mm(ctx, req))) 6765 return -EFAULT; 6766 6767 sqe_flags = READ_ONCE(sqe->flags); ··· 6794 /* same numerical values with corresponding REQ_F_*, safe to copy */ 6795 req->flags |= sqe_flags; 6796 6797 - if (!io_op_defs[req->opcode].needs_file) 6798 - return 0; 6799 6800 - ret = io_req_set_file(state, req, READ_ONCE(sqe->fd)); 6801 state->ios_left--; 6802 return ret; 6803 } ··· 6821 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) 6822 { 6823 struct io_submit_state state; 6824 - struct io_kiocb *link = NULL; 6825 int i, submitted = 0; 6826 6827 /* if we have a backlog and couldn't flush it all, return BUSY */ ··· 6841 refcount_add(nr, &current->usage); 6842 6843 io_submit_state_start(&state, ctx, nr); 6844 6845 for (i = 0; i < nr; i++) { 6846 const struct io_uring_sqe *sqe; ··· 6887 percpu_counter_sub(&tctx->inflight, unused); 6888 put_task_struct_many(current, unused); 6889 } 6890 - if (link) 6891 - io_queue_link_head(link, &state.comp); 6892 io_submit_state_end(&state); 6893 6894 /* Commit SQ ring head once we've consumed and submitted all SQEs */ ··· 6912 spin_unlock_irq(&ctx->completion_lock); 6913 } 6914 6915 - static int io_sq_wake_function(struct wait_queue_entry *wqe, unsigned mode, 6916 - int sync, void *key) 6917 { 6918 - struct io_ring_ctx *ctx = container_of(wqe, struct io_ring_ctx, sqo_wait_entry); 6919 - int ret; 6920 - 6921 - ret = autoremove_wake_function(wqe, mode, sync, key); 6922 - if (ret) { 6923 - unsigned long flags; 6924 - 6925 - spin_lock_irqsave(&ctx->completion_lock, flags); 6926 - ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; 6927 - spin_unlock_irqrestore(&ctx->completion_lock, flags); 6928 - } 6929 - return ret; 6930 - } 6931 - 6932 - enum sq_ret { 6933 - SQT_IDLE = 1, 6934 - SQT_SPIN = 2, 6935 - SQT_DID_WORK = 4, 6936 - }; 6937 - 6938 - static enum sq_ret __io_sq_thread(struct io_ring_ctx *ctx, 6939 - unsigned long start_jiffies, bool cap_entries) 6940 - { 6941 - unsigned long timeout = start_jiffies + ctx->sq_thread_idle; 6942 - struct io_sq_data *sqd = ctx->sq_data; 6943 unsigned int to_submit; 6944 int ret = 0; 6945 6946 - again: 6947 - if (!list_empty(&ctx->iopoll_list)) { 6948 - unsigned nr_events = 0; 6949 - 6950 - mutex_lock(&ctx->uring_lock); 6951 - if (!list_empty(&ctx->iopoll_list) && !need_resched()) 6952 - io_do_iopoll(ctx, &nr_events, 0); 6953 - mutex_unlock(&ctx->uring_lock); 6954 - } 6955 - 6956 to_submit = io_sqring_entries(ctx); 6957 - 6958 - /* 6959 - * If submit got -EBUSY, flag us as needing the application 6960 - * to enter the kernel to reap and flush events. 6961 - */ 6962 - if (!to_submit || ret == -EBUSY || need_resched()) { 6963 - /* 6964 - * Drop cur_mm before scheduling, we can't hold it for 6965 - * long periods (or over schedule()). Do this before 6966 - * adding ourselves to the waitqueue, as the unuse/drop 6967 - * may sleep. 6968 - */ 6969 - io_sq_thread_drop_mm(); 6970 - 6971 - /* 6972 - * We're polling. If we're within the defined idle 6973 - * period, then let us spin without work before going 6974 - * to sleep. The exception is if we got EBUSY doing 6975 - * more IO, we should wait for the application to 6976 - * reap events and wake us up. 6977 - */ 6978 - if (!list_empty(&ctx->iopoll_list) || need_resched() || 6979 - (!time_after(jiffies, timeout) && ret != -EBUSY && 6980 - !percpu_ref_is_dying(&ctx->refs))) 6981 - return SQT_SPIN; 6982 - 6983 - prepare_to_wait(&sqd->wait, &ctx->sqo_wait_entry, 6984 - TASK_INTERRUPTIBLE); 6985 - 6986 - /* 6987 - * While doing polled IO, before going to sleep, we need 6988 - * to check if there are new reqs added to iopoll_list, 6989 - * it is because reqs may have been punted to io worker 6990 - * and will be added to iopoll_list later, hence check 6991 - * the iopoll_list again. 6992 - */ 6993 - if ((ctx->flags & IORING_SETUP_IOPOLL) && 6994 - !list_empty_careful(&ctx->iopoll_list)) { 6995 - finish_wait(&sqd->wait, &ctx->sqo_wait_entry); 6996 - goto again; 6997 - } 6998 - 6999 - to_submit = io_sqring_entries(ctx); 7000 - if (!to_submit || ret == -EBUSY) 7001 - return SQT_IDLE; 7002 - } 7003 - 7004 - finish_wait(&sqd->wait, &ctx->sqo_wait_entry); 7005 - io_ring_clear_wakeup_flag(ctx); 7006 - 7007 /* if we're handling multiple rings, cap submit size for fairness */ 7008 if (cap_entries && to_submit > 8) 7009 to_submit = 8; 7010 7011 - mutex_lock(&ctx->uring_lock); 7012 - if (likely(!percpu_ref_is_dying(&ctx->refs))) 7013 - ret = io_submit_sqes(ctx, to_submit); 7014 - mutex_unlock(&ctx->uring_lock); 7015 7016 if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait)) 7017 wake_up(&ctx->sqo_sq_wait); 7018 7019 - return SQT_DID_WORK; 7020 } 7021 7022 static void io_sqd_init_new(struct io_sq_data *sqd) ··· 6959 6960 while (!list_empty(&sqd->ctx_new_list)) { 6961 ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list); 6962 - init_wait(&ctx->sqo_wait_entry); 6963 - ctx->sqo_wait_entry.func = io_sq_wake_function; 6964 list_move_tail(&ctx->sqd_list, &sqd->ctx_list); 6965 complete(&ctx->sq_thread_comp); 6966 } 6967 } 6968 6969 static int io_sq_thread(void *data) 6970 { 6971 struct cgroup_subsys_state *cur_css = NULL; 6972 const struct cred *old_cred = NULL; 6973 struct io_sq_data *sqd = data; 6974 struct io_ring_ctx *ctx; 6975 - unsigned long start_jiffies; 6976 6977 - start_jiffies = jiffies; 6978 while (!kthread_should_stop()) { 6979 - enum sq_ret ret = 0; 6980 - bool cap_entries; 6981 6982 /* 6983 * Any changes to the sqd lists are synchronized through the 6984 * kthread parking. This synchronizes the thread vs users, 6985 * the users are synchronized on the sqd->ctx_lock. 6986 */ 6987 - if (kthread_should_park()) 6988 kthread_parkme(); 6989 6990 - if (unlikely(!list_empty(&sqd->ctx_new_list))) 6991 io_sqd_init_new(sqd); 6992 6993 cap_entries = !list_is_singular(&sqd->ctx_list); 6994 - 6995 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 6996 if (current->cred != ctx->creds) { 6997 if (old_cred) ··· 7021 current->sessionid = ctx->sessionid; 7022 #endif 7023 7024 - ret |= __io_sq_thread(ctx, start_jiffies, cap_entries); 7025 7026 - io_sq_thread_drop_mm(); 7027 } 7028 7029 - if (ret & SQT_SPIN) { 7030 io_run_task_work(); 7031 cond_resched(); 7032 - } else if (ret == SQT_IDLE) { 7033 - if (kthread_should_park()) 7034 - continue; 7035 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 7036 io_ring_set_wakeup_flag(ctx); 7037 schedule(); 7038 - start_jiffies = jiffies; 7039 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 7040 io_ring_clear_wakeup_flag(ctx); 7041 } 7042 } 7043 7044 io_run_task_work(); ··· 7072 io_sq_thread_unassociate_blkcg(); 7073 if (old_cred) 7074 revert_creds(old_cred); 7075 7076 kthread_parkme(); 7077 ··· 7132 * application must reap them itself, as they reside on the shared cq ring. 7133 */ 7134 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, 7135 - const sigset_t __user *sig, size_t sigsz) 7136 { 7137 struct io_wait_queue iowq = { 7138 .wq = { ··· 7145 .to_wait = min_events, 7146 }; 7147 struct io_rings *rings = ctx->rings; 7148 int ret = 0; 7149 7150 do { ··· 7169 return ret; 7170 } 7171 7172 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); 7173 trace_io_uring_cqring_wait(ctx, min_events); 7174 do { ··· 7188 break; 7189 if (io_should_wake(&iowq, false)) 7190 break; 7191 - schedule(); 7192 } while (1); 7193 finish_wait(&ctx->wait, &iowq.wq); 7194 ··· 7245 if (!data) 7246 return -ENXIO; 7247 7248 - spin_lock(&data->lock); 7249 ref_node = data->node; 7250 - spin_unlock(&data->lock); 7251 if (ref_node) 7252 percpu_ref_kill(&ref_node->refs); 7253 ··· 7370 7371 mutex_lock(&sqd->ctx_lock); 7372 list_del(&ctx->sqd_list); 7373 mutex_unlock(&sqd->ctx_lock); 7374 7375 - if (sqd->thread) { 7376 - finish_wait(&sqd->wait, &ctx->sqo_wait_entry); 7377 io_sq_thread_unpark(sqd); 7378 - } 7379 7380 io_put_sq_data(sqd); 7381 ctx->sq_data = NULL; ··· 7629 data = ref_node->file_data; 7630 ctx = data->ctx; 7631 7632 - spin_lock(&data->lock); 7633 ref_node->done = true; 7634 7635 while (!list_empty(&data->ref_list)) { ··· 7641 list_del(&ref_node->node); 7642 first_add |= llist_add(&ref_node->llist, &ctx->file_put_llist); 7643 } 7644 - spin_unlock(&data->lock); 7645 7646 if (percpu_ref_is_dying(&data->refs)) 7647 delay = 0; ··· 7764 } 7765 7766 file_data->node = ref_node; 7767 - spin_lock(&file_data->lock); 7768 list_add_tail(&ref_node->node, &file_data->ref_list); 7769 - spin_unlock(&file_data->lock); 7770 percpu_ref_get(&file_data->refs); 7771 return ret; 7772 out_fput: ··· 7923 7924 if (needs_switch) { 7925 percpu_ref_kill(&data->node->refs); 7926 - spin_lock(&data->lock); 7927 list_add_tail(&ref_node->node, &data->ref_list); 7928 data->node = ref_node; 7929 - spin_unlock(&data->lock); 7930 percpu_ref_get(&ctx->file_data->refs); 7931 } else 7932 destroy_fixed_file_ref_node(ref_node); ··· 8054 struct io_sq_data *sqd; 8055 8056 ret = -EPERM; 8057 - if (!capable(CAP_SYS_ADMIN)) 8058 goto err; 8059 8060 sqd = io_get_sq_data(p); ··· 8640 * as nobody else will be looking for them. 8641 */ 8642 do { 8643 - if (ctx->rings) 8644 - io_cqring_overflow_flush(ctx, true, NULL, NULL); 8645 io_iopoll_try_reap_events(ctx); 8646 } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); 8647 io_ring_ctx_free(ctx); ··· 8649 { 8650 mutex_lock(&ctx->uring_lock); 8651 percpu_ref_kill(&ctx->refs); 8652 mutex_unlock(&ctx->uring_lock); 8653 8654 - io_kill_timeouts(ctx, NULL); 8655 - io_poll_remove_all(ctx, NULL); 8656 8657 if (ctx->io_wq) 8658 io_wq_cancel_all(ctx->io_wq); 8659 8660 /* if we failed setting up the ctx, we might not have any rings */ 8661 - if (ctx->rings) 8662 - io_cqring_overflow_flush(ctx, true, NULL, NULL); 8663 io_iopoll_try_reap_events(ctx); 8664 idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); 8665 ··· 8690 return 0; 8691 } 8692 8693 - static bool io_wq_files_match(struct io_wq_work *work, void *data) 8694 - { 8695 - struct files_struct *files = data; 8696 8697 - return !files || ((work->flags & IO_WQ_WORK_FILES) && 8698 - work->identity->files == files); 8699 - } 8700 - 8701 - /* 8702 - * Returns true if 'preq' is the link parent of 'req' 8703 - */ 8704 - static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req) 8705 - { 8706 - struct io_kiocb *link; 8707 - 8708 - if (!(preq->flags & REQ_F_LINK_HEAD)) 8709 - return false; 8710 - 8711 - list_for_each_entry(link, &preq->link_list, link_list) { 8712 - if (link == req) 8713 - return true; 8714 - } 8715 - 8716 - return false; 8717 - } 8718 - 8719 - /* 8720 - * We're looking to cancel 'req' because it's holding on to our files, but 8721 - * 'req' could be a link to another request. See if it is, and cancel that 8722 - * parent request if so. 8723 - */ 8724 - static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req) 8725 - { 8726 - struct hlist_node *tmp; 8727 - struct io_kiocb *preq; 8728 - bool found = false; 8729 - int i; 8730 - 8731 - spin_lock_irq(&ctx->completion_lock); 8732 - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { 8733 - struct hlist_head *list; 8734 - 8735 - list = &ctx->cancel_hash[i]; 8736 - hlist_for_each_entry_safe(preq, tmp, list, hash_node) { 8737 - found = io_match_link(preq, req); 8738 - if (found) { 8739 - io_poll_remove_one(preq); 8740 - break; 8741 - } 8742 - } 8743 - } 8744 - spin_unlock_irq(&ctx->completion_lock); 8745 - return found; 8746 - } 8747 - 8748 - static bool io_timeout_remove_link(struct io_ring_ctx *ctx, 8749 - struct io_kiocb *req) 8750 - { 8751 - struct io_kiocb *preq; 8752 - bool found = false; 8753 - 8754 - spin_lock_irq(&ctx->completion_lock); 8755 - list_for_each_entry(preq, &ctx->timeout_list, timeout.list) { 8756 - found = io_match_link(preq, req); 8757 - if (found) { 8758 - __io_timeout_cancel(preq); 8759 - break; 8760 - } 8761 - } 8762 - spin_unlock_irq(&ctx->completion_lock); 8763 - return found; 8764 - } 8765 - 8766 - static bool io_cancel_link_cb(struct io_wq_work *work, void *data) 8767 { 8768 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 8769 bool ret; 8770 8771 - if (req->flags & REQ_F_LINK_TIMEOUT) { 8772 unsigned long flags; 8773 struct io_ring_ctx *ctx = req->ctx; 8774 8775 /* protect against races with linked timeouts */ 8776 spin_lock_irqsave(&ctx->completion_lock, flags); 8777 - ret = io_match_link(req, data); 8778 spin_unlock_irqrestore(&ctx->completion_lock, flags); 8779 } else { 8780 - ret = io_match_link(req, data); 8781 } 8782 return ret; 8783 - } 8784 - 8785 - static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) 8786 - { 8787 - enum io_wq_cancel cret; 8788 - 8789 - /* cancel this particular work, if it's running */ 8790 - cret = io_wq_cancel_work(ctx->io_wq, &req->work); 8791 - if (cret != IO_WQ_CANCEL_NOTFOUND) 8792 - return; 8793 - 8794 - /* find links that hold this pending, cancel those */ 8795 - cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true); 8796 - if (cret != IO_WQ_CANCEL_NOTFOUND) 8797 - return; 8798 - 8799 - /* if we have a poll link holding this pending, cancel that */ 8800 - if (io_poll_remove_link(ctx, req)) 8801 - return; 8802 - 8803 - /* final option, timeout link is holding this req pending */ 8804 - io_timeout_remove_link(ctx, req); 8805 } 8806 8807 static void io_cancel_defer_files(struct io_ring_ctx *ctx, ··· 8724 8725 spin_lock_irq(&ctx->completion_lock); 8726 list_for_each_entry_reverse(de, &ctx->defer_list, list) { 8727 - if (io_task_match(de->req, task) && 8728 - io_match_files(de->req, files)) { 8729 list_cut_position(&list, &ctx->defer_list, &de->list); 8730 break; 8731 } ··· 8741 } 8742 } 8743 8744 - /* 8745 - * Returns true if we found and killed one or more files pinning requests 8746 - */ 8747 - static bool io_uring_cancel_files(struct io_ring_ctx *ctx, 8748 struct files_struct *files) 8749 { 8750 - if (list_empty_careful(&ctx->inflight_list)) 8751 - return false; 8752 - 8753 - /* cancel all at once, should be faster than doing it one by one*/ 8754 - io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true); 8755 - 8756 while (!list_empty_careful(&ctx->inflight_list)) { 8757 - struct io_kiocb *cancel_req = NULL, *req; 8758 DEFINE_WAIT(wait); 8759 8760 spin_lock_irq(&ctx->inflight_lock); 8761 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { 8762 - if (files && (req->work.flags & IO_WQ_WORK_FILES) && 8763 req->work.identity->files != files) 8764 continue; 8765 - /* req is being completed, ignore */ 8766 - if (!refcount_inc_not_zero(&req->refs)) 8767 - continue; 8768 - cancel_req = req; 8769 break; 8770 } 8771 - if (cancel_req) 8772 - prepare_to_wait(&ctx->inflight_wait, &wait, 8773 - TASK_UNINTERRUPTIBLE); 8774 spin_unlock_irq(&ctx->inflight_lock); 8775 8776 /* We need to keep going until we don't find a matching req */ 8777 - if (!cancel_req) 8778 break; 8779 - /* cancel this request, or head link requests */ 8780 - io_attempt_cancel(ctx, cancel_req); 8781 - io_put_req(cancel_req); 8782 /* cancellations _may_ trigger task work */ 8783 io_run_task_work(); 8784 schedule(); 8785 - finish_wait(&ctx->inflight_wait, &wait); 8786 } 8787 - 8788 - return true; 8789 } 8790 8791 - static bool io_cancel_task_cb(struct io_wq_work *work, void *data) 8792 { 8793 - struct io_kiocb *req = container_of(work, struct io_kiocb, work); 8794 - struct task_struct *task = data; 8795 - 8796 - return io_task_match(req, task); 8797 - } 8798 - 8799 - static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, 8800 - struct task_struct *task, 8801 - struct files_struct *files) 8802 - { 8803 - bool ret; 8804 - 8805 - ret = io_uring_cancel_files(ctx, files); 8806 - if (!files) { 8807 enum io_wq_cancel cret; 8808 8809 - cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true); 8810 if (cret != IO_WQ_CANCEL_NOTFOUND) 8811 ret = true; 8812 ··· 8798 } 8799 } 8800 8801 - ret |= io_poll_remove_all(ctx, task); 8802 - ret |= io_kill_timeouts(ctx, task); 8803 } 8804 - 8805 - return ret; 8806 } 8807 8808 /* ··· 8823 io_sq_thread_park(ctx->sq_data); 8824 } 8825 8826 - if (files) 8827 - io_cancel_defer_files(ctx, NULL, files); 8828 - else 8829 - io_cancel_defer_files(ctx, task, NULL); 8830 - 8831 io_cqring_overflow_flush(ctx, true, task, files); 8832 8833 - while (__io_uring_cancel_task_requests(ctx, task, files)) { 8834 - io_run_task_work(); 8835 - cond_resched(); 8836 - } 8837 8838 if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { 8839 atomic_dec(&task->io_uring->in_idle); ··· 9089 finish_wait(&ctx->sqo_sq_wait, &wait); 9090 } 9091 9092 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, 9093 - u32, min_complete, u32, flags, const sigset_t __user *, sig, 9094 - size_t, sigsz) 9095 { 9096 struct io_ring_ctx *ctx; 9097 long ret = -EBADF; ··· 9131 io_run_task_work(); 9132 9133 if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | 9134 - IORING_ENTER_SQ_WAIT)) 9135 return -EINVAL; 9136 9137 f = fdget(fd); ··· 9158 */ 9159 ret = 0; 9160 if (ctx->flags & IORING_SETUP_SQPOLL) { 9161 if (!list_empty_careful(&ctx->cq_overflow_list)) 9162 io_cqring_overflow_flush(ctx, false, NULL, NULL); 9163 if (flags & IORING_ENTER_SQ_WAKEUP) 9164 wake_up(&ctx->sq_data->wait); 9165 if (flags & IORING_ENTER_SQ_WAIT) ··· 9179 goto out; 9180 } 9181 if (flags & IORING_ENTER_GETEVENTS) { 9182 min_complete = min(min_complete, ctx->cq_entries); 9183 9184 /* ··· 9198 !(ctx->flags & IORING_SETUP_SQPOLL)) { 9199 ret = io_iopoll_check(ctx, min_complete); 9200 } else { 9201 - ret = io_cqring_wait(ctx, min_complete, sig, sigsz); 9202 } 9203 } 9204 ··· 9566 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | 9567 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | 9568 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | 9569 - IORING_FEAT_POLL_32BITS; 9570 9571 if (copy_to_user(params, p, sizeof(*p))) { 9572 ret = -EFAULT;

··· 245 246 struct task_struct *thread; 247 struct wait_queue_head wait; 248 + 249 + unsigned sq_thread_idle; 250 }; 251 252 struct io_ring_ctx { ··· 285 struct list_head timeout_list; 286 struct list_head cq_overflow_list; 287 288 struct io_uring_sqe *sq_sqes; 289 } ____cacheline_aligned_in_smp; 290 ··· 310 struct io_sq_data *sq_data; /* if using sq thread polling */ 311 312 struct wait_queue_head sqo_sq_wait; 313 struct list_head sqd_list; 314 315 /* ··· 395 */ 396 struct io_poll_iocb { 397 struct file *file; 398 + struct wait_queue_head *head; 399 __poll_t events; 400 bool done; 401 bool canceled; 402 struct wait_queue_entry wait; 403 + }; 404 + 405 + struct io_poll_remove { 406 + struct file *file; 407 + u64 addr; 408 }; 409 410 struct io_close { ··· 444 u32 off; 445 u32 target_seq; 446 struct list_head list; 447 + /* head of the link, used by linked timeouts only */ 448 + struct io_kiocb *head; 449 }; 450 451 struct io_timeout_rem { 452 struct file *file; 453 u64 addr; 454 + 455 + /* timeout update */ 456 + struct timespec64 ts; 457 + u32 flags; 458 }; 459 460 struct io_rw { ··· 541 struct statx __user *buffer; 542 }; 543 544 + struct io_shutdown { 545 + struct file *file; 546 + int how; 547 + }; 548 + 549 + struct io_rename { 550 + struct file *file; 551 + int old_dfd; 552 + int new_dfd; 553 + struct filename *oldpath; 554 + struct filename *newpath; 555 + int flags; 556 + }; 557 + 558 + struct io_unlink { 559 + struct file *file; 560 + int dfd; 561 + int flags; 562 + struct filename *filename; 563 + }; 564 + 565 struct io_completion { 566 struct file *file; 567 struct list_head list; ··· 575 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, 576 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, 577 578 REQ_F_FAIL_LINK_BIT, 579 REQ_F_INFLIGHT_BIT, 580 REQ_F_CUR_POS_BIT, ··· 607 /* IOSQE_BUFFER_SELECT */ 608 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), 609 610 /* fail rest of links */ 611 REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), 612 /* on inflight list */ ··· 651 struct file *file; 652 struct io_rw rw; 653 struct io_poll_iocb poll; 654 + struct io_poll_remove poll_remove; 655 struct io_accept accept; 656 struct io_sync sync; 657 struct io_cancel cancel; ··· 667 struct io_splice splice; 668 struct io_provide_buf pbuf; 669 struct io_statx statx; 670 + struct io_shutdown shutdown; 671 + struct io_rename rename; 672 + struct io_unlink unlink; 673 /* use only after cleaning per-op data, see io_clean_op() */ 674 struct io_completion compl; 675 }; ··· 686 struct task_struct *task; 687 u64 user_data; 688 689 + struct io_kiocb *link; 690 + struct percpu_ref *fixed_file_refs; 691 692 /* 693 * 1. used with ctx->iopoll_list with reads/writes 694 * 2. to track reqs with ->files (see io_op_def::file_table) 695 */ 696 struct list_head inflight_entry; 697 struct callback_head task_work; 698 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ 699 struct hlist_node hash_node; ··· 725 void *reqs[IO_IOPOLL_BATCH]; 726 unsigned int free_reqs; 727 728 + bool plug_started; 729 + 730 /* 731 * Batch completion logic 732 */ ··· 735 */ 736 struct file *file; 737 unsigned int fd; 738 + unsigned int file_refs; 739 unsigned int ios_left; 740 }; 741 ··· 757 unsigned buffer_select : 1; 758 /* must always have async data allocated */ 759 unsigned needs_async_data : 1; 760 + /* should block plug */ 761 + unsigned plug : 1; 762 /* size of async data needed, if any */ 763 unsigned short async_size; 764 unsigned work_flags; ··· 770 .pollin = 1, 771 .buffer_select = 1, 772 .needs_async_data = 1, 773 + .plug = 1, 774 .async_size = sizeof(struct io_async_rw), 775 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, 776 }, ··· 779 .unbound_nonreg_file = 1, 780 .pollout = 1, 781 .needs_async_data = 1, 782 + .plug = 1, 783 .async_size = sizeof(struct io_async_rw), 784 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | 785 IO_WQ_WORK_FSIZE, ··· 791 .needs_file = 1, 792 .unbound_nonreg_file = 1, 793 .pollin = 1, 794 + .plug = 1, 795 .async_size = sizeof(struct io_async_rw), 796 .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM, 797 }, ··· 799 .hash_reg_file = 1, 800 .unbound_nonreg_file = 1, 801 .pollout = 1, 802 + .plug = 1, 803 .async_size = sizeof(struct io_async_rw), 804 .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE | 805 IO_WQ_WORK_MM, ··· 818 .pollout = 1, 819 .needs_async_data = 1, 820 .async_size = sizeof(struct io_async_msghdr), 821 + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, 822 }, 823 [IORING_OP_RECVMSG] = { 824 .needs_file = 1, ··· 828 .buffer_select = 1, 829 .needs_async_data = 1, 830 .async_size = sizeof(struct io_async_msghdr), 831 + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, 832 }, 833 [IORING_OP_TIMEOUT] = { 834 .needs_async_data = 1, 835 .async_size = sizeof(struct io_timeout_data), 836 .work_flags = IO_WQ_WORK_MM, 837 }, 838 + [IORING_OP_TIMEOUT_REMOVE] = { 839 + /* used by timeout updates' prep() */ 840 + .work_flags = IO_WQ_WORK_MM, 841 + }, 842 [IORING_OP_ACCEPT] = { 843 .needs_file = 1, 844 .unbound_nonreg_file = 1, ··· 863 }, 864 [IORING_OP_OPENAT] = { 865 .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG | 866 + IO_WQ_WORK_FS | IO_WQ_WORK_MM, 867 }, 868 [IORING_OP_CLOSE] = { 869 .needs_file = 1, ··· 882 .unbound_nonreg_file = 1, 883 .pollin = 1, 884 .buffer_select = 1, 885 + .plug = 1, 886 .async_size = sizeof(struct io_async_rw), 887 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, 888 }, ··· 889 .needs_file = 1, 890 .unbound_nonreg_file = 1, 891 .pollout = 1, 892 + .plug = 1, 893 .async_size = sizeof(struct io_async_rw), 894 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | 895 IO_WQ_WORK_FSIZE, ··· 915 }, 916 [IORING_OP_OPENAT2] = { 917 .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS | 918 + IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM, 919 }, 920 [IORING_OP_EPOLL_CTL] = { 921 .unbound_nonreg_file = 1, ··· 933 .needs_file = 1, 934 .hash_reg_file = 1, 935 .unbound_nonreg_file = 1, 936 + }, 937 + [IORING_OP_SHUTDOWN] = { 938 + .needs_file = 1, 939 + }, 940 + [IORING_OP_RENAMEAT] = { 941 + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES | 942 + IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, 943 + }, 944 + [IORING_OP_UNLINKAT] = { 945 + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES | 946 + IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, 947 }, 948 }; 949 ··· 983 } 984 EXPORT_SYMBOL(io_uring_get_socket); 985 986 + #define io_for_each_link(pos, head) \ 987 + for (pos = (head); pos; pos = pos->link) 988 + 989 static inline void io_clean_op(struct io_kiocb *req) 990 { 991 if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED | ··· 990 __io_clean_op(req); 991 } 992 993 + static inline void io_set_resource_node(struct io_kiocb *req) 994 { 995 + struct io_ring_ctx *ctx = req->ctx; 996 + 997 + if (!req->fixed_file_refs) { 998 + req->fixed_file_refs = &ctx->file_data->node->refs; 999 + percpu_ref_get(req->fixed_file_refs); 1000 + } 1001 + } 1002 + 1003 + static bool io_match_task(struct io_kiocb *head, 1004 + struct task_struct *task, 1005 + struct files_struct *files) 1006 + { 1007 + struct io_kiocb *req; 1008 + 1009 + if (task && head->task != task) 1010 + return false; 1011 + if (!files) 1012 + return true; 1013 + 1014 + io_for_each_link(req, head) { 1015 + if ((req->flags & REQ_F_WORK_INITIALIZED) && 1016 + (req->work.flags & IO_WQ_WORK_FILES) && 1017 + req->work.identity->files == files) 1018 + return true; 1019 + } 1020 + return false; 1021 + } 1022 + 1023 + static void io_sq_thread_drop_mm_files(void) 1024 + { 1025 + struct files_struct *files = current->files; 1026 struct mm_struct *mm = current->mm; 1027 1028 if (mm) { ··· 999 mmput(mm); 1000 current->mm = NULL; 1001 } 1002 + if (files) { 1003 + struct nsproxy *nsproxy = current->nsproxy; 1004 + 1005 + task_lock(current); 1006 + current->files = NULL; 1007 + current->nsproxy = NULL; 1008 + task_unlock(current); 1009 + put_files_struct(files); 1010 + put_nsproxy(nsproxy); 1011 + } 1012 + } 1013 + 1014 + static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx) 1015 + { 1016 + if (!current->files) { 1017 + struct files_struct *files; 1018 + struct nsproxy *nsproxy; 1019 + 1020 + task_lock(ctx->sqo_task); 1021 + files = ctx->sqo_task->files; 1022 + if (!files) { 1023 + task_unlock(ctx->sqo_task); 1024 + return -EOWNERDEAD; 1025 + } 1026 + atomic_inc(&files->count); 1027 + get_nsproxy(ctx->sqo_task->nsproxy); 1028 + nsproxy = ctx->sqo_task->nsproxy; 1029 + task_unlock(ctx->sqo_task); 1030 + 1031 + task_lock(current); 1032 + current->files = files; 1033 + current->nsproxy = nsproxy; 1034 + task_unlock(current); 1035 + } 1036 + return 0; 1037 } 1038 1039 static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) ··· 1026 return -EFAULT; 1027 } 1028 1029 + static int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx, 1030 + struct io_kiocb *req) 1031 { 1032 + const struct io_op_def *def = &io_op_defs[req->opcode]; 1033 + int ret; 1034 + 1035 + if (def->work_flags & IO_WQ_WORK_MM) { 1036 + ret = __io_sq_thread_acquire_mm(ctx); 1037 + if (unlikely(ret)) 1038 + return ret; 1039 + } 1040 + 1041 + if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) { 1042 + ret = __io_sq_thread_acquire_files(ctx); 1043 + if (unlikely(ret)) 1044 + return ret; 1045 + } 1046 + 1047 + return 0; 1048 } 1049 1050 static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx, ··· 1174 INIT_LIST_HEAD(&ctx->iopoll_list); 1175 INIT_LIST_HEAD(&ctx->defer_list); 1176 INIT_LIST_HEAD(&ctx->timeout_list); 1177 spin_lock_init(&ctx->inflight_lock); 1178 INIT_LIST_HEAD(&ctx->inflight_list); 1179 INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work); ··· 1416 { 1417 struct io_kiocb *cur; 1418 1419 + io_for_each_link(cur, req) 1420 + io_prep_async_work(cur); 1421 } 1422 1423 static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req) ··· 1460 } 1461 } 1462 1463 /* 1464 * Returns true if we found and killed one or more timeouts 1465 */ 1466 + static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, 1467 + struct files_struct *files) 1468 { 1469 struct io_kiocb *req, *tmp; 1470 int canceled = 0; 1471 1472 spin_lock_irq(&ctx->completion_lock); 1473 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { 1474 + if (io_match_task(req, tsk, files)) { 1475 io_kill_timeout(req); 1476 canceled++; 1477 } ··· 1594 } 1595 } 1596 1597 /* Returns true if there are no backlogged entries after the flush */ 1598 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, 1599 struct task_struct *tsk, ··· 1647 1648 cqe = NULL; 1649 list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) { 1650 + if (!io_match_task(req, tsk, files)) 1651 continue; 1652 1653 cqe = io_get_cqring(ctx); ··· 1845 static inline void io_put_file(struct io_kiocb *req, struct file *file, 1846 bool fixed) 1847 { 1848 + if (!fixed) 1849 fput(file); 1850 } 1851 ··· 1859 kfree(req->async_data); 1860 if (req->file) 1861 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); 1862 + if (req->fixed_file_refs) 1863 + percpu_ref_put(req->fixed_file_refs); 1864 io_req_clean_work(req); 1865 } 1866 ··· 1882 percpu_ref_put(&ctx->refs); 1883 } 1884 1885 + static inline void io_remove_next_linked(struct io_kiocb *req) 1886 + { 1887 + struct io_kiocb *nxt = req->link; 1888 + 1889 + req->link = nxt->link; 1890 + nxt->link = NULL; 1891 + } 1892 + 1893 static void io_kill_linked_timeout(struct io_kiocb *req) 1894 { 1895 struct io_ring_ctx *ctx = req->ctx; ··· 1890 unsigned long flags; 1891 1892 spin_lock_irqsave(&ctx->completion_lock, flags); 1893 + link = req->link; 1894 + 1895 /* 1896 * Can happen if a linked timeout fired and link had been like 1897 * req -> link t-out -> link t-out [-> ...] ··· 1900 struct io_timeout_data *io = link->async_data; 1901 int ret; 1902 1903 + io_remove_next_linked(req); 1904 + link->timeout.head = NULL; 1905 ret = hrtimer_try_to_cancel(&io->timer); 1906 if (ret != -1) { 1907 io_cqring_fill_event(link, -ECANCELED); ··· 1917 } 1918 } 1919 1920 1921 static void io_fail_links(struct io_kiocb *req) 1922 { 1923 + struct io_kiocb *link, *nxt; 1924 struct io_ring_ctx *ctx = req->ctx; 1925 unsigned long flags; 1926 1927 spin_lock_irqsave(&ctx->completion_lock, flags); 1928 + link = req->link; 1929 + req->link = NULL; 1930 1931 + while (link) { 1932 + nxt = link->link; 1933 + link->link = NULL; 1934 + 1935 trace_io_uring_fail_link(req, link); 1936 io_cqring_fill_event(link, -ECANCELED); 1937 1938 /* ··· 1963 io_put_req_deferred(link, 2); 1964 else 1965 io_double_put_req(link); 1966 + link = nxt; 1967 } 1968 io_commit_cqring(ctx); 1969 spin_unlock_irqrestore(&ctx->completion_lock, flags); 1970 ··· 1973 1974 static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) 1975 { 1976 if (req->flags & REQ_F_LINK_TIMEOUT) 1977 io_kill_linked_timeout(req); 1978 ··· 1983 * dependencies to the next request. In case of failure, fail the rest 1984 * of the chain. 1985 */ 1986 + if (likely(!(req->flags & REQ_F_FAIL_LINK))) { 1987 + struct io_kiocb *nxt = req->link; 1988 + 1989 + req->link = NULL; 1990 + return nxt; 1991 + } 1992 io_fail_links(req); 1993 return NULL; 1994 } 1995 1996 + static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) 1997 { 1998 + if (likely(!(req->link) && !(req->flags & REQ_F_LINK_TIMEOUT))) 1999 return NULL; 2000 return __io_req_find_next(req); 2001 } ··· 2050 { 2051 struct io_ring_ctx *ctx = req->ctx; 2052 2053 + if (!__io_sq_thread_acquire_mm(ctx) && 2054 + !__io_sq_thread_acquire_files(ctx)) { 2055 mutex_lock(&ctx->uring_lock); 2056 __io_queue_sqe(req, NULL); 2057 mutex_unlock(&ctx->uring_lock); ··· 2086 } 2087 } 2088 2089 + static inline void io_queue_next(struct io_kiocb *req) 2090 { 2091 struct io_kiocb *nxt = io_req_find_next(req); 2092 ··· 2143 io_free_req(req); 2144 return; 2145 } 2146 + io_queue_next(req); 2147 2148 if (req->task != rb->task) { 2149 if (rb->task) { ··· 2246 * we wake up the task, and the next invocation will flush the 2247 * entries. We cannot safely to it from here. 2248 */ 2249 + if (noflush) 2250 return -1U; 2251 2252 io_cqring_overflow_flush(ctx, false, NULL, NULL); ··· 2593 if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) 2594 return false; 2595 2596 + ret = io_sq_thread_acquire_mm_files(req->ctx, req); 2597 2598 if (io_resubmit_prep(req, ret)) { 2599 refcount_inc(&req->refs); ··· 2641 * find it from a io_iopoll_getevents() thread before the issuer is done 2642 * accessing the kiocb cookie. 2643 */ 2644 + static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async) 2645 { 2646 struct io_ring_ctx *ctx = req->ctx; 2647 ··· 2670 else 2671 list_add_tail(&req->inflight_entry, &ctx->iopoll_list); 2672 2673 + /* 2674 + * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread 2675 + * task context or in io worker task context. If current task context is 2676 + * sq thread, we don't need to check whether should wake up sq thread. 2677 + */ 2678 + if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) && 2679 wq_has_sleeper(&ctx->sq_data->wait)) 2680 wake_up(&ctx->sq_data->wait); 2681 } 2682 2683 + static inline void __io_state_file_put(struct io_submit_state *state) 2684 { 2685 + fput_many(state->file, state->file_refs); 2686 + state->file_refs = 0; 2687 } 2688 2689 static inline void io_state_file_put(struct io_submit_state *state) 2690 { 2691 + if (state->file_refs) 2692 __io_state_file_put(state); 2693 } 2694 ··· 2698 if (!state) 2699 return fget(fd); 2700 2701 + if (state->file_refs) { 2702 if (state->fd == fd) { 2703 + state->file_refs--; 2704 return state->file; 2705 } 2706 __io_state_file_put(state); 2707 } 2708 state->file = fget_many(fd, state->ios_left); 2709 + if (unlikely(!state->file)) 2710 return NULL; 2711 2712 state->fd = fd; 2713 + state->file_refs = state->ios_left - 1; 2714 return state->file; 2715 } 2716 ··· 3065 return __io_iov_buffer_select(req, iov, needs_lock); 3066 } 3067 3068 + static ssize_t io_import_iovec(int rw, struct io_kiocb *req, 3069 struct iovec **iovec, struct iov_iter *iter, 3070 bool needs_lock) 3071 { ··· 3094 3095 ret = import_single_range(rw, buf, sqe_len, *iovec, iter); 3096 *iovec = NULL; 3097 + return ret; 3098 } 3099 3100 if (req->flags & REQ_F_BUFFER_SELECT) { ··· 3109 3110 return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter, 3111 req->ctx->compat); 3112 } 3113 3114 static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) ··· 3246 struct iovec *iov = iorw->fast_iov; 3247 ssize_t ret; 3248 3249 + ret = io_import_iovec(rw, req, &iov, &iorw->iter, false); 3250 if (unlikely(ret < 0)) 3251 return ret; 3252 ··· 3379 struct iov_iter __iter, *iter = &__iter; 3380 struct io_async_rw *rw = req->async_data; 3381 ssize_t io_size, ret, ret2; 3382 bool no_async; 3383 3384 + if (rw) { 3385 iter = &rw->iter; 3386 + iovec = NULL; 3387 + } else { 3388 + ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); 3389 + if (ret < 0) 3390 + return ret; 3391 + } 3392 + io_size = iov_iter_count(iter); 3393 req->result = io_size; 3394 ret = 0; 3395 ··· 3405 if (no_async) 3406 goto copy_iov; 3407 3408 + ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size); 3409 if (unlikely(ret)) 3410 goto out_free; 3411 ··· 3424 if (req->file->f_flags & O_NONBLOCK) 3425 goto done; 3426 /* some cases will consume bytes even on error returns */ 3427 + iov_iter_revert(iter, io_size - iov_iter_count(iter)); 3428 ret = 0; 3429 goto copy_iov; 3430 } else if (ret < 0) { ··· 3507 struct kiocb *kiocb = &req->rw.kiocb; 3508 struct iov_iter __iter, *iter = &__iter; 3509 struct io_async_rw *rw = req->async_data; 3510 ssize_t ret, ret2, io_size; 3511 3512 + if (rw) { 3513 iter = &rw->iter; 3514 + iovec = NULL; 3515 + } else { 3516 + ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); 3517 + if (ret < 0) 3518 + return ret; 3519 + } 3520 + io_size = iov_iter_count(iter); 3521 req->result = io_size; 3522 3523 /* Ensure we clear previously set non-block flag */ ··· 3535 (req->flags & REQ_F_ISREG)) 3536 goto copy_iov; 3537 3538 + ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size); 3539 if (unlikely(ret)) 3540 goto out_free; 3541 ··· 3578 } else { 3579 copy_iov: 3580 /* some cases will consume bytes even on error returns */ 3581 + iov_iter_revert(iter, io_size - iov_iter_count(iter)); 3582 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); 3583 if (!ret) 3584 return -EAGAIN; ··· 3588 if (iovec) 3589 kfree(iovec); 3590 return ret; 3591 + } 3592 + 3593 + static int io_renameat_prep(struct io_kiocb *req, 3594 + const struct io_uring_sqe *sqe) 3595 + { 3596 + struct io_rename *ren = &req->rename; 3597 + const char __user *oldf, *newf; 3598 + 3599 + if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3600 + return -EBADF; 3601 + 3602 + ren->old_dfd = READ_ONCE(sqe->fd); 3603 + oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3604 + newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 3605 + ren->new_dfd = READ_ONCE(sqe->len); 3606 + ren->flags = READ_ONCE(sqe->rename_flags); 3607 + 3608 + ren->oldpath = getname(oldf); 3609 + if (IS_ERR(ren->oldpath)) 3610 + return PTR_ERR(ren->oldpath); 3611 + 3612 + ren->newpath = getname(newf); 3613 + if (IS_ERR(ren->newpath)) { 3614 + putname(ren->oldpath); 3615 + return PTR_ERR(ren->newpath); 3616 + } 3617 + 3618 + req->flags |= REQ_F_NEED_CLEANUP; 3619 + return 0; 3620 + } 3621 + 3622 + static int io_renameat(struct io_kiocb *req, bool force_nonblock) 3623 + { 3624 + struct io_rename *ren = &req->rename; 3625 + int ret; 3626 + 3627 + if (force_nonblock) 3628 + return -EAGAIN; 3629 + 3630 + ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, 3631 + ren->newpath, ren->flags); 3632 + 3633 + req->flags &= ~REQ_F_NEED_CLEANUP; 3634 + if (ret < 0) 3635 + req_set_fail_links(req); 3636 + io_req_complete(req, ret); 3637 + return 0; 3638 + } 3639 + 3640 + static int io_unlinkat_prep(struct io_kiocb *req, 3641 + const struct io_uring_sqe *sqe) 3642 + { 3643 + struct io_unlink *un = &req->unlink; 3644 + const char __user *fname; 3645 + 3646 + if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3647 + return -EBADF; 3648 + 3649 + un->dfd = READ_ONCE(sqe->fd); 3650 + 3651 + un->flags = READ_ONCE(sqe->unlink_flags); 3652 + if (un->flags & ~AT_REMOVEDIR) 3653 + return -EINVAL; 3654 + 3655 + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3656 + un->filename = getname(fname); 3657 + if (IS_ERR(un->filename)) 3658 + return PTR_ERR(un->filename); 3659 + 3660 + req->flags |= REQ_F_NEED_CLEANUP; 3661 + return 0; 3662 + } 3663 + 3664 + static int io_unlinkat(struct io_kiocb *req, bool force_nonblock) 3665 + { 3666 + struct io_unlink *un = &req->unlink; 3667 + int ret; 3668 + 3669 + if (force_nonblock) 3670 + return -EAGAIN; 3671 + 3672 + if (un->flags & AT_REMOVEDIR) 3673 + ret = do_rmdir(un->dfd, un->filename); 3674 + else 3675 + ret = do_unlinkat(un->dfd, un->filename); 3676 + 3677 + req->flags &= ~REQ_F_NEED_CLEANUP; 3678 + if (ret < 0) 3679 + req_set_fail_links(req); 3680 + io_req_complete(req, ret); 3681 + return 0; 3682 + } 3683 + 3684 + static int io_shutdown_prep(struct io_kiocb *req, 3685 + const struct io_uring_sqe *sqe) 3686 + { 3687 + #if defined(CONFIG_NET) 3688 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3689 + return -EINVAL; 3690 + if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || 3691 + sqe->buf_index) 3692 + return -EINVAL; 3693 + 3694 + req->shutdown.how = READ_ONCE(sqe->len); 3695 + return 0; 3696 + #else 3697 + return -EOPNOTSUPP; 3698 + #endif 3699 + } 3700 + 3701 + static int io_shutdown(struct io_kiocb *req, bool force_nonblock) 3702 + { 3703 + #if defined(CONFIG_NET) 3704 + struct socket *sock; 3705 + int ret; 3706 + 3707 + if (force_nonblock) 3708 + return -EAGAIN; 3709 + 3710 + sock = sock_from_file(req->file); 3711 + if (unlikely(!sock)) 3712 + return -ENOTSOCK; 3713 + 3714 + ret = __sys_shutdown_sock(sock, req->shutdown.how); 3715 + io_req_complete(req, ret); 3716 + return 0; 3717 + #else 3718 + return -EOPNOTSUPP; 3719 + #endif 3720 } 3721 3722 static int __io_splice_prep(struct io_kiocb *req, ··· 3804 { 3805 u64 flags, mode; 3806 3807 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3808 return -EINVAL; 3809 mode = READ_ONCE(sqe->len); 3810 flags = READ_ONCE(sqe->open_flags); ··· 3818 size_t len; 3819 int ret; 3820 3821 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3822 return -EINVAL; 3823 how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 3824 len = READ_ONCE(sqe->len); ··· 3948 head = idr_find(&ctx->io_buffer_idr, p->bgid); 3949 if (head) 3950 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); 3951 if (ret < 0) 3952 req_set_fail_links(req); 3953 + 3954 + /* need to hold the lock to complete IOPOLL requests */ 3955 + if (ctx->flags & IORING_SETUP_IOPOLL) { 3956 + __io_req_complete(req, ret, 0, cs); 3957 + io_ring_submit_unlock(ctx, !force_nonblock); 3958 + } else { 3959 + io_ring_submit_unlock(ctx, !force_nonblock); 3960 + __io_req_complete(req, ret, 0, cs); 3961 + } 3962 return 0; 3963 } 3964 ··· 4037 } 4038 } 4039 out: 4040 if (ret < 0) 4041 req_set_fail_links(req); 4042 + 4043 + /* need to hold the lock to complete IOPOLL requests */ 4044 + if (ctx->flags & IORING_SETUP_IOPOLL) { 4045 + __io_req_complete(req, ret, 0, cs); 4046 + io_ring_submit_unlock(ctx, !force_nonblock); 4047 + } else { 4048 + io_ring_submit_unlock(ctx, !force_nonblock); 4049 + __io_req_complete(req, ret, 0, cs); 4050 + } 4051 return 0; 4052 } 4053 ··· 4212 io_req_init_async(req); 4213 req->work.flags |= IO_WQ_WORK_NO_CANCEL; 4214 4215 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4216 return -EINVAL; 4217 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || 4218 sqe->rw_flags || sqe->buf_index) ··· 4694 { 4695 struct io_accept *accept = &req->accept; 4696 4697 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4698 return -EINVAL; 4699 if (sqe->ioprio || sqe->len || sqe->buf_index) 4700 return -EINVAL; ··· 4735 struct io_connect *conn = &req->connect; 4736 struct io_async_connect *io = req->async_data; 4737 4738 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4739 return -EINVAL; 4740 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) 4741 return -EINVAL; ··· 5270 /* 5271 * Returns true if we found and killed one or more poll requests 5272 */ 5273 + static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, 5274 + struct files_struct *files) 5275 { 5276 struct hlist_node *tmp; 5277 struct io_kiocb *req; ··· 5282 5283 list = &ctx->cancel_hash[i]; 5284 hlist_for_each_entry_safe(req, tmp, list, hash_node) { 5285 + if (io_match_task(req, tsk, files)) 5286 posted += io_poll_remove_one(req); 5287 } 5288 } ··· 5320 sqe->poll_events) 5321 return -EINVAL; 5322 5323 + req->poll_remove.addr = READ_ONCE(sqe->addr); 5324 return 0; 5325 } 5326 ··· 5331 static int io_poll_remove(struct io_kiocb *req) 5332 { 5333 struct io_ring_ctx *ctx = req->ctx; 5334 int ret; 5335 5336 spin_lock_irq(&ctx->completion_lock); 5337 + ret = io_poll_cancel(ctx, req->poll_remove.addr); 5338 spin_unlock_irq(&ctx->completion_lock); 5339 5340 if (ret < 0) ··· 5429 return HRTIMER_NORESTART; 5430 } 5431 5432 + static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, 5433 + __u64 user_data) 5434 { 5435 + struct io_timeout_data *io; 5436 struct io_kiocb *req; 5437 int ret = -ENOENT; 5438 ··· 5458 } 5459 5460 if (ret == -ENOENT) 5461 + return ERR_PTR(ret); 5462 5463 + io = req->async_data; 5464 + ret = hrtimer_try_to_cancel(&io->timer); 5465 + if (ret == -1) 5466 + return ERR_PTR(-EALREADY); 5467 + list_del_init(&req->timeout.list); 5468 + return req; 5469 + } 5470 + 5471 + static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) 5472 + { 5473 + struct io_kiocb *req = io_timeout_extract(ctx, user_data); 5474 + 5475 + if (IS_ERR(req)) 5476 + return PTR_ERR(req); 5477 + 5478 + req_set_fail_links(req); 5479 + io_cqring_fill_event(req, -ECANCELED); 5480 + io_put_req_deferred(req, 1); 5481 + return 0; 5482 + } 5483 + 5484 + static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, 5485 + struct timespec64 *ts, enum hrtimer_mode mode) 5486 + { 5487 + struct io_kiocb *req = io_timeout_extract(ctx, user_data); 5488 + struct io_timeout_data *data; 5489 + 5490 + if (IS_ERR(req)) 5491 + return PTR_ERR(req); 5492 + 5493 + req->timeout.off = 0; /* noseq */ 5494 + data = req->async_data; 5495 + list_add_tail(&req->timeout.list, &ctx->timeout_list); 5496 + hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode); 5497 + data->timer.function = io_timeout_fn; 5498 + hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); 5499 + return 0; 5500 } 5501 5502 static int io_timeout_remove_prep(struct io_kiocb *req, 5503 const struct io_uring_sqe *sqe) 5504 { 5505 + struct io_timeout_rem *tr = &req->timeout_rem; 5506 + 5507 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5508 return -EINVAL; 5509 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 5510 return -EINVAL; 5511 + if (sqe->ioprio || sqe->buf_index || sqe->len) 5512 return -EINVAL; 5513 5514 + tr->addr = READ_ONCE(sqe->addr); 5515 + tr->flags = READ_ONCE(sqe->timeout_flags); 5516 + if (tr->flags & IORING_TIMEOUT_UPDATE) { 5517 + if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS)) 5518 + return -EINVAL; 5519 + if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) 5520 + return -EFAULT; 5521 + } else if (tr->flags) { 5522 + /* timeout removal doesn't support flags */ 5523 + return -EINVAL; 5524 + } 5525 + 5526 return 0; 5527 } 5528 ··· 5482 */ 5483 static int io_timeout_remove(struct io_kiocb *req) 5484 { 5485 + struct io_timeout_rem *tr = &req->timeout_rem; 5486 struct io_ring_ctx *ctx = req->ctx; 5487 int ret; 5488 5489 spin_lock_irq(&ctx->completion_lock); 5490 + if (req->timeout_rem.flags & IORING_TIMEOUT_UPDATE) { 5491 + enum hrtimer_mode mode = (tr->flags & IORING_TIMEOUT_ABS) 5492 + ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; 5493 + 5494 + ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); 5495 + } else { 5496 + ret = io_timeout_cancel(ctx, tr->addr); 5497 + } 5498 5499 io_cqring_fill_event(req, ret); 5500 io_commit_cqring(ctx); ··· 5766 return io_remove_buffers_prep(req, sqe); 5767 case IORING_OP_TEE: 5768 return io_tee_prep(req, sqe); 5769 + case IORING_OP_SHUTDOWN: 5770 + return io_shutdown_prep(req, sqe); 5771 + case IORING_OP_RENAMEAT: 5772 + return io_renameat_prep(req, sqe); 5773 + case IORING_OP_UNLINKAT: 5774 + return io_unlinkat_prep(req, sqe); 5775 } 5776 5777 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", ··· 5787 { 5788 struct io_kiocb *pos; 5789 struct io_ring_ctx *ctx = req->ctx; 5790 + u32 total_submitted, nr_reqs = 0; 5791 5792 + io_for_each_link(pos, req) 5793 + nr_reqs++; 5794 5795 total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped; 5796 return total_submitted - nr_reqs; ··· 5843 static void io_req_drop_files(struct io_kiocb *req) 5844 { 5845 struct io_ring_ctx *ctx = req->ctx; 5846 + struct io_uring_task *tctx = req->task->io_uring; 5847 unsigned long flags; 5848 5849 spin_lock_irqsave(&ctx->inflight_lock, flags); 5850 list_del(&req->inflight_entry); 5851 + if (atomic_read(&tctx->in_idle)) 5852 + wake_up(&tctx->wait); 5853 spin_unlock_irqrestore(&ctx->inflight_lock, flags); 5854 req->flags &= ~REQ_F_INFLIGHT; 5855 put_files_struct(req->work.identity->files); ··· 5902 case IORING_OP_OPENAT2: 5903 if (req->open.filename) 5904 putname(req->open.filename); 5905 + break; 5906 + case IORING_OP_RENAMEAT: 5907 + putname(req->rename.oldpath); 5908 + putname(req->rename.newpath); 5909 + break; 5910 + case IORING_OP_UNLINKAT: 5911 + putname(req->unlink.filename); 5912 break; 5913 } 5914 req->flags &= ~REQ_F_NEED_CLEANUP; ··· 6009 case IORING_OP_TEE: 6010 ret = io_tee(req, force_nonblock); 6011 break; 6012 + case IORING_OP_SHUTDOWN: 6013 + ret = io_shutdown(req, force_nonblock); 6014 + break; 6015 + case IORING_OP_RENAMEAT: 6016 + ret = io_renameat(req, force_nonblock); 6017 + break; 6018 + case IORING_OP_UNLINKAT: 6019 + ret = io_unlinkat(req, force_nonblock); 6020 + break; 6021 default: 6022 ret = -EINVAL; 6023 break; ··· 6025 if (in_async) 6026 mutex_lock(&ctx->uring_lock); 6027 6028 + io_iopoll_req_issued(req, in_async); 6029 6030 if (in_async) 6031 mutex_unlock(&ctx->uring_lock); ··· 6065 } 6066 6067 if (ret) { 6068 + /* 6069 + * io_iopoll_complete() does not hold completion_lock to complete 6070 + * polled io, so here for polled io, just mark it done and still let 6071 + * io_iopoll_complete() complete it. 6072 + */ 6073 + if (req->ctx->flags & IORING_SETUP_IOPOLL) { 6074 + struct kiocb *kiocb = &req->rw.kiocb; 6075 + 6076 + kiocb_done(kiocb, ret, NULL); 6077 + } else { 6078 + req_set_fail_links(req); 6079 + io_req_complete(req, ret); 6080 + } 6081 } 6082 6083 return io_steal_work(req); ··· 6092 return NULL; 6093 fd = array_index_nospec(fd, ctx->nr_user_files); 6094 file = io_file_from_index(ctx, fd); 6095 + io_set_resource_node(req); 6096 } else { 6097 trace_io_uring_file_get(ctx, fd); 6098 file = __io_file_get(state, fd); ··· 6104 return file; 6105 } 6106 6107 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) 6108 { 6109 struct io_timeout_data *data = container_of(timer, 6110 struct io_timeout_data, timer); 6111 + struct io_kiocb *prev, *req = data->req; 6112 struct io_ring_ctx *ctx = req->ctx; 6113 unsigned long flags; 6114 6115 spin_lock_irqsave(&ctx->completion_lock, flags); 6116 + prev = req->timeout.head; 6117 + req->timeout.head = NULL; 6118 6119 /* 6120 * We don't expect the list to be empty, that will only happen if we 6121 * race with the completion of the linked work. 6122 */ 6123 + if (prev && refcount_inc_not_zero(&prev->refs)) 6124 + io_remove_next_linked(prev); 6125 + else 6126 + prev = NULL; 6127 spin_unlock_irqrestore(&ctx->completion_lock, flags); 6128 6129 if (prev) { ··· 6158 static void __io_queue_linked_timeout(struct io_kiocb *req) 6159 { 6160 /* 6161 + * If the back reference is NULL, then our linked request finished 6162 + * before we got a chance to setup the timer 6163 */ 6164 + if (req->timeout.head) { 6165 struct io_timeout_data *data = req->async_data; 6166 6167 data->timer.function = io_link_timeout_fn; ··· 6184 6185 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) 6186 { 6187 + struct io_kiocb *nxt = req->link; 6188 6189 + if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) || 6190 + nxt->opcode != IORING_OP_LINK_TIMEOUT) 6191 return NULL; 6192 6193 + nxt->timeout.head = req; 6194 nxt->flags |= REQ_F_LTIMEOUT_ACTIVE; 6195 req->flags |= REQ_F_LINK_TIMEOUT; 6196 return nxt; ··· 6301 io_queue_sqe(req, NULL, cs); 6302 } 6303 6304 + struct io_submit_link { 6305 + struct io_kiocb *head; 6306 + struct io_kiocb *last; 6307 + }; 6308 + 6309 static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, 6310 + struct io_submit_link *link, struct io_comp_state *cs) 6311 { 6312 struct io_ring_ctx *ctx = req->ctx; 6313 int ret; ··· 6314 * submitted sync once the chain is complete. If none of those 6315 * conditions are true (normal request), then just queue it. 6316 */ 6317 + if (link->head) { 6318 + struct io_kiocb *head = link->head; 6319 6320 /* 6321 * Taking sequential execution of a link, draining both sides ··· 6335 return ret; 6336 } 6337 trace_io_uring_link(ctx, req, head); 6338 + link->last->link = req; 6339 + link->last = req; 6340 6341 /* last request of a link, enqueue the link */ 6342 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { 6343 io_queue_link_head(head, cs); 6344 + link->head = NULL; 6345 } 6346 } else { 6347 if (unlikely(ctx->drain_next)) { ··· 6348 ctx->drain_next = 0; 6349 } 6350 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { 6351 ret = io_req_defer_prep(req, sqe); 6352 if (unlikely(ret)) 6353 req->flags |= REQ_F_FAIL_LINK; 6354 + link->head = req; 6355 + link->last = req; 6356 } else { 6357 io_queue_sqe(req, sqe, cs); 6358 } ··· 6370 { 6371 if (!list_empty(&state->comp.list)) 6372 io_submit_flush_completions(&state->comp); 6373 + if (state->plug_started) 6374 + blk_finish_plug(&state->plug); 6375 io_state_file_put(state); 6376 if (state->free_reqs) 6377 kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs); ··· 6382 static void io_submit_state_start(struct io_submit_state *state, 6383 struct io_ring_ctx *ctx, unsigned int max_ios) 6384 { 6385 + state->plug_started = false; 6386 state->comp.nr = 0; 6387 INIT_LIST_HEAD(&state->comp.list); 6388 state->comp.ctx = ctx; 6389 state->free_reqs = 0; 6390 + state->file_refs = 0; 6391 state->ios_left = max_ios; 6392 } 6393 ··· 6482 req->file = NULL; 6483 req->ctx = ctx; 6484 req->flags = 0; 6485 + req->link = NULL; 6486 + req->fixed_file_refs = NULL; 6487 /* one is dropped after submission, the other at completion */ 6488 refcount_set(&req->refs, 2); 6489 req->task = current; ··· 6490 if (unlikely(req->opcode >= IORING_OP_LAST)) 6491 return -EINVAL; 6492 6493 + if (unlikely(io_sq_thread_acquire_mm_files(ctx, req))) 6494 return -EFAULT; 6495 6496 sqe_flags = READ_ONCE(sqe->flags); ··· 6523 /* same numerical values with corresponding REQ_F_*, safe to copy */ 6524 req->flags |= sqe_flags; 6525 6526 + /* 6527 + * Plug now if we have more than 1 IO left after this, and the target 6528 + * is potentially a read/write to block based storage. 6529 + */ 6530 + if (!state->plug_started && state->ios_left > 1 && 6531 + io_op_defs[req->opcode].plug) { 6532 + blk_start_plug(&state->plug); 6533 + state->plug_started = true; 6534 + } 6535 6536 + ret = 0; 6537 + if (io_op_defs[req->opcode].needs_file) { 6538 + bool fixed = req->flags & REQ_F_FIXED_FILE; 6539 + 6540 + req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed); 6541 + if (unlikely(!req->file && 6542 + !io_op_defs[req->opcode].needs_file_no_error)) 6543 + ret = -EBADF; 6544 + } 6545 + 6546 state->ios_left--; 6547 return ret; 6548 } ··· 6534 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) 6535 { 6536 struct io_submit_state state; 6537 + struct io_submit_link link; 6538 int i, submitted = 0; 6539 6540 /* if we have a backlog and couldn't flush it all, return BUSY */ ··· 6554 refcount_add(nr, &current->usage); 6555 6556 io_submit_state_start(&state, ctx, nr); 6557 + link.head = NULL; 6558 6559 for (i = 0; i < nr; i++) { 6560 const struct io_uring_sqe *sqe; ··· 6599 percpu_counter_sub(&tctx->inflight, unused); 6600 put_task_struct_many(current, unused); 6601 } 6602 + if (link.head) 6603 + io_queue_link_head(link.head, &state.comp); 6604 io_submit_state_end(&state); 6605 6606 /* Commit SQ ring head once we've consumed and submitted all SQEs */ ··· 6624 spin_unlock_irq(&ctx->completion_lock); 6625 } 6626 6627 + static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) 6628 { 6629 unsigned int to_submit; 6630 int ret = 0; 6631 6632 to_submit = io_sqring_entries(ctx); 6633 /* if we're handling multiple rings, cap submit size for fairness */ 6634 if (cap_entries && to_submit > 8) 6635 to_submit = 8; 6636 6637 + if (!list_empty(&ctx->iopoll_list) || to_submit) { 6638 + unsigned nr_events = 0; 6639 + 6640 + mutex_lock(&ctx->uring_lock); 6641 + if (!list_empty(&ctx->iopoll_list)) 6642 + io_do_iopoll(ctx, &nr_events, 0); 6643 + 6644 + if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs))) 6645 + ret = io_submit_sqes(ctx, to_submit); 6646 + mutex_unlock(&ctx->uring_lock); 6647 + } 6648 6649 if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait)) 6650 wake_up(&ctx->sqo_sq_wait); 6651 6652 + return ret; 6653 + } 6654 + 6655 + static void io_sqd_update_thread_idle(struct io_sq_data *sqd) 6656 + { 6657 + struct io_ring_ctx *ctx; 6658 + unsigned sq_thread_idle = 0; 6659 + 6660 + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 6661 + if (sq_thread_idle < ctx->sq_thread_idle) 6662 + sq_thread_idle = ctx->sq_thread_idle; 6663 + } 6664 + 6665 + sqd->sq_thread_idle = sq_thread_idle; 6666 } 6667 6668 static void io_sqd_init_new(struct io_sq_data *sqd) ··· 6737 6738 while (!list_empty(&sqd->ctx_new_list)) { 6739 ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list); 6740 list_move_tail(&ctx->sqd_list, &sqd->ctx_list); 6741 complete(&ctx->sq_thread_comp); 6742 } 6743 + 6744 + io_sqd_update_thread_idle(sqd); 6745 } 6746 6747 static int io_sq_thread(void *data) 6748 { 6749 struct cgroup_subsys_state *cur_css = NULL; 6750 + struct files_struct *old_files = current->files; 6751 + struct nsproxy *old_nsproxy = current->nsproxy; 6752 const struct cred *old_cred = NULL; 6753 struct io_sq_data *sqd = data; 6754 struct io_ring_ctx *ctx; 6755 + unsigned long timeout = 0; 6756 + DEFINE_WAIT(wait); 6757 6758 + task_lock(current); 6759 + current->files = NULL; 6760 + current->nsproxy = NULL; 6761 + task_unlock(current); 6762 + 6763 while (!kthread_should_stop()) { 6764 + int ret; 6765 + bool cap_entries, sqt_spin, needs_sched; 6766 6767 /* 6768 * Any changes to the sqd lists are synchronized through the 6769 * kthread parking. This synchronizes the thread vs users, 6770 * the users are synchronized on the sqd->ctx_lock. 6771 */ 6772 + if (kthread_should_park()) { 6773 kthread_parkme(); 6774 + /* 6775 + * When sq thread is unparked, in case the previous park operation 6776 + * comes from io_put_sq_data(), which means that sq thread is going 6777 + * to be stopped, so here needs to have a check. 6778 + */ 6779 + if (kthread_should_stop()) 6780 + break; 6781 + } 6782 6783 + if (unlikely(!list_empty(&sqd->ctx_new_list))) { 6784 io_sqd_init_new(sqd); 6785 + timeout = jiffies + sqd->sq_thread_idle; 6786 + } 6787 6788 + sqt_spin = false; 6789 cap_entries = !list_is_singular(&sqd->ctx_list); 6790 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 6791 if (current->cred != ctx->creds) { 6792 if (old_cred) ··· 6782 current->sessionid = ctx->sessionid; 6783 #endif 6784 6785 + ret = __io_sq_thread(ctx, cap_entries); 6786 + if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list))) 6787 + sqt_spin = true; 6788 6789 + io_sq_thread_drop_mm_files(); 6790 } 6791 6792 + if (sqt_spin || !time_after(jiffies, timeout)) { 6793 io_run_task_work(); 6794 cond_resched(); 6795 + if (sqt_spin) 6796 + timeout = jiffies + sqd->sq_thread_idle; 6797 + continue; 6798 + } 6799 + 6800 + if (kthread_should_park()) 6801 + continue; 6802 + 6803 + needs_sched = true; 6804 + prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); 6805 + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 6806 + if ((ctx->flags & IORING_SETUP_IOPOLL) && 6807 + !list_empty_careful(&ctx->iopoll_list)) { 6808 + needs_sched = false; 6809 + break; 6810 + } 6811 + if (io_sqring_entries(ctx)) { 6812 + needs_sched = false; 6813 + break; 6814 + } 6815 + } 6816 + 6817 + if (needs_sched) { 6818 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 6819 io_ring_set_wakeup_flag(ctx); 6820 + 6821 schedule(); 6822 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 6823 io_ring_clear_wakeup_flag(ctx); 6824 } 6825 + 6826 + finish_wait(&sqd->wait, &wait); 6827 + timeout = jiffies + sqd->sq_thread_idle; 6828 } 6829 6830 io_run_task_work(); ··· 6808 io_sq_thread_unassociate_blkcg(); 6809 if (old_cred) 6810 revert_creds(old_cred); 6811 + 6812 + task_lock(current); 6813 + current->files = old_files; 6814 + current->nsproxy = old_nsproxy; 6815 + task_unlock(current); 6816 6817 kthread_parkme(); 6818 ··· 6863 * application must reap them itself, as they reside on the shared cq ring. 6864 */ 6865 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, 6866 + const sigset_t __user *sig, size_t sigsz, 6867 + struct __kernel_timespec __user *uts) 6868 { 6869 struct io_wait_queue iowq = { 6870 .wq = { ··· 6875 .to_wait = min_events, 6876 }; 6877 struct io_rings *rings = ctx->rings; 6878 + struct timespec64 ts; 6879 + signed long timeout = 0; 6880 int ret = 0; 6881 6882 do { ··· 6897 return ret; 6898 } 6899 6900 + if (uts) { 6901 + if (get_timespec64(&ts, uts)) 6902 + return -EFAULT; 6903 + timeout = timespec64_to_jiffies(&ts); 6904 + } 6905 + 6906 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); 6907 trace_io_uring_cqring_wait(ctx, min_events); 6908 do { ··· 6910 break; 6911 if (io_should_wake(&iowq, false)) 6912 break; 6913 + if (uts) { 6914 + timeout = schedule_timeout(timeout); 6915 + if (timeout == 0) { 6916 + ret = -ETIME; 6917 + break; 6918 + } 6919 + } else { 6920 + schedule(); 6921 + } 6922 } while (1); 6923 finish_wait(&ctx->wait, &iowq.wq); 6924 ··· 6959 if (!data) 6960 return -ENXIO; 6961 6962 + spin_lock_bh(&data->lock); 6963 ref_node = data->node; 6964 + spin_unlock_bh(&data->lock); 6965 if (ref_node) 6966 percpu_ref_kill(&ref_node->refs); 6967 ··· 7084 7085 mutex_lock(&sqd->ctx_lock); 7086 list_del(&ctx->sqd_list); 7087 + io_sqd_update_thread_idle(sqd); 7088 mutex_unlock(&sqd->ctx_lock); 7089 7090 + if (sqd->thread) 7091 io_sq_thread_unpark(sqd); 7092 7093 io_put_sq_data(sqd); 7094 ctx->sq_data = NULL; ··· 7344 data = ref_node->file_data; 7345 ctx = data->ctx; 7346 7347 + spin_lock_bh(&data->lock); 7348 ref_node->done = true; 7349 7350 while (!list_empty(&data->ref_list)) { ··· 7356 list_del(&ref_node->node); 7357 first_add |= llist_add(&ref_node->llist, &ctx->file_put_llist); 7358 } 7359 + spin_unlock_bh(&data->lock); 7360 7361 if (percpu_ref_is_dying(&data->refs)) 7362 delay = 0; ··· 7479 } 7480 7481 file_data->node = ref_node; 7482 + spin_lock_bh(&file_data->lock); 7483 list_add_tail(&ref_node->node, &file_data->ref_list); 7484 + spin_unlock_bh(&file_data->lock); 7485 percpu_ref_get(&file_data->refs); 7486 return ret; 7487 out_fput: ··· 7638 7639 if (needs_switch) { 7640 percpu_ref_kill(&data->node->refs); 7641 + spin_lock_bh(&data->lock); 7642 list_add_tail(&ref_node->node, &data->ref_list); 7643 data->node = ref_node; 7644 + spin_unlock_bh(&data->lock); 7645 percpu_ref_get(&ctx->file_data->refs); 7646 } else 7647 destroy_fixed_file_ref_node(ref_node); ··· 7769 struct io_sq_data *sqd; 7770 7771 ret = -EPERM; 7772 + if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE)) 7773 goto err; 7774 7775 sqd = io_get_sq_data(p); ··· 8355 * as nobody else will be looking for them. 8356 */ 8357 do { 8358 io_iopoll_try_reap_events(ctx); 8359 } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); 8360 io_ring_ctx_free(ctx); ··· 8366 { 8367 mutex_lock(&ctx->uring_lock); 8368 percpu_ref_kill(&ctx->refs); 8369 + if (ctx->rings) 8370 + io_cqring_overflow_flush(ctx, true, NULL, NULL); 8371 mutex_unlock(&ctx->uring_lock); 8372 8373 + io_kill_timeouts(ctx, NULL, NULL); 8374 + io_poll_remove_all(ctx, NULL, NULL); 8375 8376 if (ctx->io_wq) 8377 io_wq_cancel_all(ctx->io_wq); 8378 8379 /* if we failed setting up the ctx, we might not have any rings */ 8380 io_iopoll_try_reap_events(ctx); 8381 idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); 8382 ··· 8407 return 0; 8408 } 8409 8410 + struct io_task_cancel { 8411 + struct task_struct *task; 8412 + struct files_struct *files; 8413 + }; 8414 8415 + static bool io_cancel_task_cb(struct io_wq_work *work, void *data) 8416 { 8417 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 8418 + struct io_task_cancel *cancel = data; 8419 bool ret; 8420 8421 + if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) { 8422 unsigned long flags; 8423 struct io_ring_ctx *ctx = req->ctx; 8424 8425 /* protect against races with linked timeouts */ 8426 spin_lock_irqsave(&ctx->completion_lock, flags); 8427 + ret = io_match_task(req, cancel->task, cancel->files); 8428 spin_unlock_irqrestore(&ctx->completion_lock, flags); 8429 } else { 8430 + ret = io_match_task(req, cancel->task, cancel->files); 8431 } 8432 return ret; 8433 } 8434 8435 static void io_cancel_defer_files(struct io_ring_ctx *ctx, ··· 8530 8531 spin_lock_irq(&ctx->completion_lock); 8532 list_for_each_entry_reverse(de, &ctx->defer_list, list) { 8533 + if (io_match_task(de->req, task, files)) { 8534 list_cut_position(&list, &ctx->defer_list, &de->list); 8535 break; 8536 } ··· 8548 } 8549 } 8550 8551 + static void io_uring_cancel_files(struct io_ring_ctx *ctx, 8552 + struct task_struct *task, 8553 struct files_struct *files) 8554 { 8555 while (!list_empty_careful(&ctx->inflight_list)) { 8556 + struct io_task_cancel cancel = { .task = task, .files = files }; 8557 + struct io_kiocb *req; 8558 DEFINE_WAIT(wait); 8559 + bool found = false; 8560 8561 spin_lock_irq(&ctx->inflight_lock); 8562 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { 8563 + if (req->task != task || 8564 req->work.identity->files != files) 8565 continue; 8566 + found = true; 8567 break; 8568 } 8569 + if (found) 8570 + prepare_to_wait(&task->io_uring->wait, &wait, 8571 + TASK_UNINTERRUPTIBLE); 8572 spin_unlock_irq(&ctx->inflight_lock); 8573 8574 /* We need to keep going until we don't find a matching req */ 8575 + if (!found) 8576 break; 8577 + 8578 + io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true); 8579 + io_poll_remove_all(ctx, task, files); 8580 + io_kill_timeouts(ctx, task, files); 8581 /* cancellations _may_ trigger task work */ 8582 io_run_task_work(); 8583 schedule(); 8584 + finish_wait(&task->io_uring->wait, &wait); 8585 } 8586 } 8587 8588 + static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, 8589 + struct task_struct *task) 8590 { 8591 + while (1) { 8592 + struct io_task_cancel cancel = { .task = task, .files = NULL, }; 8593 enum io_wq_cancel cret; 8594 + bool ret = false; 8595 8596 + cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true); 8597 if (cret != IO_WQ_CANCEL_NOTFOUND) 8598 ret = true; 8599 ··· 8625 } 8626 } 8627 8628 + ret |= io_poll_remove_all(ctx, task, NULL); 8629 + ret |= io_kill_timeouts(ctx, task, NULL); 8630 + if (!ret) 8631 + break; 8632 + io_run_task_work(); 8633 + cond_resched(); 8634 } 8635 } 8636 8637 /* ··· 8648 io_sq_thread_park(ctx->sq_data); 8649 } 8650 8651 + io_cancel_defer_files(ctx, task, files); 8652 + io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); 8653 io_cqring_overflow_flush(ctx, true, task, files); 8654 + io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); 8655 8656 + if (!files) 8657 + __io_uring_cancel_task_requests(ctx, task); 8658 + else 8659 + io_uring_cancel_files(ctx, task, files); 8660 8661 if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { 8662 atomic_dec(&task->io_uring->in_idle); ··· 8916 finish_wait(&ctx->sqo_sq_wait, &wait); 8917 } 8918 8919 + static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, 8920 + struct __kernel_timespec __user **ts, 8921 + const sigset_t __user **sig) 8922 + { 8923 + struct io_uring_getevents_arg arg; 8924 + 8925 + /* 8926 + * If EXT_ARG isn't set, then we have no timespec and the argp pointer 8927 + * is just a pointer to the sigset_t. 8928 + */ 8929 + if (!(flags & IORING_ENTER_EXT_ARG)) { 8930 + *sig = (const sigset_t __user *) argp; 8931 + *ts = NULL; 8932 + return 0; 8933 + } 8934 + 8935 + /* 8936 + * EXT_ARG is set - ensure we agree on the size of it and copy in our 8937 + * timespec and sigset_t pointers if good. 8938 + */ 8939 + if (*argsz != sizeof(arg)) 8940 + return -EINVAL; 8941 + if (copy_from_user(&arg, argp, sizeof(arg))) 8942 + return -EFAULT; 8943 + *sig = u64_to_user_ptr(arg.sigmask); 8944 + *argsz = arg.sigmask_sz; 8945 + *ts = u64_to_user_ptr(arg.ts); 8946 + return 0; 8947 + } 8948 + 8949 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, 8950 + u32, min_complete, u32, flags, const void __user *, argp, 8951 + size_t, argsz) 8952 { 8953 struct io_ring_ctx *ctx; 8954 long ret = -EBADF; ··· 8928 io_run_task_work(); 8929 8930 if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | 8931 + IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)) 8932 return -EINVAL; 8933 8934 f = fdget(fd); ··· 8955 */ 8956 ret = 0; 8957 if (ctx->flags & IORING_SETUP_SQPOLL) { 8958 + io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); 8959 if (!list_empty_careful(&ctx->cq_overflow_list)) 8960 io_cqring_overflow_flush(ctx, false, NULL, NULL); 8961 + io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); 8962 if (flags & IORING_ENTER_SQ_WAKEUP) 8963 wake_up(&ctx->sq_data->wait); 8964 if (flags & IORING_ENTER_SQ_WAIT) ··· 8974 goto out; 8975 } 8976 if (flags & IORING_ENTER_GETEVENTS) { 8977 + const sigset_t __user *sig; 8978 + struct __kernel_timespec __user *ts; 8979 + 8980 + ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); 8981 + if (unlikely(ret)) 8982 + goto out; 8983 + 8984 min_complete = min(min_complete, ctx->cq_entries); 8985 8986 /* ··· 8986 !(ctx->flags & IORING_SETUP_SQPOLL)) { 8987 ret = io_iopoll_check(ctx, min_complete); 8988 } else { 8989 + ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts); 8990 } 8991 } 8992 ··· 9354 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | 9355 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | 9356 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | 9357 + IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | 9358 + IORING_FEAT_EXT_ARG; 9359 9360 if (copy_to_user(params, p, sizeof(*p))) { 9361 ret = -EFAULT;

+22 -18

fs/namei.c

··· 4346 } 4347 EXPORT_SYMBOL(vfs_rename); 4348 4349 - static int do_renameat2(int olddfd, const char __user *oldname, int newdfd, 4350 - const char __user *newname, unsigned int flags) 4351 { 4352 struct dentry *old_dentry, *new_dentry; 4353 struct dentry *trap; ··· 4355 struct qstr old_last, new_last; 4356 int old_type, new_type; 4357 struct inode *delegated_inode = NULL; 4358 - struct filename *from; 4359 - struct filename *to; 4360 unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET; 4361 bool should_retry = false; 4362 - int error; 4363 4364 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 4365 - return -EINVAL; 4366 4367 if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && 4368 (flags & RENAME_EXCHANGE)) 4369 - return -EINVAL; 4370 4371 if (flags & RENAME_EXCHANGE) 4372 target_flags = 0; 4373 4374 retry: 4375 - from = filename_parentat(olddfd, getname(oldname), lookup_flags, 4376 - &old_path, &old_last, &old_type); 4377 if (IS_ERR(from)) { 4378 error = PTR_ERR(from); 4379 - goto exit; 4380 } 4381 4382 - to = filename_parentat(newdfd, getname(newname), lookup_flags, 4383 - &new_path, &new_last, &new_type); 4384 if (IS_ERR(to)) { 4385 error = PTR_ERR(to); 4386 goto exit1; ··· 4471 if (retry_estale(error, lookup_flags)) 4472 should_retry = true; 4473 path_put(&new_path); 4474 - putname(to); 4475 exit1: 4476 path_put(&old_path); 4477 - putname(from); 4478 if (should_retry) { 4479 should_retry = false; 4480 lookup_flags |= LOOKUP_REVAL; 4481 goto retry; 4482 } 4483 - exit: 4484 return error; 4485 } 4486 4487 SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, 4488 int, newdfd, const char __user *, newname, unsigned int, flags) 4489 { 4490 - return do_renameat2(olddfd, oldname, newdfd, newname, flags); 4491 } 4492 4493 SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, 4494 int, newdfd, const char __user *, newname) 4495 { 4496 - return do_renameat2(olddfd, oldname, newdfd, newname, 0); 4497 } 4498 4499 SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) 4500 { 4501 - return do_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); 4502 } 4503 4504 int readlink_copy(char __user *buffer, int buflen, const char *link)

··· 4346 } 4347 EXPORT_SYMBOL(vfs_rename); 4348 4349 + int do_renameat2(int olddfd, struct filename *from, int newdfd, 4350 + struct filename *to, unsigned int flags) 4351 { 4352 struct dentry *old_dentry, *new_dentry; 4353 struct dentry *trap; ··· 4355 struct qstr old_last, new_last; 4356 int old_type, new_type; 4357 struct inode *delegated_inode = NULL; 4358 unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET; 4359 bool should_retry = false; 4360 + int error = -EINVAL; 4361 4362 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 4363 + goto put_both; 4364 4365 if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && 4366 (flags & RENAME_EXCHANGE)) 4367 + goto put_both; 4368 4369 if (flags & RENAME_EXCHANGE) 4370 target_flags = 0; 4371 4372 retry: 4373 + from = filename_parentat(olddfd, from, lookup_flags, &old_path, 4374 + &old_last, &old_type); 4375 if (IS_ERR(from)) { 4376 error = PTR_ERR(from); 4377 + goto put_new; 4378 } 4379 4380 + to = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last, 4381 + &new_type); 4382 if (IS_ERR(to)) { 4383 error = PTR_ERR(to); 4384 goto exit1; ··· 4473 if (retry_estale(error, lookup_flags)) 4474 should_retry = true; 4475 path_put(&new_path); 4476 exit1: 4477 path_put(&old_path); 4478 if (should_retry) { 4479 should_retry = false; 4480 lookup_flags |= LOOKUP_REVAL; 4481 goto retry; 4482 } 4483 + put_both: 4484 + if (!IS_ERR(from)) 4485 + putname(from); 4486 + put_new: 4487 + if (!IS_ERR(to)) 4488 + putname(to); 4489 return error; 4490 } 4491 4492 SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, 4493 int, newdfd, const char __user *, newname, unsigned int, flags) 4494 { 4495 + return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), 4496 + flags); 4497 } 4498 4499 SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, 4500 int, newdfd, const char __user *, newname) 4501 { 4502 + return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), 4503 + 0); 4504 } 4505 4506 SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) 4507 { 4508 + return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD, 4509 + getname(newname), 0); 4510 } 4511 4512 int readlink_copy(char __user *buffer, int buflen, const char *link)

+1

include/linux/socket.h

··· 436 int __user *usockaddr_len); 437 extern int __sys_socketpair(int family, int type, int protocol, 438 int __user *usockvec); 439 extern int __sys_shutdown(int fd, int how); 440 441 extern struct ns_common *get_net_ns(struct ns_common *ns);

··· 436 int __user *usockaddr_len); 437 extern int __sys_socketpair(int family, int type, int protocol, 438 int __user *usockvec); 439 + extern int __sys_shutdown_sock(struct socket *sock, int how); 440 extern int __sys_shutdown(int fd, int how); 441 442 extern struct ns_common *get_net_ns(struct ns_common *ns);

+1 -1

include/linux/syscalls.h

··· 317 struct io_uring_params __user *p); 318 asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit, 319 u32 min_complete, u32 flags, 320 - const sigset_t __user *sig, size_t sigsz); 321 asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op, 322 void __user *arg, unsigned int nr_args); 323

··· 317 struct io_uring_params __user *p); 318 asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit, 319 u32 min_complete, u32 flags, 320 + const void __user *argp, size_t argsz); 321 asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op, 322 void __user *arg, unsigned int nr_args); 323

+16

include/uapi/linux/io_uring.h

··· 42 __u32 statx_flags; 43 __u32 fadvise_advice; 44 __u32 splice_flags; 45 }; 46 __u64 user_data; /* data to be passed back at completion time */ 47 union { ··· 134 IORING_OP_PROVIDE_BUFFERS, 135 IORING_OP_REMOVE_BUFFERS, 136 IORING_OP_TEE, 137 138 /* this goes last, obviously */ 139 IORING_OP_LAST, ··· 151 * sqe->timeout_flags 152 */ 153 #define IORING_TIMEOUT_ABS (1U << 0) 154 155 /* 156 * sqe->splice_flags ··· 232 #define IORING_ENTER_GETEVENTS (1U << 0) 233 #define IORING_ENTER_SQ_WAKEUP (1U << 1) 234 #define IORING_ENTER_SQ_WAIT (1U << 2) 235 236 /* 237 * Passed in for io_uring_setup(2). Copied back with updated info on success ··· 260 #define IORING_FEAT_CUR_PERSONALITY (1U << 4) 261 #define IORING_FEAT_FAST_POLL (1U << 5) 262 #define IORING_FEAT_POLL_32BITS (1U << 6) 263 264 /* 265 * io_uring_register(2) opcodes and arguments ··· 336 IORING_RESTRICTION_SQE_FLAGS_REQUIRED = 3, 337 338 IORING_RESTRICTION_LAST 339 }; 340 341 #endif

··· 42 __u32 statx_flags; 43 __u32 fadvise_advice; 44 __u32 splice_flags; 45 + __u32 rename_flags; 46 + __u32 unlink_flags; 47 }; 48 __u64 user_data; /* data to be passed back at completion time */ 49 union { ··· 132 IORING_OP_PROVIDE_BUFFERS, 133 IORING_OP_REMOVE_BUFFERS, 134 IORING_OP_TEE, 135 + IORING_OP_SHUTDOWN, 136 + IORING_OP_RENAMEAT, 137 + IORING_OP_UNLINKAT, 138 139 /* this goes last, obviously */ 140 IORING_OP_LAST, ··· 146 * sqe->timeout_flags 147 */ 148 #define IORING_TIMEOUT_ABS (1U << 0) 149 + #define IORING_TIMEOUT_UPDATE (1U << 1) 150 151 /* 152 * sqe->splice_flags ··· 226 #define IORING_ENTER_GETEVENTS (1U << 0) 227 #define IORING_ENTER_SQ_WAKEUP (1U << 1) 228 #define IORING_ENTER_SQ_WAIT (1U << 2) 229 + #define IORING_ENTER_EXT_ARG (1U << 3) 230 231 /* 232 * Passed in for io_uring_setup(2). Copied back with updated info on success ··· 253 #define IORING_FEAT_CUR_PERSONALITY (1U << 4) 254 #define IORING_FEAT_FAST_POLL (1U << 5) 255 #define IORING_FEAT_POLL_32BITS (1U << 6) 256 + #define IORING_FEAT_SQPOLL_NONFIXED (1U << 7) 257 + #define IORING_FEAT_EXT_ARG (1U << 8) 258 259 /* 260 * io_uring_register(2) opcodes and arguments ··· 327 IORING_RESTRICTION_SQE_FLAGS_REQUIRED = 3, 328 329 IORING_RESTRICTION_LAST 330 + }; 331 + 332 + struct io_uring_getevents_arg { 333 + __u64 sigmask; 334 + __u32 sigmask_sz; 335 + __u32 pad; 336 + __u64 ts; 337 }; 338 339 #endif

+12 -3

net/socket.c

··· 2175 * Shutdown a socket. 2176 */ 2177 2178 int __sys_shutdown(int fd, int how) 2179 { 2180 int err, fput_needed; ··· 2193 2194 sock = sockfd_lookup_light(fd, &err, &fput_needed); 2195 if (sock != NULL) { 2196 - err = security_socket_shutdown(sock, how); 2197 - if (!err) 2198 - err = sock->ops->shutdown(sock, how); 2199 fput_light(sock->file, fput_needed); 2200 } 2201 return err;

··· 2175 * Shutdown a socket. 2176 */ 2177 2178 + int __sys_shutdown_sock(struct socket *sock, int how) 2179 + { 2180 + int err; 2181 + 2182 + err = security_socket_shutdown(sock, how); 2183 + if (!err) 2184 + err = sock->ops->shutdown(sock, how); 2185 + 2186 + return err; 2187 + } 2188 + 2189 int __sys_shutdown(int fd, int how) 2190 { 2191 int err, fput_needed; ··· 2182 2183 sock = sockfd_lookup_light(fd, &err, &fput_needed); 2184 if (sock != NULL) { 2185 + err = __sys_shutdown_sock(sock, how); 2186 fput_light(sock->file, fput_needed); 2187 } 2188 return err;