commit 2cc3c4b3c2e9c99e90aaf19cd801ff2c160f283c · tjh.dev/kernel

+386 -153

fs/io_uring.c

··· 508 508 509 509 struct io_async_rw { 510 510 struct iovec fast_iov[UIO_FASTIOV]; 511 - struct iovec *iov; 512 - ssize_t nr_segs; 513 - ssize_t size; 511 + const struct iovec *free_iovec; 512 + struct iov_iter iter; 513 + size_t bytes_done; 514 514 struct wait_page_queue wpq; 515 515 }; 516 516 ··· 898 898 static void io_double_put_req(struct io_kiocb *req); 899 899 static void __io_double_put_req(struct io_kiocb *req); 900 900 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); 901 + static void __io_queue_linked_timeout(struct io_kiocb *req); 901 902 static void io_queue_linked_timeout(struct io_kiocb *req); 902 903 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 903 904 struct io_uring_files_update *ip, ··· 915 914 static ssize_t io_import_iovec(int rw, struct io_kiocb *req, 916 915 struct iovec **iovec, struct iov_iter *iter, 917 916 bool needs_lock); 918 - static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, 919 - struct iovec *iovec, struct iovec *fast_iov, 920 - struct iov_iter *iter); 917 + static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, 918 + const struct iovec *fast_iov, 919 + struct iov_iter *iter, bool force); 921 920 922 921 static struct kmem_cache *req_cachep; 923 922 ··· 1108 1107 } 1109 1108 } 1110 1109 1111 - static void io_req_clean_work(struct io_kiocb *req) 1110 + /* 1111 + * Returns true if we need to defer file table putting. This can only happen 1112 + * from the error path with REQ_F_COMP_LOCKED set. 1113 + */ 1114 + static bool io_req_clean_work(struct io_kiocb *req) 1112 1115 { 1113 1116 if (!(req->flags & REQ_F_WORK_INITIALIZED)) 1114 - return; 1117 + return false; 1118 + 1119 + req->flags &= ~REQ_F_WORK_INITIALIZED; 1115 1120 1116 1121 if (req->work.mm) { 1117 1122 mmdrop(req->work.mm); ··· 1130 1123 if (req->work.fs) { 1131 1124 struct fs_struct *fs = req->work.fs; 1132 1125 1126 + if (req->flags & REQ_F_COMP_LOCKED) 1127 + return true; 1128 + 1133 1129 spin_lock(&req->work.fs->lock); 1134 1130 if (--fs->users) 1135 1131 fs = NULL; ··· 1141 1131 free_fs_struct(fs); 1142 1132 req->work.fs = NULL; 1143 1133 } 1144 - req->flags &= ~REQ_F_WORK_INITIALIZED; 1134 + 1135 + return false; 1145 1136 } 1146 1137 1147 1138 static void io_prep_async_work(struct io_kiocb *req) ··· 1190 1179 io_prep_async_work(cur); 1191 1180 } 1192 1181 1193 - static void __io_queue_async_work(struct io_kiocb *req) 1182 + static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req) 1194 1183 { 1195 1184 struct io_ring_ctx *ctx = req->ctx; 1196 1185 struct io_kiocb *link = io_prep_linked_timeout(req); ··· 1198 1187 trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, 1199 1188 &req->work, req->flags); 1200 1189 io_wq_enqueue(ctx->io_wq, &req->work); 1201 - 1202 - if (link) 1203 - io_queue_linked_timeout(link); 1190 + return link; 1204 1191 } 1205 1192 1206 1193 static void io_queue_async_work(struct io_kiocb *req) 1207 1194 { 1195 + struct io_kiocb *link; 1196 + 1208 1197 /* init ->work of the whole link before punting */ 1209 1198 io_prep_async_link(req); 1210 - __io_queue_async_work(req); 1199 + link = __io_queue_async_work(req); 1200 + 1201 + if (link) 1202 + io_queue_linked_timeout(link); 1211 1203 } 1212 1204 1213 1205 static void io_kill_timeout(struct io_kiocb *req) ··· 1243 1229 do { 1244 1230 struct io_defer_entry *de = list_first_entry(&ctx->defer_list, 1245 1231 struct io_defer_entry, list); 1232 + struct io_kiocb *link; 1246 1233 1247 1234 if (req_need_defer(de->req, de->seq)) 1248 1235 break; 1249 1236 list_del_init(&de->list); 1250 1237 /* punt-init is done before queueing for defer */ 1251 - __io_queue_async_work(de->req); 1238 + link = __io_queue_async_work(de->req); 1239 + if (link) { 1240 + __io_queue_linked_timeout(link); 1241 + /* drop submission reference */ 1242 + link->flags |= REQ_F_COMP_LOCKED; 1243 + io_put_req(link); 1244 + } 1252 1245 kfree(de); 1253 1246 } while (!list_empty(&ctx->defer_list)); 1254 1247 } ··· 1554 1533 fput(file); 1555 1534 } 1556 1535 1557 - static void io_dismantle_req(struct io_kiocb *req) 1536 + static bool io_dismantle_req(struct io_kiocb *req) 1558 1537 { 1559 1538 io_clean_op(req); 1560 1539 ··· 1562 1541 kfree(req->io); 1563 1542 if (req->file) 1564 1543 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); 1565 - io_req_clean_work(req); 1566 1544 1567 1545 if (req->flags & REQ_F_INFLIGHT) { 1568 1546 struct io_ring_ctx *ctx = req->ctx; ··· 1573 1553 wake_up(&ctx->inflight_wait); 1574 1554 spin_unlock_irqrestore(&ctx->inflight_lock, flags); 1575 1555 } 1556 + 1557 + return io_req_clean_work(req); 1576 1558 } 1577 1559 1578 - static void __io_free_req(struct io_kiocb *req) 1560 + static void __io_free_req_finish(struct io_kiocb *req) 1579 1561 { 1580 - struct io_ring_ctx *ctx; 1562 + struct io_ring_ctx *ctx = req->ctx; 1581 1563 1582 - io_dismantle_req(req); 1583 1564 __io_put_req_task(req); 1584 - ctx = req->ctx; 1585 1565 if (likely(!io_is_fallback_req(req))) 1586 1566 kmem_cache_free(req_cachep, req); 1587 1567 else 1588 1568 clear_bit_unlock(0, (unsigned long *) &ctx->fallback_req); 1589 1569 percpu_ref_put(&ctx->refs); 1570 + } 1571 + 1572 + static void io_req_task_file_table_put(struct callback_head *cb) 1573 + { 1574 + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); 1575 + struct fs_struct *fs = req->work.fs; 1576 + 1577 + spin_lock(&req->work.fs->lock); 1578 + if (--fs->users) 1579 + fs = NULL; 1580 + spin_unlock(&req->work.fs->lock); 1581 + if (fs) 1582 + free_fs_struct(fs); 1583 + req->work.fs = NULL; 1584 + __io_free_req_finish(req); 1585 + } 1586 + 1587 + static void __io_free_req(struct io_kiocb *req) 1588 + { 1589 + if (!io_dismantle_req(req)) { 1590 + __io_free_req_finish(req); 1591 + } else { 1592 + int ret; 1593 + 1594 + init_task_work(&req->task_work, io_req_task_file_table_put); 1595 + ret = task_work_add(req->task, &req->task_work, TWA_RESUME); 1596 + if (unlikely(ret)) { 1597 + struct task_struct *tsk; 1598 + 1599 + tsk = io_wq_get_task(req->ctx->io_wq); 1600 + task_work_add(tsk, &req->task_work, 0); 1601 + } 1602 + } 1590 1603 } 1591 1604 1592 1605 static bool io_link_cancel_timeout(struct io_kiocb *req) ··· 1651 1598 return false; 1652 1599 1653 1600 list_del_init(&link->link_list); 1601 + link->flags |= REQ_F_COMP_LOCKED; 1654 1602 wake_ev = io_link_cancel_timeout(link); 1655 1603 req->flags &= ~REQ_F_LINK_TIMEOUT; 1656 1604 return wake_ev; ··· 1710 1656 trace_io_uring_fail_link(req, link); 1711 1657 1712 1658 io_cqring_fill_event(link, -ECANCELED); 1659 + link->flags |= REQ_F_COMP_LOCKED; 1713 1660 __io_double_put_req(link); 1714 1661 req->flags &= ~REQ_F_LINK_TIMEOUT; 1715 1662 } ··· 1765 1710 { 1766 1711 struct task_struct *tsk = req->task; 1767 1712 struct io_ring_ctx *ctx = req->ctx; 1768 - int ret, notify = TWA_RESUME; 1713 + int ret, notify; 1769 1714 1770 1715 /* 1771 - * SQPOLL kernel thread doesn't need notification, just a wakeup. 1772 - * If we're not using an eventfd, then TWA_RESUME is always fine, 1773 - * as we won't have dependencies between request completions for 1774 - * other kernel wait conditions. 1716 + * SQPOLL kernel thread doesn't need notification, just a wakeup. For 1717 + * all other cases, use TWA_SIGNAL unconditionally to ensure we're 1718 + * processing task_work. There's no reliable way to tell if TWA_RESUME 1719 + * will do the job. 1775 1720 */ 1776 - if (ctx->flags & IORING_SETUP_SQPOLL) 1777 - notify = 0; 1778 - else if (ctx->cq_ev_fd) 1721 + notify = 0; 1722 + if (!(ctx->flags & IORING_SETUP_SQPOLL)) 1779 1723 notify = TWA_SIGNAL; 1780 1724 1781 1725 ret = task_work_add(tsk, cb, notify); 1782 1726 if (!ret) 1783 1727 wake_up_process(tsk); 1728 + 1784 1729 return ret; 1785 1730 } 1786 1731 ··· 1821 1766 static void io_req_task_submit(struct callback_head *cb) 1822 1767 { 1823 1768 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); 1769 + struct io_ring_ctx *ctx = req->ctx; 1824 1770 1825 1771 __io_req_task_submit(req); 1772 + percpu_ref_put(&ctx->refs); 1826 1773 } 1827 1774 1828 1775 static void io_req_task_queue(struct io_kiocb *req) ··· 1832 1775 int ret; 1833 1776 1834 1777 init_task_work(&req->task_work, io_req_task_submit); 1778 + percpu_ref_get(&req->ctx->refs); 1835 1779 1836 1780 ret = io_req_task_work_add(req, &req->task_work); 1837 1781 if (unlikely(ret)) { ··· 1913 1855 req->flags &= ~REQ_F_TASK_PINNED; 1914 1856 } 1915 1857 1916 - io_dismantle_req(req); 1858 + WARN_ON_ONCE(io_dismantle_req(req)); 1917 1859 rb->reqs[rb->to_free++] = req; 1918 1860 if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) 1919 1861 __io_req_free_batch_flush(req->ctx, rb); ··· 2299 2241 ret = io_import_iovec(rw, req, &iovec, &iter, false); 2300 2242 if (ret < 0) 2301 2243 goto end_req; 2302 - ret = io_setup_async_rw(req, ret, iovec, inline_vecs, &iter); 2244 + ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false); 2303 2245 if (!ret) 2304 2246 return true; 2305 2247 kfree(iovec); ··· 2321 2263 refcount_inc(&req->refs); 2322 2264 io_queue_async_work(req); 2323 2265 } 2266 + 2267 + percpu_ref_put(&ctx->refs); 2324 2268 } 2325 2269 #endif 2326 2270 ··· 2335 2275 return false; 2336 2276 2337 2277 init_task_work(&req->task_work, io_rw_resubmit); 2278 + percpu_ref_get(&req->ctx->refs); 2279 + 2338 2280 ret = io_req_task_work_add(req, &req->task_work); 2339 2281 if (!ret) 2340 2282 return true; ··· 2589 2527 { 2590 2528 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 2591 2529 2530 + /* add previously done IO, if any */ 2531 + if (req->io && req->io->rw.bytes_done > 0) { 2532 + if (ret < 0) 2533 + ret = req->io->rw.bytes_done; 2534 + else 2535 + ret += req->io->rw.bytes_done; 2536 + } 2537 + 2592 2538 if (req->flags & REQ_F_CUR_POS) 2593 2539 req->file->f_pos = kiocb->ki_pos; 2594 2540 if (ret >= 0 && kiocb->ki_complete == io_complete_rw) ··· 2828 2758 ssize_t ret; 2829 2759 u8 opcode; 2830 2760 2761 + if (req->io) { 2762 + struct io_async_rw *iorw = &req->io->rw; 2763 + 2764 + *iovec = NULL; 2765 + return iov_iter_count(&iorw->iter); 2766 + } 2767 + 2831 2768 opcode = req->opcode; 2832 2769 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { 2833 2770 *iovec = NULL; ··· 2858 2781 ret = import_single_range(rw, buf, sqe_len, *iovec, iter); 2859 2782 *iovec = NULL; 2860 2783 return ret < 0 ? ret : sqe_len; 2861 - } 2862 - 2863 - if (req->io) { 2864 - struct io_async_rw *iorw = &req->io->rw; 2865 - 2866 - iov_iter_init(iter, rw, iorw->iov, iorw->nr_segs, iorw->size); 2867 - *iovec = NULL; 2868 - return iorw->size; 2869 2784 } 2870 2785 2871 2786 if (req->flags & REQ_F_BUFFER_SELECT) { ··· 2937 2868 return ret; 2938 2869 } 2939 2870 2940 - static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, 2941 - struct iovec *iovec, struct iovec *fast_iov, 2942 - struct iov_iter *iter) 2871 + static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, 2872 + const struct iovec *fast_iov, struct iov_iter *iter) 2943 2873 { 2944 2874 struct io_async_rw *rw = &req->io->rw; 2945 2875 2946 - rw->nr_segs = iter->nr_segs; 2947 - rw->size = io_size; 2876 + memcpy(&rw->iter, iter, sizeof(*iter)); 2877 + rw->free_iovec = NULL; 2878 + rw->bytes_done = 0; 2879 + /* can only be fixed buffers, no need to do anything */ 2880 + if (iter->type == ITER_BVEC) 2881 + return; 2948 2882 if (!iovec) { 2949 - rw->iov = rw->fast_iov; 2950 - if (rw->iov != fast_iov) 2951 - memcpy(rw->iov, fast_iov, 2883 + unsigned iov_off = 0; 2884 + 2885 + rw->iter.iov = rw->fast_iov; 2886 + if (iter->iov != fast_iov) { 2887 + iov_off = iter->iov - fast_iov; 2888 + rw->iter.iov += iov_off; 2889 + } 2890 + if (rw->fast_iov != fast_iov) 2891 + memcpy(rw->fast_iov + iov_off, fast_iov + iov_off, 2952 2892 sizeof(struct iovec) * iter->nr_segs); 2953 2893 } else { 2954 - rw->iov = iovec; 2894 + rw->free_iovec = iovec; 2955 2895 req->flags |= REQ_F_NEED_CLEANUP; 2956 2896 } 2957 2897 } ··· 2979 2901 return __io_alloc_async_ctx(req); 2980 2902 } 2981 2903 2982 - static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, 2983 - struct iovec *iovec, struct iovec *fast_iov, 2984 - struct iov_iter *iter) 2904 + static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, 2905 + const struct iovec *fast_iov, 2906 + struct iov_iter *iter, bool force) 2985 2907 { 2986 - if (!io_op_defs[req->opcode].async_ctx) 2908 + if (!force && !io_op_defs[req->opcode].async_ctx) 2987 2909 return 0; 2988 2910 if (!req->io) { 2989 2911 if (__io_alloc_async_ctx(req)) 2990 2912 return -ENOMEM; 2991 2913 2992 - io_req_map_rw(req, io_size, iovec, fast_iov, iter); 2914 + io_req_map_rw(req, iovec, fast_iov, iter); 2993 2915 } 2994 2916 return 0; 2995 2917 } ··· 2997 2919 static inline int io_rw_prep_async(struct io_kiocb *req, int rw, 2998 2920 bool force_nonblock) 2999 2921 { 3000 - struct io_async_ctx *io = req->io; 3001 - struct iov_iter iter; 2922 + struct io_async_rw *iorw = &req->io->rw; 3002 2923 ssize_t ret; 3003 2924 3004 - io->rw.iov = io->rw.fast_iov; 2925 + iorw->iter.iov = iorw->fast_iov; 2926 + /* reset ->io around the iovec import, we don't want to use it */ 3005 2927 req->io = NULL; 3006 - ret = io_import_iovec(rw, req, &io->rw.iov, &iter, !force_nonblock); 3007 - req->io = io; 2928 + ret = io_import_iovec(rw, req, (struct iovec **) &iorw->iter.iov, 2929 + &iorw->iter, !force_nonblock); 2930 + req->io = container_of(iorw, struct io_async_ctx, rw); 3008 2931 if (unlikely(ret < 0)) 3009 2932 return ret; 3010 2933 3011 - io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); 2934 + io_req_map_rw(req, iorw->iter.iov, iorw->fast_iov, &iorw->iter); 3012 2935 return 0; 3013 2936 } 3014 2937 ··· 3031 2952 return io_rw_prep_async(req, READ, force_nonblock); 3032 2953 } 3033 2954 2955 + /* 2956 + * This is our waitqueue callback handler, registered through lock_page_async() 2957 + * when we initially tried to do the IO with the iocb armed our waitqueue. 2958 + * This gets called when the page is unlocked, and we generally expect that to 2959 + * happen when the page IO is completed and the page is now uptodate. This will 2960 + * queue a task_work based retry of the operation, attempting to copy the data 2961 + * again. If the latter fails because the page was NOT uptodate, then we will 2962 + * do a thread based blocking retry of the operation. That's the unexpected 2963 + * slow path. 2964 + */ 3034 2965 static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, 3035 2966 int sync, void *arg) 3036 2967 { ··· 3054 2965 if (!wake_page_match(wpq, key)) 3055 2966 return 0; 3056 2967 3057 - /* Stop waking things up if the page is locked again */ 3058 - if (test_bit(key->bit_nr, &key->page->flags)) 3059 - return -1; 3060 - 3061 2968 list_del_init(&wait->entry); 3062 2969 3063 2970 init_task_work(&req->task_work, io_req_task_submit); 2971 + percpu_ref_get(&req->ctx->refs); 2972 + 3064 2973 /* submit ref gets dropped, acquire a new one */ 3065 2974 refcount_inc(&req->refs); 3066 2975 ret = io_req_task_work_add(req, &req->task_work); ··· 3095 3008 return -EOPNOTSUPP; 3096 3009 } 3097 3010 3098 - 3011 + /* 3012 + * This controls whether a given IO request should be armed for async page 3013 + * based retry. If we return false here, the request is handed to the async 3014 + * worker threads for retry. If we're doing buffered reads on a regular file, 3015 + * we prepare a private wait_page_queue entry and retry the operation. This 3016 + * will either succeed because the page is now uptodate and unlocked, or it 3017 + * will register a callback when the page is unlocked at IO completion. Through 3018 + * that callback, io_uring uses task_work to setup a retry of the operation. 3019 + * That retry will attempt the buffered read again. The retry will generally 3020 + * succeed, or in rare cases where it fails, we then fall back to using the 3021 + * async worker threads for a blocking retry. 3022 + */ 3099 3023 static bool io_rw_should_retry(struct io_kiocb *req) 3100 3024 { 3101 3025 struct kiocb *kiocb = &req->rw.kiocb; ··· 3116 3018 if (req->flags & REQ_F_NOWAIT) 3117 3019 return false; 3118 3020 3119 - /* already tried, or we're doing O_DIRECT */ 3120 - if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_WAITQ)) 3021 + /* Only for buffered IO */ 3022 + if (kiocb->ki_flags & IOCB_DIRECT) 3121 3023 return false; 3122 3024 /* 3123 3025 * just use poll if we can, and don't attempt if the fs doesn't 3124 3026 * support callback based unlocks 3125 3027 */ 3126 3028 if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) 3127 - return false; 3128 - 3129 - /* 3130 - * If request type doesn't require req->io to defer in general, 3131 - * we need to allocate it here 3132 - */ 3133 - if (!req->io && __io_alloc_async_ctx(req)) 3134 3029 return false; 3135 3030 3136 3031 ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq, ··· 3140 3049 { 3141 3050 if (req->file->f_op->read_iter) 3142 3051 return call_read_iter(req->file, &req->rw.kiocb, iter); 3143 - return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter); 3052 + else if (req->file->f_op->read) 3053 + return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter); 3054 + else 3055 + return -EINVAL; 3144 3056 } 3145 3057 3146 3058 static int io_read(struct io_kiocb *req, bool force_nonblock, ··· 3151 3057 { 3152 3058 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 3153 3059 struct kiocb *kiocb = &req->rw.kiocb; 3154 - struct iov_iter iter; 3155 - size_t iov_count; 3060 + struct iov_iter __iter, *iter = &__iter; 3156 3061 ssize_t io_size, ret, ret2; 3157 - unsigned long nr_segs; 3062 + size_t iov_count; 3158 3063 3159 - ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock); 3064 + if (req->io) 3065 + iter = &req->io->rw.iter; 3066 + 3067 + ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); 3160 3068 if (ret < 0) 3161 3069 return ret; 3162 3070 io_size = ret; 3163 3071 req->result = io_size; 3072 + ret = 0; 3164 3073 3165 3074 /* Ensure we clear previously set non-block flag */ 3166 3075 if (!force_nonblock) ··· 3173 3076 if (force_nonblock && !io_file_supports_async(req->file, READ)) 3174 3077 goto copy_iov; 3175 3078 3176 - iov_count = iov_iter_count(&iter); 3177 - nr_segs = iter.nr_segs; 3079 + iov_count = iov_iter_count(iter); 3178 3080 ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); 3179 3081 if (unlikely(ret)) 3180 3082 goto out_free; 3181 3083 3182 - ret2 = io_iter_do_read(req, &iter); 3084 + ret = io_iter_do_read(req, iter); 3183 3085 3184 - /* Catch -EAGAIN return for forced non-blocking submission */ 3185 - if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { 3186 - kiocb_done(kiocb, ret2, cs); 3187 - } else { 3188 - iter.count = iov_count; 3189 - iter.nr_segs = nr_segs; 3190 - copy_iov: 3191 - ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, 3192 - &iter); 3086 + if (!ret) { 3087 + goto done; 3088 + } else if (ret == -EIOCBQUEUED) { 3089 + ret = 0; 3090 + goto out_free; 3091 + } else if (ret == -EAGAIN) { 3092 + if (!force_nonblock) 3093 + goto done; 3094 + ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); 3193 3095 if (ret) 3194 3096 goto out_free; 3195 - /* it's copied and will be cleaned with ->io */ 3196 - iovec = NULL; 3197 - /* if we can retry, do so with the callbacks armed */ 3198 - if (io_rw_should_retry(req)) { 3199 - ret2 = io_iter_do_read(req, &iter); 3200 - if (ret2 == -EIOCBQUEUED) { 3201 - goto out_free; 3202 - } else if (ret2 != -EAGAIN) { 3203 - kiocb_done(kiocb, ret2, cs); 3204 - goto out_free; 3205 - } 3206 - } 3097 + return -EAGAIN; 3098 + } else if (ret < 0) { 3099 + goto out_free; 3100 + } 3101 + 3102 + /* read it all, or we did blocking attempt. no retry. */ 3103 + if (!iov_iter_count(iter) || !force_nonblock || 3104 + (req->file->f_flags & O_NONBLOCK)) 3105 + goto done; 3106 + 3107 + io_size -= ret; 3108 + copy_iov: 3109 + ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true); 3110 + if (ret2) { 3111 + ret = ret2; 3112 + goto out_free; 3113 + } 3114 + /* it's copied and will be cleaned with ->io */ 3115 + iovec = NULL; 3116 + /* now use our persistent iterator, if we aren't already */ 3117 + iter = &req->io->rw.iter; 3118 + retry: 3119 + req->io->rw.bytes_done += ret; 3120 + /* if we can retry, do so with the callbacks armed */ 3121 + if (!io_rw_should_retry(req)) { 3207 3122 kiocb->ki_flags &= ~IOCB_WAITQ; 3208 3123 return -EAGAIN; 3209 3124 } 3125 + 3126 + /* 3127 + * Now retry read with the IOCB_WAITQ parts set in the iocb. If we 3128 + * get -EIOCBQUEUED, then we'll get a notification when the desired 3129 + * page gets unlocked. We can also get a partial read here, and if we 3130 + * do, then just retry at the new offset. 3131 + */ 3132 + ret = io_iter_do_read(req, iter); 3133 + if (ret == -EIOCBQUEUED) { 3134 + ret = 0; 3135 + goto out_free; 3136 + } else if (ret > 0 && ret < io_size) { 3137 + /* we got some bytes, but not all. retry. */ 3138 + goto retry; 3139 + } 3140 + done: 3141 + kiocb_done(kiocb, ret, cs); 3142 + ret = 0; 3210 3143 out_free: 3211 3144 if (iovec) 3212 3145 kfree(iovec); ··· 3266 3139 { 3267 3140 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 3268 3141 struct kiocb *kiocb = &req->rw.kiocb; 3269 - struct iov_iter iter; 3142 + struct iov_iter __iter, *iter = &__iter; 3270 3143 size_t iov_count; 3271 3144 ssize_t ret, ret2, io_size; 3272 - unsigned long nr_segs; 3273 3145 3274 - ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock); 3146 + if (req->io) 3147 + iter = &req->io->rw.iter; 3148 + 3149 + ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); 3275 3150 if (ret < 0) 3276 3151 return ret; 3277 3152 io_size = ret; ··· 3292 3163 (req->flags & REQ_F_ISREG)) 3293 3164 goto copy_iov; 3294 3165 3295 - iov_count = iov_iter_count(&iter); 3296 - nr_segs = iter.nr_segs; 3166 + iov_count = iov_iter_count(iter); 3297 3167 ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); 3298 3168 if (unlikely(ret)) 3299 3169 goto out_free; ··· 3313 3185 kiocb->ki_flags |= IOCB_WRITE; 3314 3186 3315 3187 if (req->file->f_op->write_iter) 3316 - ret2 = call_write_iter(req->file, kiocb, &iter); 3188 + ret2 = call_write_iter(req->file, kiocb, iter); 3189 + else if (req->file->f_op->write) 3190 + ret2 = loop_rw_iter(WRITE, req->file, kiocb, iter); 3317 3191 else 3318 - ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); 3192 + ret2 = -EINVAL; 3319 3193 3320 3194 /* 3321 3195 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just ··· 3328 3198 if (!force_nonblock || ret2 != -EAGAIN) { 3329 3199 kiocb_done(kiocb, ret2, cs); 3330 3200 } else { 3331 - iter.count = iov_count; 3332 - iter.nr_segs = nr_segs; 3333 3201 copy_iov: 3334 - ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, 3335 - &iter); 3336 - if (ret) 3337 - goto out_free; 3338 - /* it's copied and will be cleaned with ->io */ 3339 - iovec = NULL; 3340 - return -EAGAIN; 3202 + ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); 3203 + if (!ret) 3204 + return -EAGAIN; 3341 3205 } 3342 3206 out_free: 3343 3207 if (iovec) ··· 4612 4488 4613 4489 req->result = mask; 4614 4490 init_task_work(&req->task_work, func); 4491 + percpu_ref_get(&req->ctx->refs); 4492 + 4615 4493 /* 4616 4494 * If this fails, then the task is exiting. When a task exits, the 4617 4495 * work gets canceled, so just cancel this request as well instead ··· 4652 4526 return false; 4653 4527 } 4654 4528 4655 - static void io_poll_remove_double(struct io_kiocb *req, void *data) 4529 + static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) 4656 4530 { 4657 - struct io_poll_iocb *poll = data; 4531 + /* pure poll stashes this in ->io, poll driven retry elsewhere */ 4532 + if (req->opcode == IORING_OP_POLL_ADD) 4533 + return (struct io_poll_iocb *) req->io; 4534 + return req->apoll->double_poll; 4535 + } 4536 + 4537 + static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req) 4538 + { 4539 + if (req->opcode == IORING_OP_POLL_ADD) 4540 + return &req->poll; 4541 + return &req->apoll->poll; 4542 + } 4543 + 4544 + static void io_poll_remove_double(struct io_kiocb *req) 4545 + { 4546 + struct io_poll_iocb *poll = io_poll_get_double(req); 4658 4547 4659 4548 lockdep_assert_held(&req->ctx->completion_lock); 4660 4549 ··· 4689 4548 { 4690 4549 struct io_ring_ctx *ctx = req->ctx; 4691 4550 4692 - io_poll_remove_double(req, req->io); 4551 + io_poll_remove_double(req); 4693 4552 req->poll.done = true; 4694 4553 io_cqring_fill_event(req, error ? error : mangle_poll(mask)); 4695 4554 io_commit_cqring(ctx); ··· 4716 4575 static void io_poll_task_func(struct callback_head *cb) 4717 4576 { 4718 4577 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); 4578 + struct io_ring_ctx *ctx = req->ctx; 4719 4579 struct io_kiocb *nxt = NULL; 4720 4580 4721 4581 io_poll_task_handler(req, &nxt); 4722 4582 if (nxt) 4723 4583 __io_req_task_submit(nxt); 4584 + percpu_ref_put(&ctx->refs); 4724 4585 } 4725 4586 4726 4587 static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, 4727 4588 int sync, void *key) 4728 4589 { 4729 4590 struct io_kiocb *req = wait->private; 4730 - struct io_poll_iocb *poll = req->apoll->double_poll; 4591 + struct io_poll_iocb *poll = io_poll_get_single(req); 4731 4592 __poll_t mask = key_to_poll(key); 4732 4593 4733 4594 /* for instances that support it check for an event match first: */ ··· 4743 4600 done = list_empty(&poll->wait.entry); 4744 4601 if (!done) 4745 4602 list_del_init(&poll->wait.entry); 4603 + /* make sure double remove sees this as being gone */ 4604 + wait->private = NULL; 4746 4605 spin_unlock(&poll->head->lock); 4747 4606 if (!done) 4748 4607 __io_async_wake(req, poll, mask, io_poll_task_func); ··· 4820 4675 4821 4676 if (io_poll_rewait(req, &apoll->poll)) { 4822 4677 spin_unlock_irq(&ctx->completion_lock); 4678 + percpu_ref_put(&ctx->refs); 4823 4679 return; 4824 4680 } 4825 4681 ··· 4828 4682 if (hash_hashed(&req->hash_node)) 4829 4683 hash_del(&req->hash_node); 4830 4684 4831 - io_poll_remove_double(req, apoll->double_poll); 4685 + io_poll_remove_double(req); 4832 4686 spin_unlock_irq(&ctx->completion_lock); 4833 4687 4834 4688 if (!READ_ONCE(apoll->poll.canceled)) ··· 4836 4690 else 4837 4691 __io_req_task_cancel(req, -ECANCELED); 4838 4692 4693 + percpu_ref_put(&ctx->refs); 4839 4694 kfree(apoll->double_poll); 4840 4695 kfree(apoll); 4841 4696 } ··· 4938 4791 4939 4792 ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, 4940 4793 io_async_wake); 4941 - if (ret) { 4942 - io_poll_remove_double(req, apoll->double_poll); 4794 + if (ret || ipt.error) { 4795 + io_poll_remove_double(req); 4943 4796 spin_unlock_irq(&ctx->completion_lock); 4944 4797 kfree(apoll->double_poll); 4945 4798 kfree(apoll); ··· 4971 4824 { 4972 4825 bool do_complete; 4973 4826 4827 + io_poll_remove_double(req); 4828 + 4974 4829 if (req->opcode == IORING_OP_POLL_ADD) { 4975 - io_poll_remove_double(req, req->io); 4976 4830 do_complete = __io_poll_remove_one(req, &req->poll); 4977 4831 } else { 4978 4832 struct async_poll *apoll = req->apoll; 4979 - 4980 - io_poll_remove_double(req, apoll->double_poll); 4981 4833 4982 4834 /* non-poll requests have submit ref still */ 4983 4835 do_complete = __io_poll_remove_one(req, &apoll->poll); ··· 4991 4845 io_cqring_fill_event(req, -ECANCELED); 4992 4846 io_commit_cqring(req->ctx); 4993 4847 req->flags |= REQ_F_COMP_LOCKED; 4848 + req_set_fail_links(req); 4994 4849 io_put_req(req); 4995 4850 } 4996 4851 ··· 5164 5017 return HRTIMER_NORESTART; 5165 5018 } 5166 5019 5020 + static int __io_timeout_cancel(struct io_kiocb *req) 5021 + { 5022 + int ret; 5023 + 5024 + list_del_init(&req->timeout.list); 5025 + 5026 + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); 5027 + if (ret == -1) 5028 + return -EALREADY; 5029 + 5030 + req_set_fail_links(req); 5031 + req->flags |= REQ_F_COMP_LOCKED; 5032 + io_cqring_fill_event(req, -ECANCELED); 5033 + io_put_req(req); 5034 + return 0; 5035 + } 5036 + 5167 5037 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) 5168 5038 { 5169 5039 struct io_kiocb *req; ··· 5188 5024 5189 5025 list_for_each_entry(req, &ctx->timeout_list, timeout.list) { 5190 5026 if (user_data == req->user_data) { 5191 - list_del_init(&req->timeout.list); 5192 5027 ret = 0; 5193 5028 break; 5194 5029 } ··· 5196 5033 if (ret == -ENOENT) 5197 5034 return ret; 5198 5035 5199 - ret = hrtimer_try_to_cancel(&req->io->timeout.timer); 5200 - if (ret == -1) 5201 - return -EALREADY; 5202 - 5203 - req_set_fail_links(req); 5204 - io_cqring_fill_event(req, -ECANCELED); 5205 - io_put_req(req); 5206 - return 0; 5036 + return __io_timeout_cancel(req); 5207 5037 } 5208 5038 5209 5039 static int io_timeout_remove_prep(struct io_kiocb *req, ··· 5637 5481 case IORING_OP_WRITEV: 5638 5482 case IORING_OP_WRITE_FIXED: 5639 5483 case IORING_OP_WRITE: 5640 - if (io->rw.iov != io->rw.fast_iov) 5641 - kfree(io->rw.iov); 5484 + if (io->rw.free_iovec) 5485 + kfree(io->rw.free_iovec); 5642 5486 break; 5643 5487 case IORING_OP_RECVMSG: 5644 5488 case IORING_OP_SENDMSG: ··· 6073 5917 return HRTIMER_NORESTART; 6074 5918 } 6075 5919 6076 - static void io_queue_linked_timeout(struct io_kiocb *req) 5920 + static void __io_queue_linked_timeout(struct io_kiocb *req) 6077 5921 { 6078 - struct io_ring_ctx *ctx = req->ctx; 6079 - 6080 5922 /* 6081 5923 * If the list is now empty, then our linked request finished before 6082 5924 * we got a chance to setup the timer 6083 5925 */ 6084 - spin_lock_irq(&ctx->completion_lock); 6085 5926 if (!list_empty(&req->link_list)) { 6086 5927 struct io_timeout_data *data = &req->io->timeout; 6087 5928 ··· 6086 5933 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), 6087 5934 data->mode); 6088 5935 } 5936 + } 5937 + 5938 + static void io_queue_linked_timeout(struct io_kiocb *req) 5939 + { 5940 + struct io_ring_ctx *ctx = req->ctx; 5941 + 5942 + spin_lock_irq(&ctx->completion_lock); 5943 + __io_queue_linked_timeout(req); 6089 5944 spin_unlock_irq(&ctx->completion_lock); 6090 5945 6091 5946 /* drop submission reference */ ··· 7998 7837 return work->files == files; 7999 7838 } 8000 7839 7840 + /* 7841 + * Returns true if 'preq' is the link parent of 'req' 7842 + */ 7843 + static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req) 7844 + { 7845 + struct io_kiocb *link; 7846 + 7847 + if (!(preq->flags & REQ_F_LINK_HEAD)) 7848 + return false; 7849 + 7850 + list_for_each_entry(link, &preq->link_list, link_list) { 7851 + if (link == req) 7852 + return true; 7853 + } 7854 + 7855 + return false; 7856 + } 7857 + 7858 + /* 7859 + * We're looking to cancel 'req' because it's holding on to our files, but 7860 + * 'req' could be a link to another request. See if it is, and cancel that 7861 + * parent request if so. 7862 + */ 7863 + static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req) 7864 + { 7865 + struct hlist_node *tmp; 7866 + struct io_kiocb *preq; 7867 + bool found = false; 7868 + int i; 7869 + 7870 + spin_lock_irq(&ctx->completion_lock); 7871 + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { 7872 + struct hlist_head *list; 7873 + 7874 + list = &ctx->cancel_hash[i]; 7875 + hlist_for_each_entry_safe(preq, tmp, list, hash_node) { 7876 + found = io_match_link(preq, req); 7877 + if (found) { 7878 + io_poll_remove_one(preq); 7879 + break; 7880 + } 7881 + } 7882 + } 7883 + spin_unlock_irq(&ctx->completion_lock); 7884 + return found; 7885 + } 7886 + 7887 + static bool io_timeout_remove_link(struct io_ring_ctx *ctx, 7888 + struct io_kiocb *req) 7889 + { 7890 + struct io_kiocb *preq; 7891 + bool found = false; 7892 + 7893 + spin_lock_irq(&ctx->completion_lock); 7894 + list_for_each_entry(preq, &ctx->timeout_list, timeout.list) { 7895 + found = io_match_link(preq, req); 7896 + if (found) { 7897 + __io_timeout_cancel(preq); 7898 + break; 7899 + } 7900 + } 7901 + spin_unlock_irq(&ctx->completion_lock); 7902 + return found; 7903 + } 7904 + 8001 7905 static void io_uring_cancel_files(struct io_ring_ctx *ctx, 8002 7906 struct files_struct *files) 8003 7907 { ··· 8117 7891 } 8118 7892 } else { 8119 7893 io_wq_cancel_work(ctx->io_wq, &cancel_req->work); 7894 + /* could be a link, check and remove if it is */ 7895 + if (!io_poll_remove_link(ctx, cancel_req)) 7896 + io_timeout_remove_link(ctx, cancel_req); 8120 7897 io_put_req(cancel_req); 8121 7898 } 8122 7899 ··· 8400 8171 struct io_rings *rings; 8401 8172 size_t size, sq_array_offset; 8402 8173 8174 + /* make sure these are sane, as we already accounted them */ 8175 + ctx->sq_entries = p->sq_entries; 8176 + ctx->cq_entries = p->cq_entries; 8177 + 8403 8178 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); 8404 8179 if (size == SIZE_MAX) 8405 8180 return -EOVERFLOW; ··· 8420 8187 rings->cq_ring_entries = p->cq_entries; 8421 8188 ctx->sq_mask = rings->sq_ring_mask; 8422 8189 ctx->cq_mask = rings->cq_ring_mask; 8423 - ctx->sq_entries = rings->sq_ring_entries; 8424 - ctx->cq_entries = rings->cq_ring_entries; 8425 8190 8426 8191 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); 8427 8192 if (size == SIZE_MAX) { ··· 8548 8317 ctx->user = user; 8549 8318 ctx->creds = get_current_cred(); 8550 8319 8320 + /* 8321 + * Account memory _before_ installing the file descriptor. Once 8322 + * the descriptor is installed, it can get closed at any time. Also 8323 + * do this before hitting the general error path, as ring freeing 8324 + * will un-account as well. 8325 + */ 8326 + io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries), 8327 + ACCT_LOCKED); 8328 + ctx->limit_mem = limit_mem; 8329 + 8551 8330 ret = io_allocate_scq_urings(ctx, p); 8552 8331 if (ret) 8553 8332 goto err; ··· 8593 8352 ret = -EFAULT; 8594 8353 goto err; 8595 8354 } 8596 - 8597 - /* 8598 - * Account memory _before_ installing the file descriptor. Once 8599 - * the descriptor is installed, it can get closed at any time. 8600 - */ 8601 - io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries), 8602 - ACCT_LOCKED); 8603 - ctx->limit_mem = limit_mem; 8604 8355 8605 8356 /* 8606 8357 * Install ring fd as the very last thing, so we don't risk someone

+1 -1

include/linux/fs.h

··· 3322 3322 if (flags & RWF_NOWAIT) { 3323 3323 if (!(ki->ki_filp->f_mode & FMODE_NOWAIT)) 3324 3324 return -EOPNOTSUPP; 3325 - kiocb_flags |= IOCB_NOWAIT; 3325 + kiocb_flags |= IOCB_NOWAIT | IOCB_NOIO; 3326 3326 } 3327 3327 if (flags & RWF_HIPRI) 3328 3328 kiocb_flags |= IOCB_HIPRI;

+15 -1

kernel/signal.c

··· 2541 2541 2542 2542 relock: 2543 2543 spin_lock_irq(&sighand->siglock); 2544 - current->jobctl &= ~JOBCTL_TASK_WORK; 2544 + /* 2545 + * Make sure we can safely read ->jobctl() in task_work add. As Oleg 2546 + * states: 2547 + * 2548 + * It pairs with mb (implied by cmpxchg) before READ_ONCE. So we 2549 + * roughly have 2550 + * 2551 + * task_work_add: get_signal: 2552 + * STORE(task->task_works, new_work); STORE(task->jobctl); 2553 + * mb(); mb(); 2554 + * LOAD(task->jobctl); LOAD(task->task_works); 2555 + * 2556 + * and we can rely on STORE-MB-LOAD [ in task_work_add]. 2557 + */ 2558 + smp_store_mb(current->jobctl, current->jobctl & ~JOBCTL_TASK_WORK); 2545 2559 if (unlikely(current->task_works)) { 2546 2560 spin_unlock_irq(&sighand->siglock); 2547 2561 task_work_run();

+7 -1

kernel/task_work.c

··· 42 42 set_notify_resume(task); 43 43 break; 44 44 case TWA_SIGNAL: 45 - if (lock_task_sighand(task, &flags)) { 45 + /* 46 + * Only grab the sighand lock if we don't already have some 47 + * task_work pending. This pairs with the smp_store_mb() 48 + * in get_signal(), see comment there. 49 + */ 50 + if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) && 51 + lock_task_sighand(task, &flags)) { 46 52 task->jobctl |= JOBCTL_TASK_WORK; 47 53 signal_wake_up(task, 0); 48 54 unlock_task_sighand(task, &flags);