io_uring: internally retry short reads

We've had a few application cases of not handling short reads properly,
and it is understandable as short reads aren't really expected if the
application isn't doing non-blocking IO.

Now that we retain the iov_iter over retries, we can implement internal
retry pretty trivially. This ensures that we don't return a short read,
even for buffered reads on page cache conflicts.

Cleanup the deep nesting and hard to read nature of io_read() as well,
it's much more straight forward now to read and understand. Added a
few comments explaining the logic as well.

Signed-off-by: Jens Axboe <axboe@kernel.dk>

Changed files
+70 -39
fs
+70 -39
fs/io_uring.c
··· 510 510 struct iovec fast_iov[UIO_FASTIOV]; 511 511 const struct iovec *free_iovec; 512 512 struct iov_iter iter; 513 + size_t bytes_done; 513 514 struct wait_page_queue wpq; 514 515 }; 515 516 ··· 917 916 bool needs_lock); 918 917 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, 919 918 const struct iovec *fast_iov, 920 - struct iov_iter *iter); 919 + struct iov_iter *iter, bool force); 921 920 922 921 static struct kmem_cache *req_cachep; 923 922 ··· 2299 2298 ret = io_import_iovec(rw, req, &iovec, &iter, false); 2300 2299 if (ret < 0) 2301 2300 goto end_req; 2302 - ret = io_setup_async_rw(req, iovec, inline_vecs, &iter); 2301 + ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false); 2303 2302 if (!ret) 2304 2303 return true; 2305 2304 kfree(iovec); ··· 2588 2587 struct io_comp_state *cs) 2589 2588 { 2590 2589 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 2590 + 2591 + /* add previously done IO, if any */ 2592 + if (req->io && req->io->rw.bytes_done > 0) { 2593 + if (ret < 0) 2594 + ret = req->io->rw.bytes_done; 2595 + else 2596 + ret += req->io->rw.bytes_done; 2597 + } 2591 2598 2592 2599 if (req->flags & REQ_F_CUR_POS) 2593 2600 req->file->f_pos = kiocb->ki_pos; ··· 2944 2935 2945 2936 memcpy(&rw->iter, iter, sizeof(*iter)); 2946 2937 rw->free_iovec = NULL; 2938 + rw->bytes_done = 0; 2947 2939 /* can only be fixed buffers, no need to do anything */ 2948 2940 if (iter->type == ITER_BVEC) 2949 2941 return; ··· 2981 2971 2982 2972 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, 2983 2973 const struct iovec *fast_iov, 2984 - struct iov_iter *iter) 2974 + struct iov_iter *iter, bool force) 2985 2975 { 2986 - if (!io_op_defs[req->opcode].async_ctx) 2976 + if (!force && !io_op_defs[req->opcode].async_ctx) 2987 2977 return 0; 2988 2978 if (!req->io) { 2989 2979 if (__io_alloc_async_ctx(req)) ··· 3107 3097 * succeed, or in rare cases where it fails, we then fall back to using the 3108 3098 * async worker threads for a blocking retry. 3109 3099 */ 3110 - static bool io_rw_should_retry(struct io_kiocb *req, struct iovec *iovec, 3111 - struct iovec *fast_iov, struct iov_iter *iter) 3100 + static bool io_rw_should_retry(struct io_kiocb *req) 3112 3101 { 3113 3102 struct kiocb *kiocb = &req->rw.kiocb; 3114 3103 int ret; ··· 3116 3107 if (req->flags & REQ_F_NOWAIT) 3117 3108 return false; 3118 3109 3119 - /* already tried, or we're doing O_DIRECT */ 3120 - if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_WAITQ)) 3110 + /* Only for buffered IO */ 3111 + if (kiocb->ki_flags & IOCB_DIRECT) 3121 3112 return false; 3122 3113 /* 3123 3114 * just use poll if we can, and don't attempt if the fs doesn't ··· 3125 3116 */ 3126 3117 if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) 3127 3118 return false; 3128 - 3129 - /* 3130 - * If request type doesn't require req->io to defer in general, 3131 - * we need to allocate it here 3132 - */ 3133 - if (!req->io) { 3134 - if (__io_alloc_async_ctx(req)) 3135 - return false; 3136 - io_req_map_rw(req, iovec, fast_iov, iter); 3137 - } 3138 3119 3139 3120 ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq, 3140 3121 io_async_buf_func, req); ··· 3152 3153 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 3153 3154 struct kiocb *kiocb = &req->rw.kiocb; 3154 3155 struct iov_iter __iter, *iter = &__iter; 3156 + ssize_t io_size, ret, ret2; 3155 3157 size_t iov_count; 3156 - ssize_t io_size, ret, ret2 = 0; 3157 3158 3158 3159 if (req->io) 3159 3160 iter = &req->io->rw.iter; ··· 3163 3164 return ret; 3164 3165 io_size = ret; 3165 3166 req->result = io_size; 3167 + ret = 0; 3166 3168 3167 3169 /* Ensure we clear previously set non-block flag */ 3168 3170 if (!force_nonblock) ··· 3178 3178 if (unlikely(ret)) 3179 3179 goto out_free; 3180 3180 3181 - ret2 = io_iter_do_read(req, iter); 3181 + ret = io_iter_do_read(req, iter); 3182 3182 3183 - /* Catch -EAGAIN return for forced non-blocking submission */ 3184 - if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { 3185 - kiocb_done(kiocb, ret2, cs); 3186 - } else { 3187 - copy_iov: 3188 - ret = io_setup_async_rw(req, iovec, inline_vecs, iter); 3183 + if (!ret) { 3184 + goto done; 3185 + } else if (ret == -EIOCBQUEUED) { 3186 + ret = 0; 3187 + goto out_free; 3188 + } else if (ret == -EAGAIN) { 3189 + ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); 3189 3190 if (ret) 3190 3191 goto out_free; 3191 - /* it's copied and will be cleaned with ->io */ 3192 - iovec = NULL; 3193 - /* if we can retry, do so with the callbacks armed */ 3194 - if (io_rw_should_retry(req, iovec, inline_vecs, iter)) { 3195 - ret2 = io_iter_do_read(req, iter); 3196 - if (ret2 == -EIOCBQUEUED) { 3197 - goto out_free; 3198 - } else if (ret2 != -EAGAIN) { 3199 - kiocb_done(kiocb, ret2, cs); 3200 - goto out_free; 3201 - } 3202 - } 3192 + return -EAGAIN; 3193 + } else if (ret < 0) { 3194 + goto out_free; 3195 + } 3196 + 3197 + /* read it all, or we did blocking attempt. no retry. */ 3198 + if (!iov_iter_count(iter) || !force_nonblock) 3199 + goto done; 3200 + 3201 + io_size -= ret; 3202 + copy_iov: 3203 + ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true); 3204 + if (ret2) { 3205 + ret = ret2; 3206 + goto out_free; 3207 + } 3208 + /* it's copied and will be cleaned with ->io */ 3209 + iovec = NULL; 3210 + /* now use our persistent iterator, if we aren't already */ 3211 + iter = &req->io->rw.iter; 3212 + retry: 3213 + req->io->rw.bytes_done += ret; 3214 + /* if we can retry, do so with the callbacks armed */ 3215 + if (!io_rw_should_retry(req)) { 3203 3216 kiocb->ki_flags &= ~IOCB_WAITQ; 3204 3217 return -EAGAIN; 3205 3218 } 3219 + 3220 + /* 3221 + * Now retry read with the IOCB_WAITQ parts set in the iocb. If we 3222 + * get -EIOCBQUEUED, then we'll get a notification when the desired 3223 + * page gets unlocked. We can also get a partial read here, and if we 3224 + * do, then just retry at the new offset. 3225 + */ 3226 + ret = io_iter_do_read(req, iter); 3227 + if (ret == -EIOCBQUEUED) { 3228 + ret = 0; 3229 + goto out_free; 3230 + } else if (ret > 0 && ret < io_size) { 3231 + /* we got some bytes, but not all. retry. */ 3232 + goto retry; 3233 + } 3234 + done: 3235 + kiocb_done(kiocb, ret, cs); 3236 + ret = 0; 3206 3237 out_free: 3207 3238 if (iovec) 3208 3239 kfree(iovec); ··· 3326 3295 kiocb_done(kiocb, ret2, cs); 3327 3296 } else { 3328 3297 copy_iov: 3329 - ret = io_setup_async_rw(req, iovec, inline_vecs, iter); 3298 + ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); 3330 3299 if (!ret) 3331 3300 return -EAGAIN; 3332 3301 }