Merge tag 'for-6.1/io_uring-2022-10-03' of git://git.kernel.dk/linux

+2 -1

block/blk-mq.c

··· 1233 1233 complete(&wait->done); 1234 1234 } 1235 1235 1236 - static bool blk_rq_is_poll(struct request *rq) 1236 + bool blk_rq_is_poll(struct request *rq) 1237 1237 { 1238 1238 if (!rq->mq_hctx) 1239 1239 return false; ··· 1243 1243 return false; 1244 1244 return true; 1245 1245 } 1246 + EXPORT_SYMBOL_GPL(blk_rq_is_poll); 1246 1247 1247 1248 static void blk_rq_poll_completion(struct request *rq, struct completion *wait) 1248 1249 {

+1

drivers/nvme/host/core.c

··· 3976 3976 .unlocked_ioctl = nvme_ns_chr_ioctl, 3977 3977 .compat_ioctl = compat_ptr_ioctl, 3978 3978 .uring_cmd = nvme_ns_chr_uring_cmd, 3979 + .uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll, 3979 3980 }; 3980 3981 3981 3982 static int nvme_add_ns_cdev(struct nvme_ns *ns)

+72 -5

drivers/nvme/host/ioctl.c

··· 391 391 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 392 392 /* extract bio before reusing the same field for request */ 393 393 struct bio *bio = pdu->bio; 394 + void *cookie = READ_ONCE(ioucmd->cookie); 394 395 395 396 pdu->req = req; 396 397 req->bio = bio; 397 - /* this takes care of moving rest of completion-work to task context */ 398 - io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb); 398 + 399 + /* 400 + * For iopoll, complete it directly. 401 + * Otherwise, move the completion to task work. 402 + */ 403 + if (cookie != NULL && blk_rq_is_poll(req)) 404 + nvme_uring_task_cb(ioucmd); 405 + else 406 + io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb); 399 407 } 400 408 401 409 static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, ··· 453 445 rq_flags = REQ_NOWAIT; 454 446 blk_flags = BLK_MQ_REQ_NOWAIT; 455 447 } 448 + if (issue_flags & IO_URING_F_IOPOLL) 449 + rq_flags |= REQ_POLLED; 456 450 451 + retry: 457 452 req = nvme_alloc_user_request(q, &c, nvme_to_user_ptr(d.addr), 458 453 d.data_len, nvme_to_user_ptr(d.metadata), 459 454 d.metadata_len, 0, &meta, d.timeout_ms ? ··· 467 456 req->end_io = nvme_uring_cmd_end_io; 468 457 req->end_io_data = ioucmd; 469 458 459 + if (issue_flags & IO_URING_F_IOPOLL && rq_flags & REQ_POLLED) { 460 + if (unlikely(!req->bio)) { 461 + /* we can't poll this, so alloc regular req instead */ 462 + blk_mq_free_request(req); 463 + rq_flags &= ~REQ_POLLED; 464 + goto retry; 465 + } else { 466 + WRITE_ONCE(ioucmd->cookie, req->bio); 467 + req->bio->bi_opf |= REQ_POLLED; 468 + } 469 + } 470 470 /* to free bio on completion, as req->bio will be null at that time */ 471 471 pdu->bio = req->bio; 472 472 pdu->meta = meta; ··· 581 559 582 560 static int nvme_uring_cmd_checks(unsigned int issue_flags) 583 561 { 584 - /* IOPOLL not supported yet */ 585 - if (issue_flags & IO_URING_F_IOPOLL) 586 - return -EOPNOTSUPP; 587 562 588 563 /* NVMe passthrough requires big SQE/CQE support */ 589 564 if ((issue_flags & (IO_URING_F_SQE128|IO_URING_F_CQE32)) != ··· 623 604 return nvme_ns_uring_cmd(ns, ioucmd, issue_flags); 624 605 } 625 606 607 + int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, 608 + struct io_comp_batch *iob, 609 + unsigned int poll_flags) 610 + { 611 + struct bio *bio; 612 + int ret = 0; 613 + struct nvme_ns *ns; 614 + struct request_queue *q; 615 + 616 + rcu_read_lock(); 617 + bio = READ_ONCE(ioucmd->cookie); 618 + ns = container_of(file_inode(ioucmd->file)->i_cdev, 619 + struct nvme_ns, cdev); 620 + q = ns->queue; 621 + if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev) 622 + ret = bio_poll(bio, iob, poll_flags); 623 + rcu_read_unlock(); 624 + return ret; 625 + } 626 626 #ifdef CONFIG_NVME_MULTIPATH 627 627 static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, 628 628 void __user *argp, struct nvme_ns_head *head, int srcu_idx) ··· 723 685 srcu_read_unlock(&head->srcu, srcu_idx); 724 686 return ret; 725 687 } 688 + 689 + int nvme_ns_head_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, 690 + struct io_comp_batch *iob, 691 + unsigned int poll_flags) 692 + { 693 + struct cdev *cdev = file_inode(ioucmd->file)->i_cdev; 694 + struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev); 695 + int srcu_idx = srcu_read_lock(&head->srcu); 696 + struct nvme_ns *ns = nvme_find_path(head); 697 + struct bio *bio; 698 + int ret = 0; 699 + struct request_queue *q; 700 + 701 + if (ns) { 702 + rcu_read_lock(); 703 + bio = READ_ONCE(ioucmd->cookie); 704 + q = ns->queue; 705 + if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio 706 + && bio->bi_bdev) 707 + ret = bio_poll(bio, iob, poll_flags); 708 + rcu_read_unlock(); 709 + } 710 + srcu_read_unlock(&head->srcu, srcu_idx); 711 + return ret; 712 + } 726 713 #endif /* CONFIG_NVME_MULTIPATH */ 727 714 728 715 int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) 729 716 { 730 717 struct nvme_ctrl *ctrl = ioucmd->file->private_data; 731 718 int ret; 719 + 720 + /* IOPOLL not supported yet */ 721 + if (issue_flags & IO_URING_F_IOPOLL) 722 + return -EOPNOTSUPP; 732 723 733 724 ret = nvme_uring_cmd_checks(issue_flags); 734 725 if (ret)

+1

drivers/nvme/host/multipath.c

··· 439 439 .unlocked_ioctl = nvme_ns_head_chr_ioctl, 440 440 .compat_ioctl = compat_ptr_ioctl, 441 441 .uring_cmd = nvme_ns_head_chr_uring_cmd, 442 + .uring_cmd_iopoll = nvme_ns_head_chr_uring_cmd_iopoll, 442 443 }; 443 444 444 445 static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)

+4

drivers/nvme/host/nvme.h

··· 821 821 unsigned long arg); 822 822 long nvme_dev_ioctl(struct file *file, unsigned int cmd, 823 823 unsigned long arg); 824 + int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, 825 + struct io_comp_batch *iob, unsigned int poll_flags); 826 + int nvme_ns_head_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, 827 + struct io_comp_batch *iob, unsigned int poll_flags); 824 828 int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, 825 829 unsigned int issue_flags); 826 830 int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,

+7 -3

fs/eventfd.c

··· 69 69 * it returns false, the eventfd_signal() call should be deferred to a 70 70 * safe context. 71 71 */ 72 - if (WARN_ON_ONCE(current->in_eventfd_signal)) 72 + if (WARN_ON_ONCE(current->in_eventfd)) 73 73 return 0; 74 74 75 75 spin_lock_irqsave(&ctx->wqh.lock, flags); 76 - current->in_eventfd_signal = 1; 76 + current->in_eventfd = 1; 77 77 if (ULLONG_MAX - ctx->count < n) 78 78 n = ULLONG_MAX - ctx->count; 79 79 ctx->count += n; 80 80 if (waitqueue_active(&ctx->wqh)) 81 81 wake_up_locked_poll(&ctx->wqh, EPOLLIN); 82 - current->in_eventfd_signal = 0; 82 + current->in_eventfd = 0; 83 83 spin_unlock_irqrestore(&ctx->wqh.lock, flags); 84 84 85 85 return n; ··· 253 253 __set_current_state(TASK_RUNNING); 254 254 } 255 255 eventfd_ctx_do_read(ctx, &ucnt); 256 + current->in_eventfd = 1; 256 257 if (waitqueue_active(&ctx->wqh)) 257 258 wake_up_locked_poll(&ctx->wqh, EPOLLOUT); 259 + current->in_eventfd = 0; 258 260 spin_unlock_irq(&ctx->wqh.lock); 259 261 if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt))) 260 262 return -EFAULT; ··· 303 301 } 304 302 if (likely(res > 0)) { 305 303 ctx->count += ucnt; 304 + current->in_eventfd = 1; 306 305 if (waitqueue_active(&ctx->wqh)) 307 306 wake_up_locked_poll(&ctx->wqh, EPOLLIN); 307 + current->in_eventfd = 0; 308 308 } 309 309 spin_unlock_irq(&ctx->wqh.lock); 310 310

+1

include/linux/blk-mq.h

··· 980 980 int blk_rq_append_bio(struct request *rq, struct bio *bio); 981 981 void blk_execute_rq_nowait(struct request *rq, bool at_head); 982 982 blk_status_t blk_execute_rq(struct request *rq, bool at_head); 983 + bool blk_rq_is_poll(struct request *rq); 983 984 984 985 struct req_iterator { 985 986 struct bvec_iter iter;

+1 -1

include/linux/eventfd.h

··· 46 46 47 47 static inline bool eventfd_signal_allowed(void) 48 48 { 49 - return !current->in_eventfd_signal; 49 + return !current->in_eventfd; 50 50 } 51 51 52 52 #else /* CONFIG_EVENTFD */

+2

include/linux/fs.h

··· 2133 2133 loff_t len, unsigned int remap_flags); 2134 2134 int (*fadvise)(struct file *, loff_t, loff_t, int); 2135 2135 int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags); 2136 + int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *, 2137 + unsigned int poll_flags); 2136 2138 } __randomize_layout; 2137 2139 2138 2140 struct inode_operations {

+6 -2

include/linux/io_uring.h

··· 20 20 struct io_uring_cmd { 21 21 struct file *file; 22 22 const void *cmd; 23 - /* callback to defer completions to task context */ 24 - void (*task_work_cb)(struct io_uring_cmd *cmd); 23 + union { 24 + /* callback to defer completions to task context */ 25 + void (*task_work_cb)(struct io_uring_cmd *cmd); 26 + /* used for polled completion */ 27 + void *cookie; 28 + }; 25 29 u32 cmd_op; 26 30 u32 pad; 27 31 u8 pdu[32]; /* available inline for free use */

+4

include/linux/io_uring_types.h

··· 184 184 struct eventfd_ctx *cq_ev_fd; 185 185 unsigned int eventfd_async: 1; 186 186 struct rcu_head rcu; 187 + atomic_t refs; 188 + atomic_t ops; 187 189 }; 188 190 189 191 struct io_alloc_cache { ··· 302 300 struct io_wq_work_list iopoll_list; 303 301 struct io_hash_table cancel_table; 304 302 bool poll_multi_queue; 303 + 304 + struct llist_head work_llist; 305 305 306 306 struct list_head io_buffers_comp; 307 307 } ____cacheline_aligned_in_smp;

+1 -1

include/linux/sched.h

··· 936 936 #endif 937 937 #ifdef CONFIG_EVENTFD 938 938 /* Recursion prevention for eventfd_signal() */ 939 - unsigned in_eventfd_signal:1; 939 + unsigned in_eventfd:1; 940 940 #endif 941 941 #ifdef CONFIG_IOMMU_SVA 942 942 unsigned pasid_activated:1;

+29

include/trace/events/io_uring.h

··· 655 655 __entry->wanted, __entry->got) 656 656 ); 657 657 658 + /* 659 + * io_uring_local_work_run - ran ring local task work 660 + * 661 + * @tctx: pointer to a io_uring_ctx 662 + * @count: how many functions it ran 663 + * @loops: how many loops it ran 664 + * 665 + */ 666 + TRACE_EVENT(io_uring_local_work_run, 667 + 668 + TP_PROTO(void *ctx, int count, unsigned int loops), 669 + 670 + TP_ARGS(ctx, count, loops), 671 + 672 + TP_STRUCT__entry ( 673 + __field(void *, ctx ) 674 + __field(int, count ) 675 + __field(unsigned int, loops ) 676 + ), 677 + 678 + TP_fast_assign( 679 + __entry->ctx = ctx; 680 + __entry->count = count; 681 + __entry->loops = loops; 682 + ), 683 + 684 + TP_printk("ring %p, count %d, loops %u", __entry->ctx, __entry->count, __entry->loops) 685 + ); 686 + 658 687 #endif /* _TRACE_IO_URING_H */ 659 688 660 689 /* This part must be outside protection */

+8

include/uapi/linux/io_uring.h

··· 157 157 */ 158 158 #define IORING_SETUP_SINGLE_ISSUER (1U << 12) 159 159 160 + /* 161 + * Defer running task work to get events. 162 + * Rather than running bits of task work whenever the task transitions 163 + * try to do it just before it is needed. 164 + */ 165 + #define IORING_SETUP_DEFER_TASKRUN (1U << 13) 166 + 160 167 enum io_uring_op { 161 168 IORING_OP_NOP, 162 169 IORING_OP_READV, ··· 213 206 IORING_OP_SOCKET, 214 207 IORING_OP_URING_CMD, 215 208 IORING_OP_SEND_ZC, 209 + IORING_OP_SENDMSG_ZC, 216 210 217 211 /* this goes last, obviously */ 218 212 IORING_OP_LAST,

+1 -1

io_uring/cancel.c

··· 292 292 break; 293 293 294 294 mutex_unlock(&ctx->uring_lock); 295 - ret = io_run_task_work_sig(); 295 + ret = io_run_task_work_sig(ctx); 296 296 if (ret < 0) { 297 297 mutex_lock(&ctx->uring_lock); 298 298 break;

+32 -16

io_uring/fdinfo.c

··· 60 60 unsigned int cq_head = READ_ONCE(r->cq.head); 61 61 unsigned int cq_tail = READ_ONCE(r->cq.tail); 62 62 unsigned int cq_shift = 0; 63 + unsigned int sq_shift = 0; 63 64 unsigned int sq_entries, cq_entries; 64 65 bool has_lock; 65 - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); 66 66 unsigned int i; 67 67 68 - if (is_cqe32) 68 + if (ctx->flags & IORING_SETUP_CQE32) 69 69 cq_shift = 1; 70 + if (ctx->flags & IORING_SETUP_SQE128) 71 + sq_shift = 1; 70 72 71 73 /* 72 74 * we may get imprecise sqe and cqe info if uring is actively running ··· 84 82 seq_printf(m, "CqHead:\t%u\n", cq_head); 85 83 seq_printf(m, "CqTail:\t%u\n", cq_tail); 86 84 seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail); 87 - seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head); 85 + seq_printf(m, "SQEs:\t%u\n", sq_tail - sq_head); 88 86 sq_entries = min(sq_tail - sq_head, ctx->sq_entries); 89 87 for (i = 0; i < sq_entries; i++) { 90 88 unsigned int entry = i + sq_head; 91 - unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); 92 89 struct io_uring_sqe *sqe; 90 + unsigned int sq_idx; 93 91 92 + sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); 94 93 if (sq_idx > sq_mask) 95 94 continue; 96 - sqe = &ctx->sq_sqes[sq_idx]; 97 - seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n", 98 - sq_idx, sqe->opcode, sqe->fd, sqe->flags, 99 - sqe->user_data); 95 + sqe = &ctx->sq_sqes[sq_idx << 1]; 96 + seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, " 97 + "addr:0x%llx, rw_flags:0x%x, buf_index:%d " 98 + "user_data:%llu", 99 + sq_idx, io_uring_get_opcode(sqe->opcode), sqe->fd, 100 + sqe->flags, (unsigned long long) sqe->off, 101 + (unsigned long long) sqe->addr, sqe->rw_flags, 102 + sqe->buf_index, sqe->user_data); 103 + if (sq_shift) { 104 + u64 *sqeb = (void *) (sqe + 1); 105 + int size = sizeof(struct io_uring_sqe) / sizeof(u64); 106 + int j; 107 + 108 + for (j = 0; j < size; j++) { 109 + seq_printf(m, ", e%d:0x%llx", j, 110 + (unsigned long long) *sqeb); 111 + sqeb++; 112 + } 113 + } 114 + seq_printf(m, "\n"); 100 115 } 101 116 seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); 102 117 cq_entries = min(cq_tail - cq_head, ctx->cq_entries); ··· 121 102 unsigned int entry = i + cq_head; 122 103 struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift]; 123 104 124 - if (!is_cqe32) { 125 - seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", 105 + seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x", 126 106 entry & cq_mask, cqe->user_data, cqe->res, 127 107 cqe->flags); 128 - } else { 129 - seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, " 130 - "extra1:%llu, extra2:%llu\n", 131 - entry & cq_mask, cqe->user_data, cqe->res, 132 - cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]); 133 - } 108 + if (cq_shift) 109 + seq_printf(m, ", extra1:%llu, extra2:%llu\n", 110 + cqe->big_cqe[0], cqe->big_cqe[1]); 111 + seq_printf(m, "\n"); 134 112 } 135 113 136 114 /*

+245 -61

io_uring/io_uring.c

··· 125 125 IO_CHECK_CQ_DROPPED_BIT, 126 126 }; 127 127 128 + enum { 129 + IO_EVENTFD_OP_SIGNAL_BIT, 130 + IO_EVENTFD_OP_FREE_BIT, 131 + }; 132 + 128 133 struct io_defer_entry { 129 134 struct list_head list; 130 135 struct io_kiocb *req; ··· 147 142 static void io_dismantle_req(struct io_kiocb *req); 148 143 static void io_clean_op(struct io_kiocb *req); 149 144 static void io_queue_sqe(struct io_kiocb *req); 150 - 145 + static void io_move_task_work_from_local(struct io_ring_ctx *ctx); 151 146 static void __io_submit_flush_completions(struct io_ring_ctx *ctx); 152 147 153 148 static struct kmem_cache *req_cachep; ··· 321 316 INIT_LIST_HEAD(&ctx->rsrc_ref_list); 322 317 INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); 323 318 init_llist_head(&ctx->rsrc_put_llist); 319 + init_llist_head(&ctx->work_llist); 324 320 INIT_LIST_HEAD(&ctx->tctx_list); 325 321 ctx->submit_state.free_list.next = NULL; 326 322 INIT_WQ_LIST(&ctx->locked_free_list); ··· 483 477 } 484 478 } 485 479 480 + 481 + static void io_eventfd_ops(struct rcu_head *rcu) 482 + { 483 + struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); 484 + int ops = atomic_xchg(&ev_fd->ops, 0); 485 + 486 + if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT)) 487 + eventfd_signal(ev_fd->cq_ev_fd, 1); 488 + 489 + /* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback 490 + * ordering in a race but if references are 0 we know we have to free 491 + * it regardless. 492 + */ 493 + if (atomic_dec_and_test(&ev_fd->refs)) { 494 + eventfd_ctx_put(ev_fd->cq_ev_fd); 495 + kfree(ev_fd); 496 + } 497 + } 498 + 486 499 static void io_eventfd_signal(struct io_ring_ctx *ctx) 487 500 { 488 - struct io_ev_fd *ev_fd; 489 - bool skip; 490 - 491 - spin_lock(&ctx->completion_lock); 492 - /* 493 - * Eventfd should only get triggered when at least one event has been 494 - * posted. Some applications rely on the eventfd notification count only 495 - * changing IFF a new CQE has been added to the CQ ring. There's no 496 - * depedency on 1:1 relationship between how many times this function is 497 - * called (and hence the eventfd count) and number of CQEs posted to the 498 - * CQ ring. 499 - */ 500 - skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; 501 - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; 502 - spin_unlock(&ctx->completion_lock); 503 - if (skip) 504 - return; 501 + struct io_ev_fd *ev_fd = NULL; 505 502 506 503 rcu_read_lock(); 507 504 /* ··· 522 513 goto out; 523 514 if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) 524 515 goto out; 516 + if (ev_fd->eventfd_async && !io_wq_current_is_worker()) 517 + goto out; 525 518 526 - if (!ev_fd->eventfd_async || io_wq_current_is_worker()) 519 + if (likely(eventfd_signal_allowed())) { 527 520 eventfd_signal(ev_fd->cq_ev_fd, 1); 521 + } else { 522 + atomic_inc(&ev_fd->refs); 523 + if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) 524 + call_rcu(&ev_fd->rcu, io_eventfd_ops); 525 + else 526 + atomic_dec(&ev_fd->refs); 527 + } 528 + 528 529 out: 529 530 rcu_read_unlock(); 531 + } 532 + 533 + static void io_eventfd_flush_signal(struct io_ring_ctx *ctx) 534 + { 535 + bool skip; 536 + 537 + spin_lock(&ctx->completion_lock); 538 + 539 + /* 540 + * Eventfd should only get triggered when at least one event has been 541 + * posted. Some applications rely on the eventfd notification count 542 + * only changing IFF a new CQE has been added to the CQ ring. There's 543 + * no depedency on 1:1 relationship between how many times this 544 + * function is called (and hence the eventfd count) and number of CQEs 545 + * posted to the CQ ring. 546 + */ 547 + skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; 548 + ctx->evfd_last_cq_tail = ctx->cached_cq_tail; 549 + spin_unlock(&ctx->completion_lock); 550 + if (skip) 551 + return; 552 + 553 + io_eventfd_signal(ctx); 530 554 } 531 555 532 556 void __io_commit_cqring_flush(struct io_ring_ctx *ctx) ··· 573 531 spin_unlock(&ctx->completion_lock); 574 532 } 575 533 if (ctx->has_evfd) 576 - io_eventfd_signal(ctx); 534 + io_eventfd_flush_signal(ctx); 577 535 } 578 536 579 537 static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx) ··· 609 567 610 568 io_cq_lock(ctx); 611 569 while (!list_empty(&ctx->cq_overflow_list)) { 612 - struct io_uring_cqe *cqe = io_get_cqe(ctx); 570 + struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true); 613 571 struct io_overflow_cqe *ocqe; 614 572 615 573 if (!cqe && !force) ··· 736 694 * control dependency is enough as we're using WRITE_ONCE to 737 695 * fill the cq entry 738 696 */ 739 - struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) 697 + struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) 740 698 { 741 699 struct io_rings *rings = ctx->rings; 742 700 unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); 743 701 unsigned int free, queued, len; 744 702 703 + /* 704 + * Posting into the CQ when there are pending overflowed CQEs may break 705 + * ordering guarantees, which will affect links, F_MORE users and more. 706 + * Force overflow the completion. 707 + */ 708 + if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))) 709 + return NULL; 745 710 746 711 /* userspace may cheat modifying the tail, be safe and do min */ 747 712 queued = min(__io_cqring_events(ctx), ctx->cq_entries); ··· 872 823 873 824 void io_req_complete_failed(struct io_kiocb *req, s32 res) 874 825 { 826 + const struct io_op_def *def = &io_op_defs[req->opcode]; 827 + 875 828 req_set_fail(req); 876 829 io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); 830 + if (def->fail) 831 + def->fail(req); 877 832 io_req_complete_post(req); 878 833 } 879 834 ··· 1100 1047 trace_io_uring_task_work_run(tctx, count, loops); 1101 1048 } 1102 1049 1103 - void io_req_task_work_add(struct io_kiocb *req) 1050 + static void io_req_local_work_add(struct io_kiocb *req) 1051 + { 1052 + struct io_ring_ctx *ctx = req->ctx; 1053 + 1054 + if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) 1055 + return; 1056 + 1057 + if (unlikely(atomic_read(&req->task->io_uring->in_idle))) { 1058 + io_move_task_work_from_local(ctx); 1059 + return; 1060 + } 1061 + 1062 + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 1063 + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 1064 + 1065 + if (ctx->has_evfd) 1066 + io_eventfd_signal(ctx); 1067 + io_cqring_wake(ctx); 1068 + 1069 + } 1070 + 1071 + static inline void __io_req_task_work_add(struct io_kiocb *req, bool allow_local) 1104 1072 { 1105 1073 struct io_uring_task *tctx = req->task->io_uring; 1106 1074 struct io_ring_ctx *ctx = req->ctx; 1107 1075 struct llist_node *node; 1108 - bool running; 1109 1076 1110 - running = !llist_add(&req->io_task_work.node, &tctx->task_list); 1077 + if (allow_local && ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 1078 + io_req_local_work_add(req); 1079 + return; 1080 + } 1111 1081 1112 1082 /* task_work already pending, we're done */ 1113 - if (running) 1083 + if (!llist_add(&req->io_task_work.node, &tctx->task_list)) 1114 1084 return; 1115 1085 1116 1086 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) ··· 1151 1075 &req->ctx->fallback_llist)) 1152 1076 schedule_delayed_work(&req->ctx->fallback_work, 1); 1153 1077 } 1078 + } 1079 + 1080 + void io_req_task_work_add(struct io_kiocb *req) 1081 + { 1082 + __io_req_task_work_add(req, true); 1083 + } 1084 + 1085 + static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) 1086 + { 1087 + struct llist_node *node; 1088 + 1089 + node = llist_del_all(&ctx->work_llist); 1090 + while (node) { 1091 + struct io_kiocb *req = container_of(node, struct io_kiocb, 1092 + io_task_work.node); 1093 + 1094 + node = node->next; 1095 + __io_req_task_work_add(req, false); 1096 + } 1097 + } 1098 + 1099 + int __io_run_local_work(struct io_ring_ctx *ctx, bool locked) 1100 + { 1101 + struct llist_node *node; 1102 + struct llist_node fake; 1103 + struct llist_node *current_final = NULL; 1104 + int ret; 1105 + unsigned int loops = 1; 1106 + 1107 + if (unlikely(ctx->submitter_task != current)) 1108 + return -EEXIST; 1109 + 1110 + node = io_llist_xchg(&ctx->work_llist, &fake); 1111 + ret = 0; 1112 + again: 1113 + while (node != current_final) { 1114 + struct llist_node *next = node->next; 1115 + struct io_kiocb *req = container_of(node, struct io_kiocb, 1116 + io_task_work.node); 1117 + prefetch(container_of(next, struct io_kiocb, io_task_work.node)); 1118 + req->io_task_work.func(req, &locked); 1119 + ret++; 1120 + node = next; 1121 + } 1122 + 1123 + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 1124 + atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 1125 + 1126 + node = io_llist_cmpxchg(&ctx->work_llist, &fake, NULL); 1127 + if (node != &fake) { 1128 + loops++; 1129 + current_final = &fake; 1130 + node = io_llist_xchg(&ctx->work_llist, &fake); 1131 + goto again; 1132 + } 1133 + 1134 + if (locked) 1135 + io_submit_flush_completions(ctx); 1136 + trace_io_uring_local_work_run(ctx, ret, loops); 1137 + return ret; 1138 + 1139 + } 1140 + 1141 + int io_run_local_work(struct io_ring_ctx *ctx) 1142 + { 1143 + bool locked; 1144 + int ret; 1145 + 1146 + if (llist_empty(&ctx->work_llist)) 1147 + return 0; 1148 + 1149 + __set_current_state(TASK_RUNNING); 1150 + locked = mutex_trylock(&ctx->uring_lock); 1151 + ret = __io_run_local_work(ctx, locked); 1152 + if (locked) 1153 + mutex_unlock(&ctx->uring_lock); 1154 + 1155 + return ret; 1154 1156 } 1155 1157 1156 1158 static void io_req_tw_post(struct io_kiocb *req, bool *locked) ··· 1337 1183 struct io_wq_work_node *node, *prev; 1338 1184 struct io_submit_state *state = &ctx->submit_state; 1339 1185 1340 - spin_lock(&ctx->completion_lock); 1186 + io_cq_lock(ctx); 1341 1187 wq_list_for_each(node, prev, &state->compl_reqs) { 1342 1188 struct io_kiocb *req = container_of(node, struct io_kiocb, 1343 1189 comp_list); ··· 1408 1254 int ret = 0; 1409 1255 unsigned long check_cq; 1410 1256 1257 + if (!io_allowed_run_tw(ctx)) 1258 + return -EEXIST; 1259 + 1411 1260 check_cq = READ_ONCE(ctx->check_cq); 1412 1261 if (unlikely(check_cq)) { 1413 1262 if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) ··· 1441 1284 * forever, while the workqueue is stuck trying to acquire the 1442 1285 * very same mutex. 1443 1286 */ 1444 - if (wq_list_empty(&ctx->iopoll_list)) { 1287 + if (wq_list_empty(&ctx->iopoll_list) || 1288 + io_task_work_pending(ctx)) { 1445 1289 u32 tail = ctx->cached_cq_tail; 1446 1290 1447 - mutex_unlock(&ctx->uring_lock); 1448 - io_run_task_work(); 1449 - mutex_lock(&ctx->uring_lock); 1291 + if (!llist_empty(&ctx->work_llist)) 1292 + __io_run_local_work(ctx, true); 1450 1293 1294 + if (task_work_pending(current) || 1295 + wq_list_empty(&ctx->iopoll_list)) { 1296 + mutex_unlock(&ctx->uring_lock); 1297 + io_run_task_work(); 1298 + mutex_lock(&ctx->uring_lock); 1299 + } 1451 1300 /* some requests don't go through iopoll_list */ 1452 1301 if (tail != ctx->cached_cq_tail || 1453 1302 wq_list_empty(&ctx->iopoll_list)) ··· 1895 1732 io_req_task_queue(req); 1896 1733 break; 1897 1734 case IO_APOLL_ABORTED: 1898 - /* 1899 - * Queued up for async execution, worker will release 1900 - * submit reference when the iocb is actually submitted. 1901 - */ 1902 1735 io_kbuf_recycle(req, 0); 1903 1736 io_queue_iowq(req, NULL); 1904 1737 break; ··· 2308 2149 unsigned nr_timeouts; 2309 2150 }; 2310 2151 2152 + static inline bool io_has_work(struct io_ring_ctx *ctx) 2153 + { 2154 + return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) || 2155 + ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && 2156 + !llist_empty(&ctx->work_llist)); 2157 + } 2158 + 2311 2159 static inline bool io_should_wake(struct io_wait_queue *iowq) 2312 2160 { 2313 2161 struct io_ring_ctx *ctx = iowq->ctx; ··· 2333 2167 { 2334 2168 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, 2335 2169 wq); 2170 + struct io_ring_ctx *ctx = iowq->ctx; 2336 2171 2337 2172 /* 2338 2173 * Cannot safely flush overflowed CQEs from here, ensure we wake up 2339 2174 * the task, and the next invocation will do it. 2340 2175 */ 2341 - if (io_should_wake(iowq) || 2342 - test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &iowq->ctx->check_cq)) 2176 + if (io_should_wake(iowq) || io_has_work(ctx)) 2343 2177 return autoremove_wake_function(curr, mode, wake_flags, key); 2344 2178 return -1; 2345 2179 } 2346 2180 2347 - int io_run_task_work_sig(void) 2181 + int io_run_task_work_sig(struct io_ring_ctx *ctx) 2348 2182 { 2349 - if (io_run_task_work()) 2183 + if (io_run_task_work_ctx(ctx) > 0) 2350 2184 return 1; 2351 2185 if (task_sigpending(current)) 2352 2186 return -EINTR; ··· 2362 2196 unsigned long check_cq; 2363 2197 2364 2198 /* make sure we run task_work before checking for signals */ 2365 - ret = io_run_task_work_sig(); 2199 + ret = io_run_task_work_sig(ctx); 2366 2200 if (ret || io_should_wake(iowq)) 2367 2201 return ret; 2368 2202 ··· 2392 2226 ktime_t timeout = KTIME_MAX; 2393 2227 int ret; 2394 2228 2229 + if (!io_allowed_run_tw(ctx)) 2230 + return -EEXIST; 2231 + 2395 2232 do { 2233 + /* always run at least 1 task work to process local work */ 2234 + ret = io_run_task_work_ctx(ctx); 2235 + if (ret < 0) 2236 + return ret; 2396 2237 io_cqring_overflow_flush(ctx); 2238 + 2397 2239 if (io_cqring_events(ctx) >= min_events) 2398 2240 return 0; 2399 - if (!io_run_task_work()) 2400 - break; 2401 - } while (1); 2241 + } while (ret > 0); 2402 2242 2403 2243 if (sig) { 2404 2244 #ifdef CONFIG_COMPAT ··· 2538 2366 ev_fd->eventfd_async = eventfd_async; 2539 2367 ctx->has_evfd = true; 2540 2368 rcu_assign_pointer(ctx->io_ev_fd, ev_fd); 2369 + atomic_set(&ev_fd->refs, 1); 2370 + atomic_set(&ev_fd->ops, 0); 2541 2371 return 0; 2542 - } 2543 - 2544 - static void io_eventfd_put(struct rcu_head *rcu) 2545 - { 2546 - struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); 2547 - 2548 - eventfd_ctx_put(ev_fd->cq_ev_fd); 2549 - kfree(ev_fd); 2550 2372 } 2551 2373 2552 2374 static int io_eventfd_unregister(struct io_ring_ctx *ctx) ··· 2552 2386 if (ev_fd) { 2553 2387 ctx->has_evfd = false; 2554 2388 rcu_assign_pointer(ctx->io_ev_fd, NULL); 2555 - call_rcu(&ev_fd->rcu, io_eventfd_put); 2389 + if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops)) 2390 + call_rcu(&ev_fd->rcu, io_eventfd_ops); 2556 2391 return 0; 2557 2392 } 2558 2393 ··· 2676 2509 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this 2677 2510 * pushs them to do the flush. 2678 2511 */ 2679 - if (io_cqring_events(ctx) || 2680 - test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) 2512 + 2513 + if (io_cqring_events(ctx) || io_has_work(ctx)) 2681 2514 mask |= EPOLLIN | EPOLLRDNORM; 2682 2515 2683 2516 return mask; ··· 2740 2573 * as nobody else will be looking for them. 2741 2574 */ 2742 2575 do { 2576 + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) 2577 + io_move_task_work_from_local(ctx); 2578 + 2743 2579 while (io_uring_try_cancel_requests(ctx, NULL, true)) 2744 2580 cond_resched(); 2745 2581 ··· 2940 2770 } 2941 2771 } 2942 2772 2773 + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) 2774 + ret |= io_run_local_work(ctx) > 0; 2943 2775 ret |= io_cancel_defer_files(ctx, task, cancel_all); 2944 2776 mutex_lock(&ctx->uring_lock); 2945 2777 ret |= io_poll_remove_all(ctx, task, cancel_all); 2946 2778 mutex_unlock(&ctx->uring_lock); 2947 2779 ret |= io_kill_timeouts(ctx, task, cancel_all); 2948 2780 if (task) 2949 - ret |= io_run_task_work(); 2781 + ret |= io_run_task_work() > 0; 2950 2782 return ret; 2951 2783 } 2952 2784 ··· 3164 2992 struct fd f; 3165 2993 long ret; 3166 2994 3167 - io_run_task_work(); 3168 - 3169 2995 if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | 3170 2996 IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | 3171 2997 IORING_ENTER_REGISTERED_RING))) ··· 3233 3063 goto iopoll_locked; 3234 3064 mutex_unlock(&ctx->uring_lock); 3235 3065 } 3066 + 3236 3067 if (flags & IORING_ENTER_GETEVENTS) { 3237 3068 int ret2; 3069 + 3238 3070 if (ctx->syscall_iopoll) { 3239 3071 /* 3240 3072 * We disallow the app entering submit/complete with ··· 3465 3293 if (ctx->flags & IORING_SETUP_SQPOLL) { 3466 3294 /* IPI related flags don't make sense with SQPOLL */ 3467 3295 if (ctx->flags & (IORING_SETUP_COOP_TASKRUN | 3468 - IORING_SETUP_TASKRUN_FLAG)) 3296 + IORING_SETUP_TASKRUN_FLAG | 3297 + IORING_SETUP_DEFER_TASKRUN)) 3469 3298 goto err; 3470 3299 ctx->notify_method = TWA_SIGNAL_NO_IPI; 3471 3300 } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) { 3472 3301 ctx->notify_method = TWA_SIGNAL_NO_IPI; 3473 3302 } else { 3474 - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 3303 + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG && 3304 + !(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 3475 3305 goto err; 3476 3306 ctx->notify_method = TWA_SIGNAL; 3307 + } 3308 + 3309 + /* 3310 + * For DEFER_TASKRUN we require the completion task to be the same as the 3311 + * submission task. This implies that there is only one submitter, so enforce 3312 + * that. 3313 + */ 3314 + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN && 3315 + !(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) { 3316 + goto err; 3477 3317 } 3478 3318 3479 3319 /* ··· 3592 3408 IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | 3593 3409 IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | 3594 3410 IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | 3595 - IORING_SETUP_SINGLE_ISSUER)) 3411 + IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN)) 3596 3412 return -EINVAL; 3597 3413 3598 3414 return io_uring_create(entries, &p, params); ··· 4058 3874 4059 3875 ctx = f.file->private_data; 4060 3876 4061 - io_run_task_work(); 3877 + io_run_task_work_ctx(ctx); 4062 3878 4063 3879 mutex_lock(&ctx->uring_lock); 4064 3880 ret = __io_uring_register(ctx, opcode, arg, nr_args);

+51 -11

io_uring/io_uring.h

··· 24 24 IOU_STOP_MULTISHOT = -ECANCELED, 25 25 }; 26 26 27 - struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx); 27 + struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow); 28 28 bool io_req_cqe_overflow(struct io_kiocb *req); 29 - int io_run_task_work_sig(void); 29 + int io_run_task_work_sig(struct io_ring_ctx *ctx); 30 + int __io_run_local_work(struct io_ring_ctx *ctx, bool locked); 31 + int io_run_local_work(struct io_ring_ctx *ctx); 30 32 void io_req_complete_failed(struct io_kiocb *req, s32 res); 31 33 void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); 32 34 void io_req_complete_post(struct io_kiocb *req); ··· 93 91 94 92 void io_cq_unlock_post(struct io_ring_ctx *ctx); 95 93 96 - static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) 94 + static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx, 95 + bool overflow) 97 96 { 98 97 if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { 99 98 struct io_uring_cqe *cqe = ctx->cqe_cached; ··· 106 103 return cqe; 107 104 } 108 105 109 - return __io_get_cqe(ctx); 106 + return __io_get_cqe(ctx, overflow); 107 + } 108 + 109 + static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) 110 + { 111 + return io_get_cqe_overflow(ctx, false); 110 112 } 111 113 112 114 static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, ··· 229 221 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; 230 222 } 231 223 232 - static inline bool io_run_task_work(void) 224 + static inline int io_run_task_work(void) 233 225 { 234 - if (test_thread_flag(TIF_NOTIFY_SIGNAL)) { 226 + if (task_work_pending(current)) { 227 + if (test_thread_flag(TIF_NOTIFY_SIGNAL)) 228 + clear_notify_signal(); 235 229 __set_current_state(TASK_RUNNING); 236 - clear_notify_signal(); 237 - if (task_work_pending(current)) 238 - task_work_run(); 239 - return true; 230 + task_work_run(); 231 + return 1; 240 232 } 241 233 242 - return false; 234 + return 0; 235 + } 236 + 237 + static inline bool io_task_work_pending(struct io_ring_ctx *ctx) 238 + { 239 + return test_thread_flag(TIF_NOTIFY_SIGNAL) || 240 + !wq_list_empty(&ctx->work_llist); 241 + } 242 + 243 + static inline int io_run_task_work_ctx(struct io_ring_ctx *ctx) 244 + { 245 + int ret = 0; 246 + int ret2; 247 + 248 + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) 249 + ret = io_run_local_work(ctx); 250 + 251 + /* want to run this after in case more is added */ 252 + ret2 = io_run_task_work(); 253 + 254 + /* Try propagate error in favour of if tasks were run, 255 + * but still make sure to run them if requested 256 + */ 257 + if (ret >= 0) 258 + ret += ret2; 259 + 260 + return ret; 243 261 } 244 262 245 263 static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) ··· 333 299 334 300 node = wq_stack_extract(&ctx->submit_state.free_list); 335 301 return container_of(node, struct io_kiocb, comp_list); 302 + } 303 + 304 + static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) 305 + { 306 + return likely(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN) || 307 + ctx->submitter_task == current); 336 308 } 337 309 338 310 #endif

-12

io_uring/kbuf.h

··· 86 86 87 87 static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) 88 88 { 89 - /* 90 - * READV uses fields in `struct io_rw` (len/addr) to stash the selected 91 - * buffer data. However if that buffer is recycled the original request 92 - * data stored in addr is lost. Therefore forbid recycling for now. 93 - */ 94 - if (req->opcode == IORING_OP_READV) { 95 - if ((req->flags & REQ_F_BUFFER_RING) && req->buf_list) { 96 - req->buf_list->head++; 97 - req->buf_list = NULL; 98 - } 99 - return; 100 - } 101 89 if (req->flags & REQ_F_BUFFER_SELECTED) 102 90 io_kbuf_recycle_legacy(req, issue_flags); 103 91 if (req->flags & REQ_F_BUFFER_RING)

+222 -86

io_uring/net.c

··· 55 55 struct user_msghdr __user *umsg; 56 56 void __user *buf; 57 57 }; 58 + unsigned len; 59 + unsigned done_io; 58 60 unsigned msg_flags; 59 - unsigned flags; 60 - size_t len; 61 - size_t done_io; 62 - }; 63 - 64 - struct io_sendzc { 65 - struct file *file; 66 - void __user *buf; 67 - size_t len; 68 - unsigned msg_flags; 69 - unsigned flags; 70 - unsigned addr_len; 61 + u16 flags; 62 + /* initialised and used only by !msg send variants */ 63 + u16 addr_len; 71 64 void __user *addr; 72 - size_t done_io; 65 + /* used only for send zerocopy */ 73 66 struct io_kiocb *notif; 74 67 }; 75 68 ··· 119 126 } 120 127 } 121 128 122 - static struct io_async_msghdr *io_recvmsg_alloc_async(struct io_kiocb *req, 123 - unsigned int issue_flags) 129 + static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req, 130 + unsigned int issue_flags) 124 131 { 125 132 struct io_ring_ctx *ctx = req->ctx; 126 133 struct io_cache_entry *entry; 134 + struct io_async_msghdr *hdr; 127 135 128 136 if (!(issue_flags & IO_URING_F_UNLOCKED) && 129 137 (entry = io_alloc_cache_get(&ctx->netmsg_cache)) != NULL) { 130 - struct io_async_msghdr *hdr; 131 - 132 138 hdr = container_of(entry, struct io_async_msghdr, cache); 139 + hdr->free_iov = NULL; 133 140 req->flags |= REQ_F_ASYNC_DATA; 134 141 req->async_data = hdr; 135 142 return hdr; 136 143 } 137 144 138 - if (!io_alloc_async_data(req)) 139 - return req->async_data; 140 - 145 + if (!io_alloc_async_data(req)) { 146 + hdr = req->async_data; 147 + hdr->free_iov = NULL; 148 + return hdr; 149 + } 141 150 return NULL; 151 + } 152 + 153 + static inline struct io_async_msghdr *io_msg_alloc_async_prep(struct io_kiocb *req) 154 + { 155 + /* ->prep_async is always called from the submission context */ 156 + return io_msg_alloc_async(req, 0); 142 157 } 143 158 144 159 static int io_setup_async_msg(struct io_kiocb *req, ··· 157 156 158 157 if (req_has_async_data(req)) 159 158 return -EAGAIN; 160 - async_msg = io_recvmsg_alloc_async(req, issue_flags); 159 + async_msg = io_msg_alloc_async(req, issue_flags); 161 160 if (!async_msg) { 162 161 kfree(kmsg->free_iov); 163 162 return -ENOMEM; 164 163 } 165 164 req->flags |= REQ_F_NEED_CLEANUP; 166 165 memcpy(async_msg, kmsg, sizeof(*kmsg)); 167 - async_msg->msg.msg_name = &async_msg->addr; 166 + if (async_msg->msg.msg_name) 167 + async_msg->msg.msg_name = &async_msg->addr; 168 168 /* if were using fast_iov, set it to the new one */ 169 - if (!async_msg->free_iov) 170 - async_msg->msg.msg_iter.iov = async_msg->fast_iov; 169 + if (!kmsg->free_iov) { 170 + size_t fast_idx = kmsg->msg.msg_iter.iov - kmsg->fast_iov; 171 + async_msg->msg.msg_iter.iov = &async_msg->fast_iov[fast_idx]; 172 + } 171 173 172 174 return -EAGAIN; 173 175 } ··· 186 182 &iomsg->free_iov); 187 183 } 188 184 189 - int io_sendzc_prep_async(struct io_kiocb *req) 185 + int io_send_prep_async(struct io_kiocb *req) 190 186 { 191 - struct io_sendzc *zc = io_kiocb_to_cmd(req, struct io_sendzc); 187 + struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 192 188 struct io_async_msghdr *io; 193 189 int ret; 194 190 195 191 if (!zc->addr || req_has_async_data(req)) 196 192 return 0; 197 - if (io_alloc_async_data(req)) 193 + io = io_msg_alloc_async_prep(req); 194 + if (!io) 198 195 return -ENOMEM; 199 - 200 - io = req->async_data; 201 196 ret = move_addr_to_kernel(zc->addr, zc->addr_len, &io->addr); 202 197 return ret; 203 198 } 204 199 205 200 static int io_setup_async_addr(struct io_kiocb *req, 206 - struct sockaddr_storage *addr, 201 + struct sockaddr_storage *addr_storage, 207 202 unsigned int issue_flags) 208 203 { 204 + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 209 205 struct io_async_msghdr *io; 210 206 211 - if (!addr || req_has_async_data(req)) 207 + if (!sr->addr || req_has_async_data(req)) 212 208 return -EAGAIN; 213 - if (io_alloc_async_data(req)) 209 + io = io_msg_alloc_async(req, issue_flags); 210 + if (!io) 214 211 return -ENOMEM; 215 - io = req->async_data; 216 - memcpy(&io->addr, addr, sizeof(io->addr)); 212 + memcpy(&io->addr, addr_storage, sizeof(io->addr)); 217 213 return -EAGAIN; 218 214 } 219 215 ··· 221 217 { 222 218 int ret; 223 219 220 + if (!io_msg_alloc_async_prep(req)) 221 + return -ENOMEM; 224 222 ret = io_sendmsg_copy_hdr(req, req->async_data); 225 223 if (!ret) 226 224 req->flags |= REQ_F_NEED_CLEANUP; ··· 240 234 { 241 235 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 242 236 243 - if (unlikely(sqe->file_index || sqe->addr2)) 237 + if (req->opcode == IORING_OP_SEND) { 238 + if (READ_ONCE(sqe->__pad3[0])) 239 + return -EINVAL; 240 + sr->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 241 + sr->addr_len = READ_ONCE(sqe->addr_len); 242 + } else if (sqe->addr2 || sqe->file_index) { 244 243 return -EINVAL; 244 + } 245 245 246 246 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 247 247 sr->len = READ_ONCE(sqe->len); ··· 303 291 if (ret < min_ret) { 304 292 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 305 293 return io_setup_async_msg(req, kmsg, issue_flags); 306 - if (ret == -ERESTARTSYS) 307 - ret = -EINTR; 308 294 if (ret > 0 && io_net_retry(sock, flags)) { 309 295 sr->done_io += ret; 310 296 req->flags |= REQ_F_PARTIAL_IO; 311 297 return io_setup_async_msg(req, kmsg, issue_flags); 312 298 } 299 + if (ret == -ERESTARTSYS) 300 + ret = -EINTR; 313 301 req_set_fail(req); 314 302 } 315 303 /* fast path, check for non-NULL to avoid function call */ ··· 327 315 328 316 int io_send(struct io_kiocb *req, unsigned int issue_flags) 329 317 { 318 + struct sockaddr_storage __address; 330 319 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 331 320 struct msghdr msg; 332 321 struct iovec iov; ··· 336 323 int min_ret = 0; 337 324 int ret; 338 325 326 + msg.msg_name = NULL; 327 + msg.msg_control = NULL; 328 + msg.msg_controllen = 0; 329 + msg.msg_namelen = 0; 330 + msg.msg_ubuf = NULL; 331 + 332 + if (sr->addr) { 333 + if (req_has_async_data(req)) { 334 + struct io_async_msghdr *io = req->async_data; 335 + 336 + msg.msg_name = &io->addr; 337 + } else { 338 + ret = move_addr_to_kernel(sr->addr, sr->addr_len, &__address); 339 + if (unlikely(ret < 0)) 340 + return ret; 341 + msg.msg_name = (struct sockaddr *)&__address; 342 + } 343 + msg.msg_namelen = sr->addr_len; 344 + } 345 + 339 346 if (!(req->flags & REQ_F_POLLED) && 340 347 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 341 - return -EAGAIN; 348 + return io_setup_async_addr(req, &__address, issue_flags); 342 349 343 350 sock = sock_from_file(req->file); 344 351 if (unlikely(!sock)) ··· 367 334 ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); 368 335 if (unlikely(ret)) 369 336 return ret; 370 - 371 - msg.msg_name = NULL; 372 - msg.msg_control = NULL; 373 - msg.msg_controllen = 0; 374 - msg.msg_namelen = 0; 375 - msg.msg_ubuf = NULL; 376 337 377 338 flags = sr->msg_flags; 378 339 if (issue_flags & IO_URING_F_NONBLOCK) ··· 378 351 ret = sock_sendmsg(sock, &msg); 379 352 if (ret < min_ret) { 380 353 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 381 - return -EAGAIN; 382 - if (ret == -ERESTARTSYS) 383 - ret = -EINTR; 354 + return io_setup_async_addr(req, &__address, issue_flags); 355 + 384 356 if (ret > 0 && io_net_retry(sock, flags)) { 385 357 sr->len -= ret; 386 358 sr->buf += ret; 387 359 sr->done_io += ret; 388 360 req->flags |= REQ_F_PARTIAL_IO; 389 - return -EAGAIN; 361 + return io_setup_async_addr(req, &__address, issue_flags); 390 362 } 363 + if (ret == -ERESTARTSYS) 364 + ret = -EINTR; 391 365 req_set_fail(req); 392 366 } 393 367 if (ret >= 0) ··· 482 454 483 455 if (msg.msg_iovlen == 0) { 484 456 sr->len = 0; 485 - iomsg->free_iov = NULL; 486 457 } else if (msg.msg_iovlen > 1) { 487 458 return -EINVAL; 488 459 } else { ··· 492 465 if (clen < 0) 493 466 return -EINVAL; 494 467 sr->len = clen; 495 - iomsg->free_iov = NULL; 496 468 } 497 469 498 470 if (req->flags & REQ_F_APOLL_MULTISHOT) { ··· 530 504 { 531 505 int ret; 532 506 507 + if (!io_msg_alloc_async_prep(req)) 508 + return -ENOMEM; 533 509 ret = io_recvmsg_copy_hdr(req, req->async_data); 534 510 if (!ret) 535 511 req->flags |= REQ_F_NEED_CLEANUP; ··· 779 751 } 780 752 return ret; 781 753 } 782 - if (ret == -ERESTARTSYS) 783 - ret = -EINTR; 784 754 if (ret > 0 && io_net_retry(sock, flags)) { 785 755 sr->done_io += ret; 786 756 req->flags |= REQ_F_PARTIAL_IO; 787 757 return io_setup_async_msg(req, kmsg, issue_flags); 788 758 } 759 + if (ret == -ERESTARTSYS) 760 + ret = -EINTR; 789 761 req_set_fail(req); 790 762 } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { 791 763 req_set_fail(req); ··· 875 847 876 848 return -EAGAIN; 877 849 } 878 - if (ret == -ERESTARTSYS) 879 - ret = -EINTR; 880 850 if (ret > 0 && io_net_retry(sock, flags)) { 881 851 sr->len -= ret; 882 852 sr->buf += ret; ··· 882 856 req->flags |= REQ_F_PARTIAL_IO; 883 857 return -EAGAIN; 884 858 } 859 + if (ret == -ERESTARTSYS) 860 + ret = -EINTR; 885 861 req_set_fail(req); 886 862 } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { 887 863 out_free: ··· 907 879 return ret; 908 880 } 909 881 910 - void io_sendzc_cleanup(struct io_kiocb *req) 882 + void io_send_zc_cleanup(struct io_kiocb *req) 911 883 { 912 - struct io_sendzc *zc = io_kiocb_to_cmd(req, struct io_sendzc); 884 + struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 885 + struct io_async_msghdr *io; 913 886 914 - zc->notif->flags |= REQ_F_CQE_SKIP; 915 - io_notif_flush(zc->notif); 916 - zc->notif = NULL; 887 + if (req_has_async_data(req)) { 888 + io = req->async_data; 889 + /* might be ->fast_iov if *msg_copy_hdr failed */ 890 + if (io->free_iov != io->fast_iov) 891 + kfree(io->free_iov); 892 + } 893 + if (zc->notif) { 894 + io_notif_flush(zc->notif); 895 + zc->notif = NULL; 896 + } 917 897 } 918 898 919 - int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 899 + int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 920 900 { 921 - struct io_sendzc *zc = io_kiocb_to_cmd(req, struct io_sendzc); 901 + struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 922 902 struct io_ring_ctx *ctx = req->ctx; 923 903 struct io_kiocb *notif; 924 904 925 - if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3) || 926 - READ_ONCE(sqe->__pad3[0])) 905 + if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) 927 906 return -EINVAL; 928 907 /* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */ 929 908 if (req->flags & REQ_F_CQE_SKIP) ··· 957 922 io_req_set_rsrc_node(notif, ctx, 0); 958 923 } 959 924 925 + if (req->opcode == IORING_OP_SEND_ZC) { 926 + if (READ_ONCE(sqe->__pad3[0])) 927 + return -EINVAL; 928 + zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 929 + zc->addr_len = READ_ONCE(sqe->addr_len); 930 + } else { 931 + if (unlikely(sqe->addr2 || sqe->file_index)) 932 + return -EINVAL; 933 + if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF)) 934 + return -EINVAL; 935 + } 936 + 960 937 zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 961 938 zc->len = READ_ONCE(sqe->len); 962 939 zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 963 940 if (zc->msg_flags & MSG_DONTWAIT) 964 941 req->flags |= REQ_F_NOWAIT; 965 942 966 - zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 967 - zc->addr_len = READ_ONCE(sqe->addr_len); 968 943 zc->done_io = 0; 969 944 970 945 #ifdef CONFIG_COMPAT ··· 982 937 zc->msg_flags |= MSG_CMSG_COMPAT; 983 938 #endif 984 939 return 0; 940 + } 941 + 942 + static int io_sg_from_iter_iovec(struct sock *sk, struct sk_buff *skb, 943 + struct iov_iter *from, size_t length) 944 + { 945 + skb_zcopy_downgrade_managed(skb); 946 + return __zerocopy_sg_from_iter(NULL, sk, skb, from, length); 985 947 } 986 948 987 949 static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb, ··· 1001 949 ssize_t copied = 0; 1002 950 unsigned long truesize = 0; 1003 951 1004 - if (!shinfo->nr_frags) 952 + if (!frag) 1005 953 shinfo->flags |= SKBFL_MANAGED_FRAG_REFS; 1006 - 1007 - if (!skb_zcopy_managed(skb) || !iov_iter_is_bvec(from)) { 1008 - skb_zcopy_downgrade_managed(skb); 954 + else if (unlikely(!skb_zcopy_managed(skb))) 1009 955 return __zerocopy_sg_from_iter(NULL, sk, skb, from, length); 1010 - } 1011 956 1012 957 bi.bi_size = min(from->count, length); 1013 958 bi.bi_bvec_done = from->iov_offset; ··· 1042 993 return ret; 1043 994 } 1044 995 1045 - int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) 996 + int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) 1046 997 { 1047 - struct sockaddr_storage __address, *addr = NULL; 1048 - struct io_sendzc *zc = io_kiocb_to_cmd(req, struct io_sendzc); 998 + struct sockaddr_storage __address; 999 + struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1049 1000 struct msghdr msg; 1050 1001 struct iovec iov; 1051 1002 struct socket *sock; 1052 - unsigned msg_flags, cflags; 1003 + unsigned msg_flags; 1053 1004 int ret, min_ret = 0; 1054 1005 1055 1006 sock = sock_from_file(req->file); ··· 1065 1016 if (req_has_async_data(req)) { 1066 1017 struct io_async_msghdr *io = req->async_data; 1067 1018 1068 - msg.msg_name = addr = &io->addr; 1019 + msg.msg_name = &io->addr; 1069 1020 } else { 1070 1021 ret = move_addr_to_kernel(zc->addr, zc->addr_len, &__address); 1071 1022 if (unlikely(ret < 0)) 1072 1023 return ret; 1073 1024 msg.msg_name = (struct sockaddr *)&__address; 1074 - addr = &__address; 1075 1025 } 1076 1026 msg.msg_namelen = zc->addr_len; 1077 1027 } 1078 1028 1079 1029 if (!(req->flags & REQ_F_POLLED) && 1080 1030 (zc->flags & IORING_RECVSEND_POLL_FIRST)) 1081 - return io_setup_async_addr(req, addr, issue_flags); 1031 + return io_setup_async_addr(req, &__address, issue_flags); 1082 1032 1083 1033 if (zc->flags & IORING_RECVSEND_FIXED_BUF) { 1084 1034 ret = io_import_fixed(WRITE, &msg.msg_iter, req->imu, 1085 1035 (u64)(uintptr_t)zc->buf, zc->len); 1086 1036 if (unlikely(ret)) 1087 1037 return ret; 1038 + msg.sg_from_iter = io_sg_from_iter; 1088 1039 } else { 1089 1040 ret = import_single_range(WRITE, zc->buf, zc->len, &iov, 1090 1041 &msg.msg_iter); ··· 1093 1044 ret = io_notif_account_mem(zc->notif, zc->len); 1094 1045 if (unlikely(ret)) 1095 1046 return ret; 1047 + msg.sg_from_iter = io_sg_from_iter_iovec; 1096 1048 } 1097 1049 1098 1050 msg_flags = zc->msg_flags | MSG_ZEROCOPY; ··· 1104 1054 1105 1055 msg.msg_flags = msg_flags; 1106 1056 msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; 1107 - msg.sg_from_iter = io_sg_from_iter; 1108 1057 ret = sock_sendmsg(sock, &msg); 1109 1058 1110 1059 if (unlikely(ret < min_ret)) { 1111 1060 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1112 - return io_setup_async_addr(req, addr, issue_flags); 1061 + return io_setup_async_addr(req, &__address, issue_flags); 1113 1062 1114 1063 if (ret > 0 && io_net_retry(sock, msg.msg_flags)) { 1115 1064 zc->len -= ret; 1116 1065 zc->buf += ret; 1117 1066 zc->done_io += ret; 1118 1067 req->flags |= REQ_F_PARTIAL_IO; 1119 - return io_setup_async_addr(req, addr, issue_flags); 1068 + return io_setup_async_addr(req, &__address, issue_flags); 1120 1069 } 1121 - if (ret < 0 && !zc->done_io) 1122 - zc->notif->flags |= REQ_F_CQE_SKIP; 1123 1070 if (ret == -ERESTARTSYS) 1124 1071 ret = -EINTR; 1125 1072 req_set_fail(req); ··· 1127 1080 else if (zc->done_io) 1128 1081 ret = zc->done_io; 1129 1082 1130 - io_notif_flush(zc->notif); 1131 - req->flags &= ~REQ_F_NEED_CLEANUP; 1132 - cflags = ret >= 0 ? IORING_CQE_F_MORE : 0; 1133 - io_req_set_res(req, ret, cflags); 1083 + /* 1084 + * If we're in io-wq we can't rely on tw ordering guarantees, defer 1085 + * flushing notif to io_send_zc_cleanup() 1086 + */ 1087 + if (!(issue_flags & IO_URING_F_UNLOCKED)) { 1088 + io_notif_flush(zc->notif); 1089 + req->flags &= ~REQ_F_NEED_CLEANUP; 1090 + } 1091 + io_req_set_res(req, ret, IORING_CQE_F_MORE); 1134 1092 return IOU_OK; 1093 + } 1094 + 1095 + int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) 1096 + { 1097 + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1098 + struct io_async_msghdr iomsg, *kmsg; 1099 + struct socket *sock; 1100 + unsigned flags; 1101 + int ret, min_ret = 0; 1102 + 1103 + sock = sock_from_file(req->file); 1104 + if (unlikely(!sock)) 1105 + return -ENOTSOCK; 1106 + 1107 + if (req_has_async_data(req)) { 1108 + kmsg = req->async_data; 1109 + } else { 1110 + ret = io_sendmsg_copy_hdr(req, &iomsg); 1111 + if (ret) 1112 + return ret; 1113 + kmsg = &iomsg; 1114 + } 1115 + 1116 + if (!(req->flags & REQ_F_POLLED) && 1117 + (sr->flags & IORING_RECVSEND_POLL_FIRST)) 1118 + return io_setup_async_msg(req, kmsg, issue_flags); 1119 + 1120 + flags = sr->msg_flags | MSG_ZEROCOPY; 1121 + if (issue_flags & IO_URING_F_NONBLOCK) 1122 + flags |= MSG_DONTWAIT; 1123 + if (flags & MSG_WAITALL) 1124 + min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1125 + 1126 + kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; 1127 + kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; 1128 + ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 1129 + 1130 + if (unlikely(ret < min_ret)) { 1131 + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1132 + return io_setup_async_msg(req, kmsg, issue_flags); 1133 + 1134 + if (ret > 0 && io_net_retry(sock, flags)) { 1135 + sr->done_io += ret; 1136 + req->flags |= REQ_F_PARTIAL_IO; 1137 + return io_setup_async_msg(req, kmsg, issue_flags); 1138 + } 1139 + if (ret == -ERESTARTSYS) 1140 + ret = -EINTR; 1141 + req_set_fail(req); 1142 + } 1143 + /* fast path, check for non-NULL to avoid function call */ 1144 + if (kmsg->free_iov) { 1145 + kfree(kmsg->free_iov); 1146 + kmsg->free_iov = NULL; 1147 + } 1148 + 1149 + io_netmsg_recycle(req, issue_flags); 1150 + if (ret >= 0) 1151 + ret += sr->done_io; 1152 + else if (sr->done_io) 1153 + ret = sr->done_io; 1154 + 1155 + /* 1156 + * If we're in io-wq we can't rely on tw ordering guarantees, defer 1157 + * flushing notif to io_send_zc_cleanup() 1158 + */ 1159 + if (!(issue_flags & IO_URING_F_UNLOCKED)) { 1160 + io_notif_flush(sr->notif); 1161 + req->flags &= ~REQ_F_NEED_CLEANUP; 1162 + } 1163 + io_req_set_res(req, ret, IORING_CQE_F_MORE); 1164 + return IOU_OK; 1165 + } 1166 + 1167 + void io_sendrecv_fail(struct io_kiocb *req) 1168 + { 1169 + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1170 + 1171 + if (req->flags & REQ_F_PARTIAL_IO) 1172 + req->cqe.res = sr->done_io; 1173 + 1174 + if ((req->flags & REQ_F_NEED_CLEANUP) && 1175 + (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC)) 1176 + req->cqe.flags |= IORING_CQE_F_MORE; 1135 1177 } 1136 1178 1137 1179 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)

+8 -4

io_uring/net.h

··· 31 31 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 32 32 int io_shutdown(struct io_kiocb *req, unsigned int issue_flags); 33 33 34 - int io_sendzc_prep_async(struct io_kiocb *req); 35 34 int io_sendmsg_prep_async(struct io_kiocb *req); 36 35 void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req); 37 36 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 38 37 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags); 38 + 39 39 int io_send(struct io_kiocb *req, unsigned int issue_flags); 40 + int io_send_prep_async(struct io_kiocb *req); 40 41 41 42 int io_recvmsg_prep_async(struct io_kiocb *req); 42 43 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 43 44 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags); 44 45 int io_recv(struct io_kiocb *req, unsigned int issue_flags); 46 + 47 + void io_sendrecv_fail(struct io_kiocb *req); 45 48 46 49 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 47 50 int io_accept(struct io_kiocb *req, unsigned int issue_flags); ··· 56 53 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 57 54 int io_connect(struct io_kiocb *req, unsigned int issue_flags); 58 55 59 - int io_sendzc(struct io_kiocb *req, unsigned int issue_flags); 60 - int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 61 - void io_sendzc_cleanup(struct io_kiocb *req); 56 + int io_send_zc(struct io_kiocb *req, unsigned int issue_flags); 57 + int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags); 58 + int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 59 + void io_send_zc_cleanup(struct io_kiocb *req); 62 60 63 61 void io_netmsg_cache_free(struct io_cache_entry *entry); 64 62 #else

+40 -4

io_uring/opdef.c

··· 69 69 .issue = io_read, 70 70 .prep_async = io_readv_prep_async, 71 71 .cleanup = io_readv_writev_cleanup, 72 + .fail = io_rw_fail, 72 73 }, 73 74 [IORING_OP_WRITEV] = { 74 75 .needs_file = 1, ··· 86 85 .issue = io_write, 87 86 .prep_async = io_writev_prep_async, 88 87 .cleanup = io_readv_writev_cleanup, 88 + .fail = io_rw_fail, 89 89 }, 90 90 [IORING_OP_FSYNC] = { 91 91 .needs_file = 1, ··· 107 105 .name = "READ_FIXED", 108 106 .prep = io_prep_rw, 109 107 .issue = io_read, 108 + .fail = io_rw_fail, 110 109 }, 111 110 [IORING_OP_WRITE_FIXED] = { 112 111 .needs_file = 1, ··· 122 119 .name = "WRITE_FIXED", 123 120 .prep = io_prep_rw, 124 121 .issue = io_write, 122 + .fail = io_rw_fail, 125 123 }, 126 124 [IORING_OP_POLL_ADD] = { 127 125 .needs_file = 1, ··· 150 146 .unbound_nonreg_file = 1, 151 147 .pollout = 1, 152 148 .ioprio = 1, 149 + .manual_alloc = 1, 153 150 .name = "SENDMSG", 154 151 #if defined(CONFIG_NET) 155 152 .async_size = sizeof(struct io_async_msghdr), ··· 158 153 .issue = io_sendmsg, 159 154 .prep_async = io_sendmsg_prep_async, 160 155 .cleanup = io_sendmsg_recvmsg_cleanup, 156 + .fail = io_sendrecv_fail, 161 157 #else 162 158 .prep = io_eopnotsupp_prep, 163 159 #endif ··· 169 163 .pollin = 1, 170 164 .buffer_select = 1, 171 165 .ioprio = 1, 166 + .manual_alloc = 1, 172 167 .name = "RECVMSG", 173 168 #if defined(CONFIG_NET) 174 169 .async_size = sizeof(struct io_async_msghdr), ··· 177 170 .issue = io_recvmsg, 178 171 .prep_async = io_recvmsg_prep_async, 179 172 .cleanup = io_sendmsg_recvmsg_cleanup, 173 + .fail = io_sendrecv_fail, 180 174 #else 181 175 .prep = io_eopnotsupp_prep, 182 176 #endif ··· 281 273 .name = "READ", 282 274 .prep = io_prep_rw, 283 275 .issue = io_read, 276 + .fail = io_rw_fail, 284 277 }, 285 278 [IORING_OP_WRITE] = { 286 279 .needs_file = 1, ··· 296 287 .name = "WRITE", 297 288 .prep = io_prep_rw, 298 289 .issue = io_write, 290 + .fail = io_rw_fail, 299 291 }, 300 292 [IORING_OP_FADVISE] = { 301 293 .needs_file = 1, ··· 316 306 .pollout = 1, 317 307 .audit_skip = 1, 318 308 .ioprio = 1, 309 + .manual_alloc = 1, 319 310 .name = "SEND", 320 311 #if defined(CONFIG_NET) 312 + .async_size = sizeof(struct io_async_msghdr), 321 313 .prep = io_sendmsg_prep, 322 314 .issue = io_send, 315 + .fail = io_sendrecv_fail, 316 + .prep_async = io_send_prep_async, 323 317 #else 324 318 .prep = io_eopnotsupp_prep, 325 319 #endif ··· 339 325 #if defined(CONFIG_NET) 340 326 .prep = io_recvmsg_prep, 341 327 .issue = io_recv, 328 + .fail = io_sendrecv_fail, 342 329 #else 343 330 .prep = io_eopnotsupp_prep, 344 331 #endif ··· 480 465 .needs_file = 1, 481 466 .plug = 1, 482 467 .name = "URING_CMD", 468 + .iopoll = 1, 483 469 .async_size = uring_cmd_pdu_size(1), 484 470 .prep = io_uring_cmd_prep, 485 471 .issue = io_uring_cmd, ··· 496 480 .manual_alloc = 1, 497 481 #if defined(CONFIG_NET) 498 482 .async_size = sizeof(struct io_async_msghdr), 499 - .prep = io_sendzc_prep, 500 - .issue = io_sendzc, 501 - .prep_async = io_sendzc_prep_async, 502 - .cleanup = io_sendzc_cleanup, 483 + .prep = io_send_zc_prep, 484 + .issue = io_send_zc, 485 + .prep_async = io_send_prep_async, 486 + .cleanup = io_send_zc_cleanup, 487 + .fail = io_sendrecv_fail, 488 + #else 489 + .prep = io_eopnotsupp_prep, 490 + #endif 491 + }, 492 + [IORING_OP_SENDMSG_ZC] = { 493 + .name = "SENDMSG_ZC", 494 + .needs_file = 1, 495 + .unbound_nonreg_file = 1, 496 + .pollout = 1, 497 + .audit_skip = 1, 498 + .ioprio = 1, 499 + .manual_alloc = 1, 500 + #if defined(CONFIG_NET) 501 + .async_size = sizeof(struct io_async_msghdr), 502 + .prep = io_send_zc_prep, 503 + .issue = io_sendmsg_zc, 504 + .prep_async = io_sendmsg_prep_async, 505 + .cleanup = io_send_zc_cleanup, 506 + .fail = io_sendrecv_fail, 503 507 #else 504 508 .prep = io_eopnotsupp_prep, 505 509 #endif

+1

io_uring/opdef.h

··· 36 36 int (*issue)(struct io_kiocb *, unsigned int); 37 37 int (*prep_async)(struct io_kiocb *); 38 38 void (*cleanup)(struct io_kiocb *); 39 + void (*fail)(struct io_kiocb *); 39 40 }; 40 41 41 42 extern const struct io_op_def io_op_defs[];

+1 -1

io_uring/rsrc.c

··· 341 341 flush_delayed_work(&ctx->rsrc_put_work); 342 342 reinit_completion(&data->done); 343 343 344 - ret = io_run_task_work_sig(); 344 + ret = io_run_task_work_sig(ctx); 345 345 mutex_lock(&ctx->uring_lock); 346 346 } while (ret >= 0); 347 347 data->quiesce = false;

+93 -96

io_uring/rw.c

··· 33 33 return req->flags & REQ_F_SUPPORT_NOWAIT; 34 34 } 35 35 36 + #ifdef CONFIG_COMPAT 37 + static int io_iov_compat_buffer_select_prep(struct io_rw *rw) 38 + { 39 + struct compat_iovec __user *uiov; 40 + compat_ssize_t clen; 41 + 42 + uiov = u64_to_user_ptr(rw->addr); 43 + if (!access_ok(uiov, sizeof(*uiov))) 44 + return -EFAULT; 45 + if (__get_user(clen, &uiov->iov_len)) 46 + return -EFAULT; 47 + if (clen < 0) 48 + return -EINVAL; 49 + 50 + rw->len = clen; 51 + return 0; 52 + } 53 + #endif 54 + 55 + static int io_iov_buffer_select_prep(struct io_kiocb *req) 56 + { 57 + struct iovec __user *uiov; 58 + struct iovec iov; 59 + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 60 + 61 + if (rw->len != 1) 62 + return -EINVAL; 63 + 64 + #ifdef CONFIG_COMPAT 65 + if (req->ctx->compat) 66 + return io_iov_compat_buffer_select_prep(rw); 67 + #endif 68 + 69 + uiov = u64_to_user_ptr(rw->addr); 70 + if (copy_from_user(&iov, uiov, sizeof(*uiov))) 71 + return -EFAULT; 72 + rw->len = iov.iov_len; 73 + return 0; 74 + } 75 + 36 76 int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) 37 77 { 38 78 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); ··· 109 69 rw->addr = READ_ONCE(sqe->addr); 110 70 rw->len = READ_ONCE(sqe->len); 111 71 rw->flags = READ_ONCE(sqe->rw_flags); 72 + 73 + /* Have to do this validation here, as this is in io_read() rw->len might 74 + * have chanaged due to buffer selection 75 + */ 76 + if (req->opcode == IORING_OP_READV && req->flags & REQ_F_BUFFER_SELECT) { 77 + ret = io_iov_buffer_select_prep(req); 78 + if (ret) 79 + return ret; 80 + } 81 + 112 82 return 0; 113 83 } 114 84 ··· 236 186 237 187 static bool __io_complete_rw_common(struct io_kiocb *req, long res) 238 188 { 239 - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 240 - 241 - if (rw->kiocb.ki_flags & IOCB_WRITE) { 242 - kiocb_end_write(req); 243 - fsnotify_modify(req->file); 244 - } else { 245 - fsnotify_access(req->file); 246 - } 247 189 if (unlikely(res != req->cqe.res)) { 248 190 if ((res == -EAGAIN || res == -EOPNOTSUPP) && 249 191 io_rw_should_reissue(req)) { ··· 262 220 return res; 263 221 } 264 222 223 + static void io_req_rw_complete(struct io_kiocb *req, bool *locked) 224 + { 225 + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 226 + 227 + if (rw->kiocb.ki_flags & IOCB_WRITE) { 228 + kiocb_end_write(req); 229 + fsnotify_modify(req->file); 230 + } else { 231 + fsnotify_access(req->file); 232 + } 233 + 234 + io_req_task_complete(req, locked); 235 + } 236 + 265 237 static void io_complete_rw(struct kiocb *kiocb, long res) 266 238 { 267 239 struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); ··· 284 228 if (__io_complete_rw_common(req, res)) 285 229 return; 286 230 io_req_set_res(req, io_fixup_rw_res(req, res), 0); 287 - req->io_task_work.func = io_req_task_complete; 231 + req->io_task_work.func = io_req_rw_complete; 288 232 io_req_task_work_add(req); 289 233 } 290 234 ··· 335 279 return IOU_ISSUE_SKIP_COMPLETE; 336 280 } 337 281 338 - #ifdef CONFIG_COMPAT 339 - static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, 340 - unsigned int issue_flags) 341 - { 342 - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 343 - struct compat_iovec __user *uiov; 344 - compat_ssize_t clen; 345 - void __user *buf; 346 - size_t len; 347 - 348 - uiov = u64_to_user_ptr(rw->addr); 349 - if (!access_ok(uiov, sizeof(*uiov))) 350 - return -EFAULT; 351 - if (__get_user(clen, &uiov->iov_len)) 352 - return -EFAULT; 353 - if (clen < 0) 354 - return -EINVAL; 355 - 356 - len = clen; 357 - buf = io_buffer_select(req, &len, issue_flags); 358 - if (!buf) 359 - return -ENOBUFS; 360 - rw->addr = (unsigned long) buf; 361 - iov[0].iov_base = buf; 362 - rw->len = iov[0].iov_len = (compat_size_t) len; 363 - return 0; 364 - } 365 - #endif 366 - 367 - static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, 368 - unsigned int issue_flags) 369 - { 370 - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 371 - struct iovec __user *uiov = u64_to_user_ptr(rw->addr); 372 - void __user *buf; 373 - ssize_t len; 374 - 375 - if (copy_from_user(iov, uiov, sizeof(*uiov))) 376 - return -EFAULT; 377 - 378 - len = iov[0].iov_len; 379 - if (len < 0) 380 - return -EINVAL; 381 - buf = io_buffer_select(req, &len, issue_flags); 382 - if (!buf) 383 - return -ENOBUFS; 384 - rw->addr = (unsigned long) buf; 385 - iov[0].iov_base = buf; 386 - rw->len = iov[0].iov_len = len; 387 - return 0; 388 - } 389 - 390 - static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, 391 - unsigned int issue_flags) 392 - { 393 - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 394 - 395 - if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { 396 - iov[0].iov_base = u64_to_user_ptr(rw->addr); 397 - iov[0].iov_len = rw->len; 398 - return 0; 399 - } 400 - if (rw->len != 1) 401 - return -EINVAL; 402 - 403 - #ifdef CONFIG_COMPAT 404 - if (req->ctx->compat) 405 - return io_compat_import(req, iov, issue_flags); 406 - #endif 407 - 408 - return __io_iov_buffer_select(req, iov, issue_flags); 409 - } 410 - 411 282 static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req, 412 283 struct io_rw_state *s, 413 284 unsigned int issue_flags) ··· 357 374 buf = u64_to_user_ptr(rw->addr); 358 375 sqe_len = rw->len; 359 376 360 - if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { 377 + if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE || 378 + (req->flags & REQ_F_BUFFER_SELECT)) { 361 379 if (io_do_buffer_select(req)) { 362 380 buf = io_buffer_select(req, &sqe_len, issue_flags); 363 381 if (!buf) ··· 374 390 } 375 391 376 392 iovec = s->fast_iov; 377 - if (req->flags & REQ_F_BUFFER_SELECT) { 378 - ret = io_iov_buffer_select(req, iovec, issue_flags); 379 - if (ret) 380 - return ERR_PTR(ret); 381 - iov_iter_init(iter, ddir, iovec, 1, iovec->iov_len); 382 - return NULL; 383 - } 384 - 385 393 ret = __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &iovec, iter, 386 394 req->ctx->compat); 387 395 if (unlikely(ret < 0)) ··· 770 794 iov_iter_restore(&s->iter, &s->iter_state); 771 795 772 796 ret2 = io_setup_async_rw(req, iovec, s, true); 773 - if (ret2) 774 - return ret2; 775 - 776 797 iovec = NULL; 798 + if (ret2) { 799 + ret = ret > 0 ? ret : ret2; 800 + goto done; 801 + } 802 + 777 803 io = req->async_data; 778 804 s = &io->s; 779 805 /* ··· 801 823 return -EAGAIN; 802 824 } 803 825 826 + req->cqe.res = iov_iter_count(&s->iter); 804 827 /* 805 828 * Now retry read with the IOCB_WAITQ parts set in the iocb. If 806 829 * we get -EIOCBQUEUED, then we'll get a notification when the ··· 963 984 io_cqring_wake(ctx); 964 985 } 965 986 987 + void io_rw_fail(struct io_kiocb *req) 988 + { 989 + int res; 990 + 991 + res = io_fixup_rw_res(req, req->cqe.res); 992 + io_req_set_res(req, res, req->cqe.flags); 993 + } 994 + 966 995 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) 967 996 { 968 997 struct io_wq_work_node *pos, *start, *prev; ··· 987 1000 988 1001 wq_list_for_each(pos, start, &ctx->iopoll_list) { 989 1002 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); 990 - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 1003 + struct file *file = req->file; 991 1004 int ret; 992 1005 993 1006 /* ··· 998 1011 if (READ_ONCE(req->iopoll_completed)) 999 1012 break; 1000 1013 1001 - ret = rw->kiocb.ki_filp->f_op->iopoll(&rw->kiocb, &iob, poll_flags); 1014 + if (req->opcode == IORING_OP_URING_CMD) { 1015 + struct io_uring_cmd *ioucmd; 1016 + 1017 + ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); 1018 + ret = file->f_op->uring_cmd_iopoll(ioucmd, &iob, 1019 + poll_flags); 1020 + } else { 1021 + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 1022 + 1023 + ret = file->f_op->iopoll(&rw->kiocb, &iob, poll_flags); 1024 + } 1002 1025 if (unlikely(ret < 0)) 1003 1026 return ret; 1004 1027 else if (ret)

+1

io_uring/rw.h

··· 21 21 int io_write(struct io_kiocb *req, unsigned int issue_flags); 22 22 int io_writev_prep_async(struct io_kiocb *req); 23 23 void io_readv_writev_cleanup(struct io_kiocb *req); 24 + void io_rw_fail(struct io_kiocb *req);

+3 -10

io_uring/timeout.c

··· 149 149 nxt->link = NULL; 150 150 } 151 151 152 - bool io_disarm_next(struct io_kiocb *req) 152 + void io_disarm_next(struct io_kiocb *req) 153 153 __must_hold(&req->ctx->completion_lock) 154 154 { 155 155 struct io_kiocb *link = NULL; 156 - bool posted = false; 157 156 158 157 if (req->flags & REQ_F_ARM_LTIMEOUT) { 159 158 link = req->link; ··· 160 161 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { 161 162 io_remove_next_linked(req); 162 163 io_req_tw_post_queue(link, -ECANCELED, 0); 163 - posted = true; 164 164 } 165 165 } else if (req->flags & REQ_F_LINK_TIMEOUT) { 166 166 struct io_ring_ctx *ctx = req->ctx; ··· 167 169 spin_lock_irq(&ctx->timeout_lock); 168 170 link = io_disarm_linked_timeout(req); 169 171 spin_unlock_irq(&ctx->timeout_lock); 170 - if (link) { 171 - posted = true; 172 + if (link) 172 173 io_req_tw_post_queue(link, -ECANCELED, 0); 173 - } 174 174 } 175 175 if (unlikely((req->flags & REQ_F_FAIL) && 176 - !(req->flags & REQ_F_HARDLINK))) { 177 - posted |= (req->link != NULL); 176 + !(req->flags & REQ_F_HARDLINK))) 178 177 io_fail_links(req); 179 - } 180 - return posted; 181 178 } 182 179 183 180 struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,

+1 -1

io_uring/timeout.h

··· 27 27 __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, 28 28 bool cancel_all); 29 29 void io_queue_linked_timeout(struct io_kiocb *req); 30 - bool io_disarm_next(struct io_kiocb *req); 30 + void io_disarm_next(struct io_kiocb *req); 31 31 32 32 int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 33 33 int io_link_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);

+9 -2

io_uring/uring_cmd.c

··· 50 50 io_req_set_res(req, ret, 0); 51 51 if (req->ctx->flags & IORING_SETUP_CQE32) 52 52 io_req_set_cqe32_extra(req, res2, 0); 53 - __io_req_complete(req, 0); 53 + if (req->ctx->flags & IORING_SETUP_IOPOLL) 54 + /* order with io_iopoll_req_issued() checking ->iopoll_complete */ 55 + smp_store_release(&req->iopoll_completed, 1); 56 + else 57 + __io_req_complete(req, 0); 54 58 } 55 59 EXPORT_SYMBOL_GPL(io_uring_cmd_done); 56 60 ··· 101 97 issue_flags |= IO_URING_F_SQE128; 102 98 if (ctx->flags & IORING_SETUP_CQE32) 103 99 issue_flags |= IO_URING_F_CQE32; 104 - if (ctx->flags & IORING_SETUP_IOPOLL) 100 + if (ctx->flags & IORING_SETUP_IOPOLL) { 105 101 issue_flags |= IO_URING_F_IOPOLL; 102 + req->iopoll_completed = 0; 103 + WRITE_ONCE(ioucmd->cookie, NULL); 104 + } 106 105 107 106 if (req_has_async_data(req)) 108 107 ioucmd->cmd = req->async_data;

+13 -9

tools/testing/selftests/net/io_uring_zerocopy_tx.c

··· 400 400 cfg_payload_len, msg_flags); 401 401 sqe->user_data = NONZC_TAG; 402 402 } else { 403 - compl_cqes++; 404 403 io_uring_prep_sendzc(sqe, fd, payload, 405 404 cfg_payload_len, 406 405 msg_flags, zc_flags); ··· 429 430 if (cqe->flags & IORING_CQE_F_NOTIF) { 430 431 if (cqe->flags & IORING_CQE_F_MORE) 431 432 error(1, -EINVAL, "invalid notif flags"); 433 + if (compl_cqes <= 0) 434 + error(1, -EINVAL, "notification mismatch"); 432 435 compl_cqes--; 433 436 i--; 434 - } else if (cqe->res <= 0) { 435 - if (cqe->flags & IORING_CQE_F_MORE) 436 - error(1, cqe->res, "more with a failed send"); 437 - error(1, cqe->res, "send failed"); 438 - } else { 439 - if (cqe->user_data == ZC_TAG && 440 - !(cqe->flags & IORING_CQE_F_MORE)) 441 - error(1, cqe->res, "missing more flag"); 437 + io_uring_cqe_seen(&ring); 438 + continue; 439 + } 440 + if (cqe->flags & IORING_CQE_F_MORE) { 441 + if (cqe->user_data != ZC_TAG) 442 + error(1, cqe->res, "unexpected F_MORE"); 443 + compl_cqes++; 444 + } 445 + if (cqe->res >= 0) { 442 446 packets++; 443 447 bytes += cqe->res; 448 + } else if (cqe->res != -EAGAIN) { 449 + error(1, cqe->res, "send failed"); 444 450 } 445 451 io_uring_cqe_seen(&ring); 446 452 }