Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'for-5.7/io_uring-2020-03-29' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:
"Here are the io_uring changes for this merge window. Light on new
features this time around (just splice + buffer selection), lots of
cleanups, fixes, and improvements to existing support. In particular,
this contains:

- Cleanup fixed file update handling for stack fallback (Hillf)

- Re-work of how pollable async IO is handled, we no longer require
thread offload to handle that. Instead we rely using poll to drive
this, with task_work execution.

- In conjunction with the above, allow expendable buffer selection,
so that poll+recv (for example) no longer has to be a split
operation.

- Make sure we honor RLIMIT_FSIZE for buffered writes

- Add support for splice (Pavel)

- Linked work inheritance fixes and optimizations (Pavel)

- Async work fixes and cleanups (Pavel)

- Improve io-wq locking (Pavel)

- Hashed link write improvements (Pavel)

- SETUP_IOPOLL|SETUP_SQPOLL improvements (Xiaoguang)"

* tag 'for-5.7/io_uring-2020-03-29' of git://git.kernel.dk/linux-block: (54 commits)
io_uring: cleanup io_alloc_async_ctx()
io_uring: fix missing 'return' in comment
io-wq: handle hashed writes in chains
io-uring: drop 'free_pfile' in struct io_file_put
io-uring: drop completion when removing file
io_uring: Fix ->data corruption on re-enqueue
io-wq: close cancel gap for hashed linked work
io_uring: make spdxcheck.py happy
io_uring: honor original task RLIMIT_FSIZE
io-wq: hash dependent work
io-wq: split hashing and enqueueing
io-wq: don't resched if there is no work
io-wq: remove duplicated cancel code
io_uring: fix truncated async read/readv and write/writev retry
io_uring: dual license io_uring.h uapi header
io_uring: io_uring_enter(2) don't poll while SETUP_IOPOLL|SETUP_SQPOLL enabled
io_uring: Fix unused function warnings
io_uring: add end-of-bits marker and build time verify it
io_uring: provide means of removing buffers
io_uring: add IOSQE_BUFFER_SELECT support for IORING_OP_RECVMSG
...

+1834 -864
+164 -208
fs/io-wq.c
··· 69 69 #define IO_WQ_HASH_ORDER 5 70 70 #endif 71 71 72 + #define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER) 73 + 72 74 struct io_wqe_acct { 73 75 unsigned nr_workers; 74 76 unsigned max_workers; ··· 100 98 struct list_head all_list; 101 99 102 100 struct io_wq *wq; 101 + struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; 103 102 }; 104 103 105 104 /* ··· 110 107 struct io_wqe **wqes; 111 108 unsigned long state; 112 109 113 - get_work_fn *get_work; 114 - put_work_fn *put_work; 110 + free_work_fn *free_work; 115 111 116 112 struct task_struct *manager; 117 113 struct user_struct *user; ··· 378 376 return __io_worker_unuse(wqe, worker); 379 377 } 380 378 381 - static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash) 379 + static inline unsigned int io_get_work_hash(struct io_wq_work *work) 380 + { 381 + return work->flags >> IO_WQ_HASH_SHIFT; 382 + } 383 + 384 + static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) 382 385 __must_hold(wqe->lock) 383 386 { 384 387 struct io_wq_work_node *node, *prev; 385 - struct io_wq_work *work; 388 + struct io_wq_work *work, *tail; 389 + unsigned int hash; 386 390 387 391 wq_list_for_each(node, prev, &wqe->work_list) { 388 392 work = container_of(node, struct io_wq_work, list); 389 393 390 394 /* not hashed, can run anytime */ 391 - if (!(work->flags & IO_WQ_WORK_HASHED)) { 392 - wq_node_del(&wqe->work_list, node, prev); 395 + if (!io_wq_is_hashed(work)) { 396 + wq_list_del(&wqe->work_list, node, prev); 393 397 return work; 394 398 } 395 399 396 400 /* hashed, can run if not already running */ 397 - *hash = work->flags >> IO_WQ_HASH_SHIFT; 398 - if (!(wqe->hash_map & BIT_ULL(*hash))) { 399 - wqe->hash_map |= BIT_ULL(*hash); 400 - wq_node_del(&wqe->work_list, node, prev); 401 + hash = io_get_work_hash(work); 402 + if (!(wqe->hash_map & BIT(hash))) { 403 + wqe->hash_map |= BIT(hash); 404 + /* all items with this hash lie in [work, tail] */ 405 + tail = wqe->hash_tail[hash]; 406 + wqe->hash_tail[hash] = NULL; 407 + wq_list_cut(&wqe->work_list, &tail->list, prev); 401 408 return work; 402 409 } 403 410 } ··· 451 440 worker->saved_creds = old_creds; 452 441 } 453 442 443 + static void io_impersonate_work(struct io_worker *worker, 444 + struct io_wq_work *work) 445 + { 446 + if (work->files && current->files != work->files) { 447 + task_lock(current); 448 + current->files = work->files; 449 + task_unlock(current); 450 + } 451 + if (work->fs && current->fs != work->fs) 452 + current->fs = work->fs; 453 + if (work->mm != worker->mm) 454 + io_wq_switch_mm(worker, work); 455 + if (worker->cur_creds != work->creds) 456 + io_wq_switch_creds(worker, work); 457 + } 458 + 459 + static void io_assign_current_work(struct io_worker *worker, 460 + struct io_wq_work *work) 461 + { 462 + if (work) { 463 + /* flush pending signals before assigning new work */ 464 + if (signal_pending(current)) 465 + flush_signals(current); 466 + cond_resched(); 467 + } 468 + 469 + spin_lock_irq(&worker->lock); 470 + worker->cur_work = work; 471 + spin_unlock_irq(&worker->lock); 472 + } 473 + 474 + static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); 475 + 454 476 static void io_worker_handle_work(struct io_worker *worker) 455 477 __releases(wqe->lock) 456 478 { 457 - struct io_wq_work *work, *old_work = NULL, *put_work = NULL; 458 479 struct io_wqe *wqe = worker->wqe; 459 480 struct io_wq *wq = wqe->wq; 460 481 461 482 do { 462 - unsigned hash = -1U; 463 - 483 + struct io_wq_work *work; 484 + unsigned int hash; 485 + get_next: 464 486 /* 465 487 * If we got some work, mark us as busy. If we didn't, but 466 488 * the list isn't empty, it means we stalled on hashed work. ··· 501 457 * can't make progress, any work completion or insertion will 502 458 * clear the stalled flag. 503 459 */ 504 - work = io_get_next_work(wqe, &hash); 460 + work = io_get_next_work(wqe); 505 461 if (work) 506 462 __io_worker_busy(wqe, worker, work); 507 463 else if (!wq_list_empty(&wqe->work_list)) 508 464 wqe->flags |= IO_WQE_FLAG_STALLED; 509 465 510 466 spin_unlock_irq(&wqe->lock); 511 - if (put_work && wq->put_work) 512 - wq->put_work(old_work); 513 467 if (!work) 514 468 break; 515 - next: 516 - /* flush any pending signals before assigning new work */ 517 - if (signal_pending(current)) 518 - flush_signals(current); 469 + io_assign_current_work(worker, work); 519 470 520 - cond_resched(); 471 + /* handle a whole dependent link */ 472 + do { 473 + struct io_wq_work *old_work, *next_hashed, *linked; 521 474 522 - spin_lock_irq(&worker->lock); 523 - worker->cur_work = work; 524 - spin_unlock_irq(&worker->lock); 475 + next_hashed = wq_next_work(work); 476 + io_impersonate_work(worker, work); 477 + /* 478 + * OK to set IO_WQ_WORK_CANCEL even for uncancellable 479 + * work, the worker function will do the right thing. 480 + */ 481 + if (test_bit(IO_WQ_BIT_CANCEL, &wq->state)) 482 + work->flags |= IO_WQ_WORK_CANCEL; 525 483 526 - if (work->flags & IO_WQ_WORK_CB) 527 - work->func(&work); 484 + hash = io_get_work_hash(work); 485 + linked = old_work = work; 486 + linked->func(&linked); 487 + linked = (old_work == linked) ? NULL : linked; 528 488 529 - if (work->files && current->files != work->files) { 530 - task_lock(current); 531 - current->files = work->files; 532 - task_unlock(current); 533 - } 534 - if (work->fs && current->fs != work->fs) 535 - current->fs = work->fs; 536 - if (work->mm != worker->mm) 537 - io_wq_switch_mm(worker, work); 538 - if (worker->cur_creds != work->creds) 539 - io_wq_switch_creds(worker, work); 540 - /* 541 - * OK to set IO_WQ_WORK_CANCEL even for uncancellable work, 542 - * the worker function will do the right thing. 543 - */ 544 - if (test_bit(IO_WQ_BIT_CANCEL, &wq->state)) 545 - work->flags |= IO_WQ_WORK_CANCEL; 546 - if (worker->mm) 547 - work->flags |= IO_WQ_WORK_HAS_MM; 489 + work = next_hashed; 490 + if (!work && linked && !io_wq_is_hashed(linked)) { 491 + work = linked; 492 + linked = NULL; 493 + } 494 + io_assign_current_work(worker, work); 495 + wq->free_work(old_work); 548 496 549 - if (wq->get_work) { 550 - put_work = work; 551 - wq->get_work(work); 552 - } 497 + if (linked) 498 + io_wqe_enqueue(wqe, linked); 553 499 554 - old_work = work; 555 - work->func(&work); 556 - 557 - spin_lock_irq(&worker->lock); 558 - worker->cur_work = NULL; 559 - spin_unlock_irq(&worker->lock); 500 + if (hash != -1U && !next_hashed) { 501 + spin_lock_irq(&wqe->lock); 502 + wqe->hash_map &= ~BIT_ULL(hash); 503 + wqe->flags &= ~IO_WQE_FLAG_STALLED; 504 + /* dependent work is not hashed */ 505 + hash = -1U; 506 + /* skip unnecessary unlock-lock wqe->lock */ 507 + if (!work) 508 + goto get_next; 509 + spin_unlock_irq(&wqe->lock); 510 + } 511 + } while (work); 560 512 561 513 spin_lock_irq(&wqe->lock); 562 - 563 - if (hash != -1U) { 564 - wqe->hash_map &= ~BIT_ULL(hash); 565 - wqe->flags &= ~IO_WQE_FLAG_STALLED; 566 - } 567 - if (work && work != old_work) { 568 - spin_unlock_irq(&wqe->lock); 569 - 570 - if (put_work && wq->put_work) { 571 - wq->put_work(put_work); 572 - put_work = NULL; 573 - } 574 - 575 - /* dependent work not hashed */ 576 - hash = -1U; 577 - goto next; 578 - } 579 514 } while (1); 580 515 } 581 516 ··· 770 747 return true; 771 748 } 772 749 773 - static void io_run_cancel(struct io_wq_work *work) 750 + static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) 774 751 { 752 + struct io_wq *wq = wqe->wq; 753 + 775 754 do { 776 755 struct io_wq_work *old_work = work; 777 756 778 757 work->flags |= IO_WQ_WORK_CANCEL; 779 758 work->func(&work); 780 759 work = (work == old_work) ? NULL : work; 760 + wq->free_work(old_work); 781 761 } while (work); 762 + } 763 + 764 + static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work) 765 + { 766 + unsigned int hash; 767 + struct io_wq_work *tail; 768 + 769 + if (!io_wq_is_hashed(work)) { 770 + append: 771 + wq_list_add_tail(&work->list, &wqe->work_list); 772 + return; 773 + } 774 + 775 + hash = io_get_work_hash(work); 776 + tail = wqe->hash_tail[hash]; 777 + wqe->hash_tail[hash] = work; 778 + if (!tail) 779 + goto append; 780 + 781 + wq_list_add_after(&work->list, &tail->list, &wqe->work_list); 782 782 } 783 783 784 784 static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) ··· 817 771 * It's close enough to not be an issue, fork() has the same delay. 818 772 */ 819 773 if (unlikely(!io_wq_can_queue(wqe, acct, work))) { 820 - io_run_cancel(work); 774 + io_run_cancel(work, wqe); 821 775 return; 822 776 } 823 777 824 778 work_flags = work->flags; 825 779 spin_lock_irqsave(&wqe->lock, flags); 826 - wq_list_add_tail(&work->list, &wqe->work_list); 780 + io_wqe_insert_work(wqe, work); 827 781 wqe->flags &= ~IO_WQE_FLAG_STALLED; 828 782 spin_unlock_irqrestore(&wqe->lock, flags); 829 783 ··· 840 794 } 841 795 842 796 /* 843 - * Enqueue work, hashed by some key. Work items that hash to the same value 844 - * will not be done in parallel. Used to limit concurrent writes, generally 845 - * hashed by inode. 797 + * Work items that hash to the same value will not be done in parallel. 798 + * Used to limit concurrent writes, generally hashed by inode. 846 799 */ 847 - void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val) 800 + void io_wq_hash_work(struct io_wq_work *work, void *val) 848 801 { 849 - struct io_wqe *wqe = wq->wqes[numa_node_id()]; 850 - unsigned bit; 851 - 802 + unsigned int bit; 852 803 853 804 bit = hash_ptr(val, IO_WQ_HASH_ORDER); 854 805 work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); 855 - io_wqe_enqueue(wqe, work); 856 806 } 857 807 858 808 static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data) ··· 898 856 } 899 857 900 858 struct io_cb_cancel_data { 901 - struct io_wqe *wqe; 902 - work_cancel_fn *cancel; 903 - void *caller_data; 859 + work_cancel_fn *fn; 860 + void *data; 904 861 }; 905 862 906 - static bool io_work_cancel(struct io_worker *worker, void *cancel_data) 863 + static bool io_wq_worker_cancel(struct io_worker *worker, void *data) 907 864 { 908 - struct io_cb_cancel_data *data = cancel_data; 865 + struct io_cb_cancel_data *match = data; 909 866 unsigned long flags; 910 867 bool ret = false; 911 868 ··· 915 874 spin_lock_irqsave(&worker->lock, flags); 916 875 if (worker->cur_work && 917 876 !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) && 918 - data->cancel(worker->cur_work, data->caller_data)) { 919 - send_sig(SIGINT, worker->task, 1); 920 - ret = true; 921 - } 922 - spin_unlock_irqrestore(&worker->lock, flags); 923 - 924 - return ret; 925 - } 926 - 927 - static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe, 928 - work_cancel_fn *cancel, 929 - void *cancel_data) 930 - { 931 - struct io_cb_cancel_data data = { 932 - .wqe = wqe, 933 - .cancel = cancel, 934 - .caller_data = cancel_data, 935 - }; 936 - struct io_wq_work_node *node, *prev; 937 - struct io_wq_work *work; 938 - unsigned long flags; 939 - bool found = false; 940 - 941 - spin_lock_irqsave(&wqe->lock, flags); 942 - wq_list_for_each(node, prev, &wqe->work_list) { 943 - work = container_of(node, struct io_wq_work, list); 944 - 945 - if (cancel(work, cancel_data)) { 946 - wq_node_del(&wqe->work_list, node, prev); 947 - found = true; 948 - break; 949 - } 950 - } 951 - spin_unlock_irqrestore(&wqe->lock, flags); 952 - 953 - if (found) { 954 - io_run_cancel(work); 955 - return IO_WQ_CANCEL_OK; 956 - } 957 - 958 - rcu_read_lock(); 959 - found = io_wq_for_each_worker(wqe, io_work_cancel, &data); 960 - rcu_read_unlock(); 961 - return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND; 962 - } 963 - 964 - enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, 965 - void *data) 966 - { 967 - enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; 968 - int node; 969 - 970 - for_each_node(node) { 971 - struct io_wqe *wqe = wq->wqes[node]; 972 - 973 - ret = io_wqe_cancel_cb_work(wqe, cancel, data); 974 - if (ret != IO_WQ_CANCEL_NOTFOUND) 975 - break; 976 - } 977 - 978 - return ret; 979 - } 980 - 981 - struct work_match { 982 - bool (*fn)(struct io_wq_work *, void *data); 983 - void *data; 984 - }; 985 - 986 - static bool io_wq_worker_cancel(struct io_worker *worker, void *data) 987 - { 988 - struct work_match *match = data; 989 - unsigned long flags; 990 - bool ret = false; 991 - 992 - spin_lock_irqsave(&worker->lock, flags); 993 - if (match->fn(worker->cur_work, match->data) && 994 - !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) { 877 + match->fn(worker->cur_work, match->data)) { 995 878 send_sig(SIGINT, worker->task, 1); 996 879 ret = true; 997 880 } ··· 925 960 } 926 961 927 962 static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, 928 - struct work_match *match) 963 + struct io_cb_cancel_data *match) 929 964 { 930 965 struct io_wq_work_node *node, *prev; 931 966 struct io_wq_work *work; ··· 942 977 work = container_of(node, struct io_wq_work, list); 943 978 944 979 if (match->fn(work, match->data)) { 945 - wq_node_del(&wqe->work_list, node, prev); 980 + wq_list_del(&wqe->work_list, node, prev); 946 981 found = true; 947 982 break; 948 983 } ··· 950 985 spin_unlock_irqrestore(&wqe->lock, flags); 951 986 952 987 if (found) { 953 - io_run_cancel(work); 988 + io_run_cancel(work, wqe); 954 989 return IO_WQ_CANCEL_OK; 955 990 } 956 991 ··· 966 1001 return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND; 967 1002 } 968 1003 969 - static bool io_wq_work_match(struct io_wq_work *work, void *data) 1004 + enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, 1005 + void *data) 1006 + { 1007 + struct io_cb_cancel_data match = { 1008 + .fn = cancel, 1009 + .data = data, 1010 + }; 1011 + enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; 1012 + int node; 1013 + 1014 + for_each_node(node) { 1015 + struct io_wqe *wqe = wq->wqes[node]; 1016 + 1017 + ret = io_wqe_cancel_work(wqe, &match); 1018 + if (ret != IO_WQ_CANCEL_NOTFOUND) 1019 + break; 1020 + } 1021 + 1022 + return ret; 1023 + } 1024 + 1025 + static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data) 970 1026 { 971 1027 return work == data; 972 1028 } 973 1029 974 1030 enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) 975 1031 { 976 - struct work_match match = { 977 - .fn = io_wq_work_match, 978 - .data = cwork 979 - }; 980 - enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; 981 - int node; 982 - 983 - cwork->flags |= IO_WQ_WORK_CANCEL; 984 - 985 - for_each_node(node) { 986 - struct io_wqe *wqe = wq->wqes[node]; 987 - 988 - ret = io_wqe_cancel_work(wqe, &match); 989 - if (ret != IO_WQ_CANCEL_NOTFOUND) 990 - break; 991 - } 992 - 993 - return ret; 1032 + return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork); 994 1033 } 995 1034 996 1035 static bool io_wq_pid_match(struct io_wq_work *work, void *data) 997 1036 { 998 1037 pid_t pid = (pid_t) (unsigned long) data; 999 1038 1000 - if (work) 1001 - return work->task_pid == pid; 1002 - return false; 1039 + return work->task_pid == pid; 1003 1040 } 1004 1041 1005 1042 enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid) 1006 1043 { 1007 - struct work_match match = { 1008 - .fn = io_wq_pid_match, 1009 - .data = (void *) (unsigned long) pid 1010 - }; 1011 - enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; 1012 - int node; 1044 + void *data = (void *) (unsigned long) pid; 1013 1045 1014 - for_each_node(node) { 1015 - struct io_wqe *wqe = wq->wqes[node]; 1016 - 1017 - ret = io_wqe_cancel_work(wqe, &match); 1018 - if (ret != IO_WQ_CANCEL_NOTFOUND) 1019 - break; 1020 - } 1021 - 1022 - return ret; 1046 + return io_wq_cancel_cb(wq, io_wq_pid_match, data); 1023 1047 } 1024 1048 1025 1049 struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) 1026 1050 { 1027 1051 int ret = -ENOMEM, node; 1028 1052 struct io_wq *wq; 1053 + 1054 + if (WARN_ON_ONCE(!data->free_work)) 1055 + return ERR_PTR(-EINVAL); 1029 1056 1030 1057 wq = kzalloc(sizeof(*wq), GFP_KERNEL); 1031 1058 if (!wq) ··· 1029 1072 return ERR_PTR(-ENOMEM); 1030 1073 } 1031 1074 1032 - wq->get_work = data->get_work; 1033 - wq->put_work = data->put_work; 1075 + wq->free_work = data->free_work; 1034 1076 1035 1077 /* caller must already hold a reference to this */ 1036 1078 wq->user = data->user; ··· 1086 1130 1087 1131 bool io_wq_get(struct io_wq *wq, struct io_wq_data *data) 1088 1132 { 1089 - if (data->get_work != wq->get_work || data->put_work != wq->put_work) 1133 + if (data->free_work != wq->free_work) 1090 1134 return false; 1091 1135 1092 1136 return refcount_inc_not_zero(&wq->use_refs);
+46 -19
fs/io-wq.h
··· 5 5 6 6 enum { 7 7 IO_WQ_WORK_CANCEL = 1, 8 - IO_WQ_WORK_HAS_MM = 2, 9 8 IO_WQ_WORK_HASHED = 4, 10 9 IO_WQ_WORK_UNBOUND = 32, 11 - IO_WQ_WORK_CB = 128, 12 10 IO_WQ_WORK_NO_CANCEL = 256, 13 11 IO_WQ_WORK_CONCURRENT = 512, 14 12 ··· 28 30 struct io_wq_work_node *last; 29 31 }; 30 32 33 + static inline void wq_list_add_after(struct io_wq_work_node *node, 34 + struct io_wq_work_node *pos, 35 + struct io_wq_work_list *list) 36 + { 37 + struct io_wq_work_node *next = pos->next; 38 + 39 + pos->next = node; 40 + node->next = next; 41 + if (!next) 42 + list->last = node; 43 + } 44 + 31 45 static inline void wq_list_add_tail(struct io_wq_work_node *node, 32 46 struct io_wq_work_list *list) 33 47 { ··· 52 42 } 53 43 } 54 44 55 - static inline void wq_node_del(struct io_wq_work_list *list, 45 + static inline void wq_list_cut(struct io_wq_work_list *list, 46 + struct io_wq_work_node *last, 47 + struct io_wq_work_node *prev) 48 + { 49 + /* first in the list, if prev==NULL */ 50 + if (!prev) 51 + WRITE_ONCE(list->first, last->next); 52 + else 53 + prev->next = last->next; 54 + 55 + if (last == list->last) 56 + list->last = prev; 57 + last->next = NULL; 58 + } 59 + 60 + static inline void wq_list_del(struct io_wq_work_list *list, 56 61 struct io_wq_work_node *node, 57 62 struct io_wq_work_node *prev) 58 63 { 59 - if (node == list->first) 60 - WRITE_ONCE(list->first, node->next); 61 - if (node == list->last) 62 - list->last = prev; 63 - if (prev) 64 - prev->next = node->next; 65 - node->next = NULL; 64 + wq_list_cut(list, node, prev); 66 65 } 67 66 68 67 #define wq_list_for_each(pos, prv, head) \ ··· 84 65 } while (0) 85 66 86 67 struct io_wq_work { 87 - union { 88 - struct io_wq_work_node list; 89 - void *data; 90 - }; 68 + struct io_wq_work_node list; 91 69 void (*func)(struct io_wq_work **); 92 70 struct files_struct *files; 93 71 struct mm_struct *mm; ··· 99 83 *(work) = (struct io_wq_work){ .func = _func }; \ 100 84 } while (0) \ 101 85 102 - typedef void (get_work_fn)(struct io_wq_work *); 103 - typedef void (put_work_fn)(struct io_wq_work *); 86 + static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) 87 + { 88 + if (!work->list.next) 89 + return NULL; 90 + 91 + return container_of(work->list.next, struct io_wq_work, list); 92 + } 93 + 94 + typedef void (free_work_fn)(struct io_wq_work *); 104 95 105 96 struct io_wq_data { 106 97 struct user_struct *user; 107 98 108 - get_work_fn *get_work; 109 - put_work_fn *put_work; 99 + free_work_fn *free_work; 110 100 }; 111 101 112 102 struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); ··· 120 98 void io_wq_destroy(struct io_wq *wq); 121 99 122 100 void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); 123 - void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val); 101 + void io_wq_hash_work(struct io_wq_work *work, void *val); 102 + 103 + static inline bool io_wq_is_hashed(struct io_wq_work *work) 104 + { 105 + return work->flags & IO_WQ_WORK_HASHED; 106 + } 124 107 125 108 void io_wq_cancel_all(struct io_wq *wq); 126 109 enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
+1412 -615
fs/io_uring.c
··· 44 44 #include <linux/errno.h> 45 45 #include <linux/syscalls.h> 46 46 #include <linux/compat.h> 47 + #include <net/compat.h> 47 48 #include <linux/refcount.h> 48 49 #include <linux/uio.h> 49 50 #include <linux/bits.h> ··· 77 76 #include <linux/fadvise.h> 78 77 #include <linux/eventpoll.h> 79 78 #include <linux/fs_struct.h> 79 + #include <linux/splice.h> 80 + #include <linux/task_work.h> 80 81 81 82 #define CREATE_TRACE_POINTS 82 83 #include <trace/events/io_uring.h> ··· 196 193 struct completion done; 197 194 }; 198 195 196 + struct io_buffer { 197 + struct list_head list; 198 + __u64 addr; 199 + __s32 len; 200 + __u16 bid; 201 + }; 202 + 199 203 struct io_ring_ctx { 200 204 struct { 201 205 struct percpu_ref refs; ··· 280 270 struct socket *ring_sock; 281 271 #endif 282 272 273 + struct idr io_buffer_idr; 274 + 283 275 struct idr personality_idr; 284 276 285 277 struct { ··· 302 290 303 291 struct { 304 292 spinlock_t completion_lock; 305 - struct llist_head poll_llist; 306 293 307 294 /* 308 295 * ->poll_list is protected by the ctx->uring_lock for ··· 397 386 void __user *buf; 398 387 }; 399 388 int msg_flags; 389 + int bgid; 400 390 size_t len; 391 + struct io_buffer *kbuf; 401 392 }; 402 393 403 394 struct io_open { ··· 443 430 struct epoll_event event; 444 431 }; 445 432 433 + struct io_splice { 434 + struct file *file_out; 435 + struct file *file_in; 436 + loff_t off_out; 437 + loff_t off_in; 438 + u64 len; 439 + unsigned int flags; 440 + }; 441 + 442 + struct io_provide_buf { 443 + struct file *file; 444 + __u64 addr; 445 + __s32 len; 446 + __u32 bgid; 447 + __u16 nbufs; 448 + __u16 bid; 449 + }; 450 + 446 451 struct io_async_connect { 447 452 struct sockaddr_storage address; 448 453 }; ··· 495 464 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, 496 465 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, 497 466 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, 467 + REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, 498 468 499 469 REQ_F_LINK_NEXT_BIT, 500 470 REQ_F_FAIL_LINK_BIT, ··· 511 479 REQ_F_COMP_LOCKED_BIT, 512 480 REQ_F_NEED_CLEANUP_BIT, 513 481 REQ_F_OVERFLOW_BIT, 482 + REQ_F_POLLED_BIT, 483 + REQ_F_BUFFER_SELECTED_BIT, 484 + 485 + /* not a real bit, just to check we're not overflowing the space */ 486 + __REQ_F_LAST_BIT, 514 487 }; 515 488 516 489 enum { ··· 529 492 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), 530 493 /* IOSQE_ASYNC */ 531 494 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), 495 + /* IOSQE_BUFFER_SELECT */ 496 + REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), 532 497 533 498 /* already grabbed next link */ 534 499 REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT), ··· 560 521 REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), 561 522 /* in overflow list */ 562 523 REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT), 524 + /* already went through poll handler */ 525 + REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), 526 + /* buffer already selected */ 527 + REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), 528 + }; 529 + 530 + struct async_poll { 531 + struct io_poll_iocb poll; 532 + struct io_wq_work work; 563 533 }; 564 534 565 535 /* ··· 594 546 struct io_fadvise fadvise; 595 547 struct io_madvise madvise; 596 548 struct io_epoll epoll; 549 + struct io_splice splice; 550 + struct io_provide_buf pbuf; 597 551 }; 598 552 599 553 struct io_async_ctx *io; 600 - /* 601 - * llist_node is only used for poll deferred completions 602 - */ 603 - struct llist_node llist_node; 604 - bool in_async; 605 554 bool needs_fixed_file; 606 555 u8 opcode; 607 556 608 557 struct io_ring_ctx *ctx; 609 - union { 610 - struct list_head list; 611 - struct hlist_node hash_node; 612 - }; 613 - struct list_head link_list; 558 + struct list_head list; 614 559 unsigned int flags; 615 560 refcount_t refs; 561 + union { 562 + struct task_struct *task; 563 + unsigned long fsize; 564 + }; 616 565 u64 user_data; 617 566 u32 result; 618 567 u32 sequence; 619 568 569 + struct list_head link_list; 570 + 620 571 struct list_head inflight_entry; 621 572 622 - struct io_wq_work work; 573 + union { 574 + /* 575 + * Only commands that never go async can use the below fields, 576 + * obviously. Right now only IORING_OP_POLL_ADD uses them, and 577 + * async armed poll handlers for regular commands. The latter 578 + * restore the work, if needed. 579 + */ 580 + struct { 581 + struct callback_head task_work; 582 + struct hlist_node hash_node; 583 + struct async_poll *apoll; 584 + int cflags; 585 + }; 586 + struct io_wq_work work; 587 + }; 623 588 }; 624 589 625 590 #define IO_PLUG_THRESHOLD 2 ··· 676 615 unsigned file_table : 1; 677 616 /* needs ->fs */ 678 617 unsigned needs_fs : 1; 618 + /* set if opcode supports polled "wait" */ 619 + unsigned pollin : 1; 620 + unsigned pollout : 1; 621 + /* op supports buffer selection */ 622 + unsigned buffer_select : 1; 679 623 }; 680 624 681 625 static const struct io_op_def io_op_defs[] = { ··· 690 624 .needs_mm = 1, 691 625 .needs_file = 1, 692 626 .unbound_nonreg_file = 1, 627 + .pollin = 1, 628 + .buffer_select = 1, 693 629 }, 694 630 [IORING_OP_WRITEV] = { 695 631 .async_ctx = 1, ··· 699 631 .needs_file = 1, 700 632 .hash_reg_file = 1, 701 633 .unbound_nonreg_file = 1, 634 + .pollout = 1, 702 635 }, 703 636 [IORING_OP_FSYNC] = { 704 637 .needs_file = 1, ··· 707 638 [IORING_OP_READ_FIXED] = { 708 639 .needs_file = 1, 709 640 .unbound_nonreg_file = 1, 641 + .pollin = 1, 710 642 }, 711 643 [IORING_OP_WRITE_FIXED] = { 712 644 .needs_file = 1, 713 645 .hash_reg_file = 1, 714 646 .unbound_nonreg_file = 1, 647 + .pollout = 1, 715 648 }, 716 649 [IORING_OP_POLL_ADD] = { 717 650 .needs_file = 1, ··· 729 658 .needs_file = 1, 730 659 .unbound_nonreg_file = 1, 731 660 .needs_fs = 1, 661 + .pollout = 1, 732 662 }, 733 663 [IORING_OP_RECVMSG] = { 734 664 .async_ctx = 1, ··· 737 665 .needs_file = 1, 738 666 .unbound_nonreg_file = 1, 739 667 .needs_fs = 1, 668 + .pollin = 1, 669 + .buffer_select = 1, 740 670 }, 741 671 [IORING_OP_TIMEOUT] = { 742 672 .async_ctx = 1, ··· 750 676 .needs_file = 1, 751 677 .unbound_nonreg_file = 1, 752 678 .file_table = 1, 679 + .pollin = 1, 753 680 }, 754 681 [IORING_OP_ASYNC_CANCEL] = {}, 755 682 [IORING_OP_LINK_TIMEOUT] = { ··· 762 687 .needs_mm = 1, 763 688 .needs_file = 1, 764 689 .unbound_nonreg_file = 1, 690 + .pollout = 1, 765 691 }, 766 692 [IORING_OP_FALLOCATE] = { 767 693 .needs_file = 1, ··· 791 715 .needs_mm = 1, 792 716 .needs_file = 1, 793 717 .unbound_nonreg_file = 1, 718 + .pollin = 1, 719 + .buffer_select = 1, 794 720 }, 795 721 [IORING_OP_WRITE] = { 796 722 .needs_mm = 1, 797 723 .needs_file = 1, 798 724 .unbound_nonreg_file = 1, 725 + .pollout = 1, 799 726 }, 800 727 [IORING_OP_FADVISE] = { 801 728 .needs_file = 1, ··· 810 731 .needs_mm = 1, 811 732 .needs_file = 1, 812 733 .unbound_nonreg_file = 1, 734 + .pollout = 1, 813 735 }, 814 736 [IORING_OP_RECV] = { 815 737 .needs_mm = 1, 816 738 .needs_file = 1, 817 739 .unbound_nonreg_file = 1, 740 + .pollin = 1, 741 + .buffer_select = 1, 818 742 }, 819 743 [IORING_OP_OPENAT2] = { 820 744 .needs_file = 1, ··· 829 747 .unbound_nonreg_file = 1, 830 748 .file_table = 1, 831 749 }, 750 + [IORING_OP_SPLICE] = { 751 + .needs_file = 1, 752 + .hash_reg_file = 1, 753 + .unbound_nonreg_file = 1, 754 + }, 755 + [IORING_OP_PROVIDE_BUFFERS] = {}, 756 + [IORING_OP_REMOVE_BUFFERS] = {}, 832 757 }; 833 758 834 759 static void io_wq_submit_work(struct io_wq_work **workptr); ··· 850 761 static int io_grab_files(struct io_kiocb *req); 851 762 static void io_ring_file_ref_flush(struct fixed_file_data *data); 852 763 static void io_cleanup_req(struct io_kiocb *req); 764 + static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, 765 + int fd, struct file **out_file, bool fixed); 766 + static void __io_queue_sqe(struct io_kiocb *req, 767 + const struct io_uring_sqe *sqe); 853 768 854 769 static struct kmem_cache *req_cachep; 855 770 ··· 920 827 INIT_LIST_HEAD(&ctx->cq_overflow_list); 921 828 init_completion(&ctx->completions[0]); 922 829 init_completion(&ctx->completions[1]); 830 + idr_init(&ctx->io_buffer_idr); 923 831 idr_init(&ctx->personality_idr); 924 832 mutex_init(&ctx->uring_lock); 925 833 init_waitqueue_head(&ctx->wait); 926 834 spin_lock_init(&ctx->completion_lock); 927 - init_llist_head(&ctx->poll_llist); 928 835 INIT_LIST_HEAD(&ctx->poll_list); 929 836 INIT_LIST_HEAD(&ctx->defer_list); 930 837 INIT_LIST_HEAD(&ctx->timeout_list); ··· 1045 952 } 1046 953 } 1047 954 1048 - static inline bool io_prep_async_work(struct io_kiocb *req, 955 + static inline void io_prep_async_work(struct io_kiocb *req, 1049 956 struct io_kiocb **link) 1050 957 { 1051 958 const struct io_op_def *def = &io_op_defs[req->opcode]; 1052 - bool do_hashed = false; 1053 959 1054 960 if (req->flags & REQ_F_ISREG) { 1055 961 if (def->hash_reg_file) 1056 - do_hashed = true; 962 + io_wq_hash_work(&req->work, file_inode(req->file)); 1057 963 } else { 1058 964 if (def->unbound_nonreg_file) 1059 965 req->work.flags |= IO_WQ_WORK_UNBOUND; ··· 1061 969 io_req_work_grab_env(req, def); 1062 970 1063 971 *link = io_prep_linked_timeout(req); 1064 - return do_hashed; 1065 972 } 1066 973 1067 974 static inline void io_queue_async_work(struct io_kiocb *req) 1068 975 { 1069 976 struct io_ring_ctx *ctx = req->ctx; 1070 977 struct io_kiocb *link; 1071 - bool do_hashed; 1072 978 1073 - do_hashed = io_prep_async_work(req, &link); 979 + io_prep_async_work(req, &link); 1074 980 1075 - trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work, 1076 - req->flags); 1077 - if (!do_hashed) { 1078 - io_wq_enqueue(ctx->io_wq, &req->work); 1079 - } else { 1080 - io_wq_enqueue_hashed(ctx->io_wq, &req->work, 1081 - file_inode(req->file)); 1082 - } 981 + trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, 982 + &req->work, req->flags); 983 + io_wq_enqueue(ctx->io_wq, &req->work); 1083 984 1084 985 if (link) 1085 986 io_queue_linked_timeout(link); ··· 1139 1054 return false; 1140 1055 if (!ctx->eventfd_async) 1141 1056 return true; 1142 - return io_wq_current_is_worker() || in_interrupt(); 1057 + return io_wq_current_is_worker(); 1143 1058 } 1144 1059 1145 - static void __io_cqring_ev_posted(struct io_ring_ctx *ctx, bool trigger_ev) 1060 + static void io_cqring_ev_posted(struct io_ring_ctx *ctx) 1146 1061 { 1147 1062 if (waitqueue_active(&ctx->wait)) 1148 1063 wake_up(&ctx->wait); 1149 1064 if (waitqueue_active(&ctx->sqo_wait)) 1150 1065 wake_up(&ctx->sqo_wait); 1151 - if (trigger_ev) 1066 + if (io_should_trigger_evfd(ctx)) 1152 1067 eventfd_signal(ctx->cq_ev_fd, 1); 1153 - } 1154 - 1155 - static void io_cqring_ev_posted(struct io_ring_ctx *ctx) 1156 - { 1157 - __io_cqring_ev_posted(ctx, io_should_trigger_evfd(ctx)); 1158 1068 } 1159 1069 1160 1070 /* Returns true if there are no backlogged entries after the flush */ ··· 1188 1108 if (cqe) { 1189 1109 WRITE_ONCE(cqe->user_data, req->user_data); 1190 1110 WRITE_ONCE(cqe->res, req->result); 1191 - WRITE_ONCE(cqe->flags, 0); 1111 + WRITE_ONCE(cqe->flags, req->cflags); 1192 1112 } else { 1193 1113 WRITE_ONCE(ctx->rings->cq_overflow, 1194 1114 atomic_inc_return(&ctx->cached_cq_overflow)); ··· 1212 1132 return cqe != NULL; 1213 1133 } 1214 1134 1215 - static void io_cqring_fill_event(struct io_kiocb *req, long res) 1135 + static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) 1216 1136 { 1217 1137 struct io_ring_ctx *ctx = req->ctx; 1218 1138 struct io_uring_cqe *cqe; ··· 1228 1148 if (likely(cqe)) { 1229 1149 WRITE_ONCE(cqe->user_data, req->user_data); 1230 1150 WRITE_ONCE(cqe->res, res); 1231 - WRITE_ONCE(cqe->flags, 0); 1151 + WRITE_ONCE(cqe->flags, cflags); 1232 1152 } else if (ctx->cq_overflow_flushed) { 1233 1153 WRITE_ONCE(ctx->rings->cq_overflow, 1234 1154 atomic_inc_return(&ctx->cached_cq_overflow)); ··· 1240 1160 req->flags |= REQ_F_OVERFLOW; 1241 1161 refcount_inc(&req->refs); 1242 1162 req->result = res; 1163 + req->cflags = cflags; 1243 1164 list_add_tail(&req->list, &ctx->cq_overflow_list); 1244 1165 } 1245 1166 } 1246 1167 1247 - static void io_cqring_add_event(struct io_kiocb *req, long res) 1168 + static void io_cqring_fill_event(struct io_kiocb *req, long res) 1169 + { 1170 + __io_cqring_fill_event(req, res, 0); 1171 + } 1172 + 1173 + static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags) 1248 1174 { 1249 1175 struct io_ring_ctx *ctx = req->ctx; 1250 1176 unsigned long flags; 1251 1177 1252 1178 spin_lock_irqsave(&ctx->completion_lock, flags); 1253 - io_cqring_fill_event(req, res); 1179 + __io_cqring_fill_event(req, res, cflags); 1254 1180 io_commit_cqring(ctx); 1255 1181 spin_unlock_irqrestore(&ctx->completion_lock, flags); 1256 1182 1257 1183 io_cqring_ev_posted(ctx); 1184 + } 1185 + 1186 + static void io_cqring_add_event(struct io_kiocb *req, long res) 1187 + { 1188 + __io_cqring_add_event(req, res, 0); 1258 1189 } 1259 1190 1260 1191 static inline bool io_is_fallback_req(struct io_kiocb *req) ··· 1337 1246 return NULL; 1338 1247 } 1339 1248 1249 + static inline void io_put_file(struct io_kiocb *req, struct file *file, 1250 + bool fixed) 1251 + { 1252 + if (fixed) 1253 + percpu_ref_put(&req->ctx->file_data->refs); 1254 + else 1255 + fput(file); 1256 + } 1257 + 1340 1258 static void __io_req_do_free(struct io_kiocb *req) 1341 1259 { 1342 1260 if (likely(!io_is_fallback_req(req))) ··· 1356 1256 1357 1257 static void __io_req_aux_free(struct io_kiocb *req) 1358 1258 { 1359 - struct io_ring_ctx *ctx = req->ctx; 1360 - 1361 1259 if (req->flags & REQ_F_NEED_CLEANUP) 1362 1260 io_cleanup_req(req); 1363 1261 1364 1262 kfree(req->io); 1365 - if (req->file) { 1366 - if (req->flags & REQ_F_FIXED_FILE) 1367 - percpu_ref_put(&ctx->file_data->refs); 1368 - else 1369 - fput(req->file); 1370 - } 1263 + if (req->file) 1264 + io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); 1371 1265 1372 1266 io_req_work_drop_env(req); 1373 1267 } ··· 1568 1474 io_queue_async_work(nxt); 1569 1475 } 1570 1476 1477 + static void io_link_work_cb(struct io_wq_work **workptr) 1478 + { 1479 + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); 1480 + struct io_kiocb *link; 1481 + 1482 + link = list_first_entry(&req->link_list, struct io_kiocb, link_list); 1483 + io_queue_linked_timeout(link); 1484 + io_wq_submit_work(workptr); 1485 + } 1486 + 1487 + static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) 1488 + { 1489 + struct io_kiocb *link; 1490 + const struct io_op_def *def = &io_op_defs[nxt->opcode]; 1491 + 1492 + if ((nxt->flags & REQ_F_ISREG) && def->hash_reg_file) 1493 + io_wq_hash_work(&nxt->work, file_inode(nxt->file)); 1494 + 1495 + *workptr = &nxt->work; 1496 + link = io_prep_linked_timeout(nxt); 1497 + if (link) 1498 + nxt->work.func = io_link_work_cb; 1499 + } 1500 + 1571 1501 /* 1572 1502 * Drop reference to request, return next in chain (if there is one) if this 1573 1503 * was the last reference to this request. ··· 1609 1491 { 1610 1492 if (refcount_dec_and_test(&req->refs)) 1611 1493 io_free_req(req); 1494 + } 1495 + 1496 + static void io_steal_work(struct io_kiocb *req, 1497 + struct io_wq_work **workptr) 1498 + { 1499 + /* 1500 + * It's in an io-wq worker, so there always should be at least 1501 + * one reference, which will be dropped in io_put_work() just 1502 + * after the current handler returns. 1503 + * 1504 + * It also means, that if the counter dropped to 1, then there is 1505 + * no asynchronous users left, so it's safe to steal the next work. 1506 + */ 1507 + if (refcount_read(&req->refs) == 1) { 1508 + struct io_kiocb *nxt = NULL; 1509 + 1510 + io_req_find_next(req, &nxt); 1511 + if (nxt) 1512 + io_wq_assign_next(workptr, nxt); 1513 + } 1612 1514 } 1613 1515 1614 1516 /* ··· 1692 1554 return true; 1693 1555 } 1694 1556 1557 + static int io_put_kbuf(struct io_kiocb *req) 1558 + { 1559 + struct io_buffer *kbuf; 1560 + int cflags; 1561 + 1562 + kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; 1563 + cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; 1564 + cflags |= IORING_CQE_F_BUFFER; 1565 + req->rw.addr = 0; 1566 + kfree(kbuf); 1567 + return cflags; 1568 + } 1569 + 1695 1570 /* 1696 1571 * Find and free completed poll iocbs 1697 1572 */ ··· 1716 1565 1717 1566 rb.to_free = rb.need_iter = 0; 1718 1567 while (!list_empty(done)) { 1568 + int cflags = 0; 1569 + 1719 1570 req = list_first_entry(done, struct io_kiocb, list); 1720 1571 list_del(&req->list); 1721 1572 1722 - io_cqring_fill_event(req, req->result); 1573 + if (req->flags & REQ_F_BUFFER_SELECTED) 1574 + cflags = io_put_kbuf(req); 1575 + 1576 + __io_cqring_fill_event(req, req->result, cflags); 1723 1577 (*nr_events)++; 1724 1578 1725 1579 if (refcount_dec_and_test(&req->refs) && ··· 1733 1577 } 1734 1578 1735 1579 io_commit_cqring(ctx); 1580 + if (ctx->flags & IORING_SETUP_SQPOLL) 1581 + io_cqring_ev_posted(ctx); 1736 1582 io_free_req_many(ctx, &rb); 1737 1583 } 1738 1584 ··· 1901 1743 static void io_complete_rw_common(struct kiocb *kiocb, long res) 1902 1744 { 1903 1745 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 1746 + int cflags = 0; 1904 1747 1905 1748 if (kiocb->ki_flags & IOCB_WRITE) 1906 1749 kiocb_end_write(req); 1907 1750 1908 1751 if (res != req->result) 1909 1752 req_set_fail_links(req); 1910 - io_cqring_add_event(req, res); 1753 + if (req->flags & REQ_F_BUFFER_SELECTED) 1754 + cflags = io_put_kbuf(req); 1755 + __io_cqring_add_event(req, res, cflags); 1911 1756 } 1912 1757 1913 1758 static void io_complete_rw(struct kiocb *kiocb, long res, long res2) ··· 1919 1758 1920 1759 io_complete_rw_common(kiocb, res); 1921 1760 io_put_req(req); 1922 - } 1923 - 1924 - static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res) 1925 - { 1926 - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 1927 - struct io_kiocb *nxt = NULL; 1928 - 1929 - io_complete_rw_common(kiocb, res); 1930 - io_put_req_find_next(req, &nxt); 1931 - 1932 - return nxt; 1933 1761 } 1934 1762 1935 1763 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) ··· 1991 1841 * assuming most submissions are for one file, or at least that each file 1992 1842 * has more than one submission. 1993 1843 */ 1994 - static struct file *io_file_get(struct io_submit_state *state, int fd) 1844 + static struct file *__io_file_get(struct io_submit_state *state, int fd) 1995 1845 { 1996 1846 if (!state) 1997 1847 return fget(fd); ··· 2088 1938 2089 1939 req->rw.addr = READ_ONCE(sqe->addr); 2090 1940 req->rw.len = READ_ONCE(sqe->len); 2091 - /* we own ->private, reuse it for the buffer index */ 1941 + /* we own ->private, reuse it for the buffer index / buffer ID */ 2092 1942 req->rw.kiocb.private = (void *) (unsigned long) 2093 1943 READ_ONCE(sqe->buf_index); 2094 1944 return 0; ··· 2115 1965 } 2116 1966 } 2117 1967 2118 - static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt, 2119 - bool in_async) 1968 + static void kiocb_done(struct kiocb *kiocb, ssize_t ret) 2120 1969 { 2121 1970 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 2122 1971 2123 1972 if (req->flags & REQ_F_CUR_POS) 2124 1973 req->file->f_pos = kiocb->ki_pos; 2125 - if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw) 2126 - *nxt = __io_complete_rw(kiocb, ret); 1974 + if (ret >= 0 && kiocb->ki_complete == io_complete_rw) 1975 + io_complete_rw(kiocb, ret, 0); 2127 1976 else 2128 1977 io_rw_done(kiocb, ret); 2129 1978 } ··· 2201 2052 return len; 2202 2053 } 2203 2054 2055 + static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock) 2056 + { 2057 + if (needs_lock) 2058 + mutex_unlock(&ctx->uring_lock); 2059 + } 2060 + 2061 + static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) 2062 + { 2063 + /* 2064 + * "Normal" inline submissions always hold the uring_lock, since we 2065 + * grab it from the system call. Same is true for the SQPOLL offload. 2066 + * The only exception is when we've detached the request and issue it 2067 + * from an async worker thread, grab the lock for that case. 2068 + */ 2069 + if (needs_lock) 2070 + mutex_lock(&ctx->uring_lock); 2071 + } 2072 + 2073 + static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, 2074 + int bgid, struct io_buffer *kbuf, 2075 + bool needs_lock) 2076 + { 2077 + struct io_buffer *head; 2078 + 2079 + if (req->flags & REQ_F_BUFFER_SELECTED) 2080 + return kbuf; 2081 + 2082 + io_ring_submit_lock(req->ctx, needs_lock); 2083 + 2084 + lockdep_assert_held(&req->ctx->uring_lock); 2085 + 2086 + head = idr_find(&req->ctx->io_buffer_idr, bgid); 2087 + if (head) { 2088 + if (!list_empty(&head->list)) { 2089 + kbuf = list_last_entry(&head->list, struct io_buffer, 2090 + list); 2091 + list_del(&kbuf->list); 2092 + } else { 2093 + kbuf = head; 2094 + idr_remove(&req->ctx->io_buffer_idr, bgid); 2095 + } 2096 + if (*len > kbuf->len) 2097 + *len = kbuf->len; 2098 + } else { 2099 + kbuf = ERR_PTR(-ENOBUFS); 2100 + } 2101 + 2102 + io_ring_submit_unlock(req->ctx, needs_lock); 2103 + 2104 + return kbuf; 2105 + } 2106 + 2107 + static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, 2108 + bool needs_lock) 2109 + { 2110 + struct io_buffer *kbuf; 2111 + int bgid; 2112 + 2113 + kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; 2114 + bgid = (int) (unsigned long) req->rw.kiocb.private; 2115 + kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock); 2116 + if (IS_ERR(kbuf)) 2117 + return kbuf; 2118 + req->rw.addr = (u64) (unsigned long) kbuf; 2119 + req->flags |= REQ_F_BUFFER_SELECTED; 2120 + return u64_to_user_ptr(kbuf->addr); 2121 + } 2122 + 2123 + #ifdef CONFIG_COMPAT 2124 + static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, 2125 + bool needs_lock) 2126 + { 2127 + struct compat_iovec __user *uiov; 2128 + compat_ssize_t clen; 2129 + void __user *buf; 2130 + ssize_t len; 2131 + 2132 + uiov = u64_to_user_ptr(req->rw.addr); 2133 + if (!access_ok(uiov, sizeof(*uiov))) 2134 + return -EFAULT; 2135 + if (__get_user(clen, &uiov->iov_len)) 2136 + return -EFAULT; 2137 + if (clen < 0) 2138 + return -EINVAL; 2139 + 2140 + len = clen; 2141 + buf = io_rw_buffer_select(req, &len, needs_lock); 2142 + if (IS_ERR(buf)) 2143 + return PTR_ERR(buf); 2144 + iov[0].iov_base = buf; 2145 + iov[0].iov_len = (compat_size_t) len; 2146 + return 0; 2147 + } 2148 + #endif 2149 + 2150 + static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, 2151 + bool needs_lock) 2152 + { 2153 + struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr); 2154 + void __user *buf; 2155 + ssize_t len; 2156 + 2157 + if (copy_from_user(iov, uiov, sizeof(*uiov))) 2158 + return -EFAULT; 2159 + 2160 + len = iov[0].iov_len; 2161 + if (len < 0) 2162 + return -EINVAL; 2163 + buf = io_rw_buffer_select(req, &len, needs_lock); 2164 + if (IS_ERR(buf)) 2165 + return PTR_ERR(buf); 2166 + iov[0].iov_base = buf; 2167 + iov[0].iov_len = len; 2168 + return 0; 2169 + } 2170 + 2171 + static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, 2172 + bool needs_lock) 2173 + { 2174 + if (req->flags & REQ_F_BUFFER_SELECTED) 2175 + return 0; 2176 + if (!req->rw.len) 2177 + return 0; 2178 + else if (req->rw.len > 1) 2179 + return -EINVAL; 2180 + 2181 + #ifdef CONFIG_COMPAT 2182 + if (req->ctx->compat) 2183 + return io_compat_import(req, iov, needs_lock); 2184 + #endif 2185 + 2186 + return __io_iov_buffer_select(req, iov, needs_lock); 2187 + } 2188 + 2204 2189 static ssize_t io_import_iovec(int rw, struct io_kiocb *req, 2205 - struct iovec **iovec, struct iov_iter *iter) 2190 + struct iovec **iovec, struct iov_iter *iter, 2191 + bool needs_lock) 2206 2192 { 2207 2193 void __user *buf = u64_to_user_ptr(req->rw.addr); 2208 2194 size_t sqe_len = req->rw.len; 2195 + ssize_t ret; 2209 2196 u8 opcode; 2210 2197 2211 2198 opcode = req->opcode; ··· 2350 2065 return io_import_fixed(req, rw, iter); 2351 2066 } 2352 2067 2353 - /* buffer index only valid with fixed read/write */ 2354 - if (req->rw.kiocb.private) 2068 + /* buffer index only valid with fixed read/write, or buffer select */ 2069 + if (req->rw.kiocb.private && !(req->flags & REQ_F_BUFFER_SELECT)) 2355 2070 return -EINVAL; 2356 2071 2357 2072 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { 2358 - ssize_t ret; 2073 + if (req->flags & REQ_F_BUFFER_SELECT) { 2074 + buf = io_rw_buffer_select(req, &sqe_len, needs_lock); 2075 + if (IS_ERR(buf)) { 2076 + *iovec = NULL; 2077 + return PTR_ERR(buf); 2078 + } 2079 + req->rw.len = sqe_len; 2080 + } 2081 + 2359 2082 ret = import_single_range(rw, buf, sqe_len, *iovec, iter); 2360 2083 *iovec = NULL; 2361 2084 return ret < 0 ? ret : sqe_len; ··· 2377 2084 if (iorw->iov == iorw->fast_iov) 2378 2085 *iovec = NULL; 2379 2086 return iorw->size; 2087 + } 2088 + 2089 + if (req->flags & REQ_F_BUFFER_SELECT) { 2090 + ret = io_iov_buffer_select(req, *iovec, needs_lock); 2091 + if (!ret) { 2092 + ret = (*iovec)->iov_len; 2093 + iov_iter_init(iter, rw, *iovec, 1, ret); 2094 + } 2095 + *iovec = NULL; 2096 + return ret; 2380 2097 } 2381 2098 2382 2099 #ifdef CONFIG_COMPAT ··· 2472 2169 } 2473 2170 } 2474 2171 2172 + static inline int __io_alloc_async_ctx(struct io_kiocb *req) 2173 + { 2174 + req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); 2175 + return req->io == NULL; 2176 + } 2177 + 2475 2178 static int io_alloc_async_ctx(struct io_kiocb *req) 2476 2179 { 2477 2180 if (!io_op_defs[req->opcode].async_ctx) 2478 2181 return 0; 2479 - req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); 2480 - return req->io == NULL; 2182 + 2183 + return __io_alloc_async_ctx(req); 2481 2184 } 2482 2185 2483 2186 static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, ··· 2493 2184 if (!io_op_defs[req->opcode].async_ctx) 2494 2185 return 0; 2495 2186 if (!req->io) { 2496 - if (io_alloc_async_ctx(req)) 2187 + if (__io_alloc_async_ctx(req)) 2497 2188 return -ENOMEM; 2498 2189 2499 2190 io_req_map_rw(req, io_size, iovec, fast_iov, iter); ··· 2522 2213 io = req->io; 2523 2214 io->rw.iov = io->rw.fast_iov; 2524 2215 req->io = NULL; 2525 - ret = io_import_iovec(READ, req, &io->rw.iov, &iter); 2216 + ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock); 2526 2217 req->io = io; 2527 2218 if (ret < 0) 2528 2219 return ret; ··· 2531 2222 return 0; 2532 2223 } 2533 2224 2534 - static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, 2535 - bool force_nonblock) 2225 + static int io_read(struct io_kiocb *req, bool force_nonblock) 2536 2226 { 2537 2227 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 2538 2228 struct kiocb *kiocb = &req->rw.kiocb; ··· 2539 2231 size_t iov_count; 2540 2232 ssize_t io_size, ret; 2541 2233 2542 - ret = io_import_iovec(READ, req, &iovec, &iter); 2234 + ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock); 2543 2235 if (ret < 0) 2544 2236 return ret; 2545 2237 2546 2238 /* Ensure we clear previously set non-block flag */ 2547 2239 if (!force_nonblock) 2548 - req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; 2240 + kiocb->ki_flags &= ~IOCB_NOWAIT; 2549 2241 2550 2242 req->result = 0; 2551 2243 io_size = ret; ··· 2556 2248 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so 2557 2249 * we know to async punt it even if it was opened O_NONBLOCK 2558 2250 */ 2559 - if (force_nonblock && !io_file_supports_async(req->file)) { 2560 - req->flags |= REQ_F_MUST_PUNT; 2251 + if (force_nonblock && !io_file_supports_async(req->file)) 2561 2252 goto copy_iov; 2562 - } 2563 2253 2564 2254 iov_count = iov_iter_count(&iter); 2565 2255 ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); ··· 2571 2265 2572 2266 /* Catch -EAGAIN return for forced non-blocking submission */ 2573 2267 if (!force_nonblock || ret2 != -EAGAIN) { 2574 - kiocb_done(kiocb, ret2, nxt, req->in_async); 2268 + kiocb_done(kiocb, ret2); 2575 2269 } else { 2576 2270 copy_iov: 2577 2271 ret = io_setup_async_rw(req, io_size, iovec, 2578 2272 inline_vecs, &iter); 2579 2273 if (ret) 2580 2274 goto out_free; 2275 + /* any defer here is final, must blocking retry */ 2276 + if (!(req->flags & REQ_F_NOWAIT)) 2277 + req->flags |= REQ_F_MUST_PUNT; 2581 2278 return -EAGAIN; 2582 2279 } 2583 2280 } ··· 2604 2295 if (unlikely(!(req->file->f_mode & FMODE_WRITE))) 2605 2296 return -EBADF; 2606 2297 2298 + req->fsize = rlimit(RLIMIT_FSIZE); 2299 + 2607 2300 /* either don't need iovec imported or already have it */ 2608 2301 if (!req->io || req->flags & REQ_F_NEED_CLEANUP) 2609 2302 return 0; ··· 2613 2302 io = req->io; 2614 2303 io->rw.iov = io->rw.fast_iov; 2615 2304 req->io = NULL; 2616 - ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter); 2305 + ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock); 2617 2306 req->io = io; 2618 2307 if (ret < 0) 2619 2308 return ret; ··· 2622 2311 return 0; 2623 2312 } 2624 2313 2625 - static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, 2626 - bool force_nonblock) 2314 + static int io_write(struct io_kiocb *req, bool force_nonblock) 2627 2315 { 2628 2316 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 2629 2317 struct kiocb *kiocb = &req->rw.kiocb; ··· 2630 2320 size_t iov_count; 2631 2321 ssize_t ret, io_size; 2632 2322 2633 - ret = io_import_iovec(WRITE, req, &iovec, &iter); 2323 + ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock); 2634 2324 if (ret < 0) 2635 2325 return ret; 2636 2326 ··· 2647 2337 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so 2648 2338 * we know to async punt it even if it was opened O_NONBLOCK 2649 2339 */ 2650 - if (force_nonblock && !io_file_supports_async(req->file)) { 2651 - req->flags |= REQ_F_MUST_PUNT; 2340 + if (force_nonblock && !io_file_supports_async(req->file)) 2652 2341 goto copy_iov; 2653 - } 2654 2342 2655 2343 /* file path doesn't support NOWAIT for non-direct_IO */ 2656 2344 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && ··· 2675 2367 } 2676 2368 kiocb->ki_flags |= IOCB_WRITE; 2677 2369 2370 + if (!force_nonblock) 2371 + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize; 2372 + 2678 2373 if (req->file->f_op->write_iter) 2679 2374 ret2 = call_write_iter(req->file, kiocb, &iter); 2680 2375 else 2681 2376 ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); 2377 + 2378 + if (!force_nonblock) 2379 + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; 2380 + 2682 2381 /* 2683 - * Raw bdev writes will -EOPNOTSUPP for IOCB_NOWAIT. Just 2382 + * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just 2684 2383 * retry them without IOCB_NOWAIT. 2685 2384 */ 2686 2385 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) 2687 2386 ret2 = -EAGAIN; 2688 2387 if (!force_nonblock || ret2 != -EAGAIN) { 2689 - kiocb_done(kiocb, ret2, nxt, req->in_async); 2388 + kiocb_done(kiocb, ret2); 2690 2389 } else { 2691 2390 copy_iov: 2692 2391 ret = io_setup_async_rw(req, io_size, iovec, 2693 2392 inline_vecs, &iter); 2694 2393 if (ret) 2695 2394 goto out_free; 2395 + /* any defer here is final, must blocking retry */ 2396 + req->flags |= REQ_F_MUST_PUNT; 2696 2397 return -EAGAIN; 2697 2398 } 2698 2399 } ··· 2709 2392 req->flags &= ~REQ_F_NEED_CLEANUP; 2710 2393 kfree(iovec); 2711 2394 return ret; 2395 + } 2396 + 2397 + static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 2398 + { 2399 + struct io_splice* sp = &req->splice; 2400 + unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; 2401 + int ret; 2402 + 2403 + if (req->flags & REQ_F_NEED_CLEANUP) 2404 + return 0; 2405 + 2406 + sp->file_in = NULL; 2407 + sp->off_in = READ_ONCE(sqe->splice_off_in); 2408 + sp->off_out = READ_ONCE(sqe->off); 2409 + sp->len = READ_ONCE(sqe->len); 2410 + sp->flags = READ_ONCE(sqe->splice_flags); 2411 + 2412 + if (unlikely(sp->flags & ~valid_flags)) 2413 + return -EINVAL; 2414 + 2415 + ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in, 2416 + (sp->flags & SPLICE_F_FD_IN_FIXED)); 2417 + if (ret) 2418 + return ret; 2419 + req->flags |= REQ_F_NEED_CLEANUP; 2420 + 2421 + if (!S_ISREG(file_inode(sp->file_in)->i_mode)) 2422 + req->work.flags |= IO_WQ_WORK_UNBOUND; 2423 + 2424 + return 0; 2425 + } 2426 + 2427 + static bool io_splice_punt(struct file *file) 2428 + { 2429 + if (get_pipe_info(file)) 2430 + return false; 2431 + if (!io_file_supports_async(file)) 2432 + return true; 2433 + return !(file->f_mode & O_NONBLOCK); 2434 + } 2435 + 2436 + static int io_splice(struct io_kiocb *req, bool force_nonblock) 2437 + { 2438 + struct io_splice *sp = &req->splice; 2439 + struct file *in = sp->file_in; 2440 + struct file *out = sp->file_out; 2441 + unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; 2442 + loff_t *poff_in, *poff_out; 2443 + long ret; 2444 + 2445 + if (force_nonblock) { 2446 + if (io_splice_punt(in) || io_splice_punt(out)) 2447 + return -EAGAIN; 2448 + flags |= SPLICE_F_NONBLOCK; 2449 + } 2450 + 2451 + poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; 2452 + poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; 2453 + ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); 2454 + if (force_nonblock && ret == -EAGAIN) 2455 + return -EAGAIN; 2456 + 2457 + io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); 2458 + req->flags &= ~REQ_F_NEED_CLEANUP; 2459 + 2460 + io_cqring_add_event(req, ret); 2461 + if (ret != sp->len) 2462 + req_set_fail_links(req); 2463 + io_put_req(req); 2464 + return 0; 2712 2465 } 2713 2466 2714 2467 /* ··· 2829 2442 return false; 2830 2443 } 2831 2444 2832 - static void io_link_work_cb(struct io_wq_work **workptr) 2445 + static void __io_fsync(struct io_kiocb *req) 2833 2446 { 2834 - struct io_wq_work *work = *workptr; 2835 - struct io_kiocb *link = work->data; 2836 - 2837 - io_queue_linked_timeout(link); 2838 - work->func = io_wq_submit_work; 2839 - } 2840 - 2841 - static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) 2842 - { 2843 - struct io_kiocb *link; 2844 - 2845 - io_prep_async_work(nxt, &link); 2846 - *workptr = &nxt->work; 2847 - if (link) { 2848 - nxt->work.flags |= IO_WQ_WORK_CB; 2849 - nxt->work.func = io_link_work_cb; 2850 - nxt->work.data = link; 2851 - } 2852 - } 2853 - 2854 - static void io_fsync_finish(struct io_wq_work **workptr) 2855 - { 2856 - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); 2857 2447 loff_t end = req->sync.off + req->sync.len; 2858 - struct io_kiocb *nxt = NULL; 2859 2448 int ret; 2860 - 2861 - if (io_req_cancelled(req)) 2862 - return; 2863 2449 2864 2450 ret = vfs_fsync_range(req->file, req->sync.off, 2865 2451 end > 0 ? end : LLONG_MAX, ··· 2840 2480 if (ret < 0) 2841 2481 req_set_fail_links(req); 2842 2482 io_cqring_add_event(req, ret); 2843 - io_put_req_find_next(req, &nxt); 2844 - if (nxt) 2845 - io_wq_assign_next(workptr, nxt); 2483 + io_put_req(req); 2846 2484 } 2847 2485 2848 - static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt, 2849 - bool force_nonblock) 2486 + static void io_fsync_finish(struct io_wq_work **workptr) 2850 2487 { 2851 - struct io_wq_work *work, *old_work; 2488 + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); 2852 2489 2490 + if (io_req_cancelled(req)) 2491 + return; 2492 + __io_fsync(req); 2493 + io_steal_work(req, workptr); 2494 + } 2495 + 2496 + static int io_fsync(struct io_kiocb *req, bool force_nonblock) 2497 + { 2853 2498 /* fsync always requires a blocking context */ 2854 2499 if (force_nonblock) { 2855 - io_put_req(req); 2856 2500 req->work.func = io_fsync_finish; 2857 2501 return -EAGAIN; 2858 2502 } 2859 - 2860 - work = old_work = &req->work; 2861 - io_fsync_finish(&work); 2862 - if (work && work != old_work) 2863 - *nxt = container_of(work, struct io_kiocb, work); 2503 + __io_fsync(req); 2864 2504 return 0; 2505 + } 2506 + 2507 + static void __io_fallocate(struct io_kiocb *req) 2508 + { 2509 + int ret; 2510 + 2511 + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize; 2512 + ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, 2513 + req->sync.len); 2514 + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; 2515 + if (ret < 0) 2516 + req_set_fail_links(req); 2517 + io_cqring_add_event(req, ret); 2518 + io_put_req(req); 2865 2519 } 2866 2520 2867 2521 static void io_fallocate_finish(struct io_wq_work **workptr) 2868 2522 { 2869 2523 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); 2870 - struct io_kiocb *nxt = NULL; 2871 - int ret; 2872 2524 2873 2525 if (io_req_cancelled(req)) 2874 2526 return; 2875 - 2876 - ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, 2877 - req->sync.len); 2878 - if (ret < 0) 2879 - req_set_fail_links(req); 2880 - io_cqring_add_event(req, ret); 2881 - io_put_req_find_next(req, &nxt); 2882 - if (nxt) 2883 - io_wq_assign_next(workptr, nxt); 2527 + __io_fallocate(req); 2528 + io_steal_work(req, workptr); 2884 2529 } 2885 2530 2886 2531 static int io_fallocate_prep(struct io_kiocb *req, ··· 2897 2532 req->sync.off = READ_ONCE(sqe->off); 2898 2533 req->sync.len = READ_ONCE(sqe->addr); 2899 2534 req->sync.mode = READ_ONCE(sqe->len); 2535 + req->fsize = rlimit(RLIMIT_FSIZE); 2900 2536 return 0; 2901 2537 } 2902 2538 2903 - static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt, 2904 - bool force_nonblock) 2539 + static int io_fallocate(struct io_kiocb *req, bool force_nonblock) 2905 2540 { 2906 - struct io_wq_work *work, *old_work; 2907 - 2908 2541 /* fallocate always requiring blocking context */ 2909 2542 if (force_nonblock) { 2910 - io_put_req(req); 2911 2543 req->work.func = io_fallocate_finish; 2912 2544 return -EAGAIN; 2913 2545 } 2914 2546 2915 - work = old_work = &req->work; 2916 - io_fallocate_finish(&work); 2917 - if (work && work != old_work) 2918 - *nxt = container_of(work, struct io_kiocb, work); 2919 - 2547 + __io_fallocate(req); 2920 2548 return 0; 2921 2549 } 2922 2550 ··· 2984 2626 return 0; 2985 2627 } 2986 2628 2987 - static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt, 2988 - bool force_nonblock) 2629 + static int io_openat2(struct io_kiocb *req, bool force_nonblock) 2989 2630 { 2990 2631 struct open_flags op; 2991 2632 struct file *file; ··· 3015 2658 if (ret < 0) 3016 2659 req_set_fail_links(req); 3017 2660 io_cqring_add_event(req, ret); 3018 - io_put_req_find_next(req, nxt); 2661 + io_put_req(req); 3019 2662 return 0; 3020 2663 } 3021 2664 3022 - static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt, 3023 - bool force_nonblock) 2665 + static int io_openat(struct io_kiocb *req, bool force_nonblock) 3024 2666 { 3025 2667 req->open.how = build_open_how(req->open.how.flags, req->open.how.mode); 3026 - return io_openat2(req, nxt, force_nonblock); 2668 + return io_openat2(req, force_nonblock); 2669 + } 2670 + 2671 + static int io_remove_buffers_prep(struct io_kiocb *req, 2672 + const struct io_uring_sqe *sqe) 2673 + { 2674 + struct io_provide_buf *p = &req->pbuf; 2675 + u64 tmp; 2676 + 2677 + if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off) 2678 + return -EINVAL; 2679 + 2680 + tmp = READ_ONCE(sqe->fd); 2681 + if (!tmp || tmp > USHRT_MAX) 2682 + return -EINVAL; 2683 + 2684 + memset(p, 0, sizeof(*p)); 2685 + p->nbufs = tmp; 2686 + p->bgid = READ_ONCE(sqe->buf_group); 2687 + return 0; 2688 + } 2689 + 2690 + static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, 2691 + int bgid, unsigned nbufs) 2692 + { 2693 + unsigned i = 0; 2694 + 2695 + /* shouldn't happen */ 2696 + if (!nbufs) 2697 + return 0; 2698 + 2699 + /* the head kbuf is the list itself */ 2700 + while (!list_empty(&buf->list)) { 2701 + struct io_buffer *nxt; 2702 + 2703 + nxt = list_first_entry(&buf->list, struct io_buffer, list); 2704 + list_del(&nxt->list); 2705 + kfree(nxt); 2706 + if (++i == nbufs) 2707 + return i; 2708 + } 2709 + i++; 2710 + kfree(buf); 2711 + idr_remove(&ctx->io_buffer_idr, bgid); 2712 + 2713 + return i; 2714 + } 2715 + 2716 + static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock) 2717 + { 2718 + struct io_provide_buf *p = &req->pbuf; 2719 + struct io_ring_ctx *ctx = req->ctx; 2720 + struct io_buffer *head; 2721 + int ret = 0; 2722 + 2723 + io_ring_submit_lock(ctx, !force_nonblock); 2724 + 2725 + lockdep_assert_held(&ctx->uring_lock); 2726 + 2727 + ret = -ENOENT; 2728 + head = idr_find(&ctx->io_buffer_idr, p->bgid); 2729 + if (head) 2730 + ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); 2731 + 2732 + io_ring_submit_lock(ctx, !force_nonblock); 2733 + if (ret < 0) 2734 + req_set_fail_links(req); 2735 + io_cqring_add_event(req, ret); 2736 + io_put_req(req); 2737 + return 0; 2738 + } 2739 + 2740 + static int io_provide_buffers_prep(struct io_kiocb *req, 2741 + const struct io_uring_sqe *sqe) 2742 + { 2743 + struct io_provide_buf *p = &req->pbuf; 2744 + u64 tmp; 2745 + 2746 + if (sqe->ioprio || sqe->rw_flags) 2747 + return -EINVAL; 2748 + 2749 + tmp = READ_ONCE(sqe->fd); 2750 + if (!tmp || tmp > USHRT_MAX) 2751 + return -E2BIG; 2752 + p->nbufs = tmp; 2753 + p->addr = READ_ONCE(sqe->addr); 2754 + p->len = READ_ONCE(sqe->len); 2755 + 2756 + if (!access_ok(u64_to_user_ptr(p->addr), p->len)) 2757 + return -EFAULT; 2758 + 2759 + p->bgid = READ_ONCE(sqe->buf_group); 2760 + tmp = READ_ONCE(sqe->off); 2761 + if (tmp > USHRT_MAX) 2762 + return -E2BIG; 2763 + p->bid = tmp; 2764 + return 0; 2765 + } 2766 + 2767 + static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) 2768 + { 2769 + struct io_buffer *buf; 2770 + u64 addr = pbuf->addr; 2771 + int i, bid = pbuf->bid; 2772 + 2773 + for (i = 0; i < pbuf->nbufs; i++) { 2774 + buf = kmalloc(sizeof(*buf), GFP_KERNEL); 2775 + if (!buf) 2776 + break; 2777 + 2778 + buf->addr = addr; 2779 + buf->len = pbuf->len; 2780 + buf->bid = bid; 2781 + addr += pbuf->len; 2782 + bid++; 2783 + if (!*head) { 2784 + INIT_LIST_HEAD(&buf->list); 2785 + *head = buf; 2786 + } else { 2787 + list_add_tail(&buf->list, &(*head)->list); 2788 + } 2789 + } 2790 + 2791 + return i ? i : -ENOMEM; 2792 + } 2793 + 2794 + static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock) 2795 + { 2796 + struct io_provide_buf *p = &req->pbuf; 2797 + struct io_ring_ctx *ctx = req->ctx; 2798 + struct io_buffer *head, *list; 2799 + int ret = 0; 2800 + 2801 + io_ring_submit_lock(ctx, !force_nonblock); 2802 + 2803 + lockdep_assert_held(&ctx->uring_lock); 2804 + 2805 + list = head = idr_find(&ctx->io_buffer_idr, p->bgid); 2806 + 2807 + ret = io_add_buffers(p, &head); 2808 + if (ret < 0) 2809 + goto out; 2810 + 2811 + if (!list) { 2812 + ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1, 2813 + GFP_KERNEL); 2814 + if (ret < 0) { 2815 + __io_remove_buffers(ctx, head, p->bgid, -1U); 2816 + goto out; 2817 + } 2818 + } 2819 + out: 2820 + io_ring_submit_unlock(ctx, !force_nonblock); 2821 + if (ret < 0) 2822 + req_set_fail_links(req); 2823 + io_cqring_add_event(req, ret); 2824 + io_put_req(req); 2825 + return 0; 3027 2826 } 3028 2827 3029 2828 static int io_epoll_ctl_prep(struct io_kiocb *req, ··· 3207 2694 #endif 3208 2695 } 3209 2696 3210 - static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt, 3211 - bool force_nonblock) 2697 + static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock) 3212 2698 { 3213 2699 #if defined(CONFIG_EPOLL) 3214 2700 struct io_epoll *ie = &req->epoll; ··· 3220 2708 if (ret < 0) 3221 2709 req_set_fail_links(req); 3222 2710 io_cqring_add_event(req, ret); 3223 - io_put_req_find_next(req, nxt); 2711 + io_put_req(req); 3224 2712 return 0; 3225 2713 #else 3226 2714 return -EOPNOTSUPP; ··· 3242 2730 #endif 3243 2731 } 3244 2732 3245 - static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt, 3246 - bool force_nonblock) 2733 + static int io_madvise(struct io_kiocb *req, bool force_nonblock) 3247 2734 { 3248 2735 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) 3249 2736 struct io_madvise *ma = &req->madvise; ··· 3255 2744 if (ret < 0) 3256 2745 req_set_fail_links(req); 3257 2746 io_cqring_add_event(req, ret); 3258 - io_put_req_find_next(req, nxt); 2747 + io_put_req(req); 3259 2748 return 0; 3260 2749 #else 3261 2750 return -EOPNOTSUPP; ··· 3273 2762 return 0; 3274 2763 } 3275 2764 3276 - static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt, 3277 - bool force_nonblock) 2765 + static int io_fadvise(struct io_kiocb *req, bool force_nonblock) 3278 2766 { 3279 2767 struct io_fadvise *fa = &req->fadvise; 3280 2768 int ret; ··· 3293 2783 if (ret < 0) 3294 2784 req_set_fail_links(req); 3295 2785 io_cqring_add_event(req, ret); 3296 - io_put_req_find_next(req, nxt); 2786 + io_put_req(req); 3297 2787 return 0; 3298 2788 } 3299 2789 ··· 3330 2820 return 0; 3331 2821 } 3332 2822 3333 - static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt, 3334 - bool force_nonblock) 2823 + static int io_statx(struct io_kiocb *req, bool force_nonblock) 3335 2824 { 3336 2825 struct io_open *ctx = &req->open; 3337 2826 unsigned lookup_flags; ··· 3367 2858 if (ret < 0) 3368 2859 req_set_fail_links(req); 3369 2860 io_cqring_add_event(req, ret); 3370 - io_put_req_find_next(req, nxt); 2861 + io_put_req(req); 3371 2862 return 0; 3372 2863 } 3373 2864 ··· 3394 2885 } 3395 2886 3396 2887 /* only called when __close_fd_get_file() is done */ 3397 - static void __io_close_finish(struct io_kiocb *req, struct io_kiocb **nxt) 2888 + static void __io_close_finish(struct io_kiocb *req) 3398 2889 { 3399 2890 int ret; 3400 2891 ··· 3403 2894 req_set_fail_links(req); 3404 2895 io_cqring_add_event(req, ret); 3405 2896 fput(req->close.put_file); 3406 - io_put_req_find_next(req, nxt); 2897 + io_put_req(req); 3407 2898 } 3408 2899 3409 2900 static void io_close_finish(struct io_wq_work **workptr) 3410 2901 { 3411 2902 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); 3412 - struct io_kiocb *nxt = NULL; 3413 2903 3414 2904 /* not cancellable, don't do io_req_cancelled() */ 3415 - __io_close_finish(req, &nxt); 3416 - if (nxt) 3417 - io_wq_assign_next(workptr, nxt); 2905 + __io_close_finish(req); 2906 + io_steal_work(req, workptr); 3418 2907 } 3419 2908 3420 - static int io_close(struct io_kiocb *req, struct io_kiocb **nxt, 3421 - bool force_nonblock) 2909 + static int io_close(struct io_kiocb *req, bool force_nonblock) 3422 2910 { 3423 2911 int ret; 3424 2912 ··· 3425 2919 return ret; 3426 2920 3427 2921 /* if the file has a flush method, be safe and punt to async */ 3428 - if (req->close.put_file->f_op->flush && !io_wq_current_is_worker()) 3429 - goto eagain; 2922 + if (req->close.put_file->f_op->flush && force_nonblock) { 2923 + /* submission ref will be dropped, take it for async */ 2924 + refcount_inc(&req->refs); 2925 + 2926 + req->work.func = io_close_finish; 2927 + /* 2928 + * Do manual async queue here to avoid grabbing files - we don't 2929 + * need the files, and it'll cause io_close_finish() to close 2930 + * the file again and cause a double CQE entry for this request 2931 + */ 2932 + io_queue_async_work(req); 2933 + return 0; 2934 + } 3430 2935 3431 2936 /* 3432 2937 * No ->flush(), safely close from here and just punt the 3433 2938 * fput() to async context. 3434 2939 */ 3435 - __io_close_finish(req, nxt); 3436 - return 0; 3437 - eagain: 3438 - req->work.func = io_close_finish; 3439 - /* 3440 - * Do manual async queue here to avoid grabbing files - we don't 3441 - * need the files, and it'll cause io_close_finish() to close 3442 - * the file again and cause a double CQE entry for this request 3443 - */ 3444 - io_queue_async_work(req); 2940 + __io_close_finish(req); 3445 2941 return 0; 3446 2942 } 3447 2943 ··· 3465 2957 return 0; 3466 2958 } 3467 2959 3468 - static void io_sync_file_range_finish(struct io_wq_work **workptr) 2960 + static void __io_sync_file_range(struct io_kiocb *req) 3469 2961 { 3470 - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); 3471 - struct io_kiocb *nxt = NULL; 3472 2962 int ret; 3473 - 3474 - if (io_req_cancelled(req)) 3475 - return; 3476 2963 3477 2964 ret = sync_file_range(req->file, req->sync.off, req->sync.len, 3478 2965 req->sync.flags); 3479 2966 if (ret < 0) 3480 2967 req_set_fail_links(req); 3481 2968 io_cqring_add_event(req, ret); 3482 - io_put_req_find_next(req, &nxt); 2969 + io_put_req(req); 2970 + } 2971 + 2972 + 2973 + static void io_sync_file_range_finish(struct io_wq_work **workptr) 2974 + { 2975 + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); 2976 + struct io_kiocb *nxt = NULL; 2977 + 2978 + if (io_req_cancelled(req)) 2979 + return; 2980 + __io_sync_file_range(req); 2981 + io_put_req(req); /* put submission ref */ 3483 2982 if (nxt) 3484 2983 io_wq_assign_next(workptr, nxt); 3485 2984 } 3486 2985 3487 - static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt, 3488 - bool force_nonblock) 2986 + static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock) 3489 2987 { 3490 - struct io_wq_work *work, *old_work; 3491 - 3492 2988 /* sync_file_range always requires a blocking context */ 3493 2989 if (force_nonblock) { 3494 - io_put_req(req); 3495 2990 req->work.func = io_sync_file_range_finish; 3496 2991 return -EAGAIN; 3497 2992 } 3498 2993 3499 - work = old_work = &req->work; 3500 - io_sync_file_range_finish(&work); 3501 - if (work && work != old_work) 3502 - *nxt = container_of(work, struct io_kiocb, work); 2994 + __io_sync_file_range(req); 3503 2995 return 0; 2996 + } 2997 + 2998 + #if defined(CONFIG_NET) 2999 + static int io_setup_async_msg(struct io_kiocb *req, 3000 + struct io_async_msghdr *kmsg) 3001 + { 3002 + if (req->io) 3003 + return -EAGAIN; 3004 + if (io_alloc_async_ctx(req)) { 3005 + if (kmsg->iov != kmsg->fast_iov) 3006 + kfree(kmsg->iov); 3007 + return -ENOMEM; 3008 + } 3009 + req->flags |= REQ_F_NEED_CLEANUP; 3010 + memcpy(&req->io->msg, kmsg, sizeof(*kmsg)); 3011 + return -EAGAIN; 3504 3012 } 3505 3013 3506 3014 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3507 3015 { 3508 - #if defined(CONFIG_NET) 3509 3016 struct io_sr_msg *sr = &req->sr_msg; 3510 3017 struct io_async_ctx *io = req->io; 3511 3018 int ret; ··· 3546 3023 if (!ret) 3547 3024 req->flags |= REQ_F_NEED_CLEANUP; 3548 3025 return ret; 3549 - #else 3550 - return -EOPNOTSUPP; 3551 - #endif 3552 3026 } 3553 3027 3554 - static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, 3555 - bool force_nonblock) 3028 + static int io_sendmsg(struct io_kiocb *req, bool force_nonblock) 3556 3029 { 3557 - #if defined(CONFIG_NET) 3558 3030 struct io_async_msghdr *kmsg = NULL; 3559 3031 struct socket *sock; 3560 3032 int ret; ··· 3589 3071 flags |= MSG_DONTWAIT; 3590 3072 3591 3073 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 3592 - if (force_nonblock && ret == -EAGAIN) { 3593 - if (req->io) 3594 - return -EAGAIN; 3595 - if (io_alloc_async_ctx(req)) { 3596 - if (kmsg->iov != kmsg->fast_iov) 3597 - kfree(kmsg->iov); 3598 - return -ENOMEM; 3599 - } 3600 - req->flags |= REQ_F_NEED_CLEANUP; 3601 - memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); 3602 - return -EAGAIN; 3603 - } 3074 + if (force_nonblock && ret == -EAGAIN) 3075 + return io_setup_async_msg(req, kmsg); 3604 3076 if (ret == -ERESTARTSYS) 3605 3077 ret = -EINTR; 3606 3078 } ··· 3601 3093 io_cqring_add_event(req, ret); 3602 3094 if (ret < 0) 3603 3095 req_set_fail_links(req); 3604 - io_put_req_find_next(req, nxt); 3096 + io_put_req(req); 3605 3097 return 0; 3606 - #else 3607 - return -EOPNOTSUPP; 3608 - #endif 3609 3098 } 3610 3099 3611 - static int io_send(struct io_kiocb *req, struct io_kiocb **nxt, 3612 - bool force_nonblock) 3100 + static int io_send(struct io_kiocb *req, bool force_nonblock) 3613 3101 { 3614 - #if defined(CONFIG_NET) 3615 3102 struct socket *sock; 3616 3103 int ret; 3617 3104 ··· 3647 3144 io_cqring_add_event(req, ret); 3648 3145 if (ret < 0) 3649 3146 req_set_fail_links(req); 3650 - io_put_req_find_next(req, nxt); 3147 + io_put_req(req); 3651 3148 return 0; 3652 - #else 3653 - return -EOPNOTSUPP; 3149 + } 3150 + 3151 + static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) 3152 + { 3153 + struct io_sr_msg *sr = &req->sr_msg; 3154 + struct iovec __user *uiov; 3155 + size_t iov_len; 3156 + int ret; 3157 + 3158 + ret = __copy_msghdr_from_user(&io->msg.msg, sr->msg, &io->msg.uaddr, 3159 + &uiov, &iov_len); 3160 + if (ret) 3161 + return ret; 3162 + 3163 + if (req->flags & REQ_F_BUFFER_SELECT) { 3164 + if (iov_len > 1) 3165 + return -EINVAL; 3166 + if (copy_from_user(io->msg.iov, uiov, sizeof(*uiov))) 3167 + return -EFAULT; 3168 + sr->len = io->msg.iov[0].iov_len; 3169 + iov_iter_init(&io->msg.msg.msg_iter, READ, io->msg.iov, 1, 3170 + sr->len); 3171 + io->msg.iov = NULL; 3172 + } else { 3173 + ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV, 3174 + &io->msg.iov, &io->msg.msg.msg_iter); 3175 + if (ret > 0) 3176 + ret = 0; 3177 + } 3178 + 3179 + return ret; 3180 + } 3181 + 3182 + #ifdef CONFIG_COMPAT 3183 + static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, 3184 + struct io_async_ctx *io) 3185 + { 3186 + struct compat_msghdr __user *msg_compat; 3187 + struct io_sr_msg *sr = &req->sr_msg; 3188 + struct compat_iovec __user *uiov; 3189 + compat_uptr_t ptr; 3190 + compat_size_t len; 3191 + int ret; 3192 + 3193 + msg_compat = (struct compat_msghdr __user *) sr->msg; 3194 + ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr, 3195 + &ptr, &len); 3196 + if (ret) 3197 + return ret; 3198 + 3199 + uiov = compat_ptr(ptr); 3200 + if (req->flags & REQ_F_BUFFER_SELECT) { 3201 + compat_ssize_t clen; 3202 + 3203 + if (len > 1) 3204 + return -EINVAL; 3205 + if (!access_ok(uiov, sizeof(*uiov))) 3206 + return -EFAULT; 3207 + if (__get_user(clen, &uiov->iov_len)) 3208 + return -EFAULT; 3209 + if (clen < 0) 3210 + return -EINVAL; 3211 + sr->len = io->msg.iov[0].iov_len; 3212 + io->msg.iov = NULL; 3213 + } else { 3214 + ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV, 3215 + &io->msg.iov, 3216 + &io->msg.msg.msg_iter); 3217 + if (ret < 0) 3218 + return ret; 3219 + } 3220 + 3221 + return 0; 3222 + } 3654 3223 #endif 3224 + 3225 + static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) 3226 + { 3227 + io->msg.iov = io->msg.fast_iov; 3228 + 3229 + #ifdef CONFIG_COMPAT 3230 + if (req->ctx->compat) 3231 + return __io_compat_recvmsg_copy_hdr(req, io); 3232 + #endif 3233 + 3234 + return __io_recvmsg_copy_hdr(req, io); 3235 + } 3236 + 3237 + static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, 3238 + int *cflags, bool needs_lock) 3239 + { 3240 + struct io_sr_msg *sr = &req->sr_msg; 3241 + struct io_buffer *kbuf; 3242 + 3243 + if (!(req->flags & REQ_F_BUFFER_SELECT)) 3244 + return NULL; 3245 + 3246 + kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock); 3247 + if (IS_ERR(kbuf)) 3248 + return kbuf; 3249 + 3250 + sr->kbuf = kbuf; 3251 + req->flags |= REQ_F_BUFFER_SELECTED; 3252 + 3253 + *cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; 3254 + *cflags |= IORING_CQE_F_BUFFER; 3255 + return kbuf; 3655 3256 } 3656 3257 3657 3258 static int io_recvmsg_prep(struct io_kiocb *req, 3658 3259 const struct io_uring_sqe *sqe) 3659 3260 { 3660 - #if defined(CONFIG_NET) 3661 3261 struct io_sr_msg *sr = &req->sr_msg; 3662 3262 struct io_async_ctx *io = req->io; 3663 3263 int ret; ··· 3768 3162 sr->msg_flags = READ_ONCE(sqe->msg_flags); 3769 3163 sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3770 3164 sr->len = READ_ONCE(sqe->len); 3165 + sr->bgid = READ_ONCE(sqe->buf_group); 3771 3166 3772 3167 #ifdef CONFIG_COMPAT 3773 3168 if (req->ctx->compat) ··· 3781 3174 if (req->flags & REQ_F_NEED_CLEANUP) 3782 3175 return 0; 3783 3176 3784 - io->msg.iov = io->msg.fast_iov; 3785 - ret = recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, 3786 - &io->msg.uaddr, &io->msg.iov); 3177 + ret = io_recvmsg_copy_hdr(req, io); 3787 3178 if (!ret) 3788 3179 req->flags |= REQ_F_NEED_CLEANUP; 3789 3180 return ret; 3790 - #else 3791 - return -EOPNOTSUPP; 3792 - #endif 3793 3181 } 3794 3182 3795 - static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, 3796 - bool force_nonblock) 3183 + static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) 3797 3184 { 3798 - #if defined(CONFIG_NET) 3799 3185 struct io_async_msghdr *kmsg = NULL; 3800 3186 struct socket *sock; 3801 - int ret; 3187 + int ret, cflags = 0; 3802 3188 3803 3189 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3804 3190 return -EINVAL; 3805 3191 3806 3192 sock = sock_from_file(req->file, &ret); 3807 3193 if (sock) { 3194 + struct io_buffer *kbuf; 3808 3195 struct io_async_ctx io; 3809 3196 unsigned flags; 3810 3197 ··· 3810 3209 kmsg->iov = kmsg->fast_iov; 3811 3210 kmsg->msg.msg_iter.iov = kmsg->iov; 3812 3211 } else { 3813 - struct io_sr_msg *sr = &req->sr_msg; 3814 - 3815 3212 kmsg = &io.msg; 3816 3213 kmsg->msg.msg_name = &io.msg.addr; 3817 3214 3818 - io.msg.iov = io.msg.fast_iov; 3819 - ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg, 3820 - sr->msg_flags, &io.msg.uaddr, 3821 - &io.msg.iov); 3215 + ret = io_recvmsg_copy_hdr(req, &io); 3822 3216 if (ret) 3823 3217 return ret; 3218 + } 3219 + 3220 + kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); 3221 + if (IS_ERR(kbuf)) { 3222 + return PTR_ERR(kbuf); 3223 + } else if (kbuf) { 3224 + kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); 3225 + iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov, 3226 + 1, req->sr_msg.len); 3824 3227 } 3825 3228 3826 3229 flags = req->sr_msg.msg_flags; ··· 3835 3230 3836 3231 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg, 3837 3232 kmsg->uaddr, flags); 3838 - if (force_nonblock && ret == -EAGAIN) { 3839 - if (req->io) 3840 - return -EAGAIN; 3841 - if (io_alloc_async_ctx(req)) { 3842 - if (kmsg->iov != kmsg->fast_iov) 3843 - kfree(kmsg->iov); 3844 - return -ENOMEM; 3845 - } 3846 - memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); 3847 - req->flags |= REQ_F_NEED_CLEANUP; 3848 - return -EAGAIN; 3849 - } 3233 + if (force_nonblock && ret == -EAGAIN) 3234 + return io_setup_async_msg(req, kmsg); 3850 3235 if (ret == -ERESTARTSYS) 3851 3236 ret = -EINTR; 3852 3237 } ··· 3844 3249 if (kmsg && kmsg->iov != kmsg->fast_iov) 3845 3250 kfree(kmsg->iov); 3846 3251 req->flags &= ~REQ_F_NEED_CLEANUP; 3847 - io_cqring_add_event(req, ret); 3252 + __io_cqring_add_event(req, ret, cflags); 3848 3253 if (ret < 0) 3849 3254 req_set_fail_links(req); 3850 - io_put_req_find_next(req, nxt); 3255 + io_put_req(req); 3851 3256 return 0; 3852 - #else 3853 - return -EOPNOTSUPP; 3854 - #endif 3855 3257 } 3856 3258 3857 - static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt, 3858 - bool force_nonblock) 3259 + static int io_recv(struct io_kiocb *req, bool force_nonblock) 3859 3260 { 3860 - #if defined(CONFIG_NET) 3261 + struct io_buffer *kbuf = NULL; 3861 3262 struct socket *sock; 3862 - int ret; 3263 + int ret, cflags = 0; 3863 3264 3864 3265 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3865 3266 return -EINVAL; ··· 3863 3272 sock = sock_from_file(req->file, &ret); 3864 3273 if (sock) { 3865 3274 struct io_sr_msg *sr = &req->sr_msg; 3275 + void __user *buf = sr->buf; 3866 3276 struct msghdr msg; 3867 3277 struct iovec iov; 3868 3278 unsigned flags; 3869 3279 3870 - ret = import_single_range(READ, sr->buf, sr->len, &iov, 3871 - &msg.msg_iter); 3872 - if (ret) 3873 - return ret; 3280 + kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); 3281 + if (IS_ERR(kbuf)) 3282 + return PTR_ERR(kbuf); 3283 + else if (kbuf) 3284 + buf = u64_to_user_ptr(kbuf->addr); 3874 3285 3286 + ret = import_single_range(READ, buf, sr->len, &iov, 3287 + &msg.msg_iter); 3288 + if (ret) { 3289 + kfree(kbuf); 3290 + return ret; 3291 + } 3292 + 3293 + req->flags |= REQ_F_NEED_CLEANUP; 3875 3294 msg.msg_name = NULL; 3876 3295 msg.msg_control = NULL; 3877 3296 msg.msg_controllen = 0; ··· 3902 3301 ret = -EINTR; 3903 3302 } 3904 3303 3905 - io_cqring_add_event(req, ret); 3304 + kfree(kbuf); 3305 + req->flags &= ~REQ_F_NEED_CLEANUP; 3306 + __io_cqring_add_event(req, ret, cflags); 3906 3307 if (ret < 0) 3907 3308 req_set_fail_links(req); 3908 - io_put_req_find_next(req, nxt); 3309 + io_put_req(req); 3909 3310 return 0; 3910 - #else 3911 - return -EOPNOTSUPP; 3912 - #endif 3913 3311 } 3914 - 3915 3312 3916 3313 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3917 3314 { 3918 - #if defined(CONFIG_NET) 3919 3315 struct io_accept *accept = &req->accept; 3920 3316 3921 3317 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) ··· 3925 3327 accept->flags = READ_ONCE(sqe->accept_flags); 3926 3328 accept->nofile = rlimit(RLIMIT_NOFILE); 3927 3329 return 0; 3928 - #else 3929 - return -EOPNOTSUPP; 3930 - #endif 3931 3330 } 3932 3331 3933 - #if defined(CONFIG_NET) 3934 - static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt, 3935 - bool force_nonblock) 3332 + static int __io_accept(struct io_kiocb *req, bool force_nonblock) 3936 3333 { 3937 3334 struct io_accept *accept = &req->accept; 3938 3335 unsigned file_flags; ··· 3944 3351 if (ret < 0) 3945 3352 req_set_fail_links(req); 3946 3353 io_cqring_add_event(req, ret); 3947 - io_put_req_find_next(req, nxt); 3354 + io_put_req(req); 3948 3355 return 0; 3949 3356 } 3950 3357 3951 3358 static void io_accept_finish(struct io_wq_work **workptr) 3952 3359 { 3953 3360 struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); 3954 - struct io_kiocb *nxt = NULL; 3955 3361 3956 3362 if (io_req_cancelled(req)) 3957 3363 return; 3958 - __io_accept(req, &nxt, false); 3959 - if (nxt) 3960 - io_wq_assign_next(workptr, nxt); 3364 + __io_accept(req, false); 3365 + io_steal_work(req, workptr); 3961 3366 } 3962 - #endif 3963 3367 3964 - static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt, 3965 - bool force_nonblock) 3368 + static int io_accept(struct io_kiocb *req, bool force_nonblock) 3966 3369 { 3967 - #if defined(CONFIG_NET) 3968 3370 int ret; 3969 3371 3970 - ret = __io_accept(req, nxt, force_nonblock); 3372 + ret = __io_accept(req, force_nonblock); 3971 3373 if (ret == -EAGAIN && force_nonblock) { 3972 3374 req->work.func = io_accept_finish; 3973 - io_put_req(req); 3974 3375 return -EAGAIN; 3975 3376 } 3976 3377 return 0; 3977 - #else 3978 - return -EOPNOTSUPP; 3979 - #endif 3980 3378 } 3981 3379 3982 3380 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3983 3381 { 3984 - #if defined(CONFIG_NET) 3985 3382 struct io_connect *conn = &req->connect; 3986 3383 struct io_async_ctx *io = req->io; 3987 3384 ··· 3988 3405 3989 3406 return move_addr_to_kernel(conn->addr, conn->addr_len, 3990 3407 &io->connect.address); 3991 - #else 3992 - return -EOPNOTSUPP; 3993 - #endif 3994 3408 } 3995 3409 3996 - static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt, 3997 - bool force_nonblock) 3410 + static int io_connect(struct io_kiocb *req, bool force_nonblock) 3998 3411 { 3999 - #if defined(CONFIG_NET) 4000 3412 struct io_async_ctx __io, *io; 4001 3413 unsigned file_flags; 4002 3414 int ret; ··· 4027 3449 if (ret < 0) 4028 3450 req_set_fail_links(req); 4029 3451 io_cqring_add_event(req, ret); 4030 - io_put_req_find_next(req, nxt); 3452 + io_put_req(req); 4031 3453 return 0; 4032 - #else 3454 + } 3455 + #else /* !CONFIG_NET */ 3456 + static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3457 + { 4033 3458 return -EOPNOTSUPP; 4034 - #endif 4035 3459 } 4036 3460 4037 - static void io_poll_remove_one(struct io_kiocb *req) 3461 + static int io_sendmsg(struct io_kiocb *req, bool force_nonblock) 4038 3462 { 4039 - struct io_poll_iocb *poll = &req->poll; 3463 + return -EOPNOTSUPP; 3464 + } 3465 + 3466 + static int io_send(struct io_kiocb *req, bool force_nonblock) 3467 + { 3468 + return -EOPNOTSUPP; 3469 + } 3470 + 3471 + static int io_recvmsg_prep(struct io_kiocb *req, 3472 + const struct io_uring_sqe *sqe) 3473 + { 3474 + return -EOPNOTSUPP; 3475 + } 3476 + 3477 + static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) 3478 + { 3479 + return -EOPNOTSUPP; 3480 + } 3481 + 3482 + static int io_recv(struct io_kiocb *req, bool force_nonblock) 3483 + { 3484 + return -EOPNOTSUPP; 3485 + } 3486 + 3487 + static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3488 + { 3489 + return -EOPNOTSUPP; 3490 + } 3491 + 3492 + static int io_accept(struct io_kiocb *req, bool force_nonblock) 3493 + { 3494 + return -EOPNOTSUPP; 3495 + } 3496 + 3497 + static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3498 + { 3499 + return -EOPNOTSUPP; 3500 + } 3501 + 3502 + static int io_connect(struct io_kiocb *req, bool force_nonblock) 3503 + { 3504 + return -EOPNOTSUPP; 3505 + } 3506 + #endif /* CONFIG_NET */ 3507 + 3508 + struct io_poll_table { 3509 + struct poll_table_struct pt; 3510 + struct io_kiocb *req; 3511 + int error; 3512 + }; 3513 + 3514 + static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, 3515 + struct wait_queue_head *head) 3516 + { 3517 + if (unlikely(poll->head)) { 3518 + pt->error = -EINVAL; 3519 + return; 3520 + } 3521 + 3522 + pt->error = 0; 3523 + poll->head = head; 3524 + add_wait_queue(head, &poll->wait); 3525 + } 3526 + 3527 + static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, 3528 + struct poll_table_struct *p) 3529 + { 3530 + struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); 3531 + 3532 + __io_queue_proc(&pt->req->apoll->poll, pt, head); 3533 + } 3534 + 3535 + static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, 3536 + __poll_t mask, task_work_func_t func) 3537 + { 3538 + struct task_struct *tsk; 3539 + 3540 + /* for instances that support it check for an event match first: */ 3541 + if (mask && !(mask & poll->events)) 3542 + return 0; 3543 + 3544 + trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask); 3545 + 3546 + list_del_init(&poll->wait.entry); 3547 + 3548 + tsk = req->task; 3549 + req->result = mask; 3550 + init_task_work(&req->task_work, func); 3551 + /* 3552 + * If this fails, then the task is exiting. If that is the case, then 3553 + * the exit check will ultimately cancel these work items. Hence we 3554 + * don't need to check here and handle it specifically. 3555 + */ 3556 + task_work_add(tsk, &req->task_work, true); 3557 + wake_up_process(tsk); 3558 + return 1; 3559 + } 3560 + 3561 + static void io_async_task_func(struct callback_head *cb) 3562 + { 3563 + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); 3564 + struct async_poll *apoll = req->apoll; 3565 + struct io_ring_ctx *ctx = req->ctx; 3566 + 3567 + trace_io_uring_task_run(req->ctx, req->opcode, req->user_data); 3568 + 3569 + WARN_ON_ONCE(!list_empty(&req->apoll->poll.wait.entry)); 3570 + 3571 + if (hash_hashed(&req->hash_node)) { 3572 + spin_lock_irq(&ctx->completion_lock); 3573 + hash_del(&req->hash_node); 3574 + spin_unlock_irq(&ctx->completion_lock); 3575 + } 3576 + 3577 + /* restore ->work in case we need to retry again */ 3578 + memcpy(&req->work, &apoll->work, sizeof(req->work)); 3579 + 3580 + __set_current_state(TASK_RUNNING); 3581 + mutex_lock(&ctx->uring_lock); 3582 + __io_queue_sqe(req, NULL); 3583 + mutex_unlock(&ctx->uring_lock); 3584 + 3585 + kfree(apoll); 3586 + } 3587 + 3588 + static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, 3589 + void *key) 3590 + { 3591 + struct io_kiocb *req = wait->private; 3592 + struct io_poll_iocb *poll = &req->apoll->poll; 3593 + 3594 + trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data, 3595 + key_to_poll(key)); 3596 + 3597 + return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func); 3598 + } 3599 + 3600 + static void io_poll_req_insert(struct io_kiocb *req) 3601 + { 3602 + struct io_ring_ctx *ctx = req->ctx; 3603 + struct hlist_head *list; 3604 + 3605 + list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; 3606 + hlist_add_head(&req->hash_node, list); 3607 + } 3608 + 3609 + static __poll_t __io_arm_poll_handler(struct io_kiocb *req, 3610 + struct io_poll_iocb *poll, 3611 + struct io_poll_table *ipt, __poll_t mask, 3612 + wait_queue_func_t wake_func) 3613 + __acquires(&ctx->completion_lock) 3614 + { 3615 + struct io_ring_ctx *ctx = req->ctx; 3616 + bool cancel = false; 3617 + 3618 + poll->file = req->file; 3619 + poll->head = NULL; 3620 + poll->done = poll->canceled = false; 3621 + poll->events = mask; 3622 + 3623 + ipt->pt._key = mask; 3624 + ipt->req = req; 3625 + ipt->error = -EINVAL; 3626 + 3627 + INIT_LIST_HEAD(&poll->wait.entry); 3628 + init_waitqueue_func_entry(&poll->wait, wake_func); 3629 + poll->wait.private = req; 3630 + 3631 + mask = vfs_poll(req->file, &ipt->pt) & poll->events; 3632 + 3633 + spin_lock_irq(&ctx->completion_lock); 3634 + if (likely(poll->head)) { 3635 + spin_lock(&poll->head->lock); 3636 + if (unlikely(list_empty(&poll->wait.entry))) { 3637 + if (ipt->error) 3638 + cancel = true; 3639 + ipt->error = 0; 3640 + mask = 0; 3641 + } 3642 + if (mask || ipt->error) 3643 + list_del_init(&poll->wait.entry); 3644 + else if (cancel) 3645 + WRITE_ONCE(poll->canceled, true); 3646 + else if (!poll->done) /* actually waiting for an event */ 3647 + io_poll_req_insert(req); 3648 + spin_unlock(&poll->head->lock); 3649 + } 3650 + 3651 + return mask; 3652 + } 3653 + 3654 + static bool io_arm_poll_handler(struct io_kiocb *req) 3655 + { 3656 + const struct io_op_def *def = &io_op_defs[req->opcode]; 3657 + struct io_ring_ctx *ctx = req->ctx; 3658 + struct async_poll *apoll; 3659 + struct io_poll_table ipt; 3660 + __poll_t mask, ret; 3661 + 3662 + if (!req->file || !file_can_poll(req->file)) 3663 + return false; 3664 + if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED)) 3665 + return false; 3666 + if (!def->pollin && !def->pollout) 3667 + return false; 3668 + 3669 + apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); 3670 + if (unlikely(!apoll)) 3671 + return false; 3672 + 3673 + req->flags |= REQ_F_POLLED; 3674 + memcpy(&apoll->work, &req->work, sizeof(req->work)); 3675 + 3676 + /* 3677 + * Don't need a reference here, as we're adding it to the task 3678 + * task_works list. If the task exits, the list is pruned. 3679 + */ 3680 + req->task = current; 3681 + req->apoll = apoll; 3682 + INIT_HLIST_NODE(&req->hash_node); 3683 + 3684 + mask = 0; 3685 + if (def->pollin) 3686 + mask |= POLLIN | POLLRDNORM; 3687 + if (def->pollout) 3688 + mask |= POLLOUT | POLLWRNORM; 3689 + mask |= POLLERR | POLLPRI; 3690 + 3691 + ipt.pt._qproc = io_async_queue_proc; 3692 + 3693 + ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, 3694 + io_async_wake); 3695 + if (ret) { 3696 + ipt.error = 0; 3697 + apoll->poll.done = true; 3698 + spin_unlock_irq(&ctx->completion_lock); 3699 + memcpy(&req->work, &apoll->work, sizeof(req->work)); 3700 + kfree(apoll); 3701 + return false; 3702 + } 3703 + spin_unlock_irq(&ctx->completion_lock); 3704 + trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask, 3705 + apoll->poll.events); 3706 + return true; 3707 + } 3708 + 3709 + static bool __io_poll_remove_one(struct io_kiocb *req, 3710 + struct io_poll_iocb *poll) 3711 + { 3712 + bool do_complete = false; 4040 3713 4041 3714 spin_lock(&poll->head->lock); 4042 3715 WRITE_ONCE(poll->canceled, true); 4043 3716 if (!list_empty(&poll->wait.entry)) { 4044 3717 list_del_init(&poll->wait.entry); 4045 - io_queue_async_work(req); 3718 + do_complete = true; 4046 3719 } 4047 3720 spin_unlock(&poll->head->lock); 3721 + return do_complete; 3722 + } 3723 + 3724 + static bool io_poll_remove_one(struct io_kiocb *req) 3725 + { 3726 + bool do_complete; 3727 + 3728 + if (req->opcode == IORING_OP_POLL_ADD) { 3729 + do_complete = __io_poll_remove_one(req, &req->poll); 3730 + } else { 3731 + /* non-poll requests have submit ref still */ 3732 + do_complete = __io_poll_remove_one(req, &req->apoll->poll); 3733 + if (do_complete) 3734 + io_put_req(req); 3735 + } 3736 + 4048 3737 hash_del(&req->hash_node); 3738 + 3739 + if (do_complete) { 3740 + io_cqring_fill_event(req, -ECANCELED); 3741 + io_commit_cqring(req->ctx); 3742 + req->flags |= REQ_F_COMP_LOCKED; 3743 + io_put_req(req); 3744 + } 3745 + 3746 + return do_complete; 4049 3747 } 4050 3748 4051 3749 static void io_poll_remove_all(struct io_ring_ctx *ctx) ··· 4339 3485 io_poll_remove_one(req); 4340 3486 } 4341 3487 spin_unlock_irq(&ctx->completion_lock); 3488 + 3489 + io_cqring_ev_posted(ctx); 4342 3490 } 4343 3491 4344 3492 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) ··· 4350 3494 4351 3495 list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; 4352 3496 hlist_for_each_entry(req, list, hash_node) { 4353 - if (sqe_addr == req->user_data) { 4354 - io_poll_remove_one(req); 3497 + if (sqe_addr != req->user_data) 3498 + continue; 3499 + if (io_poll_remove_one(req)) 4355 3500 return 0; 4356 - } 3501 + return -EALREADY; 4357 3502 } 4358 3503 4359 3504 return -ENOENT; ··· 4400 3543 struct io_ring_ctx *ctx = req->ctx; 4401 3544 4402 3545 req->poll.done = true; 4403 - if (error) 4404 - io_cqring_fill_event(req, error); 4405 - else 4406 - io_cqring_fill_event(req, mangle_poll(mask)); 3546 + io_cqring_fill_event(req, error ? error : mangle_poll(mask)); 4407 3547 io_commit_cqring(ctx); 4408 3548 } 4409 3549 4410 - static void io_poll_complete_work(struct io_wq_work **workptr) 3550 + static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) 4411 3551 { 4412 - struct io_wq_work *work = *workptr; 4413 - struct io_kiocb *req = container_of(work, struct io_kiocb, work); 4414 - struct io_poll_iocb *poll = &req->poll; 4415 - struct poll_table_struct pt = { ._key = poll->events }; 4416 3552 struct io_ring_ctx *ctx = req->ctx; 4417 - struct io_kiocb *nxt = NULL; 4418 - __poll_t mask = 0; 4419 - int ret = 0; 4420 3553 4421 - if (work->flags & IO_WQ_WORK_CANCEL) { 4422 - WRITE_ONCE(poll->canceled, true); 4423 - ret = -ECANCELED; 4424 - } else if (READ_ONCE(poll->canceled)) { 4425 - ret = -ECANCELED; 4426 - } 4427 - 4428 - if (ret != -ECANCELED) 4429 - mask = vfs_poll(poll->file, &pt) & poll->events; 4430 - 4431 - /* 4432 - * Note that ->ki_cancel callers also delete iocb from active_reqs after 4433 - * calling ->ki_cancel. We need the ctx_lock roundtrip here to 4434 - * synchronize with them. In the cancellation case the list_del_init 4435 - * itself is not actually needed, but harmless so we keep it in to 4436 - * avoid further branches in the fast path. 4437 - */ 4438 3554 spin_lock_irq(&ctx->completion_lock); 4439 - if (!mask && ret != -ECANCELED) { 4440 - add_wait_queue(poll->head, &poll->wait); 4441 - spin_unlock_irq(&ctx->completion_lock); 4442 - return; 4443 - } 4444 3555 hash_del(&req->hash_node); 4445 - io_poll_complete(req, mask, ret); 3556 + io_poll_complete(req, req->result, 0); 3557 + req->flags |= REQ_F_COMP_LOCKED; 3558 + io_put_req_find_next(req, nxt); 4446 3559 spin_unlock_irq(&ctx->completion_lock); 4447 3560 4448 3561 io_cqring_ev_posted(ctx); 4449 - 4450 - if (ret < 0) 4451 - req_set_fail_links(req); 4452 - io_put_req_find_next(req, &nxt); 4453 - if (nxt) 4454 - io_wq_assign_next(workptr, nxt); 4455 3562 } 4456 3563 4457 - static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes) 3564 + static void io_poll_task_func(struct callback_head *cb) 4458 3565 { 4459 - struct io_kiocb *req, *tmp; 4460 - struct req_batch rb; 3566 + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); 3567 + struct io_kiocb *nxt = NULL; 4461 3568 4462 - rb.to_free = rb.need_iter = 0; 4463 - spin_lock_irq(&ctx->completion_lock); 4464 - llist_for_each_entry_safe(req, tmp, nodes, llist_node) { 4465 - hash_del(&req->hash_node); 4466 - io_poll_complete(req, req->result, 0); 3569 + io_poll_task_handler(req, &nxt); 3570 + if (nxt) { 3571 + struct io_ring_ctx *ctx = nxt->ctx; 4467 3572 4468 - if (refcount_dec_and_test(&req->refs) && 4469 - !io_req_multi_free(&rb, req)) { 4470 - req->flags |= REQ_F_COMP_LOCKED; 4471 - io_free_req(req); 4472 - } 3573 + mutex_lock(&ctx->uring_lock); 3574 + __io_queue_sqe(nxt, NULL); 3575 + mutex_unlock(&ctx->uring_lock); 4473 3576 } 4474 - spin_unlock_irq(&ctx->completion_lock); 4475 - 4476 - io_cqring_ev_posted(ctx); 4477 - io_free_req_many(ctx, &rb); 4478 - } 4479 - 4480 - static void io_poll_flush(struct io_wq_work **workptr) 4481 - { 4482 - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); 4483 - struct llist_node *nodes; 4484 - 4485 - nodes = llist_del_all(&req->ctx->poll_llist); 4486 - if (nodes) 4487 - __io_poll_flush(req->ctx, nodes); 4488 - } 4489 - 4490 - static void io_poll_trigger_evfd(struct io_wq_work **workptr) 4491 - { 4492 - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); 4493 - 4494 - eventfd_signal(req->ctx->cq_ev_fd, 1); 4495 - io_put_req(req); 4496 3577 } 4497 3578 4498 3579 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, 4499 3580 void *key) 4500 3581 { 4501 - struct io_poll_iocb *poll = wait->private; 4502 - struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); 4503 - struct io_ring_ctx *ctx = req->ctx; 4504 - __poll_t mask = key_to_poll(key); 3582 + struct io_kiocb *req = wait->private; 3583 + struct io_poll_iocb *poll = &req->poll; 4505 3584 4506 - /* for instances that support it check for an event match first: */ 4507 - if (mask && !(mask & poll->events)) 4508 - return 0; 4509 - 4510 - list_del_init(&poll->wait.entry); 4511 - 4512 - /* 4513 - * Run completion inline if we can. We're using trylock here because 4514 - * we are violating the completion_lock -> poll wq lock ordering. 4515 - * If we have a link timeout we're going to need the completion_lock 4516 - * for finalizing the request, mark us as having grabbed that already. 4517 - */ 4518 - if (mask) { 4519 - unsigned long flags; 4520 - 4521 - if (llist_empty(&ctx->poll_llist) && 4522 - spin_trylock_irqsave(&ctx->completion_lock, flags)) { 4523 - bool trigger_ev; 4524 - 4525 - hash_del(&req->hash_node); 4526 - io_poll_complete(req, mask, 0); 4527 - 4528 - trigger_ev = io_should_trigger_evfd(ctx); 4529 - if (trigger_ev && eventfd_signal_count()) { 4530 - trigger_ev = false; 4531 - req->work.func = io_poll_trigger_evfd; 4532 - } else { 4533 - req->flags |= REQ_F_COMP_LOCKED; 4534 - io_put_req(req); 4535 - req = NULL; 4536 - } 4537 - spin_unlock_irqrestore(&ctx->completion_lock, flags); 4538 - __io_cqring_ev_posted(ctx, trigger_ev); 4539 - } else { 4540 - req->result = mask; 4541 - req->llist_node.next = NULL; 4542 - /* if the list wasn't empty, we're done */ 4543 - if (!llist_add(&req->llist_node, &ctx->poll_llist)) 4544 - req = NULL; 4545 - else 4546 - req->work.func = io_poll_flush; 4547 - } 4548 - } 4549 - if (req) 4550 - io_queue_async_work(req); 4551 - 4552 - return 1; 3585 + return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func); 4553 3586 } 4554 - 4555 - struct io_poll_table { 4556 - struct poll_table_struct pt; 4557 - struct io_kiocb *req; 4558 - int error; 4559 - }; 4560 3587 4561 3588 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, 4562 3589 struct poll_table_struct *p) 4563 3590 { 4564 3591 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); 4565 3592 4566 - if (unlikely(pt->req->poll.head)) { 4567 - pt->error = -EINVAL; 4568 - return; 4569 - } 4570 - 4571 - pt->error = 0; 4572 - pt->req->poll.head = head; 4573 - add_wait_queue(head, &pt->req->poll.wait); 4574 - } 4575 - 4576 - static void io_poll_req_insert(struct io_kiocb *req) 4577 - { 4578 - struct io_ring_ctx *ctx = req->ctx; 4579 - struct hlist_head *list; 4580 - 4581 - list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; 4582 - hlist_add_head(&req->hash_node, list); 3593 + __io_queue_proc(&pt->req->poll, pt, head); 4583 3594 } 4584 3595 4585 3596 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) ··· 4464 3739 4465 3740 events = READ_ONCE(sqe->poll_events); 4466 3741 poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; 3742 + 3743 + /* 3744 + * Don't need a reference here, as we're adding it to the task 3745 + * task_works list. If the task exits, the list is pruned. 3746 + */ 3747 + req->task = current; 4467 3748 return 0; 4468 3749 } 4469 3750 4470 - static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt) 3751 + static int io_poll_add(struct io_kiocb *req) 4471 3752 { 4472 3753 struct io_poll_iocb *poll = &req->poll; 4473 3754 struct io_ring_ctx *ctx = req->ctx; 4474 3755 struct io_poll_table ipt; 4475 - bool cancel = false; 4476 3756 __poll_t mask; 4477 3757 4478 - INIT_IO_WORK(&req->work, io_poll_complete_work); 4479 3758 INIT_HLIST_NODE(&req->hash_node); 4480 - 4481 - poll->head = NULL; 4482 - poll->done = false; 4483 - poll->canceled = false; 4484 - 4485 - ipt.pt._qproc = io_poll_queue_proc; 4486 - ipt.pt._key = poll->events; 4487 - ipt.req = req; 4488 - ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ 4489 - 4490 - /* initialized the list so that we can do list_empty checks */ 4491 - INIT_LIST_HEAD(&poll->wait.entry); 4492 - init_waitqueue_func_entry(&poll->wait, io_poll_wake); 4493 - poll->wait.private = poll; 4494 - 4495 3759 INIT_LIST_HEAD(&req->list); 3760 + ipt.pt._qproc = io_poll_queue_proc; 4496 3761 4497 - mask = vfs_poll(poll->file, &ipt.pt) & poll->events; 3762 + mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events, 3763 + io_poll_wake); 4498 3764 4499 - spin_lock_irq(&ctx->completion_lock); 4500 - if (likely(poll->head)) { 4501 - spin_lock(&poll->head->lock); 4502 - if (unlikely(list_empty(&poll->wait.entry))) { 4503 - if (ipt.error) 4504 - cancel = true; 4505 - ipt.error = 0; 4506 - mask = 0; 4507 - } 4508 - if (mask || ipt.error) 4509 - list_del_init(&poll->wait.entry); 4510 - else if (cancel) 4511 - WRITE_ONCE(poll->canceled, true); 4512 - else if (!poll->done) /* actually waiting for an event */ 4513 - io_poll_req_insert(req); 4514 - spin_unlock(&poll->head->lock); 4515 - } 4516 3765 if (mask) { /* no async, we'd stolen it */ 4517 3766 ipt.error = 0; 4518 3767 io_poll_complete(req, mask, 0); ··· 4495 3796 4496 3797 if (mask) { 4497 3798 io_cqring_ev_posted(ctx); 4498 - io_put_req_find_next(req, nxt); 3799 + io_put_req(req); 4499 3800 } 4500 3801 return ipt.error; 4501 3802 } ··· 4744 4045 4745 4046 static void io_async_find_and_cancel(struct io_ring_ctx *ctx, 4746 4047 struct io_kiocb *req, __u64 sqe_addr, 4747 - struct io_kiocb **nxt, int success_ret) 4048 + int success_ret) 4748 4049 { 4749 4050 unsigned long flags; 4750 4051 int ret; ··· 4770 4071 4771 4072 if (ret < 0) 4772 4073 req_set_fail_links(req); 4773 - io_put_req_find_next(req, nxt); 4074 + io_put_req(req); 4774 4075 } 4775 4076 4776 4077 static int io_async_cancel_prep(struct io_kiocb *req, ··· 4786 4087 return 0; 4787 4088 } 4788 4089 4789 - static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt) 4090 + static int io_async_cancel(struct io_kiocb *req) 4790 4091 { 4791 4092 struct io_ring_ctx *ctx = req->ctx; 4792 4093 4793 - io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0); 4094 + io_async_find_and_cancel(ctx, req, req->cancel.addr, 0); 4794 4095 return 0; 4795 4096 } 4796 4097 ··· 4925 4226 case IORING_OP_EPOLL_CTL: 4926 4227 ret = io_epoll_ctl_prep(req, sqe); 4927 4228 break; 4229 + case IORING_OP_SPLICE: 4230 + ret = io_splice_prep(req, sqe); 4231 + break; 4232 + case IORING_OP_PROVIDE_BUFFERS: 4233 + ret = io_provide_buffers_prep(req, sqe); 4234 + break; 4235 + case IORING_OP_REMOVE_BUFFERS: 4236 + ret = io_remove_buffers_prep(req, sqe); 4237 + break; 4928 4238 default: 4929 4239 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", 4930 4240 req->opcode); ··· 4980 4272 case IORING_OP_READV: 4981 4273 case IORING_OP_READ_FIXED: 4982 4274 case IORING_OP_READ: 4275 + if (req->flags & REQ_F_BUFFER_SELECTED) 4276 + kfree((void *)(unsigned long)req->rw.addr); 4277 + /* fallthrough */ 4983 4278 case IORING_OP_WRITEV: 4984 4279 case IORING_OP_WRITE_FIXED: 4985 4280 case IORING_OP_WRITE: 4986 4281 if (io->rw.iov != io->rw.fast_iov) 4987 4282 kfree(io->rw.iov); 4988 4283 break; 4989 - case IORING_OP_SENDMSG: 4990 4284 case IORING_OP_RECVMSG: 4285 + if (req->flags & REQ_F_BUFFER_SELECTED) 4286 + kfree(req->sr_msg.kbuf); 4287 + /* fallthrough */ 4288 + case IORING_OP_SENDMSG: 4991 4289 if (io->msg.iov != io->msg.fast_iov) 4992 4290 kfree(io->msg.iov); 4291 + break; 4292 + case IORING_OP_RECV: 4293 + if (req->flags & REQ_F_BUFFER_SELECTED) 4294 + kfree(req->sr_msg.kbuf); 4993 4295 break; 4994 4296 case IORING_OP_OPENAT: 4995 4297 case IORING_OP_OPENAT2: 4996 4298 case IORING_OP_STATX: 4997 4299 putname(req->open.filename); 4300 + break; 4301 + case IORING_OP_SPLICE: 4302 + io_put_file(req, req->splice.file_in, 4303 + (req->splice.flags & SPLICE_F_FD_IN_FIXED)); 4998 4304 break; 4999 4305 } 5000 4306 ··· 5016 4294 } 5017 4295 5018 4296 static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, 5019 - struct io_kiocb **nxt, bool force_nonblock) 4297 + bool force_nonblock) 5020 4298 { 5021 4299 struct io_ring_ctx *ctx = req->ctx; 5022 4300 int ret; ··· 5033 4311 if (ret < 0) 5034 4312 break; 5035 4313 } 5036 - ret = io_read(req, nxt, force_nonblock); 4314 + ret = io_read(req, force_nonblock); 5037 4315 break; 5038 4316 case IORING_OP_WRITEV: 5039 4317 case IORING_OP_WRITE_FIXED: ··· 5043 4321 if (ret < 0) 5044 4322 break; 5045 4323 } 5046 - ret = io_write(req, nxt, force_nonblock); 4324 + ret = io_write(req, force_nonblock); 5047 4325 break; 5048 4326 case IORING_OP_FSYNC: 5049 4327 if (sqe) { ··· 5051 4329 if (ret < 0) 5052 4330 break; 5053 4331 } 5054 - ret = io_fsync(req, nxt, force_nonblock); 4332 + ret = io_fsync(req, force_nonblock); 5055 4333 break; 5056 4334 case IORING_OP_POLL_ADD: 5057 4335 if (sqe) { ··· 5059 4337 if (ret) 5060 4338 break; 5061 4339 } 5062 - ret = io_poll_add(req, nxt); 4340 + ret = io_poll_add(req); 5063 4341 break; 5064 4342 case IORING_OP_POLL_REMOVE: 5065 4343 if (sqe) { ··· 5075 4353 if (ret < 0) 5076 4354 break; 5077 4355 } 5078 - ret = io_sync_file_range(req, nxt, force_nonblock); 4356 + ret = io_sync_file_range(req, force_nonblock); 5079 4357 break; 5080 4358 case IORING_OP_SENDMSG: 5081 4359 case IORING_OP_SEND: ··· 5085 4363 break; 5086 4364 } 5087 4365 if (req->opcode == IORING_OP_SENDMSG) 5088 - ret = io_sendmsg(req, nxt, force_nonblock); 4366 + ret = io_sendmsg(req, force_nonblock); 5089 4367 else 5090 - ret = io_send(req, nxt, force_nonblock); 4368 + ret = io_send(req, force_nonblock); 5091 4369 break; 5092 4370 case IORING_OP_RECVMSG: 5093 4371 case IORING_OP_RECV: ··· 5097 4375 break; 5098 4376 } 5099 4377 if (req->opcode == IORING_OP_RECVMSG) 5100 - ret = io_recvmsg(req, nxt, force_nonblock); 4378 + ret = io_recvmsg(req, force_nonblock); 5101 4379 else 5102 - ret = io_recv(req, nxt, force_nonblock); 4380 + ret = io_recv(req, force_nonblock); 5103 4381 break; 5104 4382 case IORING_OP_TIMEOUT: 5105 4383 if (sqe) { ··· 5123 4401 if (ret) 5124 4402 break; 5125 4403 } 5126 - ret = io_accept(req, nxt, force_nonblock); 4404 + ret = io_accept(req, force_nonblock); 5127 4405 break; 5128 4406 case IORING_OP_CONNECT: 5129 4407 if (sqe) { ··· 5131 4409 if (ret) 5132 4410 break; 5133 4411 } 5134 - ret = io_connect(req, nxt, force_nonblock); 4412 + ret = io_connect(req, force_nonblock); 5135 4413 break; 5136 4414 case IORING_OP_ASYNC_CANCEL: 5137 4415 if (sqe) { ··· 5139 4417 if (ret) 5140 4418 break; 5141 4419 } 5142 - ret = io_async_cancel(req, nxt); 4420 + ret = io_async_cancel(req); 5143 4421 break; 5144 4422 case IORING_OP_FALLOCATE: 5145 4423 if (sqe) { ··· 5147 4425 if (ret) 5148 4426 break; 5149 4427 } 5150 - ret = io_fallocate(req, nxt, force_nonblock); 4428 + ret = io_fallocate(req, force_nonblock); 5151 4429 break; 5152 4430 case IORING_OP_OPENAT: 5153 4431 if (sqe) { ··· 5155 4433 if (ret) 5156 4434 break; 5157 4435 } 5158 - ret = io_openat(req, nxt, force_nonblock); 4436 + ret = io_openat(req, force_nonblock); 5159 4437 break; 5160 4438 case IORING_OP_CLOSE: 5161 4439 if (sqe) { ··· 5163 4441 if (ret) 5164 4442 break; 5165 4443 } 5166 - ret = io_close(req, nxt, force_nonblock); 4444 + ret = io_close(req, force_nonblock); 5167 4445 break; 5168 4446 case IORING_OP_FILES_UPDATE: 5169 4447 if (sqe) { ··· 5179 4457 if (ret) 5180 4458 break; 5181 4459 } 5182 - ret = io_statx(req, nxt, force_nonblock); 4460 + ret = io_statx(req, force_nonblock); 5183 4461 break; 5184 4462 case IORING_OP_FADVISE: 5185 4463 if (sqe) { ··· 5187 4465 if (ret) 5188 4466 break; 5189 4467 } 5190 - ret = io_fadvise(req, nxt, force_nonblock); 4468 + ret = io_fadvise(req, force_nonblock); 5191 4469 break; 5192 4470 case IORING_OP_MADVISE: 5193 4471 if (sqe) { ··· 5195 4473 if (ret) 5196 4474 break; 5197 4475 } 5198 - ret = io_madvise(req, nxt, force_nonblock); 4476 + ret = io_madvise(req, force_nonblock); 5199 4477 break; 5200 4478 case IORING_OP_OPENAT2: 5201 4479 if (sqe) { ··· 5203 4481 if (ret) 5204 4482 break; 5205 4483 } 5206 - ret = io_openat2(req, nxt, force_nonblock); 4484 + ret = io_openat2(req, force_nonblock); 5207 4485 break; 5208 4486 case IORING_OP_EPOLL_CTL: 5209 4487 if (sqe) { ··· 5211 4489 if (ret) 5212 4490 break; 5213 4491 } 5214 - ret = io_epoll_ctl(req, nxt, force_nonblock); 4492 + ret = io_epoll_ctl(req, force_nonblock); 4493 + break; 4494 + case IORING_OP_SPLICE: 4495 + if (sqe) { 4496 + ret = io_splice_prep(req, sqe); 4497 + if (ret < 0) 4498 + break; 4499 + } 4500 + ret = io_splice(req, force_nonblock); 4501 + break; 4502 + case IORING_OP_PROVIDE_BUFFERS: 4503 + if (sqe) { 4504 + ret = io_provide_buffers_prep(req, sqe); 4505 + if (ret) 4506 + break; 4507 + } 4508 + ret = io_provide_buffers(req, force_nonblock); 4509 + break; 4510 + case IORING_OP_REMOVE_BUFFERS: 4511 + if (sqe) { 4512 + ret = io_remove_buffers_prep(req, sqe); 4513 + if (ret) 4514 + break; 4515 + } 4516 + ret = io_remove_buffers(req, force_nonblock); 5215 4517 break; 5216 4518 default: 5217 4519 ret = -EINVAL; ··· 5268 4522 { 5269 4523 struct io_wq_work *work = *workptr; 5270 4524 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 5271 - struct io_kiocb *nxt = NULL; 5272 4525 int ret = 0; 5273 4526 5274 4527 /* if NO_CANCEL is set, we must still run the work */ ··· 5277 4532 } 5278 4533 5279 4534 if (!ret) { 5280 - req->in_async = true; 5281 4535 do { 5282 - ret = io_issue_sqe(req, NULL, &nxt, false); 4536 + ret = io_issue_sqe(req, NULL, false); 5283 4537 /* 5284 4538 * We can get EAGAIN for polled IO even though we're 5285 4539 * forcing a sync submission from here, since we can't ··· 5290 4546 } while (1); 5291 4547 } 5292 4548 5293 - /* drop submission reference */ 5294 - io_put_req(req); 5295 - 5296 4549 if (ret) { 5297 4550 req_set_fail_links(req); 5298 4551 io_cqring_add_event(req, ret); 5299 4552 io_put_req(req); 5300 4553 } 5301 4554 5302 - /* if a dependent link is ready, pass it back */ 5303 - if (!ret && nxt) 5304 - io_wq_assign_next(workptr, nxt); 4555 + io_steal_work(req, workptr); 5305 4556 } 5306 4557 5307 4558 static int io_req_needs_file(struct io_kiocb *req, int fd) ··· 5317 4578 return table->files[index & IORING_FILE_TABLE_MASK];; 5318 4579 } 5319 4580 4581 + static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, 4582 + int fd, struct file **out_file, bool fixed) 4583 + { 4584 + struct io_ring_ctx *ctx = req->ctx; 4585 + struct file *file; 4586 + 4587 + if (fixed) { 4588 + if (unlikely(!ctx->file_data || 4589 + (unsigned) fd >= ctx->nr_user_files)) 4590 + return -EBADF; 4591 + fd = array_index_nospec(fd, ctx->nr_user_files); 4592 + file = io_file_from_index(ctx, fd); 4593 + if (!file) 4594 + return -EBADF; 4595 + percpu_ref_get(&ctx->file_data->refs); 4596 + } else { 4597 + trace_io_uring_file_get(ctx, fd); 4598 + file = __io_file_get(state, fd); 4599 + if (unlikely(!file)) 4600 + return -EBADF; 4601 + } 4602 + 4603 + *out_file = file; 4604 + return 0; 4605 + } 4606 + 5320 4607 static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, 5321 4608 const struct io_uring_sqe *sqe) 5322 4609 { 5323 - struct io_ring_ctx *ctx = req->ctx; 5324 4610 unsigned flags; 5325 4611 int fd; 4612 + bool fixed; 5326 4613 5327 4614 flags = READ_ONCE(sqe->flags); 5328 4615 fd = READ_ONCE(sqe->fd); ··· 5356 4591 if (!io_req_needs_file(req, fd)) 5357 4592 return 0; 5358 4593 5359 - if (flags & IOSQE_FIXED_FILE) { 5360 - if (unlikely(!ctx->file_data || 5361 - (unsigned) fd >= ctx->nr_user_files)) 5362 - return -EBADF; 5363 - fd = array_index_nospec(fd, ctx->nr_user_files); 5364 - req->file = io_file_from_index(ctx, fd); 5365 - if (!req->file) 5366 - return -EBADF; 5367 - req->flags |= REQ_F_FIXED_FILE; 5368 - percpu_ref_get(&ctx->file_data->refs); 5369 - } else { 5370 - if (req->needs_fixed_file) 5371 - return -EBADF; 5372 - trace_io_uring_file_get(ctx, fd); 5373 - req->file = io_file_get(state, fd); 5374 - if (unlikely(!req->file)) 5375 - return -EBADF; 5376 - } 4594 + fixed = (flags & IOSQE_FIXED_FILE); 4595 + if (unlikely(!fixed && req->needs_fixed_file)) 4596 + return -EBADF; 5377 4597 5378 - return 0; 4598 + return io_file_get(state, req, fd, &req->file, fixed); 5379 4599 } 5380 4600 5381 4601 static int io_grab_files(struct io_kiocb *req) ··· 5422 4672 5423 4673 if (prev) { 5424 4674 req_set_fail_links(prev); 5425 - io_async_find_and_cancel(ctx, req, prev->user_data, NULL, 5426 - -ETIME); 4675 + io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME); 5427 4676 io_put_req(prev); 5428 4677 } else { 5429 4678 io_cqring_add_event(req, -ETIME); ··· 5459 4710 5460 4711 if (!(req->flags & REQ_F_LINK)) 5461 4712 return NULL; 4713 + /* for polled retry, if flag is set, we already went through here */ 4714 + if (req->flags & REQ_F_POLLED) 4715 + return NULL; 5462 4716 5463 4717 nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, 5464 4718 link_list); ··· 5475 4723 static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5476 4724 { 5477 4725 struct io_kiocb *linked_timeout; 5478 - struct io_kiocb *nxt = NULL; 4726 + struct io_kiocb *nxt; 5479 4727 const struct cred *old_creds = NULL; 5480 4728 int ret; 5481 4729 ··· 5491 4739 old_creds = override_creds(req->work.creds); 5492 4740 } 5493 4741 5494 - ret = io_issue_sqe(req, sqe, &nxt, true); 4742 + ret = io_issue_sqe(req, sqe, true); 5495 4743 5496 4744 /* 5497 4745 * We async punt it if the file wasn't marked NOWAIT, or if the file ··· 5499 4747 */ 5500 4748 if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || 5501 4749 (req->flags & REQ_F_MUST_PUNT))) { 4750 + if (io_arm_poll_handler(req)) { 4751 + if (linked_timeout) 4752 + io_queue_linked_timeout(linked_timeout); 4753 + goto exit; 4754 + } 5502 4755 punt: 5503 4756 if (io_op_defs[req->opcode].file_table) { 5504 4757 ret = io_grab_files(req); ··· 5516 4759 * submit reference when the iocb is actually submitted. 5517 4760 */ 5518 4761 io_queue_async_work(req); 5519 - goto done_req; 4762 + goto exit; 5520 4763 } 5521 4764 5522 4765 err: 4766 + nxt = NULL; 5523 4767 /* drop submission reference */ 5524 4768 io_put_req_find_next(req, &nxt); 5525 4769 ··· 5537 4779 req_set_fail_links(req); 5538 4780 io_put_req(req); 5539 4781 } 5540 - done_req: 5541 4782 if (nxt) { 5542 4783 req = nxt; 5543 - nxt = NULL; 5544 4784 5545 4785 if (req->flags & REQ_F_FORCE_ASYNC) 5546 4786 goto punt; 5547 4787 goto again; 5548 4788 } 4789 + exit: 5549 4790 if (old_creds) 5550 4791 revert_creds(old_creds); 5551 4792 } ··· 5586 4829 } 5587 4830 5588 4831 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ 5589 - IOSQE_IO_HARDLINK | IOSQE_ASYNC) 4832 + IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ 4833 + IOSQE_BUFFER_SELECT) 5590 4834 5591 4835 static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, 5592 4836 struct io_submit_state *state, struct io_kiocb **link) ··· 5604 4846 goto err_req; 5605 4847 } 5606 4848 4849 + if ((sqe_flags & IOSQE_BUFFER_SELECT) && 4850 + !io_op_defs[req->opcode].buffer_select) { 4851 + ret = -EOPNOTSUPP; 4852 + goto err_req; 4853 + } 4854 + 5607 4855 id = READ_ONCE(sqe->personality); 5608 4856 if (id) { 5609 4857 req->work.creds = idr_find(&ctx->personality_idr, id); ··· 5621 4857 } 5622 4858 5623 4859 /* same numerical values with corresponding REQ_F_*, safe to copy */ 5624 - req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK| 5625 - IOSQE_ASYNC); 4860 + req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK | 4861 + IOSQE_ASYNC | IOSQE_FIXED_FILE | 4862 + IOSQE_BUFFER_SELECT); 5626 4863 5627 4864 ret = io_req_set_file(state, req, sqe); 5628 4865 if (unlikely(ret)) { ··· 5844 5079 *mm = ctx->sqo_mm; 5845 5080 } 5846 5081 5847 - req->in_async = async; 5848 5082 req->needs_fixed_file = async; 5849 5083 trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, 5850 5084 true, async); ··· 5927 5163 if (!list_empty(&ctx->poll_list) || 5928 5164 (!time_after(jiffies, timeout) && ret != -EBUSY && 5929 5165 !percpu_ref_is_dying(&ctx->refs))) { 5166 + if (current->task_works) 5167 + task_work_run(); 5930 5168 cond_resched(); 5931 5169 continue; 5932 5170 } ··· 5960 5194 finish_wait(&ctx->sqo_wait, &wait); 5961 5195 break; 5962 5196 } 5197 + if (current->task_works) { 5198 + task_work_run(); 5199 + continue; 5200 + } 5963 5201 if (signal_pending(current)) 5964 5202 flush_signals(current); 5965 5203 schedule(); ··· 5982 5212 mutex_unlock(&ctx->uring_lock); 5983 5213 timeout = jiffies + ctx->sq_thread_idle; 5984 5214 } 5215 + 5216 + if (current->task_works) 5217 + task_work_run(); 5985 5218 5986 5219 set_fs(old_fs); 5987 5220 if (cur_mm) { ··· 6050 5277 struct io_rings *rings = ctx->rings; 6051 5278 int ret = 0; 6052 5279 6053 - if (io_cqring_events(ctx, false) >= min_events) 6054 - return 0; 5280 + do { 5281 + if (io_cqring_events(ctx, false) >= min_events) 5282 + return 0; 5283 + if (!current->task_works) 5284 + break; 5285 + task_work_run(); 5286 + } while (1); 6055 5287 6056 5288 if (sig) { 6057 5289 #ifdef CONFIG_COMPAT ··· 6076 5298 do { 6077 5299 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, 6078 5300 TASK_INTERRUPTIBLE); 5301 + if (current->task_works) 5302 + task_work_run(); 6079 5303 if (io_should_wake(&iowq, false)) 6080 5304 break; 6081 5305 schedule(); ··· 6387 5607 struct io_file_put { 6388 5608 struct llist_node llist; 6389 5609 struct file *file; 6390 - struct completion *done; 6391 5610 }; 6392 5611 6393 5612 static void io_ring_file_ref_flush(struct fixed_file_data *data) ··· 6397 5618 while ((node = llist_del_all(&data->put_llist)) != NULL) { 6398 5619 llist_for_each_entry_safe(pfile, tmp, node, llist) { 6399 5620 io_ring_file_put(data->ctx, pfile->file); 6400 - if (pfile->done) 6401 - complete(pfile->done); 6402 - else 6403 - kfree(pfile); 5621 + kfree(pfile); 6404 5622 } 6405 5623 } 6406 5624 } ··· 6592 5816 percpu_ref_get(&data->refs); 6593 5817 } 6594 5818 6595 - static bool io_queue_file_removal(struct fixed_file_data *data, 5819 + static int io_queue_file_removal(struct fixed_file_data *data, 6596 5820 struct file *file) 6597 5821 { 6598 - struct io_file_put *pfile, pfile_stack; 6599 - DECLARE_COMPLETION_ONSTACK(done); 5822 + struct io_file_put *pfile; 6600 5823 6601 - /* 6602 - * If we fail allocating the struct we need for doing async reomval 6603 - * of this file, just punt to sync and wait for it. 6604 - */ 6605 5824 pfile = kzalloc(sizeof(*pfile), GFP_KERNEL); 6606 - if (!pfile) { 6607 - pfile = &pfile_stack; 6608 - pfile->done = &done; 6609 - } 5825 + if (!pfile) 5826 + return -ENOMEM; 6610 5827 6611 5828 pfile->file = file; 6612 5829 llist_add(&pfile->llist, &data->put_llist); 6613 - 6614 - if (pfile == &pfile_stack) { 6615 - percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch); 6616 - wait_for_completion(&done); 6617 - flush_work(&data->ref_work); 6618 - return false; 6619 - } 6620 - 6621 - return true; 5830 + return 0; 6622 5831 } 6623 5832 6624 5833 static int __io_sqe_files_update(struct io_ring_ctx *ctx, ··· 6638 5877 index = i & IORING_FILE_TABLE_MASK; 6639 5878 if (table->files[index]) { 6640 5879 file = io_file_from_index(ctx, index); 5880 + err = io_queue_file_removal(data, file); 5881 + if (err) 5882 + break; 6641 5883 table->files[index] = NULL; 6642 - if (io_queue_file_removal(data, file)) 6643 - ref_switch = true; 5884 + ref_switch = true; 6644 5885 } 6645 5886 if (fd != -1) { 6646 5887 file = fget(fd); ··· 6695 5932 return __io_sqe_files_update(ctx, &up, nr_args); 6696 5933 } 6697 5934 6698 - static void io_put_work(struct io_wq_work *work) 5935 + static void io_free_work(struct io_wq_work *work) 6699 5936 { 6700 5937 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 6701 5938 5939 + /* Consider that io_steal_work() relies on this ref */ 6702 5940 io_put_req(req); 6703 - } 6704 - 6705 - static void io_get_work(struct io_wq_work *work) 6706 - { 6707 - struct io_kiocb *req = container_of(work, struct io_kiocb, work); 6708 - 6709 - refcount_inc(&req->refs); 6710 5941 } 6711 5942 6712 5943 static int io_init_wq_offload(struct io_ring_ctx *ctx, ··· 6713 5956 int ret = 0; 6714 5957 6715 5958 data.user = ctx->user; 6716 - data.get_work = io_get_work; 6717 - data.put_work = io_put_work; 5959 + data.free_work = io_free_work; 6718 5960 6719 5961 if (!(p->flags & IORING_SETUP_ATTACH_WQ)) { 6720 5962 /* Do QD, or 4 * CPUS, whatever is smallest */ ··· 7115 6359 return -ENXIO; 7116 6360 } 7117 6361 6362 + static int __io_destroy_buffers(int id, void *p, void *data) 6363 + { 6364 + struct io_ring_ctx *ctx = data; 6365 + struct io_buffer *buf = p; 6366 + 6367 + __io_remove_buffers(ctx, buf, id, -1U); 6368 + return 0; 6369 + } 6370 + 6371 + static void io_destroy_buffers(struct io_ring_ctx *ctx) 6372 + { 6373 + idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx); 6374 + idr_destroy(&ctx->io_buffer_idr); 6375 + } 6376 + 7118 6377 static void io_ring_ctx_free(struct io_ring_ctx *ctx) 7119 6378 { 7120 6379 io_finish_async(ctx); ··· 7140 6369 io_sqe_buffer_unregister(ctx); 7141 6370 io_sqe_files_unregister(ctx); 7142 6371 io_eventfd_unregister(ctx); 6372 + io_destroy_buffers(ctx); 7143 6373 idr_destroy(&ctx->personality_idr); 7144 6374 7145 6375 #if defined(CONFIG_UNIX) ··· 7395 6623 int submitted = 0; 7396 6624 struct fd f; 7397 6625 6626 + if (current->task_works) 6627 + task_work_run(); 6628 + 7398 6629 if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP)) 7399 6630 return -EINVAL; 7400 6631 ··· 7444 6669 7445 6670 min_complete = min(min_complete, ctx->cq_entries); 7446 6671 7447 - if (ctx->flags & IORING_SETUP_IOPOLL) { 6672 + /* 6673 + * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user 6674 + * space applications don't need to do io completion events 6675 + * polling again, they can rely on io_sq_thread to do polling 6676 + * work, which can reduce cpu usage and uring_lock contention. 6677 + */ 6678 + if (ctx->flags & IORING_SETUP_IOPOLL && 6679 + !(ctx->flags & IORING_SETUP_SQPOLL)) { 7448 6680 ret = io_iopoll_check(ctx, &nr_events, min_complete); 7449 6681 } else { 7450 6682 ret = io_cqring_wait(ctx, min_complete, sig, sigsz); ··· 7527 6745 seq_printf(m, "Personalities:\n"); 7528 6746 idr_for_each(&ctx->personality_idr, io_uring_show_cred, m); 7529 6747 } 6748 + seq_printf(m, "PollList:\n"); 6749 + spin_lock_irq(&ctx->completion_lock); 6750 + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { 6751 + struct hlist_head *list = &ctx->cancel_hash[i]; 6752 + struct io_kiocb *req; 6753 + 6754 + hlist_for_each_entry(req, list, hash_node) 6755 + seq_printf(m, " op=%d, task_works=%d\n", req->opcode, 6756 + req->task->task_works != NULL); 6757 + } 6758 + spin_unlock_irq(&ctx->completion_lock); 7530 6759 mutex_unlock(&ctx->uring_lock); 7531 6760 } 7532 6761 ··· 7754 6961 7755 6962 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | 7756 6963 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | 7757 - IORING_FEAT_CUR_PERSONALITY; 6964 + IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL; 7758 6965 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); 7759 6966 return ret; 7760 6967 err: ··· 8032 7239 BUILD_BUG_SQE_ELEM(8, __u64, off); 8033 7240 BUILD_BUG_SQE_ELEM(8, __u64, addr2); 8034 7241 BUILD_BUG_SQE_ELEM(16, __u64, addr); 7242 + BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in); 8035 7243 BUILD_BUG_SQE_ELEM(24, __u32, len); 8036 7244 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags); 8037 7245 BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); ··· 8047 7253 BUILD_BUG_SQE_ELEM(28, __u32, open_flags); 8048 7254 BUILD_BUG_SQE_ELEM(28, __u32, statx_flags); 8049 7255 BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice); 7256 + BUILD_BUG_SQE_ELEM(28, __u32, splice_flags); 8050 7257 BUILD_BUG_SQE_ELEM(32, __u64, user_data); 8051 7258 BUILD_BUG_SQE_ELEM(40, __u16, buf_index); 8052 7259 BUILD_BUG_SQE_ELEM(42, __u16, personality); 7260 + BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); 8053 7261 8054 7262 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); 7263 + BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int)); 8055 7264 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC); 8056 7265 return 0; 8057 7266 };
+3 -3
fs/splice.c
··· 1109 1109 /* 1110 1110 * Determine where to splice to/from. 1111 1111 */ 1112 - static long do_splice(struct file *in, loff_t __user *off_in, 1113 - struct file *out, loff_t __user *off_out, 1114 - size_t len, unsigned int flags) 1112 + long do_splice(struct file *in, loff_t __user *off_in, 1113 + struct file *out, loff_t __user *off_out, 1114 + size_t len, unsigned int flags) 1115 1115 { 1116 1116 struct pipe_inode_info *ipipe; 1117 1117 struct pipe_inode_info *opipe;
+4
include/linux/socket.h
··· 391 391 struct user_msghdr __user *umsg, unsigned flags, 392 392 struct sockaddr __user **uaddr, 393 393 struct iovec **iov); 394 + extern int __copy_msghdr_from_user(struct msghdr *kmsg, 395 + struct user_msghdr __user *umsg, 396 + struct sockaddr __user **save_addr, 397 + struct iovec __user **uiov, size_t *nsegs); 394 398 395 399 /* helpers which do the actual work for syscalls */ 396 400 extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
+3
include/linux/splice.h
··· 78 78 struct pipe_buffer *); 79 79 extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, 80 80 splice_direct_actor *); 81 + extern long do_splice(struct file *in, loff_t __user *off_in, 82 + struct file *out, loff_t __user *off_out, 83 + size_t len, unsigned int flags); 81 84 82 85 /* 83 86 * for dynamic pipe sizing
+3
include/net/compat.h
··· 38 38 #define compat_mmsghdr mmsghdr 39 39 #endif /* defined(CONFIG_COMPAT) */ 40 40 41 + int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg, 42 + struct sockaddr __user **save_addr, compat_uptr_t *ptr, 43 + compat_size_t *len); 41 44 int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *, 42 45 struct sockaddr __user **, struct iovec **); 43 46 struct sock_fprog __user *get_compat_bpf_fprog(char __user *optval);
+103
include/trace/events/io_uring.h
··· 357 357 __entry->force_nonblock, __entry->sq_thread) 358 358 ); 359 359 360 + TRACE_EVENT(io_uring_poll_arm, 361 + 362 + TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask, int events), 363 + 364 + TP_ARGS(ctx, opcode, user_data, mask, events), 365 + 366 + TP_STRUCT__entry ( 367 + __field( void *, ctx ) 368 + __field( u8, opcode ) 369 + __field( u64, user_data ) 370 + __field( int, mask ) 371 + __field( int, events ) 372 + ), 373 + 374 + TP_fast_assign( 375 + __entry->ctx = ctx; 376 + __entry->opcode = opcode; 377 + __entry->user_data = user_data; 378 + __entry->mask = mask; 379 + __entry->events = events; 380 + ), 381 + 382 + TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x, events 0x%x", 383 + __entry->ctx, __entry->opcode, 384 + (unsigned long long) __entry->user_data, 385 + __entry->mask, __entry->events) 386 + ); 387 + 388 + TRACE_EVENT(io_uring_poll_wake, 389 + 390 + TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask), 391 + 392 + TP_ARGS(ctx, opcode, user_data, mask), 393 + 394 + TP_STRUCT__entry ( 395 + __field( void *, ctx ) 396 + __field( u8, opcode ) 397 + __field( u64, user_data ) 398 + __field( int, mask ) 399 + ), 400 + 401 + TP_fast_assign( 402 + __entry->ctx = ctx; 403 + __entry->opcode = opcode; 404 + __entry->user_data = user_data; 405 + __entry->mask = mask; 406 + ), 407 + 408 + TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x", 409 + __entry->ctx, __entry->opcode, 410 + (unsigned long long) __entry->user_data, 411 + __entry->mask) 412 + ); 413 + 414 + TRACE_EVENT(io_uring_task_add, 415 + 416 + TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask), 417 + 418 + TP_ARGS(ctx, opcode, user_data, mask), 419 + 420 + TP_STRUCT__entry ( 421 + __field( void *, ctx ) 422 + __field( u8, opcode ) 423 + __field( u64, user_data ) 424 + __field( int, mask ) 425 + ), 426 + 427 + TP_fast_assign( 428 + __entry->ctx = ctx; 429 + __entry->opcode = opcode; 430 + __entry->user_data = user_data; 431 + __entry->mask = mask; 432 + ), 433 + 434 + TP_printk("ring %p, op %d, data 0x%llx, mask %x", 435 + __entry->ctx, __entry->opcode, 436 + (unsigned long long) __entry->user_data, 437 + __entry->mask) 438 + ); 439 + 440 + TRACE_EVENT(io_uring_task_run, 441 + 442 + TP_PROTO(void *ctx, u8 opcode, u64 user_data), 443 + 444 + TP_ARGS(ctx, opcode, user_data), 445 + 446 + TP_STRUCT__entry ( 447 + __field( void *, ctx ) 448 + __field( u8, opcode ) 449 + __field( u64, user_data ) 450 + ), 451 + 452 + TP_fast_assign( 453 + __entry->ctx = ctx; 454 + __entry->opcode = opcode; 455 + __entry->user_data = user_data; 456 + ), 457 + 458 + TP_printk("ring %p, op %d, data 0x%llx", 459 + __entry->ctx, __entry->opcode, 460 + (unsigned long long) __entry->user_data) 461 + ); 462 + 360 463 #endif /* _TRACE_IO_URING_H */ 361 464 362 465 /* This part must be outside protection */
+38 -4
include/uapi/linux/io_uring.h
··· 1 - /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ 1 + /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ 2 2 /* 3 3 * Header file for the io_uring interface. 4 4 * ··· 23 23 __u64 off; /* offset into file */ 24 24 __u64 addr2; 25 25 }; 26 - __u64 addr; /* pointer to buffer or iovecs */ 26 + union { 27 + __u64 addr; /* pointer to buffer or iovecs */ 28 + __u64 splice_off_in; 29 + }; 27 30 __u32 len; /* buffer size or number of iovecs */ 28 31 union { 29 32 __kernel_rwf_t rw_flags; ··· 40 37 __u32 open_flags; 41 38 __u32 statx_flags; 42 39 __u32 fadvise_advice; 40 + __u32 splice_flags; 43 41 }; 44 42 __u64 user_data; /* data to be passed back at completion time */ 45 43 union { 46 44 struct { 47 - /* index into fixed buffers, if used */ 48 - __u16 buf_index; 45 + /* pack this to avoid bogus arm OABI complaints */ 46 + union { 47 + /* index into fixed buffers, if used */ 48 + __u16 buf_index; 49 + /* for grouped buffer selection */ 50 + __u16 buf_group; 51 + } __attribute__((packed)); 49 52 /* personality to use, if used */ 50 53 __u16 personality; 54 + __s32 splice_fd_in; 51 55 }; 52 56 __u64 __pad2[3]; 53 57 }; ··· 66 56 IOSQE_IO_LINK_BIT, 67 57 IOSQE_IO_HARDLINK_BIT, 68 58 IOSQE_ASYNC_BIT, 59 + IOSQE_BUFFER_SELECT_BIT, 69 60 }; 70 61 71 62 /* ··· 82 71 #define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT) 83 72 /* always go async */ 84 73 #define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT) 74 + /* select buffer from sqe->buf_group */ 75 + #define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT) 85 76 86 77 /* 87 78 * io_uring_setup() flags ··· 126 113 IORING_OP_RECV, 127 114 IORING_OP_OPENAT2, 128 115 IORING_OP_EPOLL_CTL, 116 + IORING_OP_SPLICE, 117 + IORING_OP_PROVIDE_BUFFERS, 118 + IORING_OP_REMOVE_BUFFERS, 129 119 130 120 /* this goes last, obviously */ 131 121 IORING_OP_LAST, ··· 145 129 #define IORING_TIMEOUT_ABS (1U << 0) 146 130 147 131 /* 132 + * sqe->splice_flags 133 + * extends splice(2) flags 134 + */ 135 + #define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */ 136 + 137 + /* 148 138 * IO completion data structure (Completion Queue Entry) 149 139 */ 150 140 struct io_uring_cqe { 151 141 __u64 user_data; /* sqe->data submission passed back */ 152 142 __s32 res; /* result code for this event */ 153 143 __u32 flags; 144 + }; 145 + 146 + /* 147 + * cqe->flags 148 + * 149 + * IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID 150 + */ 151 + #define IORING_CQE_F_BUFFER (1U << 0) 152 + 153 + enum { 154 + IORING_CQE_BUFFER_SHIFT = 16, 154 155 }; 155 156 156 157 /* ··· 237 204 #define IORING_FEAT_SUBMIT_STABLE (1U << 2) 238 205 #define IORING_FEAT_RW_CUR_POS (1U << 3) 239 206 #define IORING_FEAT_CUR_PERSONALITY (1U << 4) 207 + #define IORING_FEAT_FAST_POLL (1U << 5) 240 208 241 209 /* 242 210 * io_uring_register(2) opcodes and arguments
+14 -4
kernel/task_work.c
··· 97 97 * work->func() can do task_work_add(), do not set 98 98 * work_exited unless the list is empty. 99 99 */ 100 - raw_spin_lock_irq(&task->pi_lock); 101 100 do { 101 + head = NULL; 102 102 work = READ_ONCE(task->task_works); 103 - head = !work && (task->flags & PF_EXITING) ? 104 - &work_exited : NULL; 103 + if (!work) { 104 + if (task->flags & PF_EXITING) 105 + head = &work_exited; 106 + else 107 + break; 108 + } 105 109 } while (cmpxchg(&task->task_works, work, head) != work); 106 - raw_spin_unlock_irq(&task->pi_lock); 107 110 108 111 if (!work) 109 112 break; 113 + /* 114 + * Synchronize with task_work_cancel(). It can not remove 115 + * the first entry == work, cmpxchg(task_works) must fail. 116 + * But it can remove another entry from the ->next list. 117 + */ 118 + raw_spin_lock_irq(&task->pi_lock); 119 + raw_spin_unlock_irq(&task->pi_lock); 110 120 111 121 do { 112 122 next = work->next;
+23 -7
net/compat.c
··· 33 33 #include <linux/uaccess.h> 34 34 #include <net/compat.h> 35 35 36 - int get_compat_msghdr(struct msghdr *kmsg, 37 - struct compat_msghdr __user *umsg, 38 - struct sockaddr __user **save_addr, 39 - struct iovec **iov) 36 + int __get_compat_msghdr(struct msghdr *kmsg, 37 + struct compat_msghdr __user *umsg, 38 + struct sockaddr __user **save_addr, 39 + compat_uptr_t *ptr, compat_size_t *len) 40 40 { 41 41 struct compat_msghdr msg; 42 42 ssize_t err; ··· 79 79 return -EMSGSIZE; 80 80 81 81 kmsg->msg_iocb = NULL; 82 + *ptr = msg.msg_iov; 83 + *len = msg.msg_iovlen; 84 + return 0; 85 + } 82 86 83 - err = compat_import_iovec(save_addr ? READ : WRITE, 84 - compat_ptr(msg.msg_iov), msg.msg_iovlen, 85 - UIO_FASTIOV, iov, &kmsg->msg_iter); 87 + int get_compat_msghdr(struct msghdr *kmsg, 88 + struct compat_msghdr __user *umsg, 89 + struct sockaddr __user **save_addr, 90 + struct iovec **iov) 91 + { 92 + compat_uptr_t ptr; 93 + compat_size_t len; 94 + ssize_t err; 95 + 96 + err = __get_compat_msghdr(kmsg, umsg, save_addr, &ptr, &len); 97 + if (err) 98 + return err; 99 + 100 + err = compat_import_iovec(save_addr ? READ : WRITE, compat_ptr(ptr), 101 + len, UIO_FASTIOV, iov, &kmsg->msg_iter); 86 102 return err < 0 ? err : 0; 87 103 } 88 104
+21 -4
net/socket.c
··· 2228 2228 unsigned int name_len; 2229 2229 }; 2230 2230 2231 - static int copy_msghdr_from_user(struct msghdr *kmsg, 2232 - struct user_msghdr __user *umsg, 2233 - struct sockaddr __user **save_addr, 2234 - struct iovec **iov) 2231 + int __copy_msghdr_from_user(struct msghdr *kmsg, 2232 + struct user_msghdr __user *umsg, 2233 + struct sockaddr __user **save_addr, 2234 + struct iovec __user **uiov, size_t *nsegs) 2235 2235 { 2236 2236 struct user_msghdr msg; 2237 2237 ssize_t err; ··· 2273 2273 return -EMSGSIZE; 2274 2274 2275 2275 kmsg->msg_iocb = NULL; 2276 + *uiov = msg.msg_iov; 2277 + *nsegs = msg.msg_iovlen; 2278 + return 0; 2279 + } 2280 + 2281 + static int copy_msghdr_from_user(struct msghdr *kmsg, 2282 + struct user_msghdr __user *umsg, 2283 + struct sockaddr __user **save_addr, 2284 + struct iovec **iov) 2285 + { 2286 + struct user_msghdr msg; 2287 + ssize_t err; 2288 + 2289 + err = __copy_msghdr_from_user(kmsg, umsg, save_addr, &msg.msg_iov, 2290 + &msg.msg_iovlen); 2291 + if (err) 2292 + return err; 2276 2293 2277 2294 err = import_iovec(save_addr ? READ : WRITE, 2278 2295 msg.msg_iov, msg.msg_iovlen,