Merge tag 'for-5.11/io_uring-2020-12-14' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:
"Fairly light set of changes this time around, and mostly some bits
that were pushed out to 5.11 instead of 5.10, fixes/cleanups, and a
few features. In particular:

- Cleanups around iovec import (David Laight, Pavel)

- Add timeout support for io_uring_enter(2), which enables us to
clean up liburing and avoid a timeout sqe submission in the
completion path.

The big win here is that it allows setups that split SQ and CQ
handling into separate threads to avoid locking, as the CQ side
will no longer submit when timeouts are needed when waiting for
events (Hao Xu)

- Add support for socket shutdown, and renameat/unlinkat.

- SQPOLL cleanups and improvements (Xiaoguang Wang)

- Allow SQPOLL setups for CAP_SYS_NICE, and enable regular
(non-fixed) files to be used.

- Cancelation improvements (Pavel)

- Fixed file reference improvements (Pavel)

- IOPOLL related race fixes (Pavel)

- Lots of other little fixes and cleanups (mostly Pavel)"

* tag 'for-5.11/io_uring-2020-12-14' of git://git.kernel.dk/linux-block: (43 commits)
io_uring: fix io_cqring_events()'s noflush
io_uring: fix racy IOPOLL flush overflow
io_uring: fix racy IOPOLL completions
io_uring: always let io_iopoll_complete() complete polled io
io_uring: add timeout update
io_uring: restructure io_timeout_cancel()
io_uring: fix files cancellation
io_uring: use bottom half safe lock for fixed file data
io_uring: fix miscounting ios_left
io_uring: change submit file state invariant
io_uring: check kthread stopped flag when sq thread is unparked
io_uring: share fixed_file_refs b/w multiple rsrcs
io_uring: replace inflight_wait with tctx->wait
io_uring: don't take fs for recvmsg/sendmsg
io_uring: only wake up sq thread while current task is in io worker context
io_uring: don't acquire uring_lock twice
io_uring: initialize 'timeout' properly in io_sq_thread()
io_uring: refactor io_sq_thread() handling
io_uring: always batch cancel in *cancel_files()
io_uring: pass files into kill timeouts/poll
...

+829 -595
+2
fs/internal.h
··· 78 78 long do_rmdir(int dfd, struct filename *name); 79 79 long do_unlinkat(int dfd, struct filename *name); 80 80 int may_linkat(struct path *link); 81 + int do_renameat2(int olddfd, struct filename *oldname, int newdfd, 82 + struct filename *newname, unsigned int flags); 81 83 82 84 /* 83 85 * namespace.c
-10
fs/io-wq.c
··· 1078 1078 return IO_WQ_CANCEL_NOTFOUND; 1079 1079 } 1080 1080 1081 - static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data) 1082 - { 1083 - return work == data; 1084 - } 1085 - 1086 - enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) 1087 - { 1088 - return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork, false); 1089 - } 1090 - 1091 1081 struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) 1092 1082 { 1093 1083 int ret = -ENOMEM, node;
-1
fs/io-wq.h
··· 129 129 } 130 130 131 131 void io_wq_cancel_all(struct io_wq *wq); 132 - enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork); 133 132 134 133 typedef bool (work_cancel_fn)(struct io_wq_work *, void *); 135 134
+775 -562
fs/io_uring.c
··· 245 245 246 246 struct task_struct *thread; 247 247 struct wait_queue_head wait; 248 + 249 + unsigned sq_thread_idle; 248 250 }; 249 251 250 252 struct io_ring_ctx { ··· 287 285 struct list_head timeout_list; 288 286 struct list_head cq_overflow_list; 289 287 290 - wait_queue_head_t inflight_wait; 291 288 struct io_uring_sqe *sq_sqes; 292 289 } ____cacheline_aligned_in_smp; 293 290 ··· 311 310 struct io_sq_data *sq_data; /* if using sq thread polling */ 312 311 313 312 struct wait_queue_head sqo_sq_wait; 314 - struct wait_queue_entry sqo_wait_entry; 315 313 struct list_head sqd_list; 316 314 317 315 /* ··· 395 395 */ 396 396 struct io_poll_iocb { 397 397 struct file *file; 398 - union { 399 - struct wait_queue_head *head; 400 - u64 addr; 401 - }; 398 + struct wait_queue_head *head; 402 399 __poll_t events; 403 400 bool done; 404 401 bool canceled; 405 402 struct wait_queue_entry wait; 403 + }; 404 + 405 + struct io_poll_remove { 406 + struct file *file; 407 + u64 addr; 406 408 }; 407 409 408 410 struct io_close { ··· 446 444 u32 off; 447 445 u32 target_seq; 448 446 struct list_head list; 447 + /* head of the link, used by linked timeouts only */ 448 + struct io_kiocb *head; 449 449 }; 450 450 451 451 struct io_timeout_rem { 452 452 struct file *file; 453 453 u64 addr; 454 + 455 + /* timeout update */ 456 + struct timespec64 ts; 457 + u32 flags; 454 458 }; 455 459 456 460 struct io_rw { ··· 549 541 struct statx __user *buffer; 550 542 }; 551 543 544 + struct io_shutdown { 545 + struct file *file; 546 + int how; 547 + }; 548 + 549 + struct io_rename { 550 + struct file *file; 551 + int old_dfd; 552 + int new_dfd; 553 + struct filename *oldpath; 554 + struct filename *newpath; 555 + int flags; 556 + }; 557 + 558 + struct io_unlink { 559 + struct file *file; 560 + int dfd; 561 + int flags; 562 + struct filename *filename; 563 + }; 564 + 552 565 struct io_completion { 553 566 struct file *file; 554 567 struct list_head list; ··· 604 575 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, 605 576 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, 606 577 607 - REQ_F_LINK_HEAD_BIT, 608 578 REQ_F_FAIL_LINK_BIT, 609 579 REQ_F_INFLIGHT_BIT, 610 580 REQ_F_CUR_POS_BIT, ··· 635 607 /* IOSQE_BUFFER_SELECT */ 636 608 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), 637 609 638 - /* head of a link */ 639 - REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT), 640 610 /* fail rest of links */ 641 611 REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), 642 612 /* on inflight list */ ··· 677 651 struct file *file; 678 652 struct io_rw rw; 679 653 struct io_poll_iocb poll; 654 + struct io_poll_remove poll_remove; 680 655 struct io_accept accept; 681 656 struct io_sync sync; 682 657 struct io_cancel cancel; ··· 694 667 struct io_splice splice; 695 668 struct io_provide_buf pbuf; 696 669 struct io_statx statx; 670 + struct io_shutdown shutdown; 671 + struct io_rename rename; 672 + struct io_unlink unlink; 697 673 /* use only after cleaning per-op data, see io_clean_op() */ 698 674 struct io_completion compl; 699 675 }; ··· 716 686 struct task_struct *task; 717 687 u64 user_data; 718 688 719 - struct list_head link_list; 689 + struct io_kiocb *link; 690 + struct percpu_ref *fixed_file_refs; 720 691 721 692 /* 722 693 * 1. used with ctx->iopoll_list with reads/writes 723 694 * 2. to track reqs with ->files (see io_op_def::file_table) 724 695 */ 725 696 struct list_head inflight_entry; 726 - 727 - struct percpu_ref *fixed_file_refs; 728 697 struct callback_head task_work; 729 698 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ 730 699 struct hlist_node hash_node; ··· 754 725 void *reqs[IO_IOPOLL_BATCH]; 755 726 unsigned int free_reqs; 756 727 728 + bool plug_started; 729 + 757 730 /* 758 731 * Batch completion logic 759 732 */ ··· 766 735 */ 767 736 struct file *file; 768 737 unsigned int fd; 769 - unsigned int has_refs; 738 + unsigned int file_refs; 770 739 unsigned int ios_left; 771 740 }; 772 741 ··· 788 757 unsigned buffer_select : 1; 789 758 /* must always have async data allocated */ 790 759 unsigned needs_async_data : 1; 760 + /* should block plug */ 761 + unsigned plug : 1; 791 762 /* size of async data needed, if any */ 792 763 unsigned short async_size; 793 764 unsigned work_flags; ··· 803 770 .pollin = 1, 804 771 .buffer_select = 1, 805 772 .needs_async_data = 1, 773 + .plug = 1, 806 774 .async_size = sizeof(struct io_async_rw), 807 775 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, 808 776 }, ··· 813 779 .unbound_nonreg_file = 1, 814 780 .pollout = 1, 815 781 .needs_async_data = 1, 782 + .plug = 1, 816 783 .async_size = sizeof(struct io_async_rw), 817 784 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | 818 785 IO_WQ_WORK_FSIZE, ··· 826 791 .needs_file = 1, 827 792 .unbound_nonreg_file = 1, 828 793 .pollin = 1, 794 + .plug = 1, 829 795 .async_size = sizeof(struct io_async_rw), 830 796 .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM, 831 797 }, ··· 835 799 .hash_reg_file = 1, 836 800 .unbound_nonreg_file = 1, 837 801 .pollout = 1, 802 + .plug = 1, 838 803 .async_size = sizeof(struct io_async_rw), 839 804 .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE | 840 805 IO_WQ_WORK_MM, ··· 855 818 .pollout = 1, 856 819 .needs_async_data = 1, 857 820 .async_size = sizeof(struct io_async_msghdr), 858 - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | 859 - IO_WQ_WORK_FS, 821 + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, 860 822 }, 861 823 [IORING_OP_RECVMSG] = { 862 824 .needs_file = 1, ··· 864 828 .buffer_select = 1, 865 829 .needs_async_data = 1, 866 830 .async_size = sizeof(struct io_async_msghdr), 867 - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | 868 - IO_WQ_WORK_FS, 831 + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, 869 832 }, 870 833 [IORING_OP_TIMEOUT] = { 871 834 .needs_async_data = 1, 872 835 .async_size = sizeof(struct io_timeout_data), 873 836 .work_flags = IO_WQ_WORK_MM, 874 837 }, 875 - [IORING_OP_TIMEOUT_REMOVE] = {}, 838 + [IORING_OP_TIMEOUT_REMOVE] = { 839 + /* used by timeout updates' prep() */ 840 + .work_flags = IO_WQ_WORK_MM, 841 + }, 876 842 [IORING_OP_ACCEPT] = { 877 843 .needs_file = 1, 878 844 .unbound_nonreg_file = 1, ··· 901 863 }, 902 864 [IORING_OP_OPENAT] = { 903 865 .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG | 904 - IO_WQ_WORK_FS, 866 + IO_WQ_WORK_FS | IO_WQ_WORK_MM, 905 867 }, 906 868 [IORING_OP_CLOSE] = { 907 869 .needs_file = 1, ··· 920 882 .unbound_nonreg_file = 1, 921 883 .pollin = 1, 922 884 .buffer_select = 1, 885 + .plug = 1, 923 886 .async_size = sizeof(struct io_async_rw), 924 887 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, 925 888 }, ··· 928 889 .needs_file = 1, 929 890 .unbound_nonreg_file = 1, 930 891 .pollout = 1, 892 + .plug = 1, 931 893 .async_size = sizeof(struct io_async_rw), 932 894 .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | 933 895 IO_WQ_WORK_FSIZE, ··· 955 915 }, 956 916 [IORING_OP_OPENAT2] = { 957 917 .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS | 958 - IO_WQ_WORK_BLKCG, 918 + IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM, 959 919 }, 960 920 [IORING_OP_EPOLL_CTL] = { 961 921 .unbound_nonreg_file = 1, ··· 973 933 .needs_file = 1, 974 934 .hash_reg_file = 1, 975 935 .unbound_nonreg_file = 1, 936 + }, 937 + [IORING_OP_SHUTDOWN] = { 938 + .needs_file = 1, 939 + }, 940 + [IORING_OP_RENAMEAT] = { 941 + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES | 942 + IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, 943 + }, 944 + [IORING_OP_UNLINKAT] = { 945 + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES | 946 + IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, 976 947 }, 977 948 }; 978 949 ··· 1034 983 } 1035 984 EXPORT_SYMBOL(io_uring_get_socket); 1036 985 986 + #define io_for_each_link(pos, head) \ 987 + for (pos = (head); pos; pos = pos->link) 988 + 1037 989 static inline void io_clean_op(struct io_kiocb *req) 1038 990 { 1039 991 if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED | ··· 1044 990 __io_clean_op(req); 1045 991 } 1046 992 1047 - static void io_sq_thread_drop_mm(void) 993 + static inline void io_set_resource_node(struct io_kiocb *req) 1048 994 { 995 + struct io_ring_ctx *ctx = req->ctx; 996 + 997 + if (!req->fixed_file_refs) { 998 + req->fixed_file_refs = &ctx->file_data->node->refs; 999 + percpu_ref_get(req->fixed_file_refs); 1000 + } 1001 + } 1002 + 1003 + static bool io_match_task(struct io_kiocb *head, 1004 + struct task_struct *task, 1005 + struct files_struct *files) 1006 + { 1007 + struct io_kiocb *req; 1008 + 1009 + if (task && head->task != task) 1010 + return false; 1011 + if (!files) 1012 + return true; 1013 + 1014 + io_for_each_link(req, head) { 1015 + if ((req->flags & REQ_F_WORK_INITIALIZED) && 1016 + (req->work.flags & IO_WQ_WORK_FILES) && 1017 + req->work.identity->files == files) 1018 + return true; 1019 + } 1020 + return false; 1021 + } 1022 + 1023 + static void io_sq_thread_drop_mm_files(void) 1024 + { 1025 + struct files_struct *files = current->files; 1049 1026 struct mm_struct *mm = current->mm; 1050 1027 1051 1028 if (mm) { ··· 1084 999 mmput(mm); 1085 1000 current->mm = NULL; 1086 1001 } 1002 + if (files) { 1003 + struct nsproxy *nsproxy = current->nsproxy; 1004 + 1005 + task_lock(current); 1006 + current->files = NULL; 1007 + current->nsproxy = NULL; 1008 + task_unlock(current); 1009 + put_files_struct(files); 1010 + put_nsproxy(nsproxy); 1011 + } 1012 + } 1013 + 1014 + static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx) 1015 + { 1016 + if (!current->files) { 1017 + struct files_struct *files; 1018 + struct nsproxy *nsproxy; 1019 + 1020 + task_lock(ctx->sqo_task); 1021 + files = ctx->sqo_task->files; 1022 + if (!files) { 1023 + task_unlock(ctx->sqo_task); 1024 + return -EOWNERDEAD; 1025 + } 1026 + atomic_inc(&files->count); 1027 + get_nsproxy(ctx->sqo_task->nsproxy); 1028 + nsproxy = ctx->sqo_task->nsproxy; 1029 + task_unlock(ctx->sqo_task); 1030 + 1031 + task_lock(current); 1032 + current->files = files; 1033 + current->nsproxy = nsproxy; 1034 + task_unlock(current); 1035 + } 1036 + return 0; 1087 1037 } 1088 1038 1089 1039 static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) ··· 1146 1026 return -EFAULT; 1147 1027 } 1148 1028 1149 - static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, 1150 - struct io_kiocb *req) 1029 + static int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx, 1030 + struct io_kiocb *req) 1151 1031 { 1152 - if (!(io_op_defs[req->opcode].work_flags & IO_WQ_WORK_MM)) 1153 - return 0; 1154 - return __io_sq_thread_acquire_mm(ctx); 1032 + const struct io_op_def *def = &io_op_defs[req->opcode]; 1033 + int ret; 1034 + 1035 + if (def->work_flags & IO_WQ_WORK_MM) { 1036 + ret = __io_sq_thread_acquire_mm(ctx); 1037 + if (unlikely(ret)) 1038 + return ret; 1039 + } 1040 + 1041 + if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) { 1042 + ret = __io_sq_thread_acquire_files(ctx); 1043 + if (unlikely(ret)) 1044 + return ret; 1045 + } 1046 + 1047 + return 0; 1155 1048 } 1156 1049 1157 1050 static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx, ··· 1307 1174 INIT_LIST_HEAD(&ctx->iopoll_list); 1308 1175 INIT_LIST_HEAD(&ctx->defer_list); 1309 1176 INIT_LIST_HEAD(&ctx->timeout_list); 1310 - init_waitqueue_head(&ctx->inflight_wait); 1311 1177 spin_lock_init(&ctx->inflight_lock); 1312 1178 INIT_LIST_HEAD(&ctx->inflight_list); 1313 1179 INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work); ··· 1548 1416 { 1549 1417 struct io_kiocb *cur; 1550 1418 1551 - io_prep_async_work(req); 1552 - if (req->flags & REQ_F_LINK_HEAD) 1553 - list_for_each_entry(cur, &req->link_list, link_list) 1554 - io_prep_async_work(cur); 1419 + io_for_each_link(cur, req) 1420 + io_prep_async_work(cur); 1555 1421 } 1556 1422 1557 1423 static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req) ··· 1590 1460 } 1591 1461 } 1592 1462 1593 - static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk) 1594 - { 1595 - struct io_ring_ctx *ctx = req->ctx; 1596 - 1597 - if (!tsk || req->task == tsk) 1598 - return true; 1599 - if (ctx->flags & IORING_SETUP_SQPOLL) { 1600 - if (ctx->sq_data && req->task == ctx->sq_data->thread) 1601 - return true; 1602 - } 1603 - return false; 1604 - } 1605 - 1606 1463 /* 1607 1464 * Returns true if we found and killed one or more timeouts 1608 1465 */ 1609 - static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk) 1466 + static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, 1467 + struct files_struct *files) 1610 1468 { 1611 1469 struct io_kiocb *req, *tmp; 1612 1470 int canceled = 0; 1613 1471 1614 1472 spin_lock_irq(&ctx->completion_lock); 1615 1473 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { 1616 - if (io_task_match(req, tsk)) { 1474 + if (io_match_task(req, tsk, files)) { 1617 1475 io_kill_timeout(req); 1618 1476 canceled++; 1619 1477 } ··· 1712 1594 } 1713 1595 } 1714 1596 1715 - static inline bool __io_match_files(struct io_kiocb *req, 1716 - struct files_struct *files) 1717 - { 1718 - return ((req->flags & REQ_F_WORK_INITIALIZED) && 1719 - (req->work.flags & IO_WQ_WORK_FILES)) && 1720 - req->work.identity->files == files; 1721 - } 1722 - 1723 - static bool io_match_files(struct io_kiocb *req, 1724 - struct files_struct *files) 1725 - { 1726 - struct io_kiocb *link; 1727 - 1728 - if (!files) 1729 - return true; 1730 - if (__io_match_files(req, files)) 1731 - return true; 1732 - if (req->flags & REQ_F_LINK_HEAD) { 1733 - list_for_each_entry(link, &req->link_list, link_list) { 1734 - if (__io_match_files(link, files)) 1735 - return true; 1736 - } 1737 - } 1738 - return false; 1739 - } 1740 - 1741 1597 /* Returns true if there are no backlogged entries after the flush */ 1742 1598 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, 1743 1599 struct task_struct *tsk, ··· 1739 1647 1740 1648 cqe = NULL; 1741 1649 list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) { 1742 - if (tsk && req->task != tsk) 1743 - continue; 1744 - if (!io_match_files(req, files)) 1650 + if (!io_match_task(req, tsk, files)) 1745 1651 continue; 1746 1652 1747 1653 cqe = io_get_cqring(ctx); ··· 1935 1845 static inline void io_put_file(struct io_kiocb *req, struct file *file, 1936 1846 bool fixed) 1937 1847 { 1938 - if (fixed) 1939 - percpu_ref_put(req->fixed_file_refs); 1940 - else 1848 + if (!fixed) 1941 1849 fput(file); 1942 1850 } 1943 1851 ··· 1947 1859 kfree(req->async_data); 1948 1860 if (req->file) 1949 1861 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); 1950 - 1862 + if (req->fixed_file_refs) 1863 + percpu_ref_put(req->fixed_file_refs); 1951 1864 io_req_clean_work(req); 1952 1865 } 1953 1866 ··· 1971 1882 percpu_ref_put(&ctx->refs); 1972 1883 } 1973 1884 1885 + static inline void io_remove_next_linked(struct io_kiocb *req) 1886 + { 1887 + struct io_kiocb *nxt = req->link; 1888 + 1889 + req->link = nxt->link; 1890 + nxt->link = NULL; 1891 + } 1892 + 1974 1893 static void io_kill_linked_timeout(struct io_kiocb *req) 1975 1894 { 1976 1895 struct io_ring_ctx *ctx = req->ctx; ··· 1987 1890 unsigned long flags; 1988 1891 1989 1892 spin_lock_irqsave(&ctx->completion_lock, flags); 1990 - link = list_first_entry_or_null(&req->link_list, struct io_kiocb, 1991 - link_list); 1893 + link = req->link; 1894 + 1992 1895 /* 1993 1896 * Can happen if a linked timeout fired and link had been like 1994 1897 * req -> link t-out -> link t-out [-> ...] ··· 1997 1900 struct io_timeout_data *io = link->async_data; 1998 1901 int ret; 1999 1902 2000 - list_del_init(&link->link_list); 1903 + io_remove_next_linked(req); 1904 + link->timeout.head = NULL; 2001 1905 ret = hrtimer_try_to_cancel(&io->timer); 2002 1906 if (ret != -1) { 2003 1907 io_cqring_fill_event(link, -ECANCELED); ··· 2015 1917 } 2016 1918 } 2017 1919 2018 - static struct io_kiocb *io_req_link_next(struct io_kiocb *req) 2019 - { 2020 - struct io_kiocb *nxt; 2021 1920 2022 - /* 2023 - * The list should never be empty when we are called here. But could 2024 - * potentially happen if the chain is messed up, check to be on the 2025 - * safe side. 2026 - */ 2027 - if (unlikely(list_empty(&req->link_list))) 2028 - return NULL; 2029 - 2030 - nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list); 2031 - list_del_init(&req->link_list); 2032 - if (!list_empty(&nxt->link_list)) 2033 - nxt->flags |= REQ_F_LINK_HEAD; 2034 - return nxt; 2035 - } 2036 - 2037 - /* 2038 - * Called if REQ_F_LINK_HEAD is set, and we fail the head request 2039 - */ 2040 1921 static void io_fail_links(struct io_kiocb *req) 2041 1922 { 1923 + struct io_kiocb *link, *nxt; 2042 1924 struct io_ring_ctx *ctx = req->ctx; 2043 1925 unsigned long flags; 2044 1926 2045 1927 spin_lock_irqsave(&ctx->completion_lock, flags); 2046 - while (!list_empty(&req->link_list)) { 2047 - struct io_kiocb *link = list_first_entry(&req->link_list, 2048 - struct io_kiocb, link_list); 1928 + link = req->link; 1929 + req->link = NULL; 2049 1930 2050 - list_del_init(&link->link_list); 1931 + while (link) { 1932 + nxt = link->link; 1933 + link->link = NULL; 1934 + 2051 1935 trace_io_uring_fail_link(req, link); 2052 - 2053 1936 io_cqring_fill_event(link, -ECANCELED); 2054 1937 2055 1938 /* ··· 2042 1963 io_put_req_deferred(link, 2); 2043 1964 else 2044 1965 io_double_put_req(link); 1966 + link = nxt; 2045 1967 } 2046 - 2047 1968 io_commit_cqring(ctx); 2048 1969 spin_unlock_irqrestore(&ctx->completion_lock, flags); 2049 1970 ··· 2052 1973 2053 1974 static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) 2054 1975 { 2055 - req->flags &= ~REQ_F_LINK_HEAD; 2056 1976 if (req->flags & REQ_F_LINK_TIMEOUT) 2057 1977 io_kill_linked_timeout(req); 2058 1978 ··· 2061 1983 * dependencies to the next request. In case of failure, fail the rest 2062 1984 * of the chain. 2063 1985 */ 2064 - if (likely(!(req->flags & REQ_F_FAIL_LINK))) 2065 - return io_req_link_next(req); 1986 + if (likely(!(req->flags & REQ_F_FAIL_LINK))) { 1987 + struct io_kiocb *nxt = req->link; 1988 + 1989 + req->link = NULL; 1990 + return nxt; 1991 + } 2066 1992 io_fail_links(req); 2067 1993 return NULL; 2068 1994 } 2069 1995 2070 - static struct io_kiocb *io_req_find_next(struct io_kiocb *req) 1996 + static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) 2071 1997 { 2072 - if (likely(!(req->flags & REQ_F_LINK_HEAD))) 1998 + if (likely(!(req->link) && !(req->flags & REQ_F_LINK_TIMEOUT))) 2073 1999 return NULL; 2074 2000 return __io_req_find_next(req); 2075 2001 } ··· 2132 2050 { 2133 2051 struct io_ring_ctx *ctx = req->ctx; 2134 2052 2135 - if (!__io_sq_thread_acquire_mm(ctx)) { 2053 + if (!__io_sq_thread_acquire_mm(ctx) && 2054 + !__io_sq_thread_acquire_files(ctx)) { 2136 2055 mutex_lock(&ctx->uring_lock); 2137 2056 __io_queue_sqe(req, NULL); 2138 2057 mutex_unlock(&ctx->uring_lock); ··· 2169 2086 } 2170 2087 } 2171 2088 2172 - static void io_queue_next(struct io_kiocb *req) 2089 + static inline void io_queue_next(struct io_kiocb *req) 2173 2090 { 2174 2091 struct io_kiocb *nxt = io_req_find_next(req); 2175 2092 ··· 2226 2143 io_free_req(req); 2227 2144 return; 2228 2145 } 2229 - if (req->flags & REQ_F_LINK_HEAD) 2230 - io_queue_next(req); 2146 + io_queue_next(req); 2231 2147 2232 2148 if (req->task != rb->task) { 2233 2149 if (rb->task) { ··· 2328 2246 * we wake up the task, and the next invocation will flush the 2329 2247 * entries. We cannot safely to it from here. 2330 2248 */ 2331 - if (noflush && !list_empty(&ctx->cq_overflow_list)) 2249 + if (noflush) 2332 2250 return -1U; 2333 2251 2334 2252 io_cqring_overflow_flush(ctx, false, NULL, NULL); ··· 2675 2593 if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) 2676 2594 return false; 2677 2595 2678 - ret = io_sq_thread_acquire_mm(req->ctx, req); 2596 + ret = io_sq_thread_acquire_mm_files(req->ctx, req); 2679 2597 2680 2598 if (io_resubmit_prep(req, ret)) { 2681 2599 refcount_inc(&req->refs); ··· 2723 2641 * find it from a io_iopoll_getevents() thread before the issuer is done 2724 2642 * accessing the kiocb cookie. 2725 2643 */ 2726 - static void io_iopoll_req_issued(struct io_kiocb *req) 2644 + static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async) 2727 2645 { 2728 2646 struct io_ring_ctx *ctx = req->ctx; 2729 2647 ··· 2752 2670 else 2753 2671 list_add_tail(&req->inflight_entry, &ctx->iopoll_list); 2754 2672 2755 - if ((ctx->flags & IORING_SETUP_SQPOLL) && 2673 + /* 2674 + * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread 2675 + * task context or in io worker task context. If current task context is 2676 + * sq thread, we don't need to check whether should wake up sq thread. 2677 + */ 2678 + if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) && 2756 2679 wq_has_sleeper(&ctx->sq_data->wait)) 2757 2680 wake_up(&ctx->sq_data->wait); 2758 2681 } 2759 2682 2760 - static void __io_state_file_put(struct io_submit_state *state) 2683 + static inline void __io_state_file_put(struct io_submit_state *state) 2761 2684 { 2762 - if (state->has_refs) 2763 - fput_many(state->file, state->has_refs); 2764 - state->file = NULL; 2685 + fput_many(state->file, state->file_refs); 2686 + state->file_refs = 0; 2765 2687 } 2766 2688 2767 2689 static inline void io_state_file_put(struct io_submit_state *state) 2768 2690 { 2769 - if (state->file) 2691 + if (state->file_refs) 2770 2692 __io_state_file_put(state); 2771 2693 } 2772 2694 ··· 2784 2698 if (!state) 2785 2699 return fget(fd); 2786 2700 2787 - if (state->file) { 2701 + if (state->file_refs) { 2788 2702 if (state->fd == fd) { 2789 - state->has_refs--; 2703 + state->file_refs--; 2790 2704 return state->file; 2791 2705 } 2792 2706 __io_state_file_put(state); 2793 2707 } 2794 2708 state->file = fget_many(fd, state->ios_left); 2795 - if (!state->file) 2709 + if (unlikely(!state->file)) 2796 2710 return NULL; 2797 2711 2798 2712 state->fd = fd; 2799 - state->has_refs = state->ios_left - 1; 2713 + state->file_refs = state->ios_left - 1; 2800 2714 return state->file; 2801 2715 } 2802 2716 ··· 3151 3065 return __io_iov_buffer_select(req, iov, needs_lock); 3152 3066 } 3153 3067 3154 - static ssize_t __io_import_iovec(int rw, struct io_kiocb *req, 3068 + static ssize_t io_import_iovec(int rw, struct io_kiocb *req, 3155 3069 struct iovec **iovec, struct iov_iter *iter, 3156 3070 bool needs_lock) 3157 3071 { ··· 3180 3094 3181 3095 ret = import_single_range(rw, buf, sqe_len, *iovec, iter); 3182 3096 *iovec = NULL; 3183 - return ret < 0 ? ret : sqe_len; 3097 + return ret; 3184 3098 } 3185 3099 3186 3100 if (req->flags & REQ_F_BUFFER_SELECT) { ··· 3195 3109 3196 3110 return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter, 3197 3111 req->ctx->compat); 3198 - } 3199 - 3200 - static ssize_t io_import_iovec(int rw, struct io_kiocb *req, 3201 - struct iovec **iovec, struct iov_iter *iter, 3202 - bool needs_lock) 3203 - { 3204 - struct io_async_rw *iorw = req->async_data; 3205 - 3206 - if (!iorw) 3207 - return __io_import_iovec(rw, req, iovec, iter, needs_lock); 3208 - *iovec = NULL; 3209 - return iov_iter_count(&iorw->iter); 3210 3112 } 3211 3113 3212 3114 static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) ··· 3320 3246 struct iovec *iov = iorw->fast_iov; 3321 3247 ssize_t ret; 3322 3248 3323 - ret = __io_import_iovec(rw, req, &iov, &iorw->iter, false); 3249 + ret = io_import_iovec(rw, req, &iov, &iorw->iter, false); 3324 3250 if (unlikely(ret < 0)) 3325 3251 return ret; 3326 3252 ··· 3453 3379 struct iov_iter __iter, *iter = &__iter; 3454 3380 struct io_async_rw *rw = req->async_data; 3455 3381 ssize_t io_size, ret, ret2; 3456 - size_t iov_count; 3457 3382 bool no_async; 3458 3383 3459 - if (rw) 3384 + if (rw) { 3460 3385 iter = &rw->iter; 3461 - 3462 - ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); 3463 - if (ret < 0) 3464 - return ret; 3465 - iov_count = iov_iter_count(iter); 3466 - io_size = ret; 3386 + iovec = NULL; 3387 + } else { 3388 + ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); 3389 + if (ret < 0) 3390 + return ret; 3391 + } 3392 + io_size = iov_iter_count(iter); 3467 3393 req->result = io_size; 3468 3394 ret = 0; 3469 3395 ··· 3479 3405 if (no_async) 3480 3406 goto copy_iov; 3481 3407 3482 - ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count); 3408 + ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size); 3483 3409 if (unlikely(ret)) 3484 3410 goto out_free; 3485 3411 ··· 3498 3424 if (req->file->f_flags & O_NONBLOCK) 3499 3425 goto done; 3500 3426 /* some cases will consume bytes even on error returns */ 3501 - iov_iter_revert(iter, iov_count - iov_iter_count(iter)); 3427 + iov_iter_revert(iter, io_size - iov_iter_count(iter)); 3502 3428 ret = 0; 3503 3429 goto copy_iov; 3504 3430 } else if (ret < 0) { ··· 3581 3507 struct kiocb *kiocb = &req->rw.kiocb; 3582 3508 struct iov_iter __iter, *iter = &__iter; 3583 3509 struct io_async_rw *rw = req->async_data; 3584 - size_t iov_count; 3585 3510 ssize_t ret, ret2, io_size; 3586 3511 3587 - if (rw) 3512 + if (rw) { 3588 3513 iter = &rw->iter; 3589 - 3590 - ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); 3591 - if (ret < 0) 3592 - return ret; 3593 - iov_count = iov_iter_count(iter); 3594 - io_size = ret; 3514 + iovec = NULL; 3515 + } else { 3516 + ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); 3517 + if (ret < 0) 3518 + return ret; 3519 + } 3520 + io_size = iov_iter_count(iter); 3595 3521 req->result = io_size; 3596 3522 3597 3523 /* Ensure we clear previously set non-block flag */ ··· 3609 3535 (req->flags & REQ_F_ISREG)) 3610 3536 goto copy_iov; 3611 3537 3612 - ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count); 3538 + ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size); 3613 3539 if (unlikely(ret)) 3614 3540 goto out_free; 3615 3541 ··· 3652 3578 } else { 3653 3579 copy_iov: 3654 3580 /* some cases will consume bytes even on error returns */ 3655 - iov_iter_revert(iter, iov_count - iov_iter_count(iter)); 3581 + iov_iter_revert(iter, io_size - iov_iter_count(iter)); 3656 3582 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); 3657 3583 if (!ret) 3658 3584 return -EAGAIN; ··· 3662 3588 if (iovec) 3663 3589 kfree(iovec); 3664 3590 return ret; 3591 + } 3592 + 3593 + static int io_renameat_prep(struct io_kiocb *req, 3594 + const struct io_uring_sqe *sqe) 3595 + { 3596 + struct io_rename *ren = &req->rename; 3597 + const char __user *oldf, *newf; 3598 + 3599 + if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3600 + return -EBADF; 3601 + 3602 + ren->old_dfd = READ_ONCE(sqe->fd); 3603 + oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3604 + newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 3605 + ren->new_dfd = READ_ONCE(sqe->len); 3606 + ren->flags = READ_ONCE(sqe->rename_flags); 3607 + 3608 + ren->oldpath = getname(oldf); 3609 + if (IS_ERR(ren->oldpath)) 3610 + return PTR_ERR(ren->oldpath); 3611 + 3612 + ren->newpath = getname(newf); 3613 + if (IS_ERR(ren->newpath)) { 3614 + putname(ren->oldpath); 3615 + return PTR_ERR(ren->newpath); 3616 + } 3617 + 3618 + req->flags |= REQ_F_NEED_CLEANUP; 3619 + return 0; 3620 + } 3621 + 3622 + static int io_renameat(struct io_kiocb *req, bool force_nonblock) 3623 + { 3624 + struct io_rename *ren = &req->rename; 3625 + int ret; 3626 + 3627 + if (force_nonblock) 3628 + return -EAGAIN; 3629 + 3630 + ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, 3631 + ren->newpath, ren->flags); 3632 + 3633 + req->flags &= ~REQ_F_NEED_CLEANUP; 3634 + if (ret < 0) 3635 + req_set_fail_links(req); 3636 + io_req_complete(req, ret); 3637 + return 0; 3638 + } 3639 + 3640 + static int io_unlinkat_prep(struct io_kiocb *req, 3641 + const struct io_uring_sqe *sqe) 3642 + { 3643 + struct io_unlink *un = &req->unlink; 3644 + const char __user *fname; 3645 + 3646 + if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3647 + return -EBADF; 3648 + 3649 + un->dfd = READ_ONCE(sqe->fd); 3650 + 3651 + un->flags = READ_ONCE(sqe->unlink_flags); 3652 + if (un->flags & ~AT_REMOVEDIR) 3653 + return -EINVAL; 3654 + 3655 + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3656 + un->filename = getname(fname); 3657 + if (IS_ERR(un->filename)) 3658 + return PTR_ERR(un->filename); 3659 + 3660 + req->flags |= REQ_F_NEED_CLEANUP; 3661 + return 0; 3662 + } 3663 + 3664 + static int io_unlinkat(struct io_kiocb *req, bool force_nonblock) 3665 + { 3666 + struct io_unlink *un = &req->unlink; 3667 + int ret; 3668 + 3669 + if (force_nonblock) 3670 + return -EAGAIN; 3671 + 3672 + if (un->flags & AT_REMOVEDIR) 3673 + ret = do_rmdir(un->dfd, un->filename); 3674 + else 3675 + ret = do_unlinkat(un->dfd, un->filename); 3676 + 3677 + req->flags &= ~REQ_F_NEED_CLEANUP; 3678 + if (ret < 0) 3679 + req_set_fail_links(req); 3680 + io_req_complete(req, ret); 3681 + return 0; 3682 + } 3683 + 3684 + static int io_shutdown_prep(struct io_kiocb *req, 3685 + const struct io_uring_sqe *sqe) 3686 + { 3687 + #if defined(CONFIG_NET) 3688 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3689 + return -EINVAL; 3690 + if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || 3691 + sqe->buf_index) 3692 + return -EINVAL; 3693 + 3694 + req->shutdown.how = READ_ONCE(sqe->len); 3695 + return 0; 3696 + #else 3697 + return -EOPNOTSUPP; 3698 + #endif 3699 + } 3700 + 3701 + static int io_shutdown(struct io_kiocb *req, bool force_nonblock) 3702 + { 3703 + #if defined(CONFIG_NET) 3704 + struct socket *sock; 3705 + int ret; 3706 + 3707 + if (force_nonblock) 3708 + return -EAGAIN; 3709 + 3710 + sock = sock_from_file(req->file); 3711 + if (unlikely(!sock)) 3712 + return -ENOTSOCK; 3713 + 3714 + ret = __sys_shutdown_sock(sock, req->shutdown.how); 3715 + io_req_complete(req, ret); 3716 + return 0; 3717 + #else 3718 + return -EOPNOTSUPP; 3719 + #endif 3665 3720 } 3666 3721 3667 3722 static int __io_splice_prep(struct io_kiocb *req, ··· 4007 3804 { 4008 3805 u64 flags, mode; 4009 3806 4010 - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) 3807 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4011 3808 return -EINVAL; 4012 3809 mode = READ_ONCE(sqe->len); 4013 3810 flags = READ_ONCE(sqe->open_flags); ··· 4021 3818 size_t len; 4022 3819 int ret; 4023 3820 4024 - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) 3821 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4025 3822 return -EINVAL; 4026 3823 how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 4027 3824 len = READ_ONCE(sqe->len); ··· 4151 3948 head = idr_find(&ctx->io_buffer_idr, p->bgid); 4152 3949 if (head) 4153 3950 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); 4154 - 4155 - io_ring_submit_lock(ctx, !force_nonblock); 4156 3951 if (ret < 0) 4157 3952 req_set_fail_links(req); 4158 - __io_req_complete(req, ret, 0, cs); 3953 + 3954 + /* need to hold the lock to complete IOPOLL requests */ 3955 + if (ctx->flags & IORING_SETUP_IOPOLL) { 3956 + __io_req_complete(req, ret, 0, cs); 3957 + io_ring_submit_unlock(ctx, !force_nonblock); 3958 + } else { 3959 + io_ring_submit_unlock(ctx, !force_nonblock); 3960 + __io_req_complete(req, ret, 0, cs); 3961 + } 4159 3962 return 0; 4160 3963 } 4161 3964 ··· 4246 4037 } 4247 4038 } 4248 4039 out: 4249 - io_ring_submit_unlock(ctx, !force_nonblock); 4250 4040 if (ret < 0) 4251 4041 req_set_fail_links(req); 4252 - __io_req_complete(req, ret, 0, cs); 4042 + 4043 + /* need to hold the lock to complete IOPOLL requests */ 4044 + if (ctx->flags & IORING_SETUP_IOPOLL) { 4045 + __io_req_complete(req, ret, 0, cs); 4046 + io_ring_submit_unlock(ctx, !force_nonblock); 4047 + } else { 4048 + io_ring_submit_unlock(ctx, !force_nonblock); 4049 + __io_req_complete(req, ret, 0, cs); 4050 + } 4253 4051 return 0; 4254 4052 } 4255 4053 ··· 4428 4212 io_req_init_async(req); 4429 4213 req->work.flags |= IO_WQ_WORK_NO_CANCEL; 4430 4214 4431 - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) 4215 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4432 4216 return -EINVAL; 4433 4217 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || 4434 4218 sqe->rw_flags || sqe->buf_index) ··· 4910 4694 { 4911 4695 struct io_accept *accept = &req->accept; 4912 4696 4913 - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) 4697 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4914 4698 return -EINVAL; 4915 4699 if (sqe->ioprio || sqe->len || sqe->buf_index) 4916 4700 return -EINVAL; ··· 4951 4735 struct io_connect *conn = &req->connect; 4952 4736 struct io_async_connect *io = req->async_data; 4953 4737 4954 - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) 4738 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4955 4739 return -EINVAL; 4956 4740 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) 4957 4741 return -EINVAL; ··· 5486 5270 /* 5487 5271 * Returns true if we found and killed one or more poll requests 5488 5272 */ 5489 - static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk) 5273 + static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, 5274 + struct files_struct *files) 5490 5275 { 5491 5276 struct hlist_node *tmp; 5492 5277 struct io_kiocb *req; ··· 5499 5282 5500 5283 list = &ctx->cancel_hash[i]; 5501 5284 hlist_for_each_entry_safe(req, tmp, list, hash_node) { 5502 - if (io_task_match(req, tsk)) 5285 + if (io_match_task(req, tsk, files)) 5503 5286 posted += io_poll_remove_one(req); 5504 5287 } 5505 5288 } ··· 5537 5320 sqe->poll_events) 5538 5321 return -EINVAL; 5539 5322 5540 - req->poll.addr = READ_ONCE(sqe->addr); 5323 + req->poll_remove.addr = READ_ONCE(sqe->addr); 5541 5324 return 0; 5542 5325 } 5543 5326 ··· 5548 5331 static int io_poll_remove(struct io_kiocb *req) 5549 5332 { 5550 5333 struct io_ring_ctx *ctx = req->ctx; 5551 - u64 addr; 5552 5334 int ret; 5553 5335 5554 - addr = req->poll.addr; 5555 5336 spin_lock_irq(&ctx->completion_lock); 5556 - ret = io_poll_cancel(ctx, addr); 5337 + ret = io_poll_cancel(ctx, req->poll_remove.addr); 5557 5338 spin_unlock_irq(&ctx->completion_lock); 5558 5339 5559 5340 if (ret < 0) ··· 5644 5429 return HRTIMER_NORESTART; 5645 5430 } 5646 5431 5647 - static int __io_timeout_cancel(struct io_kiocb *req) 5432 + static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, 5433 + __u64 user_data) 5648 5434 { 5649 - struct io_timeout_data *io = req->async_data; 5650 - int ret; 5651 - 5652 - ret = hrtimer_try_to_cancel(&io->timer); 5653 - if (ret == -1) 5654 - return -EALREADY; 5655 - list_del_init(&req->timeout.list); 5656 - 5657 - req_set_fail_links(req); 5658 - io_cqring_fill_event(req, -ECANCELED); 5659 - io_put_req_deferred(req, 1); 5660 - return 0; 5661 - } 5662 - 5663 - static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) 5664 - { 5435 + struct io_timeout_data *io; 5665 5436 struct io_kiocb *req; 5666 5437 int ret = -ENOENT; 5667 5438 ··· 5659 5458 } 5660 5459 5661 5460 if (ret == -ENOENT) 5662 - return ret; 5461 + return ERR_PTR(ret); 5663 5462 5664 - return __io_timeout_cancel(req); 5463 + io = req->async_data; 5464 + ret = hrtimer_try_to_cancel(&io->timer); 5465 + if (ret == -1) 5466 + return ERR_PTR(-EALREADY); 5467 + list_del_init(&req->timeout.list); 5468 + return req; 5469 + } 5470 + 5471 + static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) 5472 + { 5473 + struct io_kiocb *req = io_timeout_extract(ctx, user_data); 5474 + 5475 + if (IS_ERR(req)) 5476 + return PTR_ERR(req); 5477 + 5478 + req_set_fail_links(req); 5479 + io_cqring_fill_event(req, -ECANCELED); 5480 + io_put_req_deferred(req, 1); 5481 + return 0; 5482 + } 5483 + 5484 + static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, 5485 + struct timespec64 *ts, enum hrtimer_mode mode) 5486 + { 5487 + struct io_kiocb *req = io_timeout_extract(ctx, user_data); 5488 + struct io_timeout_data *data; 5489 + 5490 + if (IS_ERR(req)) 5491 + return PTR_ERR(req); 5492 + 5493 + req->timeout.off = 0; /* noseq */ 5494 + data = req->async_data; 5495 + list_add_tail(&req->timeout.list, &ctx->timeout_list); 5496 + hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode); 5497 + data->timer.function = io_timeout_fn; 5498 + hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); 5499 + return 0; 5665 5500 } 5666 5501 5667 5502 static int io_timeout_remove_prep(struct io_kiocb *req, 5668 5503 const struct io_uring_sqe *sqe) 5669 5504 { 5505 + struct io_timeout_rem *tr = &req->timeout_rem; 5506 + 5670 5507 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5671 5508 return -EINVAL; 5672 5509 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 5673 5510 return -EINVAL; 5674 - if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->timeout_flags) 5511 + if (sqe->ioprio || sqe->buf_index || sqe->len) 5675 5512 return -EINVAL; 5676 5513 5677 - req->timeout_rem.addr = READ_ONCE(sqe->addr); 5514 + tr->addr = READ_ONCE(sqe->addr); 5515 + tr->flags = READ_ONCE(sqe->timeout_flags); 5516 + if (tr->flags & IORING_TIMEOUT_UPDATE) { 5517 + if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS)) 5518 + return -EINVAL; 5519 + if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) 5520 + return -EFAULT; 5521 + } else if (tr->flags) { 5522 + /* timeout removal doesn't support flags */ 5523 + return -EINVAL; 5524 + } 5525 + 5678 5526 return 0; 5679 5527 } 5680 5528 ··· 5732 5482 */ 5733 5483 static int io_timeout_remove(struct io_kiocb *req) 5734 5484 { 5485 + struct io_timeout_rem *tr = &req->timeout_rem; 5735 5486 struct io_ring_ctx *ctx = req->ctx; 5736 5487 int ret; 5737 5488 5738 5489 spin_lock_irq(&ctx->completion_lock); 5739 - ret = io_timeout_cancel(ctx, req->timeout_rem.addr); 5490 + if (req->timeout_rem.flags & IORING_TIMEOUT_UPDATE) { 5491 + enum hrtimer_mode mode = (tr->flags & IORING_TIMEOUT_ABS) 5492 + ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; 5493 + 5494 + ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); 5495 + } else { 5496 + ret = io_timeout_cancel(ctx, tr->addr); 5497 + } 5740 5498 5741 5499 io_cqring_fill_event(req, ret); 5742 5500 io_commit_cqring(ctx); ··· 6024 5766 return io_remove_buffers_prep(req, sqe); 6025 5767 case IORING_OP_TEE: 6026 5768 return io_tee_prep(req, sqe); 5769 + case IORING_OP_SHUTDOWN: 5770 + return io_shutdown_prep(req, sqe); 5771 + case IORING_OP_RENAMEAT: 5772 + return io_renameat_prep(req, sqe); 5773 + case IORING_OP_UNLINKAT: 5774 + return io_unlinkat_prep(req, sqe); 6027 5775 } 6028 5776 6029 5777 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", ··· 6051 5787 { 6052 5788 struct io_kiocb *pos; 6053 5789 struct io_ring_ctx *ctx = req->ctx; 6054 - u32 total_submitted, nr_reqs = 1; 5790 + u32 total_submitted, nr_reqs = 0; 6055 5791 6056 - if (req->flags & REQ_F_LINK_HEAD) 6057 - list_for_each_entry(pos, &req->link_list, link_list) 6058 - nr_reqs++; 5792 + io_for_each_link(pos, req) 5793 + nr_reqs++; 6059 5794 6060 5795 total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped; 6061 5796 return total_submitted - nr_reqs; ··· 6106 5843 static void io_req_drop_files(struct io_kiocb *req) 6107 5844 { 6108 5845 struct io_ring_ctx *ctx = req->ctx; 5846 + struct io_uring_task *tctx = req->task->io_uring; 6109 5847 unsigned long flags; 6110 5848 6111 5849 spin_lock_irqsave(&ctx->inflight_lock, flags); 6112 5850 list_del(&req->inflight_entry); 6113 - if (waitqueue_active(&ctx->inflight_wait)) 6114 - wake_up(&ctx->inflight_wait); 5851 + if (atomic_read(&tctx->in_idle)) 5852 + wake_up(&tctx->wait); 6115 5853 spin_unlock_irqrestore(&ctx->inflight_lock, flags); 6116 5854 req->flags &= ~REQ_F_INFLIGHT; 6117 5855 put_files_struct(req->work.identity->files); ··· 6166 5902 case IORING_OP_OPENAT2: 6167 5903 if (req->open.filename) 6168 5904 putname(req->open.filename); 5905 + break; 5906 + case IORING_OP_RENAMEAT: 5907 + putname(req->rename.oldpath); 5908 + putname(req->rename.newpath); 5909 + break; 5910 + case IORING_OP_UNLINKAT: 5911 + putname(req->unlink.filename); 6169 5912 break; 6170 5913 } 6171 5914 req->flags &= ~REQ_F_NEED_CLEANUP; ··· 6280 6009 case IORING_OP_TEE: 6281 6010 ret = io_tee(req, force_nonblock); 6282 6011 break; 6012 + case IORING_OP_SHUTDOWN: 6013 + ret = io_shutdown(req, force_nonblock); 6014 + break; 6015 + case IORING_OP_RENAMEAT: 6016 + ret = io_renameat(req, force_nonblock); 6017 + break; 6018 + case IORING_OP_UNLINKAT: 6019 + ret = io_unlinkat(req, force_nonblock); 6020 + break; 6283 6021 default: 6284 6022 ret = -EINVAL; 6285 6023 break; ··· 6305 6025 if (in_async) 6306 6026 mutex_lock(&ctx->uring_lock); 6307 6027 6308 - io_iopoll_req_issued(req); 6028 + io_iopoll_req_issued(req, in_async); 6309 6029 6310 6030 if (in_async) 6311 6031 mutex_unlock(&ctx->uring_lock); ··· 6345 6065 } 6346 6066 6347 6067 if (ret) { 6348 - req_set_fail_links(req); 6349 - io_req_complete(req, ret); 6068 + /* 6069 + * io_iopoll_complete() does not hold completion_lock to complete 6070 + * polled io, so here for polled io, just mark it done and still let 6071 + * io_iopoll_complete() complete it. 6072 + */ 6073 + if (req->ctx->flags & IORING_SETUP_IOPOLL) { 6074 + struct kiocb *kiocb = &req->rw.kiocb; 6075 + 6076 + kiocb_done(kiocb, ret, NULL); 6077 + } else { 6078 + req_set_fail_links(req); 6079 + io_req_complete(req, ret); 6080 + } 6350 6081 } 6351 6082 6352 6083 return io_steal_work(req); ··· 6383 6092 return NULL; 6384 6093 fd = array_index_nospec(fd, ctx->nr_user_files); 6385 6094 file = io_file_from_index(ctx, fd); 6386 - if (file) { 6387 - req->fixed_file_refs = &ctx->file_data->node->refs; 6388 - percpu_ref_get(req->fixed_file_refs); 6389 - } 6095 + io_set_resource_node(req); 6390 6096 } else { 6391 6097 trace_io_uring_file_get(ctx, fd); 6392 6098 file = __io_file_get(state, fd); ··· 6392 6104 return file; 6393 6105 } 6394 6106 6395 - static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, 6396 - int fd) 6397 - { 6398 - bool fixed; 6399 - 6400 - fixed = (req->flags & REQ_F_FIXED_FILE) != 0; 6401 - if (unlikely(!fixed && io_async_submit(req->ctx))) 6402 - return -EBADF; 6403 - 6404 - req->file = io_file_get(state, req, fd, fixed); 6405 - if (req->file || io_op_defs[req->opcode].needs_file_no_error) 6406 - return 0; 6407 - return -EBADF; 6408 - } 6409 - 6410 6107 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) 6411 6108 { 6412 6109 struct io_timeout_data *data = container_of(timer, 6413 6110 struct io_timeout_data, timer); 6414 - struct io_kiocb *req = data->req; 6111 + struct io_kiocb *prev, *req = data->req; 6415 6112 struct io_ring_ctx *ctx = req->ctx; 6416 - struct io_kiocb *prev = NULL; 6417 6113 unsigned long flags; 6418 6114 6419 6115 spin_lock_irqsave(&ctx->completion_lock, flags); 6116 + prev = req->timeout.head; 6117 + req->timeout.head = NULL; 6420 6118 6421 6119 /* 6422 6120 * We don't expect the list to be empty, that will only happen if we 6423 6121 * race with the completion of the linked work. 6424 6122 */ 6425 - if (!list_empty(&req->link_list)) { 6426 - prev = list_entry(req->link_list.prev, struct io_kiocb, 6427 - link_list); 6428 - if (refcount_inc_not_zero(&prev->refs)) 6429 - list_del_init(&req->link_list); 6430 - else 6431 - prev = NULL; 6432 - } 6433 - 6123 + if (prev && refcount_inc_not_zero(&prev->refs)) 6124 + io_remove_next_linked(prev); 6125 + else 6126 + prev = NULL; 6434 6127 spin_unlock_irqrestore(&ctx->completion_lock, flags); 6435 6128 6436 6129 if (prev) { ··· 6427 6158 static void __io_queue_linked_timeout(struct io_kiocb *req) 6428 6159 { 6429 6160 /* 6430 - * If the list is now empty, then our linked request finished before 6431 - * we got a chance to setup the timer 6161 + * If the back reference is NULL, then our linked request finished 6162 + * before we got a chance to setup the timer 6432 6163 */ 6433 - if (!list_empty(&req->link_list)) { 6164 + if (req->timeout.head) { 6434 6165 struct io_timeout_data *data = req->async_data; 6435 6166 6436 6167 data->timer.function = io_link_timeout_fn; ··· 6453 6184 6454 6185 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) 6455 6186 { 6456 - struct io_kiocb *nxt; 6187 + struct io_kiocb *nxt = req->link; 6457 6188 6458 - if (!(req->flags & REQ_F_LINK_HEAD)) 6459 - return NULL; 6460 - if (req->flags & REQ_F_LINK_TIMEOUT) 6461 - return NULL; 6462 - 6463 - nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, 6464 - link_list); 6465 - if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT) 6189 + if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) || 6190 + nxt->opcode != IORING_OP_LINK_TIMEOUT) 6466 6191 return NULL; 6467 6192 6193 + nxt->timeout.head = req; 6468 6194 nxt->flags |= REQ_F_LTIMEOUT_ACTIVE; 6469 6195 req->flags |= REQ_F_LINK_TIMEOUT; 6470 6196 return nxt; ··· 6565 6301 io_queue_sqe(req, NULL, cs); 6566 6302 } 6567 6303 6304 + struct io_submit_link { 6305 + struct io_kiocb *head; 6306 + struct io_kiocb *last; 6307 + }; 6308 + 6568 6309 static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, 6569 - struct io_kiocb **link, struct io_comp_state *cs) 6310 + struct io_submit_link *link, struct io_comp_state *cs) 6570 6311 { 6571 6312 struct io_ring_ctx *ctx = req->ctx; 6572 6313 int ret; ··· 6583 6314 * submitted sync once the chain is complete. If none of those 6584 6315 * conditions are true (normal request), then just queue it. 6585 6316 */ 6586 - if (*link) { 6587 - struct io_kiocb *head = *link; 6317 + if (link->head) { 6318 + struct io_kiocb *head = link->head; 6588 6319 6589 6320 /* 6590 6321 * Taking sequential execution of a link, draining both sides ··· 6604 6335 return ret; 6605 6336 } 6606 6337 trace_io_uring_link(ctx, req, head); 6607 - list_add_tail(&req->link_list, &head->link_list); 6338 + link->last->link = req; 6339 + link->last = req; 6608 6340 6609 6341 /* last request of a link, enqueue the link */ 6610 6342 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { 6611 6343 io_queue_link_head(head, cs); 6612 - *link = NULL; 6344 + link->head = NULL; 6613 6345 } 6614 6346 } else { 6615 6347 if (unlikely(ctx->drain_next)) { ··· 6618 6348 ctx->drain_next = 0; 6619 6349 } 6620 6350 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { 6621 - req->flags |= REQ_F_LINK_HEAD; 6622 - INIT_LIST_HEAD(&req->link_list); 6623 - 6624 6351 ret = io_req_defer_prep(req, sqe); 6625 6352 if (unlikely(ret)) 6626 6353 req->flags |= REQ_F_FAIL_LINK; 6627 - *link = req; 6354 + link->head = req; 6355 + link->last = req; 6628 6356 } else { 6629 6357 io_queue_sqe(req, sqe, cs); 6630 6358 } ··· 6638 6370 { 6639 6371 if (!list_empty(&state->comp.list)) 6640 6372 io_submit_flush_completions(&state->comp); 6641 - blk_finish_plug(&state->plug); 6373 + if (state->plug_started) 6374 + blk_finish_plug(&state->plug); 6642 6375 io_state_file_put(state); 6643 6376 if (state->free_reqs) 6644 6377 kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs); ··· 6651 6382 static void io_submit_state_start(struct io_submit_state *state, 6652 6383 struct io_ring_ctx *ctx, unsigned int max_ios) 6653 6384 { 6654 - blk_start_plug(&state->plug); 6385 + state->plug_started = false; 6655 6386 state->comp.nr = 0; 6656 6387 INIT_LIST_HEAD(&state->comp.list); 6657 6388 state->comp.ctx = ctx; 6658 6389 state->free_reqs = 0; 6659 - state->file = NULL; 6390 + state->file_refs = 0; 6660 6391 state->ios_left = max_ios; 6661 6392 } 6662 6393 ··· 6751 6482 req->file = NULL; 6752 6483 req->ctx = ctx; 6753 6484 req->flags = 0; 6485 + req->link = NULL; 6486 + req->fixed_file_refs = NULL; 6754 6487 /* one is dropped after submission, the other at completion */ 6755 6488 refcount_set(&req->refs, 2); 6756 6489 req->task = current; ··· 6761 6490 if (unlikely(req->opcode >= IORING_OP_LAST)) 6762 6491 return -EINVAL; 6763 6492 6764 - if (unlikely(io_sq_thread_acquire_mm(ctx, req))) 6493 + if (unlikely(io_sq_thread_acquire_mm_files(ctx, req))) 6765 6494 return -EFAULT; 6766 6495 6767 6496 sqe_flags = READ_ONCE(sqe->flags); ··· 6794 6523 /* same numerical values with corresponding REQ_F_*, safe to copy */ 6795 6524 req->flags |= sqe_flags; 6796 6525 6797 - if (!io_op_defs[req->opcode].needs_file) 6798 - return 0; 6526 + /* 6527 + * Plug now if we have more than 1 IO left after this, and the target 6528 + * is potentially a read/write to block based storage. 6529 + */ 6530 + if (!state->plug_started && state->ios_left > 1 && 6531 + io_op_defs[req->opcode].plug) { 6532 + blk_start_plug(&state->plug); 6533 + state->plug_started = true; 6534 + } 6799 6535 6800 - ret = io_req_set_file(state, req, READ_ONCE(sqe->fd)); 6536 + ret = 0; 6537 + if (io_op_defs[req->opcode].needs_file) { 6538 + bool fixed = req->flags & REQ_F_FIXED_FILE; 6539 + 6540 + req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed); 6541 + if (unlikely(!req->file && 6542 + !io_op_defs[req->opcode].needs_file_no_error)) 6543 + ret = -EBADF; 6544 + } 6545 + 6801 6546 state->ios_left--; 6802 6547 return ret; 6803 6548 } ··· 6821 6534 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) 6822 6535 { 6823 6536 struct io_submit_state state; 6824 - struct io_kiocb *link = NULL; 6537 + struct io_submit_link link; 6825 6538 int i, submitted = 0; 6826 6539 6827 6540 /* if we have a backlog and couldn't flush it all, return BUSY */ ··· 6841 6554 refcount_add(nr, &current->usage); 6842 6555 6843 6556 io_submit_state_start(&state, ctx, nr); 6557 + link.head = NULL; 6844 6558 6845 6559 for (i = 0; i < nr; i++) { 6846 6560 const struct io_uring_sqe *sqe; ··· 6887 6599 percpu_counter_sub(&tctx->inflight, unused); 6888 6600 put_task_struct_many(current, unused); 6889 6601 } 6890 - if (link) 6891 - io_queue_link_head(link, &state.comp); 6602 + if (link.head) 6603 + io_queue_link_head(link.head, &state.comp); 6892 6604 io_submit_state_end(&state); 6893 6605 6894 6606 /* Commit SQ ring head once we've consumed and submitted all SQEs */ ··· 6912 6624 spin_unlock_irq(&ctx->completion_lock); 6913 6625 } 6914 6626 6915 - static int io_sq_wake_function(struct wait_queue_entry *wqe, unsigned mode, 6916 - int sync, void *key) 6627 + static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) 6917 6628 { 6918 - struct io_ring_ctx *ctx = container_of(wqe, struct io_ring_ctx, sqo_wait_entry); 6919 - int ret; 6920 - 6921 - ret = autoremove_wake_function(wqe, mode, sync, key); 6922 - if (ret) { 6923 - unsigned long flags; 6924 - 6925 - spin_lock_irqsave(&ctx->completion_lock, flags); 6926 - ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; 6927 - spin_unlock_irqrestore(&ctx->completion_lock, flags); 6928 - } 6929 - return ret; 6930 - } 6931 - 6932 - enum sq_ret { 6933 - SQT_IDLE = 1, 6934 - SQT_SPIN = 2, 6935 - SQT_DID_WORK = 4, 6936 - }; 6937 - 6938 - static enum sq_ret __io_sq_thread(struct io_ring_ctx *ctx, 6939 - unsigned long start_jiffies, bool cap_entries) 6940 - { 6941 - unsigned long timeout = start_jiffies + ctx->sq_thread_idle; 6942 - struct io_sq_data *sqd = ctx->sq_data; 6943 6629 unsigned int to_submit; 6944 6630 int ret = 0; 6945 6631 6946 - again: 6947 - if (!list_empty(&ctx->iopoll_list)) { 6948 - unsigned nr_events = 0; 6949 - 6950 - mutex_lock(&ctx->uring_lock); 6951 - if (!list_empty(&ctx->iopoll_list) && !need_resched()) 6952 - io_do_iopoll(ctx, &nr_events, 0); 6953 - mutex_unlock(&ctx->uring_lock); 6954 - } 6955 - 6956 6632 to_submit = io_sqring_entries(ctx); 6957 - 6958 - /* 6959 - * If submit got -EBUSY, flag us as needing the application 6960 - * to enter the kernel to reap and flush events. 6961 - */ 6962 - if (!to_submit || ret == -EBUSY || need_resched()) { 6963 - /* 6964 - * Drop cur_mm before scheduling, we can't hold it for 6965 - * long periods (or over schedule()). Do this before 6966 - * adding ourselves to the waitqueue, as the unuse/drop 6967 - * may sleep. 6968 - */ 6969 - io_sq_thread_drop_mm(); 6970 - 6971 - /* 6972 - * We're polling. If we're within the defined idle 6973 - * period, then let us spin without work before going 6974 - * to sleep. The exception is if we got EBUSY doing 6975 - * more IO, we should wait for the application to 6976 - * reap events and wake us up. 6977 - */ 6978 - if (!list_empty(&ctx->iopoll_list) || need_resched() || 6979 - (!time_after(jiffies, timeout) && ret != -EBUSY && 6980 - !percpu_ref_is_dying(&ctx->refs))) 6981 - return SQT_SPIN; 6982 - 6983 - prepare_to_wait(&sqd->wait, &ctx->sqo_wait_entry, 6984 - TASK_INTERRUPTIBLE); 6985 - 6986 - /* 6987 - * While doing polled IO, before going to sleep, we need 6988 - * to check if there are new reqs added to iopoll_list, 6989 - * it is because reqs may have been punted to io worker 6990 - * and will be added to iopoll_list later, hence check 6991 - * the iopoll_list again. 6992 - */ 6993 - if ((ctx->flags & IORING_SETUP_IOPOLL) && 6994 - !list_empty_careful(&ctx->iopoll_list)) { 6995 - finish_wait(&sqd->wait, &ctx->sqo_wait_entry); 6996 - goto again; 6997 - } 6998 - 6999 - to_submit = io_sqring_entries(ctx); 7000 - if (!to_submit || ret == -EBUSY) 7001 - return SQT_IDLE; 7002 - } 7003 - 7004 - finish_wait(&sqd->wait, &ctx->sqo_wait_entry); 7005 - io_ring_clear_wakeup_flag(ctx); 7006 - 7007 6633 /* if we're handling multiple rings, cap submit size for fairness */ 7008 6634 if (cap_entries && to_submit > 8) 7009 6635 to_submit = 8; 7010 6636 7011 - mutex_lock(&ctx->uring_lock); 7012 - if (likely(!percpu_ref_is_dying(&ctx->refs))) 7013 - ret = io_submit_sqes(ctx, to_submit); 7014 - mutex_unlock(&ctx->uring_lock); 6637 + if (!list_empty(&ctx->iopoll_list) || to_submit) { 6638 + unsigned nr_events = 0; 6639 + 6640 + mutex_lock(&ctx->uring_lock); 6641 + if (!list_empty(&ctx->iopoll_list)) 6642 + io_do_iopoll(ctx, &nr_events, 0); 6643 + 6644 + if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs))) 6645 + ret = io_submit_sqes(ctx, to_submit); 6646 + mutex_unlock(&ctx->uring_lock); 6647 + } 7015 6648 7016 6649 if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait)) 7017 6650 wake_up(&ctx->sqo_sq_wait); 7018 6651 7019 - return SQT_DID_WORK; 6652 + return ret; 6653 + } 6654 + 6655 + static void io_sqd_update_thread_idle(struct io_sq_data *sqd) 6656 + { 6657 + struct io_ring_ctx *ctx; 6658 + unsigned sq_thread_idle = 0; 6659 + 6660 + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 6661 + if (sq_thread_idle < ctx->sq_thread_idle) 6662 + sq_thread_idle = ctx->sq_thread_idle; 6663 + } 6664 + 6665 + sqd->sq_thread_idle = sq_thread_idle; 7020 6666 } 7021 6667 7022 6668 static void io_sqd_init_new(struct io_sq_data *sqd) ··· 6959 6737 6960 6738 while (!list_empty(&sqd->ctx_new_list)) { 6961 6739 ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list); 6962 - init_wait(&ctx->sqo_wait_entry); 6963 - ctx->sqo_wait_entry.func = io_sq_wake_function; 6964 6740 list_move_tail(&ctx->sqd_list, &sqd->ctx_list); 6965 6741 complete(&ctx->sq_thread_comp); 6966 6742 } 6743 + 6744 + io_sqd_update_thread_idle(sqd); 6967 6745 } 6968 6746 6969 6747 static int io_sq_thread(void *data) 6970 6748 { 6971 6749 struct cgroup_subsys_state *cur_css = NULL; 6750 + struct files_struct *old_files = current->files; 6751 + struct nsproxy *old_nsproxy = current->nsproxy; 6972 6752 const struct cred *old_cred = NULL; 6973 6753 struct io_sq_data *sqd = data; 6974 6754 struct io_ring_ctx *ctx; 6975 - unsigned long start_jiffies; 6755 + unsigned long timeout = 0; 6756 + DEFINE_WAIT(wait); 6976 6757 6977 - start_jiffies = jiffies; 6758 + task_lock(current); 6759 + current->files = NULL; 6760 + current->nsproxy = NULL; 6761 + task_unlock(current); 6762 + 6978 6763 while (!kthread_should_stop()) { 6979 - enum sq_ret ret = 0; 6980 - bool cap_entries; 6764 + int ret; 6765 + bool cap_entries, sqt_spin, needs_sched; 6981 6766 6982 6767 /* 6983 6768 * Any changes to the sqd lists are synchronized through the 6984 6769 * kthread parking. This synchronizes the thread vs users, 6985 6770 * the users are synchronized on the sqd->ctx_lock. 6986 6771 */ 6987 - if (kthread_should_park()) 6772 + if (kthread_should_park()) { 6988 6773 kthread_parkme(); 6774 + /* 6775 + * When sq thread is unparked, in case the previous park operation 6776 + * comes from io_put_sq_data(), which means that sq thread is going 6777 + * to be stopped, so here needs to have a check. 6778 + */ 6779 + if (kthread_should_stop()) 6780 + break; 6781 + } 6989 6782 6990 - if (unlikely(!list_empty(&sqd->ctx_new_list))) 6783 + if (unlikely(!list_empty(&sqd->ctx_new_list))) { 6991 6784 io_sqd_init_new(sqd); 6785 + timeout = jiffies + sqd->sq_thread_idle; 6786 + } 6992 6787 6788 + sqt_spin = false; 6993 6789 cap_entries = !list_is_singular(&sqd->ctx_list); 6994 - 6995 6790 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 6996 6791 if (current->cred != ctx->creds) { 6997 6792 if (old_cred) ··· 7021 6782 current->sessionid = ctx->sessionid; 7022 6783 #endif 7023 6784 7024 - ret |= __io_sq_thread(ctx, start_jiffies, cap_entries); 6785 + ret = __io_sq_thread(ctx, cap_entries); 6786 + if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list))) 6787 + sqt_spin = true; 7025 6788 7026 - io_sq_thread_drop_mm(); 6789 + io_sq_thread_drop_mm_files(); 7027 6790 } 7028 6791 7029 - if (ret & SQT_SPIN) { 6792 + if (sqt_spin || !time_after(jiffies, timeout)) { 7030 6793 io_run_task_work(); 7031 6794 cond_resched(); 7032 - } else if (ret == SQT_IDLE) { 7033 - if (kthread_should_park()) 7034 - continue; 6795 + if (sqt_spin) 6796 + timeout = jiffies + sqd->sq_thread_idle; 6797 + continue; 6798 + } 6799 + 6800 + if (kthread_should_park()) 6801 + continue; 6802 + 6803 + needs_sched = true; 6804 + prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); 6805 + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 6806 + if ((ctx->flags & IORING_SETUP_IOPOLL) && 6807 + !list_empty_careful(&ctx->iopoll_list)) { 6808 + needs_sched = false; 6809 + break; 6810 + } 6811 + if (io_sqring_entries(ctx)) { 6812 + needs_sched = false; 6813 + break; 6814 + } 6815 + } 6816 + 6817 + if (needs_sched) { 7035 6818 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 7036 6819 io_ring_set_wakeup_flag(ctx); 6820 + 7037 6821 schedule(); 7038 - start_jiffies = jiffies; 7039 6822 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 7040 6823 io_ring_clear_wakeup_flag(ctx); 7041 6824 } 6825 + 6826 + finish_wait(&sqd->wait, &wait); 6827 + timeout = jiffies + sqd->sq_thread_idle; 7042 6828 } 7043 6829 7044 6830 io_run_task_work(); ··· 7072 6808 io_sq_thread_unassociate_blkcg(); 7073 6809 if (old_cred) 7074 6810 revert_creds(old_cred); 6811 + 6812 + task_lock(current); 6813 + current->files = old_files; 6814 + current->nsproxy = old_nsproxy; 6815 + task_unlock(current); 7075 6816 7076 6817 kthread_parkme(); 7077 6818 ··· 7132 6863 * application must reap them itself, as they reside on the shared cq ring. 7133 6864 */ 7134 6865 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, 7135 - const sigset_t __user *sig, size_t sigsz) 6866 + const sigset_t __user *sig, size_t sigsz, 6867 + struct __kernel_timespec __user *uts) 7136 6868 { 7137 6869 struct io_wait_queue iowq = { 7138 6870 .wq = { ··· 7145 6875 .to_wait = min_events, 7146 6876 }; 7147 6877 struct io_rings *rings = ctx->rings; 6878 + struct timespec64 ts; 6879 + signed long timeout = 0; 7148 6880 int ret = 0; 7149 6881 7150 6882 do { ··· 7169 6897 return ret; 7170 6898 } 7171 6899 6900 + if (uts) { 6901 + if (get_timespec64(&ts, uts)) 6902 + return -EFAULT; 6903 + timeout = timespec64_to_jiffies(&ts); 6904 + } 6905 + 7172 6906 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); 7173 6907 trace_io_uring_cqring_wait(ctx, min_events); 7174 6908 do { ··· 7188 6910 break; 7189 6911 if (io_should_wake(&iowq, false)) 7190 6912 break; 7191 - schedule(); 6913 + if (uts) { 6914 + timeout = schedule_timeout(timeout); 6915 + if (timeout == 0) { 6916 + ret = -ETIME; 6917 + break; 6918 + } 6919 + } else { 6920 + schedule(); 6921 + } 7192 6922 } while (1); 7193 6923 finish_wait(&ctx->wait, &iowq.wq); 7194 6924 ··· 7245 6959 if (!data) 7246 6960 return -ENXIO; 7247 6961 7248 - spin_lock(&data->lock); 6962 + spin_lock_bh(&data->lock); 7249 6963 ref_node = data->node; 7250 - spin_unlock(&data->lock); 6964 + spin_unlock_bh(&data->lock); 7251 6965 if (ref_node) 7252 6966 percpu_ref_kill(&ref_node->refs); 7253 6967 ··· 7370 7084 7371 7085 mutex_lock(&sqd->ctx_lock); 7372 7086 list_del(&ctx->sqd_list); 7087 + io_sqd_update_thread_idle(sqd); 7373 7088 mutex_unlock(&sqd->ctx_lock); 7374 7089 7375 - if (sqd->thread) { 7376 - finish_wait(&sqd->wait, &ctx->sqo_wait_entry); 7090 + if (sqd->thread) 7377 7091 io_sq_thread_unpark(sqd); 7378 - } 7379 7092 7380 7093 io_put_sq_data(sqd); 7381 7094 ctx->sq_data = NULL; ··· 7629 7344 data = ref_node->file_data; 7630 7345 ctx = data->ctx; 7631 7346 7632 - spin_lock(&data->lock); 7347 + spin_lock_bh(&data->lock); 7633 7348 ref_node->done = true; 7634 7349 7635 7350 while (!list_empty(&data->ref_list)) { ··· 7641 7356 list_del(&ref_node->node); 7642 7357 first_add |= llist_add(&ref_node->llist, &ctx->file_put_llist); 7643 7358 } 7644 - spin_unlock(&data->lock); 7359 + spin_unlock_bh(&data->lock); 7645 7360 7646 7361 if (percpu_ref_is_dying(&data->refs)) 7647 7362 delay = 0; ··· 7764 7479 } 7765 7480 7766 7481 file_data->node = ref_node; 7767 - spin_lock(&file_data->lock); 7482 + spin_lock_bh(&file_data->lock); 7768 7483 list_add_tail(&ref_node->node, &file_data->ref_list); 7769 - spin_unlock(&file_data->lock); 7484 + spin_unlock_bh(&file_data->lock); 7770 7485 percpu_ref_get(&file_data->refs); 7771 7486 return ret; 7772 7487 out_fput: ··· 7923 7638 7924 7639 if (needs_switch) { 7925 7640 percpu_ref_kill(&data->node->refs); 7926 - spin_lock(&data->lock); 7641 + spin_lock_bh(&data->lock); 7927 7642 list_add_tail(&ref_node->node, &data->ref_list); 7928 7643 data->node = ref_node; 7929 - spin_unlock(&data->lock); 7644 + spin_unlock_bh(&data->lock); 7930 7645 percpu_ref_get(&ctx->file_data->refs); 7931 7646 } else 7932 7647 destroy_fixed_file_ref_node(ref_node); ··· 8054 7769 struct io_sq_data *sqd; 8055 7770 8056 7771 ret = -EPERM; 8057 - if (!capable(CAP_SYS_ADMIN)) 7772 + if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE)) 8058 7773 goto err; 8059 7774 8060 7775 sqd = io_get_sq_data(p); ··· 8640 8355 * as nobody else will be looking for them. 8641 8356 */ 8642 8357 do { 8643 - if (ctx->rings) 8644 - io_cqring_overflow_flush(ctx, true, NULL, NULL); 8645 8358 io_iopoll_try_reap_events(ctx); 8646 8359 } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); 8647 8360 io_ring_ctx_free(ctx); ··· 8649 8366 { 8650 8367 mutex_lock(&ctx->uring_lock); 8651 8368 percpu_ref_kill(&ctx->refs); 8369 + if (ctx->rings) 8370 + io_cqring_overflow_flush(ctx, true, NULL, NULL); 8652 8371 mutex_unlock(&ctx->uring_lock); 8653 8372 8654 - io_kill_timeouts(ctx, NULL); 8655 - io_poll_remove_all(ctx, NULL); 8373 + io_kill_timeouts(ctx, NULL, NULL); 8374 + io_poll_remove_all(ctx, NULL, NULL); 8656 8375 8657 8376 if (ctx->io_wq) 8658 8377 io_wq_cancel_all(ctx->io_wq); 8659 8378 8660 8379 /* if we failed setting up the ctx, we might not have any rings */ 8661 - if (ctx->rings) 8662 - io_cqring_overflow_flush(ctx, true, NULL, NULL); 8663 8380 io_iopoll_try_reap_events(ctx); 8664 8381 idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); 8665 8382 ··· 8690 8407 return 0; 8691 8408 } 8692 8409 8693 - static bool io_wq_files_match(struct io_wq_work *work, void *data) 8694 - { 8695 - struct files_struct *files = data; 8410 + struct io_task_cancel { 8411 + struct task_struct *task; 8412 + struct files_struct *files; 8413 + }; 8696 8414 8697 - return !files || ((work->flags & IO_WQ_WORK_FILES) && 8698 - work->identity->files == files); 8699 - } 8700 - 8701 - /* 8702 - * Returns true if 'preq' is the link parent of 'req' 8703 - */ 8704 - static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req) 8705 - { 8706 - struct io_kiocb *link; 8707 - 8708 - if (!(preq->flags & REQ_F_LINK_HEAD)) 8709 - return false; 8710 - 8711 - list_for_each_entry(link, &preq->link_list, link_list) { 8712 - if (link == req) 8713 - return true; 8714 - } 8715 - 8716 - return false; 8717 - } 8718 - 8719 - /* 8720 - * We're looking to cancel 'req' because it's holding on to our files, but 8721 - * 'req' could be a link to another request. See if it is, and cancel that 8722 - * parent request if so. 8723 - */ 8724 - static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req) 8725 - { 8726 - struct hlist_node *tmp; 8727 - struct io_kiocb *preq; 8728 - bool found = false; 8729 - int i; 8730 - 8731 - spin_lock_irq(&ctx->completion_lock); 8732 - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { 8733 - struct hlist_head *list; 8734 - 8735 - list = &ctx->cancel_hash[i]; 8736 - hlist_for_each_entry_safe(preq, tmp, list, hash_node) { 8737 - found = io_match_link(preq, req); 8738 - if (found) { 8739 - io_poll_remove_one(preq); 8740 - break; 8741 - } 8742 - } 8743 - } 8744 - spin_unlock_irq(&ctx->completion_lock); 8745 - return found; 8746 - } 8747 - 8748 - static bool io_timeout_remove_link(struct io_ring_ctx *ctx, 8749 - struct io_kiocb *req) 8750 - { 8751 - struct io_kiocb *preq; 8752 - bool found = false; 8753 - 8754 - spin_lock_irq(&ctx->completion_lock); 8755 - list_for_each_entry(preq, &ctx->timeout_list, timeout.list) { 8756 - found = io_match_link(preq, req); 8757 - if (found) { 8758 - __io_timeout_cancel(preq); 8759 - break; 8760 - } 8761 - } 8762 - spin_unlock_irq(&ctx->completion_lock); 8763 - return found; 8764 - } 8765 - 8766 - static bool io_cancel_link_cb(struct io_wq_work *work, void *data) 8415 + static bool io_cancel_task_cb(struct io_wq_work *work, void *data) 8767 8416 { 8768 8417 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 8418 + struct io_task_cancel *cancel = data; 8769 8419 bool ret; 8770 8420 8771 - if (req->flags & REQ_F_LINK_TIMEOUT) { 8421 + if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) { 8772 8422 unsigned long flags; 8773 8423 struct io_ring_ctx *ctx = req->ctx; 8774 8424 8775 8425 /* protect against races with linked timeouts */ 8776 8426 spin_lock_irqsave(&ctx->completion_lock, flags); 8777 - ret = io_match_link(req, data); 8427 + ret = io_match_task(req, cancel->task, cancel->files); 8778 8428 spin_unlock_irqrestore(&ctx->completion_lock, flags); 8779 8429 } else { 8780 - ret = io_match_link(req, data); 8430 + ret = io_match_task(req, cancel->task, cancel->files); 8781 8431 } 8782 8432 return ret; 8783 - } 8784 - 8785 - static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) 8786 - { 8787 - enum io_wq_cancel cret; 8788 - 8789 - /* cancel this particular work, if it's running */ 8790 - cret = io_wq_cancel_work(ctx->io_wq, &req->work); 8791 - if (cret != IO_WQ_CANCEL_NOTFOUND) 8792 - return; 8793 - 8794 - /* find links that hold this pending, cancel those */ 8795 - cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true); 8796 - if (cret != IO_WQ_CANCEL_NOTFOUND) 8797 - return; 8798 - 8799 - /* if we have a poll link holding this pending, cancel that */ 8800 - if (io_poll_remove_link(ctx, req)) 8801 - return; 8802 - 8803 - /* final option, timeout link is holding this req pending */ 8804 - io_timeout_remove_link(ctx, req); 8805 8433 } 8806 8434 8807 8435 static void io_cancel_defer_files(struct io_ring_ctx *ctx, ··· 8724 8530 8725 8531 spin_lock_irq(&ctx->completion_lock); 8726 8532 list_for_each_entry_reverse(de, &ctx->defer_list, list) { 8727 - if (io_task_match(de->req, task) && 8728 - io_match_files(de->req, files)) { 8533 + if (io_match_task(de->req, task, files)) { 8729 8534 list_cut_position(&list, &ctx->defer_list, &de->list); 8730 8535 break; 8731 8536 } ··· 8741 8548 } 8742 8549 } 8743 8550 8744 - /* 8745 - * Returns true if we found and killed one or more files pinning requests 8746 - */ 8747 - static bool io_uring_cancel_files(struct io_ring_ctx *ctx, 8551 + static void io_uring_cancel_files(struct io_ring_ctx *ctx, 8552 + struct task_struct *task, 8748 8553 struct files_struct *files) 8749 8554 { 8750 - if (list_empty_careful(&ctx->inflight_list)) 8751 - return false; 8752 - 8753 - /* cancel all at once, should be faster than doing it one by one*/ 8754 - io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true); 8755 - 8756 8555 while (!list_empty_careful(&ctx->inflight_list)) { 8757 - struct io_kiocb *cancel_req = NULL, *req; 8556 + struct io_task_cancel cancel = { .task = task, .files = files }; 8557 + struct io_kiocb *req; 8758 8558 DEFINE_WAIT(wait); 8559 + bool found = false; 8759 8560 8760 8561 spin_lock_irq(&ctx->inflight_lock); 8761 8562 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { 8762 - if (files && (req->work.flags & IO_WQ_WORK_FILES) && 8563 + if (req->task != task || 8763 8564 req->work.identity->files != files) 8764 8565 continue; 8765 - /* req is being completed, ignore */ 8766 - if (!refcount_inc_not_zero(&req->refs)) 8767 - continue; 8768 - cancel_req = req; 8566 + found = true; 8769 8567 break; 8770 8568 } 8771 - if (cancel_req) 8772 - prepare_to_wait(&ctx->inflight_wait, &wait, 8773 - TASK_UNINTERRUPTIBLE); 8569 + if (found) 8570 + prepare_to_wait(&task->io_uring->wait, &wait, 8571 + TASK_UNINTERRUPTIBLE); 8774 8572 spin_unlock_irq(&ctx->inflight_lock); 8775 8573 8776 8574 /* We need to keep going until we don't find a matching req */ 8777 - if (!cancel_req) 8575 + if (!found) 8778 8576 break; 8779 - /* cancel this request, or head link requests */ 8780 - io_attempt_cancel(ctx, cancel_req); 8781 - io_put_req(cancel_req); 8577 + 8578 + io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true); 8579 + io_poll_remove_all(ctx, task, files); 8580 + io_kill_timeouts(ctx, task, files); 8782 8581 /* cancellations _may_ trigger task work */ 8783 8582 io_run_task_work(); 8784 8583 schedule(); 8785 - finish_wait(&ctx->inflight_wait, &wait); 8584 + finish_wait(&task->io_uring->wait, &wait); 8786 8585 } 8787 - 8788 - return true; 8789 8586 } 8790 8587 8791 - static bool io_cancel_task_cb(struct io_wq_work *work, void *data) 8588 + static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, 8589 + struct task_struct *task) 8792 8590 { 8793 - struct io_kiocb *req = container_of(work, struct io_kiocb, work); 8794 - struct task_struct *task = data; 8795 - 8796 - return io_task_match(req, task); 8797 - } 8798 - 8799 - static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, 8800 - struct task_struct *task, 8801 - struct files_struct *files) 8802 - { 8803 - bool ret; 8804 - 8805 - ret = io_uring_cancel_files(ctx, files); 8806 - if (!files) { 8591 + while (1) { 8592 + struct io_task_cancel cancel = { .task = task, .files = NULL, }; 8807 8593 enum io_wq_cancel cret; 8594 + bool ret = false; 8808 8595 8809 - cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true); 8596 + cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true); 8810 8597 if (cret != IO_WQ_CANCEL_NOTFOUND) 8811 8598 ret = true; 8812 8599 ··· 8798 8625 } 8799 8626 } 8800 8627 8801 - ret |= io_poll_remove_all(ctx, task); 8802 - ret |= io_kill_timeouts(ctx, task); 8628 + ret |= io_poll_remove_all(ctx, task, NULL); 8629 + ret |= io_kill_timeouts(ctx, task, NULL); 8630 + if (!ret) 8631 + break; 8632 + io_run_task_work(); 8633 + cond_resched(); 8803 8634 } 8804 - 8805 - return ret; 8806 8635 } 8807 8636 8808 8637 /* ··· 8823 8648 io_sq_thread_park(ctx->sq_data); 8824 8649 } 8825 8650 8826 - if (files) 8827 - io_cancel_defer_files(ctx, NULL, files); 8828 - else 8829 - io_cancel_defer_files(ctx, task, NULL); 8830 - 8651 + io_cancel_defer_files(ctx, task, files); 8652 + io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); 8831 8653 io_cqring_overflow_flush(ctx, true, task, files); 8654 + io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); 8832 8655 8833 - while (__io_uring_cancel_task_requests(ctx, task, files)) { 8834 - io_run_task_work(); 8835 - cond_resched(); 8836 - } 8656 + if (!files) 8657 + __io_uring_cancel_task_requests(ctx, task); 8658 + else 8659 + io_uring_cancel_files(ctx, task, files); 8837 8660 8838 8661 if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { 8839 8662 atomic_dec(&task->io_uring->in_idle); ··· 9089 8916 finish_wait(&ctx->sqo_sq_wait, &wait); 9090 8917 } 9091 8918 8919 + static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, 8920 + struct __kernel_timespec __user **ts, 8921 + const sigset_t __user **sig) 8922 + { 8923 + struct io_uring_getevents_arg arg; 8924 + 8925 + /* 8926 + * If EXT_ARG isn't set, then we have no timespec and the argp pointer 8927 + * is just a pointer to the sigset_t. 8928 + */ 8929 + if (!(flags & IORING_ENTER_EXT_ARG)) { 8930 + *sig = (const sigset_t __user *) argp; 8931 + *ts = NULL; 8932 + return 0; 8933 + } 8934 + 8935 + /* 8936 + * EXT_ARG is set - ensure we agree on the size of it and copy in our 8937 + * timespec and sigset_t pointers if good. 8938 + */ 8939 + if (*argsz != sizeof(arg)) 8940 + return -EINVAL; 8941 + if (copy_from_user(&arg, argp, sizeof(arg))) 8942 + return -EFAULT; 8943 + *sig = u64_to_user_ptr(arg.sigmask); 8944 + *argsz = arg.sigmask_sz; 8945 + *ts = u64_to_user_ptr(arg.ts); 8946 + return 0; 8947 + } 8948 + 9092 8949 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, 9093 - u32, min_complete, u32, flags, const sigset_t __user *, sig, 9094 - size_t, sigsz) 8950 + u32, min_complete, u32, flags, const void __user *, argp, 8951 + size_t, argsz) 9095 8952 { 9096 8953 struct io_ring_ctx *ctx; 9097 8954 long ret = -EBADF; ··· 9131 8928 io_run_task_work(); 9132 8929 9133 8930 if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | 9134 - IORING_ENTER_SQ_WAIT)) 8931 + IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)) 9135 8932 return -EINVAL; 9136 8933 9137 8934 f = fdget(fd); ··· 9158 8955 */ 9159 8956 ret = 0; 9160 8957 if (ctx->flags & IORING_SETUP_SQPOLL) { 8958 + io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); 9161 8959 if (!list_empty_careful(&ctx->cq_overflow_list)) 9162 8960 io_cqring_overflow_flush(ctx, false, NULL, NULL); 8961 + io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); 9163 8962 if (flags & IORING_ENTER_SQ_WAKEUP) 9164 8963 wake_up(&ctx->sq_data->wait); 9165 8964 if (flags & IORING_ENTER_SQ_WAIT) ··· 9179 8974 goto out; 9180 8975 } 9181 8976 if (flags & IORING_ENTER_GETEVENTS) { 8977 + const sigset_t __user *sig; 8978 + struct __kernel_timespec __user *ts; 8979 + 8980 + ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); 8981 + if (unlikely(ret)) 8982 + goto out; 8983 + 9182 8984 min_complete = min(min_complete, ctx->cq_entries); 9183 8985 9184 8986 /* ··· 9198 8986 !(ctx->flags & IORING_SETUP_SQPOLL)) { 9199 8987 ret = io_iopoll_check(ctx, min_complete); 9200 8988 } else { 9201 - ret = io_cqring_wait(ctx, min_complete, sig, sigsz); 8989 + ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts); 9202 8990 } 9203 8991 } 9204 8992 ··· 9566 9354 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | 9567 9355 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | 9568 9356 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | 9569 - IORING_FEAT_POLL_32BITS; 9357 + IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | 9358 + IORING_FEAT_EXT_ARG; 9570 9359 9571 9360 if (copy_to_user(params, p, sizeof(*p))) { 9572 9361 ret = -EFAULT;
+22 -18
fs/namei.c
··· 4346 4346 } 4347 4347 EXPORT_SYMBOL(vfs_rename); 4348 4348 4349 - static int do_renameat2(int olddfd, const char __user *oldname, int newdfd, 4350 - const char __user *newname, unsigned int flags) 4349 + int do_renameat2(int olddfd, struct filename *from, int newdfd, 4350 + struct filename *to, unsigned int flags) 4351 4351 { 4352 4352 struct dentry *old_dentry, *new_dentry; 4353 4353 struct dentry *trap; ··· 4355 4355 struct qstr old_last, new_last; 4356 4356 int old_type, new_type; 4357 4357 struct inode *delegated_inode = NULL; 4358 - struct filename *from; 4359 - struct filename *to; 4360 4358 unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET; 4361 4359 bool should_retry = false; 4362 - int error; 4360 + int error = -EINVAL; 4363 4361 4364 4362 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 4365 - return -EINVAL; 4363 + goto put_both; 4366 4364 4367 4365 if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && 4368 4366 (flags & RENAME_EXCHANGE)) 4369 - return -EINVAL; 4367 + goto put_both; 4370 4368 4371 4369 if (flags & RENAME_EXCHANGE) 4372 4370 target_flags = 0; 4373 4371 4374 4372 retry: 4375 - from = filename_parentat(olddfd, getname(oldname), lookup_flags, 4376 - &old_path, &old_last, &old_type); 4373 + from = filename_parentat(olddfd, from, lookup_flags, &old_path, 4374 + &old_last, &old_type); 4377 4375 if (IS_ERR(from)) { 4378 4376 error = PTR_ERR(from); 4379 - goto exit; 4377 + goto put_new; 4380 4378 } 4381 4379 4382 - to = filename_parentat(newdfd, getname(newname), lookup_flags, 4383 - &new_path, &new_last, &new_type); 4380 + to = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last, 4381 + &new_type); 4384 4382 if (IS_ERR(to)) { 4385 4383 error = PTR_ERR(to); 4386 4384 goto exit1; ··· 4471 4473 if (retry_estale(error, lookup_flags)) 4472 4474 should_retry = true; 4473 4475 path_put(&new_path); 4474 - putname(to); 4475 4476 exit1: 4476 4477 path_put(&old_path); 4477 - putname(from); 4478 4478 if (should_retry) { 4479 4479 should_retry = false; 4480 4480 lookup_flags |= LOOKUP_REVAL; 4481 4481 goto retry; 4482 4482 } 4483 - exit: 4483 + put_both: 4484 + if (!IS_ERR(from)) 4485 + putname(from); 4486 + put_new: 4487 + if (!IS_ERR(to)) 4488 + putname(to); 4484 4489 return error; 4485 4490 } 4486 4491 4487 4492 SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, 4488 4493 int, newdfd, const char __user *, newname, unsigned int, flags) 4489 4494 { 4490 - return do_renameat2(olddfd, oldname, newdfd, newname, flags); 4495 + return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), 4496 + flags); 4491 4497 } 4492 4498 4493 4499 SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, 4494 4500 int, newdfd, const char __user *, newname) 4495 4501 { 4496 - return do_renameat2(olddfd, oldname, newdfd, newname, 0); 4502 + return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), 4503 + 0); 4497 4504 } 4498 4505 4499 4506 SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) 4500 4507 { 4501 - return do_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); 4508 + return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD, 4509 + getname(newname), 0); 4502 4510 } 4503 4511 4504 4512 int readlink_copy(char __user *buffer, int buflen, const char *link)
+1
include/linux/socket.h
··· 436 436 int __user *usockaddr_len); 437 437 extern int __sys_socketpair(int family, int type, int protocol, 438 438 int __user *usockvec); 439 + extern int __sys_shutdown_sock(struct socket *sock, int how); 439 440 extern int __sys_shutdown(int fd, int how); 440 441 441 442 extern struct ns_common *get_net_ns(struct ns_common *ns);
+1 -1
include/linux/syscalls.h
··· 317 317 struct io_uring_params __user *p); 318 318 asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit, 319 319 u32 min_complete, u32 flags, 320 - const sigset_t __user *sig, size_t sigsz); 320 + const void __user *argp, size_t argsz); 321 321 asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op, 322 322 void __user *arg, unsigned int nr_args); 323 323
+16
include/uapi/linux/io_uring.h
··· 42 42 __u32 statx_flags; 43 43 __u32 fadvise_advice; 44 44 __u32 splice_flags; 45 + __u32 rename_flags; 46 + __u32 unlink_flags; 45 47 }; 46 48 __u64 user_data; /* data to be passed back at completion time */ 47 49 union { ··· 134 132 IORING_OP_PROVIDE_BUFFERS, 135 133 IORING_OP_REMOVE_BUFFERS, 136 134 IORING_OP_TEE, 135 + IORING_OP_SHUTDOWN, 136 + IORING_OP_RENAMEAT, 137 + IORING_OP_UNLINKAT, 137 138 138 139 /* this goes last, obviously */ 139 140 IORING_OP_LAST, ··· 151 146 * sqe->timeout_flags 152 147 */ 153 148 #define IORING_TIMEOUT_ABS (1U << 0) 149 + #define IORING_TIMEOUT_UPDATE (1U << 1) 154 150 155 151 /* 156 152 * sqe->splice_flags ··· 232 226 #define IORING_ENTER_GETEVENTS (1U << 0) 233 227 #define IORING_ENTER_SQ_WAKEUP (1U << 1) 234 228 #define IORING_ENTER_SQ_WAIT (1U << 2) 229 + #define IORING_ENTER_EXT_ARG (1U << 3) 235 230 236 231 /* 237 232 * Passed in for io_uring_setup(2). Copied back with updated info on success ··· 260 253 #define IORING_FEAT_CUR_PERSONALITY (1U << 4) 261 254 #define IORING_FEAT_FAST_POLL (1U << 5) 262 255 #define IORING_FEAT_POLL_32BITS (1U << 6) 256 + #define IORING_FEAT_SQPOLL_NONFIXED (1U << 7) 257 + #define IORING_FEAT_EXT_ARG (1U << 8) 263 258 264 259 /* 265 260 * io_uring_register(2) opcodes and arguments ··· 336 327 IORING_RESTRICTION_SQE_FLAGS_REQUIRED = 3, 337 328 338 329 IORING_RESTRICTION_LAST 330 + }; 331 + 332 + struct io_uring_getevents_arg { 333 + __u64 sigmask; 334 + __u32 sigmask_sz; 335 + __u32 pad; 336 + __u64 ts; 339 337 }; 340 338 341 339 #endif
+12 -3
net/socket.c
··· 2175 2175 * Shutdown a socket. 2176 2176 */ 2177 2177 2178 + int __sys_shutdown_sock(struct socket *sock, int how) 2179 + { 2180 + int err; 2181 + 2182 + err = security_socket_shutdown(sock, how); 2183 + if (!err) 2184 + err = sock->ops->shutdown(sock, how); 2185 + 2186 + return err; 2187 + } 2188 + 2178 2189 int __sys_shutdown(int fd, int how) 2179 2190 { 2180 2191 int err, fput_needed; ··· 2193 2182 2194 2183 sock = sockfd_lookup_light(fd, &err, &fput_needed); 2195 2184 if (sock != NULL) { 2196 - err = security_socket_shutdown(sock, how); 2197 - if (!err) 2198 - err = sock->ops->shutdown(sock, how); 2185 + err = __sys_shutdown_sock(sock, how); 2199 2186 fput_light(sock->file, fput_needed); 2200 2187 } 2201 2188 return err;