Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v5.16-rc2 11127 lines 281 kB view raw
1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Shared application/kernel submission and completion ring pairs, for 4 * supporting fast/efficient IO. 5 * 6 * A note on the read/write ordering memory barriers that are matched between 7 * the application and kernel side. 8 * 9 * After the application reads the CQ ring tail, it must use an 10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses 11 * before writing the tail (using smp_load_acquire to read the tail will 12 * do). It also needs a smp_mb() before updating CQ head (ordering the 13 * entry load(s) with the head store), pairing with an implicit barrier 14 * through a control-dependency in io_get_cqe (smp_store_release to 15 * store head will do). Failure to do so could lead to reading invalid 16 * CQ entries. 17 * 18 * Likewise, the application must use an appropriate smp_wmb() before 19 * writing the SQ tail (ordering SQ entry stores with the tail store), 20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release 21 * to store the tail will do). And it needs a barrier ordering the SQ 22 * head load before writing new SQ entries (smp_load_acquire to read 23 * head will do). 24 * 25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application 26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after* 27 * updating the SQ tail; a full memory barrier smp_mb() is needed 28 * between. 29 * 30 * Also see the examples in the liburing library: 31 * 32 * git://git.kernel.dk/liburing 33 * 34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens 35 * from data shared between the kernel and application. This is done both 36 * for ordering purposes, but also to ensure that once a value is loaded from 37 * data that the application could potentially modify, it remains stable. 38 * 39 * Copyright (C) 2018-2019 Jens Axboe 40 * Copyright (c) 2018-2019 Christoph Hellwig 41 */ 42#include <linux/kernel.h> 43#include <linux/init.h> 44#include <linux/errno.h> 45#include <linux/syscalls.h> 46#include <linux/compat.h> 47#include <net/compat.h> 48#include <linux/refcount.h> 49#include <linux/uio.h> 50#include <linux/bits.h> 51 52#include <linux/sched/signal.h> 53#include <linux/fs.h> 54#include <linux/file.h> 55#include <linux/fdtable.h> 56#include <linux/mm.h> 57#include <linux/mman.h> 58#include <linux/percpu.h> 59#include <linux/slab.h> 60#include <linux/blkdev.h> 61#include <linux/bvec.h> 62#include <linux/net.h> 63#include <net/sock.h> 64#include <net/af_unix.h> 65#include <net/scm.h> 66#include <linux/anon_inodes.h> 67#include <linux/sched/mm.h> 68#include <linux/uaccess.h> 69#include <linux/nospec.h> 70#include <linux/sizes.h> 71#include <linux/hugetlb.h> 72#include <linux/highmem.h> 73#include <linux/namei.h> 74#include <linux/fsnotify.h> 75#include <linux/fadvise.h> 76#include <linux/eventpoll.h> 77#include <linux/splice.h> 78#include <linux/task_work.h> 79#include <linux/pagemap.h> 80#include <linux/io_uring.h> 81#include <linux/tracehook.h> 82#include <linux/audit.h> 83#include <linux/security.h> 84 85#define CREATE_TRACE_POINTS 86#include <trace/events/io_uring.h> 87 88#include <uapi/linux/io_uring.h> 89 90#include "internal.h" 91#include "io-wq.h" 92 93#define IORING_MAX_ENTRIES 32768 94#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) 95#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 96 97/* only define max */ 98#define IORING_MAX_FIXED_FILES (1U << 15) 99#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 100 IORING_REGISTER_LAST + IORING_OP_LAST) 101 102#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) 103#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) 104#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) 105 106#define IORING_MAX_REG_BUFFERS (1U << 14) 107 108#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ 109 IOSQE_IO_HARDLINK | IOSQE_ASYNC) 110 111#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS|IOSQE_BUFFER_SELECT|IOSQE_IO_DRAIN) 112 113#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ 114 REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ 115 REQ_F_ASYNC_DATA) 116 117#define IO_TCTX_REFS_CACHE_NR (1U << 10) 118 119struct io_uring { 120 u32 head ____cacheline_aligned_in_smp; 121 u32 tail ____cacheline_aligned_in_smp; 122}; 123 124/* 125 * This data is shared with the application through the mmap at offsets 126 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. 127 * 128 * The offsets to the member fields are published through struct 129 * io_sqring_offsets when calling io_uring_setup. 130 */ 131struct io_rings { 132 /* 133 * Head and tail offsets into the ring; the offsets need to be 134 * masked to get valid indices. 135 * 136 * The kernel controls head of the sq ring and the tail of the cq ring, 137 * and the application controls tail of the sq ring and the head of the 138 * cq ring. 139 */ 140 struct io_uring sq, cq; 141 /* 142 * Bitmasks to apply to head and tail offsets (constant, equals 143 * ring_entries - 1) 144 */ 145 u32 sq_ring_mask, cq_ring_mask; 146 /* Ring sizes (constant, power of 2) */ 147 u32 sq_ring_entries, cq_ring_entries; 148 /* 149 * Number of invalid entries dropped by the kernel due to 150 * invalid index stored in array 151 * 152 * Written by the kernel, shouldn't be modified by the 153 * application (i.e. get number of "new events" by comparing to 154 * cached value). 155 * 156 * After a new SQ head value was read by the application this 157 * counter includes all submissions that were dropped reaching 158 * the new SQ head (and possibly more). 159 */ 160 u32 sq_dropped; 161 /* 162 * Runtime SQ flags 163 * 164 * Written by the kernel, shouldn't be modified by the 165 * application. 166 * 167 * The application needs a full memory barrier before checking 168 * for IORING_SQ_NEED_WAKEUP after updating the sq tail. 169 */ 170 u32 sq_flags; 171 /* 172 * Runtime CQ flags 173 * 174 * Written by the application, shouldn't be modified by the 175 * kernel. 176 */ 177 u32 cq_flags; 178 /* 179 * Number of completion events lost because the queue was full; 180 * this should be avoided by the application by making sure 181 * there are not more requests pending than there is space in 182 * the completion queue. 183 * 184 * Written by the kernel, shouldn't be modified by the 185 * application (i.e. get number of "new events" by comparing to 186 * cached value). 187 * 188 * As completion events come in out of order this counter is not 189 * ordered with any other data. 190 */ 191 u32 cq_overflow; 192 /* 193 * Ring buffer of completion events. 194 * 195 * The kernel writes completion events fresh every time they are 196 * produced, so the application is allowed to modify pending 197 * entries. 198 */ 199 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; 200}; 201 202enum io_uring_cmd_flags { 203 IO_URING_F_COMPLETE_DEFER = 1, 204 IO_URING_F_UNLOCKED = 2, 205 /* int's last bit, sign checks are usually faster than a bit test */ 206 IO_URING_F_NONBLOCK = INT_MIN, 207}; 208 209struct io_mapped_ubuf { 210 u64 ubuf; 211 u64 ubuf_end; 212 unsigned int nr_bvecs; 213 unsigned long acct_pages; 214 struct bio_vec bvec[]; 215}; 216 217struct io_ring_ctx; 218 219struct io_overflow_cqe { 220 struct io_uring_cqe cqe; 221 struct list_head list; 222}; 223 224struct io_fixed_file { 225 /* file * with additional FFS_* flags */ 226 unsigned long file_ptr; 227}; 228 229struct io_rsrc_put { 230 struct list_head list; 231 u64 tag; 232 union { 233 void *rsrc; 234 struct file *file; 235 struct io_mapped_ubuf *buf; 236 }; 237}; 238 239struct io_file_table { 240 struct io_fixed_file *files; 241}; 242 243struct io_rsrc_node { 244 struct percpu_ref refs; 245 struct list_head node; 246 struct list_head rsrc_list; 247 struct io_rsrc_data *rsrc_data; 248 struct llist_node llist; 249 bool done; 250}; 251 252typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); 253 254struct io_rsrc_data { 255 struct io_ring_ctx *ctx; 256 257 u64 **tags; 258 unsigned int nr; 259 rsrc_put_fn *do_put; 260 atomic_t refs; 261 struct completion done; 262 bool quiesce; 263}; 264 265struct io_buffer { 266 struct list_head list; 267 __u64 addr; 268 __u32 len; 269 __u16 bid; 270}; 271 272struct io_restriction { 273 DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); 274 DECLARE_BITMAP(sqe_op, IORING_OP_LAST); 275 u8 sqe_flags_allowed; 276 u8 sqe_flags_required; 277 bool registered; 278}; 279 280enum { 281 IO_SQ_THREAD_SHOULD_STOP = 0, 282 IO_SQ_THREAD_SHOULD_PARK, 283}; 284 285struct io_sq_data { 286 refcount_t refs; 287 atomic_t park_pending; 288 struct mutex lock; 289 290 /* ctx's that are using this sqd */ 291 struct list_head ctx_list; 292 293 struct task_struct *thread; 294 struct wait_queue_head wait; 295 296 unsigned sq_thread_idle; 297 int sq_cpu; 298 pid_t task_pid; 299 pid_t task_tgid; 300 301 unsigned long state; 302 struct completion exited; 303}; 304 305#define IO_COMPL_BATCH 32 306#define IO_REQ_CACHE_SIZE 32 307#define IO_REQ_ALLOC_BATCH 8 308 309struct io_submit_link { 310 struct io_kiocb *head; 311 struct io_kiocb *last; 312}; 313 314struct io_submit_state { 315 /* inline/task_work completion list, under ->uring_lock */ 316 struct io_wq_work_node free_list; 317 /* batch completion logic */ 318 struct io_wq_work_list compl_reqs; 319 struct io_submit_link link; 320 321 bool plug_started; 322 bool need_plug; 323 unsigned short submit_nr; 324 struct blk_plug plug; 325}; 326 327struct io_ring_ctx { 328 /* const or read-mostly hot data */ 329 struct { 330 struct percpu_ref refs; 331 332 struct io_rings *rings; 333 unsigned int flags; 334 unsigned int compat: 1; 335 unsigned int drain_next: 1; 336 unsigned int eventfd_async: 1; 337 unsigned int restricted: 1; 338 unsigned int off_timeout_used: 1; 339 unsigned int drain_active: 1; 340 } ____cacheline_aligned_in_smp; 341 342 /* submission data */ 343 struct { 344 struct mutex uring_lock; 345 346 /* 347 * Ring buffer of indices into array of io_uring_sqe, which is 348 * mmapped by the application using the IORING_OFF_SQES offset. 349 * 350 * This indirection could e.g. be used to assign fixed 351 * io_uring_sqe entries to operations and only submit them to 352 * the queue when needed. 353 * 354 * The kernel modifies neither the indices array nor the entries 355 * array. 356 */ 357 u32 *sq_array; 358 struct io_uring_sqe *sq_sqes; 359 unsigned cached_sq_head; 360 unsigned sq_entries; 361 struct list_head defer_list; 362 363 /* 364 * Fixed resources fast path, should be accessed only under 365 * uring_lock, and updated through io_uring_register(2) 366 */ 367 struct io_rsrc_node *rsrc_node; 368 int rsrc_cached_refs; 369 struct io_file_table file_table; 370 unsigned nr_user_files; 371 unsigned nr_user_bufs; 372 struct io_mapped_ubuf **user_bufs; 373 374 struct io_submit_state submit_state; 375 struct list_head timeout_list; 376 struct list_head ltimeout_list; 377 struct list_head cq_overflow_list; 378 struct xarray io_buffers; 379 struct xarray personalities; 380 u32 pers_next; 381 unsigned sq_thread_idle; 382 } ____cacheline_aligned_in_smp; 383 384 /* IRQ completion list, under ->completion_lock */ 385 struct io_wq_work_list locked_free_list; 386 unsigned int locked_free_nr; 387 388 const struct cred *sq_creds; /* cred used for __io_sq_thread() */ 389 struct io_sq_data *sq_data; /* if using sq thread polling */ 390 391 struct wait_queue_head sqo_sq_wait; 392 struct list_head sqd_list; 393 394 unsigned long check_cq_overflow; 395 396 struct { 397 unsigned cached_cq_tail; 398 unsigned cq_entries; 399 struct eventfd_ctx *cq_ev_fd; 400 struct wait_queue_head cq_wait; 401 unsigned cq_extra; 402 atomic_t cq_timeouts; 403 unsigned cq_last_tm_flush; 404 } ____cacheline_aligned_in_smp; 405 406 struct { 407 spinlock_t completion_lock; 408 409 spinlock_t timeout_lock; 410 411 /* 412 * ->iopoll_list is protected by the ctx->uring_lock for 413 * io_uring instances that don't use IORING_SETUP_SQPOLL. 414 * For SQPOLL, only the single threaded io_sq_thread() will 415 * manipulate the list, hence no extra locking is needed there. 416 */ 417 struct io_wq_work_list iopoll_list; 418 struct hlist_head *cancel_hash; 419 unsigned cancel_hash_bits; 420 bool poll_multi_queue; 421 } ____cacheline_aligned_in_smp; 422 423 struct io_restriction restrictions; 424 425 /* slow path rsrc auxilary data, used by update/register */ 426 struct { 427 struct io_rsrc_node *rsrc_backup_node; 428 struct io_mapped_ubuf *dummy_ubuf; 429 struct io_rsrc_data *file_data; 430 struct io_rsrc_data *buf_data; 431 432 struct delayed_work rsrc_put_work; 433 struct llist_head rsrc_put_llist; 434 struct list_head rsrc_ref_list; 435 spinlock_t rsrc_ref_lock; 436 }; 437 438 /* Keep this last, we don't need it for the fast path */ 439 struct { 440 #if defined(CONFIG_UNIX) 441 struct socket *ring_sock; 442 #endif 443 /* hashed buffered write serialization */ 444 struct io_wq_hash *hash_map; 445 446 /* Only used for accounting purposes */ 447 struct user_struct *user; 448 struct mm_struct *mm_account; 449 450 /* ctx exit and cancelation */ 451 struct llist_head fallback_llist; 452 struct delayed_work fallback_work; 453 struct work_struct exit_work; 454 struct list_head tctx_list; 455 struct completion ref_comp; 456 u32 iowq_limits[2]; 457 bool iowq_limits_set; 458 }; 459}; 460 461struct io_uring_task { 462 /* submission side */ 463 int cached_refs; 464 struct xarray xa; 465 struct wait_queue_head wait; 466 const struct io_ring_ctx *last; 467 struct io_wq *io_wq; 468 struct percpu_counter inflight; 469 atomic_t inflight_tracked; 470 atomic_t in_idle; 471 472 spinlock_t task_lock; 473 struct io_wq_work_list task_list; 474 struct callback_head task_work; 475 bool task_running; 476}; 477 478/* 479 * First field must be the file pointer in all the 480 * iocb unions! See also 'struct kiocb' in <linux/fs.h> 481 */ 482struct io_poll_iocb { 483 struct file *file; 484 struct wait_queue_head *head; 485 __poll_t events; 486 bool done; 487 bool canceled; 488 struct wait_queue_entry wait; 489}; 490 491struct io_poll_update { 492 struct file *file; 493 u64 old_user_data; 494 u64 new_user_data; 495 __poll_t events; 496 bool update_events; 497 bool update_user_data; 498}; 499 500struct io_close { 501 struct file *file; 502 int fd; 503 u32 file_slot; 504}; 505 506struct io_timeout_data { 507 struct io_kiocb *req; 508 struct hrtimer timer; 509 struct timespec64 ts; 510 enum hrtimer_mode mode; 511 u32 flags; 512}; 513 514struct io_accept { 515 struct file *file; 516 struct sockaddr __user *addr; 517 int __user *addr_len; 518 int flags; 519 u32 file_slot; 520 unsigned long nofile; 521}; 522 523struct io_sync { 524 struct file *file; 525 loff_t len; 526 loff_t off; 527 int flags; 528 int mode; 529}; 530 531struct io_cancel { 532 struct file *file; 533 u64 addr; 534}; 535 536struct io_timeout { 537 struct file *file; 538 u32 off; 539 u32 target_seq; 540 struct list_head list; 541 /* head of the link, used by linked timeouts only */ 542 struct io_kiocb *head; 543 /* for linked completions */ 544 struct io_kiocb *prev; 545}; 546 547struct io_timeout_rem { 548 struct file *file; 549 u64 addr; 550 551 /* timeout update */ 552 struct timespec64 ts; 553 u32 flags; 554 bool ltimeout; 555}; 556 557struct io_rw { 558 /* NOTE: kiocb has the file as the first member, so don't do it here */ 559 struct kiocb kiocb; 560 u64 addr; 561 u64 len; 562}; 563 564struct io_connect { 565 struct file *file; 566 struct sockaddr __user *addr; 567 int addr_len; 568}; 569 570struct io_sr_msg { 571 struct file *file; 572 union { 573 struct compat_msghdr __user *umsg_compat; 574 struct user_msghdr __user *umsg; 575 void __user *buf; 576 }; 577 int msg_flags; 578 int bgid; 579 size_t len; 580}; 581 582struct io_open { 583 struct file *file; 584 int dfd; 585 u32 file_slot; 586 struct filename *filename; 587 struct open_how how; 588 unsigned long nofile; 589}; 590 591struct io_rsrc_update { 592 struct file *file; 593 u64 arg; 594 u32 nr_args; 595 u32 offset; 596}; 597 598struct io_fadvise { 599 struct file *file; 600 u64 offset; 601 u32 len; 602 u32 advice; 603}; 604 605struct io_madvise { 606 struct file *file; 607 u64 addr; 608 u32 len; 609 u32 advice; 610}; 611 612struct io_epoll { 613 struct file *file; 614 int epfd; 615 int op; 616 int fd; 617 struct epoll_event event; 618}; 619 620struct io_splice { 621 struct file *file_out; 622 struct file *file_in; 623 loff_t off_out; 624 loff_t off_in; 625 u64 len; 626 unsigned int flags; 627}; 628 629struct io_provide_buf { 630 struct file *file; 631 __u64 addr; 632 __u32 len; 633 __u32 bgid; 634 __u16 nbufs; 635 __u16 bid; 636}; 637 638struct io_statx { 639 struct file *file; 640 int dfd; 641 unsigned int mask; 642 unsigned int flags; 643 const char __user *filename; 644 struct statx __user *buffer; 645}; 646 647struct io_shutdown { 648 struct file *file; 649 int how; 650}; 651 652struct io_rename { 653 struct file *file; 654 int old_dfd; 655 int new_dfd; 656 struct filename *oldpath; 657 struct filename *newpath; 658 int flags; 659}; 660 661struct io_unlink { 662 struct file *file; 663 int dfd; 664 int flags; 665 struct filename *filename; 666}; 667 668struct io_mkdir { 669 struct file *file; 670 int dfd; 671 umode_t mode; 672 struct filename *filename; 673}; 674 675struct io_symlink { 676 struct file *file; 677 int new_dfd; 678 struct filename *oldpath; 679 struct filename *newpath; 680}; 681 682struct io_hardlink { 683 struct file *file; 684 int old_dfd; 685 int new_dfd; 686 struct filename *oldpath; 687 struct filename *newpath; 688 int flags; 689}; 690 691struct io_async_connect { 692 struct sockaddr_storage address; 693}; 694 695struct io_async_msghdr { 696 struct iovec fast_iov[UIO_FASTIOV]; 697 /* points to an allocated iov, if NULL we use fast_iov instead */ 698 struct iovec *free_iov; 699 struct sockaddr __user *uaddr; 700 struct msghdr msg; 701 struct sockaddr_storage addr; 702}; 703 704struct io_rw_state { 705 struct iov_iter iter; 706 struct iov_iter_state iter_state; 707 struct iovec fast_iov[UIO_FASTIOV]; 708}; 709 710struct io_async_rw { 711 struct io_rw_state s; 712 const struct iovec *free_iovec; 713 size_t bytes_done; 714 struct wait_page_queue wpq; 715}; 716 717enum { 718 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, 719 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, 720 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, 721 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, 722 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, 723 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, 724 725 /* first byte is taken by user flags, shift it to not overlap */ 726 REQ_F_FAIL_BIT = 8, 727 REQ_F_INFLIGHT_BIT, 728 REQ_F_CUR_POS_BIT, 729 REQ_F_NOWAIT_BIT, 730 REQ_F_LINK_TIMEOUT_BIT, 731 REQ_F_NEED_CLEANUP_BIT, 732 REQ_F_POLLED_BIT, 733 REQ_F_BUFFER_SELECTED_BIT, 734 REQ_F_COMPLETE_INLINE_BIT, 735 REQ_F_REISSUE_BIT, 736 REQ_F_CREDS_BIT, 737 REQ_F_REFCOUNT_BIT, 738 REQ_F_ARM_LTIMEOUT_BIT, 739 REQ_F_ASYNC_DATA_BIT, 740 /* keep async read/write and isreg together and in order */ 741 REQ_F_SUPPORT_NOWAIT_BIT, 742 REQ_F_ISREG_BIT, 743 744 /* not a real bit, just to check we're not overflowing the space */ 745 __REQ_F_LAST_BIT, 746}; 747 748enum { 749 /* ctx owns file */ 750 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), 751 /* drain existing IO first */ 752 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), 753 /* linked sqes */ 754 REQ_F_LINK = BIT(REQ_F_LINK_BIT), 755 /* doesn't sever on completion < 0 */ 756 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), 757 /* IOSQE_ASYNC */ 758 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), 759 /* IOSQE_BUFFER_SELECT */ 760 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), 761 762 /* fail rest of links */ 763 REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), 764 /* on inflight list, should be cancelled and waited on exit reliably */ 765 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), 766 /* read/write uses file position */ 767 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), 768 /* must not punt to workers */ 769 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), 770 /* has or had linked timeout */ 771 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), 772 /* needs cleanup */ 773 REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), 774 /* already went through poll handler */ 775 REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), 776 /* buffer already selected */ 777 REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), 778 /* completion is deferred through io_comp_state */ 779 REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), 780 /* caller should reissue async */ 781 REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), 782 /* supports async reads/writes */ 783 REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT), 784 /* regular file */ 785 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), 786 /* has creds assigned */ 787 REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), 788 /* skip refcounting if not set */ 789 REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), 790 /* there is a linked timeout that has to be armed */ 791 REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), 792 /* ->async_data allocated */ 793 REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), 794}; 795 796struct async_poll { 797 struct io_poll_iocb poll; 798 struct io_poll_iocb *double_poll; 799}; 800 801typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); 802 803struct io_task_work { 804 union { 805 struct io_wq_work_node node; 806 struct llist_node fallback_node; 807 }; 808 io_req_tw_func_t func; 809}; 810 811enum { 812 IORING_RSRC_FILE = 0, 813 IORING_RSRC_BUFFER = 1, 814}; 815 816/* 817 * NOTE! Each of the iocb union members has the file pointer 818 * as the first entry in their struct definition. So you can 819 * access the file pointer through any of the sub-structs, 820 * or directly as just 'ki_filp' in this struct. 821 */ 822struct io_kiocb { 823 union { 824 struct file *file; 825 struct io_rw rw; 826 struct io_poll_iocb poll; 827 struct io_poll_update poll_update; 828 struct io_accept accept; 829 struct io_sync sync; 830 struct io_cancel cancel; 831 struct io_timeout timeout; 832 struct io_timeout_rem timeout_rem; 833 struct io_connect connect; 834 struct io_sr_msg sr_msg; 835 struct io_open open; 836 struct io_close close; 837 struct io_rsrc_update rsrc_update; 838 struct io_fadvise fadvise; 839 struct io_madvise madvise; 840 struct io_epoll epoll; 841 struct io_splice splice; 842 struct io_provide_buf pbuf; 843 struct io_statx statx; 844 struct io_shutdown shutdown; 845 struct io_rename rename; 846 struct io_unlink unlink; 847 struct io_mkdir mkdir; 848 struct io_symlink symlink; 849 struct io_hardlink hardlink; 850 }; 851 852 u8 opcode; 853 /* polled IO has completed */ 854 u8 iopoll_completed; 855 u16 buf_index; 856 unsigned int flags; 857 858 u64 user_data; 859 u32 result; 860 u32 cflags; 861 862 struct io_ring_ctx *ctx; 863 struct task_struct *task; 864 865 struct percpu_ref *fixed_rsrc_refs; 866 /* store used ubuf, so we can prevent reloading */ 867 struct io_mapped_ubuf *imu; 868 869 /* used by request caches, completion batching and iopoll */ 870 struct io_wq_work_node comp_list; 871 atomic_t refs; 872 struct io_kiocb *link; 873 struct io_task_work io_task_work; 874 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ 875 struct hlist_node hash_node; 876 /* internal polling, see IORING_FEAT_FAST_POLL */ 877 struct async_poll *apoll; 878 /* opcode allocated if it needs to store data for async defer */ 879 void *async_data; 880 struct io_wq_work work; 881 /* custom credentials, valid IFF REQ_F_CREDS is set */ 882 const struct cred *creds; 883 /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ 884 struct io_buffer *kbuf; 885}; 886 887struct io_tctx_node { 888 struct list_head ctx_node; 889 struct task_struct *task; 890 struct io_ring_ctx *ctx; 891}; 892 893struct io_defer_entry { 894 struct list_head list; 895 struct io_kiocb *req; 896 u32 seq; 897}; 898 899struct io_op_def { 900 /* needs req->file assigned */ 901 unsigned needs_file : 1; 902 /* should block plug */ 903 unsigned plug : 1; 904 /* hash wq insertion if file is a regular file */ 905 unsigned hash_reg_file : 1; 906 /* unbound wq insertion if file is a non-regular file */ 907 unsigned unbound_nonreg_file : 1; 908 /* set if opcode supports polled "wait" */ 909 unsigned pollin : 1; 910 unsigned pollout : 1; 911 /* op supports buffer selection */ 912 unsigned buffer_select : 1; 913 /* do prep async if is going to be punted */ 914 unsigned needs_async_setup : 1; 915 /* opcode is not supported by this kernel */ 916 unsigned not_supported : 1; 917 /* skip auditing */ 918 unsigned audit_skip : 1; 919 /* size of async data needed, if any */ 920 unsigned short async_size; 921}; 922 923static const struct io_op_def io_op_defs[] = { 924 [IORING_OP_NOP] = {}, 925 [IORING_OP_READV] = { 926 .needs_file = 1, 927 .unbound_nonreg_file = 1, 928 .pollin = 1, 929 .buffer_select = 1, 930 .needs_async_setup = 1, 931 .plug = 1, 932 .audit_skip = 1, 933 .async_size = sizeof(struct io_async_rw), 934 }, 935 [IORING_OP_WRITEV] = { 936 .needs_file = 1, 937 .hash_reg_file = 1, 938 .unbound_nonreg_file = 1, 939 .pollout = 1, 940 .needs_async_setup = 1, 941 .plug = 1, 942 .audit_skip = 1, 943 .async_size = sizeof(struct io_async_rw), 944 }, 945 [IORING_OP_FSYNC] = { 946 .needs_file = 1, 947 .audit_skip = 1, 948 }, 949 [IORING_OP_READ_FIXED] = { 950 .needs_file = 1, 951 .unbound_nonreg_file = 1, 952 .pollin = 1, 953 .plug = 1, 954 .audit_skip = 1, 955 .async_size = sizeof(struct io_async_rw), 956 }, 957 [IORING_OP_WRITE_FIXED] = { 958 .needs_file = 1, 959 .hash_reg_file = 1, 960 .unbound_nonreg_file = 1, 961 .pollout = 1, 962 .plug = 1, 963 .audit_skip = 1, 964 .async_size = sizeof(struct io_async_rw), 965 }, 966 [IORING_OP_POLL_ADD] = { 967 .needs_file = 1, 968 .unbound_nonreg_file = 1, 969 .audit_skip = 1, 970 }, 971 [IORING_OP_POLL_REMOVE] = { 972 .audit_skip = 1, 973 }, 974 [IORING_OP_SYNC_FILE_RANGE] = { 975 .needs_file = 1, 976 .audit_skip = 1, 977 }, 978 [IORING_OP_SENDMSG] = { 979 .needs_file = 1, 980 .unbound_nonreg_file = 1, 981 .pollout = 1, 982 .needs_async_setup = 1, 983 .async_size = sizeof(struct io_async_msghdr), 984 }, 985 [IORING_OP_RECVMSG] = { 986 .needs_file = 1, 987 .unbound_nonreg_file = 1, 988 .pollin = 1, 989 .buffer_select = 1, 990 .needs_async_setup = 1, 991 .async_size = sizeof(struct io_async_msghdr), 992 }, 993 [IORING_OP_TIMEOUT] = { 994 .audit_skip = 1, 995 .async_size = sizeof(struct io_timeout_data), 996 }, 997 [IORING_OP_TIMEOUT_REMOVE] = { 998 /* used by timeout updates' prep() */ 999 .audit_skip = 1, 1000 }, 1001 [IORING_OP_ACCEPT] = { 1002 .needs_file = 1, 1003 .unbound_nonreg_file = 1, 1004 .pollin = 1, 1005 }, 1006 [IORING_OP_ASYNC_CANCEL] = { 1007 .audit_skip = 1, 1008 }, 1009 [IORING_OP_LINK_TIMEOUT] = { 1010 .audit_skip = 1, 1011 .async_size = sizeof(struct io_timeout_data), 1012 }, 1013 [IORING_OP_CONNECT] = { 1014 .needs_file = 1, 1015 .unbound_nonreg_file = 1, 1016 .pollout = 1, 1017 .needs_async_setup = 1, 1018 .async_size = sizeof(struct io_async_connect), 1019 }, 1020 [IORING_OP_FALLOCATE] = { 1021 .needs_file = 1, 1022 }, 1023 [IORING_OP_OPENAT] = {}, 1024 [IORING_OP_CLOSE] = {}, 1025 [IORING_OP_FILES_UPDATE] = { 1026 .audit_skip = 1, 1027 }, 1028 [IORING_OP_STATX] = { 1029 .audit_skip = 1, 1030 }, 1031 [IORING_OP_READ] = { 1032 .needs_file = 1, 1033 .unbound_nonreg_file = 1, 1034 .pollin = 1, 1035 .buffer_select = 1, 1036 .plug = 1, 1037 .audit_skip = 1, 1038 .async_size = sizeof(struct io_async_rw), 1039 }, 1040 [IORING_OP_WRITE] = { 1041 .needs_file = 1, 1042 .hash_reg_file = 1, 1043 .unbound_nonreg_file = 1, 1044 .pollout = 1, 1045 .plug = 1, 1046 .audit_skip = 1, 1047 .async_size = sizeof(struct io_async_rw), 1048 }, 1049 [IORING_OP_FADVISE] = { 1050 .needs_file = 1, 1051 .audit_skip = 1, 1052 }, 1053 [IORING_OP_MADVISE] = {}, 1054 [IORING_OP_SEND] = { 1055 .needs_file = 1, 1056 .unbound_nonreg_file = 1, 1057 .pollout = 1, 1058 .audit_skip = 1, 1059 }, 1060 [IORING_OP_RECV] = { 1061 .needs_file = 1, 1062 .unbound_nonreg_file = 1, 1063 .pollin = 1, 1064 .buffer_select = 1, 1065 .audit_skip = 1, 1066 }, 1067 [IORING_OP_OPENAT2] = { 1068 }, 1069 [IORING_OP_EPOLL_CTL] = { 1070 .unbound_nonreg_file = 1, 1071 .audit_skip = 1, 1072 }, 1073 [IORING_OP_SPLICE] = { 1074 .needs_file = 1, 1075 .hash_reg_file = 1, 1076 .unbound_nonreg_file = 1, 1077 .audit_skip = 1, 1078 }, 1079 [IORING_OP_PROVIDE_BUFFERS] = { 1080 .audit_skip = 1, 1081 }, 1082 [IORING_OP_REMOVE_BUFFERS] = { 1083 .audit_skip = 1, 1084 }, 1085 [IORING_OP_TEE] = { 1086 .needs_file = 1, 1087 .hash_reg_file = 1, 1088 .unbound_nonreg_file = 1, 1089 .audit_skip = 1, 1090 }, 1091 [IORING_OP_SHUTDOWN] = { 1092 .needs_file = 1, 1093 }, 1094 [IORING_OP_RENAMEAT] = {}, 1095 [IORING_OP_UNLINKAT] = {}, 1096 [IORING_OP_MKDIRAT] = {}, 1097 [IORING_OP_SYMLINKAT] = {}, 1098 [IORING_OP_LINKAT] = {}, 1099}; 1100 1101/* requests with any of those set should undergo io_disarm_next() */ 1102#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) 1103 1104static bool io_disarm_next(struct io_kiocb *req); 1105static void io_uring_del_tctx_node(unsigned long index); 1106static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 1107 struct task_struct *task, 1108 bool cancel_all); 1109static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); 1110 1111static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, 1112 s32 res, u32 cflags); 1113static void io_put_req(struct io_kiocb *req); 1114static void io_put_req_deferred(struct io_kiocb *req); 1115static void io_dismantle_req(struct io_kiocb *req); 1116static void io_queue_linked_timeout(struct io_kiocb *req); 1117static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 1118 struct io_uring_rsrc_update2 *up, 1119 unsigned nr_args); 1120static void io_clean_op(struct io_kiocb *req); 1121static struct file *io_file_get(struct io_ring_ctx *ctx, 1122 struct io_kiocb *req, int fd, bool fixed); 1123static void __io_queue_sqe(struct io_kiocb *req); 1124static void io_rsrc_put_work(struct work_struct *work); 1125 1126static void io_req_task_queue(struct io_kiocb *req); 1127static void __io_submit_flush_completions(struct io_ring_ctx *ctx); 1128static int io_req_prep_async(struct io_kiocb *req); 1129 1130static int io_install_fixed_file(struct io_kiocb *req, struct file *file, 1131 unsigned int issue_flags, u32 slot_index); 1132static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); 1133 1134static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); 1135 1136static struct kmem_cache *req_cachep; 1137 1138static const struct file_operations io_uring_fops; 1139 1140struct sock *io_uring_get_socket(struct file *file) 1141{ 1142#if defined(CONFIG_UNIX) 1143 if (file->f_op == &io_uring_fops) { 1144 struct io_ring_ctx *ctx = file->private_data; 1145 1146 return ctx->ring_sock->sk; 1147 } 1148#endif 1149 return NULL; 1150} 1151EXPORT_SYMBOL(io_uring_get_socket); 1152 1153static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) 1154{ 1155 if (!*locked) { 1156 mutex_lock(&ctx->uring_lock); 1157 *locked = true; 1158 } 1159} 1160 1161#define io_for_each_link(pos, head) \ 1162 for (pos = (head); pos; pos = pos->link) 1163 1164/* 1165 * Shamelessly stolen from the mm implementation of page reference checking, 1166 * see commit f958d7b528b1 for details. 1167 */ 1168#define req_ref_zero_or_close_to_overflow(req) \ 1169 ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u) 1170 1171static inline bool req_ref_inc_not_zero(struct io_kiocb *req) 1172{ 1173 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); 1174 return atomic_inc_not_zero(&req->refs); 1175} 1176 1177static inline bool req_ref_put_and_test(struct io_kiocb *req) 1178{ 1179 if (likely(!(req->flags & REQ_F_REFCOUNT))) 1180 return true; 1181 1182 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); 1183 return atomic_dec_and_test(&req->refs); 1184} 1185 1186static inline void req_ref_put(struct io_kiocb *req) 1187{ 1188 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); 1189 WARN_ON_ONCE(req_ref_put_and_test(req)); 1190} 1191 1192static inline void req_ref_get(struct io_kiocb *req) 1193{ 1194 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); 1195 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); 1196 atomic_inc(&req->refs); 1197} 1198 1199static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) 1200{ 1201 if (!wq_list_empty(&ctx->submit_state.compl_reqs)) 1202 __io_submit_flush_completions(ctx); 1203} 1204 1205static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) 1206{ 1207 if (!(req->flags & REQ_F_REFCOUNT)) { 1208 req->flags |= REQ_F_REFCOUNT; 1209 atomic_set(&req->refs, nr); 1210 } 1211} 1212 1213static inline void io_req_set_refcount(struct io_kiocb *req) 1214{ 1215 __io_req_set_refcount(req, 1); 1216} 1217 1218#define IO_RSRC_REF_BATCH 100 1219 1220static inline void io_req_put_rsrc_locked(struct io_kiocb *req, 1221 struct io_ring_ctx *ctx) 1222 __must_hold(&ctx->uring_lock) 1223{ 1224 struct percpu_ref *ref = req->fixed_rsrc_refs; 1225 1226 if (ref) { 1227 if (ref == &ctx->rsrc_node->refs) 1228 ctx->rsrc_cached_refs++; 1229 else 1230 percpu_ref_put(ref); 1231 } 1232} 1233 1234static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx) 1235{ 1236 if (req->fixed_rsrc_refs) 1237 percpu_ref_put(req->fixed_rsrc_refs); 1238} 1239 1240static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx) 1241 __must_hold(&ctx->uring_lock) 1242{ 1243 if (ctx->rsrc_cached_refs) { 1244 percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs); 1245 ctx->rsrc_cached_refs = 0; 1246 } 1247} 1248 1249static void io_rsrc_refs_refill(struct io_ring_ctx *ctx) 1250 __must_hold(&ctx->uring_lock) 1251{ 1252 ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH; 1253 percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH); 1254} 1255 1256static inline void io_req_set_rsrc_node(struct io_kiocb *req, 1257 struct io_ring_ctx *ctx) 1258{ 1259 if (!req->fixed_rsrc_refs) { 1260 req->fixed_rsrc_refs = &ctx->rsrc_node->refs; 1261 ctx->rsrc_cached_refs--; 1262 if (unlikely(ctx->rsrc_cached_refs < 0)) 1263 io_rsrc_refs_refill(ctx); 1264 } 1265} 1266 1267static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl) 1268{ 1269 bool got = percpu_ref_tryget(ref); 1270 1271 /* already at zero, wait for ->release() */ 1272 if (!got) 1273 wait_for_completion(compl); 1274 percpu_ref_resurrect(ref); 1275 if (got) 1276 percpu_ref_put(ref); 1277} 1278 1279static bool io_match_task(struct io_kiocb *head, struct task_struct *task, 1280 bool cancel_all) 1281{ 1282 struct io_kiocb *req; 1283 1284 if (task && head->task != task) 1285 return false; 1286 if (cancel_all) 1287 return true; 1288 1289 io_for_each_link(req, head) { 1290 if (req->flags & REQ_F_INFLIGHT) 1291 return true; 1292 } 1293 return false; 1294} 1295 1296static inline bool req_has_async_data(struct io_kiocb *req) 1297{ 1298 return req->flags & REQ_F_ASYNC_DATA; 1299} 1300 1301static inline void req_set_fail(struct io_kiocb *req) 1302{ 1303 req->flags |= REQ_F_FAIL; 1304} 1305 1306static inline void req_fail_link_node(struct io_kiocb *req, int res) 1307{ 1308 req_set_fail(req); 1309 req->result = res; 1310} 1311 1312static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) 1313{ 1314 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); 1315 1316 complete(&ctx->ref_comp); 1317} 1318 1319static inline bool io_is_timeout_noseq(struct io_kiocb *req) 1320{ 1321 return !req->timeout.off; 1322} 1323 1324static __cold void io_fallback_req_func(struct work_struct *work) 1325{ 1326 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, 1327 fallback_work.work); 1328 struct llist_node *node = llist_del_all(&ctx->fallback_llist); 1329 struct io_kiocb *req, *tmp; 1330 bool locked = false; 1331 1332 percpu_ref_get(&ctx->refs); 1333 llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node) 1334 req->io_task_work.func(req, &locked); 1335 1336 if (locked) { 1337 io_submit_flush_completions(ctx); 1338 mutex_unlock(&ctx->uring_lock); 1339 } 1340 percpu_ref_put(&ctx->refs); 1341} 1342 1343static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) 1344{ 1345 struct io_ring_ctx *ctx; 1346 int hash_bits; 1347 1348 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 1349 if (!ctx) 1350 return NULL; 1351 1352 /* 1353 * Use 5 bits less than the max cq entries, that should give us around 1354 * 32 entries per hash list if totally full and uniformly spread. 1355 */ 1356 hash_bits = ilog2(p->cq_entries); 1357 hash_bits -= 5; 1358 if (hash_bits <= 0) 1359 hash_bits = 1; 1360 ctx->cancel_hash_bits = hash_bits; 1361 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head), 1362 GFP_KERNEL); 1363 if (!ctx->cancel_hash) 1364 goto err; 1365 __hash_init(ctx->cancel_hash, 1U << hash_bits); 1366 1367 ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); 1368 if (!ctx->dummy_ubuf) 1369 goto err; 1370 /* set invalid range, so io_import_fixed() fails meeting it */ 1371 ctx->dummy_ubuf->ubuf = -1UL; 1372 1373 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 1374 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) 1375 goto err; 1376 1377 ctx->flags = p->flags; 1378 init_waitqueue_head(&ctx->sqo_sq_wait); 1379 INIT_LIST_HEAD(&ctx->sqd_list); 1380 INIT_LIST_HEAD(&ctx->cq_overflow_list); 1381 init_completion(&ctx->ref_comp); 1382 xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1); 1383 xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); 1384 mutex_init(&ctx->uring_lock); 1385 init_waitqueue_head(&ctx->cq_wait); 1386 spin_lock_init(&ctx->completion_lock); 1387 spin_lock_init(&ctx->timeout_lock); 1388 INIT_WQ_LIST(&ctx->iopoll_list); 1389 INIT_LIST_HEAD(&ctx->defer_list); 1390 INIT_LIST_HEAD(&ctx->timeout_list); 1391 INIT_LIST_HEAD(&ctx->ltimeout_list); 1392 spin_lock_init(&ctx->rsrc_ref_lock); 1393 INIT_LIST_HEAD(&ctx->rsrc_ref_list); 1394 INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); 1395 init_llist_head(&ctx->rsrc_put_llist); 1396 INIT_LIST_HEAD(&ctx->tctx_list); 1397 ctx->submit_state.free_list.next = NULL; 1398 INIT_WQ_LIST(&ctx->locked_free_list); 1399 INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); 1400 INIT_WQ_LIST(&ctx->submit_state.compl_reqs); 1401 return ctx; 1402err: 1403 kfree(ctx->dummy_ubuf); 1404 kfree(ctx->cancel_hash); 1405 kfree(ctx); 1406 return NULL; 1407} 1408 1409static void io_account_cq_overflow(struct io_ring_ctx *ctx) 1410{ 1411 struct io_rings *r = ctx->rings; 1412 1413 WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); 1414 ctx->cq_extra--; 1415} 1416 1417static bool req_need_defer(struct io_kiocb *req, u32 seq) 1418{ 1419 if (unlikely(req->flags & REQ_F_IO_DRAIN)) { 1420 struct io_ring_ctx *ctx = req->ctx; 1421 1422 return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; 1423 } 1424 1425 return false; 1426} 1427 1428#define FFS_NOWAIT 0x1UL 1429#define FFS_ISREG 0x2UL 1430#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) 1431 1432static inline bool io_req_ffs_set(struct io_kiocb *req) 1433{ 1434 return req->flags & REQ_F_FIXED_FILE; 1435} 1436 1437static inline void io_req_track_inflight(struct io_kiocb *req) 1438{ 1439 if (!(req->flags & REQ_F_INFLIGHT)) { 1440 req->flags |= REQ_F_INFLIGHT; 1441 atomic_inc(&current->io_uring->inflight_tracked); 1442 } 1443} 1444 1445static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) 1446{ 1447 if (WARN_ON_ONCE(!req->link)) 1448 return NULL; 1449 1450 req->flags &= ~REQ_F_ARM_LTIMEOUT; 1451 req->flags |= REQ_F_LINK_TIMEOUT; 1452 1453 /* linked timeouts should have two refs once prep'ed */ 1454 io_req_set_refcount(req); 1455 __io_req_set_refcount(req->link, 2); 1456 return req->link; 1457} 1458 1459static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) 1460{ 1461 if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT))) 1462 return NULL; 1463 return __io_prep_linked_timeout(req); 1464} 1465 1466static void io_prep_async_work(struct io_kiocb *req) 1467{ 1468 const struct io_op_def *def = &io_op_defs[req->opcode]; 1469 struct io_ring_ctx *ctx = req->ctx; 1470 1471 if (!(req->flags & REQ_F_CREDS)) { 1472 req->flags |= REQ_F_CREDS; 1473 req->creds = get_current_cred(); 1474 } 1475 1476 req->work.list.next = NULL; 1477 req->work.flags = 0; 1478 if (req->flags & REQ_F_FORCE_ASYNC) 1479 req->work.flags |= IO_WQ_WORK_CONCURRENT; 1480 1481 if (req->flags & REQ_F_ISREG) { 1482 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) 1483 io_wq_hash_work(&req->work, file_inode(req->file)); 1484 } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { 1485 if (def->unbound_nonreg_file) 1486 req->work.flags |= IO_WQ_WORK_UNBOUND; 1487 } 1488 1489 switch (req->opcode) { 1490 case IORING_OP_SPLICE: 1491 case IORING_OP_TEE: 1492 if (!S_ISREG(file_inode(req->splice.file_in)->i_mode)) 1493 req->work.flags |= IO_WQ_WORK_UNBOUND; 1494 break; 1495 } 1496} 1497 1498static void io_prep_async_link(struct io_kiocb *req) 1499{ 1500 struct io_kiocb *cur; 1501 1502 if (req->flags & REQ_F_LINK_TIMEOUT) { 1503 struct io_ring_ctx *ctx = req->ctx; 1504 1505 spin_lock(&ctx->completion_lock); 1506 io_for_each_link(cur, req) 1507 io_prep_async_work(cur); 1508 spin_unlock(&ctx->completion_lock); 1509 } else { 1510 io_for_each_link(cur, req) 1511 io_prep_async_work(cur); 1512 } 1513} 1514 1515static inline void io_req_add_compl_list(struct io_kiocb *req) 1516{ 1517 struct io_submit_state *state = &req->ctx->submit_state; 1518 1519 wq_list_add_tail(&req->comp_list, &state->compl_reqs); 1520} 1521 1522static void io_queue_async_work(struct io_kiocb *req, bool *dont_use) 1523{ 1524 struct io_ring_ctx *ctx = req->ctx; 1525 struct io_kiocb *link = io_prep_linked_timeout(req); 1526 struct io_uring_task *tctx = req->task->io_uring; 1527 1528 BUG_ON(!tctx); 1529 BUG_ON(!tctx->io_wq); 1530 1531 /* init ->work of the whole link before punting */ 1532 io_prep_async_link(req); 1533 1534 /* 1535 * Not expected to happen, but if we do have a bug where this _can_ 1536 * happen, catch it here and ensure the request is marked as 1537 * canceled. That will make io-wq go through the usual work cancel 1538 * procedure rather than attempt to run this request (or create a new 1539 * worker for it). 1540 */ 1541 if (WARN_ON_ONCE(!same_thread_group(req->task, current))) 1542 req->work.flags |= IO_WQ_WORK_CANCEL; 1543 1544 trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, 1545 &req->work, req->flags); 1546 io_wq_enqueue(tctx->io_wq, &req->work); 1547 if (link) 1548 io_queue_linked_timeout(link); 1549} 1550 1551static void io_kill_timeout(struct io_kiocb *req, int status) 1552 __must_hold(&req->ctx->completion_lock) 1553 __must_hold(&req->ctx->timeout_lock) 1554{ 1555 struct io_timeout_data *io = req->async_data; 1556 1557 if (hrtimer_try_to_cancel(&io->timer) != -1) { 1558 if (status) 1559 req_set_fail(req); 1560 atomic_set(&req->ctx->cq_timeouts, 1561 atomic_read(&req->ctx->cq_timeouts) + 1); 1562 list_del_init(&req->timeout.list); 1563 io_cqring_fill_event(req->ctx, req->user_data, status, 0); 1564 io_put_req_deferred(req); 1565 } 1566} 1567 1568static __cold void io_queue_deferred(struct io_ring_ctx *ctx) 1569{ 1570 while (!list_empty(&ctx->defer_list)) { 1571 struct io_defer_entry *de = list_first_entry(&ctx->defer_list, 1572 struct io_defer_entry, list); 1573 1574 if (req_need_defer(de->req, de->seq)) 1575 break; 1576 list_del_init(&de->list); 1577 io_req_task_queue(de->req); 1578 kfree(de); 1579 } 1580} 1581 1582static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) 1583 __must_hold(&ctx->completion_lock) 1584{ 1585 u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); 1586 1587 spin_lock_irq(&ctx->timeout_lock); 1588 while (!list_empty(&ctx->timeout_list)) { 1589 u32 events_needed, events_got; 1590 struct io_kiocb *req = list_first_entry(&ctx->timeout_list, 1591 struct io_kiocb, timeout.list); 1592 1593 if (io_is_timeout_noseq(req)) 1594 break; 1595 1596 /* 1597 * Since seq can easily wrap around over time, subtract 1598 * the last seq at which timeouts were flushed before comparing. 1599 * Assuming not more than 2^31-1 events have happened since, 1600 * these subtractions won't have wrapped, so we can check if 1601 * target is in [last_seq, current_seq] by comparing the two. 1602 */ 1603 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush; 1604 events_got = seq - ctx->cq_last_tm_flush; 1605 if (events_got < events_needed) 1606 break; 1607 1608 list_del_init(&req->timeout.list); 1609 io_kill_timeout(req, 0); 1610 } 1611 ctx->cq_last_tm_flush = seq; 1612 spin_unlock_irq(&ctx->timeout_lock); 1613} 1614 1615static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx) 1616{ 1617 if (ctx->off_timeout_used) 1618 io_flush_timeouts(ctx); 1619 if (ctx->drain_active) 1620 io_queue_deferred(ctx); 1621} 1622 1623static inline void io_commit_cqring(struct io_ring_ctx *ctx) 1624{ 1625 if (unlikely(ctx->off_timeout_used || ctx->drain_active)) 1626 __io_commit_cqring_flush(ctx); 1627 /* order cqe stores with ring update */ 1628 smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); 1629} 1630 1631static inline bool io_sqring_full(struct io_ring_ctx *ctx) 1632{ 1633 struct io_rings *r = ctx->rings; 1634 1635 return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries; 1636} 1637 1638static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) 1639{ 1640 return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); 1641} 1642 1643static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) 1644{ 1645 struct io_rings *rings = ctx->rings; 1646 unsigned tail, mask = ctx->cq_entries - 1; 1647 1648 /* 1649 * writes to the cq entry need to come after reading head; the 1650 * control dependency is enough as we're using WRITE_ONCE to 1651 * fill the cq entry 1652 */ 1653 if (__io_cqring_events(ctx) == ctx->cq_entries) 1654 return NULL; 1655 1656 tail = ctx->cached_cq_tail++; 1657 return &rings->cqes[tail & mask]; 1658} 1659 1660static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) 1661{ 1662 if (likely(!ctx->cq_ev_fd)) 1663 return false; 1664 if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) 1665 return false; 1666 return !ctx->eventfd_async || io_wq_current_is_worker(); 1667} 1668 1669/* 1670 * This should only get called when at least one event has been posted. 1671 * Some applications rely on the eventfd notification count only changing 1672 * IFF a new CQE has been added to the CQ ring. There's no depedency on 1673 * 1:1 relationship between how many times this function is called (and 1674 * hence the eventfd count) and number of CQEs posted to the CQ ring. 1675 */ 1676static void io_cqring_ev_posted(struct io_ring_ctx *ctx) 1677{ 1678 /* 1679 * wake_up_all() may seem excessive, but io_wake_function() and 1680 * io_should_wake() handle the termination of the loop and only 1681 * wake as many waiters as we need to. 1682 */ 1683 if (wq_has_sleeper(&ctx->cq_wait)) 1684 wake_up_all(&ctx->cq_wait); 1685 if (io_should_trigger_evfd(ctx)) 1686 eventfd_signal(ctx->cq_ev_fd, 1); 1687} 1688 1689static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) 1690{ 1691 /* see waitqueue_active() comment */ 1692 smp_mb(); 1693 1694 if (ctx->flags & IORING_SETUP_SQPOLL) { 1695 if (waitqueue_active(&ctx->cq_wait)) 1696 wake_up_all(&ctx->cq_wait); 1697 } 1698 if (io_should_trigger_evfd(ctx)) 1699 eventfd_signal(ctx->cq_ev_fd, 1); 1700} 1701 1702/* Returns true if there are no backlogged entries after the flush */ 1703static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) 1704{ 1705 bool all_flushed, posted; 1706 1707 if (!force && __io_cqring_events(ctx) == ctx->cq_entries) 1708 return false; 1709 1710 posted = false; 1711 spin_lock(&ctx->completion_lock); 1712 while (!list_empty(&ctx->cq_overflow_list)) { 1713 struct io_uring_cqe *cqe = io_get_cqe(ctx); 1714 struct io_overflow_cqe *ocqe; 1715 1716 if (!cqe && !force) 1717 break; 1718 ocqe = list_first_entry(&ctx->cq_overflow_list, 1719 struct io_overflow_cqe, list); 1720 if (cqe) 1721 memcpy(cqe, &ocqe->cqe, sizeof(*cqe)); 1722 else 1723 io_account_cq_overflow(ctx); 1724 1725 posted = true; 1726 list_del(&ocqe->list); 1727 kfree(ocqe); 1728 } 1729 1730 all_flushed = list_empty(&ctx->cq_overflow_list); 1731 if (all_flushed) { 1732 clear_bit(0, &ctx->check_cq_overflow); 1733 WRITE_ONCE(ctx->rings->sq_flags, 1734 ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW); 1735 } 1736 1737 if (posted) 1738 io_commit_cqring(ctx); 1739 spin_unlock(&ctx->completion_lock); 1740 if (posted) 1741 io_cqring_ev_posted(ctx); 1742 return all_flushed; 1743} 1744 1745static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) 1746{ 1747 bool ret = true; 1748 1749 if (test_bit(0, &ctx->check_cq_overflow)) { 1750 /* iopoll syncs against uring_lock, not completion_lock */ 1751 if (ctx->flags & IORING_SETUP_IOPOLL) 1752 mutex_lock(&ctx->uring_lock); 1753 ret = __io_cqring_overflow_flush(ctx, false); 1754 if (ctx->flags & IORING_SETUP_IOPOLL) 1755 mutex_unlock(&ctx->uring_lock); 1756 } 1757 1758 return ret; 1759} 1760 1761/* must to be called somewhat shortly after putting a request */ 1762static inline void io_put_task(struct task_struct *task, int nr) 1763{ 1764 struct io_uring_task *tctx = task->io_uring; 1765 1766 if (likely(task == current)) { 1767 tctx->cached_refs += nr; 1768 } else { 1769 percpu_counter_sub(&tctx->inflight, nr); 1770 if (unlikely(atomic_read(&tctx->in_idle))) 1771 wake_up(&tctx->wait); 1772 put_task_struct_many(task, nr); 1773 } 1774} 1775 1776static void io_task_refs_refill(struct io_uring_task *tctx) 1777{ 1778 unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; 1779 1780 percpu_counter_add(&tctx->inflight, refill); 1781 refcount_add(refill, &current->usage); 1782 tctx->cached_refs += refill; 1783} 1784 1785static inline void io_get_task_refs(int nr) 1786{ 1787 struct io_uring_task *tctx = current->io_uring; 1788 1789 tctx->cached_refs -= nr; 1790 if (unlikely(tctx->cached_refs < 0)) 1791 io_task_refs_refill(tctx); 1792} 1793 1794static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, 1795 s32 res, u32 cflags) 1796{ 1797 struct io_overflow_cqe *ocqe; 1798 1799 ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT); 1800 if (!ocqe) { 1801 /* 1802 * If we're in ring overflow flush mode, or in task cancel mode, 1803 * or cannot allocate an overflow entry, then we need to drop it 1804 * on the floor. 1805 */ 1806 io_account_cq_overflow(ctx); 1807 return false; 1808 } 1809 if (list_empty(&ctx->cq_overflow_list)) { 1810 set_bit(0, &ctx->check_cq_overflow); 1811 WRITE_ONCE(ctx->rings->sq_flags, 1812 ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW); 1813 1814 } 1815 ocqe->cqe.user_data = user_data; 1816 ocqe->cqe.res = res; 1817 ocqe->cqe.flags = cflags; 1818 list_add_tail(&ocqe->list, &ctx->cq_overflow_list); 1819 return true; 1820} 1821 1822static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, 1823 s32 res, u32 cflags) 1824{ 1825 struct io_uring_cqe *cqe; 1826 1827 trace_io_uring_complete(ctx, user_data, res, cflags); 1828 1829 /* 1830 * If we can't get a cq entry, userspace overflowed the 1831 * submission (by quite a lot). Increment the overflow count in 1832 * the ring. 1833 */ 1834 cqe = io_get_cqe(ctx); 1835 if (likely(cqe)) { 1836 WRITE_ONCE(cqe->user_data, user_data); 1837 WRITE_ONCE(cqe->res, res); 1838 WRITE_ONCE(cqe->flags, cflags); 1839 return true; 1840 } 1841 return io_cqring_event_overflow(ctx, user_data, res, cflags); 1842} 1843 1844/* not as hot to bloat with inlining */ 1845static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, 1846 s32 res, u32 cflags) 1847{ 1848 return __io_cqring_fill_event(ctx, user_data, res, cflags); 1849} 1850 1851static void io_req_complete_post(struct io_kiocb *req, s32 res, 1852 u32 cflags) 1853{ 1854 struct io_ring_ctx *ctx = req->ctx; 1855 1856 spin_lock(&ctx->completion_lock); 1857 __io_cqring_fill_event(ctx, req->user_data, res, cflags); 1858 /* 1859 * If we're the last reference to this request, add to our locked 1860 * free_list cache. 1861 */ 1862 if (req_ref_put_and_test(req)) { 1863 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { 1864 if (req->flags & IO_DISARM_MASK) 1865 io_disarm_next(req); 1866 if (req->link) { 1867 io_req_task_queue(req->link); 1868 req->link = NULL; 1869 } 1870 } 1871 io_req_put_rsrc(req, ctx); 1872 io_dismantle_req(req); 1873 io_put_task(req->task, 1); 1874 wq_list_add_head(&req->comp_list, &ctx->locked_free_list); 1875 ctx->locked_free_nr++; 1876 } 1877 io_commit_cqring(ctx); 1878 spin_unlock(&ctx->completion_lock); 1879 io_cqring_ev_posted(ctx); 1880} 1881 1882static inline void io_req_complete_state(struct io_kiocb *req, s32 res, 1883 u32 cflags) 1884{ 1885 req->result = res; 1886 req->cflags = cflags; 1887 req->flags |= REQ_F_COMPLETE_INLINE; 1888} 1889 1890static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, 1891 s32 res, u32 cflags) 1892{ 1893 if (issue_flags & IO_URING_F_COMPLETE_DEFER) 1894 io_req_complete_state(req, res, cflags); 1895 else 1896 io_req_complete_post(req, res, cflags); 1897} 1898 1899static inline void io_req_complete(struct io_kiocb *req, s32 res) 1900{ 1901 __io_req_complete(req, 0, res, 0); 1902} 1903 1904static void io_req_complete_failed(struct io_kiocb *req, s32 res) 1905{ 1906 req_set_fail(req); 1907 io_req_complete_post(req, res, 0); 1908} 1909 1910static void io_req_complete_fail_submit(struct io_kiocb *req) 1911{ 1912 /* 1913 * We don't submit, fail them all, for that replace hardlinks with 1914 * normal links. Extra REQ_F_LINK is tolerated. 1915 */ 1916 req->flags &= ~REQ_F_HARDLINK; 1917 req->flags |= REQ_F_LINK; 1918 io_req_complete_failed(req, req->result); 1919} 1920 1921/* 1922 * Don't initialise the fields below on every allocation, but do that in 1923 * advance and keep them valid across allocations. 1924 */ 1925static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) 1926{ 1927 req->ctx = ctx; 1928 req->link = NULL; 1929 req->async_data = NULL; 1930 /* not necessary, but safer to zero */ 1931 req->result = 0; 1932} 1933 1934static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, 1935 struct io_submit_state *state) 1936{ 1937 spin_lock(&ctx->completion_lock); 1938 wq_list_splice(&ctx->locked_free_list, &state->free_list); 1939 ctx->locked_free_nr = 0; 1940 spin_unlock(&ctx->completion_lock); 1941} 1942 1943/* Returns true IFF there are requests in the cache */ 1944static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) 1945{ 1946 struct io_submit_state *state = &ctx->submit_state; 1947 1948 /* 1949 * If we have more than a batch's worth of requests in our IRQ side 1950 * locked cache, grab the lock and move them over to our submission 1951 * side cache. 1952 */ 1953 if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) 1954 io_flush_cached_locked_reqs(ctx, state); 1955 return !!state->free_list.next; 1956} 1957 1958/* 1959 * A request might get retired back into the request caches even before opcode 1960 * handlers and io_issue_sqe() are done with it, e.g. inline completion path. 1961 * Because of that, io_alloc_req() should be called only under ->uring_lock 1962 * and with extra caution to not get a request that is still worked on. 1963 */ 1964static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) 1965 __must_hold(&ctx->uring_lock) 1966{ 1967 struct io_submit_state *state = &ctx->submit_state; 1968 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; 1969 void *reqs[IO_REQ_ALLOC_BATCH]; 1970 struct io_kiocb *req; 1971 int ret, i; 1972 1973 if (likely(state->free_list.next || io_flush_cached_reqs(ctx))) 1974 return true; 1975 1976 ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); 1977 1978 /* 1979 * Bulk alloc is all-or-nothing. If we fail to get a batch, 1980 * retry single alloc to be on the safe side. 1981 */ 1982 if (unlikely(ret <= 0)) { 1983 reqs[0] = kmem_cache_alloc(req_cachep, gfp); 1984 if (!reqs[0]) 1985 return false; 1986 ret = 1; 1987 } 1988 1989 percpu_ref_get_many(&ctx->refs, ret); 1990 for (i = 0; i < ret; i++) { 1991 req = reqs[i]; 1992 1993 io_preinit_req(req, ctx); 1994 wq_stack_add_head(&req->comp_list, &state->free_list); 1995 } 1996 return true; 1997} 1998 1999static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) 2000{ 2001 if (unlikely(!ctx->submit_state.free_list.next)) 2002 return __io_alloc_req_refill(ctx); 2003 return true; 2004} 2005 2006static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) 2007{ 2008 struct io_wq_work_node *node; 2009 2010 node = wq_stack_extract(&ctx->submit_state.free_list); 2011 return container_of(node, struct io_kiocb, comp_list); 2012} 2013 2014static inline void io_put_file(struct file *file) 2015{ 2016 if (file) 2017 fput(file); 2018} 2019 2020static inline void io_dismantle_req(struct io_kiocb *req) 2021{ 2022 unsigned int flags = req->flags; 2023 2024 if (unlikely(flags & IO_REQ_CLEAN_FLAGS)) 2025 io_clean_op(req); 2026 if (!(flags & REQ_F_FIXED_FILE)) 2027 io_put_file(req->file); 2028} 2029 2030static __cold void __io_free_req(struct io_kiocb *req) 2031{ 2032 struct io_ring_ctx *ctx = req->ctx; 2033 2034 io_req_put_rsrc(req, ctx); 2035 io_dismantle_req(req); 2036 io_put_task(req->task, 1); 2037 2038 spin_lock(&ctx->completion_lock); 2039 wq_list_add_head(&req->comp_list, &ctx->locked_free_list); 2040 ctx->locked_free_nr++; 2041 spin_unlock(&ctx->completion_lock); 2042} 2043 2044static inline void io_remove_next_linked(struct io_kiocb *req) 2045{ 2046 struct io_kiocb *nxt = req->link; 2047 2048 req->link = nxt->link; 2049 nxt->link = NULL; 2050} 2051 2052static bool io_kill_linked_timeout(struct io_kiocb *req) 2053 __must_hold(&req->ctx->completion_lock) 2054 __must_hold(&req->ctx->timeout_lock) 2055{ 2056 struct io_kiocb *link = req->link; 2057 2058 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { 2059 struct io_timeout_data *io = link->async_data; 2060 2061 io_remove_next_linked(req); 2062 link->timeout.head = NULL; 2063 if (hrtimer_try_to_cancel(&io->timer) != -1) { 2064 list_del(&link->timeout.list); 2065 io_cqring_fill_event(link->ctx, link->user_data, 2066 -ECANCELED, 0); 2067 io_put_req_deferred(link); 2068 return true; 2069 } 2070 } 2071 return false; 2072} 2073 2074static void io_fail_links(struct io_kiocb *req) 2075 __must_hold(&req->ctx->completion_lock) 2076{ 2077 struct io_kiocb *nxt, *link = req->link; 2078 2079 req->link = NULL; 2080 while (link) { 2081 long res = -ECANCELED; 2082 2083 if (link->flags & REQ_F_FAIL) 2084 res = link->result; 2085 2086 nxt = link->link; 2087 link->link = NULL; 2088 2089 trace_io_uring_fail_link(req, link); 2090 io_cqring_fill_event(link->ctx, link->user_data, res, 0); 2091 io_put_req_deferred(link); 2092 link = nxt; 2093 } 2094} 2095 2096static bool io_disarm_next(struct io_kiocb *req) 2097 __must_hold(&req->ctx->completion_lock) 2098{ 2099 bool posted = false; 2100 2101 if (req->flags & REQ_F_ARM_LTIMEOUT) { 2102 struct io_kiocb *link = req->link; 2103 2104 req->flags &= ~REQ_F_ARM_LTIMEOUT; 2105 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { 2106 io_remove_next_linked(req); 2107 io_cqring_fill_event(link->ctx, link->user_data, 2108 -ECANCELED, 0); 2109 io_put_req_deferred(link); 2110 posted = true; 2111 } 2112 } else if (req->flags & REQ_F_LINK_TIMEOUT) { 2113 struct io_ring_ctx *ctx = req->ctx; 2114 2115 spin_lock_irq(&ctx->timeout_lock); 2116 posted = io_kill_linked_timeout(req); 2117 spin_unlock_irq(&ctx->timeout_lock); 2118 } 2119 if (unlikely((req->flags & REQ_F_FAIL) && 2120 !(req->flags & REQ_F_HARDLINK))) { 2121 posted |= (req->link != NULL); 2122 io_fail_links(req); 2123 } 2124 return posted; 2125} 2126 2127static void __io_req_find_next_prep(struct io_kiocb *req) 2128{ 2129 struct io_ring_ctx *ctx = req->ctx; 2130 bool posted; 2131 2132 spin_lock(&ctx->completion_lock); 2133 posted = io_disarm_next(req); 2134 if (posted) 2135 io_commit_cqring(req->ctx); 2136 spin_unlock(&ctx->completion_lock); 2137 if (posted) 2138 io_cqring_ev_posted(ctx); 2139} 2140 2141static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) 2142{ 2143 struct io_kiocb *nxt; 2144 2145 if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))) 2146 return NULL; 2147 /* 2148 * If LINK is set, we have dependent requests in this chain. If we 2149 * didn't fail this request, queue the first one up, moving any other 2150 * dependencies to the next request. In case of failure, fail the rest 2151 * of the chain. 2152 */ 2153 if (unlikely(req->flags & IO_DISARM_MASK)) 2154 __io_req_find_next_prep(req); 2155 nxt = req->link; 2156 req->link = NULL; 2157 return nxt; 2158} 2159 2160static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) 2161{ 2162 if (!ctx) 2163 return; 2164 if (*locked) { 2165 io_submit_flush_completions(ctx); 2166 mutex_unlock(&ctx->uring_lock); 2167 *locked = false; 2168 } 2169 percpu_ref_put(&ctx->refs); 2170} 2171 2172static void tctx_task_work(struct callback_head *cb) 2173{ 2174 bool locked = false; 2175 struct io_ring_ctx *ctx = NULL; 2176 struct io_uring_task *tctx = container_of(cb, struct io_uring_task, 2177 task_work); 2178 2179 while (1) { 2180 struct io_wq_work_node *node; 2181 2182 if (!tctx->task_list.first && locked) 2183 io_submit_flush_completions(ctx); 2184 2185 spin_lock_irq(&tctx->task_lock); 2186 node = tctx->task_list.first; 2187 INIT_WQ_LIST(&tctx->task_list); 2188 if (!node) 2189 tctx->task_running = false; 2190 spin_unlock_irq(&tctx->task_lock); 2191 if (!node) 2192 break; 2193 2194 do { 2195 struct io_wq_work_node *next = node->next; 2196 struct io_kiocb *req = container_of(node, struct io_kiocb, 2197 io_task_work.node); 2198 2199 if (req->ctx != ctx) { 2200 ctx_flush_and_put(ctx, &locked); 2201 ctx = req->ctx; 2202 /* if not contended, grab and improve batching */ 2203 locked = mutex_trylock(&ctx->uring_lock); 2204 percpu_ref_get(&ctx->refs); 2205 } 2206 req->io_task_work.func(req, &locked); 2207 node = next; 2208 } while (node); 2209 2210 cond_resched(); 2211 } 2212 2213 ctx_flush_and_put(ctx, &locked); 2214} 2215 2216static void io_req_task_work_add(struct io_kiocb *req) 2217{ 2218 struct task_struct *tsk = req->task; 2219 struct io_uring_task *tctx = tsk->io_uring; 2220 enum task_work_notify_mode notify; 2221 struct io_wq_work_node *node; 2222 unsigned long flags; 2223 bool running; 2224 2225 WARN_ON_ONCE(!tctx); 2226 2227 spin_lock_irqsave(&tctx->task_lock, flags); 2228 wq_list_add_tail(&req->io_task_work.node, &tctx->task_list); 2229 running = tctx->task_running; 2230 if (!running) 2231 tctx->task_running = true; 2232 spin_unlock_irqrestore(&tctx->task_lock, flags); 2233 2234 /* task_work already pending, we're done */ 2235 if (running) 2236 return; 2237 2238 /* 2239 * SQPOLL kernel thread doesn't need notification, just a wakeup. For 2240 * all other cases, use TWA_SIGNAL unconditionally to ensure we're 2241 * processing task_work. There's no reliable way to tell if TWA_RESUME 2242 * will do the job. 2243 */ 2244 notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL; 2245 if (likely(!task_work_add(tsk, &tctx->task_work, notify))) { 2246 if (notify == TWA_NONE) 2247 wake_up_process(tsk); 2248 return; 2249 } 2250 2251 spin_lock_irqsave(&tctx->task_lock, flags); 2252 tctx->task_running = false; 2253 node = tctx->task_list.first; 2254 INIT_WQ_LIST(&tctx->task_list); 2255 spin_unlock_irqrestore(&tctx->task_lock, flags); 2256 2257 while (node) { 2258 req = container_of(node, struct io_kiocb, io_task_work.node); 2259 node = node->next; 2260 if (llist_add(&req->io_task_work.fallback_node, 2261 &req->ctx->fallback_llist)) 2262 schedule_delayed_work(&req->ctx->fallback_work, 1); 2263 } 2264} 2265 2266static void io_req_task_cancel(struct io_kiocb *req, bool *locked) 2267{ 2268 struct io_ring_ctx *ctx = req->ctx; 2269 2270 /* not needed for normal modes, but SQPOLL depends on it */ 2271 io_tw_lock(ctx, locked); 2272 io_req_complete_failed(req, req->result); 2273} 2274 2275static void io_req_task_submit(struct io_kiocb *req, bool *locked) 2276{ 2277 struct io_ring_ctx *ctx = req->ctx; 2278 2279 io_tw_lock(ctx, locked); 2280 /* req->task == current here, checking PF_EXITING is safe */ 2281 if (likely(!(req->task->flags & PF_EXITING))) 2282 __io_queue_sqe(req); 2283 else 2284 io_req_complete_failed(req, -EFAULT); 2285} 2286 2287static void io_req_task_queue_fail(struct io_kiocb *req, int ret) 2288{ 2289 req->result = ret; 2290 req->io_task_work.func = io_req_task_cancel; 2291 io_req_task_work_add(req); 2292} 2293 2294static void io_req_task_queue(struct io_kiocb *req) 2295{ 2296 req->io_task_work.func = io_req_task_submit; 2297 io_req_task_work_add(req); 2298} 2299 2300static void io_req_task_queue_reissue(struct io_kiocb *req) 2301{ 2302 req->io_task_work.func = io_queue_async_work; 2303 io_req_task_work_add(req); 2304} 2305 2306static inline void io_queue_next(struct io_kiocb *req) 2307{ 2308 struct io_kiocb *nxt = io_req_find_next(req); 2309 2310 if (nxt) 2311 io_req_task_queue(nxt); 2312} 2313 2314static void io_free_req(struct io_kiocb *req) 2315{ 2316 io_queue_next(req); 2317 __io_free_req(req); 2318} 2319 2320static void io_free_req_work(struct io_kiocb *req, bool *locked) 2321{ 2322 io_free_req(req); 2323} 2324 2325static void io_free_batch_list(struct io_ring_ctx *ctx, 2326 struct io_wq_work_node *node) 2327 __must_hold(&ctx->uring_lock) 2328{ 2329 struct task_struct *task = NULL; 2330 int task_refs = 0; 2331 2332 do { 2333 struct io_kiocb *req = container_of(node, struct io_kiocb, 2334 comp_list); 2335 2336 if (unlikely(req->flags & REQ_F_REFCOUNT)) { 2337 node = req->comp_list.next; 2338 if (!req_ref_put_and_test(req)) 2339 continue; 2340 } 2341 2342 io_req_put_rsrc_locked(req, ctx); 2343 io_queue_next(req); 2344 io_dismantle_req(req); 2345 2346 if (req->task != task) { 2347 if (task) 2348 io_put_task(task, task_refs); 2349 task = req->task; 2350 task_refs = 0; 2351 } 2352 task_refs++; 2353 node = req->comp_list.next; 2354 wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); 2355 } while (node); 2356 2357 if (task) 2358 io_put_task(task, task_refs); 2359} 2360 2361static void __io_submit_flush_completions(struct io_ring_ctx *ctx) 2362 __must_hold(&ctx->uring_lock) 2363{ 2364 struct io_wq_work_node *node, *prev; 2365 struct io_submit_state *state = &ctx->submit_state; 2366 2367 spin_lock(&ctx->completion_lock); 2368 wq_list_for_each(node, prev, &state->compl_reqs) { 2369 struct io_kiocb *req = container_of(node, struct io_kiocb, 2370 comp_list); 2371 2372 __io_cqring_fill_event(ctx, req->user_data, req->result, 2373 req->cflags); 2374 } 2375 io_commit_cqring(ctx); 2376 spin_unlock(&ctx->completion_lock); 2377 io_cqring_ev_posted(ctx); 2378 2379 io_free_batch_list(ctx, state->compl_reqs.first); 2380 INIT_WQ_LIST(&state->compl_reqs); 2381} 2382 2383/* 2384 * Drop reference to request, return next in chain (if there is one) if this 2385 * was the last reference to this request. 2386 */ 2387static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) 2388{ 2389 struct io_kiocb *nxt = NULL; 2390 2391 if (req_ref_put_and_test(req)) { 2392 nxt = io_req_find_next(req); 2393 __io_free_req(req); 2394 } 2395 return nxt; 2396} 2397 2398static inline void io_put_req(struct io_kiocb *req) 2399{ 2400 if (req_ref_put_and_test(req)) 2401 io_free_req(req); 2402} 2403 2404static inline void io_put_req_deferred(struct io_kiocb *req) 2405{ 2406 if (req_ref_put_and_test(req)) { 2407 req->io_task_work.func = io_free_req_work; 2408 io_req_task_work_add(req); 2409 } 2410} 2411 2412static unsigned io_cqring_events(struct io_ring_ctx *ctx) 2413{ 2414 /* See comment at the top of this file */ 2415 smp_rmb(); 2416 return __io_cqring_events(ctx); 2417} 2418 2419static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) 2420{ 2421 struct io_rings *rings = ctx->rings; 2422 2423 /* make sure SQ entry isn't read before tail */ 2424 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; 2425} 2426 2427static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf) 2428{ 2429 unsigned int cflags; 2430 2431 cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; 2432 cflags |= IORING_CQE_F_BUFFER; 2433 req->flags &= ~REQ_F_BUFFER_SELECTED; 2434 kfree(kbuf); 2435 return cflags; 2436} 2437 2438static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) 2439{ 2440 if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) 2441 return 0; 2442 return io_put_kbuf(req, req->kbuf); 2443} 2444 2445static inline bool io_run_task_work(void) 2446{ 2447 if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) { 2448 __set_current_state(TASK_RUNNING); 2449 tracehook_notify_signal(); 2450 return true; 2451 } 2452 2453 return false; 2454} 2455 2456static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) 2457{ 2458 struct io_wq_work_node *pos, *start, *prev; 2459 unsigned int poll_flags = BLK_POLL_NOSLEEP; 2460 DEFINE_IO_COMP_BATCH(iob); 2461 int nr_events = 0; 2462 2463 /* 2464 * Only spin for completions if we don't have multiple devices hanging 2465 * off our complete list. 2466 */ 2467 if (ctx->poll_multi_queue || force_nonspin) 2468 poll_flags |= BLK_POLL_ONESHOT; 2469 2470 wq_list_for_each(pos, start, &ctx->iopoll_list) { 2471 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); 2472 struct kiocb *kiocb = &req->rw.kiocb; 2473 int ret; 2474 2475 /* 2476 * Move completed and retryable entries to our local lists. 2477 * If we find a request that requires polling, break out 2478 * and complete those lists first, if we have entries there. 2479 */ 2480 if (READ_ONCE(req->iopoll_completed)) 2481 break; 2482 2483 ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags); 2484 if (unlikely(ret < 0)) 2485 return ret; 2486 else if (ret) 2487 poll_flags |= BLK_POLL_ONESHOT; 2488 2489 /* iopoll may have completed current req */ 2490 if (!rq_list_empty(iob.req_list) || 2491 READ_ONCE(req->iopoll_completed)) 2492 break; 2493 } 2494 2495 if (!rq_list_empty(iob.req_list)) 2496 iob.complete(&iob); 2497 else if (!pos) 2498 return 0; 2499 2500 prev = start; 2501 wq_list_for_each_resume(pos, prev) { 2502 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); 2503 2504 /* order with io_complete_rw_iopoll(), e.g. ->result updates */ 2505 if (!smp_load_acquire(&req->iopoll_completed)) 2506 break; 2507 __io_cqring_fill_event(ctx, req->user_data, req->result, 2508 io_put_rw_kbuf(req)); 2509 nr_events++; 2510 } 2511 2512 if (unlikely(!nr_events)) 2513 return 0; 2514 2515 io_commit_cqring(ctx); 2516 io_cqring_ev_posted_iopoll(ctx); 2517 pos = start ? start->next : ctx->iopoll_list.first; 2518 wq_list_cut(&ctx->iopoll_list, prev, start); 2519 io_free_batch_list(ctx, pos); 2520 return nr_events; 2521} 2522 2523/* 2524 * We can't just wait for polled events to come to us, we have to actively 2525 * find and complete them. 2526 */ 2527static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) 2528{ 2529 if (!(ctx->flags & IORING_SETUP_IOPOLL)) 2530 return; 2531 2532 mutex_lock(&ctx->uring_lock); 2533 while (!wq_list_empty(&ctx->iopoll_list)) { 2534 /* let it sleep and repeat later if can't complete a request */ 2535 if (io_do_iopoll(ctx, true) == 0) 2536 break; 2537 /* 2538 * Ensure we allow local-to-the-cpu processing to take place, 2539 * in this case we need to ensure that we reap all events. 2540 * Also let task_work, etc. to progress by releasing the mutex 2541 */ 2542 if (need_resched()) { 2543 mutex_unlock(&ctx->uring_lock); 2544 cond_resched(); 2545 mutex_lock(&ctx->uring_lock); 2546 } 2547 } 2548 mutex_unlock(&ctx->uring_lock); 2549} 2550 2551static int io_iopoll_check(struct io_ring_ctx *ctx, long min) 2552{ 2553 unsigned int nr_events = 0; 2554 int ret = 0; 2555 2556 /* 2557 * We disallow the app entering submit/complete with polling, but we 2558 * still need to lock the ring to prevent racing with polled issue 2559 * that got punted to a workqueue. 2560 */ 2561 mutex_lock(&ctx->uring_lock); 2562 /* 2563 * Don't enter poll loop if we already have events pending. 2564 * If we do, we can potentially be spinning for commands that 2565 * already triggered a CQE (eg in error). 2566 */ 2567 if (test_bit(0, &ctx->check_cq_overflow)) 2568 __io_cqring_overflow_flush(ctx, false); 2569 if (io_cqring_events(ctx)) 2570 goto out; 2571 do { 2572 /* 2573 * If a submit got punted to a workqueue, we can have the 2574 * application entering polling for a command before it gets 2575 * issued. That app will hold the uring_lock for the duration 2576 * of the poll right here, so we need to take a breather every 2577 * now and then to ensure that the issue has a chance to add 2578 * the poll to the issued list. Otherwise we can spin here 2579 * forever, while the workqueue is stuck trying to acquire the 2580 * very same mutex. 2581 */ 2582 if (wq_list_empty(&ctx->iopoll_list)) { 2583 u32 tail = ctx->cached_cq_tail; 2584 2585 mutex_unlock(&ctx->uring_lock); 2586 io_run_task_work(); 2587 mutex_lock(&ctx->uring_lock); 2588 2589 /* some requests don't go through iopoll_list */ 2590 if (tail != ctx->cached_cq_tail || 2591 wq_list_empty(&ctx->iopoll_list)) 2592 break; 2593 } 2594 ret = io_do_iopoll(ctx, !min); 2595 if (ret < 0) 2596 break; 2597 nr_events += ret; 2598 ret = 0; 2599 } while (nr_events < min && !need_resched()); 2600out: 2601 mutex_unlock(&ctx->uring_lock); 2602 return ret; 2603} 2604 2605static void kiocb_end_write(struct io_kiocb *req) 2606{ 2607 /* 2608 * Tell lockdep we inherited freeze protection from submission 2609 * thread. 2610 */ 2611 if (req->flags & REQ_F_ISREG) { 2612 struct super_block *sb = file_inode(req->file)->i_sb; 2613 2614 __sb_writers_acquired(sb, SB_FREEZE_WRITE); 2615 sb_end_write(sb); 2616 } 2617} 2618 2619#ifdef CONFIG_BLOCK 2620static bool io_resubmit_prep(struct io_kiocb *req) 2621{ 2622 struct io_async_rw *rw = req->async_data; 2623 2624 if (!req_has_async_data(req)) 2625 return !io_req_prep_async(req); 2626 iov_iter_restore(&rw->s.iter, &rw->s.iter_state); 2627 return true; 2628} 2629 2630static bool io_rw_should_reissue(struct io_kiocb *req) 2631{ 2632 umode_t mode = file_inode(req->file)->i_mode; 2633 struct io_ring_ctx *ctx = req->ctx; 2634 2635 if (!S_ISBLK(mode) && !S_ISREG(mode)) 2636 return false; 2637 if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && 2638 !(ctx->flags & IORING_SETUP_IOPOLL))) 2639 return false; 2640 /* 2641 * If ref is dying, we might be running poll reap from the exit work. 2642 * Don't attempt to reissue from that path, just let it fail with 2643 * -EAGAIN. 2644 */ 2645 if (percpu_ref_is_dying(&ctx->refs)) 2646 return false; 2647 /* 2648 * Play it safe and assume not safe to re-import and reissue if we're 2649 * not in the original thread group (or in task context). 2650 */ 2651 if (!same_thread_group(req->task, current) || !in_task()) 2652 return false; 2653 return true; 2654} 2655#else 2656static bool io_resubmit_prep(struct io_kiocb *req) 2657{ 2658 return false; 2659} 2660static bool io_rw_should_reissue(struct io_kiocb *req) 2661{ 2662 return false; 2663} 2664#endif 2665 2666static bool __io_complete_rw_common(struct io_kiocb *req, long res) 2667{ 2668 if (req->rw.kiocb.ki_flags & IOCB_WRITE) 2669 kiocb_end_write(req); 2670 if (unlikely(res != req->result)) { 2671 if ((res == -EAGAIN || res == -EOPNOTSUPP) && 2672 io_rw_should_reissue(req)) { 2673 req->flags |= REQ_F_REISSUE; 2674 return true; 2675 } 2676 req_set_fail(req); 2677 req->result = res; 2678 } 2679 return false; 2680} 2681 2682static void io_req_task_complete(struct io_kiocb *req, bool *locked) 2683{ 2684 unsigned int cflags = io_put_rw_kbuf(req); 2685 int res = req->result; 2686 2687 if (*locked) { 2688 io_req_complete_state(req, res, cflags); 2689 io_req_add_compl_list(req); 2690 } else { 2691 io_req_complete_post(req, res, cflags); 2692 } 2693} 2694 2695static void __io_complete_rw(struct io_kiocb *req, long res, long res2, 2696 unsigned int issue_flags) 2697{ 2698 if (__io_complete_rw_common(req, res)) 2699 return; 2700 __io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req)); 2701} 2702 2703static void io_complete_rw(struct kiocb *kiocb, long res) 2704{ 2705 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 2706 2707 if (__io_complete_rw_common(req, res)) 2708 return; 2709 req->result = res; 2710 req->io_task_work.func = io_req_task_complete; 2711 io_req_task_work_add(req); 2712} 2713 2714static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) 2715{ 2716 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 2717 2718 if (kiocb->ki_flags & IOCB_WRITE) 2719 kiocb_end_write(req); 2720 if (unlikely(res != req->result)) { 2721 if (res == -EAGAIN && io_rw_should_reissue(req)) { 2722 req->flags |= REQ_F_REISSUE; 2723 return; 2724 } 2725 req->result = res; 2726 } 2727 2728 /* order with io_iopoll_complete() checking ->iopoll_completed */ 2729 smp_store_release(&req->iopoll_completed, 1); 2730} 2731 2732/* 2733 * After the iocb has been issued, it's safe to be found on the poll list. 2734 * Adding the kiocb to the list AFTER submission ensures that we don't 2735 * find it from a io_do_iopoll() thread before the issuer is done 2736 * accessing the kiocb cookie. 2737 */ 2738static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) 2739{ 2740 struct io_ring_ctx *ctx = req->ctx; 2741 const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 2742 2743 /* workqueue context doesn't hold uring_lock, grab it now */ 2744 if (unlikely(needs_lock)) 2745 mutex_lock(&ctx->uring_lock); 2746 2747 /* 2748 * Track whether we have multiple files in our lists. This will impact 2749 * how we do polling eventually, not spinning if we're on potentially 2750 * different devices. 2751 */ 2752 if (wq_list_empty(&ctx->iopoll_list)) { 2753 ctx->poll_multi_queue = false; 2754 } else if (!ctx->poll_multi_queue) { 2755 struct io_kiocb *list_req; 2756 2757 list_req = container_of(ctx->iopoll_list.first, struct io_kiocb, 2758 comp_list); 2759 if (list_req->file != req->file) 2760 ctx->poll_multi_queue = true; 2761 } 2762 2763 /* 2764 * For fast devices, IO may have already completed. If it has, add 2765 * it to the front so we find it first. 2766 */ 2767 if (READ_ONCE(req->iopoll_completed)) 2768 wq_list_add_head(&req->comp_list, &ctx->iopoll_list); 2769 else 2770 wq_list_add_tail(&req->comp_list, &ctx->iopoll_list); 2771 2772 if (unlikely(needs_lock)) { 2773 /* 2774 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle 2775 * in sq thread task context or in io worker task context. If 2776 * current task context is sq thread, we don't need to check 2777 * whether should wake up sq thread. 2778 */ 2779 if ((ctx->flags & IORING_SETUP_SQPOLL) && 2780 wq_has_sleeper(&ctx->sq_data->wait)) 2781 wake_up(&ctx->sq_data->wait); 2782 2783 mutex_unlock(&ctx->uring_lock); 2784 } 2785} 2786 2787static bool io_bdev_nowait(struct block_device *bdev) 2788{ 2789 return !bdev || blk_queue_nowait(bdev_get_queue(bdev)); 2790} 2791 2792/* 2793 * If we tracked the file through the SCM inflight mechanism, we could support 2794 * any file. For now, just ensure that anything potentially problematic is done 2795 * inline. 2796 */ 2797static bool __io_file_supports_nowait(struct file *file, umode_t mode) 2798{ 2799 if (S_ISBLK(mode)) { 2800 if (IS_ENABLED(CONFIG_BLOCK) && 2801 io_bdev_nowait(I_BDEV(file->f_mapping->host))) 2802 return true; 2803 return false; 2804 } 2805 if (S_ISSOCK(mode)) 2806 return true; 2807 if (S_ISREG(mode)) { 2808 if (IS_ENABLED(CONFIG_BLOCK) && 2809 io_bdev_nowait(file->f_inode->i_sb->s_bdev) && 2810 file->f_op != &io_uring_fops) 2811 return true; 2812 return false; 2813 } 2814 2815 /* any ->read/write should understand O_NONBLOCK */ 2816 if (file->f_flags & O_NONBLOCK) 2817 return true; 2818 return file->f_mode & FMODE_NOWAIT; 2819} 2820 2821/* 2822 * If we tracked the file through the SCM inflight mechanism, we could support 2823 * any file. For now, just ensure that anything potentially problematic is done 2824 * inline. 2825 */ 2826static unsigned int io_file_get_flags(struct file *file) 2827{ 2828 umode_t mode = file_inode(file)->i_mode; 2829 unsigned int res = 0; 2830 2831 if (S_ISREG(mode)) 2832 res |= FFS_ISREG; 2833 if (__io_file_supports_nowait(file, mode)) 2834 res |= FFS_NOWAIT; 2835 return res; 2836} 2837 2838static inline bool io_file_supports_nowait(struct io_kiocb *req) 2839{ 2840 return req->flags & REQ_F_SUPPORT_NOWAIT; 2841} 2842 2843static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) 2844{ 2845 struct io_ring_ctx *ctx = req->ctx; 2846 struct kiocb *kiocb = &req->rw.kiocb; 2847 struct file *file = req->file; 2848 unsigned ioprio; 2849 int ret; 2850 2851 if (!io_req_ffs_set(req)) 2852 req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; 2853 2854 kiocb->ki_pos = READ_ONCE(sqe->off); 2855 if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) { 2856 req->flags |= REQ_F_CUR_POS; 2857 kiocb->ki_pos = file->f_pos; 2858 } 2859 kiocb->ki_flags = iocb_flags(file); 2860 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); 2861 if (unlikely(ret)) 2862 return ret; 2863 2864 /* 2865 * If the file is marked O_NONBLOCK, still allow retry for it if it 2866 * supports async. Otherwise it's impossible to use O_NONBLOCK files 2867 * reliably. If not, or it IOCB_NOWAIT is set, don't retry. 2868 */ 2869 if ((kiocb->ki_flags & IOCB_NOWAIT) || 2870 ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req))) 2871 req->flags |= REQ_F_NOWAIT; 2872 2873 if (ctx->flags & IORING_SETUP_IOPOLL) { 2874 if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) 2875 return -EOPNOTSUPP; 2876 2877 kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; 2878 kiocb->ki_complete = io_complete_rw_iopoll; 2879 req->iopoll_completed = 0; 2880 } else { 2881 if (kiocb->ki_flags & IOCB_HIPRI) 2882 return -EINVAL; 2883 kiocb->ki_complete = io_complete_rw; 2884 } 2885 2886 ioprio = READ_ONCE(sqe->ioprio); 2887 if (ioprio) { 2888 ret = ioprio_check_cap(ioprio); 2889 if (ret) 2890 return ret; 2891 2892 kiocb->ki_ioprio = ioprio; 2893 } else { 2894 kiocb->ki_ioprio = get_current_ioprio(); 2895 } 2896 2897 req->imu = NULL; 2898 req->rw.addr = READ_ONCE(sqe->addr); 2899 req->rw.len = READ_ONCE(sqe->len); 2900 req->buf_index = READ_ONCE(sqe->buf_index); 2901 return 0; 2902} 2903 2904static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) 2905{ 2906 switch (ret) { 2907 case -EIOCBQUEUED: 2908 break; 2909 case -ERESTARTSYS: 2910 case -ERESTARTNOINTR: 2911 case -ERESTARTNOHAND: 2912 case -ERESTART_RESTARTBLOCK: 2913 /* 2914 * We can't just restart the syscall, since previously 2915 * submitted sqes may already be in progress. Just fail this 2916 * IO with EINTR. 2917 */ 2918 ret = -EINTR; 2919 fallthrough; 2920 default: 2921 kiocb->ki_complete(kiocb, ret); 2922 } 2923} 2924 2925static void kiocb_done(struct kiocb *kiocb, ssize_t ret, 2926 unsigned int issue_flags) 2927{ 2928 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 2929 struct io_async_rw *io = req->async_data; 2930 2931 /* add previously done IO, if any */ 2932 if (req_has_async_data(req) && io->bytes_done > 0) { 2933 if (ret < 0) 2934 ret = io->bytes_done; 2935 else 2936 ret += io->bytes_done; 2937 } 2938 2939 if (req->flags & REQ_F_CUR_POS) 2940 req->file->f_pos = kiocb->ki_pos; 2941 if (ret >= 0 && (kiocb->ki_complete == io_complete_rw)) 2942 __io_complete_rw(req, ret, 0, issue_flags); 2943 else 2944 io_rw_done(kiocb, ret); 2945 2946 if (req->flags & REQ_F_REISSUE) { 2947 req->flags &= ~REQ_F_REISSUE; 2948 if (io_resubmit_prep(req)) { 2949 io_req_task_queue_reissue(req); 2950 } else { 2951 unsigned int cflags = io_put_rw_kbuf(req); 2952 struct io_ring_ctx *ctx = req->ctx; 2953 2954 req_set_fail(req); 2955 if (issue_flags & IO_URING_F_UNLOCKED) { 2956 mutex_lock(&ctx->uring_lock); 2957 __io_req_complete(req, issue_flags, ret, cflags); 2958 mutex_unlock(&ctx->uring_lock); 2959 } else { 2960 __io_req_complete(req, issue_flags, ret, cflags); 2961 } 2962 } 2963 } 2964} 2965 2966static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, 2967 struct io_mapped_ubuf *imu) 2968{ 2969 size_t len = req->rw.len; 2970 u64 buf_end, buf_addr = req->rw.addr; 2971 size_t offset; 2972 2973 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 2974 return -EFAULT; 2975 /* not inside the mapped region */ 2976 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) 2977 return -EFAULT; 2978 2979 /* 2980 * May not be a start of buffer, set size appropriately 2981 * and advance us to the beginning. 2982 */ 2983 offset = buf_addr - imu->ubuf; 2984 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); 2985 2986 if (offset) { 2987 /* 2988 * Don't use iov_iter_advance() here, as it's really slow for 2989 * using the latter parts of a big fixed buffer - it iterates 2990 * over each segment manually. We can cheat a bit here, because 2991 * we know that: 2992 * 2993 * 1) it's a BVEC iter, we set it up 2994 * 2) all bvecs are PAGE_SIZE in size, except potentially the 2995 * first and last bvec 2996 * 2997 * So just find our index, and adjust the iterator afterwards. 2998 * If the offset is within the first bvec (or the whole first 2999 * bvec, just use iov_iter_advance(). This makes it easier 3000 * since we can just skip the first segment, which may not 3001 * be PAGE_SIZE aligned. 3002 */ 3003 const struct bio_vec *bvec = imu->bvec; 3004 3005 if (offset <= bvec->bv_len) { 3006 iov_iter_advance(iter, offset); 3007 } else { 3008 unsigned long seg_skip; 3009 3010 /* skip first vec */ 3011 offset -= bvec->bv_len; 3012 seg_skip = 1 + (offset >> PAGE_SHIFT); 3013 3014 iter->bvec = bvec + seg_skip; 3015 iter->nr_segs -= seg_skip; 3016 iter->count -= bvec->bv_len + offset; 3017 iter->iov_offset = offset & ~PAGE_MASK; 3018 } 3019 } 3020 3021 return 0; 3022} 3023 3024static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) 3025{ 3026 struct io_mapped_ubuf *imu = req->imu; 3027 u16 index, buf_index = req->buf_index; 3028 3029 if (likely(!imu)) { 3030 struct io_ring_ctx *ctx = req->ctx; 3031 3032 if (unlikely(buf_index >= ctx->nr_user_bufs)) 3033 return -EFAULT; 3034 io_req_set_rsrc_node(req, ctx); 3035 index = array_index_nospec(buf_index, ctx->nr_user_bufs); 3036 imu = READ_ONCE(ctx->user_bufs[index]); 3037 req->imu = imu; 3038 } 3039 return __io_import_fixed(req, rw, iter, imu); 3040} 3041 3042static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock) 3043{ 3044 if (needs_lock) 3045 mutex_unlock(&ctx->uring_lock); 3046} 3047 3048static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) 3049{ 3050 /* 3051 * "Normal" inline submissions always hold the uring_lock, since we 3052 * grab it from the system call. Same is true for the SQPOLL offload. 3053 * The only exception is when we've detached the request and issue it 3054 * from an async worker thread, grab the lock for that case. 3055 */ 3056 if (needs_lock) 3057 mutex_lock(&ctx->uring_lock); 3058} 3059 3060static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, 3061 int bgid, unsigned int issue_flags) 3062{ 3063 struct io_buffer *kbuf = req->kbuf; 3064 struct io_buffer *head; 3065 bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 3066 3067 if (req->flags & REQ_F_BUFFER_SELECTED) 3068 return kbuf; 3069 3070 io_ring_submit_lock(req->ctx, needs_lock); 3071 3072 lockdep_assert_held(&req->ctx->uring_lock); 3073 3074 head = xa_load(&req->ctx->io_buffers, bgid); 3075 if (head) { 3076 if (!list_empty(&head->list)) { 3077 kbuf = list_last_entry(&head->list, struct io_buffer, 3078 list); 3079 list_del(&kbuf->list); 3080 } else { 3081 kbuf = head; 3082 xa_erase(&req->ctx->io_buffers, bgid); 3083 } 3084 if (*len > kbuf->len) 3085 *len = kbuf->len; 3086 req->flags |= REQ_F_BUFFER_SELECTED; 3087 req->kbuf = kbuf; 3088 } else { 3089 kbuf = ERR_PTR(-ENOBUFS); 3090 } 3091 3092 io_ring_submit_unlock(req->ctx, needs_lock); 3093 return kbuf; 3094} 3095 3096static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, 3097 unsigned int issue_flags) 3098{ 3099 struct io_buffer *kbuf; 3100 u16 bgid; 3101 3102 bgid = req->buf_index; 3103 kbuf = io_buffer_select(req, len, bgid, issue_flags); 3104 if (IS_ERR(kbuf)) 3105 return kbuf; 3106 return u64_to_user_ptr(kbuf->addr); 3107} 3108 3109#ifdef CONFIG_COMPAT 3110static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, 3111 unsigned int issue_flags) 3112{ 3113 struct compat_iovec __user *uiov; 3114 compat_ssize_t clen; 3115 void __user *buf; 3116 ssize_t len; 3117 3118 uiov = u64_to_user_ptr(req->rw.addr); 3119 if (!access_ok(uiov, sizeof(*uiov))) 3120 return -EFAULT; 3121 if (__get_user(clen, &uiov->iov_len)) 3122 return -EFAULT; 3123 if (clen < 0) 3124 return -EINVAL; 3125 3126 len = clen; 3127 buf = io_rw_buffer_select(req, &len, issue_flags); 3128 if (IS_ERR(buf)) 3129 return PTR_ERR(buf); 3130 iov[0].iov_base = buf; 3131 iov[0].iov_len = (compat_size_t) len; 3132 return 0; 3133} 3134#endif 3135 3136static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, 3137 unsigned int issue_flags) 3138{ 3139 struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr); 3140 void __user *buf; 3141 ssize_t len; 3142 3143 if (copy_from_user(iov, uiov, sizeof(*uiov))) 3144 return -EFAULT; 3145 3146 len = iov[0].iov_len; 3147 if (len < 0) 3148 return -EINVAL; 3149 buf = io_rw_buffer_select(req, &len, issue_flags); 3150 if (IS_ERR(buf)) 3151 return PTR_ERR(buf); 3152 iov[0].iov_base = buf; 3153 iov[0].iov_len = len; 3154 return 0; 3155} 3156 3157static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, 3158 unsigned int issue_flags) 3159{ 3160 if (req->flags & REQ_F_BUFFER_SELECTED) { 3161 struct io_buffer *kbuf = req->kbuf; 3162 3163 iov[0].iov_base = u64_to_user_ptr(kbuf->addr); 3164 iov[0].iov_len = kbuf->len; 3165 return 0; 3166 } 3167 if (req->rw.len != 1) 3168 return -EINVAL; 3169 3170#ifdef CONFIG_COMPAT 3171 if (req->ctx->compat) 3172 return io_compat_import(req, iov, issue_flags); 3173#endif 3174 3175 return __io_iov_buffer_select(req, iov, issue_flags); 3176} 3177 3178static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, 3179 struct io_rw_state *s, 3180 unsigned int issue_flags) 3181{ 3182 struct iov_iter *iter = &s->iter; 3183 u8 opcode = req->opcode; 3184 struct iovec *iovec; 3185 void __user *buf; 3186 size_t sqe_len; 3187 ssize_t ret; 3188 3189 BUILD_BUG_ON(ERR_PTR(0) != NULL); 3190 3191 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) 3192 return ERR_PTR(io_import_fixed(req, rw, iter)); 3193 3194 /* buffer index only valid with fixed read/write, or buffer select */ 3195 if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))) 3196 return ERR_PTR(-EINVAL); 3197 3198 buf = u64_to_user_ptr(req->rw.addr); 3199 sqe_len = req->rw.len; 3200 3201 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { 3202 if (req->flags & REQ_F_BUFFER_SELECT) { 3203 buf = io_rw_buffer_select(req, &sqe_len, issue_flags); 3204 if (IS_ERR(buf)) 3205 return ERR_CAST(buf); 3206 req->rw.len = sqe_len; 3207 } 3208 3209 ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter); 3210 return ERR_PTR(ret); 3211 } 3212 3213 iovec = s->fast_iov; 3214 if (req->flags & REQ_F_BUFFER_SELECT) { 3215 ret = io_iov_buffer_select(req, iovec, issue_flags); 3216 if (!ret) 3217 iov_iter_init(iter, rw, iovec, 1, iovec->iov_len); 3218 return ERR_PTR(ret); 3219 } 3220 3221 ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter, 3222 req->ctx->compat); 3223 if (unlikely(ret < 0)) 3224 return ERR_PTR(ret); 3225 return iovec; 3226} 3227 3228static inline int io_import_iovec(int rw, struct io_kiocb *req, 3229 struct iovec **iovec, struct io_rw_state *s, 3230 unsigned int issue_flags) 3231{ 3232 *iovec = __io_import_iovec(rw, req, s, issue_flags); 3233 if (unlikely(IS_ERR(*iovec))) 3234 return PTR_ERR(*iovec); 3235 3236 iov_iter_save_state(&s->iter, &s->iter_state); 3237 return 0; 3238} 3239 3240static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) 3241{ 3242 return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; 3243} 3244 3245/* 3246 * For files that don't have ->read_iter() and ->write_iter(), handle them 3247 * by looping over ->read() or ->write() manually. 3248 */ 3249static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) 3250{ 3251 struct kiocb *kiocb = &req->rw.kiocb; 3252 struct file *file = req->file; 3253 ssize_t ret = 0; 3254 3255 /* 3256 * Don't support polled IO through this interface, and we can't 3257 * support non-blocking either. For the latter, this just causes 3258 * the kiocb to be handled from an async context. 3259 */ 3260 if (kiocb->ki_flags & IOCB_HIPRI) 3261 return -EOPNOTSUPP; 3262 if ((kiocb->ki_flags & IOCB_NOWAIT) && 3263 !(kiocb->ki_filp->f_flags & O_NONBLOCK)) 3264 return -EAGAIN; 3265 3266 while (iov_iter_count(iter)) { 3267 struct iovec iovec; 3268 ssize_t nr; 3269 3270 if (!iov_iter_is_bvec(iter)) { 3271 iovec = iov_iter_iovec(iter); 3272 } else { 3273 iovec.iov_base = u64_to_user_ptr(req->rw.addr); 3274 iovec.iov_len = req->rw.len; 3275 } 3276 3277 if (rw == READ) { 3278 nr = file->f_op->read(file, iovec.iov_base, 3279 iovec.iov_len, io_kiocb_ppos(kiocb)); 3280 } else { 3281 nr = file->f_op->write(file, iovec.iov_base, 3282 iovec.iov_len, io_kiocb_ppos(kiocb)); 3283 } 3284 3285 if (nr < 0) { 3286 if (!ret) 3287 ret = nr; 3288 break; 3289 } 3290 if (!iov_iter_is_bvec(iter)) { 3291 iov_iter_advance(iter, nr); 3292 } else { 3293 req->rw.len -= nr; 3294 req->rw.addr += nr; 3295 } 3296 ret += nr; 3297 if (nr != iovec.iov_len) 3298 break; 3299 } 3300 3301 return ret; 3302} 3303 3304static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, 3305 const struct iovec *fast_iov, struct iov_iter *iter) 3306{ 3307 struct io_async_rw *rw = req->async_data; 3308 3309 memcpy(&rw->s.iter, iter, sizeof(*iter)); 3310 rw->free_iovec = iovec; 3311 rw->bytes_done = 0; 3312 /* can only be fixed buffers, no need to do anything */ 3313 if (iov_iter_is_bvec(iter)) 3314 return; 3315 if (!iovec) { 3316 unsigned iov_off = 0; 3317 3318 rw->s.iter.iov = rw->s.fast_iov; 3319 if (iter->iov != fast_iov) { 3320 iov_off = iter->iov - fast_iov; 3321 rw->s.iter.iov += iov_off; 3322 } 3323 if (rw->s.fast_iov != fast_iov) 3324 memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off, 3325 sizeof(struct iovec) * iter->nr_segs); 3326 } else { 3327 req->flags |= REQ_F_NEED_CLEANUP; 3328 } 3329} 3330 3331static inline bool io_alloc_async_data(struct io_kiocb *req) 3332{ 3333 WARN_ON_ONCE(!io_op_defs[req->opcode].async_size); 3334 req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL); 3335 if (req->async_data) { 3336 req->flags |= REQ_F_ASYNC_DATA; 3337 return false; 3338 } 3339 return true; 3340} 3341 3342static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, 3343 struct io_rw_state *s, bool force) 3344{ 3345 if (!force && !io_op_defs[req->opcode].needs_async_setup) 3346 return 0; 3347 if (!req_has_async_data(req)) { 3348 struct io_async_rw *iorw; 3349 3350 if (io_alloc_async_data(req)) { 3351 kfree(iovec); 3352 return -ENOMEM; 3353 } 3354 3355 io_req_map_rw(req, iovec, s->fast_iov, &s->iter); 3356 iorw = req->async_data; 3357 /* we've copied and mapped the iter, ensure state is saved */ 3358 iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state); 3359 } 3360 return 0; 3361} 3362 3363static inline int io_rw_prep_async(struct io_kiocb *req, int rw) 3364{ 3365 struct io_async_rw *iorw = req->async_data; 3366 struct iovec *iov; 3367 int ret; 3368 3369 /* submission path, ->uring_lock should already be taken */ 3370 ret = io_import_iovec(rw, req, &iov, &iorw->s, 0); 3371 if (unlikely(ret < 0)) 3372 return ret; 3373 3374 iorw->bytes_done = 0; 3375 iorw->free_iovec = iov; 3376 if (iov) 3377 req->flags |= REQ_F_NEED_CLEANUP; 3378 return 0; 3379} 3380 3381static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3382{ 3383 if (unlikely(!(req->file->f_mode & FMODE_READ))) 3384 return -EBADF; 3385 return io_prep_rw(req, sqe); 3386} 3387 3388/* 3389 * This is our waitqueue callback handler, registered through __folio_lock_async() 3390 * when we initially tried to do the IO with the iocb armed our waitqueue. 3391 * This gets called when the page is unlocked, and we generally expect that to 3392 * happen when the page IO is completed and the page is now uptodate. This will 3393 * queue a task_work based retry of the operation, attempting to copy the data 3394 * again. If the latter fails because the page was NOT uptodate, then we will 3395 * do a thread based blocking retry of the operation. That's the unexpected 3396 * slow path. 3397 */ 3398static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, 3399 int sync, void *arg) 3400{ 3401 struct wait_page_queue *wpq; 3402 struct io_kiocb *req = wait->private; 3403 struct wait_page_key *key = arg; 3404 3405 wpq = container_of(wait, struct wait_page_queue, wait); 3406 3407 if (!wake_page_match(wpq, key)) 3408 return 0; 3409 3410 req->rw.kiocb.ki_flags &= ~IOCB_WAITQ; 3411 list_del_init(&wait->entry); 3412 io_req_task_queue(req); 3413 return 1; 3414} 3415 3416/* 3417 * This controls whether a given IO request should be armed for async page 3418 * based retry. If we return false here, the request is handed to the async 3419 * worker threads for retry. If we're doing buffered reads on a regular file, 3420 * we prepare a private wait_page_queue entry and retry the operation. This 3421 * will either succeed because the page is now uptodate and unlocked, or it 3422 * will register a callback when the page is unlocked at IO completion. Through 3423 * that callback, io_uring uses task_work to setup a retry of the operation. 3424 * That retry will attempt the buffered read again. The retry will generally 3425 * succeed, or in rare cases where it fails, we then fall back to using the 3426 * async worker threads for a blocking retry. 3427 */ 3428static bool io_rw_should_retry(struct io_kiocb *req) 3429{ 3430 struct io_async_rw *rw = req->async_data; 3431 struct wait_page_queue *wait = &rw->wpq; 3432 struct kiocb *kiocb = &req->rw.kiocb; 3433 3434 /* never retry for NOWAIT, we just complete with -EAGAIN */ 3435 if (req->flags & REQ_F_NOWAIT) 3436 return false; 3437 3438 /* Only for buffered IO */ 3439 if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) 3440 return false; 3441 3442 /* 3443 * just use poll if we can, and don't attempt if the fs doesn't 3444 * support callback based unlocks 3445 */ 3446 if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) 3447 return false; 3448 3449 wait->wait.func = io_async_buf_func; 3450 wait->wait.private = req; 3451 wait->wait.flags = 0; 3452 INIT_LIST_HEAD(&wait->wait.entry); 3453 kiocb->ki_flags |= IOCB_WAITQ; 3454 kiocb->ki_flags &= ~IOCB_NOWAIT; 3455 kiocb->ki_waitq = wait; 3456 return true; 3457} 3458 3459static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) 3460{ 3461 if (likely(req->file->f_op->read_iter)) 3462 return call_read_iter(req->file, &req->rw.kiocb, iter); 3463 else if (req->file->f_op->read) 3464 return loop_rw_iter(READ, req, iter); 3465 else 3466 return -EINVAL; 3467} 3468 3469static bool need_read_all(struct io_kiocb *req) 3470{ 3471 return req->flags & REQ_F_ISREG || 3472 S_ISBLK(file_inode(req->file)->i_mode); 3473} 3474 3475static int io_read(struct io_kiocb *req, unsigned int issue_flags) 3476{ 3477 struct io_rw_state __s, *s = &__s; 3478 struct iovec *iovec; 3479 struct kiocb *kiocb = &req->rw.kiocb; 3480 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 3481 struct io_async_rw *rw; 3482 ssize_t ret, ret2; 3483 3484 if (!req_has_async_data(req)) { 3485 ret = io_import_iovec(READ, req, &iovec, s, issue_flags); 3486 if (unlikely(ret < 0)) 3487 return ret; 3488 } else { 3489 rw = req->async_data; 3490 s = &rw->s; 3491 /* 3492 * We come here from an earlier attempt, restore our state to 3493 * match in case it doesn't. It's cheap enough that we don't 3494 * need to make this conditional. 3495 */ 3496 iov_iter_restore(&s->iter, &s->iter_state); 3497 iovec = NULL; 3498 } 3499 req->result = iov_iter_count(&s->iter); 3500 3501 if (force_nonblock) { 3502 /* If the file doesn't support async, just async punt */ 3503 if (unlikely(!io_file_supports_nowait(req))) { 3504 ret = io_setup_async_rw(req, iovec, s, true); 3505 return ret ?: -EAGAIN; 3506 } 3507 kiocb->ki_flags |= IOCB_NOWAIT; 3508 } else { 3509 /* Ensure we clear previously set non-block flag */ 3510 kiocb->ki_flags &= ~IOCB_NOWAIT; 3511 } 3512 3513 ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result); 3514 if (unlikely(ret)) { 3515 kfree(iovec); 3516 return ret; 3517 } 3518 3519 ret = io_iter_do_read(req, &s->iter); 3520 3521 if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { 3522 req->flags &= ~REQ_F_REISSUE; 3523 /* IOPOLL retry should happen for io-wq threads */ 3524 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) 3525 goto done; 3526 /* no retry on NONBLOCK nor RWF_NOWAIT */ 3527 if (req->flags & REQ_F_NOWAIT) 3528 goto done; 3529 ret = 0; 3530 } else if (ret == -EIOCBQUEUED) { 3531 goto out_free; 3532 } else if (ret == req->result || ret <= 0 || !force_nonblock || 3533 (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { 3534 /* read all, failed, already did sync or don't want to retry */ 3535 goto done; 3536 } 3537 3538 /* 3539 * Don't depend on the iter state matching what was consumed, or being 3540 * untouched in case of error. Restore it and we'll advance it 3541 * manually if we need to. 3542 */ 3543 iov_iter_restore(&s->iter, &s->iter_state); 3544 3545 ret2 = io_setup_async_rw(req, iovec, s, true); 3546 if (ret2) 3547 return ret2; 3548 3549 iovec = NULL; 3550 rw = req->async_data; 3551 s = &rw->s; 3552 /* 3553 * Now use our persistent iterator and state, if we aren't already. 3554 * We've restored and mapped the iter to match. 3555 */ 3556 3557 do { 3558 /* 3559 * We end up here because of a partial read, either from 3560 * above or inside this loop. Advance the iter by the bytes 3561 * that were consumed. 3562 */ 3563 iov_iter_advance(&s->iter, ret); 3564 if (!iov_iter_count(&s->iter)) 3565 break; 3566 rw->bytes_done += ret; 3567 iov_iter_save_state(&s->iter, &s->iter_state); 3568 3569 /* if we can retry, do so with the callbacks armed */ 3570 if (!io_rw_should_retry(req)) { 3571 kiocb->ki_flags &= ~IOCB_WAITQ; 3572 return -EAGAIN; 3573 } 3574 3575 /* 3576 * Now retry read with the IOCB_WAITQ parts set in the iocb. If 3577 * we get -EIOCBQUEUED, then we'll get a notification when the 3578 * desired page gets unlocked. We can also get a partial read 3579 * here, and if we do, then just retry at the new offset. 3580 */ 3581 ret = io_iter_do_read(req, &s->iter); 3582 if (ret == -EIOCBQUEUED) 3583 return 0; 3584 /* we got some bytes, but not all. retry. */ 3585 kiocb->ki_flags &= ~IOCB_WAITQ; 3586 iov_iter_restore(&s->iter, &s->iter_state); 3587 } while (ret > 0); 3588done: 3589 kiocb_done(kiocb, ret, issue_flags); 3590out_free: 3591 /* it's faster to check here then delegate to kfree */ 3592 if (iovec) 3593 kfree(iovec); 3594 return 0; 3595} 3596 3597static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3598{ 3599 if (unlikely(!(req->file->f_mode & FMODE_WRITE))) 3600 return -EBADF; 3601 req->rw.kiocb.ki_hint = ki_hint_validate(file_write_hint(req->file)); 3602 return io_prep_rw(req, sqe); 3603} 3604 3605static int io_write(struct io_kiocb *req, unsigned int issue_flags) 3606{ 3607 struct io_rw_state __s, *s = &__s; 3608 struct iovec *iovec; 3609 struct kiocb *kiocb = &req->rw.kiocb; 3610 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 3611 ssize_t ret, ret2; 3612 3613 if (!req_has_async_data(req)) { 3614 ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags); 3615 if (unlikely(ret < 0)) 3616 return ret; 3617 } else { 3618 struct io_async_rw *rw = req->async_data; 3619 3620 s = &rw->s; 3621 iov_iter_restore(&s->iter, &s->iter_state); 3622 iovec = NULL; 3623 } 3624 req->result = iov_iter_count(&s->iter); 3625 3626 if (force_nonblock) { 3627 /* If the file doesn't support async, just async punt */ 3628 if (unlikely(!io_file_supports_nowait(req))) 3629 goto copy_iov; 3630 3631 /* file path doesn't support NOWAIT for non-direct_IO */ 3632 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && 3633 (req->flags & REQ_F_ISREG)) 3634 goto copy_iov; 3635 3636 kiocb->ki_flags |= IOCB_NOWAIT; 3637 } else { 3638 /* Ensure we clear previously set non-block flag */ 3639 kiocb->ki_flags &= ~IOCB_NOWAIT; 3640 } 3641 3642 ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result); 3643 if (unlikely(ret)) 3644 goto out_free; 3645 3646 /* 3647 * Open-code file_start_write here to grab freeze protection, 3648 * which will be released by another thread in 3649 * io_complete_rw(). Fool lockdep by telling it the lock got 3650 * released so that it doesn't complain about the held lock when 3651 * we return to userspace. 3652 */ 3653 if (req->flags & REQ_F_ISREG) { 3654 sb_start_write(file_inode(req->file)->i_sb); 3655 __sb_writers_release(file_inode(req->file)->i_sb, 3656 SB_FREEZE_WRITE); 3657 } 3658 kiocb->ki_flags |= IOCB_WRITE; 3659 3660 if (likely(req->file->f_op->write_iter)) 3661 ret2 = call_write_iter(req->file, kiocb, &s->iter); 3662 else if (req->file->f_op->write) 3663 ret2 = loop_rw_iter(WRITE, req, &s->iter); 3664 else 3665 ret2 = -EINVAL; 3666 3667 if (req->flags & REQ_F_REISSUE) { 3668 req->flags &= ~REQ_F_REISSUE; 3669 ret2 = -EAGAIN; 3670 } 3671 3672 /* 3673 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just 3674 * retry them without IOCB_NOWAIT. 3675 */ 3676 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) 3677 ret2 = -EAGAIN; 3678 /* no retry on NONBLOCK nor RWF_NOWAIT */ 3679 if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT)) 3680 goto done; 3681 if (!force_nonblock || ret2 != -EAGAIN) { 3682 /* IOPOLL retry should happen for io-wq threads */ 3683 if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) 3684 goto copy_iov; 3685done: 3686 kiocb_done(kiocb, ret2, issue_flags); 3687 } else { 3688copy_iov: 3689 iov_iter_restore(&s->iter, &s->iter_state); 3690 ret = io_setup_async_rw(req, iovec, s, false); 3691 return ret ?: -EAGAIN; 3692 } 3693out_free: 3694 /* it's reportedly faster than delegating the null check to kfree() */ 3695 if (iovec) 3696 kfree(iovec); 3697 return ret; 3698} 3699 3700static int io_renameat_prep(struct io_kiocb *req, 3701 const struct io_uring_sqe *sqe) 3702{ 3703 struct io_rename *ren = &req->rename; 3704 const char __user *oldf, *newf; 3705 3706 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3707 return -EINVAL; 3708 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) 3709 return -EINVAL; 3710 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3711 return -EBADF; 3712 3713 ren->old_dfd = READ_ONCE(sqe->fd); 3714 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3715 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 3716 ren->new_dfd = READ_ONCE(sqe->len); 3717 ren->flags = READ_ONCE(sqe->rename_flags); 3718 3719 ren->oldpath = getname(oldf); 3720 if (IS_ERR(ren->oldpath)) 3721 return PTR_ERR(ren->oldpath); 3722 3723 ren->newpath = getname(newf); 3724 if (IS_ERR(ren->newpath)) { 3725 putname(ren->oldpath); 3726 return PTR_ERR(ren->newpath); 3727 } 3728 3729 req->flags |= REQ_F_NEED_CLEANUP; 3730 return 0; 3731} 3732 3733static int io_renameat(struct io_kiocb *req, unsigned int issue_flags) 3734{ 3735 struct io_rename *ren = &req->rename; 3736 int ret; 3737 3738 if (issue_flags & IO_URING_F_NONBLOCK) 3739 return -EAGAIN; 3740 3741 ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, 3742 ren->newpath, ren->flags); 3743 3744 req->flags &= ~REQ_F_NEED_CLEANUP; 3745 if (ret < 0) 3746 req_set_fail(req); 3747 io_req_complete(req, ret); 3748 return 0; 3749} 3750 3751static int io_unlinkat_prep(struct io_kiocb *req, 3752 const struct io_uring_sqe *sqe) 3753{ 3754 struct io_unlink *un = &req->unlink; 3755 const char __user *fname; 3756 3757 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3758 return -EINVAL; 3759 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || 3760 sqe->splice_fd_in) 3761 return -EINVAL; 3762 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3763 return -EBADF; 3764 3765 un->dfd = READ_ONCE(sqe->fd); 3766 3767 un->flags = READ_ONCE(sqe->unlink_flags); 3768 if (un->flags & ~AT_REMOVEDIR) 3769 return -EINVAL; 3770 3771 fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3772 un->filename = getname(fname); 3773 if (IS_ERR(un->filename)) 3774 return PTR_ERR(un->filename); 3775 3776 req->flags |= REQ_F_NEED_CLEANUP; 3777 return 0; 3778} 3779 3780static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) 3781{ 3782 struct io_unlink *un = &req->unlink; 3783 int ret; 3784 3785 if (issue_flags & IO_URING_F_NONBLOCK) 3786 return -EAGAIN; 3787 3788 if (un->flags & AT_REMOVEDIR) 3789 ret = do_rmdir(un->dfd, un->filename); 3790 else 3791 ret = do_unlinkat(un->dfd, un->filename); 3792 3793 req->flags &= ~REQ_F_NEED_CLEANUP; 3794 if (ret < 0) 3795 req_set_fail(req); 3796 io_req_complete(req, ret); 3797 return 0; 3798} 3799 3800static int io_mkdirat_prep(struct io_kiocb *req, 3801 const struct io_uring_sqe *sqe) 3802{ 3803 struct io_mkdir *mkd = &req->mkdir; 3804 const char __user *fname; 3805 3806 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3807 return -EINVAL; 3808 if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index || 3809 sqe->splice_fd_in) 3810 return -EINVAL; 3811 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3812 return -EBADF; 3813 3814 mkd->dfd = READ_ONCE(sqe->fd); 3815 mkd->mode = READ_ONCE(sqe->len); 3816 3817 fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3818 mkd->filename = getname(fname); 3819 if (IS_ERR(mkd->filename)) 3820 return PTR_ERR(mkd->filename); 3821 3822 req->flags |= REQ_F_NEED_CLEANUP; 3823 return 0; 3824} 3825 3826static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) 3827{ 3828 struct io_mkdir *mkd = &req->mkdir; 3829 int ret; 3830 3831 if (issue_flags & IO_URING_F_NONBLOCK) 3832 return -EAGAIN; 3833 3834 ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); 3835 3836 req->flags &= ~REQ_F_NEED_CLEANUP; 3837 if (ret < 0) 3838 req_set_fail(req); 3839 io_req_complete(req, ret); 3840 return 0; 3841} 3842 3843static int io_symlinkat_prep(struct io_kiocb *req, 3844 const struct io_uring_sqe *sqe) 3845{ 3846 struct io_symlink *sl = &req->symlink; 3847 const char __user *oldpath, *newpath; 3848 3849 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3850 return -EINVAL; 3851 if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index || 3852 sqe->splice_fd_in) 3853 return -EINVAL; 3854 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3855 return -EBADF; 3856 3857 sl->new_dfd = READ_ONCE(sqe->fd); 3858 oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3859 newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 3860 3861 sl->oldpath = getname(oldpath); 3862 if (IS_ERR(sl->oldpath)) 3863 return PTR_ERR(sl->oldpath); 3864 3865 sl->newpath = getname(newpath); 3866 if (IS_ERR(sl->newpath)) { 3867 putname(sl->oldpath); 3868 return PTR_ERR(sl->newpath); 3869 } 3870 3871 req->flags |= REQ_F_NEED_CLEANUP; 3872 return 0; 3873} 3874 3875static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) 3876{ 3877 struct io_symlink *sl = &req->symlink; 3878 int ret; 3879 3880 if (issue_flags & IO_URING_F_NONBLOCK) 3881 return -EAGAIN; 3882 3883 ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath); 3884 3885 req->flags &= ~REQ_F_NEED_CLEANUP; 3886 if (ret < 0) 3887 req_set_fail(req); 3888 io_req_complete(req, ret); 3889 return 0; 3890} 3891 3892static int io_linkat_prep(struct io_kiocb *req, 3893 const struct io_uring_sqe *sqe) 3894{ 3895 struct io_hardlink *lnk = &req->hardlink; 3896 const char __user *oldf, *newf; 3897 3898 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3899 return -EINVAL; 3900 if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) 3901 return -EINVAL; 3902 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3903 return -EBADF; 3904 3905 lnk->old_dfd = READ_ONCE(sqe->fd); 3906 lnk->new_dfd = READ_ONCE(sqe->len); 3907 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3908 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 3909 lnk->flags = READ_ONCE(sqe->hardlink_flags); 3910 3911 lnk->oldpath = getname(oldf); 3912 if (IS_ERR(lnk->oldpath)) 3913 return PTR_ERR(lnk->oldpath); 3914 3915 lnk->newpath = getname(newf); 3916 if (IS_ERR(lnk->newpath)) { 3917 putname(lnk->oldpath); 3918 return PTR_ERR(lnk->newpath); 3919 } 3920 3921 req->flags |= REQ_F_NEED_CLEANUP; 3922 return 0; 3923} 3924 3925static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) 3926{ 3927 struct io_hardlink *lnk = &req->hardlink; 3928 int ret; 3929 3930 if (issue_flags & IO_URING_F_NONBLOCK) 3931 return -EAGAIN; 3932 3933 ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd, 3934 lnk->newpath, lnk->flags); 3935 3936 req->flags &= ~REQ_F_NEED_CLEANUP; 3937 if (ret < 0) 3938 req_set_fail(req); 3939 io_req_complete(req, ret); 3940 return 0; 3941} 3942 3943static int io_shutdown_prep(struct io_kiocb *req, 3944 const struct io_uring_sqe *sqe) 3945{ 3946#if defined(CONFIG_NET) 3947 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3948 return -EINVAL; 3949 if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || 3950 sqe->buf_index || sqe->splice_fd_in)) 3951 return -EINVAL; 3952 3953 req->shutdown.how = READ_ONCE(sqe->len); 3954 return 0; 3955#else 3956 return -EOPNOTSUPP; 3957#endif 3958} 3959 3960static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) 3961{ 3962#if defined(CONFIG_NET) 3963 struct socket *sock; 3964 int ret; 3965 3966 if (issue_flags & IO_URING_F_NONBLOCK) 3967 return -EAGAIN; 3968 3969 sock = sock_from_file(req->file); 3970 if (unlikely(!sock)) 3971 return -ENOTSOCK; 3972 3973 ret = __sys_shutdown_sock(sock, req->shutdown.how); 3974 if (ret < 0) 3975 req_set_fail(req); 3976 io_req_complete(req, ret); 3977 return 0; 3978#else 3979 return -EOPNOTSUPP; 3980#endif 3981} 3982 3983static int __io_splice_prep(struct io_kiocb *req, 3984 const struct io_uring_sqe *sqe) 3985{ 3986 struct io_splice *sp = &req->splice; 3987 unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; 3988 3989 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3990 return -EINVAL; 3991 3992 sp->file_in = NULL; 3993 sp->len = READ_ONCE(sqe->len); 3994 sp->flags = READ_ONCE(sqe->splice_flags); 3995 3996 if (unlikely(sp->flags & ~valid_flags)) 3997 return -EINVAL; 3998 3999 sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in), 4000 (sp->flags & SPLICE_F_FD_IN_FIXED)); 4001 if (!sp->file_in) 4002 return -EBADF; 4003 req->flags |= REQ_F_NEED_CLEANUP; 4004 return 0; 4005} 4006 4007static int io_tee_prep(struct io_kiocb *req, 4008 const struct io_uring_sqe *sqe) 4009{ 4010 if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off)) 4011 return -EINVAL; 4012 return __io_splice_prep(req, sqe); 4013} 4014 4015static int io_tee(struct io_kiocb *req, unsigned int issue_flags) 4016{ 4017 struct io_splice *sp = &req->splice; 4018 struct file *in = sp->file_in; 4019 struct file *out = sp->file_out; 4020 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; 4021 long ret = 0; 4022 4023 if (issue_flags & IO_URING_F_NONBLOCK) 4024 return -EAGAIN; 4025 if (sp->len) 4026 ret = do_tee(in, out, sp->len, flags); 4027 4028 if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) 4029 io_put_file(in); 4030 req->flags &= ~REQ_F_NEED_CLEANUP; 4031 4032 if (ret != sp->len) 4033 req_set_fail(req); 4034 io_req_complete(req, ret); 4035 return 0; 4036} 4037 4038static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4039{ 4040 struct io_splice *sp = &req->splice; 4041 4042 sp->off_in = READ_ONCE(sqe->splice_off_in); 4043 sp->off_out = READ_ONCE(sqe->off); 4044 return __io_splice_prep(req, sqe); 4045} 4046 4047static int io_splice(struct io_kiocb *req, unsigned int issue_flags) 4048{ 4049 struct io_splice *sp = &req->splice; 4050 struct file *in = sp->file_in; 4051 struct file *out = sp->file_out; 4052 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; 4053 loff_t *poff_in, *poff_out; 4054 long ret = 0; 4055 4056 if (issue_flags & IO_URING_F_NONBLOCK) 4057 return -EAGAIN; 4058 4059 poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; 4060 poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; 4061 4062 if (sp->len) 4063 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); 4064 4065 if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) 4066 io_put_file(in); 4067 req->flags &= ~REQ_F_NEED_CLEANUP; 4068 4069 if (ret != sp->len) 4070 req_set_fail(req); 4071 io_req_complete(req, ret); 4072 return 0; 4073} 4074 4075/* 4076 * IORING_OP_NOP just posts a completion event, nothing else. 4077 */ 4078static int io_nop(struct io_kiocb *req, unsigned int issue_flags) 4079{ 4080 struct io_ring_ctx *ctx = req->ctx; 4081 4082 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 4083 return -EINVAL; 4084 4085 __io_req_complete(req, issue_flags, 0, 0); 4086 return 0; 4087} 4088 4089static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4090{ 4091 struct io_ring_ctx *ctx = req->ctx; 4092 4093 if (!req->file) 4094 return -EBADF; 4095 4096 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 4097 return -EINVAL; 4098 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || 4099 sqe->splice_fd_in)) 4100 return -EINVAL; 4101 4102 req->sync.flags = READ_ONCE(sqe->fsync_flags); 4103 if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC)) 4104 return -EINVAL; 4105 4106 req->sync.off = READ_ONCE(sqe->off); 4107 req->sync.len = READ_ONCE(sqe->len); 4108 return 0; 4109} 4110 4111static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) 4112{ 4113 loff_t end = req->sync.off + req->sync.len; 4114 int ret; 4115 4116 /* fsync always requires a blocking context */ 4117 if (issue_flags & IO_URING_F_NONBLOCK) 4118 return -EAGAIN; 4119 4120 ret = vfs_fsync_range(req->file, req->sync.off, 4121 end > 0 ? end : LLONG_MAX, 4122 req->sync.flags & IORING_FSYNC_DATASYNC); 4123 if (ret < 0) 4124 req_set_fail(req); 4125 io_req_complete(req, ret); 4126 return 0; 4127} 4128 4129static int io_fallocate_prep(struct io_kiocb *req, 4130 const struct io_uring_sqe *sqe) 4131{ 4132 if (sqe->ioprio || sqe->buf_index || sqe->rw_flags || 4133 sqe->splice_fd_in) 4134 return -EINVAL; 4135 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4136 return -EINVAL; 4137 4138 req->sync.off = READ_ONCE(sqe->off); 4139 req->sync.len = READ_ONCE(sqe->addr); 4140 req->sync.mode = READ_ONCE(sqe->len); 4141 return 0; 4142} 4143 4144static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) 4145{ 4146 int ret; 4147 4148 /* fallocate always requiring blocking context */ 4149 if (issue_flags & IO_URING_F_NONBLOCK) 4150 return -EAGAIN; 4151 ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, 4152 req->sync.len); 4153 if (ret < 0) 4154 req_set_fail(req); 4155 io_req_complete(req, ret); 4156 return 0; 4157} 4158 4159static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4160{ 4161 const char __user *fname; 4162 int ret; 4163 4164 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4165 return -EINVAL; 4166 if (unlikely(sqe->ioprio || sqe->buf_index)) 4167 return -EINVAL; 4168 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 4169 return -EBADF; 4170 4171 /* open.how should be already initialised */ 4172 if (!(req->open.how.flags & O_PATH) && force_o_largefile()) 4173 req->open.how.flags |= O_LARGEFILE; 4174 4175 req->open.dfd = READ_ONCE(sqe->fd); 4176 fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4177 req->open.filename = getname(fname); 4178 if (IS_ERR(req->open.filename)) { 4179 ret = PTR_ERR(req->open.filename); 4180 req->open.filename = NULL; 4181 return ret; 4182 } 4183 4184 req->open.file_slot = READ_ONCE(sqe->file_index); 4185 if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC)) 4186 return -EINVAL; 4187 4188 req->open.nofile = rlimit(RLIMIT_NOFILE); 4189 req->flags |= REQ_F_NEED_CLEANUP; 4190 return 0; 4191} 4192 4193static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4194{ 4195 u64 mode = READ_ONCE(sqe->len); 4196 u64 flags = READ_ONCE(sqe->open_flags); 4197 4198 req->open.how = build_open_how(flags, mode); 4199 return __io_openat_prep(req, sqe); 4200} 4201 4202static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4203{ 4204 struct open_how __user *how; 4205 size_t len; 4206 int ret; 4207 4208 how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 4209 len = READ_ONCE(sqe->len); 4210 if (len < OPEN_HOW_SIZE_VER0) 4211 return -EINVAL; 4212 4213 ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how, 4214 len); 4215 if (ret) 4216 return ret; 4217 4218 return __io_openat_prep(req, sqe); 4219} 4220 4221static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) 4222{ 4223 struct open_flags op; 4224 struct file *file; 4225 bool resolve_nonblock, nonblock_set; 4226 bool fixed = !!req->open.file_slot; 4227 int ret; 4228 4229 ret = build_open_flags(&req->open.how, &op); 4230 if (ret) 4231 goto err; 4232 nonblock_set = op.open_flag & O_NONBLOCK; 4233 resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED; 4234 if (issue_flags & IO_URING_F_NONBLOCK) { 4235 /* 4236 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, 4237 * it'll always -EAGAIN 4238 */ 4239 if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE)) 4240 return -EAGAIN; 4241 op.lookup_flags |= LOOKUP_CACHED; 4242 op.open_flag |= O_NONBLOCK; 4243 } 4244 4245 if (!fixed) { 4246 ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile); 4247 if (ret < 0) 4248 goto err; 4249 } 4250 4251 file = do_filp_open(req->open.dfd, req->open.filename, &op); 4252 if (IS_ERR(file)) { 4253 /* 4254 * We could hang on to this 'fd' on retrying, but seems like 4255 * marginal gain for something that is now known to be a slower 4256 * path. So just put it, and we'll get a new one when we retry. 4257 */ 4258 if (!fixed) 4259 put_unused_fd(ret); 4260 4261 ret = PTR_ERR(file); 4262 /* only retry if RESOLVE_CACHED wasn't already set by application */ 4263 if (ret == -EAGAIN && 4264 (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK))) 4265 return -EAGAIN; 4266 goto err; 4267 } 4268 4269 if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) 4270 file->f_flags &= ~O_NONBLOCK; 4271 fsnotify_open(file); 4272 4273 if (!fixed) 4274 fd_install(ret, file); 4275 else 4276 ret = io_install_fixed_file(req, file, issue_flags, 4277 req->open.file_slot - 1); 4278err: 4279 putname(req->open.filename); 4280 req->flags &= ~REQ_F_NEED_CLEANUP; 4281 if (ret < 0) 4282 req_set_fail(req); 4283 __io_req_complete(req, issue_flags, ret, 0); 4284 return 0; 4285} 4286 4287static int io_openat(struct io_kiocb *req, unsigned int issue_flags) 4288{ 4289 return io_openat2(req, issue_flags); 4290} 4291 4292static int io_remove_buffers_prep(struct io_kiocb *req, 4293 const struct io_uring_sqe *sqe) 4294{ 4295 struct io_provide_buf *p = &req->pbuf; 4296 u64 tmp; 4297 4298 if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off || 4299 sqe->splice_fd_in) 4300 return -EINVAL; 4301 4302 tmp = READ_ONCE(sqe->fd); 4303 if (!tmp || tmp > USHRT_MAX) 4304 return -EINVAL; 4305 4306 memset(p, 0, sizeof(*p)); 4307 p->nbufs = tmp; 4308 p->bgid = READ_ONCE(sqe->buf_group); 4309 return 0; 4310} 4311 4312static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, 4313 int bgid, unsigned nbufs) 4314{ 4315 unsigned i = 0; 4316 4317 /* shouldn't happen */ 4318 if (!nbufs) 4319 return 0; 4320 4321 /* the head kbuf is the list itself */ 4322 while (!list_empty(&buf->list)) { 4323 struct io_buffer *nxt; 4324 4325 nxt = list_first_entry(&buf->list, struct io_buffer, list); 4326 list_del(&nxt->list); 4327 kfree(nxt); 4328 if (++i == nbufs) 4329 return i; 4330 } 4331 i++; 4332 kfree(buf); 4333 xa_erase(&ctx->io_buffers, bgid); 4334 4335 return i; 4336} 4337 4338static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) 4339{ 4340 struct io_provide_buf *p = &req->pbuf; 4341 struct io_ring_ctx *ctx = req->ctx; 4342 struct io_buffer *head; 4343 int ret = 0; 4344 bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 4345 4346 io_ring_submit_lock(ctx, needs_lock); 4347 4348 lockdep_assert_held(&ctx->uring_lock); 4349 4350 ret = -ENOENT; 4351 head = xa_load(&ctx->io_buffers, p->bgid); 4352 if (head) 4353 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); 4354 if (ret < 0) 4355 req_set_fail(req); 4356 4357 /* complete before unlock, IOPOLL may need the lock */ 4358 __io_req_complete(req, issue_flags, ret, 0); 4359 io_ring_submit_unlock(ctx, needs_lock); 4360 return 0; 4361} 4362 4363static int io_provide_buffers_prep(struct io_kiocb *req, 4364 const struct io_uring_sqe *sqe) 4365{ 4366 unsigned long size, tmp_check; 4367 struct io_provide_buf *p = &req->pbuf; 4368 u64 tmp; 4369 4370 if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) 4371 return -EINVAL; 4372 4373 tmp = READ_ONCE(sqe->fd); 4374 if (!tmp || tmp > USHRT_MAX) 4375 return -E2BIG; 4376 p->nbufs = tmp; 4377 p->addr = READ_ONCE(sqe->addr); 4378 p->len = READ_ONCE(sqe->len); 4379 4380 if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, 4381 &size)) 4382 return -EOVERFLOW; 4383 if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) 4384 return -EOVERFLOW; 4385 4386 size = (unsigned long)p->len * p->nbufs; 4387 if (!access_ok(u64_to_user_ptr(p->addr), size)) 4388 return -EFAULT; 4389 4390 p->bgid = READ_ONCE(sqe->buf_group); 4391 tmp = READ_ONCE(sqe->off); 4392 if (tmp > USHRT_MAX) 4393 return -E2BIG; 4394 p->bid = tmp; 4395 return 0; 4396} 4397 4398static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) 4399{ 4400 struct io_buffer *buf; 4401 u64 addr = pbuf->addr; 4402 int i, bid = pbuf->bid; 4403 4404 for (i = 0; i < pbuf->nbufs; i++) { 4405 buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); 4406 if (!buf) 4407 break; 4408 4409 buf->addr = addr; 4410 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); 4411 buf->bid = bid; 4412 addr += pbuf->len; 4413 bid++; 4414 if (!*head) { 4415 INIT_LIST_HEAD(&buf->list); 4416 *head = buf; 4417 } else { 4418 list_add_tail(&buf->list, &(*head)->list); 4419 } 4420 } 4421 4422 return i ? i : -ENOMEM; 4423} 4424 4425static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) 4426{ 4427 struct io_provide_buf *p = &req->pbuf; 4428 struct io_ring_ctx *ctx = req->ctx; 4429 struct io_buffer *head, *list; 4430 int ret = 0; 4431 bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 4432 4433 io_ring_submit_lock(ctx, needs_lock); 4434 4435 lockdep_assert_held(&ctx->uring_lock); 4436 4437 list = head = xa_load(&ctx->io_buffers, p->bgid); 4438 4439 ret = io_add_buffers(p, &head); 4440 if (ret >= 0 && !list) { 4441 ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL); 4442 if (ret < 0) 4443 __io_remove_buffers(ctx, head, p->bgid, -1U); 4444 } 4445 if (ret < 0) 4446 req_set_fail(req); 4447 /* complete before unlock, IOPOLL may need the lock */ 4448 __io_req_complete(req, issue_flags, ret, 0); 4449 io_ring_submit_unlock(ctx, needs_lock); 4450 return 0; 4451} 4452 4453static int io_epoll_ctl_prep(struct io_kiocb *req, 4454 const struct io_uring_sqe *sqe) 4455{ 4456#if defined(CONFIG_EPOLL) 4457 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) 4458 return -EINVAL; 4459 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4460 return -EINVAL; 4461 4462 req->epoll.epfd = READ_ONCE(sqe->fd); 4463 req->epoll.op = READ_ONCE(sqe->len); 4464 req->epoll.fd = READ_ONCE(sqe->off); 4465 4466 if (ep_op_has_event(req->epoll.op)) { 4467 struct epoll_event __user *ev; 4468 4469 ev = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4470 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev))) 4471 return -EFAULT; 4472 } 4473 4474 return 0; 4475#else 4476 return -EOPNOTSUPP; 4477#endif 4478} 4479 4480static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) 4481{ 4482#if defined(CONFIG_EPOLL) 4483 struct io_epoll *ie = &req->epoll; 4484 int ret; 4485 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 4486 4487 ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock); 4488 if (force_nonblock && ret == -EAGAIN) 4489 return -EAGAIN; 4490 4491 if (ret < 0) 4492 req_set_fail(req); 4493 __io_req_complete(req, issue_flags, ret, 0); 4494 return 0; 4495#else 4496 return -EOPNOTSUPP; 4497#endif 4498} 4499 4500static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4501{ 4502#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) 4503 if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in) 4504 return -EINVAL; 4505 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4506 return -EINVAL; 4507 4508 req->madvise.addr = READ_ONCE(sqe->addr); 4509 req->madvise.len = READ_ONCE(sqe->len); 4510 req->madvise.advice = READ_ONCE(sqe->fadvise_advice); 4511 return 0; 4512#else 4513 return -EOPNOTSUPP; 4514#endif 4515} 4516 4517static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) 4518{ 4519#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) 4520 struct io_madvise *ma = &req->madvise; 4521 int ret; 4522 4523 if (issue_flags & IO_URING_F_NONBLOCK) 4524 return -EAGAIN; 4525 4526 ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); 4527 if (ret < 0) 4528 req_set_fail(req); 4529 io_req_complete(req, ret); 4530 return 0; 4531#else 4532 return -EOPNOTSUPP; 4533#endif 4534} 4535 4536static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4537{ 4538 if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in) 4539 return -EINVAL; 4540 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4541 return -EINVAL; 4542 4543 req->fadvise.offset = READ_ONCE(sqe->off); 4544 req->fadvise.len = READ_ONCE(sqe->len); 4545 req->fadvise.advice = READ_ONCE(sqe->fadvise_advice); 4546 return 0; 4547} 4548 4549static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) 4550{ 4551 struct io_fadvise *fa = &req->fadvise; 4552 int ret; 4553 4554 if (issue_flags & IO_URING_F_NONBLOCK) { 4555 switch (fa->advice) { 4556 case POSIX_FADV_NORMAL: 4557 case POSIX_FADV_RANDOM: 4558 case POSIX_FADV_SEQUENTIAL: 4559 break; 4560 default: 4561 return -EAGAIN; 4562 } 4563 } 4564 4565 ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); 4566 if (ret < 0) 4567 req_set_fail(req); 4568 __io_req_complete(req, issue_flags, ret, 0); 4569 return 0; 4570} 4571 4572static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4573{ 4574 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4575 return -EINVAL; 4576 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) 4577 return -EINVAL; 4578 if (req->flags & REQ_F_FIXED_FILE) 4579 return -EBADF; 4580 4581 req->statx.dfd = READ_ONCE(sqe->fd); 4582 req->statx.mask = READ_ONCE(sqe->len); 4583 req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4584 req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 4585 req->statx.flags = READ_ONCE(sqe->statx_flags); 4586 4587 return 0; 4588} 4589 4590static int io_statx(struct io_kiocb *req, unsigned int issue_flags) 4591{ 4592 struct io_statx *ctx = &req->statx; 4593 int ret; 4594 4595 if (issue_flags & IO_URING_F_NONBLOCK) 4596 return -EAGAIN; 4597 4598 ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask, 4599 ctx->buffer); 4600 4601 if (ret < 0) 4602 req_set_fail(req); 4603 io_req_complete(req, ret); 4604 return 0; 4605} 4606 4607static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4608{ 4609 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4610 return -EINVAL; 4611 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || 4612 sqe->rw_flags || sqe->buf_index) 4613 return -EINVAL; 4614 if (req->flags & REQ_F_FIXED_FILE) 4615 return -EBADF; 4616 4617 req->close.fd = READ_ONCE(sqe->fd); 4618 req->close.file_slot = READ_ONCE(sqe->file_index); 4619 if (req->close.file_slot && req->close.fd) 4620 return -EINVAL; 4621 4622 return 0; 4623} 4624 4625static int io_close(struct io_kiocb *req, unsigned int issue_flags) 4626{ 4627 struct files_struct *files = current->files; 4628 struct io_close *close = &req->close; 4629 struct fdtable *fdt; 4630 struct file *file = NULL; 4631 int ret = -EBADF; 4632 4633 if (req->close.file_slot) { 4634 ret = io_close_fixed(req, issue_flags); 4635 goto err; 4636 } 4637 4638 spin_lock(&files->file_lock); 4639 fdt = files_fdtable(files); 4640 if (close->fd >= fdt->max_fds) { 4641 spin_unlock(&files->file_lock); 4642 goto err; 4643 } 4644 file = fdt->fd[close->fd]; 4645 if (!file || file->f_op == &io_uring_fops) { 4646 spin_unlock(&files->file_lock); 4647 file = NULL; 4648 goto err; 4649 } 4650 4651 /* if the file has a flush method, be safe and punt to async */ 4652 if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) { 4653 spin_unlock(&files->file_lock); 4654 return -EAGAIN; 4655 } 4656 4657 ret = __close_fd_get_file(close->fd, &file); 4658 spin_unlock(&files->file_lock); 4659 if (ret < 0) { 4660 if (ret == -ENOENT) 4661 ret = -EBADF; 4662 goto err; 4663 } 4664 4665 /* No ->flush() or already async, safely close from here */ 4666 ret = filp_close(file, current->files); 4667err: 4668 if (ret < 0) 4669 req_set_fail(req); 4670 if (file) 4671 fput(file); 4672 __io_req_complete(req, issue_flags, ret, 0); 4673 return 0; 4674} 4675 4676static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4677{ 4678 struct io_ring_ctx *ctx = req->ctx; 4679 4680 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 4681 return -EINVAL; 4682 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || 4683 sqe->splice_fd_in)) 4684 return -EINVAL; 4685 4686 req->sync.off = READ_ONCE(sqe->off); 4687 req->sync.len = READ_ONCE(sqe->len); 4688 req->sync.flags = READ_ONCE(sqe->sync_range_flags); 4689 return 0; 4690} 4691 4692static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) 4693{ 4694 int ret; 4695 4696 /* sync_file_range always requires a blocking context */ 4697 if (issue_flags & IO_URING_F_NONBLOCK) 4698 return -EAGAIN; 4699 4700 ret = sync_file_range(req->file, req->sync.off, req->sync.len, 4701 req->sync.flags); 4702 if (ret < 0) 4703 req_set_fail(req); 4704 io_req_complete(req, ret); 4705 return 0; 4706} 4707 4708#if defined(CONFIG_NET) 4709static int io_setup_async_msg(struct io_kiocb *req, 4710 struct io_async_msghdr *kmsg) 4711{ 4712 struct io_async_msghdr *async_msg = req->async_data; 4713 4714 if (async_msg) 4715 return -EAGAIN; 4716 if (io_alloc_async_data(req)) { 4717 kfree(kmsg->free_iov); 4718 return -ENOMEM; 4719 } 4720 async_msg = req->async_data; 4721 req->flags |= REQ_F_NEED_CLEANUP; 4722 memcpy(async_msg, kmsg, sizeof(*kmsg)); 4723 async_msg->msg.msg_name = &async_msg->addr; 4724 /* if were using fast_iov, set it to the new one */ 4725 if (!async_msg->free_iov) 4726 async_msg->msg.msg_iter.iov = async_msg->fast_iov; 4727 4728 return -EAGAIN; 4729} 4730 4731static int io_sendmsg_copy_hdr(struct io_kiocb *req, 4732 struct io_async_msghdr *iomsg) 4733{ 4734 iomsg->msg.msg_name = &iomsg->addr; 4735 iomsg->free_iov = iomsg->fast_iov; 4736 return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg, 4737 req->sr_msg.msg_flags, &iomsg->free_iov); 4738} 4739 4740static int io_sendmsg_prep_async(struct io_kiocb *req) 4741{ 4742 int ret; 4743 4744 ret = io_sendmsg_copy_hdr(req, req->async_data); 4745 if (!ret) 4746 req->flags |= REQ_F_NEED_CLEANUP; 4747 return ret; 4748} 4749 4750static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4751{ 4752 struct io_sr_msg *sr = &req->sr_msg; 4753 4754 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4755 return -EINVAL; 4756 4757 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4758 sr->len = READ_ONCE(sqe->len); 4759 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 4760 if (sr->msg_flags & MSG_DONTWAIT) 4761 req->flags |= REQ_F_NOWAIT; 4762 4763#ifdef CONFIG_COMPAT 4764 if (req->ctx->compat) 4765 sr->msg_flags |= MSG_CMSG_COMPAT; 4766#endif 4767 return 0; 4768} 4769 4770static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) 4771{ 4772 struct io_async_msghdr iomsg, *kmsg; 4773 struct socket *sock; 4774 unsigned flags; 4775 int min_ret = 0; 4776 int ret; 4777 4778 sock = sock_from_file(req->file); 4779 if (unlikely(!sock)) 4780 return -ENOTSOCK; 4781 4782 if (req_has_async_data(req)) { 4783 kmsg = req->async_data; 4784 } else { 4785 ret = io_sendmsg_copy_hdr(req, &iomsg); 4786 if (ret) 4787 return ret; 4788 kmsg = &iomsg; 4789 } 4790 4791 flags = req->sr_msg.msg_flags; 4792 if (issue_flags & IO_URING_F_NONBLOCK) 4793 flags |= MSG_DONTWAIT; 4794 if (flags & MSG_WAITALL) 4795 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 4796 4797 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 4798 if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) 4799 return io_setup_async_msg(req, kmsg); 4800 if (ret == -ERESTARTSYS) 4801 ret = -EINTR; 4802 4803 /* fast path, check for non-NULL to avoid function call */ 4804 if (kmsg->free_iov) 4805 kfree(kmsg->free_iov); 4806 req->flags &= ~REQ_F_NEED_CLEANUP; 4807 if (ret < min_ret) 4808 req_set_fail(req); 4809 __io_req_complete(req, issue_flags, ret, 0); 4810 return 0; 4811} 4812 4813static int io_send(struct io_kiocb *req, unsigned int issue_flags) 4814{ 4815 struct io_sr_msg *sr = &req->sr_msg; 4816 struct msghdr msg; 4817 struct iovec iov; 4818 struct socket *sock; 4819 unsigned flags; 4820 int min_ret = 0; 4821 int ret; 4822 4823 sock = sock_from_file(req->file); 4824 if (unlikely(!sock)) 4825 return -ENOTSOCK; 4826 4827 ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); 4828 if (unlikely(ret)) 4829 return ret; 4830 4831 msg.msg_name = NULL; 4832 msg.msg_control = NULL; 4833 msg.msg_controllen = 0; 4834 msg.msg_namelen = 0; 4835 4836 flags = req->sr_msg.msg_flags; 4837 if (issue_flags & IO_URING_F_NONBLOCK) 4838 flags |= MSG_DONTWAIT; 4839 if (flags & MSG_WAITALL) 4840 min_ret = iov_iter_count(&msg.msg_iter); 4841 4842 msg.msg_flags = flags; 4843 ret = sock_sendmsg(sock, &msg); 4844 if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) 4845 return -EAGAIN; 4846 if (ret == -ERESTARTSYS) 4847 ret = -EINTR; 4848 4849 if (ret < min_ret) 4850 req_set_fail(req); 4851 __io_req_complete(req, issue_flags, ret, 0); 4852 return 0; 4853} 4854 4855static int __io_recvmsg_copy_hdr(struct io_kiocb *req, 4856 struct io_async_msghdr *iomsg) 4857{ 4858 struct io_sr_msg *sr = &req->sr_msg; 4859 struct iovec __user *uiov; 4860 size_t iov_len; 4861 int ret; 4862 4863 ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg, 4864 &iomsg->uaddr, &uiov, &iov_len); 4865 if (ret) 4866 return ret; 4867 4868 if (req->flags & REQ_F_BUFFER_SELECT) { 4869 if (iov_len > 1) 4870 return -EINVAL; 4871 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov))) 4872 return -EFAULT; 4873 sr->len = iomsg->fast_iov[0].iov_len; 4874 iomsg->free_iov = NULL; 4875 } else { 4876 iomsg->free_iov = iomsg->fast_iov; 4877 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV, 4878 &iomsg->free_iov, &iomsg->msg.msg_iter, 4879 false); 4880 if (ret > 0) 4881 ret = 0; 4882 } 4883 4884 return ret; 4885} 4886 4887#ifdef CONFIG_COMPAT 4888static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, 4889 struct io_async_msghdr *iomsg) 4890{ 4891 struct io_sr_msg *sr = &req->sr_msg; 4892 struct compat_iovec __user *uiov; 4893 compat_uptr_t ptr; 4894 compat_size_t len; 4895 int ret; 4896 4897 ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr, 4898 &ptr, &len); 4899 if (ret) 4900 return ret; 4901 4902 uiov = compat_ptr(ptr); 4903 if (req->flags & REQ_F_BUFFER_SELECT) { 4904 compat_ssize_t clen; 4905 4906 if (len > 1) 4907 return -EINVAL; 4908 if (!access_ok(uiov, sizeof(*uiov))) 4909 return -EFAULT; 4910 if (__get_user(clen, &uiov->iov_len)) 4911 return -EFAULT; 4912 if (clen < 0) 4913 return -EINVAL; 4914 sr->len = clen; 4915 iomsg->free_iov = NULL; 4916 } else { 4917 iomsg->free_iov = iomsg->fast_iov; 4918 ret = __import_iovec(READ, (struct iovec __user *)uiov, len, 4919 UIO_FASTIOV, &iomsg->free_iov, 4920 &iomsg->msg.msg_iter, true); 4921 if (ret < 0) 4922 return ret; 4923 } 4924 4925 return 0; 4926} 4927#endif 4928 4929static int io_recvmsg_copy_hdr(struct io_kiocb *req, 4930 struct io_async_msghdr *iomsg) 4931{ 4932 iomsg->msg.msg_name = &iomsg->addr; 4933 4934#ifdef CONFIG_COMPAT 4935 if (req->ctx->compat) 4936 return __io_compat_recvmsg_copy_hdr(req, iomsg); 4937#endif 4938 4939 return __io_recvmsg_copy_hdr(req, iomsg); 4940} 4941 4942static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, 4943 unsigned int issue_flags) 4944{ 4945 struct io_sr_msg *sr = &req->sr_msg; 4946 4947 return io_buffer_select(req, &sr->len, sr->bgid, issue_flags); 4948} 4949 4950static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req) 4951{ 4952 return io_put_kbuf(req, req->kbuf); 4953} 4954 4955static int io_recvmsg_prep_async(struct io_kiocb *req) 4956{ 4957 int ret; 4958 4959 ret = io_recvmsg_copy_hdr(req, req->async_data); 4960 if (!ret) 4961 req->flags |= REQ_F_NEED_CLEANUP; 4962 return ret; 4963} 4964 4965static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4966{ 4967 struct io_sr_msg *sr = &req->sr_msg; 4968 4969 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4970 return -EINVAL; 4971 4972 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4973 sr->len = READ_ONCE(sqe->len); 4974 sr->bgid = READ_ONCE(sqe->buf_group); 4975 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 4976 if (sr->msg_flags & MSG_DONTWAIT) 4977 req->flags |= REQ_F_NOWAIT; 4978 4979#ifdef CONFIG_COMPAT 4980 if (req->ctx->compat) 4981 sr->msg_flags |= MSG_CMSG_COMPAT; 4982#endif 4983 return 0; 4984} 4985 4986static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) 4987{ 4988 struct io_async_msghdr iomsg, *kmsg; 4989 struct socket *sock; 4990 struct io_buffer *kbuf; 4991 unsigned flags; 4992 int min_ret = 0; 4993 int ret, cflags = 0; 4994 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 4995 4996 sock = sock_from_file(req->file); 4997 if (unlikely(!sock)) 4998 return -ENOTSOCK; 4999 5000 if (req_has_async_data(req)) { 5001 kmsg = req->async_data; 5002 } else { 5003 ret = io_recvmsg_copy_hdr(req, &iomsg); 5004 if (ret) 5005 return ret; 5006 kmsg = &iomsg; 5007 } 5008 5009 if (req->flags & REQ_F_BUFFER_SELECT) { 5010 kbuf = io_recv_buffer_select(req, issue_flags); 5011 if (IS_ERR(kbuf)) 5012 return PTR_ERR(kbuf); 5013 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); 5014 kmsg->fast_iov[0].iov_len = req->sr_msg.len; 5015 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 5016 1, req->sr_msg.len); 5017 } 5018 5019 flags = req->sr_msg.msg_flags; 5020 if (force_nonblock) 5021 flags |= MSG_DONTWAIT; 5022 if (flags & MSG_WAITALL) 5023 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 5024 5025 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, 5026 kmsg->uaddr, flags); 5027 if (force_nonblock && ret == -EAGAIN) 5028 return io_setup_async_msg(req, kmsg); 5029 if (ret == -ERESTARTSYS) 5030 ret = -EINTR; 5031 5032 if (req->flags & REQ_F_BUFFER_SELECTED) 5033 cflags = io_put_recv_kbuf(req); 5034 /* fast path, check for non-NULL to avoid function call */ 5035 if (kmsg->free_iov) 5036 kfree(kmsg->free_iov); 5037 req->flags &= ~REQ_F_NEED_CLEANUP; 5038 if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) 5039 req_set_fail(req); 5040 __io_req_complete(req, issue_flags, ret, cflags); 5041 return 0; 5042} 5043 5044static int io_recv(struct io_kiocb *req, unsigned int issue_flags) 5045{ 5046 struct io_buffer *kbuf; 5047 struct io_sr_msg *sr = &req->sr_msg; 5048 struct msghdr msg; 5049 void __user *buf = sr->buf; 5050 struct socket *sock; 5051 struct iovec iov; 5052 unsigned flags; 5053 int min_ret = 0; 5054 int ret, cflags = 0; 5055 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 5056 5057 sock = sock_from_file(req->file); 5058 if (unlikely(!sock)) 5059 return -ENOTSOCK; 5060 5061 if (req->flags & REQ_F_BUFFER_SELECT) { 5062 kbuf = io_recv_buffer_select(req, issue_flags); 5063 if (IS_ERR(kbuf)) 5064 return PTR_ERR(kbuf); 5065 buf = u64_to_user_ptr(kbuf->addr); 5066 } 5067 5068 ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); 5069 if (unlikely(ret)) 5070 goto out_free; 5071 5072 msg.msg_name = NULL; 5073 msg.msg_control = NULL; 5074 msg.msg_controllen = 0; 5075 msg.msg_namelen = 0; 5076 msg.msg_iocb = NULL; 5077 msg.msg_flags = 0; 5078 5079 flags = req->sr_msg.msg_flags; 5080 if (force_nonblock) 5081 flags |= MSG_DONTWAIT; 5082 if (flags & MSG_WAITALL) 5083 min_ret = iov_iter_count(&msg.msg_iter); 5084 5085 ret = sock_recvmsg(sock, &msg, flags); 5086 if (force_nonblock && ret == -EAGAIN) 5087 return -EAGAIN; 5088 if (ret == -ERESTARTSYS) 5089 ret = -EINTR; 5090out_free: 5091 if (req->flags & REQ_F_BUFFER_SELECTED) 5092 cflags = io_put_recv_kbuf(req); 5093 if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) 5094 req_set_fail(req); 5095 __io_req_complete(req, issue_flags, ret, cflags); 5096 return 0; 5097} 5098 5099static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5100{ 5101 struct io_accept *accept = &req->accept; 5102 5103 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5104 return -EINVAL; 5105 if (sqe->ioprio || sqe->len || sqe->buf_index) 5106 return -EINVAL; 5107 5108 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 5109 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 5110 accept->flags = READ_ONCE(sqe->accept_flags); 5111 accept->nofile = rlimit(RLIMIT_NOFILE); 5112 5113 accept->file_slot = READ_ONCE(sqe->file_index); 5114 if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) || 5115 (accept->flags & SOCK_CLOEXEC))) 5116 return -EINVAL; 5117 if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 5118 return -EINVAL; 5119 if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) 5120 accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; 5121 return 0; 5122} 5123 5124static int io_accept(struct io_kiocb *req, unsigned int issue_flags) 5125{ 5126 struct io_accept *accept = &req->accept; 5127 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 5128 unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; 5129 bool fixed = !!accept->file_slot; 5130 struct file *file; 5131 int ret, fd; 5132 5133 if (req->file->f_flags & O_NONBLOCK) 5134 req->flags |= REQ_F_NOWAIT; 5135 5136 if (!fixed) { 5137 fd = __get_unused_fd_flags(accept->flags, accept->nofile); 5138 if (unlikely(fd < 0)) 5139 return fd; 5140 } 5141 file = do_accept(req->file, file_flags, accept->addr, accept->addr_len, 5142 accept->flags); 5143 if (IS_ERR(file)) { 5144 if (!fixed) 5145 put_unused_fd(fd); 5146 ret = PTR_ERR(file); 5147 if (ret == -EAGAIN && force_nonblock) 5148 return -EAGAIN; 5149 if (ret == -ERESTARTSYS) 5150 ret = -EINTR; 5151 req_set_fail(req); 5152 } else if (!fixed) { 5153 fd_install(fd, file); 5154 ret = fd; 5155 } else { 5156 ret = io_install_fixed_file(req, file, issue_flags, 5157 accept->file_slot - 1); 5158 } 5159 __io_req_complete(req, issue_flags, ret, 0); 5160 return 0; 5161} 5162 5163static int io_connect_prep_async(struct io_kiocb *req) 5164{ 5165 struct io_async_connect *io = req->async_data; 5166 struct io_connect *conn = &req->connect; 5167 5168 return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address); 5169} 5170 5171static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5172{ 5173 struct io_connect *conn = &req->connect; 5174 5175 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5176 return -EINVAL; 5177 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags || 5178 sqe->splice_fd_in) 5179 return -EINVAL; 5180 5181 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 5182 conn->addr_len = READ_ONCE(sqe->addr2); 5183 return 0; 5184} 5185 5186static int io_connect(struct io_kiocb *req, unsigned int issue_flags) 5187{ 5188 struct io_async_connect __io, *io; 5189 unsigned file_flags; 5190 int ret; 5191 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 5192 5193 if (req_has_async_data(req)) { 5194 io = req->async_data; 5195 } else { 5196 ret = move_addr_to_kernel(req->connect.addr, 5197 req->connect.addr_len, 5198 &__io.address); 5199 if (ret) 5200 goto out; 5201 io = &__io; 5202 } 5203 5204 file_flags = force_nonblock ? O_NONBLOCK : 0; 5205 5206 ret = __sys_connect_file(req->file, &io->address, 5207 req->connect.addr_len, file_flags); 5208 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { 5209 if (req_has_async_data(req)) 5210 return -EAGAIN; 5211 if (io_alloc_async_data(req)) { 5212 ret = -ENOMEM; 5213 goto out; 5214 } 5215 memcpy(req->async_data, &__io, sizeof(__io)); 5216 return -EAGAIN; 5217 } 5218 if (ret == -ERESTARTSYS) 5219 ret = -EINTR; 5220out: 5221 if (ret < 0) 5222 req_set_fail(req); 5223 __io_req_complete(req, issue_flags, ret, 0); 5224 return 0; 5225} 5226#else /* !CONFIG_NET */ 5227#define IO_NETOP_FN(op) \ 5228static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \ 5229{ \ 5230 return -EOPNOTSUPP; \ 5231} 5232 5233#define IO_NETOP_PREP(op) \ 5234IO_NETOP_FN(op) \ 5235static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \ 5236{ \ 5237 return -EOPNOTSUPP; \ 5238} \ 5239 5240#define IO_NETOP_PREP_ASYNC(op) \ 5241IO_NETOP_PREP(op) \ 5242static int io_##op##_prep_async(struct io_kiocb *req) \ 5243{ \ 5244 return -EOPNOTSUPP; \ 5245} 5246 5247IO_NETOP_PREP_ASYNC(sendmsg); 5248IO_NETOP_PREP_ASYNC(recvmsg); 5249IO_NETOP_PREP_ASYNC(connect); 5250IO_NETOP_PREP(accept); 5251IO_NETOP_FN(send); 5252IO_NETOP_FN(recv); 5253#endif /* CONFIG_NET */ 5254 5255struct io_poll_table { 5256 struct poll_table_struct pt; 5257 struct io_kiocb *req; 5258 int nr_entries; 5259 int error; 5260}; 5261 5262static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, 5263 __poll_t mask, io_req_tw_func_t func) 5264{ 5265 /* for instances that support it check for an event match first: */ 5266 if (mask && !(mask & poll->events)) 5267 return 0; 5268 5269 trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask); 5270 5271 list_del_init(&poll->wait.entry); 5272 5273 req->result = mask; 5274 req->io_task_work.func = func; 5275 5276 /* 5277 * If this fails, then the task is exiting. When a task exits, the 5278 * work gets canceled, so just cancel this request as well instead 5279 * of executing it. We can't safely execute it anyway, as we may not 5280 * have the needed state needed for it anyway. 5281 */ 5282 io_req_task_work_add(req); 5283 return 1; 5284} 5285 5286static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) 5287 __acquires(&req->ctx->completion_lock) 5288{ 5289 struct io_ring_ctx *ctx = req->ctx; 5290 5291 /* req->task == current here, checking PF_EXITING is safe */ 5292 if (unlikely(req->task->flags & PF_EXITING)) 5293 WRITE_ONCE(poll->canceled, true); 5294 5295 if (!req->result && !READ_ONCE(poll->canceled)) { 5296 struct poll_table_struct pt = { ._key = poll->events }; 5297 5298 req->result = vfs_poll(req->file, &pt) & poll->events; 5299 } 5300 5301 spin_lock(&ctx->completion_lock); 5302 if (!req->result && !READ_ONCE(poll->canceled)) { 5303 add_wait_queue(poll->head, &poll->wait); 5304 return true; 5305 } 5306 5307 return false; 5308} 5309 5310static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) 5311{ 5312 /* pure poll stashes this in ->async_data, poll driven retry elsewhere */ 5313 if (req->opcode == IORING_OP_POLL_ADD) 5314 return req->async_data; 5315 return req->apoll->double_poll; 5316} 5317 5318static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req) 5319{ 5320 if (req->opcode == IORING_OP_POLL_ADD) 5321 return &req->poll; 5322 return &req->apoll->poll; 5323} 5324 5325static void io_poll_remove_double(struct io_kiocb *req) 5326 __must_hold(&req->ctx->completion_lock) 5327{ 5328 struct io_poll_iocb *poll = io_poll_get_double(req); 5329 5330 lockdep_assert_held(&req->ctx->completion_lock); 5331 5332 if (poll && poll->head) { 5333 struct wait_queue_head *head = poll->head; 5334 5335 spin_lock_irq(&head->lock); 5336 list_del_init(&poll->wait.entry); 5337 if (poll->wait.private) 5338 req_ref_put(req); 5339 poll->head = NULL; 5340 spin_unlock_irq(&head->lock); 5341 } 5342} 5343 5344static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask) 5345 __must_hold(&req->ctx->completion_lock) 5346{ 5347 struct io_ring_ctx *ctx = req->ctx; 5348 unsigned flags = IORING_CQE_F_MORE; 5349 int error; 5350 5351 if (READ_ONCE(req->poll.canceled)) { 5352 error = -ECANCELED; 5353 req->poll.events |= EPOLLONESHOT; 5354 } else { 5355 error = mangle_poll(mask); 5356 } 5357 if (req->poll.events & EPOLLONESHOT) 5358 flags = 0; 5359 if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) { 5360 req->poll.events |= EPOLLONESHOT; 5361 flags = 0; 5362 } 5363 if (flags & IORING_CQE_F_MORE) 5364 ctx->cq_extra++; 5365 5366 return !(flags & IORING_CQE_F_MORE); 5367} 5368 5369static void io_poll_task_func(struct io_kiocb *req, bool *locked) 5370{ 5371 struct io_ring_ctx *ctx = req->ctx; 5372 struct io_kiocb *nxt; 5373 5374 if (io_poll_rewait(req, &req->poll)) { 5375 spin_unlock(&ctx->completion_lock); 5376 } else { 5377 bool done; 5378 5379 if (req->poll.done) { 5380 spin_unlock(&ctx->completion_lock); 5381 return; 5382 } 5383 done = __io_poll_complete(req, req->result); 5384 if (done) { 5385 io_poll_remove_double(req); 5386 hash_del(&req->hash_node); 5387 req->poll.done = true; 5388 } else { 5389 req->result = 0; 5390 add_wait_queue(req->poll.head, &req->poll.wait); 5391 } 5392 io_commit_cqring(ctx); 5393 spin_unlock(&ctx->completion_lock); 5394 io_cqring_ev_posted(ctx); 5395 5396 if (done) { 5397 nxt = io_put_req_find_next(req); 5398 if (nxt) 5399 io_req_task_submit(nxt, locked); 5400 } 5401 } 5402} 5403 5404static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, 5405 int sync, void *key) 5406{ 5407 struct io_kiocb *req = wait->private; 5408 struct io_poll_iocb *poll = io_poll_get_single(req); 5409 __poll_t mask = key_to_poll(key); 5410 unsigned long flags; 5411 5412 /* for instances that support it check for an event match first: */ 5413 if (mask && !(mask & poll->events)) 5414 return 0; 5415 if (!(poll->events & EPOLLONESHOT)) 5416 return poll->wait.func(&poll->wait, mode, sync, key); 5417 5418 list_del_init(&wait->entry); 5419 5420 if (poll->head) { 5421 bool done; 5422 5423 spin_lock_irqsave(&poll->head->lock, flags); 5424 done = list_empty(&poll->wait.entry); 5425 if (!done) 5426 list_del_init(&poll->wait.entry); 5427 /* make sure double remove sees this as being gone */ 5428 wait->private = NULL; 5429 spin_unlock_irqrestore(&poll->head->lock, flags); 5430 if (!done) { 5431 /* use wait func handler, so it matches the rq type */ 5432 poll->wait.func(&poll->wait, mode, sync, key); 5433 } 5434 } 5435 req_ref_put(req); 5436 return 1; 5437} 5438 5439static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, 5440 wait_queue_func_t wake_func) 5441{ 5442 poll->head = NULL; 5443 poll->done = false; 5444 poll->canceled = false; 5445#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) 5446 /* mask in events that we always want/need */ 5447 poll->events = events | IO_POLL_UNMASK; 5448 INIT_LIST_HEAD(&poll->wait.entry); 5449 init_waitqueue_func_entry(&poll->wait, wake_func); 5450} 5451 5452static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, 5453 struct wait_queue_head *head, 5454 struct io_poll_iocb **poll_ptr) 5455{ 5456 struct io_kiocb *req = pt->req; 5457 5458 /* 5459 * The file being polled uses multiple waitqueues for poll handling 5460 * (e.g. one for read, one for write). Setup a separate io_poll_iocb 5461 * if this happens. 5462 */ 5463 if (unlikely(pt->nr_entries)) { 5464 struct io_poll_iocb *poll_one = poll; 5465 5466 /* double add on the same waitqueue head, ignore */ 5467 if (poll_one->head == head) 5468 return; 5469 /* already have a 2nd entry, fail a third attempt */ 5470 if (*poll_ptr) { 5471 if ((*poll_ptr)->head == head) 5472 return; 5473 pt->error = -EINVAL; 5474 return; 5475 } 5476 /* 5477 * Can't handle multishot for double wait for now, turn it 5478 * into one-shot mode. 5479 */ 5480 if (!(poll_one->events & EPOLLONESHOT)) 5481 poll_one->events |= EPOLLONESHOT; 5482 poll = kmalloc(sizeof(*poll), GFP_ATOMIC); 5483 if (!poll) { 5484 pt->error = -ENOMEM; 5485 return; 5486 } 5487 io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake); 5488 req_ref_get(req); 5489 poll->wait.private = req; 5490 5491 *poll_ptr = poll; 5492 if (req->opcode == IORING_OP_POLL_ADD) 5493 req->flags |= REQ_F_ASYNC_DATA; 5494 } 5495 5496 pt->nr_entries++; 5497 poll->head = head; 5498 5499 if (poll->events & EPOLLEXCLUSIVE) 5500 add_wait_queue_exclusive(head, &poll->wait); 5501 else 5502 add_wait_queue(head, &poll->wait); 5503} 5504 5505static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, 5506 struct poll_table_struct *p) 5507{ 5508 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); 5509 struct async_poll *apoll = pt->req->apoll; 5510 5511 __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); 5512} 5513 5514static void io_async_task_func(struct io_kiocb *req, bool *locked) 5515{ 5516 struct async_poll *apoll = req->apoll; 5517 struct io_ring_ctx *ctx = req->ctx; 5518 5519 trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data); 5520 5521 if (io_poll_rewait(req, &apoll->poll)) { 5522 spin_unlock(&ctx->completion_lock); 5523 return; 5524 } 5525 5526 hash_del(&req->hash_node); 5527 io_poll_remove_double(req); 5528 apoll->poll.done = true; 5529 spin_unlock(&ctx->completion_lock); 5530 5531 if (!READ_ONCE(apoll->poll.canceled)) 5532 io_req_task_submit(req, locked); 5533 else 5534 io_req_complete_failed(req, -ECANCELED); 5535} 5536 5537static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, 5538 void *key) 5539{ 5540 struct io_kiocb *req = wait->private; 5541 struct io_poll_iocb *poll = &req->apoll->poll; 5542 5543 trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data, 5544 key_to_poll(key)); 5545 5546 return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func); 5547} 5548 5549static void io_poll_req_insert(struct io_kiocb *req) 5550{ 5551 struct io_ring_ctx *ctx = req->ctx; 5552 struct hlist_head *list; 5553 5554 list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; 5555 hlist_add_head(&req->hash_node, list); 5556} 5557 5558static __poll_t __io_arm_poll_handler(struct io_kiocb *req, 5559 struct io_poll_iocb *poll, 5560 struct io_poll_table *ipt, __poll_t mask, 5561 wait_queue_func_t wake_func) 5562 __acquires(&ctx->completion_lock) 5563{ 5564 struct io_ring_ctx *ctx = req->ctx; 5565 bool cancel = false; 5566 5567 INIT_HLIST_NODE(&req->hash_node); 5568 io_init_poll_iocb(poll, mask, wake_func); 5569 poll->file = req->file; 5570 poll->wait.private = req; 5571 5572 ipt->pt._key = mask; 5573 ipt->req = req; 5574 ipt->error = 0; 5575 ipt->nr_entries = 0; 5576 5577 mask = vfs_poll(req->file, &ipt->pt) & poll->events; 5578 if (unlikely(!ipt->nr_entries) && !ipt->error) 5579 ipt->error = -EINVAL; 5580 5581 spin_lock(&ctx->completion_lock); 5582 if (ipt->error || (mask && (poll->events & EPOLLONESHOT))) 5583 io_poll_remove_double(req); 5584 if (likely(poll->head)) { 5585 spin_lock_irq(&poll->head->lock); 5586 if (unlikely(list_empty(&poll->wait.entry))) { 5587 if (ipt->error) 5588 cancel = true; 5589 ipt->error = 0; 5590 mask = 0; 5591 } 5592 if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error) 5593 list_del_init(&poll->wait.entry); 5594 else if (cancel) 5595 WRITE_ONCE(poll->canceled, true); 5596 else if (!poll->done) /* actually waiting for an event */ 5597 io_poll_req_insert(req); 5598 spin_unlock_irq(&poll->head->lock); 5599 } 5600 5601 return mask; 5602} 5603 5604enum { 5605 IO_APOLL_OK, 5606 IO_APOLL_ABORTED, 5607 IO_APOLL_READY 5608}; 5609 5610static int io_arm_poll_handler(struct io_kiocb *req) 5611{ 5612 const struct io_op_def *def = &io_op_defs[req->opcode]; 5613 struct io_ring_ctx *ctx = req->ctx; 5614 struct async_poll *apoll; 5615 struct io_poll_table ipt; 5616 __poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI; 5617 5618 if (!def->pollin && !def->pollout) 5619 return IO_APOLL_ABORTED; 5620 if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED)) 5621 return IO_APOLL_ABORTED; 5622 5623 if (def->pollin) { 5624 mask |= POLLIN | POLLRDNORM; 5625 5626 /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ 5627 if ((req->opcode == IORING_OP_RECVMSG) && 5628 (req->sr_msg.msg_flags & MSG_ERRQUEUE)) 5629 mask &= ~POLLIN; 5630 } else { 5631 mask |= POLLOUT | POLLWRNORM; 5632 } 5633 5634 apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); 5635 if (unlikely(!apoll)) 5636 return IO_APOLL_ABORTED; 5637 apoll->double_poll = NULL; 5638 req->apoll = apoll; 5639 req->flags |= REQ_F_POLLED; 5640 ipt.pt._qproc = io_async_queue_proc; 5641 io_req_set_refcount(req); 5642 5643 ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, 5644 io_async_wake); 5645 spin_unlock(&ctx->completion_lock); 5646 if (ret || ipt.error) 5647 return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; 5648 5649 trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data, 5650 mask, apoll->poll.events); 5651 return IO_APOLL_OK; 5652} 5653 5654static bool __io_poll_remove_one(struct io_kiocb *req, 5655 struct io_poll_iocb *poll, bool do_cancel) 5656 __must_hold(&req->ctx->completion_lock) 5657{ 5658 bool do_complete = false; 5659 5660 if (!poll->head) 5661 return false; 5662 spin_lock_irq(&poll->head->lock); 5663 if (do_cancel) 5664 WRITE_ONCE(poll->canceled, true); 5665 if (!list_empty(&poll->wait.entry)) { 5666 list_del_init(&poll->wait.entry); 5667 do_complete = true; 5668 } 5669 spin_unlock_irq(&poll->head->lock); 5670 hash_del(&req->hash_node); 5671 return do_complete; 5672} 5673 5674static bool io_poll_remove_one(struct io_kiocb *req) 5675 __must_hold(&req->ctx->completion_lock) 5676{ 5677 bool do_complete; 5678 5679 io_poll_remove_double(req); 5680 do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true); 5681 5682 if (do_complete) { 5683 io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0); 5684 io_commit_cqring(req->ctx); 5685 req_set_fail(req); 5686 io_put_req_deferred(req); 5687 } 5688 return do_complete; 5689} 5690 5691/* 5692 * Returns true if we found and killed one or more poll requests 5693 */ 5694static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, 5695 struct task_struct *tsk, bool cancel_all) 5696{ 5697 struct hlist_node *tmp; 5698 struct io_kiocb *req; 5699 int posted = 0, i; 5700 5701 spin_lock(&ctx->completion_lock); 5702 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { 5703 struct hlist_head *list; 5704 5705 list = &ctx->cancel_hash[i]; 5706 hlist_for_each_entry_safe(req, tmp, list, hash_node) { 5707 if (io_match_task(req, tsk, cancel_all)) 5708 posted += io_poll_remove_one(req); 5709 } 5710 } 5711 spin_unlock(&ctx->completion_lock); 5712 5713 if (posted) 5714 io_cqring_ev_posted(ctx); 5715 5716 return posted != 0; 5717} 5718 5719static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr, 5720 bool poll_only) 5721 __must_hold(&ctx->completion_lock) 5722{ 5723 struct hlist_head *list; 5724 struct io_kiocb *req; 5725 5726 list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; 5727 hlist_for_each_entry(req, list, hash_node) { 5728 if (sqe_addr != req->user_data) 5729 continue; 5730 if (poll_only && req->opcode != IORING_OP_POLL_ADD) 5731 continue; 5732 return req; 5733 } 5734 return NULL; 5735} 5736 5737static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr, 5738 bool poll_only) 5739 __must_hold(&ctx->completion_lock) 5740{ 5741 struct io_kiocb *req; 5742 5743 req = io_poll_find(ctx, sqe_addr, poll_only); 5744 if (!req) 5745 return -ENOENT; 5746 if (io_poll_remove_one(req)) 5747 return 0; 5748 5749 return -EALREADY; 5750} 5751 5752static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, 5753 unsigned int flags) 5754{ 5755 u32 events; 5756 5757 events = READ_ONCE(sqe->poll32_events); 5758#ifdef __BIG_ENDIAN 5759 events = swahw32(events); 5760#endif 5761 if (!(flags & IORING_POLL_ADD_MULTI)) 5762 events |= EPOLLONESHOT; 5763 return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT)); 5764} 5765 5766static int io_poll_update_prep(struct io_kiocb *req, 5767 const struct io_uring_sqe *sqe) 5768{ 5769 struct io_poll_update *upd = &req->poll_update; 5770 u32 flags; 5771 5772 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5773 return -EINVAL; 5774 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) 5775 return -EINVAL; 5776 flags = READ_ONCE(sqe->len); 5777 if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | 5778 IORING_POLL_ADD_MULTI)) 5779 return -EINVAL; 5780 /* meaningless without update */ 5781 if (flags == IORING_POLL_ADD_MULTI) 5782 return -EINVAL; 5783 5784 upd->old_user_data = READ_ONCE(sqe->addr); 5785 upd->update_events = flags & IORING_POLL_UPDATE_EVENTS; 5786 upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA; 5787 5788 upd->new_user_data = READ_ONCE(sqe->off); 5789 if (!upd->update_user_data && upd->new_user_data) 5790 return -EINVAL; 5791 if (upd->update_events) 5792 upd->events = io_poll_parse_events(sqe, flags); 5793 else if (sqe->poll32_events) 5794 return -EINVAL; 5795 5796 return 0; 5797} 5798 5799static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, 5800 void *key) 5801{ 5802 struct io_kiocb *req = wait->private; 5803 struct io_poll_iocb *poll = &req->poll; 5804 5805 return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func); 5806} 5807 5808static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, 5809 struct poll_table_struct *p) 5810{ 5811 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); 5812 5813 __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data); 5814} 5815 5816static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5817{ 5818 struct io_poll_iocb *poll = &req->poll; 5819 u32 flags; 5820 5821 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5822 return -EINVAL; 5823 if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr) 5824 return -EINVAL; 5825 flags = READ_ONCE(sqe->len); 5826 if (flags & ~IORING_POLL_ADD_MULTI) 5827 return -EINVAL; 5828 5829 io_req_set_refcount(req); 5830 poll->events = io_poll_parse_events(sqe, flags); 5831 return 0; 5832} 5833 5834static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) 5835{ 5836 struct io_poll_iocb *poll = &req->poll; 5837 struct io_ring_ctx *ctx = req->ctx; 5838 struct io_poll_table ipt; 5839 __poll_t mask; 5840 bool done; 5841 5842 ipt.pt._qproc = io_poll_queue_proc; 5843 5844 mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events, 5845 io_poll_wake); 5846 5847 if (mask) { /* no async, we'd stolen it */ 5848 ipt.error = 0; 5849 done = __io_poll_complete(req, mask); 5850 io_commit_cqring(req->ctx); 5851 } 5852 spin_unlock(&ctx->completion_lock); 5853 5854 if (mask) { 5855 io_cqring_ev_posted(ctx); 5856 if (done) 5857 io_put_req(req); 5858 } 5859 return ipt.error; 5860} 5861 5862static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) 5863{ 5864 struct io_ring_ctx *ctx = req->ctx; 5865 struct io_kiocb *preq; 5866 bool completing; 5867 int ret; 5868 5869 spin_lock(&ctx->completion_lock); 5870 preq = io_poll_find(ctx, req->poll_update.old_user_data, true); 5871 if (!preq) { 5872 ret = -ENOENT; 5873 goto err; 5874 } 5875 5876 if (!req->poll_update.update_events && !req->poll_update.update_user_data) { 5877 completing = true; 5878 ret = io_poll_remove_one(preq) ? 0 : -EALREADY; 5879 goto err; 5880 } 5881 5882 /* 5883 * Don't allow racy completion with singleshot, as we cannot safely 5884 * update those. For multishot, if we're racing with completion, just 5885 * let completion re-add it. 5886 */ 5887 completing = !__io_poll_remove_one(preq, &preq->poll, false); 5888 if (completing && (preq->poll.events & EPOLLONESHOT)) { 5889 ret = -EALREADY; 5890 goto err; 5891 } 5892 /* we now have a detached poll request. reissue. */ 5893 ret = 0; 5894err: 5895 if (ret < 0) { 5896 spin_unlock(&ctx->completion_lock); 5897 req_set_fail(req); 5898 io_req_complete(req, ret); 5899 return 0; 5900 } 5901 /* only mask one event flags, keep behavior flags */ 5902 if (req->poll_update.update_events) { 5903 preq->poll.events &= ~0xffff; 5904 preq->poll.events |= req->poll_update.events & 0xffff; 5905 preq->poll.events |= IO_POLL_UNMASK; 5906 } 5907 if (req->poll_update.update_user_data) 5908 preq->user_data = req->poll_update.new_user_data; 5909 spin_unlock(&ctx->completion_lock); 5910 5911 /* complete update request, we're done with it */ 5912 io_req_complete(req, ret); 5913 5914 if (!completing) { 5915 ret = io_poll_add(preq, issue_flags); 5916 if (ret < 0) { 5917 req_set_fail(preq); 5918 io_req_complete(preq, ret); 5919 } 5920 } 5921 return 0; 5922} 5923 5924static void io_req_task_timeout(struct io_kiocb *req, bool *locked) 5925{ 5926 struct io_timeout_data *data = req->async_data; 5927 5928 if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) 5929 req_set_fail(req); 5930 io_req_complete_post(req, -ETIME, 0); 5931} 5932 5933static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) 5934{ 5935 struct io_timeout_data *data = container_of(timer, 5936 struct io_timeout_data, timer); 5937 struct io_kiocb *req = data->req; 5938 struct io_ring_ctx *ctx = req->ctx; 5939 unsigned long flags; 5940 5941 spin_lock_irqsave(&ctx->timeout_lock, flags); 5942 list_del_init(&req->timeout.list); 5943 atomic_set(&req->ctx->cq_timeouts, 5944 atomic_read(&req->ctx->cq_timeouts) + 1); 5945 spin_unlock_irqrestore(&ctx->timeout_lock, flags); 5946 5947 req->io_task_work.func = io_req_task_timeout; 5948 io_req_task_work_add(req); 5949 return HRTIMER_NORESTART; 5950} 5951 5952static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, 5953 __u64 user_data) 5954 __must_hold(&ctx->timeout_lock) 5955{ 5956 struct io_timeout_data *io; 5957 struct io_kiocb *req; 5958 bool found = false; 5959 5960 list_for_each_entry(req, &ctx->timeout_list, timeout.list) { 5961 found = user_data == req->user_data; 5962 if (found) 5963 break; 5964 } 5965 if (!found) 5966 return ERR_PTR(-ENOENT); 5967 5968 io = req->async_data; 5969 if (hrtimer_try_to_cancel(&io->timer) == -1) 5970 return ERR_PTR(-EALREADY); 5971 list_del_init(&req->timeout.list); 5972 return req; 5973} 5974 5975static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) 5976 __must_hold(&ctx->completion_lock) 5977 __must_hold(&ctx->timeout_lock) 5978{ 5979 struct io_kiocb *req = io_timeout_extract(ctx, user_data); 5980 5981 if (IS_ERR(req)) 5982 return PTR_ERR(req); 5983 5984 req_set_fail(req); 5985 io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0); 5986 io_put_req_deferred(req); 5987 return 0; 5988} 5989 5990static clockid_t io_timeout_get_clock(struct io_timeout_data *data) 5991{ 5992 switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) { 5993 case IORING_TIMEOUT_BOOTTIME: 5994 return CLOCK_BOOTTIME; 5995 case IORING_TIMEOUT_REALTIME: 5996 return CLOCK_REALTIME; 5997 default: 5998 /* can't happen, vetted at prep time */ 5999 WARN_ON_ONCE(1); 6000 fallthrough; 6001 case 0: 6002 return CLOCK_MONOTONIC; 6003 } 6004} 6005 6006static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, 6007 struct timespec64 *ts, enum hrtimer_mode mode) 6008 __must_hold(&ctx->timeout_lock) 6009{ 6010 struct io_timeout_data *io; 6011 struct io_kiocb *req; 6012 bool found = false; 6013 6014 list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) { 6015 found = user_data == req->user_data; 6016 if (found) 6017 break; 6018 } 6019 if (!found) 6020 return -ENOENT; 6021 6022 io = req->async_data; 6023 if (hrtimer_try_to_cancel(&io->timer) == -1) 6024 return -EALREADY; 6025 hrtimer_init(&io->timer, io_timeout_get_clock(io), mode); 6026 io->timer.function = io_link_timeout_fn; 6027 hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); 6028 return 0; 6029} 6030 6031static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, 6032 struct timespec64 *ts, enum hrtimer_mode mode) 6033 __must_hold(&ctx->timeout_lock) 6034{ 6035 struct io_kiocb *req = io_timeout_extract(ctx, user_data); 6036 struct io_timeout_data *data; 6037 6038 if (IS_ERR(req)) 6039 return PTR_ERR(req); 6040 6041 req->timeout.off = 0; /* noseq */ 6042 data = req->async_data; 6043 list_add_tail(&req->timeout.list, &ctx->timeout_list); 6044 hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); 6045 data->timer.function = io_timeout_fn; 6046 hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); 6047 return 0; 6048} 6049 6050static int io_timeout_remove_prep(struct io_kiocb *req, 6051 const struct io_uring_sqe *sqe) 6052{ 6053 struct io_timeout_rem *tr = &req->timeout_rem; 6054 6055 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 6056 return -EINVAL; 6057 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 6058 return -EINVAL; 6059 if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in) 6060 return -EINVAL; 6061 6062 tr->ltimeout = false; 6063 tr->addr = READ_ONCE(sqe->addr); 6064 tr->flags = READ_ONCE(sqe->timeout_flags); 6065 if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) { 6066 if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1) 6067 return -EINVAL; 6068 if (tr->flags & IORING_LINK_TIMEOUT_UPDATE) 6069 tr->ltimeout = true; 6070 if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) 6071 return -EINVAL; 6072 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) 6073 return -EFAULT; 6074 } else if (tr->flags) { 6075 /* timeout removal doesn't support flags */ 6076 return -EINVAL; 6077 } 6078 6079 return 0; 6080} 6081 6082static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags) 6083{ 6084 return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS 6085 : HRTIMER_MODE_REL; 6086} 6087 6088/* 6089 * Remove or update an existing timeout command 6090 */ 6091static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) 6092{ 6093 struct io_timeout_rem *tr = &req->timeout_rem; 6094 struct io_ring_ctx *ctx = req->ctx; 6095 int ret; 6096 6097 if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) { 6098 spin_lock(&ctx->completion_lock); 6099 spin_lock_irq(&ctx->timeout_lock); 6100 ret = io_timeout_cancel(ctx, tr->addr); 6101 spin_unlock_irq(&ctx->timeout_lock); 6102 spin_unlock(&ctx->completion_lock); 6103 } else { 6104 enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); 6105 6106 spin_lock_irq(&ctx->timeout_lock); 6107 if (tr->ltimeout) 6108 ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); 6109 else 6110 ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); 6111 spin_unlock_irq(&ctx->timeout_lock); 6112 } 6113 6114 if (ret < 0) 6115 req_set_fail(req); 6116 io_req_complete_post(req, ret, 0); 6117 return 0; 6118} 6119 6120static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, 6121 bool is_timeout_link) 6122{ 6123 struct io_timeout_data *data; 6124 unsigned flags; 6125 u32 off = READ_ONCE(sqe->off); 6126 6127 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 6128 return -EINVAL; 6129 if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || 6130 sqe->splice_fd_in) 6131 return -EINVAL; 6132 if (off && is_timeout_link) 6133 return -EINVAL; 6134 flags = READ_ONCE(sqe->timeout_flags); 6135 if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK | 6136 IORING_TIMEOUT_ETIME_SUCCESS)) 6137 return -EINVAL; 6138 /* more than one clock specified is invalid, obviously */ 6139 if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) 6140 return -EINVAL; 6141 6142 INIT_LIST_HEAD(&req->timeout.list); 6143 req->timeout.off = off; 6144 if (unlikely(off && !req->ctx->off_timeout_used)) 6145 req->ctx->off_timeout_used = true; 6146 6147 if (WARN_ON_ONCE(req_has_async_data(req))) 6148 return -EFAULT; 6149 if (io_alloc_async_data(req)) 6150 return -ENOMEM; 6151 6152 data = req->async_data; 6153 data->req = req; 6154 data->flags = flags; 6155 6156 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) 6157 return -EFAULT; 6158 6159 data->mode = io_translate_timeout_mode(flags); 6160 hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); 6161 6162 if (is_timeout_link) { 6163 struct io_submit_link *link = &req->ctx->submit_state.link; 6164 6165 if (!link->head) 6166 return -EINVAL; 6167 if (link->last->opcode == IORING_OP_LINK_TIMEOUT) 6168 return -EINVAL; 6169 req->timeout.head = link->last; 6170 link->last->flags |= REQ_F_ARM_LTIMEOUT; 6171 } 6172 return 0; 6173} 6174 6175static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) 6176{ 6177 struct io_ring_ctx *ctx = req->ctx; 6178 struct io_timeout_data *data = req->async_data; 6179 struct list_head *entry; 6180 u32 tail, off = req->timeout.off; 6181 6182 spin_lock_irq(&ctx->timeout_lock); 6183 6184 /* 6185 * sqe->off holds how many events that need to occur for this 6186 * timeout event to be satisfied. If it isn't set, then this is 6187 * a pure timeout request, sequence isn't used. 6188 */ 6189 if (io_is_timeout_noseq(req)) { 6190 entry = ctx->timeout_list.prev; 6191 goto add; 6192 } 6193 6194 tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); 6195 req->timeout.target_seq = tail + off; 6196 6197 /* Update the last seq here in case io_flush_timeouts() hasn't. 6198 * This is safe because ->completion_lock is held, and submissions 6199 * and completions are never mixed in the same ->completion_lock section. 6200 */ 6201 ctx->cq_last_tm_flush = tail; 6202 6203 /* 6204 * Insertion sort, ensuring the first entry in the list is always 6205 * the one we need first. 6206 */ 6207 list_for_each_prev(entry, &ctx->timeout_list) { 6208 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, 6209 timeout.list); 6210 6211 if (io_is_timeout_noseq(nxt)) 6212 continue; 6213 /* nxt.seq is behind @tail, otherwise would've been completed */ 6214 if (off >= nxt->timeout.target_seq - tail) 6215 break; 6216 } 6217add: 6218 list_add(&req->timeout.list, entry); 6219 data->timer.function = io_timeout_fn; 6220 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); 6221 spin_unlock_irq(&ctx->timeout_lock); 6222 return 0; 6223} 6224 6225struct io_cancel_data { 6226 struct io_ring_ctx *ctx; 6227 u64 user_data; 6228}; 6229 6230static bool io_cancel_cb(struct io_wq_work *work, void *data) 6231{ 6232 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 6233 struct io_cancel_data *cd = data; 6234 6235 return req->ctx == cd->ctx && req->user_data == cd->user_data; 6236} 6237 6238static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, 6239 struct io_ring_ctx *ctx) 6240{ 6241 struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, }; 6242 enum io_wq_cancel cancel_ret; 6243 int ret = 0; 6244 6245 if (!tctx || !tctx->io_wq) 6246 return -ENOENT; 6247 6248 cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false); 6249 switch (cancel_ret) { 6250 case IO_WQ_CANCEL_OK: 6251 ret = 0; 6252 break; 6253 case IO_WQ_CANCEL_RUNNING: 6254 ret = -EALREADY; 6255 break; 6256 case IO_WQ_CANCEL_NOTFOUND: 6257 ret = -ENOENT; 6258 break; 6259 } 6260 6261 return ret; 6262} 6263 6264static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr) 6265{ 6266 struct io_ring_ctx *ctx = req->ctx; 6267 int ret; 6268 6269 WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); 6270 6271 ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); 6272 if (ret != -ENOENT) 6273 return ret; 6274 6275 spin_lock(&ctx->completion_lock); 6276 spin_lock_irq(&ctx->timeout_lock); 6277 ret = io_timeout_cancel(ctx, sqe_addr); 6278 spin_unlock_irq(&ctx->timeout_lock); 6279 if (ret != -ENOENT) 6280 goto out; 6281 ret = io_poll_cancel(ctx, sqe_addr, false); 6282out: 6283 spin_unlock(&ctx->completion_lock); 6284 return ret; 6285} 6286 6287static int io_async_cancel_prep(struct io_kiocb *req, 6288 const struct io_uring_sqe *sqe) 6289{ 6290 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 6291 return -EINVAL; 6292 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 6293 return -EINVAL; 6294 if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags || 6295 sqe->splice_fd_in) 6296 return -EINVAL; 6297 6298 req->cancel.addr = READ_ONCE(sqe->addr); 6299 return 0; 6300} 6301 6302static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) 6303{ 6304 struct io_ring_ctx *ctx = req->ctx; 6305 u64 sqe_addr = req->cancel.addr; 6306 bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 6307 struct io_tctx_node *node; 6308 int ret; 6309 6310 ret = io_try_cancel_userdata(req, sqe_addr); 6311 if (ret != -ENOENT) 6312 goto done; 6313 6314 /* slow path, try all io-wq's */ 6315 io_ring_submit_lock(ctx, needs_lock); 6316 ret = -ENOENT; 6317 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 6318 struct io_uring_task *tctx = node->task->io_uring; 6319 6320 ret = io_async_cancel_one(tctx, req->cancel.addr, ctx); 6321 if (ret != -ENOENT) 6322 break; 6323 } 6324 io_ring_submit_unlock(ctx, needs_lock); 6325done: 6326 if (ret < 0) 6327 req_set_fail(req); 6328 io_req_complete_post(req, ret, 0); 6329 return 0; 6330} 6331 6332static int io_rsrc_update_prep(struct io_kiocb *req, 6333 const struct io_uring_sqe *sqe) 6334{ 6335 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 6336 return -EINVAL; 6337 if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) 6338 return -EINVAL; 6339 6340 req->rsrc_update.offset = READ_ONCE(sqe->off); 6341 req->rsrc_update.nr_args = READ_ONCE(sqe->len); 6342 if (!req->rsrc_update.nr_args) 6343 return -EINVAL; 6344 req->rsrc_update.arg = READ_ONCE(sqe->addr); 6345 return 0; 6346} 6347 6348static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 6349{ 6350 struct io_ring_ctx *ctx = req->ctx; 6351 bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 6352 struct io_uring_rsrc_update2 up; 6353 int ret; 6354 6355 up.offset = req->rsrc_update.offset; 6356 up.data = req->rsrc_update.arg; 6357 up.nr = 0; 6358 up.tags = 0; 6359 up.resv = 0; 6360 6361 io_ring_submit_lock(ctx, needs_lock); 6362 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 6363 &up, req->rsrc_update.nr_args); 6364 io_ring_submit_unlock(ctx, needs_lock); 6365 6366 if (ret < 0) 6367 req_set_fail(req); 6368 __io_req_complete(req, issue_flags, ret, 0); 6369 return 0; 6370} 6371 6372static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 6373{ 6374 switch (req->opcode) { 6375 case IORING_OP_NOP: 6376 return 0; 6377 case IORING_OP_READV: 6378 case IORING_OP_READ_FIXED: 6379 case IORING_OP_READ: 6380 return io_read_prep(req, sqe); 6381 case IORING_OP_WRITEV: 6382 case IORING_OP_WRITE_FIXED: 6383 case IORING_OP_WRITE: 6384 return io_write_prep(req, sqe); 6385 case IORING_OP_POLL_ADD: 6386 return io_poll_add_prep(req, sqe); 6387 case IORING_OP_POLL_REMOVE: 6388 return io_poll_update_prep(req, sqe); 6389 case IORING_OP_FSYNC: 6390 return io_fsync_prep(req, sqe); 6391 case IORING_OP_SYNC_FILE_RANGE: 6392 return io_sfr_prep(req, sqe); 6393 case IORING_OP_SENDMSG: 6394 case IORING_OP_SEND: 6395 return io_sendmsg_prep(req, sqe); 6396 case IORING_OP_RECVMSG: 6397 case IORING_OP_RECV: 6398 return io_recvmsg_prep(req, sqe); 6399 case IORING_OP_CONNECT: 6400 return io_connect_prep(req, sqe); 6401 case IORING_OP_TIMEOUT: 6402 return io_timeout_prep(req, sqe, false); 6403 case IORING_OP_TIMEOUT_REMOVE: 6404 return io_timeout_remove_prep(req, sqe); 6405 case IORING_OP_ASYNC_CANCEL: 6406 return io_async_cancel_prep(req, sqe); 6407 case IORING_OP_LINK_TIMEOUT: 6408 return io_timeout_prep(req, sqe, true); 6409 case IORING_OP_ACCEPT: 6410 return io_accept_prep(req, sqe); 6411 case IORING_OP_FALLOCATE: 6412 return io_fallocate_prep(req, sqe); 6413 case IORING_OP_OPENAT: 6414 return io_openat_prep(req, sqe); 6415 case IORING_OP_CLOSE: 6416 return io_close_prep(req, sqe); 6417 case IORING_OP_FILES_UPDATE: 6418 return io_rsrc_update_prep(req, sqe); 6419 case IORING_OP_STATX: 6420 return io_statx_prep(req, sqe); 6421 case IORING_OP_FADVISE: 6422 return io_fadvise_prep(req, sqe); 6423 case IORING_OP_MADVISE: 6424 return io_madvise_prep(req, sqe); 6425 case IORING_OP_OPENAT2: 6426 return io_openat2_prep(req, sqe); 6427 case IORING_OP_EPOLL_CTL: 6428 return io_epoll_ctl_prep(req, sqe); 6429 case IORING_OP_SPLICE: 6430 return io_splice_prep(req, sqe); 6431 case IORING_OP_PROVIDE_BUFFERS: 6432 return io_provide_buffers_prep(req, sqe); 6433 case IORING_OP_REMOVE_BUFFERS: 6434 return io_remove_buffers_prep(req, sqe); 6435 case IORING_OP_TEE: 6436 return io_tee_prep(req, sqe); 6437 case IORING_OP_SHUTDOWN: 6438 return io_shutdown_prep(req, sqe); 6439 case IORING_OP_RENAMEAT: 6440 return io_renameat_prep(req, sqe); 6441 case IORING_OP_UNLINKAT: 6442 return io_unlinkat_prep(req, sqe); 6443 case IORING_OP_MKDIRAT: 6444 return io_mkdirat_prep(req, sqe); 6445 case IORING_OP_SYMLINKAT: 6446 return io_symlinkat_prep(req, sqe); 6447 case IORING_OP_LINKAT: 6448 return io_linkat_prep(req, sqe); 6449 } 6450 6451 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", 6452 req->opcode); 6453 return -EINVAL; 6454} 6455 6456static int io_req_prep_async(struct io_kiocb *req) 6457{ 6458 if (!io_op_defs[req->opcode].needs_async_setup) 6459 return 0; 6460 if (WARN_ON_ONCE(req_has_async_data(req))) 6461 return -EFAULT; 6462 if (io_alloc_async_data(req)) 6463 return -EAGAIN; 6464 6465 switch (req->opcode) { 6466 case IORING_OP_READV: 6467 return io_rw_prep_async(req, READ); 6468 case IORING_OP_WRITEV: 6469 return io_rw_prep_async(req, WRITE); 6470 case IORING_OP_SENDMSG: 6471 return io_sendmsg_prep_async(req); 6472 case IORING_OP_RECVMSG: 6473 return io_recvmsg_prep_async(req); 6474 case IORING_OP_CONNECT: 6475 return io_connect_prep_async(req); 6476 } 6477 printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n", 6478 req->opcode); 6479 return -EFAULT; 6480} 6481 6482static u32 io_get_sequence(struct io_kiocb *req) 6483{ 6484 u32 seq = req->ctx->cached_sq_head; 6485 6486 /* need original cached_sq_head, but it was increased for each req */ 6487 io_for_each_link(req, req) 6488 seq--; 6489 return seq; 6490} 6491 6492static __cold void io_drain_req(struct io_kiocb *req) 6493{ 6494 struct io_ring_ctx *ctx = req->ctx; 6495 struct io_defer_entry *de; 6496 int ret; 6497 u32 seq = io_get_sequence(req); 6498 6499 /* Still need defer if there is pending req in defer list. */ 6500 if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) { 6501queue: 6502 ctx->drain_active = false; 6503 io_req_task_queue(req); 6504 return; 6505 } 6506 6507 ret = io_req_prep_async(req); 6508 if (ret) { 6509fail: 6510 io_req_complete_failed(req, ret); 6511 return; 6512 } 6513 io_prep_async_link(req); 6514 de = kmalloc(sizeof(*de), GFP_KERNEL); 6515 if (!de) { 6516 ret = -ENOMEM; 6517 goto fail; 6518 } 6519 6520 spin_lock(&ctx->completion_lock); 6521 if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { 6522 spin_unlock(&ctx->completion_lock); 6523 kfree(de); 6524 goto queue; 6525 } 6526 6527 trace_io_uring_defer(ctx, req, req->user_data); 6528 de->req = req; 6529 de->seq = seq; 6530 list_add_tail(&de->list, &ctx->defer_list); 6531 spin_unlock(&ctx->completion_lock); 6532} 6533 6534static void io_clean_op(struct io_kiocb *req) 6535{ 6536 if (req->flags & REQ_F_BUFFER_SELECTED) { 6537 kfree(req->kbuf); 6538 req->kbuf = NULL; 6539 } 6540 6541 if (req->flags & REQ_F_NEED_CLEANUP) { 6542 switch (req->opcode) { 6543 case IORING_OP_READV: 6544 case IORING_OP_READ_FIXED: 6545 case IORING_OP_READ: 6546 case IORING_OP_WRITEV: 6547 case IORING_OP_WRITE_FIXED: 6548 case IORING_OP_WRITE: { 6549 struct io_async_rw *io = req->async_data; 6550 6551 kfree(io->free_iovec); 6552 break; 6553 } 6554 case IORING_OP_RECVMSG: 6555 case IORING_OP_SENDMSG: { 6556 struct io_async_msghdr *io = req->async_data; 6557 6558 kfree(io->free_iov); 6559 break; 6560 } 6561 case IORING_OP_SPLICE: 6562 case IORING_OP_TEE: 6563 if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED)) 6564 io_put_file(req->splice.file_in); 6565 break; 6566 case IORING_OP_OPENAT: 6567 case IORING_OP_OPENAT2: 6568 if (req->open.filename) 6569 putname(req->open.filename); 6570 break; 6571 case IORING_OP_RENAMEAT: 6572 putname(req->rename.oldpath); 6573 putname(req->rename.newpath); 6574 break; 6575 case IORING_OP_UNLINKAT: 6576 putname(req->unlink.filename); 6577 break; 6578 case IORING_OP_MKDIRAT: 6579 putname(req->mkdir.filename); 6580 break; 6581 case IORING_OP_SYMLINKAT: 6582 putname(req->symlink.oldpath); 6583 putname(req->symlink.newpath); 6584 break; 6585 case IORING_OP_LINKAT: 6586 putname(req->hardlink.oldpath); 6587 putname(req->hardlink.newpath); 6588 break; 6589 } 6590 } 6591 if ((req->flags & REQ_F_POLLED) && req->apoll) { 6592 kfree(req->apoll->double_poll); 6593 kfree(req->apoll); 6594 req->apoll = NULL; 6595 } 6596 if (req->flags & REQ_F_INFLIGHT) { 6597 struct io_uring_task *tctx = req->task->io_uring; 6598 6599 atomic_dec(&tctx->inflight_tracked); 6600 } 6601 if (req->flags & REQ_F_CREDS) 6602 put_cred(req->creds); 6603 if (req->flags & REQ_F_ASYNC_DATA) { 6604 kfree(req->async_data); 6605 req->async_data = NULL; 6606 } 6607 req->flags &= ~IO_REQ_CLEAN_FLAGS; 6608} 6609 6610static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) 6611{ 6612 const struct cred *creds = NULL; 6613 int ret; 6614 6615 if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) 6616 creds = override_creds(req->creds); 6617 6618 if (!io_op_defs[req->opcode].audit_skip) 6619 audit_uring_entry(req->opcode); 6620 6621 switch (req->opcode) { 6622 case IORING_OP_NOP: 6623 ret = io_nop(req, issue_flags); 6624 break; 6625 case IORING_OP_READV: 6626 case IORING_OP_READ_FIXED: 6627 case IORING_OP_READ: 6628 ret = io_read(req, issue_flags); 6629 break; 6630 case IORING_OP_WRITEV: 6631 case IORING_OP_WRITE_FIXED: 6632 case IORING_OP_WRITE: 6633 ret = io_write(req, issue_flags); 6634 break; 6635 case IORING_OP_FSYNC: 6636 ret = io_fsync(req, issue_flags); 6637 break; 6638 case IORING_OP_POLL_ADD: 6639 ret = io_poll_add(req, issue_flags); 6640 break; 6641 case IORING_OP_POLL_REMOVE: 6642 ret = io_poll_update(req, issue_flags); 6643 break; 6644 case IORING_OP_SYNC_FILE_RANGE: 6645 ret = io_sync_file_range(req, issue_flags); 6646 break; 6647 case IORING_OP_SENDMSG: 6648 ret = io_sendmsg(req, issue_flags); 6649 break; 6650 case IORING_OP_SEND: 6651 ret = io_send(req, issue_flags); 6652 break; 6653 case IORING_OP_RECVMSG: 6654 ret = io_recvmsg(req, issue_flags); 6655 break; 6656 case IORING_OP_RECV: 6657 ret = io_recv(req, issue_flags); 6658 break; 6659 case IORING_OP_TIMEOUT: 6660 ret = io_timeout(req, issue_flags); 6661 break; 6662 case IORING_OP_TIMEOUT_REMOVE: 6663 ret = io_timeout_remove(req, issue_flags); 6664 break; 6665 case IORING_OP_ACCEPT: 6666 ret = io_accept(req, issue_flags); 6667 break; 6668 case IORING_OP_CONNECT: 6669 ret = io_connect(req, issue_flags); 6670 break; 6671 case IORING_OP_ASYNC_CANCEL: 6672 ret = io_async_cancel(req, issue_flags); 6673 break; 6674 case IORING_OP_FALLOCATE: 6675 ret = io_fallocate(req, issue_flags); 6676 break; 6677 case IORING_OP_OPENAT: 6678 ret = io_openat(req, issue_flags); 6679 break; 6680 case IORING_OP_CLOSE: 6681 ret = io_close(req, issue_flags); 6682 break; 6683 case IORING_OP_FILES_UPDATE: 6684 ret = io_files_update(req, issue_flags); 6685 break; 6686 case IORING_OP_STATX: 6687 ret = io_statx(req, issue_flags); 6688 break; 6689 case IORING_OP_FADVISE: 6690 ret = io_fadvise(req, issue_flags); 6691 break; 6692 case IORING_OP_MADVISE: 6693 ret = io_madvise(req, issue_flags); 6694 break; 6695 case IORING_OP_OPENAT2: 6696 ret = io_openat2(req, issue_flags); 6697 break; 6698 case IORING_OP_EPOLL_CTL: 6699 ret = io_epoll_ctl(req, issue_flags); 6700 break; 6701 case IORING_OP_SPLICE: 6702 ret = io_splice(req, issue_flags); 6703 break; 6704 case IORING_OP_PROVIDE_BUFFERS: 6705 ret = io_provide_buffers(req, issue_flags); 6706 break; 6707 case IORING_OP_REMOVE_BUFFERS: 6708 ret = io_remove_buffers(req, issue_flags); 6709 break; 6710 case IORING_OP_TEE: 6711 ret = io_tee(req, issue_flags); 6712 break; 6713 case IORING_OP_SHUTDOWN: 6714 ret = io_shutdown(req, issue_flags); 6715 break; 6716 case IORING_OP_RENAMEAT: 6717 ret = io_renameat(req, issue_flags); 6718 break; 6719 case IORING_OP_UNLINKAT: 6720 ret = io_unlinkat(req, issue_flags); 6721 break; 6722 case IORING_OP_MKDIRAT: 6723 ret = io_mkdirat(req, issue_flags); 6724 break; 6725 case IORING_OP_SYMLINKAT: 6726 ret = io_symlinkat(req, issue_flags); 6727 break; 6728 case IORING_OP_LINKAT: 6729 ret = io_linkat(req, issue_flags); 6730 break; 6731 default: 6732 ret = -EINVAL; 6733 break; 6734 } 6735 6736 if (!io_op_defs[req->opcode].audit_skip) 6737 audit_uring_exit(!ret, ret); 6738 6739 if (creds) 6740 revert_creds(creds); 6741 if (ret) 6742 return ret; 6743 /* If the op doesn't have a file, we're not polling for it */ 6744 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file) 6745 io_iopoll_req_issued(req, issue_flags); 6746 6747 return 0; 6748} 6749 6750static struct io_wq_work *io_wq_free_work(struct io_wq_work *work) 6751{ 6752 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 6753 6754 req = io_put_req_find_next(req); 6755 return req ? &req->work : NULL; 6756} 6757 6758static void io_wq_submit_work(struct io_wq_work *work) 6759{ 6760 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 6761 unsigned int issue_flags = IO_URING_F_UNLOCKED; 6762 bool needs_poll = false; 6763 struct io_kiocb *timeout; 6764 int ret = 0; 6765 6766 /* one will be dropped by ->io_free_work() after returning to io-wq */ 6767 if (!(req->flags & REQ_F_REFCOUNT)) 6768 __io_req_set_refcount(req, 2); 6769 else 6770 req_ref_get(req); 6771 6772 timeout = io_prep_linked_timeout(req); 6773 if (timeout) 6774 io_queue_linked_timeout(timeout); 6775 6776 /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ 6777 if (work->flags & IO_WQ_WORK_CANCEL) { 6778 io_req_task_queue_fail(req, -ECANCELED); 6779 return; 6780 } 6781 6782 if (req->flags & REQ_F_FORCE_ASYNC) { 6783 const struct io_op_def *def = &io_op_defs[req->opcode]; 6784 bool opcode_poll = def->pollin || def->pollout; 6785 6786 if (opcode_poll && file_can_poll(req->file)) { 6787 needs_poll = true; 6788 issue_flags |= IO_URING_F_NONBLOCK; 6789 } 6790 } 6791 6792 do { 6793 ret = io_issue_sqe(req, issue_flags); 6794 if (ret != -EAGAIN) 6795 break; 6796 /* 6797 * We can get EAGAIN for iopolled IO even though we're 6798 * forcing a sync submission from here, since we can't 6799 * wait for request slots on the block side. 6800 */ 6801 if (!needs_poll) { 6802 cond_resched(); 6803 continue; 6804 } 6805 6806 if (io_arm_poll_handler(req) == IO_APOLL_OK) 6807 return; 6808 /* aborted or ready, in either case retry blocking */ 6809 needs_poll = false; 6810 issue_flags &= ~IO_URING_F_NONBLOCK; 6811 } while (1); 6812 6813 /* avoid locking problems by failing it from a clean context */ 6814 if (ret) 6815 io_req_task_queue_fail(req, ret); 6816} 6817 6818static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table, 6819 unsigned i) 6820{ 6821 return &table->files[i]; 6822} 6823 6824static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, 6825 int index) 6826{ 6827 struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index); 6828 6829 return (struct file *) (slot->file_ptr & FFS_MASK); 6830} 6831 6832static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file) 6833{ 6834 unsigned long file_ptr = (unsigned long) file; 6835 6836 file_ptr |= io_file_get_flags(file); 6837 file_slot->file_ptr = file_ptr; 6838} 6839 6840static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx, 6841 struct io_kiocb *req, int fd) 6842{ 6843 struct file *file; 6844 unsigned long file_ptr; 6845 6846 if (unlikely((unsigned int)fd >= ctx->nr_user_files)) 6847 return NULL; 6848 fd = array_index_nospec(fd, ctx->nr_user_files); 6849 file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; 6850 file = (struct file *) (file_ptr & FFS_MASK); 6851 file_ptr &= ~FFS_MASK; 6852 /* mask in overlapping REQ_F and FFS bits */ 6853 req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT); 6854 io_req_set_rsrc_node(req, ctx); 6855 return file; 6856} 6857 6858static struct file *io_file_get_normal(struct io_ring_ctx *ctx, 6859 struct io_kiocb *req, int fd) 6860{ 6861 struct file *file = fget(fd); 6862 6863 trace_io_uring_file_get(ctx, fd); 6864 6865 /* we don't allow fixed io_uring files */ 6866 if (file && unlikely(file->f_op == &io_uring_fops)) 6867 io_req_track_inflight(req); 6868 return file; 6869} 6870 6871static inline struct file *io_file_get(struct io_ring_ctx *ctx, 6872 struct io_kiocb *req, int fd, bool fixed) 6873{ 6874 if (fixed) 6875 return io_file_get_fixed(ctx, req, fd); 6876 else 6877 return io_file_get_normal(ctx, req, fd); 6878} 6879 6880static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) 6881{ 6882 struct io_kiocb *prev = req->timeout.prev; 6883 int ret; 6884 6885 if (prev) { 6886 ret = io_try_cancel_userdata(req, prev->user_data); 6887 io_req_complete_post(req, ret ?: -ETIME, 0); 6888 io_put_req(prev); 6889 } else { 6890 io_req_complete_post(req, -ETIME, 0); 6891 } 6892} 6893 6894static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) 6895{ 6896 struct io_timeout_data *data = container_of(timer, 6897 struct io_timeout_data, timer); 6898 struct io_kiocb *prev, *req = data->req; 6899 struct io_ring_ctx *ctx = req->ctx; 6900 unsigned long flags; 6901 6902 spin_lock_irqsave(&ctx->timeout_lock, flags); 6903 prev = req->timeout.head; 6904 req->timeout.head = NULL; 6905 6906 /* 6907 * We don't expect the list to be empty, that will only happen if we 6908 * race with the completion of the linked work. 6909 */ 6910 if (prev) { 6911 io_remove_next_linked(prev); 6912 if (!req_ref_inc_not_zero(prev)) 6913 prev = NULL; 6914 } 6915 list_del(&req->timeout.list); 6916 req->timeout.prev = prev; 6917 spin_unlock_irqrestore(&ctx->timeout_lock, flags); 6918 6919 req->io_task_work.func = io_req_task_link_timeout; 6920 io_req_task_work_add(req); 6921 return HRTIMER_NORESTART; 6922} 6923 6924static void io_queue_linked_timeout(struct io_kiocb *req) 6925{ 6926 struct io_ring_ctx *ctx = req->ctx; 6927 6928 spin_lock_irq(&ctx->timeout_lock); 6929 /* 6930 * If the back reference is NULL, then our linked request finished 6931 * before we got a chance to setup the timer 6932 */ 6933 if (req->timeout.head) { 6934 struct io_timeout_data *data = req->async_data; 6935 6936 data->timer.function = io_link_timeout_fn; 6937 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), 6938 data->mode); 6939 list_add_tail(&req->timeout.list, &ctx->ltimeout_list); 6940 } 6941 spin_unlock_irq(&ctx->timeout_lock); 6942 /* drop submission reference */ 6943 io_put_req(req); 6944} 6945 6946static void io_queue_sqe_arm_apoll(struct io_kiocb *req) 6947 __must_hold(&req->ctx->uring_lock) 6948{ 6949 struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); 6950 6951 switch (io_arm_poll_handler(req)) { 6952 case IO_APOLL_READY: 6953 io_req_task_queue(req); 6954 break; 6955 case IO_APOLL_ABORTED: 6956 /* 6957 * Queued up for async execution, worker will release 6958 * submit reference when the iocb is actually submitted. 6959 */ 6960 io_queue_async_work(req, NULL); 6961 break; 6962 } 6963 6964 if (linked_timeout) 6965 io_queue_linked_timeout(linked_timeout); 6966} 6967 6968static inline void __io_queue_sqe(struct io_kiocb *req) 6969 __must_hold(&req->ctx->uring_lock) 6970{ 6971 struct io_kiocb *linked_timeout; 6972 int ret; 6973 6974 ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); 6975 6976 if (req->flags & REQ_F_COMPLETE_INLINE) { 6977 io_req_add_compl_list(req); 6978 return; 6979 } 6980 /* 6981 * We async punt it if the file wasn't marked NOWAIT, or if the file 6982 * doesn't support non-blocking read/write attempts 6983 */ 6984 if (likely(!ret)) { 6985 linked_timeout = io_prep_linked_timeout(req); 6986 if (linked_timeout) 6987 io_queue_linked_timeout(linked_timeout); 6988 } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { 6989 io_queue_sqe_arm_apoll(req); 6990 } else { 6991 io_req_complete_failed(req, ret); 6992 } 6993} 6994 6995static void io_queue_sqe_fallback(struct io_kiocb *req) 6996 __must_hold(&req->ctx->uring_lock) 6997{ 6998 if (req->flags & REQ_F_FAIL) { 6999 io_req_complete_fail_submit(req); 7000 } else if (unlikely(req->ctx->drain_active)) { 7001 io_drain_req(req); 7002 } else { 7003 int ret = io_req_prep_async(req); 7004 7005 if (unlikely(ret)) 7006 io_req_complete_failed(req, ret); 7007 else 7008 io_queue_async_work(req, NULL); 7009 } 7010} 7011 7012static inline void io_queue_sqe(struct io_kiocb *req) 7013 __must_hold(&req->ctx->uring_lock) 7014{ 7015 if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) 7016 __io_queue_sqe(req); 7017 else 7018 io_queue_sqe_fallback(req); 7019} 7020 7021/* 7022 * Check SQE restrictions (opcode and flags). 7023 * 7024 * Returns 'true' if SQE is allowed, 'false' otherwise. 7025 */ 7026static inline bool io_check_restriction(struct io_ring_ctx *ctx, 7027 struct io_kiocb *req, 7028 unsigned int sqe_flags) 7029{ 7030 if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) 7031 return false; 7032 7033 if ((sqe_flags & ctx->restrictions.sqe_flags_required) != 7034 ctx->restrictions.sqe_flags_required) 7035 return false; 7036 7037 if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | 7038 ctx->restrictions.sqe_flags_required)) 7039 return false; 7040 7041 return true; 7042} 7043 7044static void io_init_req_drain(struct io_kiocb *req) 7045{ 7046 struct io_ring_ctx *ctx = req->ctx; 7047 struct io_kiocb *head = ctx->submit_state.link.head; 7048 7049 ctx->drain_active = true; 7050 if (head) { 7051 /* 7052 * If we need to drain a request in the middle of a link, drain 7053 * the head request and the next request/link after the current 7054 * link. Considering sequential execution of links, 7055 * IOSQE_IO_DRAIN will be maintained for every request of our 7056 * link. 7057 */ 7058 head->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC; 7059 ctx->drain_next = true; 7060 } 7061} 7062 7063static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, 7064 const struct io_uring_sqe *sqe) 7065 __must_hold(&ctx->uring_lock) 7066{ 7067 unsigned int sqe_flags; 7068 int personality; 7069 u8 opcode; 7070 7071 /* req is partially pre-initialised, see io_preinit_req() */ 7072 req->opcode = opcode = READ_ONCE(sqe->opcode); 7073 /* same numerical values with corresponding REQ_F_*, safe to copy */ 7074 req->flags = sqe_flags = READ_ONCE(sqe->flags); 7075 req->user_data = READ_ONCE(sqe->user_data); 7076 req->file = NULL; 7077 req->fixed_rsrc_refs = NULL; 7078 req->task = current; 7079 7080 if (unlikely(opcode >= IORING_OP_LAST)) { 7081 req->opcode = 0; 7082 return -EINVAL; 7083 } 7084 if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { 7085 /* enforce forwards compatibility on users */ 7086 if (sqe_flags & ~SQE_VALID_FLAGS) 7087 return -EINVAL; 7088 if ((sqe_flags & IOSQE_BUFFER_SELECT) && 7089 !io_op_defs[opcode].buffer_select) 7090 return -EOPNOTSUPP; 7091 if (sqe_flags & IOSQE_IO_DRAIN) 7092 io_init_req_drain(req); 7093 } 7094 if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { 7095 if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags)) 7096 return -EACCES; 7097 /* knock it to the slow queue path, will be drained there */ 7098 if (ctx->drain_active) 7099 req->flags |= REQ_F_FORCE_ASYNC; 7100 /* if there is no link, we're at "next" request and need to drain */ 7101 if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) { 7102 ctx->drain_next = false; 7103 ctx->drain_active = true; 7104 req->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC; 7105 } 7106 } 7107 7108 if (io_op_defs[opcode].needs_file) { 7109 struct io_submit_state *state = &ctx->submit_state; 7110 7111 /* 7112 * Plug now if we have more than 2 IO left after this, and the 7113 * target is potentially a read/write to block based storage. 7114 */ 7115 if (state->need_plug && io_op_defs[opcode].plug) { 7116 state->plug_started = true; 7117 state->need_plug = false; 7118 blk_start_plug_nr_ios(&state->plug, state->submit_nr); 7119 } 7120 7121 req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd), 7122 (sqe_flags & IOSQE_FIXED_FILE)); 7123 if (unlikely(!req->file)) 7124 return -EBADF; 7125 } 7126 7127 personality = READ_ONCE(sqe->personality); 7128 if (personality) { 7129 int ret; 7130 7131 req->creds = xa_load(&ctx->personalities, personality); 7132 if (!req->creds) 7133 return -EINVAL; 7134 get_cred(req->creds); 7135 ret = security_uring_override_creds(req->creds); 7136 if (ret) { 7137 put_cred(req->creds); 7138 return ret; 7139 } 7140 req->flags |= REQ_F_CREDS; 7141 } 7142 7143 return io_req_prep(req, sqe); 7144} 7145 7146static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, 7147 const struct io_uring_sqe *sqe) 7148 __must_hold(&ctx->uring_lock) 7149{ 7150 struct io_submit_link *link = &ctx->submit_state.link; 7151 int ret; 7152 7153 ret = io_init_req(ctx, req, sqe); 7154 if (unlikely(ret)) { 7155 trace_io_uring_req_failed(sqe, ret); 7156 7157 /* fail even hard links since we don't submit */ 7158 if (link->head) { 7159 /* 7160 * we can judge a link req is failed or cancelled by if 7161 * REQ_F_FAIL is set, but the head is an exception since 7162 * it may be set REQ_F_FAIL because of other req's failure 7163 * so let's leverage req->result to distinguish if a head 7164 * is set REQ_F_FAIL because of its failure or other req's 7165 * failure so that we can set the correct ret code for it. 7166 * init result here to avoid affecting the normal path. 7167 */ 7168 if (!(link->head->flags & REQ_F_FAIL)) 7169 req_fail_link_node(link->head, -ECANCELED); 7170 } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { 7171 /* 7172 * the current req is a normal req, we should return 7173 * error and thus break the submittion loop. 7174 */ 7175 io_req_complete_failed(req, ret); 7176 return ret; 7177 } 7178 req_fail_link_node(req, ret); 7179 } 7180 7181 /* don't need @sqe from now on */ 7182 trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data, 7183 req->flags, true, 7184 ctx->flags & IORING_SETUP_SQPOLL); 7185 7186 /* 7187 * If we already have a head request, queue this one for async 7188 * submittal once the head completes. If we don't have a head but 7189 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be 7190 * submitted sync once the chain is complete. If none of those 7191 * conditions are true (normal request), then just queue it. 7192 */ 7193 if (link->head) { 7194 struct io_kiocb *head = link->head; 7195 7196 if (!(req->flags & REQ_F_FAIL)) { 7197 ret = io_req_prep_async(req); 7198 if (unlikely(ret)) { 7199 req_fail_link_node(req, ret); 7200 if (!(head->flags & REQ_F_FAIL)) 7201 req_fail_link_node(head, -ECANCELED); 7202 } 7203 } 7204 trace_io_uring_link(ctx, req, head); 7205 link->last->link = req; 7206 link->last = req; 7207 7208 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) 7209 return 0; 7210 /* last request of a link, enqueue the link */ 7211 link->head = NULL; 7212 req = head; 7213 } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { 7214 link->head = req; 7215 link->last = req; 7216 return 0; 7217 } 7218 7219 io_queue_sqe(req); 7220 return 0; 7221} 7222 7223/* 7224 * Batched submission is done, ensure local IO is flushed out. 7225 */ 7226static void io_submit_state_end(struct io_ring_ctx *ctx) 7227{ 7228 struct io_submit_state *state = &ctx->submit_state; 7229 7230 if (state->link.head) 7231 io_queue_sqe(state->link.head); 7232 /* flush only after queuing links as they can generate completions */ 7233 io_submit_flush_completions(ctx); 7234 if (state->plug_started) 7235 blk_finish_plug(&state->plug); 7236} 7237 7238/* 7239 * Start submission side cache. 7240 */ 7241static void io_submit_state_start(struct io_submit_state *state, 7242 unsigned int max_ios) 7243{ 7244 state->plug_started = false; 7245 state->need_plug = max_ios > 2; 7246 state->submit_nr = max_ios; 7247 /* set only head, no need to init link_last in advance */ 7248 state->link.head = NULL; 7249} 7250 7251static void io_commit_sqring(struct io_ring_ctx *ctx) 7252{ 7253 struct io_rings *rings = ctx->rings; 7254 7255 /* 7256 * Ensure any loads from the SQEs are done at this point, 7257 * since once we write the new head, the application could 7258 * write new data to them. 7259 */ 7260 smp_store_release(&rings->sq.head, ctx->cached_sq_head); 7261} 7262 7263/* 7264 * Fetch an sqe, if one is available. Note this returns a pointer to memory 7265 * that is mapped by userspace. This means that care needs to be taken to 7266 * ensure that reads are stable, as we cannot rely on userspace always 7267 * being a good citizen. If members of the sqe are validated and then later 7268 * used, it's important that those reads are done through READ_ONCE() to 7269 * prevent a re-load down the line. 7270 */ 7271static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) 7272{ 7273 unsigned head, mask = ctx->sq_entries - 1; 7274 unsigned sq_idx = ctx->cached_sq_head++ & mask; 7275 7276 /* 7277 * The cached sq head (or cq tail) serves two purposes: 7278 * 7279 * 1) allows us to batch the cost of updating the user visible 7280 * head updates. 7281 * 2) allows the kernel side to track the head on its own, even 7282 * though the application is the one updating it. 7283 */ 7284 head = READ_ONCE(ctx->sq_array[sq_idx]); 7285 if (likely(head < ctx->sq_entries)) 7286 return &ctx->sq_sqes[head]; 7287 7288 /* drop invalid entries */ 7289 ctx->cq_extra--; 7290 WRITE_ONCE(ctx->rings->sq_dropped, 7291 READ_ONCE(ctx->rings->sq_dropped) + 1); 7292 return NULL; 7293} 7294 7295static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) 7296 __must_hold(&ctx->uring_lock) 7297{ 7298 unsigned int entries = io_sqring_entries(ctx); 7299 int submitted = 0; 7300 7301 if (unlikely(!entries)) 7302 return 0; 7303 /* make sure SQ entry isn't read before tail */ 7304 nr = min3(nr, ctx->sq_entries, entries); 7305 io_get_task_refs(nr); 7306 7307 io_submit_state_start(&ctx->submit_state, nr); 7308 do { 7309 const struct io_uring_sqe *sqe; 7310 struct io_kiocb *req; 7311 7312 if (unlikely(!io_alloc_req_refill(ctx))) { 7313 if (!submitted) 7314 submitted = -EAGAIN; 7315 break; 7316 } 7317 req = io_alloc_req(ctx); 7318 sqe = io_get_sqe(ctx); 7319 if (unlikely(!sqe)) { 7320 wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); 7321 break; 7322 } 7323 /* will complete beyond this point, count as submitted */ 7324 submitted++; 7325 if (io_submit_sqe(ctx, req, sqe)) 7326 break; 7327 } while (submitted < nr); 7328 7329 if (unlikely(submitted != nr)) { 7330 int ref_used = (submitted == -EAGAIN) ? 0 : submitted; 7331 int unused = nr - ref_used; 7332 7333 current->io_uring->cached_refs += unused; 7334 } 7335 7336 io_submit_state_end(ctx); 7337 /* Commit SQ ring head once we've consumed and submitted all SQEs */ 7338 io_commit_sqring(ctx); 7339 7340 return submitted; 7341} 7342 7343static inline bool io_sqd_events_pending(struct io_sq_data *sqd) 7344{ 7345 return READ_ONCE(sqd->state); 7346} 7347 7348static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx) 7349{ 7350 /* Tell userspace we may need a wakeup call */ 7351 spin_lock(&ctx->completion_lock); 7352 WRITE_ONCE(ctx->rings->sq_flags, 7353 ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP); 7354 spin_unlock(&ctx->completion_lock); 7355} 7356 7357static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) 7358{ 7359 spin_lock(&ctx->completion_lock); 7360 WRITE_ONCE(ctx->rings->sq_flags, 7361 ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP); 7362 spin_unlock(&ctx->completion_lock); 7363} 7364 7365static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) 7366{ 7367 unsigned int to_submit; 7368 int ret = 0; 7369 7370 to_submit = io_sqring_entries(ctx); 7371 /* if we're handling multiple rings, cap submit size for fairness */ 7372 if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) 7373 to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; 7374 7375 if (!wq_list_empty(&ctx->iopoll_list) || to_submit) { 7376 const struct cred *creds = NULL; 7377 7378 if (ctx->sq_creds != current_cred()) 7379 creds = override_creds(ctx->sq_creds); 7380 7381 mutex_lock(&ctx->uring_lock); 7382 if (!wq_list_empty(&ctx->iopoll_list)) 7383 io_do_iopoll(ctx, true); 7384 7385 /* 7386 * Don't submit if refs are dying, good for io_uring_register(), 7387 * but also it is relied upon by io_ring_exit_work() 7388 */ 7389 if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) && 7390 !(ctx->flags & IORING_SETUP_R_DISABLED)) 7391 ret = io_submit_sqes(ctx, to_submit); 7392 mutex_unlock(&ctx->uring_lock); 7393 7394 if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) 7395 wake_up(&ctx->sqo_sq_wait); 7396 if (creds) 7397 revert_creds(creds); 7398 } 7399 7400 return ret; 7401} 7402 7403static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd) 7404{ 7405 struct io_ring_ctx *ctx; 7406 unsigned sq_thread_idle = 0; 7407 7408 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 7409 sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle); 7410 sqd->sq_thread_idle = sq_thread_idle; 7411} 7412 7413static bool io_sqd_handle_event(struct io_sq_data *sqd) 7414{ 7415 bool did_sig = false; 7416 struct ksignal ksig; 7417 7418 if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) || 7419 signal_pending(current)) { 7420 mutex_unlock(&sqd->lock); 7421 if (signal_pending(current)) 7422 did_sig = get_signal(&ksig); 7423 cond_resched(); 7424 mutex_lock(&sqd->lock); 7425 } 7426 return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); 7427} 7428 7429static int io_sq_thread(void *data) 7430{ 7431 struct io_sq_data *sqd = data; 7432 struct io_ring_ctx *ctx; 7433 unsigned long timeout = 0; 7434 char buf[TASK_COMM_LEN]; 7435 DEFINE_WAIT(wait); 7436 7437 snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); 7438 set_task_comm(current, buf); 7439 7440 if (sqd->sq_cpu != -1) 7441 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu)); 7442 else 7443 set_cpus_allowed_ptr(current, cpu_online_mask); 7444 current->flags |= PF_NO_SETAFFINITY; 7445 7446 audit_alloc_kernel(current); 7447 7448 mutex_lock(&sqd->lock); 7449 while (1) { 7450 bool cap_entries, sqt_spin = false; 7451 7452 if (io_sqd_events_pending(sqd) || signal_pending(current)) { 7453 if (io_sqd_handle_event(sqd)) 7454 break; 7455 timeout = jiffies + sqd->sq_thread_idle; 7456 } 7457 7458 cap_entries = !list_is_singular(&sqd->ctx_list); 7459 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 7460 int ret = __io_sq_thread(ctx, cap_entries); 7461 7462 if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) 7463 sqt_spin = true; 7464 } 7465 if (io_run_task_work()) 7466 sqt_spin = true; 7467 7468 if (sqt_spin || !time_after(jiffies, timeout)) { 7469 cond_resched(); 7470 if (sqt_spin) 7471 timeout = jiffies + sqd->sq_thread_idle; 7472 continue; 7473 } 7474 7475 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); 7476 if (!io_sqd_events_pending(sqd) && !current->task_works) { 7477 bool needs_sched = true; 7478 7479 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 7480 io_ring_set_wakeup_flag(ctx); 7481 7482 if ((ctx->flags & IORING_SETUP_IOPOLL) && 7483 !wq_list_empty(&ctx->iopoll_list)) { 7484 needs_sched = false; 7485 break; 7486 } 7487 if (io_sqring_entries(ctx)) { 7488 needs_sched = false; 7489 break; 7490 } 7491 } 7492 7493 if (needs_sched) { 7494 mutex_unlock(&sqd->lock); 7495 schedule(); 7496 mutex_lock(&sqd->lock); 7497 } 7498 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 7499 io_ring_clear_wakeup_flag(ctx); 7500 } 7501 7502 finish_wait(&sqd->wait, &wait); 7503 timeout = jiffies + sqd->sq_thread_idle; 7504 } 7505 7506 io_uring_cancel_generic(true, sqd); 7507 sqd->thread = NULL; 7508 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 7509 io_ring_set_wakeup_flag(ctx); 7510 io_run_task_work(); 7511 mutex_unlock(&sqd->lock); 7512 7513 audit_free(current); 7514 7515 complete(&sqd->exited); 7516 do_exit(0); 7517} 7518 7519struct io_wait_queue { 7520 struct wait_queue_entry wq; 7521 struct io_ring_ctx *ctx; 7522 unsigned cq_tail; 7523 unsigned nr_timeouts; 7524}; 7525 7526static inline bool io_should_wake(struct io_wait_queue *iowq) 7527{ 7528 struct io_ring_ctx *ctx = iowq->ctx; 7529 int dist = ctx->cached_cq_tail - (int) iowq->cq_tail; 7530 7531 /* 7532 * Wake up if we have enough events, or if a timeout occurred since we 7533 * started waiting. For timeouts, we always want to return to userspace, 7534 * regardless of event count. 7535 */ 7536 return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; 7537} 7538 7539static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, 7540 int wake_flags, void *key) 7541{ 7542 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, 7543 wq); 7544 7545 /* 7546 * Cannot safely flush overflowed CQEs from here, ensure we wake up 7547 * the task, and the next invocation will do it. 7548 */ 7549 if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow)) 7550 return autoremove_wake_function(curr, mode, wake_flags, key); 7551 return -1; 7552} 7553 7554static int io_run_task_work_sig(void) 7555{ 7556 if (io_run_task_work()) 7557 return 1; 7558 if (!signal_pending(current)) 7559 return 0; 7560 if (test_thread_flag(TIF_NOTIFY_SIGNAL)) 7561 return -ERESTARTSYS; 7562 return -EINTR; 7563} 7564 7565/* when returns >0, the caller should retry */ 7566static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, 7567 struct io_wait_queue *iowq, 7568 signed long *timeout) 7569{ 7570 int ret; 7571 7572 /* make sure we run task_work before checking for signals */ 7573 ret = io_run_task_work_sig(); 7574 if (ret || io_should_wake(iowq)) 7575 return ret; 7576 /* let the caller flush overflows, retry */ 7577 if (test_bit(0, &ctx->check_cq_overflow)) 7578 return 1; 7579 7580 *timeout = schedule_timeout(*timeout); 7581 return !*timeout ? -ETIME : 1; 7582} 7583 7584/* 7585 * Wait until events become available, if we don't already have some. The 7586 * application must reap them itself, as they reside on the shared cq ring. 7587 */ 7588static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, 7589 const sigset_t __user *sig, size_t sigsz, 7590 struct __kernel_timespec __user *uts) 7591{ 7592 struct io_wait_queue iowq; 7593 struct io_rings *rings = ctx->rings; 7594 signed long timeout = MAX_SCHEDULE_TIMEOUT; 7595 int ret; 7596 7597 do { 7598 io_cqring_overflow_flush(ctx); 7599 if (io_cqring_events(ctx) >= min_events) 7600 return 0; 7601 if (!io_run_task_work()) 7602 break; 7603 } while (1); 7604 7605 if (uts) { 7606 struct timespec64 ts; 7607 7608 if (get_timespec64(&ts, uts)) 7609 return -EFAULT; 7610 timeout = timespec64_to_jiffies(&ts); 7611 } 7612 7613 if (sig) { 7614#ifdef CONFIG_COMPAT 7615 if (in_compat_syscall()) 7616 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, 7617 sigsz); 7618 else 7619#endif 7620 ret = set_user_sigmask(sig, sigsz); 7621 7622 if (ret) 7623 return ret; 7624 } 7625 7626 init_waitqueue_func_entry(&iowq.wq, io_wake_function); 7627 iowq.wq.private = current; 7628 INIT_LIST_HEAD(&iowq.wq.entry); 7629 iowq.ctx = ctx; 7630 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); 7631 iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; 7632 7633 trace_io_uring_cqring_wait(ctx, min_events); 7634 do { 7635 /* if we can't even flush overflow, don't wait for more */ 7636 if (!io_cqring_overflow_flush(ctx)) { 7637 ret = -EBUSY; 7638 break; 7639 } 7640 prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, 7641 TASK_INTERRUPTIBLE); 7642 ret = io_cqring_wait_schedule(ctx, &iowq, &timeout); 7643 finish_wait(&ctx->cq_wait, &iowq.wq); 7644 cond_resched(); 7645 } while (ret > 0); 7646 7647 restore_saved_sigmask_unless(ret == -EINTR); 7648 7649 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; 7650} 7651 7652static void io_free_page_table(void **table, size_t size) 7653{ 7654 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 7655 7656 for (i = 0; i < nr_tables; i++) 7657 kfree(table[i]); 7658 kfree(table); 7659} 7660 7661static __cold void **io_alloc_page_table(size_t size) 7662{ 7663 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 7664 size_t init_size = size; 7665 void **table; 7666 7667 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); 7668 if (!table) 7669 return NULL; 7670 7671 for (i = 0; i < nr_tables; i++) { 7672 unsigned int this_size = min_t(size_t, size, PAGE_SIZE); 7673 7674 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); 7675 if (!table[i]) { 7676 io_free_page_table(table, init_size); 7677 return NULL; 7678 } 7679 size -= this_size; 7680 } 7681 return table; 7682} 7683 7684static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) 7685{ 7686 percpu_ref_exit(&ref_node->refs); 7687 kfree(ref_node); 7688} 7689 7690static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) 7691{ 7692 struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); 7693 struct io_ring_ctx *ctx = node->rsrc_data->ctx; 7694 unsigned long flags; 7695 bool first_add = false; 7696 7697 spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); 7698 node->done = true; 7699 7700 while (!list_empty(&ctx->rsrc_ref_list)) { 7701 node = list_first_entry(&ctx->rsrc_ref_list, 7702 struct io_rsrc_node, node); 7703 /* recycle ref nodes in order */ 7704 if (!node->done) 7705 break; 7706 list_del(&node->node); 7707 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); 7708 } 7709 spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); 7710 7711 if (first_add) 7712 mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ); 7713} 7714 7715static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) 7716{ 7717 struct io_rsrc_node *ref_node; 7718 7719 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); 7720 if (!ref_node) 7721 return NULL; 7722 7723 if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, 7724 0, GFP_KERNEL)) { 7725 kfree(ref_node); 7726 return NULL; 7727 } 7728 INIT_LIST_HEAD(&ref_node->node); 7729 INIT_LIST_HEAD(&ref_node->rsrc_list); 7730 ref_node->done = false; 7731 return ref_node; 7732} 7733 7734static void io_rsrc_node_switch(struct io_ring_ctx *ctx, 7735 struct io_rsrc_data *data_to_kill) 7736 __must_hold(&ctx->uring_lock) 7737{ 7738 WARN_ON_ONCE(!ctx->rsrc_backup_node); 7739 WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); 7740 7741 io_rsrc_refs_drop(ctx); 7742 7743 if (data_to_kill) { 7744 struct io_rsrc_node *rsrc_node = ctx->rsrc_node; 7745 7746 rsrc_node->rsrc_data = data_to_kill; 7747 spin_lock_irq(&ctx->rsrc_ref_lock); 7748 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); 7749 spin_unlock_irq(&ctx->rsrc_ref_lock); 7750 7751 atomic_inc(&data_to_kill->refs); 7752 percpu_ref_kill(&rsrc_node->refs); 7753 ctx->rsrc_node = NULL; 7754 } 7755 7756 if (!ctx->rsrc_node) { 7757 ctx->rsrc_node = ctx->rsrc_backup_node; 7758 ctx->rsrc_backup_node = NULL; 7759 } 7760} 7761 7762static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) 7763{ 7764 if (ctx->rsrc_backup_node) 7765 return 0; 7766 ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx); 7767 return ctx->rsrc_backup_node ? 0 : -ENOMEM; 7768} 7769 7770static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data, 7771 struct io_ring_ctx *ctx) 7772{ 7773 int ret; 7774 7775 /* As we may drop ->uring_lock, other task may have started quiesce */ 7776 if (data->quiesce) 7777 return -ENXIO; 7778 7779 data->quiesce = true; 7780 do { 7781 ret = io_rsrc_node_switch_start(ctx); 7782 if (ret) 7783 break; 7784 io_rsrc_node_switch(ctx, data); 7785 7786 /* kill initial ref, already quiesced if zero */ 7787 if (atomic_dec_and_test(&data->refs)) 7788 break; 7789 mutex_unlock(&ctx->uring_lock); 7790 flush_delayed_work(&ctx->rsrc_put_work); 7791 ret = wait_for_completion_interruptible(&data->done); 7792 if (!ret) { 7793 mutex_lock(&ctx->uring_lock); 7794 break; 7795 } 7796 7797 atomic_inc(&data->refs); 7798 /* wait for all works potentially completing data->done */ 7799 flush_delayed_work(&ctx->rsrc_put_work); 7800 reinit_completion(&data->done); 7801 7802 ret = io_run_task_work_sig(); 7803 mutex_lock(&ctx->uring_lock); 7804 } while (ret >= 0); 7805 data->quiesce = false; 7806 7807 return ret; 7808} 7809 7810static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx) 7811{ 7812 unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK; 7813 unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT; 7814 7815 return &data->tags[table_idx][off]; 7816} 7817 7818static void io_rsrc_data_free(struct io_rsrc_data *data) 7819{ 7820 size_t size = data->nr * sizeof(data->tags[0][0]); 7821 7822 if (data->tags) 7823 io_free_page_table((void **)data->tags, size); 7824 kfree(data); 7825} 7826 7827static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put, 7828 u64 __user *utags, unsigned nr, 7829 struct io_rsrc_data **pdata) 7830{ 7831 struct io_rsrc_data *data; 7832 int ret = -ENOMEM; 7833 unsigned i; 7834 7835 data = kzalloc(sizeof(*data), GFP_KERNEL); 7836 if (!data) 7837 return -ENOMEM; 7838 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); 7839 if (!data->tags) { 7840 kfree(data); 7841 return -ENOMEM; 7842 } 7843 7844 data->nr = nr; 7845 data->ctx = ctx; 7846 data->do_put = do_put; 7847 if (utags) { 7848 ret = -EFAULT; 7849 for (i = 0; i < nr; i++) { 7850 u64 *tag_slot = io_get_tag_slot(data, i); 7851 7852 if (copy_from_user(tag_slot, &utags[i], 7853 sizeof(*tag_slot))) 7854 goto fail; 7855 } 7856 } 7857 7858 atomic_set(&data->refs, 1); 7859 init_completion(&data->done); 7860 *pdata = data; 7861 return 0; 7862fail: 7863 io_rsrc_data_free(data); 7864 return ret; 7865} 7866 7867static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) 7868{ 7869 table->files = kvcalloc(nr_files, sizeof(table->files[0]), 7870 GFP_KERNEL_ACCOUNT); 7871 return !!table->files; 7872} 7873 7874static void io_free_file_tables(struct io_file_table *table) 7875{ 7876 kvfree(table->files); 7877 table->files = NULL; 7878} 7879 7880static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) 7881{ 7882#if defined(CONFIG_UNIX) 7883 if (ctx->ring_sock) { 7884 struct sock *sock = ctx->ring_sock->sk; 7885 struct sk_buff *skb; 7886 7887 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) 7888 kfree_skb(skb); 7889 } 7890#else 7891 int i; 7892 7893 for (i = 0; i < ctx->nr_user_files; i++) { 7894 struct file *file; 7895 7896 file = io_file_from_index(ctx, i); 7897 if (file) 7898 fput(file); 7899 } 7900#endif 7901 io_free_file_tables(&ctx->file_table); 7902 io_rsrc_data_free(ctx->file_data); 7903 ctx->file_data = NULL; 7904 ctx->nr_user_files = 0; 7905} 7906 7907static int io_sqe_files_unregister(struct io_ring_ctx *ctx) 7908{ 7909 int ret; 7910 7911 if (!ctx->file_data) 7912 return -ENXIO; 7913 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); 7914 if (!ret) 7915 __io_sqe_files_unregister(ctx); 7916 return ret; 7917} 7918 7919static void io_sq_thread_unpark(struct io_sq_data *sqd) 7920 __releases(&sqd->lock) 7921{ 7922 WARN_ON_ONCE(sqd->thread == current); 7923 7924 /* 7925 * Do the dance but not conditional clear_bit() because it'd race with 7926 * other threads incrementing park_pending and setting the bit. 7927 */ 7928 clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 7929 if (atomic_dec_return(&sqd->park_pending)) 7930 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 7931 mutex_unlock(&sqd->lock); 7932} 7933 7934static void io_sq_thread_park(struct io_sq_data *sqd) 7935 __acquires(&sqd->lock) 7936{ 7937 WARN_ON_ONCE(sqd->thread == current); 7938 7939 atomic_inc(&sqd->park_pending); 7940 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 7941 mutex_lock(&sqd->lock); 7942 if (sqd->thread) 7943 wake_up_process(sqd->thread); 7944} 7945 7946static void io_sq_thread_stop(struct io_sq_data *sqd) 7947{ 7948 WARN_ON_ONCE(sqd->thread == current); 7949 WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)); 7950 7951 set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); 7952 mutex_lock(&sqd->lock); 7953 if (sqd->thread) 7954 wake_up_process(sqd->thread); 7955 mutex_unlock(&sqd->lock); 7956 wait_for_completion(&sqd->exited); 7957} 7958 7959static void io_put_sq_data(struct io_sq_data *sqd) 7960{ 7961 if (refcount_dec_and_test(&sqd->refs)) { 7962 WARN_ON_ONCE(atomic_read(&sqd->park_pending)); 7963 7964 io_sq_thread_stop(sqd); 7965 kfree(sqd); 7966 } 7967} 7968 7969static void io_sq_thread_finish(struct io_ring_ctx *ctx) 7970{ 7971 struct io_sq_data *sqd = ctx->sq_data; 7972 7973 if (sqd) { 7974 io_sq_thread_park(sqd); 7975 list_del_init(&ctx->sqd_list); 7976 io_sqd_update_thread_idle(sqd); 7977 io_sq_thread_unpark(sqd); 7978 7979 io_put_sq_data(sqd); 7980 ctx->sq_data = NULL; 7981 } 7982} 7983 7984static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) 7985{ 7986 struct io_ring_ctx *ctx_attach; 7987 struct io_sq_data *sqd; 7988 struct fd f; 7989 7990 f = fdget(p->wq_fd); 7991 if (!f.file) 7992 return ERR_PTR(-ENXIO); 7993 if (f.file->f_op != &io_uring_fops) { 7994 fdput(f); 7995 return ERR_PTR(-EINVAL); 7996 } 7997 7998 ctx_attach = f.file->private_data; 7999 sqd = ctx_attach->sq_data; 8000 if (!sqd) { 8001 fdput(f); 8002 return ERR_PTR(-EINVAL); 8003 } 8004 if (sqd->task_tgid != current->tgid) { 8005 fdput(f); 8006 return ERR_PTR(-EPERM); 8007 } 8008 8009 refcount_inc(&sqd->refs); 8010 fdput(f); 8011 return sqd; 8012} 8013 8014static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, 8015 bool *attached) 8016{ 8017 struct io_sq_data *sqd; 8018 8019 *attached = false; 8020 if (p->flags & IORING_SETUP_ATTACH_WQ) { 8021 sqd = io_attach_sq_data(p); 8022 if (!IS_ERR(sqd)) { 8023 *attached = true; 8024 return sqd; 8025 } 8026 /* fall through for EPERM case, setup new sqd/task */ 8027 if (PTR_ERR(sqd) != -EPERM) 8028 return sqd; 8029 } 8030 8031 sqd = kzalloc(sizeof(*sqd), GFP_KERNEL); 8032 if (!sqd) 8033 return ERR_PTR(-ENOMEM); 8034 8035 atomic_set(&sqd->park_pending, 0); 8036 refcount_set(&sqd->refs, 1); 8037 INIT_LIST_HEAD(&sqd->ctx_list); 8038 mutex_init(&sqd->lock); 8039 init_waitqueue_head(&sqd->wait); 8040 init_completion(&sqd->exited); 8041 return sqd; 8042} 8043 8044#if defined(CONFIG_UNIX) 8045/* 8046 * Ensure the UNIX gc is aware of our file set, so we are certain that 8047 * the io_uring can be safely unregistered on process exit, even if we have 8048 * loops in the file referencing. 8049 */ 8050static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) 8051{ 8052 struct sock *sk = ctx->ring_sock->sk; 8053 struct scm_fp_list *fpl; 8054 struct sk_buff *skb; 8055 int i, nr_files; 8056 8057 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); 8058 if (!fpl) 8059 return -ENOMEM; 8060 8061 skb = alloc_skb(0, GFP_KERNEL); 8062 if (!skb) { 8063 kfree(fpl); 8064 return -ENOMEM; 8065 } 8066 8067 skb->sk = sk; 8068 8069 nr_files = 0; 8070 fpl->user = get_uid(current_user()); 8071 for (i = 0; i < nr; i++) { 8072 struct file *file = io_file_from_index(ctx, i + offset); 8073 8074 if (!file) 8075 continue; 8076 fpl->fp[nr_files] = get_file(file); 8077 unix_inflight(fpl->user, fpl->fp[nr_files]); 8078 nr_files++; 8079 } 8080 8081 if (nr_files) { 8082 fpl->max = SCM_MAX_FD; 8083 fpl->count = nr_files; 8084 UNIXCB(skb).fp = fpl; 8085 skb->destructor = unix_destruct_scm; 8086 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 8087 skb_queue_head(&sk->sk_receive_queue, skb); 8088 8089 for (i = 0; i < nr_files; i++) 8090 fput(fpl->fp[i]); 8091 } else { 8092 kfree_skb(skb); 8093 kfree(fpl); 8094 } 8095 8096 return 0; 8097} 8098 8099/* 8100 * If UNIX sockets are enabled, fd passing can cause a reference cycle which 8101 * causes regular reference counting to break down. We rely on the UNIX 8102 * garbage collection to take care of this problem for us. 8103 */ 8104static int io_sqe_files_scm(struct io_ring_ctx *ctx) 8105{ 8106 unsigned left, total; 8107 int ret = 0; 8108 8109 total = 0; 8110 left = ctx->nr_user_files; 8111 while (left) { 8112 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD); 8113 8114 ret = __io_sqe_files_scm(ctx, this_files, total); 8115 if (ret) 8116 break; 8117 left -= this_files; 8118 total += this_files; 8119 } 8120 8121 if (!ret) 8122 return 0; 8123 8124 while (total < ctx->nr_user_files) { 8125 struct file *file = io_file_from_index(ctx, total); 8126 8127 if (file) 8128 fput(file); 8129 total++; 8130 } 8131 8132 return ret; 8133} 8134#else 8135static int io_sqe_files_scm(struct io_ring_ctx *ctx) 8136{ 8137 return 0; 8138} 8139#endif 8140 8141static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 8142{ 8143 struct file *file = prsrc->file; 8144#if defined(CONFIG_UNIX) 8145 struct sock *sock = ctx->ring_sock->sk; 8146 struct sk_buff_head list, *head = &sock->sk_receive_queue; 8147 struct sk_buff *skb; 8148 int i; 8149 8150 __skb_queue_head_init(&list); 8151 8152 /* 8153 * Find the skb that holds this file in its SCM_RIGHTS. When found, 8154 * remove this entry and rearrange the file array. 8155 */ 8156 skb = skb_dequeue(head); 8157 while (skb) { 8158 struct scm_fp_list *fp; 8159 8160 fp = UNIXCB(skb).fp; 8161 for (i = 0; i < fp->count; i++) { 8162 int left; 8163 8164 if (fp->fp[i] != file) 8165 continue; 8166 8167 unix_notinflight(fp->user, fp->fp[i]); 8168 left = fp->count - 1 - i; 8169 if (left) { 8170 memmove(&fp->fp[i], &fp->fp[i + 1], 8171 left * sizeof(struct file *)); 8172 } 8173 fp->count--; 8174 if (!fp->count) { 8175 kfree_skb(skb); 8176 skb = NULL; 8177 } else { 8178 __skb_queue_tail(&list, skb); 8179 } 8180 fput(file); 8181 file = NULL; 8182 break; 8183 } 8184 8185 if (!file) 8186 break; 8187 8188 __skb_queue_tail(&list, skb); 8189 8190 skb = skb_dequeue(head); 8191 } 8192 8193 if (skb_peek(&list)) { 8194 spin_lock_irq(&head->lock); 8195 while ((skb = __skb_dequeue(&list)) != NULL) 8196 __skb_queue_tail(head, skb); 8197 spin_unlock_irq(&head->lock); 8198 } 8199#else 8200 fput(file); 8201#endif 8202} 8203 8204static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) 8205{ 8206 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; 8207 struct io_ring_ctx *ctx = rsrc_data->ctx; 8208 struct io_rsrc_put *prsrc, *tmp; 8209 8210 list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) { 8211 list_del(&prsrc->list); 8212 8213 if (prsrc->tag) { 8214 bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL; 8215 8216 io_ring_submit_lock(ctx, lock_ring); 8217 spin_lock(&ctx->completion_lock); 8218 io_cqring_fill_event(ctx, prsrc->tag, 0, 0); 8219 ctx->cq_extra++; 8220 io_commit_cqring(ctx); 8221 spin_unlock(&ctx->completion_lock); 8222 io_cqring_ev_posted(ctx); 8223 io_ring_submit_unlock(ctx, lock_ring); 8224 } 8225 8226 rsrc_data->do_put(ctx, prsrc); 8227 kfree(prsrc); 8228 } 8229 8230 io_rsrc_node_destroy(ref_node); 8231 if (atomic_dec_and_test(&rsrc_data->refs)) 8232 complete(&rsrc_data->done); 8233} 8234 8235static void io_rsrc_put_work(struct work_struct *work) 8236{ 8237 struct io_ring_ctx *ctx; 8238 struct llist_node *node; 8239 8240 ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work); 8241 node = llist_del_all(&ctx->rsrc_put_llist); 8242 8243 while (node) { 8244 struct io_rsrc_node *ref_node; 8245 struct llist_node *next = node->next; 8246 8247 ref_node = llist_entry(node, struct io_rsrc_node, llist); 8248 __io_rsrc_put_work(ref_node); 8249 node = next; 8250 } 8251} 8252 8253static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 8254 unsigned nr_args, u64 __user *tags) 8255{ 8256 __s32 __user *fds = (__s32 __user *) arg; 8257 struct file *file; 8258 int fd, ret; 8259 unsigned i; 8260 8261 if (ctx->file_data) 8262 return -EBUSY; 8263 if (!nr_args) 8264 return -EINVAL; 8265 if (nr_args > IORING_MAX_FIXED_FILES) 8266 return -EMFILE; 8267 if (nr_args > rlimit(RLIMIT_NOFILE)) 8268 return -EMFILE; 8269 ret = io_rsrc_node_switch_start(ctx); 8270 if (ret) 8271 return ret; 8272 ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, 8273 &ctx->file_data); 8274 if (ret) 8275 return ret; 8276 8277 ret = -ENOMEM; 8278 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) 8279 goto out_free; 8280 8281 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { 8282 if (copy_from_user(&fd, &fds[i], sizeof(fd))) { 8283 ret = -EFAULT; 8284 goto out_fput; 8285 } 8286 /* allow sparse sets */ 8287 if (fd == -1) { 8288 ret = -EINVAL; 8289 if (unlikely(*io_get_tag_slot(ctx->file_data, i))) 8290 goto out_fput; 8291 continue; 8292 } 8293 8294 file = fget(fd); 8295 ret = -EBADF; 8296 if (unlikely(!file)) 8297 goto out_fput; 8298 8299 /* 8300 * Don't allow io_uring instances to be registered. If UNIX 8301 * isn't enabled, then this causes a reference cycle and this 8302 * instance can never get freed. If UNIX is enabled we'll 8303 * handle it just fine, but there's still no point in allowing 8304 * a ring fd as it doesn't support regular read/write anyway. 8305 */ 8306 if (file->f_op == &io_uring_fops) { 8307 fput(file); 8308 goto out_fput; 8309 } 8310 io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file); 8311 } 8312 8313 ret = io_sqe_files_scm(ctx); 8314 if (ret) { 8315 __io_sqe_files_unregister(ctx); 8316 return ret; 8317 } 8318 8319 io_rsrc_node_switch(ctx, NULL); 8320 return ret; 8321out_fput: 8322 for (i = 0; i < ctx->nr_user_files; i++) { 8323 file = io_file_from_index(ctx, i); 8324 if (file) 8325 fput(file); 8326 } 8327 io_free_file_tables(&ctx->file_table); 8328 ctx->nr_user_files = 0; 8329out_free: 8330 io_rsrc_data_free(ctx->file_data); 8331 ctx->file_data = NULL; 8332 return ret; 8333} 8334 8335static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, 8336 int index) 8337{ 8338#if defined(CONFIG_UNIX) 8339 struct sock *sock = ctx->ring_sock->sk; 8340 struct sk_buff_head *head = &sock->sk_receive_queue; 8341 struct sk_buff *skb; 8342 8343 /* 8344 * See if we can merge this file into an existing skb SCM_RIGHTS 8345 * file set. If there's no room, fall back to allocating a new skb 8346 * and filling it in. 8347 */ 8348 spin_lock_irq(&head->lock); 8349 skb = skb_peek(head); 8350 if (skb) { 8351 struct scm_fp_list *fpl = UNIXCB(skb).fp; 8352 8353 if (fpl->count < SCM_MAX_FD) { 8354 __skb_unlink(skb, head); 8355 spin_unlock_irq(&head->lock); 8356 fpl->fp[fpl->count] = get_file(file); 8357 unix_inflight(fpl->user, fpl->fp[fpl->count]); 8358 fpl->count++; 8359 spin_lock_irq(&head->lock); 8360 __skb_queue_head(head, skb); 8361 } else { 8362 skb = NULL; 8363 } 8364 } 8365 spin_unlock_irq(&head->lock); 8366 8367 if (skb) { 8368 fput(file); 8369 return 0; 8370 } 8371 8372 return __io_sqe_files_scm(ctx, 1, index); 8373#else 8374 return 0; 8375#endif 8376} 8377 8378static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, 8379 struct io_rsrc_node *node, void *rsrc) 8380{ 8381 struct io_rsrc_put *prsrc; 8382 8383 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); 8384 if (!prsrc) 8385 return -ENOMEM; 8386 8387 prsrc->tag = *io_get_tag_slot(data, idx); 8388 prsrc->rsrc = rsrc; 8389 list_add(&prsrc->list, &node->rsrc_list); 8390 return 0; 8391} 8392 8393static int io_install_fixed_file(struct io_kiocb *req, struct file *file, 8394 unsigned int issue_flags, u32 slot_index) 8395{ 8396 struct io_ring_ctx *ctx = req->ctx; 8397 bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 8398 bool needs_switch = false; 8399 struct io_fixed_file *file_slot; 8400 int ret = -EBADF; 8401 8402 io_ring_submit_lock(ctx, needs_lock); 8403 if (file->f_op == &io_uring_fops) 8404 goto err; 8405 ret = -ENXIO; 8406 if (!ctx->file_data) 8407 goto err; 8408 ret = -EINVAL; 8409 if (slot_index >= ctx->nr_user_files) 8410 goto err; 8411 8412 slot_index = array_index_nospec(slot_index, ctx->nr_user_files); 8413 file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); 8414 8415 if (file_slot->file_ptr) { 8416 struct file *old_file; 8417 8418 ret = io_rsrc_node_switch_start(ctx); 8419 if (ret) 8420 goto err; 8421 8422 old_file = (struct file *)(file_slot->file_ptr & FFS_MASK); 8423 ret = io_queue_rsrc_removal(ctx->file_data, slot_index, 8424 ctx->rsrc_node, old_file); 8425 if (ret) 8426 goto err; 8427 file_slot->file_ptr = 0; 8428 needs_switch = true; 8429 } 8430 8431 *io_get_tag_slot(ctx->file_data, slot_index) = 0; 8432 io_fixed_file_set(file_slot, file); 8433 ret = io_sqe_file_register(ctx, file, slot_index); 8434 if (ret) { 8435 file_slot->file_ptr = 0; 8436 goto err; 8437 } 8438 8439 ret = 0; 8440err: 8441 if (needs_switch) 8442 io_rsrc_node_switch(ctx, ctx->file_data); 8443 io_ring_submit_unlock(ctx, needs_lock); 8444 if (ret) 8445 fput(file); 8446 return ret; 8447} 8448 8449static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) 8450{ 8451 unsigned int offset = req->close.file_slot - 1; 8452 struct io_ring_ctx *ctx = req->ctx; 8453 bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 8454 struct io_fixed_file *file_slot; 8455 struct file *file; 8456 int ret, i; 8457 8458 io_ring_submit_lock(ctx, needs_lock); 8459 ret = -ENXIO; 8460 if (unlikely(!ctx->file_data)) 8461 goto out; 8462 ret = -EINVAL; 8463 if (offset >= ctx->nr_user_files) 8464 goto out; 8465 ret = io_rsrc_node_switch_start(ctx); 8466 if (ret) 8467 goto out; 8468 8469 i = array_index_nospec(offset, ctx->nr_user_files); 8470 file_slot = io_fixed_file_slot(&ctx->file_table, i); 8471 ret = -EBADF; 8472 if (!file_slot->file_ptr) 8473 goto out; 8474 8475 file = (struct file *)(file_slot->file_ptr & FFS_MASK); 8476 ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file); 8477 if (ret) 8478 goto out; 8479 8480 file_slot->file_ptr = 0; 8481 io_rsrc_node_switch(ctx, ctx->file_data); 8482 ret = 0; 8483out: 8484 io_ring_submit_unlock(ctx, needs_lock); 8485 return ret; 8486} 8487 8488static int __io_sqe_files_update(struct io_ring_ctx *ctx, 8489 struct io_uring_rsrc_update2 *up, 8490 unsigned nr_args) 8491{ 8492 u64 __user *tags = u64_to_user_ptr(up->tags); 8493 __s32 __user *fds = u64_to_user_ptr(up->data); 8494 struct io_rsrc_data *data = ctx->file_data; 8495 struct io_fixed_file *file_slot; 8496 struct file *file; 8497 int fd, i, err = 0; 8498 unsigned int done; 8499 bool needs_switch = false; 8500 8501 if (!ctx->file_data) 8502 return -ENXIO; 8503 if (up->offset + nr_args > ctx->nr_user_files) 8504 return -EINVAL; 8505 8506 for (done = 0; done < nr_args; done++) { 8507 u64 tag = 0; 8508 8509 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 8510 copy_from_user(&fd, &fds[done], sizeof(fd))) { 8511 err = -EFAULT; 8512 break; 8513 } 8514 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 8515 err = -EINVAL; 8516 break; 8517 } 8518 if (fd == IORING_REGISTER_FILES_SKIP) 8519 continue; 8520 8521 i = array_index_nospec(up->offset + done, ctx->nr_user_files); 8522 file_slot = io_fixed_file_slot(&ctx->file_table, i); 8523 8524 if (file_slot->file_ptr) { 8525 file = (struct file *)(file_slot->file_ptr & FFS_MASK); 8526 err = io_queue_rsrc_removal(data, up->offset + done, 8527 ctx->rsrc_node, file); 8528 if (err) 8529 break; 8530 file_slot->file_ptr = 0; 8531 needs_switch = true; 8532 } 8533 if (fd != -1) { 8534 file = fget(fd); 8535 if (!file) { 8536 err = -EBADF; 8537 break; 8538 } 8539 /* 8540 * Don't allow io_uring instances to be registered. If 8541 * UNIX isn't enabled, then this causes a reference 8542 * cycle and this instance can never get freed. If UNIX 8543 * is enabled we'll handle it just fine, but there's 8544 * still no point in allowing a ring fd as it doesn't 8545 * support regular read/write anyway. 8546 */ 8547 if (file->f_op == &io_uring_fops) { 8548 fput(file); 8549 err = -EBADF; 8550 break; 8551 } 8552 *io_get_tag_slot(data, up->offset + done) = tag; 8553 io_fixed_file_set(file_slot, file); 8554 err = io_sqe_file_register(ctx, file, i); 8555 if (err) { 8556 file_slot->file_ptr = 0; 8557 fput(file); 8558 break; 8559 } 8560 } 8561 } 8562 8563 if (needs_switch) 8564 io_rsrc_node_switch(ctx, data); 8565 return done ? done : err; 8566} 8567 8568static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, 8569 struct task_struct *task) 8570{ 8571 struct io_wq_hash *hash; 8572 struct io_wq_data data; 8573 unsigned int concurrency; 8574 8575 mutex_lock(&ctx->uring_lock); 8576 hash = ctx->hash_map; 8577 if (!hash) { 8578 hash = kzalloc(sizeof(*hash), GFP_KERNEL); 8579 if (!hash) { 8580 mutex_unlock(&ctx->uring_lock); 8581 return ERR_PTR(-ENOMEM); 8582 } 8583 refcount_set(&hash->refs, 1); 8584 init_waitqueue_head(&hash->wait); 8585 ctx->hash_map = hash; 8586 } 8587 mutex_unlock(&ctx->uring_lock); 8588 8589 data.hash = hash; 8590 data.task = task; 8591 data.free_work = io_wq_free_work; 8592 data.do_work = io_wq_submit_work; 8593 8594 /* Do QD, or 4 * CPUS, whatever is smallest */ 8595 concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); 8596 8597 return io_wq_create(concurrency, &data); 8598} 8599 8600static __cold int io_uring_alloc_task_context(struct task_struct *task, 8601 struct io_ring_ctx *ctx) 8602{ 8603 struct io_uring_task *tctx; 8604 int ret; 8605 8606 tctx = kzalloc(sizeof(*tctx), GFP_KERNEL); 8607 if (unlikely(!tctx)) 8608 return -ENOMEM; 8609 8610 ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); 8611 if (unlikely(ret)) { 8612 kfree(tctx); 8613 return ret; 8614 } 8615 8616 tctx->io_wq = io_init_wq_offload(ctx, task); 8617 if (IS_ERR(tctx->io_wq)) { 8618 ret = PTR_ERR(tctx->io_wq); 8619 percpu_counter_destroy(&tctx->inflight); 8620 kfree(tctx); 8621 return ret; 8622 } 8623 8624 xa_init(&tctx->xa); 8625 init_waitqueue_head(&tctx->wait); 8626 atomic_set(&tctx->in_idle, 0); 8627 atomic_set(&tctx->inflight_tracked, 0); 8628 task->io_uring = tctx; 8629 spin_lock_init(&tctx->task_lock); 8630 INIT_WQ_LIST(&tctx->task_list); 8631 init_task_work(&tctx->task_work, tctx_task_work); 8632 return 0; 8633} 8634 8635void __io_uring_free(struct task_struct *tsk) 8636{ 8637 struct io_uring_task *tctx = tsk->io_uring; 8638 8639 WARN_ON_ONCE(!xa_empty(&tctx->xa)); 8640 WARN_ON_ONCE(tctx->io_wq); 8641 WARN_ON_ONCE(tctx->cached_refs); 8642 8643 percpu_counter_destroy(&tctx->inflight); 8644 kfree(tctx); 8645 tsk->io_uring = NULL; 8646} 8647 8648static __cold int io_sq_offload_create(struct io_ring_ctx *ctx, 8649 struct io_uring_params *p) 8650{ 8651 int ret; 8652 8653 /* Retain compatibility with failing for an invalid attach attempt */ 8654 if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) == 8655 IORING_SETUP_ATTACH_WQ) { 8656 struct fd f; 8657 8658 f = fdget(p->wq_fd); 8659 if (!f.file) 8660 return -ENXIO; 8661 if (f.file->f_op != &io_uring_fops) { 8662 fdput(f); 8663 return -EINVAL; 8664 } 8665 fdput(f); 8666 } 8667 if (ctx->flags & IORING_SETUP_SQPOLL) { 8668 struct task_struct *tsk; 8669 struct io_sq_data *sqd; 8670 bool attached; 8671 8672 ret = security_uring_sqpoll(); 8673 if (ret) 8674 return ret; 8675 8676 sqd = io_get_sq_data(p, &attached); 8677 if (IS_ERR(sqd)) { 8678 ret = PTR_ERR(sqd); 8679 goto err; 8680 } 8681 8682 ctx->sq_creds = get_current_cred(); 8683 ctx->sq_data = sqd; 8684 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); 8685 if (!ctx->sq_thread_idle) 8686 ctx->sq_thread_idle = HZ; 8687 8688 io_sq_thread_park(sqd); 8689 list_add(&ctx->sqd_list, &sqd->ctx_list); 8690 io_sqd_update_thread_idle(sqd); 8691 /* don't attach to a dying SQPOLL thread, would be racy */ 8692 ret = (attached && !sqd->thread) ? -ENXIO : 0; 8693 io_sq_thread_unpark(sqd); 8694 8695 if (ret < 0) 8696 goto err; 8697 if (attached) 8698 return 0; 8699 8700 if (p->flags & IORING_SETUP_SQ_AFF) { 8701 int cpu = p->sq_thread_cpu; 8702 8703 ret = -EINVAL; 8704 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) 8705 goto err_sqpoll; 8706 sqd->sq_cpu = cpu; 8707 } else { 8708 sqd->sq_cpu = -1; 8709 } 8710 8711 sqd->task_pid = current->pid; 8712 sqd->task_tgid = current->tgid; 8713 tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); 8714 if (IS_ERR(tsk)) { 8715 ret = PTR_ERR(tsk); 8716 goto err_sqpoll; 8717 } 8718 8719 sqd->thread = tsk; 8720 ret = io_uring_alloc_task_context(tsk, ctx); 8721 wake_up_new_task(tsk); 8722 if (ret) 8723 goto err; 8724 } else if (p->flags & IORING_SETUP_SQ_AFF) { 8725 /* Can't have SQ_AFF without SQPOLL */ 8726 ret = -EINVAL; 8727 goto err; 8728 } 8729 8730 return 0; 8731err_sqpoll: 8732 complete(&ctx->sq_data->exited); 8733err: 8734 io_sq_thread_finish(ctx); 8735 return ret; 8736} 8737 8738static inline void __io_unaccount_mem(struct user_struct *user, 8739 unsigned long nr_pages) 8740{ 8741 atomic_long_sub(nr_pages, &user->locked_vm); 8742} 8743 8744static inline int __io_account_mem(struct user_struct *user, 8745 unsigned long nr_pages) 8746{ 8747 unsigned long page_limit, cur_pages, new_pages; 8748 8749 /* Don't allow more pages than we can safely lock */ 8750 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 8751 8752 do { 8753 cur_pages = atomic_long_read(&user->locked_vm); 8754 new_pages = cur_pages + nr_pages; 8755 if (new_pages > page_limit) 8756 return -ENOMEM; 8757 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, 8758 new_pages) != cur_pages); 8759 8760 return 0; 8761} 8762 8763static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 8764{ 8765 if (ctx->user) 8766 __io_unaccount_mem(ctx->user, nr_pages); 8767 8768 if (ctx->mm_account) 8769 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 8770} 8771 8772static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 8773{ 8774 int ret; 8775 8776 if (ctx->user) { 8777 ret = __io_account_mem(ctx->user, nr_pages); 8778 if (ret) 8779 return ret; 8780 } 8781 8782 if (ctx->mm_account) 8783 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 8784 8785 return 0; 8786} 8787 8788static void io_mem_free(void *ptr) 8789{ 8790 struct page *page; 8791 8792 if (!ptr) 8793 return; 8794 8795 page = virt_to_head_page(ptr); 8796 if (put_page_testzero(page)) 8797 free_compound_page(page); 8798} 8799 8800static void *io_mem_alloc(size_t size) 8801{ 8802 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP | 8803 __GFP_NORETRY | __GFP_ACCOUNT; 8804 8805 return (void *) __get_free_pages(gfp_flags, get_order(size)); 8806} 8807 8808static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, 8809 size_t *sq_offset) 8810{ 8811 struct io_rings *rings; 8812 size_t off, sq_array_size; 8813 8814 off = struct_size(rings, cqes, cq_entries); 8815 if (off == SIZE_MAX) 8816 return SIZE_MAX; 8817 8818#ifdef CONFIG_SMP 8819 off = ALIGN(off, SMP_CACHE_BYTES); 8820 if (off == 0) 8821 return SIZE_MAX; 8822#endif 8823 8824 if (sq_offset) 8825 *sq_offset = off; 8826 8827 sq_array_size = array_size(sizeof(u32), sq_entries); 8828 if (sq_array_size == SIZE_MAX) 8829 return SIZE_MAX; 8830 8831 if (check_add_overflow(off, sq_array_size, &off)) 8832 return SIZE_MAX; 8833 8834 return off; 8835} 8836 8837static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) 8838{ 8839 struct io_mapped_ubuf *imu = *slot; 8840 unsigned int i; 8841 8842 if (imu != ctx->dummy_ubuf) { 8843 for (i = 0; i < imu->nr_bvecs; i++) 8844 unpin_user_page(imu->bvec[i].bv_page); 8845 if (imu->acct_pages) 8846 io_unaccount_mem(ctx, imu->acct_pages); 8847 kvfree(imu); 8848 } 8849 *slot = NULL; 8850} 8851 8852static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 8853{ 8854 io_buffer_unmap(ctx, &prsrc->buf); 8855 prsrc->buf = NULL; 8856} 8857 8858static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 8859{ 8860 unsigned int i; 8861 8862 for (i = 0; i < ctx->nr_user_bufs; i++) 8863 io_buffer_unmap(ctx, &ctx->user_bufs[i]); 8864 kfree(ctx->user_bufs); 8865 io_rsrc_data_free(ctx->buf_data); 8866 ctx->user_bufs = NULL; 8867 ctx->buf_data = NULL; 8868 ctx->nr_user_bufs = 0; 8869} 8870 8871static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 8872{ 8873 int ret; 8874 8875 if (!ctx->buf_data) 8876 return -ENXIO; 8877 8878 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); 8879 if (!ret) 8880 __io_sqe_buffers_unregister(ctx); 8881 return ret; 8882} 8883 8884static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, 8885 void __user *arg, unsigned index) 8886{ 8887 struct iovec __user *src; 8888 8889#ifdef CONFIG_COMPAT 8890 if (ctx->compat) { 8891 struct compat_iovec __user *ciovs; 8892 struct compat_iovec ciov; 8893 8894 ciovs = (struct compat_iovec __user *) arg; 8895 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) 8896 return -EFAULT; 8897 8898 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); 8899 dst->iov_len = ciov.iov_len; 8900 return 0; 8901 } 8902#endif 8903 src = (struct iovec __user *) arg; 8904 if (copy_from_user(dst, &src[index], sizeof(*dst))) 8905 return -EFAULT; 8906 return 0; 8907} 8908 8909/* 8910 * Not super efficient, but this is just a registration time. And we do cache 8911 * the last compound head, so generally we'll only do a full search if we don't 8912 * match that one. 8913 * 8914 * We check if the given compound head page has already been accounted, to 8915 * avoid double accounting it. This allows us to account the full size of the 8916 * page, not just the constituent pages of a huge page. 8917 */ 8918static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 8919 int nr_pages, struct page *hpage) 8920{ 8921 int i, j; 8922 8923 /* check current page array */ 8924 for (i = 0; i < nr_pages; i++) { 8925 if (!PageCompound(pages[i])) 8926 continue; 8927 if (compound_head(pages[i]) == hpage) 8928 return true; 8929 } 8930 8931 /* check previously registered pages */ 8932 for (i = 0; i < ctx->nr_user_bufs; i++) { 8933 struct io_mapped_ubuf *imu = ctx->user_bufs[i]; 8934 8935 for (j = 0; j < imu->nr_bvecs; j++) { 8936 if (!PageCompound(imu->bvec[j].bv_page)) 8937 continue; 8938 if (compound_head(imu->bvec[j].bv_page) == hpage) 8939 return true; 8940 } 8941 } 8942 8943 return false; 8944} 8945 8946static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 8947 int nr_pages, struct io_mapped_ubuf *imu, 8948 struct page **last_hpage) 8949{ 8950 int i, ret; 8951 8952 imu->acct_pages = 0; 8953 for (i = 0; i < nr_pages; i++) { 8954 if (!PageCompound(pages[i])) { 8955 imu->acct_pages++; 8956 } else { 8957 struct page *hpage; 8958 8959 hpage = compound_head(pages[i]); 8960 if (hpage == *last_hpage) 8961 continue; 8962 *last_hpage = hpage; 8963 if (headpage_already_acct(ctx, pages, i, hpage)) 8964 continue; 8965 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 8966 } 8967 } 8968 8969 if (!imu->acct_pages) 8970 return 0; 8971 8972 ret = io_account_mem(ctx, imu->acct_pages); 8973 if (ret) 8974 imu->acct_pages = 0; 8975 return ret; 8976} 8977 8978static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 8979 struct io_mapped_ubuf **pimu, 8980 struct page **last_hpage) 8981{ 8982 struct io_mapped_ubuf *imu = NULL; 8983 struct vm_area_struct **vmas = NULL; 8984 struct page **pages = NULL; 8985 unsigned long off, start, end, ubuf; 8986 size_t size; 8987 int ret, pret, nr_pages, i; 8988 8989 if (!iov->iov_base) { 8990 *pimu = ctx->dummy_ubuf; 8991 return 0; 8992 } 8993 8994 ubuf = (unsigned long) iov->iov_base; 8995 end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; 8996 start = ubuf >> PAGE_SHIFT; 8997 nr_pages = end - start; 8998 8999 *pimu = NULL; 9000 ret = -ENOMEM; 9001 9002 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); 9003 if (!pages) 9004 goto done; 9005 9006 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), 9007 GFP_KERNEL); 9008 if (!vmas) 9009 goto done; 9010 9011 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); 9012 if (!imu) 9013 goto done; 9014 9015 ret = 0; 9016 mmap_read_lock(current->mm); 9017 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, 9018 pages, vmas); 9019 if (pret == nr_pages) { 9020 /* don't support file backed memory */ 9021 for (i = 0; i < nr_pages; i++) { 9022 struct vm_area_struct *vma = vmas[i]; 9023 9024 if (vma_is_shmem(vma)) 9025 continue; 9026 if (vma->vm_file && 9027 !is_file_hugepages(vma->vm_file)) { 9028 ret = -EOPNOTSUPP; 9029 break; 9030 } 9031 } 9032 } else { 9033 ret = pret < 0 ? pret : -EFAULT; 9034 } 9035 mmap_read_unlock(current->mm); 9036 if (ret) { 9037 /* 9038 * if we did partial map, or found file backed vmas, 9039 * release any pages we did get 9040 */ 9041 if (pret > 0) 9042 unpin_user_pages(pages, pret); 9043 goto done; 9044 } 9045 9046 ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage); 9047 if (ret) { 9048 unpin_user_pages(pages, pret); 9049 goto done; 9050 } 9051 9052 off = ubuf & ~PAGE_MASK; 9053 size = iov->iov_len; 9054 for (i = 0; i < nr_pages; i++) { 9055 size_t vec_len; 9056 9057 vec_len = min_t(size_t, size, PAGE_SIZE - off); 9058 imu->bvec[i].bv_page = pages[i]; 9059 imu->bvec[i].bv_len = vec_len; 9060 imu->bvec[i].bv_offset = off; 9061 off = 0; 9062 size -= vec_len; 9063 } 9064 /* store original address for later verification */ 9065 imu->ubuf = ubuf; 9066 imu->ubuf_end = ubuf + iov->iov_len; 9067 imu->nr_bvecs = nr_pages; 9068 *pimu = imu; 9069 ret = 0; 9070done: 9071 if (ret) 9072 kvfree(imu); 9073 kvfree(pages); 9074 kvfree(vmas); 9075 return ret; 9076} 9077 9078static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) 9079{ 9080 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); 9081 return ctx->user_bufs ? 0 : -ENOMEM; 9082} 9083 9084static int io_buffer_validate(struct iovec *iov) 9085{ 9086 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); 9087 9088 /* 9089 * Don't impose further limits on the size and buffer 9090 * constraints here, we'll -EINVAL later when IO is 9091 * submitted if they are wrong. 9092 */ 9093 if (!iov->iov_base) 9094 return iov->iov_len ? -EFAULT : 0; 9095 if (!iov->iov_len) 9096 return -EFAULT; 9097 9098 /* arbitrary limit, but we need something */ 9099 if (iov->iov_len > SZ_1G) 9100 return -EFAULT; 9101 9102 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) 9103 return -EOVERFLOW; 9104 9105 return 0; 9106} 9107 9108static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 9109 unsigned int nr_args, u64 __user *tags) 9110{ 9111 struct page *last_hpage = NULL; 9112 struct io_rsrc_data *data; 9113 int i, ret; 9114 struct iovec iov; 9115 9116 if (ctx->user_bufs) 9117 return -EBUSY; 9118 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 9119 return -EINVAL; 9120 ret = io_rsrc_node_switch_start(ctx); 9121 if (ret) 9122 return ret; 9123 ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); 9124 if (ret) 9125 return ret; 9126 ret = io_buffers_map_alloc(ctx, nr_args); 9127 if (ret) { 9128 io_rsrc_data_free(data); 9129 return ret; 9130 } 9131 9132 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { 9133 ret = io_copy_iov(ctx, &iov, arg, i); 9134 if (ret) 9135 break; 9136 ret = io_buffer_validate(&iov); 9137 if (ret) 9138 break; 9139 if (!iov.iov_base && *io_get_tag_slot(data, i)) { 9140 ret = -EINVAL; 9141 break; 9142 } 9143 9144 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], 9145 &last_hpage); 9146 if (ret) 9147 break; 9148 } 9149 9150 WARN_ON_ONCE(ctx->buf_data); 9151 9152 ctx->buf_data = data; 9153 if (ret) 9154 __io_sqe_buffers_unregister(ctx); 9155 else 9156 io_rsrc_node_switch(ctx, NULL); 9157 return ret; 9158} 9159 9160static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 9161 struct io_uring_rsrc_update2 *up, 9162 unsigned int nr_args) 9163{ 9164 u64 __user *tags = u64_to_user_ptr(up->tags); 9165 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); 9166 struct page *last_hpage = NULL; 9167 bool needs_switch = false; 9168 __u32 done; 9169 int i, err; 9170 9171 if (!ctx->buf_data) 9172 return -ENXIO; 9173 if (up->offset + nr_args > ctx->nr_user_bufs) 9174 return -EINVAL; 9175 9176 for (done = 0; done < nr_args; done++) { 9177 struct io_mapped_ubuf *imu; 9178 int offset = up->offset + done; 9179 u64 tag = 0; 9180 9181 err = io_copy_iov(ctx, &iov, iovs, done); 9182 if (err) 9183 break; 9184 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 9185 err = -EFAULT; 9186 break; 9187 } 9188 err = io_buffer_validate(&iov); 9189 if (err) 9190 break; 9191 if (!iov.iov_base && tag) { 9192 err = -EINVAL; 9193 break; 9194 } 9195 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); 9196 if (err) 9197 break; 9198 9199 i = array_index_nospec(offset, ctx->nr_user_bufs); 9200 if (ctx->user_bufs[i] != ctx->dummy_ubuf) { 9201 err = io_queue_rsrc_removal(ctx->buf_data, offset, 9202 ctx->rsrc_node, ctx->user_bufs[i]); 9203 if (unlikely(err)) { 9204 io_buffer_unmap(ctx, &imu); 9205 break; 9206 } 9207 ctx->user_bufs[i] = NULL; 9208 needs_switch = true; 9209 } 9210 9211 ctx->user_bufs[i] = imu; 9212 *io_get_tag_slot(ctx->buf_data, offset) = tag; 9213 } 9214 9215 if (needs_switch) 9216 io_rsrc_node_switch(ctx, ctx->buf_data); 9217 return done ? done : err; 9218} 9219 9220static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg) 9221{ 9222 __s32 __user *fds = arg; 9223 int fd; 9224 9225 if (ctx->cq_ev_fd) 9226 return -EBUSY; 9227 9228 if (copy_from_user(&fd, fds, sizeof(*fds))) 9229 return -EFAULT; 9230 9231 ctx->cq_ev_fd = eventfd_ctx_fdget(fd); 9232 if (IS_ERR(ctx->cq_ev_fd)) { 9233 int ret = PTR_ERR(ctx->cq_ev_fd); 9234 9235 ctx->cq_ev_fd = NULL; 9236 return ret; 9237 } 9238 9239 return 0; 9240} 9241 9242static int io_eventfd_unregister(struct io_ring_ctx *ctx) 9243{ 9244 if (ctx->cq_ev_fd) { 9245 eventfd_ctx_put(ctx->cq_ev_fd); 9246 ctx->cq_ev_fd = NULL; 9247 return 0; 9248 } 9249 9250 return -ENXIO; 9251} 9252 9253static void io_destroy_buffers(struct io_ring_ctx *ctx) 9254{ 9255 struct io_buffer *buf; 9256 unsigned long index; 9257 9258 xa_for_each(&ctx->io_buffers, index, buf) { 9259 __io_remove_buffers(ctx, buf, index, -1U); 9260 cond_resched(); 9261 } 9262} 9263 9264static void io_req_caches_free(struct io_ring_ctx *ctx) 9265{ 9266 struct io_submit_state *state = &ctx->submit_state; 9267 int nr = 0; 9268 9269 mutex_lock(&ctx->uring_lock); 9270 io_flush_cached_locked_reqs(ctx, state); 9271 9272 while (state->free_list.next) { 9273 struct io_wq_work_node *node; 9274 struct io_kiocb *req; 9275 9276 node = wq_stack_extract(&state->free_list); 9277 req = container_of(node, struct io_kiocb, comp_list); 9278 kmem_cache_free(req_cachep, req); 9279 nr++; 9280 } 9281 if (nr) 9282 percpu_ref_put_many(&ctx->refs, nr); 9283 mutex_unlock(&ctx->uring_lock); 9284} 9285 9286static void io_wait_rsrc_data(struct io_rsrc_data *data) 9287{ 9288 if (data && !atomic_dec_and_test(&data->refs)) 9289 wait_for_completion(&data->done); 9290} 9291 9292static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) 9293{ 9294 io_sq_thread_finish(ctx); 9295 9296 if (ctx->mm_account) { 9297 mmdrop(ctx->mm_account); 9298 ctx->mm_account = NULL; 9299 } 9300 9301 io_rsrc_refs_drop(ctx); 9302 /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ 9303 io_wait_rsrc_data(ctx->buf_data); 9304 io_wait_rsrc_data(ctx->file_data); 9305 9306 mutex_lock(&ctx->uring_lock); 9307 if (ctx->buf_data) 9308 __io_sqe_buffers_unregister(ctx); 9309 if (ctx->file_data) 9310 __io_sqe_files_unregister(ctx); 9311 if (ctx->rings) 9312 __io_cqring_overflow_flush(ctx, true); 9313 mutex_unlock(&ctx->uring_lock); 9314 io_eventfd_unregister(ctx); 9315 io_destroy_buffers(ctx); 9316 if (ctx->sq_creds) 9317 put_cred(ctx->sq_creds); 9318 9319 /* there are no registered resources left, nobody uses it */ 9320 if (ctx->rsrc_node) 9321 io_rsrc_node_destroy(ctx->rsrc_node); 9322 if (ctx->rsrc_backup_node) 9323 io_rsrc_node_destroy(ctx->rsrc_backup_node); 9324 flush_delayed_work(&ctx->rsrc_put_work); 9325 flush_delayed_work(&ctx->fallback_work); 9326 9327 WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); 9328 WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist)); 9329 9330#if defined(CONFIG_UNIX) 9331 if (ctx->ring_sock) { 9332 ctx->ring_sock->file = NULL; /* so that iput() is called */ 9333 sock_release(ctx->ring_sock); 9334 } 9335#endif 9336 WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); 9337 9338 io_mem_free(ctx->rings); 9339 io_mem_free(ctx->sq_sqes); 9340 9341 percpu_ref_exit(&ctx->refs); 9342 free_uid(ctx->user); 9343 io_req_caches_free(ctx); 9344 if (ctx->hash_map) 9345 io_wq_put_hash(ctx->hash_map); 9346 kfree(ctx->cancel_hash); 9347 kfree(ctx->dummy_ubuf); 9348 kfree(ctx); 9349} 9350 9351static __poll_t io_uring_poll(struct file *file, poll_table *wait) 9352{ 9353 struct io_ring_ctx *ctx = file->private_data; 9354 __poll_t mask = 0; 9355 9356 poll_wait(file, &ctx->cq_wait, wait); 9357 /* 9358 * synchronizes with barrier from wq_has_sleeper call in 9359 * io_commit_cqring 9360 */ 9361 smp_rmb(); 9362 if (!io_sqring_full(ctx)) 9363 mask |= EPOLLOUT | EPOLLWRNORM; 9364 9365 /* 9366 * Don't flush cqring overflow list here, just do a simple check. 9367 * Otherwise there could possible be ABBA deadlock: 9368 * CPU0 CPU1 9369 * ---- ---- 9370 * lock(&ctx->uring_lock); 9371 * lock(&ep->mtx); 9372 * lock(&ctx->uring_lock); 9373 * lock(&ep->mtx); 9374 * 9375 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this 9376 * pushs them to do the flush. 9377 */ 9378 if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow)) 9379 mask |= EPOLLIN | EPOLLRDNORM; 9380 9381 return mask; 9382} 9383 9384static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) 9385{ 9386 const struct cred *creds; 9387 9388 creds = xa_erase(&ctx->personalities, id); 9389 if (creds) { 9390 put_cred(creds); 9391 return 0; 9392 } 9393 9394 return -EINVAL; 9395} 9396 9397struct io_tctx_exit { 9398 struct callback_head task_work; 9399 struct completion completion; 9400 struct io_ring_ctx *ctx; 9401}; 9402 9403static __cold void io_tctx_exit_cb(struct callback_head *cb) 9404{ 9405 struct io_uring_task *tctx = current->io_uring; 9406 struct io_tctx_exit *work; 9407 9408 work = container_of(cb, struct io_tctx_exit, task_work); 9409 /* 9410 * When @in_idle, we're in cancellation and it's racy to remove the 9411 * node. It'll be removed by the end of cancellation, just ignore it. 9412 */ 9413 if (!atomic_read(&tctx->in_idle)) 9414 io_uring_del_tctx_node((unsigned long)work->ctx); 9415 complete(&work->completion); 9416} 9417 9418static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) 9419{ 9420 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 9421 9422 return req->ctx == data; 9423} 9424 9425static __cold void io_ring_exit_work(struct work_struct *work) 9426{ 9427 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); 9428 unsigned long timeout = jiffies + HZ * 60 * 5; 9429 unsigned long interval = HZ / 20; 9430 struct io_tctx_exit exit; 9431 struct io_tctx_node *node; 9432 int ret; 9433 9434 /* 9435 * If we're doing polled IO and end up having requests being 9436 * submitted async (out-of-line), then completions can come in while 9437 * we're waiting for refs to drop. We need to reap these manually, 9438 * as nobody else will be looking for them. 9439 */ 9440 do { 9441 io_uring_try_cancel_requests(ctx, NULL, true); 9442 if (ctx->sq_data) { 9443 struct io_sq_data *sqd = ctx->sq_data; 9444 struct task_struct *tsk; 9445 9446 io_sq_thread_park(sqd); 9447 tsk = sqd->thread; 9448 if (tsk && tsk->io_uring && tsk->io_uring->io_wq) 9449 io_wq_cancel_cb(tsk->io_uring->io_wq, 9450 io_cancel_ctx_cb, ctx, true); 9451 io_sq_thread_unpark(sqd); 9452 } 9453 9454 io_req_caches_free(ctx); 9455 9456 if (WARN_ON_ONCE(time_after(jiffies, timeout))) { 9457 /* there is little hope left, don't run it too often */ 9458 interval = HZ * 60; 9459 } 9460 } while (!wait_for_completion_timeout(&ctx->ref_comp, interval)); 9461 9462 init_completion(&exit.completion); 9463 init_task_work(&exit.task_work, io_tctx_exit_cb); 9464 exit.ctx = ctx; 9465 /* 9466 * Some may use context even when all refs and requests have been put, 9467 * and they are free to do so while still holding uring_lock or 9468 * completion_lock, see io_req_task_submit(). Apart from other work, 9469 * this lock/unlock section also waits them to finish. 9470 */ 9471 mutex_lock(&ctx->uring_lock); 9472 while (!list_empty(&ctx->tctx_list)) { 9473 WARN_ON_ONCE(time_after(jiffies, timeout)); 9474 9475 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node, 9476 ctx_node); 9477 /* don't spin on a single task if cancellation failed */ 9478 list_rotate_left(&ctx->tctx_list); 9479 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL); 9480 if (WARN_ON_ONCE(ret)) 9481 continue; 9482 9483 mutex_unlock(&ctx->uring_lock); 9484 wait_for_completion(&exit.completion); 9485 mutex_lock(&ctx->uring_lock); 9486 } 9487 mutex_unlock(&ctx->uring_lock); 9488 spin_lock(&ctx->completion_lock); 9489 spin_unlock(&ctx->completion_lock); 9490 9491 io_ring_ctx_free(ctx); 9492} 9493 9494/* Returns true if we found and killed one or more timeouts */ 9495static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, 9496 struct task_struct *tsk, bool cancel_all) 9497{ 9498 struct io_kiocb *req, *tmp; 9499 int canceled = 0; 9500 9501 spin_lock(&ctx->completion_lock); 9502 spin_lock_irq(&ctx->timeout_lock); 9503 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { 9504 if (io_match_task(req, tsk, cancel_all)) { 9505 io_kill_timeout(req, -ECANCELED); 9506 canceled++; 9507 } 9508 } 9509 spin_unlock_irq(&ctx->timeout_lock); 9510 if (canceled != 0) 9511 io_commit_cqring(ctx); 9512 spin_unlock(&ctx->completion_lock); 9513 if (canceled != 0) 9514 io_cqring_ev_posted(ctx); 9515 return canceled != 0; 9516} 9517 9518static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) 9519{ 9520 unsigned long index; 9521 struct creds *creds; 9522 9523 mutex_lock(&ctx->uring_lock); 9524 percpu_ref_kill(&ctx->refs); 9525 if (ctx->rings) 9526 __io_cqring_overflow_flush(ctx, true); 9527 xa_for_each(&ctx->personalities, index, creds) 9528 io_unregister_personality(ctx, index); 9529 mutex_unlock(&ctx->uring_lock); 9530 9531 io_kill_timeouts(ctx, NULL, true); 9532 io_poll_remove_all(ctx, NULL, true); 9533 9534 /* if we failed setting up the ctx, we might not have any rings */ 9535 io_iopoll_try_reap_events(ctx); 9536 9537 INIT_WORK(&ctx->exit_work, io_ring_exit_work); 9538 /* 9539 * Use system_unbound_wq to avoid spawning tons of event kworkers 9540 * if we're exiting a ton of rings at the same time. It just adds 9541 * noise and overhead, there's no discernable change in runtime 9542 * over using system_wq. 9543 */ 9544 queue_work(system_unbound_wq, &ctx->exit_work); 9545} 9546 9547static int io_uring_release(struct inode *inode, struct file *file) 9548{ 9549 struct io_ring_ctx *ctx = file->private_data; 9550 9551 file->private_data = NULL; 9552 io_ring_ctx_wait_and_kill(ctx); 9553 return 0; 9554} 9555 9556struct io_task_cancel { 9557 struct task_struct *task; 9558 bool all; 9559}; 9560 9561static bool io_cancel_task_cb(struct io_wq_work *work, void *data) 9562{ 9563 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 9564 struct io_task_cancel *cancel = data; 9565 bool ret; 9566 9567 if (!cancel->all && (req->flags & REQ_F_LINK_TIMEOUT)) { 9568 struct io_ring_ctx *ctx = req->ctx; 9569 9570 /* protect against races with linked timeouts */ 9571 spin_lock(&ctx->completion_lock); 9572 ret = io_match_task(req, cancel->task, cancel->all); 9573 spin_unlock(&ctx->completion_lock); 9574 } else { 9575 ret = io_match_task(req, cancel->task, cancel->all); 9576 } 9577 return ret; 9578} 9579 9580static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, 9581 struct task_struct *task, 9582 bool cancel_all) 9583{ 9584 struct io_defer_entry *de; 9585 LIST_HEAD(list); 9586 9587 spin_lock(&ctx->completion_lock); 9588 list_for_each_entry_reverse(de, &ctx->defer_list, list) { 9589 if (io_match_task(de->req, task, cancel_all)) { 9590 list_cut_position(&list, &ctx->defer_list, &de->list); 9591 break; 9592 } 9593 } 9594 spin_unlock(&ctx->completion_lock); 9595 if (list_empty(&list)) 9596 return false; 9597 9598 while (!list_empty(&list)) { 9599 de = list_first_entry(&list, struct io_defer_entry, list); 9600 list_del_init(&de->list); 9601 io_req_complete_failed(de->req, -ECANCELED); 9602 kfree(de); 9603 } 9604 return true; 9605} 9606 9607static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) 9608{ 9609 struct io_tctx_node *node; 9610 enum io_wq_cancel cret; 9611 bool ret = false; 9612 9613 mutex_lock(&ctx->uring_lock); 9614 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 9615 struct io_uring_task *tctx = node->task->io_uring; 9616 9617 /* 9618 * io_wq will stay alive while we hold uring_lock, because it's 9619 * killed after ctx nodes, which requires to take the lock. 9620 */ 9621 if (!tctx || !tctx->io_wq) 9622 continue; 9623 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); 9624 ret |= (cret != IO_WQ_CANCEL_NOTFOUND); 9625 } 9626 mutex_unlock(&ctx->uring_lock); 9627 9628 return ret; 9629} 9630 9631static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 9632 struct task_struct *task, 9633 bool cancel_all) 9634{ 9635 struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; 9636 struct io_uring_task *tctx = task ? task->io_uring : NULL; 9637 9638 while (1) { 9639 enum io_wq_cancel cret; 9640 bool ret = false; 9641 9642 if (!task) { 9643 ret |= io_uring_try_cancel_iowq(ctx); 9644 } else if (tctx && tctx->io_wq) { 9645 /* 9646 * Cancels requests of all rings, not only @ctx, but 9647 * it's fine as the task is in exit/exec. 9648 */ 9649 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, 9650 &cancel, true); 9651 ret |= (cret != IO_WQ_CANCEL_NOTFOUND); 9652 } 9653 9654 /* SQPOLL thread does its own polling */ 9655 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || 9656 (ctx->sq_data && ctx->sq_data->thread == current)) { 9657 while (!wq_list_empty(&ctx->iopoll_list)) { 9658 io_iopoll_try_reap_events(ctx); 9659 ret = true; 9660 } 9661 } 9662 9663 ret |= io_cancel_defer_files(ctx, task, cancel_all); 9664 ret |= io_poll_remove_all(ctx, task, cancel_all); 9665 ret |= io_kill_timeouts(ctx, task, cancel_all); 9666 if (task) 9667 ret |= io_run_task_work(); 9668 if (!ret) 9669 break; 9670 cond_resched(); 9671 } 9672} 9673 9674static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) 9675{ 9676 struct io_uring_task *tctx = current->io_uring; 9677 struct io_tctx_node *node; 9678 int ret; 9679 9680 if (unlikely(!tctx)) { 9681 ret = io_uring_alloc_task_context(current, ctx); 9682 if (unlikely(ret)) 9683 return ret; 9684 9685 tctx = current->io_uring; 9686 if (ctx->iowq_limits_set) { 9687 unsigned int limits[2] = { ctx->iowq_limits[0], 9688 ctx->iowq_limits[1], }; 9689 9690 ret = io_wq_max_workers(tctx->io_wq, limits); 9691 if (ret) 9692 return ret; 9693 } 9694 } 9695 if (!xa_load(&tctx->xa, (unsigned long)ctx)) { 9696 node = kmalloc(sizeof(*node), GFP_KERNEL); 9697 if (!node) 9698 return -ENOMEM; 9699 node->ctx = ctx; 9700 node->task = current; 9701 9702 ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, 9703 node, GFP_KERNEL)); 9704 if (ret) { 9705 kfree(node); 9706 return ret; 9707 } 9708 9709 mutex_lock(&ctx->uring_lock); 9710 list_add(&node->ctx_node, &ctx->tctx_list); 9711 mutex_unlock(&ctx->uring_lock); 9712 } 9713 tctx->last = ctx; 9714 return 0; 9715} 9716 9717/* 9718 * Note that this task has used io_uring. We use it for cancelation purposes. 9719 */ 9720static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx) 9721{ 9722 struct io_uring_task *tctx = current->io_uring; 9723 9724 if (likely(tctx && tctx->last == ctx)) 9725 return 0; 9726 return __io_uring_add_tctx_node(ctx); 9727} 9728 9729/* 9730 * Remove this io_uring_file -> task mapping. 9731 */ 9732static __cold void io_uring_del_tctx_node(unsigned long index) 9733{ 9734 struct io_uring_task *tctx = current->io_uring; 9735 struct io_tctx_node *node; 9736 9737 if (!tctx) 9738 return; 9739 node = xa_erase(&tctx->xa, index); 9740 if (!node) 9741 return; 9742 9743 WARN_ON_ONCE(current != node->task); 9744 WARN_ON_ONCE(list_empty(&node->ctx_node)); 9745 9746 mutex_lock(&node->ctx->uring_lock); 9747 list_del(&node->ctx_node); 9748 mutex_unlock(&node->ctx->uring_lock); 9749 9750 if (tctx->last == node->ctx) 9751 tctx->last = NULL; 9752 kfree(node); 9753} 9754 9755static __cold void io_uring_clean_tctx(struct io_uring_task *tctx) 9756{ 9757 struct io_wq *wq = tctx->io_wq; 9758 struct io_tctx_node *node; 9759 unsigned long index; 9760 9761 xa_for_each(&tctx->xa, index, node) { 9762 io_uring_del_tctx_node(index); 9763 cond_resched(); 9764 } 9765 if (wq) { 9766 /* 9767 * Must be after io_uring_del_task_file() (removes nodes under 9768 * uring_lock) to avoid race with io_uring_try_cancel_iowq(). 9769 */ 9770 io_wq_put_and_exit(wq); 9771 tctx->io_wq = NULL; 9772 } 9773} 9774 9775static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) 9776{ 9777 if (tracked) 9778 return atomic_read(&tctx->inflight_tracked); 9779 return percpu_counter_sum(&tctx->inflight); 9780} 9781 9782static __cold void io_uring_drop_tctx_refs(struct task_struct *task) 9783{ 9784 struct io_uring_task *tctx = task->io_uring; 9785 unsigned int refs = tctx->cached_refs; 9786 9787 if (refs) { 9788 tctx->cached_refs = 0; 9789 percpu_counter_sub(&tctx->inflight, refs); 9790 put_task_struct_many(task, refs); 9791 } 9792} 9793 9794/* 9795 * Find any io_uring ctx that this task has registered or done IO on, and cancel 9796 * requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation. 9797 */ 9798static __cold void io_uring_cancel_generic(bool cancel_all, 9799 struct io_sq_data *sqd) 9800{ 9801 struct io_uring_task *tctx = current->io_uring; 9802 struct io_ring_ctx *ctx; 9803 s64 inflight; 9804 DEFINE_WAIT(wait); 9805 9806 WARN_ON_ONCE(sqd && sqd->thread != current); 9807 9808 if (!current->io_uring) 9809 return; 9810 if (tctx->io_wq) 9811 io_wq_exit_start(tctx->io_wq); 9812 9813 atomic_inc(&tctx->in_idle); 9814 do { 9815 io_uring_drop_tctx_refs(current); 9816 /* read completions before cancelations */ 9817 inflight = tctx_inflight(tctx, !cancel_all); 9818 if (!inflight) 9819 break; 9820 9821 if (!sqd) { 9822 struct io_tctx_node *node; 9823 unsigned long index; 9824 9825 xa_for_each(&tctx->xa, index, node) { 9826 /* sqpoll task will cancel all its requests */ 9827 if (node->ctx->sq_data) 9828 continue; 9829 io_uring_try_cancel_requests(node->ctx, current, 9830 cancel_all); 9831 } 9832 } else { 9833 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 9834 io_uring_try_cancel_requests(ctx, current, 9835 cancel_all); 9836 } 9837 9838 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); 9839 io_uring_drop_tctx_refs(current); 9840 /* 9841 * If we've seen completions, retry without waiting. This 9842 * avoids a race where a completion comes in before we did 9843 * prepare_to_wait(). 9844 */ 9845 if (inflight == tctx_inflight(tctx, !cancel_all)) 9846 schedule(); 9847 finish_wait(&tctx->wait, &wait); 9848 } while (1); 9849 atomic_dec(&tctx->in_idle); 9850 9851 io_uring_clean_tctx(tctx); 9852 if (cancel_all) { 9853 /* for exec all current's requests should be gone, kill tctx */ 9854 __io_uring_free(current); 9855 } 9856} 9857 9858void __io_uring_cancel(bool cancel_all) 9859{ 9860 io_uring_cancel_generic(cancel_all, NULL); 9861} 9862 9863static void *io_uring_validate_mmap_request(struct file *file, 9864 loff_t pgoff, size_t sz) 9865{ 9866 struct io_ring_ctx *ctx = file->private_data; 9867 loff_t offset = pgoff << PAGE_SHIFT; 9868 struct page *page; 9869 void *ptr; 9870 9871 switch (offset) { 9872 case IORING_OFF_SQ_RING: 9873 case IORING_OFF_CQ_RING: 9874 ptr = ctx->rings; 9875 break; 9876 case IORING_OFF_SQES: 9877 ptr = ctx->sq_sqes; 9878 break; 9879 default: 9880 return ERR_PTR(-EINVAL); 9881 } 9882 9883 page = virt_to_head_page(ptr); 9884 if (sz > page_size(page)) 9885 return ERR_PTR(-EINVAL); 9886 9887 return ptr; 9888} 9889 9890#ifdef CONFIG_MMU 9891 9892static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 9893{ 9894 size_t sz = vma->vm_end - vma->vm_start; 9895 unsigned long pfn; 9896 void *ptr; 9897 9898 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); 9899 if (IS_ERR(ptr)) 9900 return PTR_ERR(ptr); 9901 9902 pfn = virt_to_phys(ptr) >> PAGE_SHIFT; 9903 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); 9904} 9905 9906#else /* !CONFIG_MMU */ 9907 9908static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 9909{ 9910 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL; 9911} 9912 9913static unsigned int io_uring_nommu_mmap_capabilities(struct file *file) 9914{ 9915 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; 9916} 9917 9918static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, 9919 unsigned long addr, unsigned long len, 9920 unsigned long pgoff, unsigned long flags) 9921{ 9922 void *ptr; 9923 9924 ptr = io_uring_validate_mmap_request(file, pgoff, len); 9925 if (IS_ERR(ptr)) 9926 return PTR_ERR(ptr); 9927 9928 return (unsigned long) ptr; 9929} 9930 9931#endif /* !CONFIG_MMU */ 9932 9933static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) 9934{ 9935 DEFINE_WAIT(wait); 9936 9937 do { 9938 if (!io_sqring_full(ctx)) 9939 break; 9940 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE); 9941 9942 if (!io_sqring_full(ctx)) 9943 break; 9944 schedule(); 9945 } while (!signal_pending(current)); 9946 9947 finish_wait(&ctx->sqo_sq_wait, &wait); 9948 return 0; 9949} 9950 9951static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, 9952 struct __kernel_timespec __user **ts, 9953 const sigset_t __user **sig) 9954{ 9955 struct io_uring_getevents_arg arg; 9956 9957 /* 9958 * If EXT_ARG isn't set, then we have no timespec and the argp pointer 9959 * is just a pointer to the sigset_t. 9960 */ 9961 if (!(flags & IORING_ENTER_EXT_ARG)) { 9962 *sig = (const sigset_t __user *) argp; 9963 *ts = NULL; 9964 return 0; 9965 } 9966 9967 /* 9968 * EXT_ARG is set - ensure we agree on the size of it and copy in our 9969 * timespec and sigset_t pointers if good. 9970 */ 9971 if (*argsz != sizeof(arg)) 9972 return -EINVAL; 9973 if (copy_from_user(&arg, argp, sizeof(arg))) 9974 return -EFAULT; 9975 *sig = u64_to_user_ptr(arg.sigmask); 9976 *argsz = arg.sigmask_sz; 9977 *ts = u64_to_user_ptr(arg.ts); 9978 return 0; 9979} 9980 9981SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, 9982 u32, min_complete, u32, flags, const void __user *, argp, 9983 size_t, argsz) 9984{ 9985 struct io_ring_ctx *ctx; 9986 int submitted = 0; 9987 struct fd f; 9988 long ret; 9989 9990 io_run_task_work(); 9991 9992 if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | 9993 IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))) 9994 return -EINVAL; 9995 9996 f = fdget(fd); 9997 if (unlikely(!f.file)) 9998 return -EBADF; 9999 10000 ret = -EOPNOTSUPP; 10001 if (unlikely(f.file->f_op != &io_uring_fops)) 10002 goto out_fput; 10003 10004 ret = -ENXIO; 10005 ctx = f.file->private_data; 10006 if (unlikely(!percpu_ref_tryget(&ctx->refs))) 10007 goto out_fput; 10008 10009 ret = -EBADFD; 10010 if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED)) 10011 goto out; 10012 10013 /* 10014 * For SQ polling, the thread will do all submissions and completions. 10015 * Just return the requested submit count, and wake the thread if 10016 * we were asked to. 10017 */ 10018 ret = 0; 10019 if (ctx->flags & IORING_SETUP_SQPOLL) { 10020 io_cqring_overflow_flush(ctx); 10021 10022 if (unlikely(ctx->sq_data->thread == NULL)) { 10023 ret = -EOWNERDEAD; 10024 goto out; 10025 } 10026 if (flags & IORING_ENTER_SQ_WAKEUP) 10027 wake_up(&ctx->sq_data->wait); 10028 if (flags & IORING_ENTER_SQ_WAIT) { 10029 ret = io_sqpoll_wait_sq(ctx); 10030 if (ret) 10031 goto out; 10032 } 10033 submitted = to_submit; 10034 } else if (to_submit) { 10035 ret = io_uring_add_tctx_node(ctx); 10036 if (unlikely(ret)) 10037 goto out; 10038 mutex_lock(&ctx->uring_lock); 10039 submitted = io_submit_sqes(ctx, to_submit); 10040 mutex_unlock(&ctx->uring_lock); 10041 10042 if (submitted != to_submit) 10043 goto out; 10044 } 10045 if (flags & IORING_ENTER_GETEVENTS) { 10046 const sigset_t __user *sig; 10047 struct __kernel_timespec __user *ts; 10048 10049 ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); 10050 if (unlikely(ret)) 10051 goto out; 10052 10053 min_complete = min(min_complete, ctx->cq_entries); 10054 10055 /* 10056 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user 10057 * space applications don't need to do io completion events 10058 * polling again, they can rely on io_sq_thread to do polling 10059 * work, which can reduce cpu usage and uring_lock contention. 10060 */ 10061 if (ctx->flags & IORING_SETUP_IOPOLL && 10062 !(ctx->flags & IORING_SETUP_SQPOLL)) { 10063 ret = io_iopoll_check(ctx, min_complete); 10064 } else { 10065 ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts); 10066 } 10067 } 10068 10069out: 10070 percpu_ref_put(&ctx->refs); 10071out_fput: 10072 fdput(f); 10073 return submitted ? submitted : ret; 10074} 10075 10076#ifdef CONFIG_PROC_FS 10077static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, 10078 const struct cred *cred) 10079{ 10080 struct user_namespace *uns = seq_user_ns(m); 10081 struct group_info *gi; 10082 kernel_cap_t cap; 10083 unsigned __capi; 10084 int g; 10085 10086 seq_printf(m, "%5d\n", id); 10087 seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid)); 10088 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid)); 10089 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid)); 10090 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid)); 10091 seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid)); 10092 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid)); 10093 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid)); 10094 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid)); 10095 seq_puts(m, "\n\tGroups:\t"); 10096 gi = cred->group_info; 10097 for (g = 0; g < gi->ngroups; g++) { 10098 seq_put_decimal_ull(m, g ? " " : "", 10099 from_kgid_munged(uns, gi->gid[g])); 10100 } 10101 seq_puts(m, "\n\tCapEff:\t"); 10102 cap = cred->cap_effective; 10103 CAP_FOR_EACH_U32(__capi) 10104 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8); 10105 seq_putc(m, '\n'); 10106 return 0; 10107} 10108 10109static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, 10110 struct seq_file *m) 10111{ 10112 struct io_sq_data *sq = NULL; 10113 struct io_overflow_cqe *ocqe; 10114 struct io_rings *r = ctx->rings; 10115 unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; 10116 unsigned int sq_head = READ_ONCE(r->sq.head); 10117 unsigned int sq_tail = READ_ONCE(r->sq.tail); 10118 unsigned int cq_head = READ_ONCE(r->cq.head); 10119 unsigned int cq_tail = READ_ONCE(r->cq.tail); 10120 unsigned int sq_entries, cq_entries; 10121 bool has_lock; 10122 unsigned int i; 10123 10124 /* 10125 * we may get imprecise sqe and cqe info if uring is actively running 10126 * since we get cached_sq_head and cached_cq_tail without uring_lock 10127 * and sq_tail and cq_head are changed by userspace. But it's ok since 10128 * we usually use these info when it is stuck. 10129 */ 10130 seq_printf(m, "SqMask:\t\t0x%x\n", sq_mask); 10131 seq_printf(m, "SqHead:\t%u\n", sq_head); 10132 seq_printf(m, "SqTail:\t%u\n", sq_tail); 10133 seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head); 10134 seq_printf(m, "CqMask:\t0x%x\n", cq_mask); 10135 seq_printf(m, "CqHead:\t%u\n", cq_head); 10136 seq_printf(m, "CqTail:\t%u\n", cq_tail); 10137 seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail); 10138 seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head); 10139 sq_entries = min(sq_tail - sq_head, ctx->sq_entries); 10140 for (i = 0; i < sq_entries; i++) { 10141 unsigned int entry = i + sq_head; 10142 unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); 10143 struct io_uring_sqe *sqe; 10144 10145 if (sq_idx > sq_mask) 10146 continue; 10147 sqe = &ctx->sq_sqes[sq_idx]; 10148 seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n", 10149 sq_idx, sqe->opcode, sqe->fd, sqe->flags, 10150 sqe->user_data); 10151 } 10152 seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); 10153 cq_entries = min(cq_tail - cq_head, ctx->cq_entries); 10154 for (i = 0; i < cq_entries; i++) { 10155 unsigned int entry = i + cq_head; 10156 struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask]; 10157 10158 seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", 10159 entry & cq_mask, cqe->user_data, cqe->res, 10160 cqe->flags); 10161 } 10162 10163 /* 10164 * Avoid ABBA deadlock between the seq lock and the io_uring mutex, 10165 * since fdinfo case grabs it in the opposite direction of normal use 10166 * cases. If we fail to get the lock, we just don't iterate any 10167 * structures that could be going away outside the io_uring mutex. 10168 */ 10169 has_lock = mutex_trylock(&ctx->uring_lock); 10170 10171 if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { 10172 sq = ctx->sq_data; 10173 if (!sq->thread) 10174 sq = NULL; 10175 } 10176 10177 seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1); 10178 seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1); 10179 seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); 10180 for (i = 0; has_lock && i < ctx->nr_user_files; i++) { 10181 struct file *f = io_file_from_index(ctx, i); 10182 10183 if (f) 10184 seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname); 10185 else 10186 seq_printf(m, "%5u: <none>\n", i); 10187 } 10188 seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs); 10189 for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) { 10190 struct io_mapped_ubuf *buf = ctx->user_bufs[i]; 10191 unsigned int len = buf->ubuf_end - buf->ubuf; 10192 10193 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len); 10194 } 10195 if (has_lock && !xa_empty(&ctx->personalities)) { 10196 unsigned long index; 10197 const struct cred *cred; 10198 10199 seq_printf(m, "Personalities:\n"); 10200 xa_for_each(&ctx->personalities, index, cred) 10201 io_uring_show_cred(m, index, cred); 10202 } 10203 if (has_lock) 10204 mutex_unlock(&ctx->uring_lock); 10205 10206 seq_puts(m, "PollList:\n"); 10207 spin_lock(&ctx->completion_lock); 10208 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { 10209 struct hlist_head *list = &ctx->cancel_hash[i]; 10210 struct io_kiocb *req; 10211 10212 hlist_for_each_entry(req, list, hash_node) 10213 seq_printf(m, " op=%d, task_works=%d\n", req->opcode, 10214 req->task->task_works != NULL); 10215 } 10216 10217 seq_puts(m, "CqOverflowList:\n"); 10218 list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { 10219 struct io_uring_cqe *cqe = &ocqe->cqe; 10220 10221 seq_printf(m, " user_data=%llu, res=%d, flags=%x\n", 10222 cqe->user_data, cqe->res, cqe->flags); 10223 10224 } 10225 10226 spin_unlock(&ctx->completion_lock); 10227} 10228 10229static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) 10230{ 10231 struct io_ring_ctx *ctx = f->private_data; 10232 10233 if (percpu_ref_tryget(&ctx->refs)) { 10234 __io_uring_show_fdinfo(ctx, m); 10235 percpu_ref_put(&ctx->refs); 10236 } 10237} 10238#endif 10239 10240static const struct file_operations io_uring_fops = { 10241 .release = io_uring_release, 10242 .mmap = io_uring_mmap, 10243#ifndef CONFIG_MMU 10244 .get_unmapped_area = io_uring_nommu_get_unmapped_area, 10245 .mmap_capabilities = io_uring_nommu_mmap_capabilities, 10246#endif 10247 .poll = io_uring_poll, 10248#ifdef CONFIG_PROC_FS 10249 .show_fdinfo = io_uring_show_fdinfo, 10250#endif 10251}; 10252 10253static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, 10254 struct io_uring_params *p) 10255{ 10256 struct io_rings *rings; 10257 size_t size, sq_array_offset; 10258 10259 /* make sure these are sane, as we already accounted them */ 10260 ctx->sq_entries = p->sq_entries; 10261 ctx->cq_entries = p->cq_entries; 10262 10263 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); 10264 if (size == SIZE_MAX) 10265 return -EOVERFLOW; 10266 10267 rings = io_mem_alloc(size); 10268 if (!rings) 10269 return -ENOMEM; 10270 10271 ctx->rings = rings; 10272 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); 10273 rings->sq_ring_mask = p->sq_entries - 1; 10274 rings->cq_ring_mask = p->cq_entries - 1; 10275 rings->sq_ring_entries = p->sq_entries; 10276 rings->cq_ring_entries = p->cq_entries; 10277 10278 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); 10279 if (size == SIZE_MAX) { 10280 io_mem_free(ctx->rings); 10281 ctx->rings = NULL; 10282 return -EOVERFLOW; 10283 } 10284 10285 ctx->sq_sqes = io_mem_alloc(size); 10286 if (!ctx->sq_sqes) { 10287 io_mem_free(ctx->rings); 10288 ctx->rings = NULL; 10289 return -ENOMEM; 10290 } 10291 10292 return 0; 10293} 10294 10295static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) 10296{ 10297 int ret, fd; 10298 10299 fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); 10300 if (fd < 0) 10301 return fd; 10302 10303 ret = io_uring_add_tctx_node(ctx); 10304 if (ret) { 10305 put_unused_fd(fd); 10306 return ret; 10307 } 10308 fd_install(fd, file); 10309 return fd; 10310} 10311 10312/* 10313 * Allocate an anonymous fd, this is what constitutes the application 10314 * visible backing of an io_uring instance. The application mmaps this 10315 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled, 10316 * we have to tie this fd to a socket for file garbage collection purposes. 10317 */ 10318static struct file *io_uring_get_file(struct io_ring_ctx *ctx) 10319{ 10320 struct file *file; 10321#if defined(CONFIG_UNIX) 10322 int ret; 10323 10324 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP, 10325 &ctx->ring_sock); 10326 if (ret) 10327 return ERR_PTR(ret); 10328#endif 10329 10330 file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx, 10331 O_RDWR | O_CLOEXEC, NULL); 10332#if defined(CONFIG_UNIX) 10333 if (IS_ERR(file)) { 10334 sock_release(ctx->ring_sock); 10335 ctx->ring_sock = NULL; 10336 } else { 10337 ctx->ring_sock->file = file; 10338 } 10339#endif 10340 return file; 10341} 10342 10343static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, 10344 struct io_uring_params __user *params) 10345{ 10346 struct io_ring_ctx *ctx; 10347 struct file *file; 10348 int ret; 10349 10350 if (!entries) 10351 return -EINVAL; 10352 if (entries > IORING_MAX_ENTRIES) { 10353 if (!(p->flags & IORING_SETUP_CLAMP)) 10354 return -EINVAL; 10355 entries = IORING_MAX_ENTRIES; 10356 } 10357 10358 /* 10359 * Use twice as many entries for the CQ ring. It's possible for the 10360 * application to drive a higher depth than the size of the SQ ring, 10361 * since the sqes are only used at submission time. This allows for 10362 * some flexibility in overcommitting a bit. If the application has 10363 * set IORING_SETUP_CQSIZE, it will have passed in the desired number 10364 * of CQ ring entries manually. 10365 */ 10366 p->sq_entries = roundup_pow_of_two(entries); 10367 if (p->flags & IORING_SETUP_CQSIZE) { 10368 /* 10369 * If IORING_SETUP_CQSIZE is set, we do the same roundup 10370 * to a power-of-two, if it isn't already. We do NOT impose 10371 * any cq vs sq ring sizing. 10372 */ 10373 if (!p->cq_entries) 10374 return -EINVAL; 10375 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) { 10376 if (!(p->flags & IORING_SETUP_CLAMP)) 10377 return -EINVAL; 10378 p->cq_entries = IORING_MAX_CQ_ENTRIES; 10379 } 10380 p->cq_entries = roundup_pow_of_two(p->cq_entries); 10381 if (p->cq_entries < p->sq_entries) 10382 return -EINVAL; 10383 } else { 10384 p->cq_entries = 2 * p->sq_entries; 10385 } 10386 10387 ctx = io_ring_ctx_alloc(p); 10388 if (!ctx) 10389 return -ENOMEM; 10390 ctx->compat = in_compat_syscall(); 10391 if (!capable(CAP_IPC_LOCK)) 10392 ctx->user = get_uid(current_user()); 10393 10394 /* 10395 * This is just grabbed for accounting purposes. When a process exits, 10396 * the mm is exited and dropped before the files, hence we need to hang 10397 * on to this mm purely for the purposes of being able to unaccount 10398 * memory (locked/pinned vm). It's not used for anything else. 10399 */ 10400 mmgrab(current->mm); 10401 ctx->mm_account = current->mm; 10402 10403 ret = io_allocate_scq_urings(ctx, p); 10404 if (ret) 10405 goto err; 10406 10407 ret = io_sq_offload_create(ctx, p); 10408 if (ret) 10409 goto err; 10410 /* always set a rsrc node */ 10411 ret = io_rsrc_node_switch_start(ctx); 10412 if (ret) 10413 goto err; 10414 io_rsrc_node_switch(ctx, NULL); 10415 10416 memset(&p->sq_off, 0, sizeof(p->sq_off)); 10417 p->sq_off.head = offsetof(struct io_rings, sq.head); 10418 p->sq_off.tail = offsetof(struct io_rings, sq.tail); 10419 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); 10420 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); 10421 p->sq_off.flags = offsetof(struct io_rings, sq_flags); 10422 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); 10423 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; 10424 10425 memset(&p->cq_off, 0, sizeof(p->cq_off)); 10426 p->cq_off.head = offsetof(struct io_rings, cq.head); 10427 p->cq_off.tail = offsetof(struct io_rings, cq.tail); 10428 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); 10429 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); 10430 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); 10431 p->cq_off.cqes = offsetof(struct io_rings, cqes); 10432 p->cq_off.flags = offsetof(struct io_rings, cq_flags); 10433 10434 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | 10435 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | 10436 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | 10437 IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | 10438 IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | 10439 IORING_FEAT_RSRC_TAGS; 10440 10441 if (copy_to_user(params, p, sizeof(*p))) { 10442 ret = -EFAULT; 10443 goto err; 10444 } 10445 10446 file = io_uring_get_file(ctx); 10447 if (IS_ERR(file)) { 10448 ret = PTR_ERR(file); 10449 goto err; 10450 } 10451 10452 /* 10453 * Install ring fd as the very last thing, so we don't risk someone 10454 * having closed it before we finish setup 10455 */ 10456 ret = io_uring_install_fd(ctx, file); 10457 if (ret < 0) { 10458 /* fput will clean it up */ 10459 fput(file); 10460 return ret; 10461 } 10462 10463 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); 10464 return ret; 10465err: 10466 io_ring_ctx_wait_and_kill(ctx); 10467 return ret; 10468} 10469 10470/* 10471 * Sets up an aio uring context, and returns the fd. Applications asks for a 10472 * ring size, we return the actual sq/cq ring sizes (among other things) in the 10473 * params structure passed in. 10474 */ 10475static long io_uring_setup(u32 entries, struct io_uring_params __user *params) 10476{ 10477 struct io_uring_params p; 10478 int i; 10479 10480 if (copy_from_user(&p, params, sizeof(p))) 10481 return -EFAULT; 10482 for (i = 0; i < ARRAY_SIZE(p.resv); i++) { 10483 if (p.resv[i]) 10484 return -EINVAL; 10485 } 10486 10487 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | 10488 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | 10489 IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | 10490 IORING_SETUP_R_DISABLED)) 10491 return -EINVAL; 10492 10493 return io_uring_create(entries, &p, params); 10494} 10495 10496SYSCALL_DEFINE2(io_uring_setup, u32, entries, 10497 struct io_uring_params __user *, params) 10498{ 10499 return io_uring_setup(entries, params); 10500} 10501 10502static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, 10503 unsigned nr_args) 10504{ 10505 struct io_uring_probe *p; 10506 size_t size; 10507 int i, ret; 10508 10509 size = struct_size(p, ops, nr_args); 10510 if (size == SIZE_MAX) 10511 return -EOVERFLOW; 10512 p = kzalloc(size, GFP_KERNEL); 10513 if (!p) 10514 return -ENOMEM; 10515 10516 ret = -EFAULT; 10517 if (copy_from_user(p, arg, size)) 10518 goto out; 10519 ret = -EINVAL; 10520 if (memchr_inv(p, 0, size)) 10521 goto out; 10522 10523 p->last_op = IORING_OP_LAST - 1; 10524 if (nr_args > IORING_OP_LAST) 10525 nr_args = IORING_OP_LAST; 10526 10527 for (i = 0; i < nr_args; i++) { 10528 p->ops[i].op = i; 10529 if (!io_op_defs[i].not_supported) 10530 p->ops[i].flags = IO_URING_OP_SUPPORTED; 10531 } 10532 p->ops_len = i; 10533 10534 ret = 0; 10535 if (copy_to_user(arg, p, size)) 10536 ret = -EFAULT; 10537out: 10538 kfree(p); 10539 return ret; 10540} 10541 10542static int io_register_personality(struct io_ring_ctx *ctx) 10543{ 10544 const struct cred *creds; 10545 u32 id; 10546 int ret; 10547 10548 creds = get_current_cred(); 10549 10550 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, 10551 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); 10552 if (ret < 0) { 10553 put_cred(creds); 10554 return ret; 10555 } 10556 return id; 10557} 10558 10559static __cold int io_register_restrictions(struct io_ring_ctx *ctx, 10560 void __user *arg, unsigned int nr_args) 10561{ 10562 struct io_uring_restriction *res; 10563 size_t size; 10564 int i, ret; 10565 10566 /* Restrictions allowed only if rings started disabled */ 10567 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 10568 return -EBADFD; 10569 10570 /* We allow only a single restrictions registration */ 10571 if (ctx->restrictions.registered) 10572 return -EBUSY; 10573 10574 if (!arg || nr_args > IORING_MAX_RESTRICTIONS) 10575 return -EINVAL; 10576 10577 size = array_size(nr_args, sizeof(*res)); 10578 if (size == SIZE_MAX) 10579 return -EOVERFLOW; 10580 10581 res = memdup_user(arg, size); 10582 if (IS_ERR(res)) 10583 return PTR_ERR(res); 10584 10585 ret = 0; 10586 10587 for (i = 0; i < nr_args; i++) { 10588 switch (res[i].opcode) { 10589 case IORING_RESTRICTION_REGISTER_OP: 10590 if (res[i].register_op >= IORING_REGISTER_LAST) { 10591 ret = -EINVAL; 10592 goto out; 10593 } 10594 10595 __set_bit(res[i].register_op, 10596 ctx->restrictions.register_op); 10597 break; 10598 case IORING_RESTRICTION_SQE_OP: 10599 if (res[i].sqe_op >= IORING_OP_LAST) { 10600 ret = -EINVAL; 10601 goto out; 10602 } 10603 10604 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); 10605 break; 10606 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: 10607 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; 10608 break; 10609 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: 10610 ctx->restrictions.sqe_flags_required = res[i].sqe_flags; 10611 break; 10612 default: 10613 ret = -EINVAL; 10614 goto out; 10615 } 10616 } 10617 10618out: 10619 /* Reset all restrictions if an error happened */ 10620 if (ret != 0) 10621 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); 10622 else 10623 ctx->restrictions.registered = true; 10624 10625 kfree(res); 10626 return ret; 10627} 10628 10629static int io_register_enable_rings(struct io_ring_ctx *ctx) 10630{ 10631 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 10632 return -EBADFD; 10633 10634 if (ctx->restrictions.registered) 10635 ctx->restricted = 1; 10636 10637 ctx->flags &= ~IORING_SETUP_R_DISABLED; 10638 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) 10639 wake_up(&ctx->sq_data->wait); 10640 return 0; 10641} 10642 10643static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 10644 struct io_uring_rsrc_update2 *up, 10645 unsigned nr_args) 10646{ 10647 __u32 tmp; 10648 int err; 10649 10650 if (up->resv) 10651 return -EINVAL; 10652 if (check_add_overflow(up->offset, nr_args, &tmp)) 10653 return -EOVERFLOW; 10654 err = io_rsrc_node_switch_start(ctx); 10655 if (err) 10656 return err; 10657 10658 switch (type) { 10659 case IORING_RSRC_FILE: 10660 return __io_sqe_files_update(ctx, up, nr_args); 10661 case IORING_RSRC_BUFFER: 10662 return __io_sqe_buffers_update(ctx, up, nr_args); 10663 } 10664 return -EINVAL; 10665} 10666 10667static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 10668 unsigned nr_args) 10669{ 10670 struct io_uring_rsrc_update2 up; 10671 10672 if (!nr_args) 10673 return -EINVAL; 10674 memset(&up, 0, sizeof(up)); 10675 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 10676 return -EFAULT; 10677 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 10678} 10679 10680static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 10681 unsigned size, unsigned type) 10682{ 10683 struct io_uring_rsrc_update2 up; 10684 10685 if (size != sizeof(up)) 10686 return -EINVAL; 10687 if (copy_from_user(&up, arg, sizeof(up))) 10688 return -EFAULT; 10689 if (!up.nr || up.resv) 10690 return -EINVAL; 10691 return __io_register_rsrc_update(ctx, type, &up, up.nr); 10692} 10693 10694static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 10695 unsigned int size, unsigned int type) 10696{ 10697 struct io_uring_rsrc_register rr; 10698 10699 /* keep it extendible */ 10700 if (size != sizeof(rr)) 10701 return -EINVAL; 10702 10703 memset(&rr, 0, sizeof(rr)); 10704 if (copy_from_user(&rr, arg, size)) 10705 return -EFAULT; 10706 if (!rr.nr || rr.resv || rr.resv2) 10707 return -EINVAL; 10708 10709 switch (type) { 10710 case IORING_RSRC_FILE: 10711 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 10712 rr.nr, u64_to_user_ptr(rr.tags)); 10713 case IORING_RSRC_BUFFER: 10714 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 10715 rr.nr, u64_to_user_ptr(rr.tags)); 10716 } 10717 return -EINVAL; 10718} 10719 10720static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, 10721 void __user *arg, unsigned len) 10722{ 10723 struct io_uring_task *tctx = current->io_uring; 10724 cpumask_var_t new_mask; 10725 int ret; 10726 10727 if (!tctx || !tctx->io_wq) 10728 return -EINVAL; 10729 10730 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 10731 return -ENOMEM; 10732 10733 cpumask_clear(new_mask); 10734 if (len > cpumask_size()) 10735 len = cpumask_size(); 10736 10737 if (copy_from_user(new_mask, arg, len)) { 10738 free_cpumask_var(new_mask); 10739 return -EFAULT; 10740 } 10741 10742 ret = io_wq_cpu_affinity(tctx->io_wq, new_mask); 10743 free_cpumask_var(new_mask); 10744 return ret; 10745} 10746 10747static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) 10748{ 10749 struct io_uring_task *tctx = current->io_uring; 10750 10751 if (!tctx || !tctx->io_wq) 10752 return -EINVAL; 10753 10754 return io_wq_cpu_affinity(tctx->io_wq, NULL); 10755} 10756 10757static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, 10758 void __user *arg) 10759 __must_hold(&ctx->uring_lock) 10760{ 10761 struct io_tctx_node *node; 10762 struct io_uring_task *tctx = NULL; 10763 struct io_sq_data *sqd = NULL; 10764 __u32 new_count[2]; 10765 int i, ret; 10766 10767 if (copy_from_user(new_count, arg, sizeof(new_count))) 10768 return -EFAULT; 10769 for (i = 0; i < ARRAY_SIZE(new_count); i++) 10770 if (new_count[i] > INT_MAX) 10771 return -EINVAL; 10772 10773 if (ctx->flags & IORING_SETUP_SQPOLL) { 10774 sqd = ctx->sq_data; 10775 if (sqd) { 10776 /* 10777 * Observe the correct sqd->lock -> ctx->uring_lock 10778 * ordering. Fine to drop uring_lock here, we hold 10779 * a ref to the ctx. 10780 */ 10781 refcount_inc(&sqd->refs); 10782 mutex_unlock(&ctx->uring_lock); 10783 mutex_lock(&sqd->lock); 10784 mutex_lock(&ctx->uring_lock); 10785 if (sqd->thread) 10786 tctx = sqd->thread->io_uring; 10787 } 10788 } else { 10789 tctx = current->io_uring; 10790 } 10791 10792 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); 10793 10794 for (i = 0; i < ARRAY_SIZE(new_count); i++) 10795 if (new_count[i]) 10796 ctx->iowq_limits[i] = new_count[i]; 10797 ctx->iowq_limits_set = true; 10798 10799 if (tctx && tctx->io_wq) { 10800 ret = io_wq_max_workers(tctx->io_wq, new_count); 10801 if (ret) 10802 goto err; 10803 } else { 10804 memset(new_count, 0, sizeof(new_count)); 10805 } 10806 10807 if (sqd) { 10808 mutex_unlock(&sqd->lock); 10809 io_put_sq_data(sqd); 10810 } 10811 10812 if (copy_to_user(arg, new_count, sizeof(new_count))) 10813 return -EFAULT; 10814 10815 /* that's it for SQPOLL, only the SQPOLL task creates requests */ 10816 if (sqd) 10817 return 0; 10818 10819 /* now propagate the restriction to all registered users */ 10820 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 10821 struct io_uring_task *tctx = node->task->io_uring; 10822 10823 if (WARN_ON_ONCE(!tctx->io_wq)) 10824 continue; 10825 10826 for (i = 0; i < ARRAY_SIZE(new_count); i++) 10827 new_count[i] = ctx->iowq_limits[i]; 10828 /* ignore errors, it always returns zero anyway */ 10829 (void)io_wq_max_workers(tctx->io_wq, new_count); 10830 } 10831 return 0; 10832err: 10833 if (sqd) { 10834 mutex_unlock(&sqd->lock); 10835 io_put_sq_data(sqd); 10836 } 10837 return ret; 10838} 10839 10840static bool io_register_op_must_quiesce(int op) 10841{ 10842 switch (op) { 10843 case IORING_REGISTER_BUFFERS: 10844 case IORING_UNREGISTER_BUFFERS: 10845 case IORING_REGISTER_FILES: 10846 case IORING_UNREGISTER_FILES: 10847 case IORING_REGISTER_FILES_UPDATE: 10848 case IORING_REGISTER_PROBE: 10849 case IORING_REGISTER_PERSONALITY: 10850 case IORING_UNREGISTER_PERSONALITY: 10851 case IORING_REGISTER_FILES2: 10852 case IORING_REGISTER_FILES_UPDATE2: 10853 case IORING_REGISTER_BUFFERS2: 10854 case IORING_REGISTER_BUFFERS_UPDATE: 10855 case IORING_REGISTER_IOWQ_AFF: 10856 case IORING_UNREGISTER_IOWQ_AFF: 10857 case IORING_REGISTER_IOWQ_MAX_WORKERS: 10858 return false; 10859 default: 10860 return true; 10861 } 10862} 10863 10864static __cold int io_ctx_quiesce(struct io_ring_ctx *ctx) 10865{ 10866 long ret; 10867 10868 percpu_ref_kill(&ctx->refs); 10869 10870 /* 10871 * Drop uring mutex before waiting for references to exit. If another 10872 * thread is currently inside io_uring_enter() it might need to grab the 10873 * uring_lock to make progress. If we hold it here across the drain 10874 * wait, then we can deadlock. It's safe to drop the mutex here, since 10875 * no new references will come in after we've killed the percpu ref. 10876 */ 10877 mutex_unlock(&ctx->uring_lock); 10878 do { 10879 ret = wait_for_completion_interruptible_timeout(&ctx->ref_comp, HZ); 10880 if (ret) { 10881 ret = min(0L, ret); 10882 break; 10883 } 10884 10885 ret = io_run_task_work_sig(); 10886 io_req_caches_free(ctx); 10887 } while (ret >= 0); 10888 mutex_lock(&ctx->uring_lock); 10889 10890 if (ret) 10891 io_refs_resurrect(&ctx->refs, &ctx->ref_comp); 10892 return ret; 10893} 10894 10895static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, 10896 void __user *arg, unsigned nr_args) 10897 __releases(ctx->uring_lock) 10898 __acquires(ctx->uring_lock) 10899{ 10900 int ret; 10901 10902 /* 10903 * We're inside the ring mutex, if the ref is already dying, then 10904 * someone else killed the ctx or is already going through 10905 * io_uring_register(). 10906 */ 10907 if (percpu_ref_is_dying(&ctx->refs)) 10908 return -ENXIO; 10909 10910 if (ctx->restricted) { 10911 if (opcode >= IORING_REGISTER_LAST) 10912 return -EINVAL; 10913 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); 10914 if (!test_bit(opcode, ctx->restrictions.register_op)) 10915 return -EACCES; 10916 } 10917 10918 if (io_register_op_must_quiesce(opcode)) { 10919 ret = io_ctx_quiesce(ctx); 10920 if (ret) 10921 return ret; 10922 } 10923 10924 switch (opcode) { 10925 case IORING_REGISTER_BUFFERS: 10926 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); 10927 break; 10928 case IORING_UNREGISTER_BUFFERS: 10929 ret = -EINVAL; 10930 if (arg || nr_args) 10931 break; 10932 ret = io_sqe_buffers_unregister(ctx); 10933 break; 10934 case IORING_REGISTER_FILES: 10935 ret = io_sqe_files_register(ctx, arg, nr_args, NULL); 10936 break; 10937 case IORING_UNREGISTER_FILES: 10938 ret = -EINVAL; 10939 if (arg || nr_args) 10940 break; 10941 ret = io_sqe_files_unregister(ctx); 10942 break; 10943 case IORING_REGISTER_FILES_UPDATE: 10944 ret = io_register_files_update(ctx, arg, nr_args); 10945 break; 10946 case IORING_REGISTER_EVENTFD: 10947 case IORING_REGISTER_EVENTFD_ASYNC: 10948 ret = -EINVAL; 10949 if (nr_args != 1) 10950 break; 10951 ret = io_eventfd_register(ctx, arg); 10952 if (ret) 10953 break; 10954 if (opcode == IORING_REGISTER_EVENTFD_ASYNC) 10955 ctx->eventfd_async = 1; 10956 else 10957 ctx->eventfd_async = 0; 10958 break; 10959 case IORING_UNREGISTER_EVENTFD: 10960 ret = -EINVAL; 10961 if (arg || nr_args) 10962 break; 10963 ret = io_eventfd_unregister(ctx); 10964 break; 10965 case IORING_REGISTER_PROBE: 10966 ret = -EINVAL; 10967 if (!arg || nr_args > 256) 10968 break; 10969 ret = io_probe(ctx, arg, nr_args); 10970 break; 10971 case IORING_REGISTER_PERSONALITY: 10972 ret = -EINVAL; 10973 if (arg || nr_args) 10974 break; 10975 ret = io_register_personality(ctx); 10976 break; 10977 case IORING_UNREGISTER_PERSONALITY: 10978 ret = -EINVAL; 10979 if (arg) 10980 break; 10981 ret = io_unregister_personality(ctx, nr_args); 10982 break; 10983 case IORING_REGISTER_ENABLE_RINGS: 10984 ret = -EINVAL; 10985 if (arg || nr_args) 10986 break; 10987 ret = io_register_enable_rings(ctx); 10988 break; 10989 case IORING_REGISTER_RESTRICTIONS: 10990 ret = io_register_restrictions(ctx, arg, nr_args); 10991 break; 10992 case IORING_REGISTER_FILES2: 10993 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); 10994 break; 10995 case IORING_REGISTER_FILES_UPDATE2: 10996 ret = io_register_rsrc_update(ctx, arg, nr_args, 10997 IORING_RSRC_FILE); 10998 break; 10999 case IORING_REGISTER_BUFFERS2: 11000 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); 11001 break; 11002 case IORING_REGISTER_BUFFERS_UPDATE: 11003 ret = io_register_rsrc_update(ctx, arg, nr_args, 11004 IORING_RSRC_BUFFER); 11005 break; 11006 case IORING_REGISTER_IOWQ_AFF: 11007 ret = -EINVAL; 11008 if (!arg || !nr_args) 11009 break; 11010 ret = io_register_iowq_aff(ctx, arg, nr_args); 11011 break; 11012 case IORING_UNREGISTER_IOWQ_AFF: 11013 ret = -EINVAL; 11014 if (arg || nr_args) 11015 break; 11016 ret = io_unregister_iowq_aff(ctx); 11017 break; 11018 case IORING_REGISTER_IOWQ_MAX_WORKERS: 11019 ret = -EINVAL; 11020 if (!arg || nr_args != 2) 11021 break; 11022 ret = io_register_iowq_max_workers(ctx, arg); 11023 break; 11024 default: 11025 ret = -EINVAL; 11026 break; 11027 } 11028 11029 if (io_register_op_must_quiesce(opcode)) { 11030 /* bring the ctx back to life */ 11031 percpu_ref_reinit(&ctx->refs); 11032 reinit_completion(&ctx->ref_comp); 11033 } 11034 return ret; 11035} 11036 11037SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, 11038 void __user *, arg, unsigned int, nr_args) 11039{ 11040 struct io_ring_ctx *ctx; 11041 long ret = -EBADF; 11042 struct fd f; 11043 11044 f = fdget(fd); 11045 if (!f.file) 11046 return -EBADF; 11047 11048 ret = -EOPNOTSUPP; 11049 if (f.file->f_op != &io_uring_fops) 11050 goto out_fput; 11051 11052 ctx = f.file->private_data; 11053 11054 io_run_task_work(); 11055 11056 mutex_lock(&ctx->uring_lock); 11057 ret = __io_uring_register(ctx, opcode, arg, nr_args); 11058 mutex_unlock(&ctx->uring_lock); 11059 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, 11060 ctx->cq_ev_fd != NULL, ret); 11061out_fput: 11062 fdput(f); 11063 return ret; 11064} 11065 11066static int __init io_uring_init(void) 11067{ 11068#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \ 11069 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \ 11070 BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \ 11071} while (0) 11072 11073#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \ 11074 __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename) 11075 BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64); 11076 BUILD_BUG_SQE_ELEM(0, __u8, opcode); 11077 BUILD_BUG_SQE_ELEM(1, __u8, flags); 11078 BUILD_BUG_SQE_ELEM(2, __u16, ioprio); 11079 BUILD_BUG_SQE_ELEM(4, __s32, fd); 11080 BUILD_BUG_SQE_ELEM(8, __u64, off); 11081 BUILD_BUG_SQE_ELEM(8, __u64, addr2); 11082 BUILD_BUG_SQE_ELEM(16, __u64, addr); 11083 BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in); 11084 BUILD_BUG_SQE_ELEM(24, __u32, len); 11085 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags); 11086 BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); 11087 BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags); 11088 BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags); 11089 BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events); 11090 BUILD_BUG_SQE_ELEM(28, __u32, poll32_events); 11091 BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags); 11092 BUILD_BUG_SQE_ELEM(28, __u32, msg_flags); 11093 BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags); 11094 BUILD_BUG_SQE_ELEM(28, __u32, accept_flags); 11095 BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags); 11096 BUILD_BUG_SQE_ELEM(28, __u32, open_flags); 11097 BUILD_BUG_SQE_ELEM(28, __u32, statx_flags); 11098 BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice); 11099 BUILD_BUG_SQE_ELEM(28, __u32, splice_flags); 11100 BUILD_BUG_SQE_ELEM(32, __u64, user_data); 11101 BUILD_BUG_SQE_ELEM(40, __u16, buf_index); 11102 BUILD_BUG_SQE_ELEM(40, __u16, buf_group); 11103 BUILD_BUG_SQE_ELEM(42, __u16, personality); 11104 BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); 11105 BUILD_BUG_SQE_ELEM(44, __u32, file_index); 11106 11107 BUILD_BUG_ON(sizeof(struct io_uring_files_update) != 11108 sizeof(struct io_uring_rsrc_update)); 11109 BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) > 11110 sizeof(struct io_uring_rsrc_update2)); 11111 11112 /* ->buf_index is u16 */ 11113 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 11114 11115 /* should fit into one byte */ 11116 BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); 11117 BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8)); 11118 BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS); 11119 11120 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); 11121 BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); 11122 11123 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | 11124 SLAB_ACCOUNT); 11125 return 0; 11126}; 11127__initcall(io_uring_init);