at v5.15-rc1 10874 lines 274 kB view raw
1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Shared application/kernel submission and completion ring pairs, for 4 * supporting fast/efficient IO. 5 * 6 * A note on the read/write ordering memory barriers that are matched between 7 * the application and kernel side. 8 * 9 * After the application reads the CQ ring tail, it must use an 10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses 11 * before writing the tail (using smp_load_acquire to read the tail will 12 * do). It also needs a smp_mb() before updating CQ head (ordering the 13 * entry load(s) with the head store), pairing with an implicit barrier 14 * through a control-dependency in io_get_cqe (smp_store_release to 15 * store head will do). Failure to do so could lead to reading invalid 16 * CQ entries. 17 * 18 * Likewise, the application must use an appropriate smp_wmb() before 19 * writing the SQ tail (ordering SQ entry stores with the tail store), 20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release 21 * to store the tail will do). And it needs a barrier ordering the SQ 22 * head load before writing new SQ entries (smp_load_acquire to read 23 * head will do). 24 * 25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application 26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after* 27 * updating the SQ tail; a full memory barrier smp_mb() is needed 28 * between. 29 * 30 * Also see the examples in the liburing library: 31 * 32 * git://git.kernel.dk/liburing 33 * 34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens 35 * from data shared between the kernel and application. This is done both 36 * for ordering purposes, but also to ensure that once a value is loaded from 37 * data that the application could potentially modify, it remains stable. 38 * 39 * Copyright (C) 2018-2019 Jens Axboe 40 * Copyright (c) 2018-2019 Christoph Hellwig 41 */ 42#include <linux/kernel.h> 43#include <linux/init.h> 44#include <linux/errno.h> 45#include <linux/syscalls.h> 46#include <linux/compat.h> 47#include <net/compat.h> 48#include <linux/refcount.h> 49#include <linux/uio.h> 50#include <linux/bits.h> 51 52#include <linux/sched/signal.h> 53#include <linux/fs.h> 54#include <linux/file.h> 55#include <linux/fdtable.h> 56#include <linux/mm.h> 57#include <linux/mman.h> 58#include <linux/percpu.h> 59#include <linux/slab.h> 60#include <linux/blkdev.h> 61#include <linux/bvec.h> 62#include <linux/net.h> 63#include <net/sock.h> 64#include <net/af_unix.h> 65#include <net/scm.h> 66#include <linux/anon_inodes.h> 67#include <linux/sched/mm.h> 68#include <linux/uaccess.h> 69#include <linux/nospec.h> 70#include <linux/sizes.h> 71#include <linux/hugetlb.h> 72#include <linux/highmem.h> 73#include <linux/namei.h> 74#include <linux/fsnotify.h> 75#include <linux/fadvise.h> 76#include <linux/eventpoll.h> 77#include <linux/splice.h> 78#include <linux/task_work.h> 79#include <linux/pagemap.h> 80#include <linux/io_uring.h> 81#include <linux/tracehook.h> 82 83#define CREATE_TRACE_POINTS 84#include <trace/events/io_uring.h> 85 86#include <uapi/linux/io_uring.h> 87 88#include "internal.h" 89#include "io-wq.h" 90 91#define IORING_MAX_ENTRIES 32768 92#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) 93#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 94 95/* only define max */ 96#define IORING_MAX_FIXED_FILES (1U << 15) 97#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 98 IORING_REGISTER_LAST + IORING_OP_LAST) 99 100#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) 101#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) 102#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) 103 104#define IORING_MAX_REG_BUFFERS (1U << 14) 105 106#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ 107 IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ 108 IOSQE_BUFFER_SELECT) 109#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ 110 REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS) 111 112#define IO_TCTX_REFS_CACHE_NR (1U << 10) 113 114struct io_uring { 115 u32 head ____cacheline_aligned_in_smp; 116 u32 tail ____cacheline_aligned_in_smp; 117}; 118 119/* 120 * This data is shared with the application through the mmap at offsets 121 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. 122 * 123 * The offsets to the member fields are published through struct 124 * io_sqring_offsets when calling io_uring_setup. 125 */ 126struct io_rings { 127 /* 128 * Head and tail offsets into the ring; the offsets need to be 129 * masked to get valid indices. 130 * 131 * The kernel controls head of the sq ring and the tail of the cq ring, 132 * and the application controls tail of the sq ring and the head of the 133 * cq ring. 134 */ 135 struct io_uring sq, cq; 136 /* 137 * Bitmasks to apply to head and tail offsets (constant, equals 138 * ring_entries - 1) 139 */ 140 u32 sq_ring_mask, cq_ring_mask; 141 /* Ring sizes (constant, power of 2) */ 142 u32 sq_ring_entries, cq_ring_entries; 143 /* 144 * Number of invalid entries dropped by the kernel due to 145 * invalid index stored in array 146 * 147 * Written by the kernel, shouldn't be modified by the 148 * application (i.e. get number of "new events" by comparing to 149 * cached value). 150 * 151 * After a new SQ head value was read by the application this 152 * counter includes all submissions that were dropped reaching 153 * the new SQ head (and possibly more). 154 */ 155 u32 sq_dropped; 156 /* 157 * Runtime SQ flags 158 * 159 * Written by the kernel, shouldn't be modified by the 160 * application. 161 * 162 * The application needs a full memory barrier before checking 163 * for IORING_SQ_NEED_WAKEUP after updating the sq tail. 164 */ 165 u32 sq_flags; 166 /* 167 * Runtime CQ flags 168 * 169 * Written by the application, shouldn't be modified by the 170 * kernel. 171 */ 172 u32 cq_flags; 173 /* 174 * Number of completion events lost because the queue was full; 175 * this should be avoided by the application by making sure 176 * there are not more requests pending than there is space in 177 * the completion queue. 178 * 179 * Written by the kernel, shouldn't be modified by the 180 * application (i.e. get number of "new events" by comparing to 181 * cached value). 182 * 183 * As completion events come in out of order this counter is not 184 * ordered with any other data. 185 */ 186 u32 cq_overflow; 187 /* 188 * Ring buffer of completion events. 189 * 190 * The kernel writes completion events fresh every time they are 191 * produced, so the application is allowed to modify pending 192 * entries. 193 */ 194 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; 195}; 196 197enum io_uring_cmd_flags { 198 IO_URING_F_NONBLOCK = 1, 199 IO_URING_F_COMPLETE_DEFER = 2, 200}; 201 202struct io_mapped_ubuf { 203 u64 ubuf; 204 u64 ubuf_end; 205 unsigned int nr_bvecs; 206 unsigned long acct_pages; 207 struct bio_vec bvec[]; 208}; 209 210struct io_ring_ctx; 211 212struct io_overflow_cqe { 213 struct io_uring_cqe cqe; 214 struct list_head list; 215}; 216 217struct io_fixed_file { 218 /* file * with additional FFS_* flags */ 219 unsigned long file_ptr; 220}; 221 222struct io_rsrc_put { 223 struct list_head list; 224 u64 tag; 225 union { 226 void *rsrc; 227 struct file *file; 228 struct io_mapped_ubuf *buf; 229 }; 230}; 231 232struct io_file_table { 233 struct io_fixed_file *files; 234}; 235 236struct io_rsrc_node { 237 struct percpu_ref refs; 238 struct list_head node; 239 struct list_head rsrc_list; 240 struct io_rsrc_data *rsrc_data; 241 struct llist_node llist; 242 bool done; 243}; 244 245typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); 246 247struct io_rsrc_data { 248 struct io_ring_ctx *ctx; 249 250 u64 **tags; 251 unsigned int nr; 252 rsrc_put_fn *do_put; 253 atomic_t refs; 254 struct completion done; 255 bool quiesce; 256}; 257 258struct io_buffer { 259 struct list_head list; 260 __u64 addr; 261 __u32 len; 262 __u16 bid; 263}; 264 265struct io_restriction { 266 DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); 267 DECLARE_BITMAP(sqe_op, IORING_OP_LAST); 268 u8 sqe_flags_allowed; 269 u8 sqe_flags_required; 270 bool registered; 271}; 272 273enum { 274 IO_SQ_THREAD_SHOULD_STOP = 0, 275 IO_SQ_THREAD_SHOULD_PARK, 276}; 277 278struct io_sq_data { 279 refcount_t refs; 280 atomic_t park_pending; 281 struct mutex lock; 282 283 /* ctx's that are using this sqd */ 284 struct list_head ctx_list; 285 286 struct task_struct *thread; 287 struct wait_queue_head wait; 288 289 unsigned sq_thread_idle; 290 int sq_cpu; 291 pid_t task_pid; 292 pid_t task_tgid; 293 294 unsigned long state; 295 struct completion exited; 296}; 297 298#define IO_COMPL_BATCH 32 299#define IO_REQ_CACHE_SIZE 32 300#define IO_REQ_ALLOC_BATCH 8 301 302struct io_submit_link { 303 struct io_kiocb *head; 304 struct io_kiocb *last; 305}; 306 307struct io_submit_state { 308 struct blk_plug plug; 309 struct io_submit_link link; 310 311 /* 312 * io_kiocb alloc cache 313 */ 314 void *reqs[IO_REQ_CACHE_SIZE]; 315 unsigned int free_reqs; 316 317 bool plug_started; 318 319 /* 320 * Batch completion logic 321 */ 322 struct io_kiocb *compl_reqs[IO_COMPL_BATCH]; 323 unsigned int compl_nr; 324 /* inline/task_work completion list, under ->uring_lock */ 325 struct list_head free_list; 326 327 unsigned int ios_left; 328}; 329 330struct io_ring_ctx { 331 /* const or read-mostly hot data */ 332 struct { 333 struct percpu_ref refs; 334 335 struct io_rings *rings; 336 unsigned int flags; 337 unsigned int compat: 1; 338 unsigned int drain_next: 1; 339 unsigned int eventfd_async: 1; 340 unsigned int restricted: 1; 341 unsigned int off_timeout_used: 1; 342 unsigned int drain_active: 1; 343 } ____cacheline_aligned_in_smp; 344 345 /* submission data */ 346 struct { 347 struct mutex uring_lock; 348 349 /* 350 * Ring buffer of indices into array of io_uring_sqe, which is 351 * mmapped by the application using the IORING_OFF_SQES offset. 352 * 353 * This indirection could e.g. be used to assign fixed 354 * io_uring_sqe entries to operations and only submit them to 355 * the queue when needed. 356 * 357 * The kernel modifies neither the indices array nor the entries 358 * array. 359 */ 360 u32 *sq_array; 361 struct io_uring_sqe *sq_sqes; 362 unsigned cached_sq_head; 363 unsigned sq_entries; 364 struct list_head defer_list; 365 366 /* 367 * Fixed resources fast path, should be accessed only under 368 * uring_lock, and updated through io_uring_register(2) 369 */ 370 struct io_rsrc_node *rsrc_node; 371 struct io_file_table file_table; 372 unsigned nr_user_files; 373 unsigned nr_user_bufs; 374 struct io_mapped_ubuf **user_bufs; 375 376 struct io_submit_state submit_state; 377 struct list_head timeout_list; 378 struct list_head ltimeout_list; 379 struct list_head cq_overflow_list; 380 struct xarray io_buffers; 381 struct xarray personalities; 382 u32 pers_next; 383 unsigned sq_thread_idle; 384 } ____cacheline_aligned_in_smp; 385 386 /* IRQ completion list, under ->completion_lock */ 387 struct list_head locked_free_list; 388 unsigned int locked_free_nr; 389 390 const struct cred *sq_creds; /* cred used for __io_sq_thread() */ 391 struct io_sq_data *sq_data; /* if using sq thread polling */ 392 393 struct wait_queue_head sqo_sq_wait; 394 struct list_head sqd_list; 395 396 unsigned long check_cq_overflow; 397 398 struct { 399 unsigned cached_cq_tail; 400 unsigned cq_entries; 401 struct eventfd_ctx *cq_ev_fd; 402 struct wait_queue_head poll_wait; 403 struct wait_queue_head cq_wait; 404 unsigned cq_extra; 405 atomic_t cq_timeouts; 406 struct fasync_struct *cq_fasync; 407 unsigned cq_last_tm_flush; 408 } ____cacheline_aligned_in_smp; 409 410 struct { 411 spinlock_t completion_lock; 412 413 spinlock_t timeout_lock; 414 415 /* 416 * ->iopoll_list is protected by the ctx->uring_lock for 417 * io_uring instances that don't use IORING_SETUP_SQPOLL. 418 * For SQPOLL, only the single threaded io_sq_thread() will 419 * manipulate the list, hence no extra locking is needed there. 420 */ 421 struct list_head iopoll_list; 422 struct hlist_head *cancel_hash; 423 unsigned cancel_hash_bits; 424 bool poll_multi_queue; 425 } ____cacheline_aligned_in_smp; 426 427 struct io_restriction restrictions; 428 429 /* slow path rsrc auxilary data, used by update/register */ 430 struct { 431 struct io_rsrc_node *rsrc_backup_node; 432 struct io_mapped_ubuf *dummy_ubuf; 433 struct io_rsrc_data *file_data; 434 struct io_rsrc_data *buf_data; 435 436 struct delayed_work rsrc_put_work; 437 struct llist_head rsrc_put_llist; 438 struct list_head rsrc_ref_list; 439 spinlock_t rsrc_ref_lock; 440 }; 441 442 /* Keep this last, we don't need it for the fast path */ 443 struct { 444 #if defined(CONFIG_UNIX) 445 struct socket *ring_sock; 446 #endif 447 /* hashed buffered write serialization */ 448 struct io_wq_hash *hash_map; 449 450 /* Only used for accounting purposes */ 451 struct user_struct *user; 452 struct mm_struct *mm_account; 453 454 /* ctx exit and cancelation */ 455 struct llist_head fallback_llist; 456 struct delayed_work fallback_work; 457 struct work_struct exit_work; 458 struct list_head tctx_list; 459 struct completion ref_comp; 460 }; 461}; 462 463struct io_uring_task { 464 /* submission side */ 465 int cached_refs; 466 struct xarray xa; 467 struct wait_queue_head wait; 468 const struct io_ring_ctx *last; 469 struct io_wq *io_wq; 470 struct percpu_counter inflight; 471 atomic_t inflight_tracked; 472 atomic_t in_idle; 473 474 spinlock_t task_lock; 475 struct io_wq_work_list task_list; 476 struct callback_head task_work; 477 bool task_running; 478}; 479 480/* 481 * First field must be the file pointer in all the 482 * iocb unions! See also 'struct kiocb' in <linux/fs.h> 483 */ 484struct io_poll_iocb { 485 struct file *file; 486 struct wait_queue_head *head; 487 __poll_t events; 488 bool done; 489 bool canceled; 490 struct wait_queue_entry wait; 491}; 492 493struct io_poll_update { 494 struct file *file; 495 u64 old_user_data; 496 u64 new_user_data; 497 __poll_t events; 498 bool update_events; 499 bool update_user_data; 500}; 501 502struct io_close { 503 struct file *file; 504 int fd; 505}; 506 507struct io_timeout_data { 508 struct io_kiocb *req; 509 struct hrtimer timer; 510 struct timespec64 ts; 511 enum hrtimer_mode mode; 512 u32 flags; 513}; 514 515struct io_accept { 516 struct file *file; 517 struct sockaddr __user *addr; 518 int __user *addr_len; 519 int flags; 520 u32 file_slot; 521 unsigned long nofile; 522}; 523 524struct io_sync { 525 struct file *file; 526 loff_t len; 527 loff_t off; 528 int flags; 529 int mode; 530}; 531 532struct io_cancel { 533 struct file *file; 534 u64 addr; 535}; 536 537struct io_timeout { 538 struct file *file; 539 u32 off; 540 u32 target_seq; 541 struct list_head list; 542 /* head of the link, used by linked timeouts only */ 543 struct io_kiocb *head; 544 /* for linked completions */ 545 struct io_kiocb *prev; 546}; 547 548struct io_timeout_rem { 549 struct file *file; 550 u64 addr; 551 552 /* timeout update */ 553 struct timespec64 ts; 554 u32 flags; 555 bool ltimeout; 556}; 557 558struct io_rw { 559 /* NOTE: kiocb has the file as the first member, so don't do it here */ 560 struct kiocb kiocb; 561 u64 addr; 562 u64 len; 563}; 564 565struct io_connect { 566 struct file *file; 567 struct sockaddr __user *addr; 568 int addr_len; 569}; 570 571struct io_sr_msg { 572 struct file *file; 573 union { 574 struct compat_msghdr __user *umsg_compat; 575 struct user_msghdr __user *umsg; 576 void __user *buf; 577 }; 578 int msg_flags; 579 int bgid; 580 size_t len; 581 struct io_buffer *kbuf; 582}; 583 584struct io_open { 585 struct file *file; 586 int dfd; 587 u32 file_slot; 588 struct filename *filename; 589 struct open_how how; 590 unsigned long nofile; 591}; 592 593struct io_rsrc_update { 594 struct file *file; 595 u64 arg; 596 u32 nr_args; 597 u32 offset; 598}; 599 600struct io_fadvise { 601 struct file *file; 602 u64 offset; 603 u32 len; 604 u32 advice; 605}; 606 607struct io_madvise { 608 struct file *file; 609 u64 addr; 610 u32 len; 611 u32 advice; 612}; 613 614struct io_epoll { 615 struct file *file; 616 int epfd; 617 int op; 618 int fd; 619 struct epoll_event event; 620}; 621 622struct io_splice { 623 struct file *file_out; 624 struct file *file_in; 625 loff_t off_out; 626 loff_t off_in; 627 u64 len; 628 unsigned int flags; 629}; 630 631struct io_provide_buf { 632 struct file *file; 633 __u64 addr; 634 __u32 len; 635 __u32 bgid; 636 __u16 nbufs; 637 __u16 bid; 638}; 639 640struct io_statx { 641 struct file *file; 642 int dfd; 643 unsigned int mask; 644 unsigned int flags; 645 const char __user *filename; 646 struct statx __user *buffer; 647}; 648 649struct io_shutdown { 650 struct file *file; 651 int how; 652}; 653 654struct io_rename { 655 struct file *file; 656 int old_dfd; 657 int new_dfd; 658 struct filename *oldpath; 659 struct filename *newpath; 660 int flags; 661}; 662 663struct io_unlink { 664 struct file *file; 665 int dfd; 666 int flags; 667 struct filename *filename; 668}; 669 670struct io_mkdir { 671 struct file *file; 672 int dfd; 673 umode_t mode; 674 struct filename *filename; 675}; 676 677struct io_symlink { 678 struct file *file; 679 int new_dfd; 680 struct filename *oldpath; 681 struct filename *newpath; 682}; 683 684struct io_hardlink { 685 struct file *file; 686 int old_dfd; 687 int new_dfd; 688 struct filename *oldpath; 689 struct filename *newpath; 690 int flags; 691}; 692 693struct io_completion { 694 struct file *file; 695 u32 cflags; 696}; 697 698struct io_async_connect { 699 struct sockaddr_storage address; 700}; 701 702struct io_async_msghdr { 703 struct iovec fast_iov[UIO_FASTIOV]; 704 /* points to an allocated iov, if NULL we use fast_iov instead */ 705 struct iovec *free_iov; 706 struct sockaddr __user *uaddr; 707 struct msghdr msg; 708 struct sockaddr_storage addr; 709}; 710 711struct io_async_rw { 712 struct iovec fast_iov[UIO_FASTIOV]; 713 const struct iovec *free_iovec; 714 struct iov_iter iter; 715 size_t bytes_done; 716 struct wait_page_queue wpq; 717}; 718 719enum { 720 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, 721 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, 722 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, 723 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, 724 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, 725 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, 726 727 /* first byte is taken by user flags, shift it to not overlap */ 728 REQ_F_FAIL_BIT = 8, 729 REQ_F_INFLIGHT_BIT, 730 REQ_F_CUR_POS_BIT, 731 REQ_F_NOWAIT_BIT, 732 REQ_F_LINK_TIMEOUT_BIT, 733 REQ_F_NEED_CLEANUP_BIT, 734 REQ_F_POLLED_BIT, 735 REQ_F_BUFFER_SELECTED_BIT, 736 REQ_F_COMPLETE_INLINE_BIT, 737 REQ_F_REISSUE_BIT, 738 REQ_F_DONT_REISSUE_BIT, 739 REQ_F_CREDS_BIT, 740 REQ_F_REFCOUNT_BIT, 741 REQ_F_ARM_LTIMEOUT_BIT, 742 /* keep async read/write and isreg together and in order */ 743 REQ_F_NOWAIT_READ_BIT, 744 REQ_F_NOWAIT_WRITE_BIT, 745 REQ_F_ISREG_BIT, 746 747 /* not a real bit, just to check we're not overflowing the space */ 748 __REQ_F_LAST_BIT, 749}; 750 751enum { 752 /* ctx owns file */ 753 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), 754 /* drain existing IO first */ 755 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), 756 /* linked sqes */ 757 REQ_F_LINK = BIT(REQ_F_LINK_BIT), 758 /* doesn't sever on completion < 0 */ 759 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), 760 /* IOSQE_ASYNC */ 761 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), 762 /* IOSQE_BUFFER_SELECT */ 763 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), 764 765 /* fail rest of links */ 766 REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), 767 /* on inflight list, should be cancelled and waited on exit reliably */ 768 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), 769 /* read/write uses file position */ 770 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), 771 /* must not punt to workers */ 772 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), 773 /* has or had linked timeout */ 774 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), 775 /* needs cleanup */ 776 REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), 777 /* already went through poll handler */ 778 REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), 779 /* buffer already selected */ 780 REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), 781 /* completion is deferred through io_comp_state */ 782 REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), 783 /* caller should reissue async */ 784 REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), 785 /* don't attempt request reissue, see io_rw_reissue() */ 786 REQ_F_DONT_REISSUE = BIT(REQ_F_DONT_REISSUE_BIT), 787 /* supports async reads */ 788 REQ_F_NOWAIT_READ = BIT(REQ_F_NOWAIT_READ_BIT), 789 /* supports async writes */ 790 REQ_F_NOWAIT_WRITE = BIT(REQ_F_NOWAIT_WRITE_BIT), 791 /* regular file */ 792 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), 793 /* has creds assigned */ 794 REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), 795 /* skip refcounting if not set */ 796 REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), 797 /* there is a linked timeout that has to be armed */ 798 REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), 799}; 800 801struct async_poll { 802 struct io_poll_iocb poll; 803 struct io_poll_iocb *double_poll; 804}; 805 806typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); 807 808struct io_task_work { 809 union { 810 struct io_wq_work_node node; 811 struct llist_node fallback_node; 812 }; 813 io_req_tw_func_t func; 814}; 815 816enum { 817 IORING_RSRC_FILE = 0, 818 IORING_RSRC_BUFFER = 1, 819}; 820 821/* 822 * NOTE! Each of the iocb union members has the file pointer 823 * as the first entry in their struct definition. So you can 824 * access the file pointer through any of the sub-structs, 825 * or directly as just 'ki_filp' in this struct. 826 */ 827struct io_kiocb { 828 union { 829 struct file *file; 830 struct io_rw rw; 831 struct io_poll_iocb poll; 832 struct io_poll_update poll_update; 833 struct io_accept accept; 834 struct io_sync sync; 835 struct io_cancel cancel; 836 struct io_timeout timeout; 837 struct io_timeout_rem timeout_rem; 838 struct io_connect connect; 839 struct io_sr_msg sr_msg; 840 struct io_open open; 841 struct io_close close; 842 struct io_rsrc_update rsrc_update; 843 struct io_fadvise fadvise; 844 struct io_madvise madvise; 845 struct io_epoll epoll; 846 struct io_splice splice; 847 struct io_provide_buf pbuf; 848 struct io_statx statx; 849 struct io_shutdown shutdown; 850 struct io_rename rename; 851 struct io_unlink unlink; 852 struct io_mkdir mkdir; 853 struct io_symlink symlink; 854 struct io_hardlink hardlink; 855 /* use only after cleaning per-op data, see io_clean_op() */ 856 struct io_completion compl; 857 }; 858 859 /* opcode allocated if it needs to store data for async defer */ 860 void *async_data; 861 u8 opcode; 862 /* polled IO has completed */ 863 u8 iopoll_completed; 864 865 u16 buf_index; 866 u32 result; 867 868 struct io_ring_ctx *ctx; 869 unsigned int flags; 870 atomic_t refs; 871 struct task_struct *task; 872 u64 user_data; 873 874 struct io_kiocb *link; 875 struct percpu_ref *fixed_rsrc_refs; 876 877 /* used with ctx->iopoll_list with reads/writes */ 878 struct list_head inflight_entry; 879 struct io_task_work io_task_work; 880 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ 881 struct hlist_node hash_node; 882 struct async_poll *apoll; 883 struct io_wq_work work; 884 const struct cred *creds; 885 886 /* store used ubuf, so we can prevent reloading */ 887 struct io_mapped_ubuf *imu; 888}; 889 890struct io_tctx_node { 891 struct list_head ctx_node; 892 struct task_struct *task; 893 struct io_ring_ctx *ctx; 894}; 895 896struct io_defer_entry { 897 struct list_head list; 898 struct io_kiocb *req; 899 u32 seq; 900}; 901 902struct io_op_def { 903 /* needs req->file assigned */ 904 unsigned needs_file : 1; 905 /* hash wq insertion if file is a regular file */ 906 unsigned hash_reg_file : 1; 907 /* unbound wq insertion if file is a non-regular file */ 908 unsigned unbound_nonreg_file : 1; 909 /* opcode is not supported by this kernel */ 910 unsigned not_supported : 1; 911 /* set if opcode supports polled "wait" */ 912 unsigned pollin : 1; 913 unsigned pollout : 1; 914 /* op supports buffer selection */ 915 unsigned buffer_select : 1; 916 /* do prep async if is going to be punted */ 917 unsigned needs_async_setup : 1; 918 /* should block plug */ 919 unsigned plug : 1; 920 /* size of async data needed, if any */ 921 unsigned short async_size; 922}; 923 924static const struct io_op_def io_op_defs[] = { 925 [IORING_OP_NOP] = {}, 926 [IORING_OP_READV] = { 927 .needs_file = 1, 928 .unbound_nonreg_file = 1, 929 .pollin = 1, 930 .buffer_select = 1, 931 .needs_async_setup = 1, 932 .plug = 1, 933 .async_size = sizeof(struct io_async_rw), 934 }, 935 [IORING_OP_WRITEV] = { 936 .needs_file = 1, 937 .hash_reg_file = 1, 938 .unbound_nonreg_file = 1, 939 .pollout = 1, 940 .needs_async_setup = 1, 941 .plug = 1, 942 .async_size = sizeof(struct io_async_rw), 943 }, 944 [IORING_OP_FSYNC] = { 945 .needs_file = 1, 946 }, 947 [IORING_OP_READ_FIXED] = { 948 .needs_file = 1, 949 .unbound_nonreg_file = 1, 950 .pollin = 1, 951 .plug = 1, 952 .async_size = sizeof(struct io_async_rw), 953 }, 954 [IORING_OP_WRITE_FIXED] = { 955 .needs_file = 1, 956 .hash_reg_file = 1, 957 .unbound_nonreg_file = 1, 958 .pollout = 1, 959 .plug = 1, 960 .async_size = sizeof(struct io_async_rw), 961 }, 962 [IORING_OP_POLL_ADD] = { 963 .needs_file = 1, 964 .unbound_nonreg_file = 1, 965 }, 966 [IORING_OP_POLL_REMOVE] = {}, 967 [IORING_OP_SYNC_FILE_RANGE] = { 968 .needs_file = 1, 969 }, 970 [IORING_OP_SENDMSG] = { 971 .needs_file = 1, 972 .unbound_nonreg_file = 1, 973 .pollout = 1, 974 .needs_async_setup = 1, 975 .async_size = sizeof(struct io_async_msghdr), 976 }, 977 [IORING_OP_RECVMSG] = { 978 .needs_file = 1, 979 .unbound_nonreg_file = 1, 980 .pollin = 1, 981 .buffer_select = 1, 982 .needs_async_setup = 1, 983 .async_size = sizeof(struct io_async_msghdr), 984 }, 985 [IORING_OP_TIMEOUT] = { 986 .async_size = sizeof(struct io_timeout_data), 987 }, 988 [IORING_OP_TIMEOUT_REMOVE] = { 989 /* used by timeout updates' prep() */ 990 }, 991 [IORING_OP_ACCEPT] = { 992 .needs_file = 1, 993 .unbound_nonreg_file = 1, 994 .pollin = 1, 995 }, 996 [IORING_OP_ASYNC_CANCEL] = {}, 997 [IORING_OP_LINK_TIMEOUT] = { 998 .async_size = sizeof(struct io_timeout_data), 999 }, 1000 [IORING_OP_CONNECT] = { 1001 .needs_file = 1, 1002 .unbound_nonreg_file = 1, 1003 .pollout = 1, 1004 .needs_async_setup = 1, 1005 .async_size = sizeof(struct io_async_connect), 1006 }, 1007 [IORING_OP_FALLOCATE] = { 1008 .needs_file = 1, 1009 }, 1010 [IORING_OP_OPENAT] = {}, 1011 [IORING_OP_CLOSE] = {}, 1012 [IORING_OP_FILES_UPDATE] = {}, 1013 [IORING_OP_STATX] = {}, 1014 [IORING_OP_READ] = { 1015 .needs_file = 1, 1016 .unbound_nonreg_file = 1, 1017 .pollin = 1, 1018 .buffer_select = 1, 1019 .plug = 1, 1020 .async_size = sizeof(struct io_async_rw), 1021 }, 1022 [IORING_OP_WRITE] = { 1023 .needs_file = 1, 1024 .hash_reg_file = 1, 1025 .unbound_nonreg_file = 1, 1026 .pollout = 1, 1027 .plug = 1, 1028 .async_size = sizeof(struct io_async_rw), 1029 }, 1030 [IORING_OP_FADVISE] = { 1031 .needs_file = 1, 1032 }, 1033 [IORING_OP_MADVISE] = {}, 1034 [IORING_OP_SEND] = { 1035 .needs_file = 1, 1036 .unbound_nonreg_file = 1, 1037 .pollout = 1, 1038 }, 1039 [IORING_OP_RECV] = { 1040 .needs_file = 1, 1041 .unbound_nonreg_file = 1, 1042 .pollin = 1, 1043 .buffer_select = 1, 1044 }, 1045 [IORING_OP_OPENAT2] = { 1046 }, 1047 [IORING_OP_EPOLL_CTL] = { 1048 .unbound_nonreg_file = 1, 1049 }, 1050 [IORING_OP_SPLICE] = { 1051 .needs_file = 1, 1052 .hash_reg_file = 1, 1053 .unbound_nonreg_file = 1, 1054 }, 1055 [IORING_OP_PROVIDE_BUFFERS] = {}, 1056 [IORING_OP_REMOVE_BUFFERS] = {}, 1057 [IORING_OP_TEE] = { 1058 .needs_file = 1, 1059 .hash_reg_file = 1, 1060 .unbound_nonreg_file = 1, 1061 }, 1062 [IORING_OP_SHUTDOWN] = { 1063 .needs_file = 1, 1064 }, 1065 [IORING_OP_RENAMEAT] = {}, 1066 [IORING_OP_UNLINKAT] = {}, 1067 [IORING_OP_MKDIRAT] = {}, 1068 [IORING_OP_SYMLINKAT] = {}, 1069 [IORING_OP_LINKAT] = {}, 1070}; 1071 1072/* requests with any of those set should undergo io_disarm_next() */ 1073#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) 1074 1075static bool io_disarm_next(struct io_kiocb *req); 1076static void io_uring_del_tctx_node(unsigned long index); 1077static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 1078 struct task_struct *task, 1079 bool cancel_all); 1080static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); 1081 1082static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, 1083 long res, unsigned int cflags); 1084static void io_put_req(struct io_kiocb *req); 1085static void io_put_req_deferred(struct io_kiocb *req); 1086static void io_dismantle_req(struct io_kiocb *req); 1087static void io_queue_linked_timeout(struct io_kiocb *req); 1088static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 1089 struct io_uring_rsrc_update2 *up, 1090 unsigned nr_args); 1091static void io_clean_op(struct io_kiocb *req); 1092static struct file *io_file_get(struct io_ring_ctx *ctx, 1093 struct io_kiocb *req, int fd, bool fixed); 1094static void __io_queue_sqe(struct io_kiocb *req); 1095static void io_rsrc_put_work(struct work_struct *work); 1096 1097static void io_req_task_queue(struct io_kiocb *req); 1098static void io_submit_flush_completions(struct io_ring_ctx *ctx); 1099static int io_req_prep_async(struct io_kiocb *req); 1100 1101static int io_install_fixed_file(struct io_kiocb *req, struct file *file, 1102 unsigned int issue_flags, u32 slot_index); 1103static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); 1104 1105static struct kmem_cache *req_cachep; 1106 1107static const struct file_operations io_uring_fops; 1108 1109struct sock *io_uring_get_socket(struct file *file) 1110{ 1111#if defined(CONFIG_UNIX) 1112 if (file->f_op == &io_uring_fops) { 1113 struct io_ring_ctx *ctx = file->private_data; 1114 1115 return ctx->ring_sock->sk; 1116 } 1117#endif 1118 return NULL; 1119} 1120EXPORT_SYMBOL(io_uring_get_socket); 1121 1122static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) 1123{ 1124 if (!*locked) { 1125 mutex_lock(&ctx->uring_lock); 1126 *locked = true; 1127 } 1128} 1129 1130#define io_for_each_link(pos, head) \ 1131 for (pos = (head); pos; pos = pos->link) 1132 1133/* 1134 * Shamelessly stolen from the mm implementation of page reference checking, 1135 * see commit f958d7b528b1 for details. 1136 */ 1137#define req_ref_zero_or_close_to_overflow(req) \ 1138 ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u) 1139 1140static inline bool req_ref_inc_not_zero(struct io_kiocb *req) 1141{ 1142 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); 1143 return atomic_inc_not_zero(&req->refs); 1144} 1145 1146static inline bool req_ref_put_and_test(struct io_kiocb *req) 1147{ 1148 if (likely(!(req->flags & REQ_F_REFCOUNT))) 1149 return true; 1150 1151 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); 1152 return atomic_dec_and_test(&req->refs); 1153} 1154 1155static inline void req_ref_put(struct io_kiocb *req) 1156{ 1157 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); 1158 WARN_ON_ONCE(req_ref_put_and_test(req)); 1159} 1160 1161static inline void req_ref_get(struct io_kiocb *req) 1162{ 1163 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); 1164 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); 1165 atomic_inc(&req->refs); 1166} 1167 1168static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) 1169{ 1170 if (!(req->flags & REQ_F_REFCOUNT)) { 1171 req->flags |= REQ_F_REFCOUNT; 1172 atomic_set(&req->refs, nr); 1173 } 1174} 1175 1176static inline void io_req_set_refcount(struct io_kiocb *req) 1177{ 1178 __io_req_set_refcount(req, 1); 1179} 1180 1181static inline void io_req_set_rsrc_node(struct io_kiocb *req) 1182{ 1183 struct io_ring_ctx *ctx = req->ctx; 1184 1185 if (!req->fixed_rsrc_refs) { 1186 req->fixed_rsrc_refs = &ctx->rsrc_node->refs; 1187 percpu_ref_get(req->fixed_rsrc_refs); 1188 } 1189} 1190 1191static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl) 1192{ 1193 bool got = percpu_ref_tryget(ref); 1194 1195 /* already at zero, wait for ->release() */ 1196 if (!got) 1197 wait_for_completion(compl); 1198 percpu_ref_resurrect(ref); 1199 if (got) 1200 percpu_ref_put(ref); 1201} 1202 1203static bool io_match_task(struct io_kiocb *head, struct task_struct *task, 1204 bool cancel_all) 1205{ 1206 struct io_kiocb *req; 1207 1208 if (task && head->task != task) 1209 return false; 1210 if (cancel_all) 1211 return true; 1212 1213 io_for_each_link(req, head) { 1214 if (req->flags & REQ_F_INFLIGHT) 1215 return true; 1216 } 1217 return false; 1218} 1219 1220static inline void req_set_fail(struct io_kiocb *req) 1221{ 1222 req->flags |= REQ_F_FAIL; 1223} 1224 1225static inline void req_fail_link_node(struct io_kiocb *req, int res) 1226{ 1227 req_set_fail(req); 1228 req->result = res; 1229} 1230 1231static void io_ring_ctx_ref_free(struct percpu_ref *ref) 1232{ 1233 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); 1234 1235 complete(&ctx->ref_comp); 1236} 1237 1238static inline bool io_is_timeout_noseq(struct io_kiocb *req) 1239{ 1240 return !req->timeout.off; 1241} 1242 1243static void io_fallback_req_func(struct work_struct *work) 1244{ 1245 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, 1246 fallback_work.work); 1247 struct llist_node *node = llist_del_all(&ctx->fallback_llist); 1248 struct io_kiocb *req, *tmp; 1249 bool locked = false; 1250 1251 percpu_ref_get(&ctx->refs); 1252 llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node) 1253 req->io_task_work.func(req, &locked); 1254 1255 if (locked) { 1256 if (ctx->submit_state.compl_nr) 1257 io_submit_flush_completions(ctx); 1258 mutex_unlock(&ctx->uring_lock); 1259 } 1260 percpu_ref_put(&ctx->refs); 1261 1262} 1263 1264static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) 1265{ 1266 struct io_ring_ctx *ctx; 1267 int hash_bits; 1268 1269 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 1270 if (!ctx) 1271 return NULL; 1272 1273 /* 1274 * Use 5 bits less than the max cq entries, that should give us around 1275 * 32 entries per hash list if totally full and uniformly spread. 1276 */ 1277 hash_bits = ilog2(p->cq_entries); 1278 hash_bits -= 5; 1279 if (hash_bits <= 0) 1280 hash_bits = 1; 1281 ctx->cancel_hash_bits = hash_bits; 1282 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head), 1283 GFP_KERNEL); 1284 if (!ctx->cancel_hash) 1285 goto err; 1286 __hash_init(ctx->cancel_hash, 1U << hash_bits); 1287 1288 ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); 1289 if (!ctx->dummy_ubuf) 1290 goto err; 1291 /* set invalid range, so io_import_fixed() fails meeting it */ 1292 ctx->dummy_ubuf->ubuf = -1UL; 1293 1294 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 1295 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) 1296 goto err; 1297 1298 ctx->flags = p->flags; 1299 init_waitqueue_head(&ctx->sqo_sq_wait); 1300 INIT_LIST_HEAD(&ctx->sqd_list); 1301 init_waitqueue_head(&ctx->poll_wait); 1302 INIT_LIST_HEAD(&ctx->cq_overflow_list); 1303 init_completion(&ctx->ref_comp); 1304 xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1); 1305 xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); 1306 mutex_init(&ctx->uring_lock); 1307 init_waitqueue_head(&ctx->cq_wait); 1308 spin_lock_init(&ctx->completion_lock); 1309 spin_lock_init(&ctx->timeout_lock); 1310 INIT_LIST_HEAD(&ctx->iopoll_list); 1311 INIT_LIST_HEAD(&ctx->defer_list); 1312 INIT_LIST_HEAD(&ctx->timeout_list); 1313 INIT_LIST_HEAD(&ctx->ltimeout_list); 1314 spin_lock_init(&ctx->rsrc_ref_lock); 1315 INIT_LIST_HEAD(&ctx->rsrc_ref_list); 1316 INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); 1317 init_llist_head(&ctx->rsrc_put_llist); 1318 INIT_LIST_HEAD(&ctx->tctx_list); 1319 INIT_LIST_HEAD(&ctx->submit_state.free_list); 1320 INIT_LIST_HEAD(&ctx->locked_free_list); 1321 INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); 1322 return ctx; 1323err: 1324 kfree(ctx->dummy_ubuf); 1325 kfree(ctx->cancel_hash); 1326 kfree(ctx); 1327 return NULL; 1328} 1329 1330static void io_account_cq_overflow(struct io_ring_ctx *ctx) 1331{ 1332 struct io_rings *r = ctx->rings; 1333 1334 WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); 1335 ctx->cq_extra--; 1336} 1337 1338static bool req_need_defer(struct io_kiocb *req, u32 seq) 1339{ 1340 if (unlikely(req->flags & REQ_F_IO_DRAIN)) { 1341 struct io_ring_ctx *ctx = req->ctx; 1342 1343 return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; 1344 } 1345 1346 return false; 1347} 1348 1349#define FFS_ASYNC_READ 0x1UL 1350#define FFS_ASYNC_WRITE 0x2UL 1351#ifdef CONFIG_64BIT 1352#define FFS_ISREG 0x4UL 1353#else 1354#define FFS_ISREG 0x0UL 1355#endif 1356#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG) 1357 1358static inline bool io_req_ffs_set(struct io_kiocb *req) 1359{ 1360 return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE); 1361} 1362 1363static void io_req_track_inflight(struct io_kiocb *req) 1364{ 1365 if (!(req->flags & REQ_F_INFLIGHT)) { 1366 req->flags |= REQ_F_INFLIGHT; 1367 atomic_inc(&current->io_uring->inflight_tracked); 1368 } 1369} 1370 1371static inline void io_unprep_linked_timeout(struct io_kiocb *req) 1372{ 1373 req->flags &= ~REQ_F_LINK_TIMEOUT; 1374} 1375 1376static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) 1377{ 1378 if (WARN_ON_ONCE(!req->link)) 1379 return NULL; 1380 1381 req->flags &= ~REQ_F_ARM_LTIMEOUT; 1382 req->flags |= REQ_F_LINK_TIMEOUT; 1383 1384 /* linked timeouts should have two refs once prep'ed */ 1385 io_req_set_refcount(req); 1386 __io_req_set_refcount(req->link, 2); 1387 return req->link; 1388} 1389 1390static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) 1391{ 1392 if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT))) 1393 return NULL; 1394 return __io_prep_linked_timeout(req); 1395} 1396 1397static void io_prep_async_work(struct io_kiocb *req) 1398{ 1399 const struct io_op_def *def = &io_op_defs[req->opcode]; 1400 struct io_ring_ctx *ctx = req->ctx; 1401 1402 if (!(req->flags & REQ_F_CREDS)) { 1403 req->flags |= REQ_F_CREDS; 1404 req->creds = get_current_cred(); 1405 } 1406 1407 req->work.list.next = NULL; 1408 req->work.flags = 0; 1409 if (req->flags & REQ_F_FORCE_ASYNC) 1410 req->work.flags |= IO_WQ_WORK_CONCURRENT; 1411 1412 if (req->flags & REQ_F_ISREG) { 1413 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) 1414 io_wq_hash_work(&req->work, file_inode(req->file)); 1415 } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { 1416 if (def->unbound_nonreg_file) 1417 req->work.flags |= IO_WQ_WORK_UNBOUND; 1418 } 1419 1420 switch (req->opcode) { 1421 case IORING_OP_SPLICE: 1422 case IORING_OP_TEE: 1423 if (!S_ISREG(file_inode(req->splice.file_in)->i_mode)) 1424 req->work.flags |= IO_WQ_WORK_UNBOUND; 1425 break; 1426 } 1427} 1428 1429static void io_prep_async_link(struct io_kiocb *req) 1430{ 1431 struct io_kiocb *cur; 1432 1433 if (req->flags & REQ_F_LINK_TIMEOUT) { 1434 struct io_ring_ctx *ctx = req->ctx; 1435 1436 spin_lock(&ctx->completion_lock); 1437 io_for_each_link(cur, req) 1438 io_prep_async_work(cur); 1439 spin_unlock(&ctx->completion_lock); 1440 } else { 1441 io_for_each_link(cur, req) 1442 io_prep_async_work(cur); 1443 } 1444} 1445 1446static void io_queue_async_work(struct io_kiocb *req, bool *locked) 1447{ 1448 struct io_ring_ctx *ctx = req->ctx; 1449 struct io_kiocb *link = io_prep_linked_timeout(req); 1450 struct io_uring_task *tctx = req->task->io_uring; 1451 1452 /* must not take the lock, NULL it as a precaution */ 1453 locked = NULL; 1454 1455 BUG_ON(!tctx); 1456 BUG_ON(!tctx->io_wq); 1457 1458 /* init ->work of the whole link before punting */ 1459 io_prep_async_link(req); 1460 1461 /* 1462 * Not expected to happen, but if we do have a bug where this _can_ 1463 * happen, catch it here and ensure the request is marked as 1464 * canceled. That will make io-wq go through the usual work cancel 1465 * procedure rather than attempt to run this request (or create a new 1466 * worker for it). 1467 */ 1468 if (WARN_ON_ONCE(!same_thread_group(req->task, current))) 1469 req->work.flags |= IO_WQ_WORK_CANCEL; 1470 1471 trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, 1472 &req->work, req->flags); 1473 io_wq_enqueue(tctx->io_wq, &req->work); 1474 if (link) 1475 io_queue_linked_timeout(link); 1476} 1477 1478static void io_kill_timeout(struct io_kiocb *req, int status) 1479 __must_hold(&req->ctx->completion_lock) 1480 __must_hold(&req->ctx->timeout_lock) 1481{ 1482 struct io_timeout_data *io = req->async_data; 1483 1484 if (hrtimer_try_to_cancel(&io->timer) != -1) { 1485 if (status) 1486 req_set_fail(req); 1487 atomic_set(&req->ctx->cq_timeouts, 1488 atomic_read(&req->ctx->cq_timeouts) + 1); 1489 list_del_init(&req->timeout.list); 1490 io_cqring_fill_event(req->ctx, req->user_data, status, 0); 1491 io_put_req_deferred(req); 1492 } 1493} 1494 1495static void io_queue_deferred(struct io_ring_ctx *ctx) 1496{ 1497 while (!list_empty(&ctx->defer_list)) { 1498 struct io_defer_entry *de = list_first_entry(&ctx->defer_list, 1499 struct io_defer_entry, list); 1500 1501 if (req_need_defer(de->req, de->seq)) 1502 break; 1503 list_del_init(&de->list); 1504 io_req_task_queue(de->req); 1505 kfree(de); 1506 } 1507} 1508 1509static void io_flush_timeouts(struct io_ring_ctx *ctx) 1510 __must_hold(&ctx->completion_lock) 1511{ 1512 u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); 1513 1514 spin_lock_irq(&ctx->timeout_lock); 1515 while (!list_empty(&ctx->timeout_list)) { 1516 u32 events_needed, events_got; 1517 struct io_kiocb *req = list_first_entry(&ctx->timeout_list, 1518 struct io_kiocb, timeout.list); 1519 1520 if (io_is_timeout_noseq(req)) 1521 break; 1522 1523 /* 1524 * Since seq can easily wrap around over time, subtract 1525 * the last seq at which timeouts were flushed before comparing. 1526 * Assuming not more than 2^31-1 events have happened since, 1527 * these subtractions won't have wrapped, so we can check if 1528 * target is in [last_seq, current_seq] by comparing the two. 1529 */ 1530 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush; 1531 events_got = seq - ctx->cq_last_tm_flush; 1532 if (events_got < events_needed) 1533 break; 1534 1535 list_del_init(&req->timeout.list); 1536 io_kill_timeout(req, 0); 1537 } 1538 ctx->cq_last_tm_flush = seq; 1539 spin_unlock_irq(&ctx->timeout_lock); 1540} 1541 1542static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) 1543{ 1544 if (ctx->off_timeout_used) 1545 io_flush_timeouts(ctx); 1546 if (ctx->drain_active) 1547 io_queue_deferred(ctx); 1548} 1549 1550static inline void io_commit_cqring(struct io_ring_ctx *ctx) 1551{ 1552 if (unlikely(ctx->off_timeout_used || ctx->drain_active)) 1553 __io_commit_cqring_flush(ctx); 1554 /* order cqe stores with ring update */ 1555 smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); 1556} 1557 1558static inline bool io_sqring_full(struct io_ring_ctx *ctx) 1559{ 1560 struct io_rings *r = ctx->rings; 1561 1562 return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries; 1563} 1564 1565static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) 1566{ 1567 return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); 1568} 1569 1570static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) 1571{ 1572 struct io_rings *rings = ctx->rings; 1573 unsigned tail, mask = ctx->cq_entries - 1; 1574 1575 /* 1576 * writes to the cq entry need to come after reading head; the 1577 * control dependency is enough as we're using WRITE_ONCE to 1578 * fill the cq entry 1579 */ 1580 if (__io_cqring_events(ctx) == ctx->cq_entries) 1581 return NULL; 1582 1583 tail = ctx->cached_cq_tail++; 1584 return &rings->cqes[tail & mask]; 1585} 1586 1587static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) 1588{ 1589 if (likely(!ctx->cq_ev_fd)) 1590 return false; 1591 if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) 1592 return false; 1593 return !ctx->eventfd_async || io_wq_current_is_worker(); 1594} 1595 1596/* 1597 * This should only get called when at least one event has been posted. 1598 * Some applications rely on the eventfd notification count only changing 1599 * IFF a new CQE has been added to the CQ ring. There's no depedency on 1600 * 1:1 relationship between how many times this function is called (and 1601 * hence the eventfd count) and number of CQEs posted to the CQ ring. 1602 */ 1603static void io_cqring_ev_posted(struct io_ring_ctx *ctx) 1604{ 1605 /* 1606 * wake_up_all() may seem excessive, but io_wake_function() and 1607 * io_should_wake() handle the termination of the loop and only 1608 * wake as many waiters as we need to. 1609 */ 1610 if (wq_has_sleeper(&ctx->cq_wait)) 1611 wake_up_all(&ctx->cq_wait); 1612 if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait)) 1613 wake_up(&ctx->sq_data->wait); 1614 if (io_should_trigger_evfd(ctx)) 1615 eventfd_signal(ctx->cq_ev_fd, 1); 1616 if (waitqueue_active(&ctx->poll_wait)) { 1617 wake_up_interruptible(&ctx->poll_wait); 1618 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); 1619 } 1620} 1621 1622static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) 1623{ 1624 /* see waitqueue_active() comment */ 1625 smp_mb(); 1626 1627 if (ctx->flags & IORING_SETUP_SQPOLL) { 1628 if (waitqueue_active(&ctx->cq_wait)) 1629 wake_up_all(&ctx->cq_wait); 1630 } 1631 if (io_should_trigger_evfd(ctx)) 1632 eventfd_signal(ctx->cq_ev_fd, 1); 1633 if (waitqueue_active(&ctx->poll_wait)) { 1634 wake_up_interruptible(&ctx->poll_wait); 1635 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); 1636 } 1637} 1638 1639/* Returns true if there are no backlogged entries after the flush */ 1640static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) 1641{ 1642 bool all_flushed, posted; 1643 1644 if (!force && __io_cqring_events(ctx) == ctx->cq_entries) 1645 return false; 1646 1647 posted = false; 1648 spin_lock(&ctx->completion_lock); 1649 while (!list_empty(&ctx->cq_overflow_list)) { 1650 struct io_uring_cqe *cqe = io_get_cqe(ctx); 1651 struct io_overflow_cqe *ocqe; 1652 1653 if (!cqe && !force) 1654 break; 1655 ocqe = list_first_entry(&ctx->cq_overflow_list, 1656 struct io_overflow_cqe, list); 1657 if (cqe) 1658 memcpy(cqe, &ocqe->cqe, sizeof(*cqe)); 1659 else 1660 io_account_cq_overflow(ctx); 1661 1662 posted = true; 1663 list_del(&ocqe->list); 1664 kfree(ocqe); 1665 } 1666 1667 all_flushed = list_empty(&ctx->cq_overflow_list); 1668 if (all_flushed) { 1669 clear_bit(0, &ctx->check_cq_overflow); 1670 WRITE_ONCE(ctx->rings->sq_flags, 1671 ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW); 1672 } 1673 1674 if (posted) 1675 io_commit_cqring(ctx); 1676 spin_unlock(&ctx->completion_lock); 1677 if (posted) 1678 io_cqring_ev_posted(ctx); 1679 return all_flushed; 1680} 1681 1682static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) 1683{ 1684 bool ret = true; 1685 1686 if (test_bit(0, &ctx->check_cq_overflow)) { 1687 /* iopoll syncs against uring_lock, not completion_lock */ 1688 if (ctx->flags & IORING_SETUP_IOPOLL) 1689 mutex_lock(&ctx->uring_lock); 1690 ret = __io_cqring_overflow_flush(ctx, false); 1691 if (ctx->flags & IORING_SETUP_IOPOLL) 1692 mutex_unlock(&ctx->uring_lock); 1693 } 1694 1695 return ret; 1696} 1697 1698/* must to be called somewhat shortly after putting a request */ 1699static inline void io_put_task(struct task_struct *task, int nr) 1700{ 1701 struct io_uring_task *tctx = task->io_uring; 1702 1703 if (likely(task == current)) { 1704 tctx->cached_refs += nr; 1705 } else { 1706 percpu_counter_sub(&tctx->inflight, nr); 1707 if (unlikely(atomic_read(&tctx->in_idle))) 1708 wake_up(&tctx->wait); 1709 put_task_struct_many(task, nr); 1710 } 1711} 1712 1713static void io_task_refs_refill(struct io_uring_task *tctx) 1714{ 1715 unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; 1716 1717 percpu_counter_add(&tctx->inflight, refill); 1718 refcount_add(refill, &current->usage); 1719 tctx->cached_refs += refill; 1720} 1721 1722static inline void io_get_task_refs(int nr) 1723{ 1724 struct io_uring_task *tctx = current->io_uring; 1725 1726 tctx->cached_refs -= nr; 1727 if (unlikely(tctx->cached_refs < 0)) 1728 io_task_refs_refill(tctx); 1729} 1730 1731static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, 1732 long res, unsigned int cflags) 1733{ 1734 struct io_overflow_cqe *ocqe; 1735 1736 ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT); 1737 if (!ocqe) { 1738 /* 1739 * If we're in ring overflow flush mode, or in task cancel mode, 1740 * or cannot allocate an overflow entry, then we need to drop it 1741 * on the floor. 1742 */ 1743 io_account_cq_overflow(ctx); 1744 return false; 1745 } 1746 if (list_empty(&ctx->cq_overflow_list)) { 1747 set_bit(0, &ctx->check_cq_overflow); 1748 WRITE_ONCE(ctx->rings->sq_flags, 1749 ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW); 1750 1751 } 1752 ocqe->cqe.user_data = user_data; 1753 ocqe->cqe.res = res; 1754 ocqe->cqe.flags = cflags; 1755 list_add_tail(&ocqe->list, &ctx->cq_overflow_list); 1756 return true; 1757} 1758 1759static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, 1760 long res, unsigned int cflags) 1761{ 1762 struct io_uring_cqe *cqe; 1763 1764 trace_io_uring_complete(ctx, user_data, res, cflags); 1765 1766 /* 1767 * If we can't get a cq entry, userspace overflowed the 1768 * submission (by quite a lot). Increment the overflow count in 1769 * the ring. 1770 */ 1771 cqe = io_get_cqe(ctx); 1772 if (likely(cqe)) { 1773 WRITE_ONCE(cqe->user_data, user_data); 1774 WRITE_ONCE(cqe->res, res); 1775 WRITE_ONCE(cqe->flags, cflags); 1776 return true; 1777 } 1778 return io_cqring_event_overflow(ctx, user_data, res, cflags); 1779} 1780 1781/* not as hot to bloat with inlining */ 1782static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, 1783 long res, unsigned int cflags) 1784{ 1785 return __io_cqring_fill_event(ctx, user_data, res, cflags); 1786} 1787 1788static void io_req_complete_post(struct io_kiocb *req, long res, 1789 unsigned int cflags) 1790{ 1791 struct io_ring_ctx *ctx = req->ctx; 1792 1793 spin_lock(&ctx->completion_lock); 1794 __io_cqring_fill_event(ctx, req->user_data, res, cflags); 1795 /* 1796 * If we're the last reference to this request, add to our locked 1797 * free_list cache. 1798 */ 1799 if (req_ref_put_and_test(req)) { 1800 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { 1801 if (req->flags & IO_DISARM_MASK) 1802 io_disarm_next(req); 1803 if (req->link) { 1804 io_req_task_queue(req->link); 1805 req->link = NULL; 1806 } 1807 } 1808 io_dismantle_req(req); 1809 io_put_task(req->task, 1); 1810 list_add(&req->inflight_entry, &ctx->locked_free_list); 1811 ctx->locked_free_nr++; 1812 } else { 1813 if (!percpu_ref_tryget(&ctx->refs)) 1814 req = NULL; 1815 } 1816 io_commit_cqring(ctx); 1817 spin_unlock(&ctx->completion_lock); 1818 1819 if (req) { 1820 io_cqring_ev_posted(ctx); 1821 percpu_ref_put(&ctx->refs); 1822 } 1823} 1824 1825static inline bool io_req_needs_clean(struct io_kiocb *req) 1826{ 1827 return req->flags & IO_REQ_CLEAN_FLAGS; 1828} 1829 1830static void io_req_complete_state(struct io_kiocb *req, long res, 1831 unsigned int cflags) 1832{ 1833 if (io_req_needs_clean(req)) 1834 io_clean_op(req); 1835 req->result = res; 1836 req->compl.cflags = cflags; 1837 req->flags |= REQ_F_COMPLETE_INLINE; 1838} 1839 1840static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, 1841 long res, unsigned cflags) 1842{ 1843 if (issue_flags & IO_URING_F_COMPLETE_DEFER) 1844 io_req_complete_state(req, res, cflags); 1845 else 1846 io_req_complete_post(req, res, cflags); 1847} 1848 1849static inline void io_req_complete(struct io_kiocb *req, long res) 1850{ 1851 __io_req_complete(req, 0, res, 0); 1852} 1853 1854static void io_req_complete_failed(struct io_kiocb *req, long res) 1855{ 1856 req_set_fail(req); 1857 io_req_complete_post(req, res, 0); 1858} 1859 1860static void io_req_complete_fail_submit(struct io_kiocb *req) 1861{ 1862 /* 1863 * We don't submit, fail them all, for that replace hardlinks with 1864 * normal links. Extra REQ_F_LINK is tolerated. 1865 */ 1866 req->flags &= ~REQ_F_HARDLINK; 1867 req->flags |= REQ_F_LINK; 1868 io_req_complete_failed(req, req->result); 1869} 1870 1871/* 1872 * Don't initialise the fields below on every allocation, but do that in 1873 * advance and keep them valid across allocations. 1874 */ 1875static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) 1876{ 1877 req->ctx = ctx; 1878 req->link = NULL; 1879 req->async_data = NULL; 1880 /* not necessary, but safer to zero */ 1881 req->result = 0; 1882} 1883 1884static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, 1885 struct io_submit_state *state) 1886{ 1887 spin_lock(&ctx->completion_lock); 1888 list_splice_init(&ctx->locked_free_list, &state->free_list); 1889 ctx->locked_free_nr = 0; 1890 spin_unlock(&ctx->completion_lock); 1891} 1892 1893/* Returns true IFF there are requests in the cache */ 1894static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) 1895{ 1896 struct io_submit_state *state = &ctx->submit_state; 1897 int nr; 1898 1899 /* 1900 * If we have more than a batch's worth of requests in our IRQ side 1901 * locked cache, grab the lock and move them over to our submission 1902 * side cache. 1903 */ 1904 if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) 1905 io_flush_cached_locked_reqs(ctx, state); 1906 1907 nr = state->free_reqs; 1908 while (!list_empty(&state->free_list)) { 1909 struct io_kiocb *req = list_first_entry(&state->free_list, 1910 struct io_kiocb, inflight_entry); 1911 1912 list_del(&req->inflight_entry); 1913 state->reqs[nr++] = req; 1914 if (nr == ARRAY_SIZE(state->reqs)) 1915 break; 1916 } 1917 1918 state->free_reqs = nr; 1919 return nr != 0; 1920} 1921 1922/* 1923 * A request might get retired back into the request caches even before opcode 1924 * handlers and io_issue_sqe() are done with it, e.g. inline completion path. 1925 * Because of that, io_alloc_req() should be called only under ->uring_lock 1926 * and with extra caution to not get a request that is still worked on. 1927 */ 1928static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) 1929 __must_hold(&ctx->uring_lock) 1930{ 1931 struct io_submit_state *state = &ctx->submit_state; 1932 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; 1933 int ret, i; 1934 1935 BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH); 1936 1937 if (likely(state->free_reqs || io_flush_cached_reqs(ctx))) 1938 goto got_req; 1939 1940 ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH, 1941 state->reqs); 1942 1943 /* 1944 * Bulk alloc is all-or-nothing. If we fail to get a batch, 1945 * retry single alloc to be on the safe side. 1946 */ 1947 if (unlikely(ret <= 0)) { 1948 state->reqs[0] = kmem_cache_alloc(req_cachep, gfp); 1949 if (!state->reqs[0]) 1950 return NULL; 1951 ret = 1; 1952 } 1953 1954 for (i = 0; i < ret; i++) 1955 io_preinit_req(state->reqs[i], ctx); 1956 state->free_reqs = ret; 1957got_req: 1958 state->free_reqs--; 1959 return state->reqs[state->free_reqs]; 1960} 1961 1962static inline void io_put_file(struct file *file) 1963{ 1964 if (file) 1965 fput(file); 1966} 1967 1968static void io_dismantle_req(struct io_kiocb *req) 1969{ 1970 unsigned int flags = req->flags; 1971 1972 if (io_req_needs_clean(req)) 1973 io_clean_op(req); 1974 if (!(flags & REQ_F_FIXED_FILE)) 1975 io_put_file(req->file); 1976 if (req->fixed_rsrc_refs) 1977 percpu_ref_put(req->fixed_rsrc_refs); 1978 if (req->async_data) { 1979 kfree(req->async_data); 1980 req->async_data = NULL; 1981 } 1982} 1983 1984static void __io_free_req(struct io_kiocb *req) 1985{ 1986 struct io_ring_ctx *ctx = req->ctx; 1987 1988 io_dismantle_req(req); 1989 io_put_task(req->task, 1); 1990 1991 spin_lock(&ctx->completion_lock); 1992 list_add(&req->inflight_entry, &ctx->locked_free_list); 1993 ctx->locked_free_nr++; 1994 spin_unlock(&ctx->completion_lock); 1995 1996 percpu_ref_put(&ctx->refs); 1997} 1998 1999static inline void io_remove_next_linked(struct io_kiocb *req) 2000{ 2001 struct io_kiocb *nxt = req->link; 2002 2003 req->link = nxt->link; 2004 nxt->link = NULL; 2005} 2006 2007static bool io_kill_linked_timeout(struct io_kiocb *req) 2008 __must_hold(&req->ctx->completion_lock) 2009 __must_hold(&req->ctx->timeout_lock) 2010{ 2011 struct io_kiocb *link = req->link; 2012 2013 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { 2014 struct io_timeout_data *io = link->async_data; 2015 2016 io_remove_next_linked(req); 2017 link->timeout.head = NULL; 2018 if (hrtimer_try_to_cancel(&io->timer) != -1) { 2019 list_del(&link->timeout.list); 2020 io_cqring_fill_event(link->ctx, link->user_data, 2021 -ECANCELED, 0); 2022 io_put_req_deferred(link); 2023 return true; 2024 } 2025 } 2026 return false; 2027} 2028 2029static void io_fail_links(struct io_kiocb *req) 2030 __must_hold(&req->ctx->completion_lock) 2031{ 2032 struct io_kiocb *nxt, *link = req->link; 2033 2034 req->link = NULL; 2035 while (link) { 2036 long res = -ECANCELED; 2037 2038 if (link->flags & REQ_F_FAIL) 2039 res = link->result; 2040 2041 nxt = link->link; 2042 link->link = NULL; 2043 2044 trace_io_uring_fail_link(req, link); 2045 io_cqring_fill_event(link->ctx, link->user_data, res, 0); 2046 io_put_req_deferred(link); 2047 link = nxt; 2048 } 2049} 2050 2051static bool io_disarm_next(struct io_kiocb *req) 2052 __must_hold(&req->ctx->completion_lock) 2053{ 2054 bool posted = false; 2055 2056 if (req->flags & REQ_F_ARM_LTIMEOUT) { 2057 struct io_kiocb *link = req->link; 2058 2059 req->flags &= ~REQ_F_ARM_LTIMEOUT; 2060 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { 2061 io_remove_next_linked(req); 2062 io_cqring_fill_event(link->ctx, link->user_data, 2063 -ECANCELED, 0); 2064 io_put_req_deferred(link); 2065 posted = true; 2066 } 2067 } else if (req->flags & REQ_F_LINK_TIMEOUT) { 2068 struct io_ring_ctx *ctx = req->ctx; 2069 2070 spin_lock_irq(&ctx->timeout_lock); 2071 posted = io_kill_linked_timeout(req); 2072 spin_unlock_irq(&ctx->timeout_lock); 2073 } 2074 if (unlikely((req->flags & REQ_F_FAIL) && 2075 !(req->flags & REQ_F_HARDLINK))) { 2076 posted |= (req->link != NULL); 2077 io_fail_links(req); 2078 } 2079 return posted; 2080} 2081 2082static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) 2083{ 2084 struct io_kiocb *nxt; 2085 2086 /* 2087 * If LINK is set, we have dependent requests in this chain. If we 2088 * didn't fail this request, queue the first one up, moving any other 2089 * dependencies to the next request. In case of failure, fail the rest 2090 * of the chain. 2091 */ 2092 if (req->flags & IO_DISARM_MASK) { 2093 struct io_ring_ctx *ctx = req->ctx; 2094 bool posted; 2095 2096 spin_lock(&ctx->completion_lock); 2097 posted = io_disarm_next(req); 2098 if (posted) 2099 io_commit_cqring(req->ctx); 2100 spin_unlock(&ctx->completion_lock); 2101 if (posted) 2102 io_cqring_ev_posted(ctx); 2103 } 2104 nxt = req->link; 2105 req->link = NULL; 2106 return nxt; 2107} 2108 2109static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) 2110{ 2111 if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))) 2112 return NULL; 2113 return __io_req_find_next(req); 2114} 2115 2116static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) 2117{ 2118 if (!ctx) 2119 return; 2120 if (*locked) { 2121 if (ctx->submit_state.compl_nr) 2122 io_submit_flush_completions(ctx); 2123 mutex_unlock(&ctx->uring_lock); 2124 *locked = false; 2125 } 2126 percpu_ref_put(&ctx->refs); 2127} 2128 2129static void tctx_task_work(struct callback_head *cb) 2130{ 2131 bool locked = false; 2132 struct io_ring_ctx *ctx = NULL; 2133 struct io_uring_task *tctx = container_of(cb, struct io_uring_task, 2134 task_work); 2135 2136 while (1) { 2137 struct io_wq_work_node *node; 2138 2139 if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr) 2140 io_submit_flush_completions(ctx); 2141 2142 spin_lock_irq(&tctx->task_lock); 2143 node = tctx->task_list.first; 2144 INIT_WQ_LIST(&tctx->task_list); 2145 if (!node) 2146 tctx->task_running = false; 2147 spin_unlock_irq(&tctx->task_lock); 2148 if (!node) 2149 break; 2150 2151 do { 2152 struct io_wq_work_node *next = node->next; 2153 struct io_kiocb *req = container_of(node, struct io_kiocb, 2154 io_task_work.node); 2155 2156 if (req->ctx != ctx) { 2157 ctx_flush_and_put(ctx, &locked); 2158 ctx = req->ctx; 2159 /* if not contended, grab and improve batching */ 2160 locked = mutex_trylock(&ctx->uring_lock); 2161 percpu_ref_get(&ctx->refs); 2162 } 2163 req->io_task_work.func(req, &locked); 2164 node = next; 2165 } while (node); 2166 2167 cond_resched(); 2168 } 2169 2170 ctx_flush_and_put(ctx, &locked); 2171} 2172 2173static void io_req_task_work_add(struct io_kiocb *req) 2174{ 2175 struct task_struct *tsk = req->task; 2176 struct io_uring_task *tctx = tsk->io_uring; 2177 enum task_work_notify_mode notify; 2178 struct io_wq_work_node *node; 2179 unsigned long flags; 2180 bool running; 2181 2182 WARN_ON_ONCE(!tctx); 2183 2184 spin_lock_irqsave(&tctx->task_lock, flags); 2185 wq_list_add_tail(&req->io_task_work.node, &tctx->task_list); 2186 running = tctx->task_running; 2187 if (!running) 2188 tctx->task_running = true; 2189 spin_unlock_irqrestore(&tctx->task_lock, flags); 2190 2191 /* task_work already pending, we're done */ 2192 if (running) 2193 return; 2194 2195 /* 2196 * SQPOLL kernel thread doesn't need notification, just a wakeup. For 2197 * all other cases, use TWA_SIGNAL unconditionally to ensure we're 2198 * processing task_work. There's no reliable way to tell if TWA_RESUME 2199 * will do the job. 2200 */ 2201 notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL; 2202 if (!task_work_add(tsk, &tctx->task_work, notify)) { 2203 wake_up_process(tsk); 2204 return; 2205 } 2206 2207 spin_lock_irqsave(&tctx->task_lock, flags); 2208 tctx->task_running = false; 2209 node = tctx->task_list.first; 2210 INIT_WQ_LIST(&tctx->task_list); 2211 spin_unlock_irqrestore(&tctx->task_lock, flags); 2212 2213 while (node) { 2214 req = container_of(node, struct io_kiocb, io_task_work.node); 2215 node = node->next; 2216 if (llist_add(&req->io_task_work.fallback_node, 2217 &req->ctx->fallback_llist)) 2218 schedule_delayed_work(&req->ctx->fallback_work, 1); 2219 } 2220} 2221 2222static void io_req_task_cancel(struct io_kiocb *req, bool *locked) 2223{ 2224 struct io_ring_ctx *ctx = req->ctx; 2225 2226 /* not needed for normal modes, but SQPOLL depends on it */ 2227 io_tw_lock(ctx, locked); 2228 io_req_complete_failed(req, req->result); 2229} 2230 2231static void io_req_task_submit(struct io_kiocb *req, bool *locked) 2232{ 2233 struct io_ring_ctx *ctx = req->ctx; 2234 2235 io_tw_lock(ctx, locked); 2236 /* req->task == current here, checking PF_EXITING is safe */ 2237 if (likely(!(req->task->flags & PF_EXITING))) 2238 __io_queue_sqe(req); 2239 else 2240 io_req_complete_failed(req, -EFAULT); 2241} 2242 2243static void io_req_task_queue_fail(struct io_kiocb *req, int ret) 2244{ 2245 req->result = ret; 2246 req->io_task_work.func = io_req_task_cancel; 2247 io_req_task_work_add(req); 2248} 2249 2250static void io_req_task_queue(struct io_kiocb *req) 2251{ 2252 req->io_task_work.func = io_req_task_submit; 2253 io_req_task_work_add(req); 2254} 2255 2256static void io_req_task_queue_reissue(struct io_kiocb *req) 2257{ 2258 req->io_task_work.func = io_queue_async_work; 2259 io_req_task_work_add(req); 2260} 2261 2262static inline void io_queue_next(struct io_kiocb *req) 2263{ 2264 struct io_kiocb *nxt = io_req_find_next(req); 2265 2266 if (nxt) 2267 io_req_task_queue(nxt); 2268} 2269 2270static void io_free_req(struct io_kiocb *req) 2271{ 2272 io_queue_next(req); 2273 __io_free_req(req); 2274} 2275 2276static void io_free_req_work(struct io_kiocb *req, bool *locked) 2277{ 2278 io_free_req(req); 2279} 2280 2281struct req_batch { 2282 struct task_struct *task; 2283 int task_refs; 2284 int ctx_refs; 2285}; 2286 2287static inline void io_init_req_batch(struct req_batch *rb) 2288{ 2289 rb->task_refs = 0; 2290 rb->ctx_refs = 0; 2291 rb->task = NULL; 2292} 2293 2294static void io_req_free_batch_finish(struct io_ring_ctx *ctx, 2295 struct req_batch *rb) 2296{ 2297 if (rb->ctx_refs) 2298 percpu_ref_put_many(&ctx->refs, rb->ctx_refs); 2299 if (rb->task) 2300 io_put_task(rb->task, rb->task_refs); 2301} 2302 2303static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req, 2304 struct io_submit_state *state) 2305{ 2306 io_queue_next(req); 2307 io_dismantle_req(req); 2308 2309 if (req->task != rb->task) { 2310 if (rb->task) 2311 io_put_task(rb->task, rb->task_refs); 2312 rb->task = req->task; 2313 rb->task_refs = 0; 2314 } 2315 rb->task_refs++; 2316 rb->ctx_refs++; 2317 2318 if (state->free_reqs != ARRAY_SIZE(state->reqs)) 2319 state->reqs[state->free_reqs++] = req; 2320 else 2321 list_add(&req->inflight_entry, &state->free_list); 2322} 2323 2324static void io_submit_flush_completions(struct io_ring_ctx *ctx) 2325 __must_hold(&ctx->uring_lock) 2326{ 2327 struct io_submit_state *state = &ctx->submit_state; 2328 int i, nr = state->compl_nr; 2329 struct req_batch rb; 2330 2331 spin_lock(&ctx->completion_lock); 2332 for (i = 0; i < nr; i++) { 2333 struct io_kiocb *req = state->compl_reqs[i]; 2334 2335 __io_cqring_fill_event(ctx, req->user_data, req->result, 2336 req->compl.cflags); 2337 } 2338 io_commit_cqring(ctx); 2339 spin_unlock(&ctx->completion_lock); 2340 io_cqring_ev_posted(ctx); 2341 2342 io_init_req_batch(&rb); 2343 for (i = 0; i < nr; i++) { 2344 struct io_kiocb *req = state->compl_reqs[i]; 2345 2346 if (req_ref_put_and_test(req)) 2347 io_req_free_batch(&rb, req, &ctx->submit_state); 2348 } 2349 2350 io_req_free_batch_finish(ctx, &rb); 2351 state->compl_nr = 0; 2352} 2353 2354/* 2355 * Drop reference to request, return next in chain (if there is one) if this 2356 * was the last reference to this request. 2357 */ 2358static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) 2359{ 2360 struct io_kiocb *nxt = NULL; 2361 2362 if (req_ref_put_and_test(req)) { 2363 nxt = io_req_find_next(req); 2364 __io_free_req(req); 2365 } 2366 return nxt; 2367} 2368 2369static inline void io_put_req(struct io_kiocb *req) 2370{ 2371 if (req_ref_put_and_test(req)) 2372 io_free_req(req); 2373} 2374 2375static inline void io_put_req_deferred(struct io_kiocb *req) 2376{ 2377 if (req_ref_put_and_test(req)) { 2378 req->io_task_work.func = io_free_req_work; 2379 io_req_task_work_add(req); 2380 } 2381} 2382 2383static unsigned io_cqring_events(struct io_ring_ctx *ctx) 2384{ 2385 /* See comment at the top of this file */ 2386 smp_rmb(); 2387 return __io_cqring_events(ctx); 2388} 2389 2390static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) 2391{ 2392 struct io_rings *rings = ctx->rings; 2393 2394 /* make sure SQ entry isn't read before tail */ 2395 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; 2396} 2397 2398static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf) 2399{ 2400 unsigned int cflags; 2401 2402 cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; 2403 cflags |= IORING_CQE_F_BUFFER; 2404 req->flags &= ~REQ_F_BUFFER_SELECTED; 2405 kfree(kbuf); 2406 return cflags; 2407} 2408 2409static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) 2410{ 2411 struct io_buffer *kbuf; 2412 2413 if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) 2414 return 0; 2415 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; 2416 return io_put_kbuf(req, kbuf); 2417} 2418 2419static inline bool io_run_task_work(void) 2420{ 2421 if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) { 2422 __set_current_state(TASK_RUNNING); 2423 tracehook_notify_signal(); 2424 return true; 2425 } 2426 2427 return false; 2428} 2429 2430/* 2431 * Find and free completed poll iocbs 2432 */ 2433static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, 2434 struct list_head *done) 2435{ 2436 struct req_batch rb; 2437 struct io_kiocb *req; 2438 2439 /* order with ->result store in io_complete_rw_iopoll() */ 2440 smp_rmb(); 2441 2442 io_init_req_batch(&rb); 2443 while (!list_empty(done)) { 2444 req = list_first_entry(done, struct io_kiocb, inflight_entry); 2445 list_del(&req->inflight_entry); 2446 2447 if (READ_ONCE(req->result) == -EAGAIN && 2448 !(req->flags & REQ_F_DONT_REISSUE)) { 2449 req->iopoll_completed = 0; 2450 io_req_task_queue_reissue(req); 2451 continue; 2452 } 2453 2454 __io_cqring_fill_event(ctx, req->user_data, req->result, 2455 io_put_rw_kbuf(req)); 2456 (*nr_events)++; 2457 2458 if (req_ref_put_and_test(req)) 2459 io_req_free_batch(&rb, req, &ctx->submit_state); 2460 } 2461 2462 io_commit_cqring(ctx); 2463 io_cqring_ev_posted_iopoll(ctx); 2464 io_req_free_batch_finish(ctx, &rb); 2465} 2466 2467static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, 2468 long min) 2469{ 2470 struct io_kiocb *req, *tmp; 2471 LIST_HEAD(done); 2472 bool spin; 2473 2474 /* 2475 * Only spin for completions if we don't have multiple devices hanging 2476 * off our complete list, and we're under the requested amount. 2477 */ 2478 spin = !ctx->poll_multi_queue && *nr_events < min; 2479 2480 list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) { 2481 struct kiocb *kiocb = &req->rw.kiocb; 2482 int ret; 2483 2484 /* 2485 * Move completed and retryable entries to our local lists. 2486 * If we find a request that requires polling, break out 2487 * and complete those lists first, if we have entries there. 2488 */ 2489 if (READ_ONCE(req->iopoll_completed)) { 2490 list_move_tail(&req->inflight_entry, &done); 2491 continue; 2492 } 2493 if (!list_empty(&done)) 2494 break; 2495 2496 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin); 2497 if (unlikely(ret < 0)) 2498 return ret; 2499 else if (ret) 2500 spin = false; 2501 2502 /* iopoll may have completed current req */ 2503 if (READ_ONCE(req->iopoll_completed)) 2504 list_move_tail(&req->inflight_entry, &done); 2505 } 2506 2507 if (!list_empty(&done)) 2508 io_iopoll_complete(ctx, nr_events, &done); 2509 2510 return 0; 2511} 2512 2513/* 2514 * We can't just wait for polled events to come to us, we have to actively 2515 * find and complete them. 2516 */ 2517static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) 2518{ 2519 if (!(ctx->flags & IORING_SETUP_IOPOLL)) 2520 return; 2521 2522 mutex_lock(&ctx->uring_lock); 2523 while (!list_empty(&ctx->iopoll_list)) { 2524 unsigned int nr_events = 0; 2525 2526 io_do_iopoll(ctx, &nr_events, 0); 2527 2528 /* let it sleep and repeat later if can't complete a request */ 2529 if (nr_events == 0) 2530 break; 2531 /* 2532 * Ensure we allow local-to-the-cpu processing to take place, 2533 * in this case we need to ensure that we reap all events. 2534 * Also let task_work, etc. to progress by releasing the mutex 2535 */ 2536 if (need_resched()) { 2537 mutex_unlock(&ctx->uring_lock); 2538 cond_resched(); 2539 mutex_lock(&ctx->uring_lock); 2540 } 2541 } 2542 mutex_unlock(&ctx->uring_lock); 2543} 2544 2545static int io_iopoll_check(struct io_ring_ctx *ctx, long min) 2546{ 2547 unsigned int nr_events = 0; 2548 int ret = 0; 2549 2550 /* 2551 * We disallow the app entering submit/complete with polling, but we 2552 * still need to lock the ring to prevent racing with polled issue 2553 * that got punted to a workqueue. 2554 */ 2555 mutex_lock(&ctx->uring_lock); 2556 /* 2557 * Don't enter poll loop if we already have events pending. 2558 * If we do, we can potentially be spinning for commands that 2559 * already triggered a CQE (eg in error). 2560 */ 2561 if (test_bit(0, &ctx->check_cq_overflow)) 2562 __io_cqring_overflow_flush(ctx, false); 2563 if (io_cqring_events(ctx)) 2564 goto out; 2565 do { 2566 /* 2567 * If a submit got punted to a workqueue, we can have the 2568 * application entering polling for a command before it gets 2569 * issued. That app will hold the uring_lock for the duration 2570 * of the poll right here, so we need to take a breather every 2571 * now and then to ensure that the issue has a chance to add 2572 * the poll to the issued list. Otherwise we can spin here 2573 * forever, while the workqueue is stuck trying to acquire the 2574 * very same mutex. 2575 */ 2576 if (list_empty(&ctx->iopoll_list)) { 2577 u32 tail = ctx->cached_cq_tail; 2578 2579 mutex_unlock(&ctx->uring_lock); 2580 io_run_task_work(); 2581 mutex_lock(&ctx->uring_lock); 2582 2583 /* some requests don't go through iopoll_list */ 2584 if (tail != ctx->cached_cq_tail || 2585 list_empty(&ctx->iopoll_list)) 2586 break; 2587 } 2588 ret = io_do_iopoll(ctx, &nr_events, min); 2589 } while (!ret && nr_events < min && !need_resched()); 2590out: 2591 mutex_unlock(&ctx->uring_lock); 2592 return ret; 2593} 2594 2595static void kiocb_end_write(struct io_kiocb *req) 2596{ 2597 /* 2598 * Tell lockdep we inherited freeze protection from submission 2599 * thread. 2600 */ 2601 if (req->flags & REQ_F_ISREG) { 2602 struct super_block *sb = file_inode(req->file)->i_sb; 2603 2604 __sb_writers_acquired(sb, SB_FREEZE_WRITE); 2605 sb_end_write(sb); 2606 } 2607} 2608 2609#ifdef CONFIG_BLOCK 2610static bool io_resubmit_prep(struct io_kiocb *req) 2611{ 2612 struct io_async_rw *rw = req->async_data; 2613 2614 if (!rw) 2615 return !io_req_prep_async(req); 2616 /* may have left rw->iter inconsistent on -EIOCBQUEUED */ 2617 iov_iter_revert(&rw->iter, req->result - iov_iter_count(&rw->iter)); 2618 return true; 2619} 2620 2621static bool io_rw_should_reissue(struct io_kiocb *req) 2622{ 2623 umode_t mode = file_inode(req->file)->i_mode; 2624 struct io_ring_ctx *ctx = req->ctx; 2625 2626 if (!S_ISBLK(mode) && !S_ISREG(mode)) 2627 return false; 2628 if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && 2629 !(ctx->flags & IORING_SETUP_IOPOLL))) 2630 return false; 2631 /* 2632 * If ref is dying, we might be running poll reap from the exit work. 2633 * Don't attempt to reissue from that path, just let it fail with 2634 * -EAGAIN. 2635 */ 2636 if (percpu_ref_is_dying(&ctx->refs)) 2637 return false; 2638 /* 2639 * Play it safe and assume not safe to re-import and reissue if we're 2640 * not in the original thread group (or in task context). 2641 */ 2642 if (!same_thread_group(req->task, current) || !in_task()) 2643 return false; 2644 return true; 2645} 2646#else 2647static bool io_resubmit_prep(struct io_kiocb *req) 2648{ 2649 return false; 2650} 2651static bool io_rw_should_reissue(struct io_kiocb *req) 2652{ 2653 return false; 2654} 2655#endif 2656 2657static bool __io_complete_rw_common(struct io_kiocb *req, long res) 2658{ 2659 if (req->rw.kiocb.ki_flags & IOCB_WRITE) 2660 kiocb_end_write(req); 2661 if (res != req->result) { 2662 if ((res == -EAGAIN || res == -EOPNOTSUPP) && 2663 io_rw_should_reissue(req)) { 2664 req->flags |= REQ_F_REISSUE; 2665 return true; 2666 } 2667 req_set_fail(req); 2668 req->result = res; 2669 } 2670 return false; 2671} 2672 2673static void io_req_task_complete(struct io_kiocb *req, bool *locked) 2674{ 2675 unsigned int cflags = io_put_rw_kbuf(req); 2676 long res = req->result; 2677 2678 if (*locked) { 2679 struct io_ring_ctx *ctx = req->ctx; 2680 struct io_submit_state *state = &ctx->submit_state; 2681 2682 io_req_complete_state(req, res, cflags); 2683 state->compl_reqs[state->compl_nr++] = req; 2684 if (state->compl_nr == ARRAY_SIZE(state->compl_reqs)) 2685 io_submit_flush_completions(ctx); 2686 } else { 2687 io_req_complete_post(req, res, cflags); 2688 } 2689} 2690 2691static void __io_complete_rw(struct io_kiocb *req, long res, long res2, 2692 unsigned int issue_flags) 2693{ 2694 if (__io_complete_rw_common(req, res)) 2695 return; 2696 __io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req)); 2697} 2698 2699static void io_complete_rw(struct kiocb *kiocb, long res, long res2) 2700{ 2701 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 2702 2703 if (__io_complete_rw_common(req, res)) 2704 return; 2705 req->result = res; 2706 req->io_task_work.func = io_req_task_complete; 2707 io_req_task_work_add(req); 2708} 2709 2710static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) 2711{ 2712 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 2713 2714 if (kiocb->ki_flags & IOCB_WRITE) 2715 kiocb_end_write(req); 2716 if (unlikely(res != req->result)) { 2717 if (!(res == -EAGAIN && io_rw_should_reissue(req) && 2718 io_resubmit_prep(req))) { 2719 req_set_fail(req); 2720 req->flags |= REQ_F_DONT_REISSUE; 2721 } 2722 } 2723 2724 WRITE_ONCE(req->result, res); 2725 /* order with io_iopoll_complete() checking ->result */ 2726 smp_wmb(); 2727 WRITE_ONCE(req->iopoll_completed, 1); 2728} 2729 2730/* 2731 * After the iocb has been issued, it's safe to be found on the poll list. 2732 * Adding the kiocb to the list AFTER submission ensures that we don't 2733 * find it from a io_do_iopoll() thread before the issuer is done 2734 * accessing the kiocb cookie. 2735 */ 2736static void io_iopoll_req_issued(struct io_kiocb *req) 2737{ 2738 struct io_ring_ctx *ctx = req->ctx; 2739 const bool in_async = io_wq_current_is_worker(); 2740 2741 /* workqueue context doesn't hold uring_lock, grab it now */ 2742 if (unlikely(in_async)) 2743 mutex_lock(&ctx->uring_lock); 2744 2745 /* 2746 * Track whether we have multiple files in our lists. This will impact 2747 * how we do polling eventually, not spinning if we're on potentially 2748 * different devices. 2749 */ 2750 if (list_empty(&ctx->iopoll_list)) { 2751 ctx->poll_multi_queue = false; 2752 } else if (!ctx->poll_multi_queue) { 2753 struct io_kiocb *list_req; 2754 unsigned int queue_num0, queue_num1; 2755 2756 list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb, 2757 inflight_entry); 2758 2759 if (list_req->file != req->file) { 2760 ctx->poll_multi_queue = true; 2761 } else { 2762 queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie); 2763 queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie); 2764 if (queue_num0 != queue_num1) 2765 ctx->poll_multi_queue = true; 2766 } 2767 } 2768 2769 /* 2770 * For fast devices, IO may have already completed. If it has, add 2771 * it to the front so we find it first. 2772 */ 2773 if (READ_ONCE(req->iopoll_completed)) 2774 list_add(&req->inflight_entry, &ctx->iopoll_list); 2775 else 2776 list_add_tail(&req->inflight_entry, &ctx->iopoll_list); 2777 2778 if (unlikely(in_async)) { 2779 /* 2780 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle 2781 * in sq thread task context or in io worker task context. If 2782 * current task context is sq thread, we don't need to check 2783 * whether should wake up sq thread. 2784 */ 2785 if ((ctx->flags & IORING_SETUP_SQPOLL) && 2786 wq_has_sleeper(&ctx->sq_data->wait)) 2787 wake_up(&ctx->sq_data->wait); 2788 2789 mutex_unlock(&ctx->uring_lock); 2790 } 2791} 2792 2793static bool io_bdev_nowait(struct block_device *bdev) 2794{ 2795 return !bdev || blk_queue_nowait(bdev_get_queue(bdev)); 2796} 2797 2798/* 2799 * If we tracked the file through the SCM inflight mechanism, we could support 2800 * any file. For now, just ensure that anything potentially problematic is done 2801 * inline. 2802 */ 2803static bool __io_file_supports_nowait(struct file *file, int rw) 2804{ 2805 umode_t mode = file_inode(file)->i_mode; 2806 2807 if (S_ISBLK(mode)) { 2808 if (IS_ENABLED(CONFIG_BLOCK) && 2809 io_bdev_nowait(I_BDEV(file->f_mapping->host))) 2810 return true; 2811 return false; 2812 } 2813 if (S_ISSOCK(mode)) 2814 return true; 2815 if (S_ISREG(mode)) { 2816 if (IS_ENABLED(CONFIG_BLOCK) && 2817 io_bdev_nowait(file->f_inode->i_sb->s_bdev) && 2818 file->f_op != &io_uring_fops) 2819 return true; 2820 return false; 2821 } 2822 2823 /* any ->read/write should understand O_NONBLOCK */ 2824 if (file->f_flags & O_NONBLOCK) 2825 return true; 2826 2827 if (!(file->f_mode & FMODE_NOWAIT)) 2828 return false; 2829 2830 if (rw == READ) 2831 return file->f_op->read_iter != NULL; 2832 2833 return file->f_op->write_iter != NULL; 2834} 2835 2836static bool io_file_supports_nowait(struct io_kiocb *req, int rw) 2837{ 2838 if (rw == READ && (req->flags & REQ_F_NOWAIT_READ)) 2839 return true; 2840 else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE)) 2841 return true; 2842 2843 return __io_file_supports_nowait(req->file, rw); 2844} 2845 2846static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) 2847{ 2848 struct io_ring_ctx *ctx = req->ctx; 2849 struct kiocb *kiocb = &req->rw.kiocb; 2850 struct file *file = req->file; 2851 unsigned ioprio; 2852 int ret; 2853 2854 if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode)) 2855 req->flags |= REQ_F_ISREG; 2856 2857 kiocb->ki_pos = READ_ONCE(sqe->off); 2858 if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) { 2859 req->flags |= REQ_F_CUR_POS; 2860 kiocb->ki_pos = file->f_pos; 2861 } 2862 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); 2863 kiocb->ki_flags = iocb_flags(kiocb->ki_filp); 2864 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); 2865 if (unlikely(ret)) 2866 return ret; 2867 2868 /* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */ 2869 if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK)) 2870 req->flags |= REQ_F_NOWAIT; 2871 2872 ioprio = READ_ONCE(sqe->ioprio); 2873 if (ioprio) { 2874 ret = ioprio_check_cap(ioprio); 2875 if (ret) 2876 return ret; 2877 2878 kiocb->ki_ioprio = ioprio; 2879 } else 2880 kiocb->ki_ioprio = get_current_ioprio(); 2881 2882 if (ctx->flags & IORING_SETUP_IOPOLL) { 2883 if (!(kiocb->ki_flags & IOCB_DIRECT) || 2884 !kiocb->ki_filp->f_op->iopoll) 2885 return -EOPNOTSUPP; 2886 2887 kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; 2888 kiocb->ki_complete = io_complete_rw_iopoll; 2889 req->iopoll_completed = 0; 2890 } else { 2891 if (kiocb->ki_flags & IOCB_HIPRI) 2892 return -EINVAL; 2893 kiocb->ki_complete = io_complete_rw; 2894 } 2895 2896 if (req->opcode == IORING_OP_READ_FIXED || 2897 req->opcode == IORING_OP_WRITE_FIXED) { 2898 req->imu = NULL; 2899 io_req_set_rsrc_node(req); 2900 } 2901 2902 req->rw.addr = READ_ONCE(sqe->addr); 2903 req->rw.len = READ_ONCE(sqe->len); 2904 req->buf_index = READ_ONCE(sqe->buf_index); 2905 return 0; 2906} 2907 2908static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) 2909{ 2910 switch (ret) { 2911 case -EIOCBQUEUED: 2912 break; 2913 case -ERESTARTSYS: 2914 case -ERESTARTNOINTR: 2915 case -ERESTARTNOHAND: 2916 case -ERESTART_RESTARTBLOCK: 2917 /* 2918 * We can't just restart the syscall, since previously 2919 * submitted sqes may already be in progress. Just fail this 2920 * IO with EINTR. 2921 */ 2922 ret = -EINTR; 2923 fallthrough; 2924 default: 2925 kiocb->ki_complete(kiocb, ret, 0); 2926 } 2927} 2928 2929static void kiocb_done(struct kiocb *kiocb, ssize_t ret, 2930 unsigned int issue_flags) 2931{ 2932 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 2933 struct io_async_rw *io = req->async_data; 2934 bool check_reissue = kiocb->ki_complete == io_complete_rw; 2935 2936 /* add previously done IO, if any */ 2937 if (io && io->bytes_done > 0) { 2938 if (ret < 0) 2939 ret = io->bytes_done; 2940 else 2941 ret += io->bytes_done; 2942 } 2943 2944 if (req->flags & REQ_F_CUR_POS) 2945 req->file->f_pos = kiocb->ki_pos; 2946 if (ret >= 0 && check_reissue) 2947 __io_complete_rw(req, ret, 0, issue_flags); 2948 else 2949 io_rw_done(kiocb, ret); 2950 2951 if (check_reissue && (req->flags & REQ_F_REISSUE)) { 2952 req->flags &= ~REQ_F_REISSUE; 2953 if (io_resubmit_prep(req)) { 2954 io_req_task_queue_reissue(req); 2955 } else { 2956 req_set_fail(req); 2957 __io_req_complete(req, issue_flags, ret, 2958 io_put_rw_kbuf(req)); 2959 } 2960 } 2961} 2962 2963static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, 2964 struct io_mapped_ubuf *imu) 2965{ 2966 size_t len = req->rw.len; 2967 u64 buf_end, buf_addr = req->rw.addr; 2968 size_t offset; 2969 2970 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 2971 return -EFAULT; 2972 /* not inside the mapped region */ 2973 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) 2974 return -EFAULT; 2975 2976 /* 2977 * May not be a start of buffer, set size appropriately 2978 * and advance us to the beginning. 2979 */ 2980 offset = buf_addr - imu->ubuf; 2981 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); 2982 2983 if (offset) { 2984 /* 2985 * Don't use iov_iter_advance() here, as it's really slow for 2986 * using the latter parts of a big fixed buffer - it iterates 2987 * over each segment manually. We can cheat a bit here, because 2988 * we know that: 2989 * 2990 * 1) it's a BVEC iter, we set it up 2991 * 2) all bvecs are PAGE_SIZE in size, except potentially the 2992 * first and last bvec 2993 * 2994 * So just find our index, and adjust the iterator afterwards. 2995 * If the offset is within the first bvec (or the whole first 2996 * bvec, just use iov_iter_advance(). This makes it easier 2997 * since we can just skip the first segment, which may not 2998 * be PAGE_SIZE aligned. 2999 */ 3000 const struct bio_vec *bvec = imu->bvec; 3001 3002 if (offset <= bvec->bv_len) { 3003 iov_iter_advance(iter, offset); 3004 } else { 3005 unsigned long seg_skip; 3006 3007 /* skip first vec */ 3008 offset -= bvec->bv_len; 3009 seg_skip = 1 + (offset >> PAGE_SHIFT); 3010 3011 iter->bvec = bvec + seg_skip; 3012 iter->nr_segs -= seg_skip; 3013 iter->count -= bvec->bv_len + offset; 3014 iter->iov_offset = offset & ~PAGE_MASK; 3015 } 3016 } 3017 3018 return 0; 3019} 3020 3021static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) 3022{ 3023 struct io_ring_ctx *ctx = req->ctx; 3024 struct io_mapped_ubuf *imu = req->imu; 3025 u16 index, buf_index = req->buf_index; 3026 3027 if (likely(!imu)) { 3028 if (unlikely(buf_index >= ctx->nr_user_bufs)) 3029 return -EFAULT; 3030 index = array_index_nospec(buf_index, ctx->nr_user_bufs); 3031 imu = READ_ONCE(ctx->user_bufs[index]); 3032 req->imu = imu; 3033 } 3034 return __io_import_fixed(req, rw, iter, imu); 3035} 3036 3037static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock) 3038{ 3039 if (needs_lock) 3040 mutex_unlock(&ctx->uring_lock); 3041} 3042 3043static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) 3044{ 3045 /* 3046 * "Normal" inline submissions always hold the uring_lock, since we 3047 * grab it from the system call. Same is true for the SQPOLL offload. 3048 * The only exception is when we've detached the request and issue it 3049 * from an async worker thread, grab the lock for that case. 3050 */ 3051 if (needs_lock) 3052 mutex_lock(&ctx->uring_lock); 3053} 3054 3055static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, 3056 int bgid, struct io_buffer *kbuf, 3057 bool needs_lock) 3058{ 3059 struct io_buffer *head; 3060 3061 if (req->flags & REQ_F_BUFFER_SELECTED) 3062 return kbuf; 3063 3064 io_ring_submit_lock(req->ctx, needs_lock); 3065 3066 lockdep_assert_held(&req->ctx->uring_lock); 3067 3068 head = xa_load(&req->ctx->io_buffers, bgid); 3069 if (head) { 3070 if (!list_empty(&head->list)) { 3071 kbuf = list_last_entry(&head->list, struct io_buffer, 3072 list); 3073 list_del(&kbuf->list); 3074 } else { 3075 kbuf = head; 3076 xa_erase(&req->ctx->io_buffers, bgid); 3077 } 3078 if (*len > kbuf->len) 3079 *len = kbuf->len; 3080 } else { 3081 kbuf = ERR_PTR(-ENOBUFS); 3082 } 3083 3084 io_ring_submit_unlock(req->ctx, needs_lock); 3085 3086 return kbuf; 3087} 3088 3089static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, 3090 bool needs_lock) 3091{ 3092 struct io_buffer *kbuf; 3093 u16 bgid; 3094 3095 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; 3096 bgid = req->buf_index; 3097 kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock); 3098 if (IS_ERR(kbuf)) 3099 return kbuf; 3100 req->rw.addr = (u64) (unsigned long) kbuf; 3101 req->flags |= REQ_F_BUFFER_SELECTED; 3102 return u64_to_user_ptr(kbuf->addr); 3103} 3104 3105#ifdef CONFIG_COMPAT 3106static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, 3107 bool needs_lock) 3108{ 3109 struct compat_iovec __user *uiov; 3110 compat_ssize_t clen; 3111 void __user *buf; 3112 ssize_t len; 3113 3114 uiov = u64_to_user_ptr(req->rw.addr); 3115 if (!access_ok(uiov, sizeof(*uiov))) 3116 return -EFAULT; 3117 if (__get_user(clen, &uiov->iov_len)) 3118 return -EFAULT; 3119 if (clen < 0) 3120 return -EINVAL; 3121 3122 len = clen; 3123 buf = io_rw_buffer_select(req, &len, needs_lock); 3124 if (IS_ERR(buf)) 3125 return PTR_ERR(buf); 3126 iov[0].iov_base = buf; 3127 iov[0].iov_len = (compat_size_t) len; 3128 return 0; 3129} 3130#endif 3131 3132static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, 3133 bool needs_lock) 3134{ 3135 struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr); 3136 void __user *buf; 3137 ssize_t len; 3138 3139 if (copy_from_user(iov, uiov, sizeof(*uiov))) 3140 return -EFAULT; 3141 3142 len = iov[0].iov_len; 3143 if (len < 0) 3144 return -EINVAL; 3145 buf = io_rw_buffer_select(req, &len, needs_lock); 3146 if (IS_ERR(buf)) 3147 return PTR_ERR(buf); 3148 iov[0].iov_base = buf; 3149 iov[0].iov_len = len; 3150 return 0; 3151} 3152 3153static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, 3154 bool needs_lock) 3155{ 3156 if (req->flags & REQ_F_BUFFER_SELECTED) { 3157 struct io_buffer *kbuf; 3158 3159 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; 3160 iov[0].iov_base = u64_to_user_ptr(kbuf->addr); 3161 iov[0].iov_len = kbuf->len; 3162 return 0; 3163 } 3164 if (req->rw.len != 1) 3165 return -EINVAL; 3166 3167#ifdef CONFIG_COMPAT 3168 if (req->ctx->compat) 3169 return io_compat_import(req, iov, needs_lock); 3170#endif 3171 3172 return __io_iov_buffer_select(req, iov, needs_lock); 3173} 3174 3175static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, 3176 struct iov_iter *iter, bool needs_lock) 3177{ 3178 void __user *buf = u64_to_user_ptr(req->rw.addr); 3179 size_t sqe_len = req->rw.len; 3180 u8 opcode = req->opcode; 3181 ssize_t ret; 3182 3183 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { 3184 *iovec = NULL; 3185 return io_import_fixed(req, rw, iter); 3186 } 3187 3188 /* buffer index only valid with fixed read/write, or buffer select */ 3189 if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)) 3190 return -EINVAL; 3191 3192 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { 3193 if (req->flags & REQ_F_BUFFER_SELECT) { 3194 buf = io_rw_buffer_select(req, &sqe_len, needs_lock); 3195 if (IS_ERR(buf)) 3196 return PTR_ERR(buf); 3197 req->rw.len = sqe_len; 3198 } 3199 3200 ret = import_single_range(rw, buf, sqe_len, *iovec, iter); 3201 *iovec = NULL; 3202 return ret; 3203 } 3204 3205 if (req->flags & REQ_F_BUFFER_SELECT) { 3206 ret = io_iov_buffer_select(req, *iovec, needs_lock); 3207 if (!ret) 3208 iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len); 3209 *iovec = NULL; 3210 return ret; 3211 } 3212 3213 return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter, 3214 req->ctx->compat); 3215} 3216 3217static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) 3218{ 3219 return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; 3220} 3221 3222/* 3223 * For files that don't have ->read_iter() and ->write_iter(), handle them 3224 * by looping over ->read() or ->write() manually. 3225 */ 3226static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) 3227{ 3228 struct kiocb *kiocb = &req->rw.kiocb; 3229 struct file *file = req->file; 3230 ssize_t ret = 0; 3231 3232 /* 3233 * Don't support polled IO through this interface, and we can't 3234 * support non-blocking either. For the latter, this just causes 3235 * the kiocb to be handled from an async context. 3236 */ 3237 if (kiocb->ki_flags & IOCB_HIPRI) 3238 return -EOPNOTSUPP; 3239 if (kiocb->ki_flags & IOCB_NOWAIT) 3240 return -EAGAIN; 3241 3242 while (iov_iter_count(iter)) { 3243 struct iovec iovec; 3244 ssize_t nr; 3245 3246 if (!iov_iter_is_bvec(iter)) { 3247 iovec = iov_iter_iovec(iter); 3248 } else { 3249 iovec.iov_base = u64_to_user_ptr(req->rw.addr); 3250 iovec.iov_len = req->rw.len; 3251 } 3252 3253 if (rw == READ) { 3254 nr = file->f_op->read(file, iovec.iov_base, 3255 iovec.iov_len, io_kiocb_ppos(kiocb)); 3256 } else { 3257 nr = file->f_op->write(file, iovec.iov_base, 3258 iovec.iov_len, io_kiocb_ppos(kiocb)); 3259 } 3260 3261 if (nr < 0) { 3262 if (!ret) 3263 ret = nr; 3264 break; 3265 } 3266 ret += nr; 3267 if (nr != iovec.iov_len) 3268 break; 3269 req->rw.len -= nr; 3270 req->rw.addr += nr; 3271 iov_iter_advance(iter, nr); 3272 } 3273 3274 return ret; 3275} 3276 3277static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, 3278 const struct iovec *fast_iov, struct iov_iter *iter) 3279{ 3280 struct io_async_rw *rw = req->async_data; 3281 3282 memcpy(&rw->iter, iter, sizeof(*iter)); 3283 rw->free_iovec = iovec; 3284 rw->bytes_done = 0; 3285 /* can only be fixed buffers, no need to do anything */ 3286 if (iov_iter_is_bvec(iter)) 3287 return; 3288 if (!iovec) { 3289 unsigned iov_off = 0; 3290 3291 rw->iter.iov = rw->fast_iov; 3292 if (iter->iov != fast_iov) { 3293 iov_off = iter->iov - fast_iov; 3294 rw->iter.iov += iov_off; 3295 } 3296 if (rw->fast_iov != fast_iov) 3297 memcpy(rw->fast_iov + iov_off, fast_iov + iov_off, 3298 sizeof(struct iovec) * iter->nr_segs); 3299 } else { 3300 req->flags |= REQ_F_NEED_CLEANUP; 3301 } 3302} 3303 3304static inline int io_alloc_async_data(struct io_kiocb *req) 3305{ 3306 WARN_ON_ONCE(!io_op_defs[req->opcode].async_size); 3307 req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL); 3308 return req->async_data == NULL; 3309} 3310 3311static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, 3312 const struct iovec *fast_iov, 3313 struct iov_iter *iter, bool force) 3314{ 3315 if (!force && !io_op_defs[req->opcode].needs_async_setup) 3316 return 0; 3317 if (!req->async_data) { 3318 if (io_alloc_async_data(req)) { 3319 kfree(iovec); 3320 return -ENOMEM; 3321 } 3322 3323 io_req_map_rw(req, iovec, fast_iov, iter); 3324 } 3325 return 0; 3326} 3327 3328static inline int io_rw_prep_async(struct io_kiocb *req, int rw) 3329{ 3330 struct io_async_rw *iorw = req->async_data; 3331 struct iovec *iov = iorw->fast_iov; 3332 int ret; 3333 3334 ret = io_import_iovec(rw, req, &iov, &iorw->iter, false); 3335 if (unlikely(ret < 0)) 3336 return ret; 3337 3338 iorw->bytes_done = 0; 3339 iorw->free_iovec = iov; 3340 if (iov) 3341 req->flags |= REQ_F_NEED_CLEANUP; 3342 return 0; 3343} 3344 3345static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3346{ 3347 if (unlikely(!(req->file->f_mode & FMODE_READ))) 3348 return -EBADF; 3349 return io_prep_rw(req, sqe); 3350} 3351 3352/* 3353 * This is our waitqueue callback handler, registered through lock_page_async() 3354 * when we initially tried to do the IO with the iocb armed our waitqueue. 3355 * This gets called when the page is unlocked, and we generally expect that to 3356 * happen when the page IO is completed and the page is now uptodate. This will 3357 * queue a task_work based retry of the operation, attempting to copy the data 3358 * again. If the latter fails because the page was NOT uptodate, then we will 3359 * do a thread based blocking retry of the operation. That's the unexpected 3360 * slow path. 3361 */ 3362static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, 3363 int sync, void *arg) 3364{ 3365 struct wait_page_queue *wpq; 3366 struct io_kiocb *req = wait->private; 3367 struct wait_page_key *key = arg; 3368 3369 wpq = container_of(wait, struct wait_page_queue, wait); 3370 3371 if (!wake_page_match(wpq, key)) 3372 return 0; 3373 3374 req->rw.kiocb.ki_flags &= ~IOCB_WAITQ; 3375 list_del_init(&wait->entry); 3376 io_req_task_queue(req); 3377 return 1; 3378} 3379 3380/* 3381 * This controls whether a given IO request should be armed for async page 3382 * based retry. If we return false here, the request is handed to the async 3383 * worker threads for retry. If we're doing buffered reads on a regular file, 3384 * we prepare a private wait_page_queue entry and retry the operation. This 3385 * will either succeed because the page is now uptodate and unlocked, or it 3386 * will register a callback when the page is unlocked at IO completion. Through 3387 * that callback, io_uring uses task_work to setup a retry of the operation. 3388 * That retry will attempt the buffered read again. The retry will generally 3389 * succeed, or in rare cases where it fails, we then fall back to using the 3390 * async worker threads for a blocking retry. 3391 */ 3392static bool io_rw_should_retry(struct io_kiocb *req) 3393{ 3394 struct io_async_rw *rw = req->async_data; 3395 struct wait_page_queue *wait = &rw->wpq; 3396 struct kiocb *kiocb = &req->rw.kiocb; 3397 3398 /* never retry for NOWAIT, we just complete with -EAGAIN */ 3399 if (req->flags & REQ_F_NOWAIT) 3400 return false; 3401 3402 /* Only for buffered IO */ 3403 if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) 3404 return false; 3405 3406 /* 3407 * just use poll if we can, and don't attempt if the fs doesn't 3408 * support callback based unlocks 3409 */ 3410 if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) 3411 return false; 3412 3413 wait->wait.func = io_async_buf_func; 3414 wait->wait.private = req; 3415 wait->wait.flags = 0; 3416 INIT_LIST_HEAD(&wait->wait.entry); 3417 kiocb->ki_flags |= IOCB_WAITQ; 3418 kiocb->ki_flags &= ~IOCB_NOWAIT; 3419 kiocb->ki_waitq = wait; 3420 return true; 3421} 3422 3423static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) 3424{ 3425 if (req->file->f_op->read_iter) 3426 return call_read_iter(req->file, &req->rw.kiocb, iter); 3427 else if (req->file->f_op->read) 3428 return loop_rw_iter(READ, req, iter); 3429 else 3430 return -EINVAL; 3431} 3432 3433static bool need_read_all(struct io_kiocb *req) 3434{ 3435 return req->flags & REQ_F_ISREG || 3436 S_ISBLK(file_inode(req->file)->i_mode); 3437} 3438 3439static int io_read(struct io_kiocb *req, unsigned int issue_flags) 3440{ 3441 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 3442 struct kiocb *kiocb = &req->rw.kiocb; 3443 struct iov_iter __iter, *iter = &__iter; 3444 struct io_async_rw *rw = req->async_data; 3445 ssize_t io_size, ret, ret2; 3446 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 3447 3448 if (rw) { 3449 iter = &rw->iter; 3450 iovec = NULL; 3451 } else { 3452 ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); 3453 if (ret < 0) 3454 return ret; 3455 } 3456 io_size = iov_iter_count(iter); 3457 req->result = io_size; 3458 3459 /* Ensure we clear previously set non-block flag */ 3460 if (!force_nonblock) 3461 kiocb->ki_flags &= ~IOCB_NOWAIT; 3462 else 3463 kiocb->ki_flags |= IOCB_NOWAIT; 3464 3465 /* If the file doesn't support async, just async punt */ 3466 if (force_nonblock && !io_file_supports_nowait(req, READ)) { 3467 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true); 3468 return ret ?: -EAGAIN; 3469 } 3470 3471 ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size); 3472 if (unlikely(ret)) { 3473 kfree(iovec); 3474 return ret; 3475 } 3476 3477 ret = io_iter_do_read(req, iter); 3478 3479 if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { 3480 req->flags &= ~REQ_F_REISSUE; 3481 /* IOPOLL retry should happen for io-wq threads */ 3482 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) 3483 goto done; 3484 /* no retry on NONBLOCK nor RWF_NOWAIT */ 3485 if (req->flags & REQ_F_NOWAIT) 3486 goto done; 3487 /* some cases will consume bytes even on error returns */ 3488 iov_iter_reexpand(iter, iter->count + iter->truncated); 3489 iov_iter_revert(iter, io_size - iov_iter_count(iter)); 3490 ret = 0; 3491 } else if (ret == -EIOCBQUEUED) { 3492 goto out_free; 3493 } else if (ret <= 0 || ret == io_size || !force_nonblock || 3494 (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { 3495 /* read all, failed, already did sync or don't want to retry */ 3496 goto done; 3497 } 3498 3499 ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true); 3500 if (ret2) 3501 return ret2; 3502 3503 iovec = NULL; 3504 rw = req->async_data; 3505 /* now use our persistent iterator, if we aren't already */ 3506 iter = &rw->iter; 3507 3508 do { 3509 io_size -= ret; 3510 rw->bytes_done += ret; 3511 /* if we can retry, do so with the callbacks armed */ 3512 if (!io_rw_should_retry(req)) { 3513 kiocb->ki_flags &= ~IOCB_WAITQ; 3514 return -EAGAIN; 3515 } 3516 3517 /* 3518 * Now retry read with the IOCB_WAITQ parts set in the iocb. If 3519 * we get -EIOCBQUEUED, then we'll get a notification when the 3520 * desired page gets unlocked. We can also get a partial read 3521 * here, and if we do, then just retry at the new offset. 3522 */ 3523 ret = io_iter_do_read(req, iter); 3524 if (ret == -EIOCBQUEUED) 3525 return 0; 3526 /* we got some bytes, but not all. retry. */ 3527 kiocb->ki_flags &= ~IOCB_WAITQ; 3528 } while (ret > 0 && ret < io_size); 3529done: 3530 kiocb_done(kiocb, ret, issue_flags); 3531out_free: 3532 /* it's faster to check here then delegate to kfree */ 3533 if (iovec) 3534 kfree(iovec); 3535 return 0; 3536} 3537 3538static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3539{ 3540 if (unlikely(!(req->file->f_mode & FMODE_WRITE))) 3541 return -EBADF; 3542 return io_prep_rw(req, sqe); 3543} 3544 3545static int io_write(struct io_kiocb *req, unsigned int issue_flags) 3546{ 3547 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 3548 struct kiocb *kiocb = &req->rw.kiocb; 3549 struct iov_iter __iter, *iter = &__iter; 3550 struct io_async_rw *rw = req->async_data; 3551 ssize_t ret, ret2, io_size; 3552 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 3553 3554 if (rw) { 3555 iter = &rw->iter; 3556 iovec = NULL; 3557 } else { 3558 ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); 3559 if (ret < 0) 3560 return ret; 3561 } 3562 io_size = iov_iter_count(iter); 3563 req->result = io_size; 3564 3565 /* Ensure we clear previously set non-block flag */ 3566 if (!force_nonblock) 3567 kiocb->ki_flags &= ~IOCB_NOWAIT; 3568 else 3569 kiocb->ki_flags |= IOCB_NOWAIT; 3570 3571 /* If the file doesn't support async, just async punt */ 3572 if (force_nonblock && !io_file_supports_nowait(req, WRITE)) 3573 goto copy_iov; 3574 3575 /* file path doesn't support NOWAIT for non-direct_IO */ 3576 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && 3577 (req->flags & REQ_F_ISREG)) 3578 goto copy_iov; 3579 3580 ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size); 3581 if (unlikely(ret)) 3582 goto out_free; 3583 3584 /* 3585 * Open-code file_start_write here to grab freeze protection, 3586 * which will be released by another thread in 3587 * io_complete_rw(). Fool lockdep by telling it the lock got 3588 * released so that it doesn't complain about the held lock when 3589 * we return to userspace. 3590 */ 3591 if (req->flags & REQ_F_ISREG) { 3592 sb_start_write(file_inode(req->file)->i_sb); 3593 __sb_writers_release(file_inode(req->file)->i_sb, 3594 SB_FREEZE_WRITE); 3595 } 3596 kiocb->ki_flags |= IOCB_WRITE; 3597 3598 if (req->file->f_op->write_iter) 3599 ret2 = call_write_iter(req->file, kiocb, iter); 3600 else if (req->file->f_op->write) 3601 ret2 = loop_rw_iter(WRITE, req, iter); 3602 else 3603 ret2 = -EINVAL; 3604 3605 if (req->flags & REQ_F_REISSUE) { 3606 req->flags &= ~REQ_F_REISSUE; 3607 ret2 = -EAGAIN; 3608 } 3609 3610 /* 3611 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just 3612 * retry them without IOCB_NOWAIT. 3613 */ 3614 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) 3615 ret2 = -EAGAIN; 3616 /* no retry on NONBLOCK nor RWF_NOWAIT */ 3617 if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT)) 3618 goto done; 3619 if (!force_nonblock || ret2 != -EAGAIN) { 3620 /* IOPOLL retry should happen for io-wq threads */ 3621 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN) 3622 goto copy_iov; 3623done: 3624 kiocb_done(kiocb, ret2, issue_flags); 3625 } else { 3626copy_iov: 3627 /* some cases will consume bytes even on error returns */ 3628 iov_iter_reexpand(iter, iter->count + iter->truncated); 3629 iov_iter_revert(iter, io_size - iov_iter_count(iter)); 3630 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); 3631 return ret ?: -EAGAIN; 3632 } 3633out_free: 3634 /* it's reportedly faster than delegating the null check to kfree() */ 3635 if (iovec) 3636 kfree(iovec); 3637 return ret; 3638} 3639 3640static int io_renameat_prep(struct io_kiocb *req, 3641 const struct io_uring_sqe *sqe) 3642{ 3643 struct io_rename *ren = &req->rename; 3644 const char __user *oldf, *newf; 3645 3646 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3647 return -EINVAL; 3648 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) 3649 return -EINVAL; 3650 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3651 return -EBADF; 3652 3653 ren->old_dfd = READ_ONCE(sqe->fd); 3654 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3655 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 3656 ren->new_dfd = READ_ONCE(sqe->len); 3657 ren->flags = READ_ONCE(sqe->rename_flags); 3658 3659 ren->oldpath = getname(oldf); 3660 if (IS_ERR(ren->oldpath)) 3661 return PTR_ERR(ren->oldpath); 3662 3663 ren->newpath = getname(newf); 3664 if (IS_ERR(ren->newpath)) { 3665 putname(ren->oldpath); 3666 return PTR_ERR(ren->newpath); 3667 } 3668 3669 req->flags |= REQ_F_NEED_CLEANUP; 3670 return 0; 3671} 3672 3673static int io_renameat(struct io_kiocb *req, unsigned int issue_flags) 3674{ 3675 struct io_rename *ren = &req->rename; 3676 int ret; 3677 3678 if (issue_flags & IO_URING_F_NONBLOCK) 3679 return -EAGAIN; 3680 3681 ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, 3682 ren->newpath, ren->flags); 3683 3684 req->flags &= ~REQ_F_NEED_CLEANUP; 3685 if (ret < 0) 3686 req_set_fail(req); 3687 io_req_complete(req, ret); 3688 return 0; 3689} 3690 3691static int io_unlinkat_prep(struct io_kiocb *req, 3692 const struct io_uring_sqe *sqe) 3693{ 3694 struct io_unlink *un = &req->unlink; 3695 const char __user *fname; 3696 3697 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3698 return -EINVAL; 3699 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || 3700 sqe->splice_fd_in) 3701 return -EINVAL; 3702 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3703 return -EBADF; 3704 3705 un->dfd = READ_ONCE(sqe->fd); 3706 3707 un->flags = READ_ONCE(sqe->unlink_flags); 3708 if (un->flags & ~AT_REMOVEDIR) 3709 return -EINVAL; 3710 3711 fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3712 un->filename = getname(fname); 3713 if (IS_ERR(un->filename)) 3714 return PTR_ERR(un->filename); 3715 3716 req->flags |= REQ_F_NEED_CLEANUP; 3717 return 0; 3718} 3719 3720static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) 3721{ 3722 struct io_unlink *un = &req->unlink; 3723 int ret; 3724 3725 if (issue_flags & IO_URING_F_NONBLOCK) 3726 return -EAGAIN; 3727 3728 if (un->flags & AT_REMOVEDIR) 3729 ret = do_rmdir(un->dfd, un->filename); 3730 else 3731 ret = do_unlinkat(un->dfd, un->filename); 3732 3733 req->flags &= ~REQ_F_NEED_CLEANUP; 3734 if (ret < 0) 3735 req_set_fail(req); 3736 io_req_complete(req, ret); 3737 return 0; 3738} 3739 3740static int io_mkdirat_prep(struct io_kiocb *req, 3741 const struct io_uring_sqe *sqe) 3742{ 3743 struct io_mkdir *mkd = &req->mkdir; 3744 const char __user *fname; 3745 3746 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3747 return -EINVAL; 3748 if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index || 3749 sqe->splice_fd_in) 3750 return -EINVAL; 3751 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3752 return -EBADF; 3753 3754 mkd->dfd = READ_ONCE(sqe->fd); 3755 mkd->mode = READ_ONCE(sqe->len); 3756 3757 fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3758 mkd->filename = getname(fname); 3759 if (IS_ERR(mkd->filename)) 3760 return PTR_ERR(mkd->filename); 3761 3762 req->flags |= REQ_F_NEED_CLEANUP; 3763 return 0; 3764} 3765 3766static int io_mkdirat(struct io_kiocb *req, int issue_flags) 3767{ 3768 struct io_mkdir *mkd = &req->mkdir; 3769 int ret; 3770 3771 if (issue_flags & IO_URING_F_NONBLOCK) 3772 return -EAGAIN; 3773 3774 ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); 3775 3776 req->flags &= ~REQ_F_NEED_CLEANUP; 3777 if (ret < 0) 3778 req_set_fail(req); 3779 io_req_complete(req, ret); 3780 return 0; 3781} 3782 3783static int io_symlinkat_prep(struct io_kiocb *req, 3784 const struct io_uring_sqe *sqe) 3785{ 3786 struct io_symlink *sl = &req->symlink; 3787 const char __user *oldpath, *newpath; 3788 3789 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3790 return -EINVAL; 3791 if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index || 3792 sqe->splice_fd_in) 3793 return -EINVAL; 3794 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3795 return -EBADF; 3796 3797 sl->new_dfd = READ_ONCE(sqe->fd); 3798 oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3799 newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 3800 3801 sl->oldpath = getname(oldpath); 3802 if (IS_ERR(sl->oldpath)) 3803 return PTR_ERR(sl->oldpath); 3804 3805 sl->newpath = getname(newpath); 3806 if (IS_ERR(sl->newpath)) { 3807 putname(sl->oldpath); 3808 return PTR_ERR(sl->newpath); 3809 } 3810 3811 req->flags |= REQ_F_NEED_CLEANUP; 3812 return 0; 3813} 3814 3815static int io_symlinkat(struct io_kiocb *req, int issue_flags) 3816{ 3817 struct io_symlink *sl = &req->symlink; 3818 int ret; 3819 3820 if (issue_flags & IO_URING_F_NONBLOCK) 3821 return -EAGAIN; 3822 3823 ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath); 3824 3825 req->flags &= ~REQ_F_NEED_CLEANUP; 3826 if (ret < 0) 3827 req_set_fail(req); 3828 io_req_complete(req, ret); 3829 return 0; 3830} 3831 3832static int io_linkat_prep(struct io_kiocb *req, 3833 const struct io_uring_sqe *sqe) 3834{ 3835 struct io_hardlink *lnk = &req->hardlink; 3836 const char __user *oldf, *newf; 3837 3838 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3839 return -EINVAL; 3840 if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) 3841 return -EINVAL; 3842 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3843 return -EBADF; 3844 3845 lnk->old_dfd = READ_ONCE(sqe->fd); 3846 lnk->new_dfd = READ_ONCE(sqe->len); 3847 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3848 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 3849 lnk->flags = READ_ONCE(sqe->hardlink_flags); 3850 3851 lnk->oldpath = getname(oldf); 3852 if (IS_ERR(lnk->oldpath)) 3853 return PTR_ERR(lnk->oldpath); 3854 3855 lnk->newpath = getname(newf); 3856 if (IS_ERR(lnk->newpath)) { 3857 putname(lnk->oldpath); 3858 return PTR_ERR(lnk->newpath); 3859 } 3860 3861 req->flags |= REQ_F_NEED_CLEANUP; 3862 return 0; 3863} 3864 3865static int io_linkat(struct io_kiocb *req, int issue_flags) 3866{ 3867 struct io_hardlink *lnk = &req->hardlink; 3868 int ret; 3869 3870 if (issue_flags & IO_URING_F_NONBLOCK) 3871 return -EAGAIN; 3872 3873 ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd, 3874 lnk->newpath, lnk->flags); 3875 3876 req->flags &= ~REQ_F_NEED_CLEANUP; 3877 if (ret < 0) 3878 req_set_fail(req); 3879 io_req_complete(req, ret); 3880 return 0; 3881} 3882 3883static int io_shutdown_prep(struct io_kiocb *req, 3884 const struct io_uring_sqe *sqe) 3885{ 3886#if defined(CONFIG_NET) 3887 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3888 return -EINVAL; 3889 if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || 3890 sqe->buf_index || sqe->splice_fd_in)) 3891 return -EINVAL; 3892 3893 req->shutdown.how = READ_ONCE(sqe->len); 3894 return 0; 3895#else 3896 return -EOPNOTSUPP; 3897#endif 3898} 3899 3900static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) 3901{ 3902#if defined(CONFIG_NET) 3903 struct socket *sock; 3904 int ret; 3905 3906 if (issue_flags & IO_URING_F_NONBLOCK) 3907 return -EAGAIN; 3908 3909 sock = sock_from_file(req->file); 3910 if (unlikely(!sock)) 3911 return -ENOTSOCK; 3912 3913 ret = __sys_shutdown_sock(sock, req->shutdown.how); 3914 if (ret < 0) 3915 req_set_fail(req); 3916 io_req_complete(req, ret); 3917 return 0; 3918#else 3919 return -EOPNOTSUPP; 3920#endif 3921} 3922 3923static int __io_splice_prep(struct io_kiocb *req, 3924 const struct io_uring_sqe *sqe) 3925{ 3926 struct io_splice *sp = &req->splice; 3927 unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; 3928 3929 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3930 return -EINVAL; 3931 3932 sp->file_in = NULL; 3933 sp->len = READ_ONCE(sqe->len); 3934 sp->flags = READ_ONCE(sqe->splice_flags); 3935 3936 if (unlikely(sp->flags & ~valid_flags)) 3937 return -EINVAL; 3938 3939 sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in), 3940 (sp->flags & SPLICE_F_FD_IN_FIXED)); 3941 if (!sp->file_in) 3942 return -EBADF; 3943 req->flags |= REQ_F_NEED_CLEANUP; 3944 return 0; 3945} 3946 3947static int io_tee_prep(struct io_kiocb *req, 3948 const struct io_uring_sqe *sqe) 3949{ 3950 if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off)) 3951 return -EINVAL; 3952 return __io_splice_prep(req, sqe); 3953} 3954 3955static int io_tee(struct io_kiocb *req, unsigned int issue_flags) 3956{ 3957 struct io_splice *sp = &req->splice; 3958 struct file *in = sp->file_in; 3959 struct file *out = sp->file_out; 3960 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; 3961 long ret = 0; 3962 3963 if (issue_flags & IO_URING_F_NONBLOCK) 3964 return -EAGAIN; 3965 if (sp->len) 3966 ret = do_tee(in, out, sp->len, flags); 3967 3968 if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) 3969 io_put_file(in); 3970 req->flags &= ~REQ_F_NEED_CLEANUP; 3971 3972 if (ret != sp->len) 3973 req_set_fail(req); 3974 io_req_complete(req, ret); 3975 return 0; 3976} 3977 3978static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3979{ 3980 struct io_splice *sp = &req->splice; 3981 3982 sp->off_in = READ_ONCE(sqe->splice_off_in); 3983 sp->off_out = READ_ONCE(sqe->off); 3984 return __io_splice_prep(req, sqe); 3985} 3986 3987static int io_splice(struct io_kiocb *req, unsigned int issue_flags) 3988{ 3989 struct io_splice *sp = &req->splice; 3990 struct file *in = sp->file_in; 3991 struct file *out = sp->file_out; 3992 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; 3993 loff_t *poff_in, *poff_out; 3994 long ret = 0; 3995 3996 if (issue_flags & IO_URING_F_NONBLOCK) 3997 return -EAGAIN; 3998 3999 poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; 4000 poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; 4001 4002 if (sp->len) 4003 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); 4004 4005 if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) 4006 io_put_file(in); 4007 req->flags &= ~REQ_F_NEED_CLEANUP; 4008 4009 if (ret != sp->len) 4010 req_set_fail(req); 4011 io_req_complete(req, ret); 4012 return 0; 4013} 4014 4015/* 4016 * IORING_OP_NOP just posts a completion event, nothing else. 4017 */ 4018static int io_nop(struct io_kiocb *req, unsigned int issue_flags) 4019{ 4020 struct io_ring_ctx *ctx = req->ctx; 4021 4022 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 4023 return -EINVAL; 4024 4025 __io_req_complete(req, issue_flags, 0, 0); 4026 return 0; 4027} 4028 4029static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4030{ 4031 struct io_ring_ctx *ctx = req->ctx; 4032 4033 if (!req->file) 4034 return -EBADF; 4035 4036 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 4037 return -EINVAL; 4038 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || 4039 sqe->splice_fd_in)) 4040 return -EINVAL; 4041 4042 req->sync.flags = READ_ONCE(sqe->fsync_flags); 4043 if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC)) 4044 return -EINVAL; 4045 4046 req->sync.off = READ_ONCE(sqe->off); 4047 req->sync.len = READ_ONCE(sqe->len); 4048 return 0; 4049} 4050 4051static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) 4052{ 4053 loff_t end = req->sync.off + req->sync.len; 4054 int ret; 4055 4056 /* fsync always requires a blocking context */ 4057 if (issue_flags & IO_URING_F_NONBLOCK) 4058 return -EAGAIN; 4059 4060 ret = vfs_fsync_range(req->file, req->sync.off, 4061 end > 0 ? end : LLONG_MAX, 4062 req->sync.flags & IORING_FSYNC_DATASYNC); 4063 if (ret < 0) 4064 req_set_fail(req); 4065 io_req_complete(req, ret); 4066 return 0; 4067} 4068 4069static int io_fallocate_prep(struct io_kiocb *req, 4070 const struct io_uring_sqe *sqe) 4071{ 4072 if (sqe->ioprio || sqe->buf_index || sqe->rw_flags || 4073 sqe->splice_fd_in) 4074 return -EINVAL; 4075 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4076 return -EINVAL; 4077 4078 req->sync.off = READ_ONCE(sqe->off); 4079 req->sync.len = READ_ONCE(sqe->addr); 4080 req->sync.mode = READ_ONCE(sqe->len); 4081 return 0; 4082} 4083 4084static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) 4085{ 4086 int ret; 4087 4088 /* fallocate always requiring blocking context */ 4089 if (issue_flags & IO_URING_F_NONBLOCK) 4090 return -EAGAIN; 4091 ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, 4092 req->sync.len); 4093 if (ret < 0) 4094 req_set_fail(req); 4095 io_req_complete(req, ret); 4096 return 0; 4097} 4098 4099static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4100{ 4101 const char __user *fname; 4102 int ret; 4103 4104 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4105 return -EINVAL; 4106 if (unlikely(sqe->ioprio || sqe->buf_index)) 4107 return -EINVAL; 4108 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 4109 return -EBADF; 4110 4111 /* open.how should be already initialised */ 4112 if (!(req->open.how.flags & O_PATH) && force_o_largefile()) 4113 req->open.how.flags |= O_LARGEFILE; 4114 4115 req->open.dfd = READ_ONCE(sqe->fd); 4116 fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4117 req->open.filename = getname(fname); 4118 if (IS_ERR(req->open.filename)) { 4119 ret = PTR_ERR(req->open.filename); 4120 req->open.filename = NULL; 4121 return ret; 4122 } 4123 4124 req->open.file_slot = READ_ONCE(sqe->file_index); 4125 if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC)) 4126 return -EINVAL; 4127 4128 req->open.nofile = rlimit(RLIMIT_NOFILE); 4129 req->flags |= REQ_F_NEED_CLEANUP; 4130 return 0; 4131} 4132 4133static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4134{ 4135 u64 mode = READ_ONCE(sqe->len); 4136 u64 flags = READ_ONCE(sqe->open_flags); 4137 4138 req->open.how = build_open_how(flags, mode); 4139 return __io_openat_prep(req, sqe); 4140} 4141 4142static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4143{ 4144 struct open_how __user *how; 4145 size_t len; 4146 int ret; 4147 4148 how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 4149 len = READ_ONCE(sqe->len); 4150 if (len < OPEN_HOW_SIZE_VER0) 4151 return -EINVAL; 4152 4153 ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how, 4154 len); 4155 if (ret) 4156 return ret; 4157 4158 return __io_openat_prep(req, sqe); 4159} 4160 4161static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) 4162{ 4163 struct open_flags op; 4164 struct file *file; 4165 bool resolve_nonblock, nonblock_set; 4166 bool fixed = !!req->open.file_slot; 4167 int ret; 4168 4169 ret = build_open_flags(&req->open.how, &op); 4170 if (ret) 4171 goto err; 4172 nonblock_set = op.open_flag & O_NONBLOCK; 4173 resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED; 4174 if (issue_flags & IO_URING_F_NONBLOCK) { 4175 /* 4176 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, 4177 * it'll always -EAGAIN 4178 */ 4179 if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE)) 4180 return -EAGAIN; 4181 op.lookup_flags |= LOOKUP_CACHED; 4182 op.open_flag |= O_NONBLOCK; 4183 } 4184 4185 if (!fixed) { 4186 ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile); 4187 if (ret < 0) 4188 goto err; 4189 } 4190 4191 file = do_filp_open(req->open.dfd, req->open.filename, &op); 4192 if (IS_ERR(file)) { 4193 /* 4194 * We could hang on to this 'fd' on retrying, but seems like 4195 * marginal gain for something that is now known to be a slower 4196 * path. So just put it, and we'll get a new one when we retry. 4197 */ 4198 if (!fixed) 4199 put_unused_fd(ret); 4200 4201 ret = PTR_ERR(file); 4202 /* only retry if RESOLVE_CACHED wasn't already set by application */ 4203 if (ret == -EAGAIN && 4204 (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK))) 4205 return -EAGAIN; 4206 goto err; 4207 } 4208 4209 if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) 4210 file->f_flags &= ~O_NONBLOCK; 4211 fsnotify_open(file); 4212 4213 if (!fixed) 4214 fd_install(ret, file); 4215 else 4216 ret = io_install_fixed_file(req, file, issue_flags, 4217 req->open.file_slot - 1); 4218err: 4219 putname(req->open.filename); 4220 req->flags &= ~REQ_F_NEED_CLEANUP; 4221 if (ret < 0) 4222 req_set_fail(req); 4223 __io_req_complete(req, issue_flags, ret, 0); 4224 return 0; 4225} 4226 4227static int io_openat(struct io_kiocb *req, unsigned int issue_flags) 4228{ 4229 return io_openat2(req, issue_flags); 4230} 4231 4232static int io_remove_buffers_prep(struct io_kiocb *req, 4233 const struct io_uring_sqe *sqe) 4234{ 4235 struct io_provide_buf *p = &req->pbuf; 4236 u64 tmp; 4237 4238 if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off || 4239 sqe->splice_fd_in) 4240 return -EINVAL; 4241 4242 tmp = READ_ONCE(sqe->fd); 4243 if (!tmp || tmp > USHRT_MAX) 4244 return -EINVAL; 4245 4246 memset(p, 0, sizeof(*p)); 4247 p->nbufs = tmp; 4248 p->bgid = READ_ONCE(sqe->buf_group); 4249 return 0; 4250} 4251 4252static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, 4253 int bgid, unsigned nbufs) 4254{ 4255 unsigned i = 0; 4256 4257 /* shouldn't happen */ 4258 if (!nbufs) 4259 return 0; 4260 4261 /* the head kbuf is the list itself */ 4262 while (!list_empty(&buf->list)) { 4263 struct io_buffer *nxt; 4264 4265 nxt = list_first_entry(&buf->list, struct io_buffer, list); 4266 list_del(&nxt->list); 4267 kfree(nxt); 4268 if (++i == nbufs) 4269 return i; 4270 } 4271 i++; 4272 kfree(buf); 4273 xa_erase(&ctx->io_buffers, bgid); 4274 4275 return i; 4276} 4277 4278static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) 4279{ 4280 struct io_provide_buf *p = &req->pbuf; 4281 struct io_ring_ctx *ctx = req->ctx; 4282 struct io_buffer *head; 4283 int ret = 0; 4284 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 4285 4286 io_ring_submit_lock(ctx, !force_nonblock); 4287 4288 lockdep_assert_held(&ctx->uring_lock); 4289 4290 ret = -ENOENT; 4291 head = xa_load(&ctx->io_buffers, p->bgid); 4292 if (head) 4293 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); 4294 if (ret < 0) 4295 req_set_fail(req); 4296 4297 /* complete before unlock, IOPOLL may need the lock */ 4298 __io_req_complete(req, issue_flags, ret, 0); 4299 io_ring_submit_unlock(ctx, !force_nonblock); 4300 return 0; 4301} 4302 4303static int io_provide_buffers_prep(struct io_kiocb *req, 4304 const struct io_uring_sqe *sqe) 4305{ 4306 unsigned long size, tmp_check; 4307 struct io_provide_buf *p = &req->pbuf; 4308 u64 tmp; 4309 4310 if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) 4311 return -EINVAL; 4312 4313 tmp = READ_ONCE(sqe->fd); 4314 if (!tmp || tmp > USHRT_MAX) 4315 return -E2BIG; 4316 p->nbufs = tmp; 4317 p->addr = READ_ONCE(sqe->addr); 4318 p->len = READ_ONCE(sqe->len); 4319 4320 if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, 4321 &size)) 4322 return -EOVERFLOW; 4323 if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) 4324 return -EOVERFLOW; 4325 4326 size = (unsigned long)p->len * p->nbufs; 4327 if (!access_ok(u64_to_user_ptr(p->addr), size)) 4328 return -EFAULT; 4329 4330 p->bgid = READ_ONCE(sqe->buf_group); 4331 tmp = READ_ONCE(sqe->off); 4332 if (tmp > USHRT_MAX) 4333 return -E2BIG; 4334 p->bid = tmp; 4335 return 0; 4336} 4337 4338static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) 4339{ 4340 struct io_buffer *buf; 4341 u64 addr = pbuf->addr; 4342 int i, bid = pbuf->bid; 4343 4344 for (i = 0; i < pbuf->nbufs; i++) { 4345 buf = kmalloc(sizeof(*buf), GFP_KERNEL); 4346 if (!buf) 4347 break; 4348 4349 buf->addr = addr; 4350 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); 4351 buf->bid = bid; 4352 addr += pbuf->len; 4353 bid++; 4354 if (!*head) { 4355 INIT_LIST_HEAD(&buf->list); 4356 *head = buf; 4357 } else { 4358 list_add_tail(&buf->list, &(*head)->list); 4359 } 4360 } 4361 4362 return i ? i : -ENOMEM; 4363} 4364 4365static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) 4366{ 4367 struct io_provide_buf *p = &req->pbuf; 4368 struct io_ring_ctx *ctx = req->ctx; 4369 struct io_buffer *head, *list; 4370 int ret = 0; 4371 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 4372 4373 io_ring_submit_lock(ctx, !force_nonblock); 4374 4375 lockdep_assert_held(&ctx->uring_lock); 4376 4377 list = head = xa_load(&ctx->io_buffers, p->bgid); 4378 4379 ret = io_add_buffers(p, &head); 4380 if (ret >= 0 && !list) { 4381 ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL); 4382 if (ret < 0) 4383 __io_remove_buffers(ctx, head, p->bgid, -1U); 4384 } 4385 if (ret < 0) 4386 req_set_fail(req); 4387 /* complete before unlock, IOPOLL may need the lock */ 4388 __io_req_complete(req, issue_flags, ret, 0); 4389 io_ring_submit_unlock(ctx, !force_nonblock); 4390 return 0; 4391} 4392 4393static int io_epoll_ctl_prep(struct io_kiocb *req, 4394 const struct io_uring_sqe *sqe) 4395{ 4396#if defined(CONFIG_EPOLL) 4397 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) 4398 return -EINVAL; 4399 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4400 return -EINVAL; 4401 4402 req->epoll.epfd = READ_ONCE(sqe->fd); 4403 req->epoll.op = READ_ONCE(sqe->len); 4404 req->epoll.fd = READ_ONCE(sqe->off); 4405 4406 if (ep_op_has_event(req->epoll.op)) { 4407 struct epoll_event __user *ev; 4408 4409 ev = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4410 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev))) 4411 return -EFAULT; 4412 } 4413 4414 return 0; 4415#else 4416 return -EOPNOTSUPP; 4417#endif 4418} 4419 4420static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) 4421{ 4422#if defined(CONFIG_EPOLL) 4423 struct io_epoll *ie = &req->epoll; 4424 int ret; 4425 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 4426 4427 ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock); 4428 if (force_nonblock && ret == -EAGAIN) 4429 return -EAGAIN; 4430 4431 if (ret < 0) 4432 req_set_fail(req); 4433 __io_req_complete(req, issue_flags, ret, 0); 4434 return 0; 4435#else 4436 return -EOPNOTSUPP; 4437#endif 4438} 4439 4440static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4441{ 4442#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) 4443 if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in) 4444 return -EINVAL; 4445 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4446 return -EINVAL; 4447 4448 req->madvise.addr = READ_ONCE(sqe->addr); 4449 req->madvise.len = READ_ONCE(sqe->len); 4450 req->madvise.advice = READ_ONCE(sqe->fadvise_advice); 4451 return 0; 4452#else 4453 return -EOPNOTSUPP; 4454#endif 4455} 4456 4457static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) 4458{ 4459#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) 4460 struct io_madvise *ma = &req->madvise; 4461 int ret; 4462 4463 if (issue_flags & IO_URING_F_NONBLOCK) 4464 return -EAGAIN; 4465 4466 ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); 4467 if (ret < 0) 4468 req_set_fail(req); 4469 io_req_complete(req, ret); 4470 return 0; 4471#else 4472 return -EOPNOTSUPP; 4473#endif 4474} 4475 4476static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4477{ 4478 if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in) 4479 return -EINVAL; 4480 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4481 return -EINVAL; 4482 4483 req->fadvise.offset = READ_ONCE(sqe->off); 4484 req->fadvise.len = READ_ONCE(sqe->len); 4485 req->fadvise.advice = READ_ONCE(sqe->fadvise_advice); 4486 return 0; 4487} 4488 4489static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) 4490{ 4491 struct io_fadvise *fa = &req->fadvise; 4492 int ret; 4493 4494 if (issue_flags & IO_URING_F_NONBLOCK) { 4495 switch (fa->advice) { 4496 case POSIX_FADV_NORMAL: 4497 case POSIX_FADV_RANDOM: 4498 case POSIX_FADV_SEQUENTIAL: 4499 break; 4500 default: 4501 return -EAGAIN; 4502 } 4503 } 4504 4505 ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); 4506 if (ret < 0) 4507 req_set_fail(req); 4508 __io_req_complete(req, issue_flags, ret, 0); 4509 return 0; 4510} 4511 4512static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4513{ 4514 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4515 return -EINVAL; 4516 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) 4517 return -EINVAL; 4518 if (req->flags & REQ_F_FIXED_FILE) 4519 return -EBADF; 4520 4521 req->statx.dfd = READ_ONCE(sqe->fd); 4522 req->statx.mask = READ_ONCE(sqe->len); 4523 req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4524 req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 4525 req->statx.flags = READ_ONCE(sqe->statx_flags); 4526 4527 return 0; 4528} 4529 4530static int io_statx(struct io_kiocb *req, unsigned int issue_flags) 4531{ 4532 struct io_statx *ctx = &req->statx; 4533 int ret; 4534 4535 if (issue_flags & IO_URING_F_NONBLOCK) 4536 return -EAGAIN; 4537 4538 ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask, 4539 ctx->buffer); 4540 4541 if (ret < 0) 4542 req_set_fail(req); 4543 io_req_complete(req, ret); 4544 return 0; 4545} 4546 4547static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4548{ 4549 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4550 return -EINVAL; 4551 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || 4552 sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) 4553 return -EINVAL; 4554 if (req->flags & REQ_F_FIXED_FILE) 4555 return -EBADF; 4556 4557 req->close.fd = READ_ONCE(sqe->fd); 4558 return 0; 4559} 4560 4561static int io_close(struct io_kiocb *req, unsigned int issue_flags) 4562{ 4563 struct files_struct *files = current->files; 4564 struct io_close *close = &req->close; 4565 struct fdtable *fdt; 4566 struct file *file = NULL; 4567 int ret = -EBADF; 4568 4569 spin_lock(&files->file_lock); 4570 fdt = files_fdtable(files); 4571 if (close->fd >= fdt->max_fds) { 4572 spin_unlock(&files->file_lock); 4573 goto err; 4574 } 4575 file = fdt->fd[close->fd]; 4576 if (!file || file->f_op == &io_uring_fops) { 4577 spin_unlock(&files->file_lock); 4578 file = NULL; 4579 goto err; 4580 } 4581 4582 /* if the file has a flush method, be safe and punt to async */ 4583 if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) { 4584 spin_unlock(&files->file_lock); 4585 return -EAGAIN; 4586 } 4587 4588 ret = __close_fd_get_file(close->fd, &file); 4589 spin_unlock(&files->file_lock); 4590 if (ret < 0) { 4591 if (ret == -ENOENT) 4592 ret = -EBADF; 4593 goto err; 4594 } 4595 4596 /* No ->flush() or already async, safely close from here */ 4597 ret = filp_close(file, current->files); 4598err: 4599 if (ret < 0) 4600 req_set_fail(req); 4601 if (file) 4602 fput(file); 4603 __io_req_complete(req, issue_flags, ret, 0); 4604 return 0; 4605} 4606 4607static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4608{ 4609 struct io_ring_ctx *ctx = req->ctx; 4610 4611 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 4612 return -EINVAL; 4613 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || 4614 sqe->splice_fd_in)) 4615 return -EINVAL; 4616 4617 req->sync.off = READ_ONCE(sqe->off); 4618 req->sync.len = READ_ONCE(sqe->len); 4619 req->sync.flags = READ_ONCE(sqe->sync_range_flags); 4620 return 0; 4621} 4622 4623static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) 4624{ 4625 int ret; 4626 4627 /* sync_file_range always requires a blocking context */ 4628 if (issue_flags & IO_URING_F_NONBLOCK) 4629 return -EAGAIN; 4630 4631 ret = sync_file_range(req->file, req->sync.off, req->sync.len, 4632 req->sync.flags); 4633 if (ret < 0) 4634 req_set_fail(req); 4635 io_req_complete(req, ret); 4636 return 0; 4637} 4638 4639#if defined(CONFIG_NET) 4640static int io_setup_async_msg(struct io_kiocb *req, 4641 struct io_async_msghdr *kmsg) 4642{ 4643 struct io_async_msghdr *async_msg = req->async_data; 4644 4645 if (async_msg) 4646 return -EAGAIN; 4647 if (io_alloc_async_data(req)) { 4648 kfree(kmsg->free_iov); 4649 return -ENOMEM; 4650 } 4651 async_msg = req->async_data; 4652 req->flags |= REQ_F_NEED_CLEANUP; 4653 memcpy(async_msg, kmsg, sizeof(*kmsg)); 4654 async_msg->msg.msg_name = &async_msg->addr; 4655 /* if were using fast_iov, set it to the new one */ 4656 if (!async_msg->free_iov) 4657 async_msg->msg.msg_iter.iov = async_msg->fast_iov; 4658 4659 return -EAGAIN; 4660} 4661 4662static int io_sendmsg_copy_hdr(struct io_kiocb *req, 4663 struct io_async_msghdr *iomsg) 4664{ 4665 iomsg->msg.msg_name = &iomsg->addr; 4666 iomsg->free_iov = iomsg->fast_iov; 4667 return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg, 4668 req->sr_msg.msg_flags, &iomsg->free_iov); 4669} 4670 4671static int io_sendmsg_prep_async(struct io_kiocb *req) 4672{ 4673 int ret; 4674 4675 ret = io_sendmsg_copy_hdr(req, req->async_data); 4676 if (!ret) 4677 req->flags |= REQ_F_NEED_CLEANUP; 4678 return ret; 4679} 4680 4681static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4682{ 4683 struct io_sr_msg *sr = &req->sr_msg; 4684 4685 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4686 return -EINVAL; 4687 4688 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4689 sr->len = READ_ONCE(sqe->len); 4690 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 4691 if (sr->msg_flags & MSG_DONTWAIT) 4692 req->flags |= REQ_F_NOWAIT; 4693 4694#ifdef CONFIG_COMPAT 4695 if (req->ctx->compat) 4696 sr->msg_flags |= MSG_CMSG_COMPAT; 4697#endif 4698 return 0; 4699} 4700 4701static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) 4702{ 4703 struct io_async_msghdr iomsg, *kmsg; 4704 struct socket *sock; 4705 unsigned flags; 4706 int min_ret = 0; 4707 int ret; 4708 4709 sock = sock_from_file(req->file); 4710 if (unlikely(!sock)) 4711 return -ENOTSOCK; 4712 4713 kmsg = req->async_data; 4714 if (!kmsg) { 4715 ret = io_sendmsg_copy_hdr(req, &iomsg); 4716 if (ret) 4717 return ret; 4718 kmsg = &iomsg; 4719 } 4720 4721 flags = req->sr_msg.msg_flags; 4722 if (issue_flags & IO_URING_F_NONBLOCK) 4723 flags |= MSG_DONTWAIT; 4724 if (flags & MSG_WAITALL) 4725 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 4726 4727 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 4728 if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) 4729 return io_setup_async_msg(req, kmsg); 4730 if (ret == -ERESTARTSYS) 4731 ret = -EINTR; 4732 4733 /* fast path, check for non-NULL to avoid function call */ 4734 if (kmsg->free_iov) 4735 kfree(kmsg->free_iov); 4736 req->flags &= ~REQ_F_NEED_CLEANUP; 4737 if (ret < min_ret) 4738 req_set_fail(req); 4739 __io_req_complete(req, issue_flags, ret, 0); 4740 return 0; 4741} 4742 4743static int io_send(struct io_kiocb *req, unsigned int issue_flags) 4744{ 4745 struct io_sr_msg *sr = &req->sr_msg; 4746 struct msghdr msg; 4747 struct iovec iov; 4748 struct socket *sock; 4749 unsigned flags; 4750 int min_ret = 0; 4751 int ret; 4752 4753 sock = sock_from_file(req->file); 4754 if (unlikely(!sock)) 4755 return -ENOTSOCK; 4756 4757 ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); 4758 if (unlikely(ret)) 4759 return ret; 4760 4761 msg.msg_name = NULL; 4762 msg.msg_control = NULL; 4763 msg.msg_controllen = 0; 4764 msg.msg_namelen = 0; 4765 4766 flags = req->sr_msg.msg_flags; 4767 if (issue_flags & IO_URING_F_NONBLOCK) 4768 flags |= MSG_DONTWAIT; 4769 if (flags & MSG_WAITALL) 4770 min_ret = iov_iter_count(&msg.msg_iter); 4771 4772 msg.msg_flags = flags; 4773 ret = sock_sendmsg(sock, &msg); 4774 if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) 4775 return -EAGAIN; 4776 if (ret == -ERESTARTSYS) 4777 ret = -EINTR; 4778 4779 if (ret < min_ret) 4780 req_set_fail(req); 4781 __io_req_complete(req, issue_flags, ret, 0); 4782 return 0; 4783} 4784 4785static int __io_recvmsg_copy_hdr(struct io_kiocb *req, 4786 struct io_async_msghdr *iomsg) 4787{ 4788 struct io_sr_msg *sr = &req->sr_msg; 4789 struct iovec __user *uiov; 4790 size_t iov_len; 4791 int ret; 4792 4793 ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg, 4794 &iomsg->uaddr, &uiov, &iov_len); 4795 if (ret) 4796 return ret; 4797 4798 if (req->flags & REQ_F_BUFFER_SELECT) { 4799 if (iov_len > 1) 4800 return -EINVAL; 4801 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov))) 4802 return -EFAULT; 4803 sr->len = iomsg->fast_iov[0].iov_len; 4804 iomsg->free_iov = NULL; 4805 } else { 4806 iomsg->free_iov = iomsg->fast_iov; 4807 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV, 4808 &iomsg->free_iov, &iomsg->msg.msg_iter, 4809 false); 4810 if (ret > 0) 4811 ret = 0; 4812 } 4813 4814 return ret; 4815} 4816 4817#ifdef CONFIG_COMPAT 4818static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, 4819 struct io_async_msghdr *iomsg) 4820{ 4821 struct io_sr_msg *sr = &req->sr_msg; 4822 struct compat_iovec __user *uiov; 4823 compat_uptr_t ptr; 4824 compat_size_t len; 4825 int ret; 4826 4827 ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr, 4828 &ptr, &len); 4829 if (ret) 4830 return ret; 4831 4832 uiov = compat_ptr(ptr); 4833 if (req->flags & REQ_F_BUFFER_SELECT) { 4834 compat_ssize_t clen; 4835 4836 if (len > 1) 4837 return -EINVAL; 4838 if (!access_ok(uiov, sizeof(*uiov))) 4839 return -EFAULT; 4840 if (__get_user(clen, &uiov->iov_len)) 4841 return -EFAULT; 4842 if (clen < 0) 4843 return -EINVAL; 4844 sr->len = clen; 4845 iomsg->free_iov = NULL; 4846 } else { 4847 iomsg->free_iov = iomsg->fast_iov; 4848 ret = __import_iovec(READ, (struct iovec __user *)uiov, len, 4849 UIO_FASTIOV, &iomsg->free_iov, 4850 &iomsg->msg.msg_iter, true); 4851 if (ret < 0) 4852 return ret; 4853 } 4854 4855 return 0; 4856} 4857#endif 4858 4859static int io_recvmsg_copy_hdr(struct io_kiocb *req, 4860 struct io_async_msghdr *iomsg) 4861{ 4862 iomsg->msg.msg_name = &iomsg->addr; 4863 4864#ifdef CONFIG_COMPAT 4865 if (req->ctx->compat) 4866 return __io_compat_recvmsg_copy_hdr(req, iomsg); 4867#endif 4868 4869 return __io_recvmsg_copy_hdr(req, iomsg); 4870} 4871 4872static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, 4873 bool needs_lock) 4874{ 4875 struct io_sr_msg *sr = &req->sr_msg; 4876 struct io_buffer *kbuf; 4877 4878 kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock); 4879 if (IS_ERR(kbuf)) 4880 return kbuf; 4881 4882 sr->kbuf = kbuf; 4883 req->flags |= REQ_F_BUFFER_SELECTED; 4884 return kbuf; 4885} 4886 4887static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req) 4888{ 4889 return io_put_kbuf(req, req->sr_msg.kbuf); 4890} 4891 4892static int io_recvmsg_prep_async(struct io_kiocb *req) 4893{ 4894 int ret; 4895 4896 ret = io_recvmsg_copy_hdr(req, req->async_data); 4897 if (!ret) 4898 req->flags |= REQ_F_NEED_CLEANUP; 4899 return ret; 4900} 4901 4902static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4903{ 4904 struct io_sr_msg *sr = &req->sr_msg; 4905 4906 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4907 return -EINVAL; 4908 4909 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4910 sr->len = READ_ONCE(sqe->len); 4911 sr->bgid = READ_ONCE(sqe->buf_group); 4912 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 4913 if (sr->msg_flags & MSG_DONTWAIT) 4914 req->flags |= REQ_F_NOWAIT; 4915 4916#ifdef CONFIG_COMPAT 4917 if (req->ctx->compat) 4918 sr->msg_flags |= MSG_CMSG_COMPAT; 4919#endif 4920 return 0; 4921} 4922 4923static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) 4924{ 4925 struct io_async_msghdr iomsg, *kmsg; 4926 struct socket *sock; 4927 struct io_buffer *kbuf; 4928 unsigned flags; 4929 int min_ret = 0; 4930 int ret, cflags = 0; 4931 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 4932 4933 sock = sock_from_file(req->file); 4934 if (unlikely(!sock)) 4935 return -ENOTSOCK; 4936 4937 kmsg = req->async_data; 4938 if (!kmsg) { 4939 ret = io_recvmsg_copy_hdr(req, &iomsg); 4940 if (ret) 4941 return ret; 4942 kmsg = &iomsg; 4943 } 4944 4945 if (req->flags & REQ_F_BUFFER_SELECT) { 4946 kbuf = io_recv_buffer_select(req, !force_nonblock); 4947 if (IS_ERR(kbuf)) 4948 return PTR_ERR(kbuf); 4949 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); 4950 kmsg->fast_iov[0].iov_len = req->sr_msg.len; 4951 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 4952 1, req->sr_msg.len); 4953 } 4954 4955 flags = req->sr_msg.msg_flags; 4956 if (force_nonblock) 4957 flags |= MSG_DONTWAIT; 4958 if (flags & MSG_WAITALL) 4959 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 4960 4961 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, 4962 kmsg->uaddr, flags); 4963 if (force_nonblock && ret == -EAGAIN) 4964 return io_setup_async_msg(req, kmsg); 4965 if (ret == -ERESTARTSYS) 4966 ret = -EINTR; 4967 4968 if (req->flags & REQ_F_BUFFER_SELECTED) 4969 cflags = io_put_recv_kbuf(req); 4970 /* fast path, check for non-NULL to avoid function call */ 4971 if (kmsg->free_iov) 4972 kfree(kmsg->free_iov); 4973 req->flags &= ~REQ_F_NEED_CLEANUP; 4974 if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) 4975 req_set_fail(req); 4976 __io_req_complete(req, issue_flags, ret, cflags); 4977 return 0; 4978} 4979 4980static int io_recv(struct io_kiocb *req, unsigned int issue_flags) 4981{ 4982 struct io_buffer *kbuf; 4983 struct io_sr_msg *sr = &req->sr_msg; 4984 struct msghdr msg; 4985 void __user *buf = sr->buf; 4986 struct socket *sock; 4987 struct iovec iov; 4988 unsigned flags; 4989 int min_ret = 0; 4990 int ret, cflags = 0; 4991 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 4992 4993 sock = sock_from_file(req->file); 4994 if (unlikely(!sock)) 4995 return -ENOTSOCK; 4996 4997 if (req->flags & REQ_F_BUFFER_SELECT) { 4998 kbuf = io_recv_buffer_select(req, !force_nonblock); 4999 if (IS_ERR(kbuf)) 5000 return PTR_ERR(kbuf); 5001 buf = u64_to_user_ptr(kbuf->addr); 5002 } 5003 5004 ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); 5005 if (unlikely(ret)) 5006 goto out_free; 5007 5008 msg.msg_name = NULL; 5009 msg.msg_control = NULL; 5010 msg.msg_controllen = 0; 5011 msg.msg_namelen = 0; 5012 msg.msg_iocb = NULL; 5013 msg.msg_flags = 0; 5014 5015 flags = req->sr_msg.msg_flags; 5016 if (force_nonblock) 5017 flags |= MSG_DONTWAIT; 5018 if (flags & MSG_WAITALL) 5019 min_ret = iov_iter_count(&msg.msg_iter); 5020 5021 ret = sock_recvmsg(sock, &msg, flags); 5022 if (force_nonblock && ret == -EAGAIN) 5023 return -EAGAIN; 5024 if (ret == -ERESTARTSYS) 5025 ret = -EINTR; 5026out_free: 5027 if (req->flags & REQ_F_BUFFER_SELECTED) 5028 cflags = io_put_recv_kbuf(req); 5029 if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) 5030 req_set_fail(req); 5031 __io_req_complete(req, issue_flags, ret, cflags); 5032 return 0; 5033} 5034 5035static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5036{ 5037 struct io_accept *accept = &req->accept; 5038 5039 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5040 return -EINVAL; 5041 if (sqe->ioprio || sqe->len || sqe->buf_index) 5042 return -EINVAL; 5043 5044 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 5045 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 5046 accept->flags = READ_ONCE(sqe->accept_flags); 5047 accept->nofile = rlimit(RLIMIT_NOFILE); 5048 5049 accept->file_slot = READ_ONCE(sqe->file_index); 5050 if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) || 5051 (accept->flags & SOCK_CLOEXEC))) 5052 return -EINVAL; 5053 if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 5054 return -EINVAL; 5055 if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) 5056 accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; 5057 return 0; 5058} 5059 5060static int io_accept(struct io_kiocb *req, unsigned int issue_flags) 5061{ 5062 struct io_accept *accept = &req->accept; 5063 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 5064 unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; 5065 bool fixed = !!accept->file_slot; 5066 struct file *file; 5067 int ret, fd; 5068 5069 if (req->file->f_flags & O_NONBLOCK) 5070 req->flags |= REQ_F_NOWAIT; 5071 5072 if (!fixed) { 5073 fd = __get_unused_fd_flags(accept->flags, accept->nofile); 5074 if (unlikely(fd < 0)) 5075 return fd; 5076 } 5077 file = do_accept(req->file, file_flags, accept->addr, accept->addr_len, 5078 accept->flags); 5079 if (IS_ERR(file)) { 5080 if (!fixed) 5081 put_unused_fd(fd); 5082 ret = PTR_ERR(file); 5083 if (ret == -EAGAIN && force_nonblock) 5084 return -EAGAIN; 5085 if (ret == -ERESTARTSYS) 5086 ret = -EINTR; 5087 req_set_fail(req); 5088 } else if (!fixed) { 5089 fd_install(fd, file); 5090 ret = fd; 5091 } else { 5092 ret = io_install_fixed_file(req, file, issue_flags, 5093 accept->file_slot - 1); 5094 } 5095 __io_req_complete(req, issue_flags, ret, 0); 5096 return 0; 5097} 5098 5099static int io_connect_prep_async(struct io_kiocb *req) 5100{ 5101 struct io_async_connect *io = req->async_data; 5102 struct io_connect *conn = &req->connect; 5103 5104 return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address); 5105} 5106 5107static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5108{ 5109 struct io_connect *conn = &req->connect; 5110 5111 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5112 return -EINVAL; 5113 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags || 5114 sqe->splice_fd_in) 5115 return -EINVAL; 5116 5117 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 5118 conn->addr_len = READ_ONCE(sqe->addr2); 5119 return 0; 5120} 5121 5122static int io_connect(struct io_kiocb *req, unsigned int issue_flags) 5123{ 5124 struct io_async_connect __io, *io; 5125 unsigned file_flags; 5126 int ret; 5127 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 5128 5129 if (req->async_data) { 5130 io = req->async_data; 5131 } else { 5132 ret = move_addr_to_kernel(req->connect.addr, 5133 req->connect.addr_len, 5134 &__io.address); 5135 if (ret) 5136 goto out; 5137 io = &__io; 5138 } 5139 5140 file_flags = force_nonblock ? O_NONBLOCK : 0; 5141 5142 ret = __sys_connect_file(req->file, &io->address, 5143 req->connect.addr_len, file_flags); 5144 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { 5145 if (req->async_data) 5146 return -EAGAIN; 5147 if (io_alloc_async_data(req)) { 5148 ret = -ENOMEM; 5149 goto out; 5150 } 5151 memcpy(req->async_data, &__io, sizeof(__io)); 5152 return -EAGAIN; 5153 } 5154 if (ret == -ERESTARTSYS) 5155 ret = -EINTR; 5156out: 5157 if (ret < 0) 5158 req_set_fail(req); 5159 __io_req_complete(req, issue_flags, ret, 0); 5160 return 0; 5161} 5162#else /* !CONFIG_NET */ 5163#define IO_NETOP_FN(op) \ 5164static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \ 5165{ \ 5166 return -EOPNOTSUPP; \ 5167} 5168 5169#define IO_NETOP_PREP(op) \ 5170IO_NETOP_FN(op) \ 5171static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \ 5172{ \ 5173 return -EOPNOTSUPP; \ 5174} \ 5175 5176#define IO_NETOP_PREP_ASYNC(op) \ 5177IO_NETOP_PREP(op) \ 5178static int io_##op##_prep_async(struct io_kiocb *req) \ 5179{ \ 5180 return -EOPNOTSUPP; \ 5181} 5182 5183IO_NETOP_PREP_ASYNC(sendmsg); 5184IO_NETOP_PREP_ASYNC(recvmsg); 5185IO_NETOP_PREP_ASYNC(connect); 5186IO_NETOP_PREP(accept); 5187IO_NETOP_FN(send); 5188IO_NETOP_FN(recv); 5189#endif /* CONFIG_NET */ 5190 5191struct io_poll_table { 5192 struct poll_table_struct pt; 5193 struct io_kiocb *req; 5194 int nr_entries; 5195 int error; 5196}; 5197 5198static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, 5199 __poll_t mask, io_req_tw_func_t func) 5200{ 5201 /* for instances that support it check for an event match first: */ 5202 if (mask && !(mask & poll->events)) 5203 return 0; 5204 5205 trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask); 5206 5207 list_del_init(&poll->wait.entry); 5208 5209 req->result = mask; 5210 req->io_task_work.func = func; 5211 5212 /* 5213 * If this fails, then the task is exiting. When a task exits, the 5214 * work gets canceled, so just cancel this request as well instead 5215 * of executing it. We can't safely execute it anyway, as we may not 5216 * have the needed state needed for it anyway. 5217 */ 5218 io_req_task_work_add(req); 5219 return 1; 5220} 5221 5222static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) 5223 __acquires(&req->ctx->completion_lock) 5224{ 5225 struct io_ring_ctx *ctx = req->ctx; 5226 5227 /* req->task == current here, checking PF_EXITING is safe */ 5228 if (unlikely(req->task->flags & PF_EXITING)) 5229 WRITE_ONCE(poll->canceled, true); 5230 5231 if (!req->result && !READ_ONCE(poll->canceled)) { 5232 struct poll_table_struct pt = { ._key = poll->events }; 5233 5234 req->result = vfs_poll(req->file, &pt) & poll->events; 5235 } 5236 5237 spin_lock(&ctx->completion_lock); 5238 if (!req->result && !READ_ONCE(poll->canceled)) { 5239 add_wait_queue(poll->head, &poll->wait); 5240 return true; 5241 } 5242 5243 return false; 5244} 5245 5246static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) 5247{ 5248 /* pure poll stashes this in ->async_data, poll driven retry elsewhere */ 5249 if (req->opcode == IORING_OP_POLL_ADD) 5250 return req->async_data; 5251 return req->apoll->double_poll; 5252} 5253 5254static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req) 5255{ 5256 if (req->opcode == IORING_OP_POLL_ADD) 5257 return &req->poll; 5258 return &req->apoll->poll; 5259} 5260 5261static void io_poll_remove_double(struct io_kiocb *req) 5262 __must_hold(&req->ctx->completion_lock) 5263{ 5264 struct io_poll_iocb *poll = io_poll_get_double(req); 5265 5266 lockdep_assert_held(&req->ctx->completion_lock); 5267 5268 if (poll && poll->head) { 5269 struct wait_queue_head *head = poll->head; 5270 5271 spin_lock_irq(&head->lock); 5272 list_del_init(&poll->wait.entry); 5273 if (poll->wait.private) 5274 req_ref_put(req); 5275 poll->head = NULL; 5276 spin_unlock_irq(&head->lock); 5277 } 5278} 5279 5280static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask) 5281 __must_hold(&req->ctx->completion_lock) 5282{ 5283 struct io_ring_ctx *ctx = req->ctx; 5284 unsigned flags = IORING_CQE_F_MORE; 5285 int error; 5286 5287 if (READ_ONCE(req->poll.canceled)) { 5288 error = -ECANCELED; 5289 req->poll.events |= EPOLLONESHOT; 5290 } else { 5291 error = mangle_poll(mask); 5292 } 5293 if (req->poll.events & EPOLLONESHOT) 5294 flags = 0; 5295 if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) { 5296 req->poll.done = true; 5297 flags = 0; 5298 } 5299 if (flags & IORING_CQE_F_MORE) 5300 ctx->cq_extra++; 5301 5302 return !(flags & IORING_CQE_F_MORE); 5303} 5304 5305static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask) 5306 __must_hold(&req->ctx->completion_lock) 5307{ 5308 bool done; 5309 5310 done = __io_poll_complete(req, mask); 5311 io_commit_cqring(req->ctx); 5312 return done; 5313} 5314 5315static void io_poll_task_func(struct io_kiocb *req, bool *locked) 5316{ 5317 struct io_ring_ctx *ctx = req->ctx; 5318 struct io_kiocb *nxt; 5319 5320 if (io_poll_rewait(req, &req->poll)) { 5321 spin_unlock(&ctx->completion_lock); 5322 } else { 5323 bool done; 5324 5325 done = __io_poll_complete(req, req->result); 5326 if (done) { 5327 io_poll_remove_double(req); 5328 hash_del(&req->hash_node); 5329 } else { 5330 req->result = 0; 5331 add_wait_queue(req->poll.head, &req->poll.wait); 5332 } 5333 io_commit_cqring(ctx); 5334 spin_unlock(&ctx->completion_lock); 5335 io_cqring_ev_posted(ctx); 5336 5337 if (done) { 5338 nxt = io_put_req_find_next(req); 5339 if (nxt) 5340 io_req_task_submit(nxt, locked); 5341 } 5342 } 5343} 5344 5345static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, 5346 int sync, void *key) 5347{ 5348 struct io_kiocb *req = wait->private; 5349 struct io_poll_iocb *poll = io_poll_get_single(req); 5350 __poll_t mask = key_to_poll(key); 5351 unsigned long flags; 5352 5353 /* for instances that support it check for an event match first: */ 5354 if (mask && !(mask & poll->events)) 5355 return 0; 5356 if (!(poll->events & EPOLLONESHOT)) 5357 return poll->wait.func(&poll->wait, mode, sync, key); 5358 5359 list_del_init(&wait->entry); 5360 5361 if (poll->head) { 5362 bool done; 5363 5364 spin_lock_irqsave(&poll->head->lock, flags); 5365 done = list_empty(&poll->wait.entry); 5366 if (!done) 5367 list_del_init(&poll->wait.entry); 5368 /* make sure double remove sees this as being gone */ 5369 wait->private = NULL; 5370 spin_unlock_irqrestore(&poll->head->lock, flags); 5371 if (!done) { 5372 /* use wait func handler, so it matches the rq type */ 5373 poll->wait.func(&poll->wait, mode, sync, key); 5374 } 5375 } 5376 req_ref_put(req); 5377 return 1; 5378} 5379 5380static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, 5381 wait_queue_func_t wake_func) 5382{ 5383 poll->head = NULL; 5384 poll->done = false; 5385 poll->canceled = false; 5386#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) 5387 /* mask in events that we always want/need */ 5388 poll->events = events | IO_POLL_UNMASK; 5389 INIT_LIST_HEAD(&poll->wait.entry); 5390 init_waitqueue_func_entry(&poll->wait, wake_func); 5391} 5392 5393static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, 5394 struct wait_queue_head *head, 5395 struct io_poll_iocb **poll_ptr) 5396{ 5397 struct io_kiocb *req = pt->req; 5398 5399 /* 5400 * The file being polled uses multiple waitqueues for poll handling 5401 * (e.g. one for read, one for write). Setup a separate io_poll_iocb 5402 * if this happens. 5403 */ 5404 if (unlikely(pt->nr_entries)) { 5405 struct io_poll_iocb *poll_one = poll; 5406 5407 /* double add on the same waitqueue head, ignore */ 5408 if (poll_one->head == head) 5409 return; 5410 /* already have a 2nd entry, fail a third attempt */ 5411 if (*poll_ptr) { 5412 if ((*poll_ptr)->head == head) 5413 return; 5414 pt->error = -EINVAL; 5415 return; 5416 } 5417 /* 5418 * Can't handle multishot for double wait for now, turn it 5419 * into one-shot mode. 5420 */ 5421 if (!(poll_one->events & EPOLLONESHOT)) 5422 poll_one->events |= EPOLLONESHOT; 5423 poll = kmalloc(sizeof(*poll), GFP_ATOMIC); 5424 if (!poll) { 5425 pt->error = -ENOMEM; 5426 return; 5427 } 5428 io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake); 5429 req_ref_get(req); 5430 poll->wait.private = req; 5431 *poll_ptr = poll; 5432 } 5433 5434 pt->nr_entries++; 5435 poll->head = head; 5436 5437 if (poll->events & EPOLLEXCLUSIVE) 5438 add_wait_queue_exclusive(head, &poll->wait); 5439 else 5440 add_wait_queue(head, &poll->wait); 5441} 5442 5443static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, 5444 struct poll_table_struct *p) 5445{ 5446 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); 5447 struct async_poll *apoll = pt->req->apoll; 5448 5449 __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); 5450} 5451 5452static void io_async_task_func(struct io_kiocb *req, bool *locked) 5453{ 5454 struct async_poll *apoll = req->apoll; 5455 struct io_ring_ctx *ctx = req->ctx; 5456 5457 trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data); 5458 5459 if (io_poll_rewait(req, &apoll->poll)) { 5460 spin_unlock(&ctx->completion_lock); 5461 return; 5462 } 5463 5464 hash_del(&req->hash_node); 5465 io_poll_remove_double(req); 5466 spin_unlock(&ctx->completion_lock); 5467 5468 if (!READ_ONCE(apoll->poll.canceled)) 5469 io_req_task_submit(req, locked); 5470 else 5471 io_req_complete_failed(req, -ECANCELED); 5472} 5473 5474static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, 5475 void *key) 5476{ 5477 struct io_kiocb *req = wait->private; 5478 struct io_poll_iocb *poll = &req->apoll->poll; 5479 5480 trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data, 5481 key_to_poll(key)); 5482 5483 return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func); 5484} 5485 5486static void io_poll_req_insert(struct io_kiocb *req) 5487{ 5488 struct io_ring_ctx *ctx = req->ctx; 5489 struct hlist_head *list; 5490 5491 list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; 5492 hlist_add_head(&req->hash_node, list); 5493} 5494 5495static __poll_t __io_arm_poll_handler(struct io_kiocb *req, 5496 struct io_poll_iocb *poll, 5497 struct io_poll_table *ipt, __poll_t mask, 5498 wait_queue_func_t wake_func) 5499 __acquires(&ctx->completion_lock) 5500{ 5501 struct io_ring_ctx *ctx = req->ctx; 5502 bool cancel = false; 5503 5504 INIT_HLIST_NODE(&req->hash_node); 5505 io_init_poll_iocb(poll, mask, wake_func); 5506 poll->file = req->file; 5507 poll->wait.private = req; 5508 5509 ipt->pt._key = mask; 5510 ipt->req = req; 5511 ipt->error = 0; 5512 ipt->nr_entries = 0; 5513 5514 mask = vfs_poll(req->file, &ipt->pt) & poll->events; 5515 if (unlikely(!ipt->nr_entries) && !ipt->error) 5516 ipt->error = -EINVAL; 5517 5518 spin_lock(&ctx->completion_lock); 5519 if (ipt->error || (mask && (poll->events & EPOLLONESHOT))) 5520 io_poll_remove_double(req); 5521 if (likely(poll->head)) { 5522 spin_lock_irq(&poll->head->lock); 5523 if (unlikely(list_empty(&poll->wait.entry))) { 5524 if (ipt->error) 5525 cancel = true; 5526 ipt->error = 0; 5527 mask = 0; 5528 } 5529 if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error) 5530 list_del_init(&poll->wait.entry); 5531 else if (cancel) 5532 WRITE_ONCE(poll->canceled, true); 5533 else if (!poll->done) /* actually waiting for an event */ 5534 io_poll_req_insert(req); 5535 spin_unlock_irq(&poll->head->lock); 5536 } 5537 5538 return mask; 5539} 5540 5541enum { 5542 IO_APOLL_OK, 5543 IO_APOLL_ABORTED, 5544 IO_APOLL_READY 5545}; 5546 5547static int io_arm_poll_handler(struct io_kiocb *req) 5548{ 5549 const struct io_op_def *def = &io_op_defs[req->opcode]; 5550 struct io_ring_ctx *ctx = req->ctx; 5551 struct async_poll *apoll; 5552 struct io_poll_table ipt; 5553 __poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI; 5554 int rw; 5555 5556 if (!req->file || !file_can_poll(req->file)) 5557 return IO_APOLL_ABORTED; 5558 if (req->flags & REQ_F_POLLED) 5559 return IO_APOLL_ABORTED; 5560 if (!def->pollin && !def->pollout) 5561 return IO_APOLL_ABORTED; 5562 5563 if (def->pollin) { 5564 rw = READ; 5565 mask |= POLLIN | POLLRDNORM; 5566 5567 /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ 5568 if ((req->opcode == IORING_OP_RECVMSG) && 5569 (req->sr_msg.msg_flags & MSG_ERRQUEUE)) 5570 mask &= ~POLLIN; 5571 } else { 5572 rw = WRITE; 5573 mask |= POLLOUT | POLLWRNORM; 5574 } 5575 5576 /* if we can't nonblock try, then no point in arming a poll handler */ 5577 if (!io_file_supports_nowait(req, rw)) 5578 return IO_APOLL_ABORTED; 5579 5580 apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); 5581 if (unlikely(!apoll)) 5582 return IO_APOLL_ABORTED; 5583 apoll->double_poll = NULL; 5584 req->apoll = apoll; 5585 req->flags |= REQ_F_POLLED; 5586 ipt.pt._qproc = io_async_queue_proc; 5587 io_req_set_refcount(req); 5588 5589 ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, 5590 io_async_wake); 5591 spin_unlock(&ctx->completion_lock); 5592 if (ret || ipt.error) 5593 return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; 5594 5595 trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data, 5596 mask, apoll->poll.events); 5597 return IO_APOLL_OK; 5598} 5599 5600static bool __io_poll_remove_one(struct io_kiocb *req, 5601 struct io_poll_iocb *poll, bool do_cancel) 5602 __must_hold(&req->ctx->completion_lock) 5603{ 5604 bool do_complete = false; 5605 5606 if (!poll->head) 5607 return false; 5608 spin_lock_irq(&poll->head->lock); 5609 if (do_cancel) 5610 WRITE_ONCE(poll->canceled, true); 5611 if (!list_empty(&poll->wait.entry)) { 5612 list_del_init(&poll->wait.entry); 5613 do_complete = true; 5614 } 5615 spin_unlock_irq(&poll->head->lock); 5616 hash_del(&req->hash_node); 5617 return do_complete; 5618} 5619 5620static bool io_poll_remove_one(struct io_kiocb *req) 5621 __must_hold(&req->ctx->completion_lock) 5622{ 5623 bool do_complete; 5624 5625 io_poll_remove_double(req); 5626 do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true); 5627 5628 if (do_complete) { 5629 io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0); 5630 io_commit_cqring(req->ctx); 5631 req_set_fail(req); 5632 io_put_req_deferred(req); 5633 } 5634 return do_complete; 5635} 5636 5637/* 5638 * Returns true if we found and killed one or more poll requests 5639 */ 5640static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, 5641 bool cancel_all) 5642{ 5643 struct hlist_node *tmp; 5644 struct io_kiocb *req; 5645 int posted = 0, i; 5646 5647 spin_lock(&ctx->completion_lock); 5648 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { 5649 struct hlist_head *list; 5650 5651 list = &ctx->cancel_hash[i]; 5652 hlist_for_each_entry_safe(req, tmp, list, hash_node) { 5653 if (io_match_task(req, tsk, cancel_all)) 5654 posted += io_poll_remove_one(req); 5655 } 5656 } 5657 spin_unlock(&ctx->completion_lock); 5658 5659 if (posted) 5660 io_cqring_ev_posted(ctx); 5661 5662 return posted != 0; 5663} 5664 5665static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr, 5666 bool poll_only) 5667 __must_hold(&ctx->completion_lock) 5668{ 5669 struct hlist_head *list; 5670 struct io_kiocb *req; 5671 5672 list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; 5673 hlist_for_each_entry(req, list, hash_node) { 5674 if (sqe_addr != req->user_data) 5675 continue; 5676 if (poll_only && req->opcode != IORING_OP_POLL_ADD) 5677 continue; 5678 return req; 5679 } 5680 return NULL; 5681} 5682 5683static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr, 5684 bool poll_only) 5685 __must_hold(&ctx->completion_lock) 5686{ 5687 struct io_kiocb *req; 5688 5689 req = io_poll_find(ctx, sqe_addr, poll_only); 5690 if (!req) 5691 return -ENOENT; 5692 if (io_poll_remove_one(req)) 5693 return 0; 5694 5695 return -EALREADY; 5696} 5697 5698static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, 5699 unsigned int flags) 5700{ 5701 u32 events; 5702 5703 events = READ_ONCE(sqe->poll32_events); 5704#ifdef __BIG_ENDIAN 5705 events = swahw32(events); 5706#endif 5707 if (!(flags & IORING_POLL_ADD_MULTI)) 5708 events |= EPOLLONESHOT; 5709 return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT)); 5710} 5711 5712static int io_poll_update_prep(struct io_kiocb *req, 5713 const struct io_uring_sqe *sqe) 5714{ 5715 struct io_poll_update *upd = &req->poll_update; 5716 u32 flags; 5717 5718 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5719 return -EINVAL; 5720 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) 5721 return -EINVAL; 5722 flags = READ_ONCE(sqe->len); 5723 if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | 5724 IORING_POLL_ADD_MULTI)) 5725 return -EINVAL; 5726 /* meaningless without update */ 5727 if (flags == IORING_POLL_ADD_MULTI) 5728 return -EINVAL; 5729 5730 upd->old_user_data = READ_ONCE(sqe->addr); 5731 upd->update_events = flags & IORING_POLL_UPDATE_EVENTS; 5732 upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA; 5733 5734 upd->new_user_data = READ_ONCE(sqe->off); 5735 if (!upd->update_user_data && upd->new_user_data) 5736 return -EINVAL; 5737 if (upd->update_events) 5738 upd->events = io_poll_parse_events(sqe, flags); 5739 else if (sqe->poll32_events) 5740 return -EINVAL; 5741 5742 return 0; 5743} 5744 5745static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, 5746 void *key) 5747{ 5748 struct io_kiocb *req = wait->private; 5749 struct io_poll_iocb *poll = &req->poll; 5750 5751 return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func); 5752} 5753 5754static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, 5755 struct poll_table_struct *p) 5756{ 5757 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); 5758 5759 __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data); 5760} 5761 5762static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5763{ 5764 struct io_poll_iocb *poll = &req->poll; 5765 u32 flags; 5766 5767 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5768 return -EINVAL; 5769 if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr) 5770 return -EINVAL; 5771 flags = READ_ONCE(sqe->len); 5772 if (flags & ~IORING_POLL_ADD_MULTI) 5773 return -EINVAL; 5774 5775 io_req_set_refcount(req); 5776 poll->events = io_poll_parse_events(sqe, flags); 5777 return 0; 5778} 5779 5780static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) 5781{ 5782 struct io_poll_iocb *poll = &req->poll; 5783 struct io_ring_ctx *ctx = req->ctx; 5784 struct io_poll_table ipt; 5785 __poll_t mask; 5786 5787 ipt.pt._qproc = io_poll_queue_proc; 5788 5789 mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events, 5790 io_poll_wake); 5791 5792 if (mask) { /* no async, we'd stolen it */ 5793 ipt.error = 0; 5794 io_poll_complete(req, mask); 5795 } 5796 spin_unlock(&ctx->completion_lock); 5797 5798 if (mask) { 5799 io_cqring_ev_posted(ctx); 5800 if (poll->events & EPOLLONESHOT) 5801 io_put_req(req); 5802 } 5803 return ipt.error; 5804} 5805 5806static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) 5807{ 5808 struct io_ring_ctx *ctx = req->ctx; 5809 struct io_kiocb *preq; 5810 bool completing; 5811 int ret; 5812 5813 spin_lock(&ctx->completion_lock); 5814 preq = io_poll_find(ctx, req->poll_update.old_user_data, true); 5815 if (!preq) { 5816 ret = -ENOENT; 5817 goto err; 5818 } 5819 5820 if (!req->poll_update.update_events && !req->poll_update.update_user_data) { 5821 completing = true; 5822 ret = io_poll_remove_one(preq) ? 0 : -EALREADY; 5823 goto err; 5824 } 5825 5826 /* 5827 * Don't allow racy completion with singleshot, as we cannot safely 5828 * update those. For multishot, if we're racing with completion, just 5829 * let completion re-add it. 5830 */ 5831 completing = !__io_poll_remove_one(preq, &preq->poll, false); 5832 if (completing && (preq->poll.events & EPOLLONESHOT)) { 5833 ret = -EALREADY; 5834 goto err; 5835 } 5836 /* we now have a detached poll request. reissue. */ 5837 ret = 0; 5838err: 5839 if (ret < 0) { 5840 spin_unlock(&ctx->completion_lock); 5841 req_set_fail(req); 5842 io_req_complete(req, ret); 5843 return 0; 5844 } 5845 /* only mask one event flags, keep behavior flags */ 5846 if (req->poll_update.update_events) { 5847 preq->poll.events &= ~0xffff; 5848 preq->poll.events |= req->poll_update.events & 0xffff; 5849 preq->poll.events |= IO_POLL_UNMASK; 5850 } 5851 if (req->poll_update.update_user_data) 5852 preq->user_data = req->poll_update.new_user_data; 5853 spin_unlock(&ctx->completion_lock); 5854 5855 /* complete update request, we're done with it */ 5856 io_req_complete(req, ret); 5857 5858 if (!completing) { 5859 ret = io_poll_add(preq, issue_flags); 5860 if (ret < 0) { 5861 req_set_fail(preq); 5862 io_req_complete(preq, ret); 5863 } 5864 } 5865 return 0; 5866} 5867 5868static void io_req_task_timeout(struct io_kiocb *req, bool *locked) 5869{ 5870 req_set_fail(req); 5871 io_req_complete_post(req, -ETIME, 0); 5872} 5873 5874static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) 5875{ 5876 struct io_timeout_data *data = container_of(timer, 5877 struct io_timeout_data, timer); 5878 struct io_kiocb *req = data->req; 5879 struct io_ring_ctx *ctx = req->ctx; 5880 unsigned long flags; 5881 5882 spin_lock_irqsave(&ctx->timeout_lock, flags); 5883 list_del_init(&req->timeout.list); 5884 atomic_set(&req->ctx->cq_timeouts, 5885 atomic_read(&req->ctx->cq_timeouts) + 1); 5886 spin_unlock_irqrestore(&ctx->timeout_lock, flags); 5887 5888 req->io_task_work.func = io_req_task_timeout; 5889 io_req_task_work_add(req); 5890 return HRTIMER_NORESTART; 5891} 5892 5893static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, 5894 __u64 user_data) 5895 __must_hold(&ctx->timeout_lock) 5896{ 5897 struct io_timeout_data *io; 5898 struct io_kiocb *req; 5899 bool found = false; 5900 5901 list_for_each_entry(req, &ctx->timeout_list, timeout.list) { 5902 found = user_data == req->user_data; 5903 if (found) 5904 break; 5905 } 5906 if (!found) 5907 return ERR_PTR(-ENOENT); 5908 5909 io = req->async_data; 5910 if (hrtimer_try_to_cancel(&io->timer) == -1) 5911 return ERR_PTR(-EALREADY); 5912 list_del_init(&req->timeout.list); 5913 return req; 5914} 5915 5916static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) 5917 __must_hold(&ctx->completion_lock) 5918 __must_hold(&ctx->timeout_lock) 5919{ 5920 struct io_kiocb *req = io_timeout_extract(ctx, user_data); 5921 5922 if (IS_ERR(req)) 5923 return PTR_ERR(req); 5924 5925 req_set_fail(req); 5926 io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0); 5927 io_put_req_deferred(req); 5928 return 0; 5929} 5930 5931static clockid_t io_timeout_get_clock(struct io_timeout_data *data) 5932{ 5933 switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) { 5934 case IORING_TIMEOUT_BOOTTIME: 5935 return CLOCK_BOOTTIME; 5936 case IORING_TIMEOUT_REALTIME: 5937 return CLOCK_REALTIME; 5938 default: 5939 /* can't happen, vetted at prep time */ 5940 WARN_ON_ONCE(1); 5941 fallthrough; 5942 case 0: 5943 return CLOCK_MONOTONIC; 5944 } 5945} 5946 5947static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, 5948 struct timespec64 *ts, enum hrtimer_mode mode) 5949 __must_hold(&ctx->timeout_lock) 5950{ 5951 struct io_timeout_data *io; 5952 struct io_kiocb *req; 5953 bool found = false; 5954 5955 list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) { 5956 found = user_data == req->user_data; 5957 if (found) 5958 break; 5959 } 5960 if (!found) 5961 return -ENOENT; 5962 5963 io = req->async_data; 5964 if (hrtimer_try_to_cancel(&io->timer) == -1) 5965 return -EALREADY; 5966 hrtimer_init(&io->timer, io_timeout_get_clock(io), mode); 5967 io->timer.function = io_link_timeout_fn; 5968 hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); 5969 return 0; 5970} 5971 5972static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, 5973 struct timespec64 *ts, enum hrtimer_mode mode) 5974 __must_hold(&ctx->timeout_lock) 5975{ 5976 struct io_kiocb *req = io_timeout_extract(ctx, user_data); 5977 struct io_timeout_data *data; 5978 5979 if (IS_ERR(req)) 5980 return PTR_ERR(req); 5981 5982 req->timeout.off = 0; /* noseq */ 5983 data = req->async_data; 5984 list_add_tail(&req->timeout.list, &ctx->timeout_list); 5985 hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); 5986 data->timer.function = io_timeout_fn; 5987 hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); 5988 return 0; 5989} 5990 5991static int io_timeout_remove_prep(struct io_kiocb *req, 5992 const struct io_uring_sqe *sqe) 5993{ 5994 struct io_timeout_rem *tr = &req->timeout_rem; 5995 5996 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5997 return -EINVAL; 5998 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 5999 return -EINVAL; 6000 if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in) 6001 return -EINVAL; 6002 6003 tr->ltimeout = false; 6004 tr->addr = READ_ONCE(sqe->addr); 6005 tr->flags = READ_ONCE(sqe->timeout_flags); 6006 if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) { 6007 if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1) 6008 return -EINVAL; 6009 if (tr->flags & IORING_LINK_TIMEOUT_UPDATE) 6010 tr->ltimeout = true; 6011 if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) 6012 return -EINVAL; 6013 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) 6014 return -EFAULT; 6015 } else if (tr->flags) { 6016 /* timeout removal doesn't support flags */ 6017 return -EINVAL; 6018 } 6019 6020 return 0; 6021} 6022 6023static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags) 6024{ 6025 return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS 6026 : HRTIMER_MODE_REL; 6027} 6028 6029/* 6030 * Remove or update an existing timeout command 6031 */ 6032static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) 6033{ 6034 struct io_timeout_rem *tr = &req->timeout_rem; 6035 struct io_ring_ctx *ctx = req->ctx; 6036 int ret; 6037 6038 if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) { 6039 spin_lock(&ctx->completion_lock); 6040 spin_lock_irq(&ctx->timeout_lock); 6041 ret = io_timeout_cancel(ctx, tr->addr); 6042 spin_unlock_irq(&ctx->timeout_lock); 6043 spin_unlock(&ctx->completion_lock); 6044 } else { 6045 enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); 6046 6047 spin_lock_irq(&ctx->timeout_lock); 6048 if (tr->ltimeout) 6049 ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); 6050 else 6051 ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); 6052 spin_unlock_irq(&ctx->timeout_lock); 6053 } 6054 6055 if (ret < 0) 6056 req_set_fail(req); 6057 io_req_complete_post(req, ret, 0); 6058 return 0; 6059} 6060 6061static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, 6062 bool is_timeout_link) 6063{ 6064 struct io_timeout_data *data; 6065 unsigned flags; 6066 u32 off = READ_ONCE(sqe->off); 6067 6068 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 6069 return -EINVAL; 6070 if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || 6071 sqe->splice_fd_in) 6072 return -EINVAL; 6073 if (off && is_timeout_link) 6074 return -EINVAL; 6075 flags = READ_ONCE(sqe->timeout_flags); 6076 if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK)) 6077 return -EINVAL; 6078 /* more than one clock specified is invalid, obviously */ 6079 if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) 6080 return -EINVAL; 6081 6082 INIT_LIST_HEAD(&req->timeout.list); 6083 req->timeout.off = off; 6084 if (unlikely(off && !req->ctx->off_timeout_used)) 6085 req->ctx->off_timeout_used = true; 6086 6087 if (!req->async_data && io_alloc_async_data(req)) 6088 return -ENOMEM; 6089 6090 data = req->async_data; 6091 data->req = req; 6092 data->flags = flags; 6093 6094 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) 6095 return -EFAULT; 6096 6097 data->mode = io_translate_timeout_mode(flags); 6098 hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); 6099 6100 if (is_timeout_link) { 6101 struct io_submit_link *link = &req->ctx->submit_state.link; 6102 6103 if (!link->head) 6104 return -EINVAL; 6105 if (link->last->opcode == IORING_OP_LINK_TIMEOUT) 6106 return -EINVAL; 6107 req->timeout.head = link->last; 6108 link->last->flags |= REQ_F_ARM_LTIMEOUT; 6109 } 6110 return 0; 6111} 6112 6113static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) 6114{ 6115 struct io_ring_ctx *ctx = req->ctx; 6116 struct io_timeout_data *data = req->async_data; 6117 struct list_head *entry; 6118 u32 tail, off = req->timeout.off; 6119 6120 spin_lock_irq(&ctx->timeout_lock); 6121 6122 /* 6123 * sqe->off holds how many events that need to occur for this 6124 * timeout event to be satisfied. If it isn't set, then this is 6125 * a pure timeout request, sequence isn't used. 6126 */ 6127 if (io_is_timeout_noseq(req)) { 6128 entry = ctx->timeout_list.prev; 6129 goto add; 6130 } 6131 6132 tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); 6133 req->timeout.target_seq = tail + off; 6134 6135 /* Update the last seq here in case io_flush_timeouts() hasn't. 6136 * This is safe because ->completion_lock is held, and submissions 6137 * and completions are never mixed in the same ->completion_lock section. 6138 */ 6139 ctx->cq_last_tm_flush = tail; 6140 6141 /* 6142 * Insertion sort, ensuring the first entry in the list is always 6143 * the one we need first. 6144 */ 6145 list_for_each_prev(entry, &ctx->timeout_list) { 6146 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, 6147 timeout.list); 6148 6149 if (io_is_timeout_noseq(nxt)) 6150 continue; 6151 /* nxt.seq is behind @tail, otherwise would've been completed */ 6152 if (off >= nxt->timeout.target_seq - tail) 6153 break; 6154 } 6155add: 6156 list_add(&req->timeout.list, entry); 6157 data->timer.function = io_timeout_fn; 6158 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); 6159 spin_unlock_irq(&ctx->timeout_lock); 6160 return 0; 6161} 6162 6163struct io_cancel_data { 6164 struct io_ring_ctx *ctx; 6165 u64 user_data; 6166}; 6167 6168static bool io_cancel_cb(struct io_wq_work *work, void *data) 6169{ 6170 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 6171 struct io_cancel_data *cd = data; 6172 6173 return req->ctx == cd->ctx && req->user_data == cd->user_data; 6174} 6175 6176static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, 6177 struct io_ring_ctx *ctx) 6178{ 6179 struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, }; 6180 enum io_wq_cancel cancel_ret; 6181 int ret = 0; 6182 6183 if (!tctx || !tctx->io_wq) 6184 return -ENOENT; 6185 6186 cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false); 6187 switch (cancel_ret) { 6188 case IO_WQ_CANCEL_OK: 6189 ret = 0; 6190 break; 6191 case IO_WQ_CANCEL_RUNNING: 6192 ret = -EALREADY; 6193 break; 6194 case IO_WQ_CANCEL_NOTFOUND: 6195 ret = -ENOENT; 6196 break; 6197 } 6198 6199 return ret; 6200} 6201 6202static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr) 6203{ 6204 struct io_ring_ctx *ctx = req->ctx; 6205 int ret; 6206 6207 WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); 6208 6209 ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); 6210 if (ret != -ENOENT) 6211 return ret; 6212 6213 spin_lock(&ctx->completion_lock); 6214 spin_lock_irq(&ctx->timeout_lock); 6215 ret = io_timeout_cancel(ctx, sqe_addr); 6216 spin_unlock_irq(&ctx->timeout_lock); 6217 if (ret != -ENOENT) 6218 goto out; 6219 ret = io_poll_cancel(ctx, sqe_addr, false); 6220out: 6221 spin_unlock(&ctx->completion_lock); 6222 return ret; 6223} 6224 6225static int io_async_cancel_prep(struct io_kiocb *req, 6226 const struct io_uring_sqe *sqe) 6227{ 6228 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 6229 return -EINVAL; 6230 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 6231 return -EINVAL; 6232 if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags || 6233 sqe->splice_fd_in) 6234 return -EINVAL; 6235 6236 req->cancel.addr = READ_ONCE(sqe->addr); 6237 return 0; 6238} 6239 6240static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) 6241{ 6242 struct io_ring_ctx *ctx = req->ctx; 6243 u64 sqe_addr = req->cancel.addr; 6244 struct io_tctx_node *node; 6245 int ret; 6246 6247 ret = io_try_cancel_userdata(req, sqe_addr); 6248 if (ret != -ENOENT) 6249 goto done; 6250 6251 /* slow path, try all io-wq's */ 6252 io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); 6253 ret = -ENOENT; 6254 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 6255 struct io_uring_task *tctx = node->task->io_uring; 6256 6257 ret = io_async_cancel_one(tctx, req->cancel.addr, ctx); 6258 if (ret != -ENOENT) 6259 break; 6260 } 6261 io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); 6262done: 6263 if (ret < 0) 6264 req_set_fail(req); 6265 io_req_complete_post(req, ret, 0); 6266 return 0; 6267} 6268 6269static int io_rsrc_update_prep(struct io_kiocb *req, 6270 const struct io_uring_sqe *sqe) 6271{ 6272 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 6273 return -EINVAL; 6274 if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) 6275 return -EINVAL; 6276 6277 req->rsrc_update.offset = READ_ONCE(sqe->off); 6278 req->rsrc_update.nr_args = READ_ONCE(sqe->len); 6279 if (!req->rsrc_update.nr_args) 6280 return -EINVAL; 6281 req->rsrc_update.arg = READ_ONCE(sqe->addr); 6282 return 0; 6283} 6284 6285static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 6286{ 6287 struct io_ring_ctx *ctx = req->ctx; 6288 struct io_uring_rsrc_update2 up; 6289 int ret; 6290 6291 if (issue_flags & IO_URING_F_NONBLOCK) 6292 return -EAGAIN; 6293 6294 up.offset = req->rsrc_update.offset; 6295 up.data = req->rsrc_update.arg; 6296 up.nr = 0; 6297 up.tags = 0; 6298 up.resv = 0; 6299 6300 mutex_lock(&ctx->uring_lock); 6301 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 6302 &up, req->rsrc_update.nr_args); 6303 mutex_unlock(&ctx->uring_lock); 6304 6305 if (ret < 0) 6306 req_set_fail(req); 6307 __io_req_complete(req, issue_flags, ret, 0); 6308 return 0; 6309} 6310 6311static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 6312{ 6313 switch (req->opcode) { 6314 case IORING_OP_NOP: 6315 return 0; 6316 case IORING_OP_READV: 6317 case IORING_OP_READ_FIXED: 6318 case IORING_OP_READ: 6319 return io_read_prep(req, sqe); 6320 case IORING_OP_WRITEV: 6321 case IORING_OP_WRITE_FIXED: 6322 case IORING_OP_WRITE: 6323 return io_write_prep(req, sqe); 6324 case IORING_OP_POLL_ADD: 6325 return io_poll_add_prep(req, sqe); 6326 case IORING_OP_POLL_REMOVE: 6327 return io_poll_update_prep(req, sqe); 6328 case IORING_OP_FSYNC: 6329 return io_fsync_prep(req, sqe); 6330 case IORING_OP_SYNC_FILE_RANGE: 6331 return io_sfr_prep(req, sqe); 6332 case IORING_OP_SENDMSG: 6333 case IORING_OP_SEND: 6334 return io_sendmsg_prep(req, sqe); 6335 case IORING_OP_RECVMSG: 6336 case IORING_OP_RECV: 6337 return io_recvmsg_prep(req, sqe); 6338 case IORING_OP_CONNECT: 6339 return io_connect_prep(req, sqe); 6340 case IORING_OP_TIMEOUT: 6341 return io_timeout_prep(req, sqe, false); 6342 case IORING_OP_TIMEOUT_REMOVE: 6343 return io_timeout_remove_prep(req, sqe); 6344 case IORING_OP_ASYNC_CANCEL: 6345 return io_async_cancel_prep(req, sqe); 6346 case IORING_OP_LINK_TIMEOUT: 6347 return io_timeout_prep(req, sqe, true); 6348 case IORING_OP_ACCEPT: 6349 return io_accept_prep(req, sqe); 6350 case IORING_OP_FALLOCATE: 6351 return io_fallocate_prep(req, sqe); 6352 case IORING_OP_OPENAT: 6353 return io_openat_prep(req, sqe); 6354 case IORING_OP_CLOSE: 6355 return io_close_prep(req, sqe); 6356 case IORING_OP_FILES_UPDATE: 6357 return io_rsrc_update_prep(req, sqe); 6358 case IORING_OP_STATX: 6359 return io_statx_prep(req, sqe); 6360 case IORING_OP_FADVISE: 6361 return io_fadvise_prep(req, sqe); 6362 case IORING_OP_MADVISE: 6363 return io_madvise_prep(req, sqe); 6364 case IORING_OP_OPENAT2: 6365 return io_openat2_prep(req, sqe); 6366 case IORING_OP_EPOLL_CTL: 6367 return io_epoll_ctl_prep(req, sqe); 6368 case IORING_OP_SPLICE: 6369 return io_splice_prep(req, sqe); 6370 case IORING_OP_PROVIDE_BUFFERS: 6371 return io_provide_buffers_prep(req, sqe); 6372 case IORING_OP_REMOVE_BUFFERS: 6373 return io_remove_buffers_prep(req, sqe); 6374 case IORING_OP_TEE: 6375 return io_tee_prep(req, sqe); 6376 case IORING_OP_SHUTDOWN: 6377 return io_shutdown_prep(req, sqe); 6378 case IORING_OP_RENAMEAT: 6379 return io_renameat_prep(req, sqe); 6380 case IORING_OP_UNLINKAT: 6381 return io_unlinkat_prep(req, sqe); 6382 case IORING_OP_MKDIRAT: 6383 return io_mkdirat_prep(req, sqe); 6384 case IORING_OP_SYMLINKAT: 6385 return io_symlinkat_prep(req, sqe); 6386 case IORING_OP_LINKAT: 6387 return io_linkat_prep(req, sqe); 6388 } 6389 6390 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", 6391 req->opcode); 6392 return -EINVAL; 6393} 6394 6395static int io_req_prep_async(struct io_kiocb *req) 6396{ 6397 if (!io_op_defs[req->opcode].needs_async_setup) 6398 return 0; 6399 if (WARN_ON_ONCE(req->async_data)) 6400 return -EFAULT; 6401 if (io_alloc_async_data(req)) 6402 return -EAGAIN; 6403 6404 switch (req->opcode) { 6405 case IORING_OP_READV: 6406 return io_rw_prep_async(req, READ); 6407 case IORING_OP_WRITEV: 6408 return io_rw_prep_async(req, WRITE); 6409 case IORING_OP_SENDMSG: 6410 return io_sendmsg_prep_async(req); 6411 case IORING_OP_RECVMSG: 6412 return io_recvmsg_prep_async(req); 6413 case IORING_OP_CONNECT: 6414 return io_connect_prep_async(req); 6415 } 6416 printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n", 6417 req->opcode); 6418 return -EFAULT; 6419} 6420 6421static u32 io_get_sequence(struct io_kiocb *req) 6422{ 6423 u32 seq = req->ctx->cached_sq_head; 6424 6425 /* need original cached_sq_head, but it was increased for each req */ 6426 io_for_each_link(req, req) 6427 seq--; 6428 return seq; 6429} 6430 6431static bool io_drain_req(struct io_kiocb *req) 6432{ 6433 struct io_kiocb *pos; 6434 struct io_ring_ctx *ctx = req->ctx; 6435 struct io_defer_entry *de; 6436 int ret; 6437 u32 seq; 6438 6439 if (req->flags & REQ_F_FAIL) { 6440 io_req_complete_fail_submit(req); 6441 return true; 6442 } 6443 6444 /* 6445 * If we need to drain a request in the middle of a link, drain the 6446 * head request and the next request/link after the current link. 6447 * Considering sequential execution of links, IOSQE_IO_DRAIN will be 6448 * maintained for every request of our link. 6449 */ 6450 if (ctx->drain_next) { 6451 req->flags |= REQ_F_IO_DRAIN; 6452 ctx->drain_next = false; 6453 } 6454 /* not interested in head, start from the first linked */ 6455 io_for_each_link(pos, req->link) { 6456 if (pos->flags & REQ_F_IO_DRAIN) { 6457 ctx->drain_next = true; 6458 req->flags |= REQ_F_IO_DRAIN; 6459 break; 6460 } 6461 } 6462 6463 /* Still need defer if there is pending req in defer list. */ 6464 if (likely(list_empty_careful(&ctx->defer_list) && 6465 !(req->flags & REQ_F_IO_DRAIN))) { 6466 ctx->drain_active = false; 6467 return false; 6468 } 6469 6470 seq = io_get_sequence(req); 6471 /* Still a chance to pass the sequence check */ 6472 if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) 6473 return false; 6474 6475 ret = io_req_prep_async(req); 6476 if (ret) 6477 goto fail; 6478 io_prep_async_link(req); 6479 de = kmalloc(sizeof(*de), GFP_KERNEL); 6480 if (!de) { 6481 ret = -ENOMEM; 6482fail: 6483 io_req_complete_failed(req, ret); 6484 return true; 6485 } 6486 6487 spin_lock(&ctx->completion_lock); 6488 if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { 6489 spin_unlock(&ctx->completion_lock); 6490 kfree(de); 6491 io_queue_async_work(req, NULL); 6492 return true; 6493 } 6494 6495 trace_io_uring_defer(ctx, req, req->user_data); 6496 de->req = req; 6497 de->seq = seq; 6498 list_add_tail(&de->list, &ctx->defer_list); 6499 spin_unlock(&ctx->completion_lock); 6500 return true; 6501} 6502 6503static void io_clean_op(struct io_kiocb *req) 6504{ 6505 if (req->flags & REQ_F_BUFFER_SELECTED) { 6506 switch (req->opcode) { 6507 case IORING_OP_READV: 6508 case IORING_OP_READ_FIXED: 6509 case IORING_OP_READ: 6510 kfree((void *)(unsigned long)req->rw.addr); 6511 break; 6512 case IORING_OP_RECVMSG: 6513 case IORING_OP_RECV: 6514 kfree(req->sr_msg.kbuf); 6515 break; 6516 } 6517 } 6518 6519 if (req->flags & REQ_F_NEED_CLEANUP) { 6520 switch (req->opcode) { 6521 case IORING_OP_READV: 6522 case IORING_OP_READ_FIXED: 6523 case IORING_OP_READ: 6524 case IORING_OP_WRITEV: 6525 case IORING_OP_WRITE_FIXED: 6526 case IORING_OP_WRITE: { 6527 struct io_async_rw *io = req->async_data; 6528 6529 kfree(io->free_iovec); 6530 break; 6531 } 6532 case IORING_OP_RECVMSG: 6533 case IORING_OP_SENDMSG: { 6534 struct io_async_msghdr *io = req->async_data; 6535 6536 kfree(io->free_iov); 6537 break; 6538 } 6539 case IORING_OP_SPLICE: 6540 case IORING_OP_TEE: 6541 if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED)) 6542 io_put_file(req->splice.file_in); 6543 break; 6544 case IORING_OP_OPENAT: 6545 case IORING_OP_OPENAT2: 6546 if (req->open.filename) 6547 putname(req->open.filename); 6548 break; 6549 case IORING_OP_RENAMEAT: 6550 putname(req->rename.oldpath); 6551 putname(req->rename.newpath); 6552 break; 6553 case IORING_OP_UNLINKAT: 6554 putname(req->unlink.filename); 6555 break; 6556 case IORING_OP_MKDIRAT: 6557 putname(req->mkdir.filename); 6558 break; 6559 case IORING_OP_SYMLINKAT: 6560 putname(req->symlink.oldpath); 6561 putname(req->symlink.newpath); 6562 break; 6563 case IORING_OP_LINKAT: 6564 putname(req->hardlink.oldpath); 6565 putname(req->hardlink.newpath); 6566 break; 6567 } 6568 } 6569 if ((req->flags & REQ_F_POLLED) && req->apoll) { 6570 kfree(req->apoll->double_poll); 6571 kfree(req->apoll); 6572 req->apoll = NULL; 6573 } 6574 if (req->flags & REQ_F_INFLIGHT) { 6575 struct io_uring_task *tctx = req->task->io_uring; 6576 6577 atomic_dec(&tctx->inflight_tracked); 6578 } 6579 if (req->flags & REQ_F_CREDS) 6580 put_cred(req->creds); 6581 6582 req->flags &= ~IO_REQ_CLEAN_FLAGS; 6583} 6584 6585static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) 6586{ 6587 struct io_ring_ctx *ctx = req->ctx; 6588 const struct cred *creds = NULL; 6589 int ret; 6590 6591 if ((req->flags & REQ_F_CREDS) && req->creds != current_cred()) 6592 creds = override_creds(req->creds); 6593 6594 switch (req->opcode) { 6595 case IORING_OP_NOP: 6596 ret = io_nop(req, issue_flags); 6597 break; 6598 case IORING_OP_READV: 6599 case IORING_OP_READ_FIXED: 6600 case IORING_OP_READ: 6601 ret = io_read(req, issue_flags); 6602 break; 6603 case IORING_OP_WRITEV: 6604 case IORING_OP_WRITE_FIXED: 6605 case IORING_OP_WRITE: 6606 ret = io_write(req, issue_flags); 6607 break; 6608 case IORING_OP_FSYNC: 6609 ret = io_fsync(req, issue_flags); 6610 break; 6611 case IORING_OP_POLL_ADD: 6612 ret = io_poll_add(req, issue_flags); 6613 break; 6614 case IORING_OP_POLL_REMOVE: 6615 ret = io_poll_update(req, issue_flags); 6616 break; 6617 case IORING_OP_SYNC_FILE_RANGE: 6618 ret = io_sync_file_range(req, issue_flags); 6619 break; 6620 case IORING_OP_SENDMSG: 6621 ret = io_sendmsg(req, issue_flags); 6622 break; 6623 case IORING_OP_SEND: 6624 ret = io_send(req, issue_flags); 6625 break; 6626 case IORING_OP_RECVMSG: 6627 ret = io_recvmsg(req, issue_flags); 6628 break; 6629 case IORING_OP_RECV: 6630 ret = io_recv(req, issue_flags); 6631 break; 6632 case IORING_OP_TIMEOUT: 6633 ret = io_timeout(req, issue_flags); 6634 break; 6635 case IORING_OP_TIMEOUT_REMOVE: 6636 ret = io_timeout_remove(req, issue_flags); 6637 break; 6638 case IORING_OP_ACCEPT: 6639 ret = io_accept(req, issue_flags); 6640 break; 6641 case IORING_OP_CONNECT: 6642 ret = io_connect(req, issue_flags); 6643 break; 6644 case IORING_OP_ASYNC_CANCEL: 6645 ret = io_async_cancel(req, issue_flags); 6646 break; 6647 case IORING_OP_FALLOCATE: 6648 ret = io_fallocate(req, issue_flags); 6649 break; 6650 case IORING_OP_OPENAT: 6651 ret = io_openat(req, issue_flags); 6652 break; 6653 case IORING_OP_CLOSE: 6654 ret = io_close(req, issue_flags); 6655 break; 6656 case IORING_OP_FILES_UPDATE: 6657 ret = io_files_update(req, issue_flags); 6658 break; 6659 case IORING_OP_STATX: 6660 ret = io_statx(req, issue_flags); 6661 break; 6662 case IORING_OP_FADVISE: 6663 ret = io_fadvise(req, issue_flags); 6664 break; 6665 case IORING_OP_MADVISE: 6666 ret = io_madvise(req, issue_flags); 6667 break; 6668 case IORING_OP_OPENAT2: 6669 ret = io_openat2(req, issue_flags); 6670 break; 6671 case IORING_OP_EPOLL_CTL: 6672 ret = io_epoll_ctl(req, issue_flags); 6673 break; 6674 case IORING_OP_SPLICE: 6675 ret = io_splice(req, issue_flags); 6676 break; 6677 case IORING_OP_PROVIDE_BUFFERS: 6678 ret = io_provide_buffers(req, issue_flags); 6679 break; 6680 case IORING_OP_REMOVE_BUFFERS: 6681 ret = io_remove_buffers(req, issue_flags); 6682 break; 6683 case IORING_OP_TEE: 6684 ret = io_tee(req, issue_flags); 6685 break; 6686 case IORING_OP_SHUTDOWN: 6687 ret = io_shutdown(req, issue_flags); 6688 break; 6689 case IORING_OP_RENAMEAT: 6690 ret = io_renameat(req, issue_flags); 6691 break; 6692 case IORING_OP_UNLINKAT: 6693 ret = io_unlinkat(req, issue_flags); 6694 break; 6695 case IORING_OP_MKDIRAT: 6696 ret = io_mkdirat(req, issue_flags); 6697 break; 6698 case IORING_OP_SYMLINKAT: 6699 ret = io_symlinkat(req, issue_flags); 6700 break; 6701 case IORING_OP_LINKAT: 6702 ret = io_linkat(req, issue_flags); 6703 break; 6704 default: 6705 ret = -EINVAL; 6706 break; 6707 } 6708 6709 if (creds) 6710 revert_creds(creds); 6711 if (ret) 6712 return ret; 6713 /* If the op doesn't have a file, we're not polling for it */ 6714 if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) 6715 io_iopoll_req_issued(req); 6716 6717 return 0; 6718} 6719 6720static struct io_wq_work *io_wq_free_work(struct io_wq_work *work) 6721{ 6722 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 6723 6724 req = io_put_req_find_next(req); 6725 return req ? &req->work : NULL; 6726} 6727 6728static void io_wq_submit_work(struct io_wq_work *work) 6729{ 6730 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 6731 struct io_kiocb *timeout; 6732 int ret = 0; 6733 6734 /* one will be dropped by ->io_free_work() after returning to io-wq */ 6735 if (!(req->flags & REQ_F_REFCOUNT)) 6736 __io_req_set_refcount(req, 2); 6737 else 6738 req_ref_get(req); 6739 6740 timeout = io_prep_linked_timeout(req); 6741 if (timeout) 6742 io_queue_linked_timeout(timeout); 6743 6744 /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ 6745 if (work->flags & IO_WQ_WORK_CANCEL) 6746 ret = -ECANCELED; 6747 6748 if (!ret) { 6749 do { 6750 ret = io_issue_sqe(req, 0); 6751 /* 6752 * We can get EAGAIN for polled IO even though we're 6753 * forcing a sync submission from here, since we can't 6754 * wait for request slots on the block side. 6755 */ 6756 if (ret != -EAGAIN) 6757 break; 6758 cond_resched(); 6759 } while (1); 6760 } 6761 6762 /* avoid locking problems by failing it from a clean context */ 6763 if (ret) 6764 io_req_task_queue_fail(req, ret); 6765} 6766 6767static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table, 6768 unsigned i) 6769{ 6770 return &table->files[i]; 6771} 6772 6773static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, 6774 int index) 6775{ 6776 struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index); 6777 6778 return (struct file *) (slot->file_ptr & FFS_MASK); 6779} 6780 6781static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file) 6782{ 6783 unsigned long file_ptr = (unsigned long) file; 6784 6785 if (__io_file_supports_nowait(file, READ)) 6786 file_ptr |= FFS_ASYNC_READ; 6787 if (__io_file_supports_nowait(file, WRITE)) 6788 file_ptr |= FFS_ASYNC_WRITE; 6789 if (S_ISREG(file_inode(file)->i_mode)) 6790 file_ptr |= FFS_ISREG; 6791 file_slot->file_ptr = file_ptr; 6792} 6793 6794static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx, 6795 struct io_kiocb *req, int fd) 6796{ 6797 struct file *file; 6798 unsigned long file_ptr; 6799 6800 if (unlikely((unsigned int)fd >= ctx->nr_user_files)) 6801 return NULL; 6802 fd = array_index_nospec(fd, ctx->nr_user_files); 6803 file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; 6804 file = (struct file *) (file_ptr & FFS_MASK); 6805 file_ptr &= ~FFS_MASK; 6806 /* mask in overlapping REQ_F and FFS bits */ 6807 req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT); 6808 io_req_set_rsrc_node(req); 6809 return file; 6810} 6811 6812static struct file *io_file_get_normal(struct io_ring_ctx *ctx, 6813 struct io_kiocb *req, int fd) 6814{ 6815 struct file *file = fget(fd); 6816 6817 trace_io_uring_file_get(ctx, fd); 6818 6819 /* we don't allow fixed io_uring files */ 6820 if (file && unlikely(file->f_op == &io_uring_fops)) 6821 io_req_track_inflight(req); 6822 return file; 6823} 6824 6825static inline struct file *io_file_get(struct io_ring_ctx *ctx, 6826 struct io_kiocb *req, int fd, bool fixed) 6827{ 6828 if (fixed) 6829 return io_file_get_fixed(ctx, req, fd); 6830 else 6831 return io_file_get_normal(ctx, req, fd); 6832} 6833 6834static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) 6835{ 6836 struct io_kiocb *prev = req->timeout.prev; 6837 int ret; 6838 6839 if (prev) { 6840 ret = io_try_cancel_userdata(req, prev->user_data); 6841 io_req_complete_post(req, ret ?: -ETIME, 0); 6842 io_put_req(prev); 6843 } else { 6844 io_req_complete_post(req, -ETIME, 0); 6845 } 6846} 6847 6848static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) 6849{ 6850 struct io_timeout_data *data = container_of(timer, 6851 struct io_timeout_data, timer); 6852 struct io_kiocb *prev, *req = data->req; 6853 struct io_ring_ctx *ctx = req->ctx; 6854 unsigned long flags; 6855 6856 spin_lock_irqsave(&ctx->timeout_lock, flags); 6857 prev = req->timeout.head; 6858 req->timeout.head = NULL; 6859 6860 /* 6861 * We don't expect the list to be empty, that will only happen if we 6862 * race with the completion of the linked work. 6863 */ 6864 if (prev) { 6865 io_remove_next_linked(prev); 6866 if (!req_ref_inc_not_zero(prev)) 6867 prev = NULL; 6868 } 6869 list_del(&req->timeout.list); 6870 req->timeout.prev = prev; 6871 spin_unlock_irqrestore(&ctx->timeout_lock, flags); 6872 6873 req->io_task_work.func = io_req_task_link_timeout; 6874 io_req_task_work_add(req); 6875 return HRTIMER_NORESTART; 6876} 6877 6878static void io_queue_linked_timeout(struct io_kiocb *req) 6879{ 6880 struct io_ring_ctx *ctx = req->ctx; 6881 6882 spin_lock_irq(&ctx->timeout_lock); 6883 /* 6884 * If the back reference is NULL, then our linked request finished 6885 * before we got a chance to setup the timer 6886 */ 6887 if (req->timeout.head) { 6888 struct io_timeout_data *data = req->async_data; 6889 6890 data->timer.function = io_link_timeout_fn; 6891 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), 6892 data->mode); 6893 list_add_tail(&req->timeout.list, &ctx->ltimeout_list); 6894 } 6895 spin_unlock_irq(&ctx->timeout_lock); 6896 /* drop submission reference */ 6897 io_put_req(req); 6898} 6899 6900static void __io_queue_sqe(struct io_kiocb *req) 6901 __must_hold(&req->ctx->uring_lock) 6902{ 6903 struct io_kiocb *linked_timeout; 6904 int ret; 6905 6906issue_sqe: 6907 ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); 6908 6909 /* 6910 * We async punt it if the file wasn't marked NOWAIT, or if the file 6911 * doesn't support non-blocking read/write attempts 6912 */ 6913 if (likely(!ret)) { 6914 if (req->flags & REQ_F_COMPLETE_INLINE) { 6915 struct io_ring_ctx *ctx = req->ctx; 6916 struct io_submit_state *state = &ctx->submit_state; 6917 6918 state->compl_reqs[state->compl_nr++] = req; 6919 if (state->compl_nr == ARRAY_SIZE(state->compl_reqs)) 6920 io_submit_flush_completions(ctx); 6921 return; 6922 } 6923 6924 linked_timeout = io_prep_linked_timeout(req); 6925 if (linked_timeout) 6926 io_queue_linked_timeout(linked_timeout); 6927 } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { 6928 linked_timeout = io_prep_linked_timeout(req); 6929 6930 switch (io_arm_poll_handler(req)) { 6931 case IO_APOLL_READY: 6932 if (linked_timeout) 6933 io_unprep_linked_timeout(req); 6934 goto issue_sqe; 6935 case IO_APOLL_ABORTED: 6936 /* 6937 * Queued up for async execution, worker will release 6938 * submit reference when the iocb is actually submitted. 6939 */ 6940 io_queue_async_work(req, NULL); 6941 break; 6942 } 6943 6944 if (linked_timeout) 6945 io_queue_linked_timeout(linked_timeout); 6946 } else { 6947 io_req_complete_failed(req, ret); 6948 } 6949} 6950 6951static inline void io_queue_sqe(struct io_kiocb *req) 6952 __must_hold(&req->ctx->uring_lock) 6953{ 6954 if (unlikely(req->ctx->drain_active) && io_drain_req(req)) 6955 return; 6956 6957 if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) { 6958 __io_queue_sqe(req); 6959 } else if (req->flags & REQ_F_FAIL) { 6960 io_req_complete_fail_submit(req); 6961 } else { 6962 int ret = io_req_prep_async(req); 6963 6964 if (unlikely(ret)) 6965 io_req_complete_failed(req, ret); 6966 else 6967 io_queue_async_work(req, NULL); 6968 } 6969} 6970 6971/* 6972 * Check SQE restrictions (opcode and flags). 6973 * 6974 * Returns 'true' if SQE is allowed, 'false' otherwise. 6975 */ 6976static inline bool io_check_restriction(struct io_ring_ctx *ctx, 6977 struct io_kiocb *req, 6978 unsigned int sqe_flags) 6979{ 6980 if (likely(!ctx->restricted)) 6981 return true; 6982 6983 if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) 6984 return false; 6985 6986 if ((sqe_flags & ctx->restrictions.sqe_flags_required) != 6987 ctx->restrictions.sqe_flags_required) 6988 return false; 6989 6990 if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | 6991 ctx->restrictions.sqe_flags_required)) 6992 return false; 6993 6994 return true; 6995} 6996 6997static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, 6998 const struct io_uring_sqe *sqe) 6999 __must_hold(&ctx->uring_lock) 7000{ 7001 struct io_submit_state *state; 7002 unsigned int sqe_flags; 7003 int personality, ret = 0; 7004 7005 /* req is partially pre-initialised, see io_preinit_req() */ 7006 req->opcode = READ_ONCE(sqe->opcode); 7007 /* same numerical values with corresponding REQ_F_*, safe to copy */ 7008 req->flags = sqe_flags = READ_ONCE(sqe->flags); 7009 req->user_data = READ_ONCE(sqe->user_data); 7010 req->file = NULL; 7011 req->fixed_rsrc_refs = NULL; 7012 req->task = current; 7013 7014 /* enforce forwards compatibility on users */ 7015 if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) 7016 return -EINVAL; 7017 if (unlikely(req->opcode >= IORING_OP_LAST)) 7018 return -EINVAL; 7019 if (!io_check_restriction(ctx, req, sqe_flags)) 7020 return -EACCES; 7021 7022 if ((sqe_flags & IOSQE_BUFFER_SELECT) && 7023 !io_op_defs[req->opcode].buffer_select) 7024 return -EOPNOTSUPP; 7025 if (unlikely(sqe_flags & IOSQE_IO_DRAIN)) 7026 ctx->drain_active = true; 7027 7028 personality = READ_ONCE(sqe->personality); 7029 if (personality) { 7030 req->creds = xa_load(&ctx->personalities, personality); 7031 if (!req->creds) 7032 return -EINVAL; 7033 get_cred(req->creds); 7034 req->flags |= REQ_F_CREDS; 7035 } 7036 state = &ctx->submit_state; 7037 7038 /* 7039 * Plug now if we have more than 1 IO left after this, and the target 7040 * is potentially a read/write to block based storage. 7041 */ 7042 if (!state->plug_started && state->ios_left > 1 && 7043 io_op_defs[req->opcode].plug) { 7044 blk_start_plug(&state->plug); 7045 state->plug_started = true; 7046 } 7047 7048 if (io_op_defs[req->opcode].needs_file) { 7049 req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd), 7050 (sqe_flags & IOSQE_FIXED_FILE)); 7051 if (unlikely(!req->file)) 7052 ret = -EBADF; 7053 } 7054 7055 state->ios_left--; 7056 return ret; 7057} 7058 7059static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, 7060 const struct io_uring_sqe *sqe) 7061 __must_hold(&ctx->uring_lock) 7062{ 7063 struct io_submit_link *link = &ctx->submit_state.link; 7064 int ret; 7065 7066 ret = io_init_req(ctx, req, sqe); 7067 if (unlikely(ret)) { 7068fail_req: 7069 /* fail even hard links since we don't submit */ 7070 if (link->head) { 7071 /* 7072 * we can judge a link req is failed or cancelled by if 7073 * REQ_F_FAIL is set, but the head is an exception since 7074 * it may be set REQ_F_FAIL because of other req's failure 7075 * so let's leverage req->result to distinguish if a head 7076 * is set REQ_F_FAIL because of its failure or other req's 7077 * failure so that we can set the correct ret code for it. 7078 * init result here to avoid affecting the normal path. 7079 */ 7080 if (!(link->head->flags & REQ_F_FAIL)) 7081 req_fail_link_node(link->head, -ECANCELED); 7082 } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { 7083 /* 7084 * the current req is a normal req, we should return 7085 * error and thus break the submittion loop. 7086 */ 7087 io_req_complete_failed(req, ret); 7088 return ret; 7089 } 7090 req_fail_link_node(req, ret); 7091 } else { 7092 ret = io_req_prep(req, sqe); 7093 if (unlikely(ret)) 7094 goto fail_req; 7095 } 7096 7097 /* don't need @sqe from now on */ 7098 trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data, 7099 req->flags, true, 7100 ctx->flags & IORING_SETUP_SQPOLL); 7101 7102 /* 7103 * If we already have a head request, queue this one for async 7104 * submittal once the head completes. If we don't have a head but 7105 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be 7106 * submitted sync once the chain is complete. If none of those 7107 * conditions are true (normal request), then just queue it. 7108 */ 7109 if (link->head) { 7110 struct io_kiocb *head = link->head; 7111 7112 if (!(req->flags & REQ_F_FAIL)) { 7113 ret = io_req_prep_async(req); 7114 if (unlikely(ret)) { 7115 req_fail_link_node(req, ret); 7116 if (!(head->flags & REQ_F_FAIL)) 7117 req_fail_link_node(head, -ECANCELED); 7118 } 7119 } 7120 trace_io_uring_link(ctx, req, head); 7121 link->last->link = req; 7122 link->last = req; 7123 7124 /* last request of a link, enqueue the link */ 7125 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { 7126 link->head = NULL; 7127 io_queue_sqe(head); 7128 } 7129 } else { 7130 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { 7131 link->head = req; 7132 link->last = req; 7133 } else { 7134 io_queue_sqe(req); 7135 } 7136 } 7137 7138 return 0; 7139} 7140 7141/* 7142 * Batched submission is done, ensure local IO is flushed out. 7143 */ 7144static void io_submit_state_end(struct io_submit_state *state, 7145 struct io_ring_ctx *ctx) 7146{ 7147 if (state->link.head) 7148 io_queue_sqe(state->link.head); 7149 if (state->compl_nr) 7150 io_submit_flush_completions(ctx); 7151 if (state->plug_started) 7152 blk_finish_plug(&state->plug); 7153} 7154 7155/* 7156 * Start submission side cache. 7157 */ 7158static void io_submit_state_start(struct io_submit_state *state, 7159 unsigned int max_ios) 7160{ 7161 state->plug_started = false; 7162 state->ios_left = max_ios; 7163 /* set only head, no need to init link_last in advance */ 7164 state->link.head = NULL; 7165} 7166 7167static void io_commit_sqring(struct io_ring_ctx *ctx) 7168{ 7169 struct io_rings *rings = ctx->rings; 7170 7171 /* 7172 * Ensure any loads from the SQEs are done at this point, 7173 * since once we write the new head, the application could 7174 * write new data to them. 7175 */ 7176 smp_store_release(&rings->sq.head, ctx->cached_sq_head); 7177} 7178 7179/* 7180 * Fetch an sqe, if one is available. Note this returns a pointer to memory 7181 * that is mapped by userspace. This means that care needs to be taken to 7182 * ensure that reads are stable, as we cannot rely on userspace always 7183 * being a good citizen. If members of the sqe are validated and then later 7184 * used, it's important that those reads are done through READ_ONCE() to 7185 * prevent a re-load down the line. 7186 */ 7187static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) 7188{ 7189 unsigned head, mask = ctx->sq_entries - 1; 7190 unsigned sq_idx = ctx->cached_sq_head++ & mask; 7191 7192 /* 7193 * The cached sq head (or cq tail) serves two purposes: 7194 * 7195 * 1) allows us to batch the cost of updating the user visible 7196 * head updates. 7197 * 2) allows the kernel side to track the head on its own, even 7198 * though the application is the one updating it. 7199 */ 7200 head = READ_ONCE(ctx->sq_array[sq_idx]); 7201 if (likely(head < ctx->sq_entries)) 7202 return &ctx->sq_sqes[head]; 7203 7204 /* drop invalid entries */ 7205 ctx->cq_extra--; 7206 WRITE_ONCE(ctx->rings->sq_dropped, 7207 READ_ONCE(ctx->rings->sq_dropped) + 1); 7208 return NULL; 7209} 7210 7211static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) 7212 __must_hold(&ctx->uring_lock) 7213{ 7214 int submitted = 0; 7215 7216 /* make sure SQ entry isn't read before tail */ 7217 nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx)); 7218 if (!percpu_ref_tryget_many(&ctx->refs, nr)) 7219 return -EAGAIN; 7220 io_get_task_refs(nr); 7221 7222 io_submit_state_start(&ctx->submit_state, nr); 7223 while (submitted < nr) { 7224 const struct io_uring_sqe *sqe; 7225 struct io_kiocb *req; 7226 7227 req = io_alloc_req(ctx); 7228 if (unlikely(!req)) { 7229 if (!submitted) 7230 submitted = -EAGAIN; 7231 break; 7232 } 7233 sqe = io_get_sqe(ctx); 7234 if (unlikely(!sqe)) { 7235 list_add(&req->inflight_entry, &ctx->submit_state.free_list); 7236 break; 7237 } 7238 /* will complete beyond this point, count as submitted */ 7239 submitted++; 7240 if (io_submit_sqe(ctx, req, sqe)) 7241 break; 7242 } 7243 7244 if (unlikely(submitted != nr)) { 7245 int ref_used = (submitted == -EAGAIN) ? 0 : submitted; 7246 int unused = nr - ref_used; 7247 7248 current->io_uring->cached_refs += unused; 7249 percpu_ref_put_many(&ctx->refs, unused); 7250 } 7251 7252 io_submit_state_end(&ctx->submit_state, ctx); 7253 /* Commit SQ ring head once we've consumed and submitted all SQEs */ 7254 io_commit_sqring(ctx); 7255 7256 return submitted; 7257} 7258 7259static inline bool io_sqd_events_pending(struct io_sq_data *sqd) 7260{ 7261 return READ_ONCE(sqd->state); 7262} 7263 7264static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx) 7265{ 7266 /* Tell userspace we may need a wakeup call */ 7267 spin_lock(&ctx->completion_lock); 7268 WRITE_ONCE(ctx->rings->sq_flags, 7269 ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP); 7270 spin_unlock(&ctx->completion_lock); 7271} 7272 7273static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) 7274{ 7275 spin_lock(&ctx->completion_lock); 7276 WRITE_ONCE(ctx->rings->sq_flags, 7277 ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP); 7278 spin_unlock(&ctx->completion_lock); 7279} 7280 7281static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) 7282{ 7283 unsigned int to_submit; 7284 int ret = 0; 7285 7286 to_submit = io_sqring_entries(ctx); 7287 /* if we're handling multiple rings, cap submit size for fairness */ 7288 if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) 7289 to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; 7290 7291 if (!list_empty(&ctx->iopoll_list) || to_submit) { 7292 unsigned nr_events = 0; 7293 const struct cred *creds = NULL; 7294 7295 if (ctx->sq_creds != current_cred()) 7296 creds = override_creds(ctx->sq_creds); 7297 7298 mutex_lock(&ctx->uring_lock); 7299 if (!list_empty(&ctx->iopoll_list)) 7300 io_do_iopoll(ctx, &nr_events, 0); 7301 7302 /* 7303 * Don't submit if refs are dying, good for io_uring_register(), 7304 * but also it is relied upon by io_ring_exit_work() 7305 */ 7306 if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) && 7307 !(ctx->flags & IORING_SETUP_R_DISABLED)) 7308 ret = io_submit_sqes(ctx, to_submit); 7309 mutex_unlock(&ctx->uring_lock); 7310 7311 if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) 7312 wake_up(&ctx->sqo_sq_wait); 7313 if (creds) 7314 revert_creds(creds); 7315 } 7316 7317 return ret; 7318} 7319 7320static void io_sqd_update_thread_idle(struct io_sq_data *sqd) 7321{ 7322 struct io_ring_ctx *ctx; 7323 unsigned sq_thread_idle = 0; 7324 7325 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 7326 sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle); 7327 sqd->sq_thread_idle = sq_thread_idle; 7328} 7329 7330static bool io_sqd_handle_event(struct io_sq_data *sqd) 7331{ 7332 bool did_sig = false; 7333 struct ksignal ksig; 7334 7335 if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) || 7336 signal_pending(current)) { 7337 mutex_unlock(&sqd->lock); 7338 if (signal_pending(current)) 7339 did_sig = get_signal(&ksig); 7340 cond_resched(); 7341 mutex_lock(&sqd->lock); 7342 } 7343 return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); 7344} 7345 7346static int io_sq_thread(void *data) 7347{ 7348 struct io_sq_data *sqd = data; 7349 struct io_ring_ctx *ctx; 7350 unsigned long timeout = 0; 7351 char buf[TASK_COMM_LEN]; 7352 DEFINE_WAIT(wait); 7353 7354 snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); 7355 set_task_comm(current, buf); 7356 7357 if (sqd->sq_cpu != -1) 7358 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu)); 7359 else 7360 set_cpus_allowed_ptr(current, cpu_online_mask); 7361 current->flags |= PF_NO_SETAFFINITY; 7362 7363 mutex_lock(&sqd->lock); 7364 while (1) { 7365 bool cap_entries, sqt_spin = false; 7366 7367 if (io_sqd_events_pending(sqd) || signal_pending(current)) { 7368 if (io_sqd_handle_event(sqd)) 7369 break; 7370 timeout = jiffies + sqd->sq_thread_idle; 7371 } 7372 7373 cap_entries = !list_is_singular(&sqd->ctx_list); 7374 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 7375 int ret = __io_sq_thread(ctx, cap_entries); 7376 7377 if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list))) 7378 sqt_spin = true; 7379 } 7380 if (io_run_task_work()) 7381 sqt_spin = true; 7382 7383 if (sqt_spin || !time_after(jiffies, timeout)) { 7384 cond_resched(); 7385 if (sqt_spin) 7386 timeout = jiffies + sqd->sq_thread_idle; 7387 continue; 7388 } 7389 7390 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); 7391 if (!io_sqd_events_pending(sqd) && !current->task_works) { 7392 bool needs_sched = true; 7393 7394 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 7395 io_ring_set_wakeup_flag(ctx); 7396 7397 if ((ctx->flags & IORING_SETUP_IOPOLL) && 7398 !list_empty_careful(&ctx->iopoll_list)) { 7399 needs_sched = false; 7400 break; 7401 } 7402 if (io_sqring_entries(ctx)) { 7403 needs_sched = false; 7404 break; 7405 } 7406 } 7407 7408 if (needs_sched) { 7409 mutex_unlock(&sqd->lock); 7410 schedule(); 7411 mutex_lock(&sqd->lock); 7412 } 7413 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 7414 io_ring_clear_wakeup_flag(ctx); 7415 } 7416 7417 finish_wait(&sqd->wait, &wait); 7418 timeout = jiffies + sqd->sq_thread_idle; 7419 } 7420 7421 io_uring_cancel_generic(true, sqd); 7422 sqd->thread = NULL; 7423 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 7424 io_ring_set_wakeup_flag(ctx); 7425 io_run_task_work(); 7426 mutex_unlock(&sqd->lock); 7427 7428 complete(&sqd->exited); 7429 do_exit(0); 7430} 7431 7432struct io_wait_queue { 7433 struct wait_queue_entry wq; 7434 struct io_ring_ctx *ctx; 7435 unsigned cq_tail; 7436 unsigned nr_timeouts; 7437}; 7438 7439static inline bool io_should_wake(struct io_wait_queue *iowq) 7440{ 7441 struct io_ring_ctx *ctx = iowq->ctx; 7442 int dist = ctx->cached_cq_tail - (int) iowq->cq_tail; 7443 7444 /* 7445 * Wake up if we have enough events, or if a timeout occurred since we 7446 * started waiting. For timeouts, we always want to return to userspace, 7447 * regardless of event count. 7448 */ 7449 return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; 7450} 7451 7452static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, 7453 int wake_flags, void *key) 7454{ 7455 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, 7456 wq); 7457 7458 /* 7459 * Cannot safely flush overflowed CQEs from here, ensure we wake up 7460 * the task, and the next invocation will do it. 7461 */ 7462 if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow)) 7463 return autoremove_wake_function(curr, mode, wake_flags, key); 7464 return -1; 7465} 7466 7467static int io_run_task_work_sig(void) 7468{ 7469 if (io_run_task_work()) 7470 return 1; 7471 if (!signal_pending(current)) 7472 return 0; 7473 if (test_thread_flag(TIF_NOTIFY_SIGNAL)) 7474 return -ERESTARTSYS; 7475 return -EINTR; 7476} 7477 7478/* when returns >0, the caller should retry */ 7479static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, 7480 struct io_wait_queue *iowq, 7481 signed long *timeout) 7482{ 7483 int ret; 7484 7485 /* make sure we run task_work before checking for signals */ 7486 ret = io_run_task_work_sig(); 7487 if (ret || io_should_wake(iowq)) 7488 return ret; 7489 /* let the caller flush overflows, retry */ 7490 if (test_bit(0, &ctx->check_cq_overflow)) 7491 return 1; 7492 7493 *timeout = schedule_timeout(*timeout); 7494 return !*timeout ? -ETIME : 1; 7495} 7496 7497/* 7498 * Wait until events become available, if we don't already have some. The 7499 * application must reap them itself, as they reside on the shared cq ring. 7500 */ 7501static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, 7502 const sigset_t __user *sig, size_t sigsz, 7503 struct __kernel_timespec __user *uts) 7504{ 7505 struct io_wait_queue iowq; 7506 struct io_rings *rings = ctx->rings; 7507 signed long timeout = MAX_SCHEDULE_TIMEOUT; 7508 int ret; 7509 7510 do { 7511 io_cqring_overflow_flush(ctx); 7512 if (io_cqring_events(ctx) >= min_events) 7513 return 0; 7514 if (!io_run_task_work()) 7515 break; 7516 } while (1); 7517 7518 if (sig) { 7519#ifdef CONFIG_COMPAT 7520 if (in_compat_syscall()) 7521 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, 7522 sigsz); 7523 else 7524#endif 7525 ret = set_user_sigmask(sig, sigsz); 7526 7527 if (ret) 7528 return ret; 7529 } 7530 7531 if (uts) { 7532 struct timespec64 ts; 7533 7534 if (get_timespec64(&ts, uts)) 7535 return -EFAULT; 7536 timeout = timespec64_to_jiffies(&ts); 7537 } 7538 7539 init_waitqueue_func_entry(&iowq.wq, io_wake_function); 7540 iowq.wq.private = current; 7541 INIT_LIST_HEAD(&iowq.wq.entry); 7542 iowq.ctx = ctx; 7543 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); 7544 iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; 7545 7546 trace_io_uring_cqring_wait(ctx, min_events); 7547 do { 7548 /* if we can't even flush overflow, don't wait for more */ 7549 if (!io_cqring_overflow_flush(ctx)) { 7550 ret = -EBUSY; 7551 break; 7552 } 7553 prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, 7554 TASK_INTERRUPTIBLE); 7555 ret = io_cqring_wait_schedule(ctx, &iowq, &timeout); 7556 finish_wait(&ctx->cq_wait, &iowq.wq); 7557 cond_resched(); 7558 } while (ret > 0); 7559 7560 restore_saved_sigmask_unless(ret == -EINTR); 7561 7562 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; 7563} 7564 7565static void io_free_page_table(void **table, size_t size) 7566{ 7567 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 7568 7569 for (i = 0; i < nr_tables; i++) 7570 kfree(table[i]); 7571 kfree(table); 7572} 7573 7574static void **io_alloc_page_table(size_t size) 7575{ 7576 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 7577 size_t init_size = size; 7578 void **table; 7579 7580 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); 7581 if (!table) 7582 return NULL; 7583 7584 for (i = 0; i < nr_tables; i++) { 7585 unsigned int this_size = min_t(size_t, size, PAGE_SIZE); 7586 7587 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); 7588 if (!table[i]) { 7589 io_free_page_table(table, init_size); 7590 return NULL; 7591 } 7592 size -= this_size; 7593 } 7594 return table; 7595} 7596 7597static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) 7598{ 7599 percpu_ref_exit(&ref_node->refs); 7600 kfree(ref_node); 7601} 7602 7603static void io_rsrc_node_ref_zero(struct percpu_ref *ref) 7604{ 7605 struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); 7606 struct io_ring_ctx *ctx = node->rsrc_data->ctx; 7607 unsigned long flags; 7608 bool first_add = false; 7609 7610 spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); 7611 node->done = true; 7612 7613 while (!list_empty(&ctx->rsrc_ref_list)) { 7614 node = list_first_entry(&ctx->rsrc_ref_list, 7615 struct io_rsrc_node, node); 7616 /* recycle ref nodes in order */ 7617 if (!node->done) 7618 break; 7619 list_del(&node->node); 7620 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); 7621 } 7622 spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); 7623 7624 if (first_add) 7625 mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ); 7626} 7627 7628static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) 7629{ 7630 struct io_rsrc_node *ref_node; 7631 7632 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); 7633 if (!ref_node) 7634 return NULL; 7635 7636 if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, 7637 0, GFP_KERNEL)) { 7638 kfree(ref_node); 7639 return NULL; 7640 } 7641 INIT_LIST_HEAD(&ref_node->node); 7642 INIT_LIST_HEAD(&ref_node->rsrc_list); 7643 ref_node->done = false; 7644 return ref_node; 7645} 7646 7647static void io_rsrc_node_switch(struct io_ring_ctx *ctx, 7648 struct io_rsrc_data *data_to_kill) 7649{ 7650 WARN_ON_ONCE(!ctx->rsrc_backup_node); 7651 WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); 7652 7653 if (data_to_kill) { 7654 struct io_rsrc_node *rsrc_node = ctx->rsrc_node; 7655 7656 rsrc_node->rsrc_data = data_to_kill; 7657 spin_lock_irq(&ctx->rsrc_ref_lock); 7658 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); 7659 spin_unlock_irq(&ctx->rsrc_ref_lock); 7660 7661 atomic_inc(&data_to_kill->refs); 7662 percpu_ref_kill(&rsrc_node->refs); 7663 ctx->rsrc_node = NULL; 7664 } 7665 7666 if (!ctx->rsrc_node) { 7667 ctx->rsrc_node = ctx->rsrc_backup_node; 7668 ctx->rsrc_backup_node = NULL; 7669 } 7670} 7671 7672static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) 7673{ 7674 if (ctx->rsrc_backup_node) 7675 return 0; 7676 ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx); 7677 return ctx->rsrc_backup_node ? 0 : -ENOMEM; 7678} 7679 7680static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx) 7681{ 7682 int ret; 7683 7684 /* As we may drop ->uring_lock, other task may have started quiesce */ 7685 if (data->quiesce) 7686 return -ENXIO; 7687 7688 data->quiesce = true; 7689 do { 7690 ret = io_rsrc_node_switch_start(ctx); 7691 if (ret) 7692 break; 7693 io_rsrc_node_switch(ctx, data); 7694 7695 /* kill initial ref, already quiesced if zero */ 7696 if (atomic_dec_and_test(&data->refs)) 7697 break; 7698 mutex_unlock(&ctx->uring_lock); 7699 flush_delayed_work(&ctx->rsrc_put_work); 7700 ret = wait_for_completion_interruptible(&data->done); 7701 if (!ret) { 7702 mutex_lock(&ctx->uring_lock); 7703 break; 7704 } 7705 7706 atomic_inc(&data->refs); 7707 /* wait for all works potentially completing data->done */ 7708 flush_delayed_work(&ctx->rsrc_put_work); 7709 reinit_completion(&data->done); 7710 7711 ret = io_run_task_work_sig(); 7712 mutex_lock(&ctx->uring_lock); 7713 } while (ret >= 0); 7714 data->quiesce = false; 7715 7716 return ret; 7717} 7718 7719static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx) 7720{ 7721 unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK; 7722 unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT; 7723 7724 return &data->tags[table_idx][off]; 7725} 7726 7727static void io_rsrc_data_free(struct io_rsrc_data *data) 7728{ 7729 size_t size = data->nr * sizeof(data->tags[0][0]); 7730 7731 if (data->tags) 7732 io_free_page_table((void **)data->tags, size); 7733 kfree(data); 7734} 7735 7736static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put, 7737 u64 __user *utags, unsigned nr, 7738 struct io_rsrc_data **pdata) 7739{ 7740 struct io_rsrc_data *data; 7741 int ret = -ENOMEM; 7742 unsigned i; 7743 7744 data = kzalloc(sizeof(*data), GFP_KERNEL); 7745 if (!data) 7746 return -ENOMEM; 7747 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); 7748 if (!data->tags) { 7749 kfree(data); 7750 return -ENOMEM; 7751 } 7752 7753 data->nr = nr; 7754 data->ctx = ctx; 7755 data->do_put = do_put; 7756 if (utags) { 7757 ret = -EFAULT; 7758 for (i = 0; i < nr; i++) { 7759 u64 *tag_slot = io_get_tag_slot(data, i); 7760 7761 if (copy_from_user(tag_slot, &utags[i], 7762 sizeof(*tag_slot))) 7763 goto fail; 7764 } 7765 } 7766 7767 atomic_set(&data->refs, 1); 7768 init_completion(&data->done); 7769 *pdata = data; 7770 return 0; 7771fail: 7772 io_rsrc_data_free(data); 7773 return ret; 7774} 7775 7776static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) 7777{ 7778 table->files = kvcalloc(nr_files, sizeof(table->files[0]), 7779 GFP_KERNEL_ACCOUNT); 7780 return !!table->files; 7781} 7782 7783static void io_free_file_tables(struct io_file_table *table) 7784{ 7785 kvfree(table->files); 7786 table->files = NULL; 7787} 7788 7789static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) 7790{ 7791#if defined(CONFIG_UNIX) 7792 if (ctx->ring_sock) { 7793 struct sock *sock = ctx->ring_sock->sk; 7794 struct sk_buff *skb; 7795 7796 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) 7797 kfree_skb(skb); 7798 } 7799#else 7800 int i; 7801 7802 for (i = 0; i < ctx->nr_user_files; i++) { 7803 struct file *file; 7804 7805 file = io_file_from_index(ctx, i); 7806 if (file) 7807 fput(file); 7808 } 7809#endif 7810 io_free_file_tables(&ctx->file_table); 7811 io_rsrc_data_free(ctx->file_data); 7812 ctx->file_data = NULL; 7813 ctx->nr_user_files = 0; 7814} 7815 7816static int io_sqe_files_unregister(struct io_ring_ctx *ctx) 7817{ 7818 int ret; 7819 7820 if (!ctx->file_data) 7821 return -ENXIO; 7822 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); 7823 if (!ret) 7824 __io_sqe_files_unregister(ctx); 7825 return ret; 7826} 7827 7828static void io_sq_thread_unpark(struct io_sq_data *sqd) 7829 __releases(&sqd->lock) 7830{ 7831 WARN_ON_ONCE(sqd->thread == current); 7832 7833 /* 7834 * Do the dance but not conditional clear_bit() because it'd race with 7835 * other threads incrementing park_pending and setting the bit. 7836 */ 7837 clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 7838 if (atomic_dec_return(&sqd->park_pending)) 7839 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 7840 mutex_unlock(&sqd->lock); 7841} 7842 7843static void io_sq_thread_park(struct io_sq_data *sqd) 7844 __acquires(&sqd->lock) 7845{ 7846 WARN_ON_ONCE(sqd->thread == current); 7847 7848 atomic_inc(&sqd->park_pending); 7849 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 7850 mutex_lock(&sqd->lock); 7851 if (sqd->thread) 7852 wake_up_process(sqd->thread); 7853} 7854 7855static void io_sq_thread_stop(struct io_sq_data *sqd) 7856{ 7857 WARN_ON_ONCE(sqd->thread == current); 7858 WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)); 7859 7860 set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); 7861 mutex_lock(&sqd->lock); 7862 if (sqd->thread) 7863 wake_up_process(sqd->thread); 7864 mutex_unlock(&sqd->lock); 7865 wait_for_completion(&sqd->exited); 7866} 7867 7868static void io_put_sq_data(struct io_sq_data *sqd) 7869{ 7870 if (refcount_dec_and_test(&sqd->refs)) { 7871 WARN_ON_ONCE(atomic_read(&sqd->park_pending)); 7872 7873 io_sq_thread_stop(sqd); 7874 kfree(sqd); 7875 } 7876} 7877 7878static void io_sq_thread_finish(struct io_ring_ctx *ctx) 7879{ 7880 struct io_sq_data *sqd = ctx->sq_data; 7881 7882 if (sqd) { 7883 io_sq_thread_park(sqd); 7884 list_del_init(&ctx->sqd_list); 7885 io_sqd_update_thread_idle(sqd); 7886 io_sq_thread_unpark(sqd); 7887 7888 io_put_sq_data(sqd); 7889 ctx->sq_data = NULL; 7890 } 7891} 7892 7893static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) 7894{ 7895 struct io_ring_ctx *ctx_attach; 7896 struct io_sq_data *sqd; 7897 struct fd f; 7898 7899 f = fdget(p->wq_fd); 7900 if (!f.file) 7901 return ERR_PTR(-ENXIO); 7902 if (f.file->f_op != &io_uring_fops) { 7903 fdput(f); 7904 return ERR_PTR(-EINVAL); 7905 } 7906 7907 ctx_attach = f.file->private_data; 7908 sqd = ctx_attach->sq_data; 7909 if (!sqd) { 7910 fdput(f); 7911 return ERR_PTR(-EINVAL); 7912 } 7913 if (sqd->task_tgid != current->tgid) { 7914 fdput(f); 7915 return ERR_PTR(-EPERM); 7916 } 7917 7918 refcount_inc(&sqd->refs); 7919 fdput(f); 7920 return sqd; 7921} 7922 7923static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, 7924 bool *attached) 7925{ 7926 struct io_sq_data *sqd; 7927 7928 *attached = false; 7929 if (p->flags & IORING_SETUP_ATTACH_WQ) { 7930 sqd = io_attach_sq_data(p); 7931 if (!IS_ERR(sqd)) { 7932 *attached = true; 7933 return sqd; 7934 } 7935 /* fall through for EPERM case, setup new sqd/task */ 7936 if (PTR_ERR(sqd) != -EPERM) 7937 return sqd; 7938 } 7939 7940 sqd = kzalloc(sizeof(*sqd), GFP_KERNEL); 7941 if (!sqd) 7942 return ERR_PTR(-ENOMEM); 7943 7944 atomic_set(&sqd->park_pending, 0); 7945 refcount_set(&sqd->refs, 1); 7946 INIT_LIST_HEAD(&sqd->ctx_list); 7947 mutex_init(&sqd->lock); 7948 init_waitqueue_head(&sqd->wait); 7949 init_completion(&sqd->exited); 7950 return sqd; 7951} 7952 7953#if defined(CONFIG_UNIX) 7954/* 7955 * Ensure the UNIX gc is aware of our file set, so we are certain that 7956 * the io_uring can be safely unregistered on process exit, even if we have 7957 * loops in the file referencing. 7958 */ 7959static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) 7960{ 7961 struct sock *sk = ctx->ring_sock->sk; 7962 struct scm_fp_list *fpl; 7963 struct sk_buff *skb; 7964 int i, nr_files; 7965 7966 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); 7967 if (!fpl) 7968 return -ENOMEM; 7969 7970 skb = alloc_skb(0, GFP_KERNEL); 7971 if (!skb) { 7972 kfree(fpl); 7973 return -ENOMEM; 7974 } 7975 7976 skb->sk = sk; 7977 7978 nr_files = 0; 7979 fpl->user = get_uid(current_user()); 7980 for (i = 0; i < nr; i++) { 7981 struct file *file = io_file_from_index(ctx, i + offset); 7982 7983 if (!file) 7984 continue; 7985 fpl->fp[nr_files] = get_file(file); 7986 unix_inflight(fpl->user, fpl->fp[nr_files]); 7987 nr_files++; 7988 } 7989 7990 if (nr_files) { 7991 fpl->max = SCM_MAX_FD; 7992 fpl->count = nr_files; 7993 UNIXCB(skb).fp = fpl; 7994 skb->destructor = unix_destruct_scm; 7995 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 7996 skb_queue_head(&sk->sk_receive_queue, skb); 7997 7998 for (i = 0; i < nr_files; i++) 7999 fput(fpl->fp[i]); 8000 } else { 8001 kfree_skb(skb); 8002 kfree(fpl); 8003 } 8004 8005 return 0; 8006} 8007 8008/* 8009 * If UNIX sockets are enabled, fd passing can cause a reference cycle which 8010 * causes regular reference counting to break down. We rely on the UNIX 8011 * garbage collection to take care of this problem for us. 8012 */ 8013static int io_sqe_files_scm(struct io_ring_ctx *ctx) 8014{ 8015 unsigned left, total; 8016 int ret = 0; 8017 8018 total = 0; 8019 left = ctx->nr_user_files; 8020 while (left) { 8021 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD); 8022 8023 ret = __io_sqe_files_scm(ctx, this_files, total); 8024 if (ret) 8025 break; 8026 left -= this_files; 8027 total += this_files; 8028 } 8029 8030 if (!ret) 8031 return 0; 8032 8033 while (total < ctx->nr_user_files) { 8034 struct file *file = io_file_from_index(ctx, total); 8035 8036 if (file) 8037 fput(file); 8038 total++; 8039 } 8040 8041 return ret; 8042} 8043#else 8044static int io_sqe_files_scm(struct io_ring_ctx *ctx) 8045{ 8046 return 0; 8047} 8048#endif 8049 8050static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 8051{ 8052 struct file *file = prsrc->file; 8053#if defined(CONFIG_UNIX) 8054 struct sock *sock = ctx->ring_sock->sk; 8055 struct sk_buff_head list, *head = &sock->sk_receive_queue; 8056 struct sk_buff *skb; 8057 int i; 8058 8059 __skb_queue_head_init(&list); 8060 8061 /* 8062 * Find the skb that holds this file in its SCM_RIGHTS. When found, 8063 * remove this entry and rearrange the file array. 8064 */ 8065 skb = skb_dequeue(head); 8066 while (skb) { 8067 struct scm_fp_list *fp; 8068 8069 fp = UNIXCB(skb).fp; 8070 for (i = 0; i < fp->count; i++) { 8071 int left; 8072 8073 if (fp->fp[i] != file) 8074 continue; 8075 8076 unix_notinflight(fp->user, fp->fp[i]); 8077 left = fp->count - 1 - i; 8078 if (left) { 8079 memmove(&fp->fp[i], &fp->fp[i + 1], 8080 left * sizeof(struct file *)); 8081 } 8082 fp->count--; 8083 if (!fp->count) { 8084 kfree_skb(skb); 8085 skb = NULL; 8086 } else { 8087 __skb_queue_tail(&list, skb); 8088 } 8089 fput(file); 8090 file = NULL; 8091 break; 8092 } 8093 8094 if (!file) 8095 break; 8096 8097 __skb_queue_tail(&list, skb); 8098 8099 skb = skb_dequeue(head); 8100 } 8101 8102 if (skb_peek(&list)) { 8103 spin_lock_irq(&head->lock); 8104 while ((skb = __skb_dequeue(&list)) != NULL) 8105 __skb_queue_tail(head, skb); 8106 spin_unlock_irq(&head->lock); 8107 } 8108#else 8109 fput(file); 8110#endif 8111} 8112 8113static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) 8114{ 8115 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; 8116 struct io_ring_ctx *ctx = rsrc_data->ctx; 8117 struct io_rsrc_put *prsrc, *tmp; 8118 8119 list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) { 8120 list_del(&prsrc->list); 8121 8122 if (prsrc->tag) { 8123 bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL; 8124 8125 io_ring_submit_lock(ctx, lock_ring); 8126 spin_lock(&ctx->completion_lock); 8127 io_cqring_fill_event(ctx, prsrc->tag, 0, 0); 8128 ctx->cq_extra++; 8129 io_commit_cqring(ctx); 8130 spin_unlock(&ctx->completion_lock); 8131 io_cqring_ev_posted(ctx); 8132 io_ring_submit_unlock(ctx, lock_ring); 8133 } 8134 8135 rsrc_data->do_put(ctx, prsrc); 8136 kfree(prsrc); 8137 } 8138 8139 io_rsrc_node_destroy(ref_node); 8140 if (atomic_dec_and_test(&rsrc_data->refs)) 8141 complete(&rsrc_data->done); 8142} 8143 8144static void io_rsrc_put_work(struct work_struct *work) 8145{ 8146 struct io_ring_ctx *ctx; 8147 struct llist_node *node; 8148 8149 ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work); 8150 node = llist_del_all(&ctx->rsrc_put_llist); 8151 8152 while (node) { 8153 struct io_rsrc_node *ref_node; 8154 struct llist_node *next = node->next; 8155 8156 ref_node = llist_entry(node, struct io_rsrc_node, llist); 8157 __io_rsrc_put_work(ref_node); 8158 node = next; 8159 } 8160} 8161 8162static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 8163 unsigned nr_args, u64 __user *tags) 8164{ 8165 __s32 __user *fds = (__s32 __user *) arg; 8166 struct file *file; 8167 int fd, ret; 8168 unsigned i; 8169 8170 if (ctx->file_data) 8171 return -EBUSY; 8172 if (!nr_args) 8173 return -EINVAL; 8174 if (nr_args > IORING_MAX_FIXED_FILES) 8175 return -EMFILE; 8176 if (nr_args > rlimit(RLIMIT_NOFILE)) 8177 return -EMFILE; 8178 ret = io_rsrc_node_switch_start(ctx); 8179 if (ret) 8180 return ret; 8181 ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, 8182 &ctx->file_data); 8183 if (ret) 8184 return ret; 8185 8186 ret = -ENOMEM; 8187 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) 8188 goto out_free; 8189 8190 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { 8191 if (copy_from_user(&fd, &fds[i], sizeof(fd))) { 8192 ret = -EFAULT; 8193 goto out_fput; 8194 } 8195 /* allow sparse sets */ 8196 if (fd == -1) { 8197 ret = -EINVAL; 8198 if (unlikely(*io_get_tag_slot(ctx->file_data, i))) 8199 goto out_fput; 8200 continue; 8201 } 8202 8203 file = fget(fd); 8204 ret = -EBADF; 8205 if (unlikely(!file)) 8206 goto out_fput; 8207 8208 /* 8209 * Don't allow io_uring instances to be registered. If UNIX 8210 * isn't enabled, then this causes a reference cycle and this 8211 * instance can never get freed. If UNIX is enabled we'll 8212 * handle it just fine, but there's still no point in allowing 8213 * a ring fd as it doesn't support regular read/write anyway. 8214 */ 8215 if (file->f_op == &io_uring_fops) { 8216 fput(file); 8217 goto out_fput; 8218 } 8219 io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file); 8220 } 8221 8222 ret = io_sqe_files_scm(ctx); 8223 if (ret) { 8224 __io_sqe_files_unregister(ctx); 8225 return ret; 8226 } 8227 8228 io_rsrc_node_switch(ctx, NULL); 8229 return ret; 8230out_fput: 8231 for (i = 0; i < ctx->nr_user_files; i++) { 8232 file = io_file_from_index(ctx, i); 8233 if (file) 8234 fput(file); 8235 } 8236 io_free_file_tables(&ctx->file_table); 8237 ctx->nr_user_files = 0; 8238out_free: 8239 io_rsrc_data_free(ctx->file_data); 8240 ctx->file_data = NULL; 8241 return ret; 8242} 8243 8244static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, 8245 int index) 8246{ 8247#if defined(CONFIG_UNIX) 8248 struct sock *sock = ctx->ring_sock->sk; 8249 struct sk_buff_head *head = &sock->sk_receive_queue; 8250 struct sk_buff *skb; 8251 8252 /* 8253 * See if we can merge this file into an existing skb SCM_RIGHTS 8254 * file set. If there's no room, fall back to allocating a new skb 8255 * and filling it in. 8256 */ 8257 spin_lock_irq(&head->lock); 8258 skb = skb_peek(head); 8259 if (skb) { 8260 struct scm_fp_list *fpl = UNIXCB(skb).fp; 8261 8262 if (fpl->count < SCM_MAX_FD) { 8263 __skb_unlink(skb, head); 8264 spin_unlock_irq(&head->lock); 8265 fpl->fp[fpl->count] = get_file(file); 8266 unix_inflight(fpl->user, fpl->fp[fpl->count]); 8267 fpl->count++; 8268 spin_lock_irq(&head->lock); 8269 __skb_queue_head(head, skb); 8270 } else { 8271 skb = NULL; 8272 } 8273 } 8274 spin_unlock_irq(&head->lock); 8275 8276 if (skb) { 8277 fput(file); 8278 return 0; 8279 } 8280 8281 return __io_sqe_files_scm(ctx, 1, index); 8282#else 8283 return 0; 8284#endif 8285} 8286 8287static int io_install_fixed_file(struct io_kiocb *req, struct file *file, 8288 unsigned int issue_flags, u32 slot_index) 8289{ 8290 struct io_ring_ctx *ctx = req->ctx; 8291 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 8292 struct io_fixed_file *file_slot; 8293 int ret = -EBADF; 8294 8295 io_ring_submit_lock(ctx, !force_nonblock); 8296 if (file->f_op == &io_uring_fops) 8297 goto err; 8298 ret = -ENXIO; 8299 if (!ctx->file_data) 8300 goto err; 8301 ret = -EINVAL; 8302 if (slot_index >= ctx->nr_user_files) 8303 goto err; 8304 8305 slot_index = array_index_nospec(slot_index, ctx->nr_user_files); 8306 file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); 8307 ret = -EBADF; 8308 if (file_slot->file_ptr) 8309 goto err; 8310 8311 *io_get_tag_slot(ctx->file_data, slot_index) = 0; 8312 io_fixed_file_set(file_slot, file); 8313 ret = io_sqe_file_register(ctx, file, slot_index); 8314 if (ret) { 8315 file_slot->file_ptr = 0; 8316 goto err; 8317 } 8318 8319 ret = 0; 8320err: 8321 io_ring_submit_unlock(ctx, !force_nonblock); 8322 if (ret) 8323 fput(file); 8324 return ret; 8325} 8326 8327static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, 8328 struct io_rsrc_node *node, void *rsrc) 8329{ 8330 struct io_rsrc_put *prsrc; 8331 8332 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); 8333 if (!prsrc) 8334 return -ENOMEM; 8335 8336 prsrc->tag = *io_get_tag_slot(data, idx); 8337 prsrc->rsrc = rsrc; 8338 list_add(&prsrc->list, &node->rsrc_list); 8339 return 0; 8340} 8341 8342static int __io_sqe_files_update(struct io_ring_ctx *ctx, 8343 struct io_uring_rsrc_update2 *up, 8344 unsigned nr_args) 8345{ 8346 u64 __user *tags = u64_to_user_ptr(up->tags); 8347 __s32 __user *fds = u64_to_user_ptr(up->data); 8348 struct io_rsrc_data *data = ctx->file_data; 8349 struct io_fixed_file *file_slot; 8350 struct file *file; 8351 int fd, i, err = 0; 8352 unsigned int done; 8353 bool needs_switch = false; 8354 8355 if (!ctx->file_data) 8356 return -ENXIO; 8357 if (up->offset + nr_args > ctx->nr_user_files) 8358 return -EINVAL; 8359 8360 for (done = 0; done < nr_args; done++) { 8361 u64 tag = 0; 8362 8363 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 8364 copy_from_user(&fd, &fds[done], sizeof(fd))) { 8365 err = -EFAULT; 8366 break; 8367 } 8368 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 8369 err = -EINVAL; 8370 break; 8371 } 8372 if (fd == IORING_REGISTER_FILES_SKIP) 8373 continue; 8374 8375 i = array_index_nospec(up->offset + done, ctx->nr_user_files); 8376 file_slot = io_fixed_file_slot(&ctx->file_table, i); 8377 8378 if (file_slot->file_ptr) { 8379 file = (struct file *)(file_slot->file_ptr & FFS_MASK); 8380 err = io_queue_rsrc_removal(data, up->offset + done, 8381 ctx->rsrc_node, file); 8382 if (err) 8383 break; 8384 file_slot->file_ptr = 0; 8385 needs_switch = true; 8386 } 8387 if (fd != -1) { 8388 file = fget(fd); 8389 if (!file) { 8390 err = -EBADF; 8391 break; 8392 } 8393 /* 8394 * Don't allow io_uring instances to be registered. If 8395 * UNIX isn't enabled, then this causes a reference 8396 * cycle and this instance can never get freed. If UNIX 8397 * is enabled we'll handle it just fine, but there's 8398 * still no point in allowing a ring fd as it doesn't 8399 * support regular read/write anyway. 8400 */ 8401 if (file->f_op == &io_uring_fops) { 8402 fput(file); 8403 err = -EBADF; 8404 break; 8405 } 8406 *io_get_tag_slot(data, up->offset + done) = tag; 8407 io_fixed_file_set(file_slot, file); 8408 err = io_sqe_file_register(ctx, file, i); 8409 if (err) { 8410 file_slot->file_ptr = 0; 8411 fput(file); 8412 break; 8413 } 8414 } 8415 } 8416 8417 if (needs_switch) 8418 io_rsrc_node_switch(ctx, data); 8419 return done ? done : err; 8420} 8421 8422static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, 8423 struct task_struct *task) 8424{ 8425 struct io_wq_hash *hash; 8426 struct io_wq_data data; 8427 unsigned int concurrency; 8428 8429 mutex_lock(&ctx->uring_lock); 8430 hash = ctx->hash_map; 8431 if (!hash) { 8432 hash = kzalloc(sizeof(*hash), GFP_KERNEL); 8433 if (!hash) { 8434 mutex_unlock(&ctx->uring_lock); 8435 return ERR_PTR(-ENOMEM); 8436 } 8437 refcount_set(&hash->refs, 1); 8438 init_waitqueue_head(&hash->wait); 8439 ctx->hash_map = hash; 8440 } 8441 mutex_unlock(&ctx->uring_lock); 8442 8443 data.hash = hash; 8444 data.task = task; 8445 data.free_work = io_wq_free_work; 8446 data.do_work = io_wq_submit_work; 8447 8448 /* Do QD, or 4 * CPUS, whatever is smallest */ 8449 concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); 8450 8451 return io_wq_create(concurrency, &data); 8452} 8453 8454static int io_uring_alloc_task_context(struct task_struct *task, 8455 struct io_ring_ctx *ctx) 8456{ 8457 struct io_uring_task *tctx; 8458 int ret; 8459 8460 tctx = kzalloc(sizeof(*tctx), GFP_KERNEL); 8461 if (unlikely(!tctx)) 8462 return -ENOMEM; 8463 8464 ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); 8465 if (unlikely(ret)) { 8466 kfree(tctx); 8467 return ret; 8468 } 8469 8470 tctx->io_wq = io_init_wq_offload(ctx, task); 8471 if (IS_ERR(tctx->io_wq)) { 8472 ret = PTR_ERR(tctx->io_wq); 8473 percpu_counter_destroy(&tctx->inflight); 8474 kfree(tctx); 8475 return ret; 8476 } 8477 8478 xa_init(&tctx->xa); 8479 init_waitqueue_head(&tctx->wait); 8480 atomic_set(&tctx->in_idle, 0); 8481 atomic_set(&tctx->inflight_tracked, 0); 8482 task->io_uring = tctx; 8483 spin_lock_init(&tctx->task_lock); 8484 INIT_WQ_LIST(&tctx->task_list); 8485 init_task_work(&tctx->task_work, tctx_task_work); 8486 return 0; 8487} 8488 8489void __io_uring_free(struct task_struct *tsk) 8490{ 8491 struct io_uring_task *tctx = tsk->io_uring; 8492 8493 WARN_ON_ONCE(!xa_empty(&tctx->xa)); 8494 WARN_ON_ONCE(tctx->io_wq); 8495 WARN_ON_ONCE(tctx->cached_refs); 8496 8497 percpu_counter_destroy(&tctx->inflight); 8498 kfree(tctx); 8499 tsk->io_uring = NULL; 8500} 8501 8502static int io_sq_offload_create(struct io_ring_ctx *ctx, 8503 struct io_uring_params *p) 8504{ 8505 int ret; 8506 8507 /* Retain compatibility with failing for an invalid attach attempt */ 8508 if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) == 8509 IORING_SETUP_ATTACH_WQ) { 8510 struct fd f; 8511 8512 f = fdget(p->wq_fd); 8513 if (!f.file) 8514 return -ENXIO; 8515 if (f.file->f_op != &io_uring_fops) { 8516 fdput(f); 8517 return -EINVAL; 8518 } 8519 fdput(f); 8520 } 8521 if (ctx->flags & IORING_SETUP_SQPOLL) { 8522 struct task_struct *tsk; 8523 struct io_sq_data *sqd; 8524 bool attached; 8525 8526 sqd = io_get_sq_data(p, &attached); 8527 if (IS_ERR(sqd)) { 8528 ret = PTR_ERR(sqd); 8529 goto err; 8530 } 8531 8532 ctx->sq_creds = get_current_cred(); 8533 ctx->sq_data = sqd; 8534 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); 8535 if (!ctx->sq_thread_idle) 8536 ctx->sq_thread_idle = HZ; 8537 8538 io_sq_thread_park(sqd); 8539 list_add(&ctx->sqd_list, &sqd->ctx_list); 8540 io_sqd_update_thread_idle(sqd); 8541 /* don't attach to a dying SQPOLL thread, would be racy */ 8542 ret = (attached && !sqd->thread) ? -ENXIO : 0; 8543 io_sq_thread_unpark(sqd); 8544 8545 if (ret < 0) 8546 goto err; 8547 if (attached) 8548 return 0; 8549 8550 if (p->flags & IORING_SETUP_SQ_AFF) { 8551 int cpu = p->sq_thread_cpu; 8552 8553 ret = -EINVAL; 8554 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) 8555 goto err_sqpoll; 8556 sqd->sq_cpu = cpu; 8557 } else { 8558 sqd->sq_cpu = -1; 8559 } 8560 8561 sqd->task_pid = current->pid; 8562 sqd->task_tgid = current->tgid; 8563 tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); 8564 if (IS_ERR(tsk)) { 8565 ret = PTR_ERR(tsk); 8566 goto err_sqpoll; 8567 } 8568 8569 sqd->thread = tsk; 8570 ret = io_uring_alloc_task_context(tsk, ctx); 8571 wake_up_new_task(tsk); 8572 if (ret) 8573 goto err; 8574 } else if (p->flags & IORING_SETUP_SQ_AFF) { 8575 /* Can't have SQ_AFF without SQPOLL */ 8576 ret = -EINVAL; 8577 goto err; 8578 } 8579 8580 return 0; 8581err_sqpoll: 8582 complete(&ctx->sq_data->exited); 8583err: 8584 io_sq_thread_finish(ctx); 8585 return ret; 8586} 8587 8588static inline void __io_unaccount_mem(struct user_struct *user, 8589 unsigned long nr_pages) 8590{ 8591 atomic_long_sub(nr_pages, &user->locked_vm); 8592} 8593 8594static inline int __io_account_mem(struct user_struct *user, 8595 unsigned long nr_pages) 8596{ 8597 unsigned long page_limit, cur_pages, new_pages; 8598 8599 /* Don't allow more pages than we can safely lock */ 8600 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 8601 8602 do { 8603 cur_pages = atomic_long_read(&user->locked_vm); 8604 new_pages = cur_pages + nr_pages; 8605 if (new_pages > page_limit) 8606 return -ENOMEM; 8607 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, 8608 new_pages) != cur_pages); 8609 8610 return 0; 8611} 8612 8613static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 8614{ 8615 if (ctx->user) 8616 __io_unaccount_mem(ctx->user, nr_pages); 8617 8618 if (ctx->mm_account) 8619 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 8620} 8621 8622static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 8623{ 8624 int ret; 8625 8626 if (ctx->user) { 8627 ret = __io_account_mem(ctx->user, nr_pages); 8628 if (ret) 8629 return ret; 8630 } 8631 8632 if (ctx->mm_account) 8633 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 8634 8635 return 0; 8636} 8637 8638static void io_mem_free(void *ptr) 8639{ 8640 struct page *page; 8641 8642 if (!ptr) 8643 return; 8644 8645 page = virt_to_head_page(ptr); 8646 if (put_page_testzero(page)) 8647 free_compound_page(page); 8648} 8649 8650static void *io_mem_alloc(size_t size) 8651{ 8652 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP | 8653 __GFP_NORETRY | __GFP_ACCOUNT; 8654 8655 return (void *) __get_free_pages(gfp_flags, get_order(size)); 8656} 8657 8658static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, 8659 size_t *sq_offset) 8660{ 8661 struct io_rings *rings; 8662 size_t off, sq_array_size; 8663 8664 off = struct_size(rings, cqes, cq_entries); 8665 if (off == SIZE_MAX) 8666 return SIZE_MAX; 8667 8668#ifdef CONFIG_SMP 8669 off = ALIGN(off, SMP_CACHE_BYTES); 8670 if (off == 0) 8671 return SIZE_MAX; 8672#endif 8673 8674 if (sq_offset) 8675 *sq_offset = off; 8676 8677 sq_array_size = array_size(sizeof(u32), sq_entries); 8678 if (sq_array_size == SIZE_MAX) 8679 return SIZE_MAX; 8680 8681 if (check_add_overflow(off, sq_array_size, &off)) 8682 return SIZE_MAX; 8683 8684 return off; 8685} 8686 8687static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) 8688{ 8689 struct io_mapped_ubuf *imu = *slot; 8690 unsigned int i; 8691 8692 if (imu != ctx->dummy_ubuf) { 8693 for (i = 0; i < imu->nr_bvecs; i++) 8694 unpin_user_page(imu->bvec[i].bv_page); 8695 if (imu->acct_pages) 8696 io_unaccount_mem(ctx, imu->acct_pages); 8697 kvfree(imu); 8698 } 8699 *slot = NULL; 8700} 8701 8702static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 8703{ 8704 io_buffer_unmap(ctx, &prsrc->buf); 8705 prsrc->buf = NULL; 8706} 8707 8708static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 8709{ 8710 unsigned int i; 8711 8712 for (i = 0; i < ctx->nr_user_bufs; i++) 8713 io_buffer_unmap(ctx, &ctx->user_bufs[i]); 8714 kfree(ctx->user_bufs); 8715 io_rsrc_data_free(ctx->buf_data); 8716 ctx->user_bufs = NULL; 8717 ctx->buf_data = NULL; 8718 ctx->nr_user_bufs = 0; 8719} 8720 8721static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 8722{ 8723 int ret; 8724 8725 if (!ctx->buf_data) 8726 return -ENXIO; 8727 8728 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); 8729 if (!ret) 8730 __io_sqe_buffers_unregister(ctx); 8731 return ret; 8732} 8733 8734static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, 8735 void __user *arg, unsigned index) 8736{ 8737 struct iovec __user *src; 8738 8739#ifdef CONFIG_COMPAT 8740 if (ctx->compat) { 8741 struct compat_iovec __user *ciovs; 8742 struct compat_iovec ciov; 8743 8744 ciovs = (struct compat_iovec __user *) arg; 8745 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) 8746 return -EFAULT; 8747 8748 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); 8749 dst->iov_len = ciov.iov_len; 8750 return 0; 8751 } 8752#endif 8753 src = (struct iovec __user *) arg; 8754 if (copy_from_user(dst, &src[index], sizeof(*dst))) 8755 return -EFAULT; 8756 return 0; 8757} 8758 8759/* 8760 * Not super efficient, but this is just a registration time. And we do cache 8761 * the last compound head, so generally we'll only do a full search if we don't 8762 * match that one. 8763 * 8764 * We check if the given compound head page has already been accounted, to 8765 * avoid double accounting it. This allows us to account the full size of the 8766 * page, not just the constituent pages of a huge page. 8767 */ 8768static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 8769 int nr_pages, struct page *hpage) 8770{ 8771 int i, j; 8772 8773 /* check current page array */ 8774 for (i = 0; i < nr_pages; i++) { 8775 if (!PageCompound(pages[i])) 8776 continue; 8777 if (compound_head(pages[i]) == hpage) 8778 return true; 8779 } 8780 8781 /* check previously registered pages */ 8782 for (i = 0; i < ctx->nr_user_bufs; i++) { 8783 struct io_mapped_ubuf *imu = ctx->user_bufs[i]; 8784 8785 for (j = 0; j < imu->nr_bvecs; j++) { 8786 if (!PageCompound(imu->bvec[j].bv_page)) 8787 continue; 8788 if (compound_head(imu->bvec[j].bv_page) == hpage) 8789 return true; 8790 } 8791 } 8792 8793 return false; 8794} 8795 8796static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 8797 int nr_pages, struct io_mapped_ubuf *imu, 8798 struct page **last_hpage) 8799{ 8800 int i, ret; 8801 8802 imu->acct_pages = 0; 8803 for (i = 0; i < nr_pages; i++) { 8804 if (!PageCompound(pages[i])) { 8805 imu->acct_pages++; 8806 } else { 8807 struct page *hpage; 8808 8809 hpage = compound_head(pages[i]); 8810 if (hpage == *last_hpage) 8811 continue; 8812 *last_hpage = hpage; 8813 if (headpage_already_acct(ctx, pages, i, hpage)) 8814 continue; 8815 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 8816 } 8817 } 8818 8819 if (!imu->acct_pages) 8820 return 0; 8821 8822 ret = io_account_mem(ctx, imu->acct_pages); 8823 if (ret) 8824 imu->acct_pages = 0; 8825 return ret; 8826} 8827 8828static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 8829 struct io_mapped_ubuf **pimu, 8830 struct page **last_hpage) 8831{ 8832 struct io_mapped_ubuf *imu = NULL; 8833 struct vm_area_struct **vmas = NULL; 8834 struct page **pages = NULL; 8835 unsigned long off, start, end, ubuf; 8836 size_t size; 8837 int ret, pret, nr_pages, i; 8838 8839 if (!iov->iov_base) { 8840 *pimu = ctx->dummy_ubuf; 8841 return 0; 8842 } 8843 8844 ubuf = (unsigned long) iov->iov_base; 8845 end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; 8846 start = ubuf >> PAGE_SHIFT; 8847 nr_pages = end - start; 8848 8849 *pimu = NULL; 8850 ret = -ENOMEM; 8851 8852 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); 8853 if (!pages) 8854 goto done; 8855 8856 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), 8857 GFP_KERNEL); 8858 if (!vmas) 8859 goto done; 8860 8861 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); 8862 if (!imu) 8863 goto done; 8864 8865 ret = 0; 8866 mmap_read_lock(current->mm); 8867 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, 8868 pages, vmas); 8869 if (pret == nr_pages) { 8870 /* don't support file backed memory */ 8871 for (i = 0; i < nr_pages; i++) { 8872 struct vm_area_struct *vma = vmas[i]; 8873 8874 if (vma_is_shmem(vma)) 8875 continue; 8876 if (vma->vm_file && 8877 !is_file_hugepages(vma->vm_file)) { 8878 ret = -EOPNOTSUPP; 8879 break; 8880 } 8881 } 8882 } else { 8883 ret = pret < 0 ? pret : -EFAULT; 8884 } 8885 mmap_read_unlock(current->mm); 8886 if (ret) { 8887 /* 8888 * if we did partial map, or found file backed vmas, 8889 * release any pages we did get 8890 */ 8891 if (pret > 0) 8892 unpin_user_pages(pages, pret); 8893 goto done; 8894 } 8895 8896 ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage); 8897 if (ret) { 8898 unpin_user_pages(pages, pret); 8899 goto done; 8900 } 8901 8902 off = ubuf & ~PAGE_MASK; 8903 size = iov->iov_len; 8904 for (i = 0; i < nr_pages; i++) { 8905 size_t vec_len; 8906 8907 vec_len = min_t(size_t, size, PAGE_SIZE - off); 8908 imu->bvec[i].bv_page = pages[i]; 8909 imu->bvec[i].bv_len = vec_len; 8910 imu->bvec[i].bv_offset = off; 8911 off = 0; 8912 size -= vec_len; 8913 } 8914 /* store original address for later verification */ 8915 imu->ubuf = ubuf; 8916 imu->ubuf_end = ubuf + iov->iov_len; 8917 imu->nr_bvecs = nr_pages; 8918 *pimu = imu; 8919 ret = 0; 8920done: 8921 if (ret) 8922 kvfree(imu); 8923 kvfree(pages); 8924 kvfree(vmas); 8925 return ret; 8926} 8927 8928static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) 8929{ 8930 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); 8931 return ctx->user_bufs ? 0 : -ENOMEM; 8932} 8933 8934static int io_buffer_validate(struct iovec *iov) 8935{ 8936 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); 8937 8938 /* 8939 * Don't impose further limits on the size and buffer 8940 * constraints here, we'll -EINVAL later when IO is 8941 * submitted if they are wrong. 8942 */ 8943 if (!iov->iov_base) 8944 return iov->iov_len ? -EFAULT : 0; 8945 if (!iov->iov_len) 8946 return -EFAULT; 8947 8948 /* arbitrary limit, but we need something */ 8949 if (iov->iov_len > SZ_1G) 8950 return -EFAULT; 8951 8952 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) 8953 return -EOVERFLOW; 8954 8955 return 0; 8956} 8957 8958static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 8959 unsigned int nr_args, u64 __user *tags) 8960{ 8961 struct page *last_hpage = NULL; 8962 struct io_rsrc_data *data; 8963 int i, ret; 8964 struct iovec iov; 8965 8966 if (ctx->user_bufs) 8967 return -EBUSY; 8968 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 8969 return -EINVAL; 8970 ret = io_rsrc_node_switch_start(ctx); 8971 if (ret) 8972 return ret; 8973 ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); 8974 if (ret) 8975 return ret; 8976 ret = io_buffers_map_alloc(ctx, nr_args); 8977 if (ret) { 8978 io_rsrc_data_free(data); 8979 return ret; 8980 } 8981 8982 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { 8983 ret = io_copy_iov(ctx, &iov, arg, i); 8984 if (ret) 8985 break; 8986 ret = io_buffer_validate(&iov); 8987 if (ret) 8988 break; 8989 if (!iov.iov_base && *io_get_tag_slot(data, i)) { 8990 ret = -EINVAL; 8991 break; 8992 } 8993 8994 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], 8995 &last_hpage); 8996 if (ret) 8997 break; 8998 } 8999 9000 WARN_ON_ONCE(ctx->buf_data); 9001 9002 ctx->buf_data = data; 9003 if (ret) 9004 __io_sqe_buffers_unregister(ctx); 9005 else 9006 io_rsrc_node_switch(ctx, NULL); 9007 return ret; 9008} 9009 9010static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 9011 struct io_uring_rsrc_update2 *up, 9012 unsigned int nr_args) 9013{ 9014 u64 __user *tags = u64_to_user_ptr(up->tags); 9015 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); 9016 struct page *last_hpage = NULL; 9017 bool needs_switch = false; 9018 __u32 done; 9019 int i, err; 9020 9021 if (!ctx->buf_data) 9022 return -ENXIO; 9023 if (up->offset + nr_args > ctx->nr_user_bufs) 9024 return -EINVAL; 9025 9026 for (done = 0; done < nr_args; done++) { 9027 struct io_mapped_ubuf *imu; 9028 int offset = up->offset + done; 9029 u64 tag = 0; 9030 9031 err = io_copy_iov(ctx, &iov, iovs, done); 9032 if (err) 9033 break; 9034 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 9035 err = -EFAULT; 9036 break; 9037 } 9038 err = io_buffer_validate(&iov); 9039 if (err) 9040 break; 9041 if (!iov.iov_base && tag) { 9042 err = -EINVAL; 9043 break; 9044 } 9045 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); 9046 if (err) 9047 break; 9048 9049 i = array_index_nospec(offset, ctx->nr_user_bufs); 9050 if (ctx->user_bufs[i] != ctx->dummy_ubuf) { 9051 err = io_queue_rsrc_removal(ctx->buf_data, offset, 9052 ctx->rsrc_node, ctx->user_bufs[i]); 9053 if (unlikely(err)) { 9054 io_buffer_unmap(ctx, &imu); 9055 break; 9056 } 9057 ctx->user_bufs[i] = NULL; 9058 needs_switch = true; 9059 } 9060 9061 ctx->user_bufs[i] = imu; 9062 *io_get_tag_slot(ctx->buf_data, offset) = tag; 9063 } 9064 9065 if (needs_switch) 9066 io_rsrc_node_switch(ctx, ctx->buf_data); 9067 return done ? done : err; 9068} 9069 9070static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg) 9071{ 9072 __s32 __user *fds = arg; 9073 int fd; 9074 9075 if (ctx->cq_ev_fd) 9076 return -EBUSY; 9077 9078 if (copy_from_user(&fd, fds, sizeof(*fds))) 9079 return -EFAULT; 9080 9081 ctx->cq_ev_fd = eventfd_ctx_fdget(fd); 9082 if (IS_ERR(ctx->cq_ev_fd)) { 9083 int ret = PTR_ERR(ctx->cq_ev_fd); 9084 9085 ctx->cq_ev_fd = NULL; 9086 return ret; 9087 } 9088 9089 return 0; 9090} 9091 9092static int io_eventfd_unregister(struct io_ring_ctx *ctx) 9093{ 9094 if (ctx->cq_ev_fd) { 9095 eventfd_ctx_put(ctx->cq_ev_fd); 9096 ctx->cq_ev_fd = NULL; 9097 return 0; 9098 } 9099 9100 return -ENXIO; 9101} 9102 9103static void io_destroy_buffers(struct io_ring_ctx *ctx) 9104{ 9105 struct io_buffer *buf; 9106 unsigned long index; 9107 9108 xa_for_each(&ctx->io_buffers, index, buf) 9109 __io_remove_buffers(ctx, buf, index, -1U); 9110} 9111 9112static void io_req_cache_free(struct list_head *list) 9113{ 9114 struct io_kiocb *req, *nxt; 9115 9116 list_for_each_entry_safe(req, nxt, list, inflight_entry) { 9117 list_del(&req->inflight_entry); 9118 kmem_cache_free(req_cachep, req); 9119 } 9120} 9121 9122static void io_req_caches_free(struct io_ring_ctx *ctx) 9123{ 9124 struct io_submit_state *state = &ctx->submit_state; 9125 9126 mutex_lock(&ctx->uring_lock); 9127 9128 if (state->free_reqs) { 9129 kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs); 9130 state->free_reqs = 0; 9131 } 9132 9133 io_flush_cached_locked_reqs(ctx, state); 9134 io_req_cache_free(&state->free_list); 9135 mutex_unlock(&ctx->uring_lock); 9136} 9137 9138static void io_wait_rsrc_data(struct io_rsrc_data *data) 9139{ 9140 if (data && !atomic_dec_and_test(&data->refs)) 9141 wait_for_completion(&data->done); 9142} 9143 9144static void io_ring_ctx_free(struct io_ring_ctx *ctx) 9145{ 9146 io_sq_thread_finish(ctx); 9147 9148 if (ctx->mm_account) { 9149 mmdrop(ctx->mm_account); 9150 ctx->mm_account = NULL; 9151 } 9152 9153 /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ 9154 io_wait_rsrc_data(ctx->buf_data); 9155 io_wait_rsrc_data(ctx->file_data); 9156 9157 mutex_lock(&ctx->uring_lock); 9158 if (ctx->buf_data) 9159 __io_sqe_buffers_unregister(ctx); 9160 if (ctx->file_data) 9161 __io_sqe_files_unregister(ctx); 9162 if (ctx->rings) 9163 __io_cqring_overflow_flush(ctx, true); 9164 mutex_unlock(&ctx->uring_lock); 9165 io_eventfd_unregister(ctx); 9166 io_destroy_buffers(ctx); 9167 if (ctx->sq_creds) 9168 put_cred(ctx->sq_creds); 9169 9170 /* there are no registered resources left, nobody uses it */ 9171 if (ctx->rsrc_node) 9172 io_rsrc_node_destroy(ctx->rsrc_node); 9173 if (ctx->rsrc_backup_node) 9174 io_rsrc_node_destroy(ctx->rsrc_backup_node); 9175 flush_delayed_work(&ctx->rsrc_put_work); 9176 9177 WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); 9178 WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist)); 9179 9180#if defined(CONFIG_UNIX) 9181 if (ctx->ring_sock) { 9182 ctx->ring_sock->file = NULL; /* so that iput() is called */ 9183 sock_release(ctx->ring_sock); 9184 } 9185#endif 9186 WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); 9187 9188 io_mem_free(ctx->rings); 9189 io_mem_free(ctx->sq_sqes); 9190 9191 percpu_ref_exit(&ctx->refs); 9192 free_uid(ctx->user); 9193 io_req_caches_free(ctx); 9194 if (ctx->hash_map) 9195 io_wq_put_hash(ctx->hash_map); 9196 kfree(ctx->cancel_hash); 9197 kfree(ctx->dummy_ubuf); 9198 kfree(ctx); 9199} 9200 9201static __poll_t io_uring_poll(struct file *file, poll_table *wait) 9202{ 9203 struct io_ring_ctx *ctx = file->private_data; 9204 __poll_t mask = 0; 9205 9206 poll_wait(file, &ctx->poll_wait, wait); 9207 /* 9208 * synchronizes with barrier from wq_has_sleeper call in 9209 * io_commit_cqring 9210 */ 9211 smp_rmb(); 9212 if (!io_sqring_full(ctx)) 9213 mask |= EPOLLOUT | EPOLLWRNORM; 9214 9215 /* 9216 * Don't flush cqring overflow list here, just do a simple check. 9217 * Otherwise there could possible be ABBA deadlock: 9218 * CPU0 CPU1 9219 * ---- ---- 9220 * lock(&ctx->uring_lock); 9221 * lock(&ep->mtx); 9222 * lock(&ctx->uring_lock); 9223 * lock(&ep->mtx); 9224 * 9225 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this 9226 * pushs them to do the flush. 9227 */ 9228 if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow)) 9229 mask |= EPOLLIN | EPOLLRDNORM; 9230 9231 return mask; 9232} 9233 9234static int io_uring_fasync(int fd, struct file *file, int on) 9235{ 9236 struct io_ring_ctx *ctx = file->private_data; 9237 9238 return fasync_helper(fd, file, on, &ctx->cq_fasync); 9239} 9240 9241static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) 9242{ 9243 const struct cred *creds; 9244 9245 creds = xa_erase(&ctx->personalities, id); 9246 if (creds) { 9247 put_cred(creds); 9248 return 0; 9249 } 9250 9251 return -EINVAL; 9252} 9253 9254struct io_tctx_exit { 9255 struct callback_head task_work; 9256 struct completion completion; 9257 struct io_ring_ctx *ctx; 9258}; 9259 9260static void io_tctx_exit_cb(struct callback_head *cb) 9261{ 9262 struct io_uring_task *tctx = current->io_uring; 9263 struct io_tctx_exit *work; 9264 9265 work = container_of(cb, struct io_tctx_exit, task_work); 9266 /* 9267 * When @in_idle, we're in cancellation and it's racy to remove the 9268 * node. It'll be removed by the end of cancellation, just ignore it. 9269 */ 9270 if (!atomic_read(&tctx->in_idle)) 9271 io_uring_del_tctx_node((unsigned long)work->ctx); 9272 complete(&work->completion); 9273} 9274 9275static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) 9276{ 9277 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 9278 9279 return req->ctx == data; 9280} 9281 9282static void io_ring_exit_work(struct work_struct *work) 9283{ 9284 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); 9285 unsigned long timeout = jiffies + HZ * 60 * 5; 9286 unsigned long interval = HZ / 20; 9287 struct io_tctx_exit exit; 9288 struct io_tctx_node *node; 9289 int ret; 9290 9291 /* 9292 * If we're doing polled IO and end up having requests being 9293 * submitted async (out-of-line), then completions can come in while 9294 * we're waiting for refs to drop. We need to reap these manually, 9295 * as nobody else will be looking for them. 9296 */ 9297 do { 9298 io_uring_try_cancel_requests(ctx, NULL, true); 9299 if (ctx->sq_data) { 9300 struct io_sq_data *sqd = ctx->sq_data; 9301 struct task_struct *tsk; 9302 9303 io_sq_thread_park(sqd); 9304 tsk = sqd->thread; 9305 if (tsk && tsk->io_uring && tsk->io_uring->io_wq) 9306 io_wq_cancel_cb(tsk->io_uring->io_wq, 9307 io_cancel_ctx_cb, ctx, true); 9308 io_sq_thread_unpark(sqd); 9309 } 9310 9311 if (WARN_ON_ONCE(time_after(jiffies, timeout))) { 9312 /* there is little hope left, don't run it too often */ 9313 interval = HZ * 60; 9314 } 9315 } while (!wait_for_completion_timeout(&ctx->ref_comp, interval)); 9316 9317 init_completion(&exit.completion); 9318 init_task_work(&exit.task_work, io_tctx_exit_cb); 9319 exit.ctx = ctx; 9320 /* 9321 * Some may use context even when all refs and requests have been put, 9322 * and they are free to do so while still holding uring_lock or 9323 * completion_lock, see io_req_task_submit(). Apart from other work, 9324 * this lock/unlock section also waits them to finish. 9325 */ 9326 mutex_lock(&ctx->uring_lock); 9327 while (!list_empty(&ctx->tctx_list)) { 9328 WARN_ON_ONCE(time_after(jiffies, timeout)); 9329 9330 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node, 9331 ctx_node); 9332 /* don't spin on a single task if cancellation failed */ 9333 list_rotate_left(&ctx->tctx_list); 9334 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL); 9335 if (WARN_ON_ONCE(ret)) 9336 continue; 9337 wake_up_process(node->task); 9338 9339 mutex_unlock(&ctx->uring_lock); 9340 wait_for_completion(&exit.completion); 9341 mutex_lock(&ctx->uring_lock); 9342 } 9343 mutex_unlock(&ctx->uring_lock); 9344 spin_lock(&ctx->completion_lock); 9345 spin_unlock(&ctx->completion_lock); 9346 9347 io_ring_ctx_free(ctx); 9348} 9349 9350/* Returns true if we found and killed one or more timeouts */ 9351static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, 9352 bool cancel_all) 9353{ 9354 struct io_kiocb *req, *tmp; 9355 int canceled = 0; 9356 9357 spin_lock(&ctx->completion_lock); 9358 spin_lock_irq(&ctx->timeout_lock); 9359 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { 9360 if (io_match_task(req, tsk, cancel_all)) { 9361 io_kill_timeout(req, -ECANCELED); 9362 canceled++; 9363 } 9364 } 9365 spin_unlock_irq(&ctx->timeout_lock); 9366 if (canceled != 0) 9367 io_commit_cqring(ctx); 9368 spin_unlock(&ctx->completion_lock); 9369 if (canceled != 0) 9370 io_cqring_ev_posted(ctx); 9371 return canceled != 0; 9372} 9373 9374static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) 9375{ 9376 unsigned long index; 9377 struct creds *creds; 9378 9379 mutex_lock(&ctx->uring_lock); 9380 percpu_ref_kill(&ctx->refs); 9381 if (ctx->rings) 9382 __io_cqring_overflow_flush(ctx, true); 9383 xa_for_each(&ctx->personalities, index, creds) 9384 io_unregister_personality(ctx, index); 9385 mutex_unlock(&ctx->uring_lock); 9386 9387 io_kill_timeouts(ctx, NULL, true); 9388 io_poll_remove_all(ctx, NULL, true); 9389 9390 /* if we failed setting up the ctx, we might not have any rings */ 9391 io_iopoll_try_reap_events(ctx); 9392 9393 INIT_WORK(&ctx->exit_work, io_ring_exit_work); 9394 /* 9395 * Use system_unbound_wq to avoid spawning tons of event kworkers 9396 * if we're exiting a ton of rings at the same time. It just adds 9397 * noise and overhead, there's no discernable change in runtime 9398 * over using system_wq. 9399 */ 9400 queue_work(system_unbound_wq, &ctx->exit_work); 9401} 9402 9403static int io_uring_release(struct inode *inode, struct file *file) 9404{ 9405 struct io_ring_ctx *ctx = file->private_data; 9406 9407 file->private_data = NULL; 9408 io_ring_ctx_wait_and_kill(ctx); 9409 return 0; 9410} 9411 9412struct io_task_cancel { 9413 struct task_struct *task; 9414 bool all; 9415}; 9416 9417static bool io_cancel_task_cb(struct io_wq_work *work, void *data) 9418{ 9419 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 9420 struct io_task_cancel *cancel = data; 9421 bool ret; 9422 9423 if (!cancel->all && (req->flags & REQ_F_LINK_TIMEOUT)) { 9424 struct io_ring_ctx *ctx = req->ctx; 9425 9426 /* protect against races with linked timeouts */ 9427 spin_lock(&ctx->completion_lock); 9428 ret = io_match_task(req, cancel->task, cancel->all); 9429 spin_unlock(&ctx->completion_lock); 9430 } else { 9431 ret = io_match_task(req, cancel->task, cancel->all); 9432 } 9433 return ret; 9434} 9435 9436static bool io_cancel_defer_files(struct io_ring_ctx *ctx, 9437 struct task_struct *task, bool cancel_all) 9438{ 9439 struct io_defer_entry *de; 9440 LIST_HEAD(list); 9441 9442 spin_lock(&ctx->completion_lock); 9443 list_for_each_entry_reverse(de, &ctx->defer_list, list) { 9444 if (io_match_task(de->req, task, cancel_all)) { 9445 list_cut_position(&list, &ctx->defer_list, &de->list); 9446 break; 9447 } 9448 } 9449 spin_unlock(&ctx->completion_lock); 9450 if (list_empty(&list)) 9451 return false; 9452 9453 while (!list_empty(&list)) { 9454 de = list_first_entry(&list, struct io_defer_entry, list); 9455 list_del_init(&de->list); 9456 io_req_complete_failed(de->req, -ECANCELED); 9457 kfree(de); 9458 } 9459 return true; 9460} 9461 9462static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) 9463{ 9464 struct io_tctx_node *node; 9465 enum io_wq_cancel cret; 9466 bool ret = false; 9467 9468 mutex_lock(&ctx->uring_lock); 9469 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 9470 struct io_uring_task *tctx = node->task->io_uring; 9471 9472 /* 9473 * io_wq will stay alive while we hold uring_lock, because it's 9474 * killed after ctx nodes, which requires to take the lock. 9475 */ 9476 if (!tctx || !tctx->io_wq) 9477 continue; 9478 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); 9479 ret |= (cret != IO_WQ_CANCEL_NOTFOUND); 9480 } 9481 mutex_unlock(&ctx->uring_lock); 9482 9483 return ret; 9484} 9485 9486static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 9487 struct task_struct *task, 9488 bool cancel_all) 9489{ 9490 struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; 9491 struct io_uring_task *tctx = task ? task->io_uring : NULL; 9492 9493 while (1) { 9494 enum io_wq_cancel cret; 9495 bool ret = false; 9496 9497 if (!task) { 9498 ret |= io_uring_try_cancel_iowq(ctx); 9499 } else if (tctx && tctx->io_wq) { 9500 /* 9501 * Cancels requests of all rings, not only @ctx, but 9502 * it's fine as the task is in exit/exec. 9503 */ 9504 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, 9505 &cancel, true); 9506 ret |= (cret != IO_WQ_CANCEL_NOTFOUND); 9507 } 9508 9509 /* SQPOLL thread does its own polling */ 9510 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || 9511 (ctx->sq_data && ctx->sq_data->thread == current)) { 9512 while (!list_empty_careful(&ctx->iopoll_list)) { 9513 io_iopoll_try_reap_events(ctx); 9514 ret = true; 9515 } 9516 } 9517 9518 ret |= io_cancel_defer_files(ctx, task, cancel_all); 9519 ret |= io_poll_remove_all(ctx, task, cancel_all); 9520 ret |= io_kill_timeouts(ctx, task, cancel_all); 9521 if (task) 9522 ret |= io_run_task_work(); 9523 if (!ret) 9524 break; 9525 cond_resched(); 9526 } 9527} 9528 9529static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) 9530{ 9531 struct io_uring_task *tctx = current->io_uring; 9532 struct io_tctx_node *node; 9533 int ret; 9534 9535 if (unlikely(!tctx)) { 9536 ret = io_uring_alloc_task_context(current, ctx); 9537 if (unlikely(ret)) 9538 return ret; 9539 tctx = current->io_uring; 9540 } 9541 if (!xa_load(&tctx->xa, (unsigned long)ctx)) { 9542 node = kmalloc(sizeof(*node), GFP_KERNEL); 9543 if (!node) 9544 return -ENOMEM; 9545 node->ctx = ctx; 9546 node->task = current; 9547 9548 ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, 9549 node, GFP_KERNEL)); 9550 if (ret) { 9551 kfree(node); 9552 return ret; 9553 } 9554 9555 mutex_lock(&ctx->uring_lock); 9556 list_add(&node->ctx_node, &ctx->tctx_list); 9557 mutex_unlock(&ctx->uring_lock); 9558 } 9559 tctx->last = ctx; 9560 return 0; 9561} 9562 9563/* 9564 * Note that this task has used io_uring. We use it for cancelation purposes. 9565 */ 9566static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx) 9567{ 9568 struct io_uring_task *tctx = current->io_uring; 9569 9570 if (likely(tctx && tctx->last == ctx)) 9571 return 0; 9572 return __io_uring_add_tctx_node(ctx); 9573} 9574 9575/* 9576 * Remove this io_uring_file -> task mapping. 9577 */ 9578static void io_uring_del_tctx_node(unsigned long index) 9579{ 9580 struct io_uring_task *tctx = current->io_uring; 9581 struct io_tctx_node *node; 9582 9583 if (!tctx) 9584 return; 9585 node = xa_erase(&tctx->xa, index); 9586 if (!node) 9587 return; 9588 9589 WARN_ON_ONCE(current != node->task); 9590 WARN_ON_ONCE(list_empty(&node->ctx_node)); 9591 9592 mutex_lock(&node->ctx->uring_lock); 9593 list_del(&node->ctx_node); 9594 mutex_unlock(&node->ctx->uring_lock); 9595 9596 if (tctx->last == node->ctx) 9597 tctx->last = NULL; 9598 kfree(node); 9599} 9600 9601static void io_uring_clean_tctx(struct io_uring_task *tctx) 9602{ 9603 struct io_wq *wq = tctx->io_wq; 9604 struct io_tctx_node *node; 9605 unsigned long index; 9606 9607 xa_for_each(&tctx->xa, index, node) 9608 io_uring_del_tctx_node(index); 9609 if (wq) { 9610 /* 9611 * Must be after io_uring_del_task_file() (removes nodes under 9612 * uring_lock) to avoid race with io_uring_try_cancel_iowq(). 9613 */ 9614 io_wq_put_and_exit(wq); 9615 tctx->io_wq = NULL; 9616 } 9617} 9618 9619static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) 9620{ 9621 if (tracked) 9622 return atomic_read(&tctx->inflight_tracked); 9623 return percpu_counter_sum(&tctx->inflight); 9624} 9625 9626static void io_uring_drop_tctx_refs(struct task_struct *task) 9627{ 9628 struct io_uring_task *tctx = task->io_uring; 9629 unsigned int refs = tctx->cached_refs; 9630 9631 if (refs) { 9632 tctx->cached_refs = 0; 9633 percpu_counter_sub(&tctx->inflight, refs); 9634 put_task_struct_many(task, refs); 9635 } 9636} 9637 9638/* 9639 * Find any io_uring ctx that this task has registered or done IO on, and cancel 9640 * requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation. 9641 */ 9642static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) 9643{ 9644 struct io_uring_task *tctx = current->io_uring; 9645 struct io_ring_ctx *ctx; 9646 s64 inflight; 9647 DEFINE_WAIT(wait); 9648 9649 WARN_ON_ONCE(sqd && sqd->thread != current); 9650 9651 if (!current->io_uring) 9652 return; 9653 if (tctx->io_wq) 9654 io_wq_exit_start(tctx->io_wq); 9655 9656 atomic_inc(&tctx->in_idle); 9657 do { 9658 io_uring_drop_tctx_refs(current); 9659 /* read completions before cancelations */ 9660 inflight = tctx_inflight(tctx, !cancel_all); 9661 if (!inflight) 9662 break; 9663 9664 if (!sqd) { 9665 struct io_tctx_node *node; 9666 unsigned long index; 9667 9668 xa_for_each(&tctx->xa, index, node) { 9669 /* sqpoll task will cancel all its requests */ 9670 if (node->ctx->sq_data) 9671 continue; 9672 io_uring_try_cancel_requests(node->ctx, current, 9673 cancel_all); 9674 } 9675 } else { 9676 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 9677 io_uring_try_cancel_requests(ctx, current, 9678 cancel_all); 9679 } 9680 9681 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); 9682 io_uring_drop_tctx_refs(current); 9683 /* 9684 * If we've seen completions, retry without waiting. This 9685 * avoids a race where a completion comes in before we did 9686 * prepare_to_wait(). 9687 */ 9688 if (inflight == tctx_inflight(tctx, !cancel_all)) 9689 schedule(); 9690 finish_wait(&tctx->wait, &wait); 9691 } while (1); 9692 atomic_dec(&tctx->in_idle); 9693 9694 io_uring_clean_tctx(tctx); 9695 if (cancel_all) { 9696 /* for exec all current's requests should be gone, kill tctx */ 9697 __io_uring_free(current); 9698 } 9699} 9700 9701void __io_uring_cancel(bool cancel_all) 9702{ 9703 io_uring_cancel_generic(cancel_all, NULL); 9704} 9705 9706static void *io_uring_validate_mmap_request(struct file *file, 9707 loff_t pgoff, size_t sz) 9708{ 9709 struct io_ring_ctx *ctx = file->private_data; 9710 loff_t offset = pgoff << PAGE_SHIFT; 9711 struct page *page; 9712 void *ptr; 9713 9714 switch (offset) { 9715 case IORING_OFF_SQ_RING: 9716 case IORING_OFF_CQ_RING: 9717 ptr = ctx->rings; 9718 break; 9719 case IORING_OFF_SQES: 9720 ptr = ctx->sq_sqes; 9721 break; 9722 default: 9723 return ERR_PTR(-EINVAL); 9724 } 9725 9726 page = virt_to_head_page(ptr); 9727 if (sz > page_size(page)) 9728 return ERR_PTR(-EINVAL); 9729 9730 return ptr; 9731} 9732 9733#ifdef CONFIG_MMU 9734 9735static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 9736{ 9737 size_t sz = vma->vm_end - vma->vm_start; 9738 unsigned long pfn; 9739 void *ptr; 9740 9741 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); 9742 if (IS_ERR(ptr)) 9743 return PTR_ERR(ptr); 9744 9745 pfn = virt_to_phys(ptr) >> PAGE_SHIFT; 9746 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); 9747} 9748 9749#else /* !CONFIG_MMU */ 9750 9751static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 9752{ 9753 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL; 9754} 9755 9756static unsigned int io_uring_nommu_mmap_capabilities(struct file *file) 9757{ 9758 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; 9759} 9760 9761static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, 9762 unsigned long addr, unsigned long len, 9763 unsigned long pgoff, unsigned long flags) 9764{ 9765 void *ptr; 9766 9767 ptr = io_uring_validate_mmap_request(file, pgoff, len); 9768 if (IS_ERR(ptr)) 9769 return PTR_ERR(ptr); 9770 9771 return (unsigned long) ptr; 9772} 9773 9774#endif /* !CONFIG_MMU */ 9775 9776static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) 9777{ 9778 DEFINE_WAIT(wait); 9779 9780 do { 9781 if (!io_sqring_full(ctx)) 9782 break; 9783 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE); 9784 9785 if (!io_sqring_full(ctx)) 9786 break; 9787 schedule(); 9788 } while (!signal_pending(current)); 9789 9790 finish_wait(&ctx->sqo_sq_wait, &wait); 9791 return 0; 9792} 9793 9794static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, 9795 struct __kernel_timespec __user **ts, 9796 const sigset_t __user **sig) 9797{ 9798 struct io_uring_getevents_arg arg; 9799 9800 /* 9801 * If EXT_ARG isn't set, then we have no timespec and the argp pointer 9802 * is just a pointer to the sigset_t. 9803 */ 9804 if (!(flags & IORING_ENTER_EXT_ARG)) { 9805 *sig = (const sigset_t __user *) argp; 9806 *ts = NULL; 9807 return 0; 9808 } 9809 9810 /* 9811 * EXT_ARG is set - ensure we agree on the size of it and copy in our 9812 * timespec and sigset_t pointers if good. 9813 */ 9814 if (*argsz != sizeof(arg)) 9815 return -EINVAL; 9816 if (copy_from_user(&arg, argp, sizeof(arg))) 9817 return -EFAULT; 9818 *sig = u64_to_user_ptr(arg.sigmask); 9819 *argsz = arg.sigmask_sz; 9820 *ts = u64_to_user_ptr(arg.ts); 9821 return 0; 9822} 9823 9824SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, 9825 u32, min_complete, u32, flags, const void __user *, argp, 9826 size_t, argsz) 9827{ 9828 struct io_ring_ctx *ctx; 9829 int submitted = 0; 9830 struct fd f; 9831 long ret; 9832 9833 io_run_task_work(); 9834 9835 if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | 9836 IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))) 9837 return -EINVAL; 9838 9839 f = fdget(fd); 9840 if (unlikely(!f.file)) 9841 return -EBADF; 9842 9843 ret = -EOPNOTSUPP; 9844 if (unlikely(f.file->f_op != &io_uring_fops)) 9845 goto out_fput; 9846 9847 ret = -ENXIO; 9848 ctx = f.file->private_data; 9849 if (unlikely(!percpu_ref_tryget(&ctx->refs))) 9850 goto out_fput; 9851 9852 ret = -EBADFD; 9853 if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED)) 9854 goto out; 9855 9856 /* 9857 * For SQ polling, the thread will do all submissions and completions. 9858 * Just return the requested submit count, and wake the thread if 9859 * we were asked to. 9860 */ 9861 ret = 0; 9862 if (ctx->flags & IORING_SETUP_SQPOLL) { 9863 io_cqring_overflow_flush(ctx); 9864 9865 if (unlikely(ctx->sq_data->thread == NULL)) { 9866 ret = -EOWNERDEAD; 9867 goto out; 9868 } 9869 if (flags & IORING_ENTER_SQ_WAKEUP) 9870 wake_up(&ctx->sq_data->wait); 9871 if (flags & IORING_ENTER_SQ_WAIT) { 9872 ret = io_sqpoll_wait_sq(ctx); 9873 if (ret) 9874 goto out; 9875 } 9876 submitted = to_submit; 9877 } else if (to_submit) { 9878 ret = io_uring_add_tctx_node(ctx); 9879 if (unlikely(ret)) 9880 goto out; 9881 mutex_lock(&ctx->uring_lock); 9882 submitted = io_submit_sqes(ctx, to_submit); 9883 mutex_unlock(&ctx->uring_lock); 9884 9885 if (submitted != to_submit) 9886 goto out; 9887 } 9888 if (flags & IORING_ENTER_GETEVENTS) { 9889 const sigset_t __user *sig; 9890 struct __kernel_timespec __user *ts; 9891 9892 ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); 9893 if (unlikely(ret)) 9894 goto out; 9895 9896 min_complete = min(min_complete, ctx->cq_entries); 9897 9898 /* 9899 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user 9900 * space applications don't need to do io completion events 9901 * polling again, they can rely on io_sq_thread to do polling 9902 * work, which can reduce cpu usage and uring_lock contention. 9903 */ 9904 if (ctx->flags & IORING_SETUP_IOPOLL && 9905 !(ctx->flags & IORING_SETUP_SQPOLL)) { 9906 ret = io_iopoll_check(ctx, min_complete); 9907 } else { 9908 ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts); 9909 } 9910 } 9911 9912out: 9913 percpu_ref_put(&ctx->refs); 9914out_fput: 9915 fdput(f); 9916 return submitted ? submitted : ret; 9917} 9918 9919#ifdef CONFIG_PROC_FS 9920static int io_uring_show_cred(struct seq_file *m, unsigned int id, 9921 const struct cred *cred) 9922{ 9923 struct user_namespace *uns = seq_user_ns(m); 9924 struct group_info *gi; 9925 kernel_cap_t cap; 9926 unsigned __capi; 9927 int g; 9928 9929 seq_printf(m, "%5d\n", id); 9930 seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid)); 9931 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid)); 9932 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid)); 9933 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid)); 9934 seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid)); 9935 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid)); 9936 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid)); 9937 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid)); 9938 seq_puts(m, "\n\tGroups:\t"); 9939 gi = cred->group_info; 9940 for (g = 0; g < gi->ngroups; g++) { 9941 seq_put_decimal_ull(m, g ? " " : "", 9942 from_kgid_munged(uns, gi->gid[g])); 9943 } 9944 seq_puts(m, "\n\tCapEff:\t"); 9945 cap = cred->cap_effective; 9946 CAP_FOR_EACH_U32(__capi) 9947 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8); 9948 seq_putc(m, '\n'); 9949 return 0; 9950} 9951 9952static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) 9953{ 9954 struct io_sq_data *sq = NULL; 9955 bool has_lock; 9956 int i; 9957 9958 /* 9959 * Avoid ABBA deadlock between the seq lock and the io_uring mutex, 9960 * since fdinfo case grabs it in the opposite direction of normal use 9961 * cases. If we fail to get the lock, we just don't iterate any 9962 * structures that could be going away outside the io_uring mutex. 9963 */ 9964 has_lock = mutex_trylock(&ctx->uring_lock); 9965 9966 if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { 9967 sq = ctx->sq_data; 9968 if (!sq->thread) 9969 sq = NULL; 9970 } 9971 9972 seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1); 9973 seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1); 9974 seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); 9975 for (i = 0; has_lock && i < ctx->nr_user_files; i++) { 9976 struct file *f = io_file_from_index(ctx, i); 9977 9978 if (f) 9979 seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname); 9980 else 9981 seq_printf(m, "%5u: <none>\n", i); 9982 } 9983 seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs); 9984 for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) { 9985 struct io_mapped_ubuf *buf = ctx->user_bufs[i]; 9986 unsigned int len = buf->ubuf_end - buf->ubuf; 9987 9988 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len); 9989 } 9990 if (has_lock && !xa_empty(&ctx->personalities)) { 9991 unsigned long index; 9992 const struct cred *cred; 9993 9994 seq_printf(m, "Personalities:\n"); 9995 xa_for_each(&ctx->personalities, index, cred) 9996 io_uring_show_cred(m, index, cred); 9997 } 9998 seq_printf(m, "PollList:\n"); 9999 spin_lock(&ctx->completion_lock); 10000 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { 10001 struct hlist_head *list = &ctx->cancel_hash[i]; 10002 struct io_kiocb *req; 10003 10004 hlist_for_each_entry(req, list, hash_node) 10005 seq_printf(m, " op=%d, task_works=%d\n", req->opcode, 10006 req->task->task_works != NULL); 10007 } 10008 spin_unlock(&ctx->completion_lock); 10009 if (has_lock) 10010 mutex_unlock(&ctx->uring_lock); 10011} 10012 10013static void io_uring_show_fdinfo(struct seq_file *m, struct file *f) 10014{ 10015 struct io_ring_ctx *ctx = f->private_data; 10016 10017 if (percpu_ref_tryget(&ctx->refs)) { 10018 __io_uring_show_fdinfo(ctx, m); 10019 percpu_ref_put(&ctx->refs); 10020 } 10021} 10022#endif 10023 10024static const struct file_operations io_uring_fops = { 10025 .release = io_uring_release, 10026 .mmap = io_uring_mmap, 10027#ifndef CONFIG_MMU 10028 .get_unmapped_area = io_uring_nommu_get_unmapped_area, 10029 .mmap_capabilities = io_uring_nommu_mmap_capabilities, 10030#endif 10031 .poll = io_uring_poll, 10032 .fasync = io_uring_fasync, 10033#ifdef CONFIG_PROC_FS 10034 .show_fdinfo = io_uring_show_fdinfo, 10035#endif 10036}; 10037 10038static int io_allocate_scq_urings(struct io_ring_ctx *ctx, 10039 struct io_uring_params *p) 10040{ 10041 struct io_rings *rings; 10042 size_t size, sq_array_offset; 10043 10044 /* make sure these are sane, as we already accounted them */ 10045 ctx->sq_entries = p->sq_entries; 10046 ctx->cq_entries = p->cq_entries; 10047 10048 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); 10049 if (size == SIZE_MAX) 10050 return -EOVERFLOW; 10051 10052 rings = io_mem_alloc(size); 10053 if (!rings) 10054 return -ENOMEM; 10055 10056 ctx->rings = rings; 10057 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); 10058 rings->sq_ring_mask = p->sq_entries - 1; 10059 rings->cq_ring_mask = p->cq_entries - 1; 10060 rings->sq_ring_entries = p->sq_entries; 10061 rings->cq_ring_entries = p->cq_entries; 10062 10063 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); 10064 if (size == SIZE_MAX) { 10065 io_mem_free(ctx->rings); 10066 ctx->rings = NULL; 10067 return -EOVERFLOW; 10068 } 10069 10070 ctx->sq_sqes = io_mem_alloc(size); 10071 if (!ctx->sq_sqes) { 10072 io_mem_free(ctx->rings); 10073 ctx->rings = NULL; 10074 return -ENOMEM; 10075 } 10076 10077 return 0; 10078} 10079 10080static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) 10081{ 10082 int ret, fd; 10083 10084 fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); 10085 if (fd < 0) 10086 return fd; 10087 10088 ret = io_uring_add_tctx_node(ctx); 10089 if (ret) { 10090 put_unused_fd(fd); 10091 return ret; 10092 } 10093 fd_install(fd, file); 10094 return fd; 10095} 10096 10097/* 10098 * Allocate an anonymous fd, this is what constitutes the application 10099 * visible backing of an io_uring instance. The application mmaps this 10100 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled, 10101 * we have to tie this fd to a socket for file garbage collection purposes. 10102 */ 10103static struct file *io_uring_get_file(struct io_ring_ctx *ctx) 10104{ 10105 struct file *file; 10106#if defined(CONFIG_UNIX) 10107 int ret; 10108 10109 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP, 10110 &ctx->ring_sock); 10111 if (ret) 10112 return ERR_PTR(ret); 10113#endif 10114 10115 file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx, 10116 O_RDWR | O_CLOEXEC); 10117#if defined(CONFIG_UNIX) 10118 if (IS_ERR(file)) { 10119 sock_release(ctx->ring_sock); 10120 ctx->ring_sock = NULL; 10121 } else { 10122 ctx->ring_sock->file = file; 10123 } 10124#endif 10125 return file; 10126} 10127 10128static int io_uring_create(unsigned entries, struct io_uring_params *p, 10129 struct io_uring_params __user *params) 10130{ 10131 struct io_ring_ctx *ctx; 10132 struct file *file; 10133 int ret; 10134 10135 if (!entries) 10136 return -EINVAL; 10137 if (entries > IORING_MAX_ENTRIES) { 10138 if (!(p->flags & IORING_SETUP_CLAMP)) 10139 return -EINVAL; 10140 entries = IORING_MAX_ENTRIES; 10141 } 10142 10143 /* 10144 * Use twice as many entries for the CQ ring. It's possible for the 10145 * application to drive a higher depth than the size of the SQ ring, 10146 * since the sqes are only used at submission time. This allows for 10147 * some flexibility in overcommitting a bit. If the application has 10148 * set IORING_SETUP_CQSIZE, it will have passed in the desired number 10149 * of CQ ring entries manually. 10150 */ 10151 p->sq_entries = roundup_pow_of_two(entries); 10152 if (p->flags & IORING_SETUP_CQSIZE) { 10153 /* 10154 * If IORING_SETUP_CQSIZE is set, we do the same roundup 10155 * to a power-of-two, if it isn't already. We do NOT impose 10156 * any cq vs sq ring sizing. 10157 */ 10158 if (!p->cq_entries) 10159 return -EINVAL; 10160 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) { 10161 if (!(p->flags & IORING_SETUP_CLAMP)) 10162 return -EINVAL; 10163 p->cq_entries = IORING_MAX_CQ_ENTRIES; 10164 } 10165 p->cq_entries = roundup_pow_of_two(p->cq_entries); 10166 if (p->cq_entries < p->sq_entries) 10167 return -EINVAL; 10168 } else { 10169 p->cq_entries = 2 * p->sq_entries; 10170 } 10171 10172 ctx = io_ring_ctx_alloc(p); 10173 if (!ctx) 10174 return -ENOMEM; 10175 ctx->compat = in_compat_syscall(); 10176 if (!capable(CAP_IPC_LOCK)) 10177 ctx->user = get_uid(current_user()); 10178 10179 /* 10180 * This is just grabbed for accounting purposes. When a process exits, 10181 * the mm is exited and dropped before the files, hence we need to hang 10182 * on to this mm purely for the purposes of being able to unaccount 10183 * memory (locked/pinned vm). It's not used for anything else. 10184 */ 10185 mmgrab(current->mm); 10186 ctx->mm_account = current->mm; 10187 10188 ret = io_allocate_scq_urings(ctx, p); 10189 if (ret) 10190 goto err; 10191 10192 ret = io_sq_offload_create(ctx, p); 10193 if (ret) 10194 goto err; 10195 /* always set a rsrc node */ 10196 ret = io_rsrc_node_switch_start(ctx); 10197 if (ret) 10198 goto err; 10199 io_rsrc_node_switch(ctx, NULL); 10200 10201 memset(&p->sq_off, 0, sizeof(p->sq_off)); 10202 p->sq_off.head = offsetof(struct io_rings, sq.head); 10203 p->sq_off.tail = offsetof(struct io_rings, sq.tail); 10204 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); 10205 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); 10206 p->sq_off.flags = offsetof(struct io_rings, sq_flags); 10207 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); 10208 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; 10209 10210 memset(&p->cq_off, 0, sizeof(p->cq_off)); 10211 p->cq_off.head = offsetof(struct io_rings, cq.head); 10212 p->cq_off.tail = offsetof(struct io_rings, cq.tail); 10213 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); 10214 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); 10215 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); 10216 p->cq_off.cqes = offsetof(struct io_rings, cqes); 10217 p->cq_off.flags = offsetof(struct io_rings, cq_flags); 10218 10219 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | 10220 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | 10221 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | 10222 IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | 10223 IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | 10224 IORING_FEAT_RSRC_TAGS; 10225 10226 if (copy_to_user(params, p, sizeof(*p))) { 10227 ret = -EFAULT; 10228 goto err; 10229 } 10230 10231 file = io_uring_get_file(ctx); 10232 if (IS_ERR(file)) { 10233 ret = PTR_ERR(file); 10234 goto err; 10235 } 10236 10237 /* 10238 * Install ring fd as the very last thing, so we don't risk someone 10239 * having closed it before we finish setup 10240 */ 10241 ret = io_uring_install_fd(ctx, file); 10242 if (ret < 0) { 10243 /* fput will clean it up */ 10244 fput(file); 10245 return ret; 10246 } 10247 10248 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); 10249 return ret; 10250err: 10251 io_ring_ctx_wait_and_kill(ctx); 10252 return ret; 10253} 10254 10255/* 10256 * Sets up an aio uring context, and returns the fd. Applications asks for a 10257 * ring size, we return the actual sq/cq ring sizes (among other things) in the 10258 * params structure passed in. 10259 */ 10260static long io_uring_setup(u32 entries, struct io_uring_params __user *params) 10261{ 10262 struct io_uring_params p; 10263 int i; 10264 10265 if (copy_from_user(&p, params, sizeof(p))) 10266 return -EFAULT; 10267 for (i = 0; i < ARRAY_SIZE(p.resv); i++) { 10268 if (p.resv[i]) 10269 return -EINVAL; 10270 } 10271 10272 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | 10273 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | 10274 IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | 10275 IORING_SETUP_R_DISABLED)) 10276 return -EINVAL; 10277 10278 return io_uring_create(entries, &p, params); 10279} 10280 10281SYSCALL_DEFINE2(io_uring_setup, u32, entries, 10282 struct io_uring_params __user *, params) 10283{ 10284 return io_uring_setup(entries, params); 10285} 10286 10287static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) 10288{ 10289 struct io_uring_probe *p; 10290 size_t size; 10291 int i, ret; 10292 10293 size = struct_size(p, ops, nr_args); 10294 if (size == SIZE_MAX) 10295 return -EOVERFLOW; 10296 p = kzalloc(size, GFP_KERNEL); 10297 if (!p) 10298 return -ENOMEM; 10299 10300 ret = -EFAULT; 10301 if (copy_from_user(p, arg, size)) 10302 goto out; 10303 ret = -EINVAL; 10304 if (memchr_inv(p, 0, size)) 10305 goto out; 10306 10307 p->last_op = IORING_OP_LAST - 1; 10308 if (nr_args > IORING_OP_LAST) 10309 nr_args = IORING_OP_LAST; 10310 10311 for (i = 0; i < nr_args; i++) { 10312 p->ops[i].op = i; 10313 if (!io_op_defs[i].not_supported) 10314 p->ops[i].flags = IO_URING_OP_SUPPORTED; 10315 } 10316 p->ops_len = i; 10317 10318 ret = 0; 10319 if (copy_to_user(arg, p, size)) 10320 ret = -EFAULT; 10321out: 10322 kfree(p); 10323 return ret; 10324} 10325 10326static int io_register_personality(struct io_ring_ctx *ctx) 10327{ 10328 const struct cred *creds; 10329 u32 id; 10330 int ret; 10331 10332 creds = get_current_cred(); 10333 10334 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, 10335 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); 10336 if (ret < 0) { 10337 put_cred(creds); 10338 return ret; 10339 } 10340 return id; 10341} 10342 10343static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg, 10344 unsigned int nr_args) 10345{ 10346 struct io_uring_restriction *res; 10347 size_t size; 10348 int i, ret; 10349 10350 /* Restrictions allowed only if rings started disabled */ 10351 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 10352 return -EBADFD; 10353 10354 /* We allow only a single restrictions registration */ 10355 if (ctx->restrictions.registered) 10356 return -EBUSY; 10357 10358 if (!arg || nr_args > IORING_MAX_RESTRICTIONS) 10359 return -EINVAL; 10360 10361 size = array_size(nr_args, sizeof(*res)); 10362 if (size == SIZE_MAX) 10363 return -EOVERFLOW; 10364 10365 res = memdup_user(arg, size); 10366 if (IS_ERR(res)) 10367 return PTR_ERR(res); 10368 10369 ret = 0; 10370 10371 for (i = 0; i < nr_args; i++) { 10372 switch (res[i].opcode) { 10373 case IORING_RESTRICTION_REGISTER_OP: 10374 if (res[i].register_op >= IORING_REGISTER_LAST) { 10375 ret = -EINVAL; 10376 goto out; 10377 } 10378 10379 __set_bit(res[i].register_op, 10380 ctx->restrictions.register_op); 10381 break; 10382 case IORING_RESTRICTION_SQE_OP: 10383 if (res[i].sqe_op >= IORING_OP_LAST) { 10384 ret = -EINVAL; 10385 goto out; 10386 } 10387 10388 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); 10389 break; 10390 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: 10391 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; 10392 break; 10393 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: 10394 ctx->restrictions.sqe_flags_required = res[i].sqe_flags; 10395 break; 10396 default: 10397 ret = -EINVAL; 10398 goto out; 10399 } 10400 } 10401 10402out: 10403 /* Reset all restrictions if an error happened */ 10404 if (ret != 0) 10405 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); 10406 else 10407 ctx->restrictions.registered = true; 10408 10409 kfree(res); 10410 return ret; 10411} 10412 10413static int io_register_enable_rings(struct io_ring_ctx *ctx) 10414{ 10415 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 10416 return -EBADFD; 10417 10418 if (ctx->restrictions.registered) 10419 ctx->restricted = 1; 10420 10421 ctx->flags &= ~IORING_SETUP_R_DISABLED; 10422 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) 10423 wake_up(&ctx->sq_data->wait); 10424 return 0; 10425} 10426 10427static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 10428 struct io_uring_rsrc_update2 *up, 10429 unsigned nr_args) 10430{ 10431 __u32 tmp; 10432 int err; 10433 10434 if (up->resv) 10435 return -EINVAL; 10436 if (check_add_overflow(up->offset, nr_args, &tmp)) 10437 return -EOVERFLOW; 10438 err = io_rsrc_node_switch_start(ctx); 10439 if (err) 10440 return err; 10441 10442 switch (type) { 10443 case IORING_RSRC_FILE: 10444 return __io_sqe_files_update(ctx, up, nr_args); 10445 case IORING_RSRC_BUFFER: 10446 return __io_sqe_buffers_update(ctx, up, nr_args); 10447 } 10448 return -EINVAL; 10449} 10450 10451static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 10452 unsigned nr_args) 10453{ 10454 struct io_uring_rsrc_update2 up; 10455 10456 if (!nr_args) 10457 return -EINVAL; 10458 memset(&up, 0, sizeof(up)); 10459 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 10460 return -EFAULT; 10461 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 10462} 10463 10464static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 10465 unsigned size, unsigned type) 10466{ 10467 struct io_uring_rsrc_update2 up; 10468 10469 if (size != sizeof(up)) 10470 return -EINVAL; 10471 if (copy_from_user(&up, arg, sizeof(up))) 10472 return -EFAULT; 10473 if (!up.nr || up.resv) 10474 return -EINVAL; 10475 return __io_register_rsrc_update(ctx, type, &up, up.nr); 10476} 10477 10478static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 10479 unsigned int size, unsigned int type) 10480{ 10481 struct io_uring_rsrc_register rr; 10482 10483 /* keep it extendible */ 10484 if (size != sizeof(rr)) 10485 return -EINVAL; 10486 10487 memset(&rr, 0, sizeof(rr)); 10488 if (copy_from_user(&rr, arg, size)) 10489 return -EFAULT; 10490 if (!rr.nr || rr.resv || rr.resv2) 10491 return -EINVAL; 10492 10493 switch (type) { 10494 case IORING_RSRC_FILE: 10495 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 10496 rr.nr, u64_to_user_ptr(rr.tags)); 10497 case IORING_RSRC_BUFFER: 10498 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 10499 rr.nr, u64_to_user_ptr(rr.tags)); 10500 } 10501 return -EINVAL; 10502} 10503 10504static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg, 10505 unsigned len) 10506{ 10507 struct io_uring_task *tctx = current->io_uring; 10508 cpumask_var_t new_mask; 10509 int ret; 10510 10511 if (!tctx || !tctx->io_wq) 10512 return -EINVAL; 10513 10514 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 10515 return -ENOMEM; 10516 10517 cpumask_clear(new_mask); 10518 if (len > cpumask_size()) 10519 len = cpumask_size(); 10520 10521 if (copy_from_user(new_mask, arg, len)) { 10522 free_cpumask_var(new_mask); 10523 return -EFAULT; 10524 } 10525 10526 ret = io_wq_cpu_affinity(tctx->io_wq, new_mask); 10527 free_cpumask_var(new_mask); 10528 return ret; 10529} 10530 10531static int io_unregister_iowq_aff(struct io_ring_ctx *ctx) 10532{ 10533 struct io_uring_task *tctx = current->io_uring; 10534 10535 if (!tctx || !tctx->io_wq) 10536 return -EINVAL; 10537 10538 return io_wq_cpu_affinity(tctx->io_wq, NULL); 10539} 10540 10541static int io_register_iowq_max_workers(struct io_ring_ctx *ctx, 10542 void __user *arg) 10543{ 10544 struct io_uring_task *tctx = NULL; 10545 struct io_sq_data *sqd = NULL; 10546 __u32 new_count[2]; 10547 int i, ret; 10548 10549 if (copy_from_user(new_count, arg, sizeof(new_count))) 10550 return -EFAULT; 10551 for (i = 0; i < ARRAY_SIZE(new_count); i++) 10552 if (new_count[i] > INT_MAX) 10553 return -EINVAL; 10554 10555 if (ctx->flags & IORING_SETUP_SQPOLL) { 10556 sqd = ctx->sq_data; 10557 if (sqd) { 10558 /* 10559 * Observe the correct sqd->lock -> ctx->uring_lock 10560 * ordering. Fine to drop uring_lock here, we hold 10561 * a ref to the ctx. 10562 */ 10563 mutex_unlock(&ctx->uring_lock); 10564 mutex_lock(&sqd->lock); 10565 mutex_lock(&ctx->uring_lock); 10566 tctx = sqd->thread->io_uring; 10567 } 10568 } else { 10569 tctx = current->io_uring; 10570 } 10571 10572 ret = -EINVAL; 10573 if (!tctx || !tctx->io_wq) 10574 goto err; 10575 10576 ret = io_wq_max_workers(tctx->io_wq, new_count); 10577 if (ret) 10578 goto err; 10579 10580 if (sqd) 10581 mutex_unlock(&sqd->lock); 10582 10583 if (copy_to_user(arg, new_count, sizeof(new_count))) 10584 return -EFAULT; 10585 10586 return 0; 10587err: 10588 if (sqd) 10589 mutex_unlock(&sqd->lock); 10590 return ret; 10591} 10592 10593static bool io_register_op_must_quiesce(int op) 10594{ 10595 switch (op) { 10596 case IORING_REGISTER_BUFFERS: 10597 case IORING_UNREGISTER_BUFFERS: 10598 case IORING_REGISTER_FILES: 10599 case IORING_UNREGISTER_FILES: 10600 case IORING_REGISTER_FILES_UPDATE: 10601 case IORING_REGISTER_PROBE: 10602 case IORING_REGISTER_PERSONALITY: 10603 case IORING_UNREGISTER_PERSONALITY: 10604 case IORING_REGISTER_FILES2: 10605 case IORING_REGISTER_FILES_UPDATE2: 10606 case IORING_REGISTER_BUFFERS2: 10607 case IORING_REGISTER_BUFFERS_UPDATE: 10608 case IORING_REGISTER_IOWQ_AFF: 10609 case IORING_UNREGISTER_IOWQ_AFF: 10610 case IORING_REGISTER_IOWQ_MAX_WORKERS: 10611 return false; 10612 default: 10613 return true; 10614 } 10615} 10616 10617static int io_ctx_quiesce(struct io_ring_ctx *ctx) 10618{ 10619 long ret; 10620 10621 percpu_ref_kill(&ctx->refs); 10622 10623 /* 10624 * Drop uring mutex before waiting for references to exit. If another 10625 * thread is currently inside io_uring_enter() it might need to grab the 10626 * uring_lock to make progress. If we hold it here across the drain 10627 * wait, then we can deadlock. It's safe to drop the mutex here, since 10628 * no new references will come in after we've killed the percpu ref. 10629 */ 10630 mutex_unlock(&ctx->uring_lock); 10631 do { 10632 ret = wait_for_completion_interruptible(&ctx->ref_comp); 10633 if (!ret) 10634 break; 10635 ret = io_run_task_work_sig(); 10636 } while (ret >= 0); 10637 mutex_lock(&ctx->uring_lock); 10638 10639 if (ret) 10640 io_refs_resurrect(&ctx->refs, &ctx->ref_comp); 10641 return ret; 10642} 10643 10644static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, 10645 void __user *arg, unsigned nr_args) 10646 __releases(ctx->uring_lock) 10647 __acquires(ctx->uring_lock) 10648{ 10649 int ret; 10650 10651 /* 10652 * We're inside the ring mutex, if the ref is already dying, then 10653 * someone else killed the ctx or is already going through 10654 * io_uring_register(). 10655 */ 10656 if (percpu_ref_is_dying(&ctx->refs)) 10657 return -ENXIO; 10658 10659 if (ctx->restricted) { 10660 if (opcode >= IORING_REGISTER_LAST) 10661 return -EINVAL; 10662 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); 10663 if (!test_bit(opcode, ctx->restrictions.register_op)) 10664 return -EACCES; 10665 } 10666 10667 if (io_register_op_must_quiesce(opcode)) { 10668 ret = io_ctx_quiesce(ctx); 10669 if (ret) 10670 return ret; 10671 } 10672 10673 switch (opcode) { 10674 case IORING_REGISTER_BUFFERS: 10675 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); 10676 break; 10677 case IORING_UNREGISTER_BUFFERS: 10678 ret = -EINVAL; 10679 if (arg || nr_args) 10680 break; 10681 ret = io_sqe_buffers_unregister(ctx); 10682 break; 10683 case IORING_REGISTER_FILES: 10684 ret = io_sqe_files_register(ctx, arg, nr_args, NULL); 10685 break; 10686 case IORING_UNREGISTER_FILES: 10687 ret = -EINVAL; 10688 if (arg || nr_args) 10689 break; 10690 ret = io_sqe_files_unregister(ctx); 10691 break; 10692 case IORING_REGISTER_FILES_UPDATE: 10693 ret = io_register_files_update(ctx, arg, nr_args); 10694 break; 10695 case IORING_REGISTER_EVENTFD: 10696 case IORING_REGISTER_EVENTFD_ASYNC: 10697 ret = -EINVAL; 10698 if (nr_args != 1) 10699 break; 10700 ret = io_eventfd_register(ctx, arg); 10701 if (ret) 10702 break; 10703 if (opcode == IORING_REGISTER_EVENTFD_ASYNC) 10704 ctx->eventfd_async = 1; 10705 else 10706 ctx->eventfd_async = 0; 10707 break; 10708 case IORING_UNREGISTER_EVENTFD: 10709 ret = -EINVAL; 10710 if (arg || nr_args) 10711 break; 10712 ret = io_eventfd_unregister(ctx); 10713 break; 10714 case IORING_REGISTER_PROBE: 10715 ret = -EINVAL; 10716 if (!arg || nr_args > 256) 10717 break; 10718 ret = io_probe(ctx, arg, nr_args); 10719 break; 10720 case IORING_REGISTER_PERSONALITY: 10721 ret = -EINVAL; 10722 if (arg || nr_args) 10723 break; 10724 ret = io_register_personality(ctx); 10725 break; 10726 case IORING_UNREGISTER_PERSONALITY: 10727 ret = -EINVAL; 10728 if (arg) 10729 break; 10730 ret = io_unregister_personality(ctx, nr_args); 10731 break; 10732 case IORING_REGISTER_ENABLE_RINGS: 10733 ret = -EINVAL; 10734 if (arg || nr_args) 10735 break; 10736 ret = io_register_enable_rings(ctx); 10737 break; 10738 case IORING_REGISTER_RESTRICTIONS: 10739 ret = io_register_restrictions(ctx, arg, nr_args); 10740 break; 10741 case IORING_REGISTER_FILES2: 10742 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); 10743 break; 10744 case IORING_REGISTER_FILES_UPDATE2: 10745 ret = io_register_rsrc_update(ctx, arg, nr_args, 10746 IORING_RSRC_FILE); 10747 break; 10748 case IORING_REGISTER_BUFFERS2: 10749 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); 10750 break; 10751 case IORING_REGISTER_BUFFERS_UPDATE: 10752 ret = io_register_rsrc_update(ctx, arg, nr_args, 10753 IORING_RSRC_BUFFER); 10754 break; 10755 case IORING_REGISTER_IOWQ_AFF: 10756 ret = -EINVAL; 10757 if (!arg || !nr_args) 10758 break; 10759 ret = io_register_iowq_aff(ctx, arg, nr_args); 10760 break; 10761 case IORING_UNREGISTER_IOWQ_AFF: 10762 ret = -EINVAL; 10763 if (arg || nr_args) 10764 break; 10765 ret = io_unregister_iowq_aff(ctx); 10766 break; 10767 case IORING_REGISTER_IOWQ_MAX_WORKERS: 10768 ret = -EINVAL; 10769 if (!arg || nr_args != 2) 10770 break; 10771 ret = io_register_iowq_max_workers(ctx, arg); 10772 break; 10773 default: 10774 ret = -EINVAL; 10775 break; 10776 } 10777 10778 if (io_register_op_must_quiesce(opcode)) { 10779 /* bring the ctx back to life */ 10780 percpu_ref_reinit(&ctx->refs); 10781 reinit_completion(&ctx->ref_comp); 10782 } 10783 return ret; 10784} 10785 10786SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, 10787 void __user *, arg, unsigned int, nr_args) 10788{ 10789 struct io_ring_ctx *ctx; 10790 long ret = -EBADF; 10791 struct fd f; 10792 10793 f = fdget(fd); 10794 if (!f.file) 10795 return -EBADF; 10796 10797 ret = -EOPNOTSUPP; 10798 if (f.file->f_op != &io_uring_fops) 10799 goto out_fput; 10800 10801 ctx = f.file->private_data; 10802 10803 io_run_task_work(); 10804 10805 mutex_lock(&ctx->uring_lock); 10806 ret = __io_uring_register(ctx, opcode, arg, nr_args); 10807 mutex_unlock(&ctx->uring_lock); 10808 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, 10809 ctx->cq_ev_fd != NULL, ret); 10810out_fput: 10811 fdput(f); 10812 return ret; 10813} 10814 10815static int __init io_uring_init(void) 10816{ 10817#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \ 10818 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \ 10819 BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \ 10820} while (0) 10821 10822#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \ 10823 __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename) 10824 BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64); 10825 BUILD_BUG_SQE_ELEM(0, __u8, opcode); 10826 BUILD_BUG_SQE_ELEM(1, __u8, flags); 10827 BUILD_BUG_SQE_ELEM(2, __u16, ioprio); 10828 BUILD_BUG_SQE_ELEM(4, __s32, fd); 10829 BUILD_BUG_SQE_ELEM(8, __u64, off); 10830 BUILD_BUG_SQE_ELEM(8, __u64, addr2); 10831 BUILD_BUG_SQE_ELEM(16, __u64, addr); 10832 BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in); 10833 BUILD_BUG_SQE_ELEM(24, __u32, len); 10834 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags); 10835 BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); 10836 BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags); 10837 BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags); 10838 BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events); 10839 BUILD_BUG_SQE_ELEM(28, __u32, poll32_events); 10840 BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags); 10841 BUILD_BUG_SQE_ELEM(28, __u32, msg_flags); 10842 BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags); 10843 BUILD_BUG_SQE_ELEM(28, __u32, accept_flags); 10844 BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags); 10845 BUILD_BUG_SQE_ELEM(28, __u32, open_flags); 10846 BUILD_BUG_SQE_ELEM(28, __u32, statx_flags); 10847 BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice); 10848 BUILD_BUG_SQE_ELEM(28, __u32, splice_flags); 10849 BUILD_BUG_SQE_ELEM(32, __u64, user_data); 10850 BUILD_BUG_SQE_ELEM(40, __u16, buf_index); 10851 BUILD_BUG_SQE_ELEM(40, __u16, buf_group); 10852 BUILD_BUG_SQE_ELEM(42, __u16, personality); 10853 BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); 10854 BUILD_BUG_SQE_ELEM(44, __u32, file_index); 10855 10856 BUILD_BUG_ON(sizeof(struct io_uring_files_update) != 10857 sizeof(struct io_uring_rsrc_update)); 10858 BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) > 10859 sizeof(struct io_uring_rsrc_update2)); 10860 10861 /* ->buf_index is u16 */ 10862 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 10863 10864 /* should fit into one byte */ 10865 BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); 10866 10867 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); 10868 BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); 10869 10870 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | 10871 SLAB_ACCOUNT); 10872 return 0; 10873}; 10874__initcall(io_uring_init);