at v5.15-rc6 10983 lines 276 kB view raw
1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Shared application/kernel submission and completion ring pairs, for 4 * supporting fast/efficient IO. 5 * 6 * A note on the read/write ordering memory barriers that are matched between 7 * the application and kernel side. 8 * 9 * After the application reads the CQ ring tail, it must use an 10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses 11 * before writing the tail (using smp_load_acquire to read the tail will 12 * do). It also needs a smp_mb() before updating CQ head (ordering the 13 * entry load(s) with the head store), pairing with an implicit barrier 14 * through a control-dependency in io_get_cqe (smp_store_release to 15 * store head will do). Failure to do so could lead to reading invalid 16 * CQ entries. 17 * 18 * Likewise, the application must use an appropriate smp_wmb() before 19 * writing the SQ tail (ordering SQ entry stores with the tail store), 20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release 21 * to store the tail will do). And it needs a barrier ordering the SQ 22 * head load before writing new SQ entries (smp_load_acquire to read 23 * head will do). 24 * 25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application 26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after* 27 * updating the SQ tail; a full memory barrier smp_mb() is needed 28 * between. 29 * 30 * Also see the examples in the liburing library: 31 * 32 * git://git.kernel.dk/liburing 33 * 34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens 35 * from data shared between the kernel and application. This is done both 36 * for ordering purposes, but also to ensure that once a value is loaded from 37 * data that the application could potentially modify, it remains stable. 38 * 39 * Copyright (C) 2018-2019 Jens Axboe 40 * Copyright (c) 2018-2019 Christoph Hellwig 41 */ 42#include <linux/kernel.h> 43#include <linux/init.h> 44#include <linux/errno.h> 45#include <linux/syscalls.h> 46#include <linux/compat.h> 47#include <net/compat.h> 48#include <linux/refcount.h> 49#include <linux/uio.h> 50#include <linux/bits.h> 51 52#include <linux/sched/signal.h> 53#include <linux/fs.h> 54#include <linux/file.h> 55#include <linux/fdtable.h> 56#include <linux/mm.h> 57#include <linux/mman.h> 58#include <linux/percpu.h> 59#include <linux/slab.h> 60#include <linux/blkdev.h> 61#include <linux/bvec.h> 62#include <linux/net.h> 63#include <net/sock.h> 64#include <net/af_unix.h> 65#include <net/scm.h> 66#include <linux/anon_inodes.h> 67#include <linux/sched/mm.h> 68#include <linux/uaccess.h> 69#include <linux/nospec.h> 70#include <linux/sizes.h> 71#include <linux/hugetlb.h> 72#include <linux/highmem.h> 73#include <linux/namei.h> 74#include <linux/fsnotify.h> 75#include <linux/fadvise.h> 76#include <linux/eventpoll.h> 77#include <linux/splice.h> 78#include <linux/task_work.h> 79#include <linux/pagemap.h> 80#include <linux/io_uring.h> 81#include <linux/tracehook.h> 82 83#define CREATE_TRACE_POINTS 84#include <trace/events/io_uring.h> 85 86#include <uapi/linux/io_uring.h> 87 88#include "internal.h" 89#include "io-wq.h" 90 91#define IORING_MAX_ENTRIES 32768 92#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) 93#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 94 95/* only define max */ 96#define IORING_MAX_FIXED_FILES (1U << 15) 97#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 98 IORING_REGISTER_LAST + IORING_OP_LAST) 99 100#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) 101#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) 102#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) 103 104#define IORING_MAX_REG_BUFFERS (1U << 14) 105 106#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ 107 IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ 108 IOSQE_BUFFER_SELECT) 109#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ 110 REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS) 111 112#define IO_TCTX_REFS_CACHE_NR (1U << 10) 113 114struct io_uring { 115 u32 head ____cacheline_aligned_in_smp; 116 u32 tail ____cacheline_aligned_in_smp; 117}; 118 119/* 120 * This data is shared with the application through the mmap at offsets 121 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. 122 * 123 * The offsets to the member fields are published through struct 124 * io_sqring_offsets when calling io_uring_setup. 125 */ 126struct io_rings { 127 /* 128 * Head and tail offsets into the ring; the offsets need to be 129 * masked to get valid indices. 130 * 131 * The kernel controls head of the sq ring and the tail of the cq ring, 132 * and the application controls tail of the sq ring and the head of the 133 * cq ring. 134 */ 135 struct io_uring sq, cq; 136 /* 137 * Bitmasks to apply to head and tail offsets (constant, equals 138 * ring_entries - 1) 139 */ 140 u32 sq_ring_mask, cq_ring_mask; 141 /* Ring sizes (constant, power of 2) */ 142 u32 sq_ring_entries, cq_ring_entries; 143 /* 144 * Number of invalid entries dropped by the kernel due to 145 * invalid index stored in array 146 * 147 * Written by the kernel, shouldn't be modified by the 148 * application (i.e. get number of "new events" by comparing to 149 * cached value). 150 * 151 * After a new SQ head value was read by the application this 152 * counter includes all submissions that were dropped reaching 153 * the new SQ head (and possibly more). 154 */ 155 u32 sq_dropped; 156 /* 157 * Runtime SQ flags 158 * 159 * Written by the kernel, shouldn't be modified by the 160 * application. 161 * 162 * The application needs a full memory barrier before checking 163 * for IORING_SQ_NEED_WAKEUP after updating the sq tail. 164 */ 165 u32 sq_flags; 166 /* 167 * Runtime CQ flags 168 * 169 * Written by the application, shouldn't be modified by the 170 * kernel. 171 */ 172 u32 cq_flags; 173 /* 174 * Number of completion events lost because the queue was full; 175 * this should be avoided by the application by making sure 176 * there are not more requests pending than there is space in 177 * the completion queue. 178 * 179 * Written by the kernel, shouldn't be modified by the 180 * application (i.e. get number of "new events" by comparing to 181 * cached value). 182 * 183 * As completion events come in out of order this counter is not 184 * ordered with any other data. 185 */ 186 u32 cq_overflow; 187 /* 188 * Ring buffer of completion events. 189 * 190 * The kernel writes completion events fresh every time they are 191 * produced, so the application is allowed to modify pending 192 * entries. 193 */ 194 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; 195}; 196 197enum io_uring_cmd_flags { 198 IO_URING_F_NONBLOCK = 1, 199 IO_URING_F_COMPLETE_DEFER = 2, 200}; 201 202struct io_mapped_ubuf { 203 u64 ubuf; 204 u64 ubuf_end; 205 unsigned int nr_bvecs; 206 unsigned long acct_pages; 207 struct bio_vec bvec[]; 208}; 209 210struct io_ring_ctx; 211 212struct io_overflow_cqe { 213 struct io_uring_cqe cqe; 214 struct list_head list; 215}; 216 217struct io_fixed_file { 218 /* file * with additional FFS_* flags */ 219 unsigned long file_ptr; 220}; 221 222struct io_rsrc_put { 223 struct list_head list; 224 u64 tag; 225 union { 226 void *rsrc; 227 struct file *file; 228 struct io_mapped_ubuf *buf; 229 }; 230}; 231 232struct io_file_table { 233 struct io_fixed_file *files; 234}; 235 236struct io_rsrc_node { 237 struct percpu_ref refs; 238 struct list_head node; 239 struct list_head rsrc_list; 240 struct io_rsrc_data *rsrc_data; 241 struct llist_node llist; 242 bool done; 243}; 244 245typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); 246 247struct io_rsrc_data { 248 struct io_ring_ctx *ctx; 249 250 u64 **tags; 251 unsigned int nr; 252 rsrc_put_fn *do_put; 253 atomic_t refs; 254 struct completion done; 255 bool quiesce; 256}; 257 258struct io_buffer { 259 struct list_head list; 260 __u64 addr; 261 __u32 len; 262 __u16 bid; 263}; 264 265struct io_restriction { 266 DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); 267 DECLARE_BITMAP(sqe_op, IORING_OP_LAST); 268 u8 sqe_flags_allowed; 269 u8 sqe_flags_required; 270 bool registered; 271}; 272 273enum { 274 IO_SQ_THREAD_SHOULD_STOP = 0, 275 IO_SQ_THREAD_SHOULD_PARK, 276}; 277 278struct io_sq_data { 279 refcount_t refs; 280 atomic_t park_pending; 281 struct mutex lock; 282 283 /* ctx's that are using this sqd */ 284 struct list_head ctx_list; 285 286 struct task_struct *thread; 287 struct wait_queue_head wait; 288 289 unsigned sq_thread_idle; 290 int sq_cpu; 291 pid_t task_pid; 292 pid_t task_tgid; 293 294 unsigned long state; 295 struct completion exited; 296}; 297 298#define IO_COMPL_BATCH 32 299#define IO_REQ_CACHE_SIZE 32 300#define IO_REQ_ALLOC_BATCH 8 301 302struct io_submit_link { 303 struct io_kiocb *head; 304 struct io_kiocb *last; 305}; 306 307struct io_submit_state { 308 struct blk_plug plug; 309 struct io_submit_link link; 310 311 /* 312 * io_kiocb alloc cache 313 */ 314 void *reqs[IO_REQ_CACHE_SIZE]; 315 unsigned int free_reqs; 316 317 bool plug_started; 318 319 /* 320 * Batch completion logic 321 */ 322 struct io_kiocb *compl_reqs[IO_COMPL_BATCH]; 323 unsigned int compl_nr; 324 /* inline/task_work completion list, under ->uring_lock */ 325 struct list_head free_list; 326 327 unsigned int ios_left; 328}; 329 330struct io_ring_ctx { 331 /* const or read-mostly hot data */ 332 struct { 333 struct percpu_ref refs; 334 335 struct io_rings *rings; 336 unsigned int flags; 337 unsigned int compat: 1; 338 unsigned int drain_next: 1; 339 unsigned int eventfd_async: 1; 340 unsigned int restricted: 1; 341 unsigned int off_timeout_used: 1; 342 unsigned int drain_active: 1; 343 } ____cacheline_aligned_in_smp; 344 345 /* submission data */ 346 struct { 347 struct mutex uring_lock; 348 349 /* 350 * Ring buffer of indices into array of io_uring_sqe, which is 351 * mmapped by the application using the IORING_OFF_SQES offset. 352 * 353 * This indirection could e.g. be used to assign fixed 354 * io_uring_sqe entries to operations and only submit them to 355 * the queue when needed. 356 * 357 * The kernel modifies neither the indices array nor the entries 358 * array. 359 */ 360 u32 *sq_array; 361 struct io_uring_sqe *sq_sqes; 362 unsigned cached_sq_head; 363 unsigned sq_entries; 364 struct list_head defer_list; 365 366 /* 367 * Fixed resources fast path, should be accessed only under 368 * uring_lock, and updated through io_uring_register(2) 369 */ 370 struct io_rsrc_node *rsrc_node; 371 struct io_file_table file_table; 372 unsigned nr_user_files; 373 unsigned nr_user_bufs; 374 struct io_mapped_ubuf **user_bufs; 375 376 struct io_submit_state submit_state; 377 struct list_head timeout_list; 378 struct list_head ltimeout_list; 379 struct list_head cq_overflow_list; 380 struct xarray io_buffers; 381 struct xarray personalities; 382 u32 pers_next; 383 unsigned sq_thread_idle; 384 } ____cacheline_aligned_in_smp; 385 386 /* IRQ completion list, under ->completion_lock */ 387 struct list_head locked_free_list; 388 unsigned int locked_free_nr; 389 390 const struct cred *sq_creds; /* cred used for __io_sq_thread() */ 391 struct io_sq_data *sq_data; /* if using sq thread polling */ 392 393 struct wait_queue_head sqo_sq_wait; 394 struct list_head sqd_list; 395 396 unsigned long check_cq_overflow; 397 398 struct { 399 unsigned cached_cq_tail; 400 unsigned cq_entries; 401 struct eventfd_ctx *cq_ev_fd; 402 struct wait_queue_head poll_wait; 403 struct wait_queue_head cq_wait; 404 unsigned cq_extra; 405 atomic_t cq_timeouts; 406 unsigned cq_last_tm_flush; 407 } ____cacheline_aligned_in_smp; 408 409 struct { 410 spinlock_t completion_lock; 411 412 spinlock_t timeout_lock; 413 414 /* 415 * ->iopoll_list is protected by the ctx->uring_lock for 416 * io_uring instances that don't use IORING_SETUP_SQPOLL. 417 * For SQPOLL, only the single threaded io_sq_thread() will 418 * manipulate the list, hence no extra locking is needed there. 419 */ 420 struct list_head iopoll_list; 421 struct hlist_head *cancel_hash; 422 unsigned cancel_hash_bits; 423 bool poll_multi_queue; 424 } ____cacheline_aligned_in_smp; 425 426 struct io_restriction restrictions; 427 428 /* slow path rsrc auxilary data, used by update/register */ 429 struct { 430 struct io_rsrc_node *rsrc_backup_node; 431 struct io_mapped_ubuf *dummy_ubuf; 432 struct io_rsrc_data *file_data; 433 struct io_rsrc_data *buf_data; 434 435 struct delayed_work rsrc_put_work; 436 struct llist_head rsrc_put_llist; 437 struct list_head rsrc_ref_list; 438 spinlock_t rsrc_ref_lock; 439 }; 440 441 /* Keep this last, we don't need it for the fast path */ 442 struct { 443 #if defined(CONFIG_UNIX) 444 struct socket *ring_sock; 445 #endif 446 /* hashed buffered write serialization */ 447 struct io_wq_hash *hash_map; 448 449 /* Only used for accounting purposes */ 450 struct user_struct *user; 451 struct mm_struct *mm_account; 452 453 /* ctx exit and cancelation */ 454 struct llist_head fallback_llist; 455 struct delayed_work fallback_work; 456 struct work_struct exit_work; 457 struct list_head tctx_list; 458 struct completion ref_comp; 459 }; 460}; 461 462struct io_uring_task { 463 /* submission side */ 464 int cached_refs; 465 struct xarray xa; 466 struct wait_queue_head wait; 467 const struct io_ring_ctx *last; 468 struct io_wq *io_wq; 469 struct percpu_counter inflight; 470 atomic_t inflight_tracked; 471 atomic_t in_idle; 472 473 spinlock_t task_lock; 474 struct io_wq_work_list task_list; 475 struct callback_head task_work; 476 bool task_running; 477}; 478 479/* 480 * First field must be the file pointer in all the 481 * iocb unions! See also 'struct kiocb' in <linux/fs.h> 482 */ 483struct io_poll_iocb { 484 struct file *file; 485 struct wait_queue_head *head; 486 __poll_t events; 487 bool done; 488 bool canceled; 489 struct wait_queue_entry wait; 490}; 491 492struct io_poll_update { 493 struct file *file; 494 u64 old_user_data; 495 u64 new_user_data; 496 __poll_t events; 497 bool update_events; 498 bool update_user_data; 499}; 500 501struct io_close { 502 struct file *file; 503 int fd; 504 u32 file_slot; 505}; 506 507struct io_timeout_data { 508 struct io_kiocb *req; 509 struct hrtimer timer; 510 struct timespec64 ts; 511 enum hrtimer_mode mode; 512 u32 flags; 513}; 514 515struct io_accept { 516 struct file *file; 517 struct sockaddr __user *addr; 518 int __user *addr_len; 519 int flags; 520 u32 file_slot; 521 unsigned long nofile; 522}; 523 524struct io_sync { 525 struct file *file; 526 loff_t len; 527 loff_t off; 528 int flags; 529 int mode; 530}; 531 532struct io_cancel { 533 struct file *file; 534 u64 addr; 535}; 536 537struct io_timeout { 538 struct file *file; 539 u32 off; 540 u32 target_seq; 541 struct list_head list; 542 /* head of the link, used by linked timeouts only */ 543 struct io_kiocb *head; 544 /* for linked completions */ 545 struct io_kiocb *prev; 546}; 547 548struct io_timeout_rem { 549 struct file *file; 550 u64 addr; 551 552 /* timeout update */ 553 struct timespec64 ts; 554 u32 flags; 555 bool ltimeout; 556}; 557 558struct io_rw { 559 /* NOTE: kiocb has the file as the first member, so don't do it here */ 560 struct kiocb kiocb; 561 u64 addr; 562 u64 len; 563}; 564 565struct io_connect { 566 struct file *file; 567 struct sockaddr __user *addr; 568 int addr_len; 569}; 570 571struct io_sr_msg { 572 struct file *file; 573 union { 574 struct compat_msghdr __user *umsg_compat; 575 struct user_msghdr __user *umsg; 576 void __user *buf; 577 }; 578 int msg_flags; 579 int bgid; 580 size_t len; 581 struct io_buffer *kbuf; 582}; 583 584struct io_open { 585 struct file *file; 586 int dfd; 587 u32 file_slot; 588 struct filename *filename; 589 struct open_how how; 590 unsigned long nofile; 591}; 592 593struct io_rsrc_update { 594 struct file *file; 595 u64 arg; 596 u32 nr_args; 597 u32 offset; 598}; 599 600struct io_fadvise { 601 struct file *file; 602 u64 offset; 603 u32 len; 604 u32 advice; 605}; 606 607struct io_madvise { 608 struct file *file; 609 u64 addr; 610 u32 len; 611 u32 advice; 612}; 613 614struct io_epoll { 615 struct file *file; 616 int epfd; 617 int op; 618 int fd; 619 struct epoll_event event; 620}; 621 622struct io_splice { 623 struct file *file_out; 624 struct file *file_in; 625 loff_t off_out; 626 loff_t off_in; 627 u64 len; 628 unsigned int flags; 629}; 630 631struct io_provide_buf { 632 struct file *file; 633 __u64 addr; 634 __u32 len; 635 __u32 bgid; 636 __u16 nbufs; 637 __u16 bid; 638}; 639 640struct io_statx { 641 struct file *file; 642 int dfd; 643 unsigned int mask; 644 unsigned int flags; 645 const char __user *filename; 646 struct statx __user *buffer; 647}; 648 649struct io_shutdown { 650 struct file *file; 651 int how; 652}; 653 654struct io_rename { 655 struct file *file; 656 int old_dfd; 657 int new_dfd; 658 struct filename *oldpath; 659 struct filename *newpath; 660 int flags; 661}; 662 663struct io_unlink { 664 struct file *file; 665 int dfd; 666 int flags; 667 struct filename *filename; 668}; 669 670struct io_mkdir { 671 struct file *file; 672 int dfd; 673 umode_t mode; 674 struct filename *filename; 675}; 676 677struct io_symlink { 678 struct file *file; 679 int new_dfd; 680 struct filename *oldpath; 681 struct filename *newpath; 682}; 683 684struct io_hardlink { 685 struct file *file; 686 int old_dfd; 687 int new_dfd; 688 struct filename *oldpath; 689 struct filename *newpath; 690 int flags; 691}; 692 693struct io_completion { 694 struct file *file; 695 u32 cflags; 696}; 697 698struct io_async_connect { 699 struct sockaddr_storage address; 700}; 701 702struct io_async_msghdr { 703 struct iovec fast_iov[UIO_FASTIOV]; 704 /* points to an allocated iov, if NULL we use fast_iov instead */ 705 struct iovec *free_iov; 706 struct sockaddr __user *uaddr; 707 struct msghdr msg; 708 struct sockaddr_storage addr; 709}; 710 711struct io_async_rw { 712 struct iovec fast_iov[UIO_FASTIOV]; 713 const struct iovec *free_iovec; 714 struct iov_iter iter; 715 struct iov_iter_state iter_state; 716 size_t bytes_done; 717 struct wait_page_queue wpq; 718}; 719 720enum { 721 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, 722 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, 723 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, 724 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, 725 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, 726 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, 727 728 /* first byte is taken by user flags, shift it to not overlap */ 729 REQ_F_FAIL_BIT = 8, 730 REQ_F_INFLIGHT_BIT, 731 REQ_F_CUR_POS_BIT, 732 REQ_F_NOWAIT_BIT, 733 REQ_F_LINK_TIMEOUT_BIT, 734 REQ_F_NEED_CLEANUP_BIT, 735 REQ_F_POLLED_BIT, 736 REQ_F_BUFFER_SELECTED_BIT, 737 REQ_F_COMPLETE_INLINE_BIT, 738 REQ_F_REISSUE_BIT, 739 REQ_F_CREDS_BIT, 740 REQ_F_REFCOUNT_BIT, 741 REQ_F_ARM_LTIMEOUT_BIT, 742 /* keep async read/write and isreg together and in order */ 743 REQ_F_NOWAIT_READ_BIT, 744 REQ_F_NOWAIT_WRITE_BIT, 745 REQ_F_ISREG_BIT, 746 747 /* not a real bit, just to check we're not overflowing the space */ 748 __REQ_F_LAST_BIT, 749}; 750 751enum { 752 /* ctx owns file */ 753 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), 754 /* drain existing IO first */ 755 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), 756 /* linked sqes */ 757 REQ_F_LINK = BIT(REQ_F_LINK_BIT), 758 /* doesn't sever on completion < 0 */ 759 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), 760 /* IOSQE_ASYNC */ 761 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), 762 /* IOSQE_BUFFER_SELECT */ 763 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), 764 765 /* fail rest of links */ 766 REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), 767 /* on inflight list, should be cancelled and waited on exit reliably */ 768 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), 769 /* read/write uses file position */ 770 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), 771 /* must not punt to workers */ 772 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), 773 /* has or had linked timeout */ 774 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), 775 /* needs cleanup */ 776 REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), 777 /* already went through poll handler */ 778 REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), 779 /* buffer already selected */ 780 REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), 781 /* completion is deferred through io_comp_state */ 782 REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), 783 /* caller should reissue async */ 784 REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), 785 /* supports async reads */ 786 REQ_F_NOWAIT_READ = BIT(REQ_F_NOWAIT_READ_BIT), 787 /* supports async writes */ 788 REQ_F_NOWAIT_WRITE = BIT(REQ_F_NOWAIT_WRITE_BIT), 789 /* regular file */ 790 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), 791 /* has creds assigned */ 792 REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), 793 /* skip refcounting if not set */ 794 REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), 795 /* there is a linked timeout that has to be armed */ 796 REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), 797}; 798 799struct async_poll { 800 struct io_poll_iocb poll; 801 struct io_poll_iocb *double_poll; 802}; 803 804typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); 805 806struct io_task_work { 807 union { 808 struct io_wq_work_node node; 809 struct llist_node fallback_node; 810 }; 811 io_req_tw_func_t func; 812}; 813 814enum { 815 IORING_RSRC_FILE = 0, 816 IORING_RSRC_BUFFER = 1, 817}; 818 819/* 820 * NOTE! Each of the iocb union members has the file pointer 821 * as the first entry in their struct definition. So you can 822 * access the file pointer through any of the sub-structs, 823 * or directly as just 'ki_filp' in this struct. 824 */ 825struct io_kiocb { 826 union { 827 struct file *file; 828 struct io_rw rw; 829 struct io_poll_iocb poll; 830 struct io_poll_update poll_update; 831 struct io_accept accept; 832 struct io_sync sync; 833 struct io_cancel cancel; 834 struct io_timeout timeout; 835 struct io_timeout_rem timeout_rem; 836 struct io_connect connect; 837 struct io_sr_msg sr_msg; 838 struct io_open open; 839 struct io_close close; 840 struct io_rsrc_update rsrc_update; 841 struct io_fadvise fadvise; 842 struct io_madvise madvise; 843 struct io_epoll epoll; 844 struct io_splice splice; 845 struct io_provide_buf pbuf; 846 struct io_statx statx; 847 struct io_shutdown shutdown; 848 struct io_rename rename; 849 struct io_unlink unlink; 850 struct io_mkdir mkdir; 851 struct io_symlink symlink; 852 struct io_hardlink hardlink; 853 /* use only after cleaning per-op data, see io_clean_op() */ 854 struct io_completion compl; 855 }; 856 857 /* opcode allocated if it needs to store data for async defer */ 858 void *async_data; 859 u8 opcode; 860 /* polled IO has completed */ 861 u8 iopoll_completed; 862 863 u16 buf_index; 864 u32 result; 865 866 struct io_ring_ctx *ctx; 867 unsigned int flags; 868 atomic_t refs; 869 struct task_struct *task; 870 u64 user_data; 871 872 struct io_kiocb *link; 873 struct percpu_ref *fixed_rsrc_refs; 874 875 /* used with ctx->iopoll_list with reads/writes */ 876 struct list_head inflight_entry; 877 struct io_task_work io_task_work; 878 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ 879 struct hlist_node hash_node; 880 struct async_poll *apoll; 881 struct io_wq_work work; 882 const struct cred *creds; 883 884 /* store used ubuf, so we can prevent reloading */ 885 struct io_mapped_ubuf *imu; 886}; 887 888struct io_tctx_node { 889 struct list_head ctx_node; 890 struct task_struct *task; 891 struct io_ring_ctx *ctx; 892}; 893 894struct io_defer_entry { 895 struct list_head list; 896 struct io_kiocb *req; 897 u32 seq; 898}; 899 900struct io_op_def { 901 /* needs req->file assigned */ 902 unsigned needs_file : 1; 903 /* hash wq insertion if file is a regular file */ 904 unsigned hash_reg_file : 1; 905 /* unbound wq insertion if file is a non-regular file */ 906 unsigned unbound_nonreg_file : 1; 907 /* opcode is not supported by this kernel */ 908 unsigned not_supported : 1; 909 /* set if opcode supports polled "wait" */ 910 unsigned pollin : 1; 911 unsigned pollout : 1; 912 /* op supports buffer selection */ 913 unsigned buffer_select : 1; 914 /* do prep async if is going to be punted */ 915 unsigned needs_async_setup : 1; 916 /* should block plug */ 917 unsigned plug : 1; 918 /* size of async data needed, if any */ 919 unsigned short async_size; 920}; 921 922static const struct io_op_def io_op_defs[] = { 923 [IORING_OP_NOP] = {}, 924 [IORING_OP_READV] = { 925 .needs_file = 1, 926 .unbound_nonreg_file = 1, 927 .pollin = 1, 928 .buffer_select = 1, 929 .needs_async_setup = 1, 930 .plug = 1, 931 .async_size = sizeof(struct io_async_rw), 932 }, 933 [IORING_OP_WRITEV] = { 934 .needs_file = 1, 935 .hash_reg_file = 1, 936 .unbound_nonreg_file = 1, 937 .pollout = 1, 938 .needs_async_setup = 1, 939 .plug = 1, 940 .async_size = sizeof(struct io_async_rw), 941 }, 942 [IORING_OP_FSYNC] = { 943 .needs_file = 1, 944 }, 945 [IORING_OP_READ_FIXED] = { 946 .needs_file = 1, 947 .unbound_nonreg_file = 1, 948 .pollin = 1, 949 .plug = 1, 950 .async_size = sizeof(struct io_async_rw), 951 }, 952 [IORING_OP_WRITE_FIXED] = { 953 .needs_file = 1, 954 .hash_reg_file = 1, 955 .unbound_nonreg_file = 1, 956 .pollout = 1, 957 .plug = 1, 958 .async_size = sizeof(struct io_async_rw), 959 }, 960 [IORING_OP_POLL_ADD] = { 961 .needs_file = 1, 962 .unbound_nonreg_file = 1, 963 }, 964 [IORING_OP_POLL_REMOVE] = {}, 965 [IORING_OP_SYNC_FILE_RANGE] = { 966 .needs_file = 1, 967 }, 968 [IORING_OP_SENDMSG] = { 969 .needs_file = 1, 970 .unbound_nonreg_file = 1, 971 .pollout = 1, 972 .needs_async_setup = 1, 973 .async_size = sizeof(struct io_async_msghdr), 974 }, 975 [IORING_OP_RECVMSG] = { 976 .needs_file = 1, 977 .unbound_nonreg_file = 1, 978 .pollin = 1, 979 .buffer_select = 1, 980 .needs_async_setup = 1, 981 .async_size = sizeof(struct io_async_msghdr), 982 }, 983 [IORING_OP_TIMEOUT] = { 984 .async_size = sizeof(struct io_timeout_data), 985 }, 986 [IORING_OP_TIMEOUT_REMOVE] = { 987 /* used by timeout updates' prep() */ 988 }, 989 [IORING_OP_ACCEPT] = { 990 .needs_file = 1, 991 .unbound_nonreg_file = 1, 992 .pollin = 1, 993 }, 994 [IORING_OP_ASYNC_CANCEL] = {}, 995 [IORING_OP_LINK_TIMEOUT] = { 996 .async_size = sizeof(struct io_timeout_data), 997 }, 998 [IORING_OP_CONNECT] = { 999 .needs_file = 1, 1000 .unbound_nonreg_file = 1, 1001 .pollout = 1, 1002 .needs_async_setup = 1, 1003 .async_size = sizeof(struct io_async_connect), 1004 }, 1005 [IORING_OP_FALLOCATE] = { 1006 .needs_file = 1, 1007 }, 1008 [IORING_OP_OPENAT] = {}, 1009 [IORING_OP_CLOSE] = {}, 1010 [IORING_OP_FILES_UPDATE] = {}, 1011 [IORING_OP_STATX] = {}, 1012 [IORING_OP_READ] = { 1013 .needs_file = 1, 1014 .unbound_nonreg_file = 1, 1015 .pollin = 1, 1016 .buffer_select = 1, 1017 .plug = 1, 1018 .async_size = sizeof(struct io_async_rw), 1019 }, 1020 [IORING_OP_WRITE] = { 1021 .needs_file = 1, 1022 .hash_reg_file = 1, 1023 .unbound_nonreg_file = 1, 1024 .pollout = 1, 1025 .plug = 1, 1026 .async_size = sizeof(struct io_async_rw), 1027 }, 1028 [IORING_OP_FADVISE] = { 1029 .needs_file = 1, 1030 }, 1031 [IORING_OP_MADVISE] = {}, 1032 [IORING_OP_SEND] = { 1033 .needs_file = 1, 1034 .unbound_nonreg_file = 1, 1035 .pollout = 1, 1036 }, 1037 [IORING_OP_RECV] = { 1038 .needs_file = 1, 1039 .unbound_nonreg_file = 1, 1040 .pollin = 1, 1041 .buffer_select = 1, 1042 }, 1043 [IORING_OP_OPENAT2] = { 1044 }, 1045 [IORING_OP_EPOLL_CTL] = { 1046 .unbound_nonreg_file = 1, 1047 }, 1048 [IORING_OP_SPLICE] = { 1049 .needs_file = 1, 1050 .hash_reg_file = 1, 1051 .unbound_nonreg_file = 1, 1052 }, 1053 [IORING_OP_PROVIDE_BUFFERS] = {}, 1054 [IORING_OP_REMOVE_BUFFERS] = {}, 1055 [IORING_OP_TEE] = { 1056 .needs_file = 1, 1057 .hash_reg_file = 1, 1058 .unbound_nonreg_file = 1, 1059 }, 1060 [IORING_OP_SHUTDOWN] = { 1061 .needs_file = 1, 1062 }, 1063 [IORING_OP_RENAMEAT] = {}, 1064 [IORING_OP_UNLINKAT] = {}, 1065 [IORING_OP_MKDIRAT] = {}, 1066 [IORING_OP_SYMLINKAT] = {}, 1067 [IORING_OP_LINKAT] = {}, 1068}; 1069 1070/* requests with any of those set should undergo io_disarm_next() */ 1071#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) 1072 1073static bool io_disarm_next(struct io_kiocb *req); 1074static void io_uring_del_tctx_node(unsigned long index); 1075static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 1076 struct task_struct *task, 1077 bool cancel_all); 1078static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); 1079 1080static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, 1081 long res, unsigned int cflags); 1082static void io_put_req(struct io_kiocb *req); 1083static void io_put_req_deferred(struct io_kiocb *req); 1084static void io_dismantle_req(struct io_kiocb *req); 1085static void io_queue_linked_timeout(struct io_kiocb *req); 1086static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 1087 struct io_uring_rsrc_update2 *up, 1088 unsigned nr_args); 1089static void io_clean_op(struct io_kiocb *req); 1090static struct file *io_file_get(struct io_ring_ctx *ctx, 1091 struct io_kiocb *req, int fd, bool fixed); 1092static void __io_queue_sqe(struct io_kiocb *req); 1093static void io_rsrc_put_work(struct work_struct *work); 1094 1095static void io_req_task_queue(struct io_kiocb *req); 1096static void io_submit_flush_completions(struct io_ring_ctx *ctx); 1097static int io_req_prep_async(struct io_kiocb *req); 1098 1099static int io_install_fixed_file(struct io_kiocb *req, struct file *file, 1100 unsigned int issue_flags, u32 slot_index); 1101static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); 1102 1103static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); 1104 1105static struct kmem_cache *req_cachep; 1106 1107static const struct file_operations io_uring_fops; 1108 1109struct sock *io_uring_get_socket(struct file *file) 1110{ 1111#if defined(CONFIG_UNIX) 1112 if (file->f_op == &io_uring_fops) { 1113 struct io_ring_ctx *ctx = file->private_data; 1114 1115 return ctx->ring_sock->sk; 1116 } 1117#endif 1118 return NULL; 1119} 1120EXPORT_SYMBOL(io_uring_get_socket); 1121 1122static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) 1123{ 1124 if (!*locked) { 1125 mutex_lock(&ctx->uring_lock); 1126 *locked = true; 1127 } 1128} 1129 1130#define io_for_each_link(pos, head) \ 1131 for (pos = (head); pos; pos = pos->link) 1132 1133/* 1134 * Shamelessly stolen from the mm implementation of page reference checking, 1135 * see commit f958d7b528b1 for details. 1136 */ 1137#define req_ref_zero_or_close_to_overflow(req) \ 1138 ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u) 1139 1140static inline bool req_ref_inc_not_zero(struct io_kiocb *req) 1141{ 1142 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); 1143 return atomic_inc_not_zero(&req->refs); 1144} 1145 1146static inline bool req_ref_put_and_test(struct io_kiocb *req) 1147{ 1148 if (likely(!(req->flags & REQ_F_REFCOUNT))) 1149 return true; 1150 1151 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); 1152 return atomic_dec_and_test(&req->refs); 1153} 1154 1155static inline void req_ref_put(struct io_kiocb *req) 1156{ 1157 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); 1158 WARN_ON_ONCE(req_ref_put_and_test(req)); 1159} 1160 1161static inline void req_ref_get(struct io_kiocb *req) 1162{ 1163 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); 1164 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); 1165 atomic_inc(&req->refs); 1166} 1167 1168static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) 1169{ 1170 if (!(req->flags & REQ_F_REFCOUNT)) { 1171 req->flags |= REQ_F_REFCOUNT; 1172 atomic_set(&req->refs, nr); 1173 } 1174} 1175 1176static inline void io_req_set_refcount(struct io_kiocb *req) 1177{ 1178 __io_req_set_refcount(req, 1); 1179} 1180 1181static inline void io_req_set_rsrc_node(struct io_kiocb *req) 1182{ 1183 struct io_ring_ctx *ctx = req->ctx; 1184 1185 if (!req->fixed_rsrc_refs) { 1186 req->fixed_rsrc_refs = &ctx->rsrc_node->refs; 1187 percpu_ref_get(req->fixed_rsrc_refs); 1188 } 1189} 1190 1191static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl) 1192{ 1193 bool got = percpu_ref_tryget(ref); 1194 1195 /* already at zero, wait for ->release() */ 1196 if (!got) 1197 wait_for_completion(compl); 1198 percpu_ref_resurrect(ref); 1199 if (got) 1200 percpu_ref_put(ref); 1201} 1202 1203static bool io_match_task(struct io_kiocb *head, struct task_struct *task, 1204 bool cancel_all) 1205{ 1206 struct io_kiocb *req; 1207 1208 if (task && head->task != task) 1209 return false; 1210 if (cancel_all) 1211 return true; 1212 1213 io_for_each_link(req, head) { 1214 if (req->flags & REQ_F_INFLIGHT) 1215 return true; 1216 } 1217 return false; 1218} 1219 1220static inline void req_set_fail(struct io_kiocb *req) 1221{ 1222 req->flags |= REQ_F_FAIL; 1223} 1224 1225static inline void req_fail_link_node(struct io_kiocb *req, int res) 1226{ 1227 req_set_fail(req); 1228 req->result = res; 1229} 1230 1231static void io_ring_ctx_ref_free(struct percpu_ref *ref) 1232{ 1233 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); 1234 1235 complete(&ctx->ref_comp); 1236} 1237 1238static inline bool io_is_timeout_noseq(struct io_kiocb *req) 1239{ 1240 return !req->timeout.off; 1241} 1242 1243static void io_fallback_req_func(struct work_struct *work) 1244{ 1245 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, 1246 fallback_work.work); 1247 struct llist_node *node = llist_del_all(&ctx->fallback_llist); 1248 struct io_kiocb *req, *tmp; 1249 bool locked = false; 1250 1251 percpu_ref_get(&ctx->refs); 1252 llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node) 1253 req->io_task_work.func(req, &locked); 1254 1255 if (locked) { 1256 if (ctx->submit_state.compl_nr) 1257 io_submit_flush_completions(ctx); 1258 mutex_unlock(&ctx->uring_lock); 1259 } 1260 percpu_ref_put(&ctx->refs); 1261 1262} 1263 1264static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) 1265{ 1266 struct io_ring_ctx *ctx; 1267 int hash_bits; 1268 1269 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 1270 if (!ctx) 1271 return NULL; 1272 1273 /* 1274 * Use 5 bits less than the max cq entries, that should give us around 1275 * 32 entries per hash list if totally full and uniformly spread. 1276 */ 1277 hash_bits = ilog2(p->cq_entries); 1278 hash_bits -= 5; 1279 if (hash_bits <= 0) 1280 hash_bits = 1; 1281 ctx->cancel_hash_bits = hash_bits; 1282 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head), 1283 GFP_KERNEL); 1284 if (!ctx->cancel_hash) 1285 goto err; 1286 __hash_init(ctx->cancel_hash, 1U << hash_bits); 1287 1288 ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); 1289 if (!ctx->dummy_ubuf) 1290 goto err; 1291 /* set invalid range, so io_import_fixed() fails meeting it */ 1292 ctx->dummy_ubuf->ubuf = -1UL; 1293 1294 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 1295 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) 1296 goto err; 1297 1298 ctx->flags = p->flags; 1299 init_waitqueue_head(&ctx->sqo_sq_wait); 1300 INIT_LIST_HEAD(&ctx->sqd_list); 1301 init_waitqueue_head(&ctx->poll_wait); 1302 INIT_LIST_HEAD(&ctx->cq_overflow_list); 1303 init_completion(&ctx->ref_comp); 1304 xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1); 1305 xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); 1306 mutex_init(&ctx->uring_lock); 1307 init_waitqueue_head(&ctx->cq_wait); 1308 spin_lock_init(&ctx->completion_lock); 1309 spin_lock_init(&ctx->timeout_lock); 1310 INIT_LIST_HEAD(&ctx->iopoll_list); 1311 INIT_LIST_HEAD(&ctx->defer_list); 1312 INIT_LIST_HEAD(&ctx->timeout_list); 1313 INIT_LIST_HEAD(&ctx->ltimeout_list); 1314 spin_lock_init(&ctx->rsrc_ref_lock); 1315 INIT_LIST_HEAD(&ctx->rsrc_ref_list); 1316 INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); 1317 init_llist_head(&ctx->rsrc_put_llist); 1318 INIT_LIST_HEAD(&ctx->tctx_list); 1319 INIT_LIST_HEAD(&ctx->submit_state.free_list); 1320 INIT_LIST_HEAD(&ctx->locked_free_list); 1321 INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); 1322 return ctx; 1323err: 1324 kfree(ctx->dummy_ubuf); 1325 kfree(ctx->cancel_hash); 1326 kfree(ctx); 1327 return NULL; 1328} 1329 1330static void io_account_cq_overflow(struct io_ring_ctx *ctx) 1331{ 1332 struct io_rings *r = ctx->rings; 1333 1334 WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); 1335 ctx->cq_extra--; 1336} 1337 1338static bool req_need_defer(struct io_kiocb *req, u32 seq) 1339{ 1340 if (unlikely(req->flags & REQ_F_IO_DRAIN)) { 1341 struct io_ring_ctx *ctx = req->ctx; 1342 1343 return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; 1344 } 1345 1346 return false; 1347} 1348 1349#define FFS_ASYNC_READ 0x1UL 1350#define FFS_ASYNC_WRITE 0x2UL 1351#ifdef CONFIG_64BIT 1352#define FFS_ISREG 0x4UL 1353#else 1354#define FFS_ISREG 0x0UL 1355#endif 1356#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG) 1357 1358static inline bool io_req_ffs_set(struct io_kiocb *req) 1359{ 1360 return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE); 1361} 1362 1363static void io_req_track_inflight(struct io_kiocb *req) 1364{ 1365 if (!(req->flags & REQ_F_INFLIGHT)) { 1366 req->flags |= REQ_F_INFLIGHT; 1367 atomic_inc(&current->io_uring->inflight_tracked); 1368 } 1369} 1370 1371static inline void io_unprep_linked_timeout(struct io_kiocb *req) 1372{ 1373 req->flags &= ~REQ_F_LINK_TIMEOUT; 1374} 1375 1376static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) 1377{ 1378 if (WARN_ON_ONCE(!req->link)) 1379 return NULL; 1380 1381 req->flags &= ~REQ_F_ARM_LTIMEOUT; 1382 req->flags |= REQ_F_LINK_TIMEOUT; 1383 1384 /* linked timeouts should have two refs once prep'ed */ 1385 io_req_set_refcount(req); 1386 __io_req_set_refcount(req->link, 2); 1387 return req->link; 1388} 1389 1390static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) 1391{ 1392 if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT))) 1393 return NULL; 1394 return __io_prep_linked_timeout(req); 1395} 1396 1397static void io_prep_async_work(struct io_kiocb *req) 1398{ 1399 const struct io_op_def *def = &io_op_defs[req->opcode]; 1400 struct io_ring_ctx *ctx = req->ctx; 1401 1402 if (!(req->flags & REQ_F_CREDS)) { 1403 req->flags |= REQ_F_CREDS; 1404 req->creds = get_current_cred(); 1405 } 1406 1407 req->work.list.next = NULL; 1408 req->work.flags = 0; 1409 if (req->flags & REQ_F_FORCE_ASYNC) 1410 req->work.flags |= IO_WQ_WORK_CONCURRENT; 1411 1412 if (req->flags & REQ_F_ISREG) { 1413 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) 1414 io_wq_hash_work(&req->work, file_inode(req->file)); 1415 } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { 1416 if (def->unbound_nonreg_file) 1417 req->work.flags |= IO_WQ_WORK_UNBOUND; 1418 } 1419 1420 switch (req->opcode) { 1421 case IORING_OP_SPLICE: 1422 case IORING_OP_TEE: 1423 if (!S_ISREG(file_inode(req->splice.file_in)->i_mode)) 1424 req->work.flags |= IO_WQ_WORK_UNBOUND; 1425 break; 1426 } 1427} 1428 1429static void io_prep_async_link(struct io_kiocb *req) 1430{ 1431 struct io_kiocb *cur; 1432 1433 if (req->flags & REQ_F_LINK_TIMEOUT) { 1434 struct io_ring_ctx *ctx = req->ctx; 1435 1436 spin_lock(&ctx->completion_lock); 1437 io_for_each_link(cur, req) 1438 io_prep_async_work(cur); 1439 spin_unlock(&ctx->completion_lock); 1440 } else { 1441 io_for_each_link(cur, req) 1442 io_prep_async_work(cur); 1443 } 1444} 1445 1446static void io_queue_async_work(struct io_kiocb *req, bool *locked) 1447{ 1448 struct io_ring_ctx *ctx = req->ctx; 1449 struct io_kiocb *link = io_prep_linked_timeout(req); 1450 struct io_uring_task *tctx = req->task->io_uring; 1451 1452 /* must not take the lock, NULL it as a precaution */ 1453 locked = NULL; 1454 1455 BUG_ON(!tctx); 1456 BUG_ON(!tctx->io_wq); 1457 1458 /* init ->work of the whole link before punting */ 1459 io_prep_async_link(req); 1460 1461 /* 1462 * Not expected to happen, but if we do have a bug where this _can_ 1463 * happen, catch it here and ensure the request is marked as 1464 * canceled. That will make io-wq go through the usual work cancel 1465 * procedure rather than attempt to run this request (or create a new 1466 * worker for it). 1467 */ 1468 if (WARN_ON_ONCE(!same_thread_group(req->task, current))) 1469 req->work.flags |= IO_WQ_WORK_CANCEL; 1470 1471 trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, 1472 &req->work, req->flags); 1473 io_wq_enqueue(tctx->io_wq, &req->work); 1474 if (link) 1475 io_queue_linked_timeout(link); 1476} 1477 1478static void io_kill_timeout(struct io_kiocb *req, int status) 1479 __must_hold(&req->ctx->completion_lock) 1480 __must_hold(&req->ctx->timeout_lock) 1481{ 1482 struct io_timeout_data *io = req->async_data; 1483 1484 if (hrtimer_try_to_cancel(&io->timer) != -1) { 1485 if (status) 1486 req_set_fail(req); 1487 atomic_set(&req->ctx->cq_timeouts, 1488 atomic_read(&req->ctx->cq_timeouts) + 1); 1489 list_del_init(&req->timeout.list); 1490 io_cqring_fill_event(req->ctx, req->user_data, status, 0); 1491 io_put_req_deferred(req); 1492 } 1493} 1494 1495static void io_queue_deferred(struct io_ring_ctx *ctx) 1496{ 1497 while (!list_empty(&ctx->defer_list)) { 1498 struct io_defer_entry *de = list_first_entry(&ctx->defer_list, 1499 struct io_defer_entry, list); 1500 1501 if (req_need_defer(de->req, de->seq)) 1502 break; 1503 list_del_init(&de->list); 1504 io_req_task_queue(de->req); 1505 kfree(de); 1506 } 1507} 1508 1509static void io_flush_timeouts(struct io_ring_ctx *ctx) 1510 __must_hold(&ctx->completion_lock) 1511{ 1512 u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); 1513 1514 spin_lock_irq(&ctx->timeout_lock); 1515 while (!list_empty(&ctx->timeout_list)) { 1516 u32 events_needed, events_got; 1517 struct io_kiocb *req = list_first_entry(&ctx->timeout_list, 1518 struct io_kiocb, timeout.list); 1519 1520 if (io_is_timeout_noseq(req)) 1521 break; 1522 1523 /* 1524 * Since seq can easily wrap around over time, subtract 1525 * the last seq at which timeouts were flushed before comparing. 1526 * Assuming not more than 2^31-1 events have happened since, 1527 * these subtractions won't have wrapped, so we can check if 1528 * target is in [last_seq, current_seq] by comparing the two. 1529 */ 1530 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush; 1531 events_got = seq - ctx->cq_last_tm_flush; 1532 if (events_got < events_needed) 1533 break; 1534 1535 list_del_init(&req->timeout.list); 1536 io_kill_timeout(req, 0); 1537 } 1538 ctx->cq_last_tm_flush = seq; 1539 spin_unlock_irq(&ctx->timeout_lock); 1540} 1541 1542static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) 1543{ 1544 if (ctx->off_timeout_used) 1545 io_flush_timeouts(ctx); 1546 if (ctx->drain_active) 1547 io_queue_deferred(ctx); 1548} 1549 1550static inline void io_commit_cqring(struct io_ring_ctx *ctx) 1551{ 1552 if (unlikely(ctx->off_timeout_used || ctx->drain_active)) 1553 __io_commit_cqring_flush(ctx); 1554 /* order cqe stores with ring update */ 1555 smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); 1556} 1557 1558static inline bool io_sqring_full(struct io_ring_ctx *ctx) 1559{ 1560 struct io_rings *r = ctx->rings; 1561 1562 return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries; 1563} 1564 1565static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) 1566{ 1567 return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); 1568} 1569 1570static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) 1571{ 1572 struct io_rings *rings = ctx->rings; 1573 unsigned tail, mask = ctx->cq_entries - 1; 1574 1575 /* 1576 * writes to the cq entry need to come after reading head; the 1577 * control dependency is enough as we're using WRITE_ONCE to 1578 * fill the cq entry 1579 */ 1580 if (__io_cqring_events(ctx) == ctx->cq_entries) 1581 return NULL; 1582 1583 tail = ctx->cached_cq_tail++; 1584 return &rings->cqes[tail & mask]; 1585} 1586 1587static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) 1588{ 1589 if (likely(!ctx->cq_ev_fd)) 1590 return false; 1591 if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) 1592 return false; 1593 return !ctx->eventfd_async || io_wq_current_is_worker(); 1594} 1595 1596/* 1597 * This should only get called when at least one event has been posted. 1598 * Some applications rely on the eventfd notification count only changing 1599 * IFF a new CQE has been added to the CQ ring. There's no depedency on 1600 * 1:1 relationship between how many times this function is called (and 1601 * hence the eventfd count) and number of CQEs posted to the CQ ring. 1602 */ 1603static void io_cqring_ev_posted(struct io_ring_ctx *ctx) 1604{ 1605 /* 1606 * wake_up_all() may seem excessive, but io_wake_function() and 1607 * io_should_wake() handle the termination of the loop and only 1608 * wake as many waiters as we need to. 1609 */ 1610 if (wq_has_sleeper(&ctx->cq_wait)) 1611 wake_up_all(&ctx->cq_wait); 1612 if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait)) 1613 wake_up(&ctx->sq_data->wait); 1614 if (io_should_trigger_evfd(ctx)) 1615 eventfd_signal(ctx->cq_ev_fd, 1); 1616 if (waitqueue_active(&ctx->poll_wait)) 1617 wake_up_interruptible(&ctx->poll_wait); 1618} 1619 1620static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) 1621{ 1622 /* see waitqueue_active() comment */ 1623 smp_mb(); 1624 1625 if (ctx->flags & IORING_SETUP_SQPOLL) { 1626 if (waitqueue_active(&ctx->cq_wait)) 1627 wake_up_all(&ctx->cq_wait); 1628 } 1629 if (io_should_trigger_evfd(ctx)) 1630 eventfd_signal(ctx->cq_ev_fd, 1); 1631 if (waitqueue_active(&ctx->poll_wait)) 1632 wake_up_interruptible(&ctx->poll_wait); 1633} 1634 1635/* Returns true if there are no backlogged entries after the flush */ 1636static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) 1637{ 1638 bool all_flushed, posted; 1639 1640 if (!force && __io_cqring_events(ctx) == ctx->cq_entries) 1641 return false; 1642 1643 posted = false; 1644 spin_lock(&ctx->completion_lock); 1645 while (!list_empty(&ctx->cq_overflow_list)) { 1646 struct io_uring_cqe *cqe = io_get_cqe(ctx); 1647 struct io_overflow_cqe *ocqe; 1648 1649 if (!cqe && !force) 1650 break; 1651 ocqe = list_first_entry(&ctx->cq_overflow_list, 1652 struct io_overflow_cqe, list); 1653 if (cqe) 1654 memcpy(cqe, &ocqe->cqe, sizeof(*cqe)); 1655 else 1656 io_account_cq_overflow(ctx); 1657 1658 posted = true; 1659 list_del(&ocqe->list); 1660 kfree(ocqe); 1661 } 1662 1663 all_flushed = list_empty(&ctx->cq_overflow_list); 1664 if (all_flushed) { 1665 clear_bit(0, &ctx->check_cq_overflow); 1666 WRITE_ONCE(ctx->rings->sq_flags, 1667 ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW); 1668 } 1669 1670 if (posted) 1671 io_commit_cqring(ctx); 1672 spin_unlock(&ctx->completion_lock); 1673 if (posted) 1674 io_cqring_ev_posted(ctx); 1675 return all_flushed; 1676} 1677 1678static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) 1679{ 1680 bool ret = true; 1681 1682 if (test_bit(0, &ctx->check_cq_overflow)) { 1683 /* iopoll syncs against uring_lock, not completion_lock */ 1684 if (ctx->flags & IORING_SETUP_IOPOLL) 1685 mutex_lock(&ctx->uring_lock); 1686 ret = __io_cqring_overflow_flush(ctx, false); 1687 if (ctx->flags & IORING_SETUP_IOPOLL) 1688 mutex_unlock(&ctx->uring_lock); 1689 } 1690 1691 return ret; 1692} 1693 1694/* must to be called somewhat shortly after putting a request */ 1695static inline void io_put_task(struct task_struct *task, int nr) 1696{ 1697 struct io_uring_task *tctx = task->io_uring; 1698 1699 if (likely(task == current)) { 1700 tctx->cached_refs += nr; 1701 } else { 1702 percpu_counter_sub(&tctx->inflight, nr); 1703 if (unlikely(atomic_read(&tctx->in_idle))) 1704 wake_up(&tctx->wait); 1705 put_task_struct_many(task, nr); 1706 } 1707} 1708 1709static void io_task_refs_refill(struct io_uring_task *tctx) 1710{ 1711 unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; 1712 1713 percpu_counter_add(&tctx->inflight, refill); 1714 refcount_add(refill, &current->usage); 1715 tctx->cached_refs += refill; 1716} 1717 1718static inline void io_get_task_refs(int nr) 1719{ 1720 struct io_uring_task *tctx = current->io_uring; 1721 1722 tctx->cached_refs -= nr; 1723 if (unlikely(tctx->cached_refs < 0)) 1724 io_task_refs_refill(tctx); 1725} 1726 1727static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, 1728 long res, unsigned int cflags) 1729{ 1730 struct io_overflow_cqe *ocqe; 1731 1732 ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT); 1733 if (!ocqe) { 1734 /* 1735 * If we're in ring overflow flush mode, or in task cancel mode, 1736 * or cannot allocate an overflow entry, then we need to drop it 1737 * on the floor. 1738 */ 1739 io_account_cq_overflow(ctx); 1740 return false; 1741 } 1742 if (list_empty(&ctx->cq_overflow_list)) { 1743 set_bit(0, &ctx->check_cq_overflow); 1744 WRITE_ONCE(ctx->rings->sq_flags, 1745 ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW); 1746 1747 } 1748 ocqe->cqe.user_data = user_data; 1749 ocqe->cqe.res = res; 1750 ocqe->cqe.flags = cflags; 1751 list_add_tail(&ocqe->list, &ctx->cq_overflow_list); 1752 return true; 1753} 1754 1755static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, 1756 long res, unsigned int cflags) 1757{ 1758 struct io_uring_cqe *cqe; 1759 1760 trace_io_uring_complete(ctx, user_data, res, cflags); 1761 1762 /* 1763 * If we can't get a cq entry, userspace overflowed the 1764 * submission (by quite a lot). Increment the overflow count in 1765 * the ring. 1766 */ 1767 cqe = io_get_cqe(ctx); 1768 if (likely(cqe)) { 1769 WRITE_ONCE(cqe->user_data, user_data); 1770 WRITE_ONCE(cqe->res, res); 1771 WRITE_ONCE(cqe->flags, cflags); 1772 return true; 1773 } 1774 return io_cqring_event_overflow(ctx, user_data, res, cflags); 1775} 1776 1777/* not as hot to bloat with inlining */ 1778static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, 1779 long res, unsigned int cflags) 1780{ 1781 return __io_cqring_fill_event(ctx, user_data, res, cflags); 1782} 1783 1784static void io_req_complete_post(struct io_kiocb *req, long res, 1785 unsigned int cflags) 1786{ 1787 struct io_ring_ctx *ctx = req->ctx; 1788 1789 spin_lock(&ctx->completion_lock); 1790 __io_cqring_fill_event(ctx, req->user_data, res, cflags); 1791 /* 1792 * If we're the last reference to this request, add to our locked 1793 * free_list cache. 1794 */ 1795 if (req_ref_put_and_test(req)) { 1796 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { 1797 if (req->flags & IO_DISARM_MASK) 1798 io_disarm_next(req); 1799 if (req->link) { 1800 io_req_task_queue(req->link); 1801 req->link = NULL; 1802 } 1803 } 1804 io_dismantle_req(req); 1805 io_put_task(req->task, 1); 1806 list_add(&req->inflight_entry, &ctx->locked_free_list); 1807 ctx->locked_free_nr++; 1808 } else { 1809 if (!percpu_ref_tryget(&ctx->refs)) 1810 req = NULL; 1811 } 1812 io_commit_cqring(ctx); 1813 spin_unlock(&ctx->completion_lock); 1814 1815 if (req) { 1816 io_cqring_ev_posted(ctx); 1817 percpu_ref_put(&ctx->refs); 1818 } 1819} 1820 1821static inline bool io_req_needs_clean(struct io_kiocb *req) 1822{ 1823 return req->flags & IO_REQ_CLEAN_FLAGS; 1824} 1825 1826static void io_req_complete_state(struct io_kiocb *req, long res, 1827 unsigned int cflags) 1828{ 1829 if (io_req_needs_clean(req)) 1830 io_clean_op(req); 1831 req->result = res; 1832 req->compl.cflags = cflags; 1833 req->flags |= REQ_F_COMPLETE_INLINE; 1834} 1835 1836static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, 1837 long res, unsigned cflags) 1838{ 1839 if (issue_flags & IO_URING_F_COMPLETE_DEFER) 1840 io_req_complete_state(req, res, cflags); 1841 else 1842 io_req_complete_post(req, res, cflags); 1843} 1844 1845static inline void io_req_complete(struct io_kiocb *req, long res) 1846{ 1847 __io_req_complete(req, 0, res, 0); 1848} 1849 1850static void io_req_complete_failed(struct io_kiocb *req, long res) 1851{ 1852 req_set_fail(req); 1853 io_req_complete_post(req, res, 0); 1854} 1855 1856static void io_req_complete_fail_submit(struct io_kiocb *req) 1857{ 1858 /* 1859 * We don't submit, fail them all, for that replace hardlinks with 1860 * normal links. Extra REQ_F_LINK is tolerated. 1861 */ 1862 req->flags &= ~REQ_F_HARDLINK; 1863 req->flags |= REQ_F_LINK; 1864 io_req_complete_failed(req, req->result); 1865} 1866 1867/* 1868 * Don't initialise the fields below on every allocation, but do that in 1869 * advance and keep them valid across allocations. 1870 */ 1871static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) 1872{ 1873 req->ctx = ctx; 1874 req->link = NULL; 1875 req->async_data = NULL; 1876 /* not necessary, but safer to zero */ 1877 req->result = 0; 1878} 1879 1880static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, 1881 struct io_submit_state *state) 1882{ 1883 spin_lock(&ctx->completion_lock); 1884 list_splice_init(&ctx->locked_free_list, &state->free_list); 1885 ctx->locked_free_nr = 0; 1886 spin_unlock(&ctx->completion_lock); 1887} 1888 1889/* Returns true IFF there are requests in the cache */ 1890static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) 1891{ 1892 struct io_submit_state *state = &ctx->submit_state; 1893 int nr; 1894 1895 /* 1896 * If we have more than a batch's worth of requests in our IRQ side 1897 * locked cache, grab the lock and move them over to our submission 1898 * side cache. 1899 */ 1900 if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) 1901 io_flush_cached_locked_reqs(ctx, state); 1902 1903 nr = state->free_reqs; 1904 while (!list_empty(&state->free_list)) { 1905 struct io_kiocb *req = list_first_entry(&state->free_list, 1906 struct io_kiocb, inflight_entry); 1907 1908 list_del(&req->inflight_entry); 1909 state->reqs[nr++] = req; 1910 if (nr == ARRAY_SIZE(state->reqs)) 1911 break; 1912 } 1913 1914 state->free_reqs = nr; 1915 return nr != 0; 1916} 1917 1918/* 1919 * A request might get retired back into the request caches even before opcode 1920 * handlers and io_issue_sqe() are done with it, e.g. inline completion path. 1921 * Because of that, io_alloc_req() should be called only under ->uring_lock 1922 * and with extra caution to not get a request that is still worked on. 1923 */ 1924static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) 1925 __must_hold(&ctx->uring_lock) 1926{ 1927 struct io_submit_state *state = &ctx->submit_state; 1928 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; 1929 int ret, i; 1930 1931 BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH); 1932 1933 if (likely(state->free_reqs || io_flush_cached_reqs(ctx))) 1934 goto got_req; 1935 1936 ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH, 1937 state->reqs); 1938 1939 /* 1940 * Bulk alloc is all-or-nothing. If we fail to get a batch, 1941 * retry single alloc to be on the safe side. 1942 */ 1943 if (unlikely(ret <= 0)) { 1944 state->reqs[0] = kmem_cache_alloc(req_cachep, gfp); 1945 if (!state->reqs[0]) 1946 return NULL; 1947 ret = 1; 1948 } 1949 1950 for (i = 0; i < ret; i++) 1951 io_preinit_req(state->reqs[i], ctx); 1952 state->free_reqs = ret; 1953got_req: 1954 state->free_reqs--; 1955 return state->reqs[state->free_reqs]; 1956} 1957 1958static inline void io_put_file(struct file *file) 1959{ 1960 if (file) 1961 fput(file); 1962} 1963 1964static void io_dismantle_req(struct io_kiocb *req) 1965{ 1966 unsigned int flags = req->flags; 1967 1968 if (io_req_needs_clean(req)) 1969 io_clean_op(req); 1970 if (!(flags & REQ_F_FIXED_FILE)) 1971 io_put_file(req->file); 1972 if (req->fixed_rsrc_refs) 1973 percpu_ref_put(req->fixed_rsrc_refs); 1974 if (req->async_data) { 1975 kfree(req->async_data); 1976 req->async_data = NULL; 1977 } 1978} 1979 1980static void __io_free_req(struct io_kiocb *req) 1981{ 1982 struct io_ring_ctx *ctx = req->ctx; 1983 1984 io_dismantle_req(req); 1985 io_put_task(req->task, 1); 1986 1987 spin_lock(&ctx->completion_lock); 1988 list_add(&req->inflight_entry, &ctx->locked_free_list); 1989 ctx->locked_free_nr++; 1990 spin_unlock(&ctx->completion_lock); 1991 1992 percpu_ref_put(&ctx->refs); 1993} 1994 1995static inline void io_remove_next_linked(struct io_kiocb *req) 1996{ 1997 struct io_kiocb *nxt = req->link; 1998 1999 req->link = nxt->link; 2000 nxt->link = NULL; 2001} 2002 2003static bool io_kill_linked_timeout(struct io_kiocb *req) 2004 __must_hold(&req->ctx->completion_lock) 2005 __must_hold(&req->ctx->timeout_lock) 2006{ 2007 struct io_kiocb *link = req->link; 2008 2009 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { 2010 struct io_timeout_data *io = link->async_data; 2011 2012 io_remove_next_linked(req); 2013 link->timeout.head = NULL; 2014 if (hrtimer_try_to_cancel(&io->timer) != -1) { 2015 list_del(&link->timeout.list); 2016 io_cqring_fill_event(link->ctx, link->user_data, 2017 -ECANCELED, 0); 2018 io_put_req_deferred(link); 2019 return true; 2020 } 2021 } 2022 return false; 2023} 2024 2025static void io_fail_links(struct io_kiocb *req) 2026 __must_hold(&req->ctx->completion_lock) 2027{ 2028 struct io_kiocb *nxt, *link = req->link; 2029 2030 req->link = NULL; 2031 while (link) { 2032 long res = -ECANCELED; 2033 2034 if (link->flags & REQ_F_FAIL) 2035 res = link->result; 2036 2037 nxt = link->link; 2038 link->link = NULL; 2039 2040 trace_io_uring_fail_link(req, link); 2041 io_cqring_fill_event(link->ctx, link->user_data, res, 0); 2042 io_put_req_deferred(link); 2043 link = nxt; 2044 } 2045} 2046 2047static bool io_disarm_next(struct io_kiocb *req) 2048 __must_hold(&req->ctx->completion_lock) 2049{ 2050 bool posted = false; 2051 2052 if (req->flags & REQ_F_ARM_LTIMEOUT) { 2053 struct io_kiocb *link = req->link; 2054 2055 req->flags &= ~REQ_F_ARM_LTIMEOUT; 2056 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { 2057 io_remove_next_linked(req); 2058 io_cqring_fill_event(link->ctx, link->user_data, 2059 -ECANCELED, 0); 2060 io_put_req_deferred(link); 2061 posted = true; 2062 } 2063 } else if (req->flags & REQ_F_LINK_TIMEOUT) { 2064 struct io_ring_ctx *ctx = req->ctx; 2065 2066 spin_lock_irq(&ctx->timeout_lock); 2067 posted = io_kill_linked_timeout(req); 2068 spin_unlock_irq(&ctx->timeout_lock); 2069 } 2070 if (unlikely((req->flags & REQ_F_FAIL) && 2071 !(req->flags & REQ_F_HARDLINK))) { 2072 posted |= (req->link != NULL); 2073 io_fail_links(req); 2074 } 2075 return posted; 2076} 2077 2078static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) 2079{ 2080 struct io_kiocb *nxt; 2081 2082 /* 2083 * If LINK is set, we have dependent requests in this chain. If we 2084 * didn't fail this request, queue the first one up, moving any other 2085 * dependencies to the next request. In case of failure, fail the rest 2086 * of the chain. 2087 */ 2088 if (req->flags & IO_DISARM_MASK) { 2089 struct io_ring_ctx *ctx = req->ctx; 2090 bool posted; 2091 2092 spin_lock(&ctx->completion_lock); 2093 posted = io_disarm_next(req); 2094 if (posted) 2095 io_commit_cqring(req->ctx); 2096 spin_unlock(&ctx->completion_lock); 2097 if (posted) 2098 io_cqring_ev_posted(ctx); 2099 } 2100 nxt = req->link; 2101 req->link = NULL; 2102 return nxt; 2103} 2104 2105static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) 2106{ 2107 if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))) 2108 return NULL; 2109 return __io_req_find_next(req); 2110} 2111 2112static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) 2113{ 2114 if (!ctx) 2115 return; 2116 if (*locked) { 2117 if (ctx->submit_state.compl_nr) 2118 io_submit_flush_completions(ctx); 2119 mutex_unlock(&ctx->uring_lock); 2120 *locked = false; 2121 } 2122 percpu_ref_put(&ctx->refs); 2123} 2124 2125static void tctx_task_work(struct callback_head *cb) 2126{ 2127 bool locked = false; 2128 struct io_ring_ctx *ctx = NULL; 2129 struct io_uring_task *tctx = container_of(cb, struct io_uring_task, 2130 task_work); 2131 2132 while (1) { 2133 struct io_wq_work_node *node; 2134 2135 if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr) 2136 io_submit_flush_completions(ctx); 2137 2138 spin_lock_irq(&tctx->task_lock); 2139 node = tctx->task_list.first; 2140 INIT_WQ_LIST(&tctx->task_list); 2141 if (!node) 2142 tctx->task_running = false; 2143 spin_unlock_irq(&tctx->task_lock); 2144 if (!node) 2145 break; 2146 2147 do { 2148 struct io_wq_work_node *next = node->next; 2149 struct io_kiocb *req = container_of(node, struct io_kiocb, 2150 io_task_work.node); 2151 2152 if (req->ctx != ctx) { 2153 ctx_flush_and_put(ctx, &locked); 2154 ctx = req->ctx; 2155 /* if not contended, grab and improve batching */ 2156 locked = mutex_trylock(&ctx->uring_lock); 2157 percpu_ref_get(&ctx->refs); 2158 } 2159 req->io_task_work.func(req, &locked); 2160 node = next; 2161 } while (node); 2162 2163 cond_resched(); 2164 } 2165 2166 ctx_flush_and_put(ctx, &locked); 2167} 2168 2169static void io_req_task_work_add(struct io_kiocb *req) 2170{ 2171 struct task_struct *tsk = req->task; 2172 struct io_uring_task *tctx = tsk->io_uring; 2173 enum task_work_notify_mode notify; 2174 struct io_wq_work_node *node; 2175 unsigned long flags; 2176 bool running; 2177 2178 WARN_ON_ONCE(!tctx); 2179 2180 spin_lock_irqsave(&tctx->task_lock, flags); 2181 wq_list_add_tail(&req->io_task_work.node, &tctx->task_list); 2182 running = tctx->task_running; 2183 if (!running) 2184 tctx->task_running = true; 2185 spin_unlock_irqrestore(&tctx->task_lock, flags); 2186 2187 /* task_work already pending, we're done */ 2188 if (running) 2189 return; 2190 2191 /* 2192 * SQPOLL kernel thread doesn't need notification, just a wakeup. For 2193 * all other cases, use TWA_SIGNAL unconditionally to ensure we're 2194 * processing task_work. There's no reliable way to tell if TWA_RESUME 2195 * will do the job. 2196 */ 2197 notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL; 2198 if (!task_work_add(tsk, &tctx->task_work, notify)) { 2199 wake_up_process(tsk); 2200 return; 2201 } 2202 2203 spin_lock_irqsave(&tctx->task_lock, flags); 2204 tctx->task_running = false; 2205 node = tctx->task_list.first; 2206 INIT_WQ_LIST(&tctx->task_list); 2207 spin_unlock_irqrestore(&tctx->task_lock, flags); 2208 2209 while (node) { 2210 req = container_of(node, struct io_kiocb, io_task_work.node); 2211 node = node->next; 2212 if (llist_add(&req->io_task_work.fallback_node, 2213 &req->ctx->fallback_llist)) 2214 schedule_delayed_work(&req->ctx->fallback_work, 1); 2215 } 2216} 2217 2218static void io_req_task_cancel(struct io_kiocb *req, bool *locked) 2219{ 2220 struct io_ring_ctx *ctx = req->ctx; 2221 2222 /* not needed for normal modes, but SQPOLL depends on it */ 2223 io_tw_lock(ctx, locked); 2224 io_req_complete_failed(req, req->result); 2225} 2226 2227static void io_req_task_submit(struct io_kiocb *req, bool *locked) 2228{ 2229 struct io_ring_ctx *ctx = req->ctx; 2230 2231 io_tw_lock(ctx, locked); 2232 /* req->task == current here, checking PF_EXITING is safe */ 2233 if (likely(!(req->task->flags & PF_EXITING))) 2234 __io_queue_sqe(req); 2235 else 2236 io_req_complete_failed(req, -EFAULT); 2237} 2238 2239static void io_req_task_queue_fail(struct io_kiocb *req, int ret) 2240{ 2241 req->result = ret; 2242 req->io_task_work.func = io_req_task_cancel; 2243 io_req_task_work_add(req); 2244} 2245 2246static void io_req_task_queue(struct io_kiocb *req) 2247{ 2248 req->io_task_work.func = io_req_task_submit; 2249 io_req_task_work_add(req); 2250} 2251 2252static void io_req_task_queue_reissue(struct io_kiocb *req) 2253{ 2254 req->io_task_work.func = io_queue_async_work; 2255 io_req_task_work_add(req); 2256} 2257 2258static inline void io_queue_next(struct io_kiocb *req) 2259{ 2260 struct io_kiocb *nxt = io_req_find_next(req); 2261 2262 if (nxt) 2263 io_req_task_queue(nxt); 2264} 2265 2266static void io_free_req(struct io_kiocb *req) 2267{ 2268 io_queue_next(req); 2269 __io_free_req(req); 2270} 2271 2272static void io_free_req_work(struct io_kiocb *req, bool *locked) 2273{ 2274 io_free_req(req); 2275} 2276 2277struct req_batch { 2278 struct task_struct *task; 2279 int task_refs; 2280 int ctx_refs; 2281}; 2282 2283static inline void io_init_req_batch(struct req_batch *rb) 2284{ 2285 rb->task_refs = 0; 2286 rb->ctx_refs = 0; 2287 rb->task = NULL; 2288} 2289 2290static void io_req_free_batch_finish(struct io_ring_ctx *ctx, 2291 struct req_batch *rb) 2292{ 2293 if (rb->ctx_refs) 2294 percpu_ref_put_many(&ctx->refs, rb->ctx_refs); 2295 if (rb->task) 2296 io_put_task(rb->task, rb->task_refs); 2297} 2298 2299static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req, 2300 struct io_submit_state *state) 2301{ 2302 io_queue_next(req); 2303 io_dismantle_req(req); 2304 2305 if (req->task != rb->task) { 2306 if (rb->task) 2307 io_put_task(rb->task, rb->task_refs); 2308 rb->task = req->task; 2309 rb->task_refs = 0; 2310 } 2311 rb->task_refs++; 2312 rb->ctx_refs++; 2313 2314 if (state->free_reqs != ARRAY_SIZE(state->reqs)) 2315 state->reqs[state->free_reqs++] = req; 2316 else 2317 list_add(&req->inflight_entry, &state->free_list); 2318} 2319 2320static void io_submit_flush_completions(struct io_ring_ctx *ctx) 2321 __must_hold(&ctx->uring_lock) 2322{ 2323 struct io_submit_state *state = &ctx->submit_state; 2324 int i, nr = state->compl_nr; 2325 struct req_batch rb; 2326 2327 spin_lock(&ctx->completion_lock); 2328 for (i = 0; i < nr; i++) { 2329 struct io_kiocb *req = state->compl_reqs[i]; 2330 2331 __io_cqring_fill_event(ctx, req->user_data, req->result, 2332 req->compl.cflags); 2333 } 2334 io_commit_cqring(ctx); 2335 spin_unlock(&ctx->completion_lock); 2336 io_cqring_ev_posted(ctx); 2337 2338 io_init_req_batch(&rb); 2339 for (i = 0; i < nr; i++) { 2340 struct io_kiocb *req = state->compl_reqs[i]; 2341 2342 if (req_ref_put_and_test(req)) 2343 io_req_free_batch(&rb, req, &ctx->submit_state); 2344 } 2345 2346 io_req_free_batch_finish(ctx, &rb); 2347 state->compl_nr = 0; 2348} 2349 2350/* 2351 * Drop reference to request, return next in chain (if there is one) if this 2352 * was the last reference to this request. 2353 */ 2354static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) 2355{ 2356 struct io_kiocb *nxt = NULL; 2357 2358 if (req_ref_put_and_test(req)) { 2359 nxt = io_req_find_next(req); 2360 __io_free_req(req); 2361 } 2362 return nxt; 2363} 2364 2365static inline void io_put_req(struct io_kiocb *req) 2366{ 2367 if (req_ref_put_and_test(req)) 2368 io_free_req(req); 2369} 2370 2371static inline void io_put_req_deferred(struct io_kiocb *req) 2372{ 2373 if (req_ref_put_and_test(req)) { 2374 req->io_task_work.func = io_free_req_work; 2375 io_req_task_work_add(req); 2376 } 2377} 2378 2379static unsigned io_cqring_events(struct io_ring_ctx *ctx) 2380{ 2381 /* See comment at the top of this file */ 2382 smp_rmb(); 2383 return __io_cqring_events(ctx); 2384} 2385 2386static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) 2387{ 2388 struct io_rings *rings = ctx->rings; 2389 2390 /* make sure SQ entry isn't read before tail */ 2391 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; 2392} 2393 2394static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf) 2395{ 2396 unsigned int cflags; 2397 2398 cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; 2399 cflags |= IORING_CQE_F_BUFFER; 2400 req->flags &= ~REQ_F_BUFFER_SELECTED; 2401 kfree(kbuf); 2402 return cflags; 2403} 2404 2405static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) 2406{ 2407 struct io_buffer *kbuf; 2408 2409 if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) 2410 return 0; 2411 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; 2412 return io_put_kbuf(req, kbuf); 2413} 2414 2415static inline bool io_run_task_work(void) 2416{ 2417 if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) { 2418 __set_current_state(TASK_RUNNING); 2419 tracehook_notify_signal(); 2420 return true; 2421 } 2422 2423 return false; 2424} 2425 2426/* 2427 * Find and free completed poll iocbs 2428 */ 2429static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, 2430 struct list_head *done) 2431{ 2432 struct req_batch rb; 2433 struct io_kiocb *req; 2434 2435 /* order with ->result store in io_complete_rw_iopoll() */ 2436 smp_rmb(); 2437 2438 io_init_req_batch(&rb); 2439 while (!list_empty(done)) { 2440 req = list_first_entry(done, struct io_kiocb, inflight_entry); 2441 list_del(&req->inflight_entry); 2442 2443 __io_cqring_fill_event(ctx, req->user_data, req->result, 2444 io_put_rw_kbuf(req)); 2445 (*nr_events)++; 2446 2447 if (req_ref_put_and_test(req)) 2448 io_req_free_batch(&rb, req, &ctx->submit_state); 2449 } 2450 2451 io_commit_cqring(ctx); 2452 io_cqring_ev_posted_iopoll(ctx); 2453 io_req_free_batch_finish(ctx, &rb); 2454} 2455 2456static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, 2457 long min) 2458{ 2459 struct io_kiocb *req, *tmp; 2460 LIST_HEAD(done); 2461 bool spin; 2462 2463 /* 2464 * Only spin for completions if we don't have multiple devices hanging 2465 * off our complete list, and we're under the requested amount. 2466 */ 2467 spin = !ctx->poll_multi_queue && *nr_events < min; 2468 2469 list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) { 2470 struct kiocb *kiocb = &req->rw.kiocb; 2471 int ret; 2472 2473 /* 2474 * Move completed and retryable entries to our local lists. 2475 * If we find a request that requires polling, break out 2476 * and complete those lists first, if we have entries there. 2477 */ 2478 if (READ_ONCE(req->iopoll_completed)) { 2479 list_move_tail(&req->inflight_entry, &done); 2480 continue; 2481 } 2482 if (!list_empty(&done)) 2483 break; 2484 2485 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin); 2486 if (unlikely(ret < 0)) 2487 return ret; 2488 else if (ret) 2489 spin = false; 2490 2491 /* iopoll may have completed current req */ 2492 if (READ_ONCE(req->iopoll_completed)) 2493 list_move_tail(&req->inflight_entry, &done); 2494 } 2495 2496 if (!list_empty(&done)) 2497 io_iopoll_complete(ctx, nr_events, &done); 2498 2499 return 0; 2500} 2501 2502/* 2503 * We can't just wait for polled events to come to us, we have to actively 2504 * find and complete them. 2505 */ 2506static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) 2507{ 2508 if (!(ctx->flags & IORING_SETUP_IOPOLL)) 2509 return; 2510 2511 mutex_lock(&ctx->uring_lock); 2512 while (!list_empty(&ctx->iopoll_list)) { 2513 unsigned int nr_events = 0; 2514 2515 io_do_iopoll(ctx, &nr_events, 0); 2516 2517 /* let it sleep and repeat later if can't complete a request */ 2518 if (nr_events == 0) 2519 break; 2520 /* 2521 * Ensure we allow local-to-the-cpu processing to take place, 2522 * in this case we need to ensure that we reap all events. 2523 * Also let task_work, etc. to progress by releasing the mutex 2524 */ 2525 if (need_resched()) { 2526 mutex_unlock(&ctx->uring_lock); 2527 cond_resched(); 2528 mutex_lock(&ctx->uring_lock); 2529 } 2530 } 2531 mutex_unlock(&ctx->uring_lock); 2532} 2533 2534static int io_iopoll_check(struct io_ring_ctx *ctx, long min) 2535{ 2536 unsigned int nr_events = 0; 2537 int ret = 0; 2538 2539 /* 2540 * We disallow the app entering submit/complete with polling, but we 2541 * still need to lock the ring to prevent racing with polled issue 2542 * that got punted to a workqueue. 2543 */ 2544 mutex_lock(&ctx->uring_lock); 2545 /* 2546 * Don't enter poll loop if we already have events pending. 2547 * If we do, we can potentially be spinning for commands that 2548 * already triggered a CQE (eg in error). 2549 */ 2550 if (test_bit(0, &ctx->check_cq_overflow)) 2551 __io_cqring_overflow_flush(ctx, false); 2552 if (io_cqring_events(ctx)) 2553 goto out; 2554 do { 2555 /* 2556 * If a submit got punted to a workqueue, we can have the 2557 * application entering polling for a command before it gets 2558 * issued. That app will hold the uring_lock for the duration 2559 * of the poll right here, so we need to take a breather every 2560 * now and then to ensure that the issue has a chance to add 2561 * the poll to the issued list. Otherwise we can spin here 2562 * forever, while the workqueue is stuck trying to acquire the 2563 * very same mutex. 2564 */ 2565 if (list_empty(&ctx->iopoll_list)) { 2566 u32 tail = ctx->cached_cq_tail; 2567 2568 mutex_unlock(&ctx->uring_lock); 2569 io_run_task_work(); 2570 mutex_lock(&ctx->uring_lock); 2571 2572 /* some requests don't go through iopoll_list */ 2573 if (tail != ctx->cached_cq_tail || 2574 list_empty(&ctx->iopoll_list)) 2575 break; 2576 } 2577 ret = io_do_iopoll(ctx, &nr_events, min); 2578 } while (!ret && nr_events < min && !need_resched()); 2579out: 2580 mutex_unlock(&ctx->uring_lock); 2581 return ret; 2582} 2583 2584static void kiocb_end_write(struct io_kiocb *req) 2585{ 2586 /* 2587 * Tell lockdep we inherited freeze protection from submission 2588 * thread. 2589 */ 2590 if (req->flags & REQ_F_ISREG) { 2591 struct super_block *sb = file_inode(req->file)->i_sb; 2592 2593 __sb_writers_acquired(sb, SB_FREEZE_WRITE); 2594 sb_end_write(sb); 2595 } 2596} 2597 2598#ifdef CONFIG_BLOCK 2599static bool io_resubmit_prep(struct io_kiocb *req) 2600{ 2601 struct io_async_rw *rw = req->async_data; 2602 2603 if (!rw) 2604 return !io_req_prep_async(req); 2605 iov_iter_restore(&rw->iter, &rw->iter_state); 2606 return true; 2607} 2608 2609static bool io_rw_should_reissue(struct io_kiocb *req) 2610{ 2611 umode_t mode = file_inode(req->file)->i_mode; 2612 struct io_ring_ctx *ctx = req->ctx; 2613 2614 if (!S_ISBLK(mode) && !S_ISREG(mode)) 2615 return false; 2616 if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && 2617 !(ctx->flags & IORING_SETUP_IOPOLL))) 2618 return false; 2619 /* 2620 * If ref is dying, we might be running poll reap from the exit work. 2621 * Don't attempt to reissue from that path, just let it fail with 2622 * -EAGAIN. 2623 */ 2624 if (percpu_ref_is_dying(&ctx->refs)) 2625 return false; 2626 /* 2627 * Play it safe and assume not safe to re-import and reissue if we're 2628 * not in the original thread group (or in task context). 2629 */ 2630 if (!same_thread_group(req->task, current) || !in_task()) 2631 return false; 2632 return true; 2633} 2634#else 2635static bool io_resubmit_prep(struct io_kiocb *req) 2636{ 2637 return false; 2638} 2639static bool io_rw_should_reissue(struct io_kiocb *req) 2640{ 2641 return false; 2642} 2643#endif 2644 2645static bool __io_complete_rw_common(struct io_kiocb *req, long res) 2646{ 2647 if (req->rw.kiocb.ki_flags & IOCB_WRITE) 2648 kiocb_end_write(req); 2649 if (res != req->result) { 2650 if ((res == -EAGAIN || res == -EOPNOTSUPP) && 2651 io_rw_should_reissue(req)) { 2652 req->flags |= REQ_F_REISSUE; 2653 return true; 2654 } 2655 req_set_fail(req); 2656 req->result = res; 2657 } 2658 return false; 2659} 2660 2661static void io_req_task_complete(struct io_kiocb *req, bool *locked) 2662{ 2663 unsigned int cflags = io_put_rw_kbuf(req); 2664 long res = req->result; 2665 2666 if (*locked) { 2667 struct io_ring_ctx *ctx = req->ctx; 2668 struct io_submit_state *state = &ctx->submit_state; 2669 2670 io_req_complete_state(req, res, cflags); 2671 state->compl_reqs[state->compl_nr++] = req; 2672 if (state->compl_nr == ARRAY_SIZE(state->compl_reqs)) 2673 io_submit_flush_completions(ctx); 2674 } else { 2675 io_req_complete_post(req, res, cflags); 2676 } 2677} 2678 2679static void __io_complete_rw(struct io_kiocb *req, long res, long res2, 2680 unsigned int issue_flags) 2681{ 2682 if (__io_complete_rw_common(req, res)) 2683 return; 2684 __io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req)); 2685} 2686 2687static void io_complete_rw(struct kiocb *kiocb, long res, long res2) 2688{ 2689 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 2690 2691 if (__io_complete_rw_common(req, res)) 2692 return; 2693 req->result = res; 2694 req->io_task_work.func = io_req_task_complete; 2695 io_req_task_work_add(req); 2696} 2697 2698static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) 2699{ 2700 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 2701 2702 if (kiocb->ki_flags & IOCB_WRITE) 2703 kiocb_end_write(req); 2704 if (unlikely(res != req->result)) { 2705 if (res == -EAGAIN && io_rw_should_reissue(req)) { 2706 req->flags |= REQ_F_REISSUE; 2707 return; 2708 } 2709 } 2710 2711 WRITE_ONCE(req->result, res); 2712 /* order with io_iopoll_complete() checking ->result */ 2713 smp_wmb(); 2714 WRITE_ONCE(req->iopoll_completed, 1); 2715} 2716 2717/* 2718 * After the iocb has been issued, it's safe to be found on the poll list. 2719 * Adding the kiocb to the list AFTER submission ensures that we don't 2720 * find it from a io_do_iopoll() thread before the issuer is done 2721 * accessing the kiocb cookie. 2722 */ 2723static void io_iopoll_req_issued(struct io_kiocb *req) 2724{ 2725 struct io_ring_ctx *ctx = req->ctx; 2726 const bool in_async = io_wq_current_is_worker(); 2727 2728 /* workqueue context doesn't hold uring_lock, grab it now */ 2729 if (unlikely(in_async)) 2730 mutex_lock(&ctx->uring_lock); 2731 2732 /* 2733 * Track whether we have multiple files in our lists. This will impact 2734 * how we do polling eventually, not spinning if we're on potentially 2735 * different devices. 2736 */ 2737 if (list_empty(&ctx->iopoll_list)) { 2738 ctx->poll_multi_queue = false; 2739 } else if (!ctx->poll_multi_queue) { 2740 struct io_kiocb *list_req; 2741 unsigned int queue_num0, queue_num1; 2742 2743 list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb, 2744 inflight_entry); 2745 2746 if (list_req->file != req->file) { 2747 ctx->poll_multi_queue = true; 2748 } else { 2749 queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie); 2750 queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie); 2751 if (queue_num0 != queue_num1) 2752 ctx->poll_multi_queue = true; 2753 } 2754 } 2755 2756 /* 2757 * For fast devices, IO may have already completed. If it has, add 2758 * it to the front so we find it first. 2759 */ 2760 if (READ_ONCE(req->iopoll_completed)) 2761 list_add(&req->inflight_entry, &ctx->iopoll_list); 2762 else 2763 list_add_tail(&req->inflight_entry, &ctx->iopoll_list); 2764 2765 if (unlikely(in_async)) { 2766 /* 2767 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle 2768 * in sq thread task context or in io worker task context. If 2769 * current task context is sq thread, we don't need to check 2770 * whether should wake up sq thread. 2771 */ 2772 if ((ctx->flags & IORING_SETUP_SQPOLL) && 2773 wq_has_sleeper(&ctx->sq_data->wait)) 2774 wake_up(&ctx->sq_data->wait); 2775 2776 mutex_unlock(&ctx->uring_lock); 2777 } 2778} 2779 2780static bool io_bdev_nowait(struct block_device *bdev) 2781{ 2782 return !bdev || blk_queue_nowait(bdev_get_queue(bdev)); 2783} 2784 2785/* 2786 * If we tracked the file through the SCM inflight mechanism, we could support 2787 * any file. For now, just ensure that anything potentially problematic is done 2788 * inline. 2789 */ 2790static bool __io_file_supports_nowait(struct file *file, int rw) 2791{ 2792 umode_t mode = file_inode(file)->i_mode; 2793 2794 if (S_ISBLK(mode)) { 2795 if (IS_ENABLED(CONFIG_BLOCK) && 2796 io_bdev_nowait(I_BDEV(file->f_mapping->host))) 2797 return true; 2798 return false; 2799 } 2800 if (S_ISSOCK(mode)) 2801 return true; 2802 if (S_ISREG(mode)) { 2803 if (IS_ENABLED(CONFIG_BLOCK) && 2804 io_bdev_nowait(file->f_inode->i_sb->s_bdev) && 2805 file->f_op != &io_uring_fops) 2806 return true; 2807 return false; 2808 } 2809 2810 /* any ->read/write should understand O_NONBLOCK */ 2811 if (file->f_flags & O_NONBLOCK) 2812 return true; 2813 2814 if (!(file->f_mode & FMODE_NOWAIT)) 2815 return false; 2816 2817 if (rw == READ) 2818 return file->f_op->read_iter != NULL; 2819 2820 return file->f_op->write_iter != NULL; 2821} 2822 2823static bool io_file_supports_nowait(struct io_kiocb *req, int rw) 2824{ 2825 if (rw == READ && (req->flags & REQ_F_NOWAIT_READ)) 2826 return true; 2827 else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE)) 2828 return true; 2829 2830 return __io_file_supports_nowait(req->file, rw); 2831} 2832 2833static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, 2834 int rw) 2835{ 2836 struct io_ring_ctx *ctx = req->ctx; 2837 struct kiocb *kiocb = &req->rw.kiocb; 2838 struct file *file = req->file; 2839 unsigned ioprio; 2840 int ret; 2841 2842 if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode)) 2843 req->flags |= REQ_F_ISREG; 2844 2845 kiocb->ki_pos = READ_ONCE(sqe->off); 2846 if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) { 2847 req->flags |= REQ_F_CUR_POS; 2848 kiocb->ki_pos = file->f_pos; 2849 } 2850 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); 2851 kiocb->ki_flags = iocb_flags(kiocb->ki_filp); 2852 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); 2853 if (unlikely(ret)) 2854 return ret; 2855 2856 /* 2857 * If the file is marked O_NONBLOCK, still allow retry for it if it 2858 * supports async. Otherwise it's impossible to use O_NONBLOCK files 2859 * reliably. If not, or it IOCB_NOWAIT is set, don't retry. 2860 */ 2861 if ((kiocb->ki_flags & IOCB_NOWAIT) || 2862 ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req, rw))) 2863 req->flags |= REQ_F_NOWAIT; 2864 2865 ioprio = READ_ONCE(sqe->ioprio); 2866 if (ioprio) { 2867 ret = ioprio_check_cap(ioprio); 2868 if (ret) 2869 return ret; 2870 2871 kiocb->ki_ioprio = ioprio; 2872 } else 2873 kiocb->ki_ioprio = get_current_ioprio(); 2874 2875 if (ctx->flags & IORING_SETUP_IOPOLL) { 2876 if (!(kiocb->ki_flags & IOCB_DIRECT) || 2877 !kiocb->ki_filp->f_op->iopoll) 2878 return -EOPNOTSUPP; 2879 2880 kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; 2881 kiocb->ki_complete = io_complete_rw_iopoll; 2882 req->iopoll_completed = 0; 2883 } else { 2884 if (kiocb->ki_flags & IOCB_HIPRI) 2885 return -EINVAL; 2886 kiocb->ki_complete = io_complete_rw; 2887 } 2888 2889 if (req->opcode == IORING_OP_READ_FIXED || 2890 req->opcode == IORING_OP_WRITE_FIXED) { 2891 req->imu = NULL; 2892 io_req_set_rsrc_node(req); 2893 } 2894 2895 req->rw.addr = READ_ONCE(sqe->addr); 2896 req->rw.len = READ_ONCE(sqe->len); 2897 req->buf_index = READ_ONCE(sqe->buf_index); 2898 return 0; 2899} 2900 2901static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) 2902{ 2903 switch (ret) { 2904 case -EIOCBQUEUED: 2905 break; 2906 case -ERESTARTSYS: 2907 case -ERESTARTNOINTR: 2908 case -ERESTARTNOHAND: 2909 case -ERESTART_RESTARTBLOCK: 2910 /* 2911 * We can't just restart the syscall, since previously 2912 * submitted sqes may already be in progress. Just fail this 2913 * IO with EINTR. 2914 */ 2915 ret = -EINTR; 2916 fallthrough; 2917 default: 2918 kiocb->ki_complete(kiocb, ret, 0); 2919 } 2920} 2921 2922static void kiocb_done(struct kiocb *kiocb, ssize_t ret, 2923 unsigned int issue_flags) 2924{ 2925 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 2926 struct io_async_rw *io = req->async_data; 2927 2928 /* add previously done IO, if any */ 2929 if (io && io->bytes_done > 0) { 2930 if (ret < 0) 2931 ret = io->bytes_done; 2932 else 2933 ret += io->bytes_done; 2934 } 2935 2936 if (req->flags & REQ_F_CUR_POS) 2937 req->file->f_pos = kiocb->ki_pos; 2938 if (ret >= 0 && (kiocb->ki_complete == io_complete_rw)) 2939 __io_complete_rw(req, ret, 0, issue_flags); 2940 else 2941 io_rw_done(kiocb, ret); 2942 2943 if (req->flags & REQ_F_REISSUE) { 2944 req->flags &= ~REQ_F_REISSUE; 2945 if (io_resubmit_prep(req)) { 2946 io_req_task_queue_reissue(req); 2947 } else { 2948 unsigned int cflags = io_put_rw_kbuf(req); 2949 struct io_ring_ctx *ctx = req->ctx; 2950 2951 req_set_fail(req); 2952 if (!(issue_flags & IO_URING_F_NONBLOCK)) { 2953 mutex_lock(&ctx->uring_lock); 2954 __io_req_complete(req, issue_flags, ret, cflags); 2955 mutex_unlock(&ctx->uring_lock); 2956 } else { 2957 __io_req_complete(req, issue_flags, ret, cflags); 2958 } 2959 } 2960 } 2961} 2962 2963static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, 2964 struct io_mapped_ubuf *imu) 2965{ 2966 size_t len = req->rw.len; 2967 u64 buf_end, buf_addr = req->rw.addr; 2968 size_t offset; 2969 2970 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 2971 return -EFAULT; 2972 /* not inside the mapped region */ 2973 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) 2974 return -EFAULT; 2975 2976 /* 2977 * May not be a start of buffer, set size appropriately 2978 * and advance us to the beginning. 2979 */ 2980 offset = buf_addr - imu->ubuf; 2981 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); 2982 2983 if (offset) { 2984 /* 2985 * Don't use iov_iter_advance() here, as it's really slow for 2986 * using the latter parts of a big fixed buffer - it iterates 2987 * over each segment manually. We can cheat a bit here, because 2988 * we know that: 2989 * 2990 * 1) it's a BVEC iter, we set it up 2991 * 2) all bvecs are PAGE_SIZE in size, except potentially the 2992 * first and last bvec 2993 * 2994 * So just find our index, and adjust the iterator afterwards. 2995 * If the offset is within the first bvec (or the whole first 2996 * bvec, just use iov_iter_advance(). This makes it easier 2997 * since we can just skip the first segment, which may not 2998 * be PAGE_SIZE aligned. 2999 */ 3000 const struct bio_vec *bvec = imu->bvec; 3001 3002 if (offset <= bvec->bv_len) { 3003 iov_iter_advance(iter, offset); 3004 } else { 3005 unsigned long seg_skip; 3006 3007 /* skip first vec */ 3008 offset -= bvec->bv_len; 3009 seg_skip = 1 + (offset >> PAGE_SHIFT); 3010 3011 iter->bvec = bvec + seg_skip; 3012 iter->nr_segs -= seg_skip; 3013 iter->count -= bvec->bv_len + offset; 3014 iter->iov_offset = offset & ~PAGE_MASK; 3015 } 3016 } 3017 3018 return 0; 3019} 3020 3021static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) 3022{ 3023 struct io_ring_ctx *ctx = req->ctx; 3024 struct io_mapped_ubuf *imu = req->imu; 3025 u16 index, buf_index = req->buf_index; 3026 3027 if (likely(!imu)) { 3028 if (unlikely(buf_index >= ctx->nr_user_bufs)) 3029 return -EFAULT; 3030 index = array_index_nospec(buf_index, ctx->nr_user_bufs); 3031 imu = READ_ONCE(ctx->user_bufs[index]); 3032 req->imu = imu; 3033 } 3034 return __io_import_fixed(req, rw, iter, imu); 3035} 3036 3037static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock) 3038{ 3039 if (needs_lock) 3040 mutex_unlock(&ctx->uring_lock); 3041} 3042 3043static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) 3044{ 3045 /* 3046 * "Normal" inline submissions always hold the uring_lock, since we 3047 * grab it from the system call. Same is true for the SQPOLL offload. 3048 * The only exception is when we've detached the request and issue it 3049 * from an async worker thread, grab the lock for that case. 3050 */ 3051 if (needs_lock) 3052 mutex_lock(&ctx->uring_lock); 3053} 3054 3055static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, 3056 int bgid, struct io_buffer *kbuf, 3057 bool needs_lock) 3058{ 3059 struct io_buffer *head; 3060 3061 if (req->flags & REQ_F_BUFFER_SELECTED) 3062 return kbuf; 3063 3064 io_ring_submit_lock(req->ctx, needs_lock); 3065 3066 lockdep_assert_held(&req->ctx->uring_lock); 3067 3068 head = xa_load(&req->ctx->io_buffers, bgid); 3069 if (head) { 3070 if (!list_empty(&head->list)) { 3071 kbuf = list_last_entry(&head->list, struct io_buffer, 3072 list); 3073 list_del(&kbuf->list); 3074 } else { 3075 kbuf = head; 3076 xa_erase(&req->ctx->io_buffers, bgid); 3077 } 3078 if (*len > kbuf->len) 3079 *len = kbuf->len; 3080 } else { 3081 kbuf = ERR_PTR(-ENOBUFS); 3082 } 3083 3084 io_ring_submit_unlock(req->ctx, needs_lock); 3085 3086 return kbuf; 3087} 3088 3089static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, 3090 bool needs_lock) 3091{ 3092 struct io_buffer *kbuf; 3093 u16 bgid; 3094 3095 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; 3096 bgid = req->buf_index; 3097 kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock); 3098 if (IS_ERR(kbuf)) 3099 return kbuf; 3100 req->rw.addr = (u64) (unsigned long) kbuf; 3101 req->flags |= REQ_F_BUFFER_SELECTED; 3102 return u64_to_user_ptr(kbuf->addr); 3103} 3104 3105#ifdef CONFIG_COMPAT 3106static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, 3107 bool needs_lock) 3108{ 3109 struct compat_iovec __user *uiov; 3110 compat_ssize_t clen; 3111 void __user *buf; 3112 ssize_t len; 3113 3114 uiov = u64_to_user_ptr(req->rw.addr); 3115 if (!access_ok(uiov, sizeof(*uiov))) 3116 return -EFAULT; 3117 if (__get_user(clen, &uiov->iov_len)) 3118 return -EFAULT; 3119 if (clen < 0) 3120 return -EINVAL; 3121 3122 len = clen; 3123 buf = io_rw_buffer_select(req, &len, needs_lock); 3124 if (IS_ERR(buf)) 3125 return PTR_ERR(buf); 3126 iov[0].iov_base = buf; 3127 iov[0].iov_len = (compat_size_t) len; 3128 return 0; 3129} 3130#endif 3131 3132static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, 3133 bool needs_lock) 3134{ 3135 struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr); 3136 void __user *buf; 3137 ssize_t len; 3138 3139 if (copy_from_user(iov, uiov, sizeof(*uiov))) 3140 return -EFAULT; 3141 3142 len = iov[0].iov_len; 3143 if (len < 0) 3144 return -EINVAL; 3145 buf = io_rw_buffer_select(req, &len, needs_lock); 3146 if (IS_ERR(buf)) 3147 return PTR_ERR(buf); 3148 iov[0].iov_base = buf; 3149 iov[0].iov_len = len; 3150 return 0; 3151} 3152 3153static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, 3154 bool needs_lock) 3155{ 3156 if (req->flags & REQ_F_BUFFER_SELECTED) { 3157 struct io_buffer *kbuf; 3158 3159 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; 3160 iov[0].iov_base = u64_to_user_ptr(kbuf->addr); 3161 iov[0].iov_len = kbuf->len; 3162 return 0; 3163 } 3164 if (req->rw.len != 1) 3165 return -EINVAL; 3166 3167#ifdef CONFIG_COMPAT 3168 if (req->ctx->compat) 3169 return io_compat_import(req, iov, needs_lock); 3170#endif 3171 3172 return __io_iov_buffer_select(req, iov, needs_lock); 3173} 3174 3175static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, 3176 struct iov_iter *iter, bool needs_lock) 3177{ 3178 void __user *buf = u64_to_user_ptr(req->rw.addr); 3179 size_t sqe_len = req->rw.len; 3180 u8 opcode = req->opcode; 3181 ssize_t ret; 3182 3183 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { 3184 *iovec = NULL; 3185 return io_import_fixed(req, rw, iter); 3186 } 3187 3188 /* buffer index only valid with fixed read/write, or buffer select */ 3189 if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)) 3190 return -EINVAL; 3191 3192 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { 3193 if (req->flags & REQ_F_BUFFER_SELECT) { 3194 buf = io_rw_buffer_select(req, &sqe_len, needs_lock); 3195 if (IS_ERR(buf)) 3196 return PTR_ERR(buf); 3197 req->rw.len = sqe_len; 3198 } 3199 3200 ret = import_single_range(rw, buf, sqe_len, *iovec, iter); 3201 *iovec = NULL; 3202 return ret; 3203 } 3204 3205 if (req->flags & REQ_F_BUFFER_SELECT) { 3206 ret = io_iov_buffer_select(req, *iovec, needs_lock); 3207 if (!ret) 3208 iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len); 3209 *iovec = NULL; 3210 return ret; 3211 } 3212 3213 return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter, 3214 req->ctx->compat); 3215} 3216 3217static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) 3218{ 3219 return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; 3220} 3221 3222/* 3223 * For files that don't have ->read_iter() and ->write_iter(), handle them 3224 * by looping over ->read() or ->write() manually. 3225 */ 3226static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) 3227{ 3228 struct kiocb *kiocb = &req->rw.kiocb; 3229 struct file *file = req->file; 3230 ssize_t ret = 0; 3231 3232 /* 3233 * Don't support polled IO through this interface, and we can't 3234 * support non-blocking either. For the latter, this just causes 3235 * the kiocb to be handled from an async context. 3236 */ 3237 if (kiocb->ki_flags & IOCB_HIPRI) 3238 return -EOPNOTSUPP; 3239 if (kiocb->ki_flags & IOCB_NOWAIT) 3240 return -EAGAIN; 3241 3242 while (iov_iter_count(iter)) { 3243 struct iovec iovec; 3244 ssize_t nr; 3245 3246 if (!iov_iter_is_bvec(iter)) { 3247 iovec = iov_iter_iovec(iter); 3248 } else { 3249 iovec.iov_base = u64_to_user_ptr(req->rw.addr); 3250 iovec.iov_len = req->rw.len; 3251 } 3252 3253 if (rw == READ) { 3254 nr = file->f_op->read(file, iovec.iov_base, 3255 iovec.iov_len, io_kiocb_ppos(kiocb)); 3256 } else { 3257 nr = file->f_op->write(file, iovec.iov_base, 3258 iovec.iov_len, io_kiocb_ppos(kiocb)); 3259 } 3260 3261 if (nr < 0) { 3262 if (!ret) 3263 ret = nr; 3264 break; 3265 } 3266 if (!iov_iter_is_bvec(iter)) { 3267 iov_iter_advance(iter, nr); 3268 } else { 3269 req->rw.len -= nr; 3270 req->rw.addr += nr; 3271 } 3272 ret += nr; 3273 if (nr != iovec.iov_len) 3274 break; 3275 } 3276 3277 return ret; 3278} 3279 3280static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, 3281 const struct iovec *fast_iov, struct iov_iter *iter) 3282{ 3283 struct io_async_rw *rw = req->async_data; 3284 3285 memcpy(&rw->iter, iter, sizeof(*iter)); 3286 rw->free_iovec = iovec; 3287 rw->bytes_done = 0; 3288 /* can only be fixed buffers, no need to do anything */ 3289 if (iov_iter_is_bvec(iter)) 3290 return; 3291 if (!iovec) { 3292 unsigned iov_off = 0; 3293 3294 rw->iter.iov = rw->fast_iov; 3295 if (iter->iov != fast_iov) { 3296 iov_off = iter->iov - fast_iov; 3297 rw->iter.iov += iov_off; 3298 } 3299 if (rw->fast_iov != fast_iov) 3300 memcpy(rw->fast_iov + iov_off, fast_iov + iov_off, 3301 sizeof(struct iovec) * iter->nr_segs); 3302 } else { 3303 req->flags |= REQ_F_NEED_CLEANUP; 3304 } 3305} 3306 3307static inline int io_alloc_async_data(struct io_kiocb *req) 3308{ 3309 WARN_ON_ONCE(!io_op_defs[req->opcode].async_size); 3310 req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL); 3311 return req->async_data == NULL; 3312} 3313 3314static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, 3315 const struct iovec *fast_iov, 3316 struct iov_iter *iter, bool force) 3317{ 3318 if (!force && !io_op_defs[req->opcode].needs_async_setup) 3319 return 0; 3320 if (!req->async_data) { 3321 struct io_async_rw *iorw; 3322 3323 if (io_alloc_async_data(req)) { 3324 kfree(iovec); 3325 return -ENOMEM; 3326 } 3327 3328 io_req_map_rw(req, iovec, fast_iov, iter); 3329 iorw = req->async_data; 3330 /* we've copied and mapped the iter, ensure state is saved */ 3331 iov_iter_save_state(&iorw->iter, &iorw->iter_state); 3332 } 3333 return 0; 3334} 3335 3336static inline int io_rw_prep_async(struct io_kiocb *req, int rw) 3337{ 3338 struct io_async_rw *iorw = req->async_data; 3339 struct iovec *iov = iorw->fast_iov; 3340 int ret; 3341 3342 ret = io_import_iovec(rw, req, &iov, &iorw->iter, false); 3343 if (unlikely(ret < 0)) 3344 return ret; 3345 3346 iorw->bytes_done = 0; 3347 iorw->free_iovec = iov; 3348 if (iov) 3349 req->flags |= REQ_F_NEED_CLEANUP; 3350 iov_iter_save_state(&iorw->iter, &iorw->iter_state); 3351 return 0; 3352} 3353 3354static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3355{ 3356 if (unlikely(!(req->file->f_mode & FMODE_READ))) 3357 return -EBADF; 3358 return io_prep_rw(req, sqe, READ); 3359} 3360 3361/* 3362 * This is our waitqueue callback handler, registered through lock_page_async() 3363 * when we initially tried to do the IO with the iocb armed our waitqueue. 3364 * This gets called when the page is unlocked, and we generally expect that to 3365 * happen when the page IO is completed and the page is now uptodate. This will 3366 * queue a task_work based retry of the operation, attempting to copy the data 3367 * again. If the latter fails because the page was NOT uptodate, then we will 3368 * do a thread based blocking retry of the operation. That's the unexpected 3369 * slow path. 3370 */ 3371static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, 3372 int sync, void *arg) 3373{ 3374 struct wait_page_queue *wpq; 3375 struct io_kiocb *req = wait->private; 3376 struct wait_page_key *key = arg; 3377 3378 wpq = container_of(wait, struct wait_page_queue, wait); 3379 3380 if (!wake_page_match(wpq, key)) 3381 return 0; 3382 3383 req->rw.kiocb.ki_flags &= ~IOCB_WAITQ; 3384 list_del_init(&wait->entry); 3385 io_req_task_queue(req); 3386 return 1; 3387} 3388 3389/* 3390 * This controls whether a given IO request should be armed for async page 3391 * based retry. If we return false here, the request is handed to the async 3392 * worker threads for retry. If we're doing buffered reads on a regular file, 3393 * we prepare a private wait_page_queue entry and retry the operation. This 3394 * will either succeed because the page is now uptodate and unlocked, or it 3395 * will register a callback when the page is unlocked at IO completion. Through 3396 * that callback, io_uring uses task_work to setup a retry of the operation. 3397 * That retry will attempt the buffered read again. The retry will generally 3398 * succeed, or in rare cases where it fails, we then fall back to using the 3399 * async worker threads for a blocking retry. 3400 */ 3401static bool io_rw_should_retry(struct io_kiocb *req) 3402{ 3403 struct io_async_rw *rw = req->async_data; 3404 struct wait_page_queue *wait = &rw->wpq; 3405 struct kiocb *kiocb = &req->rw.kiocb; 3406 3407 /* never retry for NOWAIT, we just complete with -EAGAIN */ 3408 if (req->flags & REQ_F_NOWAIT) 3409 return false; 3410 3411 /* Only for buffered IO */ 3412 if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) 3413 return false; 3414 3415 /* 3416 * just use poll if we can, and don't attempt if the fs doesn't 3417 * support callback based unlocks 3418 */ 3419 if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) 3420 return false; 3421 3422 wait->wait.func = io_async_buf_func; 3423 wait->wait.private = req; 3424 wait->wait.flags = 0; 3425 INIT_LIST_HEAD(&wait->wait.entry); 3426 kiocb->ki_flags |= IOCB_WAITQ; 3427 kiocb->ki_flags &= ~IOCB_NOWAIT; 3428 kiocb->ki_waitq = wait; 3429 return true; 3430} 3431 3432static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) 3433{ 3434 if (req->file->f_op->read_iter) 3435 return call_read_iter(req->file, &req->rw.kiocb, iter); 3436 else if (req->file->f_op->read) 3437 return loop_rw_iter(READ, req, iter); 3438 else 3439 return -EINVAL; 3440} 3441 3442static bool need_read_all(struct io_kiocb *req) 3443{ 3444 return req->flags & REQ_F_ISREG || 3445 S_ISBLK(file_inode(req->file)->i_mode); 3446} 3447 3448static int io_read(struct io_kiocb *req, unsigned int issue_flags) 3449{ 3450 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 3451 struct kiocb *kiocb = &req->rw.kiocb; 3452 struct iov_iter __iter, *iter = &__iter; 3453 struct io_async_rw *rw = req->async_data; 3454 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 3455 struct iov_iter_state __state, *state; 3456 ssize_t ret, ret2; 3457 3458 if (rw) { 3459 iter = &rw->iter; 3460 state = &rw->iter_state; 3461 /* 3462 * We come here from an earlier attempt, restore our state to 3463 * match in case it doesn't. It's cheap enough that we don't 3464 * need to make this conditional. 3465 */ 3466 iov_iter_restore(iter, state); 3467 iovec = NULL; 3468 } else { 3469 ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); 3470 if (ret < 0) 3471 return ret; 3472 state = &__state; 3473 iov_iter_save_state(iter, state); 3474 } 3475 req->result = iov_iter_count(iter); 3476 3477 /* Ensure we clear previously set non-block flag */ 3478 if (!force_nonblock) 3479 kiocb->ki_flags &= ~IOCB_NOWAIT; 3480 else 3481 kiocb->ki_flags |= IOCB_NOWAIT; 3482 3483 /* If the file doesn't support async, just async punt */ 3484 if (force_nonblock && !io_file_supports_nowait(req, READ)) { 3485 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true); 3486 return ret ?: -EAGAIN; 3487 } 3488 3489 ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result); 3490 if (unlikely(ret)) { 3491 kfree(iovec); 3492 return ret; 3493 } 3494 3495 ret = io_iter_do_read(req, iter); 3496 3497 if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { 3498 req->flags &= ~REQ_F_REISSUE; 3499 /* IOPOLL retry should happen for io-wq threads */ 3500 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) 3501 goto done; 3502 /* no retry on NONBLOCK nor RWF_NOWAIT */ 3503 if (req->flags & REQ_F_NOWAIT) 3504 goto done; 3505 ret = 0; 3506 } else if (ret == -EIOCBQUEUED) { 3507 goto out_free; 3508 } else if (ret <= 0 || ret == req->result || !force_nonblock || 3509 (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { 3510 /* read all, failed, already did sync or don't want to retry */ 3511 goto done; 3512 } 3513 3514 /* 3515 * Don't depend on the iter state matching what was consumed, or being 3516 * untouched in case of error. Restore it and we'll advance it 3517 * manually if we need to. 3518 */ 3519 iov_iter_restore(iter, state); 3520 3521 ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true); 3522 if (ret2) 3523 return ret2; 3524 3525 iovec = NULL; 3526 rw = req->async_data; 3527 /* 3528 * Now use our persistent iterator and state, if we aren't already. 3529 * We've restored and mapped the iter to match. 3530 */ 3531 if (iter != &rw->iter) { 3532 iter = &rw->iter; 3533 state = &rw->iter_state; 3534 } 3535 3536 do { 3537 /* 3538 * We end up here because of a partial read, either from 3539 * above or inside this loop. Advance the iter by the bytes 3540 * that were consumed. 3541 */ 3542 iov_iter_advance(iter, ret); 3543 if (!iov_iter_count(iter)) 3544 break; 3545 rw->bytes_done += ret; 3546 iov_iter_save_state(iter, state); 3547 3548 /* if we can retry, do so with the callbacks armed */ 3549 if (!io_rw_should_retry(req)) { 3550 kiocb->ki_flags &= ~IOCB_WAITQ; 3551 return -EAGAIN; 3552 } 3553 3554 /* 3555 * Now retry read with the IOCB_WAITQ parts set in the iocb. If 3556 * we get -EIOCBQUEUED, then we'll get a notification when the 3557 * desired page gets unlocked. We can also get a partial read 3558 * here, and if we do, then just retry at the new offset. 3559 */ 3560 ret = io_iter_do_read(req, iter); 3561 if (ret == -EIOCBQUEUED) 3562 return 0; 3563 /* we got some bytes, but not all. retry. */ 3564 kiocb->ki_flags &= ~IOCB_WAITQ; 3565 iov_iter_restore(iter, state); 3566 } while (ret > 0); 3567done: 3568 kiocb_done(kiocb, ret, issue_flags); 3569out_free: 3570 /* it's faster to check here then delegate to kfree */ 3571 if (iovec) 3572 kfree(iovec); 3573 return 0; 3574} 3575 3576static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3577{ 3578 if (unlikely(!(req->file->f_mode & FMODE_WRITE))) 3579 return -EBADF; 3580 return io_prep_rw(req, sqe, WRITE); 3581} 3582 3583static int io_write(struct io_kiocb *req, unsigned int issue_flags) 3584{ 3585 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 3586 struct kiocb *kiocb = &req->rw.kiocb; 3587 struct iov_iter __iter, *iter = &__iter; 3588 struct io_async_rw *rw = req->async_data; 3589 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 3590 struct iov_iter_state __state, *state; 3591 ssize_t ret, ret2; 3592 3593 if (rw) { 3594 iter = &rw->iter; 3595 state = &rw->iter_state; 3596 iov_iter_restore(iter, state); 3597 iovec = NULL; 3598 } else { 3599 ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); 3600 if (ret < 0) 3601 return ret; 3602 state = &__state; 3603 iov_iter_save_state(iter, state); 3604 } 3605 req->result = iov_iter_count(iter); 3606 3607 /* Ensure we clear previously set non-block flag */ 3608 if (!force_nonblock) 3609 kiocb->ki_flags &= ~IOCB_NOWAIT; 3610 else 3611 kiocb->ki_flags |= IOCB_NOWAIT; 3612 3613 /* If the file doesn't support async, just async punt */ 3614 if (force_nonblock && !io_file_supports_nowait(req, WRITE)) 3615 goto copy_iov; 3616 3617 /* file path doesn't support NOWAIT for non-direct_IO */ 3618 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && 3619 (req->flags & REQ_F_ISREG)) 3620 goto copy_iov; 3621 3622 ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result); 3623 if (unlikely(ret)) 3624 goto out_free; 3625 3626 /* 3627 * Open-code file_start_write here to grab freeze protection, 3628 * which will be released by another thread in 3629 * io_complete_rw(). Fool lockdep by telling it the lock got 3630 * released so that it doesn't complain about the held lock when 3631 * we return to userspace. 3632 */ 3633 if (req->flags & REQ_F_ISREG) { 3634 sb_start_write(file_inode(req->file)->i_sb); 3635 __sb_writers_release(file_inode(req->file)->i_sb, 3636 SB_FREEZE_WRITE); 3637 } 3638 kiocb->ki_flags |= IOCB_WRITE; 3639 3640 if (req->file->f_op->write_iter) 3641 ret2 = call_write_iter(req->file, kiocb, iter); 3642 else if (req->file->f_op->write) 3643 ret2 = loop_rw_iter(WRITE, req, iter); 3644 else 3645 ret2 = -EINVAL; 3646 3647 if (req->flags & REQ_F_REISSUE) { 3648 req->flags &= ~REQ_F_REISSUE; 3649 ret2 = -EAGAIN; 3650 } 3651 3652 /* 3653 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just 3654 * retry them without IOCB_NOWAIT. 3655 */ 3656 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) 3657 ret2 = -EAGAIN; 3658 /* no retry on NONBLOCK nor RWF_NOWAIT */ 3659 if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT)) 3660 goto done; 3661 if (!force_nonblock || ret2 != -EAGAIN) { 3662 /* IOPOLL retry should happen for io-wq threads */ 3663 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN) 3664 goto copy_iov; 3665done: 3666 kiocb_done(kiocb, ret2, issue_flags); 3667 } else { 3668copy_iov: 3669 iov_iter_restore(iter, state); 3670 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); 3671 return ret ?: -EAGAIN; 3672 } 3673out_free: 3674 /* it's reportedly faster than delegating the null check to kfree() */ 3675 if (iovec) 3676 kfree(iovec); 3677 return ret; 3678} 3679 3680static int io_renameat_prep(struct io_kiocb *req, 3681 const struct io_uring_sqe *sqe) 3682{ 3683 struct io_rename *ren = &req->rename; 3684 const char __user *oldf, *newf; 3685 3686 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3687 return -EINVAL; 3688 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) 3689 return -EINVAL; 3690 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3691 return -EBADF; 3692 3693 ren->old_dfd = READ_ONCE(sqe->fd); 3694 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3695 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 3696 ren->new_dfd = READ_ONCE(sqe->len); 3697 ren->flags = READ_ONCE(sqe->rename_flags); 3698 3699 ren->oldpath = getname(oldf); 3700 if (IS_ERR(ren->oldpath)) 3701 return PTR_ERR(ren->oldpath); 3702 3703 ren->newpath = getname(newf); 3704 if (IS_ERR(ren->newpath)) { 3705 putname(ren->oldpath); 3706 return PTR_ERR(ren->newpath); 3707 } 3708 3709 req->flags |= REQ_F_NEED_CLEANUP; 3710 return 0; 3711} 3712 3713static int io_renameat(struct io_kiocb *req, unsigned int issue_flags) 3714{ 3715 struct io_rename *ren = &req->rename; 3716 int ret; 3717 3718 if (issue_flags & IO_URING_F_NONBLOCK) 3719 return -EAGAIN; 3720 3721 ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, 3722 ren->newpath, ren->flags); 3723 3724 req->flags &= ~REQ_F_NEED_CLEANUP; 3725 if (ret < 0) 3726 req_set_fail(req); 3727 io_req_complete(req, ret); 3728 return 0; 3729} 3730 3731static int io_unlinkat_prep(struct io_kiocb *req, 3732 const struct io_uring_sqe *sqe) 3733{ 3734 struct io_unlink *un = &req->unlink; 3735 const char __user *fname; 3736 3737 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3738 return -EINVAL; 3739 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || 3740 sqe->splice_fd_in) 3741 return -EINVAL; 3742 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3743 return -EBADF; 3744 3745 un->dfd = READ_ONCE(sqe->fd); 3746 3747 un->flags = READ_ONCE(sqe->unlink_flags); 3748 if (un->flags & ~AT_REMOVEDIR) 3749 return -EINVAL; 3750 3751 fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3752 un->filename = getname(fname); 3753 if (IS_ERR(un->filename)) 3754 return PTR_ERR(un->filename); 3755 3756 req->flags |= REQ_F_NEED_CLEANUP; 3757 return 0; 3758} 3759 3760static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) 3761{ 3762 struct io_unlink *un = &req->unlink; 3763 int ret; 3764 3765 if (issue_flags & IO_URING_F_NONBLOCK) 3766 return -EAGAIN; 3767 3768 if (un->flags & AT_REMOVEDIR) 3769 ret = do_rmdir(un->dfd, un->filename); 3770 else 3771 ret = do_unlinkat(un->dfd, un->filename); 3772 3773 req->flags &= ~REQ_F_NEED_CLEANUP; 3774 if (ret < 0) 3775 req_set_fail(req); 3776 io_req_complete(req, ret); 3777 return 0; 3778} 3779 3780static int io_mkdirat_prep(struct io_kiocb *req, 3781 const struct io_uring_sqe *sqe) 3782{ 3783 struct io_mkdir *mkd = &req->mkdir; 3784 const char __user *fname; 3785 3786 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3787 return -EINVAL; 3788 if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index || 3789 sqe->splice_fd_in) 3790 return -EINVAL; 3791 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3792 return -EBADF; 3793 3794 mkd->dfd = READ_ONCE(sqe->fd); 3795 mkd->mode = READ_ONCE(sqe->len); 3796 3797 fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3798 mkd->filename = getname(fname); 3799 if (IS_ERR(mkd->filename)) 3800 return PTR_ERR(mkd->filename); 3801 3802 req->flags |= REQ_F_NEED_CLEANUP; 3803 return 0; 3804} 3805 3806static int io_mkdirat(struct io_kiocb *req, int issue_flags) 3807{ 3808 struct io_mkdir *mkd = &req->mkdir; 3809 int ret; 3810 3811 if (issue_flags & IO_URING_F_NONBLOCK) 3812 return -EAGAIN; 3813 3814 ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); 3815 3816 req->flags &= ~REQ_F_NEED_CLEANUP; 3817 if (ret < 0) 3818 req_set_fail(req); 3819 io_req_complete(req, ret); 3820 return 0; 3821} 3822 3823static int io_symlinkat_prep(struct io_kiocb *req, 3824 const struct io_uring_sqe *sqe) 3825{ 3826 struct io_symlink *sl = &req->symlink; 3827 const char __user *oldpath, *newpath; 3828 3829 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3830 return -EINVAL; 3831 if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index || 3832 sqe->splice_fd_in) 3833 return -EINVAL; 3834 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3835 return -EBADF; 3836 3837 sl->new_dfd = READ_ONCE(sqe->fd); 3838 oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3839 newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 3840 3841 sl->oldpath = getname(oldpath); 3842 if (IS_ERR(sl->oldpath)) 3843 return PTR_ERR(sl->oldpath); 3844 3845 sl->newpath = getname(newpath); 3846 if (IS_ERR(sl->newpath)) { 3847 putname(sl->oldpath); 3848 return PTR_ERR(sl->newpath); 3849 } 3850 3851 req->flags |= REQ_F_NEED_CLEANUP; 3852 return 0; 3853} 3854 3855static int io_symlinkat(struct io_kiocb *req, int issue_flags) 3856{ 3857 struct io_symlink *sl = &req->symlink; 3858 int ret; 3859 3860 if (issue_flags & IO_URING_F_NONBLOCK) 3861 return -EAGAIN; 3862 3863 ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath); 3864 3865 req->flags &= ~REQ_F_NEED_CLEANUP; 3866 if (ret < 0) 3867 req_set_fail(req); 3868 io_req_complete(req, ret); 3869 return 0; 3870} 3871 3872static int io_linkat_prep(struct io_kiocb *req, 3873 const struct io_uring_sqe *sqe) 3874{ 3875 struct io_hardlink *lnk = &req->hardlink; 3876 const char __user *oldf, *newf; 3877 3878 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3879 return -EINVAL; 3880 if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) 3881 return -EINVAL; 3882 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3883 return -EBADF; 3884 3885 lnk->old_dfd = READ_ONCE(sqe->fd); 3886 lnk->new_dfd = READ_ONCE(sqe->len); 3887 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3888 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 3889 lnk->flags = READ_ONCE(sqe->hardlink_flags); 3890 3891 lnk->oldpath = getname(oldf); 3892 if (IS_ERR(lnk->oldpath)) 3893 return PTR_ERR(lnk->oldpath); 3894 3895 lnk->newpath = getname(newf); 3896 if (IS_ERR(lnk->newpath)) { 3897 putname(lnk->oldpath); 3898 return PTR_ERR(lnk->newpath); 3899 } 3900 3901 req->flags |= REQ_F_NEED_CLEANUP; 3902 return 0; 3903} 3904 3905static int io_linkat(struct io_kiocb *req, int issue_flags) 3906{ 3907 struct io_hardlink *lnk = &req->hardlink; 3908 int ret; 3909 3910 if (issue_flags & IO_URING_F_NONBLOCK) 3911 return -EAGAIN; 3912 3913 ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd, 3914 lnk->newpath, lnk->flags); 3915 3916 req->flags &= ~REQ_F_NEED_CLEANUP; 3917 if (ret < 0) 3918 req_set_fail(req); 3919 io_req_complete(req, ret); 3920 return 0; 3921} 3922 3923static int io_shutdown_prep(struct io_kiocb *req, 3924 const struct io_uring_sqe *sqe) 3925{ 3926#if defined(CONFIG_NET) 3927 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3928 return -EINVAL; 3929 if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || 3930 sqe->buf_index || sqe->splice_fd_in)) 3931 return -EINVAL; 3932 3933 req->shutdown.how = READ_ONCE(sqe->len); 3934 return 0; 3935#else 3936 return -EOPNOTSUPP; 3937#endif 3938} 3939 3940static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) 3941{ 3942#if defined(CONFIG_NET) 3943 struct socket *sock; 3944 int ret; 3945 3946 if (issue_flags & IO_URING_F_NONBLOCK) 3947 return -EAGAIN; 3948 3949 sock = sock_from_file(req->file); 3950 if (unlikely(!sock)) 3951 return -ENOTSOCK; 3952 3953 ret = __sys_shutdown_sock(sock, req->shutdown.how); 3954 if (ret < 0) 3955 req_set_fail(req); 3956 io_req_complete(req, ret); 3957 return 0; 3958#else 3959 return -EOPNOTSUPP; 3960#endif 3961} 3962 3963static int __io_splice_prep(struct io_kiocb *req, 3964 const struct io_uring_sqe *sqe) 3965{ 3966 struct io_splice *sp = &req->splice; 3967 unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; 3968 3969 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3970 return -EINVAL; 3971 3972 sp->file_in = NULL; 3973 sp->len = READ_ONCE(sqe->len); 3974 sp->flags = READ_ONCE(sqe->splice_flags); 3975 3976 if (unlikely(sp->flags & ~valid_flags)) 3977 return -EINVAL; 3978 3979 sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in), 3980 (sp->flags & SPLICE_F_FD_IN_FIXED)); 3981 if (!sp->file_in) 3982 return -EBADF; 3983 req->flags |= REQ_F_NEED_CLEANUP; 3984 return 0; 3985} 3986 3987static int io_tee_prep(struct io_kiocb *req, 3988 const struct io_uring_sqe *sqe) 3989{ 3990 if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off)) 3991 return -EINVAL; 3992 return __io_splice_prep(req, sqe); 3993} 3994 3995static int io_tee(struct io_kiocb *req, unsigned int issue_flags) 3996{ 3997 struct io_splice *sp = &req->splice; 3998 struct file *in = sp->file_in; 3999 struct file *out = sp->file_out; 4000 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; 4001 long ret = 0; 4002 4003 if (issue_flags & IO_URING_F_NONBLOCK) 4004 return -EAGAIN; 4005 if (sp->len) 4006 ret = do_tee(in, out, sp->len, flags); 4007 4008 if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) 4009 io_put_file(in); 4010 req->flags &= ~REQ_F_NEED_CLEANUP; 4011 4012 if (ret != sp->len) 4013 req_set_fail(req); 4014 io_req_complete(req, ret); 4015 return 0; 4016} 4017 4018static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4019{ 4020 struct io_splice *sp = &req->splice; 4021 4022 sp->off_in = READ_ONCE(sqe->splice_off_in); 4023 sp->off_out = READ_ONCE(sqe->off); 4024 return __io_splice_prep(req, sqe); 4025} 4026 4027static int io_splice(struct io_kiocb *req, unsigned int issue_flags) 4028{ 4029 struct io_splice *sp = &req->splice; 4030 struct file *in = sp->file_in; 4031 struct file *out = sp->file_out; 4032 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; 4033 loff_t *poff_in, *poff_out; 4034 long ret = 0; 4035 4036 if (issue_flags & IO_URING_F_NONBLOCK) 4037 return -EAGAIN; 4038 4039 poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; 4040 poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; 4041 4042 if (sp->len) 4043 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); 4044 4045 if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) 4046 io_put_file(in); 4047 req->flags &= ~REQ_F_NEED_CLEANUP; 4048 4049 if (ret != sp->len) 4050 req_set_fail(req); 4051 io_req_complete(req, ret); 4052 return 0; 4053} 4054 4055/* 4056 * IORING_OP_NOP just posts a completion event, nothing else. 4057 */ 4058static int io_nop(struct io_kiocb *req, unsigned int issue_flags) 4059{ 4060 struct io_ring_ctx *ctx = req->ctx; 4061 4062 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 4063 return -EINVAL; 4064 4065 __io_req_complete(req, issue_flags, 0, 0); 4066 return 0; 4067} 4068 4069static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4070{ 4071 struct io_ring_ctx *ctx = req->ctx; 4072 4073 if (!req->file) 4074 return -EBADF; 4075 4076 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 4077 return -EINVAL; 4078 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || 4079 sqe->splice_fd_in)) 4080 return -EINVAL; 4081 4082 req->sync.flags = READ_ONCE(sqe->fsync_flags); 4083 if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC)) 4084 return -EINVAL; 4085 4086 req->sync.off = READ_ONCE(sqe->off); 4087 req->sync.len = READ_ONCE(sqe->len); 4088 return 0; 4089} 4090 4091static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) 4092{ 4093 loff_t end = req->sync.off + req->sync.len; 4094 int ret; 4095 4096 /* fsync always requires a blocking context */ 4097 if (issue_flags & IO_URING_F_NONBLOCK) 4098 return -EAGAIN; 4099 4100 ret = vfs_fsync_range(req->file, req->sync.off, 4101 end > 0 ? end : LLONG_MAX, 4102 req->sync.flags & IORING_FSYNC_DATASYNC); 4103 if (ret < 0) 4104 req_set_fail(req); 4105 io_req_complete(req, ret); 4106 return 0; 4107} 4108 4109static int io_fallocate_prep(struct io_kiocb *req, 4110 const struct io_uring_sqe *sqe) 4111{ 4112 if (sqe->ioprio || sqe->buf_index || sqe->rw_flags || 4113 sqe->splice_fd_in) 4114 return -EINVAL; 4115 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4116 return -EINVAL; 4117 4118 req->sync.off = READ_ONCE(sqe->off); 4119 req->sync.len = READ_ONCE(sqe->addr); 4120 req->sync.mode = READ_ONCE(sqe->len); 4121 return 0; 4122} 4123 4124static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) 4125{ 4126 int ret; 4127 4128 /* fallocate always requiring blocking context */ 4129 if (issue_flags & IO_URING_F_NONBLOCK) 4130 return -EAGAIN; 4131 ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, 4132 req->sync.len); 4133 if (ret < 0) 4134 req_set_fail(req); 4135 io_req_complete(req, ret); 4136 return 0; 4137} 4138 4139static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4140{ 4141 const char __user *fname; 4142 int ret; 4143 4144 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4145 return -EINVAL; 4146 if (unlikely(sqe->ioprio || sqe->buf_index)) 4147 return -EINVAL; 4148 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 4149 return -EBADF; 4150 4151 /* open.how should be already initialised */ 4152 if (!(req->open.how.flags & O_PATH) && force_o_largefile()) 4153 req->open.how.flags |= O_LARGEFILE; 4154 4155 req->open.dfd = READ_ONCE(sqe->fd); 4156 fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4157 req->open.filename = getname(fname); 4158 if (IS_ERR(req->open.filename)) { 4159 ret = PTR_ERR(req->open.filename); 4160 req->open.filename = NULL; 4161 return ret; 4162 } 4163 4164 req->open.file_slot = READ_ONCE(sqe->file_index); 4165 if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC)) 4166 return -EINVAL; 4167 4168 req->open.nofile = rlimit(RLIMIT_NOFILE); 4169 req->flags |= REQ_F_NEED_CLEANUP; 4170 return 0; 4171} 4172 4173static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4174{ 4175 u64 mode = READ_ONCE(sqe->len); 4176 u64 flags = READ_ONCE(sqe->open_flags); 4177 4178 req->open.how = build_open_how(flags, mode); 4179 return __io_openat_prep(req, sqe); 4180} 4181 4182static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4183{ 4184 struct open_how __user *how; 4185 size_t len; 4186 int ret; 4187 4188 how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 4189 len = READ_ONCE(sqe->len); 4190 if (len < OPEN_HOW_SIZE_VER0) 4191 return -EINVAL; 4192 4193 ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how, 4194 len); 4195 if (ret) 4196 return ret; 4197 4198 return __io_openat_prep(req, sqe); 4199} 4200 4201static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) 4202{ 4203 struct open_flags op; 4204 struct file *file; 4205 bool resolve_nonblock, nonblock_set; 4206 bool fixed = !!req->open.file_slot; 4207 int ret; 4208 4209 ret = build_open_flags(&req->open.how, &op); 4210 if (ret) 4211 goto err; 4212 nonblock_set = op.open_flag & O_NONBLOCK; 4213 resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED; 4214 if (issue_flags & IO_URING_F_NONBLOCK) { 4215 /* 4216 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, 4217 * it'll always -EAGAIN 4218 */ 4219 if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE)) 4220 return -EAGAIN; 4221 op.lookup_flags |= LOOKUP_CACHED; 4222 op.open_flag |= O_NONBLOCK; 4223 } 4224 4225 if (!fixed) { 4226 ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile); 4227 if (ret < 0) 4228 goto err; 4229 } 4230 4231 file = do_filp_open(req->open.dfd, req->open.filename, &op); 4232 if (IS_ERR(file)) { 4233 /* 4234 * We could hang on to this 'fd' on retrying, but seems like 4235 * marginal gain for something that is now known to be a slower 4236 * path. So just put it, and we'll get a new one when we retry. 4237 */ 4238 if (!fixed) 4239 put_unused_fd(ret); 4240 4241 ret = PTR_ERR(file); 4242 /* only retry if RESOLVE_CACHED wasn't already set by application */ 4243 if (ret == -EAGAIN && 4244 (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK))) 4245 return -EAGAIN; 4246 goto err; 4247 } 4248 4249 if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) 4250 file->f_flags &= ~O_NONBLOCK; 4251 fsnotify_open(file); 4252 4253 if (!fixed) 4254 fd_install(ret, file); 4255 else 4256 ret = io_install_fixed_file(req, file, issue_flags, 4257 req->open.file_slot - 1); 4258err: 4259 putname(req->open.filename); 4260 req->flags &= ~REQ_F_NEED_CLEANUP; 4261 if (ret < 0) 4262 req_set_fail(req); 4263 __io_req_complete(req, issue_flags, ret, 0); 4264 return 0; 4265} 4266 4267static int io_openat(struct io_kiocb *req, unsigned int issue_flags) 4268{ 4269 return io_openat2(req, issue_flags); 4270} 4271 4272static int io_remove_buffers_prep(struct io_kiocb *req, 4273 const struct io_uring_sqe *sqe) 4274{ 4275 struct io_provide_buf *p = &req->pbuf; 4276 u64 tmp; 4277 4278 if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off || 4279 sqe->splice_fd_in) 4280 return -EINVAL; 4281 4282 tmp = READ_ONCE(sqe->fd); 4283 if (!tmp || tmp > USHRT_MAX) 4284 return -EINVAL; 4285 4286 memset(p, 0, sizeof(*p)); 4287 p->nbufs = tmp; 4288 p->bgid = READ_ONCE(sqe->buf_group); 4289 return 0; 4290} 4291 4292static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, 4293 int bgid, unsigned nbufs) 4294{ 4295 unsigned i = 0; 4296 4297 /* shouldn't happen */ 4298 if (!nbufs) 4299 return 0; 4300 4301 /* the head kbuf is the list itself */ 4302 while (!list_empty(&buf->list)) { 4303 struct io_buffer *nxt; 4304 4305 nxt = list_first_entry(&buf->list, struct io_buffer, list); 4306 list_del(&nxt->list); 4307 kfree(nxt); 4308 if (++i == nbufs) 4309 return i; 4310 } 4311 i++; 4312 kfree(buf); 4313 xa_erase(&ctx->io_buffers, bgid); 4314 4315 return i; 4316} 4317 4318static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) 4319{ 4320 struct io_provide_buf *p = &req->pbuf; 4321 struct io_ring_ctx *ctx = req->ctx; 4322 struct io_buffer *head; 4323 int ret = 0; 4324 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 4325 4326 io_ring_submit_lock(ctx, !force_nonblock); 4327 4328 lockdep_assert_held(&ctx->uring_lock); 4329 4330 ret = -ENOENT; 4331 head = xa_load(&ctx->io_buffers, p->bgid); 4332 if (head) 4333 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); 4334 if (ret < 0) 4335 req_set_fail(req); 4336 4337 /* complete before unlock, IOPOLL may need the lock */ 4338 __io_req_complete(req, issue_flags, ret, 0); 4339 io_ring_submit_unlock(ctx, !force_nonblock); 4340 return 0; 4341} 4342 4343static int io_provide_buffers_prep(struct io_kiocb *req, 4344 const struct io_uring_sqe *sqe) 4345{ 4346 unsigned long size, tmp_check; 4347 struct io_provide_buf *p = &req->pbuf; 4348 u64 tmp; 4349 4350 if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) 4351 return -EINVAL; 4352 4353 tmp = READ_ONCE(sqe->fd); 4354 if (!tmp || tmp > USHRT_MAX) 4355 return -E2BIG; 4356 p->nbufs = tmp; 4357 p->addr = READ_ONCE(sqe->addr); 4358 p->len = READ_ONCE(sqe->len); 4359 4360 if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, 4361 &size)) 4362 return -EOVERFLOW; 4363 if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) 4364 return -EOVERFLOW; 4365 4366 size = (unsigned long)p->len * p->nbufs; 4367 if (!access_ok(u64_to_user_ptr(p->addr), size)) 4368 return -EFAULT; 4369 4370 p->bgid = READ_ONCE(sqe->buf_group); 4371 tmp = READ_ONCE(sqe->off); 4372 if (tmp > USHRT_MAX) 4373 return -E2BIG; 4374 p->bid = tmp; 4375 return 0; 4376} 4377 4378static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) 4379{ 4380 struct io_buffer *buf; 4381 u64 addr = pbuf->addr; 4382 int i, bid = pbuf->bid; 4383 4384 for (i = 0; i < pbuf->nbufs; i++) { 4385 buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); 4386 if (!buf) 4387 break; 4388 4389 buf->addr = addr; 4390 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); 4391 buf->bid = bid; 4392 addr += pbuf->len; 4393 bid++; 4394 if (!*head) { 4395 INIT_LIST_HEAD(&buf->list); 4396 *head = buf; 4397 } else { 4398 list_add_tail(&buf->list, &(*head)->list); 4399 } 4400 } 4401 4402 return i ? i : -ENOMEM; 4403} 4404 4405static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) 4406{ 4407 struct io_provide_buf *p = &req->pbuf; 4408 struct io_ring_ctx *ctx = req->ctx; 4409 struct io_buffer *head, *list; 4410 int ret = 0; 4411 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 4412 4413 io_ring_submit_lock(ctx, !force_nonblock); 4414 4415 lockdep_assert_held(&ctx->uring_lock); 4416 4417 list = head = xa_load(&ctx->io_buffers, p->bgid); 4418 4419 ret = io_add_buffers(p, &head); 4420 if (ret >= 0 && !list) { 4421 ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL); 4422 if (ret < 0) 4423 __io_remove_buffers(ctx, head, p->bgid, -1U); 4424 } 4425 if (ret < 0) 4426 req_set_fail(req); 4427 /* complete before unlock, IOPOLL may need the lock */ 4428 __io_req_complete(req, issue_flags, ret, 0); 4429 io_ring_submit_unlock(ctx, !force_nonblock); 4430 return 0; 4431} 4432 4433static int io_epoll_ctl_prep(struct io_kiocb *req, 4434 const struct io_uring_sqe *sqe) 4435{ 4436#if defined(CONFIG_EPOLL) 4437 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) 4438 return -EINVAL; 4439 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4440 return -EINVAL; 4441 4442 req->epoll.epfd = READ_ONCE(sqe->fd); 4443 req->epoll.op = READ_ONCE(sqe->len); 4444 req->epoll.fd = READ_ONCE(sqe->off); 4445 4446 if (ep_op_has_event(req->epoll.op)) { 4447 struct epoll_event __user *ev; 4448 4449 ev = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4450 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev))) 4451 return -EFAULT; 4452 } 4453 4454 return 0; 4455#else 4456 return -EOPNOTSUPP; 4457#endif 4458} 4459 4460static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) 4461{ 4462#if defined(CONFIG_EPOLL) 4463 struct io_epoll *ie = &req->epoll; 4464 int ret; 4465 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 4466 4467 ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock); 4468 if (force_nonblock && ret == -EAGAIN) 4469 return -EAGAIN; 4470 4471 if (ret < 0) 4472 req_set_fail(req); 4473 __io_req_complete(req, issue_flags, ret, 0); 4474 return 0; 4475#else 4476 return -EOPNOTSUPP; 4477#endif 4478} 4479 4480static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4481{ 4482#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) 4483 if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in) 4484 return -EINVAL; 4485 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4486 return -EINVAL; 4487 4488 req->madvise.addr = READ_ONCE(sqe->addr); 4489 req->madvise.len = READ_ONCE(sqe->len); 4490 req->madvise.advice = READ_ONCE(sqe->fadvise_advice); 4491 return 0; 4492#else 4493 return -EOPNOTSUPP; 4494#endif 4495} 4496 4497static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) 4498{ 4499#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) 4500 struct io_madvise *ma = &req->madvise; 4501 int ret; 4502 4503 if (issue_flags & IO_URING_F_NONBLOCK) 4504 return -EAGAIN; 4505 4506 ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); 4507 if (ret < 0) 4508 req_set_fail(req); 4509 io_req_complete(req, ret); 4510 return 0; 4511#else 4512 return -EOPNOTSUPP; 4513#endif 4514} 4515 4516static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4517{ 4518 if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in) 4519 return -EINVAL; 4520 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4521 return -EINVAL; 4522 4523 req->fadvise.offset = READ_ONCE(sqe->off); 4524 req->fadvise.len = READ_ONCE(sqe->len); 4525 req->fadvise.advice = READ_ONCE(sqe->fadvise_advice); 4526 return 0; 4527} 4528 4529static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) 4530{ 4531 struct io_fadvise *fa = &req->fadvise; 4532 int ret; 4533 4534 if (issue_flags & IO_URING_F_NONBLOCK) { 4535 switch (fa->advice) { 4536 case POSIX_FADV_NORMAL: 4537 case POSIX_FADV_RANDOM: 4538 case POSIX_FADV_SEQUENTIAL: 4539 break; 4540 default: 4541 return -EAGAIN; 4542 } 4543 } 4544 4545 ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); 4546 if (ret < 0) 4547 req_set_fail(req); 4548 __io_req_complete(req, issue_flags, ret, 0); 4549 return 0; 4550} 4551 4552static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4553{ 4554 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4555 return -EINVAL; 4556 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) 4557 return -EINVAL; 4558 if (req->flags & REQ_F_FIXED_FILE) 4559 return -EBADF; 4560 4561 req->statx.dfd = READ_ONCE(sqe->fd); 4562 req->statx.mask = READ_ONCE(sqe->len); 4563 req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4564 req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 4565 req->statx.flags = READ_ONCE(sqe->statx_flags); 4566 4567 return 0; 4568} 4569 4570static int io_statx(struct io_kiocb *req, unsigned int issue_flags) 4571{ 4572 struct io_statx *ctx = &req->statx; 4573 int ret; 4574 4575 if (issue_flags & IO_URING_F_NONBLOCK) 4576 return -EAGAIN; 4577 4578 ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask, 4579 ctx->buffer); 4580 4581 if (ret < 0) 4582 req_set_fail(req); 4583 io_req_complete(req, ret); 4584 return 0; 4585} 4586 4587static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4588{ 4589 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4590 return -EINVAL; 4591 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || 4592 sqe->rw_flags || sqe->buf_index) 4593 return -EINVAL; 4594 if (req->flags & REQ_F_FIXED_FILE) 4595 return -EBADF; 4596 4597 req->close.fd = READ_ONCE(sqe->fd); 4598 req->close.file_slot = READ_ONCE(sqe->file_index); 4599 if (req->close.file_slot && req->close.fd) 4600 return -EINVAL; 4601 4602 return 0; 4603} 4604 4605static int io_close(struct io_kiocb *req, unsigned int issue_flags) 4606{ 4607 struct files_struct *files = current->files; 4608 struct io_close *close = &req->close; 4609 struct fdtable *fdt; 4610 struct file *file = NULL; 4611 int ret = -EBADF; 4612 4613 if (req->close.file_slot) { 4614 ret = io_close_fixed(req, issue_flags); 4615 goto err; 4616 } 4617 4618 spin_lock(&files->file_lock); 4619 fdt = files_fdtable(files); 4620 if (close->fd >= fdt->max_fds) { 4621 spin_unlock(&files->file_lock); 4622 goto err; 4623 } 4624 file = fdt->fd[close->fd]; 4625 if (!file || file->f_op == &io_uring_fops) { 4626 spin_unlock(&files->file_lock); 4627 file = NULL; 4628 goto err; 4629 } 4630 4631 /* if the file has a flush method, be safe and punt to async */ 4632 if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) { 4633 spin_unlock(&files->file_lock); 4634 return -EAGAIN; 4635 } 4636 4637 ret = __close_fd_get_file(close->fd, &file); 4638 spin_unlock(&files->file_lock); 4639 if (ret < 0) { 4640 if (ret == -ENOENT) 4641 ret = -EBADF; 4642 goto err; 4643 } 4644 4645 /* No ->flush() or already async, safely close from here */ 4646 ret = filp_close(file, current->files); 4647err: 4648 if (ret < 0) 4649 req_set_fail(req); 4650 if (file) 4651 fput(file); 4652 __io_req_complete(req, issue_flags, ret, 0); 4653 return 0; 4654} 4655 4656static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4657{ 4658 struct io_ring_ctx *ctx = req->ctx; 4659 4660 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 4661 return -EINVAL; 4662 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || 4663 sqe->splice_fd_in)) 4664 return -EINVAL; 4665 4666 req->sync.off = READ_ONCE(sqe->off); 4667 req->sync.len = READ_ONCE(sqe->len); 4668 req->sync.flags = READ_ONCE(sqe->sync_range_flags); 4669 return 0; 4670} 4671 4672static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) 4673{ 4674 int ret; 4675 4676 /* sync_file_range always requires a blocking context */ 4677 if (issue_flags & IO_URING_F_NONBLOCK) 4678 return -EAGAIN; 4679 4680 ret = sync_file_range(req->file, req->sync.off, req->sync.len, 4681 req->sync.flags); 4682 if (ret < 0) 4683 req_set_fail(req); 4684 io_req_complete(req, ret); 4685 return 0; 4686} 4687 4688#if defined(CONFIG_NET) 4689static int io_setup_async_msg(struct io_kiocb *req, 4690 struct io_async_msghdr *kmsg) 4691{ 4692 struct io_async_msghdr *async_msg = req->async_data; 4693 4694 if (async_msg) 4695 return -EAGAIN; 4696 if (io_alloc_async_data(req)) { 4697 kfree(kmsg->free_iov); 4698 return -ENOMEM; 4699 } 4700 async_msg = req->async_data; 4701 req->flags |= REQ_F_NEED_CLEANUP; 4702 memcpy(async_msg, kmsg, sizeof(*kmsg)); 4703 async_msg->msg.msg_name = &async_msg->addr; 4704 /* if were using fast_iov, set it to the new one */ 4705 if (!async_msg->free_iov) 4706 async_msg->msg.msg_iter.iov = async_msg->fast_iov; 4707 4708 return -EAGAIN; 4709} 4710 4711static int io_sendmsg_copy_hdr(struct io_kiocb *req, 4712 struct io_async_msghdr *iomsg) 4713{ 4714 iomsg->msg.msg_name = &iomsg->addr; 4715 iomsg->free_iov = iomsg->fast_iov; 4716 return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg, 4717 req->sr_msg.msg_flags, &iomsg->free_iov); 4718} 4719 4720static int io_sendmsg_prep_async(struct io_kiocb *req) 4721{ 4722 int ret; 4723 4724 ret = io_sendmsg_copy_hdr(req, req->async_data); 4725 if (!ret) 4726 req->flags |= REQ_F_NEED_CLEANUP; 4727 return ret; 4728} 4729 4730static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4731{ 4732 struct io_sr_msg *sr = &req->sr_msg; 4733 4734 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4735 return -EINVAL; 4736 4737 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4738 sr->len = READ_ONCE(sqe->len); 4739 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 4740 if (sr->msg_flags & MSG_DONTWAIT) 4741 req->flags |= REQ_F_NOWAIT; 4742 4743#ifdef CONFIG_COMPAT 4744 if (req->ctx->compat) 4745 sr->msg_flags |= MSG_CMSG_COMPAT; 4746#endif 4747 return 0; 4748} 4749 4750static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) 4751{ 4752 struct io_async_msghdr iomsg, *kmsg; 4753 struct socket *sock; 4754 unsigned flags; 4755 int min_ret = 0; 4756 int ret; 4757 4758 sock = sock_from_file(req->file); 4759 if (unlikely(!sock)) 4760 return -ENOTSOCK; 4761 4762 kmsg = req->async_data; 4763 if (!kmsg) { 4764 ret = io_sendmsg_copy_hdr(req, &iomsg); 4765 if (ret) 4766 return ret; 4767 kmsg = &iomsg; 4768 } 4769 4770 flags = req->sr_msg.msg_flags; 4771 if (issue_flags & IO_URING_F_NONBLOCK) 4772 flags |= MSG_DONTWAIT; 4773 if (flags & MSG_WAITALL) 4774 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 4775 4776 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 4777 if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) 4778 return io_setup_async_msg(req, kmsg); 4779 if (ret == -ERESTARTSYS) 4780 ret = -EINTR; 4781 4782 /* fast path, check for non-NULL to avoid function call */ 4783 if (kmsg->free_iov) 4784 kfree(kmsg->free_iov); 4785 req->flags &= ~REQ_F_NEED_CLEANUP; 4786 if (ret < min_ret) 4787 req_set_fail(req); 4788 __io_req_complete(req, issue_flags, ret, 0); 4789 return 0; 4790} 4791 4792static int io_send(struct io_kiocb *req, unsigned int issue_flags) 4793{ 4794 struct io_sr_msg *sr = &req->sr_msg; 4795 struct msghdr msg; 4796 struct iovec iov; 4797 struct socket *sock; 4798 unsigned flags; 4799 int min_ret = 0; 4800 int ret; 4801 4802 sock = sock_from_file(req->file); 4803 if (unlikely(!sock)) 4804 return -ENOTSOCK; 4805 4806 ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); 4807 if (unlikely(ret)) 4808 return ret; 4809 4810 msg.msg_name = NULL; 4811 msg.msg_control = NULL; 4812 msg.msg_controllen = 0; 4813 msg.msg_namelen = 0; 4814 4815 flags = req->sr_msg.msg_flags; 4816 if (issue_flags & IO_URING_F_NONBLOCK) 4817 flags |= MSG_DONTWAIT; 4818 if (flags & MSG_WAITALL) 4819 min_ret = iov_iter_count(&msg.msg_iter); 4820 4821 msg.msg_flags = flags; 4822 ret = sock_sendmsg(sock, &msg); 4823 if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) 4824 return -EAGAIN; 4825 if (ret == -ERESTARTSYS) 4826 ret = -EINTR; 4827 4828 if (ret < min_ret) 4829 req_set_fail(req); 4830 __io_req_complete(req, issue_flags, ret, 0); 4831 return 0; 4832} 4833 4834static int __io_recvmsg_copy_hdr(struct io_kiocb *req, 4835 struct io_async_msghdr *iomsg) 4836{ 4837 struct io_sr_msg *sr = &req->sr_msg; 4838 struct iovec __user *uiov; 4839 size_t iov_len; 4840 int ret; 4841 4842 ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg, 4843 &iomsg->uaddr, &uiov, &iov_len); 4844 if (ret) 4845 return ret; 4846 4847 if (req->flags & REQ_F_BUFFER_SELECT) { 4848 if (iov_len > 1) 4849 return -EINVAL; 4850 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov))) 4851 return -EFAULT; 4852 sr->len = iomsg->fast_iov[0].iov_len; 4853 iomsg->free_iov = NULL; 4854 } else { 4855 iomsg->free_iov = iomsg->fast_iov; 4856 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV, 4857 &iomsg->free_iov, &iomsg->msg.msg_iter, 4858 false); 4859 if (ret > 0) 4860 ret = 0; 4861 } 4862 4863 return ret; 4864} 4865 4866#ifdef CONFIG_COMPAT 4867static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, 4868 struct io_async_msghdr *iomsg) 4869{ 4870 struct io_sr_msg *sr = &req->sr_msg; 4871 struct compat_iovec __user *uiov; 4872 compat_uptr_t ptr; 4873 compat_size_t len; 4874 int ret; 4875 4876 ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr, 4877 &ptr, &len); 4878 if (ret) 4879 return ret; 4880 4881 uiov = compat_ptr(ptr); 4882 if (req->flags & REQ_F_BUFFER_SELECT) { 4883 compat_ssize_t clen; 4884 4885 if (len > 1) 4886 return -EINVAL; 4887 if (!access_ok(uiov, sizeof(*uiov))) 4888 return -EFAULT; 4889 if (__get_user(clen, &uiov->iov_len)) 4890 return -EFAULT; 4891 if (clen < 0) 4892 return -EINVAL; 4893 sr->len = clen; 4894 iomsg->free_iov = NULL; 4895 } else { 4896 iomsg->free_iov = iomsg->fast_iov; 4897 ret = __import_iovec(READ, (struct iovec __user *)uiov, len, 4898 UIO_FASTIOV, &iomsg->free_iov, 4899 &iomsg->msg.msg_iter, true); 4900 if (ret < 0) 4901 return ret; 4902 } 4903 4904 return 0; 4905} 4906#endif 4907 4908static int io_recvmsg_copy_hdr(struct io_kiocb *req, 4909 struct io_async_msghdr *iomsg) 4910{ 4911 iomsg->msg.msg_name = &iomsg->addr; 4912 4913#ifdef CONFIG_COMPAT 4914 if (req->ctx->compat) 4915 return __io_compat_recvmsg_copy_hdr(req, iomsg); 4916#endif 4917 4918 return __io_recvmsg_copy_hdr(req, iomsg); 4919} 4920 4921static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, 4922 bool needs_lock) 4923{ 4924 struct io_sr_msg *sr = &req->sr_msg; 4925 struct io_buffer *kbuf; 4926 4927 kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock); 4928 if (IS_ERR(kbuf)) 4929 return kbuf; 4930 4931 sr->kbuf = kbuf; 4932 req->flags |= REQ_F_BUFFER_SELECTED; 4933 return kbuf; 4934} 4935 4936static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req) 4937{ 4938 return io_put_kbuf(req, req->sr_msg.kbuf); 4939} 4940 4941static int io_recvmsg_prep_async(struct io_kiocb *req) 4942{ 4943 int ret; 4944 4945 ret = io_recvmsg_copy_hdr(req, req->async_data); 4946 if (!ret) 4947 req->flags |= REQ_F_NEED_CLEANUP; 4948 return ret; 4949} 4950 4951static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4952{ 4953 struct io_sr_msg *sr = &req->sr_msg; 4954 4955 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4956 return -EINVAL; 4957 4958 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4959 sr->len = READ_ONCE(sqe->len); 4960 sr->bgid = READ_ONCE(sqe->buf_group); 4961 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 4962 if (sr->msg_flags & MSG_DONTWAIT) 4963 req->flags |= REQ_F_NOWAIT; 4964 4965#ifdef CONFIG_COMPAT 4966 if (req->ctx->compat) 4967 sr->msg_flags |= MSG_CMSG_COMPAT; 4968#endif 4969 return 0; 4970} 4971 4972static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) 4973{ 4974 struct io_async_msghdr iomsg, *kmsg; 4975 struct socket *sock; 4976 struct io_buffer *kbuf; 4977 unsigned flags; 4978 int min_ret = 0; 4979 int ret, cflags = 0; 4980 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 4981 4982 sock = sock_from_file(req->file); 4983 if (unlikely(!sock)) 4984 return -ENOTSOCK; 4985 4986 kmsg = req->async_data; 4987 if (!kmsg) { 4988 ret = io_recvmsg_copy_hdr(req, &iomsg); 4989 if (ret) 4990 return ret; 4991 kmsg = &iomsg; 4992 } 4993 4994 if (req->flags & REQ_F_BUFFER_SELECT) { 4995 kbuf = io_recv_buffer_select(req, !force_nonblock); 4996 if (IS_ERR(kbuf)) 4997 return PTR_ERR(kbuf); 4998 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); 4999 kmsg->fast_iov[0].iov_len = req->sr_msg.len; 5000 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 5001 1, req->sr_msg.len); 5002 } 5003 5004 flags = req->sr_msg.msg_flags; 5005 if (force_nonblock) 5006 flags |= MSG_DONTWAIT; 5007 if (flags & MSG_WAITALL) 5008 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 5009 5010 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, 5011 kmsg->uaddr, flags); 5012 if (force_nonblock && ret == -EAGAIN) 5013 return io_setup_async_msg(req, kmsg); 5014 if (ret == -ERESTARTSYS) 5015 ret = -EINTR; 5016 5017 if (req->flags & REQ_F_BUFFER_SELECTED) 5018 cflags = io_put_recv_kbuf(req); 5019 /* fast path, check for non-NULL to avoid function call */ 5020 if (kmsg->free_iov) 5021 kfree(kmsg->free_iov); 5022 req->flags &= ~REQ_F_NEED_CLEANUP; 5023 if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) 5024 req_set_fail(req); 5025 __io_req_complete(req, issue_flags, ret, cflags); 5026 return 0; 5027} 5028 5029static int io_recv(struct io_kiocb *req, unsigned int issue_flags) 5030{ 5031 struct io_buffer *kbuf; 5032 struct io_sr_msg *sr = &req->sr_msg; 5033 struct msghdr msg; 5034 void __user *buf = sr->buf; 5035 struct socket *sock; 5036 struct iovec iov; 5037 unsigned flags; 5038 int min_ret = 0; 5039 int ret, cflags = 0; 5040 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 5041 5042 sock = sock_from_file(req->file); 5043 if (unlikely(!sock)) 5044 return -ENOTSOCK; 5045 5046 if (req->flags & REQ_F_BUFFER_SELECT) { 5047 kbuf = io_recv_buffer_select(req, !force_nonblock); 5048 if (IS_ERR(kbuf)) 5049 return PTR_ERR(kbuf); 5050 buf = u64_to_user_ptr(kbuf->addr); 5051 } 5052 5053 ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); 5054 if (unlikely(ret)) 5055 goto out_free; 5056 5057 msg.msg_name = NULL; 5058 msg.msg_control = NULL; 5059 msg.msg_controllen = 0; 5060 msg.msg_namelen = 0; 5061 msg.msg_iocb = NULL; 5062 msg.msg_flags = 0; 5063 5064 flags = req->sr_msg.msg_flags; 5065 if (force_nonblock) 5066 flags |= MSG_DONTWAIT; 5067 if (flags & MSG_WAITALL) 5068 min_ret = iov_iter_count(&msg.msg_iter); 5069 5070 ret = sock_recvmsg(sock, &msg, flags); 5071 if (force_nonblock && ret == -EAGAIN) 5072 return -EAGAIN; 5073 if (ret == -ERESTARTSYS) 5074 ret = -EINTR; 5075out_free: 5076 if (req->flags & REQ_F_BUFFER_SELECTED) 5077 cflags = io_put_recv_kbuf(req); 5078 if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) 5079 req_set_fail(req); 5080 __io_req_complete(req, issue_flags, ret, cflags); 5081 return 0; 5082} 5083 5084static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5085{ 5086 struct io_accept *accept = &req->accept; 5087 5088 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5089 return -EINVAL; 5090 if (sqe->ioprio || sqe->len || sqe->buf_index) 5091 return -EINVAL; 5092 5093 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 5094 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 5095 accept->flags = READ_ONCE(sqe->accept_flags); 5096 accept->nofile = rlimit(RLIMIT_NOFILE); 5097 5098 accept->file_slot = READ_ONCE(sqe->file_index); 5099 if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) || 5100 (accept->flags & SOCK_CLOEXEC))) 5101 return -EINVAL; 5102 if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 5103 return -EINVAL; 5104 if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) 5105 accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; 5106 return 0; 5107} 5108 5109static int io_accept(struct io_kiocb *req, unsigned int issue_flags) 5110{ 5111 struct io_accept *accept = &req->accept; 5112 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 5113 unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; 5114 bool fixed = !!accept->file_slot; 5115 struct file *file; 5116 int ret, fd; 5117 5118 if (req->file->f_flags & O_NONBLOCK) 5119 req->flags |= REQ_F_NOWAIT; 5120 5121 if (!fixed) { 5122 fd = __get_unused_fd_flags(accept->flags, accept->nofile); 5123 if (unlikely(fd < 0)) 5124 return fd; 5125 } 5126 file = do_accept(req->file, file_flags, accept->addr, accept->addr_len, 5127 accept->flags); 5128 if (IS_ERR(file)) { 5129 if (!fixed) 5130 put_unused_fd(fd); 5131 ret = PTR_ERR(file); 5132 if (ret == -EAGAIN && force_nonblock) 5133 return -EAGAIN; 5134 if (ret == -ERESTARTSYS) 5135 ret = -EINTR; 5136 req_set_fail(req); 5137 } else if (!fixed) { 5138 fd_install(fd, file); 5139 ret = fd; 5140 } else { 5141 ret = io_install_fixed_file(req, file, issue_flags, 5142 accept->file_slot - 1); 5143 } 5144 __io_req_complete(req, issue_flags, ret, 0); 5145 return 0; 5146} 5147 5148static int io_connect_prep_async(struct io_kiocb *req) 5149{ 5150 struct io_async_connect *io = req->async_data; 5151 struct io_connect *conn = &req->connect; 5152 5153 return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address); 5154} 5155 5156static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5157{ 5158 struct io_connect *conn = &req->connect; 5159 5160 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5161 return -EINVAL; 5162 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags || 5163 sqe->splice_fd_in) 5164 return -EINVAL; 5165 5166 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 5167 conn->addr_len = READ_ONCE(sqe->addr2); 5168 return 0; 5169} 5170 5171static int io_connect(struct io_kiocb *req, unsigned int issue_flags) 5172{ 5173 struct io_async_connect __io, *io; 5174 unsigned file_flags; 5175 int ret; 5176 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 5177 5178 if (req->async_data) { 5179 io = req->async_data; 5180 } else { 5181 ret = move_addr_to_kernel(req->connect.addr, 5182 req->connect.addr_len, 5183 &__io.address); 5184 if (ret) 5185 goto out; 5186 io = &__io; 5187 } 5188 5189 file_flags = force_nonblock ? O_NONBLOCK : 0; 5190 5191 ret = __sys_connect_file(req->file, &io->address, 5192 req->connect.addr_len, file_flags); 5193 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { 5194 if (req->async_data) 5195 return -EAGAIN; 5196 if (io_alloc_async_data(req)) { 5197 ret = -ENOMEM; 5198 goto out; 5199 } 5200 memcpy(req->async_data, &__io, sizeof(__io)); 5201 return -EAGAIN; 5202 } 5203 if (ret == -ERESTARTSYS) 5204 ret = -EINTR; 5205out: 5206 if (ret < 0) 5207 req_set_fail(req); 5208 __io_req_complete(req, issue_flags, ret, 0); 5209 return 0; 5210} 5211#else /* !CONFIG_NET */ 5212#define IO_NETOP_FN(op) \ 5213static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \ 5214{ \ 5215 return -EOPNOTSUPP; \ 5216} 5217 5218#define IO_NETOP_PREP(op) \ 5219IO_NETOP_FN(op) \ 5220static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \ 5221{ \ 5222 return -EOPNOTSUPP; \ 5223} \ 5224 5225#define IO_NETOP_PREP_ASYNC(op) \ 5226IO_NETOP_PREP(op) \ 5227static int io_##op##_prep_async(struct io_kiocb *req) \ 5228{ \ 5229 return -EOPNOTSUPP; \ 5230} 5231 5232IO_NETOP_PREP_ASYNC(sendmsg); 5233IO_NETOP_PREP_ASYNC(recvmsg); 5234IO_NETOP_PREP_ASYNC(connect); 5235IO_NETOP_PREP(accept); 5236IO_NETOP_FN(send); 5237IO_NETOP_FN(recv); 5238#endif /* CONFIG_NET */ 5239 5240struct io_poll_table { 5241 struct poll_table_struct pt; 5242 struct io_kiocb *req; 5243 int nr_entries; 5244 int error; 5245}; 5246 5247static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, 5248 __poll_t mask, io_req_tw_func_t func) 5249{ 5250 /* for instances that support it check for an event match first: */ 5251 if (mask && !(mask & poll->events)) 5252 return 0; 5253 5254 trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask); 5255 5256 list_del_init(&poll->wait.entry); 5257 5258 req->result = mask; 5259 req->io_task_work.func = func; 5260 5261 /* 5262 * If this fails, then the task is exiting. When a task exits, the 5263 * work gets canceled, so just cancel this request as well instead 5264 * of executing it. We can't safely execute it anyway, as we may not 5265 * have the needed state needed for it anyway. 5266 */ 5267 io_req_task_work_add(req); 5268 return 1; 5269} 5270 5271static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) 5272 __acquires(&req->ctx->completion_lock) 5273{ 5274 struct io_ring_ctx *ctx = req->ctx; 5275 5276 /* req->task == current here, checking PF_EXITING is safe */ 5277 if (unlikely(req->task->flags & PF_EXITING)) 5278 WRITE_ONCE(poll->canceled, true); 5279 5280 if (!req->result && !READ_ONCE(poll->canceled)) { 5281 struct poll_table_struct pt = { ._key = poll->events }; 5282 5283 req->result = vfs_poll(req->file, &pt) & poll->events; 5284 } 5285 5286 spin_lock(&ctx->completion_lock); 5287 if (!req->result && !READ_ONCE(poll->canceled)) { 5288 add_wait_queue(poll->head, &poll->wait); 5289 return true; 5290 } 5291 5292 return false; 5293} 5294 5295static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) 5296{ 5297 /* pure poll stashes this in ->async_data, poll driven retry elsewhere */ 5298 if (req->opcode == IORING_OP_POLL_ADD) 5299 return req->async_data; 5300 return req->apoll->double_poll; 5301} 5302 5303static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req) 5304{ 5305 if (req->opcode == IORING_OP_POLL_ADD) 5306 return &req->poll; 5307 return &req->apoll->poll; 5308} 5309 5310static void io_poll_remove_double(struct io_kiocb *req) 5311 __must_hold(&req->ctx->completion_lock) 5312{ 5313 struct io_poll_iocb *poll = io_poll_get_double(req); 5314 5315 lockdep_assert_held(&req->ctx->completion_lock); 5316 5317 if (poll && poll->head) { 5318 struct wait_queue_head *head = poll->head; 5319 5320 spin_lock_irq(&head->lock); 5321 list_del_init(&poll->wait.entry); 5322 if (poll->wait.private) 5323 req_ref_put(req); 5324 poll->head = NULL; 5325 spin_unlock_irq(&head->lock); 5326 } 5327} 5328 5329static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask) 5330 __must_hold(&req->ctx->completion_lock) 5331{ 5332 struct io_ring_ctx *ctx = req->ctx; 5333 unsigned flags = IORING_CQE_F_MORE; 5334 int error; 5335 5336 if (READ_ONCE(req->poll.canceled)) { 5337 error = -ECANCELED; 5338 req->poll.events |= EPOLLONESHOT; 5339 } else { 5340 error = mangle_poll(mask); 5341 } 5342 if (req->poll.events & EPOLLONESHOT) 5343 flags = 0; 5344 if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) { 5345 req->poll.events |= EPOLLONESHOT; 5346 flags = 0; 5347 } 5348 if (flags & IORING_CQE_F_MORE) 5349 ctx->cq_extra++; 5350 5351 return !(flags & IORING_CQE_F_MORE); 5352} 5353 5354static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask) 5355 __must_hold(&req->ctx->completion_lock) 5356{ 5357 bool done; 5358 5359 done = __io_poll_complete(req, mask); 5360 io_commit_cqring(req->ctx); 5361 return done; 5362} 5363 5364static void io_poll_task_func(struct io_kiocb *req, bool *locked) 5365{ 5366 struct io_ring_ctx *ctx = req->ctx; 5367 struct io_kiocb *nxt; 5368 5369 if (io_poll_rewait(req, &req->poll)) { 5370 spin_unlock(&ctx->completion_lock); 5371 } else { 5372 bool done; 5373 5374 if (req->poll.done) { 5375 spin_unlock(&ctx->completion_lock); 5376 return; 5377 } 5378 done = __io_poll_complete(req, req->result); 5379 if (done) { 5380 io_poll_remove_double(req); 5381 hash_del(&req->hash_node); 5382 req->poll.done = true; 5383 } else { 5384 req->result = 0; 5385 add_wait_queue(req->poll.head, &req->poll.wait); 5386 } 5387 io_commit_cqring(ctx); 5388 spin_unlock(&ctx->completion_lock); 5389 io_cqring_ev_posted(ctx); 5390 5391 if (done) { 5392 nxt = io_put_req_find_next(req); 5393 if (nxt) 5394 io_req_task_submit(nxt, locked); 5395 } 5396 } 5397} 5398 5399static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, 5400 int sync, void *key) 5401{ 5402 struct io_kiocb *req = wait->private; 5403 struct io_poll_iocb *poll = io_poll_get_single(req); 5404 __poll_t mask = key_to_poll(key); 5405 unsigned long flags; 5406 5407 /* for instances that support it check for an event match first: */ 5408 if (mask && !(mask & poll->events)) 5409 return 0; 5410 if (!(poll->events & EPOLLONESHOT)) 5411 return poll->wait.func(&poll->wait, mode, sync, key); 5412 5413 list_del_init(&wait->entry); 5414 5415 if (poll->head) { 5416 bool done; 5417 5418 spin_lock_irqsave(&poll->head->lock, flags); 5419 done = list_empty(&poll->wait.entry); 5420 if (!done) 5421 list_del_init(&poll->wait.entry); 5422 /* make sure double remove sees this as being gone */ 5423 wait->private = NULL; 5424 spin_unlock_irqrestore(&poll->head->lock, flags); 5425 if (!done) { 5426 /* use wait func handler, so it matches the rq type */ 5427 poll->wait.func(&poll->wait, mode, sync, key); 5428 } 5429 } 5430 req_ref_put(req); 5431 return 1; 5432} 5433 5434static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, 5435 wait_queue_func_t wake_func) 5436{ 5437 poll->head = NULL; 5438 poll->done = false; 5439 poll->canceled = false; 5440#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) 5441 /* mask in events that we always want/need */ 5442 poll->events = events | IO_POLL_UNMASK; 5443 INIT_LIST_HEAD(&poll->wait.entry); 5444 init_waitqueue_func_entry(&poll->wait, wake_func); 5445} 5446 5447static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, 5448 struct wait_queue_head *head, 5449 struct io_poll_iocb **poll_ptr) 5450{ 5451 struct io_kiocb *req = pt->req; 5452 5453 /* 5454 * The file being polled uses multiple waitqueues for poll handling 5455 * (e.g. one for read, one for write). Setup a separate io_poll_iocb 5456 * if this happens. 5457 */ 5458 if (unlikely(pt->nr_entries)) { 5459 struct io_poll_iocb *poll_one = poll; 5460 5461 /* double add on the same waitqueue head, ignore */ 5462 if (poll_one->head == head) 5463 return; 5464 /* already have a 2nd entry, fail a third attempt */ 5465 if (*poll_ptr) { 5466 if ((*poll_ptr)->head == head) 5467 return; 5468 pt->error = -EINVAL; 5469 return; 5470 } 5471 /* 5472 * Can't handle multishot for double wait for now, turn it 5473 * into one-shot mode. 5474 */ 5475 if (!(poll_one->events & EPOLLONESHOT)) 5476 poll_one->events |= EPOLLONESHOT; 5477 poll = kmalloc(sizeof(*poll), GFP_ATOMIC); 5478 if (!poll) { 5479 pt->error = -ENOMEM; 5480 return; 5481 } 5482 io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake); 5483 req_ref_get(req); 5484 poll->wait.private = req; 5485 *poll_ptr = poll; 5486 } 5487 5488 pt->nr_entries++; 5489 poll->head = head; 5490 5491 if (poll->events & EPOLLEXCLUSIVE) 5492 add_wait_queue_exclusive(head, &poll->wait); 5493 else 5494 add_wait_queue(head, &poll->wait); 5495} 5496 5497static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, 5498 struct poll_table_struct *p) 5499{ 5500 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); 5501 struct async_poll *apoll = pt->req->apoll; 5502 5503 __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); 5504} 5505 5506static void io_async_task_func(struct io_kiocb *req, bool *locked) 5507{ 5508 struct async_poll *apoll = req->apoll; 5509 struct io_ring_ctx *ctx = req->ctx; 5510 5511 trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data); 5512 5513 if (io_poll_rewait(req, &apoll->poll)) { 5514 spin_unlock(&ctx->completion_lock); 5515 return; 5516 } 5517 5518 hash_del(&req->hash_node); 5519 io_poll_remove_double(req); 5520 apoll->poll.done = true; 5521 spin_unlock(&ctx->completion_lock); 5522 5523 if (!READ_ONCE(apoll->poll.canceled)) 5524 io_req_task_submit(req, locked); 5525 else 5526 io_req_complete_failed(req, -ECANCELED); 5527} 5528 5529static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, 5530 void *key) 5531{ 5532 struct io_kiocb *req = wait->private; 5533 struct io_poll_iocb *poll = &req->apoll->poll; 5534 5535 trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data, 5536 key_to_poll(key)); 5537 5538 return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func); 5539} 5540 5541static void io_poll_req_insert(struct io_kiocb *req) 5542{ 5543 struct io_ring_ctx *ctx = req->ctx; 5544 struct hlist_head *list; 5545 5546 list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; 5547 hlist_add_head(&req->hash_node, list); 5548} 5549 5550static __poll_t __io_arm_poll_handler(struct io_kiocb *req, 5551 struct io_poll_iocb *poll, 5552 struct io_poll_table *ipt, __poll_t mask, 5553 wait_queue_func_t wake_func) 5554 __acquires(&ctx->completion_lock) 5555{ 5556 struct io_ring_ctx *ctx = req->ctx; 5557 bool cancel = false; 5558 5559 INIT_HLIST_NODE(&req->hash_node); 5560 io_init_poll_iocb(poll, mask, wake_func); 5561 poll->file = req->file; 5562 poll->wait.private = req; 5563 5564 ipt->pt._key = mask; 5565 ipt->req = req; 5566 ipt->error = 0; 5567 ipt->nr_entries = 0; 5568 5569 mask = vfs_poll(req->file, &ipt->pt) & poll->events; 5570 if (unlikely(!ipt->nr_entries) && !ipt->error) 5571 ipt->error = -EINVAL; 5572 5573 spin_lock(&ctx->completion_lock); 5574 if (ipt->error || (mask && (poll->events & EPOLLONESHOT))) 5575 io_poll_remove_double(req); 5576 if (likely(poll->head)) { 5577 spin_lock_irq(&poll->head->lock); 5578 if (unlikely(list_empty(&poll->wait.entry))) { 5579 if (ipt->error) 5580 cancel = true; 5581 ipt->error = 0; 5582 mask = 0; 5583 } 5584 if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error) 5585 list_del_init(&poll->wait.entry); 5586 else if (cancel) 5587 WRITE_ONCE(poll->canceled, true); 5588 else if (!poll->done) /* actually waiting for an event */ 5589 io_poll_req_insert(req); 5590 spin_unlock_irq(&poll->head->lock); 5591 } 5592 5593 return mask; 5594} 5595 5596enum { 5597 IO_APOLL_OK, 5598 IO_APOLL_ABORTED, 5599 IO_APOLL_READY 5600}; 5601 5602static int io_arm_poll_handler(struct io_kiocb *req) 5603{ 5604 const struct io_op_def *def = &io_op_defs[req->opcode]; 5605 struct io_ring_ctx *ctx = req->ctx; 5606 struct async_poll *apoll; 5607 struct io_poll_table ipt; 5608 __poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI; 5609 int rw; 5610 5611 if (!req->file || !file_can_poll(req->file)) 5612 return IO_APOLL_ABORTED; 5613 if (req->flags & REQ_F_POLLED) 5614 return IO_APOLL_ABORTED; 5615 if (!def->pollin && !def->pollout) 5616 return IO_APOLL_ABORTED; 5617 5618 if (def->pollin) { 5619 rw = READ; 5620 mask |= POLLIN | POLLRDNORM; 5621 5622 /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ 5623 if ((req->opcode == IORING_OP_RECVMSG) && 5624 (req->sr_msg.msg_flags & MSG_ERRQUEUE)) 5625 mask &= ~POLLIN; 5626 } else { 5627 rw = WRITE; 5628 mask |= POLLOUT | POLLWRNORM; 5629 } 5630 5631 /* if we can't nonblock try, then no point in arming a poll handler */ 5632 if (!io_file_supports_nowait(req, rw)) 5633 return IO_APOLL_ABORTED; 5634 5635 apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); 5636 if (unlikely(!apoll)) 5637 return IO_APOLL_ABORTED; 5638 apoll->double_poll = NULL; 5639 req->apoll = apoll; 5640 req->flags |= REQ_F_POLLED; 5641 ipt.pt._qproc = io_async_queue_proc; 5642 io_req_set_refcount(req); 5643 5644 ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, 5645 io_async_wake); 5646 spin_unlock(&ctx->completion_lock); 5647 if (ret || ipt.error) 5648 return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; 5649 5650 trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data, 5651 mask, apoll->poll.events); 5652 return IO_APOLL_OK; 5653} 5654 5655static bool __io_poll_remove_one(struct io_kiocb *req, 5656 struct io_poll_iocb *poll, bool do_cancel) 5657 __must_hold(&req->ctx->completion_lock) 5658{ 5659 bool do_complete = false; 5660 5661 if (!poll->head) 5662 return false; 5663 spin_lock_irq(&poll->head->lock); 5664 if (do_cancel) 5665 WRITE_ONCE(poll->canceled, true); 5666 if (!list_empty(&poll->wait.entry)) { 5667 list_del_init(&poll->wait.entry); 5668 do_complete = true; 5669 } 5670 spin_unlock_irq(&poll->head->lock); 5671 hash_del(&req->hash_node); 5672 return do_complete; 5673} 5674 5675static bool io_poll_remove_one(struct io_kiocb *req) 5676 __must_hold(&req->ctx->completion_lock) 5677{ 5678 bool do_complete; 5679 5680 io_poll_remove_double(req); 5681 do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true); 5682 5683 if (do_complete) { 5684 io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0); 5685 io_commit_cqring(req->ctx); 5686 req_set_fail(req); 5687 io_put_req_deferred(req); 5688 } 5689 return do_complete; 5690} 5691 5692/* 5693 * Returns true if we found and killed one or more poll requests 5694 */ 5695static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, 5696 bool cancel_all) 5697{ 5698 struct hlist_node *tmp; 5699 struct io_kiocb *req; 5700 int posted = 0, i; 5701 5702 spin_lock(&ctx->completion_lock); 5703 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { 5704 struct hlist_head *list; 5705 5706 list = &ctx->cancel_hash[i]; 5707 hlist_for_each_entry_safe(req, tmp, list, hash_node) { 5708 if (io_match_task(req, tsk, cancel_all)) 5709 posted += io_poll_remove_one(req); 5710 } 5711 } 5712 spin_unlock(&ctx->completion_lock); 5713 5714 if (posted) 5715 io_cqring_ev_posted(ctx); 5716 5717 return posted != 0; 5718} 5719 5720static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr, 5721 bool poll_only) 5722 __must_hold(&ctx->completion_lock) 5723{ 5724 struct hlist_head *list; 5725 struct io_kiocb *req; 5726 5727 list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; 5728 hlist_for_each_entry(req, list, hash_node) { 5729 if (sqe_addr != req->user_data) 5730 continue; 5731 if (poll_only && req->opcode != IORING_OP_POLL_ADD) 5732 continue; 5733 return req; 5734 } 5735 return NULL; 5736} 5737 5738static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr, 5739 bool poll_only) 5740 __must_hold(&ctx->completion_lock) 5741{ 5742 struct io_kiocb *req; 5743 5744 req = io_poll_find(ctx, sqe_addr, poll_only); 5745 if (!req) 5746 return -ENOENT; 5747 if (io_poll_remove_one(req)) 5748 return 0; 5749 5750 return -EALREADY; 5751} 5752 5753static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, 5754 unsigned int flags) 5755{ 5756 u32 events; 5757 5758 events = READ_ONCE(sqe->poll32_events); 5759#ifdef __BIG_ENDIAN 5760 events = swahw32(events); 5761#endif 5762 if (!(flags & IORING_POLL_ADD_MULTI)) 5763 events |= EPOLLONESHOT; 5764 return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT)); 5765} 5766 5767static int io_poll_update_prep(struct io_kiocb *req, 5768 const struct io_uring_sqe *sqe) 5769{ 5770 struct io_poll_update *upd = &req->poll_update; 5771 u32 flags; 5772 5773 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5774 return -EINVAL; 5775 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) 5776 return -EINVAL; 5777 flags = READ_ONCE(sqe->len); 5778 if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | 5779 IORING_POLL_ADD_MULTI)) 5780 return -EINVAL; 5781 /* meaningless without update */ 5782 if (flags == IORING_POLL_ADD_MULTI) 5783 return -EINVAL; 5784 5785 upd->old_user_data = READ_ONCE(sqe->addr); 5786 upd->update_events = flags & IORING_POLL_UPDATE_EVENTS; 5787 upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA; 5788 5789 upd->new_user_data = READ_ONCE(sqe->off); 5790 if (!upd->update_user_data && upd->new_user_data) 5791 return -EINVAL; 5792 if (upd->update_events) 5793 upd->events = io_poll_parse_events(sqe, flags); 5794 else if (sqe->poll32_events) 5795 return -EINVAL; 5796 5797 return 0; 5798} 5799 5800static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, 5801 void *key) 5802{ 5803 struct io_kiocb *req = wait->private; 5804 struct io_poll_iocb *poll = &req->poll; 5805 5806 return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func); 5807} 5808 5809static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, 5810 struct poll_table_struct *p) 5811{ 5812 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); 5813 5814 __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data); 5815} 5816 5817static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5818{ 5819 struct io_poll_iocb *poll = &req->poll; 5820 u32 flags; 5821 5822 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5823 return -EINVAL; 5824 if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr) 5825 return -EINVAL; 5826 flags = READ_ONCE(sqe->len); 5827 if (flags & ~IORING_POLL_ADD_MULTI) 5828 return -EINVAL; 5829 5830 io_req_set_refcount(req); 5831 poll->events = io_poll_parse_events(sqe, flags); 5832 return 0; 5833} 5834 5835static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) 5836{ 5837 struct io_poll_iocb *poll = &req->poll; 5838 struct io_ring_ctx *ctx = req->ctx; 5839 struct io_poll_table ipt; 5840 __poll_t mask; 5841 bool done; 5842 5843 ipt.pt._qproc = io_poll_queue_proc; 5844 5845 mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events, 5846 io_poll_wake); 5847 5848 if (mask) { /* no async, we'd stolen it */ 5849 ipt.error = 0; 5850 done = io_poll_complete(req, mask); 5851 } 5852 spin_unlock(&ctx->completion_lock); 5853 5854 if (mask) { 5855 io_cqring_ev_posted(ctx); 5856 if (done) 5857 io_put_req(req); 5858 } 5859 return ipt.error; 5860} 5861 5862static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) 5863{ 5864 struct io_ring_ctx *ctx = req->ctx; 5865 struct io_kiocb *preq; 5866 bool completing; 5867 int ret; 5868 5869 spin_lock(&ctx->completion_lock); 5870 preq = io_poll_find(ctx, req->poll_update.old_user_data, true); 5871 if (!preq) { 5872 ret = -ENOENT; 5873 goto err; 5874 } 5875 5876 if (!req->poll_update.update_events && !req->poll_update.update_user_data) { 5877 completing = true; 5878 ret = io_poll_remove_one(preq) ? 0 : -EALREADY; 5879 goto err; 5880 } 5881 5882 /* 5883 * Don't allow racy completion with singleshot, as we cannot safely 5884 * update those. For multishot, if we're racing with completion, just 5885 * let completion re-add it. 5886 */ 5887 completing = !__io_poll_remove_one(preq, &preq->poll, false); 5888 if (completing && (preq->poll.events & EPOLLONESHOT)) { 5889 ret = -EALREADY; 5890 goto err; 5891 } 5892 /* we now have a detached poll request. reissue. */ 5893 ret = 0; 5894err: 5895 if (ret < 0) { 5896 spin_unlock(&ctx->completion_lock); 5897 req_set_fail(req); 5898 io_req_complete(req, ret); 5899 return 0; 5900 } 5901 /* only mask one event flags, keep behavior flags */ 5902 if (req->poll_update.update_events) { 5903 preq->poll.events &= ~0xffff; 5904 preq->poll.events |= req->poll_update.events & 0xffff; 5905 preq->poll.events |= IO_POLL_UNMASK; 5906 } 5907 if (req->poll_update.update_user_data) 5908 preq->user_data = req->poll_update.new_user_data; 5909 spin_unlock(&ctx->completion_lock); 5910 5911 /* complete update request, we're done with it */ 5912 io_req_complete(req, ret); 5913 5914 if (!completing) { 5915 ret = io_poll_add(preq, issue_flags); 5916 if (ret < 0) { 5917 req_set_fail(preq); 5918 io_req_complete(preq, ret); 5919 } 5920 } 5921 return 0; 5922} 5923 5924static void io_req_task_timeout(struct io_kiocb *req, bool *locked) 5925{ 5926 req_set_fail(req); 5927 io_req_complete_post(req, -ETIME, 0); 5928} 5929 5930static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) 5931{ 5932 struct io_timeout_data *data = container_of(timer, 5933 struct io_timeout_data, timer); 5934 struct io_kiocb *req = data->req; 5935 struct io_ring_ctx *ctx = req->ctx; 5936 unsigned long flags; 5937 5938 spin_lock_irqsave(&ctx->timeout_lock, flags); 5939 list_del_init(&req->timeout.list); 5940 atomic_set(&req->ctx->cq_timeouts, 5941 atomic_read(&req->ctx->cq_timeouts) + 1); 5942 spin_unlock_irqrestore(&ctx->timeout_lock, flags); 5943 5944 req->io_task_work.func = io_req_task_timeout; 5945 io_req_task_work_add(req); 5946 return HRTIMER_NORESTART; 5947} 5948 5949static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, 5950 __u64 user_data) 5951 __must_hold(&ctx->timeout_lock) 5952{ 5953 struct io_timeout_data *io; 5954 struct io_kiocb *req; 5955 bool found = false; 5956 5957 list_for_each_entry(req, &ctx->timeout_list, timeout.list) { 5958 found = user_data == req->user_data; 5959 if (found) 5960 break; 5961 } 5962 if (!found) 5963 return ERR_PTR(-ENOENT); 5964 5965 io = req->async_data; 5966 if (hrtimer_try_to_cancel(&io->timer) == -1) 5967 return ERR_PTR(-EALREADY); 5968 list_del_init(&req->timeout.list); 5969 return req; 5970} 5971 5972static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) 5973 __must_hold(&ctx->completion_lock) 5974 __must_hold(&ctx->timeout_lock) 5975{ 5976 struct io_kiocb *req = io_timeout_extract(ctx, user_data); 5977 5978 if (IS_ERR(req)) 5979 return PTR_ERR(req); 5980 5981 req_set_fail(req); 5982 io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0); 5983 io_put_req_deferred(req); 5984 return 0; 5985} 5986 5987static clockid_t io_timeout_get_clock(struct io_timeout_data *data) 5988{ 5989 switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) { 5990 case IORING_TIMEOUT_BOOTTIME: 5991 return CLOCK_BOOTTIME; 5992 case IORING_TIMEOUT_REALTIME: 5993 return CLOCK_REALTIME; 5994 default: 5995 /* can't happen, vetted at prep time */ 5996 WARN_ON_ONCE(1); 5997 fallthrough; 5998 case 0: 5999 return CLOCK_MONOTONIC; 6000 } 6001} 6002 6003static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, 6004 struct timespec64 *ts, enum hrtimer_mode mode) 6005 __must_hold(&ctx->timeout_lock) 6006{ 6007 struct io_timeout_data *io; 6008 struct io_kiocb *req; 6009 bool found = false; 6010 6011 list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) { 6012 found = user_data == req->user_data; 6013 if (found) 6014 break; 6015 } 6016 if (!found) 6017 return -ENOENT; 6018 6019 io = req->async_data; 6020 if (hrtimer_try_to_cancel(&io->timer) == -1) 6021 return -EALREADY; 6022 hrtimer_init(&io->timer, io_timeout_get_clock(io), mode); 6023 io->timer.function = io_link_timeout_fn; 6024 hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); 6025 return 0; 6026} 6027 6028static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, 6029 struct timespec64 *ts, enum hrtimer_mode mode) 6030 __must_hold(&ctx->timeout_lock) 6031{ 6032 struct io_kiocb *req = io_timeout_extract(ctx, user_data); 6033 struct io_timeout_data *data; 6034 6035 if (IS_ERR(req)) 6036 return PTR_ERR(req); 6037 6038 req->timeout.off = 0; /* noseq */ 6039 data = req->async_data; 6040 list_add_tail(&req->timeout.list, &ctx->timeout_list); 6041 hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); 6042 data->timer.function = io_timeout_fn; 6043 hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); 6044 return 0; 6045} 6046 6047static int io_timeout_remove_prep(struct io_kiocb *req, 6048 const struct io_uring_sqe *sqe) 6049{ 6050 struct io_timeout_rem *tr = &req->timeout_rem; 6051 6052 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 6053 return -EINVAL; 6054 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 6055 return -EINVAL; 6056 if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in) 6057 return -EINVAL; 6058 6059 tr->ltimeout = false; 6060 tr->addr = READ_ONCE(sqe->addr); 6061 tr->flags = READ_ONCE(sqe->timeout_flags); 6062 if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) { 6063 if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1) 6064 return -EINVAL; 6065 if (tr->flags & IORING_LINK_TIMEOUT_UPDATE) 6066 tr->ltimeout = true; 6067 if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) 6068 return -EINVAL; 6069 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) 6070 return -EFAULT; 6071 } else if (tr->flags) { 6072 /* timeout removal doesn't support flags */ 6073 return -EINVAL; 6074 } 6075 6076 return 0; 6077} 6078 6079static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags) 6080{ 6081 return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS 6082 : HRTIMER_MODE_REL; 6083} 6084 6085/* 6086 * Remove or update an existing timeout command 6087 */ 6088static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) 6089{ 6090 struct io_timeout_rem *tr = &req->timeout_rem; 6091 struct io_ring_ctx *ctx = req->ctx; 6092 int ret; 6093 6094 if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) { 6095 spin_lock(&ctx->completion_lock); 6096 spin_lock_irq(&ctx->timeout_lock); 6097 ret = io_timeout_cancel(ctx, tr->addr); 6098 spin_unlock_irq(&ctx->timeout_lock); 6099 spin_unlock(&ctx->completion_lock); 6100 } else { 6101 enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); 6102 6103 spin_lock_irq(&ctx->timeout_lock); 6104 if (tr->ltimeout) 6105 ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); 6106 else 6107 ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); 6108 spin_unlock_irq(&ctx->timeout_lock); 6109 } 6110 6111 if (ret < 0) 6112 req_set_fail(req); 6113 io_req_complete_post(req, ret, 0); 6114 return 0; 6115} 6116 6117static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, 6118 bool is_timeout_link) 6119{ 6120 struct io_timeout_data *data; 6121 unsigned flags; 6122 u32 off = READ_ONCE(sqe->off); 6123 6124 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 6125 return -EINVAL; 6126 if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || 6127 sqe->splice_fd_in) 6128 return -EINVAL; 6129 if (off && is_timeout_link) 6130 return -EINVAL; 6131 flags = READ_ONCE(sqe->timeout_flags); 6132 if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK)) 6133 return -EINVAL; 6134 /* more than one clock specified is invalid, obviously */ 6135 if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) 6136 return -EINVAL; 6137 6138 INIT_LIST_HEAD(&req->timeout.list); 6139 req->timeout.off = off; 6140 if (unlikely(off && !req->ctx->off_timeout_used)) 6141 req->ctx->off_timeout_used = true; 6142 6143 if (!req->async_data && io_alloc_async_data(req)) 6144 return -ENOMEM; 6145 6146 data = req->async_data; 6147 data->req = req; 6148 data->flags = flags; 6149 6150 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) 6151 return -EFAULT; 6152 6153 data->mode = io_translate_timeout_mode(flags); 6154 hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); 6155 6156 if (is_timeout_link) { 6157 struct io_submit_link *link = &req->ctx->submit_state.link; 6158 6159 if (!link->head) 6160 return -EINVAL; 6161 if (link->last->opcode == IORING_OP_LINK_TIMEOUT) 6162 return -EINVAL; 6163 req->timeout.head = link->last; 6164 link->last->flags |= REQ_F_ARM_LTIMEOUT; 6165 } 6166 return 0; 6167} 6168 6169static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) 6170{ 6171 struct io_ring_ctx *ctx = req->ctx; 6172 struct io_timeout_data *data = req->async_data; 6173 struct list_head *entry; 6174 u32 tail, off = req->timeout.off; 6175 6176 spin_lock_irq(&ctx->timeout_lock); 6177 6178 /* 6179 * sqe->off holds how many events that need to occur for this 6180 * timeout event to be satisfied. If it isn't set, then this is 6181 * a pure timeout request, sequence isn't used. 6182 */ 6183 if (io_is_timeout_noseq(req)) { 6184 entry = ctx->timeout_list.prev; 6185 goto add; 6186 } 6187 6188 tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); 6189 req->timeout.target_seq = tail + off; 6190 6191 /* Update the last seq here in case io_flush_timeouts() hasn't. 6192 * This is safe because ->completion_lock is held, and submissions 6193 * and completions are never mixed in the same ->completion_lock section. 6194 */ 6195 ctx->cq_last_tm_flush = tail; 6196 6197 /* 6198 * Insertion sort, ensuring the first entry in the list is always 6199 * the one we need first. 6200 */ 6201 list_for_each_prev(entry, &ctx->timeout_list) { 6202 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, 6203 timeout.list); 6204 6205 if (io_is_timeout_noseq(nxt)) 6206 continue; 6207 /* nxt.seq is behind @tail, otherwise would've been completed */ 6208 if (off >= nxt->timeout.target_seq - tail) 6209 break; 6210 } 6211add: 6212 list_add(&req->timeout.list, entry); 6213 data->timer.function = io_timeout_fn; 6214 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); 6215 spin_unlock_irq(&ctx->timeout_lock); 6216 return 0; 6217} 6218 6219struct io_cancel_data { 6220 struct io_ring_ctx *ctx; 6221 u64 user_data; 6222}; 6223 6224static bool io_cancel_cb(struct io_wq_work *work, void *data) 6225{ 6226 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 6227 struct io_cancel_data *cd = data; 6228 6229 return req->ctx == cd->ctx && req->user_data == cd->user_data; 6230} 6231 6232static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, 6233 struct io_ring_ctx *ctx) 6234{ 6235 struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, }; 6236 enum io_wq_cancel cancel_ret; 6237 int ret = 0; 6238 6239 if (!tctx || !tctx->io_wq) 6240 return -ENOENT; 6241 6242 cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false); 6243 switch (cancel_ret) { 6244 case IO_WQ_CANCEL_OK: 6245 ret = 0; 6246 break; 6247 case IO_WQ_CANCEL_RUNNING: 6248 ret = -EALREADY; 6249 break; 6250 case IO_WQ_CANCEL_NOTFOUND: 6251 ret = -ENOENT; 6252 break; 6253 } 6254 6255 return ret; 6256} 6257 6258static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr) 6259{ 6260 struct io_ring_ctx *ctx = req->ctx; 6261 int ret; 6262 6263 WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); 6264 6265 ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); 6266 if (ret != -ENOENT) 6267 return ret; 6268 6269 spin_lock(&ctx->completion_lock); 6270 spin_lock_irq(&ctx->timeout_lock); 6271 ret = io_timeout_cancel(ctx, sqe_addr); 6272 spin_unlock_irq(&ctx->timeout_lock); 6273 if (ret != -ENOENT) 6274 goto out; 6275 ret = io_poll_cancel(ctx, sqe_addr, false); 6276out: 6277 spin_unlock(&ctx->completion_lock); 6278 return ret; 6279} 6280 6281static int io_async_cancel_prep(struct io_kiocb *req, 6282 const struct io_uring_sqe *sqe) 6283{ 6284 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 6285 return -EINVAL; 6286 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 6287 return -EINVAL; 6288 if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags || 6289 sqe->splice_fd_in) 6290 return -EINVAL; 6291 6292 req->cancel.addr = READ_ONCE(sqe->addr); 6293 return 0; 6294} 6295 6296static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) 6297{ 6298 struct io_ring_ctx *ctx = req->ctx; 6299 u64 sqe_addr = req->cancel.addr; 6300 struct io_tctx_node *node; 6301 int ret; 6302 6303 ret = io_try_cancel_userdata(req, sqe_addr); 6304 if (ret != -ENOENT) 6305 goto done; 6306 6307 /* slow path, try all io-wq's */ 6308 io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); 6309 ret = -ENOENT; 6310 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 6311 struct io_uring_task *tctx = node->task->io_uring; 6312 6313 ret = io_async_cancel_one(tctx, req->cancel.addr, ctx); 6314 if (ret != -ENOENT) 6315 break; 6316 } 6317 io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); 6318done: 6319 if (ret < 0) 6320 req_set_fail(req); 6321 io_req_complete_post(req, ret, 0); 6322 return 0; 6323} 6324 6325static int io_rsrc_update_prep(struct io_kiocb *req, 6326 const struct io_uring_sqe *sqe) 6327{ 6328 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 6329 return -EINVAL; 6330 if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) 6331 return -EINVAL; 6332 6333 req->rsrc_update.offset = READ_ONCE(sqe->off); 6334 req->rsrc_update.nr_args = READ_ONCE(sqe->len); 6335 if (!req->rsrc_update.nr_args) 6336 return -EINVAL; 6337 req->rsrc_update.arg = READ_ONCE(sqe->addr); 6338 return 0; 6339} 6340 6341static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 6342{ 6343 struct io_ring_ctx *ctx = req->ctx; 6344 struct io_uring_rsrc_update2 up; 6345 int ret; 6346 6347 up.offset = req->rsrc_update.offset; 6348 up.data = req->rsrc_update.arg; 6349 up.nr = 0; 6350 up.tags = 0; 6351 up.resv = 0; 6352 6353 io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); 6354 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 6355 &up, req->rsrc_update.nr_args); 6356 io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); 6357 6358 if (ret < 0) 6359 req_set_fail(req); 6360 __io_req_complete(req, issue_flags, ret, 0); 6361 return 0; 6362} 6363 6364static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 6365{ 6366 switch (req->opcode) { 6367 case IORING_OP_NOP: 6368 return 0; 6369 case IORING_OP_READV: 6370 case IORING_OP_READ_FIXED: 6371 case IORING_OP_READ: 6372 return io_read_prep(req, sqe); 6373 case IORING_OP_WRITEV: 6374 case IORING_OP_WRITE_FIXED: 6375 case IORING_OP_WRITE: 6376 return io_write_prep(req, sqe); 6377 case IORING_OP_POLL_ADD: 6378 return io_poll_add_prep(req, sqe); 6379 case IORING_OP_POLL_REMOVE: 6380 return io_poll_update_prep(req, sqe); 6381 case IORING_OP_FSYNC: 6382 return io_fsync_prep(req, sqe); 6383 case IORING_OP_SYNC_FILE_RANGE: 6384 return io_sfr_prep(req, sqe); 6385 case IORING_OP_SENDMSG: 6386 case IORING_OP_SEND: 6387 return io_sendmsg_prep(req, sqe); 6388 case IORING_OP_RECVMSG: 6389 case IORING_OP_RECV: 6390 return io_recvmsg_prep(req, sqe); 6391 case IORING_OP_CONNECT: 6392 return io_connect_prep(req, sqe); 6393 case IORING_OP_TIMEOUT: 6394 return io_timeout_prep(req, sqe, false); 6395 case IORING_OP_TIMEOUT_REMOVE: 6396 return io_timeout_remove_prep(req, sqe); 6397 case IORING_OP_ASYNC_CANCEL: 6398 return io_async_cancel_prep(req, sqe); 6399 case IORING_OP_LINK_TIMEOUT: 6400 return io_timeout_prep(req, sqe, true); 6401 case IORING_OP_ACCEPT: 6402 return io_accept_prep(req, sqe); 6403 case IORING_OP_FALLOCATE: 6404 return io_fallocate_prep(req, sqe); 6405 case IORING_OP_OPENAT: 6406 return io_openat_prep(req, sqe); 6407 case IORING_OP_CLOSE: 6408 return io_close_prep(req, sqe); 6409 case IORING_OP_FILES_UPDATE: 6410 return io_rsrc_update_prep(req, sqe); 6411 case IORING_OP_STATX: 6412 return io_statx_prep(req, sqe); 6413 case IORING_OP_FADVISE: 6414 return io_fadvise_prep(req, sqe); 6415 case IORING_OP_MADVISE: 6416 return io_madvise_prep(req, sqe); 6417 case IORING_OP_OPENAT2: 6418 return io_openat2_prep(req, sqe); 6419 case IORING_OP_EPOLL_CTL: 6420 return io_epoll_ctl_prep(req, sqe); 6421 case IORING_OP_SPLICE: 6422 return io_splice_prep(req, sqe); 6423 case IORING_OP_PROVIDE_BUFFERS: 6424 return io_provide_buffers_prep(req, sqe); 6425 case IORING_OP_REMOVE_BUFFERS: 6426 return io_remove_buffers_prep(req, sqe); 6427 case IORING_OP_TEE: 6428 return io_tee_prep(req, sqe); 6429 case IORING_OP_SHUTDOWN: 6430 return io_shutdown_prep(req, sqe); 6431 case IORING_OP_RENAMEAT: 6432 return io_renameat_prep(req, sqe); 6433 case IORING_OP_UNLINKAT: 6434 return io_unlinkat_prep(req, sqe); 6435 case IORING_OP_MKDIRAT: 6436 return io_mkdirat_prep(req, sqe); 6437 case IORING_OP_SYMLINKAT: 6438 return io_symlinkat_prep(req, sqe); 6439 case IORING_OP_LINKAT: 6440 return io_linkat_prep(req, sqe); 6441 } 6442 6443 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", 6444 req->opcode); 6445 return -EINVAL; 6446} 6447 6448static int io_req_prep_async(struct io_kiocb *req) 6449{ 6450 if (!io_op_defs[req->opcode].needs_async_setup) 6451 return 0; 6452 if (WARN_ON_ONCE(req->async_data)) 6453 return -EFAULT; 6454 if (io_alloc_async_data(req)) 6455 return -EAGAIN; 6456 6457 switch (req->opcode) { 6458 case IORING_OP_READV: 6459 return io_rw_prep_async(req, READ); 6460 case IORING_OP_WRITEV: 6461 return io_rw_prep_async(req, WRITE); 6462 case IORING_OP_SENDMSG: 6463 return io_sendmsg_prep_async(req); 6464 case IORING_OP_RECVMSG: 6465 return io_recvmsg_prep_async(req); 6466 case IORING_OP_CONNECT: 6467 return io_connect_prep_async(req); 6468 } 6469 printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n", 6470 req->opcode); 6471 return -EFAULT; 6472} 6473 6474static u32 io_get_sequence(struct io_kiocb *req) 6475{ 6476 u32 seq = req->ctx->cached_sq_head; 6477 6478 /* need original cached_sq_head, but it was increased for each req */ 6479 io_for_each_link(req, req) 6480 seq--; 6481 return seq; 6482} 6483 6484static bool io_drain_req(struct io_kiocb *req) 6485{ 6486 struct io_kiocb *pos; 6487 struct io_ring_ctx *ctx = req->ctx; 6488 struct io_defer_entry *de; 6489 int ret; 6490 u32 seq; 6491 6492 if (req->flags & REQ_F_FAIL) { 6493 io_req_complete_fail_submit(req); 6494 return true; 6495 } 6496 6497 /* 6498 * If we need to drain a request in the middle of a link, drain the 6499 * head request and the next request/link after the current link. 6500 * Considering sequential execution of links, IOSQE_IO_DRAIN will be 6501 * maintained for every request of our link. 6502 */ 6503 if (ctx->drain_next) { 6504 req->flags |= REQ_F_IO_DRAIN; 6505 ctx->drain_next = false; 6506 } 6507 /* not interested in head, start from the first linked */ 6508 io_for_each_link(pos, req->link) { 6509 if (pos->flags & REQ_F_IO_DRAIN) { 6510 ctx->drain_next = true; 6511 req->flags |= REQ_F_IO_DRAIN; 6512 break; 6513 } 6514 } 6515 6516 /* Still need defer if there is pending req in defer list. */ 6517 if (likely(list_empty_careful(&ctx->defer_list) && 6518 !(req->flags & REQ_F_IO_DRAIN))) { 6519 ctx->drain_active = false; 6520 return false; 6521 } 6522 6523 seq = io_get_sequence(req); 6524 /* Still a chance to pass the sequence check */ 6525 if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) 6526 return false; 6527 6528 ret = io_req_prep_async(req); 6529 if (ret) 6530 goto fail; 6531 io_prep_async_link(req); 6532 de = kmalloc(sizeof(*de), GFP_KERNEL); 6533 if (!de) { 6534 ret = -ENOMEM; 6535fail: 6536 io_req_complete_failed(req, ret); 6537 return true; 6538 } 6539 6540 spin_lock(&ctx->completion_lock); 6541 if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { 6542 spin_unlock(&ctx->completion_lock); 6543 kfree(de); 6544 io_queue_async_work(req, NULL); 6545 return true; 6546 } 6547 6548 trace_io_uring_defer(ctx, req, req->user_data); 6549 de->req = req; 6550 de->seq = seq; 6551 list_add_tail(&de->list, &ctx->defer_list); 6552 spin_unlock(&ctx->completion_lock); 6553 return true; 6554} 6555 6556static void io_clean_op(struct io_kiocb *req) 6557{ 6558 if (req->flags & REQ_F_BUFFER_SELECTED) { 6559 switch (req->opcode) { 6560 case IORING_OP_READV: 6561 case IORING_OP_READ_FIXED: 6562 case IORING_OP_READ: 6563 kfree((void *)(unsigned long)req->rw.addr); 6564 break; 6565 case IORING_OP_RECVMSG: 6566 case IORING_OP_RECV: 6567 kfree(req->sr_msg.kbuf); 6568 break; 6569 } 6570 } 6571 6572 if (req->flags & REQ_F_NEED_CLEANUP) { 6573 switch (req->opcode) { 6574 case IORING_OP_READV: 6575 case IORING_OP_READ_FIXED: 6576 case IORING_OP_READ: 6577 case IORING_OP_WRITEV: 6578 case IORING_OP_WRITE_FIXED: 6579 case IORING_OP_WRITE: { 6580 struct io_async_rw *io = req->async_data; 6581 6582 kfree(io->free_iovec); 6583 break; 6584 } 6585 case IORING_OP_RECVMSG: 6586 case IORING_OP_SENDMSG: { 6587 struct io_async_msghdr *io = req->async_data; 6588 6589 kfree(io->free_iov); 6590 break; 6591 } 6592 case IORING_OP_SPLICE: 6593 case IORING_OP_TEE: 6594 if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED)) 6595 io_put_file(req->splice.file_in); 6596 break; 6597 case IORING_OP_OPENAT: 6598 case IORING_OP_OPENAT2: 6599 if (req->open.filename) 6600 putname(req->open.filename); 6601 break; 6602 case IORING_OP_RENAMEAT: 6603 putname(req->rename.oldpath); 6604 putname(req->rename.newpath); 6605 break; 6606 case IORING_OP_UNLINKAT: 6607 putname(req->unlink.filename); 6608 break; 6609 case IORING_OP_MKDIRAT: 6610 putname(req->mkdir.filename); 6611 break; 6612 case IORING_OP_SYMLINKAT: 6613 putname(req->symlink.oldpath); 6614 putname(req->symlink.newpath); 6615 break; 6616 case IORING_OP_LINKAT: 6617 putname(req->hardlink.oldpath); 6618 putname(req->hardlink.newpath); 6619 break; 6620 } 6621 } 6622 if ((req->flags & REQ_F_POLLED) && req->apoll) { 6623 kfree(req->apoll->double_poll); 6624 kfree(req->apoll); 6625 req->apoll = NULL; 6626 } 6627 if (req->flags & REQ_F_INFLIGHT) { 6628 struct io_uring_task *tctx = req->task->io_uring; 6629 6630 atomic_dec(&tctx->inflight_tracked); 6631 } 6632 if (req->flags & REQ_F_CREDS) 6633 put_cred(req->creds); 6634 6635 req->flags &= ~IO_REQ_CLEAN_FLAGS; 6636} 6637 6638static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) 6639{ 6640 struct io_ring_ctx *ctx = req->ctx; 6641 const struct cred *creds = NULL; 6642 int ret; 6643 6644 if ((req->flags & REQ_F_CREDS) && req->creds != current_cred()) 6645 creds = override_creds(req->creds); 6646 6647 switch (req->opcode) { 6648 case IORING_OP_NOP: 6649 ret = io_nop(req, issue_flags); 6650 break; 6651 case IORING_OP_READV: 6652 case IORING_OP_READ_FIXED: 6653 case IORING_OP_READ: 6654 ret = io_read(req, issue_flags); 6655 break; 6656 case IORING_OP_WRITEV: 6657 case IORING_OP_WRITE_FIXED: 6658 case IORING_OP_WRITE: 6659 ret = io_write(req, issue_flags); 6660 break; 6661 case IORING_OP_FSYNC: 6662 ret = io_fsync(req, issue_flags); 6663 break; 6664 case IORING_OP_POLL_ADD: 6665 ret = io_poll_add(req, issue_flags); 6666 break; 6667 case IORING_OP_POLL_REMOVE: 6668 ret = io_poll_update(req, issue_flags); 6669 break; 6670 case IORING_OP_SYNC_FILE_RANGE: 6671 ret = io_sync_file_range(req, issue_flags); 6672 break; 6673 case IORING_OP_SENDMSG: 6674 ret = io_sendmsg(req, issue_flags); 6675 break; 6676 case IORING_OP_SEND: 6677 ret = io_send(req, issue_flags); 6678 break; 6679 case IORING_OP_RECVMSG: 6680 ret = io_recvmsg(req, issue_flags); 6681 break; 6682 case IORING_OP_RECV: 6683 ret = io_recv(req, issue_flags); 6684 break; 6685 case IORING_OP_TIMEOUT: 6686 ret = io_timeout(req, issue_flags); 6687 break; 6688 case IORING_OP_TIMEOUT_REMOVE: 6689 ret = io_timeout_remove(req, issue_flags); 6690 break; 6691 case IORING_OP_ACCEPT: 6692 ret = io_accept(req, issue_flags); 6693 break; 6694 case IORING_OP_CONNECT: 6695 ret = io_connect(req, issue_flags); 6696 break; 6697 case IORING_OP_ASYNC_CANCEL: 6698 ret = io_async_cancel(req, issue_flags); 6699 break; 6700 case IORING_OP_FALLOCATE: 6701 ret = io_fallocate(req, issue_flags); 6702 break; 6703 case IORING_OP_OPENAT: 6704 ret = io_openat(req, issue_flags); 6705 break; 6706 case IORING_OP_CLOSE: 6707 ret = io_close(req, issue_flags); 6708 break; 6709 case IORING_OP_FILES_UPDATE: 6710 ret = io_files_update(req, issue_flags); 6711 break; 6712 case IORING_OP_STATX: 6713 ret = io_statx(req, issue_flags); 6714 break; 6715 case IORING_OP_FADVISE: 6716 ret = io_fadvise(req, issue_flags); 6717 break; 6718 case IORING_OP_MADVISE: 6719 ret = io_madvise(req, issue_flags); 6720 break; 6721 case IORING_OP_OPENAT2: 6722 ret = io_openat2(req, issue_flags); 6723 break; 6724 case IORING_OP_EPOLL_CTL: 6725 ret = io_epoll_ctl(req, issue_flags); 6726 break; 6727 case IORING_OP_SPLICE: 6728 ret = io_splice(req, issue_flags); 6729 break; 6730 case IORING_OP_PROVIDE_BUFFERS: 6731 ret = io_provide_buffers(req, issue_flags); 6732 break; 6733 case IORING_OP_REMOVE_BUFFERS: 6734 ret = io_remove_buffers(req, issue_flags); 6735 break; 6736 case IORING_OP_TEE: 6737 ret = io_tee(req, issue_flags); 6738 break; 6739 case IORING_OP_SHUTDOWN: 6740 ret = io_shutdown(req, issue_flags); 6741 break; 6742 case IORING_OP_RENAMEAT: 6743 ret = io_renameat(req, issue_flags); 6744 break; 6745 case IORING_OP_UNLINKAT: 6746 ret = io_unlinkat(req, issue_flags); 6747 break; 6748 case IORING_OP_MKDIRAT: 6749 ret = io_mkdirat(req, issue_flags); 6750 break; 6751 case IORING_OP_SYMLINKAT: 6752 ret = io_symlinkat(req, issue_flags); 6753 break; 6754 case IORING_OP_LINKAT: 6755 ret = io_linkat(req, issue_flags); 6756 break; 6757 default: 6758 ret = -EINVAL; 6759 break; 6760 } 6761 6762 if (creds) 6763 revert_creds(creds); 6764 if (ret) 6765 return ret; 6766 /* If the op doesn't have a file, we're not polling for it */ 6767 if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) 6768 io_iopoll_req_issued(req); 6769 6770 return 0; 6771} 6772 6773static struct io_wq_work *io_wq_free_work(struct io_wq_work *work) 6774{ 6775 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 6776 6777 req = io_put_req_find_next(req); 6778 return req ? &req->work : NULL; 6779} 6780 6781static void io_wq_submit_work(struct io_wq_work *work) 6782{ 6783 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 6784 struct io_kiocb *timeout; 6785 int ret = 0; 6786 6787 /* one will be dropped by ->io_free_work() after returning to io-wq */ 6788 if (!(req->flags & REQ_F_REFCOUNT)) 6789 __io_req_set_refcount(req, 2); 6790 else 6791 req_ref_get(req); 6792 6793 timeout = io_prep_linked_timeout(req); 6794 if (timeout) 6795 io_queue_linked_timeout(timeout); 6796 6797 /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ 6798 if (work->flags & IO_WQ_WORK_CANCEL) 6799 ret = -ECANCELED; 6800 6801 if (!ret) { 6802 do { 6803 ret = io_issue_sqe(req, 0); 6804 /* 6805 * We can get EAGAIN for polled IO even though we're 6806 * forcing a sync submission from here, since we can't 6807 * wait for request slots on the block side. 6808 */ 6809 if (ret != -EAGAIN) 6810 break; 6811 cond_resched(); 6812 } while (1); 6813 } 6814 6815 /* avoid locking problems by failing it from a clean context */ 6816 if (ret) 6817 io_req_task_queue_fail(req, ret); 6818} 6819 6820static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table, 6821 unsigned i) 6822{ 6823 return &table->files[i]; 6824} 6825 6826static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, 6827 int index) 6828{ 6829 struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index); 6830 6831 return (struct file *) (slot->file_ptr & FFS_MASK); 6832} 6833 6834static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file) 6835{ 6836 unsigned long file_ptr = (unsigned long) file; 6837 6838 if (__io_file_supports_nowait(file, READ)) 6839 file_ptr |= FFS_ASYNC_READ; 6840 if (__io_file_supports_nowait(file, WRITE)) 6841 file_ptr |= FFS_ASYNC_WRITE; 6842 if (S_ISREG(file_inode(file)->i_mode)) 6843 file_ptr |= FFS_ISREG; 6844 file_slot->file_ptr = file_ptr; 6845} 6846 6847static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx, 6848 struct io_kiocb *req, int fd) 6849{ 6850 struct file *file; 6851 unsigned long file_ptr; 6852 6853 if (unlikely((unsigned int)fd >= ctx->nr_user_files)) 6854 return NULL; 6855 fd = array_index_nospec(fd, ctx->nr_user_files); 6856 file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; 6857 file = (struct file *) (file_ptr & FFS_MASK); 6858 file_ptr &= ~FFS_MASK; 6859 /* mask in overlapping REQ_F and FFS bits */ 6860 req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT); 6861 io_req_set_rsrc_node(req); 6862 return file; 6863} 6864 6865static struct file *io_file_get_normal(struct io_ring_ctx *ctx, 6866 struct io_kiocb *req, int fd) 6867{ 6868 struct file *file = fget(fd); 6869 6870 trace_io_uring_file_get(ctx, fd); 6871 6872 /* we don't allow fixed io_uring files */ 6873 if (file && unlikely(file->f_op == &io_uring_fops)) 6874 io_req_track_inflight(req); 6875 return file; 6876} 6877 6878static inline struct file *io_file_get(struct io_ring_ctx *ctx, 6879 struct io_kiocb *req, int fd, bool fixed) 6880{ 6881 if (fixed) 6882 return io_file_get_fixed(ctx, req, fd); 6883 else 6884 return io_file_get_normal(ctx, req, fd); 6885} 6886 6887static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) 6888{ 6889 struct io_kiocb *prev = req->timeout.prev; 6890 int ret; 6891 6892 if (prev) { 6893 ret = io_try_cancel_userdata(req, prev->user_data); 6894 io_req_complete_post(req, ret ?: -ETIME, 0); 6895 io_put_req(prev); 6896 } else { 6897 io_req_complete_post(req, -ETIME, 0); 6898 } 6899} 6900 6901static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) 6902{ 6903 struct io_timeout_data *data = container_of(timer, 6904 struct io_timeout_data, timer); 6905 struct io_kiocb *prev, *req = data->req; 6906 struct io_ring_ctx *ctx = req->ctx; 6907 unsigned long flags; 6908 6909 spin_lock_irqsave(&ctx->timeout_lock, flags); 6910 prev = req->timeout.head; 6911 req->timeout.head = NULL; 6912 6913 /* 6914 * We don't expect the list to be empty, that will only happen if we 6915 * race with the completion of the linked work. 6916 */ 6917 if (prev) { 6918 io_remove_next_linked(prev); 6919 if (!req_ref_inc_not_zero(prev)) 6920 prev = NULL; 6921 } 6922 list_del(&req->timeout.list); 6923 req->timeout.prev = prev; 6924 spin_unlock_irqrestore(&ctx->timeout_lock, flags); 6925 6926 req->io_task_work.func = io_req_task_link_timeout; 6927 io_req_task_work_add(req); 6928 return HRTIMER_NORESTART; 6929} 6930 6931static void io_queue_linked_timeout(struct io_kiocb *req) 6932{ 6933 struct io_ring_ctx *ctx = req->ctx; 6934 6935 spin_lock_irq(&ctx->timeout_lock); 6936 /* 6937 * If the back reference is NULL, then our linked request finished 6938 * before we got a chance to setup the timer 6939 */ 6940 if (req->timeout.head) { 6941 struct io_timeout_data *data = req->async_data; 6942 6943 data->timer.function = io_link_timeout_fn; 6944 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), 6945 data->mode); 6946 list_add_tail(&req->timeout.list, &ctx->ltimeout_list); 6947 } 6948 spin_unlock_irq(&ctx->timeout_lock); 6949 /* drop submission reference */ 6950 io_put_req(req); 6951} 6952 6953static void __io_queue_sqe(struct io_kiocb *req) 6954 __must_hold(&req->ctx->uring_lock) 6955{ 6956 struct io_kiocb *linked_timeout; 6957 int ret; 6958 6959issue_sqe: 6960 ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); 6961 6962 /* 6963 * We async punt it if the file wasn't marked NOWAIT, or if the file 6964 * doesn't support non-blocking read/write attempts 6965 */ 6966 if (likely(!ret)) { 6967 if (req->flags & REQ_F_COMPLETE_INLINE) { 6968 struct io_ring_ctx *ctx = req->ctx; 6969 struct io_submit_state *state = &ctx->submit_state; 6970 6971 state->compl_reqs[state->compl_nr++] = req; 6972 if (state->compl_nr == ARRAY_SIZE(state->compl_reqs)) 6973 io_submit_flush_completions(ctx); 6974 return; 6975 } 6976 6977 linked_timeout = io_prep_linked_timeout(req); 6978 if (linked_timeout) 6979 io_queue_linked_timeout(linked_timeout); 6980 } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { 6981 linked_timeout = io_prep_linked_timeout(req); 6982 6983 switch (io_arm_poll_handler(req)) { 6984 case IO_APOLL_READY: 6985 if (linked_timeout) 6986 io_unprep_linked_timeout(req); 6987 goto issue_sqe; 6988 case IO_APOLL_ABORTED: 6989 /* 6990 * Queued up for async execution, worker will release 6991 * submit reference when the iocb is actually submitted. 6992 */ 6993 io_queue_async_work(req, NULL); 6994 break; 6995 } 6996 6997 if (linked_timeout) 6998 io_queue_linked_timeout(linked_timeout); 6999 } else { 7000 io_req_complete_failed(req, ret); 7001 } 7002} 7003 7004static inline void io_queue_sqe(struct io_kiocb *req) 7005 __must_hold(&req->ctx->uring_lock) 7006{ 7007 if (unlikely(req->ctx->drain_active) && io_drain_req(req)) 7008 return; 7009 7010 if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) { 7011 __io_queue_sqe(req); 7012 } else if (req->flags & REQ_F_FAIL) { 7013 io_req_complete_fail_submit(req); 7014 } else { 7015 int ret = io_req_prep_async(req); 7016 7017 if (unlikely(ret)) 7018 io_req_complete_failed(req, ret); 7019 else 7020 io_queue_async_work(req, NULL); 7021 } 7022} 7023 7024/* 7025 * Check SQE restrictions (opcode and flags). 7026 * 7027 * Returns 'true' if SQE is allowed, 'false' otherwise. 7028 */ 7029static inline bool io_check_restriction(struct io_ring_ctx *ctx, 7030 struct io_kiocb *req, 7031 unsigned int sqe_flags) 7032{ 7033 if (likely(!ctx->restricted)) 7034 return true; 7035 7036 if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) 7037 return false; 7038 7039 if ((sqe_flags & ctx->restrictions.sqe_flags_required) != 7040 ctx->restrictions.sqe_flags_required) 7041 return false; 7042 7043 if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | 7044 ctx->restrictions.sqe_flags_required)) 7045 return false; 7046 7047 return true; 7048} 7049 7050static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, 7051 const struct io_uring_sqe *sqe) 7052 __must_hold(&ctx->uring_lock) 7053{ 7054 struct io_submit_state *state; 7055 unsigned int sqe_flags; 7056 int personality, ret = 0; 7057 7058 /* req is partially pre-initialised, see io_preinit_req() */ 7059 req->opcode = READ_ONCE(sqe->opcode); 7060 /* same numerical values with corresponding REQ_F_*, safe to copy */ 7061 req->flags = sqe_flags = READ_ONCE(sqe->flags); 7062 req->user_data = READ_ONCE(sqe->user_data); 7063 req->file = NULL; 7064 req->fixed_rsrc_refs = NULL; 7065 req->task = current; 7066 7067 /* enforce forwards compatibility on users */ 7068 if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) 7069 return -EINVAL; 7070 if (unlikely(req->opcode >= IORING_OP_LAST)) 7071 return -EINVAL; 7072 if (!io_check_restriction(ctx, req, sqe_flags)) 7073 return -EACCES; 7074 7075 if ((sqe_flags & IOSQE_BUFFER_SELECT) && 7076 !io_op_defs[req->opcode].buffer_select) 7077 return -EOPNOTSUPP; 7078 if (unlikely(sqe_flags & IOSQE_IO_DRAIN)) 7079 ctx->drain_active = true; 7080 7081 personality = READ_ONCE(sqe->personality); 7082 if (personality) { 7083 req->creds = xa_load(&ctx->personalities, personality); 7084 if (!req->creds) 7085 return -EINVAL; 7086 get_cred(req->creds); 7087 req->flags |= REQ_F_CREDS; 7088 } 7089 state = &ctx->submit_state; 7090 7091 /* 7092 * Plug now if we have more than 1 IO left after this, and the target 7093 * is potentially a read/write to block based storage. 7094 */ 7095 if (!state->plug_started && state->ios_left > 1 && 7096 io_op_defs[req->opcode].plug) { 7097 blk_start_plug(&state->plug); 7098 state->plug_started = true; 7099 } 7100 7101 if (io_op_defs[req->opcode].needs_file) { 7102 req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd), 7103 (sqe_flags & IOSQE_FIXED_FILE)); 7104 if (unlikely(!req->file)) 7105 ret = -EBADF; 7106 } 7107 7108 state->ios_left--; 7109 return ret; 7110} 7111 7112static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, 7113 const struct io_uring_sqe *sqe) 7114 __must_hold(&ctx->uring_lock) 7115{ 7116 struct io_submit_link *link = &ctx->submit_state.link; 7117 int ret; 7118 7119 ret = io_init_req(ctx, req, sqe); 7120 if (unlikely(ret)) { 7121fail_req: 7122 /* fail even hard links since we don't submit */ 7123 if (link->head) { 7124 /* 7125 * we can judge a link req is failed or cancelled by if 7126 * REQ_F_FAIL is set, but the head is an exception since 7127 * it may be set REQ_F_FAIL because of other req's failure 7128 * so let's leverage req->result to distinguish if a head 7129 * is set REQ_F_FAIL because of its failure or other req's 7130 * failure so that we can set the correct ret code for it. 7131 * init result here to avoid affecting the normal path. 7132 */ 7133 if (!(link->head->flags & REQ_F_FAIL)) 7134 req_fail_link_node(link->head, -ECANCELED); 7135 } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { 7136 /* 7137 * the current req is a normal req, we should return 7138 * error and thus break the submittion loop. 7139 */ 7140 io_req_complete_failed(req, ret); 7141 return ret; 7142 } 7143 req_fail_link_node(req, ret); 7144 } else { 7145 ret = io_req_prep(req, sqe); 7146 if (unlikely(ret)) 7147 goto fail_req; 7148 } 7149 7150 /* don't need @sqe from now on */ 7151 trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data, 7152 req->flags, true, 7153 ctx->flags & IORING_SETUP_SQPOLL); 7154 7155 /* 7156 * If we already have a head request, queue this one for async 7157 * submittal once the head completes. If we don't have a head but 7158 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be 7159 * submitted sync once the chain is complete. If none of those 7160 * conditions are true (normal request), then just queue it. 7161 */ 7162 if (link->head) { 7163 struct io_kiocb *head = link->head; 7164 7165 if (!(req->flags & REQ_F_FAIL)) { 7166 ret = io_req_prep_async(req); 7167 if (unlikely(ret)) { 7168 req_fail_link_node(req, ret); 7169 if (!(head->flags & REQ_F_FAIL)) 7170 req_fail_link_node(head, -ECANCELED); 7171 } 7172 } 7173 trace_io_uring_link(ctx, req, head); 7174 link->last->link = req; 7175 link->last = req; 7176 7177 /* last request of a link, enqueue the link */ 7178 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { 7179 link->head = NULL; 7180 io_queue_sqe(head); 7181 } 7182 } else { 7183 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { 7184 link->head = req; 7185 link->last = req; 7186 } else { 7187 io_queue_sqe(req); 7188 } 7189 } 7190 7191 return 0; 7192} 7193 7194/* 7195 * Batched submission is done, ensure local IO is flushed out. 7196 */ 7197static void io_submit_state_end(struct io_submit_state *state, 7198 struct io_ring_ctx *ctx) 7199{ 7200 if (state->link.head) 7201 io_queue_sqe(state->link.head); 7202 if (state->compl_nr) 7203 io_submit_flush_completions(ctx); 7204 if (state->plug_started) 7205 blk_finish_plug(&state->plug); 7206} 7207 7208/* 7209 * Start submission side cache. 7210 */ 7211static void io_submit_state_start(struct io_submit_state *state, 7212 unsigned int max_ios) 7213{ 7214 state->plug_started = false; 7215 state->ios_left = max_ios; 7216 /* set only head, no need to init link_last in advance */ 7217 state->link.head = NULL; 7218} 7219 7220static void io_commit_sqring(struct io_ring_ctx *ctx) 7221{ 7222 struct io_rings *rings = ctx->rings; 7223 7224 /* 7225 * Ensure any loads from the SQEs are done at this point, 7226 * since once we write the new head, the application could 7227 * write new data to them. 7228 */ 7229 smp_store_release(&rings->sq.head, ctx->cached_sq_head); 7230} 7231 7232/* 7233 * Fetch an sqe, if one is available. Note this returns a pointer to memory 7234 * that is mapped by userspace. This means that care needs to be taken to 7235 * ensure that reads are stable, as we cannot rely on userspace always 7236 * being a good citizen. If members of the sqe are validated and then later 7237 * used, it's important that those reads are done through READ_ONCE() to 7238 * prevent a re-load down the line. 7239 */ 7240static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) 7241{ 7242 unsigned head, mask = ctx->sq_entries - 1; 7243 unsigned sq_idx = ctx->cached_sq_head++ & mask; 7244 7245 /* 7246 * The cached sq head (or cq tail) serves two purposes: 7247 * 7248 * 1) allows us to batch the cost of updating the user visible 7249 * head updates. 7250 * 2) allows the kernel side to track the head on its own, even 7251 * though the application is the one updating it. 7252 */ 7253 head = READ_ONCE(ctx->sq_array[sq_idx]); 7254 if (likely(head < ctx->sq_entries)) 7255 return &ctx->sq_sqes[head]; 7256 7257 /* drop invalid entries */ 7258 ctx->cq_extra--; 7259 WRITE_ONCE(ctx->rings->sq_dropped, 7260 READ_ONCE(ctx->rings->sq_dropped) + 1); 7261 return NULL; 7262} 7263 7264static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) 7265 __must_hold(&ctx->uring_lock) 7266{ 7267 int submitted = 0; 7268 7269 /* make sure SQ entry isn't read before tail */ 7270 nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx)); 7271 if (!percpu_ref_tryget_many(&ctx->refs, nr)) 7272 return -EAGAIN; 7273 io_get_task_refs(nr); 7274 7275 io_submit_state_start(&ctx->submit_state, nr); 7276 while (submitted < nr) { 7277 const struct io_uring_sqe *sqe; 7278 struct io_kiocb *req; 7279 7280 req = io_alloc_req(ctx); 7281 if (unlikely(!req)) { 7282 if (!submitted) 7283 submitted = -EAGAIN; 7284 break; 7285 } 7286 sqe = io_get_sqe(ctx); 7287 if (unlikely(!sqe)) { 7288 list_add(&req->inflight_entry, &ctx->submit_state.free_list); 7289 break; 7290 } 7291 /* will complete beyond this point, count as submitted */ 7292 submitted++; 7293 if (io_submit_sqe(ctx, req, sqe)) 7294 break; 7295 } 7296 7297 if (unlikely(submitted != nr)) { 7298 int ref_used = (submitted == -EAGAIN) ? 0 : submitted; 7299 int unused = nr - ref_used; 7300 7301 current->io_uring->cached_refs += unused; 7302 percpu_ref_put_many(&ctx->refs, unused); 7303 } 7304 7305 io_submit_state_end(&ctx->submit_state, ctx); 7306 /* Commit SQ ring head once we've consumed and submitted all SQEs */ 7307 io_commit_sqring(ctx); 7308 7309 return submitted; 7310} 7311 7312static inline bool io_sqd_events_pending(struct io_sq_data *sqd) 7313{ 7314 return READ_ONCE(sqd->state); 7315} 7316 7317static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx) 7318{ 7319 /* Tell userspace we may need a wakeup call */ 7320 spin_lock(&ctx->completion_lock); 7321 WRITE_ONCE(ctx->rings->sq_flags, 7322 ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP); 7323 spin_unlock(&ctx->completion_lock); 7324} 7325 7326static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) 7327{ 7328 spin_lock(&ctx->completion_lock); 7329 WRITE_ONCE(ctx->rings->sq_flags, 7330 ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP); 7331 spin_unlock(&ctx->completion_lock); 7332} 7333 7334static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) 7335{ 7336 unsigned int to_submit; 7337 int ret = 0; 7338 7339 to_submit = io_sqring_entries(ctx); 7340 /* if we're handling multiple rings, cap submit size for fairness */ 7341 if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) 7342 to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; 7343 7344 if (!list_empty(&ctx->iopoll_list) || to_submit) { 7345 unsigned nr_events = 0; 7346 const struct cred *creds = NULL; 7347 7348 if (ctx->sq_creds != current_cred()) 7349 creds = override_creds(ctx->sq_creds); 7350 7351 mutex_lock(&ctx->uring_lock); 7352 if (!list_empty(&ctx->iopoll_list)) 7353 io_do_iopoll(ctx, &nr_events, 0); 7354 7355 /* 7356 * Don't submit if refs are dying, good for io_uring_register(), 7357 * but also it is relied upon by io_ring_exit_work() 7358 */ 7359 if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) && 7360 !(ctx->flags & IORING_SETUP_R_DISABLED)) 7361 ret = io_submit_sqes(ctx, to_submit); 7362 mutex_unlock(&ctx->uring_lock); 7363 7364 if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) 7365 wake_up(&ctx->sqo_sq_wait); 7366 if (creds) 7367 revert_creds(creds); 7368 } 7369 7370 return ret; 7371} 7372 7373static void io_sqd_update_thread_idle(struct io_sq_data *sqd) 7374{ 7375 struct io_ring_ctx *ctx; 7376 unsigned sq_thread_idle = 0; 7377 7378 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 7379 sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle); 7380 sqd->sq_thread_idle = sq_thread_idle; 7381} 7382 7383static bool io_sqd_handle_event(struct io_sq_data *sqd) 7384{ 7385 bool did_sig = false; 7386 struct ksignal ksig; 7387 7388 if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) || 7389 signal_pending(current)) { 7390 mutex_unlock(&sqd->lock); 7391 if (signal_pending(current)) 7392 did_sig = get_signal(&ksig); 7393 cond_resched(); 7394 mutex_lock(&sqd->lock); 7395 } 7396 return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); 7397} 7398 7399static int io_sq_thread(void *data) 7400{ 7401 struct io_sq_data *sqd = data; 7402 struct io_ring_ctx *ctx; 7403 unsigned long timeout = 0; 7404 char buf[TASK_COMM_LEN]; 7405 DEFINE_WAIT(wait); 7406 7407 snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); 7408 set_task_comm(current, buf); 7409 7410 if (sqd->sq_cpu != -1) 7411 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu)); 7412 else 7413 set_cpus_allowed_ptr(current, cpu_online_mask); 7414 current->flags |= PF_NO_SETAFFINITY; 7415 7416 mutex_lock(&sqd->lock); 7417 while (1) { 7418 bool cap_entries, sqt_spin = false; 7419 7420 if (io_sqd_events_pending(sqd) || signal_pending(current)) { 7421 if (io_sqd_handle_event(sqd)) 7422 break; 7423 timeout = jiffies + sqd->sq_thread_idle; 7424 } 7425 7426 cap_entries = !list_is_singular(&sqd->ctx_list); 7427 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 7428 int ret = __io_sq_thread(ctx, cap_entries); 7429 7430 if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list))) 7431 sqt_spin = true; 7432 } 7433 if (io_run_task_work()) 7434 sqt_spin = true; 7435 7436 if (sqt_spin || !time_after(jiffies, timeout)) { 7437 cond_resched(); 7438 if (sqt_spin) 7439 timeout = jiffies + sqd->sq_thread_idle; 7440 continue; 7441 } 7442 7443 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); 7444 if (!io_sqd_events_pending(sqd) && !current->task_works) { 7445 bool needs_sched = true; 7446 7447 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 7448 io_ring_set_wakeup_flag(ctx); 7449 7450 if ((ctx->flags & IORING_SETUP_IOPOLL) && 7451 !list_empty_careful(&ctx->iopoll_list)) { 7452 needs_sched = false; 7453 break; 7454 } 7455 if (io_sqring_entries(ctx)) { 7456 needs_sched = false; 7457 break; 7458 } 7459 } 7460 7461 if (needs_sched) { 7462 mutex_unlock(&sqd->lock); 7463 schedule(); 7464 mutex_lock(&sqd->lock); 7465 } 7466 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 7467 io_ring_clear_wakeup_flag(ctx); 7468 } 7469 7470 finish_wait(&sqd->wait, &wait); 7471 timeout = jiffies + sqd->sq_thread_idle; 7472 } 7473 7474 io_uring_cancel_generic(true, sqd); 7475 sqd->thread = NULL; 7476 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 7477 io_ring_set_wakeup_flag(ctx); 7478 io_run_task_work(); 7479 mutex_unlock(&sqd->lock); 7480 7481 complete(&sqd->exited); 7482 do_exit(0); 7483} 7484 7485struct io_wait_queue { 7486 struct wait_queue_entry wq; 7487 struct io_ring_ctx *ctx; 7488 unsigned cq_tail; 7489 unsigned nr_timeouts; 7490}; 7491 7492static inline bool io_should_wake(struct io_wait_queue *iowq) 7493{ 7494 struct io_ring_ctx *ctx = iowq->ctx; 7495 int dist = ctx->cached_cq_tail - (int) iowq->cq_tail; 7496 7497 /* 7498 * Wake up if we have enough events, or if a timeout occurred since we 7499 * started waiting. For timeouts, we always want to return to userspace, 7500 * regardless of event count. 7501 */ 7502 return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; 7503} 7504 7505static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, 7506 int wake_flags, void *key) 7507{ 7508 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, 7509 wq); 7510 7511 /* 7512 * Cannot safely flush overflowed CQEs from here, ensure we wake up 7513 * the task, and the next invocation will do it. 7514 */ 7515 if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow)) 7516 return autoremove_wake_function(curr, mode, wake_flags, key); 7517 return -1; 7518} 7519 7520static int io_run_task_work_sig(void) 7521{ 7522 if (io_run_task_work()) 7523 return 1; 7524 if (!signal_pending(current)) 7525 return 0; 7526 if (test_thread_flag(TIF_NOTIFY_SIGNAL)) 7527 return -ERESTARTSYS; 7528 return -EINTR; 7529} 7530 7531/* when returns >0, the caller should retry */ 7532static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, 7533 struct io_wait_queue *iowq, 7534 signed long *timeout) 7535{ 7536 int ret; 7537 7538 /* make sure we run task_work before checking for signals */ 7539 ret = io_run_task_work_sig(); 7540 if (ret || io_should_wake(iowq)) 7541 return ret; 7542 /* let the caller flush overflows, retry */ 7543 if (test_bit(0, &ctx->check_cq_overflow)) 7544 return 1; 7545 7546 *timeout = schedule_timeout(*timeout); 7547 return !*timeout ? -ETIME : 1; 7548} 7549 7550/* 7551 * Wait until events become available, if we don't already have some. The 7552 * application must reap them itself, as they reside on the shared cq ring. 7553 */ 7554static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, 7555 const sigset_t __user *sig, size_t sigsz, 7556 struct __kernel_timespec __user *uts) 7557{ 7558 struct io_wait_queue iowq; 7559 struct io_rings *rings = ctx->rings; 7560 signed long timeout = MAX_SCHEDULE_TIMEOUT; 7561 int ret; 7562 7563 do { 7564 io_cqring_overflow_flush(ctx); 7565 if (io_cqring_events(ctx) >= min_events) 7566 return 0; 7567 if (!io_run_task_work()) 7568 break; 7569 } while (1); 7570 7571 if (uts) { 7572 struct timespec64 ts; 7573 7574 if (get_timespec64(&ts, uts)) 7575 return -EFAULT; 7576 timeout = timespec64_to_jiffies(&ts); 7577 } 7578 7579 if (sig) { 7580#ifdef CONFIG_COMPAT 7581 if (in_compat_syscall()) 7582 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, 7583 sigsz); 7584 else 7585#endif 7586 ret = set_user_sigmask(sig, sigsz); 7587 7588 if (ret) 7589 return ret; 7590 } 7591 7592 init_waitqueue_func_entry(&iowq.wq, io_wake_function); 7593 iowq.wq.private = current; 7594 INIT_LIST_HEAD(&iowq.wq.entry); 7595 iowq.ctx = ctx; 7596 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); 7597 iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; 7598 7599 trace_io_uring_cqring_wait(ctx, min_events); 7600 do { 7601 /* if we can't even flush overflow, don't wait for more */ 7602 if (!io_cqring_overflow_flush(ctx)) { 7603 ret = -EBUSY; 7604 break; 7605 } 7606 prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, 7607 TASK_INTERRUPTIBLE); 7608 ret = io_cqring_wait_schedule(ctx, &iowq, &timeout); 7609 finish_wait(&ctx->cq_wait, &iowq.wq); 7610 cond_resched(); 7611 } while (ret > 0); 7612 7613 restore_saved_sigmask_unless(ret == -EINTR); 7614 7615 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; 7616} 7617 7618static void io_free_page_table(void **table, size_t size) 7619{ 7620 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 7621 7622 for (i = 0; i < nr_tables; i++) 7623 kfree(table[i]); 7624 kfree(table); 7625} 7626 7627static void **io_alloc_page_table(size_t size) 7628{ 7629 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 7630 size_t init_size = size; 7631 void **table; 7632 7633 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); 7634 if (!table) 7635 return NULL; 7636 7637 for (i = 0; i < nr_tables; i++) { 7638 unsigned int this_size = min_t(size_t, size, PAGE_SIZE); 7639 7640 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); 7641 if (!table[i]) { 7642 io_free_page_table(table, init_size); 7643 return NULL; 7644 } 7645 size -= this_size; 7646 } 7647 return table; 7648} 7649 7650static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) 7651{ 7652 percpu_ref_exit(&ref_node->refs); 7653 kfree(ref_node); 7654} 7655 7656static void io_rsrc_node_ref_zero(struct percpu_ref *ref) 7657{ 7658 struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); 7659 struct io_ring_ctx *ctx = node->rsrc_data->ctx; 7660 unsigned long flags; 7661 bool first_add = false; 7662 7663 spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); 7664 node->done = true; 7665 7666 while (!list_empty(&ctx->rsrc_ref_list)) { 7667 node = list_first_entry(&ctx->rsrc_ref_list, 7668 struct io_rsrc_node, node); 7669 /* recycle ref nodes in order */ 7670 if (!node->done) 7671 break; 7672 list_del(&node->node); 7673 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); 7674 } 7675 spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); 7676 7677 if (first_add) 7678 mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ); 7679} 7680 7681static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) 7682{ 7683 struct io_rsrc_node *ref_node; 7684 7685 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); 7686 if (!ref_node) 7687 return NULL; 7688 7689 if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, 7690 0, GFP_KERNEL)) { 7691 kfree(ref_node); 7692 return NULL; 7693 } 7694 INIT_LIST_HEAD(&ref_node->node); 7695 INIT_LIST_HEAD(&ref_node->rsrc_list); 7696 ref_node->done = false; 7697 return ref_node; 7698} 7699 7700static void io_rsrc_node_switch(struct io_ring_ctx *ctx, 7701 struct io_rsrc_data *data_to_kill) 7702{ 7703 WARN_ON_ONCE(!ctx->rsrc_backup_node); 7704 WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); 7705 7706 if (data_to_kill) { 7707 struct io_rsrc_node *rsrc_node = ctx->rsrc_node; 7708 7709 rsrc_node->rsrc_data = data_to_kill; 7710 spin_lock_irq(&ctx->rsrc_ref_lock); 7711 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); 7712 spin_unlock_irq(&ctx->rsrc_ref_lock); 7713 7714 atomic_inc(&data_to_kill->refs); 7715 percpu_ref_kill(&rsrc_node->refs); 7716 ctx->rsrc_node = NULL; 7717 } 7718 7719 if (!ctx->rsrc_node) { 7720 ctx->rsrc_node = ctx->rsrc_backup_node; 7721 ctx->rsrc_backup_node = NULL; 7722 } 7723} 7724 7725static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) 7726{ 7727 if (ctx->rsrc_backup_node) 7728 return 0; 7729 ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx); 7730 return ctx->rsrc_backup_node ? 0 : -ENOMEM; 7731} 7732 7733static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx) 7734{ 7735 int ret; 7736 7737 /* As we may drop ->uring_lock, other task may have started quiesce */ 7738 if (data->quiesce) 7739 return -ENXIO; 7740 7741 data->quiesce = true; 7742 do { 7743 ret = io_rsrc_node_switch_start(ctx); 7744 if (ret) 7745 break; 7746 io_rsrc_node_switch(ctx, data); 7747 7748 /* kill initial ref, already quiesced if zero */ 7749 if (atomic_dec_and_test(&data->refs)) 7750 break; 7751 mutex_unlock(&ctx->uring_lock); 7752 flush_delayed_work(&ctx->rsrc_put_work); 7753 ret = wait_for_completion_interruptible(&data->done); 7754 if (!ret) { 7755 mutex_lock(&ctx->uring_lock); 7756 break; 7757 } 7758 7759 atomic_inc(&data->refs); 7760 /* wait for all works potentially completing data->done */ 7761 flush_delayed_work(&ctx->rsrc_put_work); 7762 reinit_completion(&data->done); 7763 7764 ret = io_run_task_work_sig(); 7765 mutex_lock(&ctx->uring_lock); 7766 } while (ret >= 0); 7767 data->quiesce = false; 7768 7769 return ret; 7770} 7771 7772static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx) 7773{ 7774 unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK; 7775 unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT; 7776 7777 return &data->tags[table_idx][off]; 7778} 7779 7780static void io_rsrc_data_free(struct io_rsrc_data *data) 7781{ 7782 size_t size = data->nr * sizeof(data->tags[0][0]); 7783 7784 if (data->tags) 7785 io_free_page_table((void **)data->tags, size); 7786 kfree(data); 7787} 7788 7789static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put, 7790 u64 __user *utags, unsigned nr, 7791 struct io_rsrc_data **pdata) 7792{ 7793 struct io_rsrc_data *data; 7794 int ret = -ENOMEM; 7795 unsigned i; 7796 7797 data = kzalloc(sizeof(*data), GFP_KERNEL); 7798 if (!data) 7799 return -ENOMEM; 7800 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); 7801 if (!data->tags) { 7802 kfree(data); 7803 return -ENOMEM; 7804 } 7805 7806 data->nr = nr; 7807 data->ctx = ctx; 7808 data->do_put = do_put; 7809 if (utags) { 7810 ret = -EFAULT; 7811 for (i = 0; i < nr; i++) { 7812 u64 *tag_slot = io_get_tag_slot(data, i); 7813 7814 if (copy_from_user(tag_slot, &utags[i], 7815 sizeof(*tag_slot))) 7816 goto fail; 7817 } 7818 } 7819 7820 atomic_set(&data->refs, 1); 7821 init_completion(&data->done); 7822 *pdata = data; 7823 return 0; 7824fail: 7825 io_rsrc_data_free(data); 7826 return ret; 7827} 7828 7829static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) 7830{ 7831 table->files = kvcalloc(nr_files, sizeof(table->files[0]), 7832 GFP_KERNEL_ACCOUNT); 7833 return !!table->files; 7834} 7835 7836static void io_free_file_tables(struct io_file_table *table) 7837{ 7838 kvfree(table->files); 7839 table->files = NULL; 7840} 7841 7842static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) 7843{ 7844#if defined(CONFIG_UNIX) 7845 if (ctx->ring_sock) { 7846 struct sock *sock = ctx->ring_sock->sk; 7847 struct sk_buff *skb; 7848 7849 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) 7850 kfree_skb(skb); 7851 } 7852#else 7853 int i; 7854 7855 for (i = 0; i < ctx->nr_user_files; i++) { 7856 struct file *file; 7857 7858 file = io_file_from_index(ctx, i); 7859 if (file) 7860 fput(file); 7861 } 7862#endif 7863 io_free_file_tables(&ctx->file_table); 7864 io_rsrc_data_free(ctx->file_data); 7865 ctx->file_data = NULL; 7866 ctx->nr_user_files = 0; 7867} 7868 7869static int io_sqe_files_unregister(struct io_ring_ctx *ctx) 7870{ 7871 int ret; 7872 7873 if (!ctx->file_data) 7874 return -ENXIO; 7875 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); 7876 if (!ret) 7877 __io_sqe_files_unregister(ctx); 7878 return ret; 7879} 7880 7881static void io_sq_thread_unpark(struct io_sq_data *sqd) 7882 __releases(&sqd->lock) 7883{ 7884 WARN_ON_ONCE(sqd->thread == current); 7885 7886 /* 7887 * Do the dance but not conditional clear_bit() because it'd race with 7888 * other threads incrementing park_pending and setting the bit. 7889 */ 7890 clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 7891 if (atomic_dec_return(&sqd->park_pending)) 7892 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 7893 mutex_unlock(&sqd->lock); 7894} 7895 7896static void io_sq_thread_park(struct io_sq_data *sqd) 7897 __acquires(&sqd->lock) 7898{ 7899 WARN_ON_ONCE(sqd->thread == current); 7900 7901 atomic_inc(&sqd->park_pending); 7902 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 7903 mutex_lock(&sqd->lock); 7904 if (sqd->thread) 7905 wake_up_process(sqd->thread); 7906} 7907 7908static void io_sq_thread_stop(struct io_sq_data *sqd) 7909{ 7910 WARN_ON_ONCE(sqd->thread == current); 7911 WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)); 7912 7913 set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); 7914 mutex_lock(&sqd->lock); 7915 if (sqd->thread) 7916 wake_up_process(sqd->thread); 7917 mutex_unlock(&sqd->lock); 7918 wait_for_completion(&sqd->exited); 7919} 7920 7921static void io_put_sq_data(struct io_sq_data *sqd) 7922{ 7923 if (refcount_dec_and_test(&sqd->refs)) { 7924 WARN_ON_ONCE(atomic_read(&sqd->park_pending)); 7925 7926 io_sq_thread_stop(sqd); 7927 kfree(sqd); 7928 } 7929} 7930 7931static void io_sq_thread_finish(struct io_ring_ctx *ctx) 7932{ 7933 struct io_sq_data *sqd = ctx->sq_data; 7934 7935 if (sqd) { 7936 io_sq_thread_park(sqd); 7937 list_del_init(&ctx->sqd_list); 7938 io_sqd_update_thread_idle(sqd); 7939 io_sq_thread_unpark(sqd); 7940 7941 io_put_sq_data(sqd); 7942 ctx->sq_data = NULL; 7943 } 7944} 7945 7946static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) 7947{ 7948 struct io_ring_ctx *ctx_attach; 7949 struct io_sq_data *sqd; 7950 struct fd f; 7951 7952 f = fdget(p->wq_fd); 7953 if (!f.file) 7954 return ERR_PTR(-ENXIO); 7955 if (f.file->f_op != &io_uring_fops) { 7956 fdput(f); 7957 return ERR_PTR(-EINVAL); 7958 } 7959 7960 ctx_attach = f.file->private_data; 7961 sqd = ctx_attach->sq_data; 7962 if (!sqd) { 7963 fdput(f); 7964 return ERR_PTR(-EINVAL); 7965 } 7966 if (sqd->task_tgid != current->tgid) { 7967 fdput(f); 7968 return ERR_PTR(-EPERM); 7969 } 7970 7971 refcount_inc(&sqd->refs); 7972 fdput(f); 7973 return sqd; 7974} 7975 7976static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, 7977 bool *attached) 7978{ 7979 struct io_sq_data *sqd; 7980 7981 *attached = false; 7982 if (p->flags & IORING_SETUP_ATTACH_WQ) { 7983 sqd = io_attach_sq_data(p); 7984 if (!IS_ERR(sqd)) { 7985 *attached = true; 7986 return sqd; 7987 } 7988 /* fall through for EPERM case, setup new sqd/task */ 7989 if (PTR_ERR(sqd) != -EPERM) 7990 return sqd; 7991 } 7992 7993 sqd = kzalloc(sizeof(*sqd), GFP_KERNEL); 7994 if (!sqd) 7995 return ERR_PTR(-ENOMEM); 7996 7997 atomic_set(&sqd->park_pending, 0); 7998 refcount_set(&sqd->refs, 1); 7999 INIT_LIST_HEAD(&sqd->ctx_list); 8000 mutex_init(&sqd->lock); 8001 init_waitqueue_head(&sqd->wait); 8002 init_completion(&sqd->exited); 8003 return sqd; 8004} 8005 8006#if defined(CONFIG_UNIX) 8007/* 8008 * Ensure the UNIX gc is aware of our file set, so we are certain that 8009 * the io_uring can be safely unregistered on process exit, even if we have 8010 * loops in the file referencing. 8011 */ 8012static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) 8013{ 8014 struct sock *sk = ctx->ring_sock->sk; 8015 struct scm_fp_list *fpl; 8016 struct sk_buff *skb; 8017 int i, nr_files; 8018 8019 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); 8020 if (!fpl) 8021 return -ENOMEM; 8022 8023 skb = alloc_skb(0, GFP_KERNEL); 8024 if (!skb) { 8025 kfree(fpl); 8026 return -ENOMEM; 8027 } 8028 8029 skb->sk = sk; 8030 8031 nr_files = 0; 8032 fpl->user = get_uid(current_user()); 8033 for (i = 0; i < nr; i++) { 8034 struct file *file = io_file_from_index(ctx, i + offset); 8035 8036 if (!file) 8037 continue; 8038 fpl->fp[nr_files] = get_file(file); 8039 unix_inflight(fpl->user, fpl->fp[nr_files]); 8040 nr_files++; 8041 } 8042 8043 if (nr_files) { 8044 fpl->max = SCM_MAX_FD; 8045 fpl->count = nr_files; 8046 UNIXCB(skb).fp = fpl; 8047 skb->destructor = unix_destruct_scm; 8048 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 8049 skb_queue_head(&sk->sk_receive_queue, skb); 8050 8051 for (i = 0; i < nr_files; i++) 8052 fput(fpl->fp[i]); 8053 } else { 8054 kfree_skb(skb); 8055 kfree(fpl); 8056 } 8057 8058 return 0; 8059} 8060 8061/* 8062 * If UNIX sockets are enabled, fd passing can cause a reference cycle which 8063 * causes regular reference counting to break down. We rely on the UNIX 8064 * garbage collection to take care of this problem for us. 8065 */ 8066static int io_sqe_files_scm(struct io_ring_ctx *ctx) 8067{ 8068 unsigned left, total; 8069 int ret = 0; 8070 8071 total = 0; 8072 left = ctx->nr_user_files; 8073 while (left) { 8074 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD); 8075 8076 ret = __io_sqe_files_scm(ctx, this_files, total); 8077 if (ret) 8078 break; 8079 left -= this_files; 8080 total += this_files; 8081 } 8082 8083 if (!ret) 8084 return 0; 8085 8086 while (total < ctx->nr_user_files) { 8087 struct file *file = io_file_from_index(ctx, total); 8088 8089 if (file) 8090 fput(file); 8091 total++; 8092 } 8093 8094 return ret; 8095} 8096#else 8097static int io_sqe_files_scm(struct io_ring_ctx *ctx) 8098{ 8099 return 0; 8100} 8101#endif 8102 8103static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 8104{ 8105 struct file *file = prsrc->file; 8106#if defined(CONFIG_UNIX) 8107 struct sock *sock = ctx->ring_sock->sk; 8108 struct sk_buff_head list, *head = &sock->sk_receive_queue; 8109 struct sk_buff *skb; 8110 int i; 8111 8112 __skb_queue_head_init(&list); 8113 8114 /* 8115 * Find the skb that holds this file in its SCM_RIGHTS. When found, 8116 * remove this entry and rearrange the file array. 8117 */ 8118 skb = skb_dequeue(head); 8119 while (skb) { 8120 struct scm_fp_list *fp; 8121 8122 fp = UNIXCB(skb).fp; 8123 for (i = 0; i < fp->count; i++) { 8124 int left; 8125 8126 if (fp->fp[i] != file) 8127 continue; 8128 8129 unix_notinflight(fp->user, fp->fp[i]); 8130 left = fp->count - 1 - i; 8131 if (left) { 8132 memmove(&fp->fp[i], &fp->fp[i + 1], 8133 left * sizeof(struct file *)); 8134 } 8135 fp->count--; 8136 if (!fp->count) { 8137 kfree_skb(skb); 8138 skb = NULL; 8139 } else { 8140 __skb_queue_tail(&list, skb); 8141 } 8142 fput(file); 8143 file = NULL; 8144 break; 8145 } 8146 8147 if (!file) 8148 break; 8149 8150 __skb_queue_tail(&list, skb); 8151 8152 skb = skb_dequeue(head); 8153 } 8154 8155 if (skb_peek(&list)) { 8156 spin_lock_irq(&head->lock); 8157 while ((skb = __skb_dequeue(&list)) != NULL) 8158 __skb_queue_tail(head, skb); 8159 spin_unlock_irq(&head->lock); 8160 } 8161#else 8162 fput(file); 8163#endif 8164} 8165 8166static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) 8167{ 8168 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; 8169 struct io_ring_ctx *ctx = rsrc_data->ctx; 8170 struct io_rsrc_put *prsrc, *tmp; 8171 8172 list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) { 8173 list_del(&prsrc->list); 8174 8175 if (prsrc->tag) { 8176 bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL; 8177 8178 io_ring_submit_lock(ctx, lock_ring); 8179 spin_lock(&ctx->completion_lock); 8180 io_cqring_fill_event(ctx, prsrc->tag, 0, 0); 8181 ctx->cq_extra++; 8182 io_commit_cqring(ctx); 8183 spin_unlock(&ctx->completion_lock); 8184 io_cqring_ev_posted(ctx); 8185 io_ring_submit_unlock(ctx, lock_ring); 8186 } 8187 8188 rsrc_data->do_put(ctx, prsrc); 8189 kfree(prsrc); 8190 } 8191 8192 io_rsrc_node_destroy(ref_node); 8193 if (atomic_dec_and_test(&rsrc_data->refs)) 8194 complete(&rsrc_data->done); 8195} 8196 8197static void io_rsrc_put_work(struct work_struct *work) 8198{ 8199 struct io_ring_ctx *ctx; 8200 struct llist_node *node; 8201 8202 ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work); 8203 node = llist_del_all(&ctx->rsrc_put_llist); 8204 8205 while (node) { 8206 struct io_rsrc_node *ref_node; 8207 struct llist_node *next = node->next; 8208 8209 ref_node = llist_entry(node, struct io_rsrc_node, llist); 8210 __io_rsrc_put_work(ref_node); 8211 node = next; 8212 } 8213} 8214 8215static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 8216 unsigned nr_args, u64 __user *tags) 8217{ 8218 __s32 __user *fds = (__s32 __user *) arg; 8219 struct file *file; 8220 int fd, ret; 8221 unsigned i; 8222 8223 if (ctx->file_data) 8224 return -EBUSY; 8225 if (!nr_args) 8226 return -EINVAL; 8227 if (nr_args > IORING_MAX_FIXED_FILES) 8228 return -EMFILE; 8229 if (nr_args > rlimit(RLIMIT_NOFILE)) 8230 return -EMFILE; 8231 ret = io_rsrc_node_switch_start(ctx); 8232 if (ret) 8233 return ret; 8234 ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, 8235 &ctx->file_data); 8236 if (ret) 8237 return ret; 8238 8239 ret = -ENOMEM; 8240 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) 8241 goto out_free; 8242 8243 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { 8244 if (copy_from_user(&fd, &fds[i], sizeof(fd))) { 8245 ret = -EFAULT; 8246 goto out_fput; 8247 } 8248 /* allow sparse sets */ 8249 if (fd == -1) { 8250 ret = -EINVAL; 8251 if (unlikely(*io_get_tag_slot(ctx->file_data, i))) 8252 goto out_fput; 8253 continue; 8254 } 8255 8256 file = fget(fd); 8257 ret = -EBADF; 8258 if (unlikely(!file)) 8259 goto out_fput; 8260 8261 /* 8262 * Don't allow io_uring instances to be registered. If UNIX 8263 * isn't enabled, then this causes a reference cycle and this 8264 * instance can never get freed. If UNIX is enabled we'll 8265 * handle it just fine, but there's still no point in allowing 8266 * a ring fd as it doesn't support regular read/write anyway. 8267 */ 8268 if (file->f_op == &io_uring_fops) { 8269 fput(file); 8270 goto out_fput; 8271 } 8272 io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file); 8273 } 8274 8275 ret = io_sqe_files_scm(ctx); 8276 if (ret) { 8277 __io_sqe_files_unregister(ctx); 8278 return ret; 8279 } 8280 8281 io_rsrc_node_switch(ctx, NULL); 8282 return ret; 8283out_fput: 8284 for (i = 0; i < ctx->nr_user_files; i++) { 8285 file = io_file_from_index(ctx, i); 8286 if (file) 8287 fput(file); 8288 } 8289 io_free_file_tables(&ctx->file_table); 8290 ctx->nr_user_files = 0; 8291out_free: 8292 io_rsrc_data_free(ctx->file_data); 8293 ctx->file_data = NULL; 8294 return ret; 8295} 8296 8297static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, 8298 int index) 8299{ 8300#if defined(CONFIG_UNIX) 8301 struct sock *sock = ctx->ring_sock->sk; 8302 struct sk_buff_head *head = &sock->sk_receive_queue; 8303 struct sk_buff *skb; 8304 8305 /* 8306 * See if we can merge this file into an existing skb SCM_RIGHTS 8307 * file set. If there's no room, fall back to allocating a new skb 8308 * and filling it in. 8309 */ 8310 spin_lock_irq(&head->lock); 8311 skb = skb_peek(head); 8312 if (skb) { 8313 struct scm_fp_list *fpl = UNIXCB(skb).fp; 8314 8315 if (fpl->count < SCM_MAX_FD) { 8316 __skb_unlink(skb, head); 8317 spin_unlock_irq(&head->lock); 8318 fpl->fp[fpl->count] = get_file(file); 8319 unix_inflight(fpl->user, fpl->fp[fpl->count]); 8320 fpl->count++; 8321 spin_lock_irq(&head->lock); 8322 __skb_queue_head(head, skb); 8323 } else { 8324 skb = NULL; 8325 } 8326 } 8327 spin_unlock_irq(&head->lock); 8328 8329 if (skb) { 8330 fput(file); 8331 return 0; 8332 } 8333 8334 return __io_sqe_files_scm(ctx, 1, index); 8335#else 8336 return 0; 8337#endif 8338} 8339 8340static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, 8341 struct io_rsrc_node *node, void *rsrc) 8342{ 8343 struct io_rsrc_put *prsrc; 8344 8345 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); 8346 if (!prsrc) 8347 return -ENOMEM; 8348 8349 prsrc->tag = *io_get_tag_slot(data, idx); 8350 prsrc->rsrc = rsrc; 8351 list_add(&prsrc->list, &node->rsrc_list); 8352 return 0; 8353} 8354 8355static int io_install_fixed_file(struct io_kiocb *req, struct file *file, 8356 unsigned int issue_flags, u32 slot_index) 8357{ 8358 struct io_ring_ctx *ctx = req->ctx; 8359 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 8360 bool needs_switch = false; 8361 struct io_fixed_file *file_slot; 8362 int ret = -EBADF; 8363 8364 io_ring_submit_lock(ctx, !force_nonblock); 8365 if (file->f_op == &io_uring_fops) 8366 goto err; 8367 ret = -ENXIO; 8368 if (!ctx->file_data) 8369 goto err; 8370 ret = -EINVAL; 8371 if (slot_index >= ctx->nr_user_files) 8372 goto err; 8373 8374 slot_index = array_index_nospec(slot_index, ctx->nr_user_files); 8375 file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); 8376 8377 if (file_slot->file_ptr) { 8378 struct file *old_file; 8379 8380 ret = io_rsrc_node_switch_start(ctx); 8381 if (ret) 8382 goto err; 8383 8384 old_file = (struct file *)(file_slot->file_ptr & FFS_MASK); 8385 ret = io_queue_rsrc_removal(ctx->file_data, slot_index, 8386 ctx->rsrc_node, old_file); 8387 if (ret) 8388 goto err; 8389 file_slot->file_ptr = 0; 8390 needs_switch = true; 8391 } 8392 8393 *io_get_tag_slot(ctx->file_data, slot_index) = 0; 8394 io_fixed_file_set(file_slot, file); 8395 ret = io_sqe_file_register(ctx, file, slot_index); 8396 if (ret) { 8397 file_slot->file_ptr = 0; 8398 goto err; 8399 } 8400 8401 ret = 0; 8402err: 8403 if (needs_switch) 8404 io_rsrc_node_switch(ctx, ctx->file_data); 8405 io_ring_submit_unlock(ctx, !force_nonblock); 8406 if (ret) 8407 fput(file); 8408 return ret; 8409} 8410 8411static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) 8412{ 8413 unsigned int offset = req->close.file_slot - 1; 8414 struct io_ring_ctx *ctx = req->ctx; 8415 struct io_fixed_file *file_slot; 8416 struct file *file; 8417 int ret, i; 8418 8419 io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); 8420 ret = -ENXIO; 8421 if (unlikely(!ctx->file_data)) 8422 goto out; 8423 ret = -EINVAL; 8424 if (offset >= ctx->nr_user_files) 8425 goto out; 8426 ret = io_rsrc_node_switch_start(ctx); 8427 if (ret) 8428 goto out; 8429 8430 i = array_index_nospec(offset, ctx->nr_user_files); 8431 file_slot = io_fixed_file_slot(&ctx->file_table, i); 8432 ret = -EBADF; 8433 if (!file_slot->file_ptr) 8434 goto out; 8435 8436 file = (struct file *)(file_slot->file_ptr & FFS_MASK); 8437 ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file); 8438 if (ret) 8439 goto out; 8440 8441 file_slot->file_ptr = 0; 8442 io_rsrc_node_switch(ctx, ctx->file_data); 8443 ret = 0; 8444out: 8445 io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); 8446 return ret; 8447} 8448 8449static int __io_sqe_files_update(struct io_ring_ctx *ctx, 8450 struct io_uring_rsrc_update2 *up, 8451 unsigned nr_args) 8452{ 8453 u64 __user *tags = u64_to_user_ptr(up->tags); 8454 __s32 __user *fds = u64_to_user_ptr(up->data); 8455 struct io_rsrc_data *data = ctx->file_data; 8456 struct io_fixed_file *file_slot; 8457 struct file *file; 8458 int fd, i, err = 0; 8459 unsigned int done; 8460 bool needs_switch = false; 8461 8462 if (!ctx->file_data) 8463 return -ENXIO; 8464 if (up->offset + nr_args > ctx->nr_user_files) 8465 return -EINVAL; 8466 8467 for (done = 0; done < nr_args; done++) { 8468 u64 tag = 0; 8469 8470 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 8471 copy_from_user(&fd, &fds[done], sizeof(fd))) { 8472 err = -EFAULT; 8473 break; 8474 } 8475 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 8476 err = -EINVAL; 8477 break; 8478 } 8479 if (fd == IORING_REGISTER_FILES_SKIP) 8480 continue; 8481 8482 i = array_index_nospec(up->offset + done, ctx->nr_user_files); 8483 file_slot = io_fixed_file_slot(&ctx->file_table, i); 8484 8485 if (file_slot->file_ptr) { 8486 file = (struct file *)(file_slot->file_ptr & FFS_MASK); 8487 err = io_queue_rsrc_removal(data, up->offset + done, 8488 ctx->rsrc_node, file); 8489 if (err) 8490 break; 8491 file_slot->file_ptr = 0; 8492 needs_switch = true; 8493 } 8494 if (fd != -1) { 8495 file = fget(fd); 8496 if (!file) { 8497 err = -EBADF; 8498 break; 8499 } 8500 /* 8501 * Don't allow io_uring instances to be registered. If 8502 * UNIX isn't enabled, then this causes a reference 8503 * cycle and this instance can never get freed. If UNIX 8504 * is enabled we'll handle it just fine, but there's 8505 * still no point in allowing a ring fd as it doesn't 8506 * support regular read/write anyway. 8507 */ 8508 if (file->f_op == &io_uring_fops) { 8509 fput(file); 8510 err = -EBADF; 8511 break; 8512 } 8513 *io_get_tag_slot(data, up->offset + done) = tag; 8514 io_fixed_file_set(file_slot, file); 8515 err = io_sqe_file_register(ctx, file, i); 8516 if (err) { 8517 file_slot->file_ptr = 0; 8518 fput(file); 8519 break; 8520 } 8521 } 8522 } 8523 8524 if (needs_switch) 8525 io_rsrc_node_switch(ctx, data); 8526 return done ? done : err; 8527} 8528 8529static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, 8530 struct task_struct *task) 8531{ 8532 struct io_wq_hash *hash; 8533 struct io_wq_data data; 8534 unsigned int concurrency; 8535 8536 mutex_lock(&ctx->uring_lock); 8537 hash = ctx->hash_map; 8538 if (!hash) { 8539 hash = kzalloc(sizeof(*hash), GFP_KERNEL); 8540 if (!hash) { 8541 mutex_unlock(&ctx->uring_lock); 8542 return ERR_PTR(-ENOMEM); 8543 } 8544 refcount_set(&hash->refs, 1); 8545 init_waitqueue_head(&hash->wait); 8546 ctx->hash_map = hash; 8547 } 8548 mutex_unlock(&ctx->uring_lock); 8549 8550 data.hash = hash; 8551 data.task = task; 8552 data.free_work = io_wq_free_work; 8553 data.do_work = io_wq_submit_work; 8554 8555 /* Do QD, or 4 * CPUS, whatever is smallest */ 8556 concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); 8557 8558 return io_wq_create(concurrency, &data); 8559} 8560 8561static int io_uring_alloc_task_context(struct task_struct *task, 8562 struct io_ring_ctx *ctx) 8563{ 8564 struct io_uring_task *tctx; 8565 int ret; 8566 8567 tctx = kzalloc(sizeof(*tctx), GFP_KERNEL); 8568 if (unlikely(!tctx)) 8569 return -ENOMEM; 8570 8571 ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); 8572 if (unlikely(ret)) { 8573 kfree(tctx); 8574 return ret; 8575 } 8576 8577 tctx->io_wq = io_init_wq_offload(ctx, task); 8578 if (IS_ERR(tctx->io_wq)) { 8579 ret = PTR_ERR(tctx->io_wq); 8580 percpu_counter_destroy(&tctx->inflight); 8581 kfree(tctx); 8582 return ret; 8583 } 8584 8585 xa_init(&tctx->xa); 8586 init_waitqueue_head(&tctx->wait); 8587 atomic_set(&tctx->in_idle, 0); 8588 atomic_set(&tctx->inflight_tracked, 0); 8589 task->io_uring = tctx; 8590 spin_lock_init(&tctx->task_lock); 8591 INIT_WQ_LIST(&tctx->task_list); 8592 init_task_work(&tctx->task_work, tctx_task_work); 8593 return 0; 8594} 8595 8596void __io_uring_free(struct task_struct *tsk) 8597{ 8598 struct io_uring_task *tctx = tsk->io_uring; 8599 8600 WARN_ON_ONCE(!xa_empty(&tctx->xa)); 8601 WARN_ON_ONCE(tctx->io_wq); 8602 WARN_ON_ONCE(tctx->cached_refs); 8603 8604 percpu_counter_destroy(&tctx->inflight); 8605 kfree(tctx); 8606 tsk->io_uring = NULL; 8607} 8608 8609static int io_sq_offload_create(struct io_ring_ctx *ctx, 8610 struct io_uring_params *p) 8611{ 8612 int ret; 8613 8614 /* Retain compatibility with failing for an invalid attach attempt */ 8615 if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) == 8616 IORING_SETUP_ATTACH_WQ) { 8617 struct fd f; 8618 8619 f = fdget(p->wq_fd); 8620 if (!f.file) 8621 return -ENXIO; 8622 if (f.file->f_op != &io_uring_fops) { 8623 fdput(f); 8624 return -EINVAL; 8625 } 8626 fdput(f); 8627 } 8628 if (ctx->flags & IORING_SETUP_SQPOLL) { 8629 struct task_struct *tsk; 8630 struct io_sq_data *sqd; 8631 bool attached; 8632 8633 sqd = io_get_sq_data(p, &attached); 8634 if (IS_ERR(sqd)) { 8635 ret = PTR_ERR(sqd); 8636 goto err; 8637 } 8638 8639 ctx->sq_creds = get_current_cred(); 8640 ctx->sq_data = sqd; 8641 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); 8642 if (!ctx->sq_thread_idle) 8643 ctx->sq_thread_idle = HZ; 8644 8645 io_sq_thread_park(sqd); 8646 list_add(&ctx->sqd_list, &sqd->ctx_list); 8647 io_sqd_update_thread_idle(sqd); 8648 /* don't attach to a dying SQPOLL thread, would be racy */ 8649 ret = (attached && !sqd->thread) ? -ENXIO : 0; 8650 io_sq_thread_unpark(sqd); 8651 8652 if (ret < 0) 8653 goto err; 8654 if (attached) 8655 return 0; 8656 8657 if (p->flags & IORING_SETUP_SQ_AFF) { 8658 int cpu = p->sq_thread_cpu; 8659 8660 ret = -EINVAL; 8661 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) 8662 goto err_sqpoll; 8663 sqd->sq_cpu = cpu; 8664 } else { 8665 sqd->sq_cpu = -1; 8666 } 8667 8668 sqd->task_pid = current->pid; 8669 sqd->task_tgid = current->tgid; 8670 tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); 8671 if (IS_ERR(tsk)) { 8672 ret = PTR_ERR(tsk); 8673 goto err_sqpoll; 8674 } 8675 8676 sqd->thread = tsk; 8677 ret = io_uring_alloc_task_context(tsk, ctx); 8678 wake_up_new_task(tsk); 8679 if (ret) 8680 goto err; 8681 } else if (p->flags & IORING_SETUP_SQ_AFF) { 8682 /* Can't have SQ_AFF without SQPOLL */ 8683 ret = -EINVAL; 8684 goto err; 8685 } 8686 8687 return 0; 8688err_sqpoll: 8689 complete(&ctx->sq_data->exited); 8690err: 8691 io_sq_thread_finish(ctx); 8692 return ret; 8693} 8694 8695static inline void __io_unaccount_mem(struct user_struct *user, 8696 unsigned long nr_pages) 8697{ 8698 atomic_long_sub(nr_pages, &user->locked_vm); 8699} 8700 8701static inline int __io_account_mem(struct user_struct *user, 8702 unsigned long nr_pages) 8703{ 8704 unsigned long page_limit, cur_pages, new_pages; 8705 8706 /* Don't allow more pages than we can safely lock */ 8707 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 8708 8709 do { 8710 cur_pages = atomic_long_read(&user->locked_vm); 8711 new_pages = cur_pages + nr_pages; 8712 if (new_pages > page_limit) 8713 return -ENOMEM; 8714 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, 8715 new_pages) != cur_pages); 8716 8717 return 0; 8718} 8719 8720static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 8721{ 8722 if (ctx->user) 8723 __io_unaccount_mem(ctx->user, nr_pages); 8724 8725 if (ctx->mm_account) 8726 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 8727} 8728 8729static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 8730{ 8731 int ret; 8732 8733 if (ctx->user) { 8734 ret = __io_account_mem(ctx->user, nr_pages); 8735 if (ret) 8736 return ret; 8737 } 8738 8739 if (ctx->mm_account) 8740 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 8741 8742 return 0; 8743} 8744 8745static void io_mem_free(void *ptr) 8746{ 8747 struct page *page; 8748 8749 if (!ptr) 8750 return; 8751 8752 page = virt_to_head_page(ptr); 8753 if (put_page_testzero(page)) 8754 free_compound_page(page); 8755} 8756 8757static void *io_mem_alloc(size_t size) 8758{ 8759 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP | 8760 __GFP_NORETRY | __GFP_ACCOUNT; 8761 8762 return (void *) __get_free_pages(gfp_flags, get_order(size)); 8763} 8764 8765static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, 8766 size_t *sq_offset) 8767{ 8768 struct io_rings *rings; 8769 size_t off, sq_array_size; 8770 8771 off = struct_size(rings, cqes, cq_entries); 8772 if (off == SIZE_MAX) 8773 return SIZE_MAX; 8774 8775#ifdef CONFIG_SMP 8776 off = ALIGN(off, SMP_CACHE_BYTES); 8777 if (off == 0) 8778 return SIZE_MAX; 8779#endif 8780 8781 if (sq_offset) 8782 *sq_offset = off; 8783 8784 sq_array_size = array_size(sizeof(u32), sq_entries); 8785 if (sq_array_size == SIZE_MAX) 8786 return SIZE_MAX; 8787 8788 if (check_add_overflow(off, sq_array_size, &off)) 8789 return SIZE_MAX; 8790 8791 return off; 8792} 8793 8794static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) 8795{ 8796 struct io_mapped_ubuf *imu = *slot; 8797 unsigned int i; 8798 8799 if (imu != ctx->dummy_ubuf) { 8800 for (i = 0; i < imu->nr_bvecs; i++) 8801 unpin_user_page(imu->bvec[i].bv_page); 8802 if (imu->acct_pages) 8803 io_unaccount_mem(ctx, imu->acct_pages); 8804 kvfree(imu); 8805 } 8806 *slot = NULL; 8807} 8808 8809static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 8810{ 8811 io_buffer_unmap(ctx, &prsrc->buf); 8812 prsrc->buf = NULL; 8813} 8814 8815static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 8816{ 8817 unsigned int i; 8818 8819 for (i = 0; i < ctx->nr_user_bufs; i++) 8820 io_buffer_unmap(ctx, &ctx->user_bufs[i]); 8821 kfree(ctx->user_bufs); 8822 io_rsrc_data_free(ctx->buf_data); 8823 ctx->user_bufs = NULL; 8824 ctx->buf_data = NULL; 8825 ctx->nr_user_bufs = 0; 8826} 8827 8828static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 8829{ 8830 int ret; 8831 8832 if (!ctx->buf_data) 8833 return -ENXIO; 8834 8835 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); 8836 if (!ret) 8837 __io_sqe_buffers_unregister(ctx); 8838 return ret; 8839} 8840 8841static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, 8842 void __user *arg, unsigned index) 8843{ 8844 struct iovec __user *src; 8845 8846#ifdef CONFIG_COMPAT 8847 if (ctx->compat) { 8848 struct compat_iovec __user *ciovs; 8849 struct compat_iovec ciov; 8850 8851 ciovs = (struct compat_iovec __user *) arg; 8852 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) 8853 return -EFAULT; 8854 8855 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); 8856 dst->iov_len = ciov.iov_len; 8857 return 0; 8858 } 8859#endif 8860 src = (struct iovec __user *) arg; 8861 if (copy_from_user(dst, &src[index], sizeof(*dst))) 8862 return -EFAULT; 8863 return 0; 8864} 8865 8866/* 8867 * Not super efficient, but this is just a registration time. And we do cache 8868 * the last compound head, so generally we'll only do a full search if we don't 8869 * match that one. 8870 * 8871 * We check if the given compound head page has already been accounted, to 8872 * avoid double accounting it. This allows us to account the full size of the 8873 * page, not just the constituent pages of a huge page. 8874 */ 8875static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 8876 int nr_pages, struct page *hpage) 8877{ 8878 int i, j; 8879 8880 /* check current page array */ 8881 for (i = 0; i < nr_pages; i++) { 8882 if (!PageCompound(pages[i])) 8883 continue; 8884 if (compound_head(pages[i]) == hpage) 8885 return true; 8886 } 8887 8888 /* check previously registered pages */ 8889 for (i = 0; i < ctx->nr_user_bufs; i++) { 8890 struct io_mapped_ubuf *imu = ctx->user_bufs[i]; 8891 8892 for (j = 0; j < imu->nr_bvecs; j++) { 8893 if (!PageCompound(imu->bvec[j].bv_page)) 8894 continue; 8895 if (compound_head(imu->bvec[j].bv_page) == hpage) 8896 return true; 8897 } 8898 } 8899 8900 return false; 8901} 8902 8903static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 8904 int nr_pages, struct io_mapped_ubuf *imu, 8905 struct page **last_hpage) 8906{ 8907 int i, ret; 8908 8909 imu->acct_pages = 0; 8910 for (i = 0; i < nr_pages; i++) { 8911 if (!PageCompound(pages[i])) { 8912 imu->acct_pages++; 8913 } else { 8914 struct page *hpage; 8915 8916 hpage = compound_head(pages[i]); 8917 if (hpage == *last_hpage) 8918 continue; 8919 *last_hpage = hpage; 8920 if (headpage_already_acct(ctx, pages, i, hpage)) 8921 continue; 8922 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 8923 } 8924 } 8925 8926 if (!imu->acct_pages) 8927 return 0; 8928 8929 ret = io_account_mem(ctx, imu->acct_pages); 8930 if (ret) 8931 imu->acct_pages = 0; 8932 return ret; 8933} 8934 8935static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 8936 struct io_mapped_ubuf **pimu, 8937 struct page **last_hpage) 8938{ 8939 struct io_mapped_ubuf *imu = NULL; 8940 struct vm_area_struct **vmas = NULL; 8941 struct page **pages = NULL; 8942 unsigned long off, start, end, ubuf; 8943 size_t size; 8944 int ret, pret, nr_pages, i; 8945 8946 if (!iov->iov_base) { 8947 *pimu = ctx->dummy_ubuf; 8948 return 0; 8949 } 8950 8951 ubuf = (unsigned long) iov->iov_base; 8952 end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; 8953 start = ubuf >> PAGE_SHIFT; 8954 nr_pages = end - start; 8955 8956 *pimu = NULL; 8957 ret = -ENOMEM; 8958 8959 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); 8960 if (!pages) 8961 goto done; 8962 8963 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), 8964 GFP_KERNEL); 8965 if (!vmas) 8966 goto done; 8967 8968 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); 8969 if (!imu) 8970 goto done; 8971 8972 ret = 0; 8973 mmap_read_lock(current->mm); 8974 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, 8975 pages, vmas); 8976 if (pret == nr_pages) { 8977 /* don't support file backed memory */ 8978 for (i = 0; i < nr_pages; i++) { 8979 struct vm_area_struct *vma = vmas[i]; 8980 8981 if (vma_is_shmem(vma)) 8982 continue; 8983 if (vma->vm_file && 8984 !is_file_hugepages(vma->vm_file)) { 8985 ret = -EOPNOTSUPP; 8986 break; 8987 } 8988 } 8989 } else { 8990 ret = pret < 0 ? pret : -EFAULT; 8991 } 8992 mmap_read_unlock(current->mm); 8993 if (ret) { 8994 /* 8995 * if we did partial map, or found file backed vmas, 8996 * release any pages we did get 8997 */ 8998 if (pret > 0) 8999 unpin_user_pages(pages, pret); 9000 goto done; 9001 } 9002 9003 ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage); 9004 if (ret) { 9005 unpin_user_pages(pages, pret); 9006 goto done; 9007 } 9008 9009 off = ubuf & ~PAGE_MASK; 9010 size = iov->iov_len; 9011 for (i = 0; i < nr_pages; i++) { 9012 size_t vec_len; 9013 9014 vec_len = min_t(size_t, size, PAGE_SIZE - off); 9015 imu->bvec[i].bv_page = pages[i]; 9016 imu->bvec[i].bv_len = vec_len; 9017 imu->bvec[i].bv_offset = off; 9018 off = 0; 9019 size -= vec_len; 9020 } 9021 /* store original address for later verification */ 9022 imu->ubuf = ubuf; 9023 imu->ubuf_end = ubuf + iov->iov_len; 9024 imu->nr_bvecs = nr_pages; 9025 *pimu = imu; 9026 ret = 0; 9027done: 9028 if (ret) 9029 kvfree(imu); 9030 kvfree(pages); 9031 kvfree(vmas); 9032 return ret; 9033} 9034 9035static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) 9036{ 9037 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); 9038 return ctx->user_bufs ? 0 : -ENOMEM; 9039} 9040 9041static int io_buffer_validate(struct iovec *iov) 9042{ 9043 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); 9044 9045 /* 9046 * Don't impose further limits on the size and buffer 9047 * constraints here, we'll -EINVAL later when IO is 9048 * submitted if they are wrong. 9049 */ 9050 if (!iov->iov_base) 9051 return iov->iov_len ? -EFAULT : 0; 9052 if (!iov->iov_len) 9053 return -EFAULT; 9054 9055 /* arbitrary limit, but we need something */ 9056 if (iov->iov_len > SZ_1G) 9057 return -EFAULT; 9058 9059 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) 9060 return -EOVERFLOW; 9061 9062 return 0; 9063} 9064 9065static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 9066 unsigned int nr_args, u64 __user *tags) 9067{ 9068 struct page *last_hpage = NULL; 9069 struct io_rsrc_data *data; 9070 int i, ret; 9071 struct iovec iov; 9072 9073 if (ctx->user_bufs) 9074 return -EBUSY; 9075 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 9076 return -EINVAL; 9077 ret = io_rsrc_node_switch_start(ctx); 9078 if (ret) 9079 return ret; 9080 ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); 9081 if (ret) 9082 return ret; 9083 ret = io_buffers_map_alloc(ctx, nr_args); 9084 if (ret) { 9085 io_rsrc_data_free(data); 9086 return ret; 9087 } 9088 9089 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { 9090 ret = io_copy_iov(ctx, &iov, arg, i); 9091 if (ret) 9092 break; 9093 ret = io_buffer_validate(&iov); 9094 if (ret) 9095 break; 9096 if (!iov.iov_base && *io_get_tag_slot(data, i)) { 9097 ret = -EINVAL; 9098 break; 9099 } 9100 9101 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], 9102 &last_hpage); 9103 if (ret) 9104 break; 9105 } 9106 9107 WARN_ON_ONCE(ctx->buf_data); 9108 9109 ctx->buf_data = data; 9110 if (ret) 9111 __io_sqe_buffers_unregister(ctx); 9112 else 9113 io_rsrc_node_switch(ctx, NULL); 9114 return ret; 9115} 9116 9117static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 9118 struct io_uring_rsrc_update2 *up, 9119 unsigned int nr_args) 9120{ 9121 u64 __user *tags = u64_to_user_ptr(up->tags); 9122 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); 9123 struct page *last_hpage = NULL; 9124 bool needs_switch = false; 9125 __u32 done; 9126 int i, err; 9127 9128 if (!ctx->buf_data) 9129 return -ENXIO; 9130 if (up->offset + nr_args > ctx->nr_user_bufs) 9131 return -EINVAL; 9132 9133 for (done = 0; done < nr_args; done++) { 9134 struct io_mapped_ubuf *imu; 9135 int offset = up->offset + done; 9136 u64 tag = 0; 9137 9138 err = io_copy_iov(ctx, &iov, iovs, done); 9139 if (err) 9140 break; 9141 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 9142 err = -EFAULT; 9143 break; 9144 } 9145 err = io_buffer_validate(&iov); 9146 if (err) 9147 break; 9148 if (!iov.iov_base && tag) { 9149 err = -EINVAL; 9150 break; 9151 } 9152 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); 9153 if (err) 9154 break; 9155 9156 i = array_index_nospec(offset, ctx->nr_user_bufs); 9157 if (ctx->user_bufs[i] != ctx->dummy_ubuf) { 9158 err = io_queue_rsrc_removal(ctx->buf_data, offset, 9159 ctx->rsrc_node, ctx->user_bufs[i]); 9160 if (unlikely(err)) { 9161 io_buffer_unmap(ctx, &imu); 9162 break; 9163 } 9164 ctx->user_bufs[i] = NULL; 9165 needs_switch = true; 9166 } 9167 9168 ctx->user_bufs[i] = imu; 9169 *io_get_tag_slot(ctx->buf_data, offset) = tag; 9170 } 9171 9172 if (needs_switch) 9173 io_rsrc_node_switch(ctx, ctx->buf_data); 9174 return done ? done : err; 9175} 9176 9177static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg) 9178{ 9179 __s32 __user *fds = arg; 9180 int fd; 9181 9182 if (ctx->cq_ev_fd) 9183 return -EBUSY; 9184 9185 if (copy_from_user(&fd, fds, sizeof(*fds))) 9186 return -EFAULT; 9187 9188 ctx->cq_ev_fd = eventfd_ctx_fdget(fd); 9189 if (IS_ERR(ctx->cq_ev_fd)) { 9190 int ret = PTR_ERR(ctx->cq_ev_fd); 9191 9192 ctx->cq_ev_fd = NULL; 9193 return ret; 9194 } 9195 9196 return 0; 9197} 9198 9199static int io_eventfd_unregister(struct io_ring_ctx *ctx) 9200{ 9201 if (ctx->cq_ev_fd) { 9202 eventfd_ctx_put(ctx->cq_ev_fd); 9203 ctx->cq_ev_fd = NULL; 9204 return 0; 9205 } 9206 9207 return -ENXIO; 9208} 9209 9210static void io_destroy_buffers(struct io_ring_ctx *ctx) 9211{ 9212 struct io_buffer *buf; 9213 unsigned long index; 9214 9215 xa_for_each(&ctx->io_buffers, index, buf) { 9216 __io_remove_buffers(ctx, buf, index, -1U); 9217 cond_resched(); 9218 } 9219} 9220 9221static void io_req_cache_free(struct list_head *list) 9222{ 9223 struct io_kiocb *req, *nxt; 9224 9225 list_for_each_entry_safe(req, nxt, list, inflight_entry) { 9226 list_del(&req->inflight_entry); 9227 kmem_cache_free(req_cachep, req); 9228 } 9229} 9230 9231static void io_req_caches_free(struct io_ring_ctx *ctx) 9232{ 9233 struct io_submit_state *state = &ctx->submit_state; 9234 9235 mutex_lock(&ctx->uring_lock); 9236 9237 if (state->free_reqs) { 9238 kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs); 9239 state->free_reqs = 0; 9240 } 9241 9242 io_flush_cached_locked_reqs(ctx, state); 9243 io_req_cache_free(&state->free_list); 9244 mutex_unlock(&ctx->uring_lock); 9245} 9246 9247static void io_wait_rsrc_data(struct io_rsrc_data *data) 9248{ 9249 if (data && !atomic_dec_and_test(&data->refs)) 9250 wait_for_completion(&data->done); 9251} 9252 9253static void io_ring_ctx_free(struct io_ring_ctx *ctx) 9254{ 9255 io_sq_thread_finish(ctx); 9256 9257 if (ctx->mm_account) { 9258 mmdrop(ctx->mm_account); 9259 ctx->mm_account = NULL; 9260 } 9261 9262 /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ 9263 io_wait_rsrc_data(ctx->buf_data); 9264 io_wait_rsrc_data(ctx->file_data); 9265 9266 mutex_lock(&ctx->uring_lock); 9267 if (ctx->buf_data) 9268 __io_sqe_buffers_unregister(ctx); 9269 if (ctx->file_data) 9270 __io_sqe_files_unregister(ctx); 9271 if (ctx->rings) 9272 __io_cqring_overflow_flush(ctx, true); 9273 mutex_unlock(&ctx->uring_lock); 9274 io_eventfd_unregister(ctx); 9275 io_destroy_buffers(ctx); 9276 if (ctx->sq_creds) 9277 put_cred(ctx->sq_creds); 9278 9279 /* there are no registered resources left, nobody uses it */ 9280 if (ctx->rsrc_node) 9281 io_rsrc_node_destroy(ctx->rsrc_node); 9282 if (ctx->rsrc_backup_node) 9283 io_rsrc_node_destroy(ctx->rsrc_backup_node); 9284 flush_delayed_work(&ctx->rsrc_put_work); 9285 9286 WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); 9287 WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist)); 9288 9289#if defined(CONFIG_UNIX) 9290 if (ctx->ring_sock) { 9291 ctx->ring_sock->file = NULL; /* so that iput() is called */ 9292 sock_release(ctx->ring_sock); 9293 } 9294#endif 9295 WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); 9296 9297 io_mem_free(ctx->rings); 9298 io_mem_free(ctx->sq_sqes); 9299 9300 percpu_ref_exit(&ctx->refs); 9301 free_uid(ctx->user); 9302 io_req_caches_free(ctx); 9303 if (ctx->hash_map) 9304 io_wq_put_hash(ctx->hash_map); 9305 kfree(ctx->cancel_hash); 9306 kfree(ctx->dummy_ubuf); 9307 kfree(ctx); 9308} 9309 9310static __poll_t io_uring_poll(struct file *file, poll_table *wait) 9311{ 9312 struct io_ring_ctx *ctx = file->private_data; 9313 __poll_t mask = 0; 9314 9315 poll_wait(file, &ctx->poll_wait, wait); 9316 /* 9317 * synchronizes with barrier from wq_has_sleeper call in 9318 * io_commit_cqring 9319 */ 9320 smp_rmb(); 9321 if (!io_sqring_full(ctx)) 9322 mask |= EPOLLOUT | EPOLLWRNORM; 9323 9324 /* 9325 * Don't flush cqring overflow list here, just do a simple check. 9326 * Otherwise there could possible be ABBA deadlock: 9327 * CPU0 CPU1 9328 * ---- ---- 9329 * lock(&ctx->uring_lock); 9330 * lock(&ep->mtx); 9331 * lock(&ctx->uring_lock); 9332 * lock(&ep->mtx); 9333 * 9334 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this 9335 * pushs them to do the flush. 9336 */ 9337 if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow)) 9338 mask |= EPOLLIN | EPOLLRDNORM; 9339 9340 return mask; 9341} 9342 9343static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) 9344{ 9345 const struct cred *creds; 9346 9347 creds = xa_erase(&ctx->personalities, id); 9348 if (creds) { 9349 put_cred(creds); 9350 return 0; 9351 } 9352 9353 return -EINVAL; 9354} 9355 9356struct io_tctx_exit { 9357 struct callback_head task_work; 9358 struct completion completion; 9359 struct io_ring_ctx *ctx; 9360}; 9361 9362static void io_tctx_exit_cb(struct callback_head *cb) 9363{ 9364 struct io_uring_task *tctx = current->io_uring; 9365 struct io_tctx_exit *work; 9366 9367 work = container_of(cb, struct io_tctx_exit, task_work); 9368 /* 9369 * When @in_idle, we're in cancellation and it's racy to remove the 9370 * node. It'll be removed by the end of cancellation, just ignore it. 9371 */ 9372 if (!atomic_read(&tctx->in_idle)) 9373 io_uring_del_tctx_node((unsigned long)work->ctx); 9374 complete(&work->completion); 9375} 9376 9377static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) 9378{ 9379 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 9380 9381 return req->ctx == data; 9382} 9383 9384static void io_ring_exit_work(struct work_struct *work) 9385{ 9386 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); 9387 unsigned long timeout = jiffies + HZ * 60 * 5; 9388 unsigned long interval = HZ / 20; 9389 struct io_tctx_exit exit; 9390 struct io_tctx_node *node; 9391 int ret; 9392 9393 /* 9394 * If we're doing polled IO and end up having requests being 9395 * submitted async (out-of-line), then completions can come in while 9396 * we're waiting for refs to drop. We need to reap these manually, 9397 * as nobody else will be looking for them. 9398 */ 9399 do { 9400 io_uring_try_cancel_requests(ctx, NULL, true); 9401 if (ctx->sq_data) { 9402 struct io_sq_data *sqd = ctx->sq_data; 9403 struct task_struct *tsk; 9404 9405 io_sq_thread_park(sqd); 9406 tsk = sqd->thread; 9407 if (tsk && tsk->io_uring && tsk->io_uring->io_wq) 9408 io_wq_cancel_cb(tsk->io_uring->io_wq, 9409 io_cancel_ctx_cb, ctx, true); 9410 io_sq_thread_unpark(sqd); 9411 } 9412 9413 if (WARN_ON_ONCE(time_after(jiffies, timeout))) { 9414 /* there is little hope left, don't run it too often */ 9415 interval = HZ * 60; 9416 } 9417 } while (!wait_for_completion_timeout(&ctx->ref_comp, interval)); 9418 9419 init_completion(&exit.completion); 9420 init_task_work(&exit.task_work, io_tctx_exit_cb); 9421 exit.ctx = ctx; 9422 /* 9423 * Some may use context even when all refs and requests have been put, 9424 * and they are free to do so while still holding uring_lock or 9425 * completion_lock, see io_req_task_submit(). Apart from other work, 9426 * this lock/unlock section also waits them to finish. 9427 */ 9428 mutex_lock(&ctx->uring_lock); 9429 while (!list_empty(&ctx->tctx_list)) { 9430 WARN_ON_ONCE(time_after(jiffies, timeout)); 9431 9432 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node, 9433 ctx_node); 9434 /* don't spin on a single task if cancellation failed */ 9435 list_rotate_left(&ctx->tctx_list); 9436 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL); 9437 if (WARN_ON_ONCE(ret)) 9438 continue; 9439 wake_up_process(node->task); 9440 9441 mutex_unlock(&ctx->uring_lock); 9442 wait_for_completion(&exit.completion); 9443 mutex_lock(&ctx->uring_lock); 9444 } 9445 mutex_unlock(&ctx->uring_lock); 9446 spin_lock(&ctx->completion_lock); 9447 spin_unlock(&ctx->completion_lock); 9448 9449 io_ring_ctx_free(ctx); 9450} 9451 9452/* Returns true if we found and killed one or more timeouts */ 9453static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, 9454 bool cancel_all) 9455{ 9456 struct io_kiocb *req, *tmp; 9457 int canceled = 0; 9458 9459 spin_lock(&ctx->completion_lock); 9460 spin_lock_irq(&ctx->timeout_lock); 9461 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { 9462 if (io_match_task(req, tsk, cancel_all)) { 9463 io_kill_timeout(req, -ECANCELED); 9464 canceled++; 9465 } 9466 } 9467 spin_unlock_irq(&ctx->timeout_lock); 9468 if (canceled != 0) 9469 io_commit_cqring(ctx); 9470 spin_unlock(&ctx->completion_lock); 9471 if (canceled != 0) 9472 io_cqring_ev_posted(ctx); 9473 return canceled != 0; 9474} 9475 9476static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) 9477{ 9478 unsigned long index; 9479 struct creds *creds; 9480 9481 mutex_lock(&ctx->uring_lock); 9482 percpu_ref_kill(&ctx->refs); 9483 if (ctx->rings) 9484 __io_cqring_overflow_flush(ctx, true); 9485 xa_for_each(&ctx->personalities, index, creds) 9486 io_unregister_personality(ctx, index); 9487 mutex_unlock(&ctx->uring_lock); 9488 9489 io_kill_timeouts(ctx, NULL, true); 9490 io_poll_remove_all(ctx, NULL, true); 9491 9492 /* if we failed setting up the ctx, we might not have any rings */ 9493 io_iopoll_try_reap_events(ctx); 9494 9495 INIT_WORK(&ctx->exit_work, io_ring_exit_work); 9496 /* 9497 * Use system_unbound_wq to avoid spawning tons of event kworkers 9498 * if we're exiting a ton of rings at the same time. It just adds 9499 * noise and overhead, there's no discernable change in runtime 9500 * over using system_wq. 9501 */ 9502 queue_work(system_unbound_wq, &ctx->exit_work); 9503} 9504 9505static int io_uring_release(struct inode *inode, struct file *file) 9506{ 9507 struct io_ring_ctx *ctx = file->private_data; 9508 9509 file->private_data = NULL; 9510 io_ring_ctx_wait_and_kill(ctx); 9511 return 0; 9512} 9513 9514struct io_task_cancel { 9515 struct task_struct *task; 9516 bool all; 9517}; 9518 9519static bool io_cancel_task_cb(struct io_wq_work *work, void *data) 9520{ 9521 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 9522 struct io_task_cancel *cancel = data; 9523 bool ret; 9524 9525 if (!cancel->all && (req->flags & REQ_F_LINK_TIMEOUT)) { 9526 struct io_ring_ctx *ctx = req->ctx; 9527 9528 /* protect against races with linked timeouts */ 9529 spin_lock(&ctx->completion_lock); 9530 ret = io_match_task(req, cancel->task, cancel->all); 9531 spin_unlock(&ctx->completion_lock); 9532 } else { 9533 ret = io_match_task(req, cancel->task, cancel->all); 9534 } 9535 return ret; 9536} 9537 9538static bool io_cancel_defer_files(struct io_ring_ctx *ctx, 9539 struct task_struct *task, bool cancel_all) 9540{ 9541 struct io_defer_entry *de; 9542 LIST_HEAD(list); 9543 9544 spin_lock(&ctx->completion_lock); 9545 list_for_each_entry_reverse(de, &ctx->defer_list, list) { 9546 if (io_match_task(de->req, task, cancel_all)) { 9547 list_cut_position(&list, &ctx->defer_list, &de->list); 9548 break; 9549 } 9550 } 9551 spin_unlock(&ctx->completion_lock); 9552 if (list_empty(&list)) 9553 return false; 9554 9555 while (!list_empty(&list)) { 9556 de = list_first_entry(&list, struct io_defer_entry, list); 9557 list_del_init(&de->list); 9558 io_req_complete_failed(de->req, -ECANCELED); 9559 kfree(de); 9560 } 9561 return true; 9562} 9563 9564static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) 9565{ 9566 struct io_tctx_node *node; 9567 enum io_wq_cancel cret; 9568 bool ret = false; 9569 9570 mutex_lock(&ctx->uring_lock); 9571 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 9572 struct io_uring_task *tctx = node->task->io_uring; 9573 9574 /* 9575 * io_wq will stay alive while we hold uring_lock, because it's 9576 * killed after ctx nodes, which requires to take the lock. 9577 */ 9578 if (!tctx || !tctx->io_wq) 9579 continue; 9580 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); 9581 ret |= (cret != IO_WQ_CANCEL_NOTFOUND); 9582 } 9583 mutex_unlock(&ctx->uring_lock); 9584 9585 return ret; 9586} 9587 9588static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 9589 struct task_struct *task, 9590 bool cancel_all) 9591{ 9592 struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; 9593 struct io_uring_task *tctx = task ? task->io_uring : NULL; 9594 9595 while (1) { 9596 enum io_wq_cancel cret; 9597 bool ret = false; 9598 9599 if (!task) { 9600 ret |= io_uring_try_cancel_iowq(ctx); 9601 } else if (tctx && tctx->io_wq) { 9602 /* 9603 * Cancels requests of all rings, not only @ctx, but 9604 * it's fine as the task is in exit/exec. 9605 */ 9606 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, 9607 &cancel, true); 9608 ret |= (cret != IO_WQ_CANCEL_NOTFOUND); 9609 } 9610 9611 /* SQPOLL thread does its own polling */ 9612 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || 9613 (ctx->sq_data && ctx->sq_data->thread == current)) { 9614 while (!list_empty_careful(&ctx->iopoll_list)) { 9615 io_iopoll_try_reap_events(ctx); 9616 ret = true; 9617 } 9618 } 9619 9620 ret |= io_cancel_defer_files(ctx, task, cancel_all); 9621 ret |= io_poll_remove_all(ctx, task, cancel_all); 9622 ret |= io_kill_timeouts(ctx, task, cancel_all); 9623 if (task) 9624 ret |= io_run_task_work(); 9625 if (!ret) 9626 break; 9627 cond_resched(); 9628 } 9629} 9630 9631static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) 9632{ 9633 struct io_uring_task *tctx = current->io_uring; 9634 struct io_tctx_node *node; 9635 int ret; 9636 9637 if (unlikely(!tctx)) { 9638 ret = io_uring_alloc_task_context(current, ctx); 9639 if (unlikely(ret)) 9640 return ret; 9641 tctx = current->io_uring; 9642 } 9643 if (!xa_load(&tctx->xa, (unsigned long)ctx)) { 9644 node = kmalloc(sizeof(*node), GFP_KERNEL); 9645 if (!node) 9646 return -ENOMEM; 9647 node->ctx = ctx; 9648 node->task = current; 9649 9650 ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, 9651 node, GFP_KERNEL)); 9652 if (ret) { 9653 kfree(node); 9654 return ret; 9655 } 9656 9657 mutex_lock(&ctx->uring_lock); 9658 list_add(&node->ctx_node, &ctx->tctx_list); 9659 mutex_unlock(&ctx->uring_lock); 9660 } 9661 tctx->last = ctx; 9662 return 0; 9663} 9664 9665/* 9666 * Note that this task has used io_uring. We use it for cancelation purposes. 9667 */ 9668static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx) 9669{ 9670 struct io_uring_task *tctx = current->io_uring; 9671 9672 if (likely(tctx && tctx->last == ctx)) 9673 return 0; 9674 return __io_uring_add_tctx_node(ctx); 9675} 9676 9677/* 9678 * Remove this io_uring_file -> task mapping. 9679 */ 9680static void io_uring_del_tctx_node(unsigned long index) 9681{ 9682 struct io_uring_task *tctx = current->io_uring; 9683 struct io_tctx_node *node; 9684 9685 if (!tctx) 9686 return; 9687 node = xa_erase(&tctx->xa, index); 9688 if (!node) 9689 return; 9690 9691 WARN_ON_ONCE(current != node->task); 9692 WARN_ON_ONCE(list_empty(&node->ctx_node)); 9693 9694 mutex_lock(&node->ctx->uring_lock); 9695 list_del(&node->ctx_node); 9696 mutex_unlock(&node->ctx->uring_lock); 9697 9698 if (tctx->last == node->ctx) 9699 tctx->last = NULL; 9700 kfree(node); 9701} 9702 9703static void io_uring_clean_tctx(struct io_uring_task *tctx) 9704{ 9705 struct io_wq *wq = tctx->io_wq; 9706 struct io_tctx_node *node; 9707 unsigned long index; 9708 9709 xa_for_each(&tctx->xa, index, node) { 9710 io_uring_del_tctx_node(index); 9711 cond_resched(); 9712 } 9713 if (wq) { 9714 /* 9715 * Must be after io_uring_del_task_file() (removes nodes under 9716 * uring_lock) to avoid race with io_uring_try_cancel_iowq(). 9717 */ 9718 io_wq_put_and_exit(wq); 9719 tctx->io_wq = NULL; 9720 } 9721} 9722 9723static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) 9724{ 9725 if (tracked) 9726 return atomic_read(&tctx->inflight_tracked); 9727 return percpu_counter_sum(&tctx->inflight); 9728} 9729 9730static void io_uring_drop_tctx_refs(struct task_struct *task) 9731{ 9732 struct io_uring_task *tctx = task->io_uring; 9733 unsigned int refs = tctx->cached_refs; 9734 9735 if (refs) { 9736 tctx->cached_refs = 0; 9737 percpu_counter_sub(&tctx->inflight, refs); 9738 put_task_struct_many(task, refs); 9739 } 9740} 9741 9742/* 9743 * Find any io_uring ctx that this task has registered or done IO on, and cancel 9744 * requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation. 9745 */ 9746static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) 9747{ 9748 struct io_uring_task *tctx = current->io_uring; 9749 struct io_ring_ctx *ctx; 9750 s64 inflight; 9751 DEFINE_WAIT(wait); 9752 9753 WARN_ON_ONCE(sqd && sqd->thread != current); 9754 9755 if (!current->io_uring) 9756 return; 9757 if (tctx->io_wq) 9758 io_wq_exit_start(tctx->io_wq); 9759 9760 atomic_inc(&tctx->in_idle); 9761 do { 9762 io_uring_drop_tctx_refs(current); 9763 /* read completions before cancelations */ 9764 inflight = tctx_inflight(tctx, !cancel_all); 9765 if (!inflight) 9766 break; 9767 9768 if (!sqd) { 9769 struct io_tctx_node *node; 9770 unsigned long index; 9771 9772 xa_for_each(&tctx->xa, index, node) { 9773 /* sqpoll task will cancel all its requests */ 9774 if (node->ctx->sq_data) 9775 continue; 9776 io_uring_try_cancel_requests(node->ctx, current, 9777 cancel_all); 9778 } 9779 } else { 9780 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 9781 io_uring_try_cancel_requests(ctx, current, 9782 cancel_all); 9783 } 9784 9785 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); 9786 io_uring_drop_tctx_refs(current); 9787 /* 9788 * If we've seen completions, retry without waiting. This 9789 * avoids a race where a completion comes in before we did 9790 * prepare_to_wait(). 9791 */ 9792 if (inflight == tctx_inflight(tctx, !cancel_all)) 9793 schedule(); 9794 finish_wait(&tctx->wait, &wait); 9795 } while (1); 9796 atomic_dec(&tctx->in_idle); 9797 9798 io_uring_clean_tctx(tctx); 9799 if (cancel_all) { 9800 /* for exec all current's requests should be gone, kill tctx */ 9801 __io_uring_free(current); 9802 } 9803} 9804 9805void __io_uring_cancel(bool cancel_all) 9806{ 9807 io_uring_cancel_generic(cancel_all, NULL); 9808} 9809 9810static void *io_uring_validate_mmap_request(struct file *file, 9811 loff_t pgoff, size_t sz) 9812{ 9813 struct io_ring_ctx *ctx = file->private_data; 9814 loff_t offset = pgoff << PAGE_SHIFT; 9815 struct page *page; 9816 void *ptr; 9817 9818 switch (offset) { 9819 case IORING_OFF_SQ_RING: 9820 case IORING_OFF_CQ_RING: 9821 ptr = ctx->rings; 9822 break; 9823 case IORING_OFF_SQES: 9824 ptr = ctx->sq_sqes; 9825 break; 9826 default: 9827 return ERR_PTR(-EINVAL); 9828 } 9829 9830 page = virt_to_head_page(ptr); 9831 if (sz > page_size(page)) 9832 return ERR_PTR(-EINVAL); 9833 9834 return ptr; 9835} 9836 9837#ifdef CONFIG_MMU 9838 9839static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 9840{ 9841 size_t sz = vma->vm_end - vma->vm_start; 9842 unsigned long pfn; 9843 void *ptr; 9844 9845 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); 9846 if (IS_ERR(ptr)) 9847 return PTR_ERR(ptr); 9848 9849 pfn = virt_to_phys(ptr) >> PAGE_SHIFT; 9850 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); 9851} 9852 9853#else /* !CONFIG_MMU */ 9854 9855static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 9856{ 9857 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL; 9858} 9859 9860static unsigned int io_uring_nommu_mmap_capabilities(struct file *file) 9861{ 9862 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; 9863} 9864 9865static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, 9866 unsigned long addr, unsigned long len, 9867 unsigned long pgoff, unsigned long flags) 9868{ 9869 void *ptr; 9870 9871 ptr = io_uring_validate_mmap_request(file, pgoff, len); 9872 if (IS_ERR(ptr)) 9873 return PTR_ERR(ptr); 9874 9875 return (unsigned long) ptr; 9876} 9877 9878#endif /* !CONFIG_MMU */ 9879 9880static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) 9881{ 9882 DEFINE_WAIT(wait); 9883 9884 do { 9885 if (!io_sqring_full(ctx)) 9886 break; 9887 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE); 9888 9889 if (!io_sqring_full(ctx)) 9890 break; 9891 schedule(); 9892 } while (!signal_pending(current)); 9893 9894 finish_wait(&ctx->sqo_sq_wait, &wait); 9895 return 0; 9896} 9897 9898static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, 9899 struct __kernel_timespec __user **ts, 9900 const sigset_t __user **sig) 9901{ 9902 struct io_uring_getevents_arg arg; 9903 9904 /* 9905 * If EXT_ARG isn't set, then we have no timespec and the argp pointer 9906 * is just a pointer to the sigset_t. 9907 */ 9908 if (!(flags & IORING_ENTER_EXT_ARG)) { 9909 *sig = (const sigset_t __user *) argp; 9910 *ts = NULL; 9911 return 0; 9912 } 9913 9914 /* 9915 * EXT_ARG is set - ensure we agree on the size of it and copy in our 9916 * timespec and sigset_t pointers if good. 9917 */ 9918 if (*argsz != sizeof(arg)) 9919 return -EINVAL; 9920 if (copy_from_user(&arg, argp, sizeof(arg))) 9921 return -EFAULT; 9922 *sig = u64_to_user_ptr(arg.sigmask); 9923 *argsz = arg.sigmask_sz; 9924 *ts = u64_to_user_ptr(arg.ts); 9925 return 0; 9926} 9927 9928SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, 9929 u32, min_complete, u32, flags, const void __user *, argp, 9930 size_t, argsz) 9931{ 9932 struct io_ring_ctx *ctx; 9933 int submitted = 0; 9934 struct fd f; 9935 long ret; 9936 9937 io_run_task_work(); 9938 9939 if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | 9940 IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))) 9941 return -EINVAL; 9942 9943 f = fdget(fd); 9944 if (unlikely(!f.file)) 9945 return -EBADF; 9946 9947 ret = -EOPNOTSUPP; 9948 if (unlikely(f.file->f_op != &io_uring_fops)) 9949 goto out_fput; 9950 9951 ret = -ENXIO; 9952 ctx = f.file->private_data; 9953 if (unlikely(!percpu_ref_tryget(&ctx->refs))) 9954 goto out_fput; 9955 9956 ret = -EBADFD; 9957 if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED)) 9958 goto out; 9959 9960 /* 9961 * For SQ polling, the thread will do all submissions and completions. 9962 * Just return the requested submit count, and wake the thread if 9963 * we were asked to. 9964 */ 9965 ret = 0; 9966 if (ctx->flags & IORING_SETUP_SQPOLL) { 9967 io_cqring_overflow_flush(ctx); 9968 9969 if (unlikely(ctx->sq_data->thread == NULL)) { 9970 ret = -EOWNERDEAD; 9971 goto out; 9972 } 9973 if (flags & IORING_ENTER_SQ_WAKEUP) 9974 wake_up(&ctx->sq_data->wait); 9975 if (flags & IORING_ENTER_SQ_WAIT) { 9976 ret = io_sqpoll_wait_sq(ctx); 9977 if (ret) 9978 goto out; 9979 } 9980 submitted = to_submit; 9981 } else if (to_submit) { 9982 ret = io_uring_add_tctx_node(ctx); 9983 if (unlikely(ret)) 9984 goto out; 9985 mutex_lock(&ctx->uring_lock); 9986 submitted = io_submit_sqes(ctx, to_submit); 9987 mutex_unlock(&ctx->uring_lock); 9988 9989 if (submitted != to_submit) 9990 goto out; 9991 } 9992 if (flags & IORING_ENTER_GETEVENTS) { 9993 const sigset_t __user *sig; 9994 struct __kernel_timespec __user *ts; 9995 9996 ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); 9997 if (unlikely(ret)) 9998 goto out; 9999 10000 min_complete = min(min_complete, ctx->cq_entries); 10001 10002 /* 10003 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user 10004 * space applications don't need to do io completion events 10005 * polling again, they can rely on io_sq_thread to do polling 10006 * work, which can reduce cpu usage and uring_lock contention. 10007 */ 10008 if (ctx->flags & IORING_SETUP_IOPOLL && 10009 !(ctx->flags & IORING_SETUP_SQPOLL)) { 10010 ret = io_iopoll_check(ctx, min_complete); 10011 } else { 10012 ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts); 10013 } 10014 } 10015 10016out: 10017 percpu_ref_put(&ctx->refs); 10018out_fput: 10019 fdput(f); 10020 return submitted ? submitted : ret; 10021} 10022 10023#ifdef CONFIG_PROC_FS 10024static int io_uring_show_cred(struct seq_file *m, unsigned int id, 10025 const struct cred *cred) 10026{ 10027 struct user_namespace *uns = seq_user_ns(m); 10028 struct group_info *gi; 10029 kernel_cap_t cap; 10030 unsigned __capi; 10031 int g; 10032 10033 seq_printf(m, "%5d\n", id); 10034 seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid)); 10035 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid)); 10036 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid)); 10037 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid)); 10038 seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid)); 10039 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid)); 10040 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid)); 10041 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid)); 10042 seq_puts(m, "\n\tGroups:\t"); 10043 gi = cred->group_info; 10044 for (g = 0; g < gi->ngroups; g++) { 10045 seq_put_decimal_ull(m, g ? " " : "", 10046 from_kgid_munged(uns, gi->gid[g])); 10047 } 10048 seq_puts(m, "\n\tCapEff:\t"); 10049 cap = cred->cap_effective; 10050 CAP_FOR_EACH_U32(__capi) 10051 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8); 10052 seq_putc(m, '\n'); 10053 return 0; 10054} 10055 10056static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) 10057{ 10058 struct io_sq_data *sq = NULL; 10059 bool has_lock; 10060 int i; 10061 10062 /* 10063 * Avoid ABBA deadlock between the seq lock and the io_uring mutex, 10064 * since fdinfo case grabs it in the opposite direction of normal use 10065 * cases. If we fail to get the lock, we just don't iterate any 10066 * structures that could be going away outside the io_uring mutex. 10067 */ 10068 has_lock = mutex_trylock(&ctx->uring_lock); 10069 10070 if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { 10071 sq = ctx->sq_data; 10072 if (!sq->thread) 10073 sq = NULL; 10074 } 10075 10076 seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1); 10077 seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1); 10078 seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); 10079 for (i = 0; has_lock && i < ctx->nr_user_files; i++) { 10080 struct file *f = io_file_from_index(ctx, i); 10081 10082 if (f) 10083 seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname); 10084 else 10085 seq_printf(m, "%5u: <none>\n", i); 10086 } 10087 seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs); 10088 for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) { 10089 struct io_mapped_ubuf *buf = ctx->user_bufs[i]; 10090 unsigned int len = buf->ubuf_end - buf->ubuf; 10091 10092 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len); 10093 } 10094 if (has_lock && !xa_empty(&ctx->personalities)) { 10095 unsigned long index; 10096 const struct cred *cred; 10097 10098 seq_printf(m, "Personalities:\n"); 10099 xa_for_each(&ctx->personalities, index, cred) 10100 io_uring_show_cred(m, index, cred); 10101 } 10102 seq_printf(m, "PollList:\n"); 10103 spin_lock(&ctx->completion_lock); 10104 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { 10105 struct hlist_head *list = &ctx->cancel_hash[i]; 10106 struct io_kiocb *req; 10107 10108 hlist_for_each_entry(req, list, hash_node) 10109 seq_printf(m, " op=%d, task_works=%d\n", req->opcode, 10110 req->task->task_works != NULL); 10111 } 10112 spin_unlock(&ctx->completion_lock); 10113 if (has_lock) 10114 mutex_unlock(&ctx->uring_lock); 10115} 10116 10117static void io_uring_show_fdinfo(struct seq_file *m, struct file *f) 10118{ 10119 struct io_ring_ctx *ctx = f->private_data; 10120 10121 if (percpu_ref_tryget(&ctx->refs)) { 10122 __io_uring_show_fdinfo(ctx, m); 10123 percpu_ref_put(&ctx->refs); 10124 } 10125} 10126#endif 10127 10128static const struct file_operations io_uring_fops = { 10129 .release = io_uring_release, 10130 .mmap = io_uring_mmap, 10131#ifndef CONFIG_MMU 10132 .get_unmapped_area = io_uring_nommu_get_unmapped_area, 10133 .mmap_capabilities = io_uring_nommu_mmap_capabilities, 10134#endif 10135 .poll = io_uring_poll, 10136#ifdef CONFIG_PROC_FS 10137 .show_fdinfo = io_uring_show_fdinfo, 10138#endif 10139}; 10140 10141static int io_allocate_scq_urings(struct io_ring_ctx *ctx, 10142 struct io_uring_params *p) 10143{ 10144 struct io_rings *rings; 10145 size_t size, sq_array_offset; 10146 10147 /* make sure these are sane, as we already accounted them */ 10148 ctx->sq_entries = p->sq_entries; 10149 ctx->cq_entries = p->cq_entries; 10150 10151 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); 10152 if (size == SIZE_MAX) 10153 return -EOVERFLOW; 10154 10155 rings = io_mem_alloc(size); 10156 if (!rings) 10157 return -ENOMEM; 10158 10159 ctx->rings = rings; 10160 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); 10161 rings->sq_ring_mask = p->sq_entries - 1; 10162 rings->cq_ring_mask = p->cq_entries - 1; 10163 rings->sq_ring_entries = p->sq_entries; 10164 rings->cq_ring_entries = p->cq_entries; 10165 10166 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); 10167 if (size == SIZE_MAX) { 10168 io_mem_free(ctx->rings); 10169 ctx->rings = NULL; 10170 return -EOVERFLOW; 10171 } 10172 10173 ctx->sq_sqes = io_mem_alloc(size); 10174 if (!ctx->sq_sqes) { 10175 io_mem_free(ctx->rings); 10176 ctx->rings = NULL; 10177 return -ENOMEM; 10178 } 10179 10180 return 0; 10181} 10182 10183static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) 10184{ 10185 int ret, fd; 10186 10187 fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); 10188 if (fd < 0) 10189 return fd; 10190 10191 ret = io_uring_add_tctx_node(ctx); 10192 if (ret) { 10193 put_unused_fd(fd); 10194 return ret; 10195 } 10196 fd_install(fd, file); 10197 return fd; 10198} 10199 10200/* 10201 * Allocate an anonymous fd, this is what constitutes the application 10202 * visible backing of an io_uring instance. The application mmaps this 10203 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled, 10204 * we have to tie this fd to a socket for file garbage collection purposes. 10205 */ 10206static struct file *io_uring_get_file(struct io_ring_ctx *ctx) 10207{ 10208 struct file *file; 10209#if defined(CONFIG_UNIX) 10210 int ret; 10211 10212 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP, 10213 &ctx->ring_sock); 10214 if (ret) 10215 return ERR_PTR(ret); 10216#endif 10217 10218 file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx, 10219 O_RDWR | O_CLOEXEC); 10220#if defined(CONFIG_UNIX) 10221 if (IS_ERR(file)) { 10222 sock_release(ctx->ring_sock); 10223 ctx->ring_sock = NULL; 10224 } else { 10225 ctx->ring_sock->file = file; 10226 } 10227#endif 10228 return file; 10229} 10230 10231static int io_uring_create(unsigned entries, struct io_uring_params *p, 10232 struct io_uring_params __user *params) 10233{ 10234 struct io_ring_ctx *ctx; 10235 struct file *file; 10236 int ret; 10237 10238 if (!entries) 10239 return -EINVAL; 10240 if (entries > IORING_MAX_ENTRIES) { 10241 if (!(p->flags & IORING_SETUP_CLAMP)) 10242 return -EINVAL; 10243 entries = IORING_MAX_ENTRIES; 10244 } 10245 10246 /* 10247 * Use twice as many entries for the CQ ring. It's possible for the 10248 * application to drive a higher depth than the size of the SQ ring, 10249 * since the sqes are only used at submission time. This allows for 10250 * some flexibility in overcommitting a bit. If the application has 10251 * set IORING_SETUP_CQSIZE, it will have passed in the desired number 10252 * of CQ ring entries manually. 10253 */ 10254 p->sq_entries = roundup_pow_of_two(entries); 10255 if (p->flags & IORING_SETUP_CQSIZE) { 10256 /* 10257 * If IORING_SETUP_CQSIZE is set, we do the same roundup 10258 * to a power-of-two, if it isn't already. We do NOT impose 10259 * any cq vs sq ring sizing. 10260 */ 10261 if (!p->cq_entries) 10262 return -EINVAL; 10263 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) { 10264 if (!(p->flags & IORING_SETUP_CLAMP)) 10265 return -EINVAL; 10266 p->cq_entries = IORING_MAX_CQ_ENTRIES; 10267 } 10268 p->cq_entries = roundup_pow_of_two(p->cq_entries); 10269 if (p->cq_entries < p->sq_entries) 10270 return -EINVAL; 10271 } else { 10272 p->cq_entries = 2 * p->sq_entries; 10273 } 10274 10275 ctx = io_ring_ctx_alloc(p); 10276 if (!ctx) 10277 return -ENOMEM; 10278 ctx->compat = in_compat_syscall(); 10279 if (!capable(CAP_IPC_LOCK)) 10280 ctx->user = get_uid(current_user()); 10281 10282 /* 10283 * This is just grabbed for accounting purposes. When a process exits, 10284 * the mm is exited and dropped before the files, hence we need to hang 10285 * on to this mm purely for the purposes of being able to unaccount 10286 * memory (locked/pinned vm). It's not used for anything else. 10287 */ 10288 mmgrab(current->mm); 10289 ctx->mm_account = current->mm; 10290 10291 ret = io_allocate_scq_urings(ctx, p); 10292 if (ret) 10293 goto err; 10294 10295 ret = io_sq_offload_create(ctx, p); 10296 if (ret) 10297 goto err; 10298 /* always set a rsrc node */ 10299 ret = io_rsrc_node_switch_start(ctx); 10300 if (ret) 10301 goto err; 10302 io_rsrc_node_switch(ctx, NULL); 10303 10304 memset(&p->sq_off, 0, sizeof(p->sq_off)); 10305 p->sq_off.head = offsetof(struct io_rings, sq.head); 10306 p->sq_off.tail = offsetof(struct io_rings, sq.tail); 10307 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); 10308 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); 10309 p->sq_off.flags = offsetof(struct io_rings, sq_flags); 10310 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); 10311 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; 10312 10313 memset(&p->cq_off, 0, sizeof(p->cq_off)); 10314 p->cq_off.head = offsetof(struct io_rings, cq.head); 10315 p->cq_off.tail = offsetof(struct io_rings, cq.tail); 10316 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); 10317 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); 10318 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); 10319 p->cq_off.cqes = offsetof(struct io_rings, cqes); 10320 p->cq_off.flags = offsetof(struct io_rings, cq_flags); 10321 10322 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | 10323 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | 10324 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | 10325 IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | 10326 IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | 10327 IORING_FEAT_RSRC_TAGS; 10328 10329 if (copy_to_user(params, p, sizeof(*p))) { 10330 ret = -EFAULT; 10331 goto err; 10332 } 10333 10334 file = io_uring_get_file(ctx); 10335 if (IS_ERR(file)) { 10336 ret = PTR_ERR(file); 10337 goto err; 10338 } 10339 10340 /* 10341 * Install ring fd as the very last thing, so we don't risk someone 10342 * having closed it before we finish setup 10343 */ 10344 ret = io_uring_install_fd(ctx, file); 10345 if (ret < 0) { 10346 /* fput will clean it up */ 10347 fput(file); 10348 return ret; 10349 } 10350 10351 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); 10352 return ret; 10353err: 10354 io_ring_ctx_wait_and_kill(ctx); 10355 return ret; 10356} 10357 10358/* 10359 * Sets up an aio uring context, and returns the fd. Applications asks for a 10360 * ring size, we return the actual sq/cq ring sizes (among other things) in the 10361 * params structure passed in. 10362 */ 10363static long io_uring_setup(u32 entries, struct io_uring_params __user *params) 10364{ 10365 struct io_uring_params p; 10366 int i; 10367 10368 if (copy_from_user(&p, params, sizeof(p))) 10369 return -EFAULT; 10370 for (i = 0; i < ARRAY_SIZE(p.resv); i++) { 10371 if (p.resv[i]) 10372 return -EINVAL; 10373 } 10374 10375 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | 10376 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | 10377 IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | 10378 IORING_SETUP_R_DISABLED)) 10379 return -EINVAL; 10380 10381 return io_uring_create(entries, &p, params); 10382} 10383 10384SYSCALL_DEFINE2(io_uring_setup, u32, entries, 10385 struct io_uring_params __user *, params) 10386{ 10387 return io_uring_setup(entries, params); 10388} 10389 10390static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) 10391{ 10392 struct io_uring_probe *p; 10393 size_t size; 10394 int i, ret; 10395 10396 size = struct_size(p, ops, nr_args); 10397 if (size == SIZE_MAX) 10398 return -EOVERFLOW; 10399 p = kzalloc(size, GFP_KERNEL); 10400 if (!p) 10401 return -ENOMEM; 10402 10403 ret = -EFAULT; 10404 if (copy_from_user(p, arg, size)) 10405 goto out; 10406 ret = -EINVAL; 10407 if (memchr_inv(p, 0, size)) 10408 goto out; 10409 10410 p->last_op = IORING_OP_LAST - 1; 10411 if (nr_args > IORING_OP_LAST) 10412 nr_args = IORING_OP_LAST; 10413 10414 for (i = 0; i < nr_args; i++) { 10415 p->ops[i].op = i; 10416 if (!io_op_defs[i].not_supported) 10417 p->ops[i].flags = IO_URING_OP_SUPPORTED; 10418 } 10419 p->ops_len = i; 10420 10421 ret = 0; 10422 if (copy_to_user(arg, p, size)) 10423 ret = -EFAULT; 10424out: 10425 kfree(p); 10426 return ret; 10427} 10428 10429static int io_register_personality(struct io_ring_ctx *ctx) 10430{ 10431 const struct cred *creds; 10432 u32 id; 10433 int ret; 10434 10435 creds = get_current_cred(); 10436 10437 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, 10438 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); 10439 if (ret < 0) { 10440 put_cred(creds); 10441 return ret; 10442 } 10443 return id; 10444} 10445 10446static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg, 10447 unsigned int nr_args) 10448{ 10449 struct io_uring_restriction *res; 10450 size_t size; 10451 int i, ret; 10452 10453 /* Restrictions allowed only if rings started disabled */ 10454 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 10455 return -EBADFD; 10456 10457 /* We allow only a single restrictions registration */ 10458 if (ctx->restrictions.registered) 10459 return -EBUSY; 10460 10461 if (!arg || nr_args > IORING_MAX_RESTRICTIONS) 10462 return -EINVAL; 10463 10464 size = array_size(nr_args, sizeof(*res)); 10465 if (size == SIZE_MAX) 10466 return -EOVERFLOW; 10467 10468 res = memdup_user(arg, size); 10469 if (IS_ERR(res)) 10470 return PTR_ERR(res); 10471 10472 ret = 0; 10473 10474 for (i = 0; i < nr_args; i++) { 10475 switch (res[i].opcode) { 10476 case IORING_RESTRICTION_REGISTER_OP: 10477 if (res[i].register_op >= IORING_REGISTER_LAST) { 10478 ret = -EINVAL; 10479 goto out; 10480 } 10481 10482 __set_bit(res[i].register_op, 10483 ctx->restrictions.register_op); 10484 break; 10485 case IORING_RESTRICTION_SQE_OP: 10486 if (res[i].sqe_op >= IORING_OP_LAST) { 10487 ret = -EINVAL; 10488 goto out; 10489 } 10490 10491 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); 10492 break; 10493 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: 10494 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; 10495 break; 10496 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: 10497 ctx->restrictions.sqe_flags_required = res[i].sqe_flags; 10498 break; 10499 default: 10500 ret = -EINVAL; 10501 goto out; 10502 } 10503 } 10504 10505out: 10506 /* Reset all restrictions if an error happened */ 10507 if (ret != 0) 10508 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); 10509 else 10510 ctx->restrictions.registered = true; 10511 10512 kfree(res); 10513 return ret; 10514} 10515 10516static int io_register_enable_rings(struct io_ring_ctx *ctx) 10517{ 10518 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 10519 return -EBADFD; 10520 10521 if (ctx->restrictions.registered) 10522 ctx->restricted = 1; 10523 10524 ctx->flags &= ~IORING_SETUP_R_DISABLED; 10525 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) 10526 wake_up(&ctx->sq_data->wait); 10527 return 0; 10528} 10529 10530static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 10531 struct io_uring_rsrc_update2 *up, 10532 unsigned nr_args) 10533{ 10534 __u32 tmp; 10535 int err; 10536 10537 if (up->resv) 10538 return -EINVAL; 10539 if (check_add_overflow(up->offset, nr_args, &tmp)) 10540 return -EOVERFLOW; 10541 err = io_rsrc_node_switch_start(ctx); 10542 if (err) 10543 return err; 10544 10545 switch (type) { 10546 case IORING_RSRC_FILE: 10547 return __io_sqe_files_update(ctx, up, nr_args); 10548 case IORING_RSRC_BUFFER: 10549 return __io_sqe_buffers_update(ctx, up, nr_args); 10550 } 10551 return -EINVAL; 10552} 10553 10554static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 10555 unsigned nr_args) 10556{ 10557 struct io_uring_rsrc_update2 up; 10558 10559 if (!nr_args) 10560 return -EINVAL; 10561 memset(&up, 0, sizeof(up)); 10562 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 10563 return -EFAULT; 10564 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 10565} 10566 10567static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 10568 unsigned size, unsigned type) 10569{ 10570 struct io_uring_rsrc_update2 up; 10571 10572 if (size != sizeof(up)) 10573 return -EINVAL; 10574 if (copy_from_user(&up, arg, sizeof(up))) 10575 return -EFAULT; 10576 if (!up.nr || up.resv) 10577 return -EINVAL; 10578 return __io_register_rsrc_update(ctx, type, &up, up.nr); 10579} 10580 10581static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 10582 unsigned int size, unsigned int type) 10583{ 10584 struct io_uring_rsrc_register rr; 10585 10586 /* keep it extendible */ 10587 if (size != sizeof(rr)) 10588 return -EINVAL; 10589 10590 memset(&rr, 0, sizeof(rr)); 10591 if (copy_from_user(&rr, arg, size)) 10592 return -EFAULT; 10593 if (!rr.nr || rr.resv || rr.resv2) 10594 return -EINVAL; 10595 10596 switch (type) { 10597 case IORING_RSRC_FILE: 10598 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 10599 rr.nr, u64_to_user_ptr(rr.tags)); 10600 case IORING_RSRC_BUFFER: 10601 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 10602 rr.nr, u64_to_user_ptr(rr.tags)); 10603 } 10604 return -EINVAL; 10605} 10606 10607static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg, 10608 unsigned len) 10609{ 10610 struct io_uring_task *tctx = current->io_uring; 10611 cpumask_var_t new_mask; 10612 int ret; 10613 10614 if (!tctx || !tctx->io_wq) 10615 return -EINVAL; 10616 10617 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 10618 return -ENOMEM; 10619 10620 cpumask_clear(new_mask); 10621 if (len > cpumask_size()) 10622 len = cpumask_size(); 10623 10624 if (copy_from_user(new_mask, arg, len)) { 10625 free_cpumask_var(new_mask); 10626 return -EFAULT; 10627 } 10628 10629 ret = io_wq_cpu_affinity(tctx->io_wq, new_mask); 10630 free_cpumask_var(new_mask); 10631 return ret; 10632} 10633 10634static int io_unregister_iowq_aff(struct io_ring_ctx *ctx) 10635{ 10636 struct io_uring_task *tctx = current->io_uring; 10637 10638 if (!tctx || !tctx->io_wq) 10639 return -EINVAL; 10640 10641 return io_wq_cpu_affinity(tctx->io_wq, NULL); 10642} 10643 10644static int io_register_iowq_max_workers(struct io_ring_ctx *ctx, 10645 void __user *arg) 10646{ 10647 struct io_uring_task *tctx = NULL; 10648 struct io_sq_data *sqd = NULL; 10649 __u32 new_count[2]; 10650 int i, ret; 10651 10652 if (copy_from_user(new_count, arg, sizeof(new_count))) 10653 return -EFAULT; 10654 for (i = 0; i < ARRAY_SIZE(new_count); i++) 10655 if (new_count[i] > INT_MAX) 10656 return -EINVAL; 10657 10658 if (ctx->flags & IORING_SETUP_SQPOLL) { 10659 sqd = ctx->sq_data; 10660 if (sqd) { 10661 /* 10662 * Observe the correct sqd->lock -> ctx->uring_lock 10663 * ordering. Fine to drop uring_lock here, we hold 10664 * a ref to the ctx. 10665 */ 10666 refcount_inc(&sqd->refs); 10667 mutex_unlock(&ctx->uring_lock); 10668 mutex_lock(&sqd->lock); 10669 mutex_lock(&ctx->uring_lock); 10670 if (sqd->thread) 10671 tctx = sqd->thread->io_uring; 10672 } 10673 } else { 10674 tctx = current->io_uring; 10675 } 10676 10677 ret = -EINVAL; 10678 if (!tctx || !tctx->io_wq) 10679 goto err; 10680 10681 ret = io_wq_max_workers(tctx->io_wq, new_count); 10682 if (ret) 10683 goto err; 10684 10685 if (sqd) { 10686 mutex_unlock(&sqd->lock); 10687 io_put_sq_data(sqd); 10688 } 10689 10690 if (copy_to_user(arg, new_count, sizeof(new_count))) 10691 return -EFAULT; 10692 10693 return 0; 10694err: 10695 if (sqd) { 10696 mutex_unlock(&sqd->lock); 10697 io_put_sq_data(sqd); 10698 } 10699 return ret; 10700} 10701 10702static bool io_register_op_must_quiesce(int op) 10703{ 10704 switch (op) { 10705 case IORING_REGISTER_BUFFERS: 10706 case IORING_UNREGISTER_BUFFERS: 10707 case IORING_REGISTER_FILES: 10708 case IORING_UNREGISTER_FILES: 10709 case IORING_REGISTER_FILES_UPDATE: 10710 case IORING_REGISTER_PROBE: 10711 case IORING_REGISTER_PERSONALITY: 10712 case IORING_UNREGISTER_PERSONALITY: 10713 case IORING_REGISTER_FILES2: 10714 case IORING_REGISTER_FILES_UPDATE2: 10715 case IORING_REGISTER_BUFFERS2: 10716 case IORING_REGISTER_BUFFERS_UPDATE: 10717 case IORING_REGISTER_IOWQ_AFF: 10718 case IORING_UNREGISTER_IOWQ_AFF: 10719 case IORING_REGISTER_IOWQ_MAX_WORKERS: 10720 return false; 10721 default: 10722 return true; 10723 } 10724} 10725 10726static int io_ctx_quiesce(struct io_ring_ctx *ctx) 10727{ 10728 long ret; 10729 10730 percpu_ref_kill(&ctx->refs); 10731 10732 /* 10733 * Drop uring mutex before waiting for references to exit. If another 10734 * thread is currently inside io_uring_enter() it might need to grab the 10735 * uring_lock to make progress. If we hold it here across the drain 10736 * wait, then we can deadlock. It's safe to drop the mutex here, since 10737 * no new references will come in after we've killed the percpu ref. 10738 */ 10739 mutex_unlock(&ctx->uring_lock); 10740 do { 10741 ret = wait_for_completion_interruptible(&ctx->ref_comp); 10742 if (!ret) 10743 break; 10744 ret = io_run_task_work_sig(); 10745 } while (ret >= 0); 10746 mutex_lock(&ctx->uring_lock); 10747 10748 if (ret) 10749 io_refs_resurrect(&ctx->refs, &ctx->ref_comp); 10750 return ret; 10751} 10752 10753static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, 10754 void __user *arg, unsigned nr_args) 10755 __releases(ctx->uring_lock) 10756 __acquires(ctx->uring_lock) 10757{ 10758 int ret; 10759 10760 /* 10761 * We're inside the ring mutex, if the ref is already dying, then 10762 * someone else killed the ctx or is already going through 10763 * io_uring_register(). 10764 */ 10765 if (percpu_ref_is_dying(&ctx->refs)) 10766 return -ENXIO; 10767 10768 if (ctx->restricted) { 10769 if (opcode >= IORING_REGISTER_LAST) 10770 return -EINVAL; 10771 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); 10772 if (!test_bit(opcode, ctx->restrictions.register_op)) 10773 return -EACCES; 10774 } 10775 10776 if (io_register_op_must_quiesce(opcode)) { 10777 ret = io_ctx_quiesce(ctx); 10778 if (ret) 10779 return ret; 10780 } 10781 10782 switch (opcode) { 10783 case IORING_REGISTER_BUFFERS: 10784 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); 10785 break; 10786 case IORING_UNREGISTER_BUFFERS: 10787 ret = -EINVAL; 10788 if (arg || nr_args) 10789 break; 10790 ret = io_sqe_buffers_unregister(ctx); 10791 break; 10792 case IORING_REGISTER_FILES: 10793 ret = io_sqe_files_register(ctx, arg, nr_args, NULL); 10794 break; 10795 case IORING_UNREGISTER_FILES: 10796 ret = -EINVAL; 10797 if (arg || nr_args) 10798 break; 10799 ret = io_sqe_files_unregister(ctx); 10800 break; 10801 case IORING_REGISTER_FILES_UPDATE: 10802 ret = io_register_files_update(ctx, arg, nr_args); 10803 break; 10804 case IORING_REGISTER_EVENTFD: 10805 case IORING_REGISTER_EVENTFD_ASYNC: 10806 ret = -EINVAL; 10807 if (nr_args != 1) 10808 break; 10809 ret = io_eventfd_register(ctx, arg); 10810 if (ret) 10811 break; 10812 if (opcode == IORING_REGISTER_EVENTFD_ASYNC) 10813 ctx->eventfd_async = 1; 10814 else 10815 ctx->eventfd_async = 0; 10816 break; 10817 case IORING_UNREGISTER_EVENTFD: 10818 ret = -EINVAL; 10819 if (arg || nr_args) 10820 break; 10821 ret = io_eventfd_unregister(ctx); 10822 break; 10823 case IORING_REGISTER_PROBE: 10824 ret = -EINVAL; 10825 if (!arg || nr_args > 256) 10826 break; 10827 ret = io_probe(ctx, arg, nr_args); 10828 break; 10829 case IORING_REGISTER_PERSONALITY: 10830 ret = -EINVAL; 10831 if (arg || nr_args) 10832 break; 10833 ret = io_register_personality(ctx); 10834 break; 10835 case IORING_UNREGISTER_PERSONALITY: 10836 ret = -EINVAL; 10837 if (arg) 10838 break; 10839 ret = io_unregister_personality(ctx, nr_args); 10840 break; 10841 case IORING_REGISTER_ENABLE_RINGS: 10842 ret = -EINVAL; 10843 if (arg || nr_args) 10844 break; 10845 ret = io_register_enable_rings(ctx); 10846 break; 10847 case IORING_REGISTER_RESTRICTIONS: 10848 ret = io_register_restrictions(ctx, arg, nr_args); 10849 break; 10850 case IORING_REGISTER_FILES2: 10851 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); 10852 break; 10853 case IORING_REGISTER_FILES_UPDATE2: 10854 ret = io_register_rsrc_update(ctx, arg, nr_args, 10855 IORING_RSRC_FILE); 10856 break; 10857 case IORING_REGISTER_BUFFERS2: 10858 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); 10859 break; 10860 case IORING_REGISTER_BUFFERS_UPDATE: 10861 ret = io_register_rsrc_update(ctx, arg, nr_args, 10862 IORING_RSRC_BUFFER); 10863 break; 10864 case IORING_REGISTER_IOWQ_AFF: 10865 ret = -EINVAL; 10866 if (!arg || !nr_args) 10867 break; 10868 ret = io_register_iowq_aff(ctx, arg, nr_args); 10869 break; 10870 case IORING_UNREGISTER_IOWQ_AFF: 10871 ret = -EINVAL; 10872 if (arg || nr_args) 10873 break; 10874 ret = io_unregister_iowq_aff(ctx); 10875 break; 10876 case IORING_REGISTER_IOWQ_MAX_WORKERS: 10877 ret = -EINVAL; 10878 if (!arg || nr_args != 2) 10879 break; 10880 ret = io_register_iowq_max_workers(ctx, arg); 10881 break; 10882 default: 10883 ret = -EINVAL; 10884 break; 10885 } 10886 10887 if (io_register_op_must_quiesce(opcode)) { 10888 /* bring the ctx back to life */ 10889 percpu_ref_reinit(&ctx->refs); 10890 reinit_completion(&ctx->ref_comp); 10891 } 10892 return ret; 10893} 10894 10895SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, 10896 void __user *, arg, unsigned int, nr_args) 10897{ 10898 struct io_ring_ctx *ctx; 10899 long ret = -EBADF; 10900 struct fd f; 10901 10902 f = fdget(fd); 10903 if (!f.file) 10904 return -EBADF; 10905 10906 ret = -EOPNOTSUPP; 10907 if (f.file->f_op != &io_uring_fops) 10908 goto out_fput; 10909 10910 ctx = f.file->private_data; 10911 10912 io_run_task_work(); 10913 10914 mutex_lock(&ctx->uring_lock); 10915 ret = __io_uring_register(ctx, opcode, arg, nr_args); 10916 mutex_unlock(&ctx->uring_lock); 10917 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, 10918 ctx->cq_ev_fd != NULL, ret); 10919out_fput: 10920 fdput(f); 10921 return ret; 10922} 10923 10924static int __init io_uring_init(void) 10925{ 10926#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \ 10927 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \ 10928 BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \ 10929} while (0) 10930 10931#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \ 10932 __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename) 10933 BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64); 10934 BUILD_BUG_SQE_ELEM(0, __u8, opcode); 10935 BUILD_BUG_SQE_ELEM(1, __u8, flags); 10936 BUILD_BUG_SQE_ELEM(2, __u16, ioprio); 10937 BUILD_BUG_SQE_ELEM(4, __s32, fd); 10938 BUILD_BUG_SQE_ELEM(8, __u64, off); 10939 BUILD_BUG_SQE_ELEM(8, __u64, addr2); 10940 BUILD_BUG_SQE_ELEM(16, __u64, addr); 10941 BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in); 10942 BUILD_BUG_SQE_ELEM(24, __u32, len); 10943 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags); 10944 BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); 10945 BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags); 10946 BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags); 10947 BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events); 10948 BUILD_BUG_SQE_ELEM(28, __u32, poll32_events); 10949 BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags); 10950 BUILD_BUG_SQE_ELEM(28, __u32, msg_flags); 10951 BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags); 10952 BUILD_BUG_SQE_ELEM(28, __u32, accept_flags); 10953 BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags); 10954 BUILD_BUG_SQE_ELEM(28, __u32, open_flags); 10955 BUILD_BUG_SQE_ELEM(28, __u32, statx_flags); 10956 BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice); 10957 BUILD_BUG_SQE_ELEM(28, __u32, splice_flags); 10958 BUILD_BUG_SQE_ELEM(32, __u64, user_data); 10959 BUILD_BUG_SQE_ELEM(40, __u16, buf_index); 10960 BUILD_BUG_SQE_ELEM(40, __u16, buf_group); 10961 BUILD_BUG_SQE_ELEM(42, __u16, personality); 10962 BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); 10963 BUILD_BUG_SQE_ELEM(44, __u32, file_index); 10964 10965 BUILD_BUG_ON(sizeof(struct io_uring_files_update) != 10966 sizeof(struct io_uring_rsrc_update)); 10967 BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) > 10968 sizeof(struct io_uring_rsrc_update2)); 10969 10970 /* ->buf_index is u16 */ 10971 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 10972 10973 /* should fit into one byte */ 10974 BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); 10975 10976 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); 10977 BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); 10978 10979 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | 10980 SLAB_ACCOUNT); 10981 return 0; 10982}; 10983__initcall(io_uring_init);