Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v5.17-rc7 11250 lines 285 kB view raw
1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Shared application/kernel submission and completion ring pairs, for 4 * supporting fast/efficient IO. 5 * 6 * A note on the read/write ordering memory barriers that are matched between 7 * the application and kernel side. 8 * 9 * After the application reads the CQ ring tail, it must use an 10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses 11 * before writing the tail (using smp_load_acquire to read the tail will 12 * do). It also needs a smp_mb() before updating CQ head (ordering the 13 * entry load(s) with the head store), pairing with an implicit barrier 14 * through a control-dependency in io_get_cqe (smp_store_release to 15 * store head will do). Failure to do so could lead to reading invalid 16 * CQ entries. 17 * 18 * Likewise, the application must use an appropriate smp_wmb() before 19 * writing the SQ tail (ordering SQ entry stores with the tail store), 20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release 21 * to store the tail will do). And it needs a barrier ordering the SQ 22 * head load before writing new SQ entries (smp_load_acquire to read 23 * head will do). 24 * 25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application 26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after* 27 * updating the SQ tail; a full memory barrier smp_mb() is needed 28 * between. 29 * 30 * Also see the examples in the liburing library: 31 * 32 * git://git.kernel.dk/liburing 33 * 34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens 35 * from data shared between the kernel and application. This is done both 36 * for ordering purposes, but also to ensure that once a value is loaded from 37 * data that the application could potentially modify, it remains stable. 38 * 39 * Copyright (C) 2018-2019 Jens Axboe 40 * Copyright (c) 2018-2019 Christoph Hellwig 41 */ 42#include <linux/kernel.h> 43#include <linux/init.h> 44#include <linux/errno.h> 45#include <linux/syscalls.h> 46#include <linux/compat.h> 47#include <net/compat.h> 48#include <linux/refcount.h> 49#include <linux/uio.h> 50#include <linux/bits.h> 51 52#include <linux/sched/signal.h> 53#include <linux/fs.h> 54#include <linux/file.h> 55#include <linux/fdtable.h> 56#include <linux/mm.h> 57#include <linux/mman.h> 58#include <linux/percpu.h> 59#include <linux/slab.h> 60#include <linux/blk-mq.h> 61#include <linux/bvec.h> 62#include <linux/net.h> 63#include <net/sock.h> 64#include <net/af_unix.h> 65#include <net/scm.h> 66#include <linux/anon_inodes.h> 67#include <linux/sched/mm.h> 68#include <linux/uaccess.h> 69#include <linux/nospec.h> 70#include <linux/sizes.h> 71#include <linux/hugetlb.h> 72#include <linux/highmem.h> 73#include <linux/namei.h> 74#include <linux/fsnotify.h> 75#include <linux/fadvise.h> 76#include <linux/eventpoll.h> 77#include <linux/splice.h> 78#include <linux/task_work.h> 79#include <linux/pagemap.h> 80#include <linux/io_uring.h> 81#include <linux/tracehook.h> 82#include <linux/audit.h> 83#include <linux/security.h> 84 85#define CREATE_TRACE_POINTS 86#include <trace/events/io_uring.h> 87 88#include <uapi/linux/io_uring.h> 89 90#include "internal.h" 91#include "io-wq.h" 92 93#define IORING_MAX_ENTRIES 32768 94#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) 95#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 96 97/* only define max */ 98#define IORING_MAX_FIXED_FILES (1U << 15) 99#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 100 IORING_REGISTER_LAST + IORING_OP_LAST) 101 102#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) 103#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) 104#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) 105 106#define IORING_MAX_REG_BUFFERS (1U << 14) 107 108#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ 109 IOSQE_IO_HARDLINK | IOSQE_ASYNC) 110 111#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ 112 IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) 113 114#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ 115 REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ 116 REQ_F_ASYNC_DATA) 117 118#define IO_TCTX_REFS_CACHE_NR (1U << 10) 119 120struct io_uring { 121 u32 head ____cacheline_aligned_in_smp; 122 u32 tail ____cacheline_aligned_in_smp; 123}; 124 125/* 126 * This data is shared with the application through the mmap at offsets 127 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. 128 * 129 * The offsets to the member fields are published through struct 130 * io_sqring_offsets when calling io_uring_setup. 131 */ 132struct io_rings { 133 /* 134 * Head and tail offsets into the ring; the offsets need to be 135 * masked to get valid indices. 136 * 137 * The kernel controls head of the sq ring and the tail of the cq ring, 138 * and the application controls tail of the sq ring and the head of the 139 * cq ring. 140 */ 141 struct io_uring sq, cq; 142 /* 143 * Bitmasks to apply to head and tail offsets (constant, equals 144 * ring_entries - 1) 145 */ 146 u32 sq_ring_mask, cq_ring_mask; 147 /* Ring sizes (constant, power of 2) */ 148 u32 sq_ring_entries, cq_ring_entries; 149 /* 150 * Number of invalid entries dropped by the kernel due to 151 * invalid index stored in array 152 * 153 * Written by the kernel, shouldn't be modified by the 154 * application (i.e. get number of "new events" by comparing to 155 * cached value). 156 * 157 * After a new SQ head value was read by the application this 158 * counter includes all submissions that were dropped reaching 159 * the new SQ head (and possibly more). 160 */ 161 u32 sq_dropped; 162 /* 163 * Runtime SQ flags 164 * 165 * Written by the kernel, shouldn't be modified by the 166 * application. 167 * 168 * The application needs a full memory barrier before checking 169 * for IORING_SQ_NEED_WAKEUP after updating the sq tail. 170 */ 171 u32 sq_flags; 172 /* 173 * Runtime CQ flags 174 * 175 * Written by the application, shouldn't be modified by the 176 * kernel. 177 */ 178 u32 cq_flags; 179 /* 180 * Number of completion events lost because the queue was full; 181 * this should be avoided by the application by making sure 182 * there are not more requests pending than there is space in 183 * the completion queue. 184 * 185 * Written by the kernel, shouldn't be modified by the 186 * application (i.e. get number of "new events" by comparing to 187 * cached value). 188 * 189 * As completion events come in out of order this counter is not 190 * ordered with any other data. 191 */ 192 u32 cq_overflow; 193 /* 194 * Ring buffer of completion events. 195 * 196 * The kernel writes completion events fresh every time they are 197 * produced, so the application is allowed to modify pending 198 * entries. 199 */ 200 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; 201}; 202 203enum io_uring_cmd_flags { 204 IO_URING_F_COMPLETE_DEFER = 1, 205 IO_URING_F_UNLOCKED = 2, 206 /* int's last bit, sign checks are usually faster than a bit test */ 207 IO_URING_F_NONBLOCK = INT_MIN, 208}; 209 210struct io_mapped_ubuf { 211 u64 ubuf; 212 u64 ubuf_end; 213 unsigned int nr_bvecs; 214 unsigned long acct_pages; 215 struct bio_vec bvec[]; 216}; 217 218struct io_ring_ctx; 219 220struct io_overflow_cqe { 221 struct io_uring_cqe cqe; 222 struct list_head list; 223}; 224 225struct io_fixed_file { 226 /* file * with additional FFS_* flags */ 227 unsigned long file_ptr; 228}; 229 230struct io_rsrc_put { 231 struct list_head list; 232 u64 tag; 233 union { 234 void *rsrc; 235 struct file *file; 236 struct io_mapped_ubuf *buf; 237 }; 238}; 239 240struct io_file_table { 241 struct io_fixed_file *files; 242}; 243 244struct io_rsrc_node { 245 struct percpu_ref refs; 246 struct list_head node; 247 struct list_head rsrc_list; 248 struct io_rsrc_data *rsrc_data; 249 struct llist_node llist; 250 bool done; 251}; 252 253typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); 254 255struct io_rsrc_data { 256 struct io_ring_ctx *ctx; 257 258 u64 **tags; 259 unsigned int nr; 260 rsrc_put_fn *do_put; 261 atomic_t refs; 262 struct completion done; 263 bool quiesce; 264}; 265 266struct io_buffer { 267 struct list_head list; 268 __u64 addr; 269 __u32 len; 270 __u16 bid; 271}; 272 273struct io_restriction { 274 DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); 275 DECLARE_BITMAP(sqe_op, IORING_OP_LAST); 276 u8 sqe_flags_allowed; 277 u8 sqe_flags_required; 278 bool registered; 279}; 280 281enum { 282 IO_SQ_THREAD_SHOULD_STOP = 0, 283 IO_SQ_THREAD_SHOULD_PARK, 284}; 285 286struct io_sq_data { 287 refcount_t refs; 288 atomic_t park_pending; 289 struct mutex lock; 290 291 /* ctx's that are using this sqd */ 292 struct list_head ctx_list; 293 294 struct task_struct *thread; 295 struct wait_queue_head wait; 296 297 unsigned sq_thread_idle; 298 int sq_cpu; 299 pid_t task_pid; 300 pid_t task_tgid; 301 302 unsigned long state; 303 struct completion exited; 304}; 305 306#define IO_COMPL_BATCH 32 307#define IO_REQ_CACHE_SIZE 32 308#define IO_REQ_ALLOC_BATCH 8 309 310struct io_submit_link { 311 struct io_kiocb *head; 312 struct io_kiocb *last; 313}; 314 315struct io_submit_state { 316 /* inline/task_work completion list, under ->uring_lock */ 317 struct io_wq_work_node free_list; 318 /* batch completion logic */ 319 struct io_wq_work_list compl_reqs; 320 struct io_submit_link link; 321 322 bool plug_started; 323 bool need_plug; 324 bool flush_cqes; 325 unsigned short submit_nr; 326 struct blk_plug plug; 327}; 328 329struct io_ring_ctx { 330 /* const or read-mostly hot data */ 331 struct { 332 struct percpu_ref refs; 333 334 struct io_rings *rings; 335 unsigned int flags; 336 unsigned int compat: 1; 337 unsigned int drain_next: 1; 338 unsigned int eventfd_async: 1; 339 unsigned int restricted: 1; 340 unsigned int off_timeout_used: 1; 341 unsigned int drain_active: 1; 342 unsigned int drain_disabled: 1; 343 } ____cacheline_aligned_in_smp; 344 345 /* submission data */ 346 struct { 347 struct mutex uring_lock; 348 349 /* 350 * Ring buffer of indices into array of io_uring_sqe, which is 351 * mmapped by the application using the IORING_OFF_SQES offset. 352 * 353 * This indirection could e.g. be used to assign fixed 354 * io_uring_sqe entries to operations and only submit them to 355 * the queue when needed. 356 * 357 * The kernel modifies neither the indices array nor the entries 358 * array. 359 */ 360 u32 *sq_array; 361 struct io_uring_sqe *sq_sqes; 362 unsigned cached_sq_head; 363 unsigned sq_entries; 364 struct list_head defer_list; 365 366 /* 367 * Fixed resources fast path, should be accessed only under 368 * uring_lock, and updated through io_uring_register(2) 369 */ 370 struct io_rsrc_node *rsrc_node; 371 int rsrc_cached_refs; 372 struct io_file_table file_table; 373 unsigned nr_user_files; 374 unsigned nr_user_bufs; 375 struct io_mapped_ubuf **user_bufs; 376 377 struct io_submit_state submit_state; 378 struct list_head timeout_list; 379 struct list_head ltimeout_list; 380 struct list_head cq_overflow_list; 381 struct xarray io_buffers; 382 struct xarray personalities; 383 u32 pers_next; 384 unsigned sq_thread_idle; 385 } ____cacheline_aligned_in_smp; 386 387 /* IRQ completion list, under ->completion_lock */ 388 struct io_wq_work_list locked_free_list; 389 unsigned int locked_free_nr; 390 391 const struct cred *sq_creds; /* cred used for __io_sq_thread() */ 392 struct io_sq_data *sq_data; /* if using sq thread polling */ 393 394 struct wait_queue_head sqo_sq_wait; 395 struct list_head sqd_list; 396 397 unsigned long check_cq_overflow; 398 399 struct { 400 unsigned cached_cq_tail; 401 unsigned cq_entries; 402 struct eventfd_ctx *cq_ev_fd; 403 struct wait_queue_head cq_wait; 404 unsigned cq_extra; 405 atomic_t cq_timeouts; 406 unsigned cq_last_tm_flush; 407 } ____cacheline_aligned_in_smp; 408 409 struct { 410 spinlock_t completion_lock; 411 412 spinlock_t timeout_lock; 413 414 /* 415 * ->iopoll_list is protected by the ctx->uring_lock for 416 * io_uring instances that don't use IORING_SETUP_SQPOLL. 417 * For SQPOLL, only the single threaded io_sq_thread() will 418 * manipulate the list, hence no extra locking is needed there. 419 */ 420 struct io_wq_work_list iopoll_list; 421 struct hlist_head *cancel_hash; 422 unsigned cancel_hash_bits; 423 bool poll_multi_queue; 424 } ____cacheline_aligned_in_smp; 425 426 struct io_restriction restrictions; 427 428 /* slow path rsrc auxilary data, used by update/register */ 429 struct { 430 struct io_rsrc_node *rsrc_backup_node; 431 struct io_mapped_ubuf *dummy_ubuf; 432 struct io_rsrc_data *file_data; 433 struct io_rsrc_data *buf_data; 434 435 struct delayed_work rsrc_put_work; 436 struct llist_head rsrc_put_llist; 437 struct list_head rsrc_ref_list; 438 spinlock_t rsrc_ref_lock; 439 }; 440 441 /* Keep this last, we don't need it for the fast path */ 442 struct { 443 #if defined(CONFIG_UNIX) 444 struct socket *ring_sock; 445 #endif 446 /* hashed buffered write serialization */ 447 struct io_wq_hash *hash_map; 448 449 /* Only used for accounting purposes */ 450 struct user_struct *user; 451 struct mm_struct *mm_account; 452 453 /* ctx exit and cancelation */ 454 struct llist_head fallback_llist; 455 struct delayed_work fallback_work; 456 struct work_struct exit_work; 457 struct list_head tctx_list; 458 struct completion ref_comp; 459 u32 iowq_limits[2]; 460 bool iowq_limits_set; 461 }; 462}; 463 464struct io_uring_task { 465 /* submission side */ 466 int cached_refs; 467 struct xarray xa; 468 struct wait_queue_head wait; 469 const struct io_ring_ctx *last; 470 struct io_wq *io_wq; 471 struct percpu_counter inflight; 472 atomic_t inflight_tracked; 473 atomic_t in_idle; 474 475 spinlock_t task_lock; 476 struct io_wq_work_list task_list; 477 struct io_wq_work_list prior_task_list; 478 struct callback_head task_work; 479 bool task_running; 480}; 481 482/* 483 * First field must be the file pointer in all the 484 * iocb unions! See also 'struct kiocb' in <linux/fs.h> 485 */ 486struct io_poll_iocb { 487 struct file *file; 488 struct wait_queue_head *head; 489 __poll_t events; 490 struct wait_queue_entry wait; 491}; 492 493struct io_poll_update { 494 struct file *file; 495 u64 old_user_data; 496 u64 new_user_data; 497 __poll_t events; 498 bool update_events; 499 bool update_user_data; 500}; 501 502struct io_close { 503 struct file *file; 504 int fd; 505 u32 file_slot; 506}; 507 508struct io_timeout_data { 509 struct io_kiocb *req; 510 struct hrtimer timer; 511 struct timespec64 ts; 512 enum hrtimer_mode mode; 513 u32 flags; 514}; 515 516struct io_accept { 517 struct file *file; 518 struct sockaddr __user *addr; 519 int __user *addr_len; 520 int flags; 521 u32 file_slot; 522 unsigned long nofile; 523}; 524 525struct io_sync { 526 struct file *file; 527 loff_t len; 528 loff_t off; 529 int flags; 530 int mode; 531}; 532 533struct io_cancel { 534 struct file *file; 535 u64 addr; 536}; 537 538struct io_timeout { 539 struct file *file; 540 u32 off; 541 u32 target_seq; 542 struct list_head list; 543 /* head of the link, used by linked timeouts only */ 544 struct io_kiocb *head; 545 /* for linked completions */ 546 struct io_kiocb *prev; 547}; 548 549struct io_timeout_rem { 550 struct file *file; 551 u64 addr; 552 553 /* timeout update */ 554 struct timespec64 ts; 555 u32 flags; 556 bool ltimeout; 557}; 558 559struct io_rw { 560 /* NOTE: kiocb has the file as the first member, so don't do it here */ 561 struct kiocb kiocb; 562 u64 addr; 563 u64 len; 564}; 565 566struct io_connect { 567 struct file *file; 568 struct sockaddr __user *addr; 569 int addr_len; 570}; 571 572struct io_sr_msg { 573 struct file *file; 574 union { 575 struct compat_msghdr __user *umsg_compat; 576 struct user_msghdr __user *umsg; 577 void __user *buf; 578 }; 579 int msg_flags; 580 int bgid; 581 size_t len; 582}; 583 584struct io_open { 585 struct file *file; 586 int dfd; 587 u32 file_slot; 588 struct filename *filename; 589 struct open_how how; 590 unsigned long nofile; 591}; 592 593struct io_rsrc_update { 594 struct file *file; 595 u64 arg; 596 u32 nr_args; 597 u32 offset; 598}; 599 600struct io_fadvise { 601 struct file *file; 602 u64 offset; 603 u32 len; 604 u32 advice; 605}; 606 607struct io_madvise { 608 struct file *file; 609 u64 addr; 610 u32 len; 611 u32 advice; 612}; 613 614struct io_epoll { 615 struct file *file; 616 int epfd; 617 int op; 618 int fd; 619 struct epoll_event event; 620}; 621 622struct io_splice { 623 struct file *file_out; 624 struct file *file_in; 625 loff_t off_out; 626 loff_t off_in; 627 u64 len; 628 unsigned int flags; 629}; 630 631struct io_provide_buf { 632 struct file *file; 633 __u64 addr; 634 __u32 len; 635 __u32 bgid; 636 __u16 nbufs; 637 __u16 bid; 638}; 639 640struct io_statx { 641 struct file *file; 642 int dfd; 643 unsigned int mask; 644 unsigned int flags; 645 const char __user *filename; 646 struct statx __user *buffer; 647}; 648 649struct io_shutdown { 650 struct file *file; 651 int how; 652}; 653 654struct io_rename { 655 struct file *file; 656 int old_dfd; 657 int new_dfd; 658 struct filename *oldpath; 659 struct filename *newpath; 660 int flags; 661}; 662 663struct io_unlink { 664 struct file *file; 665 int dfd; 666 int flags; 667 struct filename *filename; 668}; 669 670struct io_mkdir { 671 struct file *file; 672 int dfd; 673 umode_t mode; 674 struct filename *filename; 675}; 676 677struct io_symlink { 678 struct file *file; 679 int new_dfd; 680 struct filename *oldpath; 681 struct filename *newpath; 682}; 683 684struct io_hardlink { 685 struct file *file; 686 int old_dfd; 687 int new_dfd; 688 struct filename *oldpath; 689 struct filename *newpath; 690 int flags; 691}; 692 693struct io_async_connect { 694 struct sockaddr_storage address; 695}; 696 697struct io_async_msghdr { 698 struct iovec fast_iov[UIO_FASTIOV]; 699 /* points to an allocated iov, if NULL we use fast_iov instead */ 700 struct iovec *free_iov; 701 struct sockaddr __user *uaddr; 702 struct msghdr msg; 703 struct sockaddr_storage addr; 704}; 705 706struct io_rw_state { 707 struct iov_iter iter; 708 struct iov_iter_state iter_state; 709 struct iovec fast_iov[UIO_FASTIOV]; 710}; 711 712struct io_async_rw { 713 struct io_rw_state s; 714 const struct iovec *free_iovec; 715 size_t bytes_done; 716 struct wait_page_queue wpq; 717}; 718 719enum { 720 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, 721 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, 722 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, 723 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, 724 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, 725 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, 726 REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT, 727 728 /* first byte is taken by user flags, shift it to not overlap */ 729 REQ_F_FAIL_BIT = 8, 730 REQ_F_INFLIGHT_BIT, 731 REQ_F_CUR_POS_BIT, 732 REQ_F_NOWAIT_BIT, 733 REQ_F_LINK_TIMEOUT_BIT, 734 REQ_F_NEED_CLEANUP_BIT, 735 REQ_F_POLLED_BIT, 736 REQ_F_BUFFER_SELECTED_BIT, 737 REQ_F_COMPLETE_INLINE_BIT, 738 REQ_F_REISSUE_BIT, 739 REQ_F_CREDS_BIT, 740 REQ_F_REFCOUNT_BIT, 741 REQ_F_ARM_LTIMEOUT_BIT, 742 REQ_F_ASYNC_DATA_BIT, 743 REQ_F_SKIP_LINK_CQES_BIT, 744 /* keep async read/write and isreg together and in order */ 745 REQ_F_SUPPORT_NOWAIT_BIT, 746 REQ_F_ISREG_BIT, 747 748 /* not a real bit, just to check we're not overflowing the space */ 749 __REQ_F_LAST_BIT, 750}; 751 752enum { 753 /* ctx owns file */ 754 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), 755 /* drain existing IO first */ 756 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), 757 /* linked sqes */ 758 REQ_F_LINK = BIT(REQ_F_LINK_BIT), 759 /* doesn't sever on completion < 0 */ 760 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), 761 /* IOSQE_ASYNC */ 762 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), 763 /* IOSQE_BUFFER_SELECT */ 764 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), 765 /* IOSQE_CQE_SKIP_SUCCESS */ 766 REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT), 767 768 /* fail rest of links */ 769 REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), 770 /* on inflight list, should be cancelled and waited on exit reliably */ 771 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), 772 /* read/write uses file position */ 773 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), 774 /* must not punt to workers */ 775 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), 776 /* has or had linked timeout */ 777 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), 778 /* needs cleanup */ 779 REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), 780 /* already went through poll handler */ 781 REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), 782 /* buffer already selected */ 783 REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), 784 /* completion is deferred through io_comp_state */ 785 REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), 786 /* caller should reissue async */ 787 REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), 788 /* supports async reads/writes */ 789 REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT), 790 /* regular file */ 791 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), 792 /* has creds assigned */ 793 REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), 794 /* skip refcounting if not set */ 795 REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), 796 /* there is a linked timeout that has to be armed */ 797 REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), 798 /* ->async_data allocated */ 799 REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), 800 /* don't post CQEs while failing linked requests */ 801 REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), 802}; 803 804struct async_poll { 805 struct io_poll_iocb poll; 806 struct io_poll_iocb *double_poll; 807}; 808 809typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); 810 811struct io_task_work { 812 union { 813 struct io_wq_work_node node; 814 struct llist_node fallback_node; 815 }; 816 io_req_tw_func_t func; 817}; 818 819enum { 820 IORING_RSRC_FILE = 0, 821 IORING_RSRC_BUFFER = 1, 822}; 823 824/* 825 * NOTE! Each of the iocb union members has the file pointer 826 * as the first entry in their struct definition. So you can 827 * access the file pointer through any of the sub-structs, 828 * or directly as just 'ki_filp' in this struct. 829 */ 830struct io_kiocb { 831 union { 832 struct file *file; 833 struct io_rw rw; 834 struct io_poll_iocb poll; 835 struct io_poll_update poll_update; 836 struct io_accept accept; 837 struct io_sync sync; 838 struct io_cancel cancel; 839 struct io_timeout timeout; 840 struct io_timeout_rem timeout_rem; 841 struct io_connect connect; 842 struct io_sr_msg sr_msg; 843 struct io_open open; 844 struct io_close close; 845 struct io_rsrc_update rsrc_update; 846 struct io_fadvise fadvise; 847 struct io_madvise madvise; 848 struct io_epoll epoll; 849 struct io_splice splice; 850 struct io_provide_buf pbuf; 851 struct io_statx statx; 852 struct io_shutdown shutdown; 853 struct io_rename rename; 854 struct io_unlink unlink; 855 struct io_mkdir mkdir; 856 struct io_symlink symlink; 857 struct io_hardlink hardlink; 858 }; 859 860 u8 opcode; 861 /* polled IO has completed */ 862 u8 iopoll_completed; 863 u16 buf_index; 864 unsigned int flags; 865 866 u64 user_data; 867 u32 result; 868 u32 cflags; 869 870 struct io_ring_ctx *ctx; 871 struct task_struct *task; 872 873 struct percpu_ref *fixed_rsrc_refs; 874 /* store used ubuf, so we can prevent reloading */ 875 struct io_mapped_ubuf *imu; 876 877 /* used by request caches, completion batching and iopoll */ 878 struct io_wq_work_node comp_list; 879 atomic_t refs; 880 struct io_kiocb *link; 881 struct io_task_work io_task_work; 882 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ 883 struct hlist_node hash_node; 884 /* internal polling, see IORING_FEAT_FAST_POLL */ 885 struct async_poll *apoll; 886 /* opcode allocated if it needs to store data for async defer */ 887 void *async_data; 888 struct io_wq_work work; 889 /* custom credentials, valid IFF REQ_F_CREDS is set */ 890 const struct cred *creds; 891 /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ 892 struct io_buffer *kbuf; 893 atomic_t poll_refs; 894}; 895 896struct io_tctx_node { 897 struct list_head ctx_node; 898 struct task_struct *task; 899 struct io_ring_ctx *ctx; 900}; 901 902struct io_defer_entry { 903 struct list_head list; 904 struct io_kiocb *req; 905 u32 seq; 906}; 907 908struct io_op_def { 909 /* needs req->file assigned */ 910 unsigned needs_file : 1; 911 /* should block plug */ 912 unsigned plug : 1; 913 /* hash wq insertion if file is a regular file */ 914 unsigned hash_reg_file : 1; 915 /* unbound wq insertion if file is a non-regular file */ 916 unsigned unbound_nonreg_file : 1; 917 /* set if opcode supports polled "wait" */ 918 unsigned pollin : 1; 919 unsigned pollout : 1; 920 /* op supports buffer selection */ 921 unsigned buffer_select : 1; 922 /* do prep async if is going to be punted */ 923 unsigned needs_async_setup : 1; 924 /* opcode is not supported by this kernel */ 925 unsigned not_supported : 1; 926 /* skip auditing */ 927 unsigned audit_skip : 1; 928 /* size of async data needed, if any */ 929 unsigned short async_size; 930}; 931 932static const struct io_op_def io_op_defs[] = { 933 [IORING_OP_NOP] = {}, 934 [IORING_OP_READV] = { 935 .needs_file = 1, 936 .unbound_nonreg_file = 1, 937 .pollin = 1, 938 .buffer_select = 1, 939 .needs_async_setup = 1, 940 .plug = 1, 941 .audit_skip = 1, 942 .async_size = sizeof(struct io_async_rw), 943 }, 944 [IORING_OP_WRITEV] = { 945 .needs_file = 1, 946 .hash_reg_file = 1, 947 .unbound_nonreg_file = 1, 948 .pollout = 1, 949 .needs_async_setup = 1, 950 .plug = 1, 951 .audit_skip = 1, 952 .async_size = sizeof(struct io_async_rw), 953 }, 954 [IORING_OP_FSYNC] = { 955 .needs_file = 1, 956 .audit_skip = 1, 957 }, 958 [IORING_OP_READ_FIXED] = { 959 .needs_file = 1, 960 .unbound_nonreg_file = 1, 961 .pollin = 1, 962 .plug = 1, 963 .audit_skip = 1, 964 .async_size = sizeof(struct io_async_rw), 965 }, 966 [IORING_OP_WRITE_FIXED] = { 967 .needs_file = 1, 968 .hash_reg_file = 1, 969 .unbound_nonreg_file = 1, 970 .pollout = 1, 971 .plug = 1, 972 .audit_skip = 1, 973 .async_size = sizeof(struct io_async_rw), 974 }, 975 [IORING_OP_POLL_ADD] = { 976 .needs_file = 1, 977 .unbound_nonreg_file = 1, 978 .audit_skip = 1, 979 }, 980 [IORING_OP_POLL_REMOVE] = { 981 .audit_skip = 1, 982 }, 983 [IORING_OP_SYNC_FILE_RANGE] = { 984 .needs_file = 1, 985 .audit_skip = 1, 986 }, 987 [IORING_OP_SENDMSG] = { 988 .needs_file = 1, 989 .unbound_nonreg_file = 1, 990 .pollout = 1, 991 .needs_async_setup = 1, 992 .async_size = sizeof(struct io_async_msghdr), 993 }, 994 [IORING_OP_RECVMSG] = { 995 .needs_file = 1, 996 .unbound_nonreg_file = 1, 997 .pollin = 1, 998 .buffer_select = 1, 999 .needs_async_setup = 1, 1000 .async_size = sizeof(struct io_async_msghdr), 1001 }, 1002 [IORING_OP_TIMEOUT] = { 1003 .audit_skip = 1, 1004 .async_size = sizeof(struct io_timeout_data), 1005 }, 1006 [IORING_OP_TIMEOUT_REMOVE] = { 1007 /* used by timeout updates' prep() */ 1008 .audit_skip = 1, 1009 }, 1010 [IORING_OP_ACCEPT] = { 1011 .needs_file = 1, 1012 .unbound_nonreg_file = 1, 1013 .pollin = 1, 1014 }, 1015 [IORING_OP_ASYNC_CANCEL] = { 1016 .audit_skip = 1, 1017 }, 1018 [IORING_OP_LINK_TIMEOUT] = { 1019 .audit_skip = 1, 1020 .async_size = sizeof(struct io_timeout_data), 1021 }, 1022 [IORING_OP_CONNECT] = { 1023 .needs_file = 1, 1024 .unbound_nonreg_file = 1, 1025 .pollout = 1, 1026 .needs_async_setup = 1, 1027 .async_size = sizeof(struct io_async_connect), 1028 }, 1029 [IORING_OP_FALLOCATE] = { 1030 .needs_file = 1, 1031 }, 1032 [IORING_OP_OPENAT] = {}, 1033 [IORING_OP_CLOSE] = {}, 1034 [IORING_OP_FILES_UPDATE] = { 1035 .audit_skip = 1, 1036 }, 1037 [IORING_OP_STATX] = { 1038 .audit_skip = 1, 1039 }, 1040 [IORING_OP_READ] = { 1041 .needs_file = 1, 1042 .unbound_nonreg_file = 1, 1043 .pollin = 1, 1044 .buffer_select = 1, 1045 .plug = 1, 1046 .audit_skip = 1, 1047 .async_size = sizeof(struct io_async_rw), 1048 }, 1049 [IORING_OP_WRITE] = { 1050 .needs_file = 1, 1051 .hash_reg_file = 1, 1052 .unbound_nonreg_file = 1, 1053 .pollout = 1, 1054 .plug = 1, 1055 .audit_skip = 1, 1056 .async_size = sizeof(struct io_async_rw), 1057 }, 1058 [IORING_OP_FADVISE] = { 1059 .needs_file = 1, 1060 .audit_skip = 1, 1061 }, 1062 [IORING_OP_MADVISE] = {}, 1063 [IORING_OP_SEND] = { 1064 .needs_file = 1, 1065 .unbound_nonreg_file = 1, 1066 .pollout = 1, 1067 .audit_skip = 1, 1068 }, 1069 [IORING_OP_RECV] = { 1070 .needs_file = 1, 1071 .unbound_nonreg_file = 1, 1072 .pollin = 1, 1073 .buffer_select = 1, 1074 .audit_skip = 1, 1075 }, 1076 [IORING_OP_OPENAT2] = { 1077 }, 1078 [IORING_OP_EPOLL_CTL] = { 1079 .unbound_nonreg_file = 1, 1080 .audit_skip = 1, 1081 }, 1082 [IORING_OP_SPLICE] = { 1083 .needs_file = 1, 1084 .hash_reg_file = 1, 1085 .unbound_nonreg_file = 1, 1086 .audit_skip = 1, 1087 }, 1088 [IORING_OP_PROVIDE_BUFFERS] = { 1089 .audit_skip = 1, 1090 }, 1091 [IORING_OP_REMOVE_BUFFERS] = { 1092 .audit_skip = 1, 1093 }, 1094 [IORING_OP_TEE] = { 1095 .needs_file = 1, 1096 .hash_reg_file = 1, 1097 .unbound_nonreg_file = 1, 1098 .audit_skip = 1, 1099 }, 1100 [IORING_OP_SHUTDOWN] = { 1101 .needs_file = 1, 1102 }, 1103 [IORING_OP_RENAMEAT] = {}, 1104 [IORING_OP_UNLINKAT] = {}, 1105 [IORING_OP_MKDIRAT] = {}, 1106 [IORING_OP_SYMLINKAT] = {}, 1107 [IORING_OP_LINKAT] = {}, 1108}; 1109 1110/* requests with any of those set should undergo io_disarm_next() */ 1111#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) 1112 1113static bool io_disarm_next(struct io_kiocb *req); 1114static void io_uring_del_tctx_node(unsigned long index); 1115static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 1116 struct task_struct *task, 1117 bool cancel_all); 1118static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); 1119 1120static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags); 1121 1122static void io_put_req(struct io_kiocb *req); 1123static void io_put_req_deferred(struct io_kiocb *req); 1124static void io_dismantle_req(struct io_kiocb *req); 1125static void io_queue_linked_timeout(struct io_kiocb *req); 1126static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 1127 struct io_uring_rsrc_update2 *up, 1128 unsigned nr_args); 1129static void io_clean_op(struct io_kiocb *req); 1130static struct file *io_file_get(struct io_ring_ctx *ctx, 1131 struct io_kiocb *req, int fd, bool fixed); 1132static void __io_queue_sqe(struct io_kiocb *req); 1133static void io_rsrc_put_work(struct work_struct *work); 1134 1135static void io_req_task_queue(struct io_kiocb *req); 1136static void __io_submit_flush_completions(struct io_ring_ctx *ctx); 1137static int io_req_prep_async(struct io_kiocb *req); 1138 1139static int io_install_fixed_file(struct io_kiocb *req, struct file *file, 1140 unsigned int issue_flags, u32 slot_index); 1141static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); 1142 1143static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); 1144 1145static struct kmem_cache *req_cachep; 1146 1147static const struct file_operations io_uring_fops; 1148 1149struct sock *io_uring_get_socket(struct file *file) 1150{ 1151#if defined(CONFIG_UNIX) 1152 if (file->f_op == &io_uring_fops) { 1153 struct io_ring_ctx *ctx = file->private_data; 1154 1155 return ctx->ring_sock->sk; 1156 } 1157#endif 1158 return NULL; 1159} 1160EXPORT_SYMBOL(io_uring_get_socket); 1161 1162static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) 1163{ 1164 if (!*locked) { 1165 mutex_lock(&ctx->uring_lock); 1166 *locked = true; 1167 } 1168} 1169 1170#define io_for_each_link(pos, head) \ 1171 for (pos = (head); pos; pos = pos->link) 1172 1173/* 1174 * Shamelessly stolen from the mm implementation of page reference checking, 1175 * see commit f958d7b528b1 for details. 1176 */ 1177#define req_ref_zero_or_close_to_overflow(req) \ 1178 ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u) 1179 1180static inline bool req_ref_inc_not_zero(struct io_kiocb *req) 1181{ 1182 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); 1183 return atomic_inc_not_zero(&req->refs); 1184} 1185 1186static inline bool req_ref_put_and_test(struct io_kiocb *req) 1187{ 1188 if (likely(!(req->flags & REQ_F_REFCOUNT))) 1189 return true; 1190 1191 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); 1192 return atomic_dec_and_test(&req->refs); 1193} 1194 1195static inline void req_ref_get(struct io_kiocb *req) 1196{ 1197 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); 1198 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); 1199 atomic_inc(&req->refs); 1200} 1201 1202static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) 1203{ 1204 if (!wq_list_empty(&ctx->submit_state.compl_reqs)) 1205 __io_submit_flush_completions(ctx); 1206} 1207 1208static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) 1209{ 1210 if (!(req->flags & REQ_F_REFCOUNT)) { 1211 req->flags |= REQ_F_REFCOUNT; 1212 atomic_set(&req->refs, nr); 1213 } 1214} 1215 1216static inline void io_req_set_refcount(struct io_kiocb *req) 1217{ 1218 __io_req_set_refcount(req, 1); 1219} 1220 1221#define IO_RSRC_REF_BATCH 100 1222 1223static inline void io_req_put_rsrc_locked(struct io_kiocb *req, 1224 struct io_ring_ctx *ctx) 1225 __must_hold(&ctx->uring_lock) 1226{ 1227 struct percpu_ref *ref = req->fixed_rsrc_refs; 1228 1229 if (ref) { 1230 if (ref == &ctx->rsrc_node->refs) 1231 ctx->rsrc_cached_refs++; 1232 else 1233 percpu_ref_put(ref); 1234 } 1235} 1236 1237static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx) 1238{ 1239 if (req->fixed_rsrc_refs) 1240 percpu_ref_put(req->fixed_rsrc_refs); 1241} 1242 1243static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx) 1244 __must_hold(&ctx->uring_lock) 1245{ 1246 if (ctx->rsrc_cached_refs) { 1247 percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs); 1248 ctx->rsrc_cached_refs = 0; 1249 } 1250} 1251 1252static void io_rsrc_refs_refill(struct io_ring_ctx *ctx) 1253 __must_hold(&ctx->uring_lock) 1254{ 1255 ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH; 1256 percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH); 1257} 1258 1259static inline void io_req_set_rsrc_node(struct io_kiocb *req, 1260 struct io_ring_ctx *ctx) 1261{ 1262 if (!req->fixed_rsrc_refs) { 1263 req->fixed_rsrc_refs = &ctx->rsrc_node->refs; 1264 ctx->rsrc_cached_refs--; 1265 if (unlikely(ctx->rsrc_cached_refs < 0)) 1266 io_rsrc_refs_refill(ctx); 1267 } 1268} 1269 1270static unsigned int __io_put_kbuf(struct io_kiocb *req) 1271{ 1272 struct io_buffer *kbuf = req->kbuf; 1273 unsigned int cflags; 1274 1275 cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; 1276 cflags |= IORING_CQE_F_BUFFER; 1277 req->flags &= ~REQ_F_BUFFER_SELECTED; 1278 kfree(kbuf); 1279 req->kbuf = NULL; 1280 return cflags; 1281} 1282 1283static inline unsigned int io_put_kbuf(struct io_kiocb *req) 1284{ 1285 if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) 1286 return 0; 1287 return __io_put_kbuf(req); 1288} 1289 1290static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl) 1291{ 1292 bool got = percpu_ref_tryget(ref); 1293 1294 /* already at zero, wait for ->release() */ 1295 if (!got) 1296 wait_for_completion(compl); 1297 percpu_ref_resurrect(ref); 1298 if (got) 1299 percpu_ref_put(ref); 1300} 1301 1302static bool io_match_task(struct io_kiocb *head, struct task_struct *task, 1303 bool cancel_all) 1304 __must_hold(&req->ctx->timeout_lock) 1305{ 1306 struct io_kiocb *req; 1307 1308 if (task && head->task != task) 1309 return false; 1310 if (cancel_all) 1311 return true; 1312 1313 io_for_each_link(req, head) { 1314 if (req->flags & REQ_F_INFLIGHT) 1315 return true; 1316 } 1317 return false; 1318} 1319 1320static bool io_match_linked(struct io_kiocb *head) 1321{ 1322 struct io_kiocb *req; 1323 1324 io_for_each_link(req, head) { 1325 if (req->flags & REQ_F_INFLIGHT) 1326 return true; 1327 } 1328 return false; 1329} 1330 1331/* 1332 * As io_match_task() but protected against racing with linked timeouts. 1333 * User must not hold timeout_lock. 1334 */ 1335static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, 1336 bool cancel_all) 1337{ 1338 bool matched; 1339 1340 if (task && head->task != task) 1341 return false; 1342 if (cancel_all) 1343 return true; 1344 1345 if (head->flags & REQ_F_LINK_TIMEOUT) { 1346 struct io_ring_ctx *ctx = head->ctx; 1347 1348 /* protect against races with linked timeouts */ 1349 spin_lock_irq(&ctx->timeout_lock); 1350 matched = io_match_linked(head); 1351 spin_unlock_irq(&ctx->timeout_lock); 1352 } else { 1353 matched = io_match_linked(head); 1354 } 1355 return matched; 1356} 1357 1358static inline bool req_has_async_data(struct io_kiocb *req) 1359{ 1360 return req->flags & REQ_F_ASYNC_DATA; 1361} 1362 1363static inline void req_set_fail(struct io_kiocb *req) 1364{ 1365 req->flags |= REQ_F_FAIL; 1366 if (req->flags & REQ_F_CQE_SKIP) { 1367 req->flags &= ~REQ_F_CQE_SKIP; 1368 req->flags |= REQ_F_SKIP_LINK_CQES; 1369 } 1370} 1371 1372static inline void req_fail_link_node(struct io_kiocb *req, int res) 1373{ 1374 req_set_fail(req); 1375 req->result = res; 1376} 1377 1378static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) 1379{ 1380 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); 1381 1382 complete(&ctx->ref_comp); 1383} 1384 1385static inline bool io_is_timeout_noseq(struct io_kiocb *req) 1386{ 1387 return !req->timeout.off; 1388} 1389 1390static __cold void io_fallback_req_func(struct work_struct *work) 1391{ 1392 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, 1393 fallback_work.work); 1394 struct llist_node *node = llist_del_all(&ctx->fallback_llist); 1395 struct io_kiocb *req, *tmp; 1396 bool locked = false; 1397 1398 percpu_ref_get(&ctx->refs); 1399 llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node) 1400 req->io_task_work.func(req, &locked); 1401 1402 if (locked) { 1403 io_submit_flush_completions(ctx); 1404 mutex_unlock(&ctx->uring_lock); 1405 } 1406 percpu_ref_put(&ctx->refs); 1407} 1408 1409static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) 1410{ 1411 struct io_ring_ctx *ctx; 1412 int hash_bits; 1413 1414 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 1415 if (!ctx) 1416 return NULL; 1417 1418 /* 1419 * Use 5 bits less than the max cq entries, that should give us around 1420 * 32 entries per hash list if totally full and uniformly spread. 1421 */ 1422 hash_bits = ilog2(p->cq_entries); 1423 hash_bits -= 5; 1424 if (hash_bits <= 0) 1425 hash_bits = 1; 1426 ctx->cancel_hash_bits = hash_bits; 1427 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head), 1428 GFP_KERNEL); 1429 if (!ctx->cancel_hash) 1430 goto err; 1431 __hash_init(ctx->cancel_hash, 1U << hash_bits); 1432 1433 ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); 1434 if (!ctx->dummy_ubuf) 1435 goto err; 1436 /* set invalid range, so io_import_fixed() fails meeting it */ 1437 ctx->dummy_ubuf->ubuf = -1UL; 1438 1439 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 1440 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) 1441 goto err; 1442 1443 ctx->flags = p->flags; 1444 init_waitqueue_head(&ctx->sqo_sq_wait); 1445 INIT_LIST_HEAD(&ctx->sqd_list); 1446 INIT_LIST_HEAD(&ctx->cq_overflow_list); 1447 init_completion(&ctx->ref_comp); 1448 xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1); 1449 xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); 1450 mutex_init(&ctx->uring_lock); 1451 init_waitqueue_head(&ctx->cq_wait); 1452 spin_lock_init(&ctx->completion_lock); 1453 spin_lock_init(&ctx->timeout_lock); 1454 INIT_WQ_LIST(&ctx->iopoll_list); 1455 INIT_LIST_HEAD(&ctx->defer_list); 1456 INIT_LIST_HEAD(&ctx->timeout_list); 1457 INIT_LIST_HEAD(&ctx->ltimeout_list); 1458 spin_lock_init(&ctx->rsrc_ref_lock); 1459 INIT_LIST_HEAD(&ctx->rsrc_ref_list); 1460 INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); 1461 init_llist_head(&ctx->rsrc_put_llist); 1462 INIT_LIST_HEAD(&ctx->tctx_list); 1463 ctx->submit_state.free_list.next = NULL; 1464 INIT_WQ_LIST(&ctx->locked_free_list); 1465 INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); 1466 INIT_WQ_LIST(&ctx->submit_state.compl_reqs); 1467 return ctx; 1468err: 1469 kfree(ctx->dummy_ubuf); 1470 kfree(ctx->cancel_hash); 1471 kfree(ctx); 1472 return NULL; 1473} 1474 1475static void io_account_cq_overflow(struct io_ring_ctx *ctx) 1476{ 1477 struct io_rings *r = ctx->rings; 1478 1479 WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); 1480 ctx->cq_extra--; 1481} 1482 1483static bool req_need_defer(struct io_kiocb *req, u32 seq) 1484{ 1485 if (unlikely(req->flags & REQ_F_IO_DRAIN)) { 1486 struct io_ring_ctx *ctx = req->ctx; 1487 1488 return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; 1489 } 1490 1491 return false; 1492} 1493 1494#define FFS_NOWAIT 0x1UL 1495#define FFS_ISREG 0x2UL 1496#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) 1497 1498static inline bool io_req_ffs_set(struct io_kiocb *req) 1499{ 1500 return req->flags & REQ_F_FIXED_FILE; 1501} 1502 1503static inline void io_req_track_inflight(struct io_kiocb *req) 1504{ 1505 if (!(req->flags & REQ_F_INFLIGHT)) { 1506 req->flags |= REQ_F_INFLIGHT; 1507 atomic_inc(&current->io_uring->inflight_tracked); 1508 } 1509} 1510 1511static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) 1512{ 1513 if (WARN_ON_ONCE(!req->link)) 1514 return NULL; 1515 1516 req->flags &= ~REQ_F_ARM_LTIMEOUT; 1517 req->flags |= REQ_F_LINK_TIMEOUT; 1518 1519 /* linked timeouts should have two refs once prep'ed */ 1520 io_req_set_refcount(req); 1521 __io_req_set_refcount(req->link, 2); 1522 return req->link; 1523} 1524 1525static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) 1526{ 1527 if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT))) 1528 return NULL; 1529 return __io_prep_linked_timeout(req); 1530} 1531 1532static void io_prep_async_work(struct io_kiocb *req) 1533{ 1534 const struct io_op_def *def = &io_op_defs[req->opcode]; 1535 struct io_ring_ctx *ctx = req->ctx; 1536 1537 if (!(req->flags & REQ_F_CREDS)) { 1538 req->flags |= REQ_F_CREDS; 1539 req->creds = get_current_cred(); 1540 } 1541 1542 req->work.list.next = NULL; 1543 req->work.flags = 0; 1544 if (req->flags & REQ_F_FORCE_ASYNC) 1545 req->work.flags |= IO_WQ_WORK_CONCURRENT; 1546 1547 if (req->flags & REQ_F_ISREG) { 1548 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) 1549 io_wq_hash_work(&req->work, file_inode(req->file)); 1550 } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { 1551 if (def->unbound_nonreg_file) 1552 req->work.flags |= IO_WQ_WORK_UNBOUND; 1553 } 1554 1555 switch (req->opcode) { 1556 case IORING_OP_SPLICE: 1557 case IORING_OP_TEE: 1558 if (!S_ISREG(file_inode(req->splice.file_in)->i_mode)) 1559 req->work.flags |= IO_WQ_WORK_UNBOUND; 1560 break; 1561 } 1562} 1563 1564static void io_prep_async_link(struct io_kiocb *req) 1565{ 1566 struct io_kiocb *cur; 1567 1568 if (req->flags & REQ_F_LINK_TIMEOUT) { 1569 struct io_ring_ctx *ctx = req->ctx; 1570 1571 spin_lock_irq(&ctx->timeout_lock); 1572 io_for_each_link(cur, req) 1573 io_prep_async_work(cur); 1574 spin_unlock_irq(&ctx->timeout_lock); 1575 } else { 1576 io_for_each_link(cur, req) 1577 io_prep_async_work(cur); 1578 } 1579} 1580 1581static inline void io_req_add_compl_list(struct io_kiocb *req) 1582{ 1583 struct io_ring_ctx *ctx = req->ctx; 1584 struct io_submit_state *state = &ctx->submit_state; 1585 1586 if (!(req->flags & REQ_F_CQE_SKIP)) 1587 ctx->submit_state.flush_cqes = true; 1588 wq_list_add_tail(&req->comp_list, &state->compl_reqs); 1589} 1590 1591static void io_queue_async_work(struct io_kiocb *req, bool *dont_use) 1592{ 1593 struct io_ring_ctx *ctx = req->ctx; 1594 struct io_kiocb *link = io_prep_linked_timeout(req); 1595 struct io_uring_task *tctx = req->task->io_uring; 1596 1597 BUG_ON(!tctx); 1598 BUG_ON(!tctx->io_wq); 1599 1600 /* init ->work of the whole link before punting */ 1601 io_prep_async_link(req); 1602 1603 /* 1604 * Not expected to happen, but if we do have a bug where this _can_ 1605 * happen, catch it here and ensure the request is marked as 1606 * canceled. That will make io-wq go through the usual work cancel 1607 * procedure rather than attempt to run this request (or create a new 1608 * worker for it). 1609 */ 1610 if (WARN_ON_ONCE(!same_thread_group(req->task, current))) 1611 req->work.flags |= IO_WQ_WORK_CANCEL; 1612 1613 trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, 1614 &req->work, req->flags); 1615 io_wq_enqueue(tctx->io_wq, &req->work); 1616 if (link) 1617 io_queue_linked_timeout(link); 1618} 1619 1620static void io_kill_timeout(struct io_kiocb *req, int status) 1621 __must_hold(&req->ctx->completion_lock) 1622 __must_hold(&req->ctx->timeout_lock) 1623{ 1624 struct io_timeout_data *io = req->async_data; 1625 1626 if (hrtimer_try_to_cancel(&io->timer) != -1) { 1627 if (status) 1628 req_set_fail(req); 1629 atomic_set(&req->ctx->cq_timeouts, 1630 atomic_read(&req->ctx->cq_timeouts) + 1); 1631 list_del_init(&req->timeout.list); 1632 io_fill_cqe_req(req, status, 0); 1633 io_put_req_deferred(req); 1634 } 1635} 1636 1637static __cold void io_queue_deferred(struct io_ring_ctx *ctx) 1638{ 1639 while (!list_empty(&ctx->defer_list)) { 1640 struct io_defer_entry *de = list_first_entry(&ctx->defer_list, 1641 struct io_defer_entry, list); 1642 1643 if (req_need_defer(de->req, de->seq)) 1644 break; 1645 list_del_init(&de->list); 1646 io_req_task_queue(de->req); 1647 kfree(de); 1648 } 1649} 1650 1651static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) 1652 __must_hold(&ctx->completion_lock) 1653{ 1654 u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); 1655 1656 spin_lock_irq(&ctx->timeout_lock); 1657 while (!list_empty(&ctx->timeout_list)) { 1658 u32 events_needed, events_got; 1659 struct io_kiocb *req = list_first_entry(&ctx->timeout_list, 1660 struct io_kiocb, timeout.list); 1661 1662 if (io_is_timeout_noseq(req)) 1663 break; 1664 1665 /* 1666 * Since seq can easily wrap around over time, subtract 1667 * the last seq at which timeouts were flushed before comparing. 1668 * Assuming not more than 2^31-1 events have happened since, 1669 * these subtractions won't have wrapped, so we can check if 1670 * target is in [last_seq, current_seq] by comparing the two. 1671 */ 1672 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush; 1673 events_got = seq - ctx->cq_last_tm_flush; 1674 if (events_got < events_needed) 1675 break; 1676 1677 list_del_init(&req->timeout.list); 1678 io_kill_timeout(req, 0); 1679 } 1680 ctx->cq_last_tm_flush = seq; 1681 spin_unlock_irq(&ctx->timeout_lock); 1682} 1683 1684static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx) 1685{ 1686 if (ctx->off_timeout_used) 1687 io_flush_timeouts(ctx); 1688 if (ctx->drain_active) 1689 io_queue_deferred(ctx); 1690} 1691 1692static inline void io_commit_cqring(struct io_ring_ctx *ctx) 1693{ 1694 if (unlikely(ctx->off_timeout_used || ctx->drain_active)) 1695 __io_commit_cqring_flush(ctx); 1696 /* order cqe stores with ring update */ 1697 smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); 1698} 1699 1700static inline bool io_sqring_full(struct io_ring_ctx *ctx) 1701{ 1702 struct io_rings *r = ctx->rings; 1703 1704 return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries; 1705} 1706 1707static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) 1708{ 1709 return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); 1710} 1711 1712static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) 1713{ 1714 struct io_rings *rings = ctx->rings; 1715 unsigned tail, mask = ctx->cq_entries - 1; 1716 1717 /* 1718 * writes to the cq entry need to come after reading head; the 1719 * control dependency is enough as we're using WRITE_ONCE to 1720 * fill the cq entry 1721 */ 1722 if (__io_cqring_events(ctx) == ctx->cq_entries) 1723 return NULL; 1724 1725 tail = ctx->cached_cq_tail++; 1726 return &rings->cqes[tail & mask]; 1727} 1728 1729static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) 1730{ 1731 if (likely(!ctx->cq_ev_fd)) 1732 return false; 1733 if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) 1734 return false; 1735 return !ctx->eventfd_async || io_wq_current_is_worker(); 1736} 1737 1738/* 1739 * This should only get called when at least one event has been posted. 1740 * Some applications rely on the eventfd notification count only changing 1741 * IFF a new CQE has been added to the CQ ring. There's no depedency on 1742 * 1:1 relationship between how many times this function is called (and 1743 * hence the eventfd count) and number of CQEs posted to the CQ ring. 1744 */ 1745static void io_cqring_ev_posted(struct io_ring_ctx *ctx) 1746{ 1747 /* 1748 * wake_up_all() may seem excessive, but io_wake_function() and 1749 * io_should_wake() handle the termination of the loop and only 1750 * wake as many waiters as we need to. 1751 */ 1752 if (wq_has_sleeper(&ctx->cq_wait)) 1753 wake_up_all(&ctx->cq_wait); 1754 if (io_should_trigger_evfd(ctx)) 1755 eventfd_signal(ctx->cq_ev_fd, 1); 1756} 1757 1758static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) 1759{ 1760 /* see waitqueue_active() comment */ 1761 smp_mb(); 1762 1763 if (ctx->flags & IORING_SETUP_SQPOLL) { 1764 if (waitqueue_active(&ctx->cq_wait)) 1765 wake_up_all(&ctx->cq_wait); 1766 } 1767 if (io_should_trigger_evfd(ctx)) 1768 eventfd_signal(ctx->cq_ev_fd, 1); 1769} 1770 1771/* Returns true if there are no backlogged entries after the flush */ 1772static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) 1773{ 1774 bool all_flushed, posted; 1775 1776 if (!force && __io_cqring_events(ctx) == ctx->cq_entries) 1777 return false; 1778 1779 posted = false; 1780 spin_lock(&ctx->completion_lock); 1781 while (!list_empty(&ctx->cq_overflow_list)) { 1782 struct io_uring_cqe *cqe = io_get_cqe(ctx); 1783 struct io_overflow_cqe *ocqe; 1784 1785 if (!cqe && !force) 1786 break; 1787 ocqe = list_first_entry(&ctx->cq_overflow_list, 1788 struct io_overflow_cqe, list); 1789 if (cqe) 1790 memcpy(cqe, &ocqe->cqe, sizeof(*cqe)); 1791 else 1792 io_account_cq_overflow(ctx); 1793 1794 posted = true; 1795 list_del(&ocqe->list); 1796 kfree(ocqe); 1797 } 1798 1799 all_flushed = list_empty(&ctx->cq_overflow_list); 1800 if (all_flushed) { 1801 clear_bit(0, &ctx->check_cq_overflow); 1802 WRITE_ONCE(ctx->rings->sq_flags, 1803 ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW); 1804 } 1805 1806 if (posted) 1807 io_commit_cqring(ctx); 1808 spin_unlock(&ctx->completion_lock); 1809 if (posted) 1810 io_cqring_ev_posted(ctx); 1811 return all_flushed; 1812} 1813 1814static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) 1815{ 1816 bool ret = true; 1817 1818 if (test_bit(0, &ctx->check_cq_overflow)) { 1819 /* iopoll syncs against uring_lock, not completion_lock */ 1820 if (ctx->flags & IORING_SETUP_IOPOLL) 1821 mutex_lock(&ctx->uring_lock); 1822 ret = __io_cqring_overflow_flush(ctx, false); 1823 if (ctx->flags & IORING_SETUP_IOPOLL) 1824 mutex_unlock(&ctx->uring_lock); 1825 } 1826 1827 return ret; 1828} 1829 1830/* must to be called somewhat shortly after putting a request */ 1831static inline void io_put_task(struct task_struct *task, int nr) 1832{ 1833 struct io_uring_task *tctx = task->io_uring; 1834 1835 if (likely(task == current)) { 1836 tctx->cached_refs += nr; 1837 } else { 1838 percpu_counter_sub(&tctx->inflight, nr); 1839 if (unlikely(atomic_read(&tctx->in_idle))) 1840 wake_up(&tctx->wait); 1841 put_task_struct_many(task, nr); 1842 } 1843} 1844 1845static void io_task_refs_refill(struct io_uring_task *tctx) 1846{ 1847 unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; 1848 1849 percpu_counter_add(&tctx->inflight, refill); 1850 refcount_add(refill, &current->usage); 1851 tctx->cached_refs += refill; 1852} 1853 1854static inline void io_get_task_refs(int nr) 1855{ 1856 struct io_uring_task *tctx = current->io_uring; 1857 1858 tctx->cached_refs -= nr; 1859 if (unlikely(tctx->cached_refs < 0)) 1860 io_task_refs_refill(tctx); 1861} 1862 1863static __cold void io_uring_drop_tctx_refs(struct task_struct *task) 1864{ 1865 struct io_uring_task *tctx = task->io_uring; 1866 unsigned int refs = tctx->cached_refs; 1867 1868 if (refs) { 1869 tctx->cached_refs = 0; 1870 percpu_counter_sub(&tctx->inflight, refs); 1871 put_task_struct_many(task, refs); 1872 } 1873} 1874 1875static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, 1876 s32 res, u32 cflags) 1877{ 1878 struct io_overflow_cqe *ocqe; 1879 1880 ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT); 1881 if (!ocqe) { 1882 /* 1883 * If we're in ring overflow flush mode, or in task cancel mode, 1884 * or cannot allocate an overflow entry, then we need to drop it 1885 * on the floor. 1886 */ 1887 io_account_cq_overflow(ctx); 1888 return false; 1889 } 1890 if (list_empty(&ctx->cq_overflow_list)) { 1891 set_bit(0, &ctx->check_cq_overflow); 1892 WRITE_ONCE(ctx->rings->sq_flags, 1893 ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW); 1894 1895 } 1896 ocqe->cqe.user_data = user_data; 1897 ocqe->cqe.res = res; 1898 ocqe->cqe.flags = cflags; 1899 list_add_tail(&ocqe->list, &ctx->cq_overflow_list); 1900 return true; 1901} 1902 1903static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, 1904 s32 res, u32 cflags) 1905{ 1906 struct io_uring_cqe *cqe; 1907 1908 trace_io_uring_complete(ctx, user_data, res, cflags); 1909 1910 /* 1911 * If we can't get a cq entry, userspace overflowed the 1912 * submission (by quite a lot). Increment the overflow count in 1913 * the ring. 1914 */ 1915 cqe = io_get_cqe(ctx); 1916 if (likely(cqe)) { 1917 WRITE_ONCE(cqe->user_data, user_data); 1918 WRITE_ONCE(cqe->res, res); 1919 WRITE_ONCE(cqe->flags, cflags); 1920 return true; 1921 } 1922 return io_cqring_event_overflow(ctx, user_data, res, cflags); 1923} 1924 1925static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) 1926{ 1927 if (!(req->flags & REQ_F_CQE_SKIP)) 1928 __io_fill_cqe(req->ctx, req->user_data, res, cflags); 1929} 1930 1931static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, 1932 s32 res, u32 cflags) 1933{ 1934 ctx->cq_extra++; 1935 return __io_fill_cqe(ctx, user_data, res, cflags); 1936} 1937 1938static void __io_req_complete_post(struct io_kiocb *req, s32 res, 1939 u32 cflags) 1940{ 1941 struct io_ring_ctx *ctx = req->ctx; 1942 1943 if (!(req->flags & REQ_F_CQE_SKIP)) 1944 __io_fill_cqe(ctx, req->user_data, res, cflags); 1945 /* 1946 * If we're the last reference to this request, add to our locked 1947 * free_list cache. 1948 */ 1949 if (req_ref_put_and_test(req)) { 1950 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { 1951 if (req->flags & IO_DISARM_MASK) 1952 io_disarm_next(req); 1953 if (req->link) { 1954 io_req_task_queue(req->link); 1955 req->link = NULL; 1956 } 1957 } 1958 io_req_put_rsrc(req, ctx); 1959 io_dismantle_req(req); 1960 io_put_task(req->task, 1); 1961 wq_list_add_head(&req->comp_list, &ctx->locked_free_list); 1962 ctx->locked_free_nr++; 1963 } 1964} 1965 1966static void io_req_complete_post(struct io_kiocb *req, s32 res, 1967 u32 cflags) 1968{ 1969 struct io_ring_ctx *ctx = req->ctx; 1970 1971 spin_lock(&ctx->completion_lock); 1972 __io_req_complete_post(req, res, cflags); 1973 io_commit_cqring(ctx); 1974 spin_unlock(&ctx->completion_lock); 1975 io_cqring_ev_posted(ctx); 1976} 1977 1978static inline void io_req_complete_state(struct io_kiocb *req, s32 res, 1979 u32 cflags) 1980{ 1981 req->result = res; 1982 req->cflags = cflags; 1983 req->flags |= REQ_F_COMPLETE_INLINE; 1984} 1985 1986static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, 1987 s32 res, u32 cflags) 1988{ 1989 if (issue_flags & IO_URING_F_COMPLETE_DEFER) 1990 io_req_complete_state(req, res, cflags); 1991 else 1992 io_req_complete_post(req, res, cflags); 1993} 1994 1995static inline void io_req_complete(struct io_kiocb *req, s32 res) 1996{ 1997 __io_req_complete(req, 0, res, 0); 1998} 1999 2000static void io_req_complete_failed(struct io_kiocb *req, s32 res) 2001{ 2002 req_set_fail(req); 2003 io_req_complete_post(req, res, 0); 2004} 2005 2006static void io_req_complete_fail_submit(struct io_kiocb *req) 2007{ 2008 /* 2009 * We don't submit, fail them all, for that replace hardlinks with 2010 * normal links. Extra REQ_F_LINK is tolerated. 2011 */ 2012 req->flags &= ~REQ_F_HARDLINK; 2013 req->flags |= REQ_F_LINK; 2014 io_req_complete_failed(req, req->result); 2015} 2016 2017/* 2018 * Don't initialise the fields below on every allocation, but do that in 2019 * advance and keep them valid across allocations. 2020 */ 2021static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) 2022{ 2023 req->ctx = ctx; 2024 req->link = NULL; 2025 req->async_data = NULL; 2026 /* not necessary, but safer to zero */ 2027 req->result = 0; 2028} 2029 2030static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, 2031 struct io_submit_state *state) 2032{ 2033 spin_lock(&ctx->completion_lock); 2034 wq_list_splice(&ctx->locked_free_list, &state->free_list); 2035 ctx->locked_free_nr = 0; 2036 spin_unlock(&ctx->completion_lock); 2037} 2038 2039/* Returns true IFF there are requests in the cache */ 2040static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) 2041{ 2042 struct io_submit_state *state = &ctx->submit_state; 2043 2044 /* 2045 * If we have more than a batch's worth of requests in our IRQ side 2046 * locked cache, grab the lock and move them over to our submission 2047 * side cache. 2048 */ 2049 if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) 2050 io_flush_cached_locked_reqs(ctx, state); 2051 return !!state->free_list.next; 2052} 2053 2054/* 2055 * A request might get retired back into the request caches even before opcode 2056 * handlers and io_issue_sqe() are done with it, e.g. inline completion path. 2057 * Because of that, io_alloc_req() should be called only under ->uring_lock 2058 * and with extra caution to not get a request that is still worked on. 2059 */ 2060static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) 2061 __must_hold(&ctx->uring_lock) 2062{ 2063 struct io_submit_state *state = &ctx->submit_state; 2064 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; 2065 void *reqs[IO_REQ_ALLOC_BATCH]; 2066 struct io_kiocb *req; 2067 int ret, i; 2068 2069 if (likely(state->free_list.next || io_flush_cached_reqs(ctx))) 2070 return true; 2071 2072 ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); 2073 2074 /* 2075 * Bulk alloc is all-or-nothing. If we fail to get a batch, 2076 * retry single alloc to be on the safe side. 2077 */ 2078 if (unlikely(ret <= 0)) { 2079 reqs[0] = kmem_cache_alloc(req_cachep, gfp); 2080 if (!reqs[0]) 2081 return false; 2082 ret = 1; 2083 } 2084 2085 percpu_ref_get_many(&ctx->refs, ret); 2086 for (i = 0; i < ret; i++) { 2087 req = reqs[i]; 2088 2089 io_preinit_req(req, ctx); 2090 wq_stack_add_head(&req->comp_list, &state->free_list); 2091 } 2092 return true; 2093} 2094 2095static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) 2096{ 2097 if (unlikely(!ctx->submit_state.free_list.next)) 2098 return __io_alloc_req_refill(ctx); 2099 return true; 2100} 2101 2102static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) 2103{ 2104 struct io_wq_work_node *node; 2105 2106 node = wq_stack_extract(&ctx->submit_state.free_list); 2107 return container_of(node, struct io_kiocb, comp_list); 2108} 2109 2110static inline void io_put_file(struct file *file) 2111{ 2112 if (file) 2113 fput(file); 2114} 2115 2116static inline void io_dismantle_req(struct io_kiocb *req) 2117{ 2118 unsigned int flags = req->flags; 2119 2120 if (unlikely(flags & IO_REQ_CLEAN_FLAGS)) 2121 io_clean_op(req); 2122 if (!(flags & REQ_F_FIXED_FILE)) 2123 io_put_file(req->file); 2124} 2125 2126static __cold void __io_free_req(struct io_kiocb *req) 2127{ 2128 struct io_ring_ctx *ctx = req->ctx; 2129 2130 io_req_put_rsrc(req, ctx); 2131 io_dismantle_req(req); 2132 io_put_task(req->task, 1); 2133 2134 spin_lock(&ctx->completion_lock); 2135 wq_list_add_head(&req->comp_list, &ctx->locked_free_list); 2136 ctx->locked_free_nr++; 2137 spin_unlock(&ctx->completion_lock); 2138} 2139 2140static inline void io_remove_next_linked(struct io_kiocb *req) 2141{ 2142 struct io_kiocb *nxt = req->link; 2143 2144 req->link = nxt->link; 2145 nxt->link = NULL; 2146} 2147 2148static bool io_kill_linked_timeout(struct io_kiocb *req) 2149 __must_hold(&req->ctx->completion_lock) 2150 __must_hold(&req->ctx->timeout_lock) 2151{ 2152 struct io_kiocb *link = req->link; 2153 2154 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { 2155 struct io_timeout_data *io = link->async_data; 2156 2157 io_remove_next_linked(req); 2158 link->timeout.head = NULL; 2159 if (hrtimer_try_to_cancel(&io->timer) != -1) { 2160 list_del(&link->timeout.list); 2161 /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */ 2162 io_fill_cqe_req(link, -ECANCELED, 0); 2163 io_put_req_deferred(link); 2164 return true; 2165 } 2166 } 2167 return false; 2168} 2169 2170static void io_fail_links(struct io_kiocb *req) 2171 __must_hold(&req->ctx->completion_lock) 2172{ 2173 struct io_kiocb *nxt, *link = req->link; 2174 bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES; 2175 2176 req->link = NULL; 2177 while (link) { 2178 long res = -ECANCELED; 2179 2180 if (link->flags & REQ_F_FAIL) 2181 res = link->result; 2182 2183 nxt = link->link; 2184 link->link = NULL; 2185 2186 trace_io_uring_fail_link(req, link); 2187 if (!ignore_cqes) { 2188 link->flags &= ~REQ_F_CQE_SKIP; 2189 io_fill_cqe_req(link, res, 0); 2190 } 2191 io_put_req_deferred(link); 2192 link = nxt; 2193 } 2194} 2195 2196static bool io_disarm_next(struct io_kiocb *req) 2197 __must_hold(&req->ctx->completion_lock) 2198{ 2199 bool posted = false; 2200 2201 if (req->flags & REQ_F_ARM_LTIMEOUT) { 2202 struct io_kiocb *link = req->link; 2203 2204 req->flags &= ~REQ_F_ARM_LTIMEOUT; 2205 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { 2206 io_remove_next_linked(req); 2207 /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */ 2208 io_fill_cqe_req(link, -ECANCELED, 0); 2209 io_put_req_deferred(link); 2210 posted = true; 2211 } 2212 } else if (req->flags & REQ_F_LINK_TIMEOUT) { 2213 struct io_ring_ctx *ctx = req->ctx; 2214 2215 spin_lock_irq(&ctx->timeout_lock); 2216 posted = io_kill_linked_timeout(req); 2217 spin_unlock_irq(&ctx->timeout_lock); 2218 } 2219 if (unlikely((req->flags & REQ_F_FAIL) && 2220 !(req->flags & REQ_F_HARDLINK))) { 2221 posted |= (req->link != NULL); 2222 io_fail_links(req); 2223 } 2224 return posted; 2225} 2226 2227static void __io_req_find_next_prep(struct io_kiocb *req) 2228{ 2229 struct io_ring_ctx *ctx = req->ctx; 2230 bool posted; 2231 2232 spin_lock(&ctx->completion_lock); 2233 posted = io_disarm_next(req); 2234 if (posted) 2235 io_commit_cqring(ctx); 2236 spin_unlock(&ctx->completion_lock); 2237 if (posted) 2238 io_cqring_ev_posted(ctx); 2239} 2240 2241static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) 2242{ 2243 struct io_kiocb *nxt; 2244 2245 if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))) 2246 return NULL; 2247 /* 2248 * If LINK is set, we have dependent requests in this chain. If we 2249 * didn't fail this request, queue the first one up, moving any other 2250 * dependencies to the next request. In case of failure, fail the rest 2251 * of the chain. 2252 */ 2253 if (unlikely(req->flags & IO_DISARM_MASK)) 2254 __io_req_find_next_prep(req); 2255 nxt = req->link; 2256 req->link = NULL; 2257 return nxt; 2258} 2259 2260static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) 2261{ 2262 if (!ctx) 2263 return; 2264 if (*locked) { 2265 io_submit_flush_completions(ctx); 2266 mutex_unlock(&ctx->uring_lock); 2267 *locked = false; 2268 } 2269 percpu_ref_put(&ctx->refs); 2270} 2271 2272static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx) 2273{ 2274 io_commit_cqring(ctx); 2275 spin_unlock(&ctx->completion_lock); 2276 io_cqring_ev_posted(ctx); 2277} 2278 2279static void handle_prev_tw_list(struct io_wq_work_node *node, 2280 struct io_ring_ctx **ctx, bool *uring_locked) 2281{ 2282 if (*ctx && !*uring_locked) 2283 spin_lock(&(*ctx)->completion_lock); 2284 2285 do { 2286 struct io_wq_work_node *next = node->next; 2287 struct io_kiocb *req = container_of(node, struct io_kiocb, 2288 io_task_work.node); 2289 2290 if (req->ctx != *ctx) { 2291 if (unlikely(!*uring_locked && *ctx)) 2292 ctx_commit_and_unlock(*ctx); 2293 2294 ctx_flush_and_put(*ctx, uring_locked); 2295 *ctx = req->ctx; 2296 /* if not contended, grab and improve batching */ 2297 *uring_locked = mutex_trylock(&(*ctx)->uring_lock); 2298 percpu_ref_get(&(*ctx)->refs); 2299 if (unlikely(!*uring_locked)) 2300 spin_lock(&(*ctx)->completion_lock); 2301 } 2302 if (likely(*uring_locked)) 2303 req->io_task_work.func(req, uring_locked); 2304 else 2305 __io_req_complete_post(req, req->result, io_put_kbuf(req)); 2306 node = next; 2307 } while (node); 2308 2309 if (unlikely(!*uring_locked)) 2310 ctx_commit_and_unlock(*ctx); 2311} 2312 2313static void handle_tw_list(struct io_wq_work_node *node, 2314 struct io_ring_ctx **ctx, bool *locked) 2315{ 2316 do { 2317 struct io_wq_work_node *next = node->next; 2318 struct io_kiocb *req = container_of(node, struct io_kiocb, 2319 io_task_work.node); 2320 2321 if (req->ctx != *ctx) { 2322 ctx_flush_and_put(*ctx, locked); 2323 *ctx = req->ctx; 2324 /* if not contended, grab and improve batching */ 2325 *locked = mutex_trylock(&(*ctx)->uring_lock); 2326 percpu_ref_get(&(*ctx)->refs); 2327 } 2328 req->io_task_work.func(req, locked); 2329 node = next; 2330 } while (node); 2331} 2332 2333static void tctx_task_work(struct callback_head *cb) 2334{ 2335 bool uring_locked = false; 2336 struct io_ring_ctx *ctx = NULL; 2337 struct io_uring_task *tctx = container_of(cb, struct io_uring_task, 2338 task_work); 2339 2340 while (1) { 2341 struct io_wq_work_node *node1, *node2; 2342 2343 if (!tctx->task_list.first && 2344 !tctx->prior_task_list.first && uring_locked) 2345 io_submit_flush_completions(ctx); 2346 2347 spin_lock_irq(&tctx->task_lock); 2348 node1 = tctx->prior_task_list.first; 2349 node2 = tctx->task_list.first; 2350 INIT_WQ_LIST(&tctx->task_list); 2351 INIT_WQ_LIST(&tctx->prior_task_list); 2352 if (!node2 && !node1) 2353 tctx->task_running = false; 2354 spin_unlock_irq(&tctx->task_lock); 2355 if (!node2 && !node1) 2356 break; 2357 2358 if (node1) 2359 handle_prev_tw_list(node1, &ctx, &uring_locked); 2360 2361 if (node2) 2362 handle_tw_list(node2, &ctx, &uring_locked); 2363 cond_resched(); 2364 } 2365 2366 ctx_flush_and_put(ctx, &uring_locked); 2367 2368 /* relaxed read is enough as only the task itself sets ->in_idle */ 2369 if (unlikely(atomic_read(&tctx->in_idle))) 2370 io_uring_drop_tctx_refs(current); 2371} 2372 2373static void io_req_task_work_add(struct io_kiocb *req, bool priority) 2374{ 2375 struct task_struct *tsk = req->task; 2376 struct io_uring_task *tctx = tsk->io_uring; 2377 enum task_work_notify_mode notify; 2378 struct io_wq_work_node *node; 2379 unsigned long flags; 2380 bool running; 2381 2382 WARN_ON_ONCE(!tctx); 2383 2384 spin_lock_irqsave(&tctx->task_lock, flags); 2385 if (priority) 2386 wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list); 2387 else 2388 wq_list_add_tail(&req->io_task_work.node, &tctx->task_list); 2389 running = tctx->task_running; 2390 if (!running) 2391 tctx->task_running = true; 2392 spin_unlock_irqrestore(&tctx->task_lock, flags); 2393 2394 /* task_work already pending, we're done */ 2395 if (running) 2396 return; 2397 2398 /* 2399 * SQPOLL kernel thread doesn't need notification, just a wakeup. For 2400 * all other cases, use TWA_SIGNAL unconditionally to ensure we're 2401 * processing task_work. There's no reliable way to tell if TWA_RESUME 2402 * will do the job. 2403 */ 2404 notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL; 2405 if (likely(!task_work_add(tsk, &tctx->task_work, notify))) { 2406 if (notify == TWA_NONE) 2407 wake_up_process(tsk); 2408 return; 2409 } 2410 2411 spin_lock_irqsave(&tctx->task_lock, flags); 2412 tctx->task_running = false; 2413 node = wq_list_merge(&tctx->prior_task_list, &tctx->task_list); 2414 spin_unlock_irqrestore(&tctx->task_lock, flags); 2415 2416 while (node) { 2417 req = container_of(node, struct io_kiocb, io_task_work.node); 2418 node = node->next; 2419 if (llist_add(&req->io_task_work.fallback_node, 2420 &req->ctx->fallback_llist)) 2421 schedule_delayed_work(&req->ctx->fallback_work, 1); 2422 } 2423} 2424 2425static void io_req_task_cancel(struct io_kiocb *req, bool *locked) 2426{ 2427 struct io_ring_ctx *ctx = req->ctx; 2428 2429 /* not needed for normal modes, but SQPOLL depends on it */ 2430 io_tw_lock(ctx, locked); 2431 io_req_complete_failed(req, req->result); 2432} 2433 2434static void io_req_task_submit(struct io_kiocb *req, bool *locked) 2435{ 2436 struct io_ring_ctx *ctx = req->ctx; 2437 2438 io_tw_lock(ctx, locked); 2439 /* req->task == current here, checking PF_EXITING is safe */ 2440 if (likely(!(req->task->flags & PF_EXITING))) 2441 __io_queue_sqe(req); 2442 else 2443 io_req_complete_failed(req, -EFAULT); 2444} 2445 2446static void io_req_task_queue_fail(struct io_kiocb *req, int ret) 2447{ 2448 req->result = ret; 2449 req->io_task_work.func = io_req_task_cancel; 2450 io_req_task_work_add(req, false); 2451} 2452 2453static void io_req_task_queue(struct io_kiocb *req) 2454{ 2455 req->io_task_work.func = io_req_task_submit; 2456 io_req_task_work_add(req, false); 2457} 2458 2459static void io_req_task_queue_reissue(struct io_kiocb *req) 2460{ 2461 req->io_task_work.func = io_queue_async_work; 2462 io_req_task_work_add(req, false); 2463} 2464 2465static inline void io_queue_next(struct io_kiocb *req) 2466{ 2467 struct io_kiocb *nxt = io_req_find_next(req); 2468 2469 if (nxt) 2470 io_req_task_queue(nxt); 2471} 2472 2473static void io_free_req(struct io_kiocb *req) 2474{ 2475 io_queue_next(req); 2476 __io_free_req(req); 2477} 2478 2479static void io_free_req_work(struct io_kiocb *req, bool *locked) 2480{ 2481 io_free_req(req); 2482} 2483 2484static void io_free_batch_list(struct io_ring_ctx *ctx, 2485 struct io_wq_work_node *node) 2486 __must_hold(&ctx->uring_lock) 2487{ 2488 struct task_struct *task = NULL; 2489 int task_refs = 0; 2490 2491 do { 2492 struct io_kiocb *req = container_of(node, struct io_kiocb, 2493 comp_list); 2494 2495 if (unlikely(req->flags & REQ_F_REFCOUNT)) { 2496 node = req->comp_list.next; 2497 if (!req_ref_put_and_test(req)) 2498 continue; 2499 } 2500 2501 io_req_put_rsrc_locked(req, ctx); 2502 io_queue_next(req); 2503 io_dismantle_req(req); 2504 2505 if (req->task != task) { 2506 if (task) 2507 io_put_task(task, task_refs); 2508 task = req->task; 2509 task_refs = 0; 2510 } 2511 task_refs++; 2512 node = req->comp_list.next; 2513 wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); 2514 } while (node); 2515 2516 if (task) 2517 io_put_task(task, task_refs); 2518} 2519 2520static void __io_submit_flush_completions(struct io_ring_ctx *ctx) 2521 __must_hold(&ctx->uring_lock) 2522{ 2523 struct io_wq_work_node *node, *prev; 2524 struct io_submit_state *state = &ctx->submit_state; 2525 2526 if (state->flush_cqes) { 2527 spin_lock(&ctx->completion_lock); 2528 wq_list_for_each(node, prev, &state->compl_reqs) { 2529 struct io_kiocb *req = container_of(node, struct io_kiocb, 2530 comp_list); 2531 2532 if (!(req->flags & REQ_F_CQE_SKIP)) 2533 __io_fill_cqe(ctx, req->user_data, req->result, 2534 req->cflags); 2535 } 2536 2537 io_commit_cqring(ctx); 2538 spin_unlock(&ctx->completion_lock); 2539 io_cqring_ev_posted(ctx); 2540 state->flush_cqes = false; 2541 } 2542 2543 io_free_batch_list(ctx, state->compl_reqs.first); 2544 INIT_WQ_LIST(&state->compl_reqs); 2545} 2546 2547/* 2548 * Drop reference to request, return next in chain (if there is one) if this 2549 * was the last reference to this request. 2550 */ 2551static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) 2552{ 2553 struct io_kiocb *nxt = NULL; 2554 2555 if (req_ref_put_and_test(req)) { 2556 nxt = io_req_find_next(req); 2557 __io_free_req(req); 2558 } 2559 return nxt; 2560} 2561 2562static inline void io_put_req(struct io_kiocb *req) 2563{ 2564 if (req_ref_put_and_test(req)) 2565 io_free_req(req); 2566} 2567 2568static inline void io_put_req_deferred(struct io_kiocb *req) 2569{ 2570 if (req_ref_put_and_test(req)) { 2571 req->io_task_work.func = io_free_req_work; 2572 io_req_task_work_add(req, false); 2573 } 2574} 2575 2576static unsigned io_cqring_events(struct io_ring_ctx *ctx) 2577{ 2578 /* See comment at the top of this file */ 2579 smp_rmb(); 2580 return __io_cqring_events(ctx); 2581} 2582 2583static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) 2584{ 2585 struct io_rings *rings = ctx->rings; 2586 2587 /* make sure SQ entry isn't read before tail */ 2588 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; 2589} 2590 2591static inline bool io_run_task_work(void) 2592{ 2593 if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) { 2594 __set_current_state(TASK_RUNNING); 2595 tracehook_notify_signal(); 2596 return true; 2597 } 2598 2599 return false; 2600} 2601 2602static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) 2603{ 2604 struct io_wq_work_node *pos, *start, *prev; 2605 unsigned int poll_flags = BLK_POLL_NOSLEEP; 2606 DEFINE_IO_COMP_BATCH(iob); 2607 int nr_events = 0; 2608 2609 /* 2610 * Only spin for completions if we don't have multiple devices hanging 2611 * off our complete list. 2612 */ 2613 if (ctx->poll_multi_queue || force_nonspin) 2614 poll_flags |= BLK_POLL_ONESHOT; 2615 2616 wq_list_for_each(pos, start, &ctx->iopoll_list) { 2617 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); 2618 struct kiocb *kiocb = &req->rw.kiocb; 2619 int ret; 2620 2621 /* 2622 * Move completed and retryable entries to our local lists. 2623 * If we find a request that requires polling, break out 2624 * and complete those lists first, if we have entries there. 2625 */ 2626 if (READ_ONCE(req->iopoll_completed)) 2627 break; 2628 2629 ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags); 2630 if (unlikely(ret < 0)) 2631 return ret; 2632 else if (ret) 2633 poll_flags |= BLK_POLL_ONESHOT; 2634 2635 /* iopoll may have completed current req */ 2636 if (!rq_list_empty(iob.req_list) || 2637 READ_ONCE(req->iopoll_completed)) 2638 break; 2639 } 2640 2641 if (!rq_list_empty(iob.req_list)) 2642 iob.complete(&iob); 2643 else if (!pos) 2644 return 0; 2645 2646 prev = start; 2647 wq_list_for_each_resume(pos, prev) { 2648 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); 2649 2650 /* order with io_complete_rw_iopoll(), e.g. ->result updates */ 2651 if (!smp_load_acquire(&req->iopoll_completed)) 2652 break; 2653 if (unlikely(req->flags & REQ_F_CQE_SKIP)) 2654 continue; 2655 2656 __io_fill_cqe(ctx, req->user_data, req->result, io_put_kbuf(req)); 2657 nr_events++; 2658 } 2659 2660 if (unlikely(!nr_events)) 2661 return 0; 2662 2663 io_commit_cqring(ctx); 2664 io_cqring_ev_posted_iopoll(ctx); 2665 pos = start ? start->next : ctx->iopoll_list.first; 2666 wq_list_cut(&ctx->iopoll_list, prev, start); 2667 io_free_batch_list(ctx, pos); 2668 return nr_events; 2669} 2670 2671/* 2672 * We can't just wait for polled events to come to us, we have to actively 2673 * find and complete them. 2674 */ 2675static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) 2676{ 2677 if (!(ctx->flags & IORING_SETUP_IOPOLL)) 2678 return; 2679 2680 mutex_lock(&ctx->uring_lock); 2681 while (!wq_list_empty(&ctx->iopoll_list)) { 2682 /* let it sleep and repeat later if can't complete a request */ 2683 if (io_do_iopoll(ctx, true) == 0) 2684 break; 2685 /* 2686 * Ensure we allow local-to-the-cpu processing to take place, 2687 * in this case we need to ensure that we reap all events. 2688 * Also let task_work, etc. to progress by releasing the mutex 2689 */ 2690 if (need_resched()) { 2691 mutex_unlock(&ctx->uring_lock); 2692 cond_resched(); 2693 mutex_lock(&ctx->uring_lock); 2694 } 2695 } 2696 mutex_unlock(&ctx->uring_lock); 2697} 2698 2699static int io_iopoll_check(struct io_ring_ctx *ctx, long min) 2700{ 2701 unsigned int nr_events = 0; 2702 int ret = 0; 2703 2704 /* 2705 * We disallow the app entering submit/complete with polling, but we 2706 * still need to lock the ring to prevent racing with polled issue 2707 * that got punted to a workqueue. 2708 */ 2709 mutex_lock(&ctx->uring_lock); 2710 /* 2711 * Don't enter poll loop if we already have events pending. 2712 * If we do, we can potentially be spinning for commands that 2713 * already triggered a CQE (eg in error). 2714 */ 2715 if (test_bit(0, &ctx->check_cq_overflow)) 2716 __io_cqring_overflow_flush(ctx, false); 2717 if (io_cqring_events(ctx)) 2718 goto out; 2719 do { 2720 /* 2721 * If a submit got punted to a workqueue, we can have the 2722 * application entering polling for a command before it gets 2723 * issued. That app will hold the uring_lock for the duration 2724 * of the poll right here, so we need to take a breather every 2725 * now and then to ensure that the issue has a chance to add 2726 * the poll to the issued list. Otherwise we can spin here 2727 * forever, while the workqueue is stuck trying to acquire the 2728 * very same mutex. 2729 */ 2730 if (wq_list_empty(&ctx->iopoll_list)) { 2731 u32 tail = ctx->cached_cq_tail; 2732 2733 mutex_unlock(&ctx->uring_lock); 2734 io_run_task_work(); 2735 mutex_lock(&ctx->uring_lock); 2736 2737 /* some requests don't go through iopoll_list */ 2738 if (tail != ctx->cached_cq_tail || 2739 wq_list_empty(&ctx->iopoll_list)) 2740 break; 2741 } 2742 ret = io_do_iopoll(ctx, !min); 2743 if (ret < 0) 2744 break; 2745 nr_events += ret; 2746 ret = 0; 2747 } while (nr_events < min && !need_resched()); 2748out: 2749 mutex_unlock(&ctx->uring_lock); 2750 return ret; 2751} 2752 2753static void kiocb_end_write(struct io_kiocb *req) 2754{ 2755 /* 2756 * Tell lockdep we inherited freeze protection from submission 2757 * thread. 2758 */ 2759 if (req->flags & REQ_F_ISREG) { 2760 struct super_block *sb = file_inode(req->file)->i_sb; 2761 2762 __sb_writers_acquired(sb, SB_FREEZE_WRITE); 2763 sb_end_write(sb); 2764 } 2765} 2766 2767#ifdef CONFIG_BLOCK 2768static bool io_resubmit_prep(struct io_kiocb *req) 2769{ 2770 struct io_async_rw *rw = req->async_data; 2771 2772 if (!req_has_async_data(req)) 2773 return !io_req_prep_async(req); 2774 iov_iter_restore(&rw->s.iter, &rw->s.iter_state); 2775 return true; 2776} 2777 2778static bool io_rw_should_reissue(struct io_kiocb *req) 2779{ 2780 umode_t mode = file_inode(req->file)->i_mode; 2781 struct io_ring_ctx *ctx = req->ctx; 2782 2783 if (!S_ISBLK(mode) && !S_ISREG(mode)) 2784 return false; 2785 if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && 2786 !(ctx->flags & IORING_SETUP_IOPOLL))) 2787 return false; 2788 /* 2789 * If ref is dying, we might be running poll reap from the exit work. 2790 * Don't attempt to reissue from that path, just let it fail with 2791 * -EAGAIN. 2792 */ 2793 if (percpu_ref_is_dying(&ctx->refs)) 2794 return false; 2795 /* 2796 * Play it safe and assume not safe to re-import and reissue if we're 2797 * not in the original thread group (or in task context). 2798 */ 2799 if (!same_thread_group(req->task, current) || !in_task()) 2800 return false; 2801 return true; 2802} 2803#else 2804static bool io_resubmit_prep(struct io_kiocb *req) 2805{ 2806 return false; 2807} 2808static bool io_rw_should_reissue(struct io_kiocb *req) 2809{ 2810 return false; 2811} 2812#endif 2813 2814static bool __io_complete_rw_common(struct io_kiocb *req, long res) 2815{ 2816 if (req->rw.kiocb.ki_flags & IOCB_WRITE) 2817 kiocb_end_write(req); 2818 if (unlikely(res != req->result)) { 2819 if ((res == -EAGAIN || res == -EOPNOTSUPP) && 2820 io_rw_should_reissue(req)) { 2821 req->flags |= REQ_F_REISSUE; 2822 return true; 2823 } 2824 req_set_fail(req); 2825 req->result = res; 2826 } 2827 return false; 2828} 2829 2830static inline void io_req_task_complete(struct io_kiocb *req, bool *locked) 2831{ 2832 unsigned int cflags = io_put_kbuf(req); 2833 int res = req->result; 2834 2835 if (*locked) { 2836 io_req_complete_state(req, res, cflags); 2837 io_req_add_compl_list(req); 2838 } else { 2839 io_req_complete_post(req, res, cflags); 2840 } 2841} 2842 2843static void __io_complete_rw(struct io_kiocb *req, long res, 2844 unsigned int issue_flags) 2845{ 2846 if (__io_complete_rw_common(req, res)) 2847 return; 2848 __io_req_complete(req, issue_flags, req->result, io_put_kbuf(req)); 2849} 2850 2851static void io_complete_rw(struct kiocb *kiocb, long res) 2852{ 2853 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 2854 2855 if (__io_complete_rw_common(req, res)) 2856 return; 2857 req->result = res; 2858 req->io_task_work.func = io_req_task_complete; 2859 io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL)); 2860} 2861 2862static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) 2863{ 2864 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 2865 2866 if (kiocb->ki_flags & IOCB_WRITE) 2867 kiocb_end_write(req); 2868 if (unlikely(res != req->result)) { 2869 if (res == -EAGAIN && io_rw_should_reissue(req)) { 2870 req->flags |= REQ_F_REISSUE; 2871 return; 2872 } 2873 req->result = res; 2874 } 2875 2876 /* order with io_iopoll_complete() checking ->iopoll_completed */ 2877 smp_store_release(&req->iopoll_completed, 1); 2878} 2879 2880/* 2881 * After the iocb has been issued, it's safe to be found on the poll list. 2882 * Adding the kiocb to the list AFTER submission ensures that we don't 2883 * find it from a io_do_iopoll() thread before the issuer is done 2884 * accessing the kiocb cookie. 2885 */ 2886static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) 2887{ 2888 struct io_ring_ctx *ctx = req->ctx; 2889 const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 2890 2891 /* workqueue context doesn't hold uring_lock, grab it now */ 2892 if (unlikely(needs_lock)) 2893 mutex_lock(&ctx->uring_lock); 2894 2895 /* 2896 * Track whether we have multiple files in our lists. This will impact 2897 * how we do polling eventually, not spinning if we're on potentially 2898 * different devices. 2899 */ 2900 if (wq_list_empty(&ctx->iopoll_list)) { 2901 ctx->poll_multi_queue = false; 2902 } else if (!ctx->poll_multi_queue) { 2903 struct io_kiocb *list_req; 2904 2905 list_req = container_of(ctx->iopoll_list.first, struct io_kiocb, 2906 comp_list); 2907 if (list_req->file != req->file) 2908 ctx->poll_multi_queue = true; 2909 } 2910 2911 /* 2912 * For fast devices, IO may have already completed. If it has, add 2913 * it to the front so we find it first. 2914 */ 2915 if (READ_ONCE(req->iopoll_completed)) 2916 wq_list_add_head(&req->comp_list, &ctx->iopoll_list); 2917 else 2918 wq_list_add_tail(&req->comp_list, &ctx->iopoll_list); 2919 2920 if (unlikely(needs_lock)) { 2921 /* 2922 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle 2923 * in sq thread task context or in io worker task context. If 2924 * current task context is sq thread, we don't need to check 2925 * whether should wake up sq thread. 2926 */ 2927 if ((ctx->flags & IORING_SETUP_SQPOLL) && 2928 wq_has_sleeper(&ctx->sq_data->wait)) 2929 wake_up(&ctx->sq_data->wait); 2930 2931 mutex_unlock(&ctx->uring_lock); 2932 } 2933} 2934 2935static bool io_bdev_nowait(struct block_device *bdev) 2936{ 2937 return !bdev || blk_queue_nowait(bdev_get_queue(bdev)); 2938} 2939 2940/* 2941 * If we tracked the file through the SCM inflight mechanism, we could support 2942 * any file. For now, just ensure that anything potentially problematic is done 2943 * inline. 2944 */ 2945static bool __io_file_supports_nowait(struct file *file, umode_t mode) 2946{ 2947 if (S_ISBLK(mode)) { 2948 if (IS_ENABLED(CONFIG_BLOCK) && 2949 io_bdev_nowait(I_BDEV(file->f_mapping->host))) 2950 return true; 2951 return false; 2952 } 2953 if (S_ISSOCK(mode)) 2954 return true; 2955 if (S_ISREG(mode)) { 2956 if (IS_ENABLED(CONFIG_BLOCK) && 2957 io_bdev_nowait(file->f_inode->i_sb->s_bdev) && 2958 file->f_op != &io_uring_fops) 2959 return true; 2960 return false; 2961 } 2962 2963 /* any ->read/write should understand O_NONBLOCK */ 2964 if (file->f_flags & O_NONBLOCK) 2965 return true; 2966 return file->f_mode & FMODE_NOWAIT; 2967} 2968 2969/* 2970 * If we tracked the file through the SCM inflight mechanism, we could support 2971 * any file. For now, just ensure that anything potentially problematic is done 2972 * inline. 2973 */ 2974static unsigned int io_file_get_flags(struct file *file) 2975{ 2976 umode_t mode = file_inode(file)->i_mode; 2977 unsigned int res = 0; 2978 2979 if (S_ISREG(mode)) 2980 res |= FFS_ISREG; 2981 if (__io_file_supports_nowait(file, mode)) 2982 res |= FFS_NOWAIT; 2983 return res; 2984} 2985 2986static inline bool io_file_supports_nowait(struct io_kiocb *req) 2987{ 2988 return req->flags & REQ_F_SUPPORT_NOWAIT; 2989} 2990 2991static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) 2992{ 2993 struct io_ring_ctx *ctx = req->ctx; 2994 struct kiocb *kiocb = &req->rw.kiocb; 2995 struct file *file = req->file; 2996 unsigned ioprio; 2997 int ret; 2998 2999 if (!io_req_ffs_set(req)) 3000 req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; 3001 3002 kiocb->ki_pos = READ_ONCE(sqe->off); 3003 if (kiocb->ki_pos == -1) { 3004 if (!(file->f_mode & FMODE_STREAM)) { 3005 req->flags |= REQ_F_CUR_POS; 3006 kiocb->ki_pos = file->f_pos; 3007 } else { 3008 kiocb->ki_pos = 0; 3009 } 3010 } 3011 kiocb->ki_flags = iocb_flags(file); 3012 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); 3013 if (unlikely(ret)) 3014 return ret; 3015 3016 /* 3017 * If the file is marked O_NONBLOCK, still allow retry for it if it 3018 * supports async. Otherwise it's impossible to use O_NONBLOCK files 3019 * reliably. If not, or it IOCB_NOWAIT is set, don't retry. 3020 */ 3021 if ((kiocb->ki_flags & IOCB_NOWAIT) || 3022 ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req))) 3023 req->flags |= REQ_F_NOWAIT; 3024 3025 if (ctx->flags & IORING_SETUP_IOPOLL) { 3026 if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) 3027 return -EOPNOTSUPP; 3028 3029 kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; 3030 kiocb->ki_complete = io_complete_rw_iopoll; 3031 req->iopoll_completed = 0; 3032 } else { 3033 if (kiocb->ki_flags & IOCB_HIPRI) 3034 return -EINVAL; 3035 kiocb->ki_complete = io_complete_rw; 3036 } 3037 3038 ioprio = READ_ONCE(sqe->ioprio); 3039 if (ioprio) { 3040 ret = ioprio_check_cap(ioprio); 3041 if (ret) 3042 return ret; 3043 3044 kiocb->ki_ioprio = ioprio; 3045 } else { 3046 kiocb->ki_ioprio = get_current_ioprio(); 3047 } 3048 3049 req->imu = NULL; 3050 req->rw.addr = READ_ONCE(sqe->addr); 3051 req->rw.len = READ_ONCE(sqe->len); 3052 req->buf_index = READ_ONCE(sqe->buf_index); 3053 return 0; 3054} 3055 3056static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) 3057{ 3058 switch (ret) { 3059 case -EIOCBQUEUED: 3060 break; 3061 case -ERESTARTSYS: 3062 case -ERESTARTNOINTR: 3063 case -ERESTARTNOHAND: 3064 case -ERESTART_RESTARTBLOCK: 3065 /* 3066 * We can't just restart the syscall, since previously 3067 * submitted sqes may already be in progress. Just fail this 3068 * IO with EINTR. 3069 */ 3070 ret = -EINTR; 3071 fallthrough; 3072 default: 3073 kiocb->ki_complete(kiocb, ret); 3074 } 3075} 3076 3077static void kiocb_done(struct io_kiocb *req, ssize_t ret, 3078 unsigned int issue_flags) 3079{ 3080 struct io_async_rw *io = req->async_data; 3081 3082 /* add previously done IO, if any */ 3083 if (req_has_async_data(req) && io->bytes_done > 0) { 3084 if (ret < 0) 3085 ret = io->bytes_done; 3086 else 3087 ret += io->bytes_done; 3088 } 3089 3090 if (req->flags & REQ_F_CUR_POS) 3091 req->file->f_pos = req->rw.kiocb.ki_pos; 3092 if (ret >= 0 && (req->rw.kiocb.ki_complete == io_complete_rw)) 3093 __io_complete_rw(req, ret, issue_flags); 3094 else 3095 io_rw_done(&req->rw.kiocb, ret); 3096 3097 if (req->flags & REQ_F_REISSUE) { 3098 req->flags &= ~REQ_F_REISSUE; 3099 if (io_resubmit_prep(req)) { 3100 io_req_task_queue_reissue(req); 3101 } else { 3102 req_set_fail(req); 3103 req->result = ret; 3104 req->io_task_work.func = io_req_task_complete; 3105 io_req_task_work_add(req, false); 3106 } 3107 } 3108} 3109 3110static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, 3111 struct io_mapped_ubuf *imu) 3112{ 3113 size_t len = req->rw.len; 3114 u64 buf_end, buf_addr = req->rw.addr; 3115 size_t offset; 3116 3117 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 3118 return -EFAULT; 3119 /* not inside the mapped region */ 3120 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) 3121 return -EFAULT; 3122 3123 /* 3124 * May not be a start of buffer, set size appropriately 3125 * and advance us to the beginning. 3126 */ 3127 offset = buf_addr - imu->ubuf; 3128 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); 3129 3130 if (offset) { 3131 /* 3132 * Don't use iov_iter_advance() here, as it's really slow for 3133 * using the latter parts of a big fixed buffer - it iterates 3134 * over each segment manually. We can cheat a bit here, because 3135 * we know that: 3136 * 3137 * 1) it's a BVEC iter, we set it up 3138 * 2) all bvecs are PAGE_SIZE in size, except potentially the 3139 * first and last bvec 3140 * 3141 * So just find our index, and adjust the iterator afterwards. 3142 * If the offset is within the first bvec (or the whole first 3143 * bvec, just use iov_iter_advance(). This makes it easier 3144 * since we can just skip the first segment, which may not 3145 * be PAGE_SIZE aligned. 3146 */ 3147 const struct bio_vec *bvec = imu->bvec; 3148 3149 if (offset <= bvec->bv_len) { 3150 iov_iter_advance(iter, offset); 3151 } else { 3152 unsigned long seg_skip; 3153 3154 /* skip first vec */ 3155 offset -= bvec->bv_len; 3156 seg_skip = 1 + (offset >> PAGE_SHIFT); 3157 3158 iter->bvec = bvec + seg_skip; 3159 iter->nr_segs -= seg_skip; 3160 iter->count -= bvec->bv_len + offset; 3161 iter->iov_offset = offset & ~PAGE_MASK; 3162 } 3163 } 3164 3165 return 0; 3166} 3167 3168static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) 3169{ 3170 struct io_mapped_ubuf *imu = req->imu; 3171 u16 index, buf_index = req->buf_index; 3172 3173 if (likely(!imu)) { 3174 struct io_ring_ctx *ctx = req->ctx; 3175 3176 if (unlikely(buf_index >= ctx->nr_user_bufs)) 3177 return -EFAULT; 3178 io_req_set_rsrc_node(req, ctx); 3179 index = array_index_nospec(buf_index, ctx->nr_user_bufs); 3180 imu = READ_ONCE(ctx->user_bufs[index]); 3181 req->imu = imu; 3182 } 3183 return __io_import_fixed(req, rw, iter, imu); 3184} 3185 3186static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock) 3187{ 3188 if (needs_lock) 3189 mutex_unlock(&ctx->uring_lock); 3190} 3191 3192static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) 3193{ 3194 /* 3195 * "Normal" inline submissions always hold the uring_lock, since we 3196 * grab it from the system call. Same is true for the SQPOLL offload. 3197 * The only exception is when we've detached the request and issue it 3198 * from an async worker thread, grab the lock for that case. 3199 */ 3200 if (needs_lock) 3201 mutex_lock(&ctx->uring_lock); 3202} 3203 3204static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, 3205 int bgid, unsigned int issue_flags) 3206{ 3207 struct io_buffer *kbuf = req->kbuf; 3208 struct io_buffer *head; 3209 bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 3210 3211 if (req->flags & REQ_F_BUFFER_SELECTED) 3212 return kbuf; 3213 3214 io_ring_submit_lock(req->ctx, needs_lock); 3215 3216 lockdep_assert_held(&req->ctx->uring_lock); 3217 3218 head = xa_load(&req->ctx->io_buffers, bgid); 3219 if (head) { 3220 if (!list_empty(&head->list)) { 3221 kbuf = list_last_entry(&head->list, struct io_buffer, 3222 list); 3223 list_del(&kbuf->list); 3224 } else { 3225 kbuf = head; 3226 xa_erase(&req->ctx->io_buffers, bgid); 3227 } 3228 if (*len > kbuf->len) 3229 *len = kbuf->len; 3230 req->flags |= REQ_F_BUFFER_SELECTED; 3231 req->kbuf = kbuf; 3232 } else { 3233 kbuf = ERR_PTR(-ENOBUFS); 3234 } 3235 3236 io_ring_submit_unlock(req->ctx, needs_lock); 3237 return kbuf; 3238} 3239 3240static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, 3241 unsigned int issue_flags) 3242{ 3243 struct io_buffer *kbuf; 3244 u16 bgid; 3245 3246 bgid = req->buf_index; 3247 kbuf = io_buffer_select(req, len, bgid, issue_flags); 3248 if (IS_ERR(kbuf)) 3249 return kbuf; 3250 return u64_to_user_ptr(kbuf->addr); 3251} 3252 3253#ifdef CONFIG_COMPAT 3254static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, 3255 unsigned int issue_flags) 3256{ 3257 struct compat_iovec __user *uiov; 3258 compat_ssize_t clen; 3259 void __user *buf; 3260 ssize_t len; 3261 3262 uiov = u64_to_user_ptr(req->rw.addr); 3263 if (!access_ok(uiov, sizeof(*uiov))) 3264 return -EFAULT; 3265 if (__get_user(clen, &uiov->iov_len)) 3266 return -EFAULT; 3267 if (clen < 0) 3268 return -EINVAL; 3269 3270 len = clen; 3271 buf = io_rw_buffer_select(req, &len, issue_flags); 3272 if (IS_ERR(buf)) 3273 return PTR_ERR(buf); 3274 iov[0].iov_base = buf; 3275 iov[0].iov_len = (compat_size_t) len; 3276 return 0; 3277} 3278#endif 3279 3280static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, 3281 unsigned int issue_flags) 3282{ 3283 struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr); 3284 void __user *buf; 3285 ssize_t len; 3286 3287 if (copy_from_user(iov, uiov, sizeof(*uiov))) 3288 return -EFAULT; 3289 3290 len = iov[0].iov_len; 3291 if (len < 0) 3292 return -EINVAL; 3293 buf = io_rw_buffer_select(req, &len, issue_flags); 3294 if (IS_ERR(buf)) 3295 return PTR_ERR(buf); 3296 iov[0].iov_base = buf; 3297 iov[0].iov_len = len; 3298 return 0; 3299} 3300 3301static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, 3302 unsigned int issue_flags) 3303{ 3304 if (req->flags & REQ_F_BUFFER_SELECTED) { 3305 struct io_buffer *kbuf = req->kbuf; 3306 3307 iov[0].iov_base = u64_to_user_ptr(kbuf->addr); 3308 iov[0].iov_len = kbuf->len; 3309 return 0; 3310 } 3311 if (req->rw.len != 1) 3312 return -EINVAL; 3313 3314#ifdef CONFIG_COMPAT 3315 if (req->ctx->compat) 3316 return io_compat_import(req, iov, issue_flags); 3317#endif 3318 3319 return __io_iov_buffer_select(req, iov, issue_flags); 3320} 3321 3322static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, 3323 struct io_rw_state *s, 3324 unsigned int issue_flags) 3325{ 3326 struct iov_iter *iter = &s->iter; 3327 u8 opcode = req->opcode; 3328 struct iovec *iovec; 3329 void __user *buf; 3330 size_t sqe_len; 3331 ssize_t ret; 3332 3333 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { 3334 ret = io_import_fixed(req, rw, iter); 3335 if (ret) 3336 return ERR_PTR(ret); 3337 return NULL; 3338 } 3339 3340 /* buffer index only valid with fixed read/write, or buffer select */ 3341 if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))) 3342 return ERR_PTR(-EINVAL); 3343 3344 buf = u64_to_user_ptr(req->rw.addr); 3345 sqe_len = req->rw.len; 3346 3347 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { 3348 if (req->flags & REQ_F_BUFFER_SELECT) { 3349 buf = io_rw_buffer_select(req, &sqe_len, issue_flags); 3350 if (IS_ERR(buf)) 3351 return ERR_CAST(buf); 3352 req->rw.len = sqe_len; 3353 } 3354 3355 ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter); 3356 if (ret) 3357 return ERR_PTR(ret); 3358 return NULL; 3359 } 3360 3361 iovec = s->fast_iov; 3362 if (req->flags & REQ_F_BUFFER_SELECT) { 3363 ret = io_iov_buffer_select(req, iovec, issue_flags); 3364 if (ret) 3365 return ERR_PTR(ret); 3366 iov_iter_init(iter, rw, iovec, 1, iovec->iov_len); 3367 return NULL; 3368 } 3369 3370 ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter, 3371 req->ctx->compat); 3372 if (unlikely(ret < 0)) 3373 return ERR_PTR(ret); 3374 return iovec; 3375} 3376 3377static inline int io_import_iovec(int rw, struct io_kiocb *req, 3378 struct iovec **iovec, struct io_rw_state *s, 3379 unsigned int issue_flags) 3380{ 3381 *iovec = __io_import_iovec(rw, req, s, issue_flags); 3382 if (unlikely(IS_ERR(*iovec))) 3383 return PTR_ERR(*iovec); 3384 3385 iov_iter_save_state(&s->iter, &s->iter_state); 3386 return 0; 3387} 3388 3389static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) 3390{ 3391 return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; 3392} 3393 3394/* 3395 * For files that don't have ->read_iter() and ->write_iter(), handle them 3396 * by looping over ->read() or ->write() manually. 3397 */ 3398static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) 3399{ 3400 struct kiocb *kiocb = &req->rw.kiocb; 3401 struct file *file = req->file; 3402 ssize_t ret = 0; 3403 3404 /* 3405 * Don't support polled IO through this interface, and we can't 3406 * support non-blocking either. For the latter, this just causes 3407 * the kiocb to be handled from an async context. 3408 */ 3409 if (kiocb->ki_flags & IOCB_HIPRI) 3410 return -EOPNOTSUPP; 3411 if ((kiocb->ki_flags & IOCB_NOWAIT) && 3412 !(kiocb->ki_filp->f_flags & O_NONBLOCK)) 3413 return -EAGAIN; 3414 3415 while (iov_iter_count(iter)) { 3416 struct iovec iovec; 3417 ssize_t nr; 3418 3419 if (!iov_iter_is_bvec(iter)) { 3420 iovec = iov_iter_iovec(iter); 3421 } else { 3422 iovec.iov_base = u64_to_user_ptr(req->rw.addr); 3423 iovec.iov_len = req->rw.len; 3424 } 3425 3426 if (rw == READ) { 3427 nr = file->f_op->read(file, iovec.iov_base, 3428 iovec.iov_len, io_kiocb_ppos(kiocb)); 3429 } else { 3430 nr = file->f_op->write(file, iovec.iov_base, 3431 iovec.iov_len, io_kiocb_ppos(kiocb)); 3432 } 3433 3434 if (nr < 0) { 3435 if (!ret) 3436 ret = nr; 3437 break; 3438 } 3439 if (!iov_iter_is_bvec(iter)) { 3440 iov_iter_advance(iter, nr); 3441 } else { 3442 req->rw.len -= nr; 3443 req->rw.addr += nr; 3444 } 3445 ret += nr; 3446 if (nr != iovec.iov_len) 3447 break; 3448 } 3449 3450 return ret; 3451} 3452 3453static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, 3454 const struct iovec *fast_iov, struct iov_iter *iter) 3455{ 3456 struct io_async_rw *rw = req->async_data; 3457 3458 memcpy(&rw->s.iter, iter, sizeof(*iter)); 3459 rw->free_iovec = iovec; 3460 rw->bytes_done = 0; 3461 /* can only be fixed buffers, no need to do anything */ 3462 if (iov_iter_is_bvec(iter)) 3463 return; 3464 if (!iovec) { 3465 unsigned iov_off = 0; 3466 3467 rw->s.iter.iov = rw->s.fast_iov; 3468 if (iter->iov != fast_iov) { 3469 iov_off = iter->iov - fast_iov; 3470 rw->s.iter.iov += iov_off; 3471 } 3472 if (rw->s.fast_iov != fast_iov) 3473 memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off, 3474 sizeof(struct iovec) * iter->nr_segs); 3475 } else { 3476 req->flags |= REQ_F_NEED_CLEANUP; 3477 } 3478} 3479 3480static inline bool io_alloc_async_data(struct io_kiocb *req) 3481{ 3482 WARN_ON_ONCE(!io_op_defs[req->opcode].async_size); 3483 req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL); 3484 if (req->async_data) { 3485 req->flags |= REQ_F_ASYNC_DATA; 3486 return false; 3487 } 3488 return true; 3489} 3490 3491static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, 3492 struct io_rw_state *s, bool force) 3493{ 3494 if (!force && !io_op_defs[req->opcode].needs_async_setup) 3495 return 0; 3496 if (!req_has_async_data(req)) { 3497 struct io_async_rw *iorw; 3498 3499 if (io_alloc_async_data(req)) { 3500 kfree(iovec); 3501 return -ENOMEM; 3502 } 3503 3504 io_req_map_rw(req, iovec, s->fast_iov, &s->iter); 3505 iorw = req->async_data; 3506 /* we've copied and mapped the iter, ensure state is saved */ 3507 iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state); 3508 } 3509 return 0; 3510} 3511 3512static inline int io_rw_prep_async(struct io_kiocb *req, int rw) 3513{ 3514 struct io_async_rw *iorw = req->async_data; 3515 struct iovec *iov; 3516 int ret; 3517 3518 /* submission path, ->uring_lock should already be taken */ 3519 ret = io_import_iovec(rw, req, &iov, &iorw->s, 0); 3520 if (unlikely(ret < 0)) 3521 return ret; 3522 3523 iorw->bytes_done = 0; 3524 iorw->free_iovec = iov; 3525 if (iov) 3526 req->flags |= REQ_F_NEED_CLEANUP; 3527 return 0; 3528} 3529 3530static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3531{ 3532 if (unlikely(!(req->file->f_mode & FMODE_READ))) 3533 return -EBADF; 3534 return io_prep_rw(req, sqe); 3535} 3536 3537/* 3538 * This is our waitqueue callback handler, registered through __folio_lock_async() 3539 * when we initially tried to do the IO with the iocb armed our waitqueue. 3540 * This gets called when the page is unlocked, and we generally expect that to 3541 * happen when the page IO is completed and the page is now uptodate. This will 3542 * queue a task_work based retry of the operation, attempting to copy the data 3543 * again. If the latter fails because the page was NOT uptodate, then we will 3544 * do a thread based blocking retry of the operation. That's the unexpected 3545 * slow path. 3546 */ 3547static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, 3548 int sync, void *arg) 3549{ 3550 struct wait_page_queue *wpq; 3551 struct io_kiocb *req = wait->private; 3552 struct wait_page_key *key = arg; 3553 3554 wpq = container_of(wait, struct wait_page_queue, wait); 3555 3556 if (!wake_page_match(wpq, key)) 3557 return 0; 3558 3559 req->rw.kiocb.ki_flags &= ~IOCB_WAITQ; 3560 list_del_init(&wait->entry); 3561 io_req_task_queue(req); 3562 return 1; 3563} 3564 3565/* 3566 * This controls whether a given IO request should be armed for async page 3567 * based retry. If we return false here, the request is handed to the async 3568 * worker threads for retry. If we're doing buffered reads on a regular file, 3569 * we prepare a private wait_page_queue entry and retry the operation. This 3570 * will either succeed because the page is now uptodate and unlocked, or it 3571 * will register a callback when the page is unlocked at IO completion. Through 3572 * that callback, io_uring uses task_work to setup a retry of the operation. 3573 * That retry will attempt the buffered read again. The retry will generally 3574 * succeed, or in rare cases where it fails, we then fall back to using the 3575 * async worker threads for a blocking retry. 3576 */ 3577static bool io_rw_should_retry(struct io_kiocb *req) 3578{ 3579 struct io_async_rw *rw = req->async_data; 3580 struct wait_page_queue *wait = &rw->wpq; 3581 struct kiocb *kiocb = &req->rw.kiocb; 3582 3583 /* never retry for NOWAIT, we just complete with -EAGAIN */ 3584 if (req->flags & REQ_F_NOWAIT) 3585 return false; 3586 3587 /* Only for buffered IO */ 3588 if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) 3589 return false; 3590 3591 /* 3592 * just use poll if we can, and don't attempt if the fs doesn't 3593 * support callback based unlocks 3594 */ 3595 if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) 3596 return false; 3597 3598 wait->wait.func = io_async_buf_func; 3599 wait->wait.private = req; 3600 wait->wait.flags = 0; 3601 INIT_LIST_HEAD(&wait->wait.entry); 3602 kiocb->ki_flags |= IOCB_WAITQ; 3603 kiocb->ki_flags &= ~IOCB_NOWAIT; 3604 kiocb->ki_waitq = wait; 3605 return true; 3606} 3607 3608static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) 3609{ 3610 if (likely(req->file->f_op->read_iter)) 3611 return call_read_iter(req->file, &req->rw.kiocb, iter); 3612 else if (req->file->f_op->read) 3613 return loop_rw_iter(READ, req, iter); 3614 else 3615 return -EINVAL; 3616} 3617 3618static bool need_read_all(struct io_kiocb *req) 3619{ 3620 return req->flags & REQ_F_ISREG || 3621 S_ISBLK(file_inode(req->file)->i_mode); 3622} 3623 3624static int io_read(struct io_kiocb *req, unsigned int issue_flags) 3625{ 3626 struct io_rw_state __s, *s = &__s; 3627 struct iovec *iovec; 3628 struct kiocb *kiocb = &req->rw.kiocb; 3629 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 3630 struct io_async_rw *rw; 3631 ssize_t ret, ret2; 3632 3633 if (!req_has_async_data(req)) { 3634 ret = io_import_iovec(READ, req, &iovec, s, issue_flags); 3635 if (unlikely(ret < 0)) 3636 return ret; 3637 } else { 3638 rw = req->async_data; 3639 s = &rw->s; 3640 /* 3641 * We come here from an earlier attempt, restore our state to 3642 * match in case it doesn't. It's cheap enough that we don't 3643 * need to make this conditional. 3644 */ 3645 iov_iter_restore(&s->iter, &s->iter_state); 3646 iovec = NULL; 3647 } 3648 req->result = iov_iter_count(&s->iter); 3649 3650 if (force_nonblock) { 3651 /* If the file doesn't support async, just async punt */ 3652 if (unlikely(!io_file_supports_nowait(req))) { 3653 ret = io_setup_async_rw(req, iovec, s, true); 3654 return ret ?: -EAGAIN; 3655 } 3656 kiocb->ki_flags |= IOCB_NOWAIT; 3657 } else { 3658 /* Ensure we clear previously set non-block flag */ 3659 kiocb->ki_flags &= ~IOCB_NOWAIT; 3660 } 3661 3662 ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result); 3663 if (unlikely(ret)) { 3664 kfree(iovec); 3665 return ret; 3666 } 3667 3668 ret = io_iter_do_read(req, &s->iter); 3669 3670 if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { 3671 req->flags &= ~REQ_F_REISSUE; 3672 /* IOPOLL retry should happen for io-wq threads */ 3673 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) 3674 goto done; 3675 /* no retry on NONBLOCK nor RWF_NOWAIT */ 3676 if (req->flags & REQ_F_NOWAIT) 3677 goto done; 3678 ret = 0; 3679 } else if (ret == -EIOCBQUEUED) { 3680 goto out_free; 3681 } else if (ret == req->result || ret <= 0 || !force_nonblock || 3682 (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { 3683 /* read all, failed, already did sync or don't want to retry */ 3684 goto done; 3685 } 3686 3687 /* 3688 * Don't depend on the iter state matching what was consumed, or being 3689 * untouched in case of error. Restore it and we'll advance it 3690 * manually if we need to. 3691 */ 3692 iov_iter_restore(&s->iter, &s->iter_state); 3693 3694 ret2 = io_setup_async_rw(req, iovec, s, true); 3695 if (ret2) 3696 return ret2; 3697 3698 iovec = NULL; 3699 rw = req->async_data; 3700 s = &rw->s; 3701 /* 3702 * Now use our persistent iterator and state, if we aren't already. 3703 * We've restored and mapped the iter to match. 3704 */ 3705 3706 do { 3707 /* 3708 * We end up here because of a partial read, either from 3709 * above or inside this loop. Advance the iter by the bytes 3710 * that were consumed. 3711 */ 3712 iov_iter_advance(&s->iter, ret); 3713 if (!iov_iter_count(&s->iter)) 3714 break; 3715 rw->bytes_done += ret; 3716 iov_iter_save_state(&s->iter, &s->iter_state); 3717 3718 /* if we can retry, do so with the callbacks armed */ 3719 if (!io_rw_should_retry(req)) { 3720 kiocb->ki_flags &= ~IOCB_WAITQ; 3721 return -EAGAIN; 3722 } 3723 3724 /* 3725 * Now retry read with the IOCB_WAITQ parts set in the iocb. If 3726 * we get -EIOCBQUEUED, then we'll get a notification when the 3727 * desired page gets unlocked. We can also get a partial read 3728 * here, and if we do, then just retry at the new offset. 3729 */ 3730 ret = io_iter_do_read(req, &s->iter); 3731 if (ret == -EIOCBQUEUED) 3732 return 0; 3733 /* we got some bytes, but not all. retry. */ 3734 kiocb->ki_flags &= ~IOCB_WAITQ; 3735 iov_iter_restore(&s->iter, &s->iter_state); 3736 } while (ret > 0); 3737done: 3738 kiocb_done(req, ret, issue_flags); 3739out_free: 3740 /* it's faster to check here then delegate to kfree */ 3741 if (iovec) 3742 kfree(iovec); 3743 return 0; 3744} 3745 3746static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3747{ 3748 if (unlikely(!(req->file->f_mode & FMODE_WRITE))) 3749 return -EBADF; 3750 req->rw.kiocb.ki_hint = ki_hint_validate(file_write_hint(req->file)); 3751 return io_prep_rw(req, sqe); 3752} 3753 3754static int io_write(struct io_kiocb *req, unsigned int issue_flags) 3755{ 3756 struct io_rw_state __s, *s = &__s; 3757 struct iovec *iovec; 3758 struct kiocb *kiocb = &req->rw.kiocb; 3759 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 3760 ssize_t ret, ret2; 3761 3762 if (!req_has_async_data(req)) { 3763 ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags); 3764 if (unlikely(ret < 0)) 3765 return ret; 3766 } else { 3767 struct io_async_rw *rw = req->async_data; 3768 3769 s = &rw->s; 3770 iov_iter_restore(&s->iter, &s->iter_state); 3771 iovec = NULL; 3772 } 3773 req->result = iov_iter_count(&s->iter); 3774 3775 if (force_nonblock) { 3776 /* If the file doesn't support async, just async punt */ 3777 if (unlikely(!io_file_supports_nowait(req))) 3778 goto copy_iov; 3779 3780 /* file path doesn't support NOWAIT for non-direct_IO */ 3781 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && 3782 (req->flags & REQ_F_ISREG)) 3783 goto copy_iov; 3784 3785 kiocb->ki_flags |= IOCB_NOWAIT; 3786 } else { 3787 /* Ensure we clear previously set non-block flag */ 3788 kiocb->ki_flags &= ~IOCB_NOWAIT; 3789 } 3790 3791 ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result); 3792 if (unlikely(ret)) 3793 goto out_free; 3794 3795 /* 3796 * Open-code file_start_write here to grab freeze protection, 3797 * which will be released by another thread in 3798 * io_complete_rw(). Fool lockdep by telling it the lock got 3799 * released so that it doesn't complain about the held lock when 3800 * we return to userspace. 3801 */ 3802 if (req->flags & REQ_F_ISREG) { 3803 sb_start_write(file_inode(req->file)->i_sb); 3804 __sb_writers_release(file_inode(req->file)->i_sb, 3805 SB_FREEZE_WRITE); 3806 } 3807 kiocb->ki_flags |= IOCB_WRITE; 3808 3809 if (likely(req->file->f_op->write_iter)) 3810 ret2 = call_write_iter(req->file, kiocb, &s->iter); 3811 else if (req->file->f_op->write) 3812 ret2 = loop_rw_iter(WRITE, req, &s->iter); 3813 else 3814 ret2 = -EINVAL; 3815 3816 if (req->flags & REQ_F_REISSUE) { 3817 req->flags &= ~REQ_F_REISSUE; 3818 ret2 = -EAGAIN; 3819 } 3820 3821 /* 3822 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just 3823 * retry them without IOCB_NOWAIT. 3824 */ 3825 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) 3826 ret2 = -EAGAIN; 3827 /* no retry on NONBLOCK nor RWF_NOWAIT */ 3828 if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT)) 3829 goto done; 3830 if (!force_nonblock || ret2 != -EAGAIN) { 3831 /* IOPOLL retry should happen for io-wq threads */ 3832 if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) 3833 goto copy_iov; 3834done: 3835 kiocb_done(req, ret2, issue_flags); 3836 } else { 3837copy_iov: 3838 iov_iter_restore(&s->iter, &s->iter_state); 3839 ret = io_setup_async_rw(req, iovec, s, false); 3840 return ret ?: -EAGAIN; 3841 } 3842out_free: 3843 /* it's reportedly faster than delegating the null check to kfree() */ 3844 if (iovec) 3845 kfree(iovec); 3846 return ret; 3847} 3848 3849static int io_renameat_prep(struct io_kiocb *req, 3850 const struct io_uring_sqe *sqe) 3851{ 3852 struct io_rename *ren = &req->rename; 3853 const char __user *oldf, *newf; 3854 3855 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3856 return -EINVAL; 3857 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) 3858 return -EINVAL; 3859 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3860 return -EBADF; 3861 3862 ren->old_dfd = READ_ONCE(sqe->fd); 3863 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3864 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 3865 ren->new_dfd = READ_ONCE(sqe->len); 3866 ren->flags = READ_ONCE(sqe->rename_flags); 3867 3868 ren->oldpath = getname(oldf); 3869 if (IS_ERR(ren->oldpath)) 3870 return PTR_ERR(ren->oldpath); 3871 3872 ren->newpath = getname(newf); 3873 if (IS_ERR(ren->newpath)) { 3874 putname(ren->oldpath); 3875 return PTR_ERR(ren->newpath); 3876 } 3877 3878 req->flags |= REQ_F_NEED_CLEANUP; 3879 return 0; 3880} 3881 3882static int io_renameat(struct io_kiocb *req, unsigned int issue_flags) 3883{ 3884 struct io_rename *ren = &req->rename; 3885 int ret; 3886 3887 if (issue_flags & IO_URING_F_NONBLOCK) 3888 return -EAGAIN; 3889 3890 ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, 3891 ren->newpath, ren->flags); 3892 3893 req->flags &= ~REQ_F_NEED_CLEANUP; 3894 if (ret < 0) 3895 req_set_fail(req); 3896 io_req_complete(req, ret); 3897 return 0; 3898} 3899 3900static int io_unlinkat_prep(struct io_kiocb *req, 3901 const struct io_uring_sqe *sqe) 3902{ 3903 struct io_unlink *un = &req->unlink; 3904 const char __user *fname; 3905 3906 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3907 return -EINVAL; 3908 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || 3909 sqe->splice_fd_in) 3910 return -EINVAL; 3911 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3912 return -EBADF; 3913 3914 un->dfd = READ_ONCE(sqe->fd); 3915 3916 un->flags = READ_ONCE(sqe->unlink_flags); 3917 if (un->flags & ~AT_REMOVEDIR) 3918 return -EINVAL; 3919 3920 fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3921 un->filename = getname(fname); 3922 if (IS_ERR(un->filename)) 3923 return PTR_ERR(un->filename); 3924 3925 req->flags |= REQ_F_NEED_CLEANUP; 3926 return 0; 3927} 3928 3929static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) 3930{ 3931 struct io_unlink *un = &req->unlink; 3932 int ret; 3933 3934 if (issue_flags & IO_URING_F_NONBLOCK) 3935 return -EAGAIN; 3936 3937 if (un->flags & AT_REMOVEDIR) 3938 ret = do_rmdir(un->dfd, un->filename); 3939 else 3940 ret = do_unlinkat(un->dfd, un->filename); 3941 3942 req->flags &= ~REQ_F_NEED_CLEANUP; 3943 if (ret < 0) 3944 req_set_fail(req); 3945 io_req_complete(req, ret); 3946 return 0; 3947} 3948 3949static int io_mkdirat_prep(struct io_kiocb *req, 3950 const struct io_uring_sqe *sqe) 3951{ 3952 struct io_mkdir *mkd = &req->mkdir; 3953 const char __user *fname; 3954 3955 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3956 return -EINVAL; 3957 if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index || 3958 sqe->splice_fd_in) 3959 return -EINVAL; 3960 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3961 return -EBADF; 3962 3963 mkd->dfd = READ_ONCE(sqe->fd); 3964 mkd->mode = READ_ONCE(sqe->len); 3965 3966 fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3967 mkd->filename = getname(fname); 3968 if (IS_ERR(mkd->filename)) 3969 return PTR_ERR(mkd->filename); 3970 3971 req->flags |= REQ_F_NEED_CLEANUP; 3972 return 0; 3973} 3974 3975static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) 3976{ 3977 struct io_mkdir *mkd = &req->mkdir; 3978 int ret; 3979 3980 if (issue_flags & IO_URING_F_NONBLOCK) 3981 return -EAGAIN; 3982 3983 ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); 3984 3985 req->flags &= ~REQ_F_NEED_CLEANUP; 3986 if (ret < 0) 3987 req_set_fail(req); 3988 io_req_complete(req, ret); 3989 return 0; 3990} 3991 3992static int io_symlinkat_prep(struct io_kiocb *req, 3993 const struct io_uring_sqe *sqe) 3994{ 3995 struct io_symlink *sl = &req->symlink; 3996 const char __user *oldpath, *newpath; 3997 3998 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3999 return -EINVAL; 4000 if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index || 4001 sqe->splice_fd_in) 4002 return -EINVAL; 4003 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 4004 return -EBADF; 4005 4006 sl->new_dfd = READ_ONCE(sqe->fd); 4007 oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4008 newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 4009 4010 sl->oldpath = getname(oldpath); 4011 if (IS_ERR(sl->oldpath)) 4012 return PTR_ERR(sl->oldpath); 4013 4014 sl->newpath = getname(newpath); 4015 if (IS_ERR(sl->newpath)) { 4016 putname(sl->oldpath); 4017 return PTR_ERR(sl->newpath); 4018 } 4019 4020 req->flags |= REQ_F_NEED_CLEANUP; 4021 return 0; 4022} 4023 4024static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) 4025{ 4026 struct io_symlink *sl = &req->symlink; 4027 int ret; 4028 4029 if (issue_flags & IO_URING_F_NONBLOCK) 4030 return -EAGAIN; 4031 4032 ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath); 4033 4034 req->flags &= ~REQ_F_NEED_CLEANUP; 4035 if (ret < 0) 4036 req_set_fail(req); 4037 io_req_complete(req, ret); 4038 return 0; 4039} 4040 4041static int io_linkat_prep(struct io_kiocb *req, 4042 const struct io_uring_sqe *sqe) 4043{ 4044 struct io_hardlink *lnk = &req->hardlink; 4045 const char __user *oldf, *newf; 4046 4047 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4048 return -EINVAL; 4049 if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) 4050 return -EINVAL; 4051 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 4052 return -EBADF; 4053 4054 lnk->old_dfd = READ_ONCE(sqe->fd); 4055 lnk->new_dfd = READ_ONCE(sqe->len); 4056 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4057 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 4058 lnk->flags = READ_ONCE(sqe->hardlink_flags); 4059 4060 lnk->oldpath = getname(oldf); 4061 if (IS_ERR(lnk->oldpath)) 4062 return PTR_ERR(lnk->oldpath); 4063 4064 lnk->newpath = getname(newf); 4065 if (IS_ERR(lnk->newpath)) { 4066 putname(lnk->oldpath); 4067 return PTR_ERR(lnk->newpath); 4068 } 4069 4070 req->flags |= REQ_F_NEED_CLEANUP; 4071 return 0; 4072} 4073 4074static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) 4075{ 4076 struct io_hardlink *lnk = &req->hardlink; 4077 int ret; 4078 4079 if (issue_flags & IO_URING_F_NONBLOCK) 4080 return -EAGAIN; 4081 4082 ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd, 4083 lnk->newpath, lnk->flags); 4084 4085 req->flags &= ~REQ_F_NEED_CLEANUP; 4086 if (ret < 0) 4087 req_set_fail(req); 4088 io_req_complete(req, ret); 4089 return 0; 4090} 4091 4092static int io_shutdown_prep(struct io_kiocb *req, 4093 const struct io_uring_sqe *sqe) 4094{ 4095#if defined(CONFIG_NET) 4096 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4097 return -EINVAL; 4098 if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || 4099 sqe->buf_index || sqe->splice_fd_in)) 4100 return -EINVAL; 4101 4102 req->shutdown.how = READ_ONCE(sqe->len); 4103 return 0; 4104#else 4105 return -EOPNOTSUPP; 4106#endif 4107} 4108 4109static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) 4110{ 4111#if defined(CONFIG_NET) 4112 struct socket *sock; 4113 int ret; 4114 4115 if (issue_flags & IO_URING_F_NONBLOCK) 4116 return -EAGAIN; 4117 4118 sock = sock_from_file(req->file); 4119 if (unlikely(!sock)) 4120 return -ENOTSOCK; 4121 4122 ret = __sys_shutdown_sock(sock, req->shutdown.how); 4123 if (ret < 0) 4124 req_set_fail(req); 4125 io_req_complete(req, ret); 4126 return 0; 4127#else 4128 return -EOPNOTSUPP; 4129#endif 4130} 4131 4132static int __io_splice_prep(struct io_kiocb *req, 4133 const struct io_uring_sqe *sqe) 4134{ 4135 struct io_splice *sp = &req->splice; 4136 unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; 4137 4138 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4139 return -EINVAL; 4140 4141 sp->file_in = NULL; 4142 sp->len = READ_ONCE(sqe->len); 4143 sp->flags = READ_ONCE(sqe->splice_flags); 4144 4145 if (unlikely(sp->flags & ~valid_flags)) 4146 return -EINVAL; 4147 4148 sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in), 4149 (sp->flags & SPLICE_F_FD_IN_FIXED)); 4150 if (!sp->file_in) 4151 return -EBADF; 4152 req->flags |= REQ_F_NEED_CLEANUP; 4153 return 0; 4154} 4155 4156static int io_tee_prep(struct io_kiocb *req, 4157 const struct io_uring_sqe *sqe) 4158{ 4159 if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off)) 4160 return -EINVAL; 4161 return __io_splice_prep(req, sqe); 4162} 4163 4164static int io_tee(struct io_kiocb *req, unsigned int issue_flags) 4165{ 4166 struct io_splice *sp = &req->splice; 4167 struct file *in = sp->file_in; 4168 struct file *out = sp->file_out; 4169 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; 4170 long ret = 0; 4171 4172 if (issue_flags & IO_URING_F_NONBLOCK) 4173 return -EAGAIN; 4174 if (sp->len) 4175 ret = do_tee(in, out, sp->len, flags); 4176 4177 if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) 4178 io_put_file(in); 4179 req->flags &= ~REQ_F_NEED_CLEANUP; 4180 4181 if (ret != sp->len) 4182 req_set_fail(req); 4183 io_req_complete(req, ret); 4184 return 0; 4185} 4186 4187static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4188{ 4189 struct io_splice *sp = &req->splice; 4190 4191 sp->off_in = READ_ONCE(sqe->splice_off_in); 4192 sp->off_out = READ_ONCE(sqe->off); 4193 return __io_splice_prep(req, sqe); 4194} 4195 4196static int io_splice(struct io_kiocb *req, unsigned int issue_flags) 4197{ 4198 struct io_splice *sp = &req->splice; 4199 struct file *in = sp->file_in; 4200 struct file *out = sp->file_out; 4201 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; 4202 loff_t *poff_in, *poff_out; 4203 long ret = 0; 4204 4205 if (issue_flags & IO_URING_F_NONBLOCK) 4206 return -EAGAIN; 4207 4208 poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; 4209 poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; 4210 4211 if (sp->len) 4212 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); 4213 4214 if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) 4215 io_put_file(in); 4216 req->flags &= ~REQ_F_NEED_CLEANUP; 4217 4218 if (ret != sp->len) 4219 req_set_fail(req); 4220 io_req_complete(req, ret); 4221 return 0; 4222} 4223 4224/* 4225 * IORING_OP_NOP just posts a completion event, nothing else. 4226 */ 4227static int io_nop(struct io_kiocb *req, unsigned int issue_flags) 4228{ 4229 struct io_ring_ctx *ctx = req->ctx; 4230 4231 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 4232 return -EINVAL; 4233 4234 __io_req_complete(req, issue_flags, 0, 0); 4235 return 0; 4236} 4237 4238static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4239{ 4240 struct io_ring_ctx *ctx = req->ctx; 4241 4242 if (!req->file) 4243 return -EBADF; 4244 4245 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 4246 return -EINVAL; 4247 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || 4248 sqe->splice_fd_in)) 4249 return -EINVAL; 4250 4251 req->sync.flags = READ_ONCE(sqe->fsync_flags); 4252 if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC)) 4253 return -EINVAL; 4254 4255 req->sync.off = READ_ONCE(sqe->off); 4256 req->sync.len = READ_ONCE(sqe->len); 4257 return 0; 4258} 4259 4260static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) 4261{ 4262 loff_t end = req->sync.off + req->sync.len; 4263 int ret; 4264 4265 /* fsync always requires a blocking context */ 4266 if (issue_flags & IO_URING_F_NONBLOCK) 4267 return -EAGAIN; 4268 4269 ret = vfs_fsync_range(req->file, req->sync.off, 4270 end > 0 ? end : LLONG_MAX, 4271 req->sync.flags & IORING_FSYNC_DATASYNC); 4272 if (ret < 0) 4273 req_set_fail(req); 4274 io_req_complete(req, ret); 4275 return 0; 4276} 4277 4278static int io_fallocate_prep(struct io_kiocb *req, 4279 const struct io_uring_sqe *sqe) 4280{ 4281 if (sqe->ioprio || sqe->buf_index || sqe->rw_flags || 4282 sqe->splice_fd_in) 4283 return -EINVAL; 4284 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4285 return -EINVAL; 4286 4287 req->sync.off = READ_ONCE(sqe->off); 4288 req->sync.len = READ_ONCE(sqe->addr); 4289 req->sync.mode = READ_ONCE(sqe->len); 4290 return 0; 4291} 4292 4293static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) 4294{ 4295 int ret; 4296 4297 /* fallocate always requiring blocking context */ 4298 if (issue_flags & IO_URING_F_NONBLOCK) 4299 return -EAGAIN; 4300 ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, 4301 req->sync.len); 4302 if (ret < 0) 4303 req_set_fail(req); 4304 io_req_complete(req, ret); 4305 return 0; 4306} 4307 4308static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4309{ 4310 const char __user *fname; 4311 int ret; 4312 4313 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4314 return -EINVAL; 4315 if (unlikely(sqe->ioprio || sqe->buf_index)) 4316 return -EINVAL; 4317 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 4318 return -EBADF; 4319 4320 /* open.how should be already initialised */ 4321 if (!(req->open.how.flags & O_PATH) && force_o_largefile()) 4322 req->open.how.flags |= O_LARGEFILE; 4323 4324 req->open.dfd = READ_ONCE(sqe->fd); 4325 fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4326 req->open.filename = getname(fname); 4327 if (IS_ERR(req->open.filename)) { 4328 ret = PTR_ERR(req->open.filename); 4329 req->open.filename = NULL; 4330 return ret; 4331 } 4332 4333 req->open.file_slot = READ_ONCE(sqe->file_index); 4334 if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC)) 4335 return -EINVAL; 4336 4337 req->open.nofile = rlimit(RLIMIT_NOFILE); 4338 req->flags |= REQ_F_NEED_CLEANUP; 4339 return 0; 4340} 4341 4342static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4343{ 4344 u64 mode = READ_ONCE(sqe->len); 4345 u64 flags = READ_ONCE(sqe->open_flags); 4346 4347 req->open.how = build_open_how(flags, mode); 4348 return __io_openat_prep(req, sqe); 4349} 4350 4351static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4352{ 4353 struct open_how __user *how; 4354 size_t len; 4355 int ret; 4356 4357 how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 4358 len = READ_ONCE(sqe->len); 4359 if (len < OPEN_HOW_SIZE_VER0) 4360 return -EINVAL; 4361 4362 ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how, 4363 len); 4364 if (ret) 4365 return ret; 4366 4367 return __io_openat_prep(req, sqe); 4368} 4369 4370static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) 4371{ 4372 struct open_flags op; 4373 struct file *file; 4374 bool resolve_nonblock, nonblock_set; 4375 bool fixed = !!req->open.file_slot; 4376 int ret; 4377 4378 ret = build_open_flags(&req->open.how, &op); 4379 if (ret) 4380 goto err; 4381 nonblock_set = op.open_flag & O_NONBLOCK; 4382 resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED; 4383 if (issue_flags & IO_URING_F_NONBLOCK) { 4384 /* 4385 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, 4386 * it'll always -EAGAIN 4387 */ 4388 if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE)) 4389 return -EAGAIN; 4390 op.lookup_flags |= LOOKUP_CACHED; 4391 op.open_flag |= O_NONBLOCK; 4392 } 4393 4394 if (!fixed) { 4395 ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile); 4396 if (ret < 0) 4397 goto err; 4398 } 4399 4400 file = do_filp_open(req->open.dfd, req->open.filename, &op); 4401 if (IS_ERR(file)) { 4402 /* 4403 * We could hang on to this 'fd' on retrying, but seems like 4404 * marginal gain for something that is now known to be a slower 4405 * path. So just put it, and we'll get a new one when we retry. 4406 */ 4407 if (!fixed) 4408 put_unused_fd(ret); 4409 4410 ret = PTR_ERR(file); 4411 /* only retry if RESOLVE_CACHED wasn't already set by application */ 4412 if (ret == -EAGAIN && 4413 (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK))) 4414 return -EAGAIN; 4415 goto err; 4416 } 4417 4418 if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) 4419 file->f_flags &= ~O_NONBLOCK; 4420 fsnotify_open(file); 4421 4422 if (!fixed) 4423 fd_install(ret, file); 4424 else 4425 ret = io_install_fixed_file(req, file, issue_flags, 4426 req->open.file_slot - 1); 4427err: 4428 putname(req->open.filename); 4429 req->flags &= ~REQ_F_NEED_CLEANUP; 4430 if (ret < 0) 4431 req_set_fail(req); 4432 __io_req_complete(req, issue_flags, ret, 0); 4433 return 0; 4434} 4435 4436static int io_openat(struct io_kiocb *req, unsigned int issue_flags) 4437{ 4438 return io_openat2(req, issue_flags); 4439} 4440 4441static int io_remove_buffers_prep(struct io_kiocb *req, 4442 const struct io_uring_sqe *sqe) 4443{ 4444 struct io_provide_buf *p = &req->pbuf; 4445 u64 tmp; 4446 4447 if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off || 4448 sqe->splice_fd_in) 4449 return -EINVAL; 4450 4451 tmp = READ_ONCE(sqe->fd); 4452 if (!tmp || tmp > USHRT_MAX) 4453 return -EINVAL; 4454 4455 memset(p, 0, sizeof(*p)); 4456 p->nbufs = tmp; 4457 p->bgid = READ_ONCE(sqe->buf_group); 4458 return 0; 4459} 4460 4461static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, 4462 int bgid, unsigned nbufs) 4463{ 4464 unsigned i = 0; 4465 4466 /* shouldn't happen */ 4467 if (!nbufs) 4468 return 0; 4469 4470 /* the head kbuf is the list itself */ 4471 while (!list_empty(&buf->list)) { 4472 struct io_buffer *nxt; 4473 4474 nxt = list_first_entry(&buf->list, struct io_buffer, list); 4475 list_del(&nxt->list); 4476 kfree(nxt); 4477 if (++i == nbufs) 4478 return i; 4479 cond_resched(); 4480 } 4481 i++; 4482 kfree(buf); 4483 xa_erase(&ctx->io_buffers, bgid); 4484 4485 return i; 4486} 4487 4488static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) 4489{ 4490 struct io_provide_buf *p = &req->pbuf; 4491 struct io_ring_ctx *ctx = req->ctx; 4492 struct io_buffer *head; 4493 int ret = 0; 4494 bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 4495 4496 io_ring_submit_lock(ctx, needs_lock); 4497 4498 lockdep_assert_held(&ctx->uring_lock); 4499 4500 ret = -ENOENT; 4501 head = xa_load(&ctx->io_buffers, p->bgid); 4502 if (head) 4503 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); 4504 if (ret < 0) 4505 req_set_fail(req); 4506 4507 /* complete before unlock, IOPOLL may need the lock */ 4508 __io_req_complete(req, issue_flags, ret, 0); 4509 io_ring_submit_unlock(ctx, needs_lock); 4510 return 0; 4511} 4512 4513static int io_provide_buffers_prep(struct io_kiocb *req, 4514 const struct io_uring_sqe *sqe) 4515{ 4516 unsigned long size, tmp_check; 4517 struct io_provide_buf *p = &req->pbuf; 4518 u64 tmp; 4519 4520 if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) 4521 return -EINVAL; 4522 4523 tmp = READ_ONCE(sqe->fd); 4524 if (!tmp || tmp > USHRT_MAX) 4525 return -E2BIG; 4526 p->nbufs = tmp; 4527 p->addr = READ_ONCE(sqe->addr); 4528 p->len = READ_ONCE(sqe->len); 4529 4530 if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, 4531 &size)) 4532 return -EOVERFLOW; 4533 if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) 4534 return -EOVERFLOW; 4535 4536 size = (unsigned long)p->len * p->nbufs; 4537 if (!access_ok(u64_to_user_ptr(p->addr), size)) 4538 return -EFAULT; 4539 4540 p->bgid = READ_ONCE(sqe->buf_group); 4541 tmp = READ_ONCE(sqe->off); 4542 if (tmp > USHRT_MAX) 4543 return -E2BIG; 4544 p->bid = tmp; 4545 return 0; 4546} 4547 4548static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) 4549{ 4550 struct io_buffer *buf; 4551 u64 addr = pbuf->addr; 4552 int i, bid = pbuf->bid; 4553 4554 for (i = 0; i < pbuf->nbufs; i++) { 4555 buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); 4556 if (!buf) 4557 break; 4558 4559 buf->addr = addr; 4560 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); 4561 buf->bid = bid; 4562 addr += pbuf->len; 4563 bid++; 4564 if (!*head) { 4565 INIT_LIST_HEAD(&buf->list); 4566 *head = buf; 4567 } else { 4568 list_add_tail(&buf->list, &(*head)->list); 4569 } 4570 cond_resched(); 4571 } 4572 4573 return i ? i : -ENOMEM; 4574} 4575 4576static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) 4577{ 4578 struct io_provide_buf *p = &req->pbuf; 4579 struct io_ring_ctx *ctx = req->ctx; 4580 struct io_buffer *head, *list; 4581 int ret = 0; 4582 bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 4583 4584 io_ring_submit_lock(ctx, needs_lock); 4585 4586 lockdep_assert_held(&ctx->uring_lock); 4587 4588 list = head = xa_load(&ctx->io_buffers, p->bgid); 4589 4590 ret = io_add_buffers(p, &head); 4591 if (ret >= 0 && !list) { 4592 ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL); 4593 if (ret < 0) 4594 __io_remove_buffers(ctx, head, p->bgid, -1U); 4595 } 4596 if (ret < 0) 4597 req_set_fail(req); 4598 /* complete before unlock, IOPOLL may need the lock */ 4599 __io_req_complete(req, issue_flags, ret, 0); 4600 io_ring_submit_unlock(ctx, needs_lock); 4601 return 0; 4602} 4603 4604static int io_epoll_ctl_prep(struct io_kiocb *req, 4605 const struct io_uring_sqe *sqe) 4606{ 4607#if defined(CONFIG_EPOLL) 4608 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) 4609 return -EINVAL; 4610 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4611 return -EINVAL; 4612 4613 req->epoll.epfd = READ_ONCE(sqe->fd); 4614 req->epoll.op = READ_ONCE(sqe->len); 4615 req->epoll.fd = READ_ONCE(sqe->off); 4616 4617 if (ep_op_has_event(req->epoll.op)) { 4618 struct epoll_event __user *ev; 4619 4620 ev = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4621 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev))) 4622 return -EFAULT; 4623 } 4624 4625 return 0; 4626#else 4627 return -EOPNOTSUPP; 4628#endif 4629} 4630 4631static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) 4632{ 4633#if defined(CONFIG_EPOLL) 4634 struct io_epoll *ie = &req->epoll; 4635 int ret; 4636 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 4637 4638 ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock); 4639 if (force_nonblock && ret == -EAGAIN) 4640 return -EAGAIN; 4641 4642 if (ret < 0) 4643 req_set_fail(req); 4644 __io_req_complete(req, issue_flags, ret, 0); 4645 return 0; 4646#else 4647 return -EOPNOTSUPP; 4648#endif 4649} 4650 4651static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4652{ 4653#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) 4654 if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in) 4655 return -EINVAL; 4656 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4657 return -EINVAL; 4658 4659 req->madvise.addr = READ_ONCE(sqe->addr); 4660 req->madvise.len = READ_ONCE(sqe->len); 4661 req->madvise.advice = READ_ONCE(sqe->fadvise_advice); 4662 return 0; 4663#else 4664 return -EOPNOTSUPP; 4665#endif 4666} 4667 4668static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) 4669{ 4670#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) 4671 struct io_madvise *ma = &req->madvise; 4672 int ret; 4673 4674 if (issue_flags & IO_URING_F_NONBLOCK) 4675 return -EAGAIN; 4676 4677 ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); 4678 if (ret < 0) 4679 req_set_fail(req); 4680 io_req_complete(req, ret); 4681 return 0; 4682#else 4683 return -EOPNOTSUPP; 4684#endif 4685} 4686 4687static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4688{ 4689 if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in) 4690 return -EINVAL; 4691 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4692 return -EINVAL; 4693 4694 req->fadvise.offset = READ_ONCE(sqe->off); 4695 req->fadvise.len = READ_ONCE(sqe->len); 4696 req->fadvise.advice = READ_ONCE(sqe->fadvise_advice); 4697 return 0; 4698} 4699 4700static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) 4701{ 4702 struct io_fadvise *fa = &req->fadvise; 4703 int ret; 4704 4705 if (issue_flags & IO_URING_F_NONBLOCK) { 4706 switch (fa->advice) { 4707 case POSIX_FADV_NORMAL: 4708 case POSIX_FADV_RANDOM: 4709 case POSIX_FADV_SEQUENTIAL: 4710 break; 4711 default: 4712 return -EAGAIN; 4713 } 4714 } 4715 4716 ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); 4717 if (ret < 0) 4718 req_set_fail(req); 4719 __io_req_complete(req, issue_flags, ret, 0); 4720 return 0; 4721} 4722 4723static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4724{ 4725 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4726 return -EINVAL; 4727 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) 4728 return -EINVAL; 4729 if (req->flags & REQ_F_FIXED_FILE) 4730 return -EBADF; 4731 4732 req->statx.dfd = READ_ONCE(sqe->fd); 4733 req->statx.mask = READ_ONCE(sqe->len); 4734 req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4735 req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 4736 req->statx.flags = READ_ONCE(sqe->statx_flags); 4737 4738 return 0; 4739} 4740 4741static int io_statx(struct io_kiocb *req, unsigned int issue_flags) 4742{ 4743 struct io_statx *ctx = &req->statx; 4744 int ret; 4745 4746 if (issue_flags & IO_URING_F_NONBLOCK) 4747 return -EAGAIN; 4748 4749 ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask, 4750 ctx->buffer); 4751 4752 if (ret < 0) 4753 req_set_fail(req); 4754 io_req_complete(req, ret); 4755 return 0; 4756} 4757 4758static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4759{ 4760 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4761 return -EINVAL; 4762 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || 4763 sqe->rw_flags || sqe->buf_index) 4764 return -EINVAL; 4765 if (req->flags & REQ_F_FIXED_FILE) 4766 return -EBADF; 4767 4768 req->close.fd = READ_ONCE(sqe->fd); 4769 req->close.file_slot = READ_ONCE(sqe->file_index); 4770 if (req->close.file_slot && req->close.fd) 4771 return -EINVAL; 4772 4773 return 0; 4774} 4775 4776static int io_close(struct io_kiocb *req, unsigned int issue_flags) 4777{ 4778 struct files_struct *files = current->files; 4779 struct io_close *close = &req->close; 4780 struct fdtable *fdt; 4781 struct file *file = NULL; 4782 int ret = -EBADF; 4783 4784 if (req->close.file_slot) { 4785 ret = io_close_fixed(req, issue_flags); 4786 goto err; 4787 } 4788 4789 spin_lock(&files->file_lock); 4790 fdt = files_fdtable(files); 4791 if (close->fd >= fdt->max_fds) { 4792 spin_unlock(&files->file_lock); 4793 goto err; 4794 } 4795 file = fdt->fd[close->fd]; 4796 if (!file || file->f_op == &io_uring_fops) { 4797 spin_unlock(&files->file_lock); 4798 file = NULL; 4799 goto err; 4800 } 4801 4802 /* if the file has a flush method, be safe and punt to async */ 4803 if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) { 4804 spin_unlock(&files->file_lock); 4805 return -EAGAIN; 4806 } 4807 4808 ret = __close_fd_get_file(close->fd, &file); 4809 spin_unlock(&files->file_lock); 4810 if (ret < 0) { 4811 if (ret == -ENOENT) 4812 ret = -EBADF; 4813 goto err; 4814 } 4815 4816 /* No ->flush() or already async, safely close from here */ 4817 ret = filp_close(file, current->files); 4818err: 4819 if (ret < 0) 4820 req_set_fail(req); 4821 if (file) 4822 fput(file); 4823 __io_req_complete(req, issue_flags, ret, 0); 4824 return 0; 4825} 4826 4827static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4828{ 4829 struct io_ring_ctx *ctx = req->ctx; 4830 4831 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 4832 return -EINVAL; 4833 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || 4834 sqe->splice_fd_in)) 4835 return -EINVAL; 4836 4837 req->sync.off = READ_ONCE(sqe->off); 4838 req->sync.len = READ_ONCE(sqe->len); 4839 req->sync.flags = READ_ONCE(sqe->sync_range_flags); 4840 return 0; 4841} 4842 4843static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) 4844{ 4845 int ret; 4846 4847 /* sync_file_range always requires a blocking context */ 4848 if (issue_flags & IO_URING_F_NONBLOCK) 4849 return -EAGAIN; 4850 4851 ret = sync_file_range(req->file, req->sync.off, req->sync.len, 4852 req->sync.flags); 4853 if (ret < 0) 4854 req_set_fail(req); 4855 io_req_complete(req, ret); 4856 return 0; 4857} 4858 4859#if defined(CONFIG_NET) 4860static int io_setup_async_msg(struct io_kiocb *req, 4861 struct io_async_msghdr *kmsg) 4862{ 4863 struct io_async_msghdr *async_msg = req->async_data; 4864 4865 if (async_msg) 4866 return -EAGAIN; 4867 if (io_alloc_async_data(req)) { 4868 kfree(kmsg->free_iov); 4869 return -ENOMEM; 4870 } 4871 async_msg = req->async_data; 4872 req->flags |= REQ_F_NEED_CLEANUP; 4873 memcpy(async_msg, kmsg, sizeof(*kmsg)); 4874 async_msg->msg.msg_name = &async_msg->addr; 4875 /* if were using fast_iov, set it to the new one */ 4876 if (!async_msg->free_iov) 4877 async_msg->msg.msg_iter.iov = async_msg->fast_iov; 4878 4879 return -EAGAIN; 4880} 4881 4882static int io_sendmsg_copy_hdr(struct io_kiocb *req, 4883 struct io_async_msghdr *iomsg) 4884{ 4885 iomsg->msg.msg_name = &iomsg->addr; 4886 iomsg->free_iov = iomsg->fast_iov; 4887 return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg, 4888 req->sr_msg.msg_flags, &iomsg->free_iov); 4889} 4890 4891static int io_sendmsg_prep_async(struct io_kiocb *req) 4892{ 4893 int ret; 4894 4895 ret = io_sendmsg_copy_hdr(req, req->async_data); 4896 if (!ret) 4897 req->flags |= REQ_F_NEED_CLEANUP; 4898 return ret; 4899} 4900 4901static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4902{ 4903 struct io_sr_msg *sr = &req->sr_msg; 4904 4905 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4906 return -EINVAL; 4907 4908 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4909 sr->len = READ_ONCE(sqe->len); 4910 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 4911 if (sr->msg_flags & MSG_DONTWAIT) 4912 req->flags |= REQ_F_NOWAIT; 4913 4914#ifdef CONFIG_COMPAT 4915 if (req->ctx->compat) 4916 sr->msg_flags |= MSG_CMSG_COMPAT; 4917#endif 4918 return 0; 4919} 4920 4921static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) 4922{ 4923 struct io_async_msghdr iomsg, *kmsg; 4924 struct socket *sock; 4925 unsigned flags; 4926 int min_ret = 0; 4927 int ret; 4928 4929 sock = sock_from_file(req->file); 4930 if (unlikely(!sock)) 4931 return -ENOTSOCK; 4932 4933 if (req_has_async_data(req)) { 4934 kmsg = req->async_data; 4935 } else { 4936 ret = io_sendmsg_copy_hdr(req, &iomsg); 4937 if (ret) 4938 return ret; 4939 kmsg = &iomsg; 4940 } 4941 4942 flags = req->sr_msg.msg_flags; 4943 if (issue_flags & IO_URING_F_NONBLOCK) 4944 flags |= MSG_DONTWAIT; 4945 if (flags & MSG_WAITALL) 4946 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 4947 4948 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 4949 4950 if (ret < min_ret) { 4951 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 4952 return io_setup_async_msg(req, kmsg); 4953 if (ret == -ERESTARTSYS) 4954 ret = -EINTR; 4955 req_set_fail(req); 4956 } 4957 /* fast path, check for non-NULL to avoid function call */ 4958 if (kmsg->free_iov) 4959 kfree(kmsg->free_iov); 4960 req->flags &= ~REQ_F_NEED_CLEANUP; 4961 __io_req_complete(req, issue_flags, ret, 0); 4962 return 0; 4963} 4964 4965static int io_send(struct io_kiocb *req, unsigned int issue_flags) 4966{ 4967 struct io_sr_msg *sr = &req->sr_msg; 4968 struct msghdr msg; 4969 struct iovec iov; 4970 struct socket *sock; 4971 unsigned flags; 4972 int min_ret = 0; 4973 int ret; 4974 4975 sock = sock_from_file(req->file); 4976 if (unlikely(!sock)) 4977 return -ENOTSOCK; 4978 4979 ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); 4980 if (unlikely(ret)) 4981 return ret; 4982 4983 msg.msg_name = NULL; 4984 msg.msg_control = NULL; 4985 msg.msg_controllen = 0; 4986 msg.msg_namelen = 0; 4987 4988 flags = req->sr_msg.msg_flags; 4989 if (issue_flags & IO_URING_F_NONBLOCK) 4990 flags |= MSG_DONTWAIT; 4991 if (flags & MSG_WAITALL) 4992 min_ret = iov_iter_count(&msg.msg_iter); 4993 4994 msg.msg_flags = flags; 4995 ret = sock_sendmsg(sock, &msg); 4996 if (ret < min_ret) { 4997 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 4998 return -EAGAIN; 4999 if (ret == -ERESTARTSYS) 5000 ret = -EINTR; 5001 req_set_fail(req); 5002 } 5003 __io_req_complete(req, issue_flags, ret, 0); 5004 return 0; 5005} 5006 5007static int __io_recvmsg_copy_hdr(struct io_kiocb *req, 5008 struct io_async_msghdr *iomsg) 5009{ 5010 struct io_sr_msg *sr = &req->sr_msg; 5011 struct iovec __user *uiov; 5012 size_t iov_len; 5013 int ret; 5014 5015 ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg, 5016 &iomsg->uaddr, &uiov, &iov_len); 5017 if (ret) 5018 return ret; 5019 5020 if (req->flags & REQ_F_BUFFER_SELECT) { 5021 if (iov_len > 1) 5022 return -EINVAL; 5023 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov))) 5024 return -EFAULT; 5025 sr->len = iomsg->fast_iov[0].iov_len; 5026 iomsg->free_iov = NULL; 5027 } else { 5028 iomsg->free_iov = iomsg->fast_iov; 5029 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV, 5030 &iomsg->free_iov, &iomsg->msg.msg_iter, 5031 false); 5032 if (ret > 0) 5033 ret = 0; 5034 } 5035 5036 return ret; 5037} 5038 5039#ifdef CONFIG_COMPAT 5040static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, 5041 struct io_async_msghdr *iomsg) 5042{ 5043 struct io_sr_msg *sr = &req->sr_msg; 5044 struct compat_iovec __user *uiov; 5045 compat_uptr_t ptr; 5046 compat_size_t len; 5047 int ret; 5048 5049 ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr, 5050 &ptr, &len); 5051 if (ret) 5052 return ret; 5053 5054 uiov = compat_ptr(ptr); 5055 if (req->flags & REQ_F_BUFFER_SELECT) { 5056 compat_ssize_t clen; 5057 5058 if (len > 1) 5059 return -EINVAL; 5060 if (!access_ok(uiov, sizeof(*uiov))) 5061 return -EFAULT; 5062 if (__get_user(clen, &uiov->iov_len)) 5063 return -EFAULT; 5064 if (clen < 0) 5065 return -EINVAL; 5066 sr->len = clen; 5067 iomsg->free_iov = NULL; 5068 } else { 5069 iomsg->free_iov = iomsg->fast_iov; 5070 ret = __import_iovec(READ, (struct iovec __user *)uiov, len, 5071 UIO_FASTIOV, &iomsg->free_iov, 5072 &iomsg->msg.msg_iter, true); 5073 if (ret < 0) 5074 return ret; 5075 } 5076 5077 return 0; 5078} 5079#endif 5080 5081static int io_recvmsg_copy_hdr(struct io_kiocb *req, 5082 struct io_async_msghdr *iomsg) 5083{ 5084 iomsg->msg.msg_name = &iomsg->addr; 5085 5086#ifdef CONFIG_COMPAT 5087 if (req->ctx->compat) 5088 return __io_compat_recvmsg_copy_hdr(req, iomsg); 5089#endif 5090 5091 return __io_recvmsg_copy_hdr(req, iomsg); 5092} 5093 5094static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, 5095 unsigned int issue_flags) 5096{ 5097 struct io_sr_msg *sr = &req->sr_msg; 5098 5099 return io_buffer_select(req, &sr->len, sr->bgid, issue_flags); 5100} 5101 5102static int io_recvmsg_prep_async(struct io_kiocb *req) 5103{ 5104 int ret; 5105 5106 ret = io_recvmsg_copy_hdr(req, req->async_data); 5107 if (!ret) 5108 req->flags |= REQ_F_NEED_CLEANUP; 5109 return ret; 5110} 5111 5112static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5113{ 5114 struct io_sr_msg *sr = &req->sr_msg; 5115 5116 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5117 return -EINVAL; 5118 5119 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 5120 sr->len = READ_ONCE(sqe->len); 5121 sr->bgid = READ_ONCE(sqe->buf_group); 5122 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 5123 if (sr->msg_flags & MSG_DONTWAIT) 5124 req->flags |= REQ_F_NOWAIT; 5125 5126#ifdef CONFIG_COMPAT 5127 if (req->ctx->compat) 5128 sr->msg_flags |= MSG_CMSG_COMPAT; 5129#endif 5130 return 0; 5131} 5132 5133static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) 5134{ 5135 struct io_async_msghdr iomsg, *kmsg; 5136 struct socket *sock; 5137 struct io_buffer *kbuf; 5138 unsigned flags; 5139 int ret, min_ret = 0; 5140 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 5141 5142 sock = sock_from_file(req->file); 5143 if (unlikely(!sock)) 5144 return -ENOTSOCK; 5145 5146 if (req_has_async_data(req)) { 5147 kmsg = req->async_data; 5148 } else { 5149 ret = io_recvmsg_copy_hdr(req, &iomsg); 5150 if (ret) 5151 return ret; 5152 kmsg = &iomsg; 5153 } 5154 5155 if (req->flags & REQ_F_BUFFER_SELECT) { 5156 kbuf = io_recv_buffer_select(req, issue_flags); 5157 if (IS_ERR(kbuf)) 5158 return PTR_ERR(kbuf); 5159 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); 5160 kmsg->fast_iov[0].iov_len = req->sr_msg.len; 5161 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 5162 1, req->sr_msg.len); 5163 } 5164 5165 flags = req->sr_msg.msg_flags; 5166 if (force_nonblock) 5167 flags |= MSG_DONTWAIT; 5168 if (flags & MSG_WAITALL) 5169 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 5170 5171 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, 5172 kmsg->uaddr, flags); 5173 if (ret < min_ret) { 5174 if (ret == -EAGAIN && force_nonblock) 5175 return io_setup_async_msg(req, kmsg); 5176 if (ret == -ERESTARTSYS) 5177 ret = -EINTR; 5178 req_set_fail(req); 5179 } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { 5180 req_set_fail(req); 5181 } 5182 5183 /* fast path, check for non-NULL to avoid function call */ 5184 if (kmsg->free_iov) 5185 kfree(kmsg->free_iov); 5186 req->flags &= ~REQ_F_NEED_CLEANUP; 5187 __io_req_complete(req, issue_flags, ret, io_put_kbuf(req)); 5188 return 0; 5189} 5190 5191static int io_recv(struct io_kiocb *req, unsigned int issue_flags) 5192{ 5193 struct io_buffer *kbuf; 5194 struct io_sr_msg *sr = &req->sr_msg; 5195 struct msghdr msg; 5196 void __user *buf = sr->buf; 5197 struct socket *sock; 5198 struct iovec iov; 5199 unsigned flags; 5200 int ret, min_ret = 0; 5201 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 5202 5203 sock = sock_from_file(req->file); 5204 if (unlikely(!sock)) 5205 return -ENOTSOCK; 5206 5207 if (req->flags & REQ_F_BUFFER_SELECT) { 5208 kbuf = io_recv_buffer_select(req, issue_flags); 5209 if (IS_ERR(kbuf)) 5210 return PTR_ERR(kbuf); 5211 buf = u64_to_user_ptr(kbuf->addr); 5212 } 5213 5214 ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); 5215 if (unlikely(ret)) 5216 goto out_free; 5217 5218 msg.msg_name = NULL; 5219 msg.msg_control = NULL; 5220 msg.msg_controllen = 0; 5221 msg.msg_namelen = 0; 5222 msg.msg_iocb = NULL; 5223 msg.msg_flags = 0; 5224 5225 flags = req->sr_msg.msg_flags; 5226 if (force_nonblock) 5227 flags |= MSG_DONTWAIT; 5228 if (flags & MSG_WAITALL) 5229 min_ret = iov_iter_count(&msg.msg_iter); 5230 5231 ret = sock_recvmsg(sock, &msg, flags); 5232 if (ret < min_ret) { 5233 if (ret == -EAGAIN && force_nonblock) 5234 return -EAGAIN; 5235 if (ret == -ERESTARTSYS) 5236 ret = -EINTR; 5237 req_set_fail(req); 5238 } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { 5239out_free: 5240 req_set_fail(req); 5241 } 5242 __io_req_complete(req, issue_flags, ret, io_put_kbuf(req)); 5243 return 0; 5244} 5245 5246static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5247{ 5248 struct io_accept *accept = &req->accept; 5249 5250 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5251 return -EINVAL; 5252 if (sqe->ioprio || sqe->len || sqe->buf_index) 5253 return -EINVAL; 5254 5255 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 5256 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 5257 accept->flags = READ_ONCE(sqe->accept_flags); 5258 accept->nofile = rlimit(RLIMIT_NOFILE); 5259 5260 accept->file_slot = READ_ONCE(sqe->file_index); 5261 if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) || 5262 (accept->flags & SOCK_CLOEXEC))) 5263 return -EINVAL; 5264 if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 5265 return -EINVAL; 5266 if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) 5267 accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; 5268 return 0; 5269} 5270 5271static int io_accept(struct io_kiocb *req, unsigned int issue_flags) 5272{ 5273 struct io_accept *accept = &req->accept; 5274 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 5275 unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; 5276 bool fixed = !!accept->file_slot; 5277 struct file *file; 5278 int ret, fd; 5279 5280 if (req->file->f_flags & O_NONBLOCK) 5281 req->flags |= REQ_F_NOWAIT; 5282 5283 if (!fixed) { 5284 fd = __get_unused_fd_flags(accept->flags, accept->nofile); 5285 if (unlikely(fd < 0)) 5286 return fd; 5287 } 5288 file = do_accept(req->file, file_flags, accept->addr, accept->addr_len, 5289 accept->flags); 5290 if (IS_ERR(file)) { 5291 if (!fixed) 5292 put_unused_fd(fd); 5293 ret = PTR_ERR(file); 5294 if (ret == -EAGAIN && force_nonblock) 5295 return -EAGAIN; 5296 if (ret == -ERESTARTSYS) 5297 ret = -EINTR; 5298 req_set_fail(req); 5299 } else if (!fixed) { 5300 fd_install(fd, file); 5301 ret = fd; 5302 } else { 5303 ret = io_install_fixed_file(req, file, issue_flags, 5304 accept->file_slot - 1); 5305 } 5306 __io_req_complete(req, issue_flags, ret, 0); 5307 return 0; 5308} 5309 5310static int io_connect_prep_async(struct io_kiocb *req) 5311{ 5312 struct io_async_connect *io = req->async_data; 5313 struct io_connect *conn = &req->connect; 5314 5315 return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address); 5316} 5317 5318static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5319{ 5320 struct io_connect *conn = &req->connect; 5321 5322 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5323 return -EINVAL; 5324 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags || 5325 sqe->splice_fd_in) 5326 return -EINVAL; 5327 5328 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 5329 conn->addr_len = READ_ONCE(sqe->addr2); 5330 return 0; 5331} 5332 5333static int io_connect(struct io_kiocb *req, unsigned int issue_flags) 5334{ 5335 struct io_async_connect __io, *io; 5336 unsigned file_flags; 5337 int ret; 5338 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 5339 5340 if (req_has_async_data(req)) { 5341 io = req->async_data; 5342 } else { 5343 ret = move_addr_to_kernel(req->connect.addr, 5344 req->connect.addr_len, 5345 &__io.address); 5346 if (ret) 5347 goto out; 5348 io = &__io; 5349 } 5350 5351 file_flags = force_nonblock ? O_NONBLOCK : 0; 5352 5353 ret = __sys_connect_file(req->file, &io->address, 5354 req->connect.addr_len, file_flags); 5355 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { 5356 if (req_has_async_data(req)) 5357 return -EAGAIN; 5358 if (io_alloc_async_data(req)) { 5359 ret = -ENOMEM; 5360 goto out; 5361 } 5362 memcpy(req->async_data, &__io, sizeof(__io)); 5363 return -EAGAIN; 5364 } 5365 if (ret == -ERESTARTSYS) 5366 ret = -EINTR; 5367out: 5368 if (ret < 0) 5369 req_set_fail(req); 5370 __io_req_complete(req, issue_flags, ret, 0); 5371 return 0; 5372} 5373#else /* !CONFIG_NET */ 5374#define IO_NETOP_FN(op) \ 5375static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \ 5376{ \ 5377 return -EOPNOTSUPP; \ 5378} 5379 5380#define IO_NETOP_PREP(op) \ 5381IO_NETOP_FN(op) \ 5382static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \ 5383{ \ 5384 return -EOPNOTSUPP; \ 5385} \ 5386 5387#define IO_NETOP_PREP_ASYNC(op) \ 5388IO_NETOP_PREP(op) \ 5389static int io_##op##_prep_async(struct io_kiocb *req) \ 5390{ \ 5391 return -EOPNOTSUPP; \ 5392} 5393 5394IO_NETOP_PREP_ASYNC(sendmsg); 5395IO_NETOP_PREP_ASYNC(recvmsg); 5396IO_NETOP_PREP_ASYNC(connect); 5397IO_NETOP_PREP(accept); 5398IO_NETOP_FN(send); 5399IO_NETOP_FN(recv); 5400#endif /* CONFIG_NET */ 5401 5402struct io_poll_table { 5403 struct poll_table_struct pt; 5404 struct io_kiocb *req; 5405 int nr_entries; 5406 int error; 5407}; 5408 5409#define IO_POLL_CANCEL_FLAG BIT(31) 5410#define IO_POLL_REF_MASK ((1u << 20)-1) 5411 5412/* 5413 * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can 5414 * bump it and acquire ownership. It's disallowed to modify requests while not 5415 * owning it, that prevents from races for enqueueing task_work's and b/w 5416 * arming poll and wakeups. 5417 */ 5418static inline bool io_poll_get_ownership(struct io_kiocb *req) 5419{ 5420 return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); 5421} 5422 5423static void io_poll_mark_cancelled(struct io_kiocb *req) 5424{ 5425 atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs); 5426} 5427 5428static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) 5429{ 5430 /* pure poll stashes this in ->async_data, poll driven retry elsewhere */ 5431 if (req->opcode == IORING_OP_POLL_ADD) 5432 return req->async_data; 5433 return req->apoll->double_poll; 5434} 5435 5436static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req) 5437{ 5438 if (req->opcode == IORING_OP_POLL_ADD) 5439 return &req->poll; 5440 return &req->apoll->poll; 5441} 5442 5443static void io_poll_req_insert(struct io_kiocb *req) 5444{ 5445 struct io_ring_ctx *ctx = req->ctx; 5446 struct hlist_head *list; 5447 5448 list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; 5449 hlist_add_head(&req->hash_node, list); 5450} 5451 5452static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, 5453 wait_queue_func_t wake_func) 5454{ 5455 poll->head = NULL; 5456#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) 5457 /* mask in events that we always want/need */ 5458 poll->events = events | IO_POLL_UNMASK; 5459 INIT_LIST_HEAD(&poll->wait.entry); 5460 init_waitqueue_func_entry(&poll->wait, wake_func); 5461} 5462 5463static inline void io_poll_remove_entry(struct io_poll_iocb *poll) 5464{ 5465 struct wait_queue_head *head = smp_load_acquire(&poll->head); 5466 5467 if (head) { 5468 spin_lock_irq(&head->lock); 5469 list_del_init(&poll->wait.entry); 5470 poll->head = NULL; 5471 spin_unlock_irq(&head->lock); 5472 } 5473} 5474 5475static void io_poll_remove_entries(struct io_kiocb *req) 5476{ 5477 struct io_poll_iocb *poll = io_poll_get_single(req); 5478 struct io_poll_iocb *poll_double = io_poll_get_double(req); 5479 5480 /* 5481 * While we hold the waitqueue lock and the waitqueue is nonempty, 5482 * wake_up_pollfree() will wait for us. However, taking the waitqueue 5483 * lock in the first place can race with the waitqueue being freed. 5484 * 5485 * We solve this as eventpoll does: by taking advantage of the fact that 5486 * all users of wake_up_pollfree() will RCU-delay the actual free. If 5487 * we enter rcu_read_lock() and see that the pointer to the queue is 5488 * non-NULL, we can then lock it without the memory being freed out from 5489 * under us. 5490 * 5491 * Keep holding rcu_read_lock() as long as we hold the queue lock, in 5492 * case the caller deletes the entry from the queue, leaving it empty. 5493 * In that case, only RCU prevents the queue memory from being freed. 5494 */ 5495 rcu_read_lock(); 5496 io_poll_remove_entry(poll); 5497 if (poll_double) 5498 io_poll_remove_entry(poll_double); 5499 rcu_read_unlock(); 5500} 5501 5502/* 5503 * All poll tw should go through this. Checks for poll events, manages 5504 * references, does rewait, etc. 5505 * 5506 * Returns a negative error on failure. >0 when no action require, which is 5507 * either spurious wakeup or multishot CQE is served. 0 when it's done with 5508 * the request, then the mask is stored in req->result. 5509 */ 5510static int io_poll_check_events(struct io_kiocb *req) 5511{ 5512 struct io_ring_ctx *ctx = req->ctx; 5513 struct io_poll_iocb *poll = io_poll_get_single(req); 5514 int v; 5515 5516 /* req->task == current here, checking PF_EXITING is safe */ 5517 if (unlikely(req->task->flags & PF_EXITING)) 5518 io_poll_mark_cancelled(req); 5519 5520 do { 5521 v = atomic_read(&req->poll_refs); 5522 5523 /* tw handler should be the owner, and so have some references */ 5524 if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) 5525 return 0; 5526 if (v & IO_POLL_CANCEL_FLAG) 5527 return -ECANCELED; 5528 5529 if (!req->result) { 5530 struct poll_table_struct pt = { ._key = poll->events }; 5531 5532 req->result = vfs_poll(req->file, &pt) & poll->events; 5533 } 5534 5535 /* multishot, just fill an CQE and proceed */ 5536 if (req->result && !(poll->events & EPOLLONESHOT)) { 5537 __poll_t mask = mangle_poll(req->result & poll->events); 5538 bool filled; 5539 5540 spin_lock(&ctx->completion_lock); 5541 filled = io_fill_cqe_aux(ctx, req->user_data, mask, 5542 IORING_CQE_F_MORE); 5543 io_commit_cqring(ctx); 5544 spin_unlock(&ctx->completion_lock); 5545 if (unlikely(!filled)) 5546 return -ECANCELED; 5547 io_cqring_ev_posted(ctx); 5548 } else if (req->result) { 5549 return 0; 5550 } 5551 5552 /* 5553 * Release all references, retry if someone tried to restart 5554 * task_work while we were executing it. 5555 */ 5556 } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs)); 5557 5558 return 1; 5559} 5560 5561static void io_poll_task_func(struct io_kiocb *req, bool *locked) 5562{ 5563 struct io_ring_ctx *ctx = req->ctx; 5564 int ret; 5565 5566 ret = io_poll_check_events(req); 5567 if (ret > 0) 5568 return; 5569 5570 if (!ret) { 5571 req->result = mangle_poll(req->result & req->poll.events); 5572 } else { 5573 req->result = ret; 5574 req_set_fail(req); 5575 } 5576 5577 io_poll_remove_entries(req); 5578 spin_lock(&ctx->completion_lock); 5579 hash_del(&req->hash_node); 5580 __io_req_complete_post(req, req->result, 0); 5581 io_commit_cqring(ctx); 5582 spin_unlock(&ctx->completion_lock); 5583 io_cqring_ev_posted(ctx); 5584} 5585 5586static void io_apoll_task_func(struct io_kiocb *req, bool *locked) 5587{ 5588 struct io_ring_ctx *ctx = req->ctx; 5589 int ret; 5590 5591 ret = io_poll_check_events(req); 5592 if (ret > 0) 5593 return; 5594 5595 io_poll_remove_entries(req); 5596 spin_lock(&ctx->completion_lock); 5597 hash_del(&req->hash_node); 5598 spin_unlock(&ctx->completion_lock); 5599 5600 if (!ret) 5601 io_req_task_submit(req, locked); 5602 else 5603 io_req_complete_failed(req, ret); 5604} 5605 5606static void __io_poll_execute(struct io_kiocb *req, int mask) 5607{ 5608 req->result = mask; 5609 if (req->opcode == IORING_OP_POLL_ADD) 5610 req->io_task_work.func = io_poll_task_func; 5611 else 5612 req->io_task_work.func = io_apoll_task_func; 5613 5614 trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask); 5615 io_req_task_work_add(req, false); 5616} 5617 5618static inline void io_poll_execute(struct io_kiocb *req, int res) 5619{ 5620 if (io_poll_get_ownership(req)) 5621 __io_poll_execute(req, res); 5622} 5623 5624static void io_poll_cancel_req(struct io_kiocb *req) 5625{ 5626 io_poll_mark_cancelled(req); 5627 /* kick tw, which should complete the request */ 5628 io_poll_execute(req, 0); 5629} 5630 5631static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, 5632 void *key) 5633{ 5634 struct io_kiocb *req = wait->private; 5635 struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb, 5636 wait); 5637 __poll_t mask = key_to_poll(key); 5638 5639 if (unlikely(mask & POLLFREE)) { 5640 io_poll_mark_cancelled(req); 5641 /* we have to kick tw in case it's not already */ 5642 io_poll_execute(req, 0); 5643 5644 /* 5645 * If the waitqueue is being freed early but someone is already 5646 * holds ownership over it, we have to tear down the request as 5647 * best we can. That means immediately removing the request from 5648 * its waitqueue and preventing all further accesses to the 5649 * waitqueue via the request. 5650 */ 5651 list_del_init(&poll->wait.entry); 5652 5653 /* 5654 * Careful: this *must* be the last step, since as soon 5655 * as req->head is NULL'ed out, the request can be 5656 * completed and freed, since aio_poll_complete_work() 5657 * will no longer need to take the waitqueue lock. 5658 */ 5659 smp_store_release(&poll->head, NULL); 5660 return 1; 5661 } 5662 5663 /* for instances that support it check for an event match first */ 5664 if (mask && !(mask & poll->events)) 5665 return 0; 5666 5667 if (io_poll_get_ownership(req)) { 5668 /* optional, saves extra locking for removal in tw handler */ 5669 if (mask && poll->events & EPOLLONESHOT) { 5670 list_del_init(&poll->wait.entry); 5671 poll->head = NULL; 5672 } 5673 __io_poll_execute(req, mask); 5674 } 5675 return 1; 5676} 5677 5678static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, 5679 struct wait_queue_head *head, 5680 struct io_poll_iocb **poll_ptr) 5681{ 5682 struct io_kiocb *req = pt->req; 5683 5684 /* 5685 * The file being polled uses multiple waitqueues for poll handling 5686 * (e.g. one for read, one for write). Setup a separate io_poll_iocb 5687 * if this happens. 5688 */ 5689 if (unlikely(pt->nr_entries)) { 5690 struct io_poll_iocb *first = poll; 5691 5692 /* double add on the same waitqueue head, ignore */ 5693 if (first->head == head) 5694 return; 5695 /* already have a 2nd entry, fail a third attempt */ 5696 if (*poll_ptr) { 5697 if ((*poll_ptr)->head == head) 5698 return; 5699 pt->error = -EINVAL; 5700 return; 5701 } 5702 5703 poll = kmalloc(sizeof(*poll), GFP_ATOMIC); 5704 if (!poll) { 5705 pt->error = -ENOMEM; 5706 return; 5707 } 5708 io_init_poll_iocb(poll, first->events, first->wait.func); 5709 *poll_ptr = poll; 5710 if (req->opcode == IORING_OP_POLL_ADD) 5711 req->flags |= REQ_F_ASYNC_DATA; 5712 } 5713 5714 pt->nr_entries++; 5715 poll->head = head; 5716 poll->wait.private = req; 5717 5718 if (poll->events & EPOLLEXCLUSIVE) 5719 add_wait_queue_exclusive(head, &poll->wait); 5720 else 5721 add_wait_queue(head, &poll->wait); 5722} 5723 5724static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, 5725 struct poll_table_struct *p) 5726{ 5727 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); 5728 5729 __io_queue_proc(&pt->req->poll, pt, head, 5730 (struct io_poll_iocb **) &pt->req->async_data); 5731} 5732 5733static int __io_arm_poll_handler(struct io_kiocb *req, 5734 struct io_poll_iocb *poll, 5735 struct io_poll_table *ipt, __poll_t mask) 5736{ 5737 struct io_ring_ctx *ctx = req->ctx; 5738 int v; 5739 5740 INIT_HLIST_NODE(&req->hash_node); 5741 io_init_poll_iocb(poll, mask, io_poll_wake); 5742 poll->file = req->file; 5743 poll->wait.private = req; 5744 5745 ipt->pt._key = mask; 5746 ipt->req = req; 5747 ipt->error = 0; 5748 ipt->nr_entries = 0; 5749 5750 /* 5751 * Take the ownership to delay any tw execution up until we're done 5752 * with poll arming. see io_poll_get_ownership(). 5753 */ 5754 atomic_set(&req->poll_refs, 1); 5755 mask = vfs_poll(req->file, &ipt->pt) & poll->events; 5756 5757 if (mask && (poll->events & EPOLLONESHOT)) { 5758 io_poll_remove_entries(req); 5759 /* no one else has access to the req, forget about the ref */ 5760 return mask; 5761 } 5762 if (!mask && unlikely(ipt->error || !ipt->nr_entries)) { 5763 io_poll_remove_entries(req); 5764 if (!ipt->error) 5765 ipt->error = -EINVAL; 5766 return 0; 5767 } 5768 5769 spin_lock(&ctx->completion_lock); 5770 io_poll_req_insert(req); 5771 spin_unlock(&ctx->completion_lock); 5772 5773 if (mask) { 5774 /* can't multishot if failed, just queue the event we've got */ 5775 if (unlikely(ipt->error || !ipt->nr_entries)) 5776 poll->events |= EPOLLONESHOT; 5777 __io_poll_execute(req, mask); 5778 return 0; 5779 } 5780 5781 /* 5782 * Release ownership. If someone tried to queue a tw while it was 5783 * locked, kick it off for them. 5784 */ 5785 v = atomic_dec_return(&req->poll_refs); 5786 if (unlikely(v & IO_POLL_REF_MASK)) 5787 __io_poll_execute(req, 0); 5788 return 0; 5789} 5790 5791static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, 5792 struct poll_table_struct *p) 5793{ 5794 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); 5795 struct async_poll *apoll = pt->req->apoll; 5796 5797 __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); 5798} 5799 5800enum { 5801 IO_APOLL_OK, 5802 IO_APOLL_ABORTED, 5803 IO_APOLL_READY 5804}; 5805 5806static int io_arm_poll_handler(struct io_kiocb *req) 5807{ 5808 const struct io_op_def *def = &io_op_defs[req->opcode]; 5809 struct io_ring_ctx *ctx = req->ctx; 5810 struct async_poll *apoll; 5811 struct io_poll_table ipt; 5812 __poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI; 5813 int ret; 5814 5815 if (!def->pollin && !def->pollout) 5816 return IO_APOLL_ABORTED; 5817 if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED)) 5818 return IO_APOLL_ABORTED; 5819 5820 if (def->pollin) { 5821 mask |= POLLIN | POLLRDNORM; 5822 5823 /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ 5824 if ((req->opcode == IORING_OP_RECVMSG) && 5825 (req->sr_msg.msg_flags & MSG_ERRQUEUE)) 5826 mask &= ~POLLIN; 5827 } else { 5828 mask |= POLLOUT | POLLWRNORM; 5829 } 5830 5831 apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); 5832 if (unlikely(!apoll)) 5833 return IO_APOLL_ABORTED; 5834 apoll->double_poll = NULL; 5835 req->apoll = apoll; 5836 req->flags |= REQ_F_POLLED; 5837 ipt.pt._qproc = io_async_queue_proc; 5838 5839 ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask); 5840 if (ret || ipt.error) 5841 return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; 5842 5843 trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data, 5844 mask, apoll->poll.events); 5845 return IO_APOLL_OK; 5846} 5847 5848/* 5849 * Returns true if we found and killed one or more poll requests 5850 */ 5851static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, 5852 struct task_struct *tsk, bool cancel_all) 5853{ 5854 struct hlist_node *tmp; 5855 struct io_kiocb *req; 5856 bool found = false; 5857 int i; 5858 5859 spin_lock(&ctx->completion_lock); 5860 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { 5861 struct hlist_head *list; 5862 5863 list = &ctx->cancel_hash[i]; 5864 hlist_for_each_entry_safe(req, tmp, list, hash_node) { 5865 if (io_match_task_safe(req, tsk, cancel_all)) { 5866 io_poll_cancel_req(req); 5867 found = true; 5868 } 5869 } 5870 } 5871 spin_unlock(&ctx->completion_lock); 5872 return found; 5873} 5874 5875static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr, 5876 bool poll_only) 5877 __must_hold(&ctx->completion_lock) 5878{ 5879 struct hlist_head *list; 5880 struct io_kiocb *req; 5881 5882 list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; 5883 hlist_for_each_entry(req, list, hash_node) { 5884 if (sqe_addr != req->user_data) 5885 continue; 5886 if (poll_only && req->opcode != IORING_OP_POLL_ADD) 5887 continue; 5888 return req; 5889 } 5890 return NULL; 5891} 5892 5893static bool io_poll_disarm(struct io_kiocb *req) 5894 __must_hold(&ctx->completion_lock) 5895{ 5896 if (!io_poll_get_ownership(req)) 5897 return false; 5898 io_poll_remove_entries(req); 5899 hash_del(&req->hash_node); 5900 return true; 5901} 5902 5903static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr, 5904 bool poll_only) 5905 __must_hold(&ctx->completion_lock) 5906{ 5907 struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only); 5908 5909 if (!req) 5910 return -ENOENT; 5911 io_poll_cancel_req(req); 5912 return 0; 5913} 5914 5915static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, 5916 unsigned int flags) 5917{ 5918 u32 events; 5919 5920 events = READ_ONCE(sqe->poll32_events); 5921#ifdef __BIG_ENDIAN 5922 events = swahw32(events); 5923#endif 5924 if (!(flags & IORING_POLL_ADD_MULTI)) 5925 events |= EPOLLONESHOT; 5926 return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT)); 5927} 5928 5929static int io_poll_update_prep(struct io_kiocb *req, 5930 const struct io_uring_sqe *sqe) 5931{ 5932 struct io_poll_update *upd = &req->poll_update; 5933 u32 flags; 5934 5935 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5936 return -EINVAL; 5937 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) 5938 return -EINVAL; 5939 flags = READ_ONCE(sqe->len); 5940 if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | 5941 IORING_POLL_ADD_MULTI)) 5942 return -EINVAL; 5943 /* meaningless without update */ 5944 if (flags == IORING_POLL_ADD_MULTI) 5945 return -EINVAL; 5946 5947 upd->old_user_data = READ_ONCE(sqe->addr); 5948 upd->update_events = flags & IORING_POLL_UPDATE_EVENTS; 5949 upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA; 5950 5951 upd->new_user_data = READ_ONCE(sqe->off); 5952 if (!upd->update_user_data && upd->new_user_data) 5953 return -EINVAL; 5954 if (upd->update_events) 5955 upd->events = io_poll_parse_events(sqe, flags); 5956 else if (sqe->poll32_events) 5957 return -EINVAL; 5958 5959 return 0; 5960} 5961 5962static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5963{ 5964 struct io_poll_iocb *poll = &req->poll; 5965 u32 flags; 5966 5967 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5968 return -EINVAL; 5969 if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr) 5970 return -EINVAL; 5971 flags = READ_ONCE(sqe->len); 5972 if (flags & ~IORING_POLL_ADD_MULTI) 5973 return -EINVAL; 5974 if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP)) 5975 return -EINVAL; 5976 5977 io_req_set_refcount(req); 5978 poll->events = io_poll_parse_events(sqe, flags); 5979 return 0; 5980} 5981 5982static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) 5983{ 5984 struct io_poll_iocb *poll = &req->poll; 5985 struct io_poll_table ipt; 5986 int ret; 5987 5988 ipt.pt._qproc = io_poll_queue_proc; 5989 5990 ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events); 5991 ret = ret ?: ipt.error; 5992 if (ret) 5993 __io_req_complete(req, issue_flags, ret, 0); 5994 return 0; 5995} 5996 5997static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) 5998{ 5999 struct io_ring_ctx *ctx = req->ctx; 6000 struct io_kiocb *preq; 6001 int ret2, ret = 0; 6002 bool locked; 6003 6004 spin_lock(&ctx->completion_lock); 6005 preq = io_poll_find(ctx, req->poll_update.old_user_data, true); 6006 if (!preq || !io_poll_disarm(preq)) { 6007 spin_unlock(&ctx->completion_lock); 6008 ret = preq ? -EALREADY : -ENOENT; 6009 goto out; 6010 } 6011 spin_unlock(&ctx->completion_lock); 6012 6013 if (req->poll_update.update_events || req->poll_update.update_user_data) { 6014 /* only mask one event flags, keep behavior flags */ 6015 if (req->poll_update.update_events) { 6016 preq->poll.events &= ~0xffff; 6017 preq->poll.events |= req->poll_update.events & 0xffff; 6018 preq->poll.events |= IO_POLL_UNMASK; 6019 } 6020 if (req->poll_update.update_user_data) 6021 preq->user_data = req->poll_update.new_user_data; 6022 6023 ret2 = io_poll_add(preq, issue_flags); 6024 /* successfully updated, don't complete poll request */ 6025 if (!ret2) 6026 goto out; 6027 } 6028 6029 req_set_fail(preq); 6030 preq->result = -ECANCELED; 6031 locked = !(issue_flags & IO_URING_F_UNLOCKED); 6032 io_req_task_complete(preq, &locked); 6033out: 6034 if (ret < 0) 6035 req_set_fail(req); 6036 /* complete update request, we're done with it */ 6037 __io_req_complete(req, issue_flags, ret, 0); 6038 return 0; 6039} 6040 6041static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) 6042{ 6043 struct io_timeout_data *data = container_of(timer, 6044 struct io_timeout_data, timer); 6045 struct io_kiocb *req = data->req; 6046 struct io_ring_ctx *ctx = req->ctx; 6047 unsigned long flags; 6048 6049 spin_lock_irqsave(&ctx->timeout_lock, flags); 6050 list_del_init(&req->timeout.list); 6051 atomic_set(&req->ctx->cq_timeouts, 6052 atomic_read(&req->ctx->cq_timeouts) + 1); 6053 spin_unlock_irqrestore(&ctx->timeout_lock, flags); 6054 6055 if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) 6056 req_set_fail(req); 6057 6058 req->result = -ETIME; 6059 req->io_task_work.func = io_req_task_complete; 6060 io_req_task_work_add(req, false); 6061 return HRTIMER_NORESTART; 6062} 6063 6064static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, 6065 __u64 user_data) 6066 __must_hold(&ctx->timeout_lock) 6067{ 6068 struct io_timeout_data *io; 6069 struct io_kiocb *req; 6070 bool found = false; 6071 6072 list_for_each_entry(req, &ctx->timeout_list, timeout.list) { 6073 found = user_data == req->user_data; 6074 if (found) 6075 break; 6076 } 6077 if (!found) 6078 return ERR_PTR(-ENOENT); 6079 6080 io = req->async_data; 6081 if (hrtimer_try_to_cancel(&io->timer) == -1) 6082 return ERR_PTR(-EALREADY); 6083 list_del_init(&req->timeout.list); 6084 return req; 6085} 6086 6087static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) 6088 __must_hold(&ctx->completion_lock) 6089 __must_hold(&ctx->timeout_lock) 6090{ 6091 struct io_kiocb *req = io_timeout_extract(ctx, user_data); 6092 6093 if (IS_ERR(req)) 6094 return PTR_ERR(req); 6095 6096 req_set_fail(req); 6097 io_fill_cqe_req(req, -ECANCELED, 0); 6098 io_put_req_deferred(req); 6099 return 0; 6100} 6101 6102static clockid_t io_timeout_get_clock(struct io_timeout_data *data) 6103{ 6104 switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) { 6105 case IORING_TIMEOUT_BOOTTIME: 6106 return CLOCK_BOOTTIME; 6107 case IORING_TIMEOUT_REALTIME: 6108 return CLOCK_REALTIME; 6109 default: 6110 /* can't happen, vetted at prep time */ 6111 WARN_ON_ONCE(1); 6112 fallthrough; 6113 case 0: 6114 return CLOCK_MONOTONIC; 6115 } 6116} 6117 6118static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, 6119 struct timespec64 *ts, enum hrtimer_mode mode) 6120 __must_hold(&ctx->timeout_lock) 6121{ 6122 struct io_timeout_data *io; 6123 struct io_kiocb *req; 6124 bool found = false; 6125 6126 list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) { 6127 found = user_data == req->user_data; 6128 if (found) 6129 break; 6130 } 6131 if (!found) 6132 return -ENOENT; 6133 6134 io = req->async_data; 6135 if (hrtimer_try_to_cancel(&io->timer) == -1) 6136 return -EALREADY; 6137 hrtimer_init(&io->timer, io_timeout_get_clock(io), mode); 6138 io->timer.function = io_link_timeout_fn; 6139 hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); 6140 return 0; 6141} 6142 6143static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, 6144 struct timespec64 *ts, enum hrtimer_mode mode) 6145 __must_hold(&ctx->timeout_lock) 6146{ 6147 struct io_kiocb *req = io_timeout_extract(ctx, user_data); 6148 struct io_timeout_data *data; 6149 6150 if (IS_ERR(req)) 6151 return PTR_ERR(req); 6152 6153 req->timeout.off = 0; /* noseq */ 6154 data = req->async_data; 6155 list_add_tail(&req->timeout.list, &ctx->timeout_list); 6156 hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); 6157 data->timer.function = io_timeout_fn; 6158 hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); 6159 return 0; 6160} 6161 6162static int io_timeout_remove_prep(struct io_kiocb *req, 6163 const struct io_uring_sqe *sqe) 6164{ 6165 struct io_timeout_rem *tr = &req->timeout_rem; 6166 6167 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 6168 return -EINVAL; 6169 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 6170 return -EINVAL; 6171 if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in) 6172 return -EINVAL; 6173 6174 tr->ltimeout = false; 6175 tr->addr = READ_ONCE(sqe->addr); 6176 tr->flags = READ_ONCE(sqe->timeout_flags); 6177 if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) { 6178 if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1) 6179 return -EINVAL; 6180 if (tr->flags & IORING_LINK_TIMEOUT_UPDATE) 6181 tr->ltimeout = true; 6182 if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) 6183 return -EINVAL; 6184 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) 6185 return -EFAULT; 6186 if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0) 6187 return -EINVAL; 6188 } else if (tr->flags) { 6189 /* timeout removal doesn't support flags */ 6190 return -EINVAL; 6191 } 6192 6193 return 0; 6194} 6195 6196static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags) 6197{ 6198 return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS 6199 : HRTIMER_MODE_REL; 6200} 6201 6202/* 6203 * Remove or update an existing timeout command 6204 */ 6205static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) 6206{ 6207 struct io_timeout_rem *tr = &req->timeout_rem; 6208 struct io_ring_ctx *ctx = req->ctx; 6209 int ret; 6210 6211 if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) { 6212 spin_lock(&ctx->completion_lock); 6213 spin_lock_irq(&ctx->timeout_lock); 6214 ret = io_timeout_cancel(ctx, tr->addr); 6215 spin_unlock_irq(&ctx->timeout_lock); 6216 spin_unlock(&ctx->completion_lock); 6217 } else { 6218 enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); 6219 6220 spin_lock_irq(&ctx->timeout_lock); 6221 if (tr->ltimeout) 6222 ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); 6223 else 6224 ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); 6225 spin_unlock_irq(&ctx->timeout_lock); 6226 } 6227 6228 if (ret < 0) 6229 req_set_fail(req); 6230 io_req_complete_post(req, ret, 0); 6231 return 0; 6232} 6233 6234static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, 6235 bool is_timeout_link) 6236{ 6237 struct io_timeout_data *data; 6238 unsigned flags; 6239 u32 off = READ_ONCE(sqe->off); 6240 6241 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 6242 return -EINVAL; 6243 if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || 6244 sqe->splice_fd_in) 6245 return -EINVAL; 6246 if (off && is_timeout_link) 6247 return -EINVAL; 6248 flags = READ_ONCE(sqe->timeout_flags); 6249 if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK | 6250 IORING_TIMEOUT_ETIME_SUCCESS)) 6251 return -EINVAL; 6252 /* more than one clock specified is invalid, obviously */ 6253 if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) 6254 return -EINVAL; 6255 6256 INIT_LIST_HEAD(&req->timeout.list); 6257 req->timeout.off = off; 6258 if (unlikely(off && !req->ctx->off_timeout_used)) 6259 req->ctx->off_timeout_used = true; 6260 6261 if (WARN_ON_ONCE(req_has_async_data(req))) 6262 return -EFAULT; 6263 if (io_alloc_async_data(req)) 6264 return -ENOMEM; 6265 6266 data = req->async_data; 6267 data->req = req; 6268 data->flags = flags; 6269 6270 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) 6271 return -EFAULT; 6272 6273 if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) 6274 return -EINVAL; 6275 6276 data->mode = io_translate_timeout_mode(flags); 6277 hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); 6278 6279 if (is_timeout_link) { 6280 struct io_submit_link *link = &req->ctx->submit_state.link; 6281 6282 if (!link->head) 6283 return -EINVAL; 6284 if (link->last->opcode == IORING_OP_LINK_TIMEOUT) 6285 return -EINVAL; 6286 req->timeout.head = link->last; 6287 link->last->flags |= REQ_F_ARM_LTIMEOUT; 6288 } 6289 return 0; 6290} 6291 6292static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) 6293{ 6294 struct io_ring_ctx *ctx = req->ctx; 6295 struct io_timeout_data *data = req->async_data; 6296 struct list_head *entry; 6297 u32 tail, off = req->timeout.off; 6298 6299 spin_lock_irq(&ctx->timeout_lock); 6300 6301 /* 6302 * sqe->off holds how many events that need to occur for this 6303 * timeout event to be satisfied. If it isn't set, then this is 6304 * a pure timeout request, sequence isn't used. 6305 */ 6306 if (io_is_timeout_noseq(req)) { 6307 entry = ctx->timeout_list.prev; 6308 goto add; 6309 } 6310 6311 tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); 6312 req->timeout.target_seq = tail + off; 6313 6314 /* Update the last seq here in case io_flush_timeouts() hasn't. 6315 * This is safe because ->completion_lock is held, and submissions 6316 * and completions are never mixed in the same ->completion_lock section. 6317 */ 6318 ctx->cq_last_tm_flush = tail; 6319 6320 /* 6321 * Insertion sort, ensuring the first entry in the list is always 6322 * the one we need first. 6323 */ 6324 list_for_each_prev(entry, &ctx->timeout_list) { 6325 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, 6326 timeout.list); 6327 6328 if (io_is_timeout_noseq(nxt)) 6329 continue; 6330 /* nxt.seq is behind @tail, otherwise would've been completed */ 6331 if (off >= nxt->timeout.target_seq - tail) 6332 break; 6333 } 6334add: 6335 list_add(&req->timeout.list, entry); 6336 data->timer.function = io_timeout_fn; 6337 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); 6338 spin_unlock_irq(&ctx->timeout_lock); 6339 return 0; 6340} 6341 6342struct io_cancel_data { 6343 struct io_ring_ctx *ctx; 6344 u64 user_data; 6345}; 6346 6347static bool io_cancel_cb(struct io_wq_work *work, void *data) 6348{ 6349 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 6350 struct io_cancel_data *cd = data; 6351 6352 return req->ctx == cd->ctx && req->user_data == cd->user_data; 6353} 6354 6355static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, 6356 struct io_ring_ctx *ctx) 6357{ 6358 struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, }; 6359 enum io_wq_cancel cancel_ret; 6360 int ret = 0; 6361 6362 if (!tctx || !tctx->io_wq) 6363 return -ENOENT; 6364 6365 cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false); 6366 switch (cancel_ret) { 6367 case IO_WQ_CANCEL_OK: 6368 ret = 0; 6369 break; 6370 case IO_WQ_CANCEL_RUNNING: 6371 ret = -EALREADY; 6372 break; 6373 case IO_WQ_CANCEL_NOTFOUND: 6374 ret = -ENOENT; 6375 break; 6376 } 6377 6378 return ret; 6379} 6380 6381static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr) 6382{ 6383 struct io_ring_ctx *ctx = req->ctx; 6384 int ret; 6385 6386 WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); 6387 6388 ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); 6389 /* 6390 * Fall-through even for -EALREADY, as we may have poll armed 6391 * that need unarming. 6392 */ 6393 if (!ret) 6394 return 0; 6395 6396 spin_lock(&ctx->completion_lock); 6397 ret = io_poll_cancel(ctx, sqe_addr, false); 6398 if (ret != -ENOENT) 6399 goto out; 6400 6401 spin_lock_irq(&ctx->timeout_lock); 6402 ret = io_timeout_cancel(ctx, sqe_addr); 6403 spin_unlock_irq(&ctx->timeout_lock); 6404out: 6405 spin_unlock(&ctx->completion_lock); 6406 return ret; 6407} 6408 6409static int io_async_cancel_prep(struct io_kiocb *req, 6410 const struct io_uring_sqe *sqe) 6411{ 6412 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 6413 return -EINVAL; 6414 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 6415 return -EINVAL; 6416 if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags || 6417 sqe->splice_fd_in) 6418 return -EINVAL; 6419 6420 req->cancel.addr = READ_ONCE(sqe->addr); 6421 return 0; 6422} 6423 6424static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) 6425{ 6426 struct io_ring_ctx *ctx = req->ctx; 6427 u64 sqe_addr = req->cancel.addr; 6428 bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 6429 struct io_tctx_node *node; 6430 int ret; 6431 6432 ret = io_try_cancel_userdata(req, sqe_addr); 6433 if (ret != -ENOENT) 6434 goto done; 6435 6436 /* slow path, try all io-wq's */ 6437 io_ring_submit_lock(ctx, needs_lock); 6438 ret = -ENOENT; 6439 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 6440 struct io_uring_task *tctx = node->task->io_uring; 6441 6442 ret = io_async_cancel_one(tctx, req->cancel.addr, ctx); 6443 if (ret != -ENOENT) 6444 break; 6445 } 6446 io_ring_submit_unlock(ctx, needs_lock); 6447done: 6448 if (ret < 0) 6449 req_set_fail(req); 6450 io_req_complete_post(req, ret, 0); 6451 return 0; 6452} 6453 6454static int io_rsrc_update_prep(struct io_kiocb *req, 6455 const struct io_uring_sqe *sqe) 6456{ 6457 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 6458 return -EINVAL; 6459 if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) 6460 return -EINVAL; 6461 6462 req->rsrc_update.offset = READ_ONCE(sqe->off); 6463 req->rsrc_update.nr_args = READ_ONCE(sqe->len); 6464 if (!req->rsrc_update.nr_args) 6465 return -EINVAL; 6466 req->rsrc_update.arg = READ_ONCE(sqe->addr); 6467 return 0; 6468} 6469 6470static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 6471{ 6472 struct io_ring_ctx *ctx = req->ctx; 6473 bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 6474 struct io_uring_rsrc_update2 up; 6475 int ret; 6476 6477 up.offset = req->rsrc_update.offset; 6478 up.data = req->rsrc_update.arg; 6479 up.nr = 0; 6480 up.tags = 0; 6481 up.resv = 0; 6482 6483 io_ring_submit_lock(ctx, needs_lock); 6484 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 6485 &up, req->rsrc_update.nr_args); 6486 io_ring_submit_unlock(ctx, needs_lock); 6487 6488 if (ret < 0) 6489 req_set_fail(req); 6490 __io_req_complete(req, issue_flags, ret, 0); 6491 return 0; 6492} 6493 6494static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 6495{ 6496 switch (req->opcode) { 6497 case IORING_OP_NOP: 6498 return 0; 6499 case IORING_OP_READV: 6500 case IORING_OP_READ_FIXED: 6501 case IORING_OP_READ: 6502 return io_read_prep(req, sqe); 6503 case IORING_OP_WRITEV: 6504 case IORING_OP_WRITE_FIXED: 6505 case IORING_OP_WRITE: 6506 return io_write_prep(req, sqe); 6507 case IORING_OP_POLL_ADD: 6508 return io_poll_add_prep(req, sqe); 6509 case IORING_OP_POLL_REMOVE: 6510 return io_poll_update_prep(req, sqe); 6511 case IORING_OP_FSYNC: 6512 return io_fsync_prep(req, sqe); 6513 case IORING_OP_SYNC_FILE_RANGE: 6514 return io_sfr_prep(req, sqe); 6515 case IORING_OP_SENDMSG: 6516 case IORING_OP_SEND: 6517 return io_sendmsg_prep(req, sqe); 6518 case IORING_OP_RECVMSG: 6519 case IORING_OP_RECV: 6520 return io_recvmsg_prep(req, sqe); 6521 case IORING_OP_CONNECT: 6522 return io_connect_prep(req, sqe); 6523 case IORING_OP_TIMEOUT: 6524 return io_timeout_prep(req, sqe, false); 6525 case IORING_OP_TIMEOUT_REMOVE: 6526 return io_timeout_remove_prep(req, sqe); 6527 case IORING_OP_ASYNC_CANCEL: 6528 return io_async_cancel_prep(req, sqe); 6529 case IORING_OP_LINK_TIMEOUT: 6530 return io_timeout_prep(req, sqe, true); 6531 case IORING_OP_ACCEPT: 6532 return io_accept_prep(req, sqe); 6533 case IORING_OP_FALLOCATE: 6534 return io_fallocate_prep(req, sqe); 6535 case IORING_OP_OPENAT: 6536 return io_openat_prep(req, sqe); 6537 case IORING_OP_CLOSE: 6538 return io_close_prep(req, sqe); 6539 case IORING_OP_FILES_UPDATE: 6540 return io_rsrc_update_prep(req, sqe); 6541 case IORING_OP_STATX: 6542 return io_statx_prep(req, sqe); 6543 case IORING_OP_FADVISE: 6544 return io_fadvise_prep(req, sqe); 6545 case IORING_OP_MADVISE: 6546 return io_madvise_prep(req, sqe); 6547 case IORING_OP_OPENAT2: 6548 return io_openat2_prep(req, sqe); 6549 case IORING_OP_EPOLL_CTL: 6550 return io_epoll_ctl_prep(req, sqe); 6551 case IORING_OP_SPLICE: 6552 return io_splice_prep(req, sqe); 6553 case IORING_OP_PROVIDE_BUFFERS: 6554 return io_provide_buffers_prep(req, sqe); 6555 case IORING_OP_REMOVE_BUFFERS: 6556 return io_remove_buffers_prep(req, sqe); 6557 case IORING_OP_TEE: 6558 return io_tee_prep(req, sqe); 6559 case IORING_OP_SHUTDOWN: 6560 return io_shutdown_prep(req, sqe); 6561 case IORING_OP_RENAMEAT: 6562 return io_renameat_prep(req, sqe); 6563 case IORING_OP_UNLINKAT: 6564 return io_unlinkat_prep(req, sqe); 6565 case IORING_OP_MKDIRAT: 6566 return io_mkdirat_prep(req, sqe); 6567 case IORING_OP_SYMLINKAT: 6568 return io_symlinkat_prep(req, sqe); 6569 case IORING_OP_LINKAT: 6570 return io_linkat_prep(req, sqe); 6571 } 6572 6573 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", 6574 req->opcode); 6575 return -EINVAL; 6576} 6577 6578static int io_req_prep_async(struct io_kiocb *req) 6579{ 6580 if (!io_op_defs[req->opcode].needs_async_setup) 6581 return 0; 6582 if (WARN_ON_ONCE(req_has_async_data(req))) 6583 return -EFAULT; 6584 if (io_alloc_async_data(req)) 6585 return -EAGAIN; 6586 6587 switch (req->opcode) { 6588 case IORING_OP_READV: 6589 return io_rw_prep_async(req, READ); 6590 case IORING_OP_WRITEV: 6591 return io_rw_prep_async(req, WRITE); 6592 case IORING_OP_SENDMSG: 6593 return io_sendmsg_prep_async(req); 6594 case IORING_OP_RECVMSG: 6595 return io_recvmsg_prep_async(req); 6596 case IORING_OP_CONNECT: 6597 return io_connect_prep_async(req); 6598 } 6599 printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n", 6600 req->opcode); 6601 return -EFAULT; 6602} 6603 6604static u32 io_get_sequence(struct io_kiocb *req) 6605{ 6606 u32 seq = req->ctx->cached_sq_head; 6607 6608 /* need original cached_sq_head, but it was increased for each req */ 6609 io_for_each_link(req, req) 6610 seq--; 6611 return seq; 6612} 6613 6614static __cold void io_drain_req(struct io_kiocb *req) 6615{ 6616 struct io_ring_ctx *ctx = req->ctx; 6617 struct io_defer_entry *de; 6618 int ret; 6619 u32 seq = io_get_sequence(req); 6620 6621 /* Still need defer if there is pending req in defer list. */ 6622 spin_lock(&ctx->completion_lock); 6623 if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) { 6624 spin_unlock(&ctx->completion_lock); 6625queue: 6626 ctx->drain_active = false; 6627 io_req_task_queue(req); 6628 return; 6629 } 6630 spin_unlock(&ctx->completion_lock); 6631 6632 ret = io_req_prep_async(req); 6633 if (ret) { 6634fail: 6635 io_req_complete_failed(req, ret); 6636 return; 6637 } 6638 io_prep_async_link(req); 6639 de = kmalloc(sizeof(*de), GFP_KERNEL); 6640 if (!de) { 6641 ret = -ENOMEM; 6642 goto fail; 6643 } 6644 6645 spin_lock(&ctx->completion_lock); 6646 if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { 6647 spin_unlock(&ctx->completion_lock); 6648 kfree(de); 6649 goto queue; 6650 } 6651 6652 trace_io_uring_defer(ctx, req, req->user_data); 6653 de->req = req; 6654 de->seq = seq; 6655 list_add_tail(&de->list, &ctx->defer_list); 6656 spin_unlock(&ctx->completion_lock); 6657} 6658 6659static void io_clean_op(struct io_kiocb *req) 6660{ 6661 if (req->flags & REQ_F_BUFFER_SELECTED) 6662 io_put_kbuf(req); 6663 6664 if (req->flags & REQ_F_NEED_CLEANUP) { 6665 switch (req->opcode) { 6666 case IORING_OP_READV: 6667 case IORING_OP_READ_FIXED: 6668 case IORING_OP_READ: 6669 case IORING_OP_WRITEV: 6670 case IORING_OP_WRITE_FIXED: 6671 case IORING_OP_WRITE: { 6672 struct io_async_rw *io = req->async_data; 6673 6674 kfree(io->free_iovec); 6675 break; 6676 } 6677 case IORING_OP_RECVMSG: 6678 case IORING_OP_SENDMSG: { 6679 struct io_async_msghdr *io = req->async_data; 6680 6681 kfree(io->free_iov); 6682 break; 6683 } 6684 case IORING_OP_SPLICE: 6685 case IORING_OP_TEE: 6686 if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED)) 6687 io_put_file(req->splice.file_in); 6688 break; 6689 case IORING_OP_OPENAT: 6690 case IORING_OP_OPENAT2: 6691 if (req->open.filename) 6692 putname(req->open.filename); 6693 break; 6694 case IORING_OP_RENAMEAT: 6695 putname(req->rename.oldpath); 6696 putname(req->rename.newpath); 6697 break; 6698 case IORING_OP_UNLINKAT: 6699 putname(req->unlink.filename); 6700 break; 6701 case IORING_OP_MKDIRAT: 6702 putname(req->mkdir.filename); 6703 break; 6704 case IORING_OP_SYMLINKAT: 6705 putname(req->symlink.oldpath); 6706 putname(req->symlink.newpath); 6707 break; 6708 case IORING_OP_LINKAT: 6709 putname(req->hardlink.oldpath); 6710 putname(req->hardlink.newpath); 6711 break; 6712 } 6713 } 6714 if ((req->flags & REQ_F_POLLED) && req->apoll) { 6715 kfree(req->apoll->double_poll); 6716 kfree(req->apoll); 6717 req->apoll = NULL; 6718 } 6719 if (req->flags & REQ_F_INFLIGHT) { 6720 struct io_uring_task *tctx = req->task->io_uring; 6721 6722 atomic_dec(&tctx->inflight_tracked); 6723 } 6724 if (req->flags & REQ_F_CREDS) 6725 put_cred(req->creds); 6726 if (req->flags & REQ_F_ASYNC_DATA) { 6727 kfree(req->async_data); 6728 req->async_data = NULL; 6729 } 6730 req->flags &= ~IO_REQ_CLEAN_FLAGS; 6731} 6732 6733static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) 6734{ 6735 const struct cred *creds = NULL; 6736 int ret; 6737 6738 if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) 6739 creds = override_creds(req->creds); 6740 6741 if (!io_op_defs[req->opcode].audit_skip) 6742 audit_uring_entry(req->opcode); 6743 6744 switch (req->opcode) { 6745 case IORING_OP_NOP: 6746 ret = io_nop(req, issue_flags); 6747 break; 6748 case IORING_OP_READV: 6749 case IORING_OP_READ_FIXED: 6750 case IORING_OP_READ: 6751 ret = io_read(req, issue_flags); 6752 break; 6753 case IORING_OP_WRITEV: 6754 case IORING_OP_WRITE_FIXED: 6755 case IORING_OP_WRITE: 6756 ret = io_write(req, issue_flags); 6757 break; 6758 case IORING_OP_FSYNC: 6759 ret = io_fsync(req, issue_flags); 6760 break; 6761 case IORING_OP_POLL_ADD: 6762 ret = io_poll_add(req, issue_flags); 6763 break; 6764 case IORING_OP_POLL_REMOVE: 6765 ret = io_poll_update(req, issue_flags); 6766 break; 6767 case IORING_OP_SYNC_FILE_RANGE: 6768 ret = io_sync_file_range(req, issue_flags); 6769 break; 6770 case IORING_OP_SENDMSG: 6771 ret = io_sendmsg(req, issue_flags); 6772 break; 6773 case IORING_OP_SEND: 6774 ret = io_send(req, issue_flags); 6775 break; 6776 case IORING_OP_RECVMSG: 6777 ret = io_recvmsg(req, issue_flags); 6778 break; 6779 case IORING_OP_RECV: 6780 ret = io_recv(req, issue_flags); 6781 break; 6782 case IORING_OP_TIMEOUT: 6783 ret = io_timeout(req, issue_flags); 6784 break; 6785 case IORING_OP_TIMEOUT_REMOVE: 6786 ret = io_timeout_remove(req, issue_flags); 6787 break; 6788 case IORING_OP_ACCEPT: 6789 ret = io_accept(req, issue_flags); 6790 break; 6791 case IORING_OP_CONNECT: 6792 ret = io_connect(req, issue_flags); 6793 break; 6794 case IORING_OP_ASYNC_CANCEL: 6795 ret = io_async_cancel(req, issue_flags); 6796 break; 6797 case IORING_OP_FALLOCATE: 6798 ret = io_fallocate(req, issue_flags); 6799 break; 6800 case IORING_OP_OPENAT: 6801 ret = io_openat(req, issue_flags); 6802 break; 6803 case IORING_OP_CLOSE: 6804 ret = io_close(req, issue_flags); 6805 break; 6806 case IORING_OP_FILES_UPDATE: 6807 ret = io_files_update(req, issue_flags); 6808 break; 6809 case IORING_OP_STATX: 6810 ret = io_statx(req, issue_flags); 6811 break; 6812 case IORING_OP_FADVISE: 6813 ret = io_fadvise(req, issue_flags); 6814 break; 6815 case IORING_OP_MADVISE: 6816 ret = io_madvise(req, issue_flags); 6817 break; 6818 case IORING_OP_OPENAT2: 6819 ret = io_openat2(req, issue_flags); 6820 break; 6821 case IORING_OP_EPOLL_CTL: 6822 ret = io_epoll_ctl(req, issue_flags); 6823 break; 6824 case IORING_OP_SPLICE: 6825 ret = io_splice(req, issue_flags); 6826 break; 6827 case IORING_OP_PROVIDE_BUFFERS: 6828 ret = io_provide_buffers(req, issue_flags); 6829 break; 6830 case IORING_OP_REMOVE_BUFFERS: 6831 ret = io_remove_buffers(req, issue_flags); 6832 break; 6833 case IORING_OP_TEE: 6834 ret = io_tee(req, issue_flags); 6835 break; 6836 case IORING_OP_SHUTDOWN: 6837 ret = io_shutdown(req, issue_flags); 6838 break; 6839 case IORING_OP_RENAMEAT: 6840 ret = io_renameat(req, issue_flags); 6841 break; 6842 case IORING_OP_UNLINKAT: 6843 ret = io_unlinkat(req, issue_flags); 6844 break; 6845 case IORING_OP_MKDIRAT: 6846 ret = io_mkdirat(req, issue_flags); 6847 break; 6848 case IORING_OP_SYMLINKAT: 6849 ret = io_symlinkat(req, issue_flags); 6850 break; 6851 case IORING_OP_LINKAT: 6852 ret = io_linkat(req, issue_flags); 6853 break; 6854 default: 6855 ret = -EINVAL; 6856 break; 6857 } 6858 6859 if (!io_op_defs[req->opcode].audit_skip) 6860 audit_uring_exit(!ret, ret); 6861 6862 if (creds) 6863 revert_creds(creds); 6864 if (ret) 6865 return ret; 6866 /* If the op doesn't have a file, we're not polling for it */ 6867 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file) 6868 io_iopoll_req_issued(req, issue_flags); 6869 6870 return 0; 6871} 6872 6873static struct io_wq_work *io_wq_free_work(struct io_wq_work *work) 6874{ 6875 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 6876 6877 req = io_put_req_find_next(req); 6878 return req ? &req->work : NULL; 6879} 6880 6881static void io_wq_submit_work(struct io_wq_work *work) 6882{ 6883 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 6884 unsigned int issue_flags = IO_URING_F_UNLOCKED; 6885 bool needs_poll = false; 6886 struct io_kiocb *timeout; 6887 int ret = 0; 6888 6889 /* one will be dropped by ->io_free_work() after returning to io-wq */ 6890 if (!(req->flags & REQ_F_REFCOUNT)) 6891 __io_req_set_refcount(req, 2); 6892 else 6893 req_ref_get(req); 6894 6895 timeout = io_prep_linked_timeout(req); 6896 if (timeout) 6897 io_queue_linked_timeout(timeout); 6898 6899 /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ 6900 if (work->flags & IO_WQ_WORK_CANCEL) { 6901 io_req_task_queue_fail(req, -ECANCELED); 6902 return; 6903 } 6904 6905 if (req->flags & REQ_F_FORCE_ASYNC) { 6906 const struct io_op_def *def = &io_op_defs[req->opcode]; 6907 bool opcode_poll = def->pollin || def->pollout; 6908 6909 if (opcode_poll && file_can_poll(req->file)) { 6910 needs_poll = true; 6911 issue_flags |= IO_URING_F_NONBLOCK; 6912 } 6913 } 6914 6915 do { 6916 ret = io_issue_sqe(req, issue_flags); 6917 if (ret != -EAGAIN) 6918 break; 6919 /* 6920 * We can get EAGAIN for iopolled IO even though we're 6921 * forcing a sync submission from here, since we can't 6922 * wait for request slots on the block side. 6923 */ 6924 if (!needs_poll) { 6925 cond_resched(); 6926 continue; 6927 } 6928 6929 if (io_arm_poll_handler(req) == IO_APOLL_OK) 6930 return; 6931 /* aborted or ready, in either case retry blocking */ 6932 needs_poll = false; 6933 issue_flags &= ~IO_URING_F_NONBLOCK; 6934 } while (1); 6935 6936 /* avoid locking problems by failing it from a clean context */ 6937 if (ret) 6938 io_req_task_queue_fail(req, ret); 6939} 6940 6941static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table, 6942 unsigned i) 6943{ 6944 return &table->files[i]; 6945} 6946 6947static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, 6948 int index) 6949{ 6950 struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index); 6951 6952 return (struct file *) (slot->file_ptr & FFS_MASK); 6953} 6954 6955static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file) 6956{ 6957 unsigned long file_ptr = (unsigned long) file; 6958 6959 file_ptr |= io_file_get_flags(file); 6960 file_slot->file_ptr = file_ptr; 6961} 6962 6963static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx, 6964 struct io_kiocb *req, int fd) 6965{ 6966 struct file *file; 6967 unsigned long file_ptr; 6968 6969 if (unlikely((unsigned int)fd >= ctx->nr_user_files)) 6970 return NULL; 6971 fd = array_index_nospec(fd, ctx->nr_user_files); 6972 file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; 6973 file = (struct file *) (file_ptr & FFS_MASK); 6974 file_ptr &= ~FFS_MASK; 6975 /* mask in overlapping REQ_F and FFS bits */ 6976 req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT); 6977 io_req_set_rsrc_node(req, ctx); 6978 return file; 6979} 6980 6981static struct file *io_file_get_normal(struct io_ring_ctx *ctx, 6982 struct io_kiocb *req, int fd) 6983{ 6984 struct file *file = fget(fd); 6985 6986 trace_io_uring_file_get(ctx, fd); 6987 6988 /* we don't allow fixed io_uring files */ 6989 if (file && unlikely(file->f_op == &io_uring_fops)) 6990 io_req_track_inflight(req); 6991 return file; 6992} 6993 6994static inline struct file *io_file_get(struct io_ring_ctx *ctx, 6995 struct io_kiocb *req, int fd, bool fixed) 6996{ 6997 if (fixed) 6998 return io_file_get_fixed(ctx, req, fd); 6999 else 7000 return io_file_get_normal(ctx, req, fd); 7001} 7002 7003static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) 7004{ 7005 struct io_kiocb *prev = req->timeout.prev; 7006 int ret = -ENOENT; 7007 7008 if (prev) { 7009 if (!(req->task->flags & PF_EXITING)) 7010 ret = io_try_cancel_userdata(req, prev->user_data); 7011 io_req_complete_post(req, ret ?: -ETIME, 0); 7012 io_put_req(prev); 7013 } else { 7014 io_req_complete_post(req, -ETIME, 0); 7015 } 7016} 7017 7018static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) 7019{ 7020 struct io_timeout_data *data = container_of(timer, 7021 struct io_timeout_data, timer); 7022 struct io_kiocb *prev, *req = data->req; 7023 struct io_ring_ctx *ctx = req->ctx; 7024 unsigned long flags; 7025 7026 spin_lock_irqsave(&ctx->timeout_lock, flags); 7027 prev = req->timeout.head; 7028 req->timeout.head = NULL; 7029 7030 /* 7031 * We don't expect the list to be empty, that will only happen if we 7032 * race with the completion of the linked work. 7033 */ 7034 if (prev) { 7035 io_remove_next_linked(prev); 7036 if (!req_ref_inc_not_zero(prev)) 7037 prev = NULL; 7038 } 7039 list_del(&req->timeout.list); 7040 req->timeout.prev = prev; 7041 spin_unlock_irqrestore(&ctx->timeout_lock, flags); 7042 7043 req->io_task_work.func = io_req_task_link_timeout; 7044 io_req_task_work_add(req, false); 7045 return HRTIMER_NORESTART; 7046} 7047 7048static void io_queue_linked_timeout(struct io_kiocb *req) 7049{ 7050 struct io_ring_ctx *ctx = req->ctx; 7051 7052 spin_lock_irq(&ctx->timeout_lock); 7053 /* 7054 * If the back reference is NULL, then our linked request finished 7055 * before we got a chance to setup the timer 7056 */ 7057 if (req->timeout.head) { 7058 struct io_timeout_data *data = req->async_data; 7059 7060 data->timer.function = io_link_timeout_fn; 7061 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), 7062 data->mode); 7063 list_add_tail(&req->timeout.list, &ctx->ltimeout_list); 7064 } 7065 spin_unlock_irq(&ctx->timeout_lock); 7066 /* drop submission reference */ 7067 io_put_req(req); 7068} 7069 7070static void io_queue_sqe_arm_apoll(struct io_kiocb *req) 7071 __must_hold(&req->ctx->uring_lock) 7072{ 7073 struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); 7074 7075 switch (io_arm_poll_handler(req)) { 7076 case IO_APOLL_READY: 7077 io_req_task_queue(req); 7078 break; 7079 case IO_APOLL_ABORTED: 7080 /* 7081 * Queued up for async execution, worker will release 7082 * submit reference when the iocb is actually submitted. 7083 */ 7084 io_queue_async_work(req, NULL); 7085 break; 7086 } 7087 7088 if (linked_timeout) 7089 io_queue_linked_timeout(linked_timeout); 7090} 7091 7092static inline void __io_queue_sqe(struct io_kiocb *req) 7093 __must_hold(&req->ctx->uring_lock) 7094{ 7095 struct io_kiocb *linked_timeout; 7096 int ret; 7097 7098 ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); 7099 7100 if (req->flags & REQ_F_COMPLETE_INLINE) { 7101 io_req_add_compl_list(req); 7102 return; 7103 } 7104 /* 7105 * We async punt it if the file wasn't marked NOWAIT, or if the file 7106 * doesn't support non-blocking read/write attempts 7107 */ 7108 if (likely(!ret)) { 7109 linked_timeout = io_prep_linked_timeout(req); 7110 if (linked_timeout) 7111 io_queue_linked_timeout(linked_timeout); 7112 } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { 7113 io_queue_sqe_arm_apoll(req); 7114 } else { 7115 io_req_complete_failed(req, ret); 7116 } 7117} 7118 7119static void io_queue_sqe_fallback(struct io_kiocb *req) 7120 __must_hold(&req->ctx->uring_lock) 7121{ 7122 if (req->flags & REQ_F_FAIL) { 7123 io_req_complete_fail_submit(req); 7124 } else if (unlikely(req->ctx->drain_active)) { 7125 io_drain_req(req); 7126 } else { 7127 int ret = io_req_prep_async(req); 7128 7129 if (unlikely(ret)) 7130 io_req_complete_failed(req, ret); 7131 else 7132 io_queue_async_work(req, NULL); 7133 } 7134} 7135 7136static inline void io_queue_sqe(struct io_kiocb *req) 7137 __must_hold(&req->ctx->uring_lock) 7138{ 7139 if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) 7140 __io_queue_sqe(req); 7141 else 7142 io_queue_sqe_fallback(req); 7143} 7144 7145/* 7146 * Check SQE restrictions (opcode and flags). 7147 * 7148 * Returns 'true' if SQE is allowed, 'false' otherwise. 7149 */ 7150static inline bool io_check_restriction(struct io_ring_ctx *ctx, 7151 struct io_kiocb *req, 7152 unsigned int sqe_flags) 7153{ 7154 if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) 7155 return false; 7156 7157 if ((sqe_flags & ctx->restrictions.sqe_flags_required) != 7158 ctx->restrictions.sqe_flags_required) 7159 return false; 7160 7161 if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | 7162 ctx->restrictions.sqe_flags_required)) 7163 return false; 7164 7165 return true; 7166} 7167 7168static void io_init_req_drain(struct io_kiocb *req) 7169{ 7170 struct io_ring_ctx *ctx = req->ctx; 7171 struct io_kiocb *head = ctx->submit_state.link.head; 7172 7173 ctx->drain_active = true; 7174 if (head) { 7175 /* 7176 * If we need to drain a request in the middle of a link, drain 7177 * the head request and the next request/link after the current 7178 * link. Considering sequential execution of links, 7179 * REQ_F_IO_DRAIN will be maintained for every request of our 7180 * link. 7181 */ 7182 head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; 7183 ctx->drain_next = true; 7184 } 7185} 7186 7187static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, 7188 const struct io_uring_sqe *sqe) 7189 __must_hold(&ctx->uring_lock) 7190{ 7191 unsigned int sqe_flags; 7192 int personality; 7193 u8 opcode; 7194 7195 /* req is partially pre-initialised, see io_preinit_req() */ 7196 req->opcode = opcode = READ_ONCE(sqe->opcode); 7197 /* same numerical values with corresponding REQ_F_*, safe to copy */ 7198 req->flags = sqe_flags = READ_ONCE(sqe->flags); 7199 req->user_data = READ_ONCE(sqe->user_data); 7200 req->file = NULL; 7201 req->fixed_rsrc_refs = NULL; 7202 req->task = current; 7203 7204 if (unlikely(opcode >= IORING_OP_LAST)) { 7205 req->opcode = 0; 7206 return -EINVAL; 7207 } 7208 if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { 7209 /* enforce forwards compatibility on users */ 7210 if (sqe_flags & ~SQE_VALID_FLAGS) 7211 return -EINVAL; 7212 if ((sqe_flags & IOSQE_BUFFER_SELECT) && 7213 !io_op_defs[opcode].buffer_select) 7214 return -EOPNOTSUPP; 7215 if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) 7216 ctx->drain_disabled = true; 7217 if (sqe_flags & IOSQE_IO_DRAIN) { 7218 if (ctx->drain_disabled) 7219 return -EOPNOTSUPP; 7220 io_init_req_drain(req); 7221 } 7222 } 7223 if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { 7224 if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags)) 7225 return -EACCES; 7226 /* knock it to the slow queue path, will be drained there */ 7227 if (ctx->drain_active) 7228 req->flags |= REQ_F_FORCE_ASYNC; 7229 /* if there is no link, we're at "next" request and need to drain */ 7230 if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) { 7231 ctx->drain_next = false; 7232 ctx->drain_active = true; 7233 req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; 7234 } 7235 } 7236 7237 if (io_op_defs[opcode].needs_file) { 7238 struct io_submit_state *state = &ctx->submit_state; 7239 7240 /* 7241 * Plug now if we have more than 2 IO left after this, and the 7242 * target is potentially a read/write to block based storage. 7243 */ 7244 if (state->need_plug && io_op_defs[opcode].plug) { 7245 state->plug_started = true; 7246 state->need_plug = false; 7247 blk_start_plug_nr_ios(&state->plug, state->submit_nr); 7248 } 7249 7250 req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd), 7251 (sqe_flags & IOSQE_FIXED_FILE)); 7252 if (unlikely(!req->file)) 7253 return -EBADF; 7254 } 7255 7256 personality = READ_ONCE(sqe->personality); 7257 if (personality) { 7258 int ret; 7259 7260 req->creds = xa_load(&ctx->personalities, personality); 7261 if (!req->creds) 7262 return -EINVAL; 7263 get_cred(req->creds); 7264 ret = security_uring_override_creds(req->creds); 7265 if (ret) { 7266 put_cred(req->creds); 7267 return ret; 7268 } 7269 req->flags |= REQ_F_CREDS; 7270 } 7271 7272 return io_req_prep(req, sqe); 7273} 7274 7275static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, 7276 const struct io_uring_sqe *sqe) 7277 __must_hold(&ctx->uring_lock) 7278{ 7279 struct io_submit_link *link = &ctx->submit_state.link; 7280 int ret; 7281 7282 ret = io_init_req(ctx, req, sqe); 7283 if (unlikely(ret)) { 7284 trace_io_uring_req_failed(sqe, ret); 7285 7286 /* fail even hard links since we don't submit */ 7287 if (link->head) { 7288 /* 7289 * we can judge a link req is failed or cancelled by if 7290 * REQ_F_FAIL is set, but the head is an exception since 7291 * it may be set REQ_F_FAIL because of other req's failure 7292 * so let's leverage req->result to distinguish if a head 7293 * is set REQ_F_FAIL because of its failure or other req's 7294 * failure so that we can set the correct ret code for it. 7295 * init result here to avoid affecting the normal path. 7296 */ 7297 if (!(link->head->flags & REQ_F_FAIL)) 7298 req_fail_link_node(link->head, -ECANCELED); 7299 } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { 7300 /* 7301 * the current req is a normal req, we should return 7302 * error and thus break the submittion loop. 7303 */ 7304 io_req_complete_failed(req, ret); 7305 return ret; 7306 } 7307 req_fail_link_node(req, ret); 7308 } 7309 7310 /* don't need @sqe from now on */ 7311 trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data, 7312 req->flags, true, 7313 ctx->flags & IORING_SETUP_SQPOLL); 7314 7315 /* 7316 * If we already have a head request, queue this one for async 7317 * submittal once the head completes. If we don't have a head but 7318 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be 7319 * submitted sync once the chain is complete. If none of those 7320 * conditions are true (normal request), then just queue it. 7321 */ 7322 if (link->head) { 7323 struct io_kiocb *head = link->head; 7324 7325 if (!(req->flags & REQ_F_FAIL)) { 7326 ret = io_req_prep_async(req); 7327 if (unlikely(ret)) { 7328 req_fail_link_node(req, ret); 7329 if (!(head->flags & REQ_F_FAIL)) 7330 req_fail_link_node(head, -ECANCELED); 7331 } 7332 } 7333 trace_io_uring_link(ctx, req, head); 7334 link->last->link = req; 7335 link->last = req; 7336 7337 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) 7338 return 0; 7339 /* last request of a link, enqueue the link */ 7340 link->head = NULL; 7341 req = head; 7342 } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { 7343 link->head = req; 7344 link->last = req; 7345 return 0; 7346 } 7347 7348 io_queue_sqe(req); 7349 return 0; 7350} 7351 7352/* 7353 * Batched submission is done, ensure local IO is flushed out. 7354 */ 7355static void io_submit_state_end(struct io_ring_ctx *ctx) 7356{ 7357 struct io_submit_state *state = &ctx->submit_state; 7358 7359 if (state->link.head) 7360 io_queue_sqe(state->link.head); 7361 /* flush only after queuing links as they can generate completions */ 7362 io_submit_flush_completions(ctx); 7363 if (state->plug_started) 7364 blk_finish_plug(&state->plug); 7365} 7366 7367/* 7368 * Start submission side cache. 7369 */ 7370static void io_submit_state_start(struct io_submit_state *state, 7371 unsigned int max_ios) 7372{ 7373 state->plug_started = false; 7374 state->need_plug = max_ios > 2; 7375 state->submit_nr = max_ios; 7376 /* set only head, no need to init link_last in advance */ 7377 state->link.head = NULL; 7378} 7379 7380static void io_commit_sqring(struct io_ring_ctx *ctx) 7381{ 7382 struct io_rings *rings = ctx->rings; 7383 7384 /* 7385 * Ensure any loads from the SQEs are done at this point, 7386 * since once we write the new head, the application could 7387 * write new data to them. 7388 */ 7389 smp_store_release(&rings->sq.head, ctx->cached_sq_head); 7390} 7391 7392/* 7393 * Fetch an sqe, if one is available. Note this returns a pointer to memory 7394 * that is mapped by userspace. This means that care needs to be taken to 7395 * ensure that reads are stable, as we cannot rely on userspace always 7396 * being a good citizen. If members of the sqe are validated and then later 7397 * used, it's important that those reads are done through READ_ONCE() to 7398 * prevent a re-load down the line. 7399 */ 7400static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) 7401{ 7402 unsigned head, mask = ctx->sq_entries - 1; 7403 unsigned sq_idx = ctx->cached_sq_head++ & mask; 7404 7405 /* 7406 * The cached sq head (or cq tail) serves two purposes: 7407 * 7408 * 1) allows us to batch the cost of updating the user visible 7409 * head updates. 7410 * 2) allows the kernel side to track the head on its own, even 7411 * though the application is the one updating it. 7412 */ 7413 head = READ_ONCE(ctx->sq_array[sq_idx]); 7414 if (likely(head < ctx->sq_entries)) 7415 return &ctx->sq_sqes[head]; 7416 7417 /* drop invalid entries */ 7418 ctx->cq_extra--; 7419 WRITE_ONCE(ctx->rings->sq_dropped, 7420 READ_ONCE(ctx->rings->sq_dropped) + 1); 7421 return NULL; 7422} 7423 7424static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) 7425 __must_hold(&ctx->uring_lock) 7426{ 7427 unsigned int entries = io_sqring_entries(ctx); 7428 int submitted = 0; 7429 7430 if (unlikely(!entries)) 7431 return 0; 7432 /* make sure SQ entry isn't read before tail */ 7433 nr = min3(nr, ctx->sq_entries, entries); 7434 io_get_task_refs(nr); 7435 7436 io_submit_state_start(&ctx->submit_state, nr); 7437 do { 7438 const struct io_uring_sqe *sqe; 7439 struct io_kiocb *req; 7440 7441 if (unlikely(!io_alloc_req_refill(ctx))) { 7442 if (!submitted) 7443 submitted = -EAGAIN; 7444 break; 7445 } 7446 req = io_alloc_req(ctx); 7447 sqe = io_get_sqe(ctx); 7448 if (unlikely(!sqe)) { 7449 wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); 7450 break; 7451 } 7452 /* will complete beyond this point, count as submitted */ 7453 submitted++; 7454 if (io_submit_sqe(ctx, req, sqe)) 7455 break; 7456 } while (submitted < nr); 7457 7458 if (unlikely(submitted != nr)) { 7459 int ref_used = (submitted == -EAGAIN) ? 0 : submitted; 7460 int unused = nr - ref_used; 7461 7462 current->io_uring->cached_refs += unused; 7463 } 7464 7465 io_submit_state_end(ctx); 7466 /* Commit SQ ring head once we've consumed and submitted all SQEs */ 7467 io_commit_sqring(ctx); 7468 7469 return submitted; 7470} 7471 7472static inline bool io_sqd_events_pending(struct io_sq_data *sqd) 7473{ 7474 return READ_ONCE(sqd->state); 7475} 7476 7477static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx) 7478{ 7479 /* Tell userspace we may need a wakeup call */ 7480 spin_lock(&ctx->completion_lock); 7481 WRITE_ONCE(ctx->rings->sq_flags, 7482 ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP); 7483 spin_unlock(&ctx->completion_lock); 7484} 7485 7486static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) 7487{ 7488 spin_lock(&ctx->completion_lock); 7489 WRITE_ONCE(ctx->rings->sq_flags, 7490 ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP); 7491 spin_unlock(&ctx->completion_lock); 7492} 7493 7494static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) 7495{ 7496 unsigned int to_submit; 7497 int ret = 0; 7498 7499 to_submit = io_sqring_entries(ctx); 7500 /* if we're handling multiple rings, cap submit size for fairness */ 7501 if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) 7502 to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; 7503 7504 if (!wq_list_empty(&ctx->iopoll_list) || to_submit) { 7505 const struct cred *creds = NULL; 7506 7507 if (ctx->sq_creds != current_cred()) 7508 creds = override_creds(ctx->sq_creds); 7509 7510 mutex_lock(&ctx->uring_lock); 7511 if (!wq_list_empty(&ctx->iopoll_list)) 7512 io_do_iopoll(ctx, true); 7513 7514 /* 7515 * Don't submit if refs are dying, good for io_uring_register(), 7516 * but also it is relied upon by io_ring_exit_work() 7517 */ 7518 if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) && 7519 !(ctx->flags & IORING_SETUP_R_DISABLED)) 7520 ret = io_submit_sqes(ctx, to_submit); 7521 mutex_unlock(&ctx->uring_lock); 7522 7523 if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) 7524 wake_up(&ctx->sqo_sq_wait); 7525 if (creds) 7526 revert_creds(creds); 7527 } 7528 7529 return ret; 7530} 7531 7532static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd) 7533{ 7534 struct io_ring_ctx *ctx; 7535 unsigned sq_thread_idle = 0; 7536 7537 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 7538 sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle); 7539 sqd->sq_thread_idle = sq_thread_idle; 7540} 7541 7542static bool io_sqd_handle_event(struct io_sq_data *sqd) 7543{ 7544 bool did_sig = false; 7545 struct ksignal ksig; 7546 7547 if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) || 7548 signal_pending(current)) { 7549 mutex_unlock(&sqd->lock); 7550 if (signal_pending(current)) 7551 did_sig = get_signal(&ksig); 7552 cond_resched(); 7553 mutex_lock(&sqd->lock); 7554 } 7555 return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); 7556} 7557 7558static int io_sq_thread(void *data) 7559{ 7560 struct io_sq_data *sqd = data; 7561 struct io_ring_ctx *ctx; 7562 unsigned long timeout = 0; 7563 char buf[TASK_COMM_LEN]; 7564 DEFINE_WAIT(wait); 7565 7566 snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); 7567 set_task_comm(current, buf); 7568 7569 if (sqd->sq_cpu != -1) 7570 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu)); 7571 else 7572 set_cpus_allowed_ptr(current, cpu_online_mask); 7573 current->flags |= PF_NO_SETAFFINITY; 7574 7575 audit_alloc_kernel(current); 7576 7577 mutex_lock(&sqd->lock); 7578 while (1) { 7579 bool cap_entries, sqt_spin = false; 7580 7581 if (io_sqd_events_pending(sqd) || signal_pending(current)) { 7582 if (io_sqd_handle_event(sqd)) 7583 break; 7584 timeout = jiffies + sqd->sq_thread_idle; 7585 } 7586 7587 cap_entries = !list_is_singular(&sqd->ctx_list); 7588 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 7589 int ret = __io_sq_thread(ctx, cap_entries); 7590 7591 if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) 7592 sqt_spin = true; 7593 } 7594 if (io_run_task_work()) 7595 sqt_spin = true; 7596 7597 if (sqt_spin || !time_after(jiffies, timeout)) { 7598 cond_resched(); 7599 if (sqt_spin) 7600 timeout = jiffies + sqd->sq_thread_idle; 7601 continue; 7602 } 7603 7604 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); 7605 if (!io_sqd_events_pending(sqd) && !current->task_works) { 7606 bool needs_sched = true; 7607 7608 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 7609 io_ring_set_wakeup_flag(ctx); 7610 7611 if ((ctx->flags & IORING_SETUP_IOPOLL) && 7612 !wq_list_empty(&ctx->iopoll_list)) { 7613 needs_sched = false; 7614 break; 7615 } 7616 if (io_sqring_entries(ctx)) { 7617 needs_sched = false; 7618 break; 7619 } 7620 } 7621 7622 if (needs_sched) { 7623 mutex_unlock(&sqd->lock); 7624 schedule(); 7625 mutex_lock(&sqd->lock); 7626 } 7627 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 7628 io_ring_clear_wakeup_flag(ctx); 7629 } 7630 7631 finish_wait(&sqd->wait, &wait); 7632 timeout = jiffies + sqd->sq_thread_idle; 7633 } 7634 7635 io_uring_cancel_generic(true, sqd); 7636 sqd->thread = NULL; 7637 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 7638 io_ring_set_wakeup_flag(ctx); 7639 io_run_task_work(); 7640 mutex_unlock(&sqd->lock); 7641 7642 audit_free(current); 7643 7644 complete(&sqd->exited); 7645 do_exit(0); 7646} 7647 7648struct io_wait_queue { 7649 struct wait_queue_entry wq; 7650 struct io_ring_ctx *ctx; 7651 unsigned cq_tail; 7652 unsigned nr_timeouts; 7653}; 7654 7655static inline bool io_should_wake(struct io_wait_queue *iowq) 7656{ 7657 struct io_ring_ctx *ctx = iowq->ctx; 7658 int dist = ctx->cached_cq_tail - (int) iowq->cq_tail; 7659 7660 /* 7661 * Wake up if we have enough events, or if a timeout occurred since we 7662 * started waiting. For timeouts, we always want to return to userspace, 7663 * regardless of event count. 7664 */ 7665 return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; 7666} 7667 7668static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, 7669 int wake_flags, void *key) 7670{ 7671 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, 7672 wq); 7673 7674 /* 7675 * Cannot safely flush overflowed CQEs from here, ensure we wake up 7676 * the task, and the next invocation will do it. 7677 */ 7678 if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow)) 7679 return autoremove_wake_function(curr, mode, wake_flags, key); 7680 return -1; 7681} 7682 7683static int io_run_task_work_sig(void) 7684{ 7685 if (io_run_task_work()) 7686 return 1; 7687 if (!signal_pending(current)) 7688 return 0; 7689 if (test_thread_flag(TIF_NOTIFY_SIGNAL)) 7690 return -ERESTARTSYS; 7691 return -EINTR; 7692} 7693 7694/* when returns >0, the caller should retry */ 7695static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, 7696 struct io_wait_queue *iowq, 7697 ktime_t timeout) 7698{ 7699 int ret; 7700 7701 /* make sure we run task_work before checking for signals */ 7702 ret = io_run_task_work_sig(); 7703 if (ret || io_should_wake(iowq)) 7704 return ret; 7705 /* let the caller flush overflows, retry */ 7706 if (test_bit(0, &ctx->check_cq_overflow)) 7707 return 1; 7708 7709 if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS)) 7710 return -ETIME; 7711 return 1; 7712} 7713 7714/* 7715 * Wait until events become available, if we don't already have some. The 7716 * application must reap them itself, as they reside on the shared cq ring. 7717 */ 7718static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, 7719 const sigset_t __user *sig, size_t sigsz, 7720 struct __kernel_timespec __user *uts) 7721{ 7722 struct io_wait_queue iowq; 7723 struct io_rings *rings = ctx->rings; 7724 ktime_t timeout = KTIME_MAX; 7725 int ret; 7726 7727 do { 7728 io_cqring_overflow_flush(ctx); 7729 if (io_cqring_events(ctx) >= min_events) 7730 return 0; 7731 if (!io_run_task_work()) 7732 break; 7733 } while (1); 7734 7735 if (uts) { 7736 struct timespec64 ts; 7737 7738 if (get_timespec64(&ts, uts)) 7739 return -EFAULT; 7740 timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); 7741 } 7742 7743 if (sig) { 7744#ifdef CONFIG_COMPAT 7745 if (in_compat_syscall()) 7746 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, 7747 sigsz); 7748 else 7749#endif 7750 ret = set_user_sigmask(sig, sigsz); 7751 7752 if (ret) 7753 return ret; 7754 } 7755 7756 init_waitqueue_func_entry(&iowq.wq, io_wake_function); 7757 iowq.wq.private = current; 7758 INIT_LIST_HEAD(&iowq.wq.entry); 7759 iowq.ctx = ctx; 7760 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); 7761 iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; 7762 7763 trace_io_uring_cqring_wait(ctx, min_events); 7764 do { 7765 /* if we can't even flush overflow, don't wait for more */ 7766 if (!io_cqring_overflow_flush(ctx)) { 7767 ret = -EBUSY; 7768 break; 7769 } 7770 prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, 7771 TASK_INTERRUPTIBLE); 7772 ret = io_cqring_wait_schedule(ctx, &iowq, timeout); 7773 finish_wait(&ctx->cq_wait, &iowq.wq); 7774 cond_resched(); 7775 } while (ret > 0); 7776 7777 restore_saved_sigmask_unless(ret == -EINTR); 7778 7779 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; 7780} 7781 7782static void io_free_page_table(void **table, size_t size) 7783{ 7784 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 7785 7786 for (i = 0; i < nr_tables; i++) 7787 kfree(table[i]); 7788 kfree(table); 7789} 7790 7791static __cold void **io_alloc_page_table(size_t size) 7792{ 7793 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 7794 size_t init_size = size; 7795 void **table; 7796 7797 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); 7798 if (!table) 7799 return NULL; 7800 7801 for (i = 0; i < nr_tables; i++) { 7802 unsigned int this_size = min_t(size_t, size, PAGE_SIZE); 7803 7804 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); 7805 if (!table[i]) { 7806 io_free_page_table(table, init_size); 7807 return NULL; 7808 } 7809 size -= this_size; 7810 } 7811 return table; 7812} 7813 7814static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) 7815{ 7816 percpu_ref_exit(&ref_node->refs); 7817 kfree(ref_node); 7818} 7819 7820static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) 7821{ 7822 struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); 7823 struct io_ring_ctx *ctx = node->rsrc_data->ctx; 7824 unsigned long flags; 7825 bool first_add = false; 7826 unsigned long delay = HZ; 7827 7828 spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); 7829 node->done = true; 7830 7831 /* if we are mid-quiesce then do not delay */ 7832 if (node->rsrc_data->quiesce) 7833 delay = 0; 7834 7835 while (!list_empty(&ctx->rsrc_ref_list)) { 7836 node = list_first_entry(&ctx->rsrc_ref_list, 7837 struct io_rsrc_node, node); 7838 /* recycle ref nodes in order */ 7839 if (!node->done) 7840 break; 7841 list_del(&node->node); 7842 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); 7843 } 7844 spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); 7845 7846 if (first_add) 7847 mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); 7848} 7849 7850static struct io_rsrc_node *io_rsrc_node_alloc(void) 7851{ 7852 struct io_rsrc_node *ref_node; 7853 7854 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); 7855 if (!ref_node) 7856 return NULL; 7857 7858 if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, 7859 0, GFP_KERNEL)) { 7860 kfree(ref_node); 7861 return NULL; 7862 } 7863 INIT_LIST_HEAD(&ref_node->node); 7864 INIT_LIST_HEAD(&ref_node->rsrc_list); 7865 ref_node->done = false; 7866 return ref_node; 7867} 7868 7869static void io_rsrc_node_switch(struct io_ring_ctx *ctx, 7870 struct io_rsrc_data *data_to_kill) 7871 __must_hold(&ctx->uring_lock) 7872{ 7873 WARN_ON_ONCE(!ctx->rsrc_backup_node); 7874 WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); 7875 7876 io_rsrc_refs_drop(ctx); 7877 7878 if (data_to_kill) { 7879 struct io_rsrc_node *rsrc_node = ctx->rsrc_node; 7880 7881 rsrc_node->rsrc_data = data_to_kill; 7882 spin_lock_irq(&ctx->rsrc_ref_lock); 7883 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); 7884 spin_unlock_irq(&ctx->rsrc_ref_lock); 7885 7886 atomic_inc(&data_to_kill->refs); 7887 percpu_ref_kill(&rsrc_node->refs); 7888 ctx->rsrc_node = NULL; 7889 } 7890 7891 if (!ctx->rsrc_node) { 7892 ctx->rsrc_node = ctx->rsrc_backup_node; 7893 ctx->rsrc_backup_node = NULL; 7894 } 7895} 7896 7897static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) 7898{ 7899 if (ctx->rsrc_backup_node) 7900 return 0; 7901 ctx->rsrc_backup_node = io_rsrc_node_alloc(); 7902 return ctx->rsrc_backup_node ? 0 : -ENOMEM; 7903} 7904 7905static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data, 7906 struct io_ring_ctx *ctx) 7907{ 7908 int ret; 7909 7910 /* As we may drop ->uring_lock, other task may have started quiesce */ 7911 if (data->quiesce) 7912 return -ENXIO; 7913 7914 data->quiesce = true; 7915 do { 7916 ret = io_rsrc_node_switch_start(ctx); 7917 if (ret) 7918 break; 7919 io_rsrc_node_switch(ctx, data); 7920 7921 /* kill initial ref, already quiesced if zero */ 7922 if (atomic_dec_and_test(&data->refs)) 7923 break; 7924 mutex_unlock(&ctx->uring_lock); 7925 flush_delayed_work(&ctx->rsrc_put_work); 7926 ret = wait_for_completion_interruptible(&data->done); 7927 if (!ret) { 7928 mutex_lock(&ctx->uring_lock); 7929 if (atomic_read(&data->refs) > 0) { 7930 /* 7931 * it has been revived by another thread while 7932 * we were unlocked 7933 */ 7934 mutex_unlock(&ctx->uring_lock); 7935 } else { 7936 break; 7937 } 7938 } 7939 7940 atomic_inc(&data->refs); 7941 /* wait for all works potentially completing data->done */ 7942 flush_delayed_work(&ctx->rsrc_put_work); 7943 reinit_completion(&data->done); 7944 7945 ret = io_run_task_work_sig(); 7946 mutex_lock(&ctx->uring_lock); 7947 } while (ret >= 0); 7948 data->quiesce = false; 7949 7950 return ret; 7951} 7952 7953static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx) 7954{ 7955 unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK; 7956 unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT; 7957 7958 return &data->tags[table_idx][off]; 7959} 7960 7961static void io_rsrc_data_free(struct io_rsrc_data *data) 7962{ 7963 size_t size = data->nr * sizeof(data->tags[0][0]); 7964 7965 if (data->tags) 7966 io_free_page_table((void **)data->tags, size); 7967 kfree(data); 7968} 7969 7970static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put, 7971 u64 __user *utags, unsigned nr, 7972 struct io_rsrc_data **pdata) 7973{ 7974 struct io_rsrc_data *data; 7975 int ret = -ENOMEM; 7976 unsigned i; 7977 7978 data = kzalloc(sizeof(*data), GFP_KERNEL); 7979 if (!data) 7980 return -ENOMEM; 7981 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); 7982 if (!data->tags) { 7983 kfree(data); 7984 return -ENOMEM; 7985 } 7986 7987 data->nr = nr; 7988 data->ctx = ctx; 7989 data->do_put = do_put; 7990 if (utags) { 7991 ret = -EFAULT; 7992 for (i = 0; i < nr; i++) { 7993 u64 *tag_slot = io_get_tag_slot(data, i); 7994 7995 if (copy_from_user(tag_slot, &utags[i], 7996 sizeof(*tag_slot))) 7997 goto fail; 7998 } 7999 } 8000 8001 atomic_set(&data->refs, 1); 8002 init_completion(&data->done); 8003 *pdata = data; 8004 return 0; 8005fail: 8006 io_rsrc_data_free(data); 8007 return ret; 8008} 8009 8010static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) 8011{ 8012 table->files = kvcalloc(nr_files, sizeof(table->files[0]), 8013 GFP_KERNEL_ACCOUNT); 8014 return !!table->files; 8015} 8016 8017static void io_free_file_tables(struct io_file_table *table) 8018{ 8019 kvfree(table->files); 8020 table->files = NULL; 8021} 8022 8023static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) 8024{ 8025#if defined(CONFIG_UNIX) 8026 if (ctx->ring_sock) { 8027 struct sock *sock = ctx->ring_sock->sk; 8028 struct sk_buff *skb; 8029 8030 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) 8031 kfree_skb(skb); 8032 } 8033#else 8034 int i; 8035 8036 for (i = 0; i < ctx->nr_user_files; i++) { 8037 struct file *file; 8038 8039 file = io_file_from_index(ctx, i); 8040 if (file) 8041 fput(file); 8042 } 8043#endif 8044 io_free_file_tables(&ctx->file_table); 8045 io_rsrc_data_free(ctx->file_data); 8046 ctx->file_data = NULL; 8047 ctx->nr_user_files = 0; 8048} 8049 8050static int io_sqe_files_unregister(struct io_ring_ctx *ctx) 8051{ 8052 int ret; 8053 8054 if (!ctx->file_data) 8055 return -ENXIO; 8056 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); 8057 if (!ret) 8058 __io_sqe_files_unregister(ctx); 8059 return ret; 8060} 8061 8062static void io_sq_thread_unpark(struct io_sq_data *sqd) 8063 __releases(&sqd->lock) 8064{ 8065 WARN_ON_ONCE(sqd->thread == current); 8066 8067 /* 8068 * Do the dance but not conditional clear_bit() because it'd race with 8069 * other threads incrementing park_pending and setting the bit. 8070 */ 8071 clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 8072 if (atomic_dec_return(&sqd->park_pending)) 8073 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 8074 mutex_unlock(&sqd->lock); 8075} 8076 8077static void io_sq_thread_park(struct io_sq_data *sqd) 8078 __acquires(&sqd->lock) 8079{ 8080 WARN_ON_ONCE(sqd->thread == current); 8081 8082 atomic_inc(&sqd->park_pending); 8083 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 8084 mutex_lock(&sqd->lock); 8085 if (sqd->thread) 8086 wake_up_process(sqd->thread); 8087} 8088 8089static void io_sq_thread_stop(struct io_sq_data *sqd) 8090{ 8091 WARN_ON_ONCE(sqd->thread == current); 8092 WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)); 8093 8094 set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); 8095 mutex_lock(&sqd->lock); 8096 if (sqd->thread) 8097 wake_up_process(sqd->thread); 8098 mutex_unlock(&sqd->lock); 8099 wait_for_completion(&sqd->exited); 8100} 8101 8102static void io_put_sq_data(struct io_sq_data *sqd) 8103{ 8104 if (refcount_dec_and_test(&sqd->refs)) { 8105 WARN_ON_ONCE(atomic_read(&sqd->park_pending)); 8106 8107 io_sq_thread_stop(sqd); 8108 kfree(sqd); 8109 } 8110} 8111 8112static void io_sq_thread_finish(struct io_ring_ctx *ctx) 8113{ 8114 struct io_sq_data *sqd = ctx->sq_data; 8115 8116 if (sqd) { 8117 io_sq_thread_park(sqd); 8118 list_del_init(&ctx->sqd_list); 8119 io_sqd_update_thread_idle(sqd); 8120 io_sq_thread_unpark(sqd); 8121 8122 io_put_sq_data(sqd); 8123 ctx->sq_data = NULL; 8124 } 8125} 8126 8127static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) 8128{ 8129 struct io_ring_ctx *ctx_attach; 8130 struct io_sq_data *sqd; 8131 struct fd f; 8132 8133 f = fdget(p->wq_fd); 8134 if (!f.file) 8135 return ERR_PTR(-ENXIO); 8136 if (f.file->f_op != &io_uring_fops) { 8137 fdput(f); 8138 return ERR_PTR(-EINVAL); 8139 } 8140 8141 ctx_attach = f.file->private_data; 8142 sqd = ctx_attach->sq_data; 8143 if (!sqd) { 8144 fdput(f); 8145 return ERR_PTR(-EINVAL); 8146 } 8147 if (sqd->task_tgid != current->tgid) { 8148 fdput(f); 8149 return ERR_PTR(-EPERM); 8150 } 8151 8152 refcount_inc(&sqd->refs); 8153 fdput(f); 8154 return sqd; 8155} 8156 8157static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, 8158 bool *attached) 8159{ 8160 struct io_sq_data *sqd; 8161 8162 *attached = false; 8163 if (p->flags & IORING_SETUP_ATTACH_WQ) { 8164 sqd = io_attach_sq_data(p); 8165 if (!IS_ERR(sqd)) { 8166 *attached = true; 8167 return sqd; 8168 } 8169 /* fall through for EPERM case, setup new sqd/task */ 8170 if (PTR_ERR(sqd) != -EPERM) 8171 return sqd; 8172 } 8173 8174 sqd = kzalloc(sizeof(*sqd), GFP_KERNEL); 8175 if (!sqd) 8176 return ERR_PTR(-ENOMEM); 8177 8178 atomic_set(&sqd->park_pending, 0); 8179 refcount_set(&sqd->refs, 1); 8180 INIT_LIST_HEAD(&sqd->ctx_list); 8181 mutex_init(&sqd->lock); 8182 init_waitqueue_head(&sqd->wait); 8183 init_completion(&sqd->exited); 8184 return sqd; 8185} 8186 8187#if defined(CONFIG_UNIX) 8188/* 8189 * Ensure the UNIX gc is aware of our file set, so we are certain that 8190 * the io_uring can be safely unregistered on process exit, even if we have 8191 * loops in the file referencing. 8192 */ 8193static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) 8194{ 8195 struct sock *sk = ctx->ring_sock->sk; 8196 struct scm_fp_list *fpl; 8197 struct sk_buff *skb; 8198 int i, nr_files; 8199 8200 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); 8201 if (!fpl) 8202 return -ENOMEM; 8203 8204 skb = alloc_skb(0, GFP_KERNEL); 8205 if (!skb) { 8206 kfree(fpl); 8207 return -ENOMEM; 8208 } 8209 8210 skb->sk = sk; 8211 8212 nr_files = 0; 8213 fpl->user = get_uid(current_user()); 8214 for (i = 0; i < nr; i++) { 8215 struct file *file = io_file_from_index(ctx, i + offset); 8216 8217 if (!file) 8218 continue; 8219 fpl->fp[nr_files] = get_file(file); 8220 unix_inflight(fpl->user, fpl->fp[nr_files]); 8221 nr_files++; 8222 } 8223 8224 if (nr_files) { 8225 fpl->max = SCM_MAX_FD; 8226 fpl->count = nr_files; 8227 UNIXCB(skb).fp = fpl; 8228 skb->destructor = unix_destruct_scm; 8229 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 8230 skb_queue_head(&sk->sk_receive_queue, skb); 8231 8232 for (i = 0; i < nr_files; i++) 8233 fput(fpl->fp[i]); 8234 } else { 8235 kfree_skb(skb); 8236 kfree(fpl); 8237 } 8238 8239 return 0; 8240} 8241 8242/* 8243 * If UNIX sockets are enabled, fd passing can cause a reference cycle which 8244 * causes regular reference counting to break down. We rely on the UNIX 8245 * garbage collection to take care of this problem for us. 8246 */ 8247static int io_sqe_files_scm(struct io_ring_ctx *ctx) 8248{ 8249 unsigned left, total; 8250 int ret = 0; 8251 8252 total = 0; 8253 left = ctx->nr_user_files; 8254 while (left) { 8255 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD); 8256 8257 ret = __io_sqe_files_scm(ctx, this_files, total); 8258 if (ret) 8259 break; 8260 left -= this_files; 8261 total += this_files; 8262 } 8263 8264 if (!ret) 8265 return 0; 8266 8267 while (total < ctx->nr_user_files) { 8268 struct file *file = io_file_from_index(ctx, total); 8269 8270 if (file) 8271 fput(file); 8272 total++; 8273 } 8274 8275 return ret; 8276} 8277#else 8278static int io_sqe_files_scm(struct io_ring_ctx *ctx) 8279{ 8280 return 0; 8281} 8282#endif 8283 8284static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 8285{ 8286 struct file *file = prsrc->file; 8287#if defined(CONFIG_UNIX) 8288 struct sock *sock = ctx->ring_sock->sk; 8289 struct sk_buff_head list, *head = &sock->sk_receive_queue; 8290 struct sk_buff *skb; 8291 int i; 8292 8293 __skb_queue_head_init(&list); 8294 8295 /* 8296 * Find the skb that holds this file in its SCM_RIGHTS. When found, 8297 * remove this entry and rearrange the file array. 8298 */ 8299 skb = skb_dequeue(head); 8300 while (skb) { 8301 struct scm_fp_list *fp; 8302 8303 fp = UNIXCB(skb).fp; 8304 for (i = 0; i < fp->count; i++) { 8305 int left; 8306 8307 if (fp->fp[i] != file) 8308 continue; 8309 8310 unix_notinflight(fp->user, fp->fp[i]); 8311 left = fp->count - 1 - i; 8312 if (left) { 8313 memmove(&fp->fp[i], &fp->fp[i + 1], 8314 left * sizeof(struct file *)); 8315 } 8316 fp->count--; 8317 if (!fp->count) { 8318 kfree_skb(skb); 8319 skb = NULL; 8320 } else { 8321 __skb_queue_tail(&list, skb); 8322 } 8323 fput(file); 8324 file = NULL; 8325 break; 8326 } 8327 8328 if (!file) 8329 break; 8330 8331 __skb_queue_tail(&list, skb); 8332 8333 skb = skb_dequeue(head); 8334 } 8335 8336 if (skb_peek(&list)) { 8337 spin_lock_irq(&head->lock); 8338 while ((skb = __skb_dequeue(&list)) != NULL) 8339 __skb_queue_tail(head, skb); 8340 spin_unlock_irq(&head->lock); 8341 } 8342#else 8343 fput(file); 8344#endif 8345} 8346 8347static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) 8348{ 8349 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; 8350 struct io_ring_ctx *ctx = rsrc_data->ctx; 8351 struct io_rsrc_put *prsrc, *tmp; 8352 8353 list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) { 8354 list_del(&prsrc->list); 8355 8356 if (prsrc->tag) { 8357 bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL; 8358 8359 io_ring_submit_lock(ctx, lock_ring); 8360 spin_lock(&ctx->completion_lock); 8361 io_fill_cqe_aux(ctx, prsrc->tag, 0, 0); 8362 io_commit_cqring(ctx); 8363 spin_unlock(&ctx->completion_lock); 8364 io_cqring_ev_posted(ctx); 8365 io_ring_submit_unlock(ctx, lock_ring); 8366 } 8367 8368 rsrc_data->do_put(ctx, prsrc); 8369 kfree(prsrc); 8370 } 8371 8372 io_rsrc_node_destroy(ref_node); 8373 if (atomic_dec_and_test(&rsrc_data->refs)) 8374 complete(&rsrc_data->done); 8375} 8376 8377static void io_rsrc_put_work(struct work_struct *work) 8378{ 8379 struct io_ring_ctx *ctx; 8380 struct llist_node *node; 8381 8382 ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work); 8383 node = llist_del_all(&ctx->rsrc_put_llist); 8384 8385 while (node) { 8386 struct io_rsrc_node *ref_node; 8387 struct llist_node *next = node->next; 8388 8389 ref_node = llist_entry(node, struct io_rsrc_node, llist); 8390 __io_rsrc_put_work(ref_node); 8391 node = next; 8392 } 8393} 8394 8395static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 8396 unsigned nr_args, u64 __user *tags) 8397{ 8398 __s32 __user *fds = (__s32 __user *) arg; 8399 struct file *file; 8400 int fd, ret; 8401 unsigned i; 8402 8403 if (ctx->file_data) 8404 return -EBUSY; 8405 if (!nr_args) 8406 return -EINVAL; 8407 if (nr_args > IORING_MAX_FIXED_FILES) 8408 return -EMFILE; 8409 if (nr_args > rlimit(RLIMIT_NOFILE)) 8410 return -EMFILE; 8411 ret = io_rsrc_node_switch_start(ctx); 8412 if (ret) 8413 return ret; 8414 ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, 8415 &ctx->file_data); 8416 if (ret) 8417 return ret; 8418 8419 ret = -ENOMEM; 8420 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) 8421 goto out_free; 8422 8423 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { 8424 if (copy_from_user(&fd, &fds[i], sizeof(fd))) { 8425 ret = -EFAULT; 8426 goto out_fput; 8427 } 8428 /* allow sparse sets */ 8429 if (fd == -1) { 8430 ret = -EINVAL; 8431 if (unlikely(*io_get_tag_slot(ctx->file_data, i))) 8432 goto out_fput; 8433 continue; 8434 } 8435 8436 file = fget(fd); 8437 ret = -EBADF; 8438 if (unlikely(!file)) 8439 goto out_fput; 8440 8441 /* 8442 * Don't allow io_uring instances to be registered. If UNIX 8443 * isn't enabled, then this causes a reference cycle and this 8444 * instance can never get freed. If UNIX is enabled we'll 8445 * handle it just fine, but there's still no point in allowing 8446 * a ring fd as it doesn't support regular read/write anyway. 8447 */ 8448 if (file->f_op == &io_uring_fops) { 8449 fput(file); 8450 goto out_fput; 8451 } 8452 io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file); 8453 } 8454 8455 ret = io_sqe_files_scm(ctx); 8456 if (ret) { 8457 __io_sqe_files_unregister(ctx); 8458 return ret; 8459 } 8460 8461 io_rsrc_node_switch(ctx, NULL); 8462 return ret; 8463out_fput: 8464 for (i = 0; i < ctx->nr_user_files; i++) { 8465 file = io_file_from_index(ctx, i); 8466 if (file) 8467 fput(file); 8468 } 8469 io_free_file_tables(&ctx->file_table); 8470 ctx->nr_user_files = 0; 8471out_free: 8472 io_rsrc_data_free(ctx->file_data); 8473 ctx->file_data = NULL; 8474 return ret; 8475} 8476 8477static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, 8478 int index) 8479{ 8480#if defined(CONFIG_UNIX) 8481 struct sock *sock = ctx->ring_sock->sk; 8482 struct sk_buff_head *head = &sock->sk_receive_queue; 8483 struct sk_buff *skb; 8484 8485 /* 8486 * See if we can merge this file into an existing skb SCM_RIGHTS 8487 * file set. If there's no room, fall back to allocating a new skb 8488 * and filling it in. 8489 */ 8490 spin_lock_irq(&head->lock); 8491 skb = skb_peek(head); 8492 if (skb) { 8493 struct scm_fp_list *fpl = UNIXCB(skb).fp; 8494 8495 if (fpl->count < SCM_MAX_FD) { 8496 __skb_unlink(skb, head); 8497 spin_unlock_irq(&head->lock); 8498 fpl->fp[fpl->count] = get_file(file); 8499 unix_inflight(fpl->user, fpl->fp[fpl->count]); 8500 fpl->count++; 8501 spin_lock_irq(&head->lock); 8502 __skb_queue_head(head, skb); 8503 } else { 8504 skb = NULL; 8505 } 8506 } 8507 spin_unlock_irq(&head->lock); 8508 8509 if (skb) { 8510 fput(file); 8511 return 0; 8512 } 8513 8514 return __io_sqe_files_scm(ctx, 1, index); 8515#else 8516 return 0; 8517#endif 8518} 8519 8520static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, 8521 struct io_rsrc_node *node, void *rsrc) 8522{ 8523 struct io_rsrc_put *prsrc; 8524 8525 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); 8526 if (!prsrc) 8527 return -ENOMEM; 8528 8529 prsrc->tag = *io_get_tag_slot(data, idx); 8530 prsrc->rsrc = rsrc; 8531 list_add(&prsrc->list, &node->rsrc_list); 8532 return 0; 8533} 8534 8535static int io_install_fixed_file(struct io_kiocb *req, struct file *file, 8536 unsigned int issue_flags, u32 slot_index) 8537{ 8538 struct io_ring_ctx *ctx = req->ctx; 8539 bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 8540 bool needs_switch = false; 8541 struct io_fixed_file *file_slot; 8542 int ret = -EBADF; 8543 8544 io_ring_submit_lock(ctx, needs_lock); 8545 if (file->f_op == &io_uring_fops) 8546 goto err; 8547 ret = -ENXIO; 8548 if (!ctx->file_data) 8549 goto err; 8550 ret = -EINVAL; 8551 if (slot_index >= ctx->nr_user_files) 8552 goto err; 8553 8554 slot_index = array_index_nospec(slot_index, ctx->nr_user_files); 8555 file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); 8556 8557 if (file_slot->file_ptr) { 8558 struct file *old_file; 8559 8560 ret = io_rsrc_node_switch_start(ctx); 8561 if (ret) 8562 goto err; 8563 8564 old_file = (struct file *)(file_slot->file_ptr & FFS_MASK); 8565 ret = io_queue_rsrc_removal(ctx->file_data, slot_index, 8566 ctx->rsrc_node, old_file); 8567 if (ret) 8568 goto err; 8569 file_slot->file_ptr = 0; 8570 needs_switch = true; 8571 } 8572 8573 *io_get_tag_slot(ctx->file_data, slot_index) = 0; 8574 io_fixed_file_set(file_slot, file); 8575 ret = io_sqe_file_register(ctx, file, slot_index); 8576 if (ret) { 8577 file_slot->file_ptr = 0; 8578 goto err; 8579 } 8580 8581 ret = 0; 8582err: 8583 if (needs_switch) 8584 io_rsrc_node_switch(ctx, ctx->file_data); 8585 io_ring_submit_unlock(ctx, needs_lock); 8586 if (ret) 8587 fput(file); 8588 return ret; 8589} 8590 8591static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) 8592{ 8593 unsigned int offset = req->close.file_slot - 1; 8594 struct io_ring_ctx *ctx = req->ctx; 8595 bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 8596 struct io_fixed_file *file_slot; 8597 struct file *file; 8598 int ret, i; 8599 8600 io_ring_submit_lock(ctx, needs_lock); 8601 ret = -ENXIO; 8602 if (unlikely(!ctx->file_data)) 8603 goto out; 8604 ret = -EINVAL; 8605 if (offset >= ctx->nr_user_files) 8606 goto out; 8607 ret = io_rsrc_node_switch_start(ctx); 8608 if (ret) 8609 goto out; 8610 8611 i = array_index_nospec(offset, ctx->nr_user_files); 8612 file_slot = io_fixed_file_slot(&ctx->file_table, i); 8613 ret = -EBADF; 8614 if (!file_slot->file_ptr) 8615 goto out; 8616 8617 file = (struct file *)(file_slot->file_ptr & FFS_MASK); 8618 ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file); 8619 if (ret) 8620 goto out; 8621 8622 file_slot->file_ptr = 0; 8623 io_rsrc_node_switch(ctx, ctx->file_data); 8624 ret = 0; 8625out: 8626 io_ring_submit_unlock(ctx, needs_lock); 8627 return ret; 8628} 8629 8630static int __io_sqe_files_update(struct io_ring_ctx *ctx, 8631 struct io_uring_rsrc_update2 *up, 8632 unsigned nr_args) 8633{ 8634 u64 __user *tags = u64_to_user_ptr(up->tags); 8635 __s32 __user *fds = u64_to_user_ptr(up->data); 8636 struct io_rsrc_data *data = ctx->file_data; 8637 struct io_fixed_file *file_slot; 8638 struct file *file; 8639 int fd, i, err = 0; 8640 unsigned int done; 8641 bool needs_switch = false; 8642 8643 if (!ctx->file_data) 8644 return -ENXIO; 8645 if (up->offset + nr_args > ctx->nr_user_files) 8646 return -EINVAL; 8647 8648 for (done = 0; done < nr_args; done++) { 8649 u64 tag = 0; 8650 8651 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 8652 copy_from_user(&fd, &fds[done], sizeof(fd))) { 8653 err = -EFAULT; 8654 break; 8655 } 8656 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 8657 err = -EINVAL; 8658 break; 8659 } 8660 if (fd == IORING_REGISTER_FILES_SKIP) 8661 continue; 8662 8663 i = array_index_nospec(up->offset + done, ctx->nr_user_files); 8664 file_slot = io_fixed_file_slot(&ctx->file_table, i); 8665 8666 if (file_slot->file_ptr) { 8667 file = (struct file *)(file_slot->file_ptr & FFS_MASK); 8668 err = io_queue_rsrc_removal(data, up->offset + done, 8669 ctx->rsrc_node, file); 8670 if (err) 8671 break; 8672 file_slot->file_ptr = 0; 8673 needs_switch = true; 8674 } 8675 if (fd != -1) { 8676 file = fget(fd); 8677 if (!file) { 8678 err = -EBADF; 8679 break; 8680 } 8681 /* 8682 * Don't allow io_uring instances to be registered. If 8683 * UNIX isn't enabled, then this causes a reference 8684 * cycle and this instance can never get freed. If UNIX 8685 * is enabled we'll handle it just fine, but there's 8686 * still no point in allowing a ring fd as it doesn't 8687 * support regular read/write anyway. 8688 */ 8689 if (file->f_op == &io_uring_fops) { 8690 fput(file); 8691 err = -EBADF; 8692 break; 8693 } 8694 *io_get_tag_slot(data, up->offset + done) = tag; 8695 io_fixed_file_set(file_slot, file); 8696 err = io_sqe_file_register(ctx, file, i); 8697 if (err) { 8698 file_slot->file_ptr = 0; 8699 fput(file); 8700 break; 8701 } 8702 } 8703 } 8704 8705 if (needs_switch) 8706 io_rsrc_node_switch(ctx, data); 8707 return done ? done : err; 8708} 8709 8710static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, 8711 struct task_struct *task) 8712{ 8713 struct io_wq_hash *hash; 8714 struct io_wq_data data; 8715 unsigned int concurrency; 8716 8717 mutex_lock(&ctx->uring_lock); 8718 hash = ctx->hash_map; 8719 if (!hash) { 8720 hash = kzalloc(sizeof(*hash), GFP_KERNEL); 8721 if (!hash) { 8722 mutex_unlock(&ctx->uring_lock); 8723 return ERR_PTR(-ENOMEM); 8724 } 8725 refcount_set(&hash->refs, 1); 8726 init_waitqueue_head(&hash->wait); 8727 ctx->hash_map = hash; 8728 } 8729 mutex_unlock(&ctx->uring_lock); 8730 8731 data.hash = hash; 8732 data.task = task; 8733 data.free_work = io_wq_free_work; 8734 data.do_work = io_wq_submit_work; 8735 8736 /* Do QD, or 4 * CPUS, whatever is smallest */ 8737 concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); 8738 8739 return io_wq_create(concurrency, &data); 8740} 8741 8742static __cold int io_uring_alloc_task_context(struct task_struct *task, 8743 struct io_ring_ctx *ctx) 8744{ 8745 struct io_uring_task *tctx; 8746 int ret; 8747 8748 tctx = kzalloc(sizeof(*tctx), GFP_KERNEL); 8749 if (unlikely(!tctx)) 8750 return -ENOMEM; 8751 8752 ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); 8753 if (unlikely(ret)) { 8754 kfree(tctx); 8755 return ret; 8756 } 8757 8758 tctx->io_wq = io_init_wq_offload(ctx, task); 8759 if (IS_ERR(tctx->io_wq)) { 8760 ret = PTR_ERR(tctx->io_wq); 8761 percpu_counter_destroy(&tctx->inflight); 8762 kfree(tctx); 8763 return ret; 8764 } 8765 8766 xa_init(&tctx->xa); 8767 init_waitqueue_head(&tctx->wait); 8768 atomic_set(&tctx->in_idle, 0); 8769 atomic_set(&tctx->inflight_tracked, 0); 8770 task->io_uring = tctx; 8771 spin_lock_init(&tctx->task_lock); 8772 INIT_WQ_LIST(&tctx->task_list); 8773 INIT_WQ_LIST(&tctx->prior_task_list); 8774 init_task_work(&tctx->task_work, tctx_task_work); 8775 return 0; 8776} 8777 8778void __io_uring_free(struct task_struct *tsk) 8779{ 8780 struct io_uring_task *tctx = tsk->io_uring; 8781 8782 WARN_ON_ONCE(!xa_empty(&tctx->xa)); 8783 WARN_ON_ONCE(tctx->io_wq); 8784 WARN_ON_ONCE(tctx->cached_refs); 8785 8786 percpu_counter_destroy(&tctx->inflight); 8787 kfree(tctx); 8788 tsk->io_uring = NULL; 8789} 8790 8791static __cold int io_sq_offload_create(struct io_ring_ctx *ctx, 8792 struct io_uring_params *p) 8793{ 8794 int ret; 8795 8796 /* Retain compatibility with failing for an invalid attach attempt */ 8797 if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) == 8798 IORING_SETUP_ATTACH_WQ) { 8799 struct fd f; 8800 8801 f = fdget(p->wq_fd); 8802 if (!f.file) 8803 return -ENXIO; 8804 if (f.file->f_op != &io_uring_fops) { 8805 fdput(f); 8806 return -EINVAL; 8807 } 8808 fdput(f); 8809 } 8810 if (ctx->flags & IORING_SETUP_SQPOLL) { 8811 struct task_struct *tsk; 8812 struct io_sq_data *sqd; 8813 bool attached; 8814 8815 ret = security_uring_sqpoll(); 8816 if (ret) 8817 return ret; 8818 8819 sqd = io_get_sq_data(p, &attached); 8820 if (IS_ERR(sqd)) { 8821 ret = PTR_ERR(sqd); 8822 goto err; 8823 } 8824 8825 ctx->sq_creds = get_current_cred(); 8826 ctx->sq_data = sqd; 8827 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); 8828 if (!ctx->sq_thread_idle) 8829 ctx->sq_thread_idle = HZ; 8830 8831 io_sq_thread_park(sqd); 8832 list_add(&ctx->sqd_list, &sqd->ctx_list); 8833 io_sqd_update_thread_idle(sqd); 8834 /* don't attach to a dying SQPOLL thread, would be racy */ 8835 ret = (attached && !sqd->thread) ? -ENXIO : 0; 8836 io_sq_thread_unpark(sqd); 8837 8838 if (ret < 0) 8839 goto err; 8840 if (attached) 8841 return 0; 8842 8843 if (p->flags & IORING_SETUP_SQ_AFF) { 8844 int cpu = p->sq_thread_cpu; 8845 8846 ret = -EINVAL; 8847 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) 8848 goto err_sqpoll; 8849 sqd->sq_cpu = cpu; 8850 } else { 8851 sqd->sq_cpu = -1; 8852 } 8853 8854 sqd->task_pid = current->pid; 8855 sqd->task_tgid = current->tgid; 8856 tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); 8857 if (IS_ERR(tsk)) { 8858 ret = PTR_ERR(tsk); 8859 goto err_sqpoll; 8860 } 8861 8862 sqd->thread = tsk; 8863 ret = io_uring_alloc_task_context(tsk, ctx); 8864 wake_up_new_task(tsk); 8865 if (ret) 8866 goto err; 8867 } else if (p->flags & IORING_SETUP_SQ_AFF) { 8868 /* Can't have SQ_AFF without SQPOLL */ 8869 ret = -EINVAL; 8870 goto err; 8871 } 8872 8873 return 0; 8874err_sqpoll: 8875 complete(&ctx->sq_data->exited); 8876err: 8877 io_sq_thread_finish(ctx); 8878 return ret; 8879} 8880 8881static inline void __io_unaccount_mem(struct user_struct *user, 8882 unsigned long nr_pages) 8883{ 8884 atomic_long_sub(nr_pages, &user->locked_vm); 8885} 8886 8887static inline int __io_account_mem(struct user_struct *user, 8888 unsigned long nr_pages) 8889{ 8890 unsigned long page_limit, cur_pages, new_pages; 8891 8892 /* Don't allow more pages than we can safely lock */ 8893 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 8894 8895 do { 8896 cur_pages = atomic_long_read(&user->locked_vm); 8897 new_pages = cur_pages + nr_pages; 8898 if (new_pages > page_limit) 8899 return -ENOMEM; 8900 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, 8901 new_pages) != cur_pages); 8902 8903 return 0; 8904} 8905 8906static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 8907{ 8908 if (ctx->user) 8909 __io_unaccount_mem(ctx->user, nr_pages); 8910 8911 if (ctx->mm_account) 8912 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 8913} 8914 8915static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 8916{ 8917 int ret; 8918 8919 if (ctx->user) { 8920 ret = __io_account_mem(ctx->user, nr_pages); 8921 if (ret) 8922 return ret; 8923 } 8924 8925 if (ctx->mm_account) 8926 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 8927 8928 return 0; 8929} 8930 8931static void io_mem_free(void *ptr) 8932{ 8933 struct page *page; 8934 8935 if (!ptr) 8936 return; 8937 8938 page = virt_to_head_page(ptr); 8939 if (put_page_testzero(page)) 8940 free_compound_page(page); 8941} 8942 8943static void *io_mem_alloc(size_t size) 8944{ 8945 gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; 8946 8947 return (void *) __get_free_pages(gfp, get_order(size)); 8948} 8949 8950static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, 8951 size_t *sq_offset) 8952{ 8953 struct io_rings *rings; 8954 size_t off, sq_array_size; 8955 8956 off = struct_size(rings, cqes, cq_entries); 8957 if (off == SIZE_MAX) 8958 return SIZE_MAX; 8959 8960#ifdef CONFIG_SMP 8961 off = ALIGN(off, SMP_CACHE_BYTES); 8962 if (off == 0) 8963 return SIZE_MAX; 8964#endif 8965 8966 if (sq_offset) 8967 *sq_offset = off; 8968 8969 sq_array_size = array_size(sizeof(u32), sq_entries); 8970 if (sq_array_size == SIZE_MAX) 8971 return SIZE_MAX; 8972 8973 if (check_add_overflow(off, sq_array_size, &off)) 8974 return SIZE_MAX; 8975 8976 return off; 8977} 8978 8979static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) 8980{ 8981 struct io_mapped_ubuf *imu = *slot; 8982 unsigned int i; 8983 8984 if (imu != ctx->dummy_ubuf) { 8985 for (i = 0; i < imu->nr_bvecs; i++) 8986 unpin_user_page(imu->bvec[i].bv_page); 8987 if (imu->acct_pages) 8988 io_unaccount_mem(ctx, imu->acct_pages); 8989 kvfree(imu); 8990 } 8991 *slot = NULL; 8992} 8993 8994static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 8995{ 8996 io_buffer_unmap(ctx, &prsrc->buf); 8997 prsrc->buf = NULL; 8998} 8999 9000static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 9001{ 9002 unsigned int i; 9003 9004 for (i = 0; i < ctx->nr_user_bufs; i++) 9005 io_buffer_unmap(ctx, &ctx->user_bufs[i]); 9006 kfree(ctx->user_bufs); 9007 io_rsrc_data_free(ctx->buf_data); 9008 ctx->user_bufs = NULL; 9009 ctx->buf_data = NULL; 9010 ctx->nr_user_bufs = 0; 9011} 9012 9013static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 9014{ 9015 int ret; 9016 9017 if (!ctx->buf_data) 9018 return -ENXIO; 9019 9020 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); 9021 if (!ret) 9022 __io_sqe_buffers_unregister(ctx); 9023 return ret; 9024} 9025 9026static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, 9027 void __user *arg, unsigned index) 9028{ 9029 struct iovec __user *src; 9030 9031#ifdef CONFIG_COMPAT 9032 if (ctx->compat) { 9033 struct compat_iovec __user *ciovs; 9034 struct compat_iovec ciov; 9035 9036 ciovs = (struct compat_iovec __user *) arg; 9037 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) 9038 return -EFAULT; 9039 9040 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); 9041 dst->iov_len = ciov.iov_len; 9042 return 0; 9043 } 9044#endif 9045 src = (struct iovec __user *) arg; 9046 if (copy_from_user(dst, &src[index], sizeof(*dst))) 9047 return -EFAULT; 9048 return 0; 9049} 9050 9051/* 9052 * Not super efficient, but this is just a registration time. And we do cache 9053 * the last compound head, so generally we'll only do a full search if we don't 9054 * match that one. 9055 * 9056 * We check if the given compound head page has already been accounted, to 9057 * avoid double accounting it. This allows us to account the full size of the 9058 * page, not just the constituent pages of a huge page. 9059 */ 9060static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 9061 int nr_pages, struct page *hpage) 9062{ 9063 int i, j; 9064 9065 /* check current page array */ 9066 for (i = 0; i < nr_pages; i++) { 9067 if (!PageCompound(pages[i])) 9068 continue; 9069 if (compound_head(pages[i]) == hpage) 9070 return true; 9071 } 9072 9073 /* check previously registered pages */ 9074 for (i = 0; i < ctx->nr_user_bufs; i++) { 9075 struct io_mapped_ubuf *imu = ctx->user_bufs[i]; 9076 9077 for (j = 0; j < imu->nr_bvecs; j++) { 9078 if (!PageCompound(imu->bvec[j].bv_page)) 9079 continue; 9080 if (compound_head(imu->bvec[j].bv_page) == hpage) 9081 return true; 9082 } 9083 } 9084 9085 return false; 9086} 9087 9088static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 9089 int nr_pages, struct io_mapped_ubuf *imu, 9090 struct page **last_hpage) 9091{ 9092 int i, ret; 9093 9094 imu->acct_pages = 0; 9095 for (i = 0; i < nr_pages; i++) { 9096 if (!PageCompound(pages[i])) { 9097 imu->acct_pages++; 9098 } else { 9099 struct page *hpage; 9100 9101 hpage = compound_head(pages[i]); 9102 if (hpage == *last_hpage) 9103 continue; 9104 *last_hpage = hpage; 9105 if (headpage_already_acct(ctx, pages, i, hpage)) 9106 continue; 9107 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 9108 } 9109 } 9110 9111 if (!imu->acct_pages) 9112 return 0; 9113 9114 ret = io_account_mem(ctx, imu->acct_pages); 9115 if (ret) 9116 imu->acct_pages = 0; 9117 return ret; 9118} 9119 9120static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 9121 struct io_mapped_ubuf **pimu, 9122 struct page **last_hpage) 9123{ 9124 struct io_mapped_ubuf *imu = NULL; 9125 struct vm_area_struct **vmas = NULL; 9126 struct page **pages = NULL; 9127 unsigned long off, start, end, ubuf; 9128 size_t size; 9129 int ret, pret, nr_pages, i; 9130 9131 if (!iov->iov_base) { 9132 *pimu = ctx->dummy_ubuf; 9133 return 0; 9134 } 9135 9136 ubuf = (unsigned long) iov->iov_base; 9137 end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; 9138 start = ubuf >> PAGE_SHIFT; 9139 nr_pages = end - start; 9140 9141 *pimu = NULL; 9142 ret = -ENOMEM; 9143 9144 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); 9145 if (!pages) 9146 goto done; 9147 9148 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), 9149 GFP_KERNEL); 9150 if (!vmas) 9151 goto done; 9152 9153 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); 9154 if (!imu) 9155 goto done; 9156 9157 ret = 0; 9158 mmap_read_lock(current->mm); 9159 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, 9160 pages, vmas); 9161 if (pret == nr_pages) { 9162 /* don't support file backed memory */ 9163 for (i = 0; i < nr_pages; i++) { 9164 struct vm_area_struct *vma = vmas[i]; 9165 9166 if (vma_is_shmem(vma)) 9167 continue; 9168 if (vma->vm_file && 9169 !is_file_hugepages(vma->vm_file)) { 9170 ret = -EOPNOTSUPP; 9171 break; 9172 } 9173 } 9174 } else { 9175 ret = pret < 0 ? pret : -EFAULT; 9176 } 9177 mmap_read_unlock(current->mm); 9178 if (ret) { 9179 /* 9180 * if we did partial map, or found file backed vmas, 9181 * release any pages we did get 9182 */ 9183 if (pret > 0) 9184 unpin_user_pages(pages, pret); 9185 goto done; 9186 } 9187 9188 ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage); 9189 if (ret) { 9190 unpin_user_pages(pages, pret); 9191 goto done; 9192 } 9193 9194 off = ubuf & ~PAGE_MASK; 9195 size = iov->iov_len; 9196 for (i = 0; i < nr_pages; i++) { 9197 size_t vec_len; 9198 9199 vec_len = min_t(size_t, size, PAGE_SIZE - off); 9200 imu->bvec[i].bv_page = pages[i]; 9201 imu->bvec[i].bv_len = vec_len; 9202 imu->bvec[i].bv_offset = off; 9203 off = 0; 9204 size -= vec_len; 9205 } 9206 /* store original address for later verification */ 9207 imu->ubuf = ubuf; 9208 imu->ubuf_end = ubuf + iov->iov_len; 9209 imu->nr_bvecs = nr_pages; 9210 *pimu = imu; 9211 ret = 0; 9212done: 9213 if (ret) 9214 kvfree(imu); 9215 kvfree(pages); 9216 kvfree(vmas); 9217 return ret; 9218} 9219 9220static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) 9221{ 9222 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); 9223 return ctx->user_bufs ? 0 : -ENOMEM; 9224} 9225 9226static int io_buffer_validate(struct iovec *iov) 9227{ 9228 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); 9229 9230 /* 9231 * Don't impose further limits on the size and buffer 9232 * constraints here, we'll -EINVAL later when IO is 9233 * submitted if they are wrong. 9234 */ 9235 if (!iov->iov_base) 9236 return iov->iov_len ? -EFAULT : 0; 9237 if (!iov->iov_len) 9238 return -EFAULT; 9239 9240 /* arbitrary limit, but we need something */ 9241 if (iov->iov_len > SZ_1G) 9242 return -EFAULT; 9243 9244 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) 9245 return -EOVERFLOW; 9246 9247 return 0; 9248} 9249 9250static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 9251 unsigned int nr_args, u64 __user *tags) 9252{ 9253 struct page *last_hpage = NULL; 9254 struct io_rsrc_data *data; 9255 int i, ret; 9256 struct iovec iov; 9257 9258 if (ctx->user_bufs) 9259 return -EBUSY; 9260 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 9261 return -EINVAL; 9262 ret = io_rsrc_node_switch_start(ctx); 9263 if (ret) 9264 return ret; 9265 ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); 9266 if (ret) 9267 return ret; 9268 ret = io_buffers_map_alloc(ctx, nr_args); 9269 if (ret) { 9270 io_rsrc_data_free(data); 9271 return ret; 9272 } 9273 9274 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { 9275 ret = io_copy_iov(ctx, &iov, arg, i); 9276 if (ret) 9277 break; 9278 ret = io_buffer_validate(&iov); 9279 if (ret) 9280 break; 9281 if (!iov.iov_base && *io_get_tag_slot(data, i)) { 9282 ret = -EINVAL; 9283 break; 9284 } 9285 9286 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], 9287 &last_hpage); 9288 if (ret) 9289 break; 9290 } 9291 9292 WARN_ON_ONCE(ctx->buf_data); 9293 9294 ctx->buf_data = data; 9295 if (ret) 9296 __io_sqe_buffers_unregister(ctx); 9297 else 9298 io_rsrc_node_switch(ctx, NULL); 9299 return ret; 9300} 9301 9302static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 9303 struct io_uring_rsrc_update2 *up, 9304 unsigned int nr_args) 9305{ 9306 u64 __user *tags = u64_to_user_ptr(up->tags); 9307 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); 9308 struct page *last_hpage = NULL; 9309 bool needs_switch = false; 9310 __u32 done; 9311 int i, err; 9312 9313 if (!ctx->buf_data) 9314 return -ENXIO; 9315 if (up->offset + nr_args > ctx->nr_user_bufs) 9316 return -EINVAL; 9317 9318 for (done = 0; done < nr_args; done++) { 9319 struct io_mapped_ubuf *imu; 9320 int offset = up->offset + done; 9321 u64 tag = 0; 9322 9323 err = io_copy_iov(ctx, &iov, iovs, done); 9324 if (err) 9325 break; 9326 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 9327 err = -EFAULT; 9328 break; 9329 } 9330 err = io_buffer_validate(&iov); 9331 if (err) 9332 break; 9333 if (!iov.iov_base && tag) { 9334 err = -EINVAL; 9335 break; 9336 } 9337 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); 9338 if (err) 9339 break; 9340 9341 i = array_index_nospec(offset, ctx->nr_user_bufs); 9342 if (ctx->user_bufs[i] != ctx->dummy_ubuf) { 9343 err = io_queue_rsrc_removal(ctx->buf_data, offset, 9344 ctx->rsrc_node, ctx->user_bufs[i]); 9345 if (unlikely(err)) { 9346 io_buffer_unmap(ctx, &imu); 9347 break; 9348 } 9349 ctx->user_bufs[i] = NULL; 9350 needs_switch = true; 9351 } 9352 9353 ctx->user_bufs[i] = imu; 9354 *io_get_tag_slot(ctx->buf_data, offset) = tag; 9355 } 9356 9357 if (needs_switch) 9358 io_rsrc_node_switch(ctx, ctx->buf_data); 9359 return done ? done : err; 9360} 9361 9362static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg) 9363{ 9364 __s32 __user *fds = arg; 9365 int fd; 9366 9367 if (ctx->cq_ev_fd) 9368 return -EBUSY; 9369 9370 if (copy_from_user(&fd, fds, sizeof(*fds))) 9371 return -EFAULT; 9372 9373 ctx->cq_ev_fd = eventfd_ctx_fdget(fd); 9374 if (IS_ERR(ctx->cq_ev_fd)) { 9375 int ret = PTR_ERR(ctx->cq_ev_fd); 9376 9377 ctx->cq_ev_fd = NULL; 9378 return ret; 9379 } 9380 9381 return 0; 9382} 9383 9384static int io_eventfd_unregister(struct io_ring_ctx *ctx) 9385{ 9386 if (ctx->cq_ev_fd) { 9387 eventfd_ctx_put(ctx->cq_ev_fd); 9388 ctx->cq_ev_fd = NULL; 9389 return 0; 9390 } 9391 9392 return -ENXIO; 9393} 9394 9395static void io_destroy_buffers(struct io_ring_ctx *ctx) 9396{ 9397 struct io_buffer *buf; 9398 unsigned long index; 9399 9400 xa_for_each(&ctx->io_buffers, index, buf) 9401 __io_remove_buffers(ctx, buf, index, -1U); 9402} 9403 9404static void io_req_caches_free(struct io_ring_ctx *ctx) 9405{ 9406 struct io_submit_state *state = &ctx->submit_state; 9407 int nr = 0; 9408 9409 mutex_lock(&ctx->uring_lock); 9410 io_flush_cached_locked_reqs(ctx, state); 9411 9412 while (state->free_list.next) { 9413 struct io_wq_work_node *node; 9414 struct io_kiocb *req; 9415 9416 node = wq_stack_extract(&state->free_list); 9417 req = container_of(node, struct io_kiocb, comp_list); 9418 kmem_cache_free(req_cachep, req); 9419 nr++; 9420 } 9421 if (nr) 9422 percpu_ref_put_many(&ctx->refs, nr); 9423 mutex_unlock(&ctx->uring_lock); 9424} 9425 9426static void io_wait_rsrc_data(struct io_rsrc_data *data) 9427{ 9428 if (data && !atomic_dec_and_test(&data->refs)) 9429 wait_for_completion(&data->done); 9430} 9431 9432static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) 9433{ 9434 io_sq_thread_finish(ctx); 9435 9436 if (ctx->mm_account) { 9437 mmdrop(ctx->mm_account); 9438 ctx->mm_account = NULL; 9439 } 9440 9441 io_rsrc_refs_drop(ctx); 9442 /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ 9443 io_wait_rsrc_data(ctx->buf_data); 9444 io_wait_rsrc_data(ctx->file_data); 9445 9446 mutex_lock(&ctx->uring_lock); 9447 if (ctx->buf_data) 9448 __io_sqe_buffers_unregister(ctx); 9449 if (ctx->file_data) 9450 __io_sqe_files_unregister(ctx); 9451 if (ctx->rings) 9452 __io_cqring_overflow_flush(ctx, true); 9453 mutex_unlock(&ctx->uring_lock); 9454 io_eventfd_unregister(ctx); 9455 io_destroy_buffers(ctx); 9456 if (ctx->sq_creds) 9457 put_cred(ctx->sq_creds); 9458 9459 /* there are no registered resources left, nobody uses it */ 9460 if (ctx->rsrc_node) 9461 io_rsrc_node_destroy(ctx->rsrc_node); 9462 if (ctx->rsrc_backup_node) 9463 io_rsrc_node_destroy(ctx->rsrc_backup_node); 9464 flush_delayed_work(&ctx->rsrc_put_work); 9465 flush_delayed_work(&ctx->fallback_work); 9466 9467 WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); 9468 WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist)); 9469 9470#if defined(CONFIG_UNIX) 9471 if (ctx->ring_sock) { 9472 ctx->ring_sock->file = NULL; /* so that iput() is called */ 9473 sock_release(ctx->ring_sock); 9474 } 9475#endif 9476 WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); 9477 9478 io_mem_free(ctx->rings); 9479 io_mem_free(ctx->sq_sqes); 9480 9481 percpu_ref_exit(&ctx->refs); 9482 free_uid(ctx->user); 9483 io_req_caches_free(ctx); 9484 if (ctx->hash_map) 9485 io_wq_put_hash(ctx->hash_map); 9486 kfree(ctx->cancel_hash); 9487 kfree(ctx->dummy_ubuf); 9488 kfree(ctx); 9489} 9490 9491static __poll_t io_uring_poll(struct file *file, poll_table *wait) 9492{ 9493 struct io_ring_ctx *ctx = file->private_data; 9494 __poll_t mask = 0; 9495 9496 poll_wait(file, &ctx->cq_wait, wait); 9497 /* 9498 * synchronizes with barrier from wq_has_sleeper call in 9499 * io_commit_cqring 9500 */ 9501 smp_rmb(); 9502 if (!io_sqring_full(ctx)) 9503 mask |= EPOLLOUT | EPOLLWRNORM; 9504 9505 /* 9506 * Don't flush cqring overflow list here, just do a simple check. 9507 * Otherwise there could possible be ABBA deadlock: 9508 * CPU0 CPU1 9509 * ---- ---- 9510 * lock(&ctx->uring_lock); 9511 * lock(&ep->mtx); 9512 * lock(&ctx->uring_lock); 9513 * lock(&ep->mtx); 9514 * 9515 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this 9516 * pushs them to do the flush. 9517 */ 9518 if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow)) 9519 mask |= EPOLLIN | EPOLLRDNORM; 9520 9521 return mask; 9522} 9523 9524static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) 9525{ 9526 const struct cred *creds; 9527 9528 creds = xa_erase(&ctx->personalities, id); 9529 if (creds) { 9530 put_cred(creds); 9531 return 0; 9532 } 9533 9534 return -EINVAL; 9535} 9536 9537struct io_tctx_exit { 9538 struct callback_head task_work; 9539 struct completion completion; 9540 struct io_ring_ctx *ctx; 9541}; 9542 9543static __cold void io_tctx_exit_cb(struct callback_head *cb) 9544{ 9545 struct io_uring_task *tctx = current->io_uring; 9546 struct io_tctx_exit *work; 9547 9548 work = container_of(cb, struct io_tctx_exit, task_work); 9549 /* 9550 * When @in_idle, we're in cancellation and it's racy to remove the 9551 * node. It'll be removed by the end of cancellation, just ignore it. 9552 */ 9553 if (!atomic_read(&tctx->in_idle)) 9554 io_uring_del_tctx_node((unsigned long)work->ctx); 9555 complete(&work->completion); 9556} 9557 9558static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) 9559{ 9560 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 9561 9562 return req->ctx == data; 9563} 9564 9565static __cold void io_ring_exit_work(struct work_struct *work) 9566{ 9567 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); 9568 unsigned long timeout = jiffies + HZ * 60 * 5; 9569 unsigned long interval = HZ / 20; 9570 struct io_tctx_exit exit; 9571 struct io_tctx_node *node; 9572 int ret; 9573 9574 /* 9575 * If we're doing polled IO and end up having requests being 9576 * submitted async (out-of-line), then completions can come in while 9577 * we're waiting for refs to drop. We need to reap these manually, 9578 * as nobody else will be looking for them. 9579 */ 9580 do { 9581 io_uring_try_cancel_requests(ctx, NULL, true); 9582 if (ctx->sq_data) { 9583 struct io_sq_data *sqd = ctx->sq_data; 9584 struct task_struct *tsk; 9585 9586 io_sq_thread_park(sqd); 9587 tsk = sqd->thread; 9588 if (tsk && tsk->io_uring && tsk->io_uring->io_wq) 9589 io_wq_cancel_cb(tsk->io_uring->io_wq, 9590 io_cancel_ctx_cb, ctx, true); 9591 io_sq_thread_unpark(sqd); 9592 } 9593 9594 io_req_caches_free(ctx); 9595 9596 if (WARN_ON_ONCE(time_after(jiffies, timeout))) { 9597 /* there is little hope left, don't run it too often */ 9598 interval = HZ * 60; 9599 } 9600 } while (!wait_for_completion_timeout(&ctx->ref_comp, interval)); 9601 9602 init_completion(&exit.completion); 9603 init_task_work(&exit.task_work, io_tctx_exit_cb); 9604 exit.ctx = ctx; 9605 /* 9606 * Some may use context even when all refs and requests have been put, 9607 * and they are free to do so while still holding uring_lock or 9608 * completion_lock, see io_req_task_submit(). Apart from other work, 9609 * this lock/unlock section also waits them to finish. 9610 */ 9611 mutex_lock(&ctx->uring_lock); 9612 while (!list_empty(&ctx->tctx_list)) { 9613 WARN_ON_ONCE(time_after(jiffies, timeout)); 9614 9615 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node, 9616 ctx_node); 9617 /* don't spin on a single task if cancellation failed */ 9618 list_rotate_left(&ctx->tctx_list); 9619 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL); 9620 if (WARN_ON_ONCE(ret)) 9621 continue; 9622 9623 mutex_unlock(&ctx->uring_lock); 9624 wait_for_completion(&exit.completion); 9625 mutex_lock(&ctx->uring_lock); 9626 } 9627 mutex_unlock(&ctx->uring_lock); 9628 spin_lock(&ctx->completion_lock); 9629 spin_unlock(&ctx->completion_lock); 9630 9631 io_ring_ctx_free(ctx); 9632} 9633 9634/* Returns true if we found and killed one or more timeouts */ 9635static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, 9636 struct task_struct *tsk, bool cancel_all) 9637{ 9638 struct io_kiocb *req, *tmp; 9639 int canceled = 0; 9640 9641 spin_lock(&ctx->completion_lock); 9642 spin_lock_irq(&ctx->timeout_lock); 9643 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { 9644 if (io_match_task(req, tsk, cancel_all)) { 9645 io_kill_timeout(req, -ECANCELED); 9646 canceled++; 9647 } 9648 } 9649 spin_unlock_irq(&ctx->timeout_lock); 9650 if (canceled != 0) 9651 io_commit_cqring(ctx); 9652 spin_unlock(&ctx->completion_lock); 9653 if (canceled != 0) 9654 io_cqring_ev_posted(ctx); 9655 return canceled != 0; 9656} 9657 9658static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) 9659{ 9660 unsigned long index; 9661 struct creds *creds; 9662 9663 mutex_lock(&ctx->uring_lock); 9664 percpu_ref_kill(&ctx->refs); 9665 if (ctx->rings) 9666 __io_cqring_overflow_flush(ctx, true); 9667 xa_for_each(&ctx->personalities, index, creds) 9668 io_unregister_personality(ctx, index); 9669 mutex_unlock(&ctx->uring_lock); 9670 9671 io_kill_timeouts(ctx, NULL, true); 9672 io_poll_remove_all(ctx, NULL, true); 9673 9674 /* if we failed setting up the ctx, we might not have any rings */ 9675 io_iopoll_try_reap_events(ctx); 9676 9677 INIT_WORK(&ctx->exit_work, io_ring_exit_work); 9678 /* 9679 * Use system_unbound_wq to avoid spawning tons of event kworkers 9680 * if we're exiting a ton of rings at the same time. It just adds 9681 * noise and overhead, there's no discernable change in runtime 9682 * over using system_wq. 9683 */ 9684 queue_work(system_unbound_wq, &ctx->exit_work); 9685} 9686 9687static int io_uring_release(struct inode *inode, struct file *file) 9688{ 9689 struct io_ring_ctx *ctx = file->private_data; 9690 9691 file->private_data = NULL; 9692 io_ring_ctx_wait_and_kill(ctx); 9693 return 0; 9694} 9695 9696struct io_task_cancel { 9697 struct task_struct *task; 9698 bool all; 9699}; 9700 9701static bool io_cancel_task_cb(struct io_wq_work *work, void *data) 9702{ 9703 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 9704 struct io_task_cancel *cancel = data; 9705 9706 return io_match_task_safe(req, cancel->task, cancel->all); 9707} 9708 9709static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, 9710 struct task_struct *task, 9711 bool cancel_all) 9712{ 9713 struct io_defer_entry *de; 9714 LIST_HEAD(list); 9715 9716 spin_lock(&ctx->completion_lock); 9717 list_for_each_entry_reverse(de, &ctx->defer_list, list) { 9718 if (io_match_task_safe(de->req, task, cancel_all)) { 9719 list_cut_position(&list, &ctx->defer_list, &de->list); 9720 break; 9721 } 9722 } 9723 spin_unlock(&ctx->completion_lock); 9724 if (list_empty(&list)) 9725 return false; 9726 9727 while (!list_empty(&list)) { 9728 de = list_first_entry(&list, struct io_defer_entry, list); 9729 list_del_init(&de->list); 9730 io_req_complete_failed(de->req, -ECANCELED); 9731 kfree(de); 9732 } 9733 return true; 9734} 9735 9736static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) 9737{ 9738 struct io_tctx_node *node; 9739 enum io_wq_cancel cret; 9740 bool ret = false; 9741 9742 mutex_lock(&ctx->uring_lock); 9743 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 9744 struct io_uring_task *tctx = node->task->io_uring; 9745 9746 /* 9747 * io_wq will stay alive while we hold uring_lock, because it's 9748 * killed after ctx nodes, which requires to take the lock. 9749 */ 9750 if (!tctx || !tctx->io_wq) 9751 continue; 9752 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); 9753 ret |= (cret != IO_WQ_CANCEL_NOTFOUND); 9754 } 9755 mutex_unlock(&ctx->uring_lock); 9756 9757 return ret; 9758} 9759 9760static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 9761 struct task_struct *task, 9762 bool cancel_all) 9763{ 9764 struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; 9765 struct io_uring_task *tctx = task ? task->io_uring : NULL; 9766 9767 while (1) { 9768 enum io_wq_cancel cret; 9769 bool ret = false; 9770 9771 if (!task) { 9772 ret |= io_uring_try_cancel_iowq(ctx); 9773 } else if (tctx && tctx->io_wq) { 9774 /* 9775 * Cancels requests of all rings, not only @ctx, but 9776 * it's fine as the task is in exit/exec. 9777 */ 9778 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, 9779 &cancel, true); 9780 ret |= (cret != IO_WQ_CANCEL_NOTFOUND); 9781 } 9782 9783 /* SQPOLL thread does its own polling */ 9784 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || 9785 (ctx->sq_data && ctx->sq_data->thread == current)) { 9786 while (!wq_list_empty(&ctx->iopoll_list)) { 9787 io_iopoll_try_reap_events(ctx); 9788 ret = true; 9789 } 9790 } 9791 9792 ret |= io_cancel_defer_files(ctx, task, cancel_all); 9793 ret |= io_poll_remove_all(ctx, task, cancel_all); 9794 ret |= io_kill_timeouts(ctx, task, cancel_all); 9795 if (task) 9796 ret |= io_run_task_work(); 9797 if (!ret) 9798 break; 9799 cond_resched(); 9800 } 9801} 9802 9803static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) 9804{ 9805 struct io_uring_task *tctx = current->io_uring; 9806 struct io_tctx_node *node; 9807 int ret; 9808 9809 if (unlikely(!tctx)) { 9810 ret = io_uring_alloc_task_context(current, ctx); 9811 if (unlikely(ret)) 9812 return ret; 9813 9814 tctx = current->io_uring; 9815 if (ctx->iowq_limits_set) { 9816 unsigned int limits[2] = { ctx->iowq_limits[0], 9817 ctx->iowq_limits[1], }; 9818 9819 ret = io_wq_max_workers(tctx->io_wq, limits); 9820 if (ret) 9821 return ret; 9822 } 9823 } 9824 if (!xa_load(&tctx->xa, (unsigned long)ctx)) { 9825 node = kmalloc(sizeof(*node), GFP_KERNEL); 9826 if (!node) 9827 return -ENOMEM; 9828 node->ctx = ctx; 9829 node->task = current; 9830 9831 ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, 9832 node, GFP_KERNEL)); 9833 if (ret) { 9834 kfree(node); 9835 return ret; 9836 } 9837 9838 mutex_lock(&ctx->uring_lock); 9839 list_add(&node->ctx_node, &ctx->tctx_list); 9840 mutex_unlock(&ctx->uring_lock); 9841 } 9842 tctx->last = ctx; 9843 return 0; 9844} 9845 9846/* 9847 * Note that this task has used io_uring. We use it for cancelation purposes. 9848 */ 9849static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx) 9850{ 9851 struct io_uring_task *tctx = current->io_uring; 9852 9853 if (likely(tctx && tctx->last == ctx)) 9854 return 0; 9855 return __io_uring_add_tctx_node(ctx); 9856} 9857 9858/* 9859 * Remove this io_uring_file -> task mapping. 9860 */ 9861static __cold void io_uring_del_tctx_node(unsigned long index) 9862{ 9863 struct io_uring_task *tctx = current->io_uring; 9864 struct io_tctx_node *node; 9865 9866 if (!tctx) 9867 return; 9868 node = xa_erase(&tctx->xa, index); 9869 if (!node) 9870 return; 9871 9872 WARN_ON_ONCE(current != node->task); 9873 WARN_ON_ONCE(list_empty(&node->ctx_node)); 9874 9875 mutex_lock(&node->ctx->uring_lock); 9876 list_del(&node->ctx_node); 9877 mutex_unlock(&node->ctx->uring_lock); 9878 9879 if (tctx->last == node->ctx) 9880 tctx->last = NULL; 9881 kfree(node); 9882} 9883 9884static __cold void io_uring_clean_tctx(struct io_uring_task *tctx) 9885{ 9886 struct io_wq *wq = tctx->io_wq; 9887 struct io_tctx_node *node; 9888 unsigned long index; 9889 9890 xa_for_each(&tctx->xa, index, node) { 9891 io_uring_del_tctx_node(index); 9892 cond_resched(); 9893 } 9894 if (wq) { 9895 /* 9896 * Must be after io_uring_del_tctx_node() (removes nodes under 9897 * uring_lock) to avoid race with io_uring_try_cancel_iowq(). 9898 */ 9899 io_wq_put_and_exit(wq); 9900 tctx->io_wq = NULL; 9901 } 9902} 9903 9904static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) 9905{ 9906 if (tracked) 9907 return atomic_read(&tctx->inflight_tracked); 9908 return percpu_counter_sum(&tctx->inflight); 9909} 9910 9911/* 9912 * Find any io_uring ctx that this task has registered or done IO on, and cancel 9913 * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation. 9914 */ 9915static __cold void io_uring_cancel_generic(bool cancel_all, 9916 struct io_sq_data *sqd) 9917{ 9918 struct io_uring_task *tctx = current->io_uring; 9919 struct io_ring_ctx *ctx; 9920 s64 inflight; 9921 DEFINE_WAIT(wait); 9922 9923 WARN_ON_ONCE(sqd && sqd->thread != current); 9924 9925 if (!current->io_uring) 9926 return; 9927 if (tctx->io_wq) 9928 io_wq_exit_start(tctx->io_wq); 9929 9930 atomic_inc(&tctx->in_idle); 9931 do { 9932 io_uring_drop_tctx_refs(current); 9933 /* read completions before cancelations */ 9934 inflight = tctx_inflight(tctx, !cancel_all); 9935 if (!inflight) 9936 break; 9937 9938 if (!sqd) { 9939 struct io_tctx_node *node; 9940 unsigned long index; 9941 9942 xa_for_each(&tctx->xa, index, node) { 9943 /* sqpoll task will cancel all its requests */ 9944 if (node->ctx->sq_data) 9945 continue; 9946 io_uring_try_cancel_requests(node->ctx, current, 9947 cancel_all); 9948 } 9949 } else { 9950 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 9951 io_uring_try_cancel_requests(ctx, current, 9952 cancel_all); 9953 } 9954 9955 prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE); 9956 io_run_task_work(); 9957 io_uring_drop_tctx_refs(current); 9958 9959 /* 9960 * If we've seen completions, retry without waiting. This 9961 * avoids a race where a completion comes in before we did 9962 * prepare_to_wait(). 9963 */ 9964 if (inflight == tctx_inflight(tctx, !cancel_all)) 9965 schedule(); 9966 finish_wait(&tctx->wait, &wait); 9967 } while (1); 9968 9969 io_uring_clean_tctx(tctx); 9970 if (cancel_all) { 9971 /* 9972 * We shouldn't run task_works after cancel, so just leave 9973 * ->in_idle set for normal exit. 9974 */ 9975 atomic_dec(&tctx->in_idle); 9976 /* for exec all current's requests should be gone, kill tctx */ 9977 __io_uring_free(current); 9978 } 9979} 9980 9981void __io_uring_cancel(bool cancel_all) 9982{ 9983 io_uring_cancel_generic(cancel_all, NULL); 9984} 9985 9986static void *io_uring_validate_mmap_request(struct file *file, 9987 loff_t pgoff, size_t sz) 9988{ 9989 struct io_ring_ctx *ctx = file->private_data; 9990 loff_t offset = pgoff << PAGE_SHIFT; 9991 struct page *page; 9992 void *ptr; 9993 9994 switch (offset) { 9995 case IORING_OFF_SQ_RING: 9996 case IORING_OFF_CQ_RING: 9997 ptr = ctx->rings; 9998 break; 9999 case IORING_OFF_SQES: 10000 ptr = ctx->sq_sqes; 10001 break; 10002 default: 10003 return ERR_PTR(-EINVAL); 10004 } 10005 10006 page = virt_to_head_page(ptr); 10007 if (sz > page_size(page)) 10008 return ERR_PTR(-EINVAL); 10009 10010 return ptr; 10011} 10012 10013#ifdef CONFIG_MMU 10014 10015static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 10016{ 10017 size_t sz = vma->vm_end - vma->vm_start; 10018 unsigned long pfn; 10019 void *ptr; 10020 10021 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); 10022 if (IS_ERR(ptr)) 10023 return PTR_ERR(ptr); 10024 10025 pfn = virt_to_phys(ptr) >> PAGE_SHIFT; 10026 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); 10027} 10028 10029#else /* !CONFIG_MMU */ 10030 10031static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 10032{ 10033 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL; 10034} 10035 10036static unsigned int io_uring_nommu_mmap_capabilities(struct file *file) 10037{ 10038 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; 10039} 10040 10041static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, 10042 unsigned long addr, unsigned long len, 10043 unsigned long pgoff, unsigned long flags) 10044{ 10045 void *ptr; 10046 10047 ptr = io_uring_validate_mmap_request(file, pgoff, len); 10048 if (IS_ERR(ptr)) 10049 return PTR_ERR(ptr); 10050 10051 return (unsigned long) ptr; 10052} 10053 10054#endif /* !CONFIG_MMU */ 10055 10056static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) 10057{ 10058 DEFINE_WAIT(wait); 10059 10060 do { 10061 if (!io_sqring_full(ctx)) 10062 break; 10063 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE); 10064 10065 if (!io_sqring_full(ctx)) 10066 break; 10067 schedule(); 10068 } while (!signal_pending(current)); 10069 10070 finish_wait(&ctx->sqo_sq_wait, &wait); 10071 return 0; 10072} 10073 10074static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, 10075 struct __kernel_timespec __user **ts, 10076 const sigset_t __user **sig) 10077{ 10078 struct io_uring_getevents_arg arg; 10079 10080 /* 10081 * If EXT_ARG isn't set, then we have no timespec and the argp pointer 10082 * is just a pointer to the sigset_t. 10083 */ 10084 if (!(flags & IORING_ENTER_EXT_ARG)) { 10085 *sig = (const sigset_t __user *) argp; 10086 *ts = NULL; 10087 return 0; 10088 } 10089 10090 /* 10091 * EXT_ARG is set - ensure we agree on the size of it and copy in our 10092 * timespec and sigset_t pointers if good. 10093 */ 10094 if (*argsz != sizeof(arg)) 10095 return -EINVAL; 10096 if (copy_from_user(&arg, argp, sizeof(arg))) 10097 return -EFAULT; 10098 *sig = u64_to_user_ptr(arg.sigmask); 10099 *argsz = arg.sigmask_sz; 10100 *ts = u64_to_user_ptr(arg.ts); 10101 return 0; 10102} 10103 10104SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, 10105 u32, min_complete, u32, flags, const void __user *, argp, 10106 size_t, argsz) 10107{ 10108 struct io_ring_ctx *ctx; 10109 int submitted = 0; 10110 struct fd f; 10111 long ret; 10112 10113 io_run_task_work(); 10114 10115 if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | 10116 IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))) 10117 return -EINVAL; 10118 10119 f = fdget(fd); 10120 if (unlikely(!f.file)) 10121 return -EBADF; 10122 10123 ret = -EOPNOTSUPP; 10124 if (unlikely(f.file->f_op != &io_uring_fops)) 10125 goto out_fput; 10126 10127 ret = -ENXIO; 10128 ctx = f.file->private_data; 10129 if (unlikely(!percpu_ref_tryget(&ctx->refs))) 10130 goto out_fput; 10131 10132 ret = -EBADFD; 10133 if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED)) 10134 goto out; 10135 10136 /* 10137 * For SQ polling, the thread will do all submissions and completions. 10138 * Just return the requested submit count, and wake the thread if 10139 * we were asked to. 10140 */ 10141 ret = 0; 10142 if (ctx->flags & IORING_SETUP_SQPOLL) { 10143 io_cqring_overflow_flush(ctx); 10144 10145 if (unlikely(ctx->sq_data->thread == NULL)) { 10146 ret = -EOWNERDEAD; 10147 goto out; 10148 } 10149 if (flags & IORING_ENTER_SQ_WAKEUP) 10150 wake_up(&ctx->sq_data->wait); 10151 if (flags & IORING_ENTER_SQ_WAIT) { 10152 ret = io_sqpoll_wait_sq(ctx); 10153 if (ret) 10154 goto out; 10155 } 10156 submitted = to_submit; 10157 } else if (to_submit) { 10158 ret = io_uring_add_tctx_node(ctx); 10159 if (unlikely(ret)) 10160 goto out; 10161 mutex_lock(&ctx->uring_lock); 10162 submitted = io_submit_sqes(ctx, to_submit); 10163 mutex_unlock(&ctx->uring_lock); 10164 10165 if (submitted != to_submit) 10166 goto out; 10167 } 10168 if (flags & IORING_ENTER_GETEVENTS) { 10169 const sigset_t __user *sig; 10170 struct __kernel_timespec __user *ts; 10171 10172 ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); 10173 if (unlikely(ret)) 10174 goto out; 10175 10176 min_complete = min(min_complete, ctx->cq_entries); 10177 10178 /* 10179 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user 10180 * space applications don't need to do io completion events 10181 * polling again, they can rely on io_sq_thread to do polling 10182 * work, which can reduce cpu usage and uring_lock contention. 10183 */ 10184 if (ctx->flags & IORING_SETUP_IOPOLL && 10185 !(ctx->flags & IORING_SETUP_SQPOLL)) { 10186 ret = io_iopoll_check(ctx, min_complete); 10187 } else { 10188 ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts); 10189 } 10190 } 10191 10192out: 10193 percpu_ref_put(&ctx->refs); 10194out_fput: 10195 fdput(f); 10196 return submitted ? submitted : ret; 10197} 10198 10199#ifdef CONFIG_PROC_FS 10200static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, 10201 const struct cred *cred) 10202{ 10203 struct user_namespace *uns = seq_user_ns(m); 10204 struct group_info *gi; 10205 kernel_cap_t cap; 10206 unsigned __capi; 10207 int g; 10208 10209 seq_printf(m, "%5d\n", id); 10210 seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid)); 10211 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid)); 10212 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid)); 10213 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid)); 10214 seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid)); 10215 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid)); 10216 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid)); 10217 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid)); 10218 seq_puts(m, "\n\tGroups:\t"); 10219 gi = cred->group_info; 10220 for (g = 0; g < gi->ngroups; g++) { 10221 seq_put_decimal_ull(m, g ? " " : "", 10222 from_kgid_munged(uns, gi->gid[g])); 10223 } 10224 seq_puts(m, "\n\tCapEff:\t"); 10225 cap = cred->cap_effective; 10226 CAP_FOR_EACH_U32(__capi) 10227 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8); 10228 seq_putc(m, '\n'); 10229 return 0; 10230} 10231 10232static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, 10233 struct seq_file *m) 10234{ 10235 struct io_sq_data *sq = NULL; 10236 struct io_overflow_cqe *ocqe; 10237 struct io_rings *r = ctx->rings; 10238 unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; 10239 unsigned int sq_head = READ_ONCE(r->sq.head); 10240 unsigned int sq_tail = READ_ONCE(r->sq.tail); 10241 unsigned int cq_head = READ_ONCE(r->cq.head); 10242 unsigned int cq_tail = READ_ONCE(r->cq.tail); 10243 unsigned int sq_entries, cq_entries; 10244 bool has_lock; 10245 unsigned int i; 10246 10247 /* 10248 * we may get imprecise sqe and cqe info if uring is actively running 10249 * since we get cached_sq_head and cached_cq_tail without uring_lock 10250 * and sq_tail and cq_head are changed by userspace. But it's ok since 10251 * we usually use these info when it is stuck. 10252 */ 10253 seq_printf(m, "SqMask:\t0x%x\n", sq_mask); 10254 seq_printf(m, "SqHead:\t%u\n", sq_head); 10255 seq_printf(m, "SqTail:\t%u\n", sq_tail); 10256 seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head); 10257 seq_printf(m, "CqMask:\t0x%x\n", cq_mask); 10258 seq_printf(m, "CqHead:\t%u\n", cq_head); 10259 seq_printf(m, "CqTail:\t%u\n", cq_tail); 10260 seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail); 10261 seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head); 10262 sq_entries = min(sq_tail - sq_head, ctx->sq_entries); 10263 for (i = 0; i < sq_entries; i++) { 10264 unsigned int entry = i + sq_head; 10265 unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); 10266 struct io_uring_sqe *sqe; 10267 10268 if (sq_idx > sq_mask) 10269 continue; 10270 sqe = &ctx->sq_sqes[sq_idx]; 10271 seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n", 10272 sq_idx, sqe->opcode, sqe->fd, sqe->flags, 10273 sqe->user_data); 10274 } 10275 seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); 10276 cq_entries = min(cq_tail - cq_head, ctx->cq_entries); 10277 for (i = 0; i < cq_entries; i++) { 10278 unsigned int entry = i + cq_head; 10279 struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask]; 10280 10281 seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", 10282 entry & cq_mask, cqe->user_data, cqe->res, 10283 cqe->flags); 10284 } 10285 10286 /* 10287 * Avoid ABBA deadlock between the seq lock and the io_uring mutex, 10288 * since fdinfo case grabs it in the opposite direction of normal use 10289 * cases. If we fail to get the lock, we just don't iterate any 10290 * structures that could be going away outside the io_uring mutex. 10291 */ 10292 has_lock = mutex_trylock(&ctx->uring_lock); 10293 10294 if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { 10295 sq = ctx->sq_data; 10296 if (!sq->thread) 10297 sq = NULL; 10298 } 10299 10300 seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1); 10301 seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1); 10302 seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); 10303 for (i = 0; has_lock && i < ctx->nr_user_files; i++) { 10304 struct file *f = io_file_from_index(ctx, i); 10305 10306 if (f) 10307 seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname); 10308 else 10309 seq_printf(m, "%5u: <none>\n", i); 10310 } 10311 seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs); 10312 for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) { 10313 struct io_mapped_ubuf *buf = ctx->user_bufs[i]; 10314 unsigned int len = buf->ubuf_end - buf->ubuf; 10315 10316 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len); 10317 } 10318 if (has_lock && !xa_empty(&ctx->personalities)) { 10319 unsigned long index; 10320 const struct cred *cred; 10321 10322 seq_printf(m, "Personalities:\n"); 10323 xa_for_each(&ctx->personalities, index, cred) 10324 io_uring_show_cred(m, index, cred); 10325 } 10326 if (has_lock) 10327 mutex_unlock(&ctx->uring_lock); 10328 10329 seq_puts(m, "PollList:\n"); 10330 spin_lock(&ctx->completion_lock); 10331 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { 10332 struct hlist_head *list = &ctx->cancel_hash[i]; 10333 struct io_kiocb *req; 10334 10335 hlist_for_each_entry(req, list, hash_node) 10336 seq_printf(m, " op=%d, task_works=%d\n", req->opcode, 10337 req->task->task_works != NULL); 10338 } 10339 10340 seq_puts(m, "CqOverflowList:\n"); 10341 list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { 10342 struct io_uring_cqe *cqe = &ocqe->cqe; 10343 10344 seq_printf(m, " user_data=%llu, res=%d, flags=%x\n", 10345 cqe->user_data, cqe->res, cqe->flags); 10346 10347 } 10348 10349 spin_unlock(&ctx->completion_lock); 10350} 10351 10352static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) 10353{ 10354 struct io_ring_ctx *ctx = f->private_data; 10355 10356 if (percpu_ref_tryget(&ctx->refs)) { 10357 __io_uring_show_fdinfo(ctx, m); 10358 percpu_ref_put(&ctx->refs); 10359 } 10360} 10361#endif 10362 10363static const struct file_operations io_uring_fops = { 10364 .release = io_uring_release, 10365 .mmap = io_uring_mmap, 10366#ifndef CONFIG_MMU 10367 .get_unmapped_area = io_uring_nommu_get_unmapped_area, 10368 .mmap_capabilities = io_uring_nommu_mmap_capabilities, 10369#endif 10370 .poll = io_uring_poll, 10371#ifdef CONFIG_PROC_FS 10372 .show_fdinfo = io_uring_show_fdinfo, 10373#endif 10374}; 10375 10376static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, 10377 struct io_uring_params *p) 10378{ 10379 struct io_rings *rings; 10380 size_t size, sq_array_offset; 10381 10382 /* make sure these are sane, as we already accounted them */ 10383 ctx->sq_entries = p->sq_entries; 10384 ctx->cq_entries = p->cq_entries; 10385 10386 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); 10387 if (size == SIZE_MAX) 10388 return -EOVERFLOW; 10389 10390 rings = io_mem_alloc(size); 10391 if (!rings) 10392 return -ENOMEM; 10393 10394 ctx->rings = rings; 10395 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); 10396 rings->sq_ring_mask = p->sq_entries - 1; 10397 rings->cq_ring_mask = p->cq_entries - 1; 10398 rings->sq_ring_entries = p->sq_entries; 10399 rings->cq_ring_entries = p->cq_entries; 10400 10401 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); 10402 if (size == SIZE_MAX) { 10403 io_mem_free(ctx->rings); 10404 ctx->rings = NULL; 10405 return -EOVERFLOW; 10406 } 10407 10408 ctx->sq_sqes = io_mem_alloc(size); 10409 if (!ctx->sq_sqes) { 10410 io_mem_free(ctx->rings); 10411 ctx->rings = NULL; 10412 return -ENOMEM; 10413 } 10414 10415 return 0; 10416} 10417 10418static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) 10419{ 10420 int ret, fd; 10421 10422 fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); 10423 if (fd < 0) 10424 return fd; 10425 10426 ret = io_uring_add_tctx_node(ctx); 10427 if (ret) { 10428 put_unused_fd(fd); 10429 return ret; 10430 } 10431 fd_install(fd, file); 10432 return fd; 10433} 10434 10435/* 10436 * Allocate an anonymous fd, this is what constitutes the application 10437 * visible backing of an io_uring instance. The application mmaps this 10438 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled, 10439 * we have to tie this fd to a socket for file garbage collection purposes. 10440 */ 10441static struct file *io_uring_get_file(struct io_ring_ctx *ctx) 10442{ 10443 struct file *file; 10444#if defined(CONFIG_UNIX) 10445 int ret; 10446 10447 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP, 10448 &ctx->ring_sock); 10449 if (ret) 10450 return ERR_PTR(ret); 10451#endif 10452 10453 file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx, 10454 O_RDWR | O_CLOEXEC, NULL); 10455#if defined(CONFIG_UNIX) 10456 if (IS_ERR(file)) { 10457 sock_release(ctx->ring_sock); 10458 ctx->ring_sock = NULL; 10459 } else { 10460 ctx->ring_sock->file = file; 10461 } 10462#endif 10463 return file; 10464} 10465 10466static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, 10467 struct io_uring_params __user *params) 10468{ 10469 struct io_ring_ctx *ctx; 10470 struct file *file; 10471 int ret; 10472 10473 if (!entries) 10474 return -EINVAL; 10475 if (entries > IORING_MAX_ENTRIES) { 10476 if (!(p->flags & IORING_SETUP_CLAMP)) 10477 return -EINVAL; 10478 entries = IORING_MAX_ENTRIES; 10479 } 10480 10481 /* 10482 * Use twice as many entries for the CQ ring. It's possible for the 10483 * application to drive a higher depth than the size of the SQ ring, 10484 * since the sqes are only used at submission time. This allows for 10485 * some flexibility in overcommitting a bit. If the application has 10486 * set IORING_SETUP_CQSIZE, it will have passed in the desired number 10487 * of CQ ring entries manually. 10488 */ 10489 p->sq_entries = roundup_pow_of_two(entries); 10490 if (p->flags & IORING_SETUP_CQSIZE) { 10491 /* 10492 * If IORING_SETUP_CQSIZE is set, we do the same roundup 10493 * to a power-of-two, if it isn't already. We do NOT impose 10494 * any cq vs sq ring sizing. 10495 */ 10496 if (!p->cq_entries) 10497 return -EINVAL; 10498 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) { 10499 if (!(p->flags & IORING_SETUP_CLAMP)) 10500 return -EINVAL; 10501 p->cq_entries = IORING_MAX_CQ_ENTRIES; 10502 } 10503 p->cq_entries = roundup_pow_of_two(p->cq_entries); 10504 if (p->cq_entries < p->sq_entries) 10505 return -EINVAL; 10506 } else { 10507 p->cq_entries = 2 * p->sq_entries; 10508 } 10509 10510 ctx = io_ring_ctx_alloc(p); 10511 if (!ctx) 10512 return -ENOMEM; 10513 ctx->compat = in_compat_syscall(); 10514 if (!capable(CAP_IPC_LOCK)) 10515 ctx->user = get_uid(current_user()); 10516 10517 /* 10518 * This is just grabbed for accounting purposes. When a process exits, 10519 * the mm is exited and dropped before the files, hence we need to hang 10520 * on to this mm purely for the purposes of being able to unaccount 10521 * memory (locked/pinned vm). It's not used for anything else. 10522 */ 10523 mmgrab(current->mm); 10524 ctx->mm_account = current->mm; 10525 10526 ret = io_allocate_scq_urings(ctx, p); 10527 if (ret) 10528 goto err; 10529 10530 ret = io_sq_offload_create(ctx, p); 10531 if (ret) 10532 goto err; 10533 /* always set a rsrc node */ 10534 ret = io_rsrc_node_switch_start(ctx); 10535 if (ret) 10536 goto err; 10537 io_rsrc_node_switch(ctx, NULL); 10538 10539 memset(&p->sq_off, 0, sizeof(p->sq_off)); 10540 p->sq_off.head = offsetof(struct io_rings, sq.head); 10541 p->sq_off.tail = offsetof(struct io_rings, sq.tail); 10542 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); 10543 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); 10544 p->sq_off.flags = offsetof(struct io_rings, sq_flags); 10545 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); 10546 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; 10547 10548 memset(&p->cq_off, 0, sizeof(p->cq_off)); 10549 p->cq_off.head = offsetof(struct io_rings, cq.head); 10550 p->cq_off.tail = offsetof(struct io_rings, cq.tail); 10551 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); 10552 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); 10553 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); 10554 p->cq_off.cqes = offsetof(struct io_rings, cqes); 10555 p->cq_off.flags = offsetof(struct io_rings, cq_flags); 10556 10557 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | 10558 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | 10559 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | 10560 IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | 10561 IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | 10562 IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP; 10563 10564 if (copy_to_user(params, p, sizeof(*p))) { 10565 ret = -EFAULT; 10566 goto err; 10567 } 10568 10569 file = io_uring_get_file(ctx); 10570 if (IS_ERR(file)) { 10571 ret = PTR_ERR(file); 10572 goto err; 10573 } 10574 10575 /* 10576 * Install ring fd as the very last thing, so we don't risk someone 10577 * having closed it before we finish setup 10578 */ 10579 ret = io_uring_install_fd(ctx, file); 10580 if (ret < 0) { 10581 /* fput will clean it up */ 10582 fput(file); 10583 return ret; 10584 } 10585 10586 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); 10587 return ret; 10588err: 10589 io_ring_ctx_wait_and_kill(ctx); 10590 return ret; 10591} 10592 10593/* 10594 * Sets up an aio uring context, and returns the fd. Applications asks for a 10595 * ring size, we return the actual sq/cq ring sizes (among other things) in the 10596 * params structure passed in. 10597 */ 10598static long io_uring_setup(u32 entries, struct io_uring_params __user *params) 10599{ 10600 struct io_uring_params p; 10601 int i; 10602 10603 if (copy_from_user(&p, params, sizeof(p))) 10604 return -EFAULT; 10605 for (i = 0; i < ARRAY_SIZE(p.resv); i++) { 10606 if (p.resv[i]) 10607 return -EINVAL; 10608 } 10609 10610 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | 10611 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | 10612 IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | 10613 IORING_SETUP_R_DISABLED)) 10614 return -EINVAL; 10615 10616 return io_uring_create(entries, &p, params); 10617} 10618 10619SYSCALL_DEFINE2(io_uring_setup, u32, entries, 10620 struct io_uring_params __user *, params) 10621{ 10622 return io_uring_setup(entries, params); 10623} 10624 10625static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, 10626 unsigned nr_args) 10627{ 10628 struct io_uring_probe *p; 10629 size_t size; 10630 int i, ret; 10631 10632 size = struct_size(p, ops, nr_args); 10633 if (size == SIZE_MAX) 10634 return -EOVERFLOW; 10635 p = kzalloc(size, GFP_KERNEL); 10636 if (!p) 10637 return -ENOMEM; 10638 10639 ret = -EFAULT; 10640 if (copy_from_user(p, arg, size)) 10641 goto out; 10642 ret = -EINVAL; 10643 if (memchr_inv(p, 0, size)) 10644 goto out; 10645 10646 p->last_op = IORING_OP_LAST - 1; 10647 if (nr_args > IORING_OP_LAST) 10648 nr_args = IORING_OP_LAST; 10649 10650 for (i = 0; i < nr_args; i++) { 10651 p->ops[i].op = i; 10652 if (!io_op_defs[i].not_supported) 10653 p->ops[i].flags = IO_URING_OP_SUPPORTED; 10654 } 10655 p->ops_len = i; 10656 10657 ret = 0; 10658 if (copy_to_user(arg, p, size)) 10659 ret = -EFAULT; 10660out: 10661 kfree(p); 10662 return ret; 10663} 10664 10665static int io_register_personality(struct io_ring_ctx *ctx) 10666{ 10667 const struct cred *creds; 10668 u32 id; 10669 int ret; 10670 10671 creds = get_current_cred(); 10672 10673 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, 10674 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); 10675 if (ret < 0) { 10676 put_cred(creds); 10677 return ret; 10678 } 10679 return id; 10680} 10681 10682static __cold int io_register_restrictions(struct io_ring_ctx *ctx, 10683 void __user *arg, unsigned int nr_args) 10684{ 10685 struct io_uring_restriction *res; 10686 size_t size; 10687 int i, ret; 10688 10689 /* Restrictions allowed only if rings started disabled */ 10690 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 10691 return -EBADFD; 10692 10693 /* We allow only a single restrictions registration */ 10694 if (ctx->restrictions.registered) 10695 return -EBUSY; 10696 10697 if (!arg || nr_args > IORING_MAX_RESTRICTIONS) 10698 return -EINVAL; 10699 10700 size = array_size(nr_args, sizeof(*res)); 10701 if (size == SIZE_MAX) 10702 return -EOVERFLOW; 10703 10704 res = memdup_user(arg, size); 10705 if (IS_ERR(res)) 10706 return PTR_ERR(res); 10707 10708 ret = 0; 10709 10710 for (i = 0; i < nr_args; i++) { 10711 switch (res[i].opcode) { 10712 case IORING_RESTRICTION_REGISTER_OP: 10713 if (res[i].register_op >= IORING_REGISTER_LAST) { 10714 ret = -EINVAL; 10715 goto out; 10716 } 10717 10718 __set_bit(res[i].register_op, 10719 ctx->restrictions.register_op); 10720 break; 10721 case IORING_RESTRICTION_SQE_OP: 10722 if (res[i].sqe_op >= IORING_OP_LAST) { 10723 ret = -EINVAL; 10724 goto out; 10725 } 10726 10727 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); 10728 break; 10729 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: 10730 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; 10731 break; 10732 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: 10733 ctx->restrictions.sqe_flags_required = res[i].sqe_flags; 10734 break; 10735 default: 10736 ret = -EINVAL; 10737 goto out; 10738 } 10739 } 10740 10741out: 10742 /* Reset all restrictions if an error happened */ 10743 if (ret != 0) 10744 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); 10745 else 10746 ctx->restrictions.registered = true; 10747 10748 kfree(res); 10749 return ret; 10750} 10751 10752static int io_register_enable_rings(struct io_ring_ctx *ctx) 10753{ 10754 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 10755 return -EBADFD; 10756 10757 if (ctx->restrictions.registered) 10758 ctx->restricted = 1; 10759 10760 ctx->flags &= ~IORING_SETUP_R_DISABLED; 10761 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) 10762 wake_up(&ctx->sq_data->wait); 10763 return 0; 10764} 10765 10766static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 10767 struct io_uring_rsrc_update2 *up, 10768 unsigned nr_args) 10769{ 10770 __u32 tmp; 10771 int err; 10772 10773 if (up->resv) 10774 return -EINVAL; 10775 if (check_add_overflow(up->offset, nr_args, &tmp)) 10776 return -EOVERFLOW; 10777 err = io_rsrc_node_switch_start(ctx); 10778 if (err) 10779 return err; 10780 10781 switch (type) { 10782 case IORING_RSRC_FILE: 10783 return __io_sqe_files_update(ctx, up, nr_args); 10784 case IORING_RSRC_BUFFER: 10785 return __io_sqe_buffers_update(ctx, up, nr_args); 10786 } 10787 return -EINVAL; 10788} 10789 10790static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 10791 unsigned nr_args) 10792{ 10793 struct io_uring_rsrc_update2 up; 10794 10795 if (!nr_args) 10796 return -EINVAL; 10797 memset(&up, 0, sizeof(up)); 10798 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 10799 return -EFAULT; 10800 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 10801} 10802 10803static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 10804 unsigned size, unsigned type) 10805{ 10806 struct io_uring_rsrc_update2 up; 10807 10808 if (size != sizeof(up)) 10809 return -EINVAL; 10810 if (copy_from_user(&up, arg, sizeof(up))) 10811 return -EFAULT; 10812 if (!up.nr || up.resv) 10813 return -EINVAL; 10814 return __io_register_rsrc_update(ctx, type, &up, up.nr); 10815} 10816 10817static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 10818 unsigned int size, unsigned int type) 10819{ 10820 struct io_uring_rsrc_register rr; 10821 10822 /* keep it extendible */ 10823 if (size != sizeof(rr)) 10824 return -EINVAL; 10825 10826 memset(&rr, 0, sizeof(rr)); 10827 if (copy_from_user(&rr, arg, size)) 10828 return -EFAULT; 10829 if (!rr.nr || rr.resv || rr.resv2) 10830 return -EINVAL; 10831 10832 switch (type) { 10833 case IORING_RSRC_FILE: 10834 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 10835 rr.nr, u64_to_user_ptr(rr.tags)); 10836 case IORING_RSRC_BUFFER: 10837 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 10838 rr.nr, u64_to_user_ptr(rr.tags)); 10839 } 10840 return -EINVAL; 10841} 10842 10843static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, 10844 void __user *arg, unsigned len) 10845{ 10846 struct io_uring_task *tctx = current->io_uring; 10847 cpumask_var_t new_mask; 10848 int ret; 10849 10850 if (!tctx || !tctx->io_wq) 10851 return -EINVAL; 10852 10853 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 10854 return -ENOMEM; 10855 10856 cpumask_clear(new_mask); 10857 if (len > cpumask_size()) 10858 len = cpumask_size(); 10859 10860 if (copy_from_user(new_mask, arg, len)) { 10861 free_cpumask_var(new_mask); 10862 return -EFAULT; 10863 } 10864 10865 ret = io_wq_cpu_affinity(tctx->io_wq, new_mask); 10866 free_cpumask_var(new_mask); 10867 return ret; 10868} 10869 10870static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) 10871{ 10872 struct io_uring_task *tctx = current->io_uring; 10873 10874 if (!tctx || !tctx->io_wq) 10875 return -EINVAL; 10876 10877 return io_wq_cpu_affinity(tctx->io_wq, NULL); 10878} 10879 10880static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, 10881 void __user *arg) 10882 __must_hold(&ctx->uring_lock) 10883{ 10884 struct io_tctx_node *node; 10885 struct io_uring_task *tctx = NULL; 10886 struct io_sq_data *sqd = NULL; 10887 __u32 new_count[2]; 10888 int i, ret; 10889 10890 if (copy_from_user(new_count, arg, sizeof(new_count))) 10891 return -EFAULT; 10892 for (i = 0; i < ARRAY_SIZE(new_count); i++) 10893 if (new_count[i] > INT_MAX) 10894 return -EINVAL; 10895 10896 if (ctx->flags & IORING_SETUP_SQPOLL) { 10897 sqd = ctx->sq_data; 10898 if (sqd) { 10899 /* 10900 * Observe the correct sqd->lock -> ctx->uring_lock 10901 * ordering. Fine to drop uring_lock here, we hold 10902 * a ref to the ctx. 10903 */ 10904 refcount_inc(&sqd->refs); 10905 mutex_unlock(&ctx->uring_lock); 10906 mutex_lock(&sqd->lock); 10907 mutex_lock(&ctx->uring_lock); 10908 if (sqd->thread) 10909 tctx = sqd->thread->io_uring; 10910 } 10911 } else { 10912 tctx = current->io_uring; 10913 } 10914 10915 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); 10916 10917 for (i = 0; i < ARRAY_SIZE(new_count); i++) 10918 if (new_count[i]) 10919 ctx->iowq_limits[i] = new_count[i]; 10920 ctx->iowq_limits_set = true; 10921 10922 if (tctx && tctx->io_wq) { 10923 ret = io_wq_max_workers(tctx->io_wq, new_count); 10924 if (ret) 10925 goto err; 10926 } else { 10927 memset(new_count, 0, sizeof(new_count)); 10928 } 10929 10930 if (sqd) { 10931 mutex_unlock(&sqd->lock); 10932 io_put_sq_data(sqd); 10933 } 10934 10935 if (copy_to_user(arg, new_count, sizeof(new_count))) 10936 return -EFAULT; 10937 10938 /* that's it for SQPOLL, only the SQPOLL task creates requests */ 10939 if (sqd) 10940 return 0; 10941 10942 /* now propagate the restriction to all registered users */ 10943 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 10944 struct io_uring_task *tctx = node->task->io_uring; 10945 10946 if (WARN_ON_ONCE(!tctx->io_wq)) 10947 continue; 10948 10949 for (i = 0; i < ARRAY_SIZE(new_count); i++) 10950 new_count[i] = ctx->iowq_limits[i]; 10951 /* ignore errors, it always returns zero anyway */ 10952 (void)io_wq_max_workers(tctx->io_wq, new_count); 10953 } 10954 return 0; 10955err: 10956 if (sqd) { 10957 mutex_unlock(&sqd->lock); 10958 io_put_sq_data(sqd); 10959 } 10960 return ret; 10961} 10962 10963static bool io_register_op_must_quiesce(int op) 10964{ 10965 switch (op) { 10966 case IORING_REGISTER_BUFFERS: 10967 case IORING_UNREGISTER_BUFFERS: 10968 case IORING_REGISTER_FILES: 10969 case IORING_UNREGISTER_FILES: 10970 case IORING_REGISTER_FILES_UPDATE: 10971 case IORING_REGISTER_PROBE: 10972 case IORING_REGISTER_PERSONALITY: 10973 case IORING_UNREGISTER_PERSONALITY: 10974 case IORING_REGISTER_FILES2: 10975 case IORING_REGISTER_FILES_UPDATE2: 10976 case IORING_REGISTER_BUFFERS2: 10977 case IORING_REGISTER_BUFFERS_UPDATE: 10978 case IORING_REGISTER_IOWQ_AFF: 10979 case IORING_UNREGISTER_IOWQ_AFF: 10980 case IORING_REGISTER_IOWQ_MAX_WORKERS: 10981 return false; 10982 default: 10983 return true; 10984 } 10985} 10986 10987static __cold int io_ctx_quiesce(struct io_ring_ctx *ctx) 10988{ 10989 long ret; 10990 10991 percpu_ref_kill(&ctx->refs); 10992 10993 /* 10994 * Drop uring mutex before waiting for references to exit. If another 10995 * thread is currently inside io_uring_enter() it might need to grab the 10996 * uring_lock to make progress. If we hold it here across the drain 10997 * wait, then we can deadlock. It's safe to drop the mutex here, since 10998 * no new references will come in after we've killed the percpu ref. 10999 */ 11000 mutex_unlock(&ctx->uring_lock); 11001 do { 11002 ret = wait_for_completion_interruptible_timeout(&ctx->ref_comp, HZ); 11003 if (ret) { 11004 ret = min(0L, ret); 11005 break; 11006 } 11007 11008 ret = io_run_task_work_sig(); 11009 io_req_caches_free(ctx); 11010 } while (ret >= 0); 11011 mutex_lock(&ctx->uring_lock); 11012 11013 if (ret) 11014 io_refs_resurrect(&ctx->refs, &ctx->ref_comp); 11015 return ret; 11016} 11017 11018static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, 11019 void __user *arg, unsigned nr_args) 11020 __releases(ctx->uring_lock) 11021 __acquires(ctx->uring_lock) 11022{ 11023 int ret; 11024 11025 /* 11026 * We're inside the ring mutex, if the ref is already dying, then 11027 * someone else killed the ctx or is already going through 11028 * io_uring_register(). 11029 */ 11030 if (percpu_ref_is_dying(&ctx->refs)) 11031 return -ENXIO; 11032 11033 if (ctx->restricted) { 11034 if (opcode >= IORING_REGISTER_LAST) 11035 return -EINVAL; 11036 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); 11037 if (!test_bit(opcode, ctx->restrictions.register_op)) 11038 return -EACCES; 11039 } 11040 11041 if (io_register_op_must_quiesce(opcode)) { 11042 ret = io_ctx_quiesce(ctx); 11043 if (ret) 11044 return ret; 11045 } 11046 11047 switch (opcode) { 11048 case IORING_REGISTER_BUFFERS: 11049 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); 11050 break; 11051 case IORING_UNREGISTER_BUFFERS: 11052 ret = -EINVAL; 11053 if (arg || nr_args) 11054 break; 11055 ret = io_sqe_buffers_unregister(ctx); 11056 break; 11057 case IORING_REGISTER_FILES: 11058 ret = io_sqe_files_register(ctx, arg, nr_args, NULL); 11059 break; 11060 case IORING_UNREGISTER_FILES: 11061 ret = -EINVAL; 11062 if (arg || nr_args) 11063 break; 11064 ret = io_sqe_files_unregister(ctx); 11065 break; 11066 case IORING_REGISTER_FILES_UPDATE: 11067 ret = io_register_files_update(ctx, arg, nr_args); 11068 break; 11069 case IORING_REGISTER_EVENTFD: 11070 case IORING_REGISTER_EVENTFD_ASYNC: 11071 ret = -EINVAL; 11072 if (nr_args != 1) 11073 break; 11074 ret = io_eventfd_register(ctx, arg); 11075 if (ret) 11076 break; 11077 if (opcode == IORING_REGISTER_EVENTFD_ASYNC) 11078 ctx->eventfd_async = 1; 11079 else 11080 ctx->eventfd_async = 0; 11081 break; 11082 case IORING_UNREGISTER_EVENTFD: 11083 ret = -EINVAL; 11084 if (arg || nr_args) 11085 break; 11086 ret = io_eventfd_unregister(ctx); 11087 break; 11088 case IORING_REGISTER_PROBE: 11089 ret = -EINVAL; 11090 if (!arg || nr_args > 256) 11091 break; 11092 ret = io_probe(ctx, arg, nr_args); 11093 break; 11094 case IORING_REGISTER_PERSONALITY: 11095 ret = -EINVAL; 11096 if (arg || nr_args) 11097 break; 11098 ret = io_register_personality(ctx); 11099 break; 11100 case IORING_UNREGISTER_PERSONALITY: 11101 ret = -EINVAL; 11102 if (arg) 11103 break; 11104 ret = io_unregister_personality(ctx, nr_args); 11105 break; 11106 case IORING_REGISTER_ENABLE_RINGS: 11107 ret = -EINVAL; 11108 if (arg || nr_args) 11109 break; 11110 ret = io_register_enable_rings(ctx); 11111 break; 11112 case IORING_REGISTER_RESTRICTIONS: 11113 ret = io_register_restrictions(ctx, arg, nr_args); 11114 break; 11115 case IORING_REGISTER_FILES2: 11116 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); 11117 break; 11118 case IORING_REGISTER_FILES_UPDATE2: 11119 ret = io_register_rsrc_update(ctx, arg, nr_args, 11120 IORING_RSRC_FILE); 11121 break; 11122 case IORING_REGISTER_BUFFERS2: 11123 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); 11124 break; 11125 case IORING_REGISTER_BUFFERS_UPDATE: 11126 ret = io_register_rsrc_update(ctx, arg, nr_args, 11127 IORING_RSRC_BUFFER); 11128 break; 11129 case IORING_REGISTER_IOWQ_AFF: 11130 ret = -EINVAL; 11131 if (!arg || !nr_args) 11132 break; 11133 ret = io_register_iowq_aff(ctx, arg, nr_args); 11134 break; 11135 case IORING_UNREGISTER_IOWQ_AFF: 11136 ret = -EINVAL; 11137 if (arg || nr_args) 11138 break; 11139 ret = io_unregister_iowq_aff(ctx); 11140 break; 11141 case IORING_REGISTER_IOWQ_MAX_WORKERS: 11142 ret = -EINVAL; 11143 if (!arg || nr_args != 2) 11144 break; 11145 ret = io_register_iowq_max_workers(ctx, arg); 11146 break; 11147 default: 11148 ret = -EINVAL; 11149 break; 11150 } 11151 11152 if (io_register_op_must_quiesce(opcode)) { 11153 /* bring the ctx back to life */ 11154 percpu_ref_reinit(&ctx->refs); 11155 reinit_completion(&ctx->ref_comp); 11156 } 11157 return ret; 11158} 11159 11160SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, 11161 void __user *, arg, unsigned int, nr_args) 11162{ 11163 struct io_ring_ctx *ctx; 11164 long ret = -EBADF; 11165 struct fd f; 11166 11167 f = fdget(fd); 11168 if (!f.file) 11169 return -EBADF; 11170 11171 ret = -EOPNOTSUPP; 11172 if (f.file->f_op != &io_uring_fops) 11173 goto out_fput; 11174 11175 ctx = f.file->private_data; 11176 11177 io_run_task_work(); 11178 11179 mutex_lock(&ctx->uring_lock); 11180 ret = __io_uring_register(ctx, opcode, arg, nr_args); 11181 mutex_unlock(&ctx->uring_lock); 11182 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, 11183 ctx->cq_ev_fd != NULL, ret); 11184out_fput: 11185 fdput(f); 11186 return ret; 11187} 11188 11189static int __init io_uring_init(void) 11190{ 11191#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \ 11192 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \ 11193 BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \ 11194} while (0) 11195 11196#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \ 11197 __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename) 11198 BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64); 11199 BUILD_BUG_SQE_ELEM(0, __u8, opcode); 11200 BUILD_BUG_SQE_ELEM(1, __u8, flags); 11201 BUILD_BUG_SQE_ELEM(2, __u16, ioprio); 11202 BUILD_BUG_SQE_ELEM(4, __s32, fd); 11203 BUILD_BUG_SQE_ELEM(8, __u64, off); 11204 BUILD_BUG_SQE_ELEM(8, __u64, addr2); 11205 BUILD_BUG_SQE_ELEM(16, __u64, addr); 11206 BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in); 11207 BUILD_BUG_SQE_ELEM(24, __u32, len); 11208 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags); 11209 BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); 11210 BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags); 11211 BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags); 11212 BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events); 11213 BUILD_BUG_SQE_ELEM(28, __u32, poll32_events); 11214 BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags); 11215 BUILD_BUG_SQE_ELEM(28, __u32, msg_flags); 11216 BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags); 11217 BUILD_BUG_SQE_ELEM(28, __u32, accept_flags); 11218 BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags); 11219 BUILD_BUG_SQE_ELEM(28, __u32, open_flags); 11220 BUILD_BUG_SQE_ELEM(28, __u32, statx_flags); 11221 BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice); 11222 BUILD_BUG_SQE_ELEM(28, __u32, splice_flags); 11223 BUILD_BUG_SQE_ELEM(32, __u64, user_data); 11224 BUILD_BUG_SQE_ELEM(40, __u16, buf_index); 11225 BUILD_BUG_SQE_ELEM(40, __u16, buf_group); 11226 BUILD_BUG_SQE_ELEM(42, __u16, personality); 11227 BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); 11228 BUILD_BUG_SQE_ELEM(44, __u32, file_index); 11229 11230 BUILD_BUG_ON(sizeof(struct io_uring_files_update) != 11231 sizeof(struct io_uring_rsrc_update)); 11232 BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) > 11233 sizeof(struct io_uring_rsrc_update2)); 11234 11235 /* ->buf_index is u16 */ 11236 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 11237 11238 /* should fit into one byte */ 11239 BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); 11240 BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8)); 11241 BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS); 11242 11243 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); 11244 BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); 11245 11246 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | 11247 SLAB_ACCOUNT); 11248 return 0; 11249}; 11250__initcall(io_uring_init);