at v5.13 10207 lines 256 kB view raw
1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Shared application/kernel submission and completion ring pairs, for 4 * supporting fast/efficient IO. 5 * 6 * A note on the read/write ordering memory barriers that are matched between 7 * the application and kernel side. 8 * 9 * After the application reads the CQ ring tail, it must use an 10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses 11 * before writing the tail (using smp_load_acquire to read the tail will 12 * do). It also needs a smp_mb() before updating CQ head (ordering the 13 * entry load(s) with the head store), pairing with an implicit barrier 14 * through a control-dependency in io_get_cqring (smp_store_release to 15 * store head will do). Failure to do so could lead to reading invalid 16 * CQ entries. 17 * 18 * Likewise, the application must use an appropriate smp_wmb() before 19 * writing the SQ tail (ordering SQ entry stores with the tail store), 20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release 21 * to store the tail will do). And it needs a barrier ordering the SQ 22 * head load before writing new SQ entries (smp_load_acquire to read 23 * head will do). 24 * 25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application 26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after* 27 * updating the SQ tail; a full memory barrier smp_mb() is needed 28 * between. 29 * 30 * Also see the examples in the liburing library: 31 * 32 * git://git.kernel.dk/liburing 33 * 34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens 35 * from data shared between the kernel and application. This is done both 36 * for ordering purposes, but also to ensure that once a value is loaded from 37 * data that the application could potentially modify, it remains stable. 38 * 39 * Copyright (C) 2018-2019 Jens Axboe 40 * Copyright (c) 2018-2019 Christoph Hellwig 41 */ 42#include <linux/kernel.h> 43#include <linux/init.h> 44#include <linux/errno.h> 45#include <linux/syscalls.h> 46#include <linux/compat.h> 47#include <net/compat.h> 48#include <linux/refcount.h> 49#include <linux/uio.h> 50#include <linux/bits.h> 51 52#include <linux/sched/signal.h> 53#include <linux/fs.h> 54#include <linux/file.h> 55#include <linux/fdtable.h> 56#include <linux/mm.h> 57#include <linux/mman.h> 58#include <linux/percpu.h> 59#include <linux/slab.h> 60#include <linux/blkdev.h> 61#include <linux/bvec.h> 62#include <linux/net.h> 63#include <net/sock.h> 64#include <net/af_unix.h> 65#include <net/scm.h> 66#include <linux/anon_inodes.h> 67#include <linux/sched/mm.h> 68#include <linux/uaccess.h> 69#include <linux/nospec.h> 70#include <linux/sizes.h> 71#include <linux/hugetlb.h> 72#include <linux/highmem.h> 73#include <linux/namei.h> 74#include <linux/fsnotify.h> 75#include <linux/fadvise.h> 76#include <linux/eventpoll.h> 77#include <linux/splice.h> 78#include <linux/task_work.h> 79#include <linux/pagemap.h> 80#include <linux/io_uring.h> 81 82#define CREATE_TRACE_POINTS 83#include <trace/events/io_uring.h> 84 85#include <uapi/linux/io_uring.h> 86 87#include "internal.h" 88#include "io-wq.h" 89 90#define IORING_MAX_ENTRIES 32768 91#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) 92 93/* 94 * Shift of 9 is 512 entries, or exactly one page on 64-bit archs 95 */ 96#define IORING_FILE_TABLE_SHIFT 9 97#define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT) 98#define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1) 99#define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE) 100#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 101 IORING_REGISTER_LAST + IORING_OP_LAST) 102 103#define IORING_MAX_REG_BUFFERS (1U << 14) 104 105#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ 106 IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ 107 IOSQE_BUFFER_SELECT) 108 109struct io_uring { 110 u32 head ____cacheline_aligned_in_smp; 111 u32 tail ____cacheline_aligned_in_smp; 112}; 113 114/* 115 * This data is shared with the application through the mmap at offsets 116 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. 117 * 118 * The offsets to the member fields are published through struct 119 * io_sqring_offsets when calling io_uring_setup. 120 */ 121struct io_rings { 122 /* 123 * Head and tail offsets into the ring; the offsets need to be 124 * masked to get valid indices. 125 * 126 * The kernel controls head of the sq ring and the tail of the cq ring, 127 * and the application controls tail of the sq ring and the head of the 128 * cq ring. 129 */ 130 struct io_uring sq, cq; 131 /* 132 * Bitmasks to apply to head and tail offsets (constant, equals 133 * ring_entries - 1) 134 */ 135 u32 sq_ring_mask, cq_ring_mask; 136 /* Ring sizes (constant, power of 2) */ 137 u32 sq_ring_entries, cq_ring_entries; 138 /* 139 * Number of invalid entries dropped by the kernel due to 140 * invalid index stored in array 141 * 142 * Written by the kernel, shouldn't be modified by the 143 * application (i.e. get number of "new events" by comparing to 144 * cached value). 145 * 146 * After a new SQ head value was read by the application this 147 * counter includes all submissions that were dropped reaching 148 * the new SQ head (and possibly more). 149 */ 150 u32 sq_dropped; 151 /* 152 * Runtime SQ flags 153 * 154 * Written by the kernel, shouldn't be modified by the 155 * application. 156 * 157 * The application needs a full memory barrier before checking 158 * for IORING_SQ_NEED_WAKEUP after updating the sq tail. 159 */ 160 u32 sq_flags; 161 /* 162 * Runtime CQ flags 163 * 164 * Written by the application, shouldn't be modified by the 165 * kernel. 166 */ 167 u32 cq_flags; 168 /* 169 * Number of completion events lost because the queue was full; 170 * this should be avoided by the application by making sure 171 * there are not more requests pending than there is space in 172 * the completion queue. 173 * 174 * Written by the kernel, shouldn't be modified by the 175 * application (i.e. get number of "new events" by comparing to 176 * cached value). 177 * 178 * As completion events come in out of order this counter is not 179 * ordered with any other data. 180 */ 181 u32 cq_overflow; 182 /* 183 * Ring buffer of completion events. 184 * 185 * The kernel writes completion events fresh every time they are 186 * produced, so the application is allowed to modify pending 187 * entries. 188 */ 189 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; 190}; 191 192enum io_uring_cmd_flags { 193 IO_URING_F_NONBLOCK = 1, 194 IO_URING_F_COMPLETE_DEFER = 2, 195}; 196 197struct io_mapped_ubuf { 198 u64 ubuf; 199 u64 ubuf_end; 200 unsigned int nr_bvecs; 201 unsigned long acct_pages; 202 struct bio_vec bvec[]; 203}; 204 205struct io_ring_ctx; 206 207struct io_overflow_cqe { 208 struct io_uring_cqe cqe; 209 struct list_head list; 210}; 211 212struct io_fixed_file { 213 /* file * with additional FFS_* flags */ 214 unsigned long file_ptr; 215}; 216 217struct io_rsrc_put { 218 struct list_head list; 219 u64 tag; 220 union { 221 void *rsrc; 222 struct file *file; 223 struct io_mapped_ubuf *buf; 224 }; 225}; 226 227struct io_file_table { 228 /* two level table */ 229 struct io_fixed_file **files; 230}; 231 232struct io_rsrc_node { 233 struct percpu_ref refs; 234 struct list_head node; 235 struct list_head rsrc_list; 236 struct io_rsrc_data *rsrc_data; 237 struct llist_node llist; 238 bool done; 239}; 240 241typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); 242 243struct io_rsrc_data { 244 struct io_ring_ctx *ctx; 245 246 u64 *tags; 247 rsrc_put_fn *do_put; 248 atomic_t refs; 249 struct completion done; 250 bool quiesce; 251}; 252 253struct io_buffer { 254 struct list_head list; 255 __u64 addr; 256 __u32 len; 257 __u16 bid; 258}; 259 260struct io_restriction { 261 DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); 262 DECLARE_BITMAP(sqe_op, IORING_OP_LAST); 263 u8 sqe_flags_allowed; 264 u8 sqe_flags_required; 265 bool registered; 266}; 267 268enum { 269 IO_SQ_THREAD_SHOULD_STOP = 0, 270 IO_SQ_THREAD_SHOULD_PARK, 271}; 272 273struct io_sq_data { 274 refcount_t refs; 275 atomic_t park_pending; 276 struct mutex lock; 277 278 /* ctx's that are using this sqd */ 279 struct list_head ctx_list; 280 281 struct task_struct *thread; 282 struct wait_queue_head wait; 283 284 unsigned sq_thread_idle; 285 int sq_cpu; 286 pid_t task_pid; 287 pid_t task_tgid; 288 289 unsigned long state; 290 struct completion exited; 291 struct callback_head *park_task_work; 292}; 293 294#define IO_IOPOLL_BATCH 8 295#define IO_COMPL_BATCH 32 296#define IO_REQ_CACHE_SIZE 32 297#define IO_REQ_ALLOC_BATCH 8 298 299struct io_comp_state { 300 struct io_kiocb *reqs[IO_COMPL_BATCH]; 301 unsigned int nr; 302 unsigned int locked_free_nr; 303 /* inline/task_work completion list, under ->uring_lock */ 304 struct list_head free_list; 305 /* IRQ completion list, under ->completion_lock */ 306 struct list_head locked_free_list; 307}; 308 309struct io_submit_link { 310 struct io_kiocb *head; 311 struct io_kiocb *last; 312}; 313 314struct io_submit_state { 315 struct blk_plug plug; 316 struct io_submit_link link; 317 318 /* 319 * io_kiocb alloc cache 320 */ 321 void *reqs[IO_REQ_CACHE_SIZE]; 322 unsigned int free_reqs; 323 324 bool plug_started; 325 326 /* 327 * Batch completion logic 328 */ 329 struct io_comp_state comp; 330 331 /* 332 * File reference cache 333 */ 334 struct file *file; 335 unsigned int fd; 336 unsigned int file_refs; 337 unsigned int ios_left; 338}; 339 340struct io_ring_ctx { 341 struct { 342 struct percpu_ref refs; 343 } ____cacheline_aligned_in_smp; 344 345 struct { 346 unsigned int flags; 347 unsigned int compat: 1; 348 unsigned int drain_next: 1; 349 unsigned int eventfd_async: 1; 350 unsigned int restricted: 1; 351 352 /* 353 * Ring buffer of indices into array of io_uring_sqe, which is 354 * mmapped by the application using the IORING_OFF_SQES offset. 355 * 356 * This indirection could e.g. be used to assign fixed 357 * io_uring_sqe entries to operations and only submit them to 358 * the queue when needed. 359 * 360 * The kernel modifies neither the indices array nor the entries 361 * array. 362 */ 363 u32 *sq_array; 364 unsigned cached_sq_head; 365 unsigned sq_entries; 366 unsigned sq_mask; 367 unsigned sq_thread_idle; 368 unsigned cached_sq_dropped; 369 unsigned cached_cq_overflow; 370 unsigned long sq_check_overflow; 371 372 /* hashed buffered write serialization */ 373 struct io_wq_hash *hash_map; 374 375 struct list_head defer_list; 376 struct list_head timeout_list; 377 struct list_head cq_overflow_list; 378 379 struct io_uring_sqe *sq_sqes; 380 } ____cacheline_aligned_in_smp; 381 382 struct { 383 struct mutex uring_lock; 384 wait_queue_head_t wait; 385 } ____cacheline_aligned_in_smp; 386 387 struct io_submit_state submit_state; 388 389 struct io_rings *rings; 390 391 /* Only used for accounting purposes */ 392 struct mm_struct *mm_account; 393 394 const struct cred *sq_creds; /* cred used for __io_sq_thread() */ 395 struct io_sq_data *sq_data; /* if using sq thread polling */ 396 397 struct wait_queue_head sqo_sq_wait; 398 struct list_head sqd_list; 399 400 /* 401 * If used, fixed file set. Writers must ensure that ->refs is dead, 402 * readers must ensure that ->refs is alive as long as the file* is 403 * used. Only updated through io_uring_register(2). 404 */ 405 struct io_rsrc_data *file_data; 406 struct io_file_table file_table; 407 unsigned nr_user_files; 408 409 /* if used, fixed mapped user buffers */ 410 struct io_rsrc_data *buf_data; 411 unsigned nr_user_bufs; 412 struct io_mapped_ubuf **user_bufs; 413 414 struct user_struct *user; 415 416 struct completion ref_comp; 417 418#if defined(CONFIG_UNIX) 419 struct socket *ring_sock; 420#endif 421 422 struct xarray io_buffers; 423 424 struct xarray personalities; 425 u32 pers_next; 426 427 struct { 428 unsigned cached_cq_tail; 429 unsigned cq_entries; 430 unsigned cq_mask; 431 atomic_t cq_timeouts; 432 unsigned cq_last_tm_flush; 433 unsigned cq_extra; 434 unsigned long cq_check_overflow; 435 struct wait_queue_head cq_wait; 436 struct fasync_struct *cq_fasync; 437 struct eventfd_ctx *cq_ev_fd; 438 } ____cacheline_aligned_in_smp; 439 440 struct { 441 spinlock_t completion_lock; 442 443 /* 444 * ->iopoll_list is protected by the ctx->uring_lock for 445 * io_uring instances that don't use IORING_SETUP_SQPOLL. 446 * For SQPOLL, only the single threaded io_sq_thread() will 447 * manipulate the list, hence no extra locking is needed there. 448 */ 449 struct list_head iopoll_list; 450 struct hlist_head *cancel_hash; 451 unsigned cancel_hash_bits; 452 bool poll_multi_file; 453 } ____cacheline_aligned_in_smp; 454 455 struct delayed_work rsrc_put_work; 456 struct llist_head rsrc_put_llist; 457 struct list_head rsrc_ref_list; 458 spinlock_t rsrc_ref_lock; 459 struct io_rsrc_node *rsrc_node; 460 struct io_rsrc_node *rsrc_backup_node; 461 struct io_mapped_ubuf *dummy_ubuf; 462 463 struct io_restriction restrictions; 464 465 /* exit task_work */ 466 struct callback_head *exit_task_work; 467 468 /* Keep this last, we don't need it for the fast path */ 469 struct work_struct exit_work; 470 struct list_head tctx_list; 471}; 472 473struct io_uring_task { 474 /* submission side */ 475 struct xarray xa; 476 struct wait_queue_head wait; 477 const struct io_ring_ctx *last; 478 struct io_wq *io_wq; 479 struct percpu_counter inflight; 480 atomic_t inflight_tracked; 481 atomic_t in_idle; 482 483 spinlock_t task_lock; 484 struct io_wq_work_list task_list; 485 unsigned long task_state; 486 struct callback_head task_work; 487}; 488 489/* 490 * First field must be the file pointer in all the 491 * iocb unions! See also 'struct kiocb' in <linux/fs.h> 492 */ 493struct io_poll_iocb { 494 struct file *file; 495 struct wait_queue_head *head; 496 __poll_t events; 497 bool done; 498 bool canceled; 499 struct wait_queue_entry wait; 500}; 501 502struct io_poll_update { 503 struct file *file; 504 u64 old_user_data; 505 u64 new_user_data; 506 __poll_t events; 507 bool update_events; 508 bool update_user_data; 509}; 510 511struct io_close { 512 struct file *file; 513 int fd; 514}; 515 516struct io_timeout_data { 517 struct io_kiocb *req; 518 struct hrtimer timer; 519 struct timespec64 ts; 520 enum hrtimer_mode mode; 521}; 522 523struct io_accept { 524 struct file *file; 525 struct sockaddr __user *addr; 526 int __user *addr_len; 527 int flags; 528 unsigned long nofile; 529}; 530 531struct io_sync { 532 struct file *file; 533 loff_t len; 534 loff_t off; 535 int flags; 536 int mode; 537}; 538 539struct io_cancel { 540 struct file *file; 541 u64 addr; 542}; 543 544struct io_timeout { 545 struct file *file; 546 u32 off; 547 u32 target_seq; 548 struct list_head list; 549 /* head of the link, used by linked timeouts only */ 550 struct io_kiocb *head; 551}; 552 553struct io_timeout_rem { 554 struct file *file; 555 u64 addr; 556 557 /* timeout update */ 558 struct timespec64 ts; 559 u32 flags; 560}; 561 562struct io_rw { 563 /* NOTE: kiocb has the file as the first member, so don't do it here */ 564 struct kiocb kiocb; 565 u64 addr; 566 u64 len; 567}; 568 569struct io_connect { 570 struct file *file; 571 struct sockaddr __user *addr; 572 int addr_len; 573}; 574 575struct io_sr_msg { 576 struct file *file; 577 union { 578 struct compat_msghdr __user *umsg_compat; 579 struct user_msghdr __user *umsg; 580 void __user *buf; 581 }; 582 int msg_flags; 583 int bgid; 584 size_t len; 585 struct io_buffer *kbuf; 586}; 587 588struct io_open { 589 struct file *file; 590 int dfd; 591 struct filename *filename; 592 struct open_how how; 593 unsigned long nofile; 594}; 595 596struct io_rsrc_update { 597 struct file *file; 598 u64 arg; 599 u32 nr_args; 600 u32 offset; 601}; 602 603struct io_fadvise { 604 struct file *file; 605 u64 offset; 606 u32 len; 607 u32 advice; 608}; 609 610struct io_madvise { 611 struct file *file; 612 u64 addr; 613 u32 len; 614 u32 advice; 615}; 616 617struct io_epoll { 618 struct file *file; 619 int epfd; 620 int op; 621 int fd; 622 struct epoll_event event; 623}; 624 625struct io_splice { 626 struct file *file_out; 627 struct file *file_in; 628 loff_t off_out; 629 loff_t off_in; 630 u64 len; 631 unsigned int flags; 632}; 633 634struct io_provide_buf { 635 struct file *file; 636 __u64 addr; 637 __u32 len; 638 __u32 bgid; 639 __u16 nbufs; 640 __u16 bid; 641}; 642 643struct io_statx { 644 struct file *file; 645 int dfd; 646 unsigned int mask; 647 unsigned int flags; 648 const char __user *filename; 649 struct statx __user *buffer; 650}; 651 652struct io_shutdown { 653 struct file *file; 654 int how; 655}; 656 657struct io_rename { 658 struct file *file; 659 int old_dfd; 660 int new_dfd; 661 struct filename *oldpath; 662 struct filename *newpath; 663 int flags; 664}; 665 666struct io_unlink { 667 struct file *file; 668 int dfd; 669 int flags; 670 struct filename *filename; 671}; 672 673struct io_completion { 674 struct file *file; 675 struct list_head list; 676 u32 cflags; 677}; 678 679struct io_async_connect { 680 struct sockaddr_storage address; 681}; 682 683struct io_async_msghdr { 684 struct iovec fast_iov[UIO_FASTIOV]; 685 /* points to an allocated iov, if NULL we use fast_iov instead */ 686 struct iovec *free_iov; 687 struct sockaddr __user *uaddr; 688 struct msghdr msg; 689 struct sockaddr_storage addr; 690}; 691 692struct io_async_rw { 693 struct iovec fast_iov[UIO_FASTIOV]; 694 const struct iovec *free_iovec; 695 struct iov_iter iter; 696 size_t bytes_done; 697 struct wait_page_queue wpq; 698}; 699 700enum { 701 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, 702 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, 703 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, 704 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, 705 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, 706 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, 707 708 /* first byte is taken by user flags, shift it to not overlap */ 709 REQ_F_FAIL_LINK_BIT = 8, 710 REQ_F_INFLIGHT_BIT, 711 REQ_F_CUR_POS_BIT, 712 REQ_F_NOWAIT_BIT, 713 REQ_F_LINK_TIMEOUT_BIT, 714 REQ_F_NEED_CLEANUP_BIT, 715 REQ_F_POLLED_BIT, 716 REQ_F_BUFFER_SELECTED_BIT, 717 REQ_F_LTIMEOUT_ACTIVE_BIT, 718 REQ_F_COMPLETE_INLINE_BIT, 719 REQ_F_REISSUE_BIT, 720 REQ_F_DONT_REISSUE_BIT, 721 /* keep async read/write and isreg together and in order */ 722 REQ_F_ASYNC_READ_BIT, 723 REQ_F_ASYNC_WRITE_BIT, 724 REQ_F_ISREG_BIT, 725 726 /* not a real bit, just to check we're not overflowing the space */ 727 __REQ_F_LAST_BIT, 728}; 729 730enum { 731 /* ctx owns file */ 732 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), 733 /* drain existing IO first */ 734 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), 735 /* linked sqes */ 736 REQ_F_LINK = BIT(REQ_F_LINK_BIT), 737 /* doesn't sever on completion < 0 */ 738 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), 739 /* IOSQE_ASYNC */ 740 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), 741 /* IOSQE_BUFFER_SELECT */ 742 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), 743 744 /* fail rest of links */ 745 REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), 746 /* on inflight list, should be cancelled and waited on exit reliably */ 747 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), 748 /* read/write uses file position */ 749 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), 750 /* must not punt to workers */ 751 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), 752 /* has or had linked timeout */ 753 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), 754 /* needs cleanup */ 755 REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), 756 /* already went through poll handler */ 757 REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), 758 /* buffer already selected */ 759 REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), 760 /* linked timeout is active, i.e. prepared by link's head */ 761 REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT), 762 /* completion is deferred through io_comp_state */ 763 REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), 764 /* caller should reissue async */ 765 REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), 766 /* don't attempt request reissue, see io_rw_reissue() */ 767 REQ_F_DONT_REISSUE = BIT(REQ_F_DONT_REISSUE_BIT), 768 /* supports async reads */ 769 REQ_F_ASYNC_READ = BIT(REQ_F_ASYNC_READ_BIT), 770 /* supports async writes */ 771 REQ_F_ASYNC_WRITE = BIT(REQ_F_ASYNC_WRITE_BIT), 772 /* regular file */ 773 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), 774}; 775 776struct async_poll { 777 struct io_poll_iocb poll; 778 struct io_poll_iocb *double_poll; 779}; 780 781struct io_task_work { 782 struct io_wq_work_node node; 783 task_work_func_t func; 784}; 785 786enum { 787 IORING_RSRC_FILE = 0, 788 IORING_RSRC_BUFFER = 1, 789}; 790 791/* 792 * NOTE! Each of the iocb union members has the file pointer 793 * as the first entry in their struct definition. So you can 794 * access the file pointer through any of the sub-structs, 795 * or directly as just 'ki_filp' in this struct. 796 */ 797struct io_kiocb { 798 union { 799 struct file *file; 800 struct io_rw rw; 801 struct io_poll_iocb poll; 802 struct io_poll_update poll_update; 803 struct io_accept accept; 804 struct io_sync sync; 805 struct io_cancel cancel; 806 struct io_timeout timeout; 807 struct io_timeout_rem timeout_rem; 808 struct io_connect connect; 809 struct io_sr_msg sr_msg; 810 struct io_open open; 811 struct io_close close; 812 struct io_rsrc_update rsrc_update; 813 struct io_fadvise fadvise; 814 struct io_madvise madvise; 815 struct io_epoll epoll; 816 struct io_splice splice; 817 struct io_provide_buf pbuf; 818 struct io_statx statx; 819 struct io_shutdown shutdown; 820 struct io_rename rename; 821 struct io_unlink unlink; 822 /* use only after cleaning per-op data, see io_clean_op() */ 823 struct io_completion compl; 824 }; 825 826 /* opcode allocated if it needs to store data for async defer */ 827 void *async_data; 828 u8 opcode; 829 /* polled IO has completed */ 830 u8 iopoll_completed; 831 832 u16 buf_index; 833 u32 result; 834 835 struct io_ring_ctx *ctx; 836 unsigned int flags; 837 atomic_t refs; 838 struct task_struct *task; 839 u64 user_data; 840 841 struct io_kiocb *link; 842 struct percpu_ref *fixed_rsrc_refs; 843 844 /* used with ctx->iopoll_list with reads/writes */ 845 struct list_head inflight_entry; 846 union { 847 struct io_task_work io_task_work; 848 struct callback_head task_work; 849 }; 850 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ 851 struct hlist_node hash_node; 852 struct async_poll *apoll; 853 struct io_wq_work work; 854 /* store used ubuf, so we can prevent reloading */ 855 struct io_mapped_ubuf *imu; 856}; 857 858struct io_tctx_node { 859 struct list_head ctx_node; 860 struct task_struct *task; 861 struct io_ring_ctx *ctx; 862}; 863 864struct io_defer_entry { 865 struct list_head list; 866 struct io_kiocb *req; 867 u32 seq; 868}; 869 870struct io_op_def { 871 /* needs req->file assigned */ 872 unsigned needs_file : 1; 873 /* hash wq insertion if file is a regular file */ 874 unsigned hash_reg_file : 1; 875 /* unbound wq insertion if file is a non-regular file */ 876 unsigned unbound_nonreg_file : 1; 877 /* opcode is not supported by this kernel */ 878 unsigned not_supported : 1; 879 /* set if opcode supports polled "wait" */ 880 unsigned pollin : 1; 881 unsigned pollout : 1; 882 /* op supports buffer selection */ 883 unsigned buffer_select : 1; 884 /* do prep async if is going to be punted */ 885 unsigned needs_async_setup : 1; 886 /* should block plug */ 887 unsigned plug : 1; 888 /* size of async data needed, if any */ 889 unsigned short async_size; 890}; 891 892static const struct io_op_def io_op_defs[] = { 893 [IORING_OP_NOP] = {}, 894 [IORING_OP_READV] = { 895 .needs_file = 1, 896 .unbound_nonreg_file = 1, 897 .pollin = 1, 898 .buffer_select = 1, 899 .needs_async_setup = 1, 900 .plug = 1, 901 .async_size = sizeof(struct io_async_rw), 902 }, 903 [IORING_OP_WRITEV] = { 904 .needs_file = 1, 905 .hash_reg_file = 1, 906 .unbound_nonreg_file = 1, 907 .pollout = 1, 908 .needs_async_setup = 1, 909 .plug = 1, 910 .async_size = sizeof(struct io_async_rw), 911 }, 912 [IORING_OP_FSYNC] = { 913 .needs_file = 1, 914 }, 915 [IORING_OP_READ_FIXED] = { 916 .needs_file = 1, 917 .unbound_nonreg_file = 1, 918 .pollin = 1, 919 .plug = 1, 920 .async_size = sizeof(struct io_async_rw), 921 }, 922 [IORING_OP_WRITE_FIXED] = { 923 .needs_file = 1, 924 .hash_reg_file = 1, 925 .unbound_nonreg_file = 1, 926 .pollout = 1, 927 .plug = 1, 928 .async_size = sizeof(struct io_async_rw), 929 }, 930 [IORING_OP_POLL_ADD] = { 931 .needs_file = 1, 932 .unbound_nonreg_file = 1, 933 }, 934 [IORING_OP_POLL_REMOVE] = {}, 935 [IORING_OP_SYNC_FILE_RANGE] = { 936 .needs_file = 1, 937 }, 938 [IORING_OP_SENDMSG] = { 939 .needs_file = 1, 940 .unbound_nonreg_file = 1, 941 .pollout = 1, 942 .needs_async_setup = 1, 943 .async_size = sizeof(struct io_async_msghdr), 944 }, 945 [IORING_OP_RECVMSG] = { 946 .needs_file = 1, 947 .unbound_nonreg_file = 1, 948 .pollin = 1, 949 .buffer_select = 1, 950 .needs_async_setup = 1, 951 .async_size = sizeof(struct io_async_msghdr), 952 }, 953 [IORING_OP_TIMEOUT] = { 954 .async_size = sizeof(struct io_timeout_data), 955 }, 956 [IORING_OP_TIMEOUT_REMOVE] = { 957 /* used by timeout updates' prep() */ 958 }, 959 [IORING_OP_ACCEPT] = { 960 .needs_file = 1, 961 .unbound_nonreg_file = 1, 962 .pollin = 1, 963 }, 964 [IORING_OP_ASYNC_CANCEL] = {}, 965 [IORING_OP_LINK_TIMEOUT] = { 966 .async_size = sizeof(struct io_timeout_data), 967 }, 968 [IORING_OP_CONNECT] = { 969 .needs_file = 1, 970 .unbound_nonreg_file = 1, 971 .pollout = 1, 972 .needs_async_setup = 1, 973 .async_size = sizeof(struct io_async_connect), 974 }, 975 [IORING_OP_FALLOCATE] = { 976 .needs_file = 1, 977 }, 978 [IORING_OP_OPENAT] = {}, 979 [IORING_OP_CLOSE] = {}, 980 [IORING_OP_FILES_UPDATE] = {}, 981 [IORING_OP_STATX] = {}, 982 [IORING_OP_READ] = { 983 .needs_file = 1, 984 .unbound_nonreg_file = 1, 985 .pollin = 1, 986 .buffer_select = 1, 987 .plug = 1, 988 .async_size = sizeof(struct io_async_rw), 989 }, 990 [IORING_OP_WRITE] = { 991 .needs_file = 1, 992 .unbound_nonreg_file = 1, 993 .pollout = 1, 994 .plug = 1, 995 .async_size = sizeof(struct io_async_rw), 996 }, 997 [IORING_OP_FADVISE] = { 998 .needs_file = 1, 999 }, 1000 [IORING_OP_MADVISE] = {}, 1001 [IORING_OP_SEND] = { 1002 .needs_file = 1, 1003 .unbound_nonreg_file = 1, 1004 .pollout = 1, 1005 }, 1006 [IORING_OP_RECV] = { 1007 .needs_file = 1, 1008 .unbound_nonreg_file = 1, 1009 .pollin = 1, 1010 .buffer_select = 1, 1011 }, 1012 [IORING_OP_OPENAT2] = { 1013 }, 1014 [IORING_OP_EPOLL_CTL] = { 1015 .unbound_nonreg_file = 1, 1016 }, 1017 [IORING_OP_SPLICE] = { 1018 .needs_file = 1, 1019 .hash_reg_file = 1, 1020 .unbound_nonreg_file = 1, 1021 }, 1022 [IORING_OP_PROVIDE_BUFFERS] = {}, 1023 [IORING_OP_REMOVE_BUFFERS] = {}, 1024 [IORING_OP_TEE] = { 1025 .needs_file = 1, 1026 .hash_reg_file = 1, 1027 .unbound_nonreg_file = 1, 1028 }, 1029 [IORING_OP_SHUTDOWN] = { 1030 .needs_file = 1, 1031 }, 1032 [IORING_OP_RENAMEAT] = {}, 1033 [IORING_OP_UNLINKAT] = {}, 1034}; 1035 1036static bool io_disarm_next(struct io_kiocb *req); 1037static void io_uring_del_task_file(unsigned long index); 1038static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 1039 struct task_struct *task, 1040 struct files_struct *files); 1041static void io_uring_cancel_sqpoll(struct io_sq_data *sqd); 1042static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx); 1043 1044static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, 1045 long res, unsigned int cflags); 1046static void io_put_req(struct io_kiocb *req); 1047static void io_put_req_deferred(struct io_kiocb *req, int nr); 1048static void io_dismantle_req(struct io_kiocb *req); 1049static void io_put_task(struct task_struct *task, int nr); 1050static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); 1051static void io_queue_linked_timeout(struct io_kiocb *req); 1052static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 1053 struct io_uring_rsrc_update2 *up, 1054 unsigned nr_args); 1055static void io_clean_op(struct io_kiocb *req); 1056static struct file *io_file_get(struct io_submit_state *state, 1057 struct io_kiocb *req, int fd, bool fixed); 1058static void __io_queue_sqe(struct io_kiocb *req); 1059static void io_rsrc_put_work(struct work_struct *work); 1060 1061static void io_req_task_queue(struct io_kiocb *req); 1062static void io_submit_flush_completions(struct io_comp_state *cs, 1063 struct io_ring_ctx *ctx); 1064static bool io_poll_remove_waitqs(struct io_kiocb *req); 1065static int io_req_prep_async(struct io_kiocb *req); 1066 1067static struct kmem_cache *req_cachep; 1068 1069static const struct file_operations io_uring_fops; 1070 1071struct sock *io_uring_get_socket(struct file *file) 1072{ 1073#if defined(CONFIG_UNIX) 1074 if (file->f_op == &io_uring_fops) { 1075 struct io_ring_ctx *ctx = file->private_data; 1076 1077 return ctx->ring_sock->sk; 1078 } 1079#endif 1080 return NULL; 1081} 1082EXPORT_SYMBOL(io_uring_get_socket); 1083 1084#define io_for_each_link(pos, head) \ 1085 for (pos = (head); pos; pos = pos->link) 1086 1087static inline void io_req_set_rsrc_node(struct io_kiocb *req) 1088{ 1089 struct io_ring_ctx *ctx = req->ctx; 1090 1091 if (!req->fixed_rsrc_refs) { 1092 req->fixed_rsrc_refs = &ctx->rsrc_node->refs; 1093 percpu_ref_get(req->fixed_rsrc_refs); 1094 } 1095} 1096 1097static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl) 1098{ 1099 bool got = percpu_ref_tryget(ref); 1100 1101 /* already at zero, wait for ->release() */ 1102 if (!got) 1103 wait_for_completion(compl); 1104 percpu_ref_resurrect(ref); 1105 if (got) 1106 percpu_ref_put(ref); 1107} 1108 1109static bool io_match_task(struct io_kiocb *head, 1110 struct task_struct *task, 1111 struct files_struct *files) 1112{ 1113 struct io_kiocb *req; 1114 1115 if (task && head->task != task) 1116 return false; 1117 if (!files) 1118 return true; 1119 1120 io_for_each_link(req, head) { 1121 if (req->flags & REQ_F_INFLIGHT) 1122 return true; 1123 } 1124 return false; 1125} 1126 1127static inline void req_set_fail_links(struct io_kiocb *req) 1128{ 1129 if (req->flags & REQ_F_LINK) 1130 req->flags |= REQ_F_FAIL_LINK; 1131} 1132 1133static void io_ring_ctx_ref_free(struct percpu_ref *ref) 1134{ 1135 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); 1136 1137 complete(&ctx->ref_comp); 1138} 1139 1140static inline bool io_is_timeout_noseq(struct io_kiocb *req) 1141{ 1142 return !req->timeout.off; 1143} 1144 1145static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) 1146{ 1147 struct io_ring_ctx *ctx; 1148 int hash_bits; 1149 1150 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 1151 if (!ctx) 1152 return NULL; 1153 1154 /* 1155 * Use 5 bits less than the max cq entries, that should give us around 1156 * 32 entries per hash list if totally full and uniformly spread. 1157 */ 1158 hash_bits = ilog2(p->cq_entries); 1159 hash_bits -= 5; 1160 if (hash_bits <= 0) 1161 hash_bits = 1; 1162 ctx->cancel_hash_bits = hash_bits; 1163 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head), 1164 GFP_KERNEL); 1165 if (!ctx->cancel_hash) 1166 goto err; 1167 __hash_init(ctx->cancel_hash, 1U << hash_bits); 1168 1169 ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); 1170 if (!ctx->dummy_ubuf) 1171 goto err; 1172 /* set invalid range, so io_import_fixed() fails meeting it */ 1173 ctx->dummy_ubuf->ubuf = -1UL; 1174 1175 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 1176 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) 1177 goto err; 1178 1179 ctx->flags = p->flags; 1180 init_waitqueue_head(&ctx->sqo_sq_wait); 1181 INIT_LIST_HEAD(&ctx->sqd_list); 1182 init_waitqueue_head(&ctx->cq_wait); 1183 INIT_LIST_HEAD(&ctx->cq_overflow_list); 1184 init_completion(&ctx->ref_comp); 1185 xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1); 1186 xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); 1187 mutex_init(&ctx->uring_lock); 1188 init_waitqueue_head(&ctx->wait); 1189 spin_lock_init(&ctx->completion_lock); 1190 INIT_LIST_HEAD(&ctx->iopoll_list); 1191 INIT_LIST_HEAD(&ctx->defer_list); 1192 INIT_LIST_HEAD(&ctx->timeout_list); 1193 spin_lock_init(&ctx->rsrc_ref_lock); 1194 INIT_LIST_HEAD(&ctx->rsrc_ref_list); 1195 INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); 1196 init_llist_head(&ctx->rsrc_put_llist); 1197 INIT_LIST_HEAD(&ctx->tctx_list); 1198 INIT_LIST_HEAD(&ctx->submit_state.comp.free_list); 1199 INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list); 1200 return ctx; 1201err: 1202 kfree(ctx->dummy_ubuf); 1203 kfree(ctx->cancel_hash); 1204 kfree(ctx); 1205 return NULL; 1206} 1207 1208static bool req_need_defer(struct io_kiocb *req, u32 seq) 1209{ 1210 if (unlikely(req->flags & REQ_F_IO_DRAIN)) { 1211 struct io_ring_ctx *ctx = req->ctx; 1212 1213 return seq + ctx->cq_extra != ctx->cached_cq_tail 1214 + READ_ONCE(ctx->cached_cq_overflow); 1215 } 1216 1217 return false; 1218} 1219 1220static void io_req_track_inflight(struct io_kiocb *req) 1221{ 1222 if (!(req->flags & REQ_F_INFLIGHT)) { 1223 req->flags |= REQ_F_INFLIGHT; 1224 atomic_inc(&current->io_uring->inflight_tracked); 1225 } 1226} 1227 1228static void io_prep_async_work(struct io_kiocb *req) 1229{ 1230 const struct io_op_def *def = &io_op_defs[req->opcode]; 1231 struct io_ring_ctx *ctx = req->ctx; 1232 1233 if (!req->work.creds) 1234 req->work.creds = get_current_cred(); 1235 1236 req->work.list.next = NULL; 1237 req->work.flags = 0; 1238 if (req->flags & REQ_F_FORCE_ASYNC) 1239 req->work.flags |= IO_WQ_WORK_CONCURRENT; 1240 1241 if (req->flags & REQ_F_ISREG) { 1242 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) 1243 io_wq_hash_work(&req->work, file_inode(req->file)); 1244 } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { 1245 if (def->unbound_nonreg_file) 1246 req->work.flags |= IO_WQ_WORK_UNBOUND; 1247 } 1248 1249 switch (req->opcode) { 1250 case IORING_OP_SPLICE: 1251 case IORING_OP_TEE: 1252 if (!S_ISREG(file_inode(req->splice.file_in)->i_mode)) 1253 req->work.flags |= IO_WQ_WORK_UNBOUND; 1254 break; 1255 } 1256} 1257 1258static void io_prep_async_link(struct io_kiocb *req) 1259{ 1260 struct io_kiocb *cur; 1261 1262 io_for_each_link(cur, req) 1263 io_prep_async_work(cur); 1264} 1265 1266static void io_queue_async_work(struct io_kiocb *req) 1267{ 1268 struct io_ring_ctx *ctx = req->ctx; 1269 struct io_kiocb *link = io_prep_linked_timeout(req); 1270 struct io_uring_task *tctx = req->task->io_uring; 1271 1272 BUG_ON(!tctx); 1273 BUG_ON(!tctx->io_wq); 1274 1275 /* init ->work of the whole link before punting */ 1276 io_prep_async_link(req); 1277 trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, 1278 &req->work, req->flags); 1279 io_wq_enqueue(tctx->io_wq, &req->work); 1280 if (link) 1281 io_queue_linked_timeout(link); 1282} 1283 1284static void io_kill_timeout(struct io_kiocb *req, int status) 1285 __must_hold(&req->ctx->completion_lock) 1286{ 1287 struct io_timeout_data *io = req->async_data; 1288 1289 if (hrtimer_try_to_cancel(&io->timer) != -1) { 1290 atomic_set(&req->ctx->cq_timeouts, 1291 atomic_read(&req->ctx->cq_timeouts) + 1); 1292 list_del_init(&req->timeout.list); 1293 io_cqring_fill_event(req->ctx, req->user_data, status, 0); 1294 io_put_req_deferred(req, 1); 1295 } 1296} 1297 1298static void __io_queue_deferred(struct io_ring_ctx *ctx) 1299{ 1300 do { 1301 struct io_defer_entry *de = list_first_entry(&ctx->defer_list, 1302 struct io_defer_entry, list); 1303 1304 if (req_need_defer(de->req, de->seq)) 1305 break; 1306 list_del_init(&de->list); 1307 io_req_task_queue(de->req); 1308 kfree(de); 1309 } while (!list_empty(&ctx->defer_list)); 1310} 1311 1312static void io_flush_timeouts(struct io_ring_ctx *ctx) 1313{ 1314 u32 seq; 1315 1316 if (list_empty(&ctx->timeout_list)) 1317 return; 1318 1319 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); 1320 1321 do { 1322 u32 events_needed, events_got; 1323 struct io_kiocb *req = list_first_entry(&ctx->timeout_list, 1324 struct io_kiocb, timeout.list); 1325 1326 if (io_is_timeout_noseq(req)) 1327 break; 1328 1329 /* 1330 * Since seq can easily wrap around over time, subtract 1331 * the last seq at which timeouts were flushed before comparing. 1332 * Assuming not more than 2^31-1 events have happened since, 1333 * these subtractions won't have wrapped, so we can check if 1334 * target is in [last_seq, current_seq] by comparing the two. 1335 */ 1336 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush; 1337 events_got = seq - ctx->cq_last_tm_flush; 1338 if (events_got < events_needed) 1339 break; 1340 1341 list_del_init(&req->timeout.list); 1342 io_kill_timeout(req, 0); 1343 } while (!list_empty(&ctx->timeout_list)); 1344 1345 ctx->cq_last_tm_flush = seq; 1346} 1347 1348static void io_commit_cqring(struct io_ring_ctx *ctx) 1349{ 1350 io_flush_timeouts(ctx); 1351 1352 /* order cqe stores with ring update */ 1353 smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); 1354 1355 if (unlikely(!list_empty(&ctx->defer_list))) 1356 __io_queue_deferred(ctx); 1357} 1358 1359static inline bool io_sqring_full(struct io_ring_ctx *ctx) 1360{ 1361 struct io_rings *r = ctx->rings; 1362 1363 return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries; 1364} 1365 1366static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) 1367{ 1368 return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); 1369} 1370 1371static inline struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) 1372{ 1373 struct io_rings *rings = ctx->rings; 1374 unsigned tail; 1375 1376 /* 1377 * writes to the cq entry need to come after reading head; the 1378 * control dependency is enough as we're using WRITE_ONCE to 1379 * fill the cq entry 1380 */ 1381 if (__io_cqring_events(ctx) == rings->cq_ring_entries) 1382 return NULL; 1383 1384 tail = ctx->cached_cq_tail++; 1385 return &rings->cqes[tail & ctx->cq_mask]; 1386} 1387 1388static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) 1389{ 1390 if (likely(!ctx->cq_ev_fd)) 1391 return false; 1392 if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) 1393 return false; 1394 return !ctx->eventfd_async || io_wq_current_is_worker(); 1395} 1396 1397static void io_cqring_ev_posted(struct io_ring_ctx *ctx) 1398{ 1399 /* see waitqueue_active() comment */ 1400 smp_mb(); 1401 1402 if (waitqueue_active(&ctx->wait)) 1403 wake_up(&ctx->wait); 1404 if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait)) 1405 wake_up(&ctx->sq_data->wait); 1406 if (io_should_trigger_evfd(ctx)) 1407 eventfd_signal(ctx->cq_ev_fd, 1); 1408 if (waitqueue_active(&ctx->cq_wait)) { 1409 wake_up_interruptible(&ctx->cq_wait); 1410 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); 1411 } 1412} 1413 1414static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) 1415{ 1416 /* see waitqueue_active() comment */ 1417 smp_mb(); 1418 1419 if (ctx->flags & IORING_SETUP_SQPOLL) { 1420 if (waitqueue_active(&ctx->wait)) 1421 wake_up(&ctx->wait); 1422 } 1423 if (io_should_trigger_evfd(ctx)) 1424 eventfd_signal(ctx->cq_ev_fd, 1); 1425 if (waitqueue_active(&ctx->cq_wait)) { 1426 wake_up_interruptible(&ctx->cq_wait); 1427 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); 1428 } 1429} 1430 1431/* Returns true if there are no backlogged entries after the flush */ 1432static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) 1433{ 1434 struct io_rings *rings = ctx->rings; 1435 unsigned long flags; 1436 bool all_flushed, posted; 1437 1438 if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries) 1439 return false; 1440 1441 posted = false; 1442 spin_lock_irqsave(&ctx->completion_lock, flags); 1443 while (!list_empty(&ctx->cq_overflow_list)) { 1444 struct io_uring_cqe *cqe = io_get_cqring(ctx); 1445 struct io_overflow_cqe *ocqe; 1446 1447 if (!cqe && !force) 1448 break; 1449 ocqe = list_first_entry(&ctx->cq_overflow_list, 1450 struct io_overflow_cqe, list); 1451 if (cqe) 1452 memcpy(cqe, &ocqe->cqe, sizeof(*cqe)); 1453 else 1454 WRITE_ONCE(ctx->rings->cq_overflow, 1455 ++ctx->cached_cq_overflow); 1456 posted = true; 1457 list_del(&ocqe->list); 1458 kfree(ocqe); 1459 } 1460 1461 all_flushed = list_empty(&ctx->cq_overflow_list); 1462 if (all_flushed) { 1463 clear_bit(0, &ctx->sq_check_overflow); 1464 clear_bit(0, &ctx->cq_check_overflow); 1465 ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; 1466 } 1467 1468 if (posted) 1469 io_commit_cqring(ctx); 1470 spin_unlock_irqrestore(&ctx->completion_lock, flags); 1471 if (posted) 1472 io_cqring_ev_posted(ctx); 1473 return all_flushed; 1474} 1475 1476static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) 1477{ 1478 bool ret = true; 1479 1480 if (test_bit(0, &ctx->cq_check_overflow)) { 1481 /* iopoll syncs against uring_lock, not completion_lock */ 1482 if (ctx->flags & IORING_SETUP_IOPOLL) 1483 mutex_lock(&ctx->uring_lock); 1484 ret = __io_cqring_overflow_flush(ctx, force); 1485 if (ctx->flags & IORING_SETUP_IOPOLL) 1486 mutex_unlock(&ctx->uring_lock); 1487 } 1488 1489 return ret; 1490} 1491 1492/* 1493 * Shamelessly stolen from the mm implementation of page reference checking, 1494 * see commit f958d7b528b1 for details. 1495 */ 1496#define req_ref_zero_or_close_to_overflow(req) \ 1497 ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u) 1498 1499static inline bool req_ref_inc_not_zero(struct io_kiocb *req) 1500{ 1501 return atomic_inc_not_zero(&req->refs); 1502} 1503 1504static inline bool req_ref_sub_and_test(struct io_kiocb *req, int refs) 1505{ 1506 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); 1507 return atomic_sub_and_test(refs, &req->refs); 1508} 1509 1510static inline bool req_ref_put_and_test(struct io_kiocb *req) 1511{ 1512 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); 1513 return atomic_dec_and_test(&req->refs); 1514} 1515 1516static inline void req_ref_put(struct io_kiocb *req) 1517{ 1518 WARN_ON_ONCE(req_ref_put_and_test(req)); 1519} 1520 1521static inline void req_ref_get(struct io_kiocb *req) 1522{ 1523 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); 1524 atomic_inc(&req->refs); 1525} 1526 1527static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, 1528 long res, unsigned int cflags) 1529{ 1530 struct io_overflow_cqe *ocqe; 1531 1532 ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT); 1533 if (!ocqe) { 1534 /* 1535 * If we're in ring overflow flush mode, or in task cancel mode, 1536 * or cannot allocate an overflow entry, then we need to drop it 1537 * on the floor. 1538 */ 1539 WRITE_ONCE(ctx->rings->cq_overflow, ++ctx->cached_cq_overflow); 1540 return false; 1541 } 1542 if (list_empty(&ctx->cq_overflow_list)) { 1543 set_bit(0, &ctx->sq_check_overflow); 1544 set_bit(0, &ctx->cq_check_overflow); 1545 ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW; 1546 } 1547 ocqe->cqe.user_data = user_data; 1548 ocqe->cqe.res = res; 1549 ocqe->cqe.flags = cflags; 1550 list_add_tail(&ocqe->list, &ctx->cq_overflow_list); 1551 return true; 1552} 1553 1554static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, 1555 long res, unsigned int cflags) 1556{ 1557 struct io_uring_cqe *cqe; 1558 1559 trace_io_uring_complete(ctx, user_data, res, cflags); 1560 1561 /* 1562 * If we can't get a cq entry, userspace overflowed the 1563 * submission (by quite a lot). Increment the overflow count in 1564 * the ring. 1565 */ 1566 cqe = io_get_cqring(ctx); 1567 if (likely(cqe)) { 1568 WRITE_ONCE(cqe->user_data, user_data); 1569 WRITE_ONCE(cqe->res, res); 1570 WRITE_ONCE(cqe->flags, cflags); 1571 return true; 1572 } 1573 return io_cqring_event_overflow(ctx, user_data, res, cflags); 1574} 1575 1576/* not as hot to bloat with inlining */ 1577static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, 1578 long res, unsigned int cflags) 1579{ 1580 return __io_cqring_fill_event(ctx, user_data, res, cflags); 1581} 1582 1583static void io_req_complete_post(struct io_kiocb *req, long res, 1584 unsigned int cflags) 1585{ 1586 struct io_ring_ctx *ctx = req->ctx; 1587 unsigned long flags; 1588 1589 spin_lock_irqsave(&ctx->completion_lock, flags); 1590 __io_cqring_fill_event(ctx, req->user_data, res, cflags); 1591 /* 1592 * If we're the last reference to this request, add to our locked 1593 * free_list cache. 1594 */ 1595 if (req_ref_put_and_test(req)) { 1596 struct io_comp_state *cs = &ctx->submit_state.comp; 1597 1598 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { 1599 if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL_LINK)) 1600 io_disarm_next(req); 1601 if (req->link) { 1602 io_req_task_queue(req->link); 1603 req->link = NULL; 1604 } 1605 } 1606 io_dismantle_req(req); 1607 io_put_task(req->task, 1); 1608 list_add(&req->compl.list, &cs->locked_free_list); 1609 cs->locked_free_nr++; 1610 } else { 1611 if (!percpu_ref_tryget(&ctx->refs)) 1612 req = NULL; 1613 } 1614 io_commit_cqring(ctx); 1615 spin_unlock_irqrestore(&ctx->completion_lock, flags); 1616 1617 if (req) { 1618 io_cqring_ev_posted(ctx); 1619 percpu_ref_put(&ctx->refs); 1620 } 1621} 1622 1623static inline bool io_req_needs_clean(struct io_kiocb *req) 1624{ 1625 return req->flags & (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | 1626 REQ_F_POLLED | REQ_F_INFLIGHT); 1627} 1628 1629static void io_req_complete_state(struct io_kiocb *req, long res, 1630 unsigned int cflags) 1631{ 1632 if (io_req_needs_clean(req)) 1633 io_clean_op(req); 1634 req->result = res; 1635 req->compl.cflags = cflags; 1636 req->flags |= REQ_F_COMPLETE_INLINE; 1637} 1638 1639static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, 1640 long res, unsigned cflags) 1641{ 1642 if (issue_flags & IO_URING_F_COMPLETE_DEFER) 1643 io_req_complete_state(req, res, cflags); 1644 else 1645 io_req_complete_post(req, res, cflags); 1646} 1647 1648static inline void io_req_complete(struct io_kiocb *req, long res) 1649{ 1650 __io_req_complete(req, 0, res, 0); 1651} 1652 1653static void io_req_complete_failed(struct io_kiocb *req, long res) 1654{ 1655 req_set_fail_links(req); 1656 io_put_req(req); 1657 io_req_complete_post(req, res, 0); 1658} 1659 1660static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, 1661 struct io_comp_state *cs) 1662{ 1663 spin_lock_irq(&ctx->completion_lock); 1664 list_splice_init(&cs->locked_free_list, &cs->free_list); 1665 cs->locked_free_nr = 0; 1666 spin_unlock_irq(&ctx->completion_lock); 1667} 1668 1669/* Returns true IFF there are requests in the cache */ 1670static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) 1671{ 1672 struct io_submit_state *state = &ctx->submit_state; 1673 struct io_comp_state *cs = &state->comp; 1674 int nr; 1675 1676 /* 1677 * If we have more than a batch's worth of requests in our IRQ side 1678 * locked cache, grab the lock and move them over to our submission 1679 * side cache. 1680 */ 1681 if (READ_ONCE(cs->locked_free_nr) > IO_COMPL_BATCH) 1682 io_flush_cached_locked_reqs(ctx, cs); 1683 1684 nr = state->free_reqs; 1685 while (!list_empty(&cs->free_list)) { 1686 struct io_kiocb *req = list_first_entry(&cs->free_list, 1687 struct io_kiocb, compl.list); 1688 1689 list_del(&req->compl.list); 1690 state->reqs[nr++] = req; 1691 if (nr == ARRAY_SIZE(state->reqs)) 1692 break; 1693 } 1694 1695 state->free_reqs = nr; 1696 return nr != 0; 1697} 1698 1699static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) 1700{ 1701 struct io_submit_state *state = &ctx->submit_state; 1702 1703 BUILD_BUG_ON(IO_REQ_ALLOC_BATCH > ARRAY_SIZE(state->reqs)); 1704 1705 if (!state->free_reqs) { 1706 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; 1707 int ret; 1708 1709 if (io_flush_cached_reqs(ctx)) 1710 goto got_req; 1711 1712 ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH, 1713 state->reqs); 1714 1715 /* 1716 * Bulk alloc is all-or-nothing. If we fail to get a batch, 1717 * retry single alloc to be on the safe side. 1718 */ 1719 if (unlikely(ret <= 0)) { 1720 state->reqs[0] = kmem_cache_alloc(req_cachep, gfp); 1721 if (!state->reqs[0]) 1722 return NULL; 1723 ret = 1; 1724 } 1725 state->free_reqs = ret; 1726 } 1727got_req: 1728 state->free_reqs--; 1729 return state->reqs[state->free_reqs]; 1730} 1731 1732static inline void io_put_file(struct file *file) 1733{ 1734 if (file) 1735 fput(file); 1736} 1737 1738static void io_dismantle_req(struct io_kiocb *req) 1739{ 1740 unsigned int flags = req->flags; 1741 1742 if (io_req_needs_clean(req)) 1743 io_clean_op(req); 1744 if (!(flags & REQ_F_FIXED_FILE)) 1745 io_put_file(req->file); 1746 if (req->fixed_rsrc_refs) 1747 percpu_ref_put(req->fixed_rsrc_refs); 1748 if (req->async_data) 1749 kfree(req->async_data); 1750 if (req->work.creds) { 1751 put_cred(req->work.creds); 1752 req->work.creds = NULL; 1753 } 1754} 1755 1756/* must to be called somewhat shortly after putting a request */ 1757static inline void io_put_task(struct task_struct *task, int nr) 1758{ 1759 struct io_uring_task *tctx = task->io_uring; 1760 1761 percpu_counter_sub(&tctx->inflight, nr); 1762 if (unlikely(atomic_read(&tctx->in_idle))) 1763 wake_up(&tctx->wait); 1764 put_task_struct_many(task, nr); 1765} 1766 1767static void __io_free_req(struct io_kiocb *req) 1768{ 1769 struct io_ring_ctx *ctx = req->ctx; 1770 1771 io_dismantle_req(req); 1772 io_put_task(req->task, 1); 1773 1774 kmem_cache_free(req_cachep, req); 1775 percpu_ref_put(&ctx->refs); 1776} 1777 1778static inline void io_remove_next_linked(struct io_kiocb *req) 1779{ 1780 struct io_kiocb *nxt = req->link; 1781 1782 req->link = nxt->link; 1783 nxt->link = NULL; 1784} 1785 1786static bool io_kill_linked_timeout(struct io_kiocb *req) 1787 __must_hold(&req->ctx->completion_lock) 1788{ 1789 struct io_kiocb *link = req->link; 1790 1791 /* 1792 * Can happen if a linked timeout fired and link had been like 1793 * req -> link t-out -> link t-out [-> ...] 1794 */ 1795 if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) { 1796 struct io_timeout_data *io = link->async_data; 1797 1798 io_remove_next_linked(req); 1799 link->timeout.head = NULL; 1800 if (hrtimer_try_to_cancel(&io->timer) != -1) { 1801 io_cqring_fill_event(link->ctx, link->user_data, 1802 -ECANCELED, 0); 1803 io_put_req_deferred(link, 1); 1804 return true; 1805 } 1806 } 1807 return false; 1808} 1809 1810static void io_fail_links(struct io_kiocb *req) 1811 __must_hold(&req->ctx->completion_lock) 1812{ 1813 struct io_kiocb *nxt, *link = req->link; 1814 1815 req->link = NULL; 1816 while (link) { 1817 nxt = link->link; 1818 link->link = NULL; 1819 1820 trace_io_uring_fail_link(req, link); 1821 io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0); 1822 io_put_req_deferred(link, 2); 1823 link = nxt; 1824 } 1825} 1826 1827static bool io_disarm_next(struct io_kiocb *req) 1828 __must_hold(&req->ctx->completion_lock) 1829{ 1830 bool posted = false; 1831 1832 if (likely(req->flags & REQ_F_LINK_TIMEOUT)) 1833 posted = io_kill_linked_timeout(req); 1834 if (unlikely((req->flags & REQ_F_FAIL_LINK) && 1835 !(req->flags & REQ_F_HARDLINK))) { 1836 posted |= (req->link != NULL); 1837 io_fail_links(req); 1838 } 1839 return posted; 1840} 1841 1842static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) 1843{ 1844 struct io_kiocb *nxt; 1845 1846 /* 1847 * If LINK is set, we have dependent requests in this chain. If we 1848 * didn't fail this request, queue the first one up, moving any other 1849 * dependencies to the next request. In case of failure, fail the rest 1850 * of the chain. 1851 */ 1852 if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL_LINK)) { 1853 struct io_ring_ctx *ctx = req->ctx; 1854 unsigned long flags; 1855 bool posted; 1856 1857 spin_lock_irqsave(&ctx->completion_lock, flags); 1858 posted = io_disarm_next(req); 1859 if (posted) 1860 io_commit_cqring(req->ctx); 1861 spin_unlock_irqrestore(&ctx->completion_lock, flags); 1862 if (posted) 1863 io_cqring_ev_posted(ctx); 1864 } 1865 nxt = req->link; 1866 req->link = NULL; 1867 return nxt; 1868} 1869 1870static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) 1871{ 1872 if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))) 1873 return NULL; 1874 return __io_req_find_next(req); 1875} 1876 1877static void ctx_flush_and_put(struct io_ring_ctx *ctx) 1878{ 1879 if (!ctx) 1880 return; 1881 if (ctx->submit_state.comp.nr) { 1882 mutex_lock(&ctx->uring_lock); 1883 io_submit_flush_completions(&ctx->submit_state.comp, ctx); 1884 mutex_unlock(&ctx->uring_lock); 1885 } 1886 percpu_ref_put(&ctx->refs); 1887} 1888 1889static bool __tctx_task_work(struct io_uring_task *tctx) 1890{ 1891 struct io_ring_ctx *ctx = NULL; 1892 struct io_wq_work_list list; 1893 struct io_wq_work_node *node; 1894 1895 if (wq_list_empty(&tctx->task_list)) 1896 return false; 1897 1898 spin_lock_irq(&tctx->task_lock); 1899 list = tctx->task_list; 1900 INIT_WQ_LIST(&tctx->task_list); 1901 spin_unlock_irq(&tctx->task_lock); 1902 1903 node = list.first; 1904 while (node) { 1905 struct io_wq_work_node *next = node->next; 1906 struct io_kiocb *req; 1907 1908 req = container_of(node, struct io_kiocb, io_task_work.node); 1909 if (req->ctx != ctx) { 1910 ctx_flush_and_put(ctx); 1911 ctx = req->ctx; 1912 percpu_ref_get(&ctx->refs); 1913 } 1914 1915 req->task_work.func(&req->task_work); 1916 node = next; 1917 } 1918 1919 ctx_flush_and_put(ctx); 1920 return list.first != NULL; 1921} 1922 1923static void tctx_task_work(struct callback_head *cb) 1924{ 1925 struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work); 1926 1927 clear_bit(0, &tctx->task_state); 1928 1929 while (__tctx_task_work(tctx)) 1930 cond_resched(); 1931} 1932 1933static int io_req_task_work_add(struct io_kiocb *req) 1934{ 1935 struct task_struct *tsk = req->task; 1936 struct io_uring_task *tctx = tsk->io_uring; 1937 enum task_work_notify_mode notify; 1938 struct io_wq_work_node *node, *prev; 1939 unsigned long flags; 1940 int ret = 0; 1941 1942 if (unlikely(tsk->flags & PF_EXITING)) 1943 return -ESRCH; 1944 1945 WARN_ON_ONCE(!tctx); 1946 1947 spin_lock_irqsave(&tctx->task_lock, flags); 1948 wq_list_add_tail(&req->io_task_work.node, &tctx->task_list); 1949 spin_unlock_irqrestore(&tctx->task_lock, flags); 1950 1951 /* task_work already pending, we're done */ 1952 if (test_bit(0, &tctx->task_state) || 1953 test_and_set_bit(0, &tctx->task_state)) 1954 return 0; 1955 1956 /* 1957 * SQPOLL kernel thread doesn't need notification, just a wakeup. For 1958 * all other cases, use TWA_SIGNAL unconditionally to ensure we're 1959 * processing task_work. There's no reliable way to tell if TWA_RESUME 1960 * will do the job. 1961 */ 1962 notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL; 1963 1964 if (!task_work_add(tsk, &tctx->task_work, notify)) { 1965 wake_up_process(tsk); 1966 return 0; 1967 } 1968 1969 /* 1970 * Slow path - we failed, find and delete work. if the work is not 1971 * in the list, it got run and we're fine. 1972 */ 1973 spin_lock_irqsave(&tctx->task_lock, flags); 1974 wq_list_for_each(node, prev, &tctx->task_list) { 1975 if (&req->io_task_work.node == node) { 1976 wq_list_del(&tctx->task_list, node, prev); 1977 ret = 1; 1978 break; 1979 } 1980 } 1981 spin_unlock_irqrestore(&tctx->task_lock, flags); 1982 clear_bit(0, &tctx->task_state); 1983 return ret; 1984} 1985 1986static bool io_run_task_work_head(struct callback_head **work_head) 1987{ 1988 struct callback_head *work, *next; 1989 bool executed = false; 1990 1991 do { 1992 work = xchg(work_head, NULL); 1993 if (!work) 1994 break; 1995 1996 do { 1997 next = work->next; 1998 work->func(work); 1999 work = next; 2000 cond_resched(); 2001 } while (work); 2002 executed = true; 2003 } while (1); 2004 2005 return executed; 2006} 2007 2008static void io_task_work_add_head(struct callback_head **work_head, 2009 struct callback_head *task_work) 2010{ 2011 struct callback_head *head; 2012 2013 do { 2014 head = READ_ONCE(*work_head); 2015 task_work->next = head; 2016 } while (cmpxchg(work_head, head, task_work) != head); 2017} 2018 2019static void io_req_task_work_add_fallback(struct io_kiocb *req, 2020 task_work_func_t cb) 2021{ 2022 init_task_work(&req->task_work, cb); 2023 io_task_work_add_head(&req->ctx->exit_task_work, &req->task_work); 2024} 2025 2026static void io_req_task_cancel(struct callback_head *cb) 2027{ 2028 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); 2029 struct io_ring_ctx *ctx = req->ctx; 2030 2031 /* ctx is guaranteed to stay alive while we hold uring_lock */ 2032 mutex_lock(&ctx->uring_lock); 2033 io_req_complete_failed(req, req->result); 2034 mutex_unlock(&ctx->uring_lock); 2035} 2036 2037static void __io_req_task_submit(struct io_kiocb *req) 2038{ 2039 struct io_ring_ctx *ctx = req->ctx; 2040 2041 /* ctx stays valid until unlock, even if we drop all ours ctx->refs */ 2042 mutex_lock(&ctx->uring_lock); 2043 if (!(current->flags & PF_EXITING) && !current->in_execve) 2044 __io_queue_sqe(req); 2045 else 2046 io_req_complete_failed(req, -EFAULT); 2047 mutex_unlock(&ctx->uring_lock); 2048} 2049 2050static void io_req_task_submit(struct callback_head *cb) 2051{ 2052 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); 2053 2054 __io_req_task_submit(req); 2055} 2056 2057static void io_req_task_queue_fail(struct io_kiocb *req, int ret) 2058{ 2059 req->result = ret; 2060 req->task_work.func = io_req_task_cancel; 2061 2062 if (unlikely(io_req_task_work_add(req))) 2063 io_req_task_work_add_fallback(req, io_req_task_cancel); 2064} 2065 2066static void io_req_task_queue(struct io_kiocb *req) 2067{ 2068 req->task_work.func = io_req_task_submit; 2069 2070 if (unlikely(io_req_task_work_add(req))) 2071 io_req_task_queue_fail(req, -ECANCELED); 2072} 2073 2074static inline void io_queue_next(struct io_kiocb *req) 2075{ 2076 struct io_kiocb *nxt = io_req_find_next(req); 2077 2078 if (nxt) 2079 io_req_task_queue(nxt); 2080} 2081 2082static void io_free_req(struct io_kiocb *req) 2083{ 2084 io_queue_next(req); 2085 __io_free_req(req); 2086} 2087 2088struct req_batch { 2089 struct task_struct *task; 2090 int task_refs; 2091 int ctx_refs; 2092}; 2093 2094static inline void io_init_req_batch(struct req_batch *rb) 2095{ 2096 rb->task_refs = 0; 2097 rb->ctx_refs = 0; 2098 rb->task = NULL; 2099} 2100 2101static void io_req_free_batch_finish(struct io_ring_ctx *ctx, 2102 struct req_batch *rb) 2103{ 2104 if (rb->task) 2105 io_put_task(rb->task, rb->task_refs); 2106 if (rb->ctx_refs) 2107 percpu_ref_put_many(&ctx->refs, rb->ctx_refs); 2108} 2109 2110static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req, 2111 struct io_submit_state *state) 2112{ 2113 io_queue_next(req); 2114 io_dismantle_req(req); 2115 2116 if (req->task != rb->task) { 2117 if (rb->task) 2118 io_put_task(rb->task, rb->task_refs); 2119 rb->task = req->task; 2120 rb->task_refs = 0; 2121 } 2122 rb->task_refs++; 2123 rb->ctx_refs++; 2124 2125 if (state->free_reqs != ARRAY_SIZE(state->reqs)) 2126 state->reqs[state->free_reqs++] = req; 2127 else 2128 list_add(&req->compl.list, &state->comp.free_list); 2129} 2130 2131static void io_submit_flush_completions(struct io_comp_state *cs, 2132 struct io_ring_ctx *ctx) 2133{ 2134 int i, nr = cs->nr; 2135 struct io_kiocb *req; 2136 struct req_batch rb; 2137 2138 io_init_req_batch(&rb); 2139 spin_lock_irq(&ctx->completion_lock); 2140 for (i = 0; i < nr; i++) { 2141 req = cs->reqs[i]; 2142 __io_cqring_fill_event(ctx, req->user_data, req->result, 2143 req->compl.cflags); 2144 } 2145 io_commit_cqring(ctx); 2146 spin_unlock_irq(&ctx->completion_lock); 2147 2148 io_cqring_ev_posted(ctx); 2149 for (i = 0; i < nr; i++) { 2150 req = cs->reqs[i]; 2151 2152 /* submission and completion refs */ 2153 if (req_ref_sub_and_test(req, 2)) 2154 io_req_free_batch(&rb, req, &ctx->submit_state); 2155 } 2156 2157 io_req_free_batch_finish(ctx, &rb); 2158 cs->nr = 0; 2159} 2160 2161/* 2162 * Drop reference to request, return next in chain (if there is one) if this 2163 * was the last reference to this request. 2164 */ 2165static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) 2166{ 2167 struct io_kiocb *nxt = NULL; 2168 2169 if (req_ref_put_and_test(req)) { 2170 nxt = io_req_find_next(req); 2171 __io_free_req(req); 2172 } 2173 return nxt; 2174} 2175 2176static inline void io_put_req(struct io_kiocb *req) 2177{ 2178 if (req_ref_put_and_test(req)) 2179 io_free_req(req); 2180} 2181 2182static void io_put_req_deferred_cb(struct callback_head *cb) 2183{ 2184 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); 2185 2186 io_free_req(req); 2187} 2188 2189static void io_free_req_deferred(struct io_kiocb *req) 2190{ 2191 req->task_work.func = io_put_req_deferred_cb; 2192 if (unlikely(io_req_task_work_add(req))) 2193 io_req_task_work_add_fallback(req, io_put_req_deferred_cb); 2194} 2195 2196static inline void io_put_req_deferred(struct io_kiocb *req, int refs) 2197{ 2198 if (req_ref_sub_and_test(req, refs)) 2199 io_free_req_deferred(req); 2200} 2201 2202static unsigned io_cqring_events(struct io_ring_ctx *ctx) 2203{ 2204 /* See comment at the top of this file */ 2205 smp_rmb(); 2206 return __io_cqring_events(ctx); 2207} 2208 2209static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) 2210{ 2211 struct io_rings *rings = ctx->rings; 2212 2213 /* make sure SQ entry isn't read before tail */ 2214 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; 2215} 2216 2217static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf) 2218{ 2219 unsigned int cflags; 2220 2221 cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; 2222 cflags |= IORING_CQE_F_BUFFER; 2223 req->flags &= ~REQ_F_BUFFER_SELECTED; 2224 kfree(kbuf); 2225 return cflags; 2226} 2227 2228static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) 2229{ 2230 struct io_buffer *kbuf; 2231 2232 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; 2233 return io_put_kbuf(req, kbuf); 2234} 2235 2236static inline bool io_run_task_work(void) 2237{ 2238 /* 2239 * Not safe to run on exiting task, and the task_work handling will 2240 * not add work to such a task. 2241 */ 2242 if (unlikely(current->flags & PF_EXITING)) 2243 return false; 2244 if (current->task_works) { 2245 __set_current_state(TASK_RUNNING); 2246 task_work_run(); 2247 return true; 2248 } 2249 2250 return false; 2251} 2252 2253/* 2254 * Find and free completed poll iocbs 2255 */ 2256static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, 2257 struct list_head *done) 2258{ 2259 struct req_batch rb; 2260 struct io_kiocb *req; 2261 2262 /* order with ->result store in io_complete_rw_iopoll() */ 2263 smp_rmb(); 2264 2265 io_init_req_batch(&rb); 2266 while (!list_empty(done)) { 2267 int cflags = 0; 2268 2269 req = list_first_entry(done, struct io_kiocb, inflight_entry); 2270 list_del(&req->inflight_entry); 2271 2272 if (READ_ONCE(req->result) == -EAGAIN && 2273 !(req->flags & REQ_F_DONT_REISSUE)) { 2274 req->iopoll_completed = 0; 2275 req_ref_get(req); 2276 io_queue_async_work(req); 2277 continue; 2278 } 2279 2280 if (req->flags & REQ_F_BUFFER_SELECTED) 2281 cflags = io_put_rw_kbuf(req); 2282 2283 __io_cqring_fill_event(ctx, req->user_data, req->result, cflags); 2284 (*nr_events)++; 2285 2286 if (req_ref_put_and_test(req)) 2287 io_req_free_batch(&rb, req, &ctx->submit_state); 2288 } 2289 2290 io_commit_cqring(ctx); 2291 io_cqring_ev_posted_iopoll(ctx); 2292 io_req_free_batch_finish(ctx, &rb); 2293} 2294 2295static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, 2296 long min) 2297{ 2298 struct io_kiocb *req, *tmp; 2299 LIST_HEAD(done); 2300 bool spin; 2301 int ret; 2302 2303 /* 2304 * Only spin for completions if we don't have multiple devices hanging 2305 * off our complete list, and we're under the requested amount. 2306 */ 2307 spin = !ctx->poll_multi_file && *nr_events < min; 2308 2309 ret = 0; 2310 list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) { 2311 struct kiocb *kiocb = &req->rw.kiocb; 2312 2313 /* 2314 * Move completed and retryable entries to our local lists. 2315 * If we find a request that requires polling, break out 2316 * and complete those lists first, if we have entries there. 2317 */ 2318 if (READ_ONCE(req->iopoll_completed)) { 2319 list_move_tail(&req->inflight_entry, &done); 2320 continue; 2321 } 2322 if (!list_empty(&done)) 2323 break; 2324 2325 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin); 2326 if (ret < 0) 2327 break; 2328 2329 /* iopoll may have completed current req */ 2330 if (READ_ONCE(req->iopoll_completed)) 2331 list_move_tail(&req->inflight_entry, &done); 2332 2333 if (ret && spin) 2334 spin = false; 2335 ret = 0; 2336 } 2337 2338 if (!list_empty(&done)) 2339 io_iopoll_complete(ctx, nr_events, &done); 2340 2341 return ret; 2342} 2343 2344/* 2345 * We can't just wait for polled events to come to us, we have to actively 2346 * find and complete them. 2347 */ 2348static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) 2349{ 2350 if (!(ctx->flags & IORING_SETUP_IOPOLL)) 2351 return; 2352 2353 mutex_lock(&ctx->uring_lock); 2354 while (!list_empty(&ctx->iopoll_list)) { 2355 unsigned int nr_events = 0; 2356 2357 io_do_iopoll(ctx, &nr_events, 0); 2358 2359 /* let it sleep and repeat later if can't complete a request */ 2360 if (nr_events == 0) 2361 break; 2362 /* 2363 * Ensure we allow local-to-the-cpu processing to take place, 2364 * in this case we need to ensure that we reap all events. 2365 * Also let task_work, etc. to progress by releasing the mutex 2366 */ 2367 if (need_resched()) { 2368 mutex_unlock(&ctx->uring_lock); 2369 cond_resched(); 2370 mutex_lock(&ctx->uring_lock); 2371 } 2372 } 2373 mutex_unlock(&ctx->uring_lock); 2374} 2375 2376static int io_iopoll_check(struct io_ring_ctx *ctx, long min) 2377{ 2378 unsigned int nr_events = 0; 2379 int ret = 0; 2380 2381 /* 2382 * We disallow the app entering submit/complete with polling, but we 2383 * still need to lock the ring to prevent racing with polled issue 2384 * that got punted to a workqueue. 2385 */ 2386 mutex_lock(&ctx->uring_lock); 2387 /* 2388 * Don't enter poll loop if we already have events pending. 2389 * If we do, we can potentially be spinning for commands that 2390 * already triggered a CQE (eg in error). 2391 */ 2392 if (test_bit(0, &ctx->cq_check_overflow)) 2393 __io_cqring_overflow_flush(ctx, false); 2394 if (io_cqring_events(ctx)) 2395 goto out; 2396 do { 2397 /* 2398 * If a submit got punted to a workqueue, we can have the 2399 * application entering polling for a command before it gets 2400 * issued. That app will hold the uring_lock for the duration 2401 * of the poll right here, so we need to take a breather every 2402 * now and then to ensure that the issue has a chance to add 2403 * the poll to the issued list. Otherwise we can spin here 2404 * forever, while the workqueue is stuck trying to acquire the 2405 * very same mutex. 2406 */ 2407 if (list_empty(&ctx->iopoll_list)) { 2408 mutex_unlock(&ctx->uring_lock); 2409 io_run_task_work(); 2410 mutex_lock(&ctx->uring_lock); 2411 2412 if (list_empty(&ctx->iopoll_list)) 2413 break; 2414 } 2415 ret = io_do_iopoll(ctx, &nr_events, min); 2416 } while (!ret && nr_events < min && !need_resched()); 2417out: 2418 mutex_unlock(&ctx->uring_lock); 2419 return ret; 2420} 2421 2422static void kiocb_end_write(struct io_kiocb *req) 2423{ 2424 /* 2425 * Tell lockdep we inherited freeze protection from submission 2426 * thread. 2427 */ 2428 if (req->flags & REQ_F_ISREG) { 2429 struct super_block *sb = file_inode(req->file)->i_sb; 2430 2431 __sb_writers_acquired(sb, SB_FREEZE_WRITE); 2432 sb_end_write(sb); 2433 } 2434} 2435 2436#ifdef CONFIG_BLOCK 2437static bool io_resubmit_prep(struct io_kiocb *req) 2438{ 2439 struct io_async_rw *rw = req->async_data; 2440 2441 if (!rw) 2442 return !io_req_prep_async(req); 2443 /* may have left rw->iter inconsistent on -EIOCBQUEUED */ 2444 iov_iter_revert(&rw->iter, req->result - iov_iter_count(&rw->iter)); 2445 return true; 2446} 2447 2448static bool io_rw_should_reissue(struct io_kiocb *req) 2449{ 2450 umode_t mode = file_inode(req->file)->i_mode; 2451 struct io_ring_ctx *ctx = req->ctx; 2452 2453 if (!S_ISBLK(mode) && !S_ISREG(mode)) 2454 return false; 2455 if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && 2456 !(ctx->flags & IORING_SETUP_IOPOLL))) 2457 return false; 2458 /* 2459 * If ref is dying, we might be running poll reap from the exit work. 2460 * Don't attempt to reissue from that path, just let it fail with 2461 * -EAGAIN. 2462 */ 2463 if (percpu_ref_is_dying(&ctx->refs)) 2464 return false; 2465 return true; 2466} 2467#else 2468static bool io_resubmit_prep(struct io_kiocb *req) 2469{ 2470 return false; 2471} 2472static bool io_rw_should_reissue(struct io_kiocb *req) 2473{ 2474 return false; 2475} 2476#endif 2477 2478static void __io_complete_rw(struct io_kiocb *req, long res, long res2, 2479 unsigned int issue_flags) 2480{ 2481 int cflags = 0; 2482 2483 if (req->rw.kiocb.ki_flags & IOCB_WRITE) 2484 kiocb_end_write(req); 2485 if (res != req->result) { 2486 if ((res == -EAGAIN || res == -EOPNOTSUPP) && 2487 io_rw_should_reissue(req)) { 2488 req->flags |= REQ_F_REISSUE; 2489 return; 2490 } 2491 req_set_fail_links(req); 2492 } 2493 if (req->flags & REQ_F_BUFFER_SELECTED) 2494 cflags = io_put_rw_kbuf(req); 2495 __io_req_complete(req, issue_flags, res, cflags); 2496} 2497 2498static void io_complete_rw(struct kiocb *kiocb, long res, long res2) 2499{ 2500 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 2501 2502 __io_complete_rw(req, res, res2, 0); 2503} 2504 2505static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) 2506{ 2507 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 2508 2509 if (kiocb->ki_flags & IOCB_WRITE) 2510 kiocb_end_write(req); 2511 if (unlikely(res != req->result)) { 2512 if (!(res == -EAGAIN && io_rw_should_reissue(req) && 2513 io_resubmit_prep(req))) { 2514 req_set_fail_links(req); 2515 req->flags |= REQ_F_DONT_REISSUE; 2516 } 2517 } 2518 2519 WRITE_ONCE(req->result, res); 2520 /* order with io_iopoll_complete() checking ->result */ 2521 smp_wmb(); 2522 WRITE_ONCE(req->iopoll_completed, 1); 2523} 2524 2525/* 2526 * After the iocb has been issued, it's safe to be found on the poll list. 2527 * Adding the kiocb to the list AFTER submission ensures that we don't 2528 * find it from a io_do_iopoll() thread before the issuer is done 2529 * accessing the kiocb cookie. 2530 */ 2531static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async) 2532{ 2533 struct io_ring_ctx *ctx = req->ctx; 2534 2535 /* 2536 * Track whether we have multiple files in our lists. This will impact 2537 * how we do polling eventually, not spinning if we're on potentially 2538 * different devices. 2539 */ 2540 if (list_empty(&ctx->iopoll_list)) { 2541 ctx->poll_multi_file = false; 2542 } else if (!ctx->poll_multi_file) { 2543 struct io_kiocb *list_req; 2544 2545 list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb, 2546 inflight_entry); 2547 if (list_req->file != req->file) 2548 ctx->poll_multi_file = true; 2549 } 2550 2551 /* 2552 * For fast devices, IO may have already completed. If it has, add 2553 * it to the front so we find it first. 2554 */ 2555 if (READ_ONCE(req->iopoll_completed)) 2556 list_add(&req->inflight_entry, &ctx->iopoll_list); 2557 else 2558 list_add_tail(&req->inflight_entry, &ctx->iopoll_list); 2559 2560 /* 2561 * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread 2562 * task context or in io worker task context. If current task context is 2563 * sq thread, we don't need to check whether should wake up sq thread. 2564 */ 2565 if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) && 2566 wq_has_sleeper(&ctx->sq_data->wait)) 2567 wake_up(&ctx->sq_data->wait); 2568} 2569 2570static inline void io_state_file_put(struct io_submit_state *state) 2571{ 2572 if (state->file_refs) { 2573 fput_many(state->file, state->file_refs); 2574 state->file_refs = 0; 2575 } 2576} 2577 2578/* 2579 * Get as many references to a file as we have IOs left in this submission, 2580 * assuming most submissions are for one file, or at least that each file 2581 * has more than one submission. 2582 */ 2583static struct file *__io_file_get(struct io_submit_state *state, int fd) 2584{ 2585 if (!state) 2586 return fget(fd); 2587 2588 if (state->file_refs) { 2589 if (state->fd == fd) { 2590 state->file_refs--; 2591 return state->file; 2592 } 2593 io_state_file_put(state); 2594 } 2595 state->file = fget_many(fd, state->ios_left); 2596 if (unlikely(!state->file)) 2597 return NULL; 2598 2599 state->fd = fd; 2600 state->file_refs = state->ios_left - 1; 2601 return state->file; 2602} 2603 2604static bool io_bdev_nowait(struct block_device *bdev) 2605{ 2606 return !bdev || blk_queue_nowait(bdev_get_queue(bdev)); 2607} 2608 2609/* 2610 * If we tracked the file through the SCM inflight mechanism, we could support 2611 * any file. For now, just ensure that anything potentially problematic is done 2612 * inline. 2613 */ 2614static bool __io_file_supports_async(struct file *file, int rw) 2615{ 2616 umode_t mode = file_inode(file)->i_mode; 2617 2618 if (S_ISBLK(mode)) { 2619 if (IS_ENABLED(CONFIG_BLOCK) && 2620 io_bdev_nowait(I_BDEV(file->f_mapping->host))) 2621 return true; 2622 return false; 2623 } 2624 if (S_ISCHR(mode) || S_ISSOCK(mode)) 2625 return true; 2626 if (S_ISREG(mode)) { 2627 if (IS_ENABLED(CONFIG_BLOCK) && 2628 io_bdev_nowait(file->f_inode->i_sb->s_bdev) && 2629 file->f_op != &io_uring_fops) 2630 return true; 2631 return false; 2632 } 2633 2634 /* any ->read/write should understand O_NONBLOCK */ 2635 if (file->f_flags & O_NONBLOCK) 2636 return true; 2637 2638 if (!(file->f_mode & FMODE_NOWAIT)) 2639 return false; 2640 2641 if (rw == READ) 2642 return file->f_op->read_iter != NULL; 2643 2644 return file->f_op->write_iter != NULL; 2645} 2646 2647static bool io_file_supports_async(struct io_kiocb *req, int rw) 2648{ 2649 if (rw == READ && (req->flags & REQ_F_ASYNC_READ)) 2650 return true; 2651 else if (rw == WRITE && (req->flags & REQ_F_ASYNC_WRITE)) 2652 return true; 2653 2654 return __io_file_supports_async(req->file, rw); 2655} 2656 2657static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) 2658{ 2659 struct io_ring_ctx *ctx = req->ctx; 2660 struct kiocb *kiocb = &req->rw.kiocb; 2661 struct file *file = req->file; 2662 unsigned ioprio; 2663 int ret; 2664 2665 if (!(req->flags & REQ_F_ISREG) && S_ISREG(file_inode(file)->i_mode)) 2666 req->flags |= REQ_F_ISREG; 2667 2668 kiocb->ki_pos = READ_ONCE(sqe->off); 2669 if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) { 2670 req->flags |= REQ_F_CUR_POS; 2671 kiocb->ki_pos = file->f_pos; 2672 } 2673 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); 2674 kiocb->ki_flags = iocb_flags(kiocb->ki_filp); 2675 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); 2676 if (unlikely(ret)) 2677 return ret; 2678 2679 /* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */ 2680 if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK)) 2681 req->flags |= REQ_F_NOWAIT; 2682 2683 ioprio = READ_ONCE(sqe->ioprio); 2684 if (ioprio) { 2685 ret = ioprio_check_cap(ioprio); 2686 if (ret) 2687 return ret; 2688 2689 kiocb->ki_ioprio = ioprio; 2690 } else 2691 kiocb->ki_ioprio = get_current_ioprio(); 2692 2693 if (ctx->flags & IORING_SETUP_IOPOLL) { 2694 if (!(kiocb->ki_flags & IOCB_DIRECT) || 2695 !kiocb->ki_filp->f_op->iopoll) 2696 return -EOPNOTSUPP; 2697 2698 kiocb->ki_flags |= IOCB_HIPRI; 2699 kiocb->ki_complete = io_complete_rw_iopoll; 2700 req->iopoll_completed = 0; 2701 } else { 2702 if (kiocb->ki_flags & IOCB_HIPRI) 2703 return -EINVAL; 2704 kiocb->ki_complete = io_complete_rw; 2705 } 2706 2707 if (req->opcode == IORING_OP_READ_FIXED || 2708 req->opcode == IORING_OP_WRITE_FIXED) { 2709 req->imu = NULL; 2710 io_req_set_rsrc_node(req); 2711 } 2712 2713 req->rw.addr = READ_ONCE(sqe->addr); 2714 req->rw.len = READ_ONCE(sqe->len); 2715 req->buf_index = READ_ONCE(sqe->buf_index); 2716 return 0; 2717} 2718 2719static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) 2720{ 2721 switch (ret) { 2722 case -EIOCBQUEUED: 2723 break; 2724 case -ERESTARTSYS: 2725 case -ERESTARTNOINTR: 2726 case -ERESTARTNOHAND: 2727 case -ERESTART_RESTARTBLOCK: 2728 /* 2729 * We can't just restart the syscall, since previously 2730 * submitted sqes may already be in progress. Just fail this 2731 * IO with EINTR. 2732 */ 2733 ret = -EINTR; 2734 fallthrough; 2735 default: 2736 kiocb->ki_complete(kiocb, ret, 0); 2737 } 2738} 2739 2740static void kiocb_done(struct kiocb *kiocb, ssize_t ret, 2741 unsigned int issue_flags) 2742{ 2743 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 2744 struct io_async_rw *io = req->async_data; 2745 bool check_reissue = kiocb->ki_complete == io_complete_rw; 2746 2747 /* add previously done IO, if any */ 2748 if (io && io->bytes_done > 0) { 2749 if (ret < 0) 2750 ret = io->bytes_done; 2751 else 2752 ret += io->bytes_done; 2753 } 2754 2755 if (req->flags & REQ_F_CUR_POS) 2756 req->file->f_pos = kiocb->ki_pos; 2757 if (ret >= 0 && kiocb->ki_complete == io_complete_rw) 2758 __io_complete_rw(req, ret, 0, issue_flags); 2759 else 2760 io_rw_done(kiocb, ret); 2761 2762 if (check_reissue && req->flags & REQ_F_REISSUE) { 2763 req->flags &= ~REQ_F_REISSUE; 2764 if (io_resubmit_prep(req)) { 2765 req_ref_get(req); 2766 io_queue_async_work(req); 2767 } else { 2768 int cflags = 0; 2769 2770 req_set_fail_links(req); 2771 if (req->flags & REQ_F_BUFFER_SELECTED) 2772 cflags = io_put_rw_kbuf(req); 2773 __io_req_complete(req, issue_flags, ret, cflags); 2774 } 2775 } 2776} 2777 2778static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, 2779 struct io_mapped_ubuf *imu) 2780{ 2781 size_t len = req->rw.len; 2782 u64 buf_end, buf_addr = req->rw.addr; 2783 size_t offset; 2784 2785 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 2786 return -EFAULT; 2787 /* not inside the mapped region */ 2788 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) 2789 return -EFAULT; 2790 2791 /* 2792 * May not be a start of buffer, set size appropriately 2793 * and advance us to the beginning. 2794 */ 2795 offset = buf_addr - imu->ubuf; 2796 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); 2797 2798 if (offset) { 2799 /* 2800 * Don't use iov_iter_advance() here, as it's really slow for 2801 * using the latter parts of a big fixed buffer - it iterates 2802 * over each segment manually. We can cheat a bit here, because 2803 * we know that: 2804 * 2805 * 1) it's a BVEC iter, we set it up 2806 * 2) all bvecs are PAGE_SIZE in size, except potentially the 2807 * first and last bvec 2808 * 2809 * So just find our index, and adjust the iterator afterwards. 2810 * If the offset is within the first bvec (or the whole first 2811 * bvec, just use iov_iter_advance(). This makes it easier 2812 * since we can just skip the first segment, which may not 2813 * be PAGE_SIZE aligned. 2814 */ 2815 const struct bio_vec *bvec = imu->bvec; 2816 2817 if (offset <= bvec->bv_len) { 2818 iov_iter_advance(iter, offset); 2819 } else { 2820 unsigned long seg_skip; 2821 2822 /* skip first vec */ 2823 offset -= bvec->bv_len; 2824 seg_skip = 1 + (offset >> PAGE_SHIFT); 2825 2826 iter->bvec = bvec + seg_skip; 2827 iter->nr_segs -= seg_skip; 2828 iter->count -= bvec->bv_len + offset; 2829 iter->iov_offset = offset & ~PAGE_MASK; 2830 } 2831 } 2832 2833 return 0; 2834} 2835 2836static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) 2837{ 2838 struct io_ring_ctx *ctx = req->ctx; 2839 struct io_mapped_ubuf *imu = req->imu; 2840 u16 index, buf_index = req->buf_index; 2841 2842 if (likely(!imu)) { 2843 if (unlikely(buf_index >= ctx->nr_user_bufs)) 2844 return -EFAULT; 2845 index = array_index_nospec(buf_index, ctx->nr_user_bufs); 2846 imu = READ_ONCE(ctx->user_bufs[index]); 2847 req->imu = imu; 2848 } 2849 return __io_import_fixed(req, rw, iter, imu); 2850} 2851 2852static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock) 2853{ 2854 if (needs_lock) 2855 mutex_unlock(&ctx->uring_lock); 2856} 2857 2858static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) 2859{ 2860 /* 2861 * "Normal" inline submissions always hold the uring_lock, since we 2862 * grab it from the system call. Same is true for the SQPOLL offload. 2863 * The only exception is when we've detached the request and issue it 2864 * from an async worker thread, grab the lock for that case. 2865 */ 2866 if (needs_lock) 2867 mutex_lock(&ctx->uring_lock); 2868} 2869 2870static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, 2871 int bgid, struct io_buffer *kbuf, 2872 bool needs_lock) 2873{ 2874 struct io_buffer *head; 2875 2876 if (req->flags & REQ_F_BUFFER_SELECTED) 2877 return kbuf; 2878 2879 io_ring_submit_lock(req->ctx, needs_lock); 2880 2881 lockdep_assert_held(&req->ctx->uring_lock); 2882 2883 head = xa_load(&req->ctx->io_buffers, bgid); 2884 if (head) { 2885 if (!list_empty(&head->list)) { 2886 kbuf = list_last_entry(&head->list, struct io_buffer, 2887 list); 2888 list_del(&kbuf->list); 2889 } else { 2890 kbuf = head; 2891 xa_erase(&req->ctx->io_buffers, bgid); 2892 } 2893 if (*len > kbuf->len) 2894 *len = kbuf->len; 2895 } else { 2896 kbuf = ERR_PTR(-ENOBUFS); 2897 } 2898 2899 io_ring_submit_unlock(req->ctx, needs_lock); 2900 2901 return kbuf; 2902} 2903 2904static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, 2905 bool needs_lock) 2906{ 2907 struct io_buffer *kbuf; 2908 u16 bgid; 2909 2910 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; 2911 bgid = req->buf_index; 2912 kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock); 2913 if (IS_ERR(kbuf)) 2914 return kbuf; 2915 req->rw.addr = (u64) (unsigned long) kbuf; 2916 req->flags |= REQ_F_BUFFER_SELECTED; 2917 return u64_to_user_ptr(kbuf->addr); 2918} 2919 2920#ifdef CONFIG_COMPAT 2921static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, 2922 bool needs_lock) 2923{ 2924 struct compat_iovec __user *uiov; 2925 compat_ssize_t clen; 2926 void __user *buf; 2927 ssize_t len; 2928 2929 uiov = u64_to_user_ptr(req->rw.addr); 2930 if (!access_ok(uiov, sizeof(*uiov))) 2931 return -EFAULT; 2932 if (__get_user(clen, &uiov->iov_len)) 2933 return -EFAULT; 2934 if (clen < 0) 2935 return -EINVAL; 2936 2937 len = clen; 2938 buf = io_rw_buffer_select(req, &len, needs_lock); 2939 if (IS_ERR(buf)) 2940 return PTR_ERR(buf); 2941 iov[0].iov_base = buf; 2942 iov[0].iov_len = (compat_size_t) len; 2943 return 0; 2944} 2945#endif 2946 2947static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, 2948 bool needs_lock) 2949{ 2950 struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr); 2951 void __user *buf; 2952 ssize_t len; 2953 2954 if (copy_from_user(iov, uiov, sizeof(*uiov))) 2955 return -EFAULT; 2956 2957 len = iov[0].iov_len; 2958 if (len < 0) 2959 return -EINVAL; 2960 buf = io_rw_buffer_select(req, &len, needs_lock); 2961 if (IS_ERR(buf)) 2962 return PTR_ERR(buf); 2963 iov[0].iov_base = buf; 2964 iov[0].iov_len = len; 2965 return 0; 2966} 2967 2968static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, 2969 bool needs_lock) 2970{ 2971 if (req->flags & REQ_F_BUFFER_SELECTED) { 2972 struct io_buffer *kbuf; 2973 2974 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; 2975 iov[0].iov_base = u64_to_user_ptr(kbuf->addr); 2976 iov[0].iov_len = kbuf->len; 2977 return 0; 2978 } 2979 if (req->rw.len != 1) 2980 return -EINVAL; 2981 2982#ifdef CONFIG_COMPAT 2983 if (req->ctx->compat) 2984 return io_compat_import(req, iov, needs_lock); 2985#endif 2986 2987 return __io_iov_buffer_select(req, iov, needs_lock); 2988} 2989 2990static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, 2991 struct iov_iter *iter, bool needs_lock) 2992{ 2993 void __user *buf = u64_to_user_ptr(req->rw.addr); 2994 size_t sqe_len = req->rw.len; 2995 u8 opcode = req->opcode; 2996 ssize_t ret; 2997 2998 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { 2999 *iovec = NULL; 3000 return io_import_fixed(req, rw, iter); 3001 } 3002 3003 /* buffer index only valid with fixed read/write, or buffer select */ 3004 if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)) 3005 return -EINVAL; 3006 3007 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { 3008 if (req->flags & REQ_F_BUFFER_SELECT) { 3009 buf = io_rw_buffer_select(req, &sqe_len, needs_lock); 3010 if (IS_ERR(buf)) 3011 return PTR_ERR(buf); 3012 req->rw.len = sqe_len; 3013 } 3014 3015 ret = import_single_range(rw, buf, sqe_len, *iovec, iter); 3016 *iovec = NULL; 3017 return ret; 3018 } 3019 3020 if (req->flags & REQ_F_BUFFER_SELECT) { 3021 ret = io_iov_buffer_select(req, *iovec, needs_lock); 3022 if (!ret) 3023 iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len); 3024 *iovec = NULL; 3025 return ret; 3026 } 3027 3028 return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter, 3029 req->ctx->compat); 3030} 3031 3032static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) 3033{ 3034 return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; 3035} 3036 3037/* 3038 * For files that don't have ->read_iter() and ->write_iter(), handle them 3039 * by looping over ->read() or ->write() manually. 3040 */ 3041static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) 3042{ 3043 struct kiocb *kiocb = &req->rw.kiocb; 3044 struct file *file = req->file; 3045 ssize_t ret = 0; 3046 3047 /* 3048 * Don't support polled IO through this interface, and we can't 3049 * support non-blocking either. For the latter, this just causes 3050 * the kiocb to be handled from an async context. 3051 */ 3052 if (kiocb->ki_flags & IOCB_HIPRI) 3053 return -EOPNOTSUPP; 3054 if (kiocb->ki_flags & IOCB_NOWAIT) 3055 return -EAGAIN; 3056 3057 while (iov_iter_count(iter)) { 3058 struct iovec iovec; 3059 ssize_t nr; 3060 3061 if (!iov_iter_is_bvec(iter)) { 3062 iovec = iov_iter_iovec(iter); 3063 } else { 3064 iovec.iov_base = u64_to_user_ptr(req->rw.addr); 3065 iovec.iov_len = req->rw.len; 3066 } 3067 3068 if (rw == READ) { 3069 nr = file->f_op->read(file, iovec.iov_base, 3070 iovec.iov_len, io_kiocb_ppos(kiocb)); 3071 } else { 3072 nr = file->f_op->write(file, iovec.iov_base, 3073 iovec.iov_len, io_kiocb_ppos(kiocb)); 3074 } 3075 3076 if (nr < 0) { 3077 if (!ret) 3078 ret = nr; 3079 break; 3080 } 3081 ret += nr; 3082 if (nr != iovec.iov_len) 3083 break; 3084 req->rw.len -= nr; 3085 req->rw.addr += nr; 3086 iov_iter_advance(iter, nr); 3087 } 3088 3089 return ret; 3090} 3091 3092static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, 3093 const struct iovec *fast_iov, struct iov_iter *iter) 3094{ 3095 struct io_async_rw *rw = req->async_data; 3096 3097 memcpy(&rw->iter, iter, sizeof(*iter)); 3098 rw->free_iovec = iovec; 3099 rw->bytes_done = 0; 3100 /* can only be fixed buffers, no need to do anything */ 3101 if (iov_iter_is_bvec(iter)) 3102 return; 3103 if (!iovec) { 3104 unsigned iov_off = 0; 3105 3106 rw->iter.iov = rw->fast_iov; 3107 if (iter->iov != fast_iov) { 3108 iov_off = iter->iov - fast_iov; 3109 rw->iter.iov += iov_off; 3110 } 3111 if (rw->fast_iov != fast_iov) 3112 memcpy(rw->fast_iov + iov_off, fast_iov + iov_off, 3113 sizeof(struct iovec) * iter->nr_segs); 3114 } else { 3115 req->flags |= REQ_F_NEED_CLEANUP; 3116 } 3117} 3118 3119static inline int io_alloc_async_data(struct io_kiocb *req) 3120{ 3121 WARN_ON_ONCE(!io_op_defs[req->opcode].async_size); 3122 req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL); 3123 return req->async_data == NULL; 3124} 3125 3126static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, 3127 const struct iovec *fast_iov, 3128 struct iov_iter *iter, bool force) 3129{ 3130 if (!force && !io_op_defs[req->opcode].needs_async_setup) 3131 return 0; 3132 if (!req->async_data) { 3133 if (io_alloc_async_data(req)) { 3134 kfree(iovec); 3135 return -ENOMEM; 3136 } 3137 3138 io_req_map_rw(req, iovec, fast_iov, iter); 3139 } 3140 return 0; 3141} 3142 3143static inline int io_rw_prep_async(struct io_kiocb *req, int rw) 3144{ 3145 struct io_async_rw *iorw = req->async_data; 3146 struct iovec *iov = iorw->fast_iov; 3147 int ret; 3148 3149 ret = io_import_iovec(rw, req, &iov, &iorw->iter, false); 3150 if (unlikely(ret < 0)) 3151 return ret; 3152 3153 iorw->bytes_done = 0; 3154 iorw->free_iovec = iov; 3155 if (iov) 3156 req->flags |= REQ_F_NEED_CLEANUP; 3157 return 0; 3158} 3159 3160static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3161{ 3162 if (unlikely(!(req->file->f_mode & FMODE_READ))) 3163 return -EBADF; 3164 return io_prep_rw(req, sqe); 3165} 3166 3167/* 3168 * This is our waitqueue callback handler, registered through lock_page_async() 3169 * when we initially tried to do the IO with the iocb armed our waitqueue. 3170 * This gets called when the page is unlocked, and we generally expect that to 3171 * happen when the page IO is completed and the page is now uptodate. This will 3172 * queue a task_work based retry of the operation, attempting to copy the data 3173 * again. If the latter fails because the page was NOT uptodate, then we will 3174 * do a thread based blocking retry of the operation. That's the unexpected 3175 * slow path. 3176 */ 3177static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, 3178 int sync, void *arg) 3179{ 3180 struct wait_page_queue *wpq; 3181 struct io_kiocb *req = wait->private; 3182 struct wait_page_key *key = arg; 3183 3184 wpq = container_of(wait, struct wait_page_queue, wait); 3185 3186 if (!wake_page_match(wpq, key)) 3187 return 0; 3188 3189 req->rw.kiocb.ki_flags &= ~IOCB_WAITQ; 3190 list_del_init(&wait->entry); 3191 3192 /* submit ref gets dropped, acquire a new one */ 3193 req_ref_get(req); 3194 io_req_task_queue(req); 3195 return 1; 3196} 3197 3198/* 3199 * This controls whether a given IO request should be armed for async page 3200 * based retry. If we return false here, the request is handed to the async 3201 * worker threads for retry. If we're doing buffered reads on a regular file, 3202 * we prepare a private wait_page_queue entry and retry the operation. This 3203 * will either succeed because the page is now uptodate and unlocked, or it 3204 * will register a callback when the page is unlocked at IO completion. Through 3205 * that callback, io_uring uses task_work to setup a retry of the operation. 3206 * That retry will attempt the buffered read again. The retry will generally 3207 * succeed, or in rare cases where it fails, we then fall back to using the 3208 * async worker threads for a blocking retry. 3209 */ 3210static bool io_rw_should_retry(struct io_kiocb *req) 3211{ 3212 struct io_async_rw *rw = req->async_data; 3213 struct wait_page_queue *wait = &rw->wpq; 3214 struct kiocb *kiocb = &req->rw.kiocb; 3215 3216 /* never retry for NOWAIT, we just complete with -EAGAIN */ 3217 if (req->flags & REQ_F_NOWAIT) 3218 return false; 3219 3220 /* Only for buffered IO */ 3221 if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) 3222 return false; 3223 3224 /* 3225 * just use poll if we can, and don't attempt if the fs doesn't 3226 * support callback based unlocks 3227 */ 3228 if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) 3229 return false; 3230 3231 wait->wait.func = io_async_buf_func; 3232 wait->wait.private = req; 3233 wait->wait.flags = 0; 3234 INIT_LIST_HEAD(&wait->wait.entry); 3235 kiocb->ki_flags |= IOCB_WAITQ; 3236 kiocb->ki_flags &= ~IOCB_NOWAIT; 3237 kiocb->ki_waitq = wait; 3238 return true; 3239} 3240 3241static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) 3242{ 3243 if (req->file->f_op->read_iter) 3244 return call_read_iter(req->file, &req->rw.kiocb, iter); 3245 else if (req->file->f_op->read) 3246 return loop_rw_iter(READ, req, iter); 3247 else 3248 return -EINVAL; 3249} 3250 3251static int io_read(struct io_kiocb *req, unsigned int issue_flags) 3252{ 3253 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 3254 struct kiocb *kiocb = &req->rw.kiocb; 3255 struct iov_iter __iter, *iter = &__iter; 3256 struct io_async_rw *rw = req->async_data; 3257 ssize_t io_size, ret, ret2; 3258 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 3259 3260 if (rw) { 3261 iter = &rw->iter; 3262 iovec = NULL; 3263 } else { 3264 ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); 3265 if (ret < 0) 3266 return ret; 3267 } 3268 io_size = iov_iter_count(iter); 3269 req->result = io_size; 3270 3271 /* Ensure we clear previously set non-block flag */ 3272 if (!force_nonblock) 3273 kiocb->ki_flags &= ~IOCB_NOWAIT; 3274 else 3275 kiocb->ki_flags |= IOCB_NOWAIT; 3276 3277 /* If the file doesn't support async, just async punt */ 3278 if (force_nonblock && !io_file_supports_async(req, READ)) { 3279 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true); 3280 return ret ?: -EAGAIN; 3281 } 3282 3283 ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size); 3284 if (unlikely(ret)) { 3285 kfree(iovec); 3286 return ret; 3287 } 3288 3289 ret = io_iter_do_read(req, iter); 3290 3291 if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { 3292 req->flags &= ~REQ_F_REISSUE; 3293 /* IOPOLL retry should happen for io-wq threads */ 3294 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) 3295 goto done; 3296 /* no retry on NONBLOCK nor RWF_NOWAIT */ 3297 if (req->flags & REQ_F_NOWAIT) 3298 goto done; 3299 /* some cases will consume bytes even on error returns */ 3300 iov_iter_revert(iter, io_size - iov_iter_count(iter)); 3301 ret = 0; 3302 } else if (ret == -EIOCBQUEUED) { 3303 goto out_free; 3304 } else if (ret <= 0 || ret == io_size || !force_nonblock || 3305 (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) { 3306 /* read all, failed, already did sync or don't want to retry */ 3307 goto done; 3308 } 3309 3310 ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true); 3311 if (ret2) 3312 return ret2; 3313 3314 iovec = NULL; 3315 rw = req->async_data; 3316 /* now use our persistent iterator, if we aren't already */ 3317 iter = &rw->iter; 3318 3319 do { 3320 io_size -= ret; 3321 rw->bytes_done += ret; 3322 /* if we can retry, do so with the callbacks armed */ 3323 if (!io_rw_should_retry(req)) { 3324 kiocb->ki_flags &= ~IOCB_WAITQ; 3325 return -EAGAIN; 3326 } 3327 3328 /* 3329 * Now retry read with the IOCB_WAITQ parts set in the iocb. If 3330 * we get -EIOCBQUEUED, then we'll get a notification when the 3331 * desired page gets unlocked. We can also get a partial read 3332 * here, and if we do, then just retry at the new offset. 3333 */ 3334 ret = io_iter_do_read(req, iter); 3335 if (ret == -EIOCBQUEUED) 3336 return 0; 3337 /* we got some bytes, but not all. retry. */ 3338 kiocb->ki_flags &= ~IOCB_WAITQ; 3339 } while (ret > 0 && ret < io_size); 3340done: 3341 kiocb_done(kiocb, ret, issue_flags); 3342out_free: 3343 /* it's faster to check here then delegate to kfree */ 3344 if (iovec) 3345 kfree(iovec); 3346 return 0; 3347} 3348 3349static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3350{ 3351 if (unlikely(!(req->file->f_mode & FMODE_WRITE))) 3352 return -EBADF; 3353 return io_prep_rw(req, sqe); 3354} 3355 3356static int io_write(struct io_kiocb *req, unsigned int issue_flags) 3357{ 3358 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 3359 struct kiocb *kiocb = &req->rw.kiocb; 3360 struct iov_iter __iter, *iter = &__iter; 3361 struct io_async_rw *rw = req->async_data; 3362 ssize_t ret, ret2, io_size; 3363 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 3364 3365 if (rw) { 3366 iter = &rw->iter; 3367 iovec = NULL; 3368 } else { 3369 ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); 3370 if (ret < 0) 3371 return ret; 3372 } 3373 io_size = iov_iter_count(iter); 3374 req->result = io_size; 3375 3376 /* Ensure we clear previously set non-block flag */ 3377 if (!force_nonblock) 3378 kiocb->ki_flags &= ~IOCB_NOWAIT; 3379 else 3380 kiocb->ki_flags |= IOCB_NOWAIT; 3381 3382 /* If the file doesn't support async, just async punt */ 3383 if (force_nonblock && !io_file_supports_async(req, WRITE)) 3384 goto copy_iov; 3385 3386 /* file path doesn't support NOWAIT for non-direct_IO */ 3387 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && 3388 (req->flags & REQ_F_ISREG)) 3389 goto copy_iov; 3390 3391 ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size); 3392 if (unlikely(ret)) 3393 goto out_free; 3394 3395 /* 3396 * Open-code file_start_write here to grab freeze protection, 3397 * which will be released by another thread in 3398 * io_complete_rw(). Fool lockdep by telling it the lock got 3399 * released so that it doesn't complain about the held lock when 3400 * we return to userspace. 3401 */ 3402 if (req->flags & REQ_F_ISREG) { 3403 sb_start_write(file_inode(req->file)->i_sb); 3404 __sb_writers_release(file_inode(req->file)->i_sb, 3405 SB_FREEZE_WRITE); 3406 } 3407 kiocb->ki_flags |= IOCB_WRITE; 3408 3409 if (req->file->f_op->write_iter) 3410 ret2 = call_write_iter(req->file, kiocb, iter); 3411 else if (req->file->f_op->write) 3412 ret2 = loop_rw_iter(WRITE, req, iter); 3413 else 3414 ret2 = -EINVAL; 3415 3416 if (req->flags & REQ_F_REISSUE) { 3417 req->flags &= ~REQ_F_REISSUE; 3418 ret2 = -EAGAIN; 3419 } 3420 3421 /* 3422 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just 3423 * retry them without IOCB_NOWAIT. 3424 */ 3425 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) 3426 ret2 = -EAGAIN; 3427 /* no retry on NONBLOCK nor RWF_NOWAIT */ 3428 if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT)) 3429 goto done; 3430 if (!force_nonblock || ret2 != -EAGAIN) { 3431 /* IOPOLL retry should happen for io-wq threads */ 3432 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN) 3433 goto copy_iov; 3434done: 3435 kiocb_done(kiocb, ret2, issue_flags); 3436 } else { 3437copy_iov: 3438 /* some cases will consume bytes even on error returns */ 3439 iov_iter_revert(iter, io_size - iov_iter_count(iter)); 3440 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); 3441 return ret ?: -EAGAIN; 3442 } 3443out_free: 3444 /* it's reportedly faster than delegating the null check to kfree() */ 3445 if (iovec) 3446 kfree(iovec); 3447 return ret; 3448} 3449 3450static int io_renameat_prep(struct io_kiocb *req, 3451 const struct io_uring_sqe *sqe) 3452{ 3453 struct io_rename *ren = &req->rename; 3454 const char __user *oldf, *newf; 3455 3456 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3457 return -EBADF; 3458 3459 ren->old_dfd = READ_ONCE(sqe->fd); 3460 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3461 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 3462 ren->new_dfd = READ_ONCE(sqe->len); 3463 ren->flags = READ_ONCE(sqe->rename_flags); 3464 3465 ren->oldpath = getname(oldf); 3466 if (IS_ERR(ren->oldpath)) 3467 return PTR_ERR(ren->oldpath); 3468 3469 ren->newpath = getname(newf); 3470 if (IS_ERR(ren->newpath)) { 3471 putname(ren->oldpath); 3472 return PTR_ERR(ren->newpath); 3473 } 3474 3475 req->flags |= REQ_F_NEED_CLEANUP; 3476 return 0; 3477} 3478 3479static int io_renameat(struct io_kiocb *req, unsigned int issue_flags) 3480{ 3481 struct io_rename *ren = &req->rename; 3482 int ret; 3483 3484 if (issue_flags & IO_URING_F_NONBLOCK) 3485 return -EAGAIN; 3486 3487 ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, 3488 ren->newpath, ren->flags); 3489 3490 req->flags &= ~REQ_F_NEED_CLEANUP; 3491 if (ret < 0) 3492 req_set_fail_links(req); 3493 io_req_complete(req, ret); 3494 return 0; 3495} 3496 3497static int io_unlinkat_prep(struct io_kiocb *req, 3498 const struct io_uring_sqe *sqe) 3499{ 3500 struct io_unlink *un = &req->unlink; 3501 const char __user *fname; 3502 3503 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3504 return -EBADF; 3505 3506 un->dfd = READ_ONCE(sqe->fd); 3507 3508 un->flags = READ_ONCE(sqe->unlink_flags); 3509 if (un->flags & ~AT_REMOVEDIR) 3510 return -EINVAL; 3511 3512 fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3513 un->filename = getname(fname); 3514 if (IS_ERR(un->filename)) 3515 return PTR_ERR(un->filename); 3516 3517 req->flags |= REQ_F_NEED_CLEANUP; 3518 return 0; 3519} 3520 3521static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) 3522{ 3523 struct io_unlink *un = &req->unlink; 3524 int ret; 3525 3526 if (issue_flags & IO_URING_F_NONBLOCK) 3527 return -EAGAIN; 3528 3529 if (un->flags & AT_REMOVEDIR) 3530 ret = do_rmdir(un->dfd, un->filename); 3531 else 3532 ret = do_unlinkat(un->dfd, un->filename); 3533 3534 req->flags &= ~REQ_F_NEED_CLEANUP; 3535 if (ret < 0) 3536 req_set_fail_links(req); 3537 io_req_complete(req, ret); 3538 return 0; 3539} 3540 3541static int io_shutdown_prep(struct io_kiocb *req, 3542 const struct io_uring_sqe *sqe) 3543{ 3544#if defined(CONFIG_NET) 3545 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3546 return -EINVAL; 3547 if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || 3548 sqe->buf_index) 3549 return -EINVAL; 3550 3551 req->shutdown.how = READ_ONCE(sqe->len); 3552 return 0; 3553#else 3554 return -EOPNOTSUPP; 3555#endif 3556} 3557 3558static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) 3559{ 3560#if defined(CONFIG_NET) 3561 struct socket *sock; 3562 int ret; 3563 3564 if (issue_flags & IO_URING_F_NONBLOCK) 3565 return -EAGAIN; 3566 3567 sock = sock_from_file(req->file); 3568 if (unlikely(!sock)) 3569 return -ENOTSOCK; 3570 3571 ret = __sys_shutdown_sock(sock, req->shutdown.how); 3572 if (ret < 0) 3573 req_set_fail_links(req); 3574 io_req_complete(req, ret); 3575 return 0; 3576#else 3577 return -EOPNOTSUPP; 3578#endif 3579} 3580 3581static int __io_splice_prep(struct io_kiocb *req, 3582 const struct io_uring_sqe *sqe) 3583{ 3584 struct io_splice* sp = &req->splice; 3585 unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; 3586 3587 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3588 return -EINVAL; 3589 3590 sp->file_in = NULL; 3591 sp->len = READ_ONCE(sqe->len); 3592 sp->flags = READ_ONCE(sqe->splice_flags); 3593 3594 if (unlikely(sp->flags & ~valid_flags)) 3595 return -EINVAL; 3596 3597 sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), 3598 (sp->flags & SPLICE_F_FD_IN_FIXED)); 3599 if (!sp->file_in) 3600 return -EBADF; 3601 req->flags |= REQ_F_NEED_CLEANUP; 3602 return 0; 3603} 3604 3605static int io_tee_prep(struct io_kiocb *req, 3606 const struct io_uring_sqe *sqe) 3607{ 3608 if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off)) 3609 return -EINVAL; 3610 return __io_splice_prep(req, sqe); 3611} 3612 3613static int io_tee(struct io_kiocb *req, unsigned int issue_flags) 3614{ 3615 struct io_splice *sp = &req->splice; 3616 struct file *in = sp->file_in; 3617 struct file *out = sp->file_out; 3618 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; 3619 long ret = 0; 3620 3621 if (issue_flags & IO_URING_F_NONBLOCK) 3622 return -EAGAIN; 3623 if (sp->len) 3624 ret = do_tee(in, out, sp->len, flags); 3625 3626 if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) 3627 io_put_file(in); 3628 req->flags &= ~REQ_F_NEED_CLEANUP; 3629 3630 if (ret != sp->len) 3631 req_set_fail_links(req); 3632 io_req_complete(req, ret); 3633 return 0; 3634} 3635 3636static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3637{ 3638 struct io_splice* sp = &req->splice; 3639 3640 sp->off_in = READ_ONCE(sqe->splice_off_in); 3641 sp->off_out = READ_ONCE(sqe->off); 3642 return __io_splice_prep(req, sqe); 3643} 3644 3645static int io_splice(struct io_kiocb *req, unsigned int issue_flags) 3646{ 3647 struct io_splice *sp = &req->splice; 3648 struct file *in = sp->file_in; 3649 struct file *out = sp->file_out; 3650 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; 3651 loff_t *poff_in, *poff_out; 3652 long ret = 0; 3653 3654 if (issue_flags & IO_URING_F_NONBLOCK) 3655 return -EAGAIN; 3656 3657 poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; 3658 poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; 3659 3660 if (sp->len) 3661 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); 3662 3663 if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) 3664 io_put_file(in); 3665 req->flags &= ~REQ_F_NEED_CLEANUP; 3666 3667 if (ret != sp->len) 3668 req_set_fail_links(req); 3669 io_req_complete(req, ret); 3670 return 0; 3671} 3672 3673/* 3674 * IORING_OP_NOP just posts a completion event, nothing else. 3675 */ 3676static int io_nop(struct io_kiocb *req, unsigned int issue_flags) 3677{ 3678 struct io_ring_ctx *ctx = req->ctx; 3679 3680 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 3681 return -EINVAL; 3682 3683 __io_req_complete(req, issue_flags, 0, 0); 3684 return 0; 3685} 3686 3687static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3688{ 3689 struct io_ring_ctx *ctx = req->ctx; 3690 3691 if (!req->file) 3692 return -EBADF; 3693 3694 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 3695 return -EINVAL; 3696 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) 3697 return -EINVAL; 3698 3699 req->sync.flags = READ_ONCE(sqe->fsync_flags); 3700 if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC)) 3701 return -EINVAL; 3702 3703 req->sync.off = READ_ONCE(sqe->off); 3704 req->sync.len = READ_ONCE(sqe->len); 3705 return 0; 3706} 3707 3708static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) 3709{ 3710 loff_t end = req->sync.off + req->sync.len; 3711 int ret; 3712 3713 /* fsync always requires a blocking context */ 3714 if (issue_flags & IO_URING_F_NONBLOCK) 3715 return -EAGAIN; 3716 3717 ret = vfs_fsync_range(req->file, req->sync.off, 3718 end > 0 ? end : LLONG_MAX, 3719 req->sync.flags & IORING_FSYNC_DATASYNC); 3720 if (ret < 0) 3721 req_set_fail_links(req); 3722 io_req_complete(req, ret); 3723 return 0; 3724} 3725 3726static int io_fallocate_prep(struct io_kiocb *req, 3727 const struct io_uring_sqe *sqe) 3728{ 3729 if (sqe->ioprio || sqe->buf_index || sqe->rw_flags) 3730 return -EINVAL; 3731 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3732 return -EINVAL; 3733 3734 req->sync.off = READ_ONCE(sqe->off); 3735 req->sync.len = READ_ONCE(sqe->addr); 3736 req->sync.mode = READ_ONCE(sqe->len); 3737 return 0; 3738} 3739 3740static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) 3741{ 3742 int ret; 3743 3744 /* fallocate always requiring blocking context */ 3745 if (issue_flags & IO_URING_F_NONBLOCK) 3746 return -EAGAIN; 3747 ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, 3748 req->sync.len); 3749 if (ret < 0) 3750 req_set_fail_links(req); 3751 io_req_complete(req, ret); 3752 return 0; 3753} 3754 3755static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3756{ 3757 const char __user *fname; 3758 int ret; 3759 3760 if (unlikely(sqe->ioprio || sqe->buf_index)) 3761 return -EINVAL; 3762 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3763 return -EBADF; 3764 3765 /* open.how should be already initialised */ 3766 if (!(req->open.how.flags & O_PATH) && force_o_largefile()) 3767 req->open.how.flags |= O_LARGEFILE; 3768 3769 req->open.dfd = READ_ONCE(sqe->fd); 3770 fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3771 req->open.filename = getname(fname); 3772 if (IS_ERR(req->open.filename)) { 3773 ret = PTR_ERR(req->open.filename); 3774 req->open.filename = NULL; 3775 return ret; 3776 } 3777 req->open.nofile = rlimit(RLIMIT_NOFILE); 3778 req->flags |= REQ_F_NEED_CLEANUP; 3779 return 0; 3780} 3781 3782static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3783{ 3784 u64 flags, mode; 3785 3786 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3787 return -EINVAL; 3788 mode = READ_ONCE(sqe->len); 3789 flags = READ_ONCE(sqe->open_flags); 3790 req->open.how = build_open_how(flags, mode); 3791 return __io_openat_prep(req, sqe); 3792} 3793 3794static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3795{ 3796 struct open_how __user *how; 3797 size_t len; 3798 int ret; 3799 3800 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3801 return -EINVAL; 3802 how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 3803 len = READ_ONCE(sqe->len); 3804 if (len < OPEN_HOW_SIZE_VER0) 3805 return -EINVAL; 3806 3807 ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how, 3808 len); 3809 if (ret) 3810 return ret; 3811 3812 return __io_openat_prep(req, sqe); 3813} 3814 3815static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) 3816{ 3817 struct open_flags op; 3818 struct file *file; 3819 bool nonblock_set; 3820 bool resolve_nonblock; 3821 int ret; 3822 3823 ret = build_open_flags(&req->open.how, &op); 3824 if (ret) 3825 goto err; 3826 nonblock_set = op.open_flag & O_NONBLOCK; 3827 resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED; 3828 if (issue_flags & IO_URING_F_NONBLOCK) { 3829 /* 3830 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, 3831 * it'll always -EAGAIN 3832 */ 3833 if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE)) 3834 return -EAGAIN; 3835 op.lookup_flags |= LOOKUP_CACHED; 3836 op.open_flag |= O_NONBLOCK; 3837 } 3838 3839 ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile); 3840 if (ret < 0) 3841 goto err; 3842 3843 file = do_filp_open(req->open.dfd, req->open.filename, &op); 3844 /* only retry if RESOLVE_CACHED wasn't already set by application */ 3845 if ((!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)) && 3846 file == ERR_PTR(-EAGAIN)) { 3847 /* 3848 * We could hang on to this 'fd', but seems like marginal 3849 * gain for something that is now known to be a slower path. 3850 * So just put it, and we'll get a new one when we retry. 3851 */ 3852 put_unused_fd(ret); 3853 return -EAGAIN; 3854 } 3855 3856 if (IS_ERR(file)) { 3857 put_unused_fd(ret); 3858 ret = PTR_ERR(file); 3859 } else { 3860 if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) 3861 file->f_flags &= ~O_NONBLOCK; 3862 fsnotify_open(file); 3863 fd_install(ret, file); 3864 } 3865err: 3866 putname(req->open.filename); 3867 req->flags &= ~REQ_F_NEED_CLEANUP; 3868 if (ret < 0) 3869 req_set_fail_links(req); 3870 __io_req_complete(req, issue_flags, ret, 0); 3871 return 0; 3872} 3873 3874static int io_openat(struct io_kiocb *req, unsigned int issue_flags) 3875{ 3876 return io_openat2(req, issue_flags); 3877} 3878 3879static int io_remove_buffers_prep(struct io_kiocb *req, 3880 const struct io_uring_sqe *sqe) 3881{ 3882 struct io_provide_buf *p = &req->pbuf; 3883 u64 tmp; 3884 3885 if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off) 3886 return -EINVAL; 3887 3888 tmp = READ_ONCE(sqe->fd); 3889 if (!tmp || tmp > USHRT_MAX) 3890 return -EINVAL; 3891 3892 memset(p, 0, sizeof(*p)); 3893 p->nbufs = tmp; 3894 p->bgid = READ_ONCE(sqe->buf_group); 3895 return 0; 3896} 3897 3898static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, 3899 int bgid, unsigned nbufs) 3900{ 3901 unsigned i = 0; 3902 3903 /* shouldn't happen */ 3904 if (!nbufs) 3905 return 0; 3906 3907 /* the head kbuf is the list itself */ 3908 while (!list_empty(&buf->list)) { 3909 struct io_buffer *nxt; 3910 3911 nxt = list_first_entry(&buf->list, struct io_buffer, list); 3912 list_del(&nxt->list); 3913 kfree(nxt); 3914 if (++i == nbufs) 3915 return i; 3916 } 3917 i++; 3918 kfree(buf); 3919 xa_erase(&ctx->io_buffers, bgid); 3920 3921 return i; 3922} 3923 3924static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) 3925{ 3926 struct io_provide_buf *p = &req->pbuf; 3927 struct io_ring_ctx *ctx = req->ctx; 3928 struct io_buffer *head; 3929 int ret = 0; 3930 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 3931 3932 io_ring_submit_lock(ctx, !force_nonblock); 3933 3934 lockdep_assert_held(&ctx->uring_lock); 3935 3936 ret = -ENOENT; 3937 head = xa_load(&ctx->io_buffers, p->bgid); 3938 if (head) 3939 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); 3940 if (ret < 0) 3941 req_set_fail_links(req); 3942 3943 /* complete before unlock, IOPOLL may need the lock */ 3944 __io_req_complete(req, issue_flags, ret, 0); 3945 io_ring_submit_unlock(ctx, !force_nonblock); 3946 return 0; 3947} 3948 3949static int io_provide_buffers_prep(struct io_kiocb *req, 3950 const struct io_uring_sqe *sqe) 3951{ 3952 unsigned long size, tmp_check; 3953 struct io_provide_buf *p = &req->pbuf; 3954 u64 tmp; 3955 3956 if (sqe->ioprio || sqe->rw_flags) 3957 return -EINVAL; 3958 3959 tmp = READ_ONCE(sqe->fd); 3960 if (!tmp || tmp > USHRT_MAX) 3961 return -E2BIG; 3962 p->nbufs = tmp; 3963 p->addr = READ_ONCE(sqe->addr); 3964 p->len = READ_ONCE(sqe->len); 3965 3966 if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, 3967 &size)) 3968 return -EOVERFLOW; 3969 if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) 3970 return -EOVERFLOW; 3971 3972 size = (unsigned long)p->len * p->nbufs; 3973 if (!access_ok(u64_to_user_ptr(p->addr), size)) 3974 return -EFAULT; 3975 3976 p->bgid = READ_ONCE(sqe->buf_group); 3977 tmp = READ_ONCE(sqe->off); 3978 if (tmp > USHRT_MAX) 3979 return -E2BIG; 3980 p->bid = tmp; 3981 return 0; 3982} 3983 3984static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) 3985{ 3986 struct io_buffer *buf; 3987 u64 addr = pbuf->addr; 3988 int i, bid = pbuf->bid; 3989 3990 for (i = 0; i < pbuf->nbufs; i++) { 3991 buf = kmalloc(sizeof(*buf), GFP_KERNEL); 3992 if (!buf) 3993 break; 3994 3995 buf->addr = addr; 3996 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); 3997 buf->bid = bid; 3998 addr += pbuf->len; 3999 bid++; 4000 if (!*head) { 4001 INIT_LIST_HEAD(&buf->list); 4002 *head = buf; 4003 } else { 4004 list_add_tail(&buf->list, &(*head)->list); 4005 } 4006 } 4007 4008 return i ? i : -ENOMEM; 4009} 4010 4011static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) 4012{ 4013 struct io_provide_buf *p = &req->pbuf; 4014 struct io_ring_ctx *ctx = req->ctx; 4015 struct io_buffer *head, *list; 4016 int ret = 0; 4017 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 4018 4019 io_ring_submit_lock(ctx, !force_nonblock); 4020 4021 lockdep_assert_held(&ctx->uring_lock); 4022 4023 list = head = xa_load(&ctx->io_buffers, p->bgid); 4024 4025 ret = io_add_buffers(p, &head); 4026 if (ret >= 0 && !list) { 4027 ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL); 4028 if (ret < 0) 4029 __io_remove_buffers(ctx, head, p->bgid, -1U); 4030 } 4031 if (ret < 0) 4032 req_set_fail_links(req); 4033 /* complete before unlock, IOPOLL may need the lock */ 4034 __io_req_complete(req, issue_flags, ret, 0); 4035 io_ring_submit_unlock(ctx, !force_nonblock); 4036 return 0; 4037} 4038 4039static int io_epoll_ctl_prep(struct io_kiocb *req, 4040 const struct io_uring_sqe *sqe) 4041{ 4042#if defined(CONFIG_EPOLL) 4043 if (sqe->ioprio || sqe->buf_index) 4044 return -EINVAL; 4045 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4046 return -EINVAL; 4047 4048 req->epoll.epfd = READ_ONCE(sqe->fd); 4049 req->epoll.op = READ_ONCE(sqe->len); 4050 req->epoll.fd = READ_ONCE(sqe->off); 4051 4052 if (ep_op_has_event(req->epoll.op)) { 4053 struct epoll_event __user *ev; 4054 4055 ev = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4056 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev))) 4057 return -EFAULT; 4058 } 4059 4060 return 0; 4061#else 4062 return -EOPNOTSUPP; 4063#endif 4064} 4065 4066static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) 4067{ 4068#if defined(CONFIG_EPOLL) 4069 struct io_epoll *ie = &req->epoll; 4070 int ret; 4071 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 4072 4073 ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock); 4074 if (force_nonblock && ret == -EAGAIN) 4075 return -EAGAIN; 4076 4077 if (ret < 0) 4078 req_set_fail_links(req); 4079 __io_req_complete(req, issue_flags, ret, 0); 4080 return 0; 4081#else 4082 return -EOPNOTSUPP; 4083#endif 4084} 4085 4086static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4087{ 4088#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) 4089 if (sqe->ioprio || sqe->buf_index || sqe->off) 4090 return -EINVAL; 4091 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4092 return -EINVAL; 4093 4094 req->madvise.addr = READ_ONCE(sqe->addr); 4095 req->madvise.len = READ_ONCE(sqe->len); 4096 req->madvise.advice = READ_ONCE(sqe->fadvise_advice); 4097 return 0; 4098#else 4099 return -EOPNOTSUPP; 4100#endif 4101} 4102 4103static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) 4104{ 4105#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) 4106 struct io_madvise *ma = &req->madvise; 4107 int ret; 4108 4109 if (issue_flags & IO_URING_F_NONBLOCK) 4110 return -EAGAIN; 4111 4112 ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); 4113 if (ret < 0) 4114 req_set_fail_links(req); 4115 io_req_complete(req, ret); 4116 return 0; 4117#else 4118 return -EOPNOTSUPP; 4119#endif 4120} 4121 4122static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4123{ 4124 if (sqe->ioprio || sqe->buf_index || sqe->addr) 4125 return -EINVAL; 4126 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4127 return -EINVAL; 4128 4129 req->fadvise.offset = READ_ONCE(sqe->off); 4130 req->fadvise.len = READ_ONCE(sqe->len); 4131 req->fadvise.advice = READ_ONCE(sqe->fadvise_advice); 4132 return 0; 4133} 4134 4135static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) 4136{ 4137 struct io_fadvise *fa = &req->fadvise; 4138 int ret; 4139 4140 if (issue_flags & IO_URING_F_NONBLOCK) { 4141 switch (fa->advice) { 4142 case POSIX_FADV_NORMAL: 4143 case POSIX_FADV_RANDOM: 4144 case POSIX_FADV_SEQUENTIAL: 4145 break; 4146 default: 4147 return -EAGAIN; 4148 } 4149 } 4150 4151 ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); 4152 if (ret < 0) 4153 req_set_fail_links(req); 4154 __io_req_complete(req, issue_flags, ret, 0); 4155 return 0; 4156} 4157 4158static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4159{ 4160 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4161 return -EINVAL; 4162 if (sqe->ioprio || sqe->buf_index) 4163 return -EINVAL; 4164 if (req->flags & REQ_F_FIXED_FILE) 4165 return -EBADF; 4166 4167 req->statx.dfd = READ_ONCE(sqe->fd); 4168 req->statx.mask = READ_ONCE(sqe->len); 4169 req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4170 req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 4171 req->statx.flags = READ_ONCE(sqe->statx_flags); 4172 4173 return 0; 4174} 4175 4176static int io_statx(struct io_kiocb *req, unsigned int issue_flags) 4177{ 4178 struct io_statx *ctx = &req->statx; 4179 int ret; 4180 4181 if (issue_flags & IO_URING_F_NONBLOCK) 4182 return -EAGAIN; 4183 4184 ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask, 4185 ctx->buffer); 4186 4187 if (ret < 0) 4188 req_set_fail_links(req); 4189 io_req_complete(req, ret); 4190 return 0; 4191} 4192 4193static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4194{ 4195 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4196 return -EINVAL; 4197 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || 4198 sqe->rw_flags || sqe->buf_index) 4199 return -EINVAL; 4200 if (req->flags & REQ_F_FIXED_FILE) 4201 return -EBADF; 4202 4203 req->close.fd = READ_ONCE(sqe->fd); 4204 return 0; 4205} 4206 4207static int io_close(struct io_kiocb *req, unsigned int issue_flags) 4208{ 4209 struct files_struct *files = current->files; 4210 struct io_close *close = &req->close; 4211 struct fdtable *fdt; 4212 struct file *file = NULL; 4213 int ret = -EBADF; 4214 4215 spin_lock(&files->file_lock); 4216 fdt = files_fdtable(files); 4217 if (close->fd >= fdt->max_fds) { 4218 spin_unlock(&files->file_lock); 4219 goto err; 4220 } 4221 file = fdt->fd[close->fd]; 4222 if (!file || file->f_op == &io_uring_fops) { 4223 spin_unlock(&files->file_lock); 4224 file = NULL; 4225 goto err; 4226 } 4227 4228 /* if the file has a flush method, be safe and punt to async */ 4229 if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) { 4230 spin_unlock(&files->file_lock); 4231 return -EAGAIN; 4232 } 4233 4234 ret = __close_fd_get_file(close->fd, &file); 4235 spin_unlock(&files->file_lock); 4236 if (ret < 0) { 4237 if (ret == -ENOENT) 4238 ret = -EBADF; 4239 goto err; 4240 } 4241 4242 /* No ->flush() or already async, safely close from here */ 4243 ret = filp_close(file, current->files); 4244err: 4245 if (ret < 0) 4246 req_set_fail_links(req); 4247 if (file) 4248 fput(file); 4249 __io_req_complete(req, issue_flags, ret, 0); 4250 return 0; 4251} 4252 4253static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4254{ 4255 struct io_ring_ctx *ctx = req->ctx; 4256 4257 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 4258 return -EINVAL; 4259 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) 4260 return -EINVAL; 4261 4262 req->sync.off = READ_ONCE(sqe->off); 4263 req->sync.len = READ_ONCE(sqe->len); 4264 req->sync.flags = READ_ONCE(sqe->sync_range_flags); 4265 return 0; 4266} 4267 4268static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) 4269{ 4270 int ret; 4271 4272 /* sync_file_range always requires a blocking context */ 4273 if (issue_flags & IO_URING_F_NONBLOCK) 4274 return -EAGAIN; 4275 4276 ret = sync_file_range(req->file, req->sync.off, req->sync.len, 4277 req->sync.flags); 4278 if (ret < 0) 4279 req_set_fail_links(req); 4280 io_req_complete(req, ret); 4281 return 0; 4282} 4283 4284#if defined(CONFIG_NET) 4285static int io_setup_async_msg(struct io_kiocb *req, 4286 struct io_async_msghdr *kmsg) 4287{ 4288 struct io_async_msghdr *async_msg = req->async_data; 4289 4290 if (async_msg) 4291 return -EAGAIN; 4292 if (io_alloc_async_data(req)) { 4293 kfree(kmsg->free_iov); 4294 return -ENOMEM; 4295 } 4296 async_msg = req->async_data; 4297 req->flags |= REQ_F_NEED_CLEANUP; 4298 memcpy(async_msg, kmsg, sizeof(*kmsg)); 4299 async_msg->msg.msg_name = &async_msg->addr; 4300 /* if were using fast_iov, set it to the new one */ 4301 if (!async_msg->free_iov) 4302 async_msg->msg.msg_iter.iov = async_msg->fast_iov; 4303 4304 return -EAGAIN; 4305} 4306 4307static int io_sendmsg_copy_hdr(struct io_kiocb *req, 4308 struct io_async_msghdr *iomsg) 4309{ 4310 iomsg->msg.msg_name = &iomsg->addr; 4311 iomsg->free_iov = iomsg->fast_iov; 4312 return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg, 4313 req->sr_msg.msg_flags, &iomsg->free_iov); 4314} 4315 4316static int io_sendmsg_prep_async(struct io_kiocb *req) 4317{ 4318 int ret; 4319 4320 ret = io_sendmsg_copy_hdr(req, req->async_data); 4321 if (!ret) 4322 req->flags |= REQ_F_NEED_CLEANUP; 4323 return ret; 4324} 4325 4326static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4327{ 4328 struct io_sr_msg *sr = &req->sr_msg; 4329 4330 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4331 return -EINVAL; 4332 4333 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4334 sr->len = READ_ONCE(sqe->len); 4335 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 4336 if (sr->msg_flags & MSG_DONTWAIT) 4337 req->flags |= REQ_F_NOWAIT; 4338 4339#ifdef CONFIG_COMPAT 4340 if (req->ctx->compat) 4341 sr->msg_flags |= MSG_CMSG_COMPAT; 4342#endif 4343 return 0; 4344} 4345 4346static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) 4347{ 4348 struct io_async_msghdr iomsg, *kmsg; 4349 struct socket *sock; 4350 unsigned flags; 4351 int min_ret = 0; 4352 int ret; 4353 4354 sock = sock_from_file(req->file); 4355 if (unlikely(!sock)) 4356 return -ENOTSOCK; 4357 4358 kmsg = req->async_data; 4359 if (!kmsg) { 4360 ret = io_sendmsg_copy_hdr(req, &iomsg); 4361 if (ret) 4362 return ret; 4363 kmsg = &iomsg; 4364 } 4365 4366 flags = req->sr_msg.msg_flags; 4367 if (issue_flags & IO_URING_F_NONBLOCK) 4368 flags |= MSG_DONTWAIT; 4369 if (flags & MSG_WAITALL) 4370 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 4371 4372 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 4373 if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) 4374 return io_setup_async_msg(req, kmsg); 4375 if (ret == -ERESTARTSYS) 4376 ret = -EINTR; 4377 4378 /* fast path, check for non-NULL to avoid function call */ 4379 if (kmsg->free_iov) 4380 kfree(kmsg->free_iov); 4381 req->flags &= ~REQ_F_NEED_CLEANUP; 4382 if (ret < min_ret) 4383 req_set_fail_links(req); 4384 __io_req_complete(req, issue_flags, ret, 0); 4385 return 0; 4386} 4387 4388static int io_send(struct io_kiocb *req, unsigned int issue_flags) 4389{ 4390 struct io_sr_msg *sr = &req->sr_msg; 4391 struct msghdr msg; 4392 struct iovec iov; 4393 struct socket *sock; 4394 unsigned flags; 4395 int min_ret = 0; 4396 int ret; 4397 4398 sock = sock_from_file(req->file); 4399 if (unlikely(!sock)) 4400 return -ENOTSOCK; 4401 4402 ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); 4403 if (unlikely(ret)) 4404 return ret; 4405 4406 msg.msg_name = NULL; 4407 msg.msg_control = NULL; 4408 msg.msg_controllen = 0; 4409 msg.msg_namelen = 0; 4410 4411 flags = req->sr_msg.msg_flags; 4412 if (issue_flags & IO_URING_F_NONBLOCK) 4413 flags |= MSG_DONTWAIT; 4414 if (flags & MSG_WAITALL) 4415 min_ret = iov_iter_count(&msg.msg_iter); 4416 4417 msg.msg_flags = flags; 4418 ret = sock_sendmsg(sock, &msg); 4419 if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) 4420 return -EAGAIN; 4421 if (ret == -ERESTARTSYS) 4422 ret = -EINTR; 4423 4424 if (ret < min_ret) 4425 req_set_fail_links(req); 4426 __io_req_complete(req, issue_flags, ret, 0); 4427 return 0; 4428} 4429 4430static int __io_recvmsg_copy_hdr(struct io_kiocb *req, 4431 struct io_async_msghdr *iomsg) 4432{ 4433 struct io_sr_msg *sr = &req->sr_msg; 4434 struct iovec __user *uiov; 4435 size_t iov_len; 4436 int ret; 4437 4438 ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg, 4439 &iomsg->uaddr, &uiov, &iov_len); 4440 if (ret) 4441 return ret; 4442 4443 if (req->flags & REQ_F_BUFFER_SELECT) { 4444 if (iov_len > 1) 4445 return -EINVAL; 4446 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov))) 4447 return -EFAULT; 4448 sr->len = iomsg->fast_iov[0].iov_len; 4449 iomsg->free_iov = NULL; 4450 } else { 4451 iomsg->free_iov = iomsg->fast_iov; 4452 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV, 4453 &iomsg->free_iov, &iomsg->msg.msg_iter, 4454 false); 4455 if (ret > 0) 4456 ret = 0; 4457 } 4458 4459 return ret; 4460} 4461 4462#ifdef CONFIG_COMPAT 4463static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, 4464 struct io_async_msghdr *iomsg) 4465{ 4466 struct io_sr_msg *sr = &req->sr_msg; 4467 struct compat_iovec __user *uiov; 4468 compat_uptr_t ptr; 4469 compat_size_t len; 4470 int ret; 4471 4472 ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr, 4473 &ptr, &len); 4474 if (ret) 4475 return ret; 4476 4477 uiov = compat_ptr(ptr); 4478 if (req->flags & REQ_F_BUFFER_SELECT) { 4479 compat_ssize_t clen; 4480 4481 if (len > 1) 4482 return -EINVAL; 4483 if (!access_ok(uiov, sizeof(*uiov))) 4484 return -EFAULT; 4485 if (__get_user(clen, &uiov->iov_len)) 4486 return -EFAULT; 4487 if (clen < 0) 4488 return -EINVAL; 4489 sr->len = clen; 4490 iomsg->free_iov = NULL; 4491 } else { 4492 iomsg->free_iov = iomsg->fast_iov; 4493 ret = __import_iovec(READ, (struct iovec __user *)uiov, len, 4494 UIO_FASTIOV, &iomsg->free_iov, 4495 &iomsg->msg.msg_iter, true); 4496 if (ret < 0) 4497 return ret; 4498 } 4499 4500 return 0; 4501} 4502#endif 4503 4504static int io_recvmsg_copy_hdr(struct io_kiocb *req, 4505 struct io_async_msghdr *iomsg) 4506{ 4507 iomsg->msg.msg_name = &iomsg->addr; 4508 4509#ifdef CONFIG_COMPAT 4510 if (req->ctx->compat) 4511 return __io_compat_recvmsg_copy_hdr(req, iomsg); 4512#endif 4513 4514 return __io_recvmsg_copy_hdr(req, iomsg); 4515} 4516 4517static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, 4518 bool needs_lock) 4519{ 4520 struct io_sr_msg *sr = &req->sr_msg; 4521 struct io_buffer *kbuf; 4522 4523 kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock); 4524 if (IS_ERR(kbuf)) 4525 return kbuf; 4526 4527 sr->kbuf = kbuf; 4528 req->flags |= REQ_F_BUFFER_SELECTED; 4529 return kbuf; 4530} 4531 4532static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req) 4533{ 4534 return io_put_kbuf(req, req->sr_msg.kbuf); 4535} 4536 4537static int io_recvmsg_prep_async(struct io_kiocb *req) 4538{ 4539 int ret; 4540 4541 ret = io_recvmsg_copy_hdr(req, req->async_data); 4542 if (!ret) 4543 req->flags |= REQ_F_NEED_CLEANUP; 4544 return ret; 4545} 4546 4547static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4548{ 4549 struct io_sr_msg *sr = &req->sr_msg; 4550 4551 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4552 return -EINVAL; 4553 4554 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4555 sr->len = READ_ONCE(sqe->len); 4556 sr->bgid = READ_ONCE(sqe->buf_group); 4557 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 4558 if (sr->msg_flags & MSG_DONTWAIT) 4559 req->flags |= REQ_F_NOWAIT; 4560 4561#ifdef CONFIG_COMPAT 4562 if (req->ctx->compat) 4563 sr->msg_flags |= MSG_CMSG_COMPAT; 4564#endif 4565 return 0; 4566} 4567 4568static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) 4569{ 4570 struct io_async_msghdr iomsg, *kmsg; 4571 struct socket *sock; 4572 struct io_buffer *kbuf; 4573 unsigned flags; 4574 int min_ret = 0; 4575 int ret, cflags = 0; 4576 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 4577 4578 sock = sock_from_file(req->file); 4579 if (unlikely(!sock)) 4580 return -ENOTSOCK; 4581 4582 kmsg = req->async_data; 4583 if (!kmsg) { 4584 ret = io_recvmsg_copy_hdr(req, &iomsg); 4585 if (ret) 4586 return ret; 4587 kmsg = &iomsg; 4588 } 4589 4590 if (req->flags & REQ_F_BUFFER_SELECT) { 4591 kbuf = io_recv_buffer_select(req, !force_nonblock); 4592 if (IS_ERR(kbuf)) 4593 return PTR_ERR(kbuf); 4594 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); 4595 kmsg->fast_iov[0].iov_len = req->sr_msg.len; 4596 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 4597 1, req->sr_msg.len); 4598 } 4599 4600 flags = req->sr_msg.msg_flags; 4601 if (force_nonblock) 4602 flags |= MSG_DONTWAIT; 4603 if (flags & MSG_WAITALL) 4604 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 4605 4606 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, 4607 kmsg->uaddr, flags); 4608 if (force_nonblock && ret == -EAGAIN) 4609 return io_setup_async_msg(req, kmsg); 4610 if (ret == -ERESTARTSYS) 4611 ret = -EINTR; 4612 4613 if (req->flags & REQ_F_BUFFER_SELECTED) 4614 cflags = io_put_recv_kbuf(req); 4615 /* fast path, check for non-NULL to avoid function call */ 4616 if (kmsg->free_iov) 4617 kfree(kmsg->free_iov); 4618 req->flags &= ~REQ_F_NEED_CLEANUP; 4619 if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) 4620 req_set_fail_links(req); 4621 __io_req_complete(req, issue_flags, ret, cflags); 4622 return 0; 4623} 4624 4625static int io_recv(struct io_kiocb *req, unsigned int issue_flags) 4626{ 4627 struct io_buffer *kbuf; 4628 struct io_sr_msg *sr = &req->sr_msg; 4629 struct msghdr msg; 4630 void __user *buf = sr->buf; 4631 struct socket *sock; 4632 struct iovec iov; 4633 unsigned flags; 4634 int min_ret = 0; 4635 int ret, cflags = 0; 4636 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 4637 4638 sock = sock_from_file(req->file); 4639 if (unlikely(!sock)) 4640 return -ENOTSOCK; 4641 4642 if (req->flags & REQ_F_BUFFER_SELECT) { 4643 kbuf = io_recv_buffer_select(req, !force_nonblock); 4644 if (IS_ERR(kbuf)) 4645 return PTR_ERR(kbuf); 4646 buf = u64_to_user_ptr(kbuf->addr); 4647 } 4648 4649 ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); 4650 if (unlikely(ret)) 4651 goto out_free; 4652 4653 msg.msg_name = NULL; 4654 msg.msg_control = NULL; 4655 msg.msg_controllen = 0; 4656 msg.msg_namelen = 0; 4657 msg.msg_iocb = NULL; 4658 msg.msg_flags = 0; 4659 4660 flags = req->sr_msg.msg_flags; 4661 if (force_nonblock) 4662 flags |= MSG_DONTWAIT; 4663 if (flags & MSG_WAITALL) 4664 min_ret = iov_iter_count(&msg.msg_iter); 4665 4666 ret = sock_recvmsg(sock, &msg, flags); 4667 if (force_nonblock && ret == -EAGAIN) 4668 return -EAGAIN; 4669 if (ret == -ERESTARTSYS) 4670 ret = -EINTR; 4671out_free: 4672 if (req->flags & REQ_F_BUFFER_SELECTED) 4673 cflags = io_put_recv_kbuf(req); 4674 if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) 4675 req_set_fail_links(req); 4676 __io_req_complete(req, issue_flags, ret, cflags); 4677 return 0; 4678} 4679 4680static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4681{ 4682 struct io_accept *accept = &req->accept; 4683 4684 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4685 return -EINVAL; 4686 if (sqe->ioprio || sqe->len || sqe->buf_index) 4687 return -EINVAL; 4688 4689 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4690 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 4691 accept->flags = READ_ONCE(sqe->accept_flags); 4692 accept->nofile = rlimit(RLIMIT_NOFILE); 4693 return 0; 4694} 4695 4696static int io_accept(struct io_kiocb *req, unsigned int issue_flags) 4697{ 4698 struct io_accept *accept = &req->accept; 4699 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 4700 unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; 4701 int ret; 4702 4703 if (req->file->f_flags & O_NONBLOCK) 4704 req->flags |= REQ_F_NOWAIT; 4705 4706 ret = __sys_accept4_file(req->file, file_flags, accept->addr, 4707 accept->addr_len, accept->flags, 4708 accept->nofile); 4709 if (ret == -EAGAIN && force_nonblock) 4710 return -EAGAIN; 4711 if (ret < 0) { 4712 if (ret == -ERESTARTSYS) 4713 ret = -EINTR; 4714 req_set_fail_links(req); 4715 } 4716 __io_req_complete(req, issue_flags, ret, 0); 4717 return 0; 4718} 4719 4720static int io_connect_prep_async(struct io_kiocb *req) 4721{ 4722 struct io_async_connect *io = req->async_data; 4723 struct io_connect *conn = &req->connect; 4724 4725 return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address); 4726} 4727 4728static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4729{ 4730 struct io_connect *conn = &req->connect; 4731 4732 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4733 return -EINVAL; 4734 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) 4735 return -EINVAL; 4736 4737 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4738 conn->addr_len = READ_ONCE(sqe->addr2); 4739 return 0; 4740} 4741 4742static int io_connect(struct io_kiocb *req, unsigned int issue_flags) 4743{ 4744 struct io_async_connect __io, *io; 4745 unsigned file_flags; 4746 int ret; 4747 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 4748 4749 if (req->async_data) { 4750 io = req->async_data; 4751 } else { 4752 ret = move_addr_to_kernel(req->connect.addr, 4753 req->connect.addr_len, 4754 &__io.address); 4755 if (ret) 4756 goto out; 4757 io = &__io; 4758 } 4759 4760 file_flags = force_nonblock ? O_NONBLOCK : 0; 4761 4762 ret = __sys_connect_file(req->file, &io->address, 4763 req->connect.addr_len, file_flags); 4764 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { 4765 if (req->async_data) 4766 return -EAGAIN; 4767 if (io_alloc_async_data(req)) { 4768 ret = -ENOMEM; 4769 goto out; 4770 } 4771 memcpy(req->async_data, &__io, sizeof(__io)); 4772 return -EAGAIN; 4773 } 4774 if (ret == -ERESTARTSYS) 4775 ret = -EINTR; 4776out: 4777 if (ret < 0) 4778 req_set_fail_links(req); 4779 __io_req_complete(req, issue_flags, ret, 0); 4780 return 0; 4781} 4782#else /* !CONFIG_NET */ 4783#define IO_NETOP_FN(op) \ 4784static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \ 4785{ \ 4786 return -EOPNOTSUPP; \ 4787} 4788 4789#define IO_NETOP_PREP(op) \ 4790IO_NETOP_FN(op) \ 4791static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \ 4792{ \ 4793 return -EOPNOTSUPP; \ 4794} \ 4795 4796#define IO_NETOP_PREP_ASYNC(op) \ 4797IO_NETOP_PREP(op) \ 4798static int io_##op##_prep_async(struct io_kiocb *req) \ 4799{ \ 4800 return -EOPNOTSUPP; \ 4801} 4802 4803IO_NETOP_PREP_ASYNC(sendmsg); 4804IO_NETOP_PREP_ASYNC(recvmsg); 4805IO_NETOP_PREP_ASYNC(connect); 4806IO_NETOP_PREP(accept); 4807IO_NETOP_FN(send); 4808IO_NETOP_FN(recv); 4809#endif /* CONFIG_NET */ 4810 4811struct io_poll_table { 4812 struct poll_table_struct pt; 4813 struct io_kiocb *req; 4814 int error; 4815}; 4816 4817static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, 4818 __poll_t mask, task_work_func_t func) 4819{ 4820 int ret; 4821 4822 /* for instances that support it check for an event match first: */ 4823 if (mask && !(mask & poll->events)) 4824 return 0; 4825 4826 trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask); 4827 4828 list_del_init(&poll->wait.entry); 4829 4830 req->result = mask; 4831 req->task_work.func = func; 4832 4833 /* 4834 * If this fails, then the task is exiting. When a task exits, the 4835 * work gets canceled, so just cancel this request as well instead 4836 * of executing it. We can't safely execute it anyway, as we may not 4837 * have the needed state needed for it anyway. 4838 */ 4839 ret = io_req_task_work_add(req); 4840 if (unlikely(ret)) { 4841 WRITE_ONCE(poll->canceled, true); 4842 io_req_task_work_add_fallback(req, func); 4843 } 4844 return 1; 4845} 4846 4847static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) 4848 __acquires(&req->ctx->completion_lock) 4849{ 4850 struct io_ring_ctx *ctx = req->ctx; 4851 4852 if (!req->result && !READ_ONCE(poll->canceled)) { 4853 struct poll_table_struct pt = { ._key = poll->events }; 4854 4855 req->result = vfs_poll(req->file, &pt) & poll->events; 4856 } 4857 4858 spin_lock_irq(&ctx->completion_lock); 4859 if (!req->result && !READ_ONCE(poll->canceled)) { 4860 add_wait_queue(poll->head, &poll->wait); 4861 return true; 4862 } 4863 4864 return false; 4865} 4866 4867static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) 4868{ 4869 /* pure poll stashes this in ->async_data, poll driven retry elsewhere */ 4870 if (req->opcode == IORING_OP_POLL_ADD) 4871 return req->async_data; 4872 return req->apoll->double_poll; 4873} 4874 4875static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req) 4876{ 4877 if (req->opcode == IORING_OP_POLL_ADD) 4878 return &req->poll; 4879 return &req->apoll->poll; 4880} 4881 4882static void io_poll_remove_double(struct io_kiocb *req) 4883 __must_hold(&req->ctx->completion_lock) 4884{ 4885 struct io_poll_iocb *poll = io_poll_get_double(req); 4886 4887 lockdep_assert_held(&req->ctx->completion_lock); 4888 4889 if (poll && poll->head) { 4890 struct wait_queue_head *head = poll->head; 4891 4892 spin_lock(&head->lock); 4893 list_del_init(&poll->wait.entry); 4894 if (poll->wait.private) 4895 req_ref_put(req); 4896 poll->head = NULL; 4897 spin_unlock(&head->lock); 4898 } 4899} 4900 4901static bool io_poll_complete(struct io_kiocb *req, __poll_t mask) 4902 __must_hold(&req->ctx->completion_lock) 4903{ 4904 struct io_ring_ctx *ctx = req->ctx; 4905 unsigned flags = IORING_CQE_F_MORE; 4906 int error; 4907 4908 if (READ_ONCE(req->poll.canceled)) { 4909 error = -ECANCELED; 4910 req->poll.events |= EPOLLONESHOT; 4911 } else { 4912 error = mangle_poll(mask); 4913 } 4914 if (req->poll.events & EPOLLONESHOT) 4915 flags = 0; 4916 if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) { 4917 io_poll_remove_waitqs(req); 4918 req->poll.done = true; 4919 flags = 0; 4920 } 4921 if (flags & IORING_CQE_F_MORE) 4922 ctx->cq_extra++; 4923 4924 io_commit_cqring(ctx); 4925 return !(flags & IORING_CQE_F_MORE); 4926} 4927 4928static void io_poll_task_func(struct callback_head *cb) 4929{ 4930 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); 4931 struct io_ring_ctx *ctx = req->ctx; 4932 struct io_kiocb *nxt; 4933 4934 if (io_poll_rewait(req, &req->poll)) { 4935 spin_unlock_irq(&ctx->completion_lock); 4936 } else { 4937 bool done; 4938 4939 done = io_poll_complete(req, req->result); 4940 if (done) { 4941 hash_del(&req->hash_node); 4942 } else { 4943 req->result = 0; 4944 add_wait_queue(req->poll.head, &req->poll.wait); 4945 } 4946 spin_unlock_irq(&ctx->completion_lock); 4947 io_cqring_ev_posted(ctx); 4948 4949 if (done) { 4950 nxt = io_put_req_find_next(req); 4951 if (nxt) 4952 __io_req_task_submit(nxt); 4953 } 4954 } 4955} 4956 4957static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, 4958 int sync, void *key) 4959{ 4960 struct io_kiocb *req = wait->private; 4961 struct io_poll_iocb *poll = io_poll_get_single(req); 4962 __poll_t mask = key_to_poll(key); 4963 4964 /* for instances that support it check for an event match first: */ 4965 if (mask && !(mask & poll->events)) 4966 return 0; 4967 if (!(poll->events & EPOLLONESHOT)) 4968 return poll->wait.func(&poll->wait, mode, sync, key); 4969 4970 list_del_init(&wait->entry); 4971 4972 if (poll && poll->head) { 4973 bool done; 4974 4975 spin_lock(&poll->head->lock); 4976 done = list_empty(&poll->wait.entry); 4977 if (!done) 4978 list_del_init(&poll->wait.entry); 4979 /* make sure double remove sees this as being gone */ 4980 wait->private = NULL; 4981 spin_unlock(&poll->head->lock); 4982 if (!done) { 4983 /* use wait func handler, so it matches the rq type */ 4984 poll->wait.func(&poll->wait, mode, sync, key); 4985 } 4986 } 4987 req_ref_put(req); 4988 return 1; 4989} 4990 4991static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, 4992 wait_queue_func_t wake_func) 4993{ 4994 poll->head = NULL; 4995 poll->done = false; 4996 poll->canceled = false; 4997#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) 4998 /* mask in events that we always want/need */ 4999 poll->events = events | IO_POLL_UNMASK; 5000 INIT_LIST_HEAD(&poll->wait.entry); 5001 init_waitqueue_func_entry(&poll->wait, wake_func); 5002} 5003 5004static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, 5005 struct wait_queue_head *head, 5006 struct io_poll_iocb **poll_ptr) 5007{ 5008 struct io_kiocb *req = pt->req; 5009 5010 /* 5011 * If poll->head is already set, it's because the file being polled 5012 * uses multiple waitqueues for poll handling (eg one for read, one 5013 * for write). Setup a separate io_poll_iocb if this happens. 5014 */ 5015 if (unlikely(poll->head)) { 5016 struct io_poll_iocb *poll_one = poll; 5017 5018 /* already have a 2nd entry, fail a third attempt */ 5019 if (*poll_ptr) { 5020 pt->error = -EINVAL; 5021 return; 5022 } 5023 /* 5024 * Can't handle multishot for double wait for now, turn it 5025 * into one-shot mode. 5026 */ 5027 if (!(poll_one->events & EPOLLONESHOT)) 5028 poll_one->events |= EPOLLONESHOT; 5029 /* double add on the same waitqueue head, ignore */ 5030 if (poll_one->head == head) 5031 return; 5032 poll = kmalloc(sizeof(*poll), GFP_ATOMIC); 5033 if (!poll) { 5034 pt->error = -ENOMEM; 5035 return; 5036 } 5037 io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake); 5038 req_ref_get(req); 5039 poll->wait.private = req; 5040 *poll_ptr = poll; 5041 } 5042 5043 pt->error = 0; 5044 poll->head = head; 5045 5046 if (poll->events & EPOLLEXCLUSIVE) 5047 add_wait_queue_exclusive(head, &poll->wait); 5048 else 5049 add_wait_queue(head, &poll->wait); 5050} 5051 5052static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, 5053 struct poll_table_struct *p) 5054{ 5055 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); 5056 struct async_poll *apoll = pt->req->apoll; 5057 5058 __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); 5059} 5060 5061static void io_async_task_func(struct callback_head *cb) 5062{ 5063 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); 5064 struct async_poll *apoll = req->apoll; 5065 struct io_ring_ctx *ctx = req->ctx; 5066 5067 trace_io_uring_task_run(req->ctx, req->opcode, req->user_data); 5068 5069 if (io_poll_rewait(req, &apoll->poll)) { 5070 spin_unlock_irq(&ctx->completion_lock); 5071 return; 5072 } 5073 5074 hash_del(&req->hash_node); 5075 io_poll_remove_double(req); 5076 spin_unlock_irq(&ctx->completion_lock); 5077 5078 if (!READ_ONCE(apoll->poll.canceled)) 5079 __io_req_task_submit(req); 5080 else 5081 io_req_complete_failed(req, -ECANCELED); 5082} 5083 5084static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, 5085 void *key) 5086{ 5087 struct io_kiocb *req = wait->private; 5088 struct io_poll_iocb *poll = &req->apoll->poll; 5089 5090 trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data, 5091 key_to_poll(key)); 5092 5093 return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func); 5094} 5095 5096static void io_poll_req_insert(struct io_kiocb *req) 5097{ 5098 struct io_ring_ctx *ctx = req->ctx; 5099 struct hlist_head *list; 5100 5101 list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; 5102 hlist_add_head(&req->hash_node, list); 5103} 5104 5105static __poll_t __io_arm_poll_handler(struct io_kiocb *req, 5106 struct io_poll_iocb *poll, 5107 struct io_poll_table *ipt, __poll_t mask, 5108 wait_queue_func_t wake_func) 5109 __acquires(&ctx->completion_lock) 5110{ 5111 struct io_ring_ctx *ctx = req->ctx; 5112 bool cancel = false; 5113 5114 INIT_HLIST_NODE(&req->hash_node); 5115 io_init_poll_iocb(poll, mask, wake_func); 5116 poll->file = req->file; 5117 poll->wait.private = req; 5118 5119 ipt->pt._key = mask; 5120 ipt->req = req; 5121 ipt->error = -EINVAL; 5122 5123 mask = vfs_poll(req->file, &ipt->pt) & poll->events; 5124 5125 spin_lock_irq(&ctx->completion_lock); 5126 if (likely(poll->head)) { 5127 spin_lock(&poll->head->lock); 5128 if (unlikely(list_empty(&poll->wait.entry))) { 5129 if (ipt->error) 5130 cancel = true; 5131 ipt->error = 0; 5132 mask = 0; 5133 } 5134 if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error) 5135 list_del_init(&poll->wait.entry); 5136 else if (cancel) 5137 WRITE_ONCE(poll->canceled, true); 5138 else if (!poll->done) /* actually waiting for an event */ 5139 io_poll_req_insert(req); 5140 spin_unlock(&poll->head->lock); 5141 } 5142 5143 return mask; 5144} 5145 5146static bool io_arm_poll_handler(struct io_kiocb *req) 5147{ 5148 const struct io_op_def *def = &io_op_defs[req->opcode]; 5149 struct io_ring_ctx *ctx = req->ctx; 5150 struct async_poll *apoll; 5151 struct io_poll_table ipt; 5152 __poll_t mask, ret; 5153 int rw; 5154 5155 if (!req->file || !file_can_poll(req->file)) 5156 return false; 5157 if (req->flags & REQ_F_POLLED) 5158 return false; 5159 if (def->pollin) 5160 rw = READ; 5161 else if (def->pollout) 5162 rw = WRITE; 5163 else 5164 return false; 5165 /* if we can't nonblock try, then no point in arming a poll handler */ 5166 if (!io_file_supports_async(req, rw)) 5167 return false; 5168 5169 apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); 5170 if (unlikely(!apoll)) 5171 return false; 5172 apoll->double_poll = NULL; 5173 5174 req->flags |= REQ_F_POLLED; 5175 req->apoll = apoll; 5176 5177 mask = EPOLLONESHOT; 5178 if (def->pollin) 5179 mask |= POLLIN | POLLRDNORM; 5180 if (def->pollout) 5181 mask |= POLLOUT | POLLWRNORM; 5182 5183 /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ 5184 if ((req->opcode == IORING_OP_RECVMSG) && 5185 (req->sr_msg.msg_flags & MSG_ERRQUEUE)) 5186 mask &= ~POLLIN; 5187 5188 mask |= POLLERR | POLLPRI; 5189 5190 ipt.pt._qproc = io_async_queue_proc; 5191 5192 ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, 5193 io_async_wake); 5194 if (ret || ipt.error) { 5195 io_poll_remove_double(req); 5196 spin_unlock_irq(&ctx->completion_lock); 5197 return false; 5198 } 5199 spin_unlock_irq(&ctx->completion_lock); 5200 trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask, 5201 apoll->poll.events); 5202 return true; 5203} 5204 5205static bool __io_poll_remove_one(struct io_kiocb *req, 5206 struct io_poll_iocb *poll, bool do_cancel) 5207 __must_hold(&req->ctx->completion_lock) 5208{ 5209 bool do_complete = false; 5210 5211 if (!poll->head) 5212 return false; 5213 spin_lock(&poll->head->lock); 5214 if (do_cancel) 5215 WRITE_ONCE(poll->canceled, true); 5216 if (!list_empty(&poll->wait.entry)) { 5217 list_del_init(&poll->wait.entry); 5218 do_complete = true; 5219 } 5220 spin_unlock(&poll->head->lock); 5221 hash_del(&req->hash_node); 5222 return do_complete; 5223} 5224 5225static bool io_poll_remove_waitqs(struct io_kiocb *req) 5226 __must_hold(&req->ctx->completion_lock) 5227{ 5228 bool do_complete; 5229 5230 io_poll_remove_double(req); 5231 do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true); 5232 5233 if (req->opcode != IORING_OP_POLL_ADD && do_complete) { 5234 /* non-poll requests have submit ref still */ 5235 req_ref_put(req); 5236 } 5237 return do_complete; 5238} 5239 5240static bool io_poll_remove_one(struct io_kiocb *req) 5241 __must_hold(&req->ctx->completion_lock) 5242{ 5243 bool do_complete; 5244 5245 do_complete = io_poll_remove_waitqs(req); 5246 if (do_complete) { 5247 io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0); 5248 io_commit_cqring(req->ctx); 5249 req_set_fail_links(req); 5250 io_put_req_deferred(req, 1); 5251 } 5252 5253 return do_complete; 5254} 5255 5256/* 5257 * Returns true if we found and killed one or more poll requests 5258 */ 5259static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, 5260 struct files_struct *files) 5261{ 5262 struct hlist_node *tmp; 5263 struct io_kiocb *req; 5264 int posted = 0, i; 5265 5266 spin_lock_irq(&ctx->completion_lock); 5267 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { 5268 struct hlist_head *list; 5269 5270 list = &ctx->cancel_hash[i]; 5271 hlist_for_each_entry_safe(req, tmp, list, hash_node) { 5272 if (io_match_task(req, tsk, files)) 5273 posted += io_poll_remove_one(req); 5274 } 5275 } 5276 spin_unlock_irq(&ctx->completion_lock); 5277 5278 if (posted) 5279 io_cqring_ev_posted(ctx); 5280 5281 return posted != 0; 5282} 5283 5284static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr, 5285 bool poll_only) 5286 __must_hold(&ctx->completion_lock) 5287{ 5288 struct hlist_head *list; 5289 struct io_kiocb *req; 5290 5291 list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; 5292 hlist_for_each_entry(req, list, hash_node) { 5293 if (sqe_addr != req->user_data) 5294 continue; 5295 if (poll_only && req->opcode != IORING_OP_POLL_ADD) 5296 continue; 5297 return req; 5298 } 5299 return NULL; 5300} 5301 5302static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr, 5303 bool poll_only) 5304 __must_hold(&ctx->completion_lock) 5305{ 5306 struct io_kiocb *req; 5307 5308 req = io_poll_find(ctx, sqe_addr, poll_only); 5309 if (!req) 5310 return -ENOENT; 5311 if (io_poll_remove_one(req)) 5312 return 0; 5313 5314 return -EALREADY; 5315} 5316 5317static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, 5318 unsigned int flags) 5319{ 5320 u32 events; 5321 5322 events = READ_ONCE(sqe->poll32_events); 5323#ifdef __BIG_ENDIAN 5324 events = swahw32(events); 5325#endif 5326 if (!(flags & IORING_POLL_ADD_MULTI)) 5327 events |= EPOLLONESHOT; 5328 return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT)); 5329} 5330 5331static int io_poll_update_prep(struct io_kiocb *req, 5332 const struct io_uring_sqe *sqe) 5333{ 5334 struct io_poll_update *upd = &req->poll_update; 5335 u32 flags; 5336 5337 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5338 return -EINVAL; 5339 if (sqe->ioprio || sqe->buf_index) 5340 return -EINVAL; 5341 flags = READ_ONCE(sqe->len); 5342 if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | 5343 IORING_POLL_ADD_MULTI)) 5344 return -EINVAL; 5345 /* meaningless without update */ 5346 if (flags == IORING_POLL_ADD_MULTI) 5347 return -EINVAL; 5348 5349 upd->old_user_data = READ_ONCE(sqe->addr); 5350 upd->update_events = flags & IORING_POLL_UPDATE_EVENTS; 5351 upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA; 5352 5353 upd->new_user_data = READ_ONCE(sqe->off); 5354 if (!upd->update_user_data && upd->new_user_data) 5355 return -EINVAL; 5356 if (upd->update_events) 5357 upd->events = io_poll_parse_events(sqe, flags); 5358 else if (sqe->poll32_events) 5359 return -EINVAL; 5360 5361 return 0; 5362} 5363 5364static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, 5365 void *key) 5366{ 5367 struct io_kiocb *req = wait->private; 5368 struct io_poll_iocb *poll = &req->poll; 5369 5370 return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func); 5371} 5372 5373static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, 5374 struct poll_table_struct *p) 5375{ 5376 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); 5377 5378 __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data); 5379} 5380 5381static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5382{ 5383 struct io_poll_iocb *poll = &req->poll; 5384 u32 flags; 5385 5386 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5387 return -EINVAL; 5388 if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr) 5389 return -EINVAL; 5390 flags = READ_ONCE(sqe->len); 5391 if (flags & ~IORING_POLL_ADD_MULTI) 5392 return -EINVAL; 5393 5394 poll->events = io_poll_parse_events(sqe, flags); 5395 return 0; 5396} 5397 5398static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) 5399{ 5400 struct io_poll_iocb *poll = &req->poll; 5401 struct io_ring_ctx *ctx = req->ctx; 5402 struct io_poll_table ipt; 5403 __poll_t mask; 5404 5405 ipt.pt._qproc = io_poll_queue_proc; 5406 5407 mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events, 5408 io_poll_wake); 5409 5410 if (mask) { /* no async, we'd stolen it */ 5411 ipt.error = 0; 5412 io_poll_complete(req, mask); 5413 } 5414 spin_unlock_irq(&ctx->completion_lock); 5415 5416 if (mask) { 5417 io_cqring_ev_posted(ctx); 5418 if (poll->events & EPOLLONESHOT) 5419 io_put_req(req); 5420 } 5421 return ipt.error; 5422} 5423 5424static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) 5425{ 5426 struct io_ring_ctx *ctx = req->ctx; 5427 struct io_kiocb *preq; 5428 bool completing; 5429 int ret; 5430 5431 spin_lock_irq(&ctx->completion_lock); 5432 preq = io_poll_find(ctx, req->poll_update.old_user_data, true); 5433 if (!preq) { 5434 ret = -ENOENT; 5435 goto err; 5436 } 5437 5438 if (!req->poll_update.update_events && !req->poll_update.update_user_data) { 5439 completing = true; 5440 ret = io_poll_remove_one(preq) ? 0 : -EALREADY; 5441 goto err; 5442 } 5443 5444 /* 5445 * Don't allow racy completion with singleshot, as we cannot safely 5446 * update those. For multishot, if we're racing with completion, just 5447 * let completion re-add it. 5448 */ 5449 completing = !__io_poll_remove_one(preq, &preq->poll, false); 5450 if (completing && (preq->poll.events & EPOLLONESHOT)) { 5451 ret = -EALREADY; 5452 goto err; 5453 } 5454 /* we now have a detached poll request. reissue. */ 5455 ret = 0; 5456err: 5457 if (ret < 0) { 5458 spin_unlock_irq(&ctx->completion_lock); 5459 req_set_fail_links(req); 5460 io_req_complete(req, ret); 5461 return 0; 5462 } 5463 /* only mask one event flags, keep behavior flags */ 5464 if (req->poll_update.update_events) { 5465 preq->poll.events &= ~0xffff; 5466 preq->poll.events |= req->poll_update.events & 0xffff; 5467 preq->poll.events |= IO_POLL_UNMASK; 5468 } 5469 if (req->poll_update.update_user_data) 5470 preq->user_data = req->poll_update.new_user_data; 5471 spin_unlock_irq(&ctx->completion_lock); 5472 5473 /* complete update request, we're done with it */ 5474 io_req_complete(req, ret); 5475 5476 if (!completing) { 5477 ret = io_poll_add(preq, issue_flags); 5478 if (ret < 0) { 5479 req_set_fail_links(preq); 5480 io_req_complete(preq, ret); 5481 } 5482 } 5483 return 0; 5484} 5485 5486static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) 5487{ 5488 struct io_timeout_data *data = container_of(timer, 5489 struct io_timeout_data, timer); 5490 struct io_kiocb *req = data->req; 5491 struct io_ring_ctx *ctx = req->ctx; 5492 unsigned long flags; 5493 5494 spin_lock_irqsave(&ctx->completion_lock, flags); 5495 list_del_init(&req->timeout.list); 5496 atomic_set(&req->ctx->cq_timeouts, 5497 atomic_read(&req->ctx->cq_timeouts) + 1); 5498 5499 io_cqring_fill_event(ctx, req->user_data, -ETIME, 0); 5500 io_commit_cqring(ctx); 5501 spin_unlock_irqrestore(&ctx->completion_lock, flags); 5502 5503 io_cqring_ev_posted(ctx); 5504 req_set_fail_links(req); 5505 io_put_req(req); 5506 return HRTIMER_NORESTART; 5507} 5508 5509static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, 5510 __u64 user_data) 5511 __must_hold(&ctx->completion_lock) 5512{ 5513 struct io_timeout_data *io; 5514 struct io_kiocb *req; 5515 bool found = false; 5516 5517 list_for_each_entry(req, &ctx->timeout_list, timeout.list) { 5518 found = user_data == req->user_data; 5519 if (found) 5520 break; 5521 } 5522 if (!found) 5523 return ERR_PTR(-ENOENT); 5524 5525 io = req->async_data; 5526 if (hrtimer_try_to_cancel(&io->timer) == -1) 5527 return ERR_PTR(-EALREADY); 5528 list_del_init(&req->timeout.list); 5529 return req; 5530} 5531 5532static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) 5533 __must_hold(&ctx->completion_lock) 5534{ 5535 struct io_kiocb *req = io_timeout_extract(ctx, user_data); 5536 5537 if (IS_ERR(req)) 5538 return PTR_ERR(req); 5539 5540 req_set_fail_links(req); 5541 io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0); 5542 io_put_req_deferred(req, 1); 5543 return 0; 5544} 5545 5546static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, 5547 struct timespec64 *ts, enum hrtimer_mode mode) 5548 __must_hold(&ctx->completion_lock) 5549{ 5550 struct io_kiocb *req = io_timeout_extract(ctx, user_data); 5551 struct io_timeout_data *data; 5552 5553 if (IS_ERR(req)) 5554 return PTR_ERR(req); 5555 5556 req->timeout.off = 0; /* noseq */ 5557 data = req->async_data; 5558 list_add_tail(&req->timeout.list, &ctx->timeout_list); 5559 hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode); 5560 data->timer.function = io_timeout_fn; 5561 hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); 5562 return 0; 5563} 5564 5565static int io_timeout_remove_prep(struct io_kiocb *req, 5566 const struct io_uring_sqe *sqe) 5567{ 5568 struct io_timeout_rem *tr = &req->timeout_rem; 5569 5570 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5571 return -EINVAL; 5572 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 5573 return -EINVAL; 5574 if (sqe->ioprio || sqe->buf_index || sqe->len) 5575 return -EINVAL; 5576 5577 tr->addr = READ_ONCE(sqe->addr); 5578 tr->flags = READ_ONCE(sqe->timeout_flags); 5579 if (tr->flags & IORING_TIMEOUT_UPDATE) { 5580 if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS)) 5581 return -EINVAL; 5582 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) 5583 return -EFAULT; 5584 } else if (tr->flags) { 5585 /* timeout removal doesn't support flags */ 5586 return -EINVAL; 5587 } 5588 5589 return 0; 5590} 5591 5592static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags) 5593{ 5594 return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS 5595 : HRTIMER_MODE_REL; 5596} 5597 5598/* 5599 * Remove or update an existing timeout command 5600 */ 5601static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) 5602{ 5603 struct io_timeout_rem *tr = &req->timeout_rem; 5604 struct io_ring_ctx *ctx = req->ctx; 5605 int ret; 5606 5607 spin_lock_irq(&ctx->completion_lock); 5608 if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) 5609 ret = io_timeout_cancel(ctx, tr->addr); 5610 else 5611 ret = io_timeout_update(ctx, tr->addr, &tr->ts, 5612 io_translate_timeout_mode(tr->flags)); 5613 5614 io_cqring_fill_event(ctx, req->user_data, ret, 0); 5615 io_commit_cqring(ctx); 5616 spin_unlock_irq(&ctx->completion_lock); 5617 io_cqring_ev_posted(ctx); 5618 if (ret < 0) 5619 req_set_fail_links(req); 5620 io_put_req(req); 5621 return 0; 5622} 5623 5624static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, 5625 bool is_timeout_link) 5626{ 5627 struct io_timeout_data *data; 5628 unsigned flags; 5629 u32 off = READ_ONCE(sqe->off); 5630 5631 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5632 return -EINVAL; 5633 if (sqe->ioprio || sqe->buf_index || sqe->len != 1) 5634 return -EINVAL; 5635 if (off && is_timeout_link) 5636 return -EINVAL; 5637 flags = READ_ONCE(sqe->timeout_flags); 5638 if (flags & ~IORING_TIMEOUT_ABS) 5639 return -EINVAL; 5640 5641 req->timeout.off = off; 5642 5643 if (!req->async_data && io_alloc_async_data(req)) 5644 return -ENOMEM; 5645 5646 data = req->async_data; 5647 data->req = req; 5648 5649 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) 5650 return -EFAULT; 5651 5652 data->mode = io_translate_timeout_mode(flags); 5653 hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); 5654 if (is_timeout_link) 5655 io_req_track_inflight(req); 5656 return 0; 5657} 5658 5659static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) 5660{ 5661 struct io_ring_ctx *ctx = req->ctx; 5662 struct io_timeout_data *data = req->async_data; 5663 struct list_head *entry; 5664 u32 tail, off = req->timeout.off; 5665 5666 spin_lock_irq(&ctx->completion_lock); 5667 5668 /* 5669 * sqe->off holds how many events that need to occur for this 5670 * timeout event to be satisfied. If it isn't set, then this is 5671 * a pure timeout request, sequence isn't used. 5672 */ 5673 if (io_is_timeout_noseq(req)) { 5674 entry = ctx->timeout_list.prev; 5675 goto add; 5676 } 5677 5678 tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); 5679 req->timeout.target_seq = tail + off; 5680 5681 /* Update the last seq here in case io_flush_timeouts() hasn't. 5682 * This is safe because ->completion_lock is held, and submissions 5683 * and completions are never mixed in the same ->completion_lock section. 5684 */ 5685 ctx->cq_last_tm_flush = tail; 5686 5687 /* 5688 * Insertion sort, ensuring the first entry in the list is always 5689 * the one we need first. 5690 */ 5691 list_for_each_prev(entry, &ctx->timeout_list) { 5692 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, 5693 timeout.list); 5694 5695 if (io_is_timeout_noseq(nxt)) 5696 continue; 5697 /* nxt.seq is behind @tail, otherwise would've been completed */ 5698 if (off >= nxt->timeout.target_seq - tail) 5699 break; 5700 } 5701add: 5702 list_add(&req->timeout.list, entry); 5703 data->timer.function = io_timeout_fn; 5704 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); 5705 spin_unlock_irq(&ctx->completion_lock); 5706 return 0; 5707} 5708 5709struct io_cancel_data { 5710 struct io_ring_ctx *ctx; 5711 u64 user_data; 5712}; 5713 5714static bool io_cancel_cb(struct io_wq_work *work, void *data) 5715{ 5716 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 5717 struct io_cancel_data *cd = data; 5718 5719 return req->ctx == cd->ctx && req->user_data == cd->user_data; 5720} 5721 5722static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, 5723 struct io_ring_ctx *ctx) 5724{ 5725 struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, }; 5726 enum io_wq_cancel cancel_ret; 5727 int ret = 0; 5728 5729 if (!tctx || !tctx->io_wq) 5730 return -ENOENT; 5731 5732 cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false); 5733 switch (cancel_ret) { 5734 case IO_WQ_CANCEL_OK: 5735 ret = 0; 5736 break; 5737 case IO_WQ_CANCEL_RUNNING: 5738 ret = -EALREADY; 5739 break; 5740 case IO_WQ_CANCEL_NOTFOUND: 5741 ret = -ENOENT; 5742 break; 5743 } 5744 5745 return ret; 5746} 5747 5748static void io_async_find_and_cancel(struct io_ring_ctx *ctx, 5749 struct io_kiocb *req, __u64 sqe_addr, 5750 int success_ret) 5751{ 5752 unsigned long flags; 5753 int ret; 5754 5755 ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); 5756 spin_lock_irqsave(&ctx->completion_lock, flags); 5757 if (ret != -ENOENT) 5758 goto done; 5759 ret = io_timeout_cancel(ctx, sqe_addr); 5760 if (ret != -ENOENT) 5761 goto done; 5762 ret = io_poll_cancel(ctx, sqe_addr, false); 5763done: 5764 if (!ret) 5765 ret = success_ret; 5766 io_cqring_fill_event(ctx, req->user_data, ret, 0); 5767 io_commit_cqring(ctx); 5768 spin_unlock_irqrestore(&ctx->completion_lock, flags); 5769 io_cqring_ev_posted(ctx); 5770 5771 if (ret < 0) 5772 req_set_fail_links(req); 5773} 5774 5775static int io_async_cancel_prep(struct io_kiocb *req, 5776 const struct io_uring_sqe *sqe) 5777{ 5778 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5779 return -EINVAL; 5780 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 5781 return -EINVAL; 5782 if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags) 5783 return -EINVAL; 5784 5785 req->cancel.addr = READ_ONCE(sqe->addr); 5786 return 0; 5787} 5788 5789static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) 5790{ 5791 struct io_ring_ctx *ctx = req->ctx; 5792 u64 sqe_addr = req->cancel.addr; 5793 struct io_tctx_node *node; 5794 int ret; 5795 5796 /* tasks should wait for their io-wq threads, so safe w/o sync */ 5797 ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); 5798 spin_lock_irq(&ctx->completion_lock); 5799 if (ret != -ENOENT) 5800 goto done; 5801 ret = io_timeout_cancel(ctx, sqe_addr); 5802 if (ret != -ENOENT) 5803 goto done; 5804 ret = io_poll_cancel(ctx, sqe_addr, false); 5805 if (ret != -ENOENT) 5806 goto done; 5807 spin_unlock_irq(&ctx->completion_lock); 5808 5809 /* slow path, try all io-wq's */ 5810 io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); 5811 ret = -ENOENT; 5812 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 5813 struct io_uring_task *tctx = node->task->io_uring; 5814 5815 ret = io_async_cancel_one(tctx, req->cancel.addr, ctx); 5816 if (ret != -ENOENT) 5817 break; 5818 } 5819 io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); 5820 5821 spin_lock_irq(&ctx->completion_lock); 5822done: 5823 io_cqring_fill_event(ctx, req->user_data, ret, 0); 5824 io_commit_cqring(ctx); 5825 spin_unlock_irq(&ctx->completion_lock); 5826 io_cqring_ev_posted(ctx); 5827 5828 if (ret < 0) 5829 req_set_fail_links(req); 5830 io_put_req(req); 5831 return 0; 5832} 5833 5834static int io_rsrc_update_prep(struct io_kiocb *req, 5835 const struct io_uring_sqe *sqe) 5836{ 5837 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 5838 return -EINVAL; 5839 if (sqe->ioprio || sqe->rw_flags) 5840 return -EINVAL; 5841 5842 req->rsrc_update.offset = READ_ONCE(sqe->off); 5843 req->rsrc_update.nr_args = READ_ONCE(sqe->len); 5844 if (!req->rsrc_update.nr_args) 5845 return -EINVAL; 5846 req->rsrc_update.arg = READ_ONCE(sqe->addr); 5847 return 0; 5848} 5849 5850static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 5851{ 5852 struct io_ring_ctx *ctx = req->ctx; 5853 struct io_uring_rsrc_update2 up; 5854 int ret; 5855 5856 if (issue_flags & IO_URING_F_NONBLOCK) 5857 return -EAGAIN; 5858 5859 up.offset = req->rsrc_update.offset; 5860 up.data = req->rsrc_update.arg; 5861 up.nr = 0; 5862 up.tags = 0; 5863 up.resv = 0; 5864 5865 mutex_lock(&ctx->uring_lock); 5866 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 5867 &up, req->rsrc_update.nr_args); 5868 mutex_unlock(&ctx->uring_lock); 5869 5870 if (ret < 0) 5871 req_set_fail_links(req); 5872 __io_req_complete(req, issue_flags, ret, 0); 5873 return 0; 5874} 5875 5876static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5877{ 5878 switch (req->opcode) { 5879 case IORING_OP_NOP: 5880 return 0; 5881 case IORING_OP_READV: 5882 case IORING_OP_READ_FIXED: 5883 case IORING_OP_READ: 5884 return io_read_prep(req, sqe); 5885 case IORING_OP_WRITEV: 5886 case IORING_OP_WRITE_FIXED: 5887 case IORING_OP_WRITE: 5888 return io_write_prep(req, sqe); 5889 case IORING_OP_POLL_ADD: 5890 return io_poll_add_prep(req, sqe); 5891 case IORING_OP_POLL_REMOVE: 5892 return io_poll_update_prep(req, sqe); 5893 case IORING_OP_FSYNC: 5894 return io_fsync_prep(req, sqe); 5895 case IORING_OP_SYNC_FILE_RANGE: 5896 return io_sfr_prep(req, sqe); 5897 case IORING_OP_SENDMSG: 5898 case IORING_OP_SEND: 5899 return io_sendmsg_prep(req, sqe); 5900 case IORING_OP_RECVMSG: 5901 case IORING_OP_RECV: 5902 return io_recvmsg_prep(req, sqe); 5903 case IORING_OP_CONNECT: 5904 return io_connect_prep(req, sqe); 5905 case IORING_OP_TIMEOUT: 5906 return io_timeout_prep(req, sqe, false); 5907 case IORING_OP_TIMEOUT_REMOVE: 5908 return io_timeout_remove_prep(req, sqe); 5909 case IORING_OP_ASYNC_CANCEL: 5910 return io_async_cancel_prep(req, sqe); 5911 case IORING_OP_LINK_TIMEOUT: 5912 return io_timeout_prep(req, sqe, true); 5913 case IORING_OP_ACCEPT: 5914 return io_accept_prep(req, sqe); 5915 case IORING_OP_FALLOCATE: 5916 return io_fallocate_prep(req, sqe); 5917 case IORING_OP_OPENAT: 5918 return io_openat_prep(req, sqe); 5919 case IORING_OP_CLOSE: 5920 return io_close_prep(req, sqe); 5921 case IORING_OP_FILES_UPDATE: 5922 return io_rsrc_update_prep(req, sqe); 5923 case IORING_OP_STATX: 5924 return io_statx_prep(req, sqe); 5925 case IORING_OP_FADVISE: 5926 return io_fadvise_prep(req, sqe); 5927 case IORING_OP_MADVISE: 5928 return io_madvise_prep(req, sqe); 5929 case IORING_OP_OPENAT2: 5930 return io_openat2_prep(req, sqe); 5931 case IORING_OP_EPOLL_CTL: 5932 return io_epoll_ctl_prep(req, sqe); 5933 case IORING_OP_SPLICE: 5934 return io_splice_prep(req, sqe); 5935 case IORING_OP_PROVIDE_BUFFERS: 5936 return io_provide_buffers_prep(req, sqe); 5937 case IORING_OP_REMOVE_BUFFERS: 5938 return io_remove_buffers_prep(req, sqe); 5939 case IORING_OP_TEE: 5940 return io_tee_prep(req, sqe); 5941 case IORING_OP_SHUTDOWN: 5942 return io_shutdown_prep(req, sqe); 5943 case IORING_OP_RENAMEAT: 5944 return io_renameat_prep(req, sqe); 5945 case IORING_OP_UNLINKAT: 5946 return io_unlinkat_prep(req, sqe); 5947 } 5948 5949 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", 5950 req->opcode); 5951 return -EINVAL; 5952} 5953 5954static int io_req_prep_async(struct io_kiocb *req) 5955{ 5956 if (!io_op_defs[req->opcode].needs_async_setup) 5957 return 0; 5958 if (WARN_ON_ONCE(req->async_data)) 5959 return -EFAULT; 5960 if (io_alloc_async_data(req)) 5961 return -EAGAIN; 5962 5963 switch (req->opcode) { 5964 case IORING_OP_READV: 5965 return io_rw_prep_async(req, READ); 5966 case IORING_OP_WRITEV: 5967 return io_rw_prep_async(req, WRITE); 5968 case IORING_OP_SENDMSG: 5969 return io_sendmsg_prep_async(req); 5970 case IORING_OP_RECVMSG: 5971 return io_recvmsg_prep_async(req); 5972 case IORING_OP_CONNECT: 5973 return io_connect_prep_async(req); 5974 } 5975 printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n", 5976 req->opcode); 5977 return -EFAULT; 5978} 5979 5980static u32 io_get_sequence(struct io_kiocb *req) 5981{ 5982 struct io_kiocb *pos; 5983 struct io_ring_ctx *ctx = req->ctx; 5984 u32 total_submitted, nr_reqs = 0; 5985 5986 io_for_each_link(pos, req) 5987 nr_reqs++; 5988 5989 total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped; 5990 return total_submitted - nr_reqs; 5991} 5992 5993static int io_req_defer(struct io_kiocb *req) 5994{ 5995 struct io_ring_ctx *ctx = req->ctx; 5996 struct io_defer_entry *de; 5997 int ret; 5998 u32 seq; 5999 6000 /* Still need defer if there is pending req in defer list. */ 6001 if (likely(list_empty_careful(&ctx->defer_list) && 6002 !(req->flags & REQ_F_IO_DRAIN))) 6003 return 0; 6004 6005 seq = io_get_sequence(req); 6006 /* Still a chance to pass the sequence check */ 6007 if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) 6008 return 0; 6009 6010 ret = io_req_prep_async(req); 6011 if (ret) 6012 return ret; 6013 io_prep_async_link(req); 6014 de = kmalloc(sizeof(*de), GFP_KERNEL); 6015 if (!de) 6016 return -ENOMEM; 6017 6018 spin_lock_irq(&ctx->completion_lock); 6019 if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { 6020 spin_unlock_irq(&ctx->completion_lock); 6021 kfree(de); 6022 io_queue_async_work(req); 6023 return -EIOCBQUEUED; 6024 } 6025 6026 trace_io_uring_defer(ctx, req, req->user_data); 6027 de->req = req; 6028 de->seq = seq; 6029 list_add_tail(&de->list, &ctx->defer_list); 6030 spin_unlock_irq(&ctx->completion_lock); 6031 return -EIOCBQUEUED; 6032} 6033 6034static void io_clean_op(struct io_kiocb *req) 6035{ 6036 if (req->flags & REQ_F_BUFFER_SELECTED) { 6037 switch (req->opcode) { 6038 case IORING_OP_READV: 6039 case IORING_OP_READ_FIXED: 6040 case IORING_OP_READ: 6041 kfree((void *)(unsigned long)req->rw.addr); 6042 break; 6043 case IORING_OP_RECVMSG: 6044 case IORING_OP_RECV: 6045 kfree(req->sr_msg.kbuf); 6046 break; 6047 } 6048 req->flags &= ~REQ_F_BUFFER_SELECTED; 6049 } 6050 6051 if (req->flags & REQ_F_NEED_CLEANUP) { 6052 switch (req->opcode) { 6053 case IORING_OP_READV: 6054 case IORING_OP_READ_FIXED: 6055 case IORING_OP_READ: 6056 case IORING_OP_WRITEV: 6057 case IORING_OP_WRITE_FIXED: 6058 case IORING_OP_WRITE: { 6059 struct io_async_rw *io = req->async_data; 6060 if (io->free_iovec) 6061 kfree(io->free_iovec); 6062 break; 6063 } 6064 case IORING_OP_RECVMSG: 6065 case IORING_OP_SENDMSG: { 6066 struct io_async_msghdr *io = req->async_data; 6067 6068 kfree(io->free_iov); 6069 break; 6070 } 6071 case IORING_OP_SPLICE: 6072 case IORING_OP_TEE: 6073 if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED)) 6074 io_put_file(req->splice.file_in); 6075 break; 6076 case IORING_OP_OPENAT: 6077 case IORING_OP_OPENAT2: 6078 if (req->open.filename) 6079 putname(req->open.filename); 6080 break; 6081 case IORING_OP_RENAMEAT: 6082 putname(req->rename.oldpath); 6083 putname(req->rename.newpath); 6084 break; 6085 case IORING_OP_UNLINKAT: 6086 putname(req->unlink.filename); 6087 break; 6088 } 6089 req->flags &= ~REQ_F_NEED_CLEANUP; 6090 } 6091 if ((req->flags & REQ_F_POLLED) && req->apoll) { 6092 kfree(req->apoll->double_poll); 6093 kfree(req->apoll); 6094 req->apoll = NULL; 6095 } 6096 if (req->flags & REQ_F_INFLIGHT) { 6097 struct io_uring_task *tctx = req->task->io_uring; 6098 6099 atomic_dec(&tctx->inflight_tracked); 6100 req->flags &= ~REQ_F_INFLIGHT; 6101 } 6102} 6103 6104static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) 6105{ 6106 struct io_ring_ctx *ctx = req->ctx; 6107 const struct cred *creds = NULL; 6108 int ret; 6109 6110 if (req->work.creds && req->work.creds != current_cred()) 6111 creds = override_creds(req->work.creds); 6112 6113 switch (req->opcode) { 6114 case IORING_OP_NOP: 6115 ret = io_nop(req, issue_flags); 6116 break; 6117 case IORING_OP_READV: 6118 case IORING_OP_READ_FIXED: 6119 case IORING_OP_READ: 6120 ret = io_read(req, issue_flags); 6121 break; 6122 case IORING_OP_WRITEV: 6123 case IORING_OP_WRITE_FIXED: 6124 case IORING_OP_WRITE: 6125 ret = io_write(req, issue_flags); 6126 break; 6127 case IORING_OP_FSYNC: 6128 ret = io_fsync(req, issue_flags); 6129 break; 6130 case IORING_OP_POLL_ADD: 6131 ret = io_poll_add(req, issue_flags); 6132 break; 6133 case IORING_OP_POLL_REMOVE: 6134 ret = io_poll_update(req, issue_flags); 6135 break; 6136 case IORING_OP_SYNC_FILE_RANGE: 6137 ret = io_sync_file_range(req, issue_flags); 6138 break; 6139 case IORING_OP_SENDMSG: 6140 ret = io_sendmsg(req, issue_flags); 6141 break; 6142 case IORING_OP_SEND: 6143 ret = io_send(req, issue_flags); 6144 break; 6145 case IORING_OP_RECVMSG: 6146 ret = io_recvmsg(req, issue_flags); 6147 break; 6148 case IORING_OP_RECV: 6149 ret = io_recv(req, issue_flags); 6150 break; 6151 case IORING_OP_TIMEOUT: 6152 ret = io_timeout(req, issue_flags); 6153 break; 6154 case IORING_OP_TIMEOUT_REMOVE: 6155 ret = io_timeout_remove(req, issue_flags); 6156 break; 6157 case IORING_OP_ACCEPT: 6158 ret = io_accept(req, issue_flags); 6159 break; 6160 case IORING_OP_CONNECT: 6161 ret = io_connect(req, issue_flags); 6162 break; 6163 case IORING_OP_ASYNC_CANCEL: 6164 ret = io_async_cancel(req, issue_flags); 6165 break; 6166 case IORING_OP_FALLOCATE: 6167 ret = io_fallocate(req, issue_flags); 6168 break; 6169 case IORING_OP_OPENAT: 6170 ret = io_openat(req, issue_flags); 6171 break; 6172 case IORING_OP_CLOSE: 6173 ret = io_close(req, issue_flags); 6174 break; 6175 case IORING_OP_FILES_UPDATE: 6176 ret = io_files_update(req, issue_flags); 6177 break; 6178 case IORING_OP_STATX: 6179 ret = io_statx(req, issue_flags); 6180 break; 6181 case IORING_OP_FADVISE: 6182 ret = io_fadvise(req, issue_flags); 6183 break; 6184 case IORING_OP_MADVISE: 6185 ret = io_madvise(req, issue_flags); 6186 break; 6187 case IORING_OP_OPENAT2: 6188 ret = io_openat2(req, issue_flags); 6189 break; 6190 case IORING_OP_EPOLL_CTL: 6191 ret = io_epoll_ctl(req, issue_flags); 6192 break; 6193 case IORING_OP_SPLICE: 6194 ret = io_splice(req, issue_flags); 6195 break; 6196 case IORING_OP_PROVIDE_BUFFERS: 6197 ret = io_provide_buffers(req, issue_flags); 6198 break; 6199 case IORING_OP_REMOVE_BUFFERS: 6200 ret = io_remove_buffers(req, issue_flags); 6201 break; 6202 case IORING_OP_TEE: 6203 ret = io_tee(req, issue_flags); 6204 break; 6205 case IORING_OP_SHUTDOWN: 6206 ret = io_shutdown(req, issue_flags); 6207 break; 6208 case IORING_OP_RENAMEAT: 6209 ret = io_renameat(req, issue_flags); 6210 break; 6211 case IORING_OP_UNLINKAT: 6212 ret = io_unlinkat(req, issue_flags); 6213 break; 6214 default: 6215 ret = -EINVAL; 6216 break; 6217 } 6218 6219 if (creds) 6220 revert_creds(creds); 6221 6222 if (ret) 6223 return ret; 6224 6225 /* If the op doesn't have a file, we're not polling for it */ 6226 if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) { 6227 const bool in_async = io_wq_current_is_worker(); 6228 6229 /* workqueue context doesn't hold uring_lock, grab it now */ 6230 if (in_async) 6231 mutex_lock(&ctx->uring_lock); 6232 6233 io_iopoll_req_issued(req, in_async); 6234 6235 if (in_async) 6236 mutex_unlock(&ctx->uring_lock); 6237 } 6238 6239 return 0; 6240} 6241 6242static void io_wq_submit_work(struct io_wq_work *work) 6243{ 6244 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 6245 struct io_kiocb *timeout; 6246 int ret = 0; 6247 6248 timeout = io_prep_linked_timeout(req); 6249 if (timeout) 6250 io_queue_linked_timeout(timeout); 6251 6252 if (work->flags & IO_WQ_WORK_CANCEL) 6253 ret = -ECANCELED; 6254 6255 if (!ret) { 6256 do { 6257 ret = io_issue_sqe(req, 0); 6258 /* 6259 * We can get EAGAIN for polled IO even though we're 6260 * forcing a sync submission from here, since we can't 6261 * wait for request slots on the block side. 6262 */ 6263 if (ret != -EAGAIN) 6264 break; 6265 cond_resched(); 6266 } while (1); 6267 } 6268 6269 /* avoid locking problems by failing it from a clean context */ 6270 if (ret) { 6271 /* io-wq is going to take one down */ 6272 req_ref_get(req); 6273 io_req_task_queue_fail(req, ret); 6274 } 6275} 6276 6277#define FFS_ASYNC_READ 0x1UL 6278#define FFS_ASYNC_WRITE 0x2UL 6279#ifdef CONFIG_64BIT 6280#define FFS_ISREG 0x4UL 6281#else 6282#define FFS_ISREG 0x0UL 6283#endif 6284#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG) 6285 6286static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table, 6287 unsigned i) 6288{ 6289 struct io_fixed_file *table_l2; 6290 6291 table_l2 = table->files[i >> IORING_FILE_TABLE_SHIFT]; 6292 return &table_l2[i & IORING_FILE_TABLE_MASK]; 6293} 6294 6295static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, 6296 int index) 6297{ 6298 struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index); 6299 6300 return (struct file *) (slot->file_ptr & FFS_MASK); 6301} 6302 6303static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file) 6304{ 6305 unsigned long file_ptr = (unsigned long) file; 6306 6307 if (__io_file_supports_async(file, READ)) 6308 file_ptr |= FFS_ASYNC_READ; 6309 if (__io_file_supports_async(file, WRITE)) 6310 file_ptr |= FFS_ASYNC_WRITE; 6311 if (S_ISREG(file_inode(file)->i_mode)) 6312 file_ptr |= FFS_ISREG; 6313 file_slot->file_ptr = file_ptr; 6314} 6315 6316static struct file *io_file_get(struct io_submit_state *state, 6317 struct io_kiocb *req, int fd, bool fixed) 6318{ 6319 struct io_ring_ctx *ctx = req->ctx; 6320 struct file *file; 6321 6322 if (fixed) { 6323 unsigned long file_ptr; 6324 6325 if (unlikely((unsigned int)fd >= ctx->nr_user_files)) 6326 return NULL; 6327 fd = array_index_nospec(fd, ctx->nr_user_files); 6328 file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; 6329 file = (struct file *) (file_ptr & FFS_MASK); 6330 file_ptr &= ~FFS_MASK; 6331 /* mask in overlapping REQ_F and FFS bits */ 6332 req->flags |= (file_ptr << REQ_F_ASYNC_READ_BIT); 6333 io_req_set_rsrc_node(req); 6334 } else { 6335 trace_io_uring_file_get(ctx, fd); 6336 file = __io_file_get(state, fd); 6337 6338 /* we don't allow fixed io_uring files */ 6339 if (file && unlikely(file->f_op == &io_uring_fops)) 6340 io_req_track_inflight(req); 6341 } 6342 6343 return file; 6344} 6345 6346static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) 6347{ 6348 struct io_timeout_data *data = container_of(timer, 6349 struct io_timeout_data, timer); 6350 struct io_kiocb *prev, *req = data->req; 6351 struct io_ring_ctx *ctx = req->ctx; 6352 unsigned long flags; 6353 6354 spin_lock_irqsave(&ctx->completion_lock, flags); 6355 prev = req->timeout.head; 6356 req->timeout.head = NULL; 6357 6358 /* 6359 * We don't expect the list to be empty, that will only happen if we 6360 * race with the completion of the linked work. 6361 */ 6362 if (prev) { 6363 io_remove_next_linked(prev); 6364 if (!req_ref_inc_not_zero(prev)) 6365 prev = NULL; 6366 } 6367 spin_unlock_irqrestore(&ctx->completion_lock, flags); 6368 6369 if (prev) { 6370 io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME); 6371 io_put_req_deferred(prev, 1); 6372 io_put_req_deferred(req, 1); 6373 } else { 6374 io_req_complete_post(req, -ETIME, 0); 6375 } 6376 return HRTIMER_NORESTART; 6377} 6378 6379static void io_queue_linked_timeout(struct io_kiocb *req) 6380{ 6381 struct io_ring_ctx *ctx = req->ctx; 6382 6383 spin_lock_irq(&ctx->completion_lock); 6384 /* 6385 * If the back reference is NULL, then our linked request finished 6386 * before we got a chance to setup the timer 6387 */ 6388 if (req->timeout.head) { 6389 struct io_timeout_data *data = req->async_data; 6390 6391 data->timer.function = io_link_timeout_fn; 6392 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), 6393 data->mode); 6394 } 6395 spin_unlock_irq(&ctx->completion_lock); 6396 /* drop submission reference */ 6397 io_put_req(req); 6398} 6399 6400static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) 6401{ 6402 struct io_kiocb *nxt = req->link; 6403 6404 if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) || 6405 nxt->opcode != IORING_OP_LINK_TIMEOUT) 6406 return NULL; 6407 6408 nxt->timeout.head = req; 6409 nxt->flags |= REQ_F_LTIMEOUT_ACTIVE; 6410 req->flags |= REQ_F_LINK_TIMEOUT; 6411 return nxt; 6412} 6413 6414static void __io_queue_sqe(struct io_kiocb *req) 6415{ 6416 struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); 6417 int ret; 6418 6419 ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); 6420 6421 /* 6422 * We async punt it if the file wasn't marked NOWAIT, or if the file 6423 * doesn't support non-blocking read/write attempts 6424 */ 6425 if (likely(!ret)) { 6426 /* drop submission reference */ 6427 if (req->flags & REQ_F_COMPLETE_INLINE) { 6428 struct io_ring_ctx *ctx = req->ctx; 6429 struct io_comp_state *cs = &ctx->submit_state.comp; 6430 6431 cs->reqs[cs->nr++] = req; 6432 if (cs->nr == ARRAY_SIZE(cs->reqs)) 6433 io_submit_flush_completions(cs, ctx); 6434 } else { 6435 io_put_req(req); 6436 } 6437 } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { 6438 if (!io_arm_poll_handler(req)) { 6439 /* 6440 * Queued up for async execution, worker will release 6441 * submit reference when the iocb is actually submitted. 6442 */ 6443 io_queue_async_work(req); 6444 } 6445 } else { 6446 io_req_complete_failed(req, ret); 6447 } 6448 if (linked_timeout) 6449 io_queue_linked_timeout(linked_timeout); 6450} 6451 6452static void io_queue_sqe(struct io_kiocb *req) 6453{ 6454 int ret; 6455 6456 ret = io_req_defer(req); 6457 if (ret) { 6458 if (ret != -EIOCBQUEUED) { 6459fail_req: 6460 io_req_complete_failed(req, ret); 6461 } 6462 } else if (req->flags & REQ_F_FORCE_ASYNC) { 6463 ret = io_req_prep_async(req); 6464 if (unlikely(ret)) 6465 goto fail_req; 6466 io_queue_async_work(req); 6467 } else { 6468 __io_queue_sqe(req); 6469 } 6470} 6471 6472/* 6473 * Check SQE restrictions (opcode and flags). 6474 * 6475 * Returns 'true' if SQE is allowed, 'false' otherwise. 6476 */ 6477static inline bool io_check_restriction(struct io_ring_ctx *ctx, 6478 struct io_kiocb *req, 6479 unsigned int sqe_flags) 6480{ 6481 if (!ctx->restricted) 6482 return true; 6483 6484 if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) 6485 return false; 6486 6487 if ((sqe_flags & ctx->restrictions.sqe_flags_required) != 6488 ctx->restrictions.sqe_flags_required) 6489 return false; 6490 6491 if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | 6492 ctx->restrictions.sqe_flags_required)) 6493 return false; 6494 6495 return true; 6496} 6497 6498static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, 6499 const struct io_uring_sqe *sqe) 6500{ 6501 struct io_submit_state *state; 6502 unsigned int sqe_flags; 6503 int personality, ret = 0; 6504 6505 req->opcode = READ_ONCE(sqe->opcode); 6506 /* same numerical values with corresponding REQ_F_*, safe to copy */ 6507 req->flags = sqe_flags = READ_ONCE(sqe->flags); 6508 req->user_data = READ_ONCE(sqe->user_data); 6509 req->async_data = NULL; 6510 req->file = NULL; 6511 req->ctx = ctx; 6512 req->link = NULL; 6513 req->fixed_rsrc_refs = NULL; 6514 /* one is dropped after submission, the other at completion */ 6515 atomic_set(&req->refs, 2); 6516 req->task = current; 6517 req->result = 0; 6518 req->work.creds = NULL; 6519 6520 /* enforce forwards compatibility on users */ 6521 if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) 6522 return -EINVAL; 6523 if (unlikely(req->opcode >= IORING_OP_LAST)) 6524 return -EINVAL; 6525 if (unlikely(!io_check_restriction(ctx, req, sqe_flags))) 6526 return -EACCES; 6527 6528 if ((sqe_flags & IOSQE_BUFFER_SELECT) && 6529 !io_op_defs[req->opcode].buffer_select) 6530 return -EOPNOTSUPP; 6531 6532 personality = READ_ONCE(sqe->personality); 6533 if (personality) { 6534 req->work.creds = xa_load(&ctx->personalities, personality); 6535 if (!req->work.creds) 6536 return -EINVAL; 6537 get_cred(req->work.creds); 6538 } 6539 state = &ctx->submit_state; 6540 6541 /* 6542 * Plug now if we have more than 1 IO left after this, and the target 6543 * is potentially a read/write to block based storage. 6544 */ 6545 if (!state->plug_started && state->ios_left > 1 && 6546 io_op_defs[req->opcode].plug) { 6547 blk_start_plug(&state->plug); 6548 state->plug_started = true; 6549 } 6550 6551 if (io_op_defs[req->opcode].needs_file) { 6552 bool fixed = req->flags & REQ_F_FIXED_FILE; 6553 6554 req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed); 6555 if (unlikely(!req->file)) 6556 ret = -EBADF; 6557 } 6558 6559 state->ios_left--; 6560 return ret; 6561} 6562 6563static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, 6564 const struct io_uring_sqe *sqe) 6565{ 6566 struct io_submit_link *link = &ctx->submit_state.link; 6567 int ret; 6568 6569 ret = io_init_req(ctx, req, sqe); 6570 if (unlikely(ret)) { 6571fail_req: 6572 if (link->head) { 6573 /* fail even hard links since we don't submit */ 6574 link->head->flags |= REQ_F_FAIL_LINK; 6575 io_req_complete_failed(link->head, -ECANCELED); 6576 link->head = NULL; 6577 } 6578 io_req_complete_failed(req, ret); 6579 return ret; 6580 } 6581 ret = io_req_prep(req, sqe); 6582 if (unlikely(ret)) 6583 goto fail_req; 6584 6585 /* don't need @sqe from now on */ 6586 trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, 6587 true, ctx->flags & IORING_SETUP_SQPOLL); 6588 6589 /* 6590 * If we already have a head request, queue this one for async 6591 * submittal once the head completes. If we don't have a head but 6592 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be 6593 * submitted sync once the chain is complete. If none of those 6594 * conditions are true (normal request), then just queue it. 6595 */ 6596 if (link->head) { 6597 struct io_kiocb *head = link->head; 6598 6599 /* 6600 * Taking sequential execution of a link, draining both sides 6601 * of the link also fullfils IOSQE_IO_DRAIN semantics for all 6602 * requests in the link. So, it drains the head and the 6603 * next after the link request. The last one is done via 6604 * drain_next flag to persist the effect across calls. 6605 */ 6606 if (req->flags & REQ_F_IO_DRAIN) { 6607 head->flags |= REQ_F_IO_DRAIN; 6608 ctx->drain_next = 1; 6609 } 6610 ret = io_req_prep_async(req); 6611 if (unlikely(ret)) 6612 goto fail_req; 6613 trace_io_uring_link(ctx, req, head); 6614 link->last->link = req; 6615 link->last = req; 6616 6617 /* last request of a link, enqueue the link */ 6618 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { 6619 io_queue_sqe(head); 6620 link->head = NULL; 6621 } 6622 } else { 6623 if (unlikely(ctx->drain_next)) { 6624 req->flags |= REQ_F_IO_DRAIN; 6625 ctx->drain_next = 0; 6626 } 6627 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { 6628 link->head = req; 6629 link->last = req; 6630 } else { 6631 io_queue_sqe(req); 6632 } 6633 } 6634 6635 return 0; 6636} 6637 6638/* 6639 * Batched submission is done, ensure local IO is flushed out. 6640 */ 6641static void io_submit_state_end(struct io_submit_state *state, 6642 struct io_ring_ctx *ctx) 6643{ 6644 if (state->link.head) 6645 io_queue_sqe(state->link.head); 6646 if (state->comp.nr) 6647 io_submit_flush_completions(&state->comp, ctx); 6648 if (state->plug_started) 6649 blk_finish_plug(&state->plug); 6650 io_state_file_put(state); 6651} 6652 6653/* 6654 * Start submission side cache. 6655 */ 6656static void io_submit_state_start(struct io_submit_state *state, 6657 unsigned int max_ios) 6658{ 6659 state->plug_started = false; 6660 state->ios_left = max_ios; 6661 /* set only head, no need to init link_last in advance */ 6662 state->link.head = NULL; 6663} 6664 6665static void io_commit_sqring(struct io_ring_ctx *ctx) 6666{ 6667 struct io_rings *rings = ctx->rings; 6668 6669 /* 6670 * Ensure any loads from the SQEs are done at this point, 6671 * since once we write the new head, the application could 6672 * write new data to them. 6673 */ 6674 smp_store_release(&rings->sq.head, ctx->cached_sq_head); 6675} 6676 6677/* 6678 * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory 6679 * that is mapped by userspace. This means that care needs to be taken to 6680 * ensure that reads are stable, as we cannot rely on userspace always 6681 * being a good citizen. If members of the sqe are validated and then later 6682 * used, it's important that those reads are done through READ_ONCE() to 6683 * prevent a re-load down the line. 6684 */ 6685static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) 6686{ 6687 u32 *sq_array = ctx->sq_array; 6688 unsigned head; 6689 6690 /* 6691 * The cached sq head (or cq tail) serves two purposes: 6692 * 6693 * 1) allows us to batch the cost of updating the user visible 6694 * head updates. 6695 * 2) allows the kernel side to track the head on its own, even 6696 * though the application is the one updating it. 6697 */ 6698 head = READ_ONCE(sq_array[ctx->cached_sq_head++ & ctx->sq_mask]); 6699 if (likely(head < ctx->sq_entries)) 6700 return &ctx->sq_sqes[head]; 6701 6702 /* drop invalid entries */ 6703 ctx->cached_sq_dropped++; 6704 WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped); 6705 return NULL; 6706} 6707 6708static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) 6709{ 6710 int submitted = 0; 6711 6712 /* make sure SQ entry isn't read before tail */ 6713 nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx)); 6714 6715 if (!percpu_ref_tryget_many(&ctx->refs, nr)) 6716 return -EAGAIN; 6717 6718 percpu_counter_add(&current->io_uring->inflight, nr); 6719 refcount_add(nr, &current->usage); 6720 io_submit_state_start(&ctx->submit_state, nr); 6721 6722 while (submitted < nr) { 6723 const struct io_uring_sqe *sqe; 6724 struct io_kiocb *req; 6725 6726 req = io_alloc_req(ctx); 6727 if (unlikely(!req)) { 6728 if (!submitted) 6729 submitted = -EAGAIN; 6730 break; 6731 } 6732 sqe = io_get_sqe(ctx); 6733 if (unlikely(!sqe)) { 6734 kmem_cache_free(req_cachep, req); 6735 break; 6736 } 6737 /* will complete beyond this point, count as submitted */ 6738 submitted++; 6739 if (io_submit_sqe(ctx, req, sqe)) 6740 break; 6741 } 6742 6743 if (unlikely(submitted != nr)) { 6744 int ref_used = (submitted == -EAGAIN) ? 0 : submitted; 6745 struct io_uring_task *tctx = current->io_uring; 6746 int unused = nr - ref_used; 6747 6748 percpu_ref_put_many(&ctx->refs, unused); 6749 percpu_counter_sub(&tctx->inflight, unused); 6750 put_task_struct_many(current, unused); 6751 } 6752 6753 io_submit_state_end(&ctx->submit_state, ctx); 6754 /* Commit SQ ring head once we've consumed and submitted all SQEs */ 6755 io_commit_sqring(ctx); 6756 6757 return submitted; 6758} 6759 6760static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx) 6761{ 6762 /* Tell userspace we may need a wakeup call */ 6763 spin_lock_irq(&ctx->completion_lock); 6764 ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; 6765 spin_unlock_irq(&ctx->completion_lock); 6766} 6767 6768static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) 6769{ 6770 spin_lock_irq(&ctx->completion_lock); 6771 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; 6772 spin_unlock_irq(&ctx->completion_lock); 6773} 6774 6775static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) 6776{ 6777 unsigned int to_submit; 6778 int ret = 0; 6779 6780 to_submit = io_sqring_entries(ctx); 6781 /* if we're handling multiple rings, cap submit size for fairness */ 6782 if (cap_entries && to_submit > 8) 6783 to_submit = 8; 6784 6785 if (!list_empty(&ctx->iopoll_list) || to_submit) { 6786 unsigned nr_events = 0; 6787 6788 mutex_lock(&ctx->uring_lock); 6789 if (!list_empty(&ctx->iopoll_list)) 6790 io_do_iopoll(ctx, &nr_events, 0); 6791 6792 /* 6793 * Don't submit if refs are dying, good for io_uring_register(), 6794 * but also it is relied upon by io_ring_exit_work() 6795 */ 6796 if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) && 6797 !(ctx->flags & IORING_SETUP_R_DISABLED)) 6798 ret = io_submit_sqes(ctx, to_submit); 6799 mutex_unlock(&ctx->uring_lock); 6800 } 6801 6802 if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait)) 6803 wake_up(&ctx->sqo_sq_wait); 6804 6805 return ret; 6806} 6807 6808static void io_sqd_update_thread_idle(struct io_sq_data *sqd) 6809{ 6810 struct io_ring_ctx *ctx; 6811 unsigned sq_thread_idle = 0; 6812 6813 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 6814 sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle); 6815 sqd->sq_thread_idle = sq_thread_idle; 6816} 6817 6818static int io_sq_thread(void *data) 6819{ 6820 struct io_sq_data *sqd = data; 6821 struct io_ring_ctx *ctx; 6822 unsigned long timeout = 0; 6823 char buf[TASK_COMM_LEN]; 6824 DEFINE_WAIT(wait); 6825 6826 snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); 6827 set_task_comm(current, buf); 6828 6829 if (sqd->sq_cpu != -1) 6830 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu)); 6831 else 6832 set_cpus_allowed_ptr(current, cpu_online_mask); 6833 current->flags |= PF_NO_SETAFFINITY; 6834 6835 mutex_lock(&sqd->lock); 6836 /* a user may had exited before the thread started */ 6837 io_run_task_work_head(&sqd->park_task_work); 6838 6839 while (!test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) { 6840 int ret; 6841 bool cap_entries, sqt_spin, needs_sched; 6842 6843 if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) || 6844 signal_pending(current)) { 6845 bool did_sig = false; 6846 6847 mutex_unlock(&sqd->lock); 6848 if (signal_pending(current)) { 6849 struct ksignal ksig; 6850 6851 did_sig = get_signal(&ksig); 6852 } 6853 cond_resched(); 6854 mutex_lock(&sqd->lock); 6855 io_run_task_work(); 6856 io_run_task_work_head(&sqd->park_task_work); 6857 if (did_sig) 6858 break; 6859 timeout = jiffies + sqd->sq_thread_idle; 6860 continue; 6861 } 6862 sqt_spin = false; 6863 cap_entries = !list_is_singular(&sqd->ctx_list); 6864 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 6865 const struct cred *creds = NULL; 6866 6867 if (ctx->sq_creds != current_cred()) 6868 creds = override_creds(ctx->sq_creds); 6869 ret = __io_sq_thread(ctx, cap_entries); 6870 if (creds) 6871 revert_creds(creds); 6872 if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list))) 6873 sqt_spin = true; 6874 } 6875 6876 if (sqt_spin || !time_after(jiffies, timeout)) { 6877 io_run_task_work(); 6878 cond_resched(); 6879 if (sqt_spin) 6880 timeout = jiffies + sqd->sq_thread_idle; 6881 continue; 6882 } 6883 6884 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); 6885 if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) { 6886 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 6887 io_ring_set_wakeup_flag(ctx); 6888 6889 needs_sched = true; 6890 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 6891 if ((ctx->flags & IORING_SETUP_IOPOLL) && 6892 !list_empty_careful(&ctx->iopoll_list)) { 6893 needs_sched = false; 6894 break; 6895 } 6896 if (io_sqring_entries(ctx)) { 6897 needs_sched = false; 6898 break; 6899 } 6900 } 6901 6902 if (needs_sched) { 6903 mutex_unlock(&sqd->lock); 6904 schedule(); 6905 mutex_lock(&sqd->lock); 6906 } 6907 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 6908 io_ring_clear_wakeup_flag(ctx); 6909 } 6910 6911 finish_wait(&sqd->wait, &wait); 6912 io_run_task_work_head(&sqd->park_task_work); 6913 timeout = jiffies + sqd->sq_thread_idle; 6914 } 6915 6916 io_uring_cancel_sqpoll(sqd); 6917 sqd->thread = NULL; 6918 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 6919 io_ring_set_wakeup_flag(ctx); 6920 io_run_task_work(); 6921 io_run_task_work_head(&sqd->park_task_work); 6922 mutex_unlock(&sqd->lock); 6923 6924 complete(&sqd->exited); 6925 do_exit(0); 6926} 6927 6928struct io_wait_queue { 6929 struct wait_queue_entry wq; 6930 struct io_ring_ctx *ctx; 6931 unsigned to_wait; 6932 unsigned nr_timeouts; 6933}; 6934 6935static inline bool io_should_wake(struct io_wait_queue *iowq) 6936{ 6937 struct io_ring_ctx *ctx = iowq->ctx; 6938 6939 /* 6940 * Wake up if we have enough events, or if a timeout occurred since we 6941 * started waiting. For timeouts, we always want to return to userspace, 6942 * regardless of event count. 6943 */ 6944 return io_cqring_events(ctx) >= iowq->to_wait || 6945 atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; 6946} 6947 6948static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, 6949 int wake_flags, void *key) 6950{ 6951 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, 6952 wq); 6953 6954 /* 6955 * Cannot safely flush overflowed CQEs from here, ensure we wake up 6956 * the task, and the next invocation will do it. 6957 */ 6958 if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->cq_check_overflow)) 6959 return autoremove_wake_function(curr, mode, wake_flags, key); 6960 return -1; 6961} 6962 6963static int io_run_task_work_sig(void) 6964{ 6965 if (io_run_task_work()) 6966 return 1; 6967 if (!signal_pending(current)) 6968 return 0; 6969 if (test_thread_flag(TIF_NOTIFY_SIGNAL)) 6970 return -ERESTARTSYS; 6971 return -EINTR; 6972} 6973 6974/* when returns >0, the caller should retry */ 6975static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, 6976 struct io_wait_queue *iowq, 6977 signed long *timeout) 6978{ 6979 int ret; 6980 6981 /* make sure we run task_work before checking for signals */ 6982 ret = io_run_task_work_sig(); 6983 if (ret || io_should_wake(iowq)) 6984 return ret; 6985 /* let the caller flush overflows, retry */ 6986 if (test_bit(0, &ctx->cq_check_overflow)) 6987 return 1; 6988 6989 *timeout = schedule_timeout(*timeout); 6990 return !*timeout ? -ETIME : 1; 6991} 6992 6993/* 6994 * Wait until events become available, if we don't already have some. The 6995 * application must reap them itself, as they reside on the shared cq ring. 6996 */ 6997static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, 6998 const sigset_t __user *sig, size_t sigsz, 6999 struct __kernel_timespec __user *uts) 7000{ 7001 struct io_wait_queue iowq = { 7002 .wq = { 7003 .private = current, 7004 .func = io_wake_function, 7005 .entry = LIST_HEAD_INIT(iowq.wq.entry), 7006 }, 7007 .ctx = ctx, 7008 .to_wait = min_events, 7009 }; 7010 struct io_rings *rings = ctx->rings; 7011 signed long timeout = MAX_SCHEDULE_TIMEOUT; 7012 int ret; 7013 7014 do { 7015 io_cqring_overflow_flush(ctx, false); 7016 if (io_cqring_events(ctx) >= min_events) 7017 return 0; 7018 if (!io_run_task_work()) 7019 break; 7020 } while (1); 7021 7022 if (sig) { 7023#ifdef CONFIG_COMPAT 7024 if (in_compat_syscall()) 7025 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, 7026 sigsz); 7027 else 7028#endif 7029 ret = set_user_sigmask(sig, sigsz); 7030 7031 if (ret) 7032 return ret; 7033 } 7034 7035 if (uts) { 7036 struct timespec64 ts; 7037 7038 if (get_timespec64(&ts, uts)) 7039 return -EFAULT; 7040 timeout = timespec64_to_jiffies(&ts); 7041 } 7042 7043 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); 7044 trace_io_uring_cqring_wait(ctx, min_events); 7045 do { 7046 /* if we can't even flush overflow, don't wait for more */ 7047 if (!io_cqring_overflow_flush(ctx, false)) { 7048 ret = -EBUSY; 7049 break; 7050 } 7051 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, 7052 TASK_INTERRUPTIBLE); 7053 ret = io_cqring_wait_schedule(ctx, &iowq, &timeout); 7054 finish_wait(&ctx->wait, &iowq.wq); 7055 cond_resched(); 7056 } while (ret > 0); 7057 7058 restore_saved_sigmask_unless(ret == -EINTR); 7059 7060 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; 7061} 7062 7063static void io_free_file_tables(struct io_file_table *table, unsigned nr_files) 7064{ 7065 unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE); 7066 7067 for (i = 0; i < nr_tables; i++) 7068 kfree(table->files[i]); 7069 kfree(table->files); 7070 table->files = NULL; 7071} 7072 7073static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx) 7074{ 7075 spin_lock_bh(&ctx->rsrc_ref_lock); 7076} 7077 7078static inline void io_rsrc_ref_unlock(struct io_ring_ctx *ctx) 7079{ 7080 spin_unlock_bh(&ctx->rsrc_ref_lock); 7081} 7082 7083static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) 7084{ 7085 percpu_ref_exit(&ref_node->refs); 7086 kfree(ref_node); 7087} 7088 7089static void io_rsrc_node_switch(struct io_ring_ctx *ctx, 7090 struct io_rsrc_data *data_to_kill) 7091{ 7092 WARN_ON_ONCE(!ctx->rsrc_backup_node); 7093 WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); 7094 7095 if (data_to_kill) { 7096 struct io_rsrc_node *rsrc_node = ctx->rsrc_node; 7097 7098 rsrc_node->rsrc_data = data_to_kill; 7099 io_rsrc_ref_lock(ctx); 7100 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); 7101 io_rsrc_ref_unlock(ctx); 7102 7103 atomic_inc(&data_to_kill->refs); 7104 percpu_ref_kill(&rsrc_node->refs); 7105 ctx->rsrc_node = NULL; 7106 } 7107 7108 if (!ctx->rsrc_node) { 7109 ctx->rsrc_node = ctx->rsrc_backup_node; 7110 ctx->rsrc_backup_node = NULL; 7111 } 7112} 7113 7114static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) 7115{ 7116 if (ctx->rsrc_backup_node) 7117 return 0; 7118 ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx); 7119 return ctx->rsrc_backup_node ? 0 : -ENOMEM; 7120} 7121 7122static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx) 7123{ 7124 int ret; 7125 7126 /* As we may drop ->uring_lock, other task may have started quiesce */ 7127 if (data->quiesce) 7128 return -ENXIO; 7129 7130 data->quiesce = true; 7131 do { 7132 ret = io_rsrc_node_switch_start(ctx); 7133 if (ret) 7134 break; 7135 io_rsrc_node_switch(ctx, data); 7136 7137 /* kill initial ref, already quiesced if zero */ 7138 if (atomic_dec_and_test(&data->refs)) 7139 break; 7140 flush_delayed_work(&ctx->rsrc_put_work); 7141 ret = wait_for_completion_interruptible(&data->done); 7142 if (!ret) 7143 break; 7144 7145 atomic_inc(&data->refs); 7146 /* wait for all works potentially completing data->done */ 7147 flush_delayed_work(&ctx->rsrc_put_work); 7148 reinit_completion(&data->done); 7149 7150 mutex_unlock(&ctx->uring_lock); 7151 ret = io_run_task_work_sig(); 7152 mutex_lock(&ctx->uring_lock); 7153 } while (ret >= 0); 7154 data->quiesce = false; 7155 7156 return ret; 7157} 7158 7159static void io_rsrc_data_free(struct io_rsrc_data *data) 7160{ 7161 kvfree(data->tags); 7162 kfree(data); 7163} 7164 7165static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx, 7166 rsrc_put_fn *do_put, 7167 unsigned nr) 7168{ 7169 struct io_rsrc_data *data; 7170 7171 data = kzalloc(sizeof(*data), GFP_KERNEL); 7172 if (!data) 7173 return NULL; 7174 7175 data->tags = kvcalloc(nr, sizeof(*data->tags), GFP_KERNEL); 7176 if (!data->tags) { 7177 kfree(data); 7178 return NULL; 7179 } 7180 7181 atomic_set(&data->refs, 1); 7182 data->ctx = ctx; 7183 data->do_put = do_put; 7184 init_completion(&data->done); 7185 return data; 7186} 7187 7188static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) 7189{ 7190#if defined(CONFIG_UNIX) 7191 if (ctx->ring_sock) { 7192 struct sock *sock = ctx->ring_sock->sk; 7193 struct sk_buff *skb; 7194 7195 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) 7196 kfree_skb(skb); 7197 } 7198#else 7199 int i; 7200 7201 for (i = 0; i < ctx->nr_user_files; i++) { 7202 struct file *file; 7203 7204 file = io_file_from_index(ctx, i); 7205 if (file) 7206 fput(file); 7207 } 7208#endif 7209 io_free_file_tables(&ctx->file_table, ctx->nr_user_files); 7210 io_rsrc_data_free(ctx->file_data); 7211 ctx->file_data = NULL; 7212 ctx->nr_user_files = 0; 7213} 7214 7215static int io_sqe_files_unregister(struct io_ring_ctx *ctx) 7216{ 7217 int ret; 7218 7219 if (!ctx->file_data) 7220 return -ENXIO; 7221 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); 7222 if (!ret) 7223 __io_sqe_files_unregister(ctx); 7224 return ret; 7225} 7226 7227static void io_sq_thread_unpark(struct io_sq_data *sqd) 7228 __releases(&sqd->lock) 7229{ 7230 WARN_ON_ONCE(sqd->thread == current); 7231 7232 /* 7233 * Do the dance but not conditional clear_bit() because it'd race with 7234 * other threads incrementing park_pending and setting the bit. 7235 */ 7236 clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 7237 if (atomic_dec_return(&sqd->park_pending)) 7238 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 7239 mutex_unlock(&sqd->lock); 7240} 7241 7242static void io_sq_thread_park(struct io_sq_data *sqd) 7243 __acquires(&sqd->lock) 7244{ 7245 WARN_ON_ONCE(sqd->thread == current); 7246 7247 atomic_inc(&sqd->park_pending); 7248 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 7249 mutex_lock(&sqd->lock); 7250 if (sqd->thread) 7251 wake_up_process(sqd->thread); 7252} 7253 7254static void io_sq_thread_stop(struct io_sq_data *sqd) 7255{ 7256 WARN_ON_ONCE(sqd->thread == current); 7257 WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)); 7258 7259 set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); 7260 mutex_lock(&sqd->lock); 7261 if (sqd->thread) 7262 wake_up_process(sqd->thread); 7263 mutex_unlock(&sqd->lock); 7264 wait_for_completion(&sqd->exited); 7265} 7266 7267static void io_put_sq_data(struct io_sq_data *sqd) 7268{ 7269 if (refcount_dec_and_test(&sqd->refs)) { 7270 WARN_ON_ONCE(atomic_read(&sqd->park_pending)); 7271 7272 io_sq_thread_stop(sqd); 7273 kfree(sqd); 7274 } 7275} 7276 7277static void io_sq_thread_finish(struct io_ring_ctx *ctx) 7278{ 7279 struct io_sq_data *sqd = ctx->sq_data; 7280 7281 if (sqd) { 7282 io_sq_thread_park(sqd); 7283 list_del_init(&ctx->sqd_list); 7284 io_sqd_update_thread_idle(sqd); 7285 io_sq_thread_unpark(sqd); 7286 7287 io_put_sq_data(sqd); 7288 ctx->sq_data = NULL; 7289 } 7290} 7291 7292static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) 7293{ 7294 struct io_ring_ctx *ctx_attach; 7295 struct io_sq_data *sqd; 7296 struct fd f; 7297 7298 f = fdget(p->wq_fd); 7299 if (!f.file) 7300 return ERR_PTR(-ENXIO); 7301 if (f.file->f_op != &io_uring_fops) { 7302 fdput(f); 7303 return ERR_PTR(-EINVAL); 7304 } 7305 7306 ctx_attach = f.file->private_data; 7307 sqd = ctx_attach->sq_data; 7308 if (!sqd) { 7309 fdput(f); 7310 return ERR_PTR(-EINVAL); 7311 } 7312 if (sqd->task_tgid != current->tgid) { 7313 fdput(f); 7314 return ERR_PTR(-EPERM); 7315 } 7316 7317 refcount_inc(&sqd->refs); 7318 fdput(f); 7319 return sqd; 7320} 7321 7322static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, 7323 bool *attached) 7324{ 7325 struct io_sq_data *sqd; 7326 7327 *attached = false; 7328 if (p->flags & IORING_SETUP_ATTACH_WQ) { 7329 sqd = io_attach_sq_data(p); 7330 if (!IS_ERR(sqd)) { 7331 *attached = true; 7332 return sqd; 7333 } 7334 /* fall through for EPERM case, setup new sqd/task */ 7335 if (PTR_ERR(sqd) != -EPERM) 7336 return sqd; 7337 } 7338 7339 sqd = kzalloc(sizeof(*sqd), GFP_KERNEL); 7340 if (!sqd) 7341 return ERR_PTR(-ENOMEM); 7342 7343 atomic_set(&sqd->park_pending, 0); 7344 refcount_set(&sqd->refs, 1); 7345 INIT_LIST_HEAD(&sqd->ctx_list); 7346 mutex_init(&sqd->lock); 7347 init_waitqueue_head(&sqd->wait); 7348 init_completion(&sqd->exited); 7349 return sqd; 7350} 7351 7352#if defined(CONFIG_UNIX) 7353/* 7354 * Ensure the UNIX gc is aware of our file set, so we are certain that 7355 * the io_uring can be safely unregistered on process exit, even if we have 7356 * loops in the file referencing. 7357 */ 7358static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) 7359{ 7360 struct sock *sk = ctx->ring_sock->sk; 7361 struct scm_fp_list *fpl; 7362 struct sk_buff *skb; 7363 int i, nr_files; 7364 7365 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); 7366 if (!fpl) 7367 return -ENOMEM; 7368 7369 skb = alloc_skb(0, GFP_KERNEL); 7370 if (!skb) { 7371 kfree(fpl); 7372 return -ENOMEM; 7373 } 7374 7375 skb->sk = sk; 7376 7377 nr_files = 0; 7378 fpl->user = get_uid(current_user()); 7379 for (i = 0; i < nr; i++) { 7380 struct file *file = io_file_from_index(ctx, i + offset); 7381 7382 if (!file) 7383 continue; 7384 fpl->fp[nr_files] = get_file(file); 7385 unix_inflight(fpl->user, fpl->fp[nr_files]); 7386 nr_files++; 7387 } 7388 7389 if (nr_files) { 7390 fpl->max = SCM_MAX_FD; 7391 fpl->count = nr_files; 7392 UNIXCB(skb).fp = fpl; 7393 skb->destructor = unix_destruct_scm; 7394 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 7395 skb_queue_head(&sk->sk_receive_queue, skb); 7396 7397 for (i = 0; i < nr_files; i++) 7398 fput(fpl->fp[i]); 7399 } else { 7400 kfree_skb(skb); 7401 kfree(fpl); 7402 } 7403 7404 return 0; 7405} 7406 7407/* 7408 * If UNIX sockets are enabled, fd passing can cause a reference cycle which 7409 * causes regular reference counting to break down. We rely on the UNIX 7410 * garbage collection to take care of this problem for us. 7411 */ 7412static int io_sqe_files_scm(struct io_ring_ctx *ctx) 7413{ 7414 unsigned left, total; 7415 int ret = 0; 7416 7417 total = 0; 7418 left = ctx->nr_user_files; 7419 while (left) { 7420 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD); 7421 7422 ret = __io_sqe_files_scm(ctx, this_files, total); 7423 if (ret) 7424 break; 7425 left -= this_files; 7426 total += this_files; 7427 } 7428 7429 if (!ret) 7430 return 0; 7431 7432 while (total < ctx->nr_user_files) { 7433 struct file *file = io_file_from_index(ctx, total); 7434 7435 if (file) 7436 fput(file); 7437 total++; 7438 } 7439 7440 return ret; 7441} 7442#else 7443static int io_sqe_files_scm(struct io_ring_ctx *ctx) 7444{ 7445 return 0; 7446} 7447#endif 7448 7449static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) 7450{ 7451 unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE); 7452 7453 table->files = kcalloc(nr_tables, sizeof(*table->files), GFP_KERNEL); 7454 if (!table->files) 7455 return false; 7456 7457 for (i = 0; i < nr_tables; i++) { 7458 unsigned int this_files = min(nr_files, IORING_MAX_FILES_TABLE); 7459 7460 table->files[i] = kcalloc(this_files, sizeof(*table->files[i]), 7461 GFP_KERNEL); 7462 if (!table->files[i]) 7463 break; 7464 nr_files -= this_files; 7465 } 7466 7467 if (i == nr_tables) 7468 return true; 7469 7470 io_free_file_tables(table, nr_tables * IORING_MAX_FILES_TABLE); 7471 return false; 7472} 7473 7474static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 7475{ 7476 struct file *file = prsrc->file; 7477#if defined(CONFIG_UNIX) 7478 struct sock *sock = ctx->ring_sock->sk; 7479 struct sk_buff_head list, *head = &sock->sk_receive_queue; 7480 struct sk_buff *skb; 7481 int i; 7482 7483 __skb_queue_head_init(&list); 7484 7485 /* 7486 * Find the skb that holds this file in its SCM_RIGHTS. When found, 7487 * remove this entry and rearrange the file array. 7488 */ 7489 skb = skb_dequeue(head); 7490 while (skb) { 7491 struct scm_fp_list *fp; 7492 7493 fp = UNIXCB(skb).fp; 7494 for (i = 0; i < fp->count; i++) { 7495 int left; 7496 7497 if (fp->fp[i] != file) 7498 continue; 7499 7500 unix_notinflight(fp->user, fp->fp[i]); 7501 left = fp->count - 1 - i; 7502 if (left) { 7503 memmove(&fp->fp[i], &fp->fp[i + 1], 7504 left * sizeof(struct file *)); 7505 } 7506 fp->count--; 7507 if (!fp->count) { 7508 kfree_skb(skb); 7509 skb = NULL; 7510 } else { 7511 __skb_queue_tail(&list, skb); 7512 } 7513 fput(file); 7514 file = NULL; 7515 break; 7516 } 7517 7518 if (!file) 7519 break; 7520 7521 __skb_queue_tail(&list, skb); 7522 7523 skb = skb_dequeue(head); 7524 } 7525 7526 if (skb_peek(&list)) { 7527 spin_lock_irq(&head->lock); 7528 while ((skb = __skb_dequeue(&list)) != NULL) 7529 __skb_queue_tail(head, skb); 7530 spin_unlock_irq(&head->lock); 7531 } 7532#else 7533 fput(file); 7534#endif 7535} 7536 7537static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) 7538{ 7539 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; 7540 struct io_ring_ctx *ctx = rsrc_data->ctx; 7541 struct io_rsrc_put *prsrc, *tmp; 7542 7543 list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) { 7544 list_del(&prsrc->list); 7545 7546 if (prsrc->tag) { 7547 bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL; 7548 unsigned long flags; 7549 7550 io_ring_submit_lock(ctx, lock_ring); 7551 spin_lock_irqsave(&ctx->completion_lock, flags); 7552 io_cqring_fill_event(ctx, prsrc->tag, 0, 0); 7553 ctx->cq_extra++; 7554 io_commit_cqring(ctx); 7555 spin_unlock_irqrestore(&ctx->completion_lock, flags); 7556 io_cqring_ev_posted(ctx); 7557 io_ring_submit_unlock(ctx, lock_ring); 7558 } 7559 7560 rsrc_data->do_put(ctx, prsrc); 7561 kfree(prsrc); 7562 } 7563 7564 io_rsrc_node_destroy(ref_node); 7565 if (atomic_dec_and_test(&rsrc_data->refs)) 7566 complete(&rsrc_data->done); 7567} 7568 7569static void io_rsrc_put_work(struct work_struct *work) 7570{ 7571 struct io_ring_ctx *ctx; 7572 struct llist_node *node; 7573 7574 ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work); 7575 node = llist_del_all(&ctx->rsrc_put_llist); 7576 7577 while (node) { 7578 struct io_rsrc_node *ref_node; 7579 struct llist_node *next = node->next; 7580 7581 ref_node = llist_entry(node, struct io_rsrc_node, llist); 7582 __io_rsrc_put_work(ref_node); 7583 node = next; 7584 } 7585} 7586 7587static void io_rsrc_node_ref_zero(struct percpu_ref *ref) 7588{ 7589 struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); 7590 struct io_ring_ctx *ctx = node->rsrc_data->ctx; 7591 bool first_add = false; 7592 7593 io_rsrc_ref_lock(ctx); 7594 node->done = true; 7595 7596 while (!list_empty(&ctx->rsrc_ref_list)) { 7597 node = list_first_entry(&ctx->rsrc_ref_list, 7598 struct io_rsrc_node, node); 7599 /* recycle ref nodes in order */ 7600 if (!node->done) 7601 break; 7602 list_del(&node->node); 7603 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); 7604 } 7605 io_rsrc_ref_unlock(ctx); 7606 7607 if (first_add) 7608 mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ); 7609} 7610 7611static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) 7612{ 7613 struct io_rsrc_node *ref_node; 7614 7615 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); 7616 if (!ref_node) 7617 return NULL; 7618 7619 if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, 7620 0, GFP_KERNEL)) { 7621 kfree(ref_node); 7622 return NULL; 7623 } 7624 INIT_LIST_HEAD(&ref_node->node); 7625 INIT_LIST_HEAD(&ref_node->rsrc_list); 7626 ref_node->done = false; 7627 return ref_node; 7628} 7629 7630static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 7631 unsigned nr_args, u64 __user *tags) 7632{ 7633 __s32 __user *fds = (__s32 __user *) arg; 7634 struct file *file; 7635 int fd, ret; 7636 unsigned i; 7637 struct io_rsrc_data *file_data; 7638 7639 if (ctx->file_data) 7640 return -EBUSY; 7641 if (!nr_args) 7642 return -EINVAL; 7643 if (nr_args > IORING_MAX_FIXED_FILES) 7644 return -EMFILE; 7645 ret = io_rsrc_node_switch_start(ctx); 7646 if (ret) 7647 return ret; 7648 7649 file_data = io_rsrc_data_alloc(ctx, io_rsrc_file_put, nr_args); 7650 if (!file_data) 7651 return -ENOMEM; 7652 ctx->file_data = file_data; 7653 ret = -ENOMEM; 7654 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) 7655 goto out_free; 7656 7657 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { 7658 u64 tag = 0; 7659 7660 if ((tags && copy_from_user(&tag, &tags[i], sizeof(tag))) || 7661 copy_from_user(&fd, &fds[i], sizeof(fd))) { 7662 ret = -EFAULT; 7663 goto out_fput; 7664 } 7665 /* allow sparse sets */ 7666 if (fd == -1) { 7667 ret = -EINVAL; 7668 if (unlikely(tag)) 7669 goto out_fput; 7670 continue; 7671 } 7672 7673 file = fget(fd); 7674 ret = -EBADF; 7675 if (unlikely(!file)) 7676 goto out_fput; 7677 7678 /* 7679 * Don't allow io_uring instances to be registered. If UNIX 7680 * isn't enabled, then this causes a reference cycle and this 7681 * instance can never get freed. If UNIX is enabled we'll 7682 * handle it just fine, but there's still no point in allowing 7683 * a ring fd as it doesn't support regular read/write anyway. 7684 */ 7685 if (file->f_op == &io_uring_fops) { 7686 fput(file); 7687 goto out_fput; 7688 } 7689 ctx->file_data->tags[i] = tag; 7690 io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file); 7691 } 7692 7693 ret = io_sqe_files_scm(ctx); 7694 if (ret) { 7695 __io_sqe_files_unregister(ctx); 7696 return ret; 7697 } 7698 7699 io_rsrc_node_switch(ctx, NULL); 7700 return ret; 7701out_fput: 7702 for (i = 0; i < ctx->nr_user_files; i++) { 7703 file = io_file_from_index(ctx, i); 7704 if (file) 7705 fput(file); 7706 } 7707 io_free_file_tables(&ctx->file_table, nr_args); 7708 ctx->nr_user_files = 0; 7709out_free: 7710 io_rsrc_data_free(ctx->file_data); 7711 ctx->file_data = NULL; 7712 return ret; 7713} 7714 7715static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, 7716 int index) 7717{ 7718#if defined(CONFIG_UNIX) 7719 struct sock *sock = ctx->ring_sock->sk; 7720 struct sk_buff_head *head = &sock->sk_receive_queue; 7721 struct sk_buff *skb; 7722 7723 /* 7724 * See if we can merge this file into an existing skb SCM_RIGHTS 7725 * file set. If there's no room, fall back to allocating a new skb 7726 * and filling it in. 7727 */ 7728 spin_lock_irq(&head->lock); 7729 skb = skb_peek(head); 7730 if (skb) { 7731 struct scm_fp_list *fpl = UNIXCB(skb).fp; 7732 7733 if (fpl->count < SCM_MAX_FD) { 7734 __skb_unlink(skb, head); 7735 spin_unlock_irq(&head->lock); 7736 fpl->fp[fpl->count] = get_file(file); 7737 unix_inflight(fpl->user, fpl->fp[fpl->count]); 7738 fpl->count++; 7739 spin_lock_irq(&head->lock); 7740 __skb_queue_head(head, skb); 7741 } else { 7742 skb = NULL; 7743 } 7744 } 7745 spin_unlock_irq(&head->lock); 7746 7747 if (skb) { 7748 fput(file); 7749 return 0; 7750 } 7751 7752 return __io_sqe_files_scm(ctx, 1, index); 7753#else 7754 return 0; 7755#endif 7756} 7757 7758static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, 7759 struct io_rsrc_node *node, void *rsrc) 7760{ 7761 struct io_rsrc_put *prsrc; 7762 7763 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); 7764 if (!prsrc) 7765 return -ENOMEM; 7766 7767 prsrc->tag = data->tags[idx]; 7768 prsrc->rsrc = rsrc; 7769 list_add(&prsrc->list, &node->rsrc_list); 7770 return 0; 7771} 7772 7773static int __io_sqe_files_update(struct io_ring_ctx *ctx, 7774 struct io_uring_rsrc_update2 *up, 7775 unsigned nr_args) 7776{ 7777 u64 __user *tags = u64_to_user_ptr(up->tags); 7778 __s32 __user *fds = u64_to_user_ptr(up->data); 7779 struct io_rsrc_data *data = ctx->file_data; 7780 struct io_fixed_file *file_slot; 7781 struct file *file; 7782 int fd, i, err = 0; 7783 unsigned int done; 7784 bool needs_switch = false; 7785 7786 if (!ctx->file_data) 7787 return -ENXIO; 7788 if (up->offset + nr_args > ctx->nr_user_files) 7789 return -EINVAL; 7790 7791 for (done = 0; done < nr_args; done++) { 7792 u64 tag = 0; 7793 7794 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 7795 copy_from_user(&fd, &fds[done], sizeof(fd))) { 7796 err = -EFAULT; 7797 break; 7798 } 7799 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 7800 err = -EINVAL; 7801 break; 7802 } 7803 if (fd == IORING_REGISTER_FILES_SKIP) 7804 continue; 7805 7806 i = array_index_nospec(up->offset + done, ctx->nr_user_files); 7807 file_slot = io_fixed_file_slot(&ctx->file_table, i); 7808 7809 if (file_slot->file_ptr) { 7810 file = (struct file *)(file_slot->file_ptr & FFS_MASK); 7811 err = io_queue_rsrc_removal(data, up->offset + done, 7812 ctx->rsrc_node, file); 7813 if (err) 7814 break; 7815 file_slot->file_ptr = 0; 7816 needs_switch = true; 7817 } 7818 if (fd != -1) { 7819 file = fget(fd); 7820 if (!file) { 7821 err = -EBADF; 7822 break; 7823 } 7824 /* 7825 * Don't allow io_uring instances to be registered. If 7826 * UNIX isn't enabled, then this causes a reference 7827 * cycle and this instance can never get freed. If UNIX 7828 * is enabled we'll handle it just fine, but there's 7829 * still no point in allowing a ring fd as it doesn't 7830 * support regular read/write anyway. 7831 */ 7832 if (file->f_op == &io_uring_fops) { 7833 fput(file); 7834 err = -EBADF; 7835 break; 7836 } 7837 data->tags[up->offset + done] = tag; 7838 io_fixed_file_set(file_slot, file); 7839 err = io_sqe_file_register(ctx, file, i); 7840 if (err) { 7841 file_slot->file_ptr = 0; 7842 fput(file); 7843 break; 7844 } 7845 } 7846 } 7847 7848 if (needs_switch) 7849 io_rsrc_node_switch(ctx, data); 7850 return done ? done : err; 7851} 7852 7853static struct io_wq_work *io_free_work(struct io_wq_work *work) 7854{ 7855 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 7856 7857 req = io_put_req_find_next(req); 7858 return req ? &req->work : NULL; 7859} 7860 7861static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, 7862 struct task_struct *task) 7863{ 7864 struct io_wq_hash *hash; 7865 struct io_wq_data data; 7866 unsigned int concurrency; 7867 7868 hash = ctx->hash_map; 7869 if (!hash) { 7870 hash = kzalloc(sizeof(*hash), GFP_KERNEL); 7871 if (!hash) 7872 return ERR_PTR(-ENOMEM); 7873 refcount_set(&hash->refs, 1); 7874 init_waitqueue_head(&hash->wait); 7875 ctx->hash_map = hash; 7876 } 7877 7878 data.hash = hash; 7879 data.task = task; 7880 data.free_work = io_free_work; 7881 data.do_work = io_wq_submit_work; 7882 7883 /* Do QD, or 4 * CPUS, whatever is smallest */ 7884 concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); 7885 7886 return io_wq_create(concurrency, &data); 7887} 7888 7889static int io_uring_alloc_task_context(struct task_struct *task, 7890 struct io_ring_ctx *ctx) 7891{ 7892 struct io_uring_task *tctx; 7893 int ret; 7894 7895 tctx = kmalloc(sizeof(*tctx), GFP_KERNEL); 7896 if (unlikely(!tctx)) 7897 return -ENOMEM; 7898 7899 ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); 7900 if (unlikely(ret)) { 7901 kfree(tctx); 7902 return ret; 7903 } 7904 7905 tctx->io_wq = io_init_wq_offload(ctx, task); 7906 if (IS_ERR(tctx->io_wq)) { 7907 ret = PTR_ERR(tctx->io_wq); 7908 percpu_counter_destroy(&tctx->inflight); 7909 kfree(tctx); 7910 return ret; 7911 } 7912 7913 xa_init(&tctx->xa); 7914 init_waitqueue_head(&tctx->wait); 7915 tctx->last = NULL; 7916 atomic_set(&tctx->in_idle, 0); 7917 atomic_set(&tctx->inflight_tracked, 0); 7918 task->io_uring = tctx; 7919 spin_lock_init(&tctx->task_lock); 7920 INIT_WQ_LIST(&tctx->task_list); 7921 tctx->task_state = 0; 7922 init_task_work(&tctx->task_work, tctx_task_work); 7923 return 0; 7924} 7925 7926void __io_uring_free(struct task_struct *tsk) 7927{ 7928 struct io_uring_task *tctx = tsk->io_uring; 7929 7930 WARN_ON_ONCE(!xa_empty(&tctx->xa)); 7931 WARN_ON_ONCE(tctx->io_wq); 7932 7933 percpu_counter_destroy(&tctx->inflight); 7934 kfree(tctx); 7935 tsk->io_uring = NULL; 7936} 7937 7938static int io_sq_offload_create(struct io_ring_ctx *ctx, 7939 struct io_uring_params *p) 7940{ 7941 int ret; 7942 7943 /* Retain compatibility with failing for an invalid attach attempt */ 7944 if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) == 7945 IORING_SETUP_ATTACH_WQ) { 7946 struct fd f; 7947 7948 f = fdget(p->wq_fd); 7949 if (!f.file) 7950 return -ENXIO; 7951 fdput(f); 7952 if (f.file->f_op != &io_uring_fops) 7953 return -EINVAL; 7954 } 7955 if (ctx->flags & IORING_SETUP_SQPOLL) { 7956 struct task_struct *tsk; 7957 struct io_sq_data *sqd; 7958 bool attached; 7959 7960 sqd = io_get_sq_data(p, &attached); 7961 if (IS_ERR(sqd)) { 7962 ret = PTR_ERR(sqd); 7963 goto err; 7964 } 7965 7966 ctx->sq_creds = get_current_cred(); 7967 ctx->sq_data = sqd; 7968 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); 7969 if (!ctx->sq_thread_idle) 7970 ctx->sq_thread_idle = HZ; 7971 7972 io_sq_thread_park(sqd); 7973 list_add(&ctx->sqd_list, &sqd->ctx_list); 7974 io_sqd_update_thread_idle(sqd); 7975 /* don't attach to a dying SQPOLL thread, would be racy */ 7976 ret = (attached && !sqd->thread) ? -ENXIO : 0; 7977 io_sq_thread_unpark(sqd); 7978 7979 if (ret < 0) 7980 goto err; 7981 if (attached) 7982 return 0; 7983 7984 if (p->flags & IORING_SETUP_SQ_AFF) { 7985 int cpu = p->sq_thread_cpu; 7986 7987 ret = -EINVAL; 7988 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) 7989 goto err_sqpoll; 7990 sqd->sq_cpu = cpu; 7991 } else { 7992 sqd->sq_cpu = -1; 7993 } 7994 7995 sqd->task_pid = current->pid; 7996 sqd->task_tgid = current->tgid; 7997 tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); 7998 if (IS_ERR(tsk)) { 7999 ret = PTR_ERR(tsk); 8000 goto err_sqpoll; 8001 } 8002 8003 sqd->thread = tsk; 8004 ret = io_uring_alloc_task_context(tsk, ctx); 8005 wake_up_new_task(tsk); 8006 if (ret) 8007 goto err; 8008 } else if (p->flags & IORING_SETUP_SQ_AFF) { 8009 /* Can't have SQ_AFF without SQPOLL */ 8010 ret = -EINVAL; 8011 goto err; 8012 } 8013 8014 return 0; 8015err_sqpoll: 8016 complete(&ctx->sq_data->exited); 8017err: 8018 io_sq_thread_finish(ctx); 8019 return ret; 8020} 8021 8022static inline void __io_unaccount_mem(struct user_struct *user, 8023 unsigned long nr_pages) 8024{ 8025 atomic_long_sub(nr_pages, &user->locked_vm); 8026} 8027 8028static inline int __io_account_mem(struct user_struct *user, 8029 unsigned long nr_pages) 8030{ 8031 unsigned long page_limit, cur_pages, new_pages; 8032 8033 /* Don't allow more pages than we can safely lock */ 8034 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 8035 8036 do { 8037 cur_pages = atomic_long_read(&user->locked_vm); 8038 new_pages = cur_pages + nr_pages; 8039 if (new_pages > page_limit) 8040 return -ENOMEM; 8041 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, 8042 new_pages) != cur_pages); 8043 8044 return 0; 8045} 8046 8047static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 8048{ 8049 if (ctx->user) 8050 __io_unaccount_mem(ctx->user, nr_pages); 8051 8052 if (ctx->mm_account) 8053 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 8054} 8055 8056static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 8057{ 8058 int ret; 8059 8060 if (ctx->user) { 8061 ret = __io_account_mem(ctx->user, nr_pages); 8062 if (ret) 8063 return ret; 8064 } 8065 8066 if (ctx->mm_account) 8067 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 8068 8069 return 0; 8070} 8071 8072static void io_mem_free(void *ptr) 8073{ 8074 struct page *page; 8075 8076 if (!ptr) 8077 return; 8078 8079 page = virt_to_head_page(ptr); 8080 if (put_page_testzero(page)) 8081 free_compound_page(page); 8082} 8083 8084static void *io_mem_alloc(size_t size) 8085{ 8086 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP | 8087 __GFP_NORETRY | __GFP_ACCOUNT; 8088 8089 return (void *) __get_free_pages(gfp_flags, get_order(size)); 8090} 8091 8092static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, 8093 size_t *sq_offset) 8094{ 8095 struct io_rings *rings; 8096 size_t off, sq_array_size; 8097 8098 off = struct_size(rings, cqes, cq_entries); 8099 if (off == SIZE_MAX) 8100 return SIZE_MAX; 8101 8102#ifdef CONFIG_SMP 8103 off = ALIGN(off, SMP_CACHE_BYTES); 8104 if (off == 0) 8105 return SIZE_MAX; 8106#endif 8107 8108 if (sq_offset) 8109 *sq_offset = off; 8110 8111 sq_array_size = array_size(sizeof(u32), sq_entries); 8112 if (sq_array_size == SIZE_MAX) 8113 return SIZE_MAX; 8114 8115 if (check_add_overflow(off, sq_array_size, &off)) 8116 return SIZE_MAX; 8117 8118 return off; 8119} 8120 8121static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) 8122{ 8123 struct io_mapped_ubuf *imu = *slot; 8124 unsigned int i; 8125 8126 if (imu != ctx->dummy_ubuf) { 8127 for (i = 0; i < imu->nr_bvecs; i++) 8128 unpin_user_page(imu->bvec[i].bv_page); 8129 if (imu->acct_pages) 8130 io_unaccount_mem(ctx, imu->acct_pages); 8131 kvfree(imu); 8132 } 8133 *slot = NULL; 8134} 8135 8136static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 8137{ 8138 io_buffer_unmap(ctx, &prsrc->buf); 8139 prsrc->buf = NULL; 8140} 8141 8142static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 8143{ 8144 unsigned int i; 8145 8146 for (i = 0; i < ctx->nr_user_bufs; i++) 8147 io_buffer_unmap(ctx, &ctx->user_bufs[i]); 8148 kfree(ctx->user_bufs); 8149 io_rsrc_data_free(ctx->buf_data); 8150 ctx->user_bufs = NULL; 8151 ctx->buf_data = NULL; 8152 ctx->nr_user_bufs = 0; 8153} 8154 8155static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 8156{ 8157 int ret; 8158 8159 if (!ctx->buf_data) 8160 return -ENXIO; 8161 8162 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); 8163 if (!ret) 8164 __io_sqe_buffers_unregister(ctx); 8165 return ret; 8166} 8167 8168static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, 8169 void __user *arg, unsigned index) 8170{ 8171 struct iovec __user *src; 8172 8173#ifdef CONFIG_COMPAT 8174 if (ctx->compat) { 8175 struct compat_iovec __user *ciovs; 8176 struct compat_iovec ciov; 8177 8178 ciovs = (struct compat_iovec __user *) arg; 8179 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) 8180 return -EFAULT; 8181 8182 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); 8183 dst->iov_len = ciov.iov_len; 8184 return 0; 8185 } 8186#endif 8187 src = (struct iovec __user *) arg; 8188 if (copy_from_user(dst, &src[index], sizeof(*dst))) 8189 return -EFAULT; 8190 return 0; 8191} 8192 8193/* 8194 * Not super efficient, but this is just a registration time. And we do cache 8195 * the last compound head, so generally we'll only do a full search if we don't 8196 * match that one. 8197 * 8198 * We check if the given compound head page has already been accounted, to 8199 * avoid double accounting it. This allows us to account the full size of the 8200 * page, not just the constituent pages of a huge page. 8201 */ 8202static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 8203 int nr_pages, struct page *hpage) 8204{ 8205 int i, j; 8206 8207 /* check current page array */ 8208 for (i = 0; i < nr_pages; i++) { 8209 if (!PageCompound(pages[i])) 8210 continue; 8211 if (compound_head(pages[i]) == hpage) 8212 return true; 8213 } 8214 8215 /* check previously registered pages */ 8216 for (i = 0; i < ctx->nr_user_bufs; i++) { 8217 struct io_mapped_ubuf *imu = ctx->user_bufs[i]; 8218 8219 for (j = 0; j < imu->nr_bvecs; j++) { 8220 if (!PageCompound(imu->bvec[j].bv_page)) 8221 continue; 8222 if (compound_head(imu->bvec[j].bv_page) == hpage) 8223 return true; 8224 } 8225 } 8226 8227 return false; 8228} 8229 8230static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 8231 int nr_pages, struct io_mapped_ubuf *imu, 8232 struct page **last_hpage) 8233{ 8234 int i, ret; 8235 8236 imu->acct_pages = 0; 8237 for (i = 0; i < nr_pages; i++) { 8238 if (!PageCompound(pages[i])) { 8239 imu->acct_pages++; 8240 } else { 8241 struct page *hpage; 8242 8243 hpage = compound_head(pages[i]); 8244 if (hpage == *last_hpage) 8245 continue; 8246 *last_hpage = hpage; 8247 if (headpage_already_acct(ctx, pages, i, hpage)) 8248 continue; 8249 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 8250 } 8251 } 8252 8253 if (!imu->acct_pages) 8254 return 0; 8255 8256 ret = io_account_mem(ctx, imu->acct_pages); 8257 if (ret) 8258 imu->acct_pages = 0; 8259 return ret; 8260} 8261 8262static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 8263 struct io_mapped_ubuf **pimu, 8264 struct page **last_hpage) 8265{ 8266 struct io_mapped_ubuf *imu = NULL; 8267 struct vm_area_struct **vmas = NULL; 8268 struct page **pages = NULL; 8269 unsigned long off, start, end, ubuf; 8270 size_t size; 8271 int ret, pret, nr_pages, i; 8272 8273 if (!iov->iov_base) { 8274 *pimu = ctx->dummy_ubuf; 8275 return 0; 8276 } 8277 8278 ubuf = (unsigned long) iov->iov_base; 8279 end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; 8280 start = ubuf >> PAGE_SHIFT; 8281 nr_pages = end - start; 8282 8283 *pimu = NULL; 8284 ret = -ENOMEM; 8285 8286 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); 8287 if (!pages) 8288 goto done; 8289 8290 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), 8291 GFP_KERNEL); 8292 if (!vmas) 8293 goto done; 8294 8295 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); 8296 if (!imu) 8297 goto done; 8298 8299 ret = 0; 8300 mmap_read_lock(current->mm); 8301 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, 8302 pages, vmas); 8303 if (pret == nr_pages) { 8304 /* don't support file backed memory */ 8305 for (i = 0; i < nr_pages; i++) { 8306 struct vm_area_struct *vma = vmas[i]; 8307 8308 if (vma->vm_file && 8309 !is_file_hugepages(vma->vm_file)) { 8310 ret = -EOPNOTSUPP; 8311 break; 8312 } 8313 } 8314 } else { 8315 ret = pret < 0 ? pret : -EFAULT; 8316 } 8317 mmap_read_unlock(current->mm); 8318 if (ret) { 8319 /* 8320 * if we did partial map, or found file backed vmas, 8321 * release any pages we did get 8322 */ 8323 if (pret > 0) 8324 unpin_user_pages(pages, pret); 8325 goto done; 8326 } 8327 8328 ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage); 8329 if (ret) { 8330 unpin_user_pages(pages, pret); 8331 goto done; 8332 } 8333 8334 off = ubuf & ~PAGE_MASK; 8335 size = iov->iov_len; 8336 for (i = 0; i < nr_pages; i++) { 8337 size_t vec_len; 8338 8339 vec_len = min_t(size_t, size, PAGE_SIZE - off); 8340 imu->bvec[i].bv_page = pages[i]; 8341 imu->bvec[i].bv_len = vec_len; 8342 imu->bvec[i].bv_offset = off; 8343 off = 0; 8344 size -= vec_len; 8345 } 8346 /* store original address for later verification */ 8347 imu->ubuf = ubuf; 8348 imu->ubuf_end = ubuf + iov->iov_len; 8349 imu->nr_bvecs = nr_pages; 8350 *pimu = imu; 8351 ret = 0; 8352done: 8353 if (ret) 8354 kvfree(imu); 8355 kvfree(pages); 8356 kvfree(vmas); 8357 return ret; 8358} 8359 8360static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) 8361{ 8362 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); 8363 return ctx->user_bufs ? 0 : -ENOMEM; 8364} 8365 8366static int io_buffer_validate(struct iovec *iov) 8367{ 8368 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); 8369 8370 /* 8371 * Don't impose further limits on the size and buffer 8372 * constraints here, we'll -EINVAL later when IO is 8373 * submitted if they are wrong. 8374 */ 8375 if (!iov->iov_base) 8376 return iov->iov_len ? -EFAULT : 0; 8377 if (!iov->iov_len) 8378 return -EFAULT; 8379 8380 /* arbitrary limit, but we need something */ 8381 if (iov->iov_len > SZ_1G) 8382 return -EFAULT; 8383 8384 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) 8385 return -EOVERFLOW; 8386 8387 return 0; 8388} 8389 8390static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 8391 unsigned int nr_args, u64 __user *tags) 8392{ 8393 struct page *last_hpage = NULL; 8394 struct io_rsrc_data *data; 8395 int i, ret; 8396 struct iovec iov; 8397 8398 if (ctx->user_bufs) 8399 return -EBUSY; 8400 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 8401 return -EINVAL; 8402 ret = io_rsrc_node_switch_start(ctx); 8403 if (ret) 8404 return ret; 8405 data = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, nr_args); 8406 if (!data) 8407 return -ENOMEM; 8408 ret = io_buffers_map_alloc(ctx, nr_args); 8409 if (ret) { 8410 io_rsrc_data_free(data); 8411 return ret; 8412 } 8413 8414 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { 8415 u64 tag = 0; 8416 8417 if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) { 8418 ret = -EFAULT; 8419 break; 8420 } 8421 ret = io_copy_iov(ctx, &iov, arg, i); 8422 if (ret) 8423 break; 8424 ret = io_buffer_validate(&iov); 8425 if (ret) 8426 break; 8427 if (!iov.iov_base && tag) { 8428 ret = -EINVAL; 8429 break; 8430 } 8431 8432 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], 8433 &last_hpage); 8434 if (ret) 8435 break; 8436 data->tags[i] = tag; 8437 } 8438 8439 WARN_ON_ONCE(ctx->buf_data); 8440 8441 ctx->buf_data = data; 8442 if (ret) 8443 __io_sqe_buffers_unregister(ctx); 8444 else 8445 io_rsrc_node_switch(ctx, NULL); 8446 return ret; 8447} 8448 8449static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 8450 struct io_uring_rsrc_update2 *up, 8451 unsigned int nr_args) 8452{ 8453 u64 __user *tags = u64_to_user_ptr(up->tags); 8454 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); 8455 struct page *last_hpage = NULL; 8456 bool needs_switch = false; 8457 __u32 done; 8458 int i, err; 8459 8460 if (!ctx->buf_data) 8461 return -ENXIO; 8462 if (up->offset + nr_args > ctx->nr_user_bufs) 8463 return -EINVAL; 8464 8465 for (done = 0; done < nr_args; done++) { 8466 struct io_mapped_ubuf *imu; 8467 int offset = up->offset + done; 8468 u64 tag = 0; 8469 8470 err = io_copy_iov(ctx, &iov, iovs, done); 8471 if (err) 8472 break; 8473 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 8474 err = -EFAULT; 8475 break; 8476 } 8477 err = io_buffer_validate(&iov); 8478 if (err) 8479 break; 8480 if (!iov.iov_base && tag) { 8481 err = -EINVAL; 8482 break; 8483 } 8484 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); 8485 if (err) 8486 break; 8487 8488 i = array_index_nospec(offset, ctx->nr_user_bufs); 8489 if (ctx->user_bufs[i] != ctx->dummy_ubuf) { 8490 err = io_queue_rsrc_removal(ctx->buf_data, offset, 8491 ctx->rsrc_node, ctx->user_bufs[i]); 8492 if (unlikely(err)) { 8493 io_buffer_unmap(ctx, &imu); 8494 break; 8495 } 8496 ctx->user_bufs[i] = NULL; 8497 needs_switch = true; 8498 } 8499 8500 ctx->user_bufs[i] = imu; 8501 ctx->buf_data->tags[offset] = tag; 8502 } 8503 8504 if (needs_switch) 8505 io_rsrc_node_switch(ctx, ctx->buf_data); 8506 return done ? done : err; 8507} 8508 8509static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg) 8510{ 8511 __s32 __user *fds = arg; 8512 int fd; 8513 8514 if (ctx->cq_ev_fd) 8515 return -EBUSY; 8516 8517 if (copy_from_user(&fd, fds, sizeof(*fds))) 8518 return -EFAULT; 8519 8520 ctx->cq_ev_fd = eventfd_ctx_fdget(fd); 8521 if (IS_ERR(ctx->cq_ev_fd)) { 8522 int ret = PTR_ERR(ctx->cq_ev_fd); 8523 ctx->cq_ev_fd = NULL; 8524 return ret; 8525 } 8526 8527 return 0; 8528} 8529 8530static int io_eventfd_unregister(struct io_ring_ctx *ctx) 8531{ 8532 if (ctx->cq_ev_fd) { 8533 eventfd_ctx_put(ctx->cq_ev_fd); 8534 ctx->cq_ev_fd = NULL; 8535 return 0; 8536 } 8537 8538 return -ENXIO; 8539} 8540 8541static void io_destroy_buffers(struct io_ring_ctx *ctx) 8542{ 8543 struct io_buffer *buf; 8544 unsigned long index; 8545 8546 xa_for_each(&ctx->io_buffers, index, buf) 8547 __io_remove_buffers(ctx, buf, index, -1U); 8548} 8549 8550static void io_req_cache_free(struct list_head *list, struct task_struct *tsk) 8551{ 8552 struct io_kiocb *req, *nxt; 8553 8554 list_for_each_entry_safe(req, nxt, list, compl.list) { 8555 if (tsk && req->task != tsk) 8556 continue; 8557 list_del(&req->compl.list); 8558 kmem_cache_free(req_cachep, req); 8559 } 8560} 8561 8562static void io_req_caches_free(struct io_ring_ctx *ctx) 8563{ 8564 struct io_submit_state *submit_state = &ctx->submit_state; 8565 struct io_comp_state *cs = &ctx->submit_state.comp; 8566 8567 mutex_lock(&ctx->uring_lock); 8568 8569 if (submit_state->free_reqs) { 8570 kmem_cache_free_bulk(req_cachep, submit_state->free_reqs, 8571 submit_state->reqs); 8572 submit_state->free_reqs = 0; 8573 } 8574 8575 io_flush_cached_locked_reqs(ctx, cs); 8576 io_req_cache_free(&cs->free_list, NULL); 8577 mutex_unlock(&ctx->uring_lock); 8578} 8579 8580static bool io_wait_rsrc_data(struct io_rsrc_data *data) 8581{ 8582 if (!data) 8583 return false; 8584 if (!atomic_dec_and_test(&data->refs)) 8585 wait_for_completion(&data->done); 8586 return true; 8587} 8588 8589static void io_ring_ctx_free(struct io_ring_ctx *ctx) 8590{ 8591 io_sq_thread_finish(ctx); 8592 8593 if (ctx->mm_account) { 8594 mmdrop(ctx->mm_account); 8595 ctx->mm_account = NULL; 8596 } 8597 8598 mutex_lock(&ctx->uring_lock); 8599 if (io_wait_rsrc_data(ctx->buf_data)) 8600 __io_sqe_buffers_unregister(ctx); 8601 if (io_wait_rsrc_data(ctx->file_data)) 8602 __io_sqe_files_unregister(ctx); 8603 if (ctx->rings) 8604 __io_cqring_overflow_flush(ctx, true); 8605 mutex_unlock(&ctx->uring_lock); 8606 io_eventfd_unregister(ctx); 8607 io_destroy_buffers(ctx); 8608 if (ctx->sq_creds) 8609 put_cred(ctx->sq_creds); 8610 8611 /* there are no registered resources left, nobody uses it */ 8612 if (ctx->rsrc_node) 8613 io_rsrc_node_destroy(ctx->rsrc_node); 8614 if (ctx->rsrc_backup_node) 8615 io_rsrc_node_destroy(ctx->rsrc_backup_node); 8616 flush_delayed_work(&ctx->rsrc_put_work); 8617 8618 WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); 8619 WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist)); 8620 8621#if defined(CONFIG_UNIX) 8622 if (ctx->ring_sock) { 8623 ctx->ring_sock->file = NULL; /* so that iput() is called */ 8624 sock_release(ctx->ring_sock); 8625 } 8626#endif 8627 8628 io_mem_free(ctx->rings); 8629 io_mem_free(ctx->sq_sqes); 8630 8631 percpu_ref_exit(&ctx->refs); 8632 free_uid(ctx->user); 8633 io_req_caches_free(ctx); 8634 if (ctx->hash_map) 8635 io_wq_put_hash(ctx->hash_map); 8636 kfree(ctx->cancel_hash); 8637 kfree(ctx->dummy_ubuf); 8638 kfree(ctx); 8639} 8640 8641static __poll_t io_uring_poll(struct file *file, poll_table *wait) 8642{ 8643 struct io_ring_ctx *ctx = file->private_data; 8644 __poll_t mask = 0; 8645 8646 poll_wait(file, &ctx->cq_wait, wait); 8647 /* 8648 * synchronizes with barrier from wq_has_sleeper call in 8649 * io_commit_cqring 8650 */ 8651 smp_rmb(); 8652 if (!io_sqring_full(ctx)) 8653 mask |= EPOLLOUT | EPOLLWRNORM; 8654 8655 /* 8656 * Don't flush cqring overflow list here, just do a simple check. 8657 * Otherwise there could possible be ABBA deadlock: 8658 * CPU0 CPU1 8659 * ---- ---- 8660 * lock(&ctx->uring_lock); 8661 * lock(&ep->mtx); 8662 * lock(&ctx->uring_lock); 8663 * lock(&ep->mtx); 8664 * 8665 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this 8666 * pushs them to do the flush. 8667 */ 8668 if (io_cqring_events(ctx) || test_bit(0, &ctx->cq_check_overflow)) 8669 mask |= EPOLLIN | EPOLLRDNORM; 8670 8671 return mask; 8672} 8673 8674static int io_uring_fasync(int fd, struct file *file, int on) 8675{ 8676 struct io_ring_ctx *ctx = file->private_data; 8677 8678 return fasync_helper(fd, file, on, &ctx->cq_fasync); 8679} 8680 8681static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) 8682{ 8683 const struct cred *creds; 8684 8685 creds = xa_erase(&ctx->personalities, id); 8686 if (creds) { 8687 put_cred(creds); 8688 return 0; 8689 } 8690 8691 return -EINVAL; 8692} 8693 8694static inline bool io_run_ctx_fallback(struct io_ring_ctx *ctx) 8695{ 8696 return io_run_task_work_head(&ctx->exit_task_work); 8697} 8698 8699struct io_tctx_exit { 8700 struct callback_head task_work; 8701 struct completion completion; 8702 struct io_ring_ctx *ctx; 8703}; 8704 8705static void io_tctx_exit_cb(struct callback_head *cb) 8706{ 8707 struct io_uring_task *tctx = current->io_uring; 8708 struct io_tctx_exit *work; 8709 8710 work = container_of(cb, struct io_tctx_exit, task_work); 8711 /* 8712 * When @in_idle, we're in cancellation and it's racy to remove the 8713 * node. It'll be removed by the end of cancellation, just ignore it. 8714 */ 8715 if (!atomic_read(&tctx->in_idle)) 8716 io_uring_del_task_file((unsigned long)work->ctx); 8717 complete(&work->completion); 8718} 8719 8720static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) 8721{ 8722 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 8723 8724 return req->ctx == data; 8725} 8726 8727static void io_ring_exit_work(struct work_struct *work) 8728{ 8729 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); 8730 unsigned long timeout = jiffies + HZ * 60 * 5; 8731 struct io_tctx_exit exit; 8732 struct io_tctx_node *node; 8733 int ret; 8734 8735 /* 8736 * If we're doing polled IO and end up having requests being 8737 * submitted async (out-of-line), then completions can come in while 8738 * we're waiting for refs to drop. We need to reap these manually, 8739 * as nobody else will be looking for them. 8740 */ 8741 do { 8742 io_uring_try_cancel_requests(ctx, NULL, NULL); 8743 if (ctx->sq_data) { 8744 struct io_sq_data *sqd = ctx->sq_data; 8745 struct task_struct *tsk; 8746 8747 io_sq_thread_park(sqd); 8748 tsk = sqd->thread; 8749 if (tsk && tsk->io_uring && tsk->io_uring->io_wq) 8750 io_wq_cancel_cb(tsk->io_uring->io_wq, 8751 io_cancel_ctx_cb, ctx, true); 8752 io_sq_thread_unpark(sqd); 8753 } 8754 8755 WARN_ON_ONCE(time_after(jiffies, timeout)); 8756 } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); 8757 8758 init_completion(&exit.completion); 8759 init_task_work(&exit.task_work, io_tctx_exit_cb); 8760 exit.ctx = ctx; 8761 /* 8762 * Some may use context even when all refs and requests have been put, 8763 * and they are free to do so while still holding uring_lock or 8764 * completion_lock, see __io_req_task_submit(). Apart from other work, 8765 * this lock/unlock section also waits them to finish. 8766 */ 8767 mutex_lock(&ctx->uring_lock); 8768 while (!list_empty(&ctx->tctx_list)) { 8769 WARN_ON_ONCE(time_after(jiffies, timeout)); 8770 8771 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node, 8772 ctx_node); 8773 /* don't spin on a single task if cancellation failed */ 8774 list_rotate_left(&ctx->tctx_list); 8775 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL); 8776 if (WARN_ON_ONCE(ret)) 8777 continue; 8778 wake_up_process(node->task); 8779 8780 mutex_unlock(&ctx->uring_lock); 8781 wait_for_completion(&exit.completion); 8782 mutex_lock(&ctx->uring_lock); 8783 } 8784 mutex_unlock(&ctx->uring_lock); 8785 spin_lock_irq(&ctx->completion_lock); 8786 spin_unlock_irq(&ctx->completion_lock); 8787 8788 io_ring_ctx_free(ctx); 8789} 8790 8791/* Returns true if we found and killed one or more timeouts */ 8792static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, 8793 struct files_struct *files) 8794{ 8795 struct io_kiocb *req, *tmp; 8796 int canceled = 0; 8797 8798 spin_lock_irq(&ctx->completion_lock); 8799 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { 8800 if (io_match_task(req, tsk, files)) { 8801 io_kill_timeout(req, -ECANCELED); 8802 canceled++; 8803 } 8804 } 8805 if (canceled != 0) 8806 io_commit_cqring(ctx); 8807 spin_unlock_irq(&ctx->completion_lock); 8808 if (canceled != 0) 8809 io_cqring_ev_posted(ctx); 8810 return canceled != 0; 8811} 8812 8813static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) 8814{ 8815 unsigned long index; 8816 struct creds *creds; 8817 8818 mutex_lock(&ctx->uring_lock); 8819 percpu_ref_kill(&ctx->refs); 8820 if (ctx->rings) 8821 __io_cqring_overflow_flush(ctx, true); 8822 xa_for_each(&ctx->personalities, index, creds) 8823 io_unregister_personality(ctx, index); 8824 mutex_unlock(&ctx->uring_lock); 8825 8826 io_kill_timeouts(ctx, NULL, NULL); 8827 io_poll_remove_all(ctx, NULL, NULL); 8828 8829 /* if we failed setting up the ctx, we might not have any rings */ 8830 io_iopoll_try_reap_events(ctx); 8831 8832 INIT_WORK(&ctx->exit_work, io_ring_exit_work); 8833 /* 8834 * Use system_unbound_wq to avoid spawning tons of event kworkers 8835 * if we're exiting a ton of rings at the same time. It just adds 8836 * noise and overhead, there's no discernable change in runtime 8837 * over using system_wq. 8838 */ 8839 queue_work(system_unbound_wq, &ctx->exit_work); 8840} 8841 8842static int io_uring_release(struct inode *inode, struct file *file) 8843{ 8844 struct io_ring_ctx *ctx = file->private_data; 8845 8846 file->private_data = NULL; 8847 io_ring_ctx_wait_and_kill(ctx); 8848 return 0; 8849} 8850 8851struct io_task_cancel { 8852 struct task_struct *task; 8853 struct files_struct *files; 8854}; 8855 8856static bool io_cancel_task_cb(struct io_wq_work *work, void *data) 8857{ 8858 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 8859 struct io_task_cancel *cancel = data; 8860 bool ret; 8861 8862 if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) { 8863 unsigned long flags; 8864 struct io_ring_ctx *ctx = req->ctx; 8865 8866 /* protect against races with linked timeouts */ 8867 spin_lock_irqsave(&ctx->completion_lock, flags); 8868 ret = io_match_task(req, cancel->task, cancel->files); 8869 spin_unlock_irqrestore(&ctx->completion_lock, flags); 8870 } else { 8871 ret = io_match_task(req, cancel->task, cancel->files); 8872 } 8873 return ret; 8874} 8875 8876static bool io_cancel_defer_files(struct io_ring_ctx *ctx, 8877 struct task_struct *task, 8878 struct files_struct *files) 8879{ 8880 struct io_defer_entry *de; 8881 LIST_HEAD(list); 8882 8883 spin_lock_irq(&ctx->completion_lock); 8884 list_for_each_entry_reverse(de, &ctx->defer_list, list) { 8885 if (io_match_task(de->req, task, files)) { 8886 list_cut_position(&list, &ctx->defer_list, &de->list); 8887 break; 8888 } 8889 } 8890 spin_unlock_irq(&ctx->completion_lock); 8891 if (list_empty(&list)) 8892 return false; 8893 8894 while (!list_empty(&list)) { 8895 de = list_first_entry(&list, struct io_defer_entry, list); 8896 list_del_init(&de->list); 8897 io_req_complete_failed(de->req, -ECANCELED); 8898 kfree(de); 8899 } 8900 return true; 8901} 8902 8903static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) 8904{ 8905 struct io_tctx_node *node; 8906 enum io_wq_cancel cret; 8907 bool ret = false; 8908 8909 mutex_lock(&ctx->uring_lock); 8910 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 8911 struct io_uring_task *tctx = node->task->io_uring; 8912 8913 /* 8914 * io_wq will stay alive while we hold uring_lock, because it's 8915 * killed after ctx nodes, which requires to take the lock. 8916 */ 8917 if (!tctx || !tctx->io_wq) 8918 continue; 8919 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); 8920 ret |= (cret != IO_WQ_CANCEL_NOTFOUND); 8921 } 8922 mutex_unlock(&ctx->uring_lock); 8923 8924 return ret; 8925} 8926 8927static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 8928 struct task_struct *task, 8929 struct files_struct *files) 8930{ 8931 struct io_task_cancel cancel = { .task = task, .files = files, }; 8932 struct io_uring_task *tctx = task ? task->io_uring : NULL; 8933 8934 while (1) { 8935 enum io_wq_cancel cret; 8936 bool ret = false; 8937 8938 if (!task) { 8939 ret |= io_uring_try_cancel_iowq(ctx); 8940 } else if (tctx && tctx->io_wq) { 8941 /* 8942 * Cancels requests of all rings, not only @ctx, but 8943 * it's fine as the task is in exit/exec. 8944 */ 8945 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, 8946 &cancel, true); 8947 ret |= (cret != IO_WQ_CANCEL_NOTFOUND); 8948 } 8949 8950 /* SQPOLL thread does its own polling */ 8951 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && !files) || 8952 (ctx->sq_data && ctx->sq_data->thread == current)) { 8953 while (!list_empty_careful(&ctx->iopoll_list)) { 8954 io_iopoll_try_reap_events(ctx); 8955 ret = true; 8956 } 8957 } 8958 8959 ret |= io_cancel_defer_files(ctx, task, files); 8960 ret |= io_poll_remove_all(ctx, task, files); 8961 ret |= io_kill_timeouts(ctx, task, files); 8962 ret |= io_run_task_work(); 8963 ret |= io_run_ctx_fallback(ctx); 8964 if (!ret) 8965 break; 8966 cond_resched(); 8967 } 8968} 8969 8970static int __io_uring_add_task_file(struct io_ring_ctx *ctx) 8971{ 8972 struct io_uring_task *tctx = current->io_uring; 8973 struct io_tctx_node *node; 8974 int ret; 8975 8976 if (unlikely(!tctx)) { 8977 ret = io_uring_alloc_task_context(current, ctx); 8978 if (unlikely(ret)) 8979 return ret; 8980 tctx = current->io_uring; 8981 } 8982 if (!xa_load(&tctx->xa, (unsigned long)ctx)) { 8983 node = kmalloc(sizeof(*node), GFP_KERNEL); 8984 if (!node) 8985 return -ENOMEM; 8986 node->ctx = ctx; 8987 node->task = current; 8988 8989 ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, 8990 node, GFP_KERNEL)); 8991 if (ret) { 8992 kfree(node); 8993 return ret; 8994 } 8995 8996 mutex_lock(&ctx->uring_lock); 8997 list_add(&node->ctx_node, &ctx->tctx_list); 8998 mutex_unlock(&ctx->uring_lock); 8999 } 9000 tctx->last = ctx; 9001 return 0; 9002} 9003 9004/* 9005 * Note that this task has used io_uring. We use it for cancelation purposes. 9006 */ 9007static inline int io_uring_add_task_file(struct io_ring_ctx *ctx) 9008{ 9009 struct io_uring_task *tctx = current->io_uring; 9010 9011 if (likely(tctx && tctx->last == ctx)) 9012 return 0; 9013 return __io_uring_add_task_file(ctx); 9014} 9015 9016/* 9017 * Remove this io_uring_file -> task mapping. 9018 */ 9019static void io_uring_del_task_file(unsigned long index) 9020{ 9021 struct io_uring_task *tctx = current->io_uring; 9022 struct io_tctx_node *node; 9023 9024 if (!tctx) 9025 return; 9026 node = xa_erase(&tctx->xa, index); 9027 if (!node) 9028 return; 9029 9030 WARN_ON_ONCE(current != node->task); 9031 WARN_ON_ONCE(list_empty(&node->ctx_node)); 9032 9033 mutex_lock(&node->ctx->uring_lock); 9034 list_del(&node->ctx_node); 9035 mutex_unlock(&node->ctx->uring_lock); 9036 9037 if (tctx->last == node->ctx) 9038 tctx->last = NULL; 9039 kfree(node); 9040} 9041 9042static void io_uring_clean_tctx(struct io_uring_task *tctx) 9043{ 9044 struct io_wq *wq = tctx->io_wq; 9045 struct io_tctx_node *node; 9046 unsigned long index; 9047 9048 xa_for_each(&tctx->xa, index, node) 9049 io_uring_del_task_file(index); 9050 if (wq) { 9051 /* 9052 * Must be after io_uring_del_task_file() (removes nodes under 9053 * uring_lock) to avoid race with io_uring_try_cancel_iowq(). 9054 */ 9055 tctx->io_wq = NULL; 9056 io_wq_put_and_exit(wq); 9057 } 9058} 9059 9060static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) 9061{ 9062 if (tracked) 9063 return atomic_read(&tctx->inflight_tracked); 9064 return percpu_counter_sum(&tctx->inflight); 9065} 9066 9067static void io_uring_try_cancel(struct files_struct *files) 9068{ 9069 struct io_uring_task *tctx = current->io_uring; 9070 struct io_tctx_node *node; 9071 unsigned long index; 9072 9073 xa_for_each(&tctx->xa, index, node) { 9074 struct io_ring_ctx *ctx = node->ctx; 9075 9076 /* sqpoll task will cancel all its requests */ 9077 if (!ctx->sq_data) 9078 io_uring_try_cancel_requests(ctx, current, files); 9079 } 9080} 9081 9082/* should only be called by SQPOLL task */ 9083static void io_uring_cancel_sqpoll(struct io_sq_data *sqd) 9084{ 9085 struct io_uring_task *tctx = current->io_uring; 9086 struct io_ring_ctx *ctx; 9087 s64 inflight; 9088 DEFINE_WAIT(wait); 9089 9090 if (!current->io_uring) 9091 return; 9092 if (tctx->io_wq) 9093 io_wq_exit_start(tctx->io_wq); 9094 9095 WARN_ON_ONCE(!sqd || sqd->thread != current); 9096 9097 atomic_inc(&tctx->in_idle); 9098 do { 9099 /* read completions before cancelations */ 9100 inflight = tctx_inflight(tctx, false); 9101 if (!inflight) 9102 break; 9103 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 9104 io_uring_try_cancel_requests(ctx, current, NULL); 9105 9106 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); 9107 /* 9108 * If we've seen completions, retry without waiting. This 9109 * avoids a race where a completion comes in before we did 9110 * prepare_to_wait(). 9111 */ 9112 if (inflight == tctx_inflight(tctx, false)) 9113 schedule(); 9114 finish_wait(&tctx->wait, &wait); 9115 } while (1); 9116 atomic_dec(&tctx->in_idle); 9117} 9118 9119/* 9120 * Find any io_uring fd that this task has registered or done IO on, and cancel 9121 * requests. 9122 */ 9123void __io_uring_cancel(struct files_struct *files) 9124{ 9125 struct io_uring_task *tctx = current->io_uring; 9126 DEFINE_WAIT(wait); 9127 s64 inflight; 9128 9129 if (tctx->io_wq) 9130 io_wq_exit_start(tctx->io_wq); 9131 9132 /* make sure overflow events are dropped */ 9133 atomic_inc(&tctx->in_idle); 9134 do { 9135 /* read completions before cancelations */ 9136 inflight = tctx_inflight(tctx, !!files); 9137 if (!inflight) 9138 break; 9139 io_uring_try_cancel(files); 9140 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); 9141 9142 /* 9143 * If we've seen completions, retry without waiting. This 9144 * avoids a race where a completion comes in before we did 9145 * prepare_to_wait(). 9146 */ 9147 if (inflight == tctx_inflight(tctx, !!files)) 9148 schedule(); 9149 finish_wait(&tctx->wait, &wait); 9150 } while (1); 9151 atomic_dec(&tctx->in_idle); 9152 9153 io_uring_clean_tctx(tctx); 9154 if (!files) { 9155 /* for exec all current's requests should be gone, kill tctx */ 9156 __io_uring_free(current); 9157 } 9158} 9159 9160static void *io_uring_validate_mmap_request(struct file *file, 9161 loff_t pgoff, size_t sz) 9162{ 9163 struct io_ring_ctx *ctx = file->private_data; 9164 loff_t offset = pgoff << PAGE_SHIFT; 9165 struct page *page; 9166 void *ptr; 9167 9168 switch (offset) { 9169 case IORING_OFF_SQ_RING: 9170 case IORING_OFF_CQ_RING: 9171 ptr = ctx->rings; 9172 break; 9173 case IORING_OFF_SQES: 9174 ptr = ctx->sq_sqes; 9175 break; 9176 default: 9177 return ERR_PTR(-EINVAL); 9178 } 9179 9180 page = virt_to_head_page(ptr); 9181 if (sz > page_size(page)) 9182 return ERR_PTR(-EINVAL); 9183 9184 return ptr; 9185} 9186 9187#ifdef CONFIG_MMU 9188 9189static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 9190{ 9191 size_t sz = vma->vm_end - vma->vm_start; 9192 unsigned long pfn; 9193 void *ptr; 9194 9195 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); 9196 if (IS_ERR(ptr)) 9197 return PTR_ERR(ptr); 9198 9199 pfn = virt_to_phys(ptr) >> PAGE_SHIFT; 9200 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); 9201} 9202 9203#else /* !CONFIG_MMU */ 9204 9205static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 9206{ 9207 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL; 9208} 9209 9210static unsigned int io_uring_nommu_mmap_capabilities(struct file *file) 9211{ 9212 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; 9213} 9214 9215static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, 9216 unsigned long addr, unsigned long len, 9217 unsigned long pgoff, unsigned long flags) 9218{ 9219 void *ptr; 9220 9221 ptr = io_uring_validate_mmap_request(file, pgoff, len); 9222 if (IS_ERR(ptr)) 9223 return PTR_ERR(ptr); 9224 9225 return (unsigned long) ptr; 9226} 9227 9228#endif /* !CONFIG_MMU */ 9229 9230static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) 9231{ 9232 DEFINE_WAIT(wait); 9233 9234 do { 9235 if (!io_sqring_full(ctx)) 9236 break; 9237 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE); 9238 9239 if (!io_sqring_full(ctx)) 9240 break; 9241 schedule(); 9242 } while (!signal_pending(current)); 9243 9244 finish_wait(&ctx->sqo_sq_wait, &wait); 9245 return 0; 9246} 9247 9248static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, 9249 struct __kernel_timespec __user **ts, 9250 const sigset_t __user **sig) 9251{ 9252 struct io_uring_getevents_arg arg; 9253 9254 /* 9255 * If EXT_ARG isn't set, then we have no timespec and the argp pointer 9256 * is just a pointer to the sigset_t. 9257 */ 9258 if (!(flags & IORING_ENTER_EXT_ARG)) { 9259 *sig = (const sigset_t __user *) argp; 9260 *ts = NULL; 9261 return 0; 9262 } 9263 9264 /* 9265 * EXT_ARG is set - ensure we agree on the size of it and copy in our 9266 * timespec and sigset_t pointers if good. 9267 */ 9268 if (*argsz != sizeof(arg)) 9269 return -EINVAL; 9270 if (copy_from_user(&arg, argp, sizeof(arg))) 9271 return -EFAULT; 9272 *sig = u64_to_user_ptr(arg.sigmask); 9273 *argsz = arg.sigmask_sz; 9274 *ts = u64_to_user_ptr(arg.ts); 9275 return 0; 9276} 9277 9278SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, 9279 u32, min_complete, u32, flags, const void __user *, argp, 9280 size_t, argsz) 9281{ 9282 struct io_ring_ctx *ctx; 9283 int submitted = 0; 9284 struct fd f; 9285 long ret; 9286 9287 io_run_task_work(); 9288 9289 if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | 9290 IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))) 9291 return -EINVAL; 9292 9293 f = fdget(fd); 9294 if (unlikely(!f.file)) 9295 return -EBADF; 9296 9297 ret = -EOPNOTSUPP; 9298 if (unlikely(f.file->f_op != &io_uring_fops)) 9299 goto out_fput; 9300 9301 ret = -ENXIO; 9302 ctx = f.file->private_data; 9303 if (unlikely(!percpu_ref_tryget(&ctx->refs))) 9304 goto out_fput; 9305 9306 ret = -EBADFD; 9307 if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED)) 9308 goto out; 9309 9310 /* 9311 * For SQ polling, the thread will do all submissions and completions. 9312 * Just return the requested submit count, and wake the thread if 9313 * we were asked to. 9314 */ 9315 ret = 0; 9316 if (ctx->flags & IORING_SETUP_SQPOLL) { 9317 io_cqring_overflow_flush(ctx, false); 9318 9319 ret = -EOWNERDEAD; 9320 if (unlikely(ctx->sq_data->thread == NULL)) { 9321 goto out; 9322 } 9323 if (flags & IORING_ENTER_SQ_WAKEUP) 9324 wake_up(&ctx->sq_data->wait); 9325 if (flags & IORING_ENTER_SQ_WAIT) { 9326 ret = io_sqpoll_wait_sq(ctx); 9327 if (ret) 9328 goto out; 9329 } 9330 submitted = to_submit; 9331 } else if (to_submit) { 9332 ret = io_uring_add_task_file(ctx); 9333 if (unlikely(ret)) 9334 goto out; 9335 mutex_lock(&ctx->uring_lock); 9336 submitted = io_submit_sqes(ctx, to_submit); 9337 mutex_unlock(&ctx->uring_lock); 9338 9339 if (submitted != to_submit) 9340 goto out; 9341 } 9342 if (flags & IORING_ENTER_GETEVENTS) { 9343 const sigset_t __user *sig; 9344 struct __kernel_timespec __user *ts; 9345 9346 ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); 9347 if (unlikely(ret)) 9348 goto out; 9349 9350 min_complete = min(min_complete, ctx->cq_entries); 9351 9352 /* 9353 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user 9354 * space applications don't need to do io completion events 9355 * polling again, they can rely on io_sq_thread to do polling 9356 * work, which can reduce cpu usage and uring_lock contention. 9357 */ 9358 if (ctx->flags & IORING_SETUP_IOPOLL && 9359 !(ctx->flags & IORING_SETUP_SQPOLL)) { 9360 ret = io_iopoll_check(ctx, min_complete); 9361 } else { 9362 ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts); 9363 } 9364 } 9365 9366out: 9367 percpu_ref_put(&ctx->refs); 9368out_fput: 9369 fdput(f); 9370 return submitted ? submitted : ret; 9371} 9372 9373#ifdef CONFIG_PROC_FS 9374static int io_uring_show_cred(struct seq_file *m, unsigned int id, 9375 const struct cred *cred) 9376{ 9377 struct user_namespace *uns = seq_user_ns(m); 9378 struct group_info *gi; 9379 kernel_cap_t cap; 9380 unsigned __capi; 9381 int g; 9382 9383 seq_printf(m, "%5d\n", id); 9384 seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid)); 9385 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid)); 9386 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid)); 9387 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid)); 9388 seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid)); 9389 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid)); 9390 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid)); 9391 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid)); 9392 seq_puts(m, "\n\tGroups:\t"); 9393 gi = cred->group_info; 9394 for (g = 0; g < gi->ngroups; g++) { 9395 seq_put_decimal_ull(m, g ? " " : "", 9396 from_kgid_munged(uns, gi->gid[g])); 9397 } 9398 seq_puts(m, "\n\tCapEff:\t"); 9399 cap = cred->cap_effective; 9400 CAP_FOR_EACH_U32(__capi) 9401 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8); 9402 seq_putc(m, '\n'); 9403 return 0; 9404} 9405 9406static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) 9407{ 9408 struct io_sq_data *sq = NULL; 9409 bool has_lock; 9410 int i; 9411 9412 /* 9413 * Avoid ABBA deadlock between the seq lock and the io_uring mutex, 9414 * since fdinfo case grabs it in the opposite direction of normal use 9415 * cases. If we fail to get the lock, we just don't iterate any 9416 * structures that could be going away outside the io_uring mutex. 9417 */ 9418 has_lock = mutex_trylock(&ctx->uring_lock); 9419 9420 if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { 9421 sq = ctx->sq_data; 9422 if (!sq->thread) 9423 sq = NULL; 9424 } 9425 9426 seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1); 9427 seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1); 9428 seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); 9429 for (i = 0; has_lock && i < ctx->nr_user_files; i++) { 9430 struct file *f = io_file_from_index(ctx, i); 9431 9432 if (f) 9433 seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname); 9434 else 9435 seq_printf(m, "%5u: <none>\n", i); 9436 } 9437 seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs); 9438 for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) { 9439 struct io_mapped_ubuf *buf = ctx->user_bufs[i]; 9440 unsigned int len = buf->ubuf_end - buf->ubuf; 9441 9442 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len); 9443 } 9444 if (has_lock && !xa_empty(&ctx->personalities)) { 9445 unsigned long index; 9446 const struct cred *cred; 9447 9448 seq_printf(m, "Personalities:\n"); 9449 xa_for_each(&ctx->personalities, index, cred) 9450 io_uring_show_cred(m, index, cred); 9451 } 9452 seq_printf(m, "PollList:\n"); 9453 spin_lock_irq(&ctx->completion_lock); 9454 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { 9455 struct hlist_head *list = &ctx->cancel_hash[i]; 9456 struct io_kiocb *req; 9457 9458 hlist_for_each_entry(req, list, hash_node) 9459 seq_printf(m, " op=%d, task_works=%d\n", req->opcode, 9460 req->task->task_works != NULL); 9461 } 9462 spin_unlock_irq(&ctx->completion_lock); 9463 if (has_lock) 9464 mutex_unlock(&ctx->uring_lock); 9465} 9466 9467static void io_uring_show_fdinfo(struct seq_file *m, struct file *f) 9468{ 9469 struct io_ring_ctx *ctx = f->private_data; 9470 9471 if (percpu_ref_tryget(&ctx->refs)) { 9472 __io_uring_show_fdinfo(ctx, m); 9473 percpu_ref_put(&ctx->refs); 9474 } 9475} 9476#endif 9477 9478static const struct file_operations io_uring_fops = { 9479 .release = io_uring_release, 9480 .mmap = io_uring_mmap, 9481#ifndef CONFIG_MMU 9482 .get_unmapped_area = io_uring_nommu_get_unmapped_area, 9483 .mmap_capabilities = io_uring_nommu_mmap_capabilities, 9484#endif 9485 .poll = io_uring_poll, 9486 .fasync = io_uring_fasync, 9487#ifdef CONFIG_PROC_FS 9488 .show_fdinfo = io_uring_show_fdinfo, 9489#endif 9490}; 9491 9492static int io_allocate_scq_urings(struct io_ring_ctx *ctx, 9493 struct io_uring_params *p) 9494{ 9495 struct io_rings *rings; 9496 size_t size, sq_array_offset; 9497 9498 /* make sure these are sane, as we already accounted them */ 9499 ctx->sq_entries = p->sq_entries; 9500 ctx->cq_entries = p->cq_entries; 9501 9502 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); 9503 if (size == SIZE_MAX) 9504 return -EOVERFLOW; 9505 9506 rings = io_mem_alloc(size); 9507 if (!rings) 9508 return -ENOMEM; 9509 9510 ctx->rings = rings; 9511 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); 9512 rings->sq_ring_mask = p->sq_entries - 1; 9513 rings->cq_ring_mask = p->cq_entries - 1; 9514 rings->sq_ring_entries = p->sq_entries; 9515 rings->cq_ring_entries = p->cq_entries; 9516 ctx->sq_mask = rings->sq_ring_mask; 9517 ctx->cq_mask = rings->cq_ring_mask; 9518 9519 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); 9520 if (size == SIZE_MAX) { 9521 io_mem_free(ctx->rings); 9522 ctx->rings = NULL; 9523 return -EOVERFLOW; 9524 } 9525 9526 ctx->sq_sqes = io_mem_alloc(size); 9527 if (!ctx->sq_sqes) { 9528 io_mem_free(ctx->rings); 9529 ctx->rings = NULL; 9530 return -ENOMEM; 9531 } 9532 9533 return 0; 9534} 9535 9536static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) 9537{ 9538 int ret, fd; 9539 9540 fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); 9541 if (fd < 0) 9542 return fd; 9543 9544 ret = io_uring_add_task_file(ctx); 9545 if (ret) { 9546 put_unused_fd(fd); 9547 return ret; 9548 } 9549 fd_install(fd, file); 9550 return fd; 9551} 9552 9553/* 9554 * Allocate an anonymous fd, this is what constitutes the application 9555 * visible backing of an io_uring instance. The application mmaps this 9556 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled, 9557 * we have to tie this fd to a socket for file garbage collection purposes. 9558 */ 9559static struct file *io_uring_get_file(struct io_ring_ctx *ctx) 9560{ 9561 struct file *file; 9562#if defined(CONFIG_UNIX) 9563 int ret; 9564 9565 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP, 9566 &ctx->ring_sock); 9567 if (ret) 9568 return ERR_PTR(ret); 9569#endif 9570 9571 file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx, 9572 O_RDWR | O_CLOEXEC); 9573#if defined(CONFIG_UNIX) 9574 if (IS_ERR(file)) { 9575 sock_release(ctx->ring_sock); 9576 ctx->ring_sock = NULL; 9577 } else { 9578 ctx->ring_sock->file = file; 9579 } 9580#endif 9581 return file; 9582} 9583 9584static int io_uring_create(unsigned entries, struct io_uring_params *p, 9585 struct io_uring_params __user *params) 9586{ 9587 struct io_ring_ctx *ctx; 9588 struct file *file; 9589 int ret; 9590 9591 if (!entries) 9592 return -EINVAL; 9593 if (entries > IORING_MAX_ENTRIES) { 9594 if (!(p->flags & IORING_SETUP_CLAMP)) 9595 return -EINVAL; 9596 entries = IORING_MAX_ENTRIES; 9597 } 9598 9599 /* 9600 * Use twice as many entries for the CQ ring. It's possible for the 9601 * application to drive a higher depth than the size of the SQ ring, 9602 * since the sqes are only used at submission time. This allows for 9603 * some flexibility in overcommitting a bit. If the application has 9604 * set IORING_SETUP_CQSIZE, it will have passed in the desired number 9605 * of CQ ring entries manually. 9606 */ 9607 p->sq_entries = roundup_pow_of_two(entries); 9608 if (p->flags & IORING_SETUP_CQSIZE) { 9609 /* 9610 * If IORING_SETUP_CQSIZE is set, we do the same roundup 9611 * to a power-of-two, if it isn't already. We do NOT impose 9612 * any cq vs sq ring sizing. 9613 */ 9614 if (!p->cq_entries) 9615 return -EINVAL; 9616 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) { 9617 if (!(p->flags & IORING_SETUP_CLAMP)) 9618 return -EINVAL; 9619 p->cq_entries = IORING_MAX_CQ_ENTRIES; 9620 } 9621 p->cq_entries = roundup_pow_of_two(p->cq_entries); 9622 if (p->cq_entries < p->sq_entries) 9623 return -EINVAL; 9624 } else { 9625 p->cq_entries = 2 * p->sq_entries; 9626 } 9627 9628 ctx = io_ring_ctx_alloc(p); 9629 if (!ctx) 9630 return -ENOMEM; 9631 ctx->compat = in_compat_syscall(); 9632 if (!capable(CAP_IPC_LOCK)) 9633 ctx->user = get_uid(current_user()); 9634 9635 /* 9636 * This is just grabbed for accounting purposes. When a process exits, 9637 * the mm is exited and dropped before the files, hence we need to hang 9638 * on to this mm purely for the purposes of being able to unaccount 9639 * memory (locked/pinned vm). It's not used for anything else. 9640 */ 9641 mmgrab(current->mm); 9642 ctx->mm_account = current->mm; 9643 9644 ret = io_allocate_scq_urings(ctx, p); 9645 if (ret) 9646 goto err; 9647 9648 ret = io_sq_offload_create(ctx, p); 9649 if (ret) 9650 goto err; 9651 /* always set a rsrc node */ 9652 ret = io_rsrc_node_switch_start(ctx); 9653 if (ret) 9654 goto err; 9655 io_rsrc_node_switch(ctx, NULL); 9656 9657 memset(&p->sq_off, 0, sizeof(p->sq_off)); 9658 p->sq_off.head = offsetof(struct io_rings, sq.head); 9659 p->sq_off.tail = offsetof(struct io_rings, sq.tail); 9660 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); 9661 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); 9662 p->sq_off.flags = offsetof(struct io_rings, sq_flags); 9663 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); 9664 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; 9665 9666 memset(&p->cq_off, 0, sizeof(p->cq_off)); 9667 p->cq_off.head = offsetof(struct io_rings, cq.head); 9668 p->cq_off.tail = offsetof(struct io_rings, cq.tail); 9669 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); 9670 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); 9671 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); 9672 p->cq_off.cqes = offsetof(struct io_rings, cqes); 9673 p->cq_off.flags = offsetof(struct io_rings, cq_flags); 9674 9675 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | 9676 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | 9677 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | 9678 IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | 9679 IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | 9680 IORING_FEAT_RSRC_TAGS; 9681 9682 if (copy_to_user(params, p, sizeof(*p))) { 9683 ret = -EFAULT; 9684 goto err; 9685 } 9686 9687 file = io_uring_get_file(ctx); 9688 if (IS_ERR(file)) { 9689 ret = PTR_ERR(file); 9690 goto err; 9691 } 9692 9693 /* 9694 * Install ring fd as the very last thing, so we don't risk someone 9695 * having closed it before we finish setup 9696 */ 9697 ret = io_uring_install_fd(ctx, file); 9698 if (ret < 0) { 9699 /* fput will clean it up */ 9700 fput(file); 9701 return ret; 9702 } 9703 9704 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); 9705 return ret; 9706err: 9707 io_ring_ctx_wait_and_kill(ctx); 9708 return ret; 9709} 9710 9711/* 9712 * Sets up an aio uring context, and returns the fd. Applications asks for a 9713 * ring size, we return the actual sq/cq ring sizes (among other things) in the 9714 * params structure passed in. 9715 */ 9716static long io_uring_setup(u32 entries, struct io_uring_params __user *params) 9717{ 9718 struct io_uring_params p; 9719 int i; 9720 9721 if (copy_from_user(&p, params, sizeof(p))) 9722 return -EFAULT; 9723 for (i = 0; i < ARRAY_SIZE(p.resv); i++) { 9724 if (p.resv[i]) 9725 return -EINVAL; 9726 } 9727 9728 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | 9729 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | 9730 IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | 9731 IORING_SETUP_R_DISABLED)) 9732 return -EINVAL; 9733 9734 return io_uring_create(entries, &p, params); 9735} 9736 9737SYSCALL_DEFINE2(io_uring_setup, u32, entries, 9738 struct io_uring_params __user *, params) 9739{ 9740 return io_uring_setup(entries, params); 9741} 9742 9743static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) 9744{ 9745 struct io_uring_probe *p; 9746 size_t size; 9747 int i, ret; 9748 9749 size = struct_size(p, ops, nr_args); 9750 if (size == SIZE_MAX) 9751 return -EOVERFLOW; 9752 p = kzalloc(size, GFP_KERNEL); 9753 if (!p) 9754 return -ENOMEM; 9755 9756 ret = -EFAULT; 9757 if (copy_from_user(p, arg, size)) 9758 goto out; 9759 ret = -EINVAL; 9760 if (memchr_inv(p, 0, size)) 9761 goto out; 9762 9763 p->last_op = IORING_OP_LAST - 1; 9764 if (nr_args > IORING_OP_LAST) 9765 nr_args = IORING_OP_LAST; 9766 9767 for (i = 0; i < nr_args; i++) { 9768 p->ops[i].op = i; 9769 if (!io_op_defs[i].not_supported) 9770 p->ops[i].flags = IO_URING_OP_SUPPORTED; 9771 } 9772 p->ops_len = i; 9773 9774 ret = 0; 9775 if (copy_to_user(arg, p, size)) 9776 ret = -EFAULT; 9777out: 9778 kfree(p); 9779 return ret; 9780} 9781 9782static int io_register_personality(struct io_ring_ctx *ctx) 9783{ 9784 const struct cred *creds; 9785 u32 id; 9786 int ret; 9787 9788 creds = get_current_cred(); 9789 9790 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, 9791 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); 9792 if (!ret) 9793 return id; 9794 put_cred(creds); 9795 return ret; 9796} 9797 9798static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg, 9799 unsigned int nr_args) 9800{ 9801 struct io_uring_restriction *res; 9802 size_t size; 9803 int i, ret; 9804 9805 /* Restrictions allowed only if rings started disabled */ 9806 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 9807 return -EBADFD; 9808 9809 /* We allow only a single restrictions registration */ 9810 if (ctx->restrictions.registered) 9811 return -EBUSY; 9812 9813 if (!arg || nr_args > IORING_MAX_RESTRICTIONS) 9814 return -EINVAL; 9815 9816 size = array_size(nr_args, sizeof(*res)); 9817 if (size == SIZE_MAX) 9818 return -EOVERFLOW; 9819 9820 res = memdup_user(arg, size); 9821 if (IS_ERR(res)) 9822 return PTR_ERR(res); 9823 9824 ret = 0; 9825 9826 for (i = 0; i < nr_args; i++) { 9827 switch (res[i].opcode) { 9828 case IORING_RESTRICTION_REGISTER_OP: 9829 if (res[i].register_op >= IORING_REGISTER_LAST) { 9830 ret = -EINVAL; 9831 goto out; 9832 } 9833 9834 __set_bit(res[i].register_op, 9835 ctx->restrictions.register_op); 9836 break; 9837 case IORING_RESTRICTION_SQE_OP: 9838 if (res[i].sqe_op >= IORING_OP_LAST) { 9839 ret = -EINVAL; 9840 goto out; 9841 } 9842 9843 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); 9844 break; 9845 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: 9846 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; 9847 break; 9848 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: 9849 ctx->restrictions.sqe_flags_required = res[i].sqe_flags; 9850 break; 9851 default: 9852 ret = -EINVAL; 9853 goto out; 9854 } 9855 } 9856 9857out: 9858 /* Reset all restrictions if an error happened */ 9859 if (ret != 0) 9860 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); 9861 else 9862 ctx->restrictions.registered = true; 9863 9864 kfree(res); 9865 return ret; 9866} 9867 9868static int io_register_enable_rings(struct io_ring_ctx *ctx) 9869{ 9870 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 9871 return -EBADFD; 9872 9873 if (ctx->restrictions.registered) 9874 ctx->restricted = 1; 9875 9876 ctx->flags &= ~IORING_SETUP_R_DISABLED; 9877 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) 9878 wake_up(&ctx->sq_data->wait); 9879 return 0; 9880} 9881 9882static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 9883 struct io_uring_rsrc_update2 *up, 9884 unsigned nr_args) 9885{ 9886 __u32 tmp; 9887 int err; 9888 9889 if (up->resv) 9890 return -EINVAL; 9891 if (check_add_overflow(up->offset, nr_args, &tmp)) 9892 return -EOVERFLOW; 9893 err = io_rsrc_node_switch_start(ctx); 9894 if (err) 9895 return err; 9896 9897 switch (type) { 9898 case IORING_RSRC_FILE: 9899 return __io_sqe_files_update(ctx, up, nr_args); 9900 case IORING_RSRC_BUFFER: 9901 return __io_sqe_buffers_update(ctx, up, nr_args); 9902 } 9903 return -EINVAL; 9904} 9905 9906static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 9907 unsigned nr_args) 9908{ 9909 struct io_uring_rsrc_update2 up; 9910 9911 if (!nr_args) 9912 return -EINVAL; 9913 memset(&up, 0, sizeof(up)); 9914 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 9915 return -EFAULT; 9916 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 9917} 9918 9919static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 9920 unsigned size, unsigned type) 9921{ 9922 struct io_uring_rsrc_update2 up; 9923 9924 if (size != sizeof(up)) 9925 return -EINVAL; 9926 if (copy_from_user(&up, arg, sizeof(up))) 9927 return -EFAULT; 9928 if (!up.nr || up.resv) 9929 return -EINVAL; 9930 return __io_register_rsrc_update(ctx, type, &up, up.nr); 9931} 9932 9933static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 9934 unsigned int size, unsigned int type) 9935{ 9936 struct io_uring_rsrc_register rr; 9937 9938 /* keep it extendible */ 9939 if (size != sizeof(rr)) 9940 return -EINVAL; 9941 9942 memset(&rr, 0, sizeof(rr)); 9943 if (copy_from_user(&rr, arg, size)) 9944 return -EFAULT; 9945 if (!rr.nr || rr.resv || rr.resv2) 9946 return -EINVAL; 9947 9948 switch (type) { 9949 case IORING_RSRC_FILE: 9950 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 9951 rr.nr, u64_to_user_ptr(rr.tags)); 9952 case IORING_RSRC_BUFFER: 9953 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 9954 rr.nr, u64_to_user_ptr(rr.tags)); 9955 } 9956 return -EINVAL; 9957} 9958 9959static bool io_register_op_must_quiesce(int op) 9960{ 9961 switch (op) { 9962 case IORING_REGISTER_BUFFERS: 9963 case IORING_UNREGISTER_BUFFERS: 9964 case IORING_REGISTER_FILES: 9965 case IORING_UNREGISTER_FILES: 9966 case IORING_REGISTER_FILES_UPDATE: 9967 case IORING_REGISTER_PROBE: 9968 case IORING_REGISTER_PERSONALITY: 9969 case IORING_UNREGISTER_PERSONALITY: 9970 case IORING_REGISTER_FILES2: 9971 case IORING_REGISTER_FILES_UPDATE2: 9972 case IORING_REGISTER_BUFFERS2: 9973 case IORING_REGISTER_BUFFERS_UPDATE: 9974 return false; 9975 default: 9976 return true; 9977 } 9978} 9979 9980static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, 9981 void __user *arg, unsigned nr_args) 9982 __releases(ctx->uring_lock) 9983 __acquires(ctx->uring_lock) 9984{ 9985 int ret; 9986 9987 /* 9988 * We're inside the ring mutex, if the ref is already dying, then 9989 * someone else killed the ctx or is already going through 9990 * io_uring_register(). 9991 */ 9992 if (percpu_ref_is_dying(&ctx->refs)) 9993 return -ENXIO; 9994 9995 if (ctx->restricted) { 9996 if (opcode >= IORING_REGISTER_LAST) 9997 return -EINVAL; 9998 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); 9999 if (!test_bit(opcode, ctx->restrictions.register_op)) 10000 return -EACCES; 10001 } 10002 10003 if (io_register_op_must_quiesce(opcode)) { 10004 percpu_ref_kill(&ctx->refs); 10005 10006 /* 10007 * Drop uring mutex before waiting for references to exit. If 10008 * another thread is currently inside io_uring_enter() it might 10009 * need to grab the uring_lock to make progress. If we hold it 10010 * here across the drain wait, then we can deadlock. It's safe 10011 * to drop the mutex here, since no new references will come in 10012 * after we've killed the percpu ref. 10013 */ 10014 mutex_unlock(&ctx->uring_lock); 10015 do { 10016 ret = wait_for_completion_interruptible(&ctx->ref_comp); 10017 if (!ret) 10018 break; 10019 ret = io_run_task_work_sig(); 10020 if (ret < 0) 10021 break; 10022 } while (1); 10023 mutex_lock(&ctx->uring_lock); 10024 10025 if (ret) { 10026 io_refs_resurrect(&ctx->refs, &ctx->ref_comp); 10027 return ret; 10028 } 10029 } 10030 10031 switch (opcode) { 10032 case IORING_REGISTER_BUFFERS: 10033 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); 10034 break; 10035 case IORING_UNREGISTER_BUFFERS: 10036 ret = -EINVAL; 10037 if (arg || nr_args) 10038 break; 10039 ret = io_sqe_buffers_unregister(ctx); 10040 break; 10041 case IORING_REGISTER_FILES: 10042 ret = io_sqe_files_register(ctx, arg, nr_args, NULL); 10043 break; 10044 case IORING_UNREGISTER_FILES: 10045 ret = -EINVAL; 10046 if (arg || nr_args) 10047 break; 10048 ret = io_sqe_files_unregister(ctx); 10049 break; 10050 case IORING_REGISTER_FILES_UPDATE: 10051 ret = io_register_files_update(ctx, arg, nr_args); 10052 break; 10053 case IORING_REGISTER_EVENTFD: 10054 case IORING_REGISTER_EVENTFD_ASYNC: 10055 ret = -EINVAL; 10056 if (nr_args != 1) 10057 break; 10058 ret = io_eventfd_register(ctx, arg); 10059 if (ret) 10060 break; 10061 if (opcode == IORING_REGISTER_EVENTFD_ASYNC) 10062 ctx->eventfd_async = 1; 10063 else 10064 ctx->eventfd_async = 0; 10065 break; 10066 case IORING_UNREGISTER_EVENTFD: 10067 ret = -EINVAL; 10068 if (arg || nr_args) 10069 break; 10070 ret = io_eventfd_unregister(ctx); 10071 break; 10072 case IORING_REGISTER_PROBE: 10073 ret = -EINVAL; 10074 if (!arg || nr_args > 256) 10075 break; 10076 ret = io_probe(ctx, arg, nr_args); 10077 break; 10078 case IORING_REGISTER_PERSONALITY: 10079 ret = -EINVAL; 10080 if (arg || nr_args) 10081 break; 10082 ret = io_register_personality(ctx); 10083 break; 10084 case IORING_UNREGISTER_PERSONALITY: 10085 ret = -EINVAL; 10086 if (arg) 10087 break; 10088 ret = io_unregister_personality(ctx, nr_args); 10089 break; 10090 case IORING_REGISTER_ENABLE_RINGS: 10091 ret = -EINVAL; 10092 if (arg || nr_args) 10093 break; 10094 ret = io_register_enable_rings(ctx); 10095 break; 10096 case IORING_REGISTER_RESTRICTIONS: 10097 ret = io_register_restrictions(ctx, arg, nr_args); 10098 break; 10099 case IORING_REGISTER_FILES2: 10100 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); 10101 break; 10102 case IORING_REGISTER_FILES_UPDATE2: 10103 ret = io_register_rsrc_update(ctx, arg, nr_args, 10104 IORING_RSRC_FILE); 10105 break; 10106 case IORING_REGISTER_BUFFERS2: 10107 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); 10108 break; 10109 case IORING_REGISTER_BUFFERS_UPDATE: 10110 ret = io_register_rsrc_update(ctx, arg, nr_args, 10111 IORING_RSRC_BUFFER); 10112 break; 10113 default: 10114 ret = -EINVAL; 10115 break; 10116 } 10117 10118 if (io_register_op_must_quiesce(opcode)) { 10119 /* bring the ctx back to life */ 10120 percpu_ref_reinit(&ctx->refs); 10121 reinit_completion(&ctx->ref_comp); 10122 } 10123 return ret; 10124} 10125 10126SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, 10127 void __user *, arg, unsigned int, nr_args) 10128{ 10129 struct io_ring_ctx *ctx; 10130 long ret = -EBADF; 10131 struct fd f; 10132 10133 f = fdget(fd); 10134 if (!f.file) 10135 return -EBADF; 10136 10137 ret = -EOPNOTSUPP; 10138 if (f.file->f_op != &io_uring_fops) 10139 goto out_fput; 10140 10141 ctx = f.file->private_data; 10142 10143 io_run_task_work(); 10144 10145 mutex_lock(&ctx->uring_lock); 10146 ret = __io_uring_register(ctx, opcode, arg, nr_args); 10147 mutex_unlock(&ctx->uring_lock); 10148 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, 10149 ctx->cq_ev_fd != NULL, ret); 10150out_fput: 10151 fdput(f); 10152 return ret; 10153} 10154 10155static int __init io_uring_init(void) 10156{ 10157#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \ 10158 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \ 10159 BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \ 10160} while (0) 10161 10162#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \ 10163 __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename) 10164 BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64); 10165 BUILD_BUG_SQE_ELEM(0, __u8, opcode); 10166 BUILD_BUG_SQE_ELEM(1, __u8, flags); 10167 BUILD_BUG_SQE_ELEM(2, __u16, ioprio); 10168 BUILD_BUG_SQE_ELEM(4, __s32, fd); 10169 BUILD_BUG_SQE_ELEM(8, __u64, off); 10170 BUILD_BUG_SQE_ELEM(8, __u64, addr2); 10171 BUILD_BUG_SQE_ELEM(16, __u64, addr); 10172 BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in); 10173 BUILD_BUG_SQE_ELEM(24, __u32, len); 10174 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags); 10175 BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); 10176 BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags); 10177 BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags); 10178 BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events); 10179 BUILD_BUG_SQE_ELEM(28, __u32, poll32_events); 10180 BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags); 10181 BUILD_BUG_SQE_ELEM(28, __u32, msg_flags); 10182 BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags); 10183 BUILD_BUG_SQE_ELEM(28, __u32, accept_flags); 10184 BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags); 10185 BUILD_BUG_SQE_ELEM(28, __u32, open_flags); 10186 BUILD_BUG_SQE_ELEM(28, __u32, statx_flags); 10187 BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice); 10188 BUILD_BUG_SQE_ELEM(28, __u32, splice_flags); 10189 BUILD_BUG_SQE_ELEM(32, __u64, user_data); 10190 BUILD_BUG_SQE_ELEM(40, __u16, buf_index); 10191 BUILD_BUG_SQE_ELEM(42, __u16, personality); 10192 BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); 10193 10194 BUILD_BUG_ON(sizeof(struct io_uring_files_update) != 10195 sizeof(struct io_uring_rsrc_update)); 10196 BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) > 10197 sizeof(struct io_uring_rsrc_update2)); 10198 /* should fit into one byte */ 10199 BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); 10200 10201 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); 10202 BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int)); 10203 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | 10204 SLAB_ACCOUNT); 10205 return 0; 10206}; 10207__initcall(io_uring_init);