fs/io_uring.c at v5.16-rc2 · tjh.dev/kernel

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / fs / io_uring.c
at v5.16-rc2 11127 lines 281 kB view raw
wrap content
    1// SPDX-License-Identifier: GPL-2.0
    2/*
    3 * Shared application/kernel submission and completion ring pairs, for
    4 * supporting fast/efficient IO.
    5 *
    6 * A note on the read/write ordering memory barriers that are matched between
    7 * the application and kernel side.
    8 *
    9 * After the application reads the CQ ring tail, it must use an
   10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
   11 * before writing the tail (using smp_load_acquire to read the tail will
   12 * do). It also needs a smp_mb() before updating CQ head (ordering the
   13 * entry load(s) with the head store), pairing with an implicit barrier
   14 * through a control-dependency in io_get_cqe (smp_store_release to
   15 * store head will do). Failure to do so could lead to reading invalid
   16 * CQ entries.
   17 *
   18 * Likewise, the application must use an appropriate smp_wmb() before
   19 * writing the SQ tail (ordering SQ entry stores with the tail store),
   20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
   21 * to store the tail will do). And it needs a barrier ordering the SQ
   22 * head load before writing new SQ entries (smp_load_acquire to read
   23 * head will do).
   24 *
   25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
   26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
   27 * updating the SQ tail; a full memory barrier smp_mb() is needed
   28 * between.
   29 *
   30 * Also see the examples in the liburing library:
   31 *
   32 *	git://git.kernel.dk/liburing
   33 *
   34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
   35 * from data shared between the kernel and application. This is done both
   36 * for ordering purposes, but also to ensure that once a value is loaded from
   37 * data that the application could potentially modify, it remains stable.
   38 *
   39 * Copyright (C) 2018-2019 Jens Axboe
   40 * Copyright (c) 2018-2019 Christoph Hellwig
   41 */
   42#include <linux/kernel.h>
   43#include <linux/init.h>
   44#include <linux/errno.h>
   45#include <linux/syscalls.h>
   46#include <linux/compat.h>
   47#include <net/compat.h>
   48#include <linux/refcount.h>
   49#include <linux/uio.h>
   50#include <linux/bits.h>
   51
   52#include <linux/sched/signal.h>
   53#include <linux/fs.h>
   54#include <linux/file.h>
   55#include <linux/fdtable.h>
   56#include <linux/mm.h>
   57#include <linux/mman.h>
   58#include <linux/percpu.h>
   59#include <linux/slab.h>
   60#include <linux/blkdev.h>
   61#include <linux/bvec.h>
   62#include <linux/net.h>
   63#include <net/sock.h>
   64#include <net/af_unix.h>
   65#include <net/scm.h>
   66#include <linux/anon_inodes.h>
   67#include <linux/sched/mm.h>
   68#include <linux/uaccess.h>
   69#include <linux/nospec.h>
   70#include <linux/sizes.h>
   71#include <linux/hugetlb.h>
   72#include <linux/highmem.h>
   73#include <linux/namei.h>
   74#include <linux/fsnotify.h>
   75#include <linux/fadvise.h>
   76#include <linux/eventpoll.h>
   77#include <linux/splice.h>
   78#include <linux/task_work.h>
   79#include <linux/pagemap.h>
   80#include <linux/io_uring.h>
   81#include <linux/tracehook.h>
   82#include <linux/audit.h>
   83#include <linux/security.h>
   84
   85#define CREATE_TRACE_POINTS
   86#include <trace/events/io_uring.h>
   87
   88#include <uapi/linux/io_uring.h>
   89
   90#include "internal.h"
   91#include "io-wq.h"
   92
   93#define IORING_MAX_ENTRIES	32768
   94#define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
   95#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
   96
   97/* only define max */
   98#define IORING_MAX_FIXED_FILES	(1U << 15)
   99#define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
  100				 IORING_REGISTER_LAST + IORING_OP_LAST)
  101
  102#define IO_RSRC_TAG_TABLE_SHIFT	(PAGE_SHIFT - 3)
  103#define IO_RSRC_TAG_TABLE_MAX	(1U << IO_RSRC_TAG_TABLE_SHIFT)
  104#define IO_RSRC_TAG_TABLE_MASK	(IO_RSRC_TAG_TABLE_MAX - 1)
  105
  106#define IORING_MAX_REG_BUFFERS	(1U << 14)
  107
  108#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
  109			  IOSQE_IO_HARDLINK | IOSQE_ASYNC)
  110
  111#define SQE_VALID_FLAGS	(SQE_COMMON_FLAGS|IOSQE_BUFFER_SELECT|IOSQE_IO_DRAIN)
  112
  113#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
  114				REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
  115				REQ_F_ASYNC_DATA)
  116
  117#define IO_TCTX_REFS_CACHE_NR	(1U << 10)
  118
  119struct io_uring {
  120	u32 head ____cacheline_aligned_in_smp;
  121	u32 tail ____cacheline_aligned_in_smp;
  122};
  123
  124/*
  125 * This data is shared with the application through the mmap at offsets
  126 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
  127 *
  128 * The offsets to the member fields are published through struct
  129 * io_sqring_offsets when calling io_uring_setup.
  130 */
  131struct io_rings {
  132	/*
  133	 * Head and tail offsets into the ring; the offsets need to be
  134	 * masked to get valid indices.
  135	 *
  136	 * The kernel controls head of the sq ring and the tail of the cq ring,
  137	 * and the application controls tail of the sq ring and the head of the
  138	 * cq ring.
  139	 */
  140	struct io_uring		sq, cq;
  141	/*
  142	 * Bitmasks to apply to head and tail offsets (constant, equals
  143	 * ring_entries - 1)
  144	 */
  145	u32			sq_ring_mask, cq_ring_mask;
  146	/* Ring sizes (constant, power of 2) */
  147	u32			sq_ring_entries, cq_ring_entries;
  148	/*
  149	 * Number of invalid entries dropped by the kernel due to
  150	 * invalid index stored in array
  151	 *
  152	 * Written by the kernel, shouldn't be modified by the
  153	 * application (i.e. get number of "new events" by comparing to
  154	 * cached value).
  155	 *
  156	 * After a new SQ head value was read by the application this
  157	 * counter includes all submissions that were dropped reaching
  158	 * the new SQ head (and possibly more).
  159	 */
  160	u32			sq_dropped;
  161	/*
  162	 * Runtime SQ flags
  163	 *
  164	 * Written by the kernel, shouldn't be modified by the
  165	 * application.
  166	 *
  167	 * The application needs a full memory barrier before checking
  168	 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
  169	 */
  170	u32			sq_flags;
  171	/*
  172	 * Runtime CQ flags
  173	 *
  174	 * Written by the application, shouldn't be modified by the
  175	 * kernel.
  176	 */
  177	u32			cq_flags;
  178	/*
  179	 * Number of completion events lost because the queue was full;
  180	 * this should be avoided by the application by making sure
  181	 * there are not more requests pending than there is space in
  182	 * the completion queue.
  183	 *
  184	 * Written by the kernel, shouldn't be modified by the
  185	 * application (i.e. get number of "new events" by comparing to
  186	 * cached value).
  187	 *
  188	 * As completion events come in out of order this counter is not
  189	 * ordered with any other data.
  190	 */
  191	u32			cq_overflow;
  192	/*
  193	 * Ring buffer of completion events.
  194	 *
  195	 * The kernel writes completion events fresh every time they are
  196	 * produced, so the application is allowed to modify pending
  197	 * entries.
  198	 */
  199	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
  200};
  201
  202enum io_uring_cmd_flags {
  203	IO_URING_F_COMPLETE_DEFER	= 1,
  204	IO_URING_F_UNLOCKED		= 2,
  205	/* int's last bit, sign checks are usually faster than a bit test */
  206	IO_URING_F_NONBLOCK		= INT_MIN,
  207};
  208
  209struct io_mapped_ubuf {
  210	u64		ubuf;
  211	u64		ubuf_end;
  212	unsigned int	nr_bvecs;
  213	unsigned long	acct_pages;
  214	struct bio_vec	bvec[];
  215};
  216
  217struct io_ring_ctx;
  218
  219struct io_overflow_cqe {
  220	struct io_uring_cqe cqe;
  221	struct list_head list;
  222};
  223
  224struct io_fixed_file {
  225	/* file * with additional FFS_* flags */
  226	unsigned long file_ptr;
  227};
  228
  229struct io_rsrc_put {
  230	struct list_head list;
  231	u64 tag;
  232	union {
  233		void *rsrc;
  234		struct file *file;
  235		struct io_mapped_ubuf *buf;
  236	};
  237};
  238
  239struct io_file_table {
  240	struct io_fixed_file *files;
  241};
  242
  243struct io_rsrc_node {
  244	struct percpu_ref		refs;
  245	struct list_head		node;
  246	struct list_head		rsrc_list;
  247	struct io_rsrc_data		*rsrc_data;
  248	struct llist_node		llist;
  249	bool				done;
  250};
  251
  252typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
  253
  254struct io_rsrc_data {
  255	struct io_ring_ctx		*ctx;
  256
  257	u64				**tags;
  258	unsigned int			nr;
  259	rsrc_put_fn			*do_put;
  260	atomic_t			refs;
  261	struct completion		done;
  262	bool				quiesce;
  263};
  264
  265struct io_buffer {
  266	struct list_head list;
  267	__u64 addr;
  268	__u32 len;
  269	__u16 bid;
  270};
  271
  272struct io_restriction {
  273	DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
  274	DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
  275	u8 sqe_flags_allowed;
  276	u8 sqe_flags_required;
  277	bool registered;
  278};
  279
  280enum {
  281	IO_SQ_THREAD_SHOULD_STOP = 0,
  282	IO_SQ_THREAD_SHOULD_PARK,
  283};
  284
  285struct io_sq_data {
  286	refcount_t		refs;
  287	atomic_t		park_pending;
  288	struct mutex		lock;
  289
  290	/* ctx's that are using this sqd */
  291	struct list_head	ctx_list;
  292
  293	struct task_struct	*thread;
  294	struct wait_queue_head	wait;
  295
  296	unsigned		sq_thread_idle;
  297	int			sq_cpu;
  298	pid_t			task_pid;
  299	pid_t			task_tgid;
  300
  301	unsigned long		state;
  302	struct completion	exited;
  303};
  304
  305#define IO_COMPL_BATCH			32
  306#define IO_REQ_CACHE_SIZE		32
  307#define IO_REQ_ALLOC_BATCH		8
  308
  309struct io_submit_link {
  310	struct io_kiocb		*head;
  311	struct io_kiocb		*last;
  312};
  313
  314struct io_submit_state {
  315	/* inline/task_work completion list, under ->uring_lock */
  316	struct io_wq_work_node	free_list;
  317	/* batch completion logic */
  318	struct io_wq_work_list	compl_reqs;
  319	struct io_submit_link	link;
  320
  321	bool			plug_started;
  322	bool			need_plug;
  323	unsigned short		submit_nr;
  324	struct blk_plug		plug;
  325};
  326
  327struct io_ring_ctx {
  328	/* const or read-mostly hot data */
  329	struct {
  330		struct percpu_ref	refs;
  331
  332		struct io_rings		*rings;
  333		unsigned int		flags;
  334		unsigned int		compat: 1;
  335		unsigned int		drain_next: 1;
  336		unsigned int		eventfd_async: 1;
  337		unsigned int		restricted: 1;
  338		unsigned int		off_timeout_used: 1;
  339		unsigned int		drain_active: 1;
  340	} ____cacheline_aligned_in_smp;
  341
  342	/* submission data */
  343	struct {
  344		struct mutex		uring_lock;
  345
  346		/*
  347		 * Ring buffer of indices into array of io_uring_sqe, which is
  348		 * mmapped by the application using the IORING_OFF_SQES offset.
  349		 *
  350		 * This indirection could e.g. be used to assign fixed
  351		 * io_uring_sqe entries to operations and only submit them to
  352		 * the queue when needed.
  353		 *
  354		 * The kernel modifies neither the indices array nor the entries
  355		 * array.
  356		 */
  357		u32			*sq_array;
  358		struct io_uring_sqe	*sq_sqes;
  359		unsigned		cached_sq_head;
  360		unsigned		sq_entries;
  361		struct list_head	defer_list;
  362
  363		/*
  364		 * Fixed resources fast path, should be accessed only under
  365		 * uring_lock, and updated through io_uring_register(2)
  366		 */
  367		struct io_rsrc_node	*rsrc_node;
  368		int			rsrc_cached_refs;
  369		struct io_file_table	file_table;
  370		unsigned		nr_user_files;
  371		unsigned		nr_user_bufs;
  372		struct io_mapped_ubuf	**user_bufs;
  373
  374		struct io_submit_state	submit_state;
  375		struct list_head	timeout_list;
  376		struct list_head	ltimeout_list;
  377		struct list_head	cq_overflow_list;
  378		struct xarray		io_buffers;
  379		struct xarray		personalities;
  380		u32			pers_next;
  381		unsigned		sq_thread_idle;
  382	} ____cacheline_aligned_in_smp;
  383
  384	/* IRQ completion list, under ->completion_lock */
  385	struct io_wq_work_list	locked_free_list;
  386	unsigned int		locked_free_nr;
  387
  388	const struct cred	*sq_creds;	/* cred used for __io_sq_thread() */
  389	struct io_sq_data	*sq_data;	/* if using sq thread polling */
  390
  391	struct wait_queue_head	sqo_sq_wait;
  392	struct list_head	sqd_list;
  393
  394	unsigned long		check_cq_overflow;
  395
  396	struct {
  397		unsigned		cached_cq_tail;
  398		unsigned		cq_entries;
  399		struct eventfd_ctx	*cq_ev_fd;
  400		struct wait_queue_head	cq_wait;
  401		unsigned		cq_extra;
  402		atomic_t		cq_timeouts;
  403		unsigned		cq_last_tm_flush;
  404	} ____cacheline_aligned_in_smp;
  405
  406	struct {
  407		spinlock_t		completion_lock;
  408
  409		spinlock_t		timeout_lock;
  410
  411		/*
  412		 * ->iopoll_list is protected by the ctx->uring_lock for
  413		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
  414		 * For SQPOLL, only the single threaded io_sq_thread() will
  415		 * manipulate the list, hence no extra locking is needed there.
  416		 */
  417		struct io_wq_work_list	iopoll_list;
  418		struct hlist_head	*cancel_hash;
  419		unsigned		cancel_hash_bits;
  420		bool			poll_multi_queue;
  421	} ____cacheline_aligned_in_smp;
  422
  423	struct io_restriction		restrictions;
  424
  425	/* slow path rsrc auxilary data, used by update/register */
  426	struct {
  427		struct io_rsrc_node		*rsrc_backup_node;
  428		struct io_mapped_ubuf		*dummy_ubuf;
  429		struct io_rsrc_data		*file_data;
  430		struct io_rsrc_data		*buf_data;
  431
  432		struct delayed_work		rsrc_put_work;
  433		struct llist_head		rsrc_put_llist;
  434		struct list_head		rsrc_ref_list;
  435		spinlock_t			rsrc_ref_lock;
  436	};
  437
  438	/* Keep this last, we don't need it for the fast path */
  439	struct {
  440		#if defined(CONFIG_UNIX)
  441			struct socket		*ring_sock;
  442		#endif
  443		/* hashed buffered write serialization */
  444		struct io_wq_hash		*hash_map;
  445
  446		/* Only used for accounting purposes */
  447		struct user_struct		*user;
  448		struct mm_struct		*mm_account;
  449
  450		/* ctx exit and cancelation */
  451		struct llist_head		fallback_llist;
  452		struct delayed_work		fallback_work;
  453		struct work_struct		exit_work;
  454		struct list_head		tctx_list;
  455		struct completion		ref_comp;
  456		u32				iowq_limits[2];
  457		bool				iowq_limits_set;
  458	};
  459};
  460
  461struct io_uring_task {
  462	/* submission side */
  463	int			cached_refs;
  464	struct xarray		xa;
  465	struct wait_queue_head	wait;
  466	const struct io_ring_ctx *last;
  467	struct io_wq		*io_wq;
  468	struct percpu_counter	inflight;
  469	atomic_t		inflight_tracked;
  470	atomic_t		in_idle;
  471
  472	spinlock_t		task_lock;
  473	struct io_wq_work_list	task_list;
  474	struct callback_head	task_work;
  475	bool			task_running;
  476};
  477
  478/*
  479 * First field must be the file pointer in all the
  480 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
  481 */
  482struct io_poll_iocb {
  483	struct file			*file;
  484	struct wait_queue_head		*head;
  485	__poll_t			events;
  486	bool				done;
  487	bool				canceled;
  488	struct wait_queue_entry		wait;
  489};
  490
  491struct io_poll_update {
  492	struct file			*file;
  493	u64				old_user_data;
  494	u64				new_user_data;
  495	__poll_t			events;
  496	bool				update_events;
  497	bool				update_user_data;
  498};
  499
  500struct io_close {
  501	struct file			*file;
  502	int				fd;
  503	u32				file_slot;
  504};
  505
  506struct io_timeout_data {
  507	struct io_kiocb			*req;
  508	struct hrtimer			timer;
  509	struct timespec64		ts;
  510	enum hrtimer_mode		mode;
  511	u32				flags;
  512};
  513
  514struct io_accept {
  515	struct file			*file;
  516	struct sockaddr __user		*addr;
  517	int __user			*addr_len;
  518	int				flags;
  519	u32				file_slot;
  520	unsigned long			nofile;
  521};
  522
  523struct io_sync {
  524	struct file			*file;
  525	loff_t				len;
  526	loff_t				off;
  527	int				flags;
  528	int				mode;
  529};
  530
  531struct io_cancel {
  532	struct file			*file;
  533	u64				addr;
  534};
  535
  536struct io_timeout {
  537	struct file			*file;
  538	u32				off;
  539	u32				target_seq;
  540	struct list_head		list;
  541	/* head of the link, used by linked timeouts only */
  542	struct io_kiocb			*head;
  543	/* for linked completions */
  544	struct io_kiocb			*prev;
  545};
  546
  547struct io_timeout_rem {
  548	struct file			*file;
  549	u64				addr;
  550
  551	/* timeout update */
  552	struct timespec64		ts;
  553	u32				flags;
  554	bool				ltimeout;
  555};
  556
  557struct io_rw {
  558	/* NOTE: kiocb has the file as the first member, so don't do it here */
  559	struct kiocb			kiocb;
  560	u64				addr;
  561	u64				len;
  562};
  563
  564struct io_connect {
  565	struct file			*file;
  566	struct sockaddr __user		*addr;
  567	int				addr_len;
  568};
  569
  570struct io_sr_msg {
  571	struct file			*file;
  572	union {
  573		struct compat_msghdr __user	*umsg_compat;
  574		struct user_msghdr __user	*umsg;
  575		void __user			*buf;
  576	};
  577	int				msg_flags;
  578	int				bgid;
  579	size_t				len;
  580};
  581
  582struct io_open {
  583	struct file			*file;
  584	int				dfd;
  585	u32				file_slot;
  586	struct filename			*filename;
  587	struct open_how			how;
  588	unsigned long			nofile;
  589};
  590
  591struct io_rsrc_update {
  592	struct file			*file;
  593	u64				arg;
  594	u32				nr_args;
  595	u32				offset;
  596};
  597
  598struct io_fadvise {
  599	struct file			*file;
  600	u64				offset;
  601	u32				len;
  602	u32				advice;
  603};
  604
  605struct io_madvise {
  606	struct file			*file;
  607	u64				addr;
  608	u32				len;
  609	u32				advice;
  610};
  611
  612struct io_epoll {
  613	struct file			*file;
  614	int				epfd;
  615	int				op;
  616	int				fd;
  617	struct epoll_event		event;
  618};
  619
  620struct io_splice {
  621	struct file			*file_out;
  622	struct file			*file_in;
  623	loff_t				off_out;
  624	loff_t				off_in;
  625	u64				len;
  626	unsigned int			flags;
  627};
  628
  629struct io_provide_buf {
  630	struct file			*file;
  631	__u64				addr;
  632	__u32				len;
  633	__u32				bgid;
  634	__u16				nbufs;
  635	__u16				bid;
  636};
  637
  638struct io_statx {
  639	struct file			*file;
  640	int				dfd;
  641	unsigned int			mask;
  642	unsigned int			flags;
  643	const char __user		*filename;
  644	struct statx __user		*buffer;
  645};
  646
  647struct io_shutdown {
  648	struct file			*file;
  649	int				how;
  650};
  651
  652struct io_rename {
  653	struct file			*file;
  654	int				old_dfd;
  655	int				new_dfd;
  656	struct filename			*oldpath;
  657	struct filename			*newpath;
  658	int				flags;
  659};
  660
  661struct io_unlink {
  662	struct file			*file;
  663	int				dfd;
  664	int				flags;
  665	struct filename			*filename;
  666};
  667
  668struct io_mkdir {
  669	struct file			*file;
  670	int				dfd;
  671	umode_t				mode;
  672	struct filename			*filename;
  673};
  674
  675struct io_symlink {
  676	struct file			*file;
  677	int				new_dfd;
  678	struct filename			*oldpath;
  679	struct filename			*newpath;
  680};
  681
  682struct io_hardlink {
  683	struct file			*file;
  684	int				old_dfd;
  685	int				new_dfd;
  686	struct filename			*oldpath;
  687	struct filename			*newpath;
  688	int				flags;
  689};
  690
  691struct io_async_connect {
  692	struct sockaddr_storage		address;
  693};
  694
  695struct io_async_msghdr {
  696	struct iovec			fast_iov[UIO_FASTIOV];
  697	/* points to an allocated iov, if NULL we use fast_iov instead */
  698	struct iovec			*free_iov;
  699	struct sockaddr __user		*uaddr;
  700	struct msghdr			msg;
  701	struct sockaddr_storage		addr;
  702};
  703
  704struct io_rw_state {
  705	struct iov_iter			iter;
  706	struct iov_iter_state		iter_state;
  707	struct iovec			fast_iov[UIO_FASTIOV];
  708};
  709
  710struct io_async_rw {
  711	struct io_rw_state		s;
  712	const struct iovec		*free_iovec;
  713	size_t				bytes_done;
  714	struct wait_page_queue		wpq;
  715};
  716
  717enum {
  718	REQ_F_FIXED_FILE_BIT	= IOSQE_FIXED_FILE_BIT,
  719	REQ_F_IO_DRAIN_BIT	= IOSQE_IO_DRAIN_BIT,
  720	REQ_F_LINK_BIT		= IOSQE_IO_LINK_BIT,
  721	REQ_F_HARDLINK_BIT	= IOSQE_IO_HARDLINK_BIT,
  722	REQ_F_FORCE_ASYNC_BIT	= IOSQE_ASYNC_BIT,
  723	REQ_F_BUFFER_SELECT_BIT	= IOSQE_BUFFER_SELECT_BIT,
  724
  725	/* first byte is taken by user flags, shift it to not overlap */
  726	REQ_F_FAIL_BIT		= 8,
  727	REQ_F_INFLIGHT_BIT,
  728	REQ_F_CUR_POS_BIT,
  729	REQ_F_NOWAIT_BIT,
  730	REQ_F_LINK_TIMEOUT_BIT,
  731	REQ_F_NEED_CLEANUP_BIT,
  732	REQ_F_POLLED_BIT,
  733	REQ_F_BUFFER_SELECTED_BIT,
  734	REQ_F_COMPLETE_INLINE_BIT,
  735	REQ_F_REISSUE_BIT,
  736	REQ_F_CREDS_BIT,
  737	REQ_F_REFCOUNT_BIT,
  738	REQ_F_ARM_LTIMEOUT_BIT,
  739	REQ_F_ASYNC_DATA_BIT,
  740	/* keep async read/write and isreg together and in order */
  741	REQ_F_SUPPORT_NOWAIT_BIT,
  742	REQ_F_ISREG_BIT,
  743
  744	/* not a real bit, just to check we're not overflowing the space */
  745	__REQ_F_LAST_BIT,
  746};
  747
  748enum {
  749	/* ctx owns file */
  750	REQ_F_FIXED_FILE	= BIT(REQ_F_FIXED_FILE_BIT),
  751	/* drain existing IO first */
  752	REQ_F_IO_DRAIN		= BIT(REQ_F_IO_DRAIN_BIT),
  753	/* linked sqes */
  754	REQ_F_LINK		= BIT(REQ_F_LINK_BIT),
  755	/* doesn't sever on completion < 0 */
  756	REQ_F_HARDLINK		= BIT(REQ_F_HARDLINK_BIT),
  757	/* IOSQE_ASYNC */
  758	REQ_F_FORCE_ASYNC	= BIT(REQ_F_FORCE_ASYNC_BIT),
  759	/* IOSQE_BUFFER_SELECT */
  760	REQ_F_BUFFER_SELECT	= BIT(REQ_F_BUFFER_SELECT_BIT),
  761
  762	/* fail rest of links */
  763	REQ_F_FAIL		= BIT(REQ_F_FAIL_BIT),
  764	/* on inflight list, should be cancelled and waited on exit reliably */
  765	REQ_F_INFLIGHT		= BIT(REQ_F_INFLIGHT_BIT),
  766	/* read/write uses file position */
  767	REQ_F_CUR_POS		= BIT(REQ_F_CUR_POS_BIT),
  768	/* must not punt to workers */
  769	REQ_F_NOWAIT		= BIT(REQ_F_NOWAIT_BIT),
  770	/* has or had linked timeout */
  771	REQ_F_LINK_TIMEOUT	= BIT(REQ_F_LINK_TIMEOUT_BIT),
  772	/* needs cleanup */
  773	REQ_F_NEED_CLEANUP	= BIT(REQ_F_NEED_CLEANUP_BIT),
  774	/* already went through poll handler */
  775	REQ_F_POLLED		= BIT(REQ_F_POLLED_BIT),
  776	/* buffer already selected */
  777	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
  778	/* completion is deferred through io_comp_state */
  779	REQ_F_COMPLETE_INLINE	= BIT(REQ_F_COMPLETE_INLINE_BIT),
  780	/* caller should reissue async */
  781	REQ_F_REISSUE		= BIT(REQ_F_REISSUE_BIT),
  782	/* supports async reads/writes */
  783	REQ_F_SUPPORT_NOWAIT	= BIT(REQ_F_SUPPORT_NOWAIT_BIT),
  784	/* regular file */
  785	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
  786	/* has creds assigned */
  787	REQ_F_CREDS		= BIT(REQ_F_CREDS_BIT),
  788	/* skip refcounting if not set */
  789	REQ_F_REFCOUNT		= BIT(REQ_F_REFCOUNT_BIT),
  790	/* there is a linked timeout that has to be armed */
  791	REQ_F_ARM_LTIMEOUT	= BIT(REQ_F_ARM_LTIMEOUT_BIT),
  792	/* ->async_data allocated */
  793	REQ_F_ASYNC_DATA	= BIT(REQ_F_ASYNC_DATA_BIT),
  794};
  795
  796struct async_poll {
  797	struct io_poll_iocb	poll;
  798	struct io_poll_iocb	*double_poll;
  799};
  800
  801typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
  802
  803struct io_task_work {
  804	union {
  805		struct io_wq_work_node	node;
  806		struct llist_node	fallback_node;
  807	};
  808	io_req_tw_func_t		func;
  809};
  810
  811enum {
  812	IORING_RSRC_FILE		= 0,
  813	IORING_RSRC_BUFFER		= 1,
  814};
  815
  816/*
  817 * NOTE! Each of the iocb union members has the file pointer
  818 * as the first entry in their struct definition. So you can
  819 * access the file pointer through any of the sub-structs,
  820 * or directly as just 'ki_filp' in this struct.
  821 */
  822struct io_kiocb {
  823	union {
  824		struct file		*file;
  825		struct io_rw		rw;
  826		struct io_poll_iocb	poll;
  827		struct io_poll_update	poll_update;
  828		struct io_accept	accept;
  829		struct io_sync		sync;
  830		struct io_cancel	cancel;
  831		struct io_timeout	timeout;
  832		struct io_timeout_rem	timeout_rem;
  833		struct io_connect	connect;
  834		struct io_sr_msg	sr_msg;
  835		struct io_open		open;
  836		struct io_close		close;
  837		struct io_rsrc_update	rsrc_update;
  838		struct io_fadvise	fadvise;
  839		struct io_madvise	madvise;
  840		struct io_epoll		epoll;
  841		struct io_splice	splice;
  842		struct io_provide_buf	pbuf;
  843		struct io_statx		statx;
  844		struct io_shutdown	shutdown;
  845		struct io_rename	rename;
  846		struct io_unlink	unlink;
  847		struct io_mkdir		mkdir;
  848		struct io_symlink	symlink;
  849		struct io_hardlink	hardlink;
  850	};
  851
  852	u8				opcode;
  853	/* polled IO has completed */
  854	u8				iopoll_completed;
  855	u16				buf_index;
  856	unsigned int			flags;
  857
  858	u64				user_data;
  859	u32				result;
  860	u32				cflags;
  861
  862	struct io_ring_ctx		*ctx;
  863	struct task_struct		*task;
  864
  865	struct percpu_ref		*fixed_rsrc_refs;
  866	/* store used ubuf, so we can prevent reloading */
  867	struct io_mapped_ubuf		*imu;
  868
  869	/* used by request caches, completion batching and iopoll */
  870	struct io_wq_work_node		comp_list;
  871	atomic_t			refs;
  872	struct io_kiocb			*link;
  873	struct io_task_work		io_task_work;
  874	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
  875	struct hlist_node		hash_node;
  876	/* internal polling, see IORING_FEAT_FAST_POLL */
  877	struct async_poll		*apoll;
  878	/* opcode allocated if it needs to store data for async defer */
  879	void				*async_data;
  880	struct io_wq_work		work;
  881	/* custom credentials, valid IFF REQ_F_CREDS is set */
  882	const struct cred		*creds;
  883	/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
  884	struct io_buffer		*kbuf;
  885};
  886
  887struct io_tctx_node {
  888	struct list_head	ctx_node;
  889	struct task_struct	*task;
  890	struct io_ring_ctx	*ctx;
  891};
  892
  893struct io_defer_entry {
  894	struct list_head	list;
  895	struct io_kiocb		*req;
  896	u32			seq;
  897};
  898
  899struct io_op_def {
  900	/* needs req->file assigned */
  901	unsigned		needs_file : 1;
  902	/* should block plug */
  903	unsigned		plug : 1;
  904	/* hash wq insertion if file is a regular file */
  905	unsigned		hash_reg_file : 1;
  906	/* unbound wq insertion if file is a non-regular file */
  907	unsigned		unbound_nonreg_file : 1;
  908	/* set if opcode supports polled "wait" */
  909	unsigned		pollin : 1;
  910	unsigned		pollout : 1;
  911	/* op supports buffer selection */
  912	unsigned		buffer_select : 1;
  913	/* do prep async if is going to be punted */
  914	unsigned		needs_async_setup : 1;
  915	/* opcode is not supported by this kernel */
  916	unsigned		not_supported : 1;
  917	/* skip auditing */
  918	unsigned		audit_skip : 1;
  919	/* size of async data needed, if any */
  920	unsigned short		async_size;
  921};
  922
  923static const struct io_op_def io_op_defs[] = {
  924	[IORING_OP_NOP] = {},
  925	[IORING_OP_READV] = {
  926		.needs_file		= 1,
  927		.unbound_nonreg_file	= 1,
  928		.pollin			= 1,
  929		.buffer_select		= 1,
  930		.needs_async_setup	= 1,
  931		.plug			= 1,
  932		.audit_skip		= 1,
  933		.async_size		= sizeof(struct io_async_rw),
  934	},
  935	[IORING_OP_WRITEV] = {
  936		.needs_file		= 1,
  937		.hash_reg_file		= 1,
  938		.unbound_nonreg_file	= 1,
  939		.pollout		= 1,
  940		.needs_async_setup	= 1,
  941		.plug			= 1,
  942		.audit_skip		= 1,
  943		.async_size		= sizeof(struct io_async_rw),
  944	},
  945	[IORING_OP_FSYNC] = {
  946		.needs_file		= 1,
  947		.audit_skip		= 1,
  948	},
  949	[IORING_OP_READ_FIXED] = {
  950		.needs_file		= 1,
  951		.unbound_nonreg_file	= 1,
  952		.pollin			= 1,
  953		.plug			= 1,
  954		.audit_skip		= 1,
  955		.async_size		= sizeof(struct io_async_rw),
  956	},
  957	[IORING_OP_WRITE_FIXED] = {
  958		.needs_file		= 1,
  959		.hash_reg_file		= 1,
  960		.unbound_nonreg_file	= 1,
  961		.pollout		= 1,
  962		.plug			= 1,
  963		.audit_skip		= 1,
  964		.async_size		= sizeof(struct io_async_rw),
  965	},
  966	[IORING_OP_POLL_ADD] = {
  967		.needs_file		= 1,
  968		.unbound_nonreg_file	= 1,
  969		.audit_skip		= 1,
  970	},
  971	[IORING_OP_POLL_REMOVE] = {
  972		.audit_skip		= 1,
  973	},
  974	[IORING_OP_SYNC_FILE_RANGE] = {
  975		.needs_file		= 1,
  976		.audit_skip		= 1,
  977	},
  978	[IORING_OP_SENDMSG] = {
  979		.needs_file		= 1,
  980		.unbound_nonreg_file	= 1,
  981		.pollout		= 1,
  982		.needs_async_setup	= 1,
  983		.async_size		= sizeof(struct io_async_msghdr),
  984	},
  985	[IORING_OP_RECVMSG] = {
  986		.needs_file		= 1,
  987		.unbound_nonreg_file	= 1,
  988		.pollin			= 1,
  989		.buffer_select		= 1,
  990		.needs_async_setup	= 1,
  991		.async_size		= sizeof(struct io_async_msghdr),
  992	},
  993	[IORING_OP_TIMEOUT] = {
  994		.audit_skip		= 1,
  995		.async_size		= sizeof(struct io_timeout_data),
  996	},
  997	[IORING_OP_TIMEOUT_REMOVE] = {
  998		/* used by timeout updates' prep() */
  999		.audit_skip		= 1,
 1000	},
 1001	[IORING_OP_ACCEPT] = {
 1002		.needs_file		= 1,
 1003		.unbound_nonreg_file	= 1,
 1004		.pollin			= 1,
 1005	},
 1006	[IORING_OP_ASYNC_CANCEL] = {
 1007		.audit_skip		= 1,
 1008	},
 1009	[IORING_OP_LINK_TIMEOUT] = {
 1010		.audit_skip		= 1,
 1011		.async_size		= sizeof(struct io_timeout_data),
 1012	},
 1013	[IORING_OP_CONNECT] = {
 1014		.needs_file		= 1,
 1015		.unbound_nonreg_file	= 1,
 1016		.pollout		= 1,
 1017		.needs_async_setup	= 1,
 1018		.async_size		= sizeof(struct io_async_connect),
 1019	},
 1020	[IORING_OP_FALLOCATE] = {
 1021		.needs_file		= 1,
 1022	},
 1023	[IORING_OP_OPENAT] = {},
 1024	[IORING_OP_CLOSE] = {},
 1025	[IORING_OP_FILES_UPDATE] = {
 1026		.audit_skip		= 1,
 1027	},
 1028	[IORING_OP_STATX] = {
 1029		.audit_skip		= 1,
 1030	},
 1031	[IORING_OP_READ] = {
 1032		.needs_file		= 1,
 1033		.unbound_nonreg_file	= 1,
 1034		.pollin			= 1,
 1035		.buffer_select		= 1,
 1036		.plug			= 1,
 1037		.audit_skip		= 1,
 1038		.async_size		= sizeof(struct io_async_rw),
 1039	},
 1040	[IORING_OP_WRITE] = {
 1041		.needs_file		= 1,
 1042		.hash_reg_file		= 1,
 1043		.unbound_nonreg_file	= 1,
 1044		.pollout		= 1,
 1045		.plug			= 1,
 1046		.audit_skip		= 1,
 1047		.async_size		= sizeof(struct io_async_rw),
 1048	},
 1049	[IORING_OP_FADVISE] = {
 1050		.needs_file		= 1,
 1051		.audit_skip		= 1,
 1052	},
 1053	[IORING_OP_MADVISE] = {},
 1054	[IORING_OP_SEND] = {
 1055		.needs_file		= 1,
 1056		.unbound_nonreg_file	= 1,
 1057		.pollout		= 1,
 1058		.audit_skip		= 1,
 1059	},
 1060	[IORING_OP_RECV] = {
 1061		.needs_file		= 1,
 1062		.unbound_nonreg_file	= 1,
 1063		.pollin			= 1,
 1064		.buffer_select		= 1,
 1065		.audit_skip		= 1,
 1066	},
 1067	[IORING_OP_OPENAT2] = {
 1068	},
 1069	[IORING_OP_EPOLL_CTL] = {
 1070		.unbound_nonreg_file	= 1,
 1071		.audit_skip		= 1,
 1072	},
 1073	[IORING_OP_SPLICE] = {
 1074		.needs_file		= 1,
 1075		.hash_reg_file		= 1,
 1076		.unbound_nonreg_file	= 1,
 1077		.audit_skip		= 1,
 1078	},
 1079	[IORING_OP_PROVIDE_BUFFERS] = {
 1080		.audit_skip		= 1,
 1081	},
 1082	[IORING_OP_REMOVE_BUFFERS] = {
 1083		.audit_skip		= 1,
 1084	},
 1085	[IORING_OP_TEE] = {
 1086		.needs_file		= 1,
 1087		.hash_reg_file		= 1,
 1088		.unbound_nonreg_file	= 1,
 1089		.audit_skip		= 1,
 1090	},
 1091	[IORING_OP_SHUTDOWN] = {
 1092		.needs_file		= 1,
 1093	},
 1094	[IORING_OP_RENAMEAT] = {},
 1095	[IORING_OP_UNLINKAT] = {},
 1096	[IORING_OP_MKDIRAT] = {},
 1097	[IORING_OP_SYMLINKAT] = {},
 1098	[IORING_OP_LINKAT] = {},
 1099};
 1100
 1101/* requests with any of those set should undergo io_disarm_next() */
 1102#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
 1103
 1104static bool io_disarm_next(struct io_kiocb *req);
 1105static void io_uring_del_tctx_node(unsigned long index);
 1106static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 1107					 struct task_struct *task,
 1108					 bool cancel_all);
 1109static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
 1110
 1111static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
 1112				 s32 res, u32 cflags);
 1113static void io_put_req(struct io_kiocb *req);
 1114static void io_put_req_deferred(struct io_kiocb *req);
 1115static void io_dismantle_req(struct io_kiocb *req);
 1116static void io_queue_linked_timeout(struct io_kiocb *req);
 1117static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 1118				     struct io_uring_rsrc_update2 *up,
 1119				     unsigned nr_args);
 1120static void io_clean_op(struct io_kiocb *req);
 1121static struct file *io_file_get(struct io_ring_ctx *ctx,
 1122				struct io_kiocb *req, int fd, bool fixed);
 1123static void __io_queue_sqe(struct io_kiocb *req);
 1124static void io_rsrc_put_work(struct work_struct *work);
 1125
 1126static void io_req_task_queue(struct io_kiocb *req);
 1127static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
 1128static int io_req_prep_async(struct io_kiocb *req);
 1129
 1130static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
 1131				 unsigned int issue_flags, u32 slot_index);
 1132static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
 1133
 1134static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
 1135
 1136static struct kmem_cache *req_cachep;
 1137
 1138static const struct file_operations io_uring_fops;
 1139
 1140struct sock *io_uring_get_socket(struct file *file)
 1141{
 1142#if defined(CONFIG_UNIX)
 1143	if (file->f_op == &io_uring_fops) {
 1144		struct io_ring_ctx *ctx = file->private_data;
 1145
 1146		return ctx->ring_sock->sk;
 1147	}
 1148#endif
 1149	return NULL;
 1150}
 1151EXPORT_SYMBOL(io_uring_get_socket);
 1152
 1153static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
 1154{
 1155	if (!*locked) {
 1156		mutex_lock(&ctx->uring_lock);
 1157		*locked = true;
 1158	}
 1159}
 1160
 1161#define io_for_each_link(pos, head) \
 1162	for (pos = (head); pos; pos = pos->link)
 1163
 1164/*
 1165 * Shamelessly stolen from the mm implementation of page reference checking,
 1166 * see commit f958d7b528b1 for details.
 1167 */
 1168#define req_ref_zero_or_close_to_overflow(req)	\
 1169	((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
 1170
 1171static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
 1172{
 1173	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
 1174	return atomic_inc_not_zero(&req->refs);
 1175}
 1176
 1177static inline bool req_ref_put_and_test(struct io_kiocb *req)
 1178{
 1179	if (likely(!(req->flags & REQ_F_REFCOUNT)))
 1180		return true;
 1181
 1182	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
 1183	return atomic_dec_and_test(&req->refs);
 1184}
 1185
 1186static inline void req_ref_put(struct io_kiocb *req)
 1187{
 1188	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
 1189	WARN_ON_ONCE(req_ref_put_and_test(req));
 1190}
 1191
 1192static inline void req_ref_get(struct io_kiocb *req)
 1193{
 1194	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
 1195	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
 1196	atomic_inc(&req->refs);
 1197}
 1198
 1199static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
 1200{
 1201	if (!wq_list_empty(&ctx->submit_state.compl_reqs))
 1202		__io_submit_flush_completions(ctx);
 1203}
 1204
 1205static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
 1206{
 1207	if (!(req->flags & REQ_F_REFCOUNT)) {
 1208		req->flags |= REQ_F_REFCOUNT;
 1209		atomic_set(&req->refs, nr);
 1210	}
 1211}
 1212
 1213static inline void io_req_set_refcount(struct io_kiocb *req)
 1214{
 1215	__io_req_set_refcount(req, 1);
 1216}
 1217
 1218#define IO_RSRC_REF_BATCH	100
 1219
 1220static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
 1221					  struct io_ring_ctx *ctx)
 1222	__must_hold(&ctx->uring_lock)
 1223{
 1224	struct percpu_ref *ref = req->fixed_rsrc_refs;
 1225
 1226	if (ref) {
 1227		if (ref == &ctx->rsrc_node->refs)
 1228			ctx->rsrc_cached_refs++;
 1229		else
 1230			percpu_ref_put(ref);
 1231	}
 1232}
 1233
 1234static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
 1235{
 1236	if (req->fixed_rsrc_refs)
 1237		percpu_ref_put(req->fixed_rsrc_refs);
 1238}
 1239
 1240static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
 1241	__must_hold(&ctx->uring_lock)
 1242{
 1243	if (ctx->rsrc_cached_refs) {
 1244		percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs);
 1245		ctx->rsrc_cached_refs = 0;
 1246	}
 1247}
 1248
 1249static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
 1250	__must_hold(&ctx->uring_lock)
 1251{
 1252	ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
 1253	percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
 1254}
 1255
 1256static inline void io_req_set_rsrc_node(struct io_kiocb *req,
 1257					struct io_ring_ctx *ctx)
 1258{
 1259	if (!req->fixed_rsrc_refs) {
 1260		req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
 1261		ctx->rsrc_cached_refs--;
 1262		if (unlikely(ctx->rsrc_cached_refs < 0))
 1263			io_rsrc_refs_refill(ctx);
 1264	}
 1265}
 1266
 1267static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
 1268{
 1269	bool got = percpu_ref_tryget(ref);
 1270
 1271	/* already at zero, wait for ->release() */
 1272	if (!got)
 1273		wait_for_completion(compl);
 1274	percpu_ref_resurrect(ref);
 1275	if (got)
 1276		percpu_ref_put(ref);
 1277}
 1278
 1279static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
 1280			  bool cancel_all)
 1281{
 1282	struct io_kiocb *req;
 1283
 1284	if (task && head->task != task)
 1285		return false;
 1286	if (cancel_all)
 1287		return true;
 1288
 1289	io_for_each_link(req, head) {
 1290		if (req->flags & REQ_F_INFLIGHT)
 1291			return true;
 1292	}
 1293	return false;
 1294}
 1295
 1296static inline bool req_has_async_data(struct io_kiocb *req)
 1297{
 1298	return req->flags & REQ_F_ASYNC_DATA;
 1299}
 1300
 1301static inline void req_set_fail(struct io_kiocb *req)
 1302{
 1303	req->flags |= REQ_F_FAIL;
 1304}
 1305
 1306static inline void req_fail_link_node(struct io_kiocb *req, int res)
 1307{
 1308	req_set_fail(req);
 1309	req->result = res;
 1310}
 1311
 1312static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
 1313{
 1314	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
 1315
 1316	complete(&ctx->ref_comp);
 1317}
 1318
 1319static inline bool io_is_timeout_noseq(struct io_kiocb *req)
 1320{
 1321	return !req->timeout.off;
 1322}
 1323
 1324static __cold void io_fallback_req_func(struct work_struct *work)
 1325{
 1326	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
 1327						fallback_work.work);
 1328	struct llist_node *node = llist_del_all(&ctx->fallback_llist);
 1329	struct io_kiocb *req, *tmp;
 1330	bool locked = false;
 1331
 1332	percpu_ref_get(&ctx->refs);
 1333	llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
 1334		req->io_task_work.func(req, &locked);
 1335
 1336	if (locked) {
 1337		io_submit_flush_completions(ctx);
 1338		mutex_unlock(&ctx->uring_lock);
 1339	}
 1340	percpu_ref_put(&ctx->refs);
 1341}
 1342
 1343static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 1344{
 1345	struct io_ring_ctx *ctx;
 1346	int hash_bits;
 1347
 1348	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 1349	if (!ctx)
 1350		return NULL;
 1351
 1352	/*
 1353	 * Use 5 bits less than the max cq entries, that should give us around
 1354	 * 32 entries per hash list if totally full and uniformly spread.
 1355	 */
 1356	hash_bits = ilog2(p->cq_entries);
 1357	hash_bits -= 5;
 1358	if (hash_bits <= 0)
 1359		hash_bits = 1;
 1360	ctx->cancel_hash_bits = hash_bits;
 1361	ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
 1362					GFP_KERNEL);
 1363	if (!ctx->cancel_hash)
 1364		goto err;
 1365	__hash_init(ctx->cancel_hash, 1U << hash_bits);
 1366
 1367	ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
 1368	if (!ctx->dummy_ubuf)
 1369		goto err;
 1370	/* set invalid range, so io_import_fixed() fails meeting it */
 1371	ctx->dummy_ubuf->ubuf = -1UL;
 1372
 1373	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
 1374			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
 1375		goto err;
 1376
 1377	ctx->flags = p->flags;
 1378	init_waitqueue_head(&ctx->sqo_sq_wait);
 1379	INIT_LIST_HEAD(&ctx->sqd_list);
 1380	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 1381	init_completion(&ctx->ref_comp);
 1382	xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
 1383	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
 1384	mutex_init(&ctx->uring_lock);
 1385	init_waitqueue_head(&ctx->cq_wait);
 1386	spin_lock_init(&ctx->completion_lock);
 1387	spin_lock_init(&ctx->timeout_lock);
 1388	INIT_WQ_LIST(&ctx->iopoll_list);
 1389	INIT_LIST_HEAD(&ctx->defer_list);
 1390	INIT_LIST_HEAD(&ctx->timeout_list);
 1391	INIT_LIST_HEAD(&ctx->ltimeout_list);
 1392	spin_lock_init(&ctx->rsrc_ref_lock);
 1393	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
 1394	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
 1395	init_llist_head(&ctx->rsrc_put_llist);
 1396	INIT_LIST_HEAD(&ctx->tctx_list);
 1397	ctx->submit_state.free_list.next = NULL;
 1398	INIT_WQ_LIST(&ctx->locked_free_list);
 1399	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
 1400	INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
 1401	return ctx;
 1402err:
 1403	kfree(ctx->dummy_ubuf);
 1404	kfree(ctx->cancel_hash);
 1405	kfree(ctx);
 1406	return NULL;
 1407}
 1408
 1409static void io_account_cq_overflow(struct io_ring_ctx *ctx)
 1410{
 1411	struct io_rings *r = ctx->rings;
 1412
 1413	WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
 1414	ctx->cq_extra--;
 1415}
 1416
 1417static bool req_need_defer(struct io_kiocb *req, u32 seq)
 1418{
 1419	if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
 1420		struct io_ring_ctx *ctx = req->ctx;
 1421
 1422		return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
 1423	}
 1424
 1425	return false;
 1426}
 1427
 1428#define FFS_NOWAIT		0x1UL
 1429#define FFS_ISREG		0x2UL
 1430#define FFS_MASK		~(FFS_NOWAIT|FFS_ISREG)
 1431
 1432static inline bool io_req_ffs_set(struct io_kiocb *req)
 1433{
 1434	return req->flags & REQ_F_FIXED_FILE;
 1435}
 1436
 1437static inline void io_req_track_inflight(struct io_kiocb *req)
 1438{
 1439	if (!(req->flags & REQ_F_INFLIGHT)) {
 1440		req->flags |= REQ_F_INFLIGHT;
 1441		atomic_inc(&current->io_uring->inflight_tracked);
 1442	}
 1443}
 1444
 1445static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
 1446{
 1447	if (WARN_ON_ONCE(!req->link))
 1448		return NULL;
 1449
 1450	req->flags &= ~REQ_F_ARM_LTIMEOUT;
 1451	req->flags |= REQ_F_LINK_TIMEOUT;
 1452
 1453	/* linked timeouts should have two refs once prep'ed */
 1454	io_req_set_refcount(req);
 1455	__io_req_set_refcount(req->link, 2);
 1456	return req->link;
 1457}
 1458
 1459static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 1460{
 1461	if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
 1462		return NULL;
 1463	return __io_prep_linked_timeout(req);
 1464}
 1465
 1466static void io_prep_async_work(struct io_kiocb *req)
 1467{
 1468	const struct io_op_def *def = &io_op_defs[req->opcode];
 1469	struct io_ring_ctx *ctx = req->ctx;
 1470
 1471	if (!(req->flags & REQ_F_CREDS)) {
 1472		req->flags |= REQ_F_CREDS;
 1473		req->creds = get_current_cred();
 1474	}
 1475
 1476	req->work.list.next = NULL;
 1477	req->work.flags = 0;
 1478	if (req->flags & REQ_F_FORCE_ASYNC)
 1479		req->work.flags |= IO_WQ_WORK_CONCURRENT;
 1480
 1481	if (req->flags & REQ_F_ISREG) {
 1482		if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
 1483			io_wq_hash_work(&req->work, file_inode(req->file));
 1484	} else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
 1485		if (def->unbound_nonreg_file)
 1486			req->work.flags |= IO_WQ_WORK_UNBOUND;
 1487	}
 1488
 1489	switch (req->opcode) {
 1490	case IORING_OP_SPLICE:
 1491	case IORING_OP_TEE:
 1492		if (!S_ISREG(file_inode(req->splice.file_in)->i_mode))
 1493			req->work.flags |= IO_WQ_WORK_UNBOUND;
 1494		break;
 1495	}
 1496}
 1497
 1498static void io_prep_async_link(struct io_kiocb *req)
 1499{
 1500	struct io_kiocb *cur;
 1501
 1502	if (req->flags & REQ_F_LINK_TIMEOUT) {
 1503		struct io_ring_ctx *ctx = req->ctx;
 1504
 1505		spin_lock(&ctx->completion_lock);
 1506		io_for_each_link(cur, req)
 1507			io_prep_async_work(cur);
 1508		spin_unlock(&ctx->completion_lock);
 1509	} else {
 1510		io_for_each_link(cur, req)
 1511			io_prep_async_work(cur);
 1512	}
 1513}
 1514
 1515static inline void io_req_add_compl_list(struct io_kiocb *req)
 1516{
 1517	struct io_submit_state *state = &req->ctx->submit_state;
 1518
 1519	wq_list_add_tail(&req->comp_list, &state->compl_reqs);
 1520}
 1521
 1522static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
 1523{
 1524	struct io_ring_ctx *ctx = req->ctx;
 1525	struct io_kiocb *link = io_prep_linked_timeout(req);
 1526	struct io_uring_task *tctx = req->task->io_uring;
 1527
 1528	BUG_ON(!tctx);
 1529	BUG_ON(!tctx->io_wq);
 1530
 1531	/* init ->work of the whole link before punting */
 1532	io_prep_async_link(req);
 1533
 1534	/*
 1535	 * Not expected to happen, but if we do have a bug where this _can_
 1536	 * happen, catch it here and ensure the request is marked as
 1537	 * canceled. That will make io-wq go through the usual work cancel
 1538	 * procedure rather than attempt to run this request (or create a new
 1539	 * worker for it).
 1540	 */
 1541	if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
 1542		req->work.flags |= IO_WQ_WORK_CANCEL;
 1543
 1544	trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
 1545					&req->work, req->flags);
 1546	io_wq_enqueue(tctx->io_wq, &req->work);
 1547	if (link)
 1548		io_queue_linked_timeout(link);
 1549}
 1550
 1551static void io_kill_timeout(struct io_kiocb *req, int status)
 1552	__must_hold(&req->ctx->completion_lock)
 1553	__must_hold(&req->ctx->timeout_lock)
 1554{
 1555	struct io_timeout_data *io = req->async_data;
 1556
 1557	if (hrtimer_try_to_cancel(&io->timer) != -1) {
 1558		if (status)
 1559			req_set_fail(req);
 1560		atomic_set(&req->ctx->cq_timeouts,
 1561			atomic_read(&req->ctx->cq_timeouts) + 1);
 1562		list_del_init(&req->timeout.list);
 1563		io_cqring_fill_event(req->ctx, req->user_data, status, 0);
 1564		io_put_req_deferred(req);
 1565	}
 1566}
 1567
 1568static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
 1569{
 1570	while (!list_empty(&ctx->defer_list)) {
 1571		struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
 1572						struct io_defer_entry, list);
 1573
 1574		if (req_need_defer(de->req, de->seq))
 1575			break;
 1576		list_del_init(&de->list);
 1577		io_req_task_queue(de->req);
 1578		kfree(de);
 1579	}
 1580}
 1581
 1582static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
 1583	__must_hold(&ctx->completion_lock)
 1584{
 1585	u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
 1586
 1587	spin_lock_irq(&ctx->timeout_lock);
 1588	while (!list_empty(&ctx->timeout_list)) {
 1589		u32 events_needed, events_got;
 1590		struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
 1591						struct io_kiocb, timeout.list);
 1592
 1593		if (io_is_timeout_noseq(req))
 1594			break;
 1595
 1596		/*
 1597		 * Since seq can easily wrap around over time, subtract
 1598		 * the last seq at which timeouts were flushed before comparing.
 1599		 * Assuming not more than 2^31-1 events have happened since,
 1600		 * these subtractions won't have wrapped, so we can check if
 1601		 * target is in [last_seq, current_seq] by comparing the two.
 1602		 */
 1603		events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
 1604		events_got = seq - ctx->cq_last_tm_flush;
 1605		if (events_got < events_needed)
 1606			break;
 1607
 1608		list_del_init(&req->timeout.list);
 1609		io_kill_timeout(req, 0);
 1610	}
 1611	ctx->cq_last_tm_flush = seq;
 1612	spin_unlock_irq(&ctx->timeout_lock);
 1613}
 1614
 1615static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 1616{
 1617	if (ctx->off_timeout_used)
 1618		io_flush_timeouts(ctx);
 1619	if (ctx->drain_active)
 1620		io_queue_deferred(ctx);
 1621}
 1622
 1623static inline void io_commit_cqring(struct io_ring_ctx *ctx)
 1624{
 1625	if (unlikely(ctx->off_timeout_used || ctx->drain_active))
 1626		__io_commit_cqring_flush(ctx);
 1627	/* order cqe stores with ring update */
 1628	smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
 1629}
 1630
 1631static inline bool io_sqring_full(struct io_ring_ctx *ctx)
 1632{
 1633	struct io_rings *r = ctx->rings;
 1634
 1635	return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
 1636}
 1637
 1638static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
 1639{
 1640	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
 1641}
 1642
 1643static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
 1644{
 1645	struct io_rings *rings = ctx->rings;
 1646	unsigned tail, mask = ctx->cq_entries - 1;
 1647
 1648	/*
 1649	 * writes to the cq entry need to come after reading head; the
 1650	 * control dependency is enough as we're using WRITE_ONCE to
 1651	 * fill the cq entry
 1652	 */
 1653	if (__io_cqring_events(ctx) == ctx->cq_entries)
 1654		return NULL;
 1655
 1656	tail = ctx->cached_cq_tail++;
 1657	return &rings->cqes[tail & mask];
 1658}
 1659
 1660static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
 1661{
 1662	if (likely(!ctx->cq_ev_fd))
 1663		return false;
 1664	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
 1665		return false;
 1666	return !ctx->eventfd_async || io_wq_current_is_worker();
 1667}
 1668
 1669/*
 1670 * This should only get called when at least one event has been posted.
 1671 * Some applications rely on the eventfd notification count only changing
 1672 * IFF a new CQE has been added to the CQ ring. There's no depedency on
 1673 * 1:1 relationship between how many times this function is called (and
 1674 * hence the eventfd count) and number of CQEs posted to the CQ ring.
 1675 */
 1676static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 1677{
 1678	/*
 1679	 * wake_up_all() may seem excessive, but io_wake_function() and
 1680	 * io_should_wake() handle the termination of the loop and only
 1681	 * wake as many waiters as we need to.
 1682	 */
 1683	if (wq_has_sleeper(&ctx->cq_wait))
 1684		wake_up_all(&ctx->cq_wait);
 1685	if (io_should_trigger_evfd(ctx))
 1686		eventfd_signal(ctx->cq_ev_fd, 1);
 1687}
 1688
 1689static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
 1690{
 1691	/* see waitqueue_active() comment */
 1692	smp_mb();
 1693
 1694	if (ctx->flags & IORING_SETUP_SQPOLL) {
 1695		if (waitqueue_active(&ctx->cq_wait))
 1696			wake_up_all(&ctx->cq_wait);
 1697	}
 1698	if (io_should_trigger_evfd(ctx))
 1699		eventfd_signal(ctx->cq_ev_fd, 1);
 1700}
 1701
 1702/* Returns true if there are no backlogged entries after the flush */
 1703static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 1704{
 1705	bool all_flushed, posted;
 1706
 1707	if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
 1708		return false;
 1709
 1710	posted = false;
 1711	spin_lock(&ctx->completion_lock);
 1712	while (!list_empty(&ctx->cq_overflow_list)) {
 1713		struct io_uring_cqe *cqe = io_get_cqe(ctx);
 1714		struct io_overflow_cqe *ocqe;
 1715
 1716		if (!cqe && !force)
 1717			break;
 1718		ocqe = list_first_entry(&ctx->cq_overflow_list,
 1719					struct io_overflow_cqe, list);
 1720		if (cqe)
 1721			memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
 1722		else
 1723			io_account_cq_overflow(ctx);
 1724
 1725		posted = true;
 1726		list_del(&ocqe->list);
 1727		kfree(ocqe);
 1728	}
 1729
 1730	all_flushed = list_empty(&ctx->cq_overflow_list);
 1731	if (all_flushed) {
 1732		clear_bit(0, &ctx->check_cq_overflow);
 1733		WRITE_ONCE(ctx->rings->sq_flags,
 1734			   ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
 1735	}
 1736
 1737	if (posted)
 1738		io_commit_cqring(ctx);
 1739	spin_unlock(&ctx->completion_lock);
 1740	if (posted)
 1741		io_cqring_ev_posted(ctx);
 1742	return all_flushed;
 1743}
 1744
 1745static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
 1746{
 1747	bool ret = true;
 1748
 1749	if (test_bit(0, &ctx->check_cq_overflow)) {
 1750		/* iopoll syncs against uring_lock, not completion_lock */
 1751		if (ctx->flags & IORING_SETUP_IOPOLL)
 1752			mutex_lock(&ctx->uring_lock);
 1753		ret = __io_cqring_overflow_flush(ctx, false);
 1754		if (ctx->flags & IORING_SETUP_IOPOLL)
 1755			mutex_unlock(&ctx->uring_lock);
 1756	}
 1757
 1758	return ret;
 1759}
 1760
 1761/* must to be called somewhat shortly after putting a request */
 1762static inline void io_put_task(struct task_struct *task, int nr)
 1763{
 1764	struct io_uring_task *tctx = task->io_uring;
 1765
 1766	if (likely(task == current)) {
 1767		tctx->cached_refs += nr;
 1768	} else {
 1769		percpu_counter_sub(&tctx->inflight, nr);
 1770		if (unlikely(atomic_read(&tctx->in_idle)))
 1771			wake_up(&tctx->wait);
 1772		put_task_struct_many(task, nr);
 1773	}
 1774}
 1775
 1776static void io_task_refs_refill(struct io_uring_task *tctx)
 1777{
 1778	unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
 1779
 1780	percpu_counter_add(&tctx->inflight, refill);
 1781	refcount_add(refill, &current->usage);
 1782	tctx->cached_refs += refill;
 1783}
 1784
 1785static inline void io_get_task_refs(int nr)
 1786{
 1787	struct io_uring_task *tctx = current->io_uring;
 1788
 1789	tctx->cached_refs -= nr;
 1790	if (unlikely(tctx->cached_refs < 0))
 1791		io_task_refs_refill(tctx);
 1792}
 1793
 1794static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
 1795				     s32 res, u32 cflags)
 1796{
 1797	struct io_overflow_cqe *ocqe;
 1798
 1799	ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
 1800	if (!ocqe) {
 1801		/*
 1802		 * If we're in ring overflow flush mode, or in task cancel mode,
 1803		 * or cannot allocate an overflow entry, then we need to drop it
 1804		 * on the floor.
 1805		 */
 1806		io_account_cq_overflow(ctx);
 1807		return false;
 1808	}
 1809	if (list_empty(&ctx->cq_overflow_list)) {
 1810		set_bit(0, &ctx->check_cq_overflow);
 1811		WRITE_ONCE(ctx->rings->sq_flags,
 1812			   ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
 1813
 1814	}
 1815	ocqe->cqe.user_data = user_data;
 1816	ocqe->cqe.res = res;
 1817	ocqe->cqe.flags = cflags;
 1818	list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
 1819	return true;
 1820}
 1821
 1822static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
 1823					  s32 res, u32 cflags)
 1824{
 1825	struct io_uring_cqe *cqe;
 1826
 1827	trace_io_uring_complete(ctx, user_data, res, cflags);
 1828
 1829	/*
 1830	 * If we can't get a cq entry, userspace overflowed the
 1831	 * submission (by quite a lot). Increment the overflow count in
 1832	 * the ring.
 1833	 */
 1834	cqe = io_get_cqe(ctx);
 1835	if (likely(cqe)) {
 1836		WRITE_ONCE(cqe->user_data, user_data);
 1837		WRITE_ONCE(cqe->res, res);
 1838		WRITE_ONCE(cqe->flags, cflags);
 1839		return true;
 1840	}
 1841	return io_cqring_event_overflow(ctx, user_data, res, cflags);
 1842}
 1843
 1844/* not as hot to bloat with inlining */
 1845static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
 1846					  s32 res, u32 cflags)
 1847{
 1848	return __io_cqring_fill_event(ctx, user_data, res, cflags);
 1849}
 1850
 1851static void io_req_complete_post(struct io_kiocb *req, s32 res,
 1852				 u32 cflags)
 1853{
 1854	struct io_ring_ctx *ctx = req->ctx;
 1855
 1856	spin_lock(&ctx->completion_lock);
 1857	__io_cqring_fill_event(ctx, req->user_data, res, cflags);
 1858	/*
 1859	 * If we're the last reference to this request, add to our locked
 1860	 * free_list cache.
 1861	 */
 1862	if (req_ref_put_and_test(req)) {
 1863		if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
 1864			if (req->flags & IO_DISARM_MASK)
 1865				io_disarm_next(req);
 1866			if (req->link) {
 1867				io_req_task_queue(req->link);
 1868				req->link = NULL;
 1869			}
 1870		}
 1871		io_req_put_rsrc(req, ctx);
 1872		io_dismantle_req(req);
 1873		io_put_task(req->task, 1);
 1874		wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
 1875		ctx->locked_free_nr++;
 1876	}
 1877	io_commit_cqring(ctx);
 1878	spin_unlock(&ctx->completion_lock);
 1879	io_cqring_ev_posted(ctx);
 1880}
 1881
 1882static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
 1883					 u32 cflags)
 1884{
 1885	req->result = res;
 1886	req->cflags = cflags;
 1887	req->flags |= REQ_F_COMPLETE_INLINE;
 1888}
 1889
 1890static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
 1891				     s32 res, u32 cflags)
 1892{
 1893	if (issue_flags & IO_URING_F_COMPLETE_DEFER)
 1894		io_req_complete_state(req, res, cflags);
 1895	else
 1896		io_req_complete_post(req, res, cflags);
 1897}
 1898
 1899static inline void io_req_complete(struct io_kiocb *req, s32 res)
 1900{
 1901	__io_req_complete(req, 0, res, 0);
 1902}
 1903
 1904static void io_req_complete_failed(struct io_kiocb *req, s32 res)
 1905{
 1906	req_set_fail(req);
 1907	io_req_complete_post(req, res, 0);
 1908}
 1909
 1910static void io_req_complete_fail_submit(struct io_kiocb *req)
 1911{
 1912	/*
 1913	 * We don't submit, fail them all, for that replace hardlinks with
 1914	 * normal links. Extra REQ_F_LINK is tolerated.
 1915	 */
 1916	req->flags &= ~REQ_F_HARDLINK;
 1917	req->flags |= REQ_F_LINK;
 1918	io_req_complete_failed(req, req->result);
 1919}
 1920
 1921/*
 1922 * Don't initialise the fields below on every allocation, but do that in
 1923 * advance and keep them valid across allocations.
 1924 */
 1925static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
 1926{
 1927	req->ctx = ctx;
 1928	req->link = NULL;
 1929	req->async_data = NULL;
 1930	/* not necessary, but safer to zero */
 1931	req->result = 0;
 1932}
 1933
 1934static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
 1935					struct io_submit_state *state)
 1936{
 1937	spin_lock(&ctx->completion_lock);
 1938	wq_list_splice(&ctx->locked_free_list, &state->free_list);
 1939	ctx->locked_free_nr = 0;
 1940	spin_unlock(&ctx->completion_lock);
 1941}
 1942
 1943/* Returns true IFF there are requests in the cache */
 1944static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
 1945{
 1946	struct io_submit_state *state = &ctx->submit_state;
 1947
 1948	/*
 1949	 * If we have more than a batch's worth of requests in our IRQ side
 1950	 * locked cache, grab the lock and move them over to our submission
 1951	 * side cache.
 1952	 */
 1953	if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
 1954		io_flush_cached_locked_reqs(ctx, state);
 1955	return !!state->free_list.next;
 1956}
 1957
 1958/*
 1959 * A request might get retired back into the request caches even before opcode
 1960 * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
 1961 * Because of that, io_alloc_req() should be called only under ->uring_lock
 1962 * and with extra caution to not get a request that is still worked on.
 1963 */
 1964static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 1965	__must_hold(&ctx->uring_lock)
 1966{
 1967	struct io_submit_state *state = &ctx->submit_state;
 1968	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 1969	void *reqs[IO_REQ_ALLOC_BATCH];
 1970	struct io_kiocb *req;
 1971	int ret, i;
 1972
 1973	if (likely(state->free_list.next || io_flush_cached_reqs(ctx)))
 1974		return true;
 1975
 1976	ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
 1977
 1978	/*
 1979	 * Bulk alloc is all-or-nothing. If we fail to get a batch,
 1980	 * retry single alloc to be on the safe side.
 1981	 */
 1982	if (unlikely(ret <= 0)) {
 1983		reqs[0] = kmem_cache_alloc(req_cachep, gfp);
 1984		if (!reqs[0])
 1985			return false;
 1986		ret = 1;
 1987	}
 1988
 1989	percpu_ref_get_many(&ctx->refs, ret);
 1990	for (i = 0; i < ret; i++) {
 1991		req = reqs[i];
 1992
 1993		io_preinit_req(req, ctx);
 1994		wq_stack_add_head(&req->comp_list, &state->free_list);
 1995	}
 1996	return true;
 1997}
 1998
 1999static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
 2000{
 2001	if (unlikely(!ctx->submit_state.free_list.next))
 2002		return __io_alloc_req_refill(ctx);
 2003	return true;
 2004}
 2005
 2006static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
 2007{
 2008	struct io_wq_work_node *node;
 2009
 2010	node = wq_stack_extract(&ctx->submit_state.free_list);
 2011	return container_of(node, struct io_kiocb, comp_list);
 2012}
 2013
 2014static inline void io_put_file(struct file *file)
 2015{
 2016	if (file)
 2017		fput(file);
 2018}
 2019
 2020static inline void io_dismantle_req(struct io_kiocb *req)
 2021{
 2022	unsigned int flags = req->flags;
 2023
 2024	if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
 2025		io_clean_op(req);
 2026	if (!(flags & REQ_F_FIXED_FILE))
 2027		io_put_file(req->file);
 2028}
 2029
 2030static __cold void __io_free_req(struct io_kiocb *req)
 2031{
 2032	struct io_ring_ctx *ctx = req->ctx;
 2033
 2034	io_req_put_rsrc(req, ctx);
 2035	io_dismantle_req(req);
 2036	io_put_task(req->task, 1);
 2037
 2038	spin_lock(&ctx->completion_lock);
 2039	wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
 2040	ctx->locked_free_nr++;
 2041	spin_unlock(&ctx->completion_lock);
 2042}
 2043
 2044static inline void io_remove_next_linked(struct io_kiocb *req)
 2045{
 2046	struct io_kiocb *nxt = req->link;
 2047
 2048	req->link = nxt->link;
 2049	nxt->link = NULL;
 2050}
 2051
 2052static bool io_kill_linked_timeout(struct io_kiocb *req)
 2053	__must_hold(&req->ctx->completion_lock)
 2054	__must_hold(&req->ctx->timeout_lock)
 2055{
 2056	struct io_kiocb *link = req->link;
 2057
 2058	if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
 2059		struct io_timeout_data *io = link->async_data;
 2060
 2061		io_remove_next_linked(req);
 2062		link->timeout.head = NULL;
 2063		if (hrtimer_try_to_cancel(&io->timer) != -1) {
 2064			list_del(&link->timeout.list);
 2065			io_cqring_fill_event(link->ctx, link->user_data,
 2066					     -ECANCELED, 0);
 2067			io_put_req_deferred(link);
 2068			return true;
 2069		}
 2070	}
 2071	return false;
 2072}
 2073
 2074static void io_fail_links(struct io_kiocb *req)
 2075	__must_hold(&req->ctx->completion_lock)
 2076{
 2077	struct io_kiocb *nxt, *link = req->link;
 2078
 2079	req->link = NULL;
 2080	while (link) {
 2081		long res = -ECANCELED;
 2082
 2083		if (link->flags & REQ_F_FAIL)
 2084			res = link->result;
 2085
 2086		nxt = link->link;
 2087		link->link = NULL;
 2088
 2089		trace_io_uring_fail_link(req, link);
 2090		io_cqring_fill_event(link->ctx, link->user_data, res, 0);
 2091		io_put_req_deferred(link);
 2092		link = nxt;
 2093	}
 2094}
 2095
 2096static bool io_disarm_next(struct io_kiocb *req)
 2097	__must_hold(&req->ctx->completion_lock)
 2098{
 2099	bool posted = false;
 2100
 2101	if (req->flags & REQ_F_ARM_LTIMEOUT) {
 2102		struct io_kiocb *link = req->link;
 2103
 2104		req->flags &= ~REQ_F_ARM_LTIMEOUT;
 2105		if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
 2106			io_remove_next_linked(req);
 2107			io_cqring_fill_event(link->ctx, link->user_data,
 2108					     -ECANCELED, 0);
 2109			io_put_req_deferred(link);
 2110			posted = true;
 2111		}
 2112	} else if (req->flags & REQ_F_LINK_TIMEOUT) {
 2113		struct io_ring_ctx *ctx = req->ctx;
 2114
 2115		spin_lock_irq(&ctx->timeout_lock);
 2116		posted = io_kill_linked_timeout(req);
 2117		spin_unlock_irq(&ctx->timeout_lock);
 2118	}
 2119	if (unlikely((req->flags & REQ_F_FAIL) &&
 2120		     !(req->flags & REQ_F_HARDLINK))) {
 2121		posted |= (req->link != NULL);
 2122		io_fail_links(req);
 2123	}
 2124	return posted;
 2125}
 2126
 2127static void __io_req_find_next_prep(struct io_kiocb *req)
 2128{
 2129	struct io_ring_ctx *ctx = req->ctx;
 2130	bool posted;
 2131
 2132	spin_lock(&ctx->completion_lock);
 2133	posted = io_disarm_next(req);
 2134	if (posted)
 2135		io_commit_cqring(req->ctx);
 2136	spin_unlock(&ctx->completion_lock);
 2137	if (posted)
 2138		io_cqring_ev_posted(ctx);
 2139}
 2140
 2141static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
 2142{
 2143	struct io_kiocb *nxt;
 2144
 2145	if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
 2146		return NULL;
 2147	/*
 2148	 * If LINK is set, we have dependent requests in this chain. If we
 2149	 * didn't fail this request, queue the first one up, moving any other
 2150	 * dependencies to the next request. In case of failure, fail the rest
 2151	 * of the chain.
 2152	 */
 2153	if (unlikely(req->flags & IO_DISARM_MASK))
 2154		__io_req_find_next_prep(req);
 2155	nxt = req->link;
 2156	req->link = NULL;
 2157	return nxt;
 2158}
 2159
 2160static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 2161{
 2162	if (!ctx)
 2163		return;
 2164	if (*locked) {
 2165		io_submit_flush_completions(ctx);
 2166		mutex_unlock(&ctx->uring_lock);
 2167		*locked = false;
 2168	}
 2169	percpu_ref_put(&ctx->refs);
 2170}
 2171
 2172static void tctx_task_work(struct callback_head *cb)
 2173{
 2174	bool locked = false;
 2175	struct io_ring_ctx *ctx = NULL;
 2176	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
 2177						  task_work);
 2178
 2179	while (1) {
 2180		struct io_wq_work_node *node;
 2181
 2182		if (!tctx->task_list.first && locked)
 2183			io_submit_flush_completions(ctx);
 2184
 2185		spin_lock_irq(&tctx->task_lock);
 2186		node = tctx->task_list.first;
 2187		INIT_WQ_LIST(&tctx->task_list);
 2188		if (!node)
 2189			tctx->task_running = false;
 2190		spin_unlock_irq(&tctx->task_lock);
 2191		if (!node)
 2192			break;
 2193
 2194		do {
 2195			struct io_wq_work_node *next = node->next;
 2196			struct io_kiocb *req = container_of(node, struct io_kiocb,
 2197							    io_task_work.node);
 2198
 2199			if (req->ctx != ctx) {
 2200				ctx_flush_and_put(ctx, &locked);
 2201				ctx = req->ctx;
 2202				/* if not contended, grab and improve batching */
 2203				locked = mutex_trylock(&ctx->uring_lock);
 2204				percpu_ref_get(&ctx->refs);
 2205			}
 2206			req->io_task_work.func(req, &locked);
 2207			node = next;
 2208		} while (node);
 2209
 2210		cond_resched();
 2211	}
 2212
 2213	ctx_flush_and_put(ctx, &locked);
 2214}
 2215
 2216static void io_req_task_work_add(struct io_kiocb *req)
 2217{
 2218	struct task_struct *tsk = req->task;
 2219	struct io_uring_task *tctx = tsk->io_uring;
 2220	enum task_work_notify_mode notify;
 2221	struct io_wq_work_node *node;
 2222	unsigned long flags;
 2223	bool running;
 2224
 2225	WARN_ON_ONCE(!tctx);
 2226
 2227	spin_lock_irqsave(&tctx->task_lock, flags);
 2228	wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
 2229	running = tctx->task_running;
 2230	if (!running)
 2231		tctx->task_running = true;
 2232	spin_unlock_irqrestore(&tctx->task_lock, flags);
 2233
 2234	/* task_work already pending, we're done */
 2235	if (running)
 2236		return;
 2237
 2238	/*
 2239	 * SQPOLL kernel thread doesn't need notification, just a wakeup. For
 2240	 * all other cases, use TWA_SIGNAL unconditionally to ensure we're
 2241	 * processing task_work. There's no reliable way to tell if TWA_RESUME
 2242	 * will do the job.
 2243	 */
 2244	notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
 2245	if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
 2246		if (notify == TWA_NONE)
 2247			wake_up_process(tsk);
 2248		return;
 2249	}
 2250
 2251	spin_lock_irqsave(&tctx->task_lock, flags);
 2252	tctx->task_running = false;
 2253	node = tctx->task_list.first;
 2254	INIT_WQ_LIST(&tctx->task_list);
 2255	spin_unlock_irqrestore(&tctx->task_lock, flags);
 2256
 2257	while (node) {
 2258		req = container_of(node, struct io_kiocb, io_task_work.node);
 2259		node = node->next;
 2260		if (llist_add(&req->io_task_work.fallback_node,
 2261			      &req->ctx->fallback_llist))
 2262			schedule_delayed_work(&req->ctx->fallback_work, 1);
 2263	}
 2264}
 2265
 2266static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
 2267{
 2268	struct io_ring_ctx *ctx = req->ctx;
 2269
 2270	/* not needed for normal modes, but SQPOLL depends on it */
 2271	io_tw_lock(ctx, locked);
 2272	io_req_complete_failed(req, req->result);
 2273}
 2274
 2275static void io_req_task_submit(struct io_kiocb *req, bool *locked)
 2276{
 2277	struct io_ring_ctx *ctx = req->ctx;
 2278
 2279	io_tw_lock(ctx, locked);
 2280	/* req->task == current here, checking PF_EXITING is safe */
 2281	if (likely(!(req->task->flags & PF_EXITING)))
 2282		__io_queue_sqe(req);
 2283	else
 2284		io_req_complete_failed(req, -EFAULT);
 2285}
 2286
 2287static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
 2288{
 2289	req->result = ret;
 2290	req->io_task_work.func = io_req_task_cancel;
 2291	io_req_task_work_add(req);
 2292}
 2293
 2294static void io_req_task_queue(struct io_kiocb *req)
 2295{
 2296	req->io_task_work.func = io_req_task_submit;
 2297	io_req_task_work_add(req);
 2298}
 2299
 2300static void io_req_task_queue_reissue(struct io_kiocb *req)
 2301{
 2302	req->io_task_work.func = io_queue_async_work;
 2303	io_req_task_work_add(req);
 2304}
 2305
 2306static inline void io_queue_next(struct io_kiocb *req)
 2307{
 2308	struct io_kiocb *nxt = io_req_find_next(req);
 2309
 2310	if (nxt)
 2311		io_req_task_queue(nxt);
 2312}
 2313
 2314static void io_free_req(struct io_kiocb *req)
 2315{
 2316	io_queue_next(req);
 2317	__io_free_req(req);
 2318}
 2319
 2320static void io_free_req_work(struct io_kiocb *req, bool *locked)
 2321{
 2322	io_free_req(req);
 2323}
 2324
 2325static void io_free_batch_list(struct io_ring_ctx *ctx,
 2326				struct io_wq_work_node *node)
 2327	__must_hold(&ctx->uring_lock)
 2328{
 2329	struct task_struct *task = NULL;
 2330	int task_refs = 0;
 2331
 2332	do {
 2333		struct io_kiocb *req = container_of(node, struct io_kiocb,
 2334						    comp_list);
 2335
 2336		if (unlikely(req->flags & REQ_F_REFCOUNT)) {
 2337			node = req->comp_list.next;
 2338			if (!req_ref_put_and_test(req))
 2339				continue;
 2340		}
 2341
 2342		io_req_put_rsrc_locked(req, ctx);
 2343		io_queue_next(req);
 2344		io_dismantle_req(req);
 2345
 2346		if (req->task != task) {
 2347			if (task)
 2348				io_put_task(task, task_refs);
 2349			task = req->task;
 2350			task_refs = 0;
 2351		}
 2352		task_refs++;
 2353		node = req->comp_list.next;
 2354		wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
 2355	} while (node);
 2356
 2357	if (task)
 2358		io_put_task(task, task_refs);
 2359}
 2360
 2361static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 2362	__must_hold(&ctx->uring_lock)
 2363{
 2364	struct io_wq_work_node *node, *prev;
 2365	struct io_submit_state *state = &ctx->submit_state;
 2366
 2367	spin_lock(&ctx->completion_lock);
 2368	wq_list_for_each(node, prev, &state->compl_reqs) {
 2369		struct io_kiocb *req = container_of(node, struct io_kiocb,
 2370						    comp_list);
 2371
 2372		__io_cqring_fill_event(ctx, req->user_data, req->result,
 2373					req->cflags);
 2374	}
 2375	io_commit_cqring(ctx);
 2376	spin_unlock(&ctx->completion_lock);
 2377	io_cqring_ev_posted(ctx);
 2378
 2379	io_free_batch_list(ctx, state->compl_reqs.first);
 2380	INIT_WQ_LIST(&state->compl_reqs);
 2381}
 2382
 2383/*
 2384 * Drop reference to request, return next in chain (if there is one) if this
 2385 * was the last reference to this request.
 2386 */
 2387static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
 2388{
 2389	struct io_kiocb *nxt = NULL;
 2390
 2391	if (req_ref_put_and_test(req)) {
 2392		nxt = io_req_find_next(req);
 2393		__io_free_req(req);
 2394	}
 2395	return nxt;
 2396}
 2397
 2398static inline void io_put_req(struct io_kiocb *req)
 2399{
 2400	if (req_ref_put_and_test(req))
 2401		io_free_req(req);
 2402}
 2403
 2404static inline void io_put_req_deferred(struct io_kiocb *req)
 2405{
 2406	if (req_ref_put_and_test(req)) {
 2407		req->io_task_work.func = io_free_req_work;
 2408		io_req_task_work_add(req);
 2409	}
 2410}
 2411
 2412static unsigned io_cqring_events(struct io_ring_ctx *ctx)
 2413{
 2414	/* See comment at the top of this file */
 2415	smp_rmb();
 2416	return __io_cqring_events(ctx);
 2417}
 2418
 2419static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
 2420{
 2421	struct io_rings *rings = ctx->rings;
 2422
 2423	/* make sure SQ entry isn't read before tail */
 2424	return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
 2425}
 2426
 2427static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
 2428{
 2429	unsigned int cflags;
 2430
 2431	cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
 2432	cflags |= IORING_CQE_F_BUFFER;
 2433	req->flags &= ~REQ_F_BUFFER_SELECTED;
 2434	kfree(kbuf);
 2435	return cflags;
 2436}
 2437
 2438static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
 2439{
 2440	if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
 2441		return 0;
 2442	return io_put_kbuf(req, req->kbuf);
 2443}
 2444
 2445static inline bool io_run_task_work(void)
 2446{
 2447	if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
 2448		__set_current_state(TASK_RUNNING);
 2449		tracehook_notify_signal();
 2450		return true;
 2451	}
 2452
 2453	return false;
 2454}
 2455
 2456static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 2457{
 2458	struct io_wq_work_node *pos, *start, *prev;
 2459	unsigned int poll_flags = BLK_POLL_NOSLEEP;
 2460	DEFINE_IO_COMP_BATCH(iob);
 2461	int nr_events = 0;
 2462
 2463	/*
 2464	 * Only spin for completions if we don't have multiple devices hanging
 2465	 * off our complete list.
 2466	 */
 2467	if (ctx->poll_multi_queue || force_nonspin)
 2468		poll_flags |= BLK_POLL_ONESHOT;
 2469
 2470	wq_list_for_each(pos, start, &ctx->iopoll_list) {
 2471		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
 2472		struct kiocb *kiocb = &req->rw.kiocb;
 2473		int ret;
 2474
 2475		/*
 2476		 * Move completed and retryable entries to our local lists.
 2477		 * If we find a request that requires polling, break out
 2478		 * and complete those lists first, if we have entries there.
 2479		 */
 2480		if (READ_ONCE(req->iopoll_completed))
 2481			break;
 2482
 2483		ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
 2484		if (unlikely(ret < 0))
 2485			return ret;
 2486		else if (ret)
 2487			poll_flags |= BLK_POLL_ONESHOT;
 2488
 2489		/* iopoll may have completed current req */
 2490		if (!rq_list_empty(iob.req_list) ||
 2491		    READ_ONCE(req->iopoll_completed))
 2492			break;
 2493	}
 2494
 2495	if (!rq_list_empty(iob.req_list))
 2496		iob.complete(&iob);
 2497	else if (!pos)
 2498		return 0;
 2499
 2500	prev = start;
 2501	wq_list_for_each_resume(pos, prev) {
 2502		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
 2503
 2504		/* order with io_complete_rw_iopoll(), e.g. ->result updates */
 2505		if (!smp_load_acquire(&req->iopoll_completed))
 2506			break;
 2507		__io_cqring_fill_event(ctx, req->user_data, req->result,
 2508					io_put_rw_kbuf(req));
 2509		nr_events++;
 2510	}
 2511
 2512	if (unlikely(!nr_events))
 2513		return 0;
 2514
 2515	io_commit_cqring(ctx);
 2516	io_cqring_ev_posted_iopoll(ctx);
 2517	pos = start ? start->next : ctx->iopoll_list.first;
 2518	wq_list_cut(&ctx->iopoll_list, prev, start);
 2519	io_free_batch_list(ctx, pos);
 2520	return nr_events;
 2521}
 2522
 2523/*
 2524 * We can't just wait for polled events to come to us, we have to actively
 2525 * find and complete them.
 2526 */
 2527static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
 2528{
 2529	if (!(ctx->flags & IORING_SETUP_IOPOLL))
 2530		return;
 2531
 2532	mutex_lock(&ctx->uring_lock);
 2533	while (!wq_list_empty(&ctx->iopoll_list)) {
 2534		/* let it sleep and repeat later if can't complete a request */
 2535		if (io_do_iopoll(ctx, true) == 0)
 2536			break;
 2537		/*
 2538		 * Ensure we allow local-to-the-cpu processing to take place,
 2539		 * in this case we need to ensure that we reap all events.
 2540		 * Also let task_work, etc. to progress by releasing the mutex
 2541		 */
 2542		if (need_resched()) {
 2543			mutex_unlock(&ctx->uring_lock);
 2544			cond_resched();
 2545			mutex_lock(&ctx->uring_lock);
 2546		}
 2547	}
 2548	mutex_unlock(&ctx->uring_lock);
 2549}
 2550
 2551static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 2552{
 2553	unsigned int nr_events = 0;
 2554	int ret = 0;
 2555
 2556	/*
 2557	 * We disallow the app entering submit/complete with polling, but we
 2558	 * still need to lock the ring to prevent racing with polled issue
 2559	 * that got punted to a workqueue.
 2560	 */
 2561	mutex_lock(&ctx->uring_lock);
 2562	/*
 2563	 * Don't enter poll loop if we already have events pending.
 2564	 * If we do, we can potentially be spinning for commands that
 2565	 * already triggered a CQE (eg in error).
 2566	 */
 2567	if (test_bit(0, &ctx->check_cq_overflow))
 2568		__io_cqring_overflow_flush(ctx, false);
 2569	if (io_cqring_events(ctx))
 2570		goto out;
 2571	do {
 2572		/*
 2573		 * If a submit got punted to a workqueue, we can have the
 2574		 * application entering polling for a command before it gets
 2575		 * issued. That app will hold the uring_lock for the duration
 2576		 * of the poll right here, so we need to take a breather every
 2577		 * now and then to ensure that the issue has a chance to add
 2578		 * the poll to the issued list. Otherwise we can spin here
 2579		 * forever, while the workqueue is stuck trying to acquire the
 2580		 * very same mutex.
 2581		 */
 2582		if (wq_list_empty(&ctx->iopoll_list)) {
 2583			u32 tail = ctx->cached_cq_tail;
 2584
 2585			mutex_unlock(&ctx->uring_lock);
 2586			io_run_task_work();
 2587			mutex_lock(&ctx->uring_lock);
 2588
 2589			/* some requests don't go through iopoll_list */
 2590			if (tail != ctx->cached_cq_tail ||
 2591			    wq_list_empty(&ctx->iopoll_list))
 2592				break;
 2593		}
 2594		ret = io_do_iopoll(ctx, !min);
 2595		if (ret < 0)
 2596			break;
 2597		nr_events += ret;
 2598		ret = 0;
 2599	} while (nr_events < min && !need_resched());
 2600out:
 2601	mutex_unlock(&ctx->uring_lock);
 2602	return ret;
 2603}
 2604
 2605static void kiocb_end_write(struct io_kiocb *req)
 2606{
 2607	/*
 2608	 * Tell lockdep we inherited freeze protection from submission
 2609	 * thread.
 2610	 */
 2611	if (req->flags & REQ_F_ISREG) {
 2612		struct super_block *sb = file_inode(req->file)->i_sb;
 2613
 2614		__sb_writers_acquired(sb, SB_FREEZE_WRITE);
 2615		sb_end_write(sb);
 2616	}
 2617}
 2618
 2619#ifdef CONFIG_BLOCK
 2620static bool io_resubmit_prep(struct io_kiocb *req)
 2621{
 2622	struct io_async_rw *rw = req->async_data;
 2623
 2624	if (!req_has_async_data(req))
 2625		return !io_req_prep_async(req);
 2626	iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
 2627	return true;
 2628}
 2629
 2630static bool io_rw_should_reissue(struct io_kiocb *req)
 2631{
 2632	umode_t mode = file_inode(req->file)->i_mode;
 2633	struct io_ring_ctx *ctx = req->ctx;
 2634
 2635	if (!S_ISBLK(mode) && !S_ISREG(mode))
 2636		return false;
 2637	if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
 2638	    !(ctx->flags & IORING_SETUP_IOPOLL)))
 2639		return false;
 2640	/*
 2641	 * If ref is dying, we might be running poll reap from the exit work.
 2642	 * Don't attempt to reissue from that path, just let it fail with
 2643	 * -EAGAIN.
 2644	 */
 2645	if (percpu_ref_is_dying(&ctx->refs))
 2646		return false;
 2647	/*
 2648	 * Play it safe and assume not safe to re-import and reissue if we're
 2649	 * not in the original thread group (or in task context).
 2650	 */
 2651	if (!same_thread_group(req->task, current) || !in_task())
 2652		return false;
 2653	return true;
 2654}
 2655#else
 2656static bool io_resubmit_prep(struct io_kiocb *req)
 2657{
 2658	return false;
 2659}
 2660static bool io_rw_should_reissue(struct io_kiocb *req)
 2661{
 2662	return false;
 2663}
 2664#endif
 2665
 2666static bool __io_complete_rw_common(struct io_kiocb *req, long res)
 2667{
 2668	if (req->rw.kiocb.ki_flags & IOCB_WRITE)
 2669		kiocb_end_write(req);
 2670	if (unlikely(res != req->result)) {
 2671		if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
 2672		    io_rw_should_reissue(req)) {
 2673			req->flags |= REQ_F_REISSUE;
 2674			return true;
 2675		}
 2676		req_set_fail(req);
 2677		req->result = res;
 2678	}
 2679	return false;
 2680}
 2681
 2682static void io_req_task_complete(struct io_kiocb *req, bool *locked)
 2683{
 2684	unsigned int cflags = io_put_rw_kbuf(req);
 2685	int res = req->result;
 2686
 2687	if (*locked) {
 2688		io_req_complete_state(req, res, cflags);
 2689		io_req_add_compl_list(req);
 2690	} else {
 2691		io_req_complete_post(req, res, cflags);
 2692	}
 2693}
 2694
 2695static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
 2696			     unsigned int issue_flags)
 2697{
 2698	if (__io_complete_rw_common(req, res))
 2699		return;
 2700	__io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req));
 2701}
 2702
 2703static void io_complete_rw(struct kiocb *kiocb, long res)
 2704{
 2705	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 2706
 2707	if (__io_complete_rw_common(req, res))
 2708		return;
 2709	req->result = res;
 2710	req->io_task_work.func = io_req_task_complete;
 2711	io_req_task_work_add(req);
 2712}
 2713
 2714static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
 2715{
 2716	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 2717
 2718	if (kiocb->ki_flags & IOCB_WRITE)
 2719		kiocb_end_write(req);
 2720	if (unlikely(res != req->result)) {
 2721		if (res == -EAGAIN && io_rw_should_reissue(req)) {
 2722			req->flags |= REQ_F_REISSUE;
 2723			return;
 2724		}
 2725		req->result = res;
 2726	}
 2727
 2728	/* order with io_iopoll_complete() checking ->iopoll_completed */
 2729	smp_store_release(&req->iopoll_completed, 1);
 2730}
 2731
 2732/*
 2733 * After the iocb has been issued, it's safe to be found on the poll list.
 2734 * Adding the kiocb to the list AFTER submission ensures that we don't
 2735 * find it from a io_do_iopoll() thread before the issuer is done
 2736 * accessing the kiocb cookie.
 2737 */
 2738static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
 2739{
 2740	struct io_ring_ctx *ctx = req->ctx;
 2741	const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 2742
 2743	/* workqueue context doesn't hold uring_lock, grab it now */
 2744	if (unlikely(needs_lock))
 2745		mutex_lock(&ctx->uring_lock);
 2746
 2747	/*
 2748	 * Track whether we have multiple files in our lists. This will impact
 2749	 * how we do polling eventually, not spinning if we're on potentially
 2750	 * different devices.
 2751	 */
 2752	if (wq_list_empty(&ctx->iopoll_list)) {
 2753		ctx->poll_multi_queue = false;
 2754	} else if (!ctx->poll_multi_queue) {
 2755		struct io_kiocb *list_req;
 2756
 2757		list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
 2758					comp_list);
 2759		if (list_req->file != req->file)
 2760			ctx->poll_multi_queue = true;
 2761	}
 2762
 2763	/*
 2764	 * For fast devices, IO may have already completed. If it has, add
 2765	 * it to the front so we find it first.
 2766	 */
 2767	if (READ_ONCE(req->iopoll_completed))
 2768		wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
 2769	else
 2770		wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
 2771
 2772	if (unlikely(needs_lock)) {
 2773		/*
 2774		 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
 2775		 * in sq thread task context or in io worker task context. If
 2776		 * current task context is sq thread, we don't need to check
 2777		 * whether should wake up sq thread.
 2778		 */
 2779		if ((ctx->flags & IORING_SETUP_SQPOLL) &&
 2780		    wq_has_sleeper(&ctx->sq_data->wait))
 2781			wake_up(&ctx->sq_data->wait);
 2782
 2783		mutex_unlock(&ctx->uring_lock);
 2784	}
 2785}
 2786
 2787static bool io_bdev_nowait(struct block_device *bdev)
 2788{
 2789	return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
 2790}
 2791
 2792/*
 2793 * If we tracked the file through the SCM inflight mechanism, we could support
 2794 * any file. For now, just ensure that anything potentially problematic is done
 2795 * inline.
 2796 */
 2797static bool __io_file_supports_nowait(struct file *file, umode_t mode)
 2798{
 2799	if (S_ISBLK(mode)) {
 2800		if (IS_ENABLED(CONFIG_BLOCK) &&
 2801		    io_bdev_nowait(I_BDEV(file->f_mapping->host)))
 2802			return true;
 2803		return false;
 2804	}
 2805	if (S_ISSOCK(mode))
 2806		return true;
 2807	if (S_ISREG(mode)) {
 2808		if (IS_ENABLED(CONFIG_BLOCK) &&
 2809		    io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
 2810		    file->f_op != &io_uring_fops)
 2811			return true;
 2812		return false;
 2813	}
 2814
 2815	/* any ->read/write should understand O_NONBLOCK */
 2816	if (file->f_flags & O_NONBLOCK)
 2817		return true;
 2818	return file->f_mode & FMODE_NOWAIT;
 2819}
 2820
 2821/*
 2822 * If we tracked the file through the SCM inflight mechanism, we could support
 2823 * any file. For now, just ensure that anything potentially problematic is done
 2824 * inline.
 2825 */
 2826static unsigned int io_file_get_flags(struct file *file)
 2827{
 2828	umode_t mode = file_inode(file)->i_mode;
 2829	unsigned int res = 0;
 2830
 2831	if (S_ISREG(mode))
 2832		res |= FFS_ISREG;
 2833	if (__io_file_supports_nowait(file, mode))
 2834		res |= FFS_NOWAIT;
 2835	return res;
 2836}
 2837
 2838static inline bool io_file_supports_nowait(struct io_kiocb *req)
 2839{
 2840	return req->flags & REQ_F_SUPPORT_NOWAIT;
 2841}
 2842
 2843static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 2844{
 2845	struct io_ring_ctx *ctx = req->ctx;
 2846	struct kiocb *kiocb = &req->rw.kiocb;
 2847	struct file *file = req->file;
 2848	unsigned ioprio;
 2849	int ret;
 2850
 2851	if (!io_req_ffs_set(req))
 2852		req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
 2853
 2854	kiocb->ki_pos = READ_ONCE(sqe->off);
 2855	if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
 2856		req->flags |= REQ_F_CUR_POS;
 2857		kiocb->ki_pos = file->f_pos;
 2858	}
 2859	kiocb->ki_flags = iocb_flags(file);
 2860	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
 2861	if (unlikely(ret))
 2862		return ret;
 2863
 2864	/*
 2865	 * If the file is marked O_NONBLOCK, still allow retry for it if it
 2866	 * supports async. Otherwise it's impossible to use O_NONBLOCK files
 2867	 * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
 2868	 */
 2869	if ((kiocb->ki_flags & IOCB_NOWAIT) ||
 2870	    ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
 2871		req->flags |= REQ_F_NOWAIT;
 2872
 2873	if (ctx->flags & IORING_SETUP_IOPOLL) {
 2874		if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
 2875			return -EOPNOTSUPP;
 2876
 2877		kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
 2878		kiocb->ki_complete = io_complete_rw_iopoll;
 2879		req->iopoll_completed = 0;
 2880	} else {
 2881		if (kiocb->ki_flags & IOCB_HIPRI)
 2882			return -EINVAL;
 2883		kiocb->ki_complete = io_complete_rw;
 2884	}
 2885
 2886	ioprio = READ_ONCE(sqe->ioprio);
 2887	if (ioprio) {
 2888		ret = ioprio_check_cap(ioprio);
 2889		if (ret)
 2890			return ret;
 2891
 2892		kiocb->ki_ioprio = ioprio;
 2893	} else {
 2894		kiocb->ki_ioprio = get_current_ioprio();
 2895	}
 2896
 2897	req->imu = NULL;
 2898	req->rw.addr = READ_ONCE(sqe->addr);
 2899	req->rw.len = READ_ONCE(sqe->len);
 2900	req->buf_index = READ_ONCE(sqe->buf_index);
 2901	return 0;
 2902}
 2903
 2904static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
 2905{
 2906	switch (ret) {
 2907	case -EIOCBQUEUED:
 2908		break;
 2909	case -ERESTARTSYS:
 2910	case -ERESTARTNOINTR:
 2911	case -ERESTARTNOHAND:
 2912	case -ERESTART_RESTARTBLOCK:
 2913		/*
 2914		 * We can't just restart the syscall, since previously
 2915		 * submitted sqes may already be in progress. Just fail this
 2916		 * IO with EINTR.
 2917		 */
 2918		ret = -EINTR;
 2919		fallthrough;
 2920	default:
 2921		kiocb->ki_complete(kiocb, ret);
 2922	}
 2923}
 2924
 2925static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
 2926		       unsigned int issue_flags)
 2927{
 2928	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 2929	struct io_async_rw *io = req->async_data;
 2930
 2931	/* add previously done IO, if any */
 2932	if (req_has_async_data(req) && io->bytes_done > 0) {
 2933		if (ret < 0)
 2934			ret = io->bytes_done;
 2935		else
 2936			ret += io->bytes_done;
 2937	}
 2938
 2939	if (req->flags & REQ_F_CUR_POS)
 2940		req->file->f_pos = kiocb->ki_pos;
 2941	if (ret >= 0 && (kiocb->ki_complete == io_complete_rw))
 2942		__io_complete_rw(req, ret, 0, issue_flags);
 2943	else
 2944		io_rw_done(kiocb, ret);
 2945
 2946	if (req->flags & REQ_F_REISSUE) {
 2947		req->flags &= ~REQ_F_REISSUE;
 2948		if (io_resubmit_prep(req)) {
 2949			io_req_task_queue_reissue(req);
 2950		} else {
 2951			unsigned int cflags = io_put_rw_kbuf(req);
 2952			struct io_ring_ctx *ctx = req->ctx;
 2953
 2954			req_set_fail(req);
 2955			if (issue_flags & IO_URING_F_UNLOCKED) {
 2956				mutex_lock(&ctx->uring_lock);
 2957				__io_req_complete(req, issue_flags, ret, cflags);
 2958				mutex_unlock(&ctx->uring_lock);
 2959			} else {
 2960				__io_req_complete(req, issue_flags, ret, cflags);
 2961			}
 2962		}
 2963	}
 2964}
 2965
 2966static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
 2967			     struct io_mapped_ubuf *imu)
 2968{
 2969	size_t len = req->rw.len;
 2970	u64 buf_end, buf_addr = req->rw.addr;
 2971	size_t offset;
 2972
 2973	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
 2974		return -EFAULT;
 2975	/* not inside the mapped region */
 2976	if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
 2977		return -EFAULT;
 2978
 2979	/*
 2980	 * May not be a start of buffer, set size appropriately
 2981	 * and advance us to the beginning.
 2982	 */
 2983	offset = buf_addr - imu->ubuf;
 2984	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
 2985
 2986	if (offset) {
 2987		/*
 2988		 * Don't use iov_iter_advance() here, as it's really slow for
 2989		 * using the latter parts of a big fixed buffer - it iterates
 2990		 * over each segment manually. We can cheat a bit here, because
 2991		 * we know that:
 2992		 *
 2993		 * 1) it's a BVEC iter, we set it up
 2994		 * 2) all bvecs are PAGE_SIZE in size, except potentially the
 2995		 *    first and last bvec
 2996		 *
 2997		 * So just find our index, and adjust the iterator afterwards.
 2998		 * If the offset is within the first bvec (or the whole first
 2999		 * bvec, just use iov_iter_advance(). This makes it easier
 3000		 * since we can just skip the first segment, which may not
 3001		 * be PAGE_SIZE aligned.
 3002		 */
 3003		const struct bio_vec *bvec = imu->bvec;
 3004
 3005		if (offset <= bvec->bv_len) {
 3006			iov_iter_advance(iter, offset);
 3007		} else {
 3008			unsigned long seg_skip;
 3009
 3010			/* skip first vec */
 3011			offset -= bvec->bv_len;
 3012			seg_skip = 1 + (offset >> PAGE_SHIFT);
 3013
 3014			iter->bvec = bvec + seg_skip;
 3015			iter->nr_segs -= seg_skip;
 3016			iter->count -= bvec->bv_len + offset;
 3017			iter->iov_offset = offset & ~PAGE_MASK;
 3018		}
 3019	}
 3020
 3021	return 0;
 3022}
 3023
 3024static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
 3025{
 3026	struct io_mapped_ubuf *imu = req->imu;
 3027	u16 index, buf_index = req->buf_index;
 3028
 3029	if (likely(!imu)) {
 3030		struct io_ring_ctx *ctx = req->ctx;
 3031
 3032		if (unlikely(buf_index >= ctx->nr_user_bufs))
 3033			return -EFAULT;
 3034		io_req_set_rsrc_node(req, ctx);
 3035		index = array_index_nospec(buf_index, ctx->nr_user_bufs);
 3036		imu = READ_ONCE(ctx->user_bufs[index]);
 3037		req->imu = imu;
 3038	}
 3039	return __io_import_fixed(req, rw, iter, imu);
 3040}
 3041
 3042static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
 3043{
 3044	if (needs_lock)
 3045		mutex_unlock(&ctx->uring_lock);
 3046}
 3047
 3048static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
 3049{
 3050	/*
 3051	 * "Normal" inline submissions always hold the uring_lock, since we
 3052	 * grab it from the system call. Same is true for the SQPOLL offload.
 3053	 * The only exception is when we've detached the request and issue it
 3054	 * from an async worker thread, grab the lock for that case.
 3055	 */
 3056	if (needs_lock)
 3057		mutex_lock(&ctx->uring_lock);
 3058}
 3059
 3060static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
 3061					  int bgid, unsigned int issue_flags)
 3062{
 3063	struct io_buffer *kbuf = req->kbuf;
 3064	struct io_buffer *head;
 3065	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 3066
 3067	if (req->flags & REQ_F_BUFFER_SELECTED)
 3068		return kbuf;
 3069
 3070	io_ring_submit_lock(req->ctx, needs_lock);
 3071
 3072	lockdep_assert_held(&req->ctx->uring_lock);
 3073
 3074	head = xa_load(&req->ctx->io_buffers, bgid);
 3075	if (head) {
 3076		if (!list_empty(&head->list)) {
 3077			kbuf = list_last_entry(&head->list, struct io_buffer,
 3078							list);
 3079			list_del(&kbuf->list);
 3080		} else {
 3081			kbuf = head;
 3082			xa_erase(&req->ctx->io_buffers, bgid);
 3083		}
 3084		if (*len > kbuf->len)
 3085			*len = kbuf->len;
 3086		req->flags |= REQ_F_BUFFER_SELECTED;
 3087		req->kbuf = kbuf;
 3088	} else {
 3089		kbuf = ERR_PTR(-ENOBUFS);
 3090	}
 3091
 3092	io_ring_submit_unlock(req->ctx, needs_lock);
 3093	return kbuf;
 3094}
 3095
 3096static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
 3097					unsigned int issue_flags)
 3098{
 3099	struct io_buffer *kbuf;
 3100	u16 bgid;
 3101
 3102	bgid = req->buf_index;
 3103	kbuf = io_buffer_select(req, len, bgid, issue_flags);
 3104	if (IS_ERR(kbuf))
 3105		return kbuf;
 3106	return u64_to_user_ptr(kbuf->addr);
 3107}
 3108
 3109#ifdef CONFIG_COMPAT
 3110static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
 3111				unsigned int issue_flags)
 3112{
 3113	struct compat_iovec __user *uiov;
 3114	compat_ssize_t clen;
 3115	void __user *buf;
 3116	ssize_t len;
 3117
 3118	uiov = u64_to_user_ptr(req->rw.addr);
 3119	if (!access_ok(uiov, sizeof(*uiov)))
 3120		return -EFAULT;
 3121	if (__get_user(clen, &uiov->iov_len))
 3122		return -EFAULT;
 3123	if (clen < 0)
 3124		return -EINVAL;
 3125
 3126	len = clen;
 3127	buf = io_rw_buffer_select(req, &len, issue_flags);
 3128	if (IS_ERR(buf))
 3129		return PTR_ERR(buf);
 3130	iov[0].iov_base = buf;
 3131	iov[0].iov_len = (compat_size_t) len;
 3132	return 0;
 3133}
 3134#endif
 3135
 3136static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 3137				      unsigned int issue_flags)
 3138{
 3139	struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
 3140	void __user *buf;
 3141	ssize_t len;
 3142
 3143	if (copy_from_user(iov, uiov, sizeof(*uiov)))
 3144		return -EFAULT;
 3145
 3146	len = iov[0].iov_len;
 3147	if (len < 0)
 3148		return -EINVAL;
 3149	buf = io_rw_buffer_select(req, &len, issue_flags);
 3150	if (IS_ERR(buf))
 3151		return PTR_ERR(buf);
 3152	iov[0].iov_base = buf;
 3153	iov[0].iov_len = len;
 3154	return 0;
 3155}
 3156
 3157static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 3158				    unsigned int issue_flags)
 3159{
 3160	if (req->flags & REQ_F_BUFFER_SELECTED) {
 3161		struct io_buffer *kbuf = req->kbuf;
 3162
 3163		iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
 3164		iov[0].iov_len = kbuf->len;
 3165		return 0;
 3166	}
 3167	if (req->rw.len != 1)
 3168		return -EINVAL;
 3169
 3170#ifdef CONFIG_COMPAT
 3171	if (req->ctx->compat)
 3172		return io_compat_import(req, iov, issue_flags);
 3173#endif
 3174
 3175	return __io_iov_buffer_select(req, iov, issue_flags);
 3176}
 3177
 3178static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
 3179				       struct io_rw_state *s,
 3180				       unsigned int issue_flags)
 3181{
 3182	struct iov_iter *iter = &s->iter;
 3183	u8 opcode = req->opcode;
 3184	struct iovec *iovec;
 3185	void __user *buf;
 3186	size_t sqe_len;
 3187	ssize_t ret;
 3188
 3189	BUILD_BUG_ON(ERR_PTR(0) != NULL);
 3190
 3191	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED)
 3192		return ERR_PTR(io_import_fixed(req, rw, iter));
 3193
 3194	/* buffer index only valid with fixed read/write, or buffer select  */
 3195	if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)))
 3196		return ERR_PTR(-EINVAL);
 3197
 3198	buf = u64_to_user_ptr(req->rw.addr);
 3199	sqe_len = req->rw.len;
 3200
 3201	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
 3202		if (req->flags & REQ_F_BUFFER_SELECT) {
 3203			buf = io_rw_buffer_select(req, &sqe_len, issue_flags);
 3204			if (IS_ERR(buf))
 3205				return ERR_CAST(buf);
 3206			req->rw.len = sqe_len;
 3207		}
 3208
 3209		ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter);
 3210		return ERR_PTR(ret);
 3211	}
 3212
 3213	iovec = s->fast_iov;
 3214	if (req->flags & REQ_F_BUFFER_SELECT) {
 3215		ret = io_iov_buffer_select(req, iovec, issue_flags);
 3216		if (!ret)
 3217			iov_iter_init(iter, rw, iovec, 1, iovec->iov_len);
 3218		return ERR_PTR(ret);
 3219	}
 3220
 3221	ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
 3222			      req->ctx->compat);
 3223	if (unlikely(ret < 0))
 3224		return ERR_PTR(ret);
 3225	return iovec;
 3226}
 3227
 3228static inline int io_import_iovec(int rw, struct io_kiocb *req,
 3229				  struct iovec **iovec, struct io_rw_state *s,
 3230				  unsigned int issue_flags)
 3231{
 3232	*iovec = __io_import_iovec(rw, req, s, issue_flags);
 3233	if (unlikely(IS_ERR(*iovec)))
 3234		return PTR_ERR(*iovec);
 3235
 3236	iov_iter_save_state(&s->iter, &s->iter_state);
 3237	return 0;
 3238}
 3239
 3240static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
 3241{
 3242	return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
 3243}
 3244
 3245/*
 3246 * For files that don't have ->read_iter() and ->write_iter(), handle them
 3247 * by looping over ->read() or ->write() manually.
 3248 */
 3249static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
 3250{
 3251	struct kiocb *kiocb = &req->rw.kiocb;
 3252	struct file *file = req->file;
 3253	ssize_t ret = 0;
 3254
 3255	/*
 3256	 * Don't support polled IO through this interface, and we can't
 3257	 * support non-blocking either. For the latter, this just causes
 3258	 * the kiocb to be handled from an async context.
 3259	 */
 3260	if (kiocb->ki_flags & IOCB_HIPRI)
 3261		return -EOPNOTSUPP;
 3262	if ((kiocb->ki_flags & IOCB_NOWAIT) &&
 3263	    !(kiocb->ki_filp->f_flags & O_NONBLOCK))
 3264		return -EAGAIN;
 3265
 3266	while (iov_iter_count(iter)) {
 3267		struct iovec iovec;
 3268		ssize_t nr;
 3269
 3270		if (!iov_iter_is_bvec(iter)) {
 3271			iovec = iov_iter_iovec(iter);
 3272		} else {
 3273			iovec.iov_base = u64_to_user_ptr(req->rw.addr);
 3274			iovec.iov_len = req->rw.len;
 3275		}
 3276
 3277		if (rw == READ) {
 3278			nr = file->f_op->read(file, iovec.iov_base,
 3279					      iovec.iov_len, io_kiocb_ppos(kiocb));
 3280		} else {
 3281			nr = file->f_op->write(file, iovec.iov_base,
 3282					       iovec.iov_len, io_kiocb_ppos(kiocb));
 3283		}
 3284
 3285		if (nr < 0) {
 3286			if (!ret)
 3287				ret = nr;
 3288			break;
 3289		}
 3290		if (!iov_iter_is_bvec(iter)) {
 3291			iov_iter_advance(iter, nr);
 3292		} else {
 3293			req->rw.len -= nr;
 3294			req->rw.addr += nr;
 3295		}
 3296		ret += nr;
 3297		if (nr != iovec.iov_len)
 3298			break;
 3299	}
 3300
 3301	return ret;
 3302}
 3303
 3304static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
 3305			  const struct iovec *fast_iov, struct iov_iter *iter)
 3306{
 3307	struct io_async_rw *rw = req->async_data;
 3308
 3309	memcpy(&rw->s.iter, iter, sizeof(*iter));
 3310	rw->free_iovec = iovec;
 3311	rw->bytes_done = 0;
 3312	/* can only be fixed buffers, no need to do anything */
 3313	if (iov_iter_is_bvec(iter))
 3314		return;
 3315	if (!iovec) {
 3316		unsigned iov_off = 0;
 3317
 3318		rw->s.iter.iov = rw->s.fast_iov;
 3319		if (iter->iov != fast_iov) {
 3320			iov_off = iter->iov - fast_iov;
 3321			rw->s.iter.iov += iov_off;
 3322		}
 3323		if (rw->s.fast_iov != fast_iov)
 3324			memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off,
 3325			       sizeof(struct iovec) * iter->nr_segs);
 3326	} else {
 3327		req->flags |= REQ_F_NEED_CLEANUP;
 3328	}
 3329}
 3330
 3331static inline bool io_alloc_async_data(struct io_kiocb *req)
 3332{
 3333	WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
 3334	req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
 3335	if (req->async_data) {
 3336		req->flags |= REQ_F_ASYNC_DATA;
 3337		return false;
 3338	}
 3339	return true;
 3340}
 3341
 3342static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
 3343			     struct io_rw_state *s, bool force)
 3344{
 3345	if (!force && !io_op_defs[req->opcode].needs_async_setup)
 3346		return 0;
 3347	if (!req_has_async_data(req)) {
 3348		struct io_async_rw *iorw;
 3349
 3350		if (io_alloc_async_data(req)) {
 3351			kfree(iovec);
 3352			return -ENOMEM;
 3353		}
 3354
 3355		io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
 3356		iorw = req->async_data;
 3357		/* we've copied and mapped the iter, ensure state is saved */
 3358		iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
 3359	}
 3360	return 0;
 3361}
 3362
 3363static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
 3364{
 3365	struct io_async_rw *iorw = req->async_data;
 3366	struct iovec *iov;
 3367	int ret;
 3368
 3369	/* submission path, ->uring_lock should already be taken */
 3370	ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
 3371	if (unlikely(ret < 0))
 3372		return ret;
 3373
 3374	iorw->bytes_done = 0;
 3375	iorw->free_iovec = iov;
 3376	if (iov)
 3377		req->flags |= REQ_F_NEED_CLEANUP;
 3378	return 0;
 3379}
 3380
 3381static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 3382{
 3383	if (unlikely(!(req->file->f_mode & FMODE_READ)))
 3384		return -EBADF;
 3385	return io_prep_rw(req, sqe);
 3386}
 3387
 3388/*
 3389 * This is our waitqueue callback handler, registered through __folio_lock_async()
 3390 * when we initially tried to do the IO with the iocb armed our waitqueue.
 3391 * This gets called when the page is unlocked, and we generally expect that to
 3392 * happen when the page IO is completed and the page is now uptodate. This will
 3393 * queue a task_work based retry of the operation, attempting to copy the data
 3394 * again. If the latter fails because the page was NOT uptodate, then we will
 3395 * do a thread based blocking retry of the operation. That's the unexpected
 3396 * slow path.
 3397 */
 3398static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
 3399			     int sync, void *arg)
 3400{
 3401	struct wait_page_queue *wpq;
 3402	struct io_kiocb *req = wait->private;
 3403	struct wait_page_key *key = arg;
 3404
 3405	wpq = container_of(wait, struct wait_page_queue, wait);
 3406
 3407	if (!wake_page_match(wpq, key))
 3408		return 0;
 3409
 3410	req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
 3411	list_del_init(&wait->entry);
 3412	io_req_task_queue(req);
 3413	return 1;
 3414}
 3415
 3416/*
 3417 * This controls whether a given IO request should be armed for async page
 3418 * based retry. If we return false here, the request is handed to the async
 3419 * worker threads for retry. If we're doing buffered reads on a regular file,
 3420 * we prepare a private wait_page_queue entry and retry the operation. This
 3421 * will either succeed because the page is now uptodate and unlocked, or it
 3422 * will register a callback when the page is unlocked at IO completion. Through
 3423 * that callback, io_uring uses task_work to setup a retry of the operation.
 3424 * That retry will attempt the buffered read again. The retry will generally
 3425 * succeed, or in rare cases where it fails, we then fall back to using the
 3426 * async worker threads for a blocking retry.
 3427 */
 3428static bool io_rw_should_retry(struct io_kiocb *req)
 3429{
 3430	struct io_async_rw *rw = req->async_data;
 3431	struct wait_page_queue *wait = &rw->wpq;
 3432	struct kiocb *kiocb = &req->rw.kiocb;
 3433
 3434	/* never retry for NOWAIT, we just complete with -EAGAIN */
 3435	if (req->flags & REQ_F_NOWAIT)
 3436		return false;
 3437
 3438	/* Only for buffered IO */
 3439	if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
 3440		return false;
 3441
 3442	/*
 3443	 * just use poll if we can, and don't attempt if the fs doesn't
 3444	 * support callback based unlocks
 3445	 */
 3446	if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
 3447		return false;
 3448
 3449	wait->wait.func = io_async_buf_func;
 3450	wait->wait.private = req;
 3451	wait->wait.flags = 0;
 3452	INIT_LIST_HEAD(&wait->wait.entry);
 3453	kiocb->ki_flags |= IOCB_WAITQ;
 3454	kiocb->ki_flags &= ~IOCB_NOWAIT;
 3455	kiocb->ki_waitq = wait;
 3456	return true;
 3457}
 3458
 3459static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
 3460{
 3461	if (likely(req->file->f_op->read_iter))
 3462		return call_read_iter(req->file, &req->rw.kiocb, iter);
 3463	else if (req->file->f_op->read)
 3464		return loop_rw_iter(READ, req, iter);
 3465	else
 3466		return -EINVAL;
 3467}
 3468
 3469static bool need_read_all(struct io_kiocb *req)
 3470{
 3471	return req->flags & REQ_F_ISREG ||
 3472		S_ISBLK(file_inode(req->file)->i_mode);
 3473}
 3474
 3475static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 3476{
 3477	struct io_rw_state __s, *s = &__s;
 3478	struct iovec *iovec;
 3479	struct kiocb *kiocb = &req->rw.kiocb;
 3480	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 3481	struct io_async_rw *rw;
 3482	ssize_t ret, ret2;
 3483
 3484	if (!req_has_async_data(req)) {
 3485		ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
 3486		if (unlikely(ret < 0))
 3487			return ret;
 3488	} else {
 3489		rw = req->async_data;
 3490		s = &rw->s;
 3491		/*
 3492		 * We come here from an earlier attempt, restore our state to
 3493		 * match in case it doesn't. It's cheap enough that we don't
 3494		 * need to make this conditional.
 3495		 */
 3496		iov_iter_restore(&s->iter, &s->iter_state);
 3497		iovec = NULL;
 3498	}
 3499	req->result = iov_iter_count(&s->iter);
 3500
 3501	if (force_nonblock) {
 3502		/* If the file doesn't support async, just async punt */
 3503		if (unlikely(!io_file_supports_nowait(req))) {
 3504			ret = io_setup_async_rw(req, iovec, s, true);
 3505			return ret ?: -EAGAIN;
 3506		}
 3507		kiocb->ki_flags |= IOCB_NOWAIT;
 3508	} else {
 3509		/* Ensure we clear previously set non-block flag */
 3510		kiocb->ki_flags &= ~IOCB_NOWAIT;
 3511	}
 3512
 3513	ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
 3514	if (unlikely(ret)) {
 3515		kfree(iovec);
 3516		return ret;
 3517	}
 3518
 3519	ret = io_iter_do_read(req, &s->iter);
 3520
 3521	if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
 3522		req->flags &= ~REQ_F_REISSUE;
 3523		/* IOPOLL retry should happen for io-wq threads */
 3524		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
 3525			goto done;
 3526		/* no retry on NONBLOCK nor RWF_NOWAIT */
 3527		if (req->flags & REQ_F_NOWAIT)
 3528			goto done;
 3529		ret = 0;
 3530	} else if (ret == -EIOCBQUEUED) {
 3531		goto out_free;
 3532	} else if (ret == req->result || ret <= 0 || !force_nonblock ||
 3533		   (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
 3534		/* read all, failed, already did sync or don't want to retry */
 3535		goto done;
 3536	}
 3537
 3538	/*
 3539	 * Don't depend on the iter state matching what was consumed, or being
 3540	 * untouched in case of error. Restore it and we'll advance it
 3541	 * manually if we need to.
 3542	 */
 3543	iov_iter_restore(&s->iter, &s->iter_state);
 3544
 3545	ret2 = io_setup_async_rw(req, iovec, s, true);
 3546	if (ret2)
 3547		return ret2;
 3548
 3549	iovec = NULL;
 3550	rw = req->async_data;
 3551	s = &rw->s;
 3552	/*
 3553	 * Now use our persistent iterator and state, if we aren't already.
 3554	 * We've restored and mapped the iter to match.
 3555	 */
 3556
 3557	do {
 3558		/*
 3559		 * We end up here because of a partial read, either from
 3560		 * above or inside this loop. Advance the iter by the bytes
 3561		 * that were consumed.
 3562		 */
 3563		iov_iter_advance(&s->iter, ret);
 3564		if (!iov_iter_count(&s->iter))
 3565			break;
 3566		rw->bytes_done += ret;
 3567		iov_iter_save_state(&s->iter, &s->iter_state);
 3568
 3569		/* if we can retry, do so with the callbacks armed */
 3570		if (!io_rw_should_retry(req)) {
 3571			kiocb->ki_flags &= ~IOCB_WAITQ;
 3572			return -EAGAIN;
 3573		}
 3574
 3575		/*
 3576		 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
 3577		 * we get -EIOCBQUEUED, then we'll get a notification when the
 3578		 * desired page gets unlocked. We can also get a partial read
 3579		 * here, and if we do, then just retry at the new offset.
 3580		 */
 3581		ret = io_iter_do_read(req, &s->iter);
 3582		if (ret == -EIOCBQUEUED)
 3583			return 0;
 3584		/* we got some bytes, but not all. retry. */
 3585		kiocb->ki_flags &= ~IOCB_WAITQ;
 3586		iov_iter_restore(&s->iter, &s->iter_state);
 3587	} while (ret > 0);
 3588done:
 3589	kiocb_done(kiocb, ret, issue_flags);
 3590out_free:
 3591	/* it's faster to check here then delegate to kfree */
 3592	if (iovec)
 3593		kfree(iovec);
 3594	return 0;
 3595}
 3596
 3597static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 3598{
 3599	if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
 3600		return -EBADF;
 3601	req->rw.kiocb.ki_hint = ki_hint_validate(file_write_hint(req->file));
 3602	return io_prep_rw(req, sqe);
 3603}
 3604
 3605static int io_write(struct io_kiocb *req, unsigned int issue_flags)
 3606{
 3607	struct io_rw_state __s, *s = &__s;
 3608	struct iovec *iovec;
 3609	struct kiocb *kiocb = &req->rw.kiocb;
 3610	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 3611	ssize_t ret, ret2;
 3612
 3613	if (!req_has_async_data(req)) {
 3614		ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
 3615		if (unlikely(ret < 0))
 3616			return ret;
 3617	} else {
 3618		struct io_async_rw *rw = req->async_data;
 3619
 3620		s = &rw->s;
 3621		iov_iter_restore(&s->iter, &s->iter_state);
 3622		iovec = NULL;
 3623	}
 3624	req->result = iov_iter_count(&s->iter);
 3625
 3626	if (force_nonblock) {
 3627		/* If the file doesn't support async, just async punt */
 3628		if (unlikely(!io_file_supports_nowait(req)))
 3629			goto copy_iov;
 3630
 3631		/* file path doesn't support NOWAIT for non-direct_IO */
 3632		if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
 3633		    (req->flags & REQ_F_ISREG))
 3634			goto copy_iov;
 3635
 3636		kiocb->ki_flags |= IOCB_NOWAIT;
 3637	} else {
 3638		/* Ensure we clear previously set non-block flag */
 3639		kiocb->ki_flags &= ~IOCB_NOWAIT;
 3640	}
 3641
 3642	ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
 3643	if (unlikely(ret))
 3644		goto out_free;
 3645
 3646	/*
 3647	 * Open-code file_start_write here to grab freeze protection,
 3648	 * which will be released by another thread in
 3649	 * io_complete_rw().  Fool lockdep by telling it the lock got
 3650	 * released so that it doesn't complain about the held lock when
 3651	 * we return to userspace.
 3652	 */
 3653	if (req->flags & REQ_F_ISREG) {
 3654		sb_start_write(file_inode(req->file)->i_sb);
 3655		__sb_writers_release(file_inode(req->file)->i_sb,
 3656					SB_FREEZE_WRITE);
 3657	}
 3658	kiocb->ki_flags |= IOCB_WRITE;
 3659
 3660	if (likely(req->file->f_op->write_iter))
 3661		ret2 = call_write_iter(req->file, kiocb, &s->iter);
 3662	else if (req->file->f_op->write)
 3663		ret2 = loop_rw_iter(WRITE, req, &s->iter);
 3664	else
 3665		ret2 = -EINVAL;
 3666
 3667	if (req->flags & REQ_F_REISSUE) {
 3668		req->flags &= ~REQ_F_REISSUE;
 3669		ret2 = -EAGAIN;
 3670	}
 3671
 3672	/*
 3673	 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
 3674	 * retry them without IOCB_NOWAIT.
 3675	 */
 3676	if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
 3677		ret2 = -EAGAIN;
 3678	/* no retry on NONBLOCK nor RWF_NOWAIT */
 3679	if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
 3680		goto done;
 3681	if (!force_nonblock || ret2 != -EAGAIN) {
 3682		/* IOPOLL retry should happen for io-wq threads */
 3683		if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
 3684			goto copy_iov;
 3685done:
 3686		kiocb_done(kiocb, ret2, issue_flags);
 3687	} else {
 3688copy_iov:
 3689		iov_iter_restore(&s->iter, &s->iter_state);
 3690		ret = io_setup_async_rw(req, iovec, s, false);
 3691		return ret ?: -EAGAIN;
 3692	}
 3693out_free:
 3694	/* it's reportedly faster than delegating the null check to kfree() */
 3695	if (iovec)
 3696		kfree(iovec);
 3697	return ret;
 3698}
 3699
 3700static int io_renameat_prep(struct io_kiocb *req,
 3701			    const struct io_uring_sqe *sqe)
 3702{
 3703	struct io_rename *ren = &req->rename;
 3704	const char __user *oldf, *newf;
 3705
 3706	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 3707		return -EINVAL;
 3708	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
 3709		return -EINVAL;
 3710	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 3711		return -EBADF;
 3712
 3713	ren->old_dfd = READ_ONCE(sqe->fd);
 3714	oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
 3715	newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 3716	ren->new_dfd = READ_ONCE(sqe->len);
 3717	ren->flags = READ_ONCE(sqe->rename_flags);
 3718
 3719	ren->oldpath = getname(oldf);
 3720	if (IS_ERR(ren->oldpath))
 3721		return PTR_ERR(ren->oldpath);
 3722
 3723	ren->newpath = getname(newf);
 3724	if (IS_ERR(ren->newpath)) {
 3725		putname(ren->oldpath);
 3726		return PTR_ERR(ren->newpath);
 3727	}
 3728
 3729	req->flags |= REQ_F_NEED_CLEANUP;
 3730	return 0;
 3731}
 3732
 3733static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
 3734{
 3735	struct io_rename *ren = &req->rename;
 3736	int ret;
 3737
 3738	if (issue_flags & IO_URING_F_NONBLOCK)
 3739		return -EAGAIN;
 3740
 3741	ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
 3742				ren->newpath, ren->flags);
 3743
 3744	req->flags &= ~REQ_F_NEED_CLEANUP;
 3745	if (ret < 0)
 3746		req_set_fail(req);
 3747	io_req_complete(req, ret);
 3748	return 0;
 3749}
 3750
 3751static int io_unlinkat_prep(struct io_kiocb *req,
 3752			    const struct io_uring_sqe *sqe)
 3753{
 3754	struct io_unlink *un = &req->unlink;
 3755	const char __user *fname;
 3756
 3757	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 3758		return -EINVAL;
 3759	if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
 3760	    sqe->splice_fd_in)
 3761		return -EINVAL;
 3762	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 3763		return -EBADF;
 3764
 3765	un->dfd = READ_ONCE(sqe->fd);
 3766
 3767	un->flags = READ_ONCE(sqe->unlink_flags);
 3768	if (un->flags & ~AT_REMOVEDIR)
 3769		return -EINVAL;
 3770
 3771	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
 3772	un->filename = getname(fname);
 3773	if (IS_ERR(un->filename))
 3774		return PTR_ERR(un->filename);
 3775
 3776	req->flags |= REQ_F_NEED_CLEANUP;
 3777	return 0;
 3778}
 3779
 3780static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
 3781{
 3782	struct io_unlink *un = &req->unlink;
 3783	int ret;
 3784
 3785	if (issue_flags & IO_URING_F_NONBLOCK)
 3786		return -EAGAIN;
 3787
 3788	if (un->flags & AT_REMOVEDIR)
 3789		ret = do_rmdir(un->dfd, un->filename);
 3790	else
 3791		ret = do_unlinkat(un->dfd, un->filename);
 3792
 3793	req->flags &= ~REQ_F_NEED_CLEANUP;
 3794	if (ret < 0)
 3795		req_set_fail(req);
 3796	io_req_complete(req, ret);
 3797	return 0;
 3798}
 3799
 3800static int io_mkdirat_prep(struct io_kiocb *req,
 3801			    const struct io_uring_sqe *sqe)
 3802{
 3803	struct io_mkdir *mkd = &req->mkdir;
 3804	const char __user *fname;
 3805
 3806	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 3807		return -EINVAL;
 3808	if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index ||
 3809	    sqe->splice_fd_in)
 3810		return -EINVAL;
 3811	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 3812		return -EBADF;
 3813
 3814	mkd->dfd = READ_ONCE(sqe->fd);
 3815	mkd->mode = READ_ONCE(sqe->len);
 3816
 3817	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
 3818	mkd->filename = getname(fname);
 3819	if (IS_ERR(mkd->filename))
 3820		return PTR_ERR(mkd->filename);
 3821
 3822	req->flags |= REQ_F_NEED_CLEANUP;
 3823	return 0;
 3824}
 3825
 3826static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
 3827{
 3828	struct io_mkdir *mkd = &req->mkdir;
 3829	int ret;
 3830
 3831	if (issue_flags & IO_URING_F_NONBLOCK)
 3832		return -EAGAIN;
 3833
 3834	ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
 3835
 3836	req->flags &= ~REQ_F_NEED_CLEANUP;
 3837	if (ret < 0)
 3838		req_set_fail(req);
 3839	io_req_complete(req, ret);
 3840	return 0;
 3841}
 3842
 3843static int io_symlinkat_prep(struct io_kiocb *req,
 3844			    const struct io_uring_sqe *sqe)
 3845{
 3846	struct io_symlink *sl = &req->symlink;
 3847	const char __user *oldpath, *newpath;
 3848
 3849	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 3850		return -EINVAL;
 3851	if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index ||
 3852	    sqe->splice_fd_in)
 3853		return -EINVAL;
 3854	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 3855		return -EBADF;
 3856
 3857	sl->new_dfd = READ_ONCE(sqe->fd);
 3858	oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr));
 3859	newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 3860
 3861	sl->oldpath = getname(oldpath);
 3862	if (IS_ERR(sl->oldpath))
 3863		return PTR_ERR(sl->oldpath);
 3864
 3865	sl->newpath = getname(newpath);
 3866	if (IS_ERR(sl->newpath)) {
 3867		putname(sl->oldpath);
 3868		return PTR_ERR(sl->newpath);
 3869	}
 3870
 3871	req->flags |= REQ_F_NEED_CLEANUP;
 3872	return 0;
 3873}
 3874
 3875static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
 3876{
 3877	struct io_symlink *sl = &req->symlink;
 3878	int ret;
 3879
 3880	if (issue_flags & IO_URING_F_NONBLOCK)
 3881		return -EAGAIN;
 3882
 3883	ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
 3884
 3885	req->flags &= ~REQ_F_NEED_CLEANUP;
 3886	if (ret < 0)
 3887		req_set_fail(req);
 3888	io_req_complete(req, ret);
 3889	return 0;
 3890}
 3891
 3892static int io_linkat_prep(struct io_kiocb *req,
 3893			    const struct io_uring_sqe *sqe)
 3894{
 3895	struct io_hardlink *lnk = &req->hardlink;
 3896	const char __user *oldf, *newf;
 3897
 3898	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 3899		return -EINVAL;
 3900	if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
 3901		return -EINVAL;
 3902	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 3903		return -EBADF;
 3904
 3905	lnk->old_dfd = READ_ONCE(sqe->fd);
 3906	lnk->new_dfd = READ_ONCE(sqe->len);
 3907	oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
 3908	newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 3909	lnk->flags = READ_ONCE(sqe->hardlink_flags);
 3910
 3911	lnk->oldpath = getname(oldf);
 3912	if (IS_ERR(lnk->oldpath))
 3913		return PTR_ERR(lnk->oldpath);
 3914
 3915	lnk->newpath = getname(newf);
 3916	if (IS_ERR(lnk->newpath)) {
 3917		putname(lnk->oldpath);
 3918		return PTR_ERR(lnk->newpath);
 3919	}
 3920
 3921	req->flags |= REQ_F_NEED_CLEANUP;
 3922	return 0;
 3923}
 3924
 3925static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
 3926{
 3927	struct io_hardlink *lnk = &req->hardlink;
 3928	int ret;
 3929
 3930	if (issue_flags & IO_URING_F_NONBLOCK)
 3931		return -EAGAIN;
 3932
 3933	ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
 3934				lnk->newpath, lnk->flags);
 3935
 3936	req->flags &= ~REQ_F_NEED_CLEANUP;
 3937	if (ret < 0)
 3938		req_set_fail(req);
 3939	io_req_complete(req, ret);
 3940	return 0;
 3941}
 3942
 3943static int io_shutdown_prep(struct io_kiocb *req,
 3944			    const struct io_uring_sqe *sqe)
 3945{
 3946#if defined(CONFIG_NET)
 3947	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 3948		return -EINVAL;
 3949	if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
 3950		     sqe->buf_index || sqe->splice_fd_in))
 3951		return -EINVAL;
 3952
 3953	req->shutdown.how = READ_ONCE(sqe->len);
 3954	return 0;
 3955#else
 3956	return -EOPNOTSUPP;
 3957#endif
 3958}
 3959
 3960static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
 3961{
 3962#if defined(CONFIG_NET)
 3963	struct socket *sock;
 3964	int ret;
 3965
 3966	if (issue_flags & IO_URING_F_NONBLOCK)
 3967		return -EAGAIN;
 3968
 3969	sock = sock_from_file(req->file);
 3970	if (unlikely(!sock))
 3971		return -ENOTSOCK;
 3972
 3973	ret = __sys_shutdown_sock(sock, req->shutdown.how);
 3974	if (ret < 0)
 3975		req_set_fail(req);
 3976	io_req_complete(req, ret);
 3977	return 0;
 3978#else
 3979	return -EOPNOTSUPP;
 3980#endif
 3981}
 3982
 3983static int __io_splice_prep(struct io_kiocb *req,
 3984			    const struct io_uring_sqe *sqe)
 3985{
 3986	struct io_splice *sp = &req->splice;
 3987	unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
 3988
 3989	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 3990		return -EINVAL;
 3991
 3992	sp->file_in = NULL;
 3993	sp->len = READ_ONCE(sqe->len);
 3994	sp->flags = READ_ONCE(sqe->splice_flags);
 3995
 3996	if (unlikely(sp->flags & ~valid_flags))
 3997		return -EINVAL;
 3998
 3999	sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in),
 4000				  (sp->flags & SPLICE_F_FD_IN_FIXED));
 4001	if (!sp->file_in)
 4002		return -EBADF;
 4003	req->flags |= REQ_F_NEED_CLEANUP;
 4004	return 0;
 4005}
 4006
 4007static int io_tee_prep(struct io_kiocb *req,
 4008		       const struct io_uring_sqe *sqe)
 4009{
 4010	if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
 4011		return -EINVAL;
 4012	return __io_splice_prep(req, sqe);
 4013}
 4014
 4015static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
 4016{
 4017	struct io_splice *sp = &req->splice;
 4018	struct file *in = sp->file_in;
 4019	struct file *out = sp->file_out;
 4020	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
 4021	long ret = 0;
 4022
 4023	if (issue_flags & IO_URING_F_NONBLOCK)
 4024		return -EAGAIN;
 4025	if (sp->len)
 4026		ret = do_tee(in, out, sp->len, flags);
 4027
 4028	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
 4029		io_put_file(in);
 4030	req->flags &= ~REQ_F_NEED_CLEANUP;
 4031
 4032	if (ret != sp->len)
 4033		req_set_fail(req);
 4034	io_req_complete(req, ret);
 4035	return 0;
 4036}
 4037
 4038static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4039{
 4040	struct io_splice *sp = &req->splice;
 4041
 4042	sp->off_in = READ_ONCE(sqe->splice_off_in);
 4043	sp->off_out = READ_ONCE(sqe->off);
 4044	return __io_splice_prep(req, sqe);
 4045}
 4046
 4047static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
 4048{
 4049	struct io_splice *sp = &req->splice;
 4050	struct file *in = sp->file_in;
 4051	struct file *out = sp->file_out;
 4052	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
 4053	loff_t *poff_in, *poff_out;
 4054	long ret = 0;
 4055
 4056	if (issue_flags & IO_URING_F_NONBLOCK)
 4057		return -EAGAIN;
 4058
 4059	poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
 4060	poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
 4061
 4062	if (sp->len)
 4063		ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
 4064
 4065	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
 4066		io_put_file(in);
 4067	req->flags &= ~REQ_F_NEED_CLEANUP;
 4068
 4069	if (ret != sp->len)
 4070		req_set_fail(req);
 4071	io_req_complete(req, ret);
 4072	return 0;
 4073}
 4074
 4075/*
 4076 * IORING_OP_NOP just posts a completion event, nothing else.
 4077 */
 4078static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 4079{
 4080	struct io_ring_ctx *ctx = req->ctx;
 4081
 4082	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 4083		return -EINVAL;
 4084
 4085	__io_req_complete(req, issue_flags, 0, 0);
 4086	return 0;
 4087}
 4088
 4089static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4090{
 4091	struct io_ring_ctx *ctx = req->ctx;
 4092
 4093	if (!req->file)
 4094		return -EBADF;
 4095
 4096	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 4097		return -EINVAL;
 4098	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
 4099		     sqe->splice_fd_in))
 4100		return -EINVAL;
 4101
 4102	req->sync.flags = READ_ONCE(sqe->fsync_flags);
 4103	if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
 4104		return -EINVAL;
 4105
 4106	req->sync.off = READ_ONCE(sqe->off);
 4107	req->sync.len = READ_ONCE(sqe->len);
 4108	return 0;
 4109}
 4110
 4111static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
 4112{
 4113	loff_t end = req->sync.off + req->sync.len;
 4114	int ret;
 4115
 4116	/* fsync always requires a blocking context */
 4117	if (issue_flags & IO_URING_F_NONBLOCK)
 4118		return -EAGAIN;
 4119
 4120	ret = vfs_fsync_range(req->file, req->sync.off,
 4121				end > 0 ? end : LLONG_MAX,
 4122				req->sync.flags & IORING_FSYNC_DATASYNC);
 4123	if (ret < 0)
 4124		req_set_fail(req);
 4125	io_req_complete(req, ret);
 4126	return 0;
 4127}
 4128
 4129static int io_fallocate_prep(struct io_kiocb *req,
 4130			     const struct io_uring_sqe *sqe)
 4131{
 4132	if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
 4133	    sqe->splice_fd_in)
 4134		return -EINVAL;
 4135	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4136		return -EINVAL;
 4137
 4138	req->sync.off = READ_ONCE(sqe->off);
 4139	req->sync.len = READ_ONCE(sqe->addr);
 4140	req->sync.mode = READ_ONCE(sqe->len);
 4141	return 0;
 4142}
 4143
 4144static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
 4145{
 4146	int ret;
 4147
 4148	/* fallocate always requiring blocking context */
 4149	if (issue_flags & IO_URING_F_NONBLOCK)
 4150		return -EAGAIN;
 4151	ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
 4152				req->sync.len);
 4153	if (ret < 0)
 4154		req_set_fail(req);
 4155	io_req_complete(req, ret);
 4156	return 0;
 4157}
 4158
 4159static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4160{
 4161	const char __user *fname;
 4162	int ret;
 4163
 4164	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4165		return -EINVAL;
 4166	if (unlikely(sqe->ioprio || sqe->buf_index))
 4167		return -EINVAL;
 4168	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 4169		return -EBADF;
 4170
 4171	/* open.how should be already initialised */
 4172	if (!(req->open.how.flags & O_PATH) && force_o_largefile())
 4173		req->open.how.flags |= O_LARGEFILE;
 4174
 4175	req->open.dfd = READ_ONCE(sqe->fd);
 4176	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
 4177	req->open.filename = getname(fname);
 4178	if (IS_ERR(req->open.filename)) {
 4179		ret = PTR_ERR(req->open.filename);
 4180		req->open.filename = NULL;
 4181		return ret;
 4182	}
 4183
 4184	req->open.file_slot = READ_ONCE(sqe->file_index);
 4185	if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
 4186		return -EINVAL;
 4187
 4188	req->open.nofile = rlimit(RLIMIT_NOFILE);
 4189	req->flags |= REQ_F_NEED_CLEANUP;
 4190	return 0;
 4191}
 4192
 4193static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4194{
 4195	u64 mode = READ_ONCE(sqe->len);
 4196	u64 flags = READ_ONCE(sqe->open_flags);
 4197
 4198	req->open.how = build_open_how(flags, mode);
 4199	return __io_openat_prep(req, sqe);
 4200}
 4201
 4202static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4203{
 4204	struct open_how __user *how;
 4205	size_t len;
 4206	int ret;
 4207
 4208	how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 4209	len = READ_ONCE(sqe->len);
 4210	if (len < OPEN_HOW_SIZE_VER0)
 4211		return -EINVAL;
 4212
 4213	ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
 4214					len);
 4215	if (ret)
 4216		return ret;
 4217
 4218	return __io_openat_prep(req, sqe);
 4219}
 4220
 4221static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
 4222{
 4223	struct open_flags op;
 4224	struct file *file;
 4225	bool resolve_nonblock, nonblock_set;
 4226	bool fixed = !!req->open.file_slot;
 4227	int ret;
 4228
 4229	ret = build_open_flags(&req->open.how, &op);
 4230	if (ret)
 4231		goto err;
 4232	nonblock_set = op.open_flag & O_NONBLOCK;
 4233	resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
 4234	if (issue_flags & IO_URING_F_NONBLOCK) {
 4235		/*
 4236		 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
 4237		 * it'll always -EAGAIN
 4238		 */
 4239		if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
 4240			return -EAGAIN;
 4241		op.lookup_flags |= LOOKUP_CACHED;
 4242		op.open_flag |= O_NONBLOCK;
 4243	}
 4244
 4245	if (!fixed) {
 4246		ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
 4247		if (ret < 0)
 4248			goto err;
 4249	}
 4250
 4251	file = do_filp_open(req->open.dfd, req->open.filename, &op);
 4252	if (IS_ERR(file)) {
 4253		/*
 4254		 * We could hang on to this 'fd' on retrying, but seems like
 4255		 * marginal gain for something that is now known to be a slower
 4256		 * path. So just put it, and we'll get a new one when we retry.
 4257		 */
 4258		if (!fixed)
 4259			put_unused_fd(ret);
 4260
 4261		ret = PTR_ERR(file);
 4262		/* only retry if RESOLVE_CACHED wasn't already set by application */
 4263		if (ret == -EAGAIN &&
 4264		    (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
 4265			return -EAGAIN;
 4266		goto err;
 4267	}
 4268
 4269	if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
 4270		file->f_flags &= ~O_NONBLOCK;
 4271	fsnotify_open(file);
 4272
 4273	if (!fixed)
 4274		fd_install(ret, file);
 4275	else
 4276		ret = io_install_fixed_file(req, file, issue_flags,
 4277					    req->open.file_slot - 1);
 4278err:
 4279	putname(req->open.filename);
 4280	req->flags &= ~REQ_F_NEED_CLEANUP;
 4281	if (ret < 0)
 4282		req_set_fail(req);
 4283	__io_req_complete(req, issue_flags, ret, 0);
 4284	return 0;
 4285}
 4286
 4287static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
 4288{
 4289	return io_openat2(req, issue_flags);
 4290}
 4291
 4292static int io_remove_buffers_prep(struct io_kiocb *req,
 4293				  const struct io_uring_sqe *sqe)
 4294{
 4295	struct io_provide_buf *p = &req->pbuf;
 4296	u64 tmp;
 4297
 4298	if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
 4299	    sqe->splice_fd_in)
 4300		return -EINVAL;
 4301
 4302	tmp = READ_ONCE(sqe->fd);
 4303	if (!tmp || tmp > USHRT_MAX)
 4304		return -EINVAL;
 4305
 4306	memset(p, 0, sizeof(*p));
 4307	p->nbufs = tmp;
 4308	p->bgid = READ_ONCE(sqe->buf_group);
 4309	return 0;
 4310}
 4311
 4312static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
 4313			       int bgid, unsigned nbufs)
 4314{
 4315	unsigned i = 0;
 4316
 4317	/* shouldn't happen */
 4318	if (!nbufs)
 4319		return 0;
 4320
 4321	/* the head kbuf is the list itself */
 4322	while (!list_empty(&buf->list)) {
 4323		struct io_buffer *nxt;
 4324
 4325		nxt = list_first_entry(&buf->list, struct io_buffer, list);
 4326		list_del(&nxt->list);
 4327		kfree(nxt);
 4328		if (++i == nbufs)
 4329			return i;
 4330	}
 4331	i++;
 4332	kfree(buf);
 4333	xa_erase(&ctx->io_buffers, bgid);
 4334
 4335	return i;
 4336}
 4337
 4338static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 4339{
 4340	struct io_provide_buf *p = &req->pbuf;
 4341	struct io_ring_ctx *ctx = req->ctx;
 4342	struct io_buffer *head;
 4343	int ret = 0;
 4344	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 4345
 4346	io_ring_submit_lock(ctx, needs_lock);
 4347
 4348	lockdep_assert_held(&ctx->uring_lock);
 4349
 4350	ret = -ENOENT;
 4351	head = xa_load(&ctx->io_buffers, p->bgid);
 4352	if (head)
 4353		ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
 4354	if (ret < 0)
 4355		req_set_fail(req);
 4356
 4357	/* complete before unlock, IOPOLL may need the lock */
 4358	__io_req_complete(req, issue_flags, ret, 0);
 4359	io_ring_submit_unlock(ctx, needs_lock);
 4360	return 0;
 4361}
 4362
 4363static int io_provide_buffers_prep(struct io_kiocb *req,
 4364				   const struct io_uring_sqe *sqe)
 4365{
 4366	unsigned long size, tmp_check;
 4367	struct io_provide_buf *p = &req->pbuf;
 4368	u64 tmp;
 4369
 4370	if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
 4371		return -EINVAL;
 4372
 4373	tmp = READ_ONCE(sqe->fd);
 4374	if (!tmp || tmp > USHRT_MAX)
 4375		return -E2BIG;
 4376	p->nbufs = tmp;
 4377	p->addr = READ_ONCE(sqe->addr);
 4378	p->len = READ_ONCE(sqe->len);
 4379
 4380	if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
 4381				&size))
 4382		return -EOVERFLOW;
 4383	if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
 4384		return -EOVERFLOW;
 4385
 4386	size = (unsigned long)p->len * p->nbufs;
 4387	if (!access_ok(u64_to_user_ptr(p->addr), size))
 4388		return -EFAULT;
 4389
 4390	p->bgid = READ_ONCE(sqe->buf_group);
 4391	tmp = READ_ONCE(sqe->off);
 4392	if (tmp > USHRT_MAX)
 4393		return -E2BIG;
 4394	p->bid = tmp;
 4395	return 0;
 4396}
 4397
 4398static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
 4399{
 4400	struct io_buffer *buf;
 4401	u64 addr = pbuf->addr;
 4402	int i, bid = pbuf->bid;
 4403
 4404	for (i = 0; i < pbuf->nbufs; i++) {
 4405		buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
 4406		if (!buf)
 4407			break;
 4408
 4409		buf->addr = addr;
 4410		buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
 4411		buf->bid = bid;
 4412		addr += pbuf->len;
 4413		bid++;
 4414		if (!*head) {
 4415			INIT_LIST_HEAD(&buf->list);
 4416			*head = buf;
 4417		} else {
 4418			list_add_tail(&buf->list, &(*head)->list);
 4419		}
 4420	}
 4421
 4422	return i ? i : -ENOMEM;
 4423}
 4424
 4425static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 4426{
 4427	struct io_provide_buf *p = &req->pbuf;
 4428	struct io_ring_ctx *ctx = req->ctx;
 4429	struct io_buffer *head, *list;
 4430	int ret = 0;
 4431	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 4432
 4433	io_ring_submit_lock(ctx, needs_lock);
 4434
 4435	lockdep_assert_held(&ctx->uring_lock);
 4436
 4437	list = head = xa_load(&ctx->io_buffers, p->bgid);
 4438
 4439	ret = io_add_buffers(p, &head);
 4440	if (ret >= 0 && !list) {
 4441		ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL);
 4442		if (ret < 0)
 4443			__io_remove_buffers(ctx, head, p->bgid, -1U);
 4444	}
 4445	if (ret < 0)
 4446		req_set_fail(req);
 4447	/* complete before unlock, IOPOLL may need the lock */
 4448	__io_req_complete(req, issue_flags, ret, 0);
 4449	io_ring_submit_unlock(ctx, needs_lock);
 4450	return 0;
 4451}
 4452
 4453static int io_epoll_ctl_prep(struct io_kiocb *req,
 4454			     const struct io_uring_sqe *sqe)
 4455{
 4456#if defined(CONFIG_EPOLL)
 4457	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
 4458		return -EINVAL;
 4459	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4460		return -EINVAL;
 4461
 4462	req->epoll.epfd = READ_ONCE(sqe->fd);
 4463	req->epoll.op = READ_ONCE(sqe->len);
 4464	req->epoll.fd = READ_ONCE(sqe->off);
 4465
 4466	if (ep_op_has_event(req->epoll.op)) {
 4467		struct epoll_event __user *ev;
 4468
 4469		ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
 4470		if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
 4471			return -EFAULT;
 4472	}
 4473
 4474	return 0;
 4475#else
 4476	return -EOPNOTSUPP;
 4477#endif
 4478}
 4479
 4480static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
 4481{
 4482#if defined(CONFIG_EPOLL)
 4483	struct io_epoll *ie = &req->epoll;
 4484	int ret;
 4485	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 4486
 4487	ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
 4488	if (force_nonblock && ret == -EAGAIN)
 4489		return -EAGAIN;
 4490
 4491	if (ret < 0)
 4492		req_set_fail(req);
 4493	__io_req_complete(req, issue_flags, ret, 0);
 4494	return 0;
 4495#else
 4496	return -EOPNOTSUPP;
 4497#endif
 4498}
 4499
 4500static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4501{
 4502#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
 4503	if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
 4504		return -EINVAL;
 4505	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4506		return -EINVAL;
 4507
 4508	req->madvise.addr = READ_ONCE(sqe->addr);
 4509	req->madvise.len = READ_ONCE(sqe->len);
 4510	req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
 4511	return 0;
 4512#else
 4513	return -EOPNOTSUPP;
 4514#endif
 4515}
 4516
 4517static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
 4518{
 4519#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
 4520	struct io_madvise *ma = &req->madvise;
 4521	int ret;
 4522
 4523	if (issue_flags & IO_URING_F_NONBLOCK)
 4524		return -EAGAIN;
 4525
 4526	ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
 4527	if (ret < 0)
 4528		req_set_fail(req);
 4529	io_req_complete(req, ret);
 4530	return 0;
 4531#else
 4532	return -EOPNOTSUPP;
 4533#endif
 4534}
 4535
 4536static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4537{
 4538	if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
 4539		return -EINVAL;
 4540	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4541		return -EINVAL;
 4542
 4543	req->fadvise.offset = READ_ONCE(sqe->off);
 4544	req->fadvise.len = READ_ONCE(sqe->len);
 4545	req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
 4546	return 0;
 4547}
 4548
 4549static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
 4550{
 4551	struct io_fadvise *fa = &req->fadvise;
 4552	int ret;
 4553
 4554	if (issue_flags & IO_URING_F_NONBLOCK) {
 4555		switch (fa->advice) {
 4556		case POSIX_FADV_NORMAL:
 4557		case POSIX_FADV_RANDOM:
 4558		case POSIX_FADV_SEQUENTIAL:
 4559			break;
 4560		default:
 4561			return -EAGAIN;
 4562		}
 4563	}
 4564
 4565	ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
 4566	if (ret < 0)
 4567		req_set_fail(req);
 4568	__io_req_complete(req, issue_flags, ret, 0);
 4569	return 0;
 4570}
 4571
 4572static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4573{
 4574	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4575		return -EINVAL;
 4576	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
 4577		return -EINVAL;
 4578	if (req->flags & REQ_F_FIXED_FILE)
 4579		return -EBADF;
 4580
 4581	req->statx.dfd = READ_ONCE(sqe->fd);
 4582	req->statx.mask = READ_ONCE(sqe->len);
 4583	req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
 4584	req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 4585	req->statx.flags = READ_ONCE(sqe->statx_flags);
 4586
 4587	return 0;
 4588}
 4589
 4590static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
 4591{
 4592	struct io_statx *ctx = &req->statx;
 4593	int ret;
 4594
 4595	if (issue_flags & IO_URING_F_NONBLOCK)
 4596		return -EAGAIN;
 4597
 4598	ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
 4599		       ctx->buffer);
 4600
 4601	if (ret < 0)
 4602		req_set_fail(req);
 4603	io_req_complete(req, ret);
 4604	return 0;
 4605}
 4606
 4607static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4608{
 4609	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4610		return -EINVAL;
 4611	if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
 4612	    sqe->rw_flags || sqe->buf_index)
 4613		return -EINVAL;
 4614	if (req->flags & REQ_F_FIXED_FILE)
 4615		return -EBADF;
 4616
 4617	req->close.fd = READ_ONCE(sqe->fd);
 4618	req->close.file_slot = READ_ONCE(sqe->file_index);
 4619	if (req->close.file_slot && req->close.fd)
 4620		return -EINVAL;
 4621
 4622	return 0;
 4623}
 4624
 4625static int io_close(struct io_kiocb *req, unsigned int issue_flags)
 4626{
 4627	struct files_struct *files = current->files;
 4628	struct io_close *close = &req->close;
 4629	struct fdtable *fdt;
 4630	struct file *file = NULL;
 4631	int ret = -EBADF;
 4632
 4633	if (req->close.file_slot) {
 4634		ret = io_close_fixed(req, issue_flags);
 4635		goto err;
 4636	}
 4637
 4638	spin_lock(&files->file_lock);
 4639	fdt = files_fdtable(files);
 4640	if (close->fd >= fdt->max_fds) {
 4641		spin_unlock(&files->file_lock);
 4642		goto err;
 4643	}
 4644	file = fdt->fd[close->fd];
 4645	if (!file || file->f_op == &io_uring_fops) {
 4646		spin_unlock(&files->file_lock);
 4647		file = NULL;
 4648		goto err;
 4649	}
 4650
 4651	/* if the file has a flush method, be safe and punt to async */
 4652	if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
 4653		spin_unlock(&files->file_lock);
 4654		return -EAGAIN;
 4655	}
 4656
 4657	ret = __close_fd_get_file(close->fd, &file);
 4658	spin_unlock(&files->file_lock);
 4659	if (ret < 0) {
 4660		if (ret == -ENOENT)
 4661			ret = -EBADF;
 4662		goto err;
 4663	}
 4664
 4665	/* No ->flush() or already async, safely close from here */
 4666	ret = filp_close(file, current->files);
 4667err:
 4668	if (ret < 0)
 4669		req_set_fail(req);
 4670	if (file)
 4671		fput(file);
 4672	__io_req_complete(req, issue_flags, ret, 0);
 4673	return 0;
 4674}
 4675
 4676static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4677{
 4678	struct io_ring_ctx *ctx = req->ctx;
 4679
 4680	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 4681		return -EINVAL;
 4682	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
 4683		     sqe->splice_fd_in))
 4684		return -EINVAL;
 4685
 4686	req->sync.off = READ_ONCE(sqe->off);
 4687	req->sync.len = READ_ONCE(sqe->len);
 4688	req->sync.flags = READ_ONCE(sqe->sync_range_flags);
 4689	return 0;
 4690}
 4691
 4692static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
 4693{
 4694	int ret;
 4695
 4696	/* sync_file_range always requires a blocking context */
 4697	if (issue_flags & IO_URING_F_NONBLOCK)
 4698		return -EAGAIN;
 4699
 4700	ret = sync_file_range(req->file, req->sync.off, req->sync.len,
 4701				req->sync.flags);
 4702	if (ret < 0)
 4703		req_set_fail(req);
 4704	io_req_complete(req, ret);
 4705	return 0;
 4706}
 4707
 4708#if defined(CONFIG_NET)
 4709static int io_setup_async_msg(struct io_kiocb *req,
 4710			      struct io_async_msghdr *kmsg)
 4711{
 4712	struct io_async_msghdr *async_msg = req->async_data;
 4713
 4714	if (async_msg)
 4715		return -EAGAIN;
 4716	if (io_alloc_async_data(req)) {
 4717		kfree(kmsg->free_iov);
 4718		return -ENOMEM;
 4719	}
 4720	async_msg = req->async_data;
 4721	req->flags |= REQ_F_NEED_CLEANUP;
 4722	memcpy(async_msg, kmsg, sizeof(*kmsg));
 4723	async_msg->msg.msg_name = &async_msg->addr;
 4724	/* if were using fast_iov, set it to the new one */
 4725	if (!async_msg->free_iov)
 4726		async_msg->msg.msg_iter.iov = async_msg->fast_iov;
 4727
 4728	return -EAGAIN;
 4729}
 4730
 4731static int io_sendmsg_copy_hdr(struct io_kiocb *req,
 4732			       struct io_async_msghdr *iomsg)
 4733{
 4734	iomsg->msg.msg_name = &iomsg->addr;
 4735	iomsg->free_iov = iomsg->fast_iov;
 4736	return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
 4737				   req->sr_msg.msg_flags, &iomsg->free_iov);
 4738}
 4739
 4740static int io_sendmsg_prep_async(struct io_kiocb *req)
 4741{
 4742	int ret;
 4743
 4744	ret = io_sendmsg_copy_hdr(req, req->async_data);
 4745	if (!ret)
 4746		req->flags |= REQ_F_NEED_CLEANUP;
 4747	return ret;
 4748}
 4749
 4750static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4751{
 4752	struct io_sr_msg *sr = &req->sr_msg;
 4753
 4754	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4755		return -EINVAL;
 4756
 4757	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 4758	sr->len = READ_ONCE(sqe->len);
 4759	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
 4760	if (sr->msg_flags & MSG_DONTWAIT)
 4761		req->flags |= REQ_F_NOWAIT;
 4762
 4763#ifdef CONFIG_COMPAT
 4764	if (req->ctx->compat)
 4765		sr->msg_flags |= MSG_CMSG_COMPAT;
 4766#endif
 4767	return 0;
 4768}
 4769
 4770static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 4771{
 4772	struct io_async_msghdr iomsg, *kmsg;
 4773	struct socket *sock;
 4774	unsigned flags;
 4775	int min_ret = 0;
 4776	int ret;
 4777
 4778	sock = sock_from_file(req->file);
 4779	if (unlikely(!sock))
 4780		return -ENOTSOCK;
 4781
 4782	if (req_has_async_data(req)) {
 4783		kmsg = req->async_data;
 4784	} else {
 4785		ret = io_sendmsg_copy_hdr(req, &iomsg);
 4786		if (ret)
 4787			return ret;
 4788		kmsg = &iomsg;
 4789	}
 4790
 4791	flags = req->sr_msg.msg_flags;
 4792	if (issue_flags & IO_URING_F_NONBLOCK)
 4793		flags |= MSG_DONTWAIT;
 4794	if (flags & MSG_WAITALL)
 4795		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
 4796
 4797	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
 4798	if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
 4799		return io_setup_async_msg(req, kmsg);
 4800	if (ret == -ERESTARTSYS)
 4801		ret = -EINTR;
 4802
 4803	/* fast path, check for non-NULL to avoid function call */
 4804	if (kmsg->free_iov)
 4805		kfree(kmsg->free_iov);
 4806	req->flags &= ~REQ_F_NEED_CLEANUP;
 4807	if (ret < min_ret)
 4808		req_set_fail(req);
 4809	__io_req_complete(req, issue_flags, ret, 0);
 4810	return 0;
 4811}
 4812
 4813static int io_send(struct io_kiocb *req, unsigned int issue_flags)
 4814{
 4815	struct io_sr_msg *sr = &req->sr_msg;
 4816	struct msghdr msg;
 4817	struct iovec iov;
 4818	struct socket *sock;
 4819	unsigned flags;
 4820	int min_ret = 0;
 4821	int ret;
 4822
 4823	sock = sock_from_file(req->file);
 4824	if (unlikely(!sock))
 4825		return -ENOTSOCK;
 4826
 4827	ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
 4828	if (unlikely(ret))
 4829		return ret;
 4830
 4831	msg.msg_name = NULL;
 4832	msg.msg_control = NULL;
 4833	msg.msg_controllen = 0;
 4834	msg.msg_namelen = 0;
 4835
 4836	flags = req->sr_msg.msg_flags;
 4837	if (issue_flags & IO_URING_F_NONBLOCK)
 4838		flags |= MSG_DONTWAIT;
 4839	if (flags & MSG_WAITALL)
 4840		min_ret = iov_iter_count(&msg.msg_iter);
 4841
 4842	msg.msg_flags = flags;
 4843	ret = sock_sendmsg(sock, &msg);
 4844	if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
 4845		return -EAGAIN;
 4846	if (ret == -ERESTARTSYS)
 4847		ret = -EINTR;
 4848
 4849	if (ret < min_ret)
 4850		req_set_fail(req);
 4851	__io_req_complete(req, issue_flags, ret, 0);
 4852	return 0;
 4853}
 4854
 4855static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
 4856				 struct io_async_msghdr *iomsg)
 4857{
 4858	struct io_sr_msg *sr = &req->sr_msg;
 4859	struct iovec __user *uiov;
 4860	size_t iov_len;
 4861	int ret;
 4862
 4863	ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
 4864					&iomsg->uaddr, &uiov, &iov_len);
 4865	if (ret)
 4866		return ret;
 4867
 4868	if (req->flags & REQ_F_BUFFER_SELECT) {
 4869		if (iov_len > 1)
 4870			return -EINVAL;
 4871		if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
 4872			return -EFAULT;
 4873		sr->len = iomsg->fast_iov[0].iov_len;
 4874		iomsg->free_iov = NULL;
 4875	} else {
 4876		iomsg->free_iov = iomsg->fast_iov;
 4877		ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
 4878				     &iomsg->free_iov, &iomsg->msg.msg_iter,
 4879				     false);
 4880		if (ret > 0)
 4881			ret = 0;
 4882	}
 4883
 4884	return ret;
 4885}
 4886
 4887#ifdef CONFIG_COMPAT
 4888static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
 4889					struct io_async_msghdr *iomsg)
 4890{
 4891	struct io_sr_msg *sr = &req->sr_msg;
 4892	struct compat_iovec __user *uiov;
 4893	compat_uptr_t ptr;
 4894	compat_size_t len;
 4895	int ret;
 4896
 4897	ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
 4898				  &ptr, &len);
 4899	if (ret)
 4900		return ret;
 4901
 4902	uiov = compat_ptr(ptr);
 4903	if (req->flags & REQ_F_BUFFER_SELECT) {
 4904		compat_ssize_t clen;
 4905
 4906		if (len > 1)
 4907			return -EINVAL;
 4908		if (!access_ok(uiov, sizeof(*uiov)))
 4909			return -EFAULT;
 4910		if (__get_user(clen, &uiov->iov_len))
 4911			return -EFAULT;
 4912		if (clen < 0)
 4913			return -EINVAL;
 4914		sr->len = clen;
 4915		iomsg->free_iov = NULL;
 4916	} else {
 4917		iomsg->free_iov = iomsg->fast_iov;
 4918		ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
 4919				   UIO_FASTIOV, &iomsg->free_iov,
 4920				   &iomsg->msg.msg_iter, true);
 4921		if (ret < 0)
 4922			return ret;
 4923	}
 4924
 4925	return 0;
 4926}
 4927#endif
 4928
 4929static int io_recvmsg_copy_hdr(struct io_kiocb *req,
 4930			       struct io_async_msghdr *iomsg)
 4931{
 4932	iomsg->msg.msg_name = &iomsg->addr;
 4933
 4934#ifdef CONFIG_COMPAT
 4935	if (req->ctx->compat)
 4936		return __io_compat_recvmsg_copy_hdr(req, iomsg);
 4937#endif
 4938
 4939	return __io_recvmsg_copy_hdr(req, iomsg);
 4940}
 4941
 4942static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
 4943					       unsigned int issue_flags)
 4944{
 4945	struct io_sr_msg *sr = &req->sr_msg;
 4946
 4947	return io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
 4948}
 4949
 4950static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
 4951{
 4952	return io_put_kbuf(req, req->kbuf);
 4953}
 4954
 4955static int io_recvmsg_prep_async(struct io_kiocb *req)
 4956{
 4957	int ret;
 4958
 4959	ret = io_recvmsg_copy_hdr(req, req->async_data);
 4960	if (!ret)
 4961		req->flags |= REQ_F_NEED_CLEANUP;
 4962	return ret;
 4963}
 4964
 4965static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4966{
 4967	struct io_sr_msg *sr = &req->sr_msg;
 4968
 4969	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4970		return -EINVAL;
 4971
 4972	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 4973	sr->len = READ_ONCE(sqe->len);
 4974	sr->bgid = READ_ONCE(sqe->buf_group);
 4975	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
 4976	if (sr->msg_flags & MSG_DONTWAIT)
 4977		req->flags |= REQ_F_NOWAIT;
 4978
 4979#ifdef CONFIG_COMPAT
 4980	if (req->ctx->compat)
 4981		sr->msg_flags |= MSG_CMSG_COMPAT;
 4982#endif
 4983	return 0;
 4984}
 4985
 4986static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 4987{
 4988	struct io_async_msghdr iomsg, *kmsg;
 4989	struct socket *sock;
 4990	struct io_buffer *kbuf;
 4991	unsigned flags;
 4992	int min_ret = 0;
 4993	int ret, cflags = 0;
 4994	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 4995
 4996	sock = sock_from_file(req->file);
 4997	if (unlikely(!sock))
 4998		return -ENOTSOCK;
 4999
 5000	if (req_has_async_data(req)) {
 5001		kmsg = req->async_data;
 5002	} else {
 5003		ret = io_recvmsg_copy_hdr(req, &iomsg);
 5004		if (ret)
 5005			return ret;
 5006		kmsg = &iomsg;
 5007	}
 5008
 5009	if (req->flags & REQ_F_BUFFER_SELECT) {
 5010		kbuf = io_recv_buffer_select(req, issue_flags);
 5011		if (IS_ERR(kbuf))
 5012			return PTR_ERR(kbuf);
 5013		kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
 5014		kmsg->fast_iov[0].iov_len = req->sr_msg.len;
 5015		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
 5016				1, req->sr_msg.len);
 5017	}
 5018
 5019	flags = req->sr_msg.msg_flags;
 5020	if (force_nonblock)
 5021		flags |= MSG_DONTWAIT;
 5022	if (flags & MSG_WAITALL)
 5023		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
 5024
 5025	ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
 5026					kmsg->uaddr, flags);
 5027	if (force_nonblock && ret == -EAGAIN)
 5028		return io_setup_async_msg(req, kmsg);
 5029	if (ret == -ERESTARTSYS)
 5030		ret = -EINTR;
 5031
 5032	if (req->flags & REQ_F_BUFFER_SELECTED)
 5033		cflags = io_put_recv_kbuf(req);
 5034	/* fast path, check for non-NULL to avoid function call */
 5035	if (kmsg->free_iov)
 5036		kfree(kmsg->free_iov);
 5037	req->flags &= ~REQ_F_NEED_CLEANUP;
 5038	if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
 5039		req_set_fail(req);
 5040	__io_req_complete(req, issue_flags, ret, cflags);
 5041	return 0;
 5042}
 5043
 5044static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 5045{
 5046	struct io_buffer *kbuf;
 5047	struct io_sr_msg *sr = &req->sr_msg;
 5048	struct msghdr msg;
 5049	void __user *buf = sr->buf;
 5050	struct socket *sock;
 5051	struct iovec iov;
 5052	unsigned flags;
 5053	int min_ret = 0;
 5054	int ret, cflags = 0;
 5055	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 5056
 5057	sock = sock_from_file(req->file);
 5058	if (unlikely(!sock))
 5059		return -ENOTSOCK;
 5060
 5061	if (req->flags & REQ_F_BUFFER_SELECT) {
 5062		kbuf = io_recv_buffer_select(req, issue_flags);
 5063		if (IS_ERR(kbuf))
 5064			return PTR_ERR(kbuf);
 5065		buf = u64_to_user_ptr(kbuf->addr);
 5066	}
 5067
 5068	ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
 5069	if (unlikely(ret))
 5070		goto out_free;
 5071
 5072	msg.msg_name = NULL;
 5073	msg.msg_control = NULL;
 5074	msg.msg_controllen = 0;
 5075	msg.msg_namelen = 0;
 5076	msg.msg_iocb = NULL;
 5077	msg.msg_flags = 0;
 5078
 5079	flags = req->sr_msg.msg_flags;
 5080	if (force_nonblock)
 5081		flags |= MSG_DONTWAIT;
 5082	if (flags & MSG_WAITALL)
 5083		min_ret = iov_iter_count(&msg.msg_iter);
 5084
 5085	ret = sock_recvmsg(sock, &msg, flags);
 5086	if (force_nonblock && ret == -EAGAIN)
 5087		return -EAGAIN;
 5088	if (ret == -ERESTARTSYS)
 5089		ret = -EINTR;
 5090out_free:
 5091	if (req->flags & REQ_F_BUFFER_SELECTED)
 5092		cflags = io_put_recv_kbuf(req);
 5093	if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
 5094		req_set_fail(req);
 5095	__io_req_complete(req, issue_flags, ret, cflags);
 5096	return 0;
 5097}
 5098
 5099static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 5100{
 5101	struct io_accept *accept = &req->accept;
 5102
 5103	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 5104		return -EINVAL;
 5105	if (sqe->ioprio || sqe->len || sqe->buf_index)
 5106		return -EINVAL;
 5107
 5108	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
 5109	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 5110	accept->flags = READ_ONCE(sqe->accept_flags);
 5111	accept->nofile = rlimit(RLIMIT_NOFILE);
 5112
 5113	accept->file_slot = READ_ONCE(sqe->file_index);
 5114	if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) ||
 5115				  (accept->flags & SOCK_CLOEXEC)))
 5116		return -EINVAL;
 5117	if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
 5118		return -EINVAL;
 5119	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
 5120		accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
 5121	return 0;
 5122}
 5123
 5124static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 5125{
 5126	struct io_accept *accept = &req->accept;
 5127	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 5128	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
 5129	bool fixed = !!accept->file_slot;
 5130	struct file *file;
 5131	int ret, fd;
 5132
 5133	if (req->file->f_flags & O_NONBLOCK)
 5134		req->flags |= REQ_F_NOWAIT;
 5135
 5136	if (!fixed) {
 5137		fd = __get_unused_fd_flags(accept->flags, accept->nofile);
 5138		if (unlikely(fd < 0))
 5139			return fd;
 5140	}
 5141	file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
 5142			 accept->flags);
 5143	if (IS_ERR(file)) {
 5144		if (!fixed)
 5145			put_unused_fd(fd);
 5146		ret = PTR_ERR(file);
 5147		if (ret == -EAGAIN && force_nonblock)
 5148			return -EAGAIN;
 5149		if (ret == -ERESTARTSYS)
 5150			ret = -EINTR;
 5151		req_set_fail(req);
 5152	} else if (!fixed) {
 5153		fd_install(fd, file);
 5154		ret = fd;
 5155	} else {
 5156		ret = io_install_fixed_file(req, file, issue_flags,
 5157					    accept->file_slot - 1);
 5158	}
 5159	__io_req_complete(req, issue_flags, ret, 0);
 5160	return 0;
 5161}
 5162
 5163static int io_connect_prep_async(struct io_kiocb *req)
 5164{
 5165	struct io_async_connect *io = req->async_data;
 5166	struct io_connect *conn = &req->connect;
 5167
 5168	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
 5169}
 5170
 5171static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 5172{
 5173	struct io_connect *conn = &req->connect;
 5174
 5175	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 5176		return -EINVAL;
 5177	if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
 5178	    sqe->splice_fd_in)
 5179		return -EINVAL;
 5180
 5181	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
 5182	conn->addr_len =  READ_ONCE(sqe->addr2);
 5183	return 0;
 5184}
 5185
 5186static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
 5187{
 5188	struct io_async_connect __io, *io;
 5189	unsigned file_flags;
 5190	int ret;
 5191	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 5192
 5193	if (req_has_async_data(req)) {
 5194		io = req->async_data;
 5195	} else {
 5196		ret = move_addr_to_kernel(req->connect.addr,
 5197						req->connect.addr_len,
 5198						&__io.address);
 5199		if (ret)
 5200			goto out;
 5201		io = &__io;
 5202	}
 5203
 5204	file_flags = force_nonblock ? O_NONBLOCK : 0;
 5205
 5206	ret = __sys_connect_file(req->file, &io->address,
 5207					req->connect.addr_len, file_flags);
 5208	if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
 5209		if (req_has_async_data(req))
 5210			return -EAGAIN;
 5211		if (io_alloc_async_data(req)) {
 5212			ret = -ENOMEM;
 5213			goto out;
 5214		}
 5215		memcpy(req->async_data, &__io, sizeof(__io));
 5216		return -EAGAIN;
 5217	}
 5218	if (ret == -ERESTARTSYS)
 5219		ret = -EINTR;
 5220out:
 5221	if (ret < 0)
 5222		req_set_fail(req);
 5223	__io_req_complete(req, issue_flags, ret, 0);
 5224	return 0;
 5225}
 5226#else /* !CONFIG_NET */
 5227#define IO_NETOP_FN(op)							\
 5228static int io_##op(struct io_kiocb *req, unsigned int issue_flags)	\
 5229{									\
 5230	return -EOPNOTSUPP;						\
 5231}
 5232
 5233#define IO_NETOP_PREP(op)						\
 5234IO_NETOP_FN(op)								\
 5235static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
 5236{									\
 5237	return -EOPNOTSUPP;						\
 5238}									\
 5239
 5240#define IO_NETOP_PREP_ASYNC(op)						\
 5241IO_NETOP_PREP(op)							\
 5242static int io_##op##_prep_async(struct io_kiocb *req)			\
 5243{									\
 5244	return -EOPNOTSUPP;						\
 5245}
 5246
 5247IO_NETOP_PREP_ASYNC(sendmsg);
 5248IO_NETOP_PREP_ASYNC(recvmsg);
 5249IO_NETOP_PREP_ASYNC(connect);
 5250IO_NETOP_PREP(accept);
 5251IO_NETOP_FN(send);
 5252IO_NETOP_FN(recv);
 5253#endif /* CONFIG_NET */
 5254
 5255struct io_poll_table {
 5256	struct poll_table_struct pt;
 5257	struct io_kiocb *req;
 5258	int nr_entries;
 5259	int error;
 5260};
 5261
 5262static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
 5263			   __poll_t mask, io_req_tw_func_t func)
 5264{
 5265	/* for instances that support it check for an event match first: */
 5266	if (mask && !(mask & poll->events))
 5267		return 0;
 5268
 5269	trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
 5270
 5271	list_del_init(&poll->wait.entry);
 5272
 5273	req->result = mask;
 5274	req->io_task_work.func = func;
 5275
 5276	/*
 5277	 * If this fails, then the task is exiting. When a task exits, the
 5278	 * work gets canceled, so just cancel this request as well instead
 5279	 * of executing it. We can't safely execute it anyway, as we may not
 5280	 * have the needed state needed for it anyway.
 5281	 */
 5282	io_req_task_work_add(req);
 5283	return 1;
 5284}
 5285
 5286static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
 5287	__acquires(&req->ctx->completion_lock)
 5288{
 5289	struct io_ring_ctx *ctx = req->ctx;
 5290
 5291	/* req->task == current here, checking PF_EXITING is safe */
 5292	if (unlikely(req->task->flags & PF_EXITING))
 5293		WRITE_ONCE(poll->canceled, true);
 5294
 5295	if (!req->result && !READ_ONCE(poll->canceled)) {
 5296		struct poll_table_struct pt = { ._key = poll->events };
 5297
 5298		req->result = vfs_poll(req->file, &pt) & poll->events;
 5299	}
 5300
 5301	spin_lock(&ctx->completion_lock);
 5302	if (!req->result && !READ_ONCE(poll->canceled)) {
 5303		add_wait_queue(poll->head, &poll->wait);
 5304		return true;
 5305	}
 5306
 5307	return false;
 5308}
 5309
 5310static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
 5311{
 5312	/* pure poll stashes this in ->async_data, poll driven retry elsewhere */
 5313	if (req->opcode == IORING_OP_POLL_ADD)
 5314		return req->async_data;
 5315	return req->apoll->double_poll;
 5316}
 5317
 5318static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
 5319{
 5320	if (req->opcode == IORING_OP_POLL_ADD)
 5321		return &req->poll;
 5322	return &req->apoll->poll;
 5323}
 5324
 5325static void io_poll_remove_double(struct io_kiocb *req)
 5326	__must_hold(&req->ctx->completion_lock)
 5327{
 5328	struct io_poll_iocb *poll = io_poll_get_double(req);
 5329
 5330	lockdep_assert_held(&req->ctx->completion_lock);
 5331
 5332	if (poll && poll->head) {
 5333		struct wait_queue_head *head = poll->head;
 5334
 5335		spin_lock_irq(&head->lock);
 5336		list_del_init(&poll->wait.entry);
 5337		if (poll->wait.private)
 5338			req_ref_put(req);
 5339		poll->head = NULL;
 5340		spin_unlock_irq(&head->lock);
 5341	}
 5342}
 5343
 5344static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)
 5345	__must_hold(&req->ctx->completion_lock)
 5346{
 5347	struct io_ring_ctx *ctx = req->ctx;
 5348	unsigned flags = IORING_CQE_F_MORE;
 5349	int error;
 5350
 5351	if (READ_ONCE(req->poll.canceled)) {
 5352		error = -ECANCELED;
 5353		req->poll.events |= EPOLLONESHOT;
 5354	} else {
 5355		error = mangle_poll(mask);
 5356	}
 5357	if (req->poll.events & EPOLLONESHOT)
 5358		flags = 0;
 5359	if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
 5360		req->poll.events |= EPOLLONESHOT;
 5361		flags = 0;
 5362	}
 5363	if (flags & IORING_CQE_F_MORE)
 5364		ctx->cq_extra++;
 5365
 5366	return !(flags & IORING_CQE_F_MORE);
 5367}
 5368
 5369static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 5370{
 5371	struct io_ring_ctx *ctx = req->ctx;
 5372	struct io_kiocb *nxt;
 5373
 5374	if (io_poll_rewait(req, &req->poll)) {
 5375		spin_unlock(&ctx->completion_lock);
 5376	} else {
 5377		bool done;
 5378
 5379		if (req->poll.done) {
 5380			spin_unlock(&ctx->completion_lock);
 5381			return;
 5382		}
 5383		done = __io_poll_complete(req, req->result);
 5384		if (done) {
 5385			io_poll_remove_double(req);
 5386			hash_del(&req->hash_node);
 5387			req->poll.done = true;
 5388		} else {
 5389			req->result = 0;
 5390			add_wait_queue(req->poll.head, &req->poll.wait);
 5391		}
 5392		io_commit_cqring(ctx);
 5393		spin_unlock(&ctx->completion_lock);
 5394		io_cqring_ev_posted(ctx);
 5395
 5396		if (done) {
 5397			nxt = io_put_req_find_next(req);
 5398			if (nxt)
 5399				io_req_task_submit(nxt, locked);
 5400		}
 5401	}
 5402}
 5403
 5404static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
 5405			       int sync, void *key)
 5406{
 5407	struct io_kiocb *req = wait->private;
 5408	struct io_poll_iocb *poll = io_poll_get_single(req);
 5409	__poll_t mask = key_to_poll(key);
 5410	unsigned long flags;
 5411
 5412	/* for instances that support it check for an event match first: */
 5413	if (mask && !(mask & poll->events))
 5414		return 0;
 5415	if (!(poll->events & EPOLLONESHOT))
 5416		return poll->wait.func(&poll->wait, mode, sync, key);
 5417
 5418	list_del_init(&wait->entry);
 5419
 5420	if (poll->head) {
 5421		bool done;
 5422
 5423		spin_lock_irqsave(&poll->head->lock, flags);
 5424		done = list_empty(&poll->wait.entry);
 5425		if (!done)
 5426			list_del_init(&poll->wait.entry);
 5427		/* make sure double remove sees this as being gone */
 5428		wait->private = NULL;
 5429		spin_unlock_irqrestore(&poll->head->lock, flags);
 5430		if (!done) {
 5431			/* use wait func handler, so it matches the rq type */
 5432			poll->wait.func(&poll->wait, mode, sync, key);
 5433		}
 5434	}
 5435	req_ref_put(req);
 5436	return 1;
 5437}
 5438
 5439static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
 5440			      wait_queue_func_t wake_func)
 5441{
 5442	poll->head = NULL;
 5443	poll->done = false;
 5444	poll->canceled = false;
 5445#define IO_POLL_UNMASK	(EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
 5446	/* mask in events that we always want/need */
 5447	poll->events = events | IO_POLL_UNMASK;
 5448	INIT_LIST_HEAD(&poll->wait.entry);
 5449	init_waitqueue_func_entry(&poll->wait, wake_func);
 5450}
 5451
 5452static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
 5453			    struct wait_queue_head *head,
 5454			    struct io_poll_iocb **poll_ptr)
 5455{
 5456	struct io_kiocb *req = pt->req;
 5457
 5458	/*
 5459	 * The file being polled uses multiple waitqueues for poll handling
 5460	 * (e.g. one for read, one for write). Setup a separate io_poll_iocb
 5461	 * if this happens.
 5462	 */
 5463	if (unlikely(pt->nr_entries)) {
 5464		struct io_poll_iocb *poll_one = poll;
 5465
 5466		/* double add on the same waitqueue head, ignore */
 5467		if (poll_one->head == head)
 5468			return;
 5469		/* already have a 2nd entry, fail a third attempt */
 5470		if (*poll_ptr) {
 5471			if ((*poll_ptr)->head == head)
 5472				return;
 5473			pt->error = -EINVAL;
 5474			return;
 5475		}
 5476		/*
 5477		 * Can't handle multishot for double wait for now, turn it
 5478		 * into one-shot mode.
 5479		 */
 5480		if (!(poll_one->events & EPOLLONESHOT))
 5481			poll_one->events |= EPOLLONESHOT;
 5482		poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
 5483		if (!poll) {
 5484			pt->error = -ENOMEM;
 5485			return;
 5486		}
 5487		io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
 5488		req_ref_get(req);
 5489		poll->wait.private = req;
 5490
 5491		*poll_ptr = poll;
 5492		if (req->opcode == IORING_OP_POLL_ADD)
 5493			req->flags |= REQ_F_ASYNC_DATA;
 5494	}
 5495
 5496	pt->nr_entries++;
 5497	poll->head = head;
 5498
 5499	if (poll->events & EPOLLEXCLUSIVE)
 5500		add_wait_queue_exclusive(head, &poll->wait);
 5501	else
 5502		add_wait_queue(head, &poll->wait);
 5503}
 5504
 5505static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
 5506			       struct poll_table_struct *p)
 5507{
 5508	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
 5509	struct async_poll *apoll = pt->req->apoll;
 5510
 5511	__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
 5512}
 5513
 5514static void io_async_task_func(struct io_kiocb *req, bool *locked)
 5515{
 5516	struct async_poll *apoll = req->apoll;
 5517	struct io_ring_ctx *ctx = req->ctx;
 5518
 5519	trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data);
 5520
 5521	if (io_poll_rewait(req, &apoll->poll)) {
 5522		spin_unlock(&ctx->completion_lock);
 5523		return;
 5524	}
 5525
 5526	hash_del(&req->hash_node);
 5527	io_poll_remove_double(req);
 5528	apoll->poll.done = true;
 5529	spin_unlock(&ctx->completion_lock);
 5530
 5531	if (!READ_ONCE(apoll->poll.canceled))
 5532		io_req_task_submit(req, locked);
 5533	else
 5534		io_req_complete_failed(req, -ECANCELED);
 5535}
 5536
 5537static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 5538			void *key)
 5539{
 5540	struct io_kiocb *req = wait->private;
 5541	struct io_poll_iocb *poll = &req->apoll->poll;
 5542
 5543	trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
 5544					key_to_poll(key));
 5545
 5546	return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
 5547}
 5548
 5549static void io_poll_req_insert(struct io_kiocb *req)
 5550{
 5551	struct io_ring_ctx *ctx = req->ctx;
 5552	struct hlist_head *list;
 5553
 5554	list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
 5555	hlist_add_head(&req->hash_node, list);
 5556}
 5557
 5558static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
 5559				      struct io_poll_iocb *poll,
 5560				      struct io_poll_table *ipt, __poll_t mask,
 5561				      wait_queue_func_t wake_func)
 5562	__acquires(&ctx->completion_lock)
 5563{
 5564	struct io_ring_ctx *ctx = req->ctx;
 5565	bool cancel = false;
 5566
 5567	INIT_HLIST_NODE(&req->hash_node);
 5568	io_init_poll_iocb(poll, mask, wake_func);
 5569	poll->file = req->file;
 5570	poll->wait.private = req;
 5571
 5572	ipt->pt._key = mask;
 5573	ipt->req = req;
 5574	ipt->error = 0;
 5575	ipt->nr_entries = 0;
 5576
 5577	mask = vfs_poll(req->file, &ipt->pt) & poll->events;
 5578	if (unlikely(!ipt->nr_entries) && !ipt->error)
 5579		ipt->error = -EINVAL;
 5580
 5581	spin_lock(&ctx->completion_lock);
 5582	if (ipt->error || (mask && (poll->events & EPOLLONESHOT)))
 5583		io_poll_remove_double(req);
 5584	if (likely(poll->head)) {
 5585		spin_lock_irq(&poll->head->lock);
 5586		if (unlikely(list_empty(&poll->wait.entry))) {
 5587			if (ipt->error)
 5588				cancel = true;
 5589			ipt->error = 0;
 5590			mask = 0;
 5591		}
 5592		if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error)
 5593			list_del_init(&poll->wait.entry);
 5594		else if (cancel)
 5595			WRITE_ONCE(poll->canceled, true);
 5596		else if (!poll->done) /* actually waiting for an event */
 5597			io_poll_req_insert(req);
 5598		spin_unlock_irq(&poll->head->lock);
 5599	}
 5600
 5601	return mask;
 5602}
 5603
 5604enum {
 5605	IO_APOLL_OK,
 5606	IO_APOLL_ABORTED,
 5607	IO_APOLL_READY
 5608};
 5609
 5610static int io_arm_poll_handler(struct io_kiocb *req)
 5611{
 5612	const struct io_op_def *def = &io_op_defs[req->opcode];
 5613	struct io_ring_ctx *ctx = req->ctx;
 5614	struct async_poll *apoll;
 5615	struct io_poll_table ipt;
 5616	__poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI;
 5617
 5618	if (!def->pollin && !def->pollout)
 5619		return IO_APOLL_ABORTED;
 5620	if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED))
 5621		return IO_APOLL_ABORTED;
 5622
 5623	if (def->pollin) {
 5624		mask |= POLLIN | POLLRDNORM;
 5625
 5626		/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
 5627		if ((req->opcode == IORING_OP_RECVMSG) &&
 5628		    (req->sr_msg.msg_flags & MSG_ERRQUEUE))
 5629			mask &= ~POLLIN;
 5630	} else {
 5631		mask |= POLLOUT | POLLWRNORM;
 5632	}
 5633
 5634	apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
 5635	if (unlikely(!apoll))
 5636		return IO_APOLL_ABORTED;
 5637	apoll->double_poll = NULL;
 5638	req->apoll = apoll;
 5639	req->flags |= REQ_F_POLLED;
 5640	ipt.pt._qproc = io_async_queue_proc;
 5641	io_req_set_refcount(req);
 5642
 5643	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
 5644					io_async_wake);
 5645	spin_unlock(&ctx->completion_lock);
 5646	if (ret || ipt.error)
 5647		return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
 5648
 5649	trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
 5650				mask, apoll->poll.events);
 5651	return IO_APOLL_OK;
 5652}
 5653
 5654static bool __io_poll_remove_one(struct io_kiocb *req,
 5655				 struct io_poll_iocb *poll, bool do_cancel)
 5656	__must_hold(&req->ctx->completion_lock)
 5657{
 5658	bool do_complete = false;
 5659
 5660	if (!poll->head)
 5661		return false;
 5662	spin_lock_irq(&poll->head->lock);
 5663	if (do_cancel)
 5664		WRITE_ONCE(poll->canceled, true);
 5665	if (!list_empty(&poll->wait.entry)) {
 5666		list_del_init(&poll->wait.entry);
 5667		do_complete = true;
 5668	}
 5669	spin_unlock_irq(&poll->head->lock);
 5670	hash_del(&req->hash_node);
 5671	return do_complete;
 5672}
 5673
 5674static bool io_poll_remove_one(struct io_kiocb *req)
 5675	__must_hold(&req->ctx->completion_lock)
 5676{
 5677	bool do_complete;
 5678
 5679	io_poll_remove_double(req);
 5680	do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
 5681
 5682	if (do_complete) {
 5683		io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
 5684		io_commit_cqring(req->ctx);
 5685		req_set_fail(req);
 5686		io_put_req_deferred(req);
 5687	}
 5688	return do_complete;
 5689}
 5690
 5691/*
 5692 * Returns true if we found and killed one or more poll requests
 5693 */
 5694static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
 5695				      struct task_struct *tsk, bool cancel_all)
 5696{
 5697	struct hlist_node *tmp;
 5698	struct io_kiocb *req;
 5699	int posted = 0, i;
 5700
 5701	spin_lock(&ctx->completion_lock);
 5702	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
 5703		struct hlist_head *list;
 5704
 5705		list = &ctx->cancel_hash[i];
 5706		hlist_for_each_entry_safe(req, tmp, list, hash_node) {
 5707			if (io_match_task(req, tsk, cancel_all))
 5708				posted += io_poll_remove_one(req);
 5709		}
 5710	}
 5711	spin_unlock(&ctx->completion_lock);
 5712
 5713	if (posted)
 5714		io_cqring_ev_posted(ctx);
 5715
 5716	return posted != 0;
 5717}
 5718
 5719static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
 5720				     bool poll_only)
 5721	__must_hold(&ctx->completion_lock)
 5722{
 5723	struct hlist_head *list;
 5724	struct io_kiocb *req;
 5725
 5726	list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
 5727	hlist_for_each_entry(req, list, hash_node) {
 5728		if (sqe_addr != req->user_data)
 5729			continue;
 5730		if (poll_only && req->opcode != IORING_OP_POLL_ADD)
 5731			continue;
 5732		return req;
 5733	}
 5734	return NULL;
 5735}
 5736
 5737static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
 5738			  bool poll_only)
 5739	__must_hold(&ctx->completion_lock)
 5740{
 5741	struct io_kiocb *req;
 5742
 5743	req = io_poll_find(ctx, sqe_addr, poll_only);
 5744	if (!req)
 5745		return -ENOENT;
 5746	if (io_poll_remove_one(req))
 5747		return 0;
 5748
 5749	return -EALREADY;
 5750}
 5751
 5752static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
 5753				     unsigned int flags)
 5754{
 5755	u32 events;
 5756
 5757	events = READ_ONCE(sqe->poll32_events);
 5758#ifdef __BIG_ENDIAN
 5759	events = swahw32(events);
 5760#endif
 5761	if (!(flags & IORING_POLL_ADD_MULTI))
 5762		events |= EPOLLONESHOT;
 5763	return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
 5764}
 5765
 5766static int io_poll_update_prep(struct io_kiocb *req,
 5767			       const struct io_uring_sqe *sqe)
 5768{
 5769	struct io_poll_update *upd = &req->poll_update;
 5770	u32 flags;
 5771
 5772	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 5773		return -EINVAL;
 5774	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
 5775		return -EINVAL;
 5776	flags = READ_ONCE(sqe->len);
 5777	if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
 5778		      IORING_POLL_ADD_MULTI))
 5779		return -EINVAL;
 5780	/* meaningless without update */
 5781	if (flags == IORING_POLL_ADD_MULTI)
 5782		return -EINVAL;
 5783
 5784	upd->old_user_data = READ_ONCE(sqe->addr);
 5785	upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
 5786	upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
 5787
 5788	upd->new_user_data = READ_ONCE(sqe->off);
 5789	if (!upd->update_user_data && upd->new_user_data)
 5790		return -EINVAL;
 5791	if (upd->update_events)
 5792		upd->events = io_poll_parse_events(sqe, flags);
 5793	else if (sqe->poll32_events)
 5794		return -EINVAL;
 5795
 5796	return 0;
 5797}
 5798
 5799static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 5800			void *key)
 5801{
 5802	struct io_kiocb *req = wait->private;
 5803	struct io_poll_iocb *poll = &req->poll;
 5804
 5805	return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
 5806}
 5807
 5808static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
 5809			       struct poll_table_struct *p)
 5810{
 5811	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
 5812
 5813	__io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
 5814}
 5815
 5816static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 5817{
 5818	struct io_poll_iocb *poll = &req->poll;
 5819	u32 flags;
 5820
 5821	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 5822		return -EINVAL;
 5823	if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
 5824		return -EINVAL;
 5825	flags = READ_ONCE(sqe->len);
 5826	if (flags & ~IORING_POLL_ADD_MULTI)
 5827		return -EINVAL;
 5828
 5829	io_req_set_refcount(req);
 5830	poll->events = io_poll_parse_events(sqe, flags);
 5831	return 0;
 5832}
 5833
 5834static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 5835{
 5836	struct io_poll_iocb *poll = &req->poll;
 5837	struct io_ring_ctx *ctx = req->ctx;
 5838	struct io_poll_table ipt;
 5839	__poll_t mask;
 5840	bool done;
 5841
 5842	ipt.pt._qproc = io_poll_queue_proc;
 5843
 5844	mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
 5845					io_poll_wake);
 5846
 5847	if (mask) { /* no async, we'd stolen it */
 5848		ipt.error = 0;
 5849		done = __io_poll_complete(req, mask);
 5850		io_commit_cqring(req->ctx);
 5851	}
 5852	spin_unlock(&ctx->completion_lock);
 5853
 5854	if (mask) {
 5855		io_cqring_ev_posted(ctx);
 5856		if (done)
 5857			io_put_req(req);
 5858	}
 5859	return ipt.error;
 5860}
 5861
 5862static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
 5863{
 5864	struct io_ring_ctx *ctx = req->ctx;
 5865	struct io_kiocb *preq;
 5866	bool completing;
 5867	int ret;
 5868
 5869	spin_lock(&ctx->completion_lock);
 5870	preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
 5871	if (!preq) {
 5872		ret = -ENOENT;
 5873		goto err;
 5874	}
 5875
 5876	if (!req->poll_update.update_events && !req->poll_update.update_user_data) {
 5877		completing = true;
 5878		ret = io_poll_remove_one(preq) ? 0 : -EALREADY;
 5879		goto err;
 5880	}
 5881
 5882	/*
 5883	 * Don't allow racy completion with singleshot, as we cannot safely
 5884	 * update those. For multishot, if we're racing with completion, just
 5885	 * let completion re-add it.
 5886	 */
 5887	completing = !__io_poll_remove_one(preq, &preq->poll, false);
 5888	if (completing && (preq->poll.events & EPOLLONESHOT)) {
 5889		ret = -EALREADY;
 5890		goto err;
 5891	}
 5892	/* we now have a detached poll request. reissue. */
 5893	ret = 0;
 5894err:
 5895	if (ret < 0) {
 5896		spin_unlock(&ctx->completion_lock);
 5897		req_set_fail(req);
 5898		io_req_complete(req, ret);
 5899		return 0;
 5900	}
 5901	/* only mask one event flags, keep behavior flags */
 5902	if (req->poll_update.update_events) {
 5903		preq->poll.events &= ~0xffff;
 5904		preq->poll.events |= req->poll_update.events & 0xffff;
 5905		preq->poll.events |= IO_POLL_UNMASK;
 5906	}
 5907	if (req->poll_update.update_user_data)
 5908		preq->user_data = req->poll_update.new_user_data;
 5909	spin_unlock(&ctx->completion_lock);
 5910
 5911	/* complete update request, we're done with it */
 5912	io_req_complete(req, ret);
 5913
 5914	if (!completing) {
 5915		ret = io_poll_add(preq, issue_flags);
 5916		if (ret < 0) {
 5917			req_set_fail(preq);
 5918			io_req_complete(preq, ret);
 5919		}
 5920	}
 5921	return 0;
 5922}
 5923
 5924static void io_req_task_timeout(struct io_kiocb *req, bool *locked)
 5925{
 5926	struct io_timeout_data *data = req->async_data;
 5927
 5928	if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
 5929		req_set_fail(req);
 5930	io_req_complete_post(req, -ETIME, 0);
 5931}
 5932
 5933static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 5934{
 5935	struct io_timeout_data *data = container_of(timer,
 5936						struct io_timeout_data, timer);
 5937	struct io_kiocb *req = data->req;
 5938	struct io_ring_ctx *ctx = req->ctx;
 5939	unsigned long flags;
 5940
 5941	spin_lock_irqsave(&ctx->timeout_lock, flags);
 5942	list_del_init(&req->timeout.list);
 5943	atomic_set(&req->ctx->cq_timeouts,
 5944		atomic_read(&req->ctx->cq_timeouts) + 1);
 5945	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
 5946
 5947	req->io_task_work.func = io_req_task_timeout;
 5948	io_req_task_work_add(req);
 5949	return HRTIMER_NORESTART;
 5950}
 5951
 5952static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
 5953					   __u64 user_data)
 5954	__must_hold(&ctx->timeout_lock)
 5955{
 5956	struct io_timeout_data *io;
 5957	struct io_kiocb *req;
 5958	bool found = false;
 5959
 5960	list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
 5961		found = user_data == req->user_data;
 5962		if (found)
 5963			break;
 5964	}
 5965	if (!found)
 5966		return ERR_PTR(-ENOENT);
 5967
 5968	io = req->async_data;
 5969	if (hrtimer_try_to_cancel(&io->timer) == -1)
 5970		return ERR_PTR(-EALREADY);
 5971	list_del_init(&req->timeout.list);
 5972	return req;
 5973}
 5974
 5975static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
 5976	__must_hold(&ctx->completion_lock)
 5977	__must_hold(&ctx->timeout_lock)
 5978{
 5979	struct io_kiocb *req = io_timeout_extract(ctx, user_data);
 5980
 5981	if (IS_ERR(req))
 5982		return PTR_ERR(req);
 5983
 5984	req_set_fail(req);
 5985	io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
 5986	io_put_req_deferred(req);
 5987	return 0;
 5988}
 5989
 5990static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
 5991{
 5992	switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
 5993	case IORING_TIMEOUT_BOOTTIME:
 5994		return CLOCK_BOOTTIME;
 5995	case IORING_TIMEOUT_REALTIME:
 5996		return CLOCK_REALTIME;
 5997	default:
 5998		/* can't happen, vetted at prep time */
 5999		WARN_ON_ONCE(1);
 6000		fallthrough;
 6001	case 0:
 6002		return CLOCK_MONOTONIC;
 6003	}
 6004}
 6005
 6006static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 6007				    struct timespec64 *ts, enum hrtimer_mode mode)
 6008	__must_hold(&ctx->timeout_lock)
 6009{
 6010	struct io_timeout_data *io;
 6011	struct io_kiocb *req;
 6012	bool found = false;
 6013
 6014	list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
 6015		found = user_data == req->user_data;
 6016		if (found)
 6017			break;
 6018	}
 6019	if (!found)
 6020		return -ENOENT;
 6021
 6022	io = req->async_data;
 6023	if (hrtimer_try_to_cancel(&io->timer) == -1)
 6024		return -EALREADY;
 6025	hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
 6026	io->timer.function = io_link_timeout_fn;
 6027	hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
 6028	return 0;
 6029}
 6030
 6031static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 6032			     struct timespec64 *ts, enum hrtimer_mode mode)
 6033	__must_hold(&ctx->timeout_lock)
 6034{
 6035	struct io_kiocb *req = io_timeout_extract(ctx, user_data);
 6036	struct io_timeout_data *data;
 6037
 6038	if (IS_ERR(req))
 6039		return PTR_ERR(req);
 6040
 6041	req->timeout.off = 0; /* noseq */
 6042	data = req->async_data;
 6043	list_add_tail(&req->timeout.list, &ctx->timeout_list);
 6044	hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
 6045	data->timer.function = io_timeout_fn;
 6046	hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
 6047	return 0;
 6048}
 6049
 6050static int io_timeout_remove_prep(struct io_kiocb *req,
 6051				  const struct io_uring_sqe *sqe)
 6052{
 6053	struct io_timeout_rem *tr = &req->timeout_rem;
 6054
 6055	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 6056		return -EINVAL;
 6057	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
 6058		return -EINVAL;
 6059	if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
 6060		return -EINVAL;
 6061
 6062	tr->ltimeout = false;
 6063	tr->addr = READ_ONCE(sqe->addr);
 6064	tr->flags = READ_ONCE(sqe->timeout_flags);
 6065	if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
 6066		if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
 6067			return -EINVAL;
 6068		if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
 6069			tr->ltimeout = true;
 6070		if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
 6071			return -EINVAL;
 6072		if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
 6073			return -EFAULT;
 6074	} else if (tr->flags) {
 6075		/* timeout removal doesn't support flags */
 6076		return -EINVAL;
 6077	}
 6078
 6079	return 0;
 6080}
 6081
 6082static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
 6083{
 6084	return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
 6085					    : HRTIMER_MODE_REL;
 6086}
 6087
 6088/*
 6089 * Remove or update an existing timeout command
 6090 */
 6091static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
 6092{
 6093	struct io_timeout_rem *tr = &req->timeout_rem;
 6094	struct io_ring_ctx *ctx = req->ctx;
 6095	int ret;
 6096
 6097	if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
 6098		spin_lock(&ctx->completion_lock);
 6099		spin_lock_irq(&ctx->timeout_lock);
 6100		ret = io_timeout_cancel(ctx, tr->addr);
 6101		spin_unlock_irq(&ctx->timeout_lock);
 6102		spin_unlock(&ctx->completion_lock);
 6103	} else {
 6104		enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
 6105
 6106		spin_lock_irq(&ctx->timeout_lock);
 6107		if (tr->ltimeout)
 6108			ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
 6109		else
 6110			ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
 6111		spin_unlock_irq(&ctx->timeout_lock);
 6112	}
 6113
 6114	if (ret < 0)
 6115		req_set_fail(req);
 6116	io_req_complete_post(req, ret, 0);
 6117	return 0;
 6118}
 6119
 6120static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 6121			   bool is_timeout_link)
 6122{
 6123	struct io_timeout_data *data;
 6124	unsigned flags;
 6125	u32 off = READ_ONCE(sqe->off);
 6126
 6127	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 6128		return -EINVAL;
 6129	if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
 6130	    sqe->splice_fd_in)
 6131		return -EINVAL;
 6132	if (off && is_timeout_link)
 6133		return -EINVAL;
 6134	flags = READ_ONCE(sqe->timeout_flags);
 6135	if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
 6136		      IORING_TIMEOUT_ETIME_SUCCESS))
 6137		return -EINVAL;
 6138	/* more than one clock specified is invalid, obviously */
 6139	if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
 6140		return -EINVAL;
 6141
 6142	INIT_LIST_HEAD(&req->timeout.list);
 6143	req->timeout.off = off;
 6144	if (unlikely(off && !req->ctx->off_timeout_used))
 6145		req->ctx->off_timeout_used = true;
 6146
 6147	if (WARN_ON_ONCE(req_has_async_data(req)))
 6148		return -EFAULT;
 6149	if (io_alloc_async_data(req))
 6150		return -ENOMEM;
 6151
 6152	data = req->async_data;
 6153	data->req = req;
 6154	data->flags = flags;
 6155
 6156	if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
 6157		return -EFAULT;
 6158
 6159	data->mode = io_translate_timeout_mode(flags);
 6160	hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
 6161
 6162	if (is_timeout_link) {
 6163		struct io_submit_link *link = &req->ctx->submit_state.link;
 6164
 6165		if (!link->head)
 6166			return -EINVAL;
 6167		if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
 6168			return -EINVAL;
 6169		req->timeout.head = link->last;
 6170		link->last->flags |= REQ_F_ARM_LTIMEOUT;
 6171	}
 6172	return 0;
 6173}
 6174
 6175static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
 6176{
 6177	struct io_ring_ctx *ctx = req->ctx;
 6178	struct io_timeout_data *data = req->async_data;
 6179	struct list_head *entry;
 6180	u32 tail, off = req->timeout.off;
 6181
 6182	spin_lock_irq(&ctx->timeout_lock);
 6183
 6184	/*
 6185	 * sqe->off holds how many events that need to occur for this
 6186	 * timeout event to be satisfied. If it isn't set, then this is
 6187	 * a pure timeout request, sequence isn't used.
 6188	 */
 6189	if (io_is_timeout_noseq(req)) {
 6190		entry = ctx->timeout_list.prev;
 6191		goto add;
 6192	}
 6193
 6194	tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
 6195	req->timeout.target_seq = tail + off;
 6196
 6197	/* Update the last seq here in case io_flush_timeouts() hasn't.
 6198	 * This is safe because ->completion_lock is held, and submissions
 6199	 * and completions are never mixed in the same ->completion_lock section.
 6200	 */
 6201	ctx->cq_last_tm_flush = tail;
 6202
 6203	/*
 6204	 * Insertion sort, ensuring the first entry in the list is always
 6205	 * the one we need first.
 6206	 */
 6207	list_for_each_prev(entry, &ctx->timeout_list) {
 6208		struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
 6209						  timeout.list);
 6210
 6211		if (io_is_timeout_noseq(nxt))
 6212			continue;
 6213		/* nxt.seq is behind @tail, otherwise would've been completed */
 6214		if (off >= nxt->timeout.target_seq - tail)
 6215			break;
 6216	}
 6217add:
 6218	list_add(&req->timeout.list, entry);
 6219	data->timer.function = io_timeout_fn;
 6220	hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
 6221	spin_unlock_irq(&ctx->timeout_lock);
 6222	return 0;
 6223}
 6224
 6225struct io_cancel_data {
 6226	struct io_ring_ctx *ctx;
 6227	u64 user_data;
 6228};
 6229
 6230static bool io_cancel_cb(struct io_wq_work *work, void *data)
 6231{
 6232	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 6233	struct io_cancel_data *cd = data;
 6234
 6235	return req->ctx == cd->ctx && req->user_data == cd->user_data;
 6236}
 6237
 6238static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
 6239			       struct io_ring_ctx *ctx)
 6240{
 6241	struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
 6242	enum io_wq_cancel cancel_ret;
 6243	int ret = 0;
 6244
 6245	if (!tctx || !tctx->io_wq)
 6246		return -ENOENT;
 6247
 6248	cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
 6249	switch (cancel_ret) {
 6250	case IO_WQ_CANCEL_OK:
 6251		ret = 0;
 6252		break;
 6253	case IO_WQ_CANCEL_RUNNING:
 6254		ret = -EALREADY;
 6255		break;
 6256	case IO_WQ_CANCEL_NOTFOUND:
 6257		ret = -ENOENT;
 6258		break;
 6259	}
 6260
 6261	return ret;
 6262}
 6263
 6264static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
 6265{
 6266	struct io_ring_ctx *ctx = req->ctx;
 6267	int ret;
 6268
 6269	WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
 6270
 6271	ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
 6272	if (ret != -ENOENT)
 6273		return ret;
 6274
 6275	spin_lock(&ctx->completion_lock);
 6276	spin_lock_irq(&ctx->timeout_lock);
 6277	ret = io_timeout_cancel(ctx, sqe_addr);
 6278	spin_unlock_irq(&ctx->timeout_lock);
 6279	if (ret != -ENOENT)
 6280		goto out;
 6281	ret = io_poll_cancel(ctx, sqe_addr, false);
 6282out:
 6283	spin_unlock(&ctx->completion_lock);
 6284	return ret;
 6285}
 6286
 6287static int io_async_cancel_prep(struct io_kiocb *req,
 6288				const struct io_uring_sqe *sqe)
 6289{
 6290	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 6291		return -EINVAL;
 6292	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
 6293		return -EINVAL;
 6294	if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags ||
 6295	    sqe->splice_fd_in)
 6296		return -EINVAL;
 6297
 6298	req->cancel.addr = READ_ONCE(sqe->addr);
 6299	return 0;
 6300}
 6301
 6302static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 6303{
 6304	struct io_ring_ctx *ctx = req->ctx;
 6305	u64 sqe_addr = req->cancel.addr;
 6306	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 6307	struct io_tctx_node *node;
 6308	int ret;
 6309
 6310	ret = io_try_cancel_userdata(req, sqe_addr);
 6311	if (ret != -ENOENT)
 6312		goto done;
 6313
 6314	/* slow path, try all io-wq's */
 6315	io_ring_submit_lock(ctx, needs_lock);
 6316	ret = -ENOENT;
 6317	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
 6318		struct io_uring_task *tctx = node->task->io_uring;
 6319
 6320		ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
 6321		if (ret != -ENOENT)
 6322			break;
 6323	}
 6324	io_ring_submit_unlock(ctx, needs_lock);
 6325done:
 6326	if (ret < 0)
 6327		req_set_fail(req);
 6328	io_req_complete_post(req, ret, 0);
 6329	return 0;
 6330}
 6331
 6332static int io_rsrc_update_prep(struct io_kiocb *req,
 6333				const struct io_uring_sqe *sqe)
 6334{
 6335	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
 6336		return -EINVAL;
 6337	if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
 6338		return -EINVAL;
 6339
 6340	req->rsrc_update.offset = READ_ONCE(sqe->off);
 6341	req->rsrc_update.nr_args = READ_ONCE(sqe->len);
 6342	if (!req->rsrc_update.nr_args)
 6343		return -EINVAL;
 6344	req->rsrc_update.arg = READ_ONCE(sqe->addr);
 6345	return 0;
 6346}
 6347
 6348static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 6349{
 6350	struct io_ring_ctx *ctx = req->ctx;
 6351	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 6352	struct io_uring_rsrc_update2 up;
 6353	int ret;
 6354
 6355	up.offset = req->rsrc_update.offset;
 6356	up.data = req->rsrc_update.arg;
 6357	up.nr = 0;
 6358	up.tags = 0;
 6359	up.resv = 0;
 6360
 6361	io_ring_submit_lock(ctx, needs_lock);
 6362	ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
 6363					&up, req->rsrc_update.nr_args);
 6364	io_ring_submit_unlock(ctx, needs_lock);
 6365
 6366	if (ret < 0)
 6367		req_set_fail(req);
 6368	__io_req_complete(req, issue_flags, ret, 0);
 6369	return 0;
 6370}
 6371
 6372static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 6373{
 6374	switch (req->opcode) {
 6375	case IORING_OP_NOP:
 6376		return 0;
 6377	case IORING_OP_READV:
 6378	case IORING_OP_READ_FIXED:
 6379	case IORING_OP_READ:
 6380		return io_read_prep(req, sqe);
 6381	case IORING_OP_WRITEV:
 6382	case IORING_OP_WRITE_FIXED:
 6383	case IORING_OP_WRITE:
 6384		return io_write_prep(req, sqe);
 6385	case IORING_OP_POLL_ADD:
 6386		return io_poll_add_prep(req, sqe);
 6387	case IORING_OP_POLL_REMOVE:
 6388		return io_poll_update_prep(req, sqe);
 6389	case IORING_OP_FSYNC:
 6390		return io_fsync_prep(req, sqe);
 6391	case IORING_OP_SYNC_FILE_RANGE:
 6392		return io_sfr_prep(req, sqe);
 6393	case IORING_OP_SENDMSG:
 6394	case IORING_OP_SEND:
 6395		return io_sendmsg_prep(req, sqe);
 6396	case IORING_OP_RECVMSG:
 6397	case IORING_OP_RECV:
 6398		return io_recvmsg_prep(req, sqe);
 6399	case IORING_OP_CONNECT:
 6400		return io_connect_prep(req, sqe);
 6401	case IORING_OP_TIMEOUT:
 6402		return io_timeout_prep(req, sqe, false);
 6403	case IORING_OP_TIMEOUT_REMOVE:
 6404		return io_timeout_remove_prep(req, sqe);
 6405	case IORING_OP_ASYNC_CANCEL:
 6406		return io_async_cancel_prep(req, sqe);
 6407	case IORING_OP_LINK_TIMEOUT:
 6408		return io_timeout_prep(req, sqe, true);
 6409	case IORING_OP_ACCEPT:
 6410		return io_accept_prep(req, sqe);
 6411	case IORING_OP_FALLOCATE:
 6412		return io_fallocate_prep(req, sqe);
 6413	case IORING_OP_OPENAT:
 6414		return io_openat_prep(req, sqe);
 6415	case IORING_OP_CLOSE:
 6416		return io_close_prep(req, sqe);
 6417	case IORING_OP_FILES_UPDATE:
 6418		return io_rsrc_update_prep(req, sqe);
 6419	case IORING_OP_STATX:
 6420		return io_statx_prep(req, sqe);
 6421	case IORING_OP_FADVISE:
 6422		return io_fadvise_prep(req, sqe);
 6423	case IORING_OP_MADVISE:
 6424		return io_madvise_prep(req, sqe);
 6425	case IORING_OP_OPENAT2:
 6426		return io_openat2_prep(req, sqe);
 6427	case IORING_OP_EPOLL_CTL:
 6428		return io_epoll_ctl_prep(req, sqe);
 6429	case IORING_OP_SPLICE:
 6430		return io_splice_prep(req, sqe);
 6431	case IORING_OP_PROVIDE_BUFFERS:
 6432		return io_provide_buffers_prep(req, sqe);
 6433	case IORING_OP_REMOVE_BUFFERS:
 6434		return io_remove_buffers_prep(req, sqe);
 6435	case IORING_OP_TEE:
 6436		return io_tee_prep(req, sqe);
 6437	case IORING_OP_SHUTDOWN:
 6438		return io_shutdown_prep(req, sqe);
 6439	case IORING_OP_RENAMEAT:
 6440		return io_renameat_prep(req, sqe);
 6441	case IORING_OP_UNLINKAT:
 6442		return io_unlinkat_prep(req, sqe);
 6443	case IORING_OP_MKDIRAT:
 6444		return io_mkdirat_prep(req, sqe);
 6445	case IORING_OP_SYMLINKAT:
 6446		return io_symlinkat_prep(req, sqe);
 6447	case IORING_OP_LINKAT:
 6448		return io_linkat_prep(req, sqe);
 6449	}
 6450
 6451	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
 6452			req->opcode);
 6453	return -EINVAL;
 6454}
 6455
 6456static int io_req_prep_async(struct io_kiocb *req)
 6457{
 6458	if (!io_op_defs[req->opcode].needs_async_setup)
 6459		return 0;
 6460	if (WARN_ON_ONCE(req_has_async_data(req)))
 6461		return -EFAULT;
 6462	if (io_alloc_async_data(req))
 6463		return -EAGAIN;
 6464
 6465	switch (req->opcode) {
 6466	case IORING_OP_READV:
 6467		return io_rw_prep_async(req, READ);
 6468	case IORING_OP_WRITEV:
 6469		return io_rw_prep_async(req, WRITE);
 6470	case IORING_OP_SENDMSG:
 6471		return io_sendmsg_prep_async(req);
 6472	case IORING_OP_RECVMSG:
 6473		return io_recvmsg_prep_async(req);
 6474	case IORING_OP_CONNECT:
 6475		return io_connect_prep_async(req);
 6476	}
 6477	printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
 6478		    req->opcode);
 6479	return -EFAULT;
 6480}
 6481
 6482static u32 io_get_sequence(struct io_kiocb *req)
 6483{
 6484	u32 seq = req->ctx->cached_sq_head;
 6485
 6486	/* need original cached_sq_head, but it was increased for each req */
 6487	io_for_each_link(req, req)
 6488		seq--;
 6489	return seq;
 6490}
 6491
 6492static __cold void io_drain_req(struct io_kiocb *req)
 6493{
 6494	struct io_ring_ctx *ctx = req->ctx;
 6495	struct io_defer_entry *de;
 6496	int ret;
 6497	u32 seq = io_get_sequence(req);
 6498
 6499	/* Still need defer if there is pending req in defer list. */
 6500	if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
 6501queue:
 6502		ctx->drain_active = false;
 6503		io_req_task_queue(req);
 6504		return;
 6505	}
 6506
 6507	ret = io_req_prep_async(req);
 6508	if (ret) {
 6509fail:
 6510		io_req_complete_failed(req, ret);
 6511		return;
 6512	}
 6513	io_prep_async_link(req);
 6514	de = kmalloc(sizeof(*de), GFP_KERNEL);
 6515	if (!de) {
 6516		ret = -ENOMEM;
 6517		goto fail;
 6518	}
 6519
 6520	spin_lock(&ctx->completion_lock);
 6521	if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
 6522		spin_unlock(&ctx->completion_lock);
 6523		kfree(de);
 6524		goto queue;
 6525	}
 6526
 6527	trace_io_uring_defer(ctx, req, req->user_data);
 6528	de->req = req;
 6529	de->seq = seq;
 6530	list_add_tail(&de->list, &ctx->defer_list);
 6531	spin_unlock(&ctx->completion_lock);
 6532}
 6533
 6534static void io_clean_op(struct io_kiocb *req)
 6535{
 6536	if (req->flags & REQ_F_BUFFER_SELECTED) {
 6537		kfree(req->kbuf);
 6538		req->kbuf = NULL;
 6539	}
 6540
 6541	if (req->flags & REQ_F_NEED_CLEANUP) {
 6542		switch (req->opcode) {
 6543		case IORING_OP_READV:
 6544		case IORING_OP_READ_FIXED:
 6545		case IORING_OP_READ:
 6546		case IORING_OP_WRITEV:
 6547		case IORING_OP_WRITE_FIXED:
 6548		case IORING_OP_WRITE: {
 6549			struct io_async_rw *io = req->async_data;
 6550
 6551			kfree(io->free_iovec);
 6552			break;
 6553			}
 6554		case IORING_OP_RECVMSG:
 6555		case IORING_OP_SENDMSG: {
 6556			struct io_async_msghdr *io = req->async_data;
 6557
 6558			kfree(io->free_iov);
 6559			break;
 6560			}
 6561		case IORING_OP_SPLICE:
 6562		case IORING_OP_TEE:
 6563			if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED))
 6564				io_put_file(req->splice.file_in);
 6565			break;
 6566		case IORING_OP_OPENAT:
 6567		case IORING_OP_OPENAT2:
 6568			if (req->open.filename)
 6569				putname(req->open.filename);
 6570			break;
 6571		case IORING_OP_RENAMEAT:
 6572			putname(req->rename.oldpath);
 6573			putname(req->rename.newpath);
 6574			break;
 6575		case IORING_OP_UNLINKAT:
 6576			putname(req->unlink.filename);
 6577			break;
 6578		case IORING_OP_MKDIRAT:
 6579			putname(req->mkdir.filename);
 6580			break;
 6581		case IORING_OP_SYMLINKAT:
 6582			putname(req->symlink.oldpath);
 6583			putname(req->symlink.newpath);
 6584			break;
 6585		case IORING_OP_LINKAT:
 6586			putname(req->hardlink.oldpath);
 6587			putname(req->hardlink.newpath);
 6588			break;
 6589		}
 6590	}
 6591	if ((req->flags & REQ_F_POLLED) && req->apoll) {
 6592		kfree(req->apoll->double_poll);
 6593		kfree(req->apoll);
 6594		req->apoll = NULL;
 6595	}
 6596	if (req->flags & REQ_F_INFLIGHT) {
 6597		struct io_uring_task *tctx = req->task->io_uring;
 6598
 6599		atomic_dec(&tctx->inflight_tracked);
 6600	}
 6601	if (req->flags & REQ_F_CREDS)
 6602		put_cred(req->creds);
 6603	if (req->flags & REQ_F_ASYNC_DATA) {
 6604		kfree(req->async_data);
 6605		req->async_data = NULL;
 6606	}
 6607	req->flags &= ~IO_REQ_CLEAN_FLAGS;
 6608}
 6609
 6610static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 6611{
 6612	const struct cred *creds = NULL;
 6613	int ret;
 6614
 6615	if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
 6616		creds = override_creds(req->creds);
 6617
 6618	if (!io_op_defs[req->opcode].audit_skip)
 6619		audit_uring_entry(req->opcode);
 6620
 6621	switch (req->opcode) {
 6622	case IORING_OP_NOP:
 6623		ret = io_nop(req, issue_flags);
 6624		break;
 6625	case IORING_OP_READV:
 6626	case IORING_OP_READ_FIXED:
 6627	case IORING_OP_READ:
 6628		ret = io_read(req, issue_flags);
 6629		break;
 6630	case IORING_OP_WRITEV:
 6631	case IORING_OP_WRITE_FIXED:
 6632	case IORING_OP_WRITE:
 6633		ret = io_write(req, issue_flags);
 6634		break;
 6635	case IORING_OP_FSYNC:
 6636		ret = io_fsync(req, issue_flags);
 6637		break;
 6638	case IORING_OP_POLL_ADD:
 6639		ret = io_poll_add(req, issue_flags);
 6640		break;
 6641	case IORING_OP_POLL_REMOVE:
 6642		ret = io_poll_update(req, issue_flags);
 6643		break;
 6644	case IORING_OP_SYNC_FILE_RANGE:
 6645		ret = io_sync_file_range(req, issue_flags);
 6646		break;
 6647	case IORING_OP_SENDMSG:
 6648		ret = io_sendmsg(req, issue_flags);
 6649		break;
 6650	case IORING_OP_SEND:
 6651		ret = io_send(req, issue_flags);
 6652		break;
 6653	case IORING_OP_RECVMSG:
 6654		ret = io_recvmsg(req, issue_flags);
 6655		break;
 6656	case IORING_OP_RECV:
 6657		ret = io_recv(req, issue_flags);
 6658		break;
 6659	case IORING_OP_TIMEOUT:
 6660		ret = io_timeout(req, issue_flags);
 6661		break;
 6662	case IORING_OP_TIMEOUT_REMOVE:
 6663		ret = io_timeout_remove(req, issue_flags);
 6664		break;
 6665	case IORING_OP_ACCEPT:
 6666		ret = io_accept(req, issue_flags);
 6667		break;
 6668	case IORING_OP_CONNECT:
 6669		ret = io_connect(req, issue_flags);
 6670		break;
 6671	case IORING_OP_ASYNC_CANCEL:
 6672		ret = io_async_cancel(req, issue_flags);
 6673		break;
 6674	case IORING_OP_FALLOCATE:
 6675		ret = io_fallocate(req, issue_flags);
 6676		break;
 6677	case IORING_OP_OPENAT:
 6678		ret = io_openat(req, issue_flags);
 6679		break;
 6680	case IORING_OP_CLOSE:
 6681		ret = io_close(req, issue_flags);
 6682		break;
 6683	case IORING_OP_FILES_UPDATE:
 6684		ret = io_files_update(req, issue_flags);
 6685		break;
 6686	case IORING_OP_STATX:
 6687		ret = io_statx(req, issue_flags);
 6688		break;
 6689	case IORING_OP_FADVISE:
 6690		ret = io_fadvise(req, issue_flags);
 6691		break;
 6692	case IORING_OP_MADVISE:
 6693		ret = io_madvise(req, issue_flags);
 6694		break;
 6695	case IORING_OP_OPENAT2:
 6696		ret = io_openat2(req, issue_flags);
 6697		break;
 6698	case IORING_OP_EPOLL_CTL:
 6699		ret = io_epoll_ctl(req, issue_flags);
 6700		break;
 6701	case IORING_OP_SPLICE:
 6702		ret = io_splice(req, issue_flags);
 6703		break;
 6704	case IORING_OP_PROVIDE_BUFFERS:
 6705		ret = io_provide_buffers(req, issue_flags);
 6706		break;
 6707	case IORING_OP_REMOVE_BUFFERS:
 6708		ret = io_remove_buffers(req, issue_flags);
 6709		break;
 6710	case IORING_OP_TEE:
 6711		ret = io_tee(req, issue_flags);
 6712		break;
 6713	case IORING_OP_SHUTDOWN:
 6714		ret = io_shutdown(req, issue_flags);
 6715		break;
 6716	case IORING_OP_RENAMEAT:
 6717		ret = io_renameat(req, issue_flags);
 6718		break;
 6719	case IORING_OP_UNLINKAT:
 6720		ret = io_unlinkat(req, issue_flags);
 6721		break;
 6722	case IORING_OP_MKDIRAT:
 6723		ret = io_mkdirat(req, issue_flags);
 6724		break;
 6725	case IORING_OP_SYMLINKAT:
 6726		ret = io_symlinkat(req, issue_flags);
 6727		break;
 6728	case IORING_OP_LINKAT:
 6729		ret = io_linkat(req, issue_flags);
 6730		break;
 6731	default:
 6732		ret = -EINVAL;
 6733		break;
 6734	}
 6735
 6736	if (!io_op_defs[req->opcode].audit_skip)
 6737		audit_uring_exit(!ret, ret);
 6738
 6739	if (creds)
 6740		revert_creds(creds);
 6741	if (ret)
 6742		return ret;
 6743	/* If the op doesn't have a file, we're not polling for it */
 6744	if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
 6745		io_iopoll_req_issued(req, issue_flags);
 6746
 6747	return 0;
 6748}
 6749
 6750static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
 6751{
 6752	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 6753
 6754	req = io_put_req_find_next(req);
 6755	return req ? &req->work : NULL;
 6756}
 6757
 6758static void io_wq_submit_work(struct io_wq_work *work)
 6759{
 6760	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 6761	unsigned int issue_flags = IO_URING_F_UNLOCKED;
 6762	bool needs_poll = false;
 6763	struct io_kiocb *timeout;
 6764	int ret = 0;
 6765
 6766	/* one will be dropped by ->io_free_work() after returning to io-wq */
 6767	if (!(req->flags & REQ_F_REFCOUNT))
 6768		__io_req_set_refcount(req, 2);
 6769	else
 6770		req_ref_get(req);
 6771
 6772	timeout = io_prep_linked_timeout(req);
 6773	if (timeout)
 6774		io_queue_linked_timeout(timeout);
 6775
 6776	/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
 6777	if (work->flags & IO_WQ_WORK_CANCEL) {
 6778		io_req_task_queue_fail(req, -ECANCELED);
 6779		return;
 6780	}
 6781
 6782	if (req->flags & REQ_F_FORCE_ASYNC) {
 6783		const struct io_op_def *def = &io_op_defs[req->opcode];
 6784		bool opcode_poll = def->pollin || def->pollout;
 6785
 6786		if (opcode_poll && file_can_poll(req->file)) {
 6787			needs_poll = true;
 6788			issue_flags |= IO_URING_F_NONBLOCK;
 6789		}
 6790	}
 6791
 6792	do {
 6793		ret = io_issue_sqe(req, issue_flags);
 6794		if (ret != -EAGAIN)
 6795			break;
 6796		/*
 6797		 * We can get EAGAIN for iopolled IO even though we're
 6798		 * forcing a sync submission from here, since we can't
 6799		 * wait for request slots on the block side.
 6800		 */
 6801		if (!needs_poll) {
 6802			cond_resched();
 6803			continue;
 6804		}
 6805
 6806		if (io_arm_poll_handler(req) == IO_APOLL_OK)
 6807			return;
 6808		/* aborted or ready, in either case retry blocking */
 6809		needs_poll = false;
 6810		issue_flags &= ~IO_URING_F_NONBLOCK;
 6811	} while (1);
 6812
 6813	/* avoid locking problems by failing it from a clean context */
 6814	if (ret)
 6815		io_req_task_queue_fail(req, ret);
 6816}
 6817
 6818static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
 6819						       unsigned i)
 6820{
 6821	return &table->files[i];
 6822}
 6823
 6824static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
 6825					      int index)
 6826{
 6827	struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
 6828
 6829	return (struct file *) (slot->file_ptr & FFS_MASK);
 6830}
 6831
 6832static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
 6833{
 6834	unsigned long file_ptr = (unsigned long) file;
 6835
 6836	file_ptr |= io_file_get_flags(file);
 6837	file_slot->file_ptr = file_ptr;
 6838}
 6839
 6840static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
 6841					     struct io_kiocb *req, int fd)
 6842{
 6843	struct file *file;
 6844	unsigned long file_ptr;
 6845
 6846	if (unlikely((unsigned int)fd >= ctx->nr_user_files))
 6847		return NULL;
 6848	fd = array_index_nospec(fd, ctx->nr_user_files);
 6849	file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
 6850	file = (struct file *) (file_ptr & FFS_MASK);
 6851	file_ptr &= ~FFS_MASK;
 6852	/* mask in overlapping REQ_F and FFS bits */
 6853	req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
 6854	io_req_set_rsrc_node(req, ctx);
 6855	return file;
 6856}
 6857
 6858static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
 6859				       struct io_kiocb *req, int fd)
 6860{
 6861	struct file *file = fget(fd);
 6862
 6863	trace_io_uring_file_get(ctx, fd);
 6864
 6865	/* we don't allow fixed io_uring files */
 6866	if (file && unlikely(file->f_op == &io_uring_fops))
 6867		io_req_track_inflight(req);
 6868	return file;
 6869}
 6870
 6871static inline struct file *io_file_get(struct io_ring_ctx *ctx,
 6872				       struct io_kiocb *req, int fd, bool fixed)
 6873{
 6874	if (fixed)
 6875		return io_file_get_fixed(ctx, req, fd);
 6876	else
 6877		return io_file_get_normal(ctx, req, fd);
 6878}
 6879
 6880static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
 6881{
 6882	struct io_kiocb *prev = req->timeout.prev;
 6883	int ret;
 6884
 6885	if (prev) {
 6886		ret = io_try_cancel_userdata(req, prev->user_data);
 6887		io_req_complete_post(req, ret ?: -ETIME, 0);
 6888		io_put_req(prev);
 6889	} else {
 6890		io_req_complete_post(req, -ETIME, 0);
 6891	}
 6892}
 6893
 6894static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 6895{
 6896	struct io_timeout_data *data = container_of(timer,
 6897						struct io_timeout_data, timer);
 6898	struct io_kiocb *prev, *req = data->req;
 6899	struct io_ring_ctx *ctx = req->ctx;
 6900	unsigned long flags;
 6901
 6902	spin_lock_irqsave(&ctx->timeout_lock, flags);
 6903	prev = req->timeout.head;
 6904	req->timeout.head = NULL;
 6905
 6906	/*
 6907	 * We don't expect the list to be empty, that will only happen if we
 6908	 * race with the completion of the linked work.
 6909	 */
 6910	if (prev) {
 6911		io_remove_next_linked(prev);
 6912		if (!req_ref_inc_not_zero(prev))
 6913			prev = NULL;
 6914	}
 6915	list_del(&req->timeout.list);
 6916	req->timeout.prev = prev;
 6917	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
 6918
 6919	req->io_task_work.func = io_req_task_link_timeout;
 6920	io_req_task_work_add(req);
 6921	return HRTIMER_NORESTART;
 6922}
 6923
 6924static void io_queue_linked_timeout(struct io_kiocb *req)
 6925{
 6926	struct io_ring_ctx *ctx = req->ctx;
 6927
 6928	spin_lock_irq(&ctx->timeout_lock);
 6929	/*
 6930	 * If the back reference is NULL, then our linked request finished
 6931	 * before we got a chance to setup the timer
 6932	 */
 6933	if (req->timeout.head) {
 6934		struct io_timeout_data *data = req->async_data;
 6935
 6936		data->timer.function = io_link_timeout_fn;
 6937		hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
 6938				data->mode);
 6939		list_add_tail(&req->timeout.list, &ctx->ltimeout_list);
 6940	}
 6941	spin_unlock_irq(&ctx->timeout_lock);
 6942	/* drop submission reference */
 6943	io_put_req(req);
 6944}
 6945
 6946static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
 6947	__must_hold(&req->ctx->uring_lock)
 6948{
 6949	struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
 6950
 6951	switch (io_arm_poll_handler(req)) {
 6952	case IO_APOLL_READY:
 6953		io_req_task_queue(req);
 6954		break;
 6955	case IO_APOLL_ABORTED:
 6956		/*
 6957		 * Queued up for async execution, worker will release
 6958		 * submit reference when the iocb is actually submitted.
 6959		 */
 6960		io_queue_async_work(req, NULL);
 6961		break;
 6962	}
 6963
 6964	if (linked_timeout)
 6965		io_queue_linked_timeout(linked_timeout);
 6966}
 6967
 6968static inline void __io_queue_sqe(struct io_kiocb *req)
 6969	__must_hold(&req->ctx->uring_lock)
 6970{
 6971	struct io_kiocb *linked_timeout;
 6972	int ret;
 6973
 6974	ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
 6975
 6976	if (req->flags & REQ_F_COMPLETE_INLINE) {
 6977		io_req_add_compl_list(req);
 6978		return;
 6979	}
 6980	/*
 6981	 * We async punt it if the file wasn't marked NOWAIT, or if the file
 6982	 * doesn't support non-blocking read/write attempts
 6983	 */
 6984	if (likely(!ret)) {
 6985		linked_timeout = io_prep_linked_timeout(req);
 6986		if (linked_timeout)
 6987			io_queue_linked_timeout(linked_timeout);
 6988	} else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
 6989		io_queue_sqe_arm_apoll(req);
 6990	} else {
 6991		io_req_complete_failed(req, ret);
 6992	}
 6993}
 6994
 6995static void io_queue_sqe_fallback(struct io_kiocb *req)
 6996	__must_hold(&req->ctx->uring_lock)
 6997{
 6998	if (req->flags & REQ_F_FAIL) {
 6999		io_req_complete_fail_submit(req);
 7000	} else if (unlikely(req->ctx->drain_active)) {
 7001		io_drain_req(req);
 7002	} else {
 7003		int ret = io_req_prep_async(req);
 7004
 7005		if (unlikely(ret))
 7006			io_req_complete_failed(req, ret);
 7007		else
 7008			io_queue_async_work(req, NULL);
 7009	}
 7010}
 7011
 7012static inline void io_queue_sqe(struct io_kiocb *req)
 7013	__must_hold(&req->ctx->uring_lock)
 7014{
 7015	if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))))
 7016		__io_queue_sqe(req);
 7017	else
 7018		io_queue_sqe_fallback(req);
 7019}
 7020
 7021/*
 7022 * Check SQE restrictions (opcode and flags).
 7023 *
 7024 * Returns 'true' if SQE is allowed, 'false' otherwise.
 7025 */
 7026static inline bool io_check_restriction(struct io_ring_ctx *ctx,
 7027					struct io_kiocb *req,
 7028					unsigned int sqe_flags)
 7029{
 7030	if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
 7031		return false;
 7032
 7033	if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
 7034	    ctx->restrictions.sqe_flags_required)
 7035		return false;
 7036
 7037	if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
 7038			  ctx->restrictions.sqe_flags_required))
 7039		return false;
 7040
 7041	return true;
 7042}
 7043
 7044static void io_init_req_drain(struct io_kiocb *req)
 7045{
 7046	struct io_ring_ctx *ctx = req->ctx;
 7047	struct io_kiocb *head = ctx->submit_state.link.head;
 7048
 7049	ctx->drain_active = true;
 7050	if (head) {
 7051		/*
 7052		 * If we need to drain a request in the middle of a link, drain
 7053		 * the head request and the next request/link after the current
 7054		 * link. Considering sequential execution of links,
 7055		 * IOSQE_IO_DRAIN will be maintained for every request of our
 7056		 * link.
 7057		 */
 7058		head->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC;
 7059		ctx->drain_next = true;
 7060	}
 7061}
 7062
 7063static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 7064		       const struct io_uring_sqe *sqe)
 7065	__must_hold(&ctx->uring_lock)
 7066{
 7067	unsigned int sqe_flags;
 7068	int personality;
 7069	u8 opcode;
 7070
 7071	/* req is partially pre-initialised, see io_preinit_req() */
 7072	req->opcode = opcode = READ_ONCE(sqe->opcode);
 7073	/* same numerical values with corresponding REQ_F_*, safe to copy */
 7074	req->flags = sqe_flags = READ_ONCE(sqe->flags);
 7075	req->user_data = READ_ONCE(sqe->user_data);
 7076	req->file = NULL;
 7077	req->fixed_rsrc_refs = NULL;
 7078	req->task = current;
 7079
 7080	if (unlikely(opcode >= IORING_OP_LAST)) {
 7081		req->opcode = 0;
 7082		return -EINVAL;
 7083	}
 7084	if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
 7085		/* enforce forwards compatibility on users */
 7086		if (sqe_flags & ~SQE_VALID_FLAGS)
 7087			return -EINVAL;
 7088		if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
 7089		    !io_op_defs[opcode].buffer_select)
 7090			return -EOPNOTSUPP;
 7091		if (sqe_flags & IOSQE_IO_DRAIN)
 7092			io_init_req_drain(req);
 7093	}
 7094	if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
 7095		if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
 7096			return -EACCES;
 7097		/* knock it to the slow queue path, will be drained there */
 7098		if (ctx->drain_active)
 7099			req->flags |= REQ_F_FORCE_ASYNC;
 7100		/* if there is no link, we're at "next" request and need to drain */
 7101		if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
 7102			ctx->drain_next = false;
 7103			ctx->drain_active = true;
 7104			req->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC;
 7105		}
 7106	}
 7107
 7108	if (io_op_defs[opcode].needs_file) {
 7109		struct io_submit_state *state = &ctx->submit_state;
 7110
 7111		/*
 7112		 * Plug now if we have more than 2 IO left after this, and the
 7113		 * target is potentially a read/write to block based storage.
 7114		 */
 7115		if (state->need_plug && io_op_defs[opcode].plug) {
 7116			state->plug_started = true;
 7117			state->need_plug = false;
 7118			blk_start_plug_nr_ios(&state->plug, state->submit_nr);
 7119		}
 7120
 7121		req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
 7122					(sqe_flags & IOSQE_FIXED_FILE));
 7123		if (unlikely(!req->file))
 7124			return -EBADF;
 7125	}
 7126
 7127	personality = READ_ONCE(sqe->personality);
 7128	if (personality) {
 7129		int ret;
 7130
 7131		req->creds = xa_load(&ctx->personalities, personality);
 7132		if (!req->creds)
 7133			return -EINVAL;
 7134		get_cred(req->creds);
 7135		ret = security_uring_override_creds(req->creds);
 7136		if (ret) {
 7137			put_cred(req->creds);
 7138			return ret;
 7139		}
 7140		req->flags |= REQ_F_CREDS;
 7141	}
 7142
 7143	return io_req_prep(req, sqe);
 7144}
 7145
 7146static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 7147			 const struct io_uring_sqe *sqe)
 7148	__must_hold(&ctx->uring_lock)
 7149{
 7150	struct io_submit_link *link = &ctx->submit_state.link;
 7151	int ret;
 7152
 7153	ret = io_init_req(ctx, req, sqe);
 7154	if (unlikely(ret)) {
 7155		trace_io_uring_req_failed(sqe, ret);
 7156
 7157		/* fail even hard links since we don't submit */
 7158		if (link->head) {
 7159			/*
 7160			 * we can judge a link req is failed or cancelled by if
 7161			 * REQ_F_FAIL is set, but the head is an exception since
 7162			 * it may be set REQ_F_FAIL because of other req's failure
 7163			 * so let's leverage req->result to distinguish if a head
 7164			 * is set REQ_F_FAIL because of its failure or other req's
 7165			 * failure so that we can set the correct ret code for it.
 7166			 * init result here to avoid affecting the normal path.
 7167			 */
 7168			if (!(link->head->flags & REQ_F_FAIL))
 7169				req_fail_link_node(link->head, -ECANCELED);
 7170		} else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
 7171			/*
 7172			 * the current req is a normal req, we should return
 7173			 * error and thus break the submittion loop.
 7174			 */
 7175			io_req_complete_failed(req, ret);
 7176			return ret;
 7177		}
 7178		req_fail_link_node(req, ret);
 7179	}
 7180
 7181	/* don't need @sqe from now on */
 7182	trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
 7183				  req->flags, true,
 7184				  ctx->flags & IORING_SETUP_SQPOLL);
 7185
 7186	/*
 7187	 * If we already have a head request, queue this one for async
 7188	 * submittal once the head completes. If we don't have a head but
 7189	 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
 7190	 * submitted sync once the chain is complete. If none of those
 7191	 * conditions are true (normal request), then just queue it.
 7192	 */
 7193	if (link->head) {
 7194		struct io_kiocb *head = link->head;
 7195
 7196		if (!(req->flags & REQ_F_FAIL)) {
 7197			ret = io_req_prep_async(req);
 7198			if (unlikely(ret)) {
 7199				req_fail_link_node(req, ret);
 7200				if (!(head->flags & REQ_F_FAIL))
 7201					req_fail_link_node(head, -ECANCELED);
 7202			}
 7203		}
 7204		trace_io_uring_link(ctx, req, head);
 7205		link->last->link = req;
 7206		link->last = req;
 7207
 7208		if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
 7209			return 0;
 7210		/* last request of a link, enqueue the link */
 7211		link->head = NULL;
 7212		req = head;
 7213	} else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
 7214		link->head = req;
 7215		link->last = req;
 7216		return 0;
 7217	}
 7218
 7219	io_queue_sqe(req);
 7220	return 0;
 7221}
 7222
 7223/*
 7224 * Batched submission is done, ensure local IO is flushed out.
 7225 */
 7226static void io_submit_state_end(struct io_ring_ctx *ctx)
 7227{
 7228	struct io_submit_state *state = &ctx->submit_state;
 7229
 7230	if (state->link.head)
 7231		io_queue_sqe(state->link.head);
 7232	/* flush only after queuing links as they can generate completions */
 7233	io_submit_flush_completions(ctx);
 7234	if (state->plug_started)
 7235		blk_finish_plug(&state->plug);
 7236}
 7237
 7238/*
 7239 * Start submission side cache.
 7240 */
 7241static void io_submit_state_start(struct io_submit_state *state,
 7242				  unsigned int max_ios)
 7243{
 7244	state->plug_started = false;
 7245	state->need_plug = max_ios > 2;
 7246	state->submit_nr = max_ios;
 7247	/* set only head, no need to init link_last in advance */
 7248	state->link.head = NULL;
 7249}
 7250
 7251static void io_commit_sqring(struct io_ring_ctx *ctx)
 7252{
 7253	struct io_rings *rings = ctx->rings;
 7254
 7255	/*
 7256	 * Ensure any loads from the SQEs are done at this point,
 7257	 * since once we write the new head, the application could
 7258	 * write new data to them.
 7259	 */
 7260	smp_store_release(&rings->sq.head, ctx->cached_sq_head);
 7261}
 7262
 7263/*
 7264 * Fetch an sqe, if one is available. Note this returns a pointer to memory
 7265 * that is mapped by userspace. This means that care needs to be taken to
 7266 * ensure that reads are stable, as we cannot rely on userspace always
 7267 * being a good citizen. If members of the sqe are validated and then later
 7268 * used, it's important that those reads are done through READ_ONCE() to
 7269 * prevent a re-load down the line.
 7270 */
 7271static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
 7272{
 7273	unsigned head, mask = ctx->sq_entries - 1;
 7274	unsigned sq_idx = ctx->cached_sq_head++ & mask;
 7275
 7276	/*
 7277	 * The cached sq head (or cq tail) serves two purposes:
 7278	 *
 7279	 * 1) allows us to batch the cost of updating the user visible
 7280	 *    head updates.
 7281	 * 2) allows the kernel side to track the head on its own, even
 7282	 *    though the application is the one updating it.
 7283	 */
 7284	head = READ_ONCE(ctx->sq_array[sq_idx]);
 7285	if (likely(head < ctx->sq_entries))
 7286		return &ctx->sq_sqes[head];
 7287
 7288	/* drop invalid entries */
 7289	ctx->cq_extra--;
 7290	WRITE_ONCE(ctx->rings->sq_dropped,
 7291		   READ_ONCE(ctx->rings->sq_dropped) + 1);
 7292	return NULL;
 7293}
 7294
 7295static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 7296	__must_hold(&ctx->uring_lock)
 7297{
 7298	unsigned int entries = io_sqring_entries(ctx);
 7299	int submitted = 0;
 7300
 7301	if (unlikely(!entries))
 7302		return 0;
 7303	/* make sure SQ entry isn't read before tail */
 7304	nr = min3(nr, ctx->sq_entries, entries);
 7305	io_get_task_refs(nr);
 7306
 7307	io_submit_state_start(&ctx->submit_state, nr);
 7308	do {
 7309		const struct io_uring_sqe *sqe;
 7310		struct io_kiocb *req;
 7311
 7312		if (unlikely(!io_alloc_req_refill(ctx))) {
 7313			if (!submitted)
 7314				submitted = -EAGAIN;
 7315			break;
 7316		}
 7317		req = io_alloc_req(ctx);
 7318		sqe = io_get_sqe(ctx);
 7319		if (unlikely(!sqe)) {
 7320			wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
 7321			break;
 7322		}
 7323		/* will complete beyond this point, count as submitted */
 7324		submitted++;
 7325		if (io_submit_sqe(ctx, req, sqe))
 7326			break;
 7327	} while (submitted < nr);
 7328
 7329	if (unlikely(submitted != nr)) {
 7330		int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
 7331		int unused = nr - ref_used;
 7332
 7333		current->io_uring->cached_refs += unused;
 7334	}
 7335
 7336	io_submit_state_end(ctx);
 7337	 /* Commit SQ ring head once we've consumed and submitted all SQEs */
 7338	io_commit_sqring(ctx);
 7339
 7340	return submitted;
 7341}
 7342
 7343static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
 7344{
 7345	return READ_ONCE(sqd->state);
 7346}
 7347
 7348static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
 7349{
 7350	/* Tell userspace we may need a wakeup call */
 7351	spin_lock(&ctx->completion_lock);
 7352	WRITE_ONCE(ctx->rings->sq_flags,
 7353		   ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
 7354	spin_unlock(&ctx->completion_lock);
 7355}
 7356
 7357static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
 7358{
 7359	spin_lock(&ctx->completion_lock);
 7360	WRITE_ONCE(ctx->rings->sq_flags,
 7361		   ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
 7362	spin_unlock(&ctx->completion_lock);
 7363}
 7364
 7365static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
 7366{
 7367	unsigned int to_submit;
 7368	int ret = 0;
 7369
 7370	to_submit = io_sqring_entries(ctx);
 7371	/* if we're handling multiple rings, cap submit size for fairness */
 7372	if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
 7373		to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
 7374
 7375	if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
 7376		const struct cred *creds = NULL;
 7377
 7378		if (ctx->sq_creds != current_cred())
 7379			creds = override_creds(ctx->sq_creds);
 7380
 7381		mutex_lock(&ctx->uring_lock);
 7382		if (!wq_list_empty(&ctx->iopoll_list))
 7383			io_do_iopoll(ctx, true);
 7384
 7385		/*
 7386		 * Don't submit if refs are dying, good for io_uring_register(),
 7387		 * but also it is relied upon by io_ring_exit_work()
 7388		 */
 7389		if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
 7390		    !(ctx->flags & IORING_SETUP_R_DISABLED))
 7391			ret = io_submit_sqes(ctx, to_submit);
 7392		mutex_unlock(&ctx->uring_lock);
 7393
 7394		if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
 7395			wake_up(&ctx->sqo_sq_wait);
 7396		if (creds)
 7397			revert_creds(creds);
 7398	}
 7399
 7400	return ret;
 7401}
 7402
 7403static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
 7404{
 7405	struct io_ring_ctx *ctx;
 7406	unsigned sq_thread_idle = 0;
 7407
 7408	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 7409		sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
 7410	sqd->sq_thread_idle = sq_thread_idle;
 7411}
 7412
 7413static bool io_sqd_handle_event(struct io_sq_data *sqd)
 7414{
 7415	bool did_sig = false;
 7416	struct ksignal ksig;
 7417
 7418	if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
 7419	    signal_pending(current)) {
 7420		mutex_unlock(&sqd->lock);
 7421		if (signal_pending(current))
 7422			did_sig = get_signal(&ksig);
 7423		cond_resched();
 7424		mutex_lock(&sqd->lock);
 7425	}
 7426	return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
 7427}
 7428
 7429static int io_sq_thread(void *data)
 7430{
 7431	struct io_sq_data *sqd = data;
 7432	struct io_ring_ctx *ctx;
 7433	unsigned long timeout = 0;
 7434	char buf[TASK_COMM_LEN];
 7435	DEFINE_WAIT(wait);
 7436
 7437	snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
 7438	set_task_comm(current, buf);
 7439
 7440	if (sqd->sq_cpu != -1)
 7441		set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
 7442	else
 7443		set_cpus_allowed_ptr(current, cpu_online_mask);
 7444	current->flags |= PF_NO_SETAFFINITY;
 7445
 7446	audit_alloc_kernel(current);
 7447
 7448	mutex_lock(&sqd->lock);
 7449	while (1) {
 7450		bool cap_entries, sqt_spin = false;
 7451
 7452		if (io_sqd_events_pending(sqd) || signal_pending(current)) {
 7453			if (io_sqd_handle_event(sqd))
 7454				break;
 7455			timeout = jiffies + sqd->sq_thread_idle;
 7456		}
 7457
 7458		cap_entries = !list_is_singular(&sqd->ctx_list);
 7459		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
 7460			int ret = __io_sq_thread(ctx, cap_entries);
 7461
 7462			if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
 7463				sqt_spin = true;
 7464		}
 7465		if (io_run_task_work())
 7466			sqt_spin = true;
 7467
 7468		if (sqt_spin || !time_after(jiffies, timeout)) {
 7469			cond_resched();
 7470			if (sqt_spin)
 7471				timeout = jiffies + sqd->sq_thread_idle;
 7472			continue;
 7473		}
 7474
 7475		prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
 7476		if (!io_sqd_events_pending(sqd) && !current->task_works) {
 7477			bool needs_sched = true;
 7478
 7479			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
 7480				io_ring_set_wakeup_flag(ctx);
 7481
 7482				if ((ctx->flags & IORING_SETUP_IOPOLL) &&
 7483				    !wq_list_empty(&ctx->iopoll_list)) {
 7484					needs_sched = false;
 7485					break;
 7486				}
 7487				if (io_sqring_entries(ctx)) {
 7488					needs_sched = false;
 7489					break;
 7490				}
 7491			}
 7492
 7493			if (needs_sched) {
 7494				mutex_unlock(&sqd->lock);
 7495				schedule();
 7496				mutex_lock(&sqd->lock);
 7497			}
 7498			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 7499				io_ring_clear_wakeup_flag(ctx);
 7500		}
 7501
 7502		finish_wait(&sqd->wait, &wait);
 7503		timeout = jiffies + sqd->sq_thread_idle;
 7504	}
 7505
 7506	io_uring_cancel_generic(true, sqd);
 7507	sqd->thread = NULL;
 7508	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 7509		io_ring_set_wakeup_flag(ctx);
 7510	io_run_task_work();
 7511	mutex_unlock(&sqd->lock);
 7512
 7513	audit_free(current);
 7514
 7515	complete(&sqd->exited);
 7516	do_exit(0);
 7517}
 7518
 7519struct io_wait_queue {
 7520	struct wait_queue_entry wq;
 7521	struct io_ring_ctx *ctx;
 7522	unsigned cq_tail;
 7523	unsigned nr_timeouts;
 7524};
 7525
 7526static inline bool io_should_wake(struct io_wait_queue *iowq)
 7527{
 7528	struct io_ring_ctx *ctx = iowq->ctx;
 7529	int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
 7530
 7531	/*
 7532	 * Wake up if we have enough events, or if a timeout occurred since we
 7533	 * started waiting. For timeouts, we always want to return to userspace,
 7534	 * regardless of event count.
 7535	 */
 7536	return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
 7537}
 7538
 7539static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
 7540			    int wake_flags, void *key)
 7541{
 7542	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
 7543							wq);
 7544
 7545	/*
 7546	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
 7547	 * the task, and the next invocation will do it.
 7548	 */
 7549	if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow))
 7550		return autoremove_wake_function(curr, mode, wake_flags, key);
 7551	return -1;
 7552}
 7553
 7554static int io_run_task_work_sig(void)
 7555{
 7556	if (io_run_task_work())
 7557		return 1;
 7558	if (!signal_pending(current))
 7559		return 0;
 7560	if (test_thread_flag(TIF_NOTIFY_SIGNAL))
 7561		return -ERESTARTSYS;
 7562	return -EINTR;
 7563}
 7564
 7565/* when returns >0, the caller should retry */
 7566static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 7567					  struct io_wait_queue *iowq,
 7568					  signed long *timeout)
 7569{
 7570	int ret;
 7571
 7572	/* make sure we run task_work before checking for signals */
 7573	ret = io_run_task_work_sig();
 7574	if (ret || io_should_wake(iowq))
 7575		return ret;
 7576	/* let the caller flush overflows, retry */
 7577	if (test_bit(0, &ctx->check_cq_overflow))
 7578		return 1;
 7579
 7580	*timeout = schedule_timeout(*timeout);
 7581	return !*timeout ? -ETIME : 1;
 7582}
 7583
 7584/*
 7585 * Wait until events become available, if we don't already have some. The
 7586 * application must reap them itself, as they reside on the shared cq ring.
 7587 */
 7588static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 7589			  const sigset_t __user *sig, size_t sigsz,
 7590			  struct __kernel_timespec __user *uts)
 7591{
 7592	struct io_wait_queue iowq;
 7593	struct io_rings *rings = ctx->rings;
 7594	signed long timeout = MAX_SCHEDULE_TIMEOUT;
 7595	int ret;
 7596
 7597	do {
 7598		io_cqring_overflow_flush(ctx);
 7599		if (io_cqring_events(ctx) >= min_events)
 7600			return 0;
 7601		if (!io_run_task_work())
 7602			break;
 7603	} while (1);
 7604
 7605	if (uts) {
 7606		struct timespec64 ts;
 7607
 7608		if (get_timespec64(&ts, uts))
 7609			return -EFAULT;
 7610		timeout = timespec64_to_jiffies(&ts);
 7611	}
 7612
 7613	if (sig) {
 7614#ifdef CONFIG_COMPAT
 7615		if (in_compat_syscall())
 7616			ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
 7617						      sigsz);
 7618		else
 7619#endif
 7620			ret = set_user_sigmask(sig, sigsz);
 7621
 7622		if (ret)
 7623			return ret;
 7624	}
 7625
 7626	init_waitqueue_func_entry(&iowq.wq, io_wake_function);
 7627	iowq.wq.private = current;
 7628	INIT_LIST_HEAD(&iowq.wq.entry);
 7629	iowq.ctx = ctx;
 7630	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
 7631	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
 7632
 7633	trace_io_uring_cqring_wait(ctx, min_events);
 7634	do {
 7635		/* if we can't even flush overflow, don't wait for more */
 7636		if (!io_cqring_overflow_flush(ctx)) {
 7637			ret = -EBUSY;
 7638			break;
 7639		}
 7640		prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
 7641						TASK_INTERRUPTIBLE);
 7642		ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
 7643		finish_wait(&ctx->cq_wait, &iowq.wq);
 7644		cond_resched();
 7645	} while (ret > 0);
 7646
 7647	restore_saved_sigmask_unless(ret == -EINTR);
 7648
 7649	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
 7650}
 7651
 7652static void io_free_page_table(void **table, size_t size)
 7653{
 7654	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
 7655
 7656	for (i = 0; i < nr_tables; i++)
 7657		kfree(table[i]);
 7658	kfree(table);
 7659}
 7660
 7661static __cold void **io_alloc_page_table(size_t size)
 7662{
 7663	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
 7664	size_t init_size = size;
 7665	void **table;
 7666
 7667	table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
 7668	if (!table)
 7669		return NULL;
 7670
 7671	for (i = 0; i < nr_tables; i++) {
 7672		unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
 7673
 7674		table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
 7675		if (!table[i]) {
 7676			io_free_page_table(table, init_size);
 7677			return NULL;
 7678		}
 7679		size -= this_size;
 7680	}
 7681	return table;
 7682}
 7683
 7684static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
 7685{
 7686	percpu_ref_exit(&ref_node->refs);
 7687	kfree(ref_node);
 7688}
 7689
 7690static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
 7691{
 7692	struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
 7693	struct io_ring_ctx *ctx = node->rsrc_data->ctx;
 7694	unsigned long flags;
 7695	bool first_add = false;
 7696
 7697	spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
 7698	node->done = true;
 7699
 7700	while (!list_empty(&ctx->rsrc_ref_list)) {
 7701		node = list_first_entry(&ctx->rsrc_ref_list,
 7702					    struct io_rsrc_node, node);
 7703		/* recycle ref nodes in order */
 7704		if (!node->done)
 7705			break;
 7706		list_del(&node->node);
 7707		first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
 7708	}
 7709	spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
 7710
 7711	if (first_add)
 7712		mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
 7713}
 7714
 7715static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
 7716{
 7717	struct io_rsrc_node *ref_node;
 7718
 7719	ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
 7720	if (!ref_node)
 7721		return NULL;
 7722
 7723	if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
 7724			    0, GFP_KERNEL)) {
 7725		kfree(ref_node);
 7726		return NULL;
 7727	}
 7728	INIT_LIST_HEAD(&ref_node->node);
 7729	INIT_LIST_HEAD(&ref_node->rsrc_list);
 7730	ref_node->done = false;
 7731	return ref_node;
 7732}
 7733
 7734static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 7735				struct io_rsrc_data *data_to_kill)
 7736	__must_hold(&ctx->uring_lock)
 7737{
 7738	WARN_ON_ONCE(!ctx->rsrc_backup_node);
 7739	WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
 7740
 7741	io_rsrc_refs_drop(ctx);
 7742
 7743	if (data_to_kill) {
 7744		struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
 7745
 7746		rsrc_node->rsrc_data = data_to_kill;
 7747		spin_lock_irq(&ctx->rsrc_ref_lock);
 7748		list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
 7749		spin_unlock_irq(&ctx->rsrc_ref_lock);
 7750
 7751		atomic_inc(&data_to_kill->refs);
 7752		percpu_ref_kill(&rsrc_node->refs);
 7753		ctx->rsrc_node = NULL;
 7754	}
 7755
 7756	if (!ctx->rsrc_node) {
 7757		ctx->rsrc_node = ctx->rsrc_backup_node;
 7758		ctx->rsrc_backup_node = NULL;
 7759	}
 7760}
 7761
 7762static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
 7763{
 7764	if (ctx->rsrc_backup_node)
 7765		return 0;
 7766	ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
 7767	return ctx->rsrc_backup_node ? 0 : -ENOMEM;
 7768}
 7769
 7770static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 7771				      struct io_ring_ctx *ctx)
 7772{
 7773	int ret;
 7774
 7775	/* As we may drop ->uring_lock, other task may have started quiesce */
 7776	if (data->quiesce)
 7777		return -ENXIO;
 7778
 7779	data->quiesce = true;
 7780	do {
 7781		ret = io_rsrc_node_switch_start(ctx);
 7782		if (ret)
 7783			break;
 7784		io_rsrc_node_switch(ctx, data);
 7785
 7786		/* kill initial ref, already quiesced if zero */
 7787		if (atomic_dec_and_test(&data->refs))
 7788			break;
 7789		mutex_unlock(&ctx->uring_lock);
 7790		flush_delayed_work(&ctx->rsrc_put_work);
 7791		ret = wait_for_completion_interruptible(&data->done);
 7792		if (!ret) {
 7793			mutex_lock(&ctx->uring_lock);
 7794			break;
 7795		}
 7796
 7797		atomic_inc(&data->refs);
 7798		/* wait for all works potentially completing data->done */
 7799		flush_delayed_work(&ctx->rsrc_put_work);
 7800		reinit_completion(&data->done);
 7801
 7802		ret = io_run_task_work_sig();
 7803		mutex_lock(&ctx->uring_lock);
 7804	} while (ret >= 0);
 7805	data->quiesce = false;
 7806
 7807	return ret;
 7808}
 7809
 7810static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
 7811{
 7812	unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
 7813	unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
 7814
 7815	return &data->tags[table_idx][off];
 7816}
 7817
 7818static void io_rsrc_data_free(struct io_rsrc_data *data)
 7819{
 7820	size_t size = data->nr * sizeof(data->tags[0][0]);
 7821
 7822	if (data->tags)
 7823		io_free_page_table((void **)data->tags, size);
 7824	kfree(data);
 7825}
 7826
 7827static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
 7828				     u64 __user *utags, unsigned nr,
 7829				     struct io_rsrc_data **pdata)
 7830{
 7831	struct io_rsrc_data *data;
 7832	int ret = -ENOMEM;
 7833	unsigned i;
 7834
 7835	data = kzalloc(sizeof(*data), GFP_KERNEL);
 7836	if (!data)
 7837		return -ENOMEM;
 7838	data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
 7839	if (!data->tags) {
 7840		kfree(data);
 7841		return -ENOMEM;
 7842	}
 7843
 7844	data->nr = nr;
 7845	data->ctx = ctx;
 7846	data->do_put = do_put;
 7847	if (utags) {
 7848		ret = -EFAULT;
 7849		for (i = 0; i < nr; i++) {
 7850			u64 *tag_slot = io_get_tag_slot(data, i);
 7851
 7852			if (copy_from_user(tag_slot, &utags[i],
 7853					   sizeof(*tag_slot)))
 7854				goto fail;
 7855		}
 7856	}
 7857
 7858	atomic_set(&data->refs, 1);
 7859	init_completion(&data->done);
 7860	*pdata = data;
 7861	return 0;
 7862fail:
 7863	io_rsrc_data_free(data);
 7864	return ret;
 7865}
 7866
 7867static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
 7868{
 7869	table->files = kvcalloc(nr_files, sizeof(table->files[0]),
 7870				GFP_KERNEL_ACCOUNT);
 7871	return !!table->files;
 7872}
 7873
 7874static void io_free_file_tables(struct io_file_table *table)
 7875{
 7876	kvfree(table->files);
 7877	table->files = NULL;
 7878}
 7879
 7880static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 7881{
 7882#if defined(CONFIG_UNIX)
 7883	if (ctx->ring_sock) {
 7884		struct sock *sock = ctx->ring_sock->sk;
 7885		struct sk_buff *skb;
 7886
 7887		while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
 7888			kfree_skb(skb);
 7889	}
 7890#else
 7891	int i;
 7892
 7893	for (i = 0; i < ctx->nr_user_files; i++) {
 7894		struct file *file;
 7895
 7896		file = io_file_from_index(ctx, i);
 7897		if (file)
 7898			fput(file);
 7899	}
 7900#endif
 7901	io_free_file_tables(&ctx->file_table);
 7902	io_rsrc_data_free(ctx->file_data);
 7903	ctx->file_data = NULL;
 7904	ctx->nr_user_files = 0;
 7905}
 7906
 7907static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 7908{
 7909	int ret;
 7910
 7911	if (!ctx->file_data)
 7912		return -ENXIO;
 7913	ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
 7914	if (!ret)
 7915		__io_sqe_files_unregister(ctx);
 7916	return ret;
 7917}
 7918
 7919static void io_sq_thread_unpark(struct io_sq_data *sqd)
 7920	__releases(&sqd->lock)
 7921{
 7922	WARN_ON_ONCE(sqd->thread == current);
 7923
 7924	/*
 7925	 * Do the dance but not conditional clear_bit() because it'd race with
 7926	 * other threads incrementing park_pending and setting the bit.
 7927	 */
 7928	clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
 7929	if (atomic_dec_return(&sqd->park_pending))
 7930		set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
 7931	mutex_unlock(&sqd->lock);
 7932}
 7933
 7934static void io_sq_thread_park(struct io_sq_data *sqd)
 7935	__acquires(&sqd->lock)
 7936{
 7937	WARN_ON_ONCE(sqd->thread == current);
 7938
 7939	atomic_inc(&sqd->park_pending);
 7940	set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
 7941	mutex_lock(&sqd->lock);
 7942	if (sqd->thread)
 7943		wake_up_process(sqd->thread);
 7944}
 7945
 7946static void io_sq_thread_stop(struct io_sq_data *sqd)
 7947{
 7948	WARN_ON_ONCE(sqd->thread == current);
 7949	WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
 7950
 7951	set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
 7952	mutex_lock(&sqd->lock);
 7953	if (sqd->thread)
 7954		wake_up_process(sqd->thread);
 7955	mutex_unlock(&sqd->lock);
 7956	wait_for_completion(&sqd->exited);
 7957}
 7958
 7959static void io_put_sq_data(struct io_sq_data *sqd)
 7960{
 7961	if (refcount_dec_and_test(&sqd->refs)) {
 7962		WARN_ON_ONCE(atomic_read(&sqd->park_pending));
 7963
 7964		io_sq_thread_stop(sqd);
 7965		kfree(sqd);
 7966	}
 7967}
 7968
 7969static void io_sq_thread_finish(struct io_ring_ctx *ctx)
 7970{
 7971	struct io_sq_data *sqd = ctx->sq_data;
 7972
 7973	if (sqd) {
 7974		io_sq_thread_park(sqd);
 7975		list_del_init(&ctx->sqd_list);
 7976		io_sqd_update_thread_idle(sqd);
 7977		io_sq_thread_unpark(sqd);
 7978
 7979		io_put_sq_data(sqd);
 7980		ctx->sq_data = NULL;
 7981	}
 7982}
 7983
 7984static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
 7985{
 7986	struct io_ring_ctx *ctx_attach;
 7987	struct io_sq_data *sqd;
 7988	struct fd f;
 7989
 7990	f = fdget(p->wq_fd);
 7991	if (!f.file)
 7992		return ERR_PTR(-ENXIO);
 7993	if (f.file->f_op != &io_uring_fops) {
 7994		fdput(f);
 7995		return ERR_PTR(-EINVAL);
 7996	}
 7997
 7998	ctx_attach = f.file->private_data;
 7999	sqd = ctx_attach->sq_data;
 8000	if (!sqd) {
 8001		fdput(f);
 8002		return ERR_PTR(-EINVAL);
 8003	}
 8004	if (sqd->task_tgid != current->tgid) {
 8005		fdput(f);
 8006		return ERR_PTR(-EPERM);
 8007	}
 8008
 8009	refcount_inc(&sqd->refs);
 8010	fdput(f);
 8011	return sqd;
 8012}
 8013
 8014static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
 8015					 bool *attached)
 8016{
 8017	struct io_sq_data *sqd;
 8018
 8019	*attached = false;
 8020	if (p->flags & IORING_SETUP_ATTACH_WQ) {
 8021		sqd = io_attach_sq_data(p);
 8022		if (!IS_ERR(sqd)) {
 8023			*attached = true;
 8024			return sqd;
 8025		}
 8026		/* fall through for EPERM case, setup new sqd/task */
 8027		if (PTR_ERR(sqd) != -EPERM)
 8028			return sqd;
 8029	}
 8030
 8031	sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
 8032	if (!sqd)
 8033		return ERR_PTR(-ENOMEM);
 8034
 8035	atomic_set(&sqd->park_pending, 0);
 8036	refcount_set(&sqd->refs, 1);
 8037	INIT_LIST_HEAD(&sqd->ctx_list);
 8038	mutex_init(&sqd->lock);
 8039	init_waitqueue_head(&sqd->wait);
 8040	init_completion(&sqd->exited);
 8041	return sqd;
 8042}
 8043
 8044#if defined(CONFIG_UNIX)
 8045/*
 8046 * Ensure the UNIX gc is aware of our file set, so we are certain that
 8047 * the io_uring can be safely unregistered on process exit, even if we have
 8048 * loops in the file referencing.
 8049 */
 8050static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
 8051{
 8052	struct sock *sk = ctx->ring_sock->sk;
 8053	struct scm_fp_list *fpl;
 8054	struct sk_buff *skb;
 8055	int i, nr_files;
 8056
 8057	fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
 8058	if (!fpl)
 8059		return -ENOMEM;
 8060
 8061	skb = alloc_skb(0, GFP_KERNEL);
 8062	if (!skb) {
 8063		kfree(fpl);
 8064		return -ENOMEM;
 8065	}
 8066
 8067	skb->sk = sk;
 8068
 8069	nr_files = 0;
 8070	fpl->user = get_uid(current_user());
 8071	for (i = 0; i < nr; i++) {
 8072		struct file *file = io_file_from_index(ctx, i + offset);
 8073
 8074		if (!file)
 8075			continue;
 8076		fpl->fp[nr_files] = get_file(file);
 8077		unix_inflight(fpl->user, fpl->fp[nr_files]);
 8078		nr_files++;
 8079	}
 8080
 8081	if (nr_files) {
 8082		fpl->max = SCM_MAX_FD;
 8083		fpl->count = nr_files;
 8084		UNIXCB(skb).fp = fpl;
 8085		skb->destructor = unix_destruct_scm;
 8086		refcount_add(skb->truesize, &sk->sk_wmem_alloc);
 8087		skb_queue_head(&sk->sk_receive_queue, skb);
 8088
 8089		for (i = 0; i < nr_files; i++)
 8090			fput(fpl->fp[i]);
 8091	} else {
 8092		kfree_skb(skb);
 8093		kfree(fpl);
 8094	}
 8095
 8096	return 0;
 8097}
 8098
 8099/*
 8100 * If UNIX sockets are enabled, fd passing can cause a reference cycle which
 8101 * causes regular reference counting to break down. We rely on the UNIX
 8102 * garbage collection to take care of this problem for us.
 8103 */
 8104static int io_sqe_files_scm(struct io_ring_ctx *ctx)
 8105{
 8106	unsigned left, total;
 8107	int ret = 0;
 8108
 8109	total = 0;
 8110	left = ctx->nr_user_files;
 8111	while (left) {
 8112		unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
 8113
 8114		ret = __io_sqe_files_scm(ctx, this_files, total);
 8115		if (ret)
 8116			break;
 8117		left -= this_files;
 8118		total += this_files;
 8119	}
 8120
 8121	if (!ret)
 8122		return 0;
 8123
 8124	while (total < ctx->nr_user_files) {
 8125		struct file *file = io_file_from_index(ctx, total);
 8126
 8127		if (file)
 8128			fput(file);
 8129		total++;
 8130	}
 8131
 8132	return ret;
 8133}
 8134#else
 8135static int io_sqe_files_scm(struct io_ring_ctx *ctx)
 8136{
 8137	return 0;
 8138}
 8139#endif
 8140
 8141static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
 8142{
 8143	struct file *file = prsrc->file;
 8144#if defined(CONFIG_UNIX)
 8145	struct sock *sock = ctx->ring_sock->sk;
 8146	struct sk_buff_head list, *head = &sock->sk_receive_queue;
 8147	struct sk_buff *skb;
 8148	int i;
 8149
 8150	__skb_queue_head_init(&list);
 8151
 8152	/*
 8153	 * Find the skb that holds this file in its SCM_RIGHTS. When found,
 8154	 * remove this entry and rearrange the file array.
 8155	 */
 8156	skb = skb_dequeue(head);
 8157	while (skb) {
 8158		struct scm_fp_list *fp;
 8159
 8160		fp = UNIXCB(skb).fp;
 8161		for (i = 0; i < fp->count; i++) {
 8162			int left;
 8163
 8164			if (fp->fp[i] != file)
 8165				continue;
 8166
 8167			unix_notinflight(fp->user, fp->fp[i]);
 8168			left = fp->count - 1 - i;
 8169			if (left) {
 8170				memmove(&fp->fp[i], &fp->fp[i + 1],
 8171						left * sizeof(struct file *));
 8172			}
 8173			fp->count--;
 8174			if (!fp->count) {
 8175				kfree_skb(skb);
 8176				skb = NULL;
 8177			} else {
 8178				__skb_queue_tail(&list, skb);
 8179			}
 8180			fput(file);
 8181			file = NULL;
 8182			break;
 8183		}
 8184
 8185		if (!file)
 8186			break;
 8187
 8188		__skb_queue_tail(&list, skb);
 8189
 8190		skb = skb_dequeue(head);
 8191	}
 8192
 8193	if (skb_peek(&list)) {
 8194		spin_lock_irq(&head->lock);
 8195		while ((skb = __skb_dequeue(&list)) != NULL)
 8196			__skb_queue_tail(head, skb);
 8197		spin_unlock_irq(&head->lock);
 8198	}
 8199#else
 8200	fput(file);
 8201#endif
 8202}
 8203
 8204static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 8205{
 8206	struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
 8207	struct io_ring_ctx *ctx = rsrc_data->ctx;
 8208	struct io_rsrc_put *prsrc, *tmp;
 8209
 8210	list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
 8211		list_del(&prsrc->list);
 8212
 8213		if (prsrc->tag) {
 8214			bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
 8215
 8216			io_ring_submit_lock(ctx, lock_ring);
 8217			spin_lock(&ctx->completion_lock);
 8218			io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
 8219			ctx->cq_extra++;
 8220			io_commit_cqring(ctx);
 8221			spin_unlock(&ctx->completion_lock);
 8222			io_cqring_ev_posted(ctx);
 8223			io_ring_submit_unlock(ctx, lock_ring);
 8224		}
 8225
 8226		rsrc_data->do_put(ctx, prsrc);
 8227		kfree(prsrc);
 8228	}
 8229
 8230	io_rsrc_node_destroy(ref_node);
 8231	if (atomic_dec_and_test(&rsrc_data->refs))
 8232		complete(&rsrc_data->done);
 8233}
 8234
 8235static void io_rsrc_put_work(struct work_struct *work)
 8236{
 8237	struct io_ring_ctx *ctx;
 8238	struct llist_node *node;
 8239
 8240	ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
 8241	node = llist_del_all(&ctx->rsrc_put_llist);
 8242
 8243	while (node) {
 8244		struct io_rsrc_node *ref_node;
 8245		struct llist_node *next = node->next;
 8246
 8247		ref_node = llist_entry(node, struct io_rsrc_node, llist);
 8248		__io_rsrc_put_work(ref_node);
 8249		node = next;
 8250	}
 8251}
 8252
 8253static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 8254				 unsigned nr_args, u64 __user *tags)
 8255{
 8256	__s32 __user *fds = (__s32 __user *) arg;
 8257	struct file *file;
 8258	int fd, ret;
 8259	unsigned i;
 8260
 8261	if (ctx->file_data)
 8262		return -EBUSY;
 8263	if (!nr_args)
 8264		return -EINVAL;
 8265	if (nr_args > IORING_MAX_FIXED_FILES)
 8266		return -EMFILE;
 8267	if (nr_args > rlimit(RLIMIT_NOFILE))
 8268		return -EMFILE;
 8269	ret = io_rsrc_node_switch_start(ctx);
 8270	if (ret)
 8271		return ret;
 8272	ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
 8273				 &ctx->file_data);
 8274	if (ret)
 8275		return ret;
 8276
 8277	ret = -ENOMEM;
 8278	if (!io_alloc_file_tables(&ctx->file_table, nr_args))
 8279		goto out_free;
 8280
 8281	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
 8282		if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
 8283			ret = -EFAULT;
 8284			goto out_fput;
 8285		}
 8286		/* allow sparse sets */
 8287		if (fd == -1) {
 8288			ret = -EINVAL;
 8289			if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
 8290				goto out_fput;
 8291			continue;
 8292		}
 8293
 8294		file = fget(fd);
 8295		ret = -EBADF;
 8296		if (unlikely(!file))
 8297			goto out_fput;
 8298
 8299		/*
 8300		 * Don't allow io_uring instances to be registered. If UNIX
 8301		 * isn't enabled, then this causes a reference cycle and this
 8302		 * instance can never get freed. If UNIX is enabled we'll
 8303		 * handle it just fine, but there's still no point in allowing
 8304		 * a ring fd as it doesn't support regular read/write anyway.
 8305		 */
 8306		if (file->f_op == &io_uring_fops) {
 8307			fput(file);
 8308			goto out_fput;
 8309		}
 8310		io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
 8311	}
 8312
 8313	ret = io_sqe_files_scm(ctx);
 8314	if (ret) {
 8315		__io_sqe_files_unregister(ctx);
 8316		return ret;
 8317	}
 8318
 8319	io_rsrc_node_switch(ctx, NULL);
 8320	return ret;
 8321out_fput:
 8322	for (i = 0; i < ctx->nr_user_files; i++) {
 8323		file = io_file_from_index(ctx, i);
 8324		if (file)
 8325			fput(file);
 8326	}
 8327	io_free_file_tables(&ctx->file_table);
 8328	ctx->nr_user_files = 0;
 8329out_free:
 8330	io_rsrc_data_free(ctx->file_data);
 8331	ctx->file_data = NULL;
 8332	return ret;
 8333}
 8334
 8335static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
 8336				int index)
 8337{
 8338#if defined(CONFIG_UNIX)
 8339	struct sock *sock = ctx->ring_sock->sk;
 8340	struct sk_buff_head *head = &sock->sk_receive_queue;
 8341	struct sk_buff *skb;
 8342
 8343	/*
 8344	 * See if we can merge this file into an existing skb SCM_RIGHTS
 8345	 * file set. If there's no room, fall back to allocating a new skb
 8346	 * and filling it in.
 8347	 */
 8348	spin_lock_irq(&head->lock);
 8349	skb = skb_peek(head);
 8350	if (skb) {
 8351		struct scm_fp_list *fpl = UNIXCB(skb).fp;
 8352
 8353		if (fpl->count < SCM_MAX_FD) {
 8354			__skb_unlink(skb, head);
 8355			spin_unlock_irq(&head->lock);
 8356			fpl->fp[fpl->count] = get_file(file);
 8357			unix_inflight(fpl->user, fpl->fp[fpl->count]);
 8358			fpl->count++;
 8359			spin_lock_irq(&head->lock);
 8360			__skb_queue_head(head, skb);
 8361		} else {
 8362			skb = NULL;
 8363		}
 8364	}
 8365	spin_unlock_irq(&head->lock);
 8366
 8367	if (skb) {
 8368		fput(file);
 8369		return 0;
 8370	}
 8371
 8372	return __io_sqe_files_scm(ctx, 1, index);
 8373#else
 8374	return 0;
 8375#endif
 8376}
 8377
 8378static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 8379				 struct io_rsrc_node *node, void *rsrc)
 8380{
 8381	struct io_rsrc_put *prsrc;
 8382
 8383	prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
 8384	if (!prsrc)
 8385		return -ENOMEM;
 8386
 8387	prsrc->tag = *io_get_tag_slot(data, idx);
 8388	prsrc->rsrc = rsrc;
 8389	list_add(&prsrc->list, &node->rsrc_list);
 8390	return 0;
 8391}
 8392
 8393static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
 8394				 unsigned int issue_flags, u32 slot_index)
 8395{
 8396	struct io_ring_ctx *ctx = req->ctx;
 8397	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 8398	bool needs_switch = false;
 8399	struct io_fixed_file *file_slot;
 8400	int ret = -EBADF;
 8401
 8402	io_ring_submit_lock(ctx, needs_lock);
 8403	if (file->f_op == &io_uring_fops)
 8404		goto err;
 8405	ret = -ENXIO;
 8406	if (!ctx->file_data)
 8407		goto err;
 8408	ret = -EINVAL;
 8409	if (slot_index >= ctx->nr_user_files)
 8410		goto err;
 8411
 8412	slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
 8413	file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
 8414
 8415	if (file_slot->file_ptr) {
 8416		struct file *old_file;
 8417
 8418		ret = io_rsrc_node_switch_start(ctx);
 8419		if (ret)
 8420			goto err;
 8421
 8422		old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
 8423		ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
 8424					    ctx->rsrc_node, old_file);
 8425		if (ret)
 8426			goto err;
 8427		file_slot->file_ptr = 0;
 8428		needs_switch = true;
 8429	}
 8430
 8431	*io_get_tag_slot(ctx->file_data, slot_index) = 0;
 8432	io_fixed_file_set(file_slot, file);
 8433	ret = io_sqe_file_register(ctx, file, slot_index);
 8434	if (ret) {
 8435		file_slot->file_ptr = 0;
 8436		goto err;
 8437	}
 8438
 8439	ret = 0;
 8440err:
 8441	if (needs_switch)
 8442		io_rsrc_node_switch(ctx, ctx->file_data);
 8443	io_ring_submit_unlock(ctx, needs_lock);
 8444	if (ret)
 8445		fput(file);
 8446	return ret;
 8447}
 8448
 8449static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
 8450{
 8451	unsigned int offset = req->close.file_slot - 1;
 8452	struct io_ring_ctx *ctx = req->ctx;
 8453	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 8454	struct io_fixed_file *file_slot;
 8455	struct file *file;
 8456	int ret, i;
 8457
 8458	io_ring_submit_lock(ctx, needs_lock);
 8459	ret = -ENXIO;
 8460	if (unlikely(!ctx->file_data))
 8461		goto out;
 8462	ret = -EINVAL;
 8463	if (offset >= ctx->nr_user_files)
 8464		goto out;
 8465	ret = io_rsrc_node_switch_start(ctx);
 8466	if (ret)
 8467		goto out;
 8468
 8469	i = array_index_nospec(offset, ctx->nr_user_files);
 8470	file_slot = io_fixed_file_slot(&ctx->file_table, i);
 8471	ret = -EBADF;
 8472	if (!file_slot->file_ptr)
 8473		goto out;
 8474
 8475	file = (struct file *)(file_slot->file_ptr & FFS_MASK);
 8476	ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
 8477	if (ret)
 8478		goto out;
 8479
 8480	file_slot->file_ptr = 0;
 8481	io_rsrc_node_switch(ctx, ctx->file_data);
 8482	ret = 0;
 8483out:
 8484	io_ring_submit_unlock(ctx, needs_lock);
 8485	return ret;
 8486}
 8487
 8488static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 8489				 struct io_uring_rsrc_update2 *up,
 8490				 unsigned nr_args)
 8491{
 8492	u64 __user *tags = u64_to_user_ptr(up->tags);
 8493	__s32 __user *fds = u64_to_user_ptr(up->data);
 8494	struct io_rsrc_data *data = ctx->file_data;
 8495	struct io_fixed_file *file_slot;
 8496	struct file *file;
 8497	int fd, i, err = 0;
 8498	unsigned int done;
 8499	bool needs_switch = false;
 8500
 8501	if (!ctx->file_data)
 8502		return -ENXIO;
 8503	if (up->offset + nr_args > ctx->nr_user_files)
 8504		return -EINVAL;
 8505
 8506	for (done = 0; done < nr_args; done++) {
 8507		u64 tag = 0;
 8508
 8509		if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
 8510		    copy_from_user(&fd, &fds[done], sizeof(fd))) {
 8511			err = -EFAULT;
 8512			break;
 8513		}
 8514		if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
 8515			err = -EINVAL;
 8516			break;
 8517		}
 8518		if (fd == IORING_REGISTER_FILES_SKIP)
 8519			continue;
 8520
 8521		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
 8522		file_slot = io_fixed_file_slot(&ctx->file_table, i);
 8523
 8524		if (file_slot->file_ptr) {
 8525			file = (struct file *)(file_slot->file_ptr & FFS_MASK);
 8526			err = io_queue_rsrc_removal(data, up->offset + done,
 8527						    ctx->rsrc_node, file);
 8528			if (err)
 8529				break;
 8530			file_slot->file_ptr = 0;
 8531			needs_switch = true;
 8532		}
 8533		if (fd != -1) {
 8534			file = fget(fd);
 8535			if (!file) {
 8536				err = -EBADF;
 8537				break;
 8538			}
 8539			/*
 8540			 * Don't allow io_uring instances to be registered. If
 8541			 * UNIX isn't enabled, then this causes a reference
 8542			 * cycle and this instance can never get freed. If UNIX
 8543			 * is enabled we'll handle it just fine, but there's
 8544			 * still no point in allowing a ring fd as it doesn't
 8545			 * support regular read/write anyway.
 8546			 */
 8547			if (file->f_op == &io_uring_fops) {
 8548				fput(file);
 8549				err = -EBADF;
 8550				break;
 8551			}
 8552			*io_get_tag_slot(data, up->offset + done) = tag;
 8553			io_fixed_file_set(file_slot, file);
 8554			err = io_sqe_file_register(ctx, file, i);
 8555			if (err) {
 8556				file_slot->file_ptr = 0;
 8557				fput(file);
 8558				break;
 8559			}
 8560		}
 8561	}
 8562
 8563	if (needs_switch)
 8564		io_rsrc_node_switch(ctx, data);
 8565	return done ? done : err;
 8566}
 8567
 8568static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
 8569					struct task_struct *task)
 8570{
 8571	struct io_wq_hash *hash;
 8572	struct io_wq_data data;
 8573	unsigned int concurrency;
 8574
 8575	mutex_lock(&ctx->uring_lock);
 8576	hash = ctx->hash_map;
 8577	if (!hash) {
 8578		hash = kzalloc(sizeof(*hash), GFP_KERNEL);
 8579		if (!hash) {
 8580			mutex_unlock(&ctx->uring_lock);
 8581			return ERR_PTR(-ENOMEM);
 8582		}
 8583		refcount_set(&hash->refs, 1);
 8584		init_waitqueue_head(&hash->wait);
 8585		ctx->hash_map = hash;
 8586	}
 8587	mutex_unlock(&ctx->uring_lock);
 8588
 8589	data.hash = hash;
 8590	data.task = task;
 8591	data.free_work = io_wq_free_work;
 8592	data.do_work = io_wq_submit_work;
 8593
 8594	/* Do QD, or 4 * CPUS, whatever is smallest */
 8595	concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
 8596
 8597	return io_wq_create(concurrency, &data);
 8598}
 8599
 8600static __cold int io_uring_alloc_task_context(struct task_struct *task,
 8601					      struct io_ring_ctx *ctx)
 8602{
 8603	struct io_uring_task *tctx;
 8604	int ret;
 8605
 8606	tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
 8607	if (unlikely(!tctx))
 8608		return -ENOMEM;
 8609
 8610	ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
 8611	if (unlikely(ret)) {
 8612		kfree(tctx);
 8613		return ret;
 8614	}
 8615
 8616	tctx->io_wq = io_init_wq_offload(ctx, task);
 8617	if (IS_ERR(tctx->io_wq)) {
 8618		ret = PTR_ERR(tctx->io_wq);
 8619		percpu_counter_destroy(&tctx->inflight);
 8620		kfree(tctx);
 8621		return ret;
 8622	}
 8623
 8624	xa_init(&tctx->xa);
 8625	init_waitqueue_head(&tctx->wait);
 8626	atomic_set(&tctx->in_idle, 0);
 8627	atomic_set(&tctx->inflight_tracked, 0);
 8628	task->io_uring = tctx;
 8629	spin_lock_init(&tctx->task_lock);
 8630	INIT_WQ_LIST(&tctx->task_list);
 8631	init_task_work(&tctx->task_work, tctx_task_work);
 8632	return 0;
 8633}
 8634
 8635void __io_uring_free(struct task_struct *tsk)
 8636{
 8637	struct io_uring_task *tctx = tsk->io_uring;
 8638
 8639	WARN_ON_ONCE(!xa_empty(&tctx->xa));
 8640	WARN_ON_ONCE(tctx->io_wq);
 8641	WARN_ON_ONCE(tctx->cached_refs);
 8642
 8643	percpu_counter_destroy(&tctx->inflight);
 8644	kfree(tctx);
 8645	tsk->io_uring = NULL;
 8646}
 8647
 8648static __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
 8649				       struct io_uring_params *p)
 8650{
 8651	int ret;
 8652
 8653	/* Retain compatibility with failing for an invalid attach attempt */
 8654	if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
 8655				IORING_SETUP_ATTACH_WQ) {
 8656		struct fd f;
 8657
 8658		f = fdget(p->wq_fd);
 8659		if (!f.file)
 8660			return -ENXIO;
 8661		if (f.file->f_op != &io_uring_fops) {
 8662			fdput(f);
 8663			return -EINVAL;
 8664		}
 8665		fdput(f);
 8666	}
 8667	if (ctx->flags & IORING_SETUP_SQPOLL) {
 8668		struct task_struct *tsk;
 8669		struct io_sq_data *sqd;
 8670		bool attached;
 8671
 8672		ret = security_uring_sqpoll();
 8673		if (ret)
 8674			return ret;
 8675
 8676		sqd = io_get_sq_data(p, &attached);
 8677		if (IS_ERR(sqd)) {
 8678			ret = PTR_ERR(sqd);
 8679			goto err;
 8680		}
 8681
 8682		ctx->sq_creds = get_current_cred();
 8683		ctx->sq_data = sqd;
 8684		ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
 8685		if (!ctx->sq_thread_idle)
 8686			ctx->sq_thread_idle = HZ;
 8687
 8688		io_sq_thread_park(sqd);
 8689		list_add(&ctx->sqd_list, &sqd->ctx_list);
 8690		io_sqd_update_thread_idle(sqd);
 8691		/* don't attach to a dying SQPOLL thread, would be racy */
 8692		ret = (attached && !sqd->thread) ? -ENXIO : 0;
 8693		io_sq_thread_unpark(sqd);
 8694
 8695		if (ret < 0)
 8696			goto err;
 8697		if (attached)
 8698			return 0;
 8699
 8700		if (p->flags & IORING_SETUP_SQ_AFF) {
 8701			int cpu = p->sq_thread_cpu;
 8702
 8703			ret = -EINVAL;
 8704			if (cpu >= nr_cpu_ids || !cpu_online(cpu))
 8705				goto err_sqpoll;
 8706			sqd->sq_cpu = cpu;
 8707		} else {
 8708			sqd->sq_cpu = -1;
 8709		}
 8710
 8711		sqd->task_pid = current->pid;
 8712		sqd->task_tgid = current->tgid;
 8713		tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
 8714		if (IS_ERR(tsk)) {
 8715			ret = PTR_ERR(tsk);
 8716			goto err_sqpoll;
 8717		}
 8718
 8719		sqd->thread = tsk;
 8720		ret = io_uring_alloc_task_context(tsk, ctx);
 8721		wake_up_new_task(tsk);
 8722		if (ret)
 8723			goto err;
 8724	} else if (p->flags & IORING_SETUP_SQ_AFF) {
 8725		/* Can't have SQ_AFF without SQPOLL */
 8726		ret = -EINVAL;
 8727		goto err;
 8728	}
 8729
 8730	return 0;
 8731err_sqpoll:
 8732	complete(&ctx->sq_data->exited);
 8733err:
 8734	io_sq_thread_finish(ctx);
 8735	return ret;
 8736}
 8737
 8738static inline void __io_unaccount_mem(struct user_struct *user,
 8739				      unsigned long nr_pages)
 8740{
 8741	atomic_long_sub(nr_pages, &user->locked_vm);
 8742}
 8743
 8744static inline int __io_account_mem(struct user_struct *user,
 8745				   unsigned long nr_pages)
 8746{
 8747	unsigned long page_limit, cur_pages, new_pages;
 8748
 8749	/* Don't allow more pages than we can safely lock */
 8750	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 8751
 8752	do {
 8753		cur_pages = atomic_long_read(&user->locked_vm);
 8754		new_pages = cur_pages + nr_pages;
 8755		if (new_pages > page_limit)
 8756			return -ENOMEM;
 8757	} while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
 8758					new_pages) != cur_pages);
 8759
 8760	return 0;
 8761}
 8762
 8763static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
 8764{
 8765	if (ctx->user)
 8766		__io_unaccount_mem(ctx->user, nr_pages);
 8767
 8768	if (ctx->mm_account)
 8769		atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
 8770}
 8771
 8772static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
 8773{
 8774	int ret;
 8775
 8776	if (ctx->user) {
 8777		ret = __io_account_mem(ctx->user, nr_pages);
 8778		if (ret)
 8779			return ret;
 8780	}
 8781
 8782	if (ctx->mm_account)
 8783		atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
 8784
 8785	return 0;
 8786}
 8787
 8788static void io_mem_free(void *ptr)
 8789{
 8790	struct page *page;
 8791
 8792	if (!ptr)
 8793		return;
 8794
 8795	page = virt_to_head_page(ptr);
 8796	if (put_page_testzero(page))
 8797		free_compound_page(page);
 8798}
 8799
 8800static void *io_mem_alloc(size_t size)
 8801{
 8802	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
 8803				__GFP_NORETRY | __GFP_ACCOUNT;
 8804
 8805	return (void *) __get_free_pages(gfp_flags, get_order(size));
 8806}
 8807
 8808static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
 8809				size_t *sq_offset)
 8810{
 8811	struct io_rings *rings;
 8812	size_t off, sq_array_size;
 8813
 8814	off = struct_size(rings, cqes, cq_entries);
 8815	if (off == SIZE_MAX)
 8816		return SIZE_MAX;
 8817
 8818#ifdef CONFIG_SMP
 8819	off = ALIGN(off, SMP_CACHE_BYTES);
 8820	if (off == 0)
 8821		return SIZE_MAX;
 8822#endif
 8823
 8824	if (sq_offset)
 8825		*sq_offset = off;
 8826
 8827	sq_array_size = array_size(sizeof(u32), sq_entries);
 8828	if (sq_array_size == SIZE_MAX)
 8829		return SIZE_MAX;
 8830
 8831	if (check_add_overflow(off, sq_array_size, &off))
 8832		return SIZE_MAX;
 8833
 8834	return off;
 8835}
 8836
 8837static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
 8838{
 8839	struct io_mapped_ubuf *imu = *slot;
 8840	unsigned int i;
 8841
 8842	if (imu != ctx->dummy_ubuf) {
 8843		for (i = 0; i < imu->nr_bvecs; i++)
 8844			unpin_user_page(imu->bvec[i].bv_page);
 8845		if (imu->acct_pages)
 8846			io_unaccount_mem(ctx, imu->acct_pages);
 8847		kvfree(imu);
 8848	}
 8849	*slot = NULL;
 8850}
 8851
 8852static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
 8853{
 8854	io_buffer_unmap(ctx, &prsrc->buf);
 8855	prsrc->buf = NULL;
 8856}
 8857
 8858static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
 8859{
 8860	unsigned int i;
 8861
 8862	for (i = 0; i < ctx->nr_user_bufs; i++)
 8863		io_buffer_unmap(ctx, &ctx->user_bufs[i]);
 8864	kfree(ctx->user_bufs);
 8865	io_rsrc_data_free(ctx->buf_data);
 8866	ctx->user_bufs = NULL;
 8867	ctx->buf_data = NULL;
 8868	ctx->nr_user_bufs = 0;
 8869}
 8870
 8871static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
 8872{
 8873	int ret;
 8874
 8875	if (!ctx->buf_data)
 8876		return -ENXIO;
 8877
 8878	ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
 8879	if (!ret)
 8880		__io_sqe_buffers_unregister(ctx);
 8881	return ret;
 8882}
 8883
 8884static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
 8885		       void __user *arg, unsigned index)
 8886{
 8887	struct iovec __user *src;
 8888
 8889#ifdef CONFIG_COMPAT
 8890	if (ctx->compat) {
 8891		struct compat_iovec __user *ciovs;
 8892		struct compat_iovec ciov;
 8893
 8894		ciovs = (struct compat_iovec __user *) arg;
 8895		if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
 8896			return -EFAULT;
 8897
 8898		dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
 8899		dst->iov_len = ciov.iov_len;
 8900		return 0;
 8901	}
 8902#endif
 8903	src = (struct iovec __user *) arg;
 8904	if (copy_from_user(dst, &src[index], sizeof(*dst)))
 8905		return -EFAULT;
 8906	return 0;
 8907}
 8908
 8909/*
 8910 * Not super efficient, but this is just a registration time. And we do cache
 8911 * the last compound head, so generally we'll only do a full search if we don't
 8912 * match that one.
 8913 *
 8914 * We check if the given compound head page has already been accounted, to
 8915 * avoid double accounting it. This allows us to account the full size of the
 8916 * page, not just the constituent pages of a huge page.
 8917 */
 8918static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
 8919				  int nr_pages, struct page *hpage)
 8920{
 8921	int i, j;
 8922
 8923	/* check current page array */
 8924	for (i = 0; i < nr_pages; i++) {
 8925		if (!PageCompound(pages[i]))
 8926			continue;
 8927		if (compound_head(pages[i]) == hpage)
 8928			return true;
 8929	}
 8930
 8931	/* check previously registered pages */
 8932	for (i = 0; i < ctx->nr_user_bufs; i++) {
 8933		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
 8934
 8935		for (j = 0; j < imu->nr_bvecs; j++) {
 8936			if (!PageCompound(imu->bvec[j].bv_page))
 8937				continue;
 8938			if (compound_head(imu->bvec[j].bv_page) == hpage)
 8939				return true;
 8940		}
 8941	}
 8942
 8943	return false;
 8944}
 8945
 8946static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
 8947				 int nr_pages, struct io_mapped_ubuf *imu,
 8948				 struct page **last_hpage)
 8949{
 8950	int i, ret;
 8951
 8952	imu->acct_pages = 0;
 8953	for (i = 0; i < nr_pages; i++) {
 8954		if (!PageCompound(pages[i])) {
 8955			imu->acct_pages++;
 8956		} else {
 8957			struct page *hpage;
 8958
 8959			hpage = compound_head(pages[i]);
 8960			if (hpage == *last_hpage)
 8961				continue;
 8962			*last_hpage = hpage;
 8963			if (headpage_already_acct(ctx, pages, i, hpage))
 8964				continue;
 8965			imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
 8966		}
 8967	}
 8968
 8969	if (!imu->acct_pages)
 8970		return 0;
 8971
 8972	ret = io_account_mem(ctx, imu->acct_pages);
 8973	if (ret)
 8974		imu->acct_pages = 0;
 8975	return ret;
 8976}
 8977
 8978static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 8979				  struct io_mapped_ubuf **pimu,
 8980				  struct page **last_hpage)
 8981{
 8982	struct io_mapped_ubuf *imu = NULL;
 8983	struct vm_area_struct **vmas = NULL;
 8984	struct page **pages = NULL;
 8985	unsigned long off, start, end, ubuf;
 8986	size_t size;
 8987	int ret, pret, nr_pages, i;
 8988
 8989	if (!iov->iov_base) {
 8990		*pimu = ctx->dummy_ubuf;
 8991		return 0;
 8992	}
 8993
 8994	ubuf = (unsigned long) iov->iov_base;
 8995	end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 8996	start = ubuf >> PAGE_SHIFT;
 8997	nr_pages = end - start;
 8998
 8999	*pimu = NULL;
 9000	ret = -ENOMEM;
 9001
 9002	pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
 9003	if (!pages)
 9004		goto done;
 9005
 9006	vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
 9007			      GFP_KERNEL);
 9008	if (!vmas)
 9009		goto done;
 9010
 9011	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
 9012	if (!imu)
 9013		goto done;
 9014
 9015	ret = 0;
 9016	mmap_read_lock(current->mm);
 9017	pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
 9018			      pages, vmas);
 9019	if (pret == nr_pages) {
 9020		/* don't support file backed memory */
 9021		for (i = 0; i < nr_pages; i++) {
 9022			struct vm_area_struct *vma = vmas[i];
 9023
 9024			if (vma_is_shmem(vma))
 9025				continue;
 9026			if (vma->vm_file &&
 9027			    !is_file_hugepages(vma->vm_file)) {
 9028				ret = -EOPNOTSUPP;
 9029				break;
 9030			}
 9031		}
 9032	} else {
 9033		ret = pret < 0 ? pret : -EFAULT;
 9034	}
 9035	mmap_read_unlock(current->mm);
 9036	if (ret) {
 9037		/*
 9038		 * if we did partial map, or found file backed vmas,
 9039		 * release any pages we did get
 9040		 */
 9041		if (pret > 0)
 9042			unpin_user_pages(pages, pret);
 9043		goto done;
 9044	}
 9045
 9046	ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
 9047	if (ret) {
 9048		unpin_user_pages(pages, pret);
 9049		goto done;
 9050	}
 9051
 9052	off = ubuf & ~PAGE_MASK;
 9053	size = iov->iov_len;
 9054	for (i = 0; i < nr_pages; i++) {
 9055		size_t vec_len;
 9056
 9057		vec_len = min_t(size_t, size, PAGE_SIZE - off);
 9058		imu->bvec[i].bv_page = pages[i];
 9059		imu->bvec[i].bv_len = vec_len;
 9060		imu->bvec[i].bv_offset = off;
 9061		off = 0;
 9062		size -= vec_len;
 9063	}
 9064	/* store original address for later verification */
 9065	imu->ubuf = ubuf;
 9066	imu->ubuf_end = ubuf + iov->iov_len;
 9067	imu->nr_bvecs = nr_pages;
 9068	*pimu = imu;
 9069	ret = 0;
 9070done:
 9071	if (ret)
 9072		kvfree(imu);
 9073	kvfree(pages);
 9074	kvfree(vmas);
 9075	return ret;
 9076}
 9077
 9078static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
 9079{
 9080	ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
 9081	return ctx->user_bufs ? 0 : -ENOMEM;
 9082}
 9083
 9084static int io_buffer_validate(struct iovec *iov)
 9085{
 9086	unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
 9087
 9088	/*
 9089	 * Don't impose further limits on the size and buffer
 9090	 * constraints here, we'll -EINVAL later when IO is
 9091	 * submitted if they are wrong.
 9092	 */
 9093	if (!iov->iov_base)
 9094		return iov->iov_len ? -EFAULT : 0;
 9095	if (!iov->iov_len)
 9096		return -EFAULT;
 9097
 9098	/* arbitrary limit, but we need something */
 9099	if (iov->iov_len > SZ_1G)
 9100		return -EFAULT;
 9101
 9102	if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
 9103		return -EOVERFLOW;
 9104
 9105	return 0;
 9106}
 9107
 9108static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 9109				   unsigned int nr_args, u64 __user *tags)
 9110{
 9111	struct page *last_hpage = NULL;
 9112	struct io_rsrc_data *data;
 9113	int i, ret;
 9114	struct iovec iov;
 9115
 9116	if (ctx->user_bufs)
 9117		return -EBUSY;
 9118	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
 9119		return -EINVAL;
 9120	ret = io_rsrc_node_switch_start(ctx);
 9121	if (ret)
 9122		return ret;
 9123	ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
 9124	if (ret)
 9125		return ret;
 9126	ret = io_buffers_map_alloc(ctx, nr_args);
 9127	if (ret) {
 9128		io_rsrc_data_free(data);
 9129		return ret;
 9130	}
 9131
 9132	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
 9133		ret = io_copy_iov(ctx, &iov, arg, i);
 9134		if (ret)
 9135			break;
 9136		ret = io_buffer_validate(&iov);
 9137		if (ret)
 9138			break;
 9139		if (!iov.iov_base && *io_get_tag_slot(data, i)) {
 9140			ret = -EINVAL;
 9141			break;
 9142		}
 9143
 9144		ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
 9145					     &last_hpage);
 9146		if (ret)
 9147			break;
 9148	}
 9149
 9150	WARN_ON_ONCE(ctx->buf_data);
 9151
 9152	ctx->buf_data = data;
 9153	if (ret)
 9154		__io_sqe_buffers_unregister(ctx);
 9155	else
 9156		io_rsrc_node_switch(ctx, NULL);
 9157	return ret;
 9158}
 9159
 9160static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 9161				   struct io_uring_rsrc_update2 *up,
 9162				   unsigned int nr_args)
 9163{
 9164	u64 __user *tags = u64_to_user_ptr(up->tags);
 9165	struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
 9166	struct page *last_hpage = NULL;
 9167	bool needs_switch = false;
 9168	__u32 done;
 9169	int i, err;
 9170
 9171	if (!ctx->buf_data)
 9172		return -ENXIO;
 9173	if (up->offset + nr_args > ctx->nr_user_bufs)
 9174		return -EINVAL;
 9175
 9176	for (done = 0; done < nr_args; done++) {
 9177		struct io_mapped_ubuf *imu;
 9178		int offset = up->offset + done;
 9179		u64 tag = 0;
 9180
 9181		err = io_copy_iov(ctx, &iov, iovs, done);
 9182		if (err)
 9183			break;
 9184		if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
 9185			err = -EFAULT;
 9186			break;
 9187		}
 9188		err = io_buffer_validate(&iov);
 9189		if (err)
 9190			break;
 9191		if (!iov.iov_base && tag) {
 9192			err = -EINVAL;
 9193			break;
 9194		}
 9195		err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
 9196		if (err)
 9197			break;
 9198
 9199		i = array_index_nospec(offset, ctx->nr_user_bufs);
 9200		if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
 9201			err = io_queue_rsrc_removal(ctx->buf_data, offset,
 9202						    ctx->rsrc_node, ctx->user_bufs[i]);
 9203			if (unlikely(err)) {
 9204				io_buffer_unmap(ctx, &imu);
 9205				break;
 9206			}
 9207			ctx->user_bufs[i] = NULL;
 9208			needs_switch = true;
 9209		}
 9210
 9211		ctx->user_bufs[i] = imu;
 9212		*io_get_tag_slot(ctx->buf_data, offset) = tag;
 9213	}
 9214
 9215	if (needs_switch)
 9216		io_rsrc_node_switch(ctx, ctx->buf_data);
 9217	return done ? done : err;
 9218}
 9219
 9220static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
 9221{
 9222	__s32 __user *fds = arg;
 9223	int fd;
 9224
 9225	if (ctx->cq_ev_fd)
 9226		return -EBUSY;
 9227
 9228	if (copy_from_user(&fd, fds, sizeof(*fds)))
 9229		return -EFAULT;
 9230
 9231	ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
 9232	if (IS_ERR(ctx->cq_ev_fd)) {
 9233		int ret = PTR_ERR(ctx->cq_ev_fd);
 9234
 9235		ctx->cq_ev_fd = NULL;
 9236		return ret;
 9237	}
 9238
 9239	return 0;
 9240}
 9241
 9242static int io_eventfd_unregister(struct io_ring_ctx *ctx)
 9243{
 9244	if (ctx->cq_ev_fd) {
 9245		eventfd_ctx_put(ctx->cq_ev_fd);
 9246		ctx->cq_ev_fd = NULL;
 9247		return 0;
 9248	}
 9249
 9250	return -ENXIO;
 9251}
 9252
 9253static void io_destroy_buffers(struct io_ring_ctx *ctx)
 9254{
 9255	struct io_buffer *buf;
 9256	unsigned long index;
 9257
 9258	xa_for_each(&ctx->io_buffers, index, buf) {
 9259		__io_remove_buffers(ctx, buf, index, -1U);
 9260		cond_resched();
 9261	}
 9262}
 9263
 9264static void io_req_caches_free(struct io_ring_ctx *ctx)
 9265{
 9266	struct io_submit_state *state = &ctx->submit_state;
 9267	int nr = 0;
 9268
 9269	mutex_lock(&ctx->uring_lock);
 9270	io_flush_cached_locked_reqs(ctx, state);
 9271
 9272	while (state->free_list.next) {
 9273		struct io_wq_work_node *node;
 9274		struct io_kiocb *req;
 9275
 9276		node = wq_stack_extract(&state->free_list);
 9277		req = container_of(node, struct io_kiocb, comp_list);
 9278		kmem_cache_free(req_cachep, req);
 9279		nr++;
 9280	}
 9281	if (nr)
 9282		percpu_ref_put_many(&ctx->refs, nr);
 9283	mutex_unlock(&ctx->uring_lock);
 9284}
 9285
 9286static void io_wait_rsrc_data(struct io_rsrc_data *data)
 9287{
 9288	if (data && !atomic_dec_and_test(&data->refs))
 9289		wait_for_completion(&data->done);
 9290}
 9291
 9292static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 9293{
 9294	io_sq_thread_finish(ctx);
 9295
 9296	if (ctx->mm_account) {
 9297		mmdrop(ctx->mm_account);
 9298		ctx->mm_account = NULL;
 9299	}
 9300
 9301	io_rsrc_refs_drop(ctx);
 9302	/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
 9303	io_wait_rsrc_data(ctx->buf_data);
 9304	io_wait_rsrc_data(ctx->file_data);
 9305
 9306	mutex_lock(&ctx->uring_lock);
 9307	if (ctx->buf_data)
 9308		__io_sqe_buffers_unregister(ctx);
 9309	if (ctx->file_data)
 9310		__io_sqe_files_unregister(ctx);
 9311	if (ctx->rings)
 9312		__io_cqring_overflow_flush(ctx, true);
 9313	mutex_unlock(&ctx->uring_lock);
 9314	io_eventfd_unregister(ctx);
 9315	io_destroy_buffers(ctx);
 9316	if (ctx->sq_creds)
 9317		put_cred(ctx->sq_creds);
 9318
 9319	/* there are no registered resources left, nobody uses it */
 9320	if (ctx->rsrc_node)
 9321		io_rsrc_node_destroy(ctx->rsrc_node);
 9322	if (ctx->rsrc_backup_node)
 9323		io_rsrc_node_destroy(ctx->rsrc_backup_node);
 9324	flush_delayed_work(&ctx->rsrc_put_work);
 9325	flush_delayed_work(&ctx->fallback_work);
 9326
 9327	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
 9328	WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
 9329
 9330#if defined(CONFIG_UNIX)
 9331	if (ctx->ring_sock) {
 9332		ctx->ring_sock->file = NULL; /* so that iput() is called */
 9333		sock_release(ctx->ring_sock);
 9334	}
 9335#endif
 9336	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
 9337
 9338	io_mem_free(ctx->rings);
 9339	io_mem_free(ctx->sq_sqes);
 9340
 9341	percpu_ref_exit(&ctx->refs);
 9342	free_uid(ctx->user);
 9343	io_req_caches_free(ctx);
 9344	if (ctx->hash_map)
 9345		io_wq_put_hash(ctx->hash_map);
 9346	kfree(ctx->cancel_hash);
 9347	kfree(ctx->dummy_ubuf);
 9348	kfree(ctx);
 9349}
 9350
 9351static __poll_t io_uring_poll(struct file *file, poll_table *wait)
 9352{
 9353	struct io_ring_ctx *ctx = file->private_data;
 9354	__poll_t mask = 0;
 9355
 9356	poll_wait(file, &ctx->cq_wait, wait);
 9357	/*
 9358	 * synchronizes with barrier from wq_has_sleeper call in
 9359	 * io_commit_cqring
 9360	 */
 9361	smp_rmb();
 9362	if (!io_sqring_full(ctx))
 9363		mask |= EPOLLOUT | EPOLLWRNORM;
 9364
 9365	/*
 9366	 * Don't flush cqring overflow list here, just do a simple check.
 9367	 * Otherwise there could possible be ABBA deadlock:
 9368	 *      CPU0                    CPU1
 9369	 *      ----                    ----
 9370	 * lock(&ctx->uring_lock);
 9371	 *                              lock(&ep->mtx);
 9372	 *                              lock(&ctx->uring_lock);
 9373	 * lock(&ep->mtx);
 9374	 *
 9375	 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
 9376	 * pushs them to do the flush.
 9377	 */
 9378	if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow))
 9379		mask |= EPOLLIN | EPOLLRDNORM;
 9380
 9381	return mask;
 9382}
 9383
 9384static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
 9385{
 9386	const struct cred *creds;
 9387
 9388	creds = xa_erase(&ctx->personalities, id);
 9389	if (creds) {
 9390		put_cred(creds);
 9391		return 0;
 9392	}
 9393
 9394	return -EINVAL;
 9395}
 9396
 9397struct io_tctx_exit {
 9398	struct callback_head		task_work;
 9399	struct completion		completion;
 9400	struct io_ring_ctx		*ctx;
 9401};
 9402
 9403static __cold void io_tctx_exit_cb(struct callback_head *cb)
 9404{
 9405	struct io_uring_task *tctx = current->io_uring;
 9406	struct io_tctx_exit *work;
 9407
 9408	work = container_of(cb, struct io_tctx_exit, task_work);
 9409	/*
 9410	 * When @in_idle, we're in cancellation and it's racy to remove the
 9411	 * node. It'll be removed by the end of cancellation, just ignore it.
 9412	 */
 9413	if (!atomic_read(&tctx->in_idle))
 9414		io_uring_del_tctx_node((unsigned long)work->ctx);
 9415	complete(&work->completion);
 9416}
 9417
 9418static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
 9419{
 9420	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 9421
 9422	return req->ctx == data;
 9423}
 9424
 9425static __cold void io_ring_exit_work(struct work_struct *work)
 9426{
 9427	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
 9428	unsigned long timeout = jiffies + HZ * 60 * 5;
 9429	unsigned long interval = HZ / 20;
 9430	struct io_tctx_exit exit;
 9431	struct io_tctx_node *node;
 9432	int ret;
 9433
 9434	/*
 9435	 * If we're doing polled IO and end up having requests being
 9436	 * submitted async (out-of-line), then completions can come in while
 9437	 * we're waiting for refs to drop. We need to reap these manually,
 9438	 * as nobody else will be looking for them.
 9439	 */
 9440	do {
 9441		io_uring_try_cancel_requests(ctx, NULL, true);
 9442		if (ctx->sq_data) {
 9443			struct io_sq_data *sqd = ctx->sq_data;
 9444			struct task_struct *tsk;
 9445
 9446			io_sq_thread_park(sqd);
 9447			tsk = sqd->thread;
 9448			if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
 9449				io_wq_cancel_cb(tsk->io_uring->io_wq,
 9450						io_cancel_ctx_cb, ctx, true);
 9451			io_sq_thread_unpark(sqd);
 9452		}
 9453
 9454		io_req_caches_free(ctx);
 9455
 9456		if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
 9457			/* there is little hope left, don't run it too often */
 9458			interval = HZ * 60;
 9459		}
 9460	} while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
 9461
 9462	init_completion(&exit.completion);
 9463	init_task_work(&exit.task_work, io_tctx_exit_cb);
 9464	exit.ctx = ctx;
 9465	/*
 9466	 * Some may use context even when all refs and requests have been put,
 9467	 * and they are free to do so while still holding uring_lock or
 9468	 * completion_lock, see io_req_task_submit(). Apart from other work,
 9469	 * this lock/unlock section also waits them to finish.
 9470	 */
 9471	mutex_lock(&ctx->uring_lock);
 9472	while (!list_empty(&ctx->tctx_list)) {
 9473		WARN_ON_ONCE(time_after(jiffies, timeout));
 9474
 9475		node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
 9476					ctx_node);
 9477		/* don't spin on a single task if cancellation failed */
 9478		list_rotate_left(&ctx->tctx_list);
 9479		ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
 9480		if (WARN_ON_ONCE(ret))
 9481			continue;
 9482
 9483		mutex_unlock(&ctx->uring_lock);
 9484		wait_for_completion(&exit.completion);
 9485		mutex_lock(&ctx->uring_lock);
 9486	}
 9487	mutex_unlock(&ctx->uring_lock);
 9488	spin_lock(&ctx->completion_lock);
 9489	spin_unlock(&ctx->completion_lock);
 9490
 9491	io_ring_ctx_free(ctx);
 9492}
 9493
 9494/* Returns true if we found and killed one or more timeouts */
 9495static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
 9496				    struct task_struct *tsk, bool cancel_all)
 9497{
 9498	struct io_kiocb *req, *tmp;
 9499	int canceled = 0;
 9500
 9501	spin_lock(&ctx->completion_lock);
 9502	spin_lock_irq(&ctx->timeout_lock);
 9503	list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
 9504		if (io_match_task(req, tsk, cancel_all)) {
 9505			io_kill_timeout(req, -ECANCELED);
 9506			canceled++;
 9507		}
 9508	}
 9509	spin_unlock_irq(&ctx->timeout_lock);
 9510	if (canceled != 0)
 9511		io_commit_cqring(ctx);
 9512	spin_unlock(&ctx->completion_lock);
 9513	if (canceled != 0)
 9514		io_cqring_ev_posted(ctx);
 9515	return canceled != 0;
 9516}
 9517
 9518static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 9519{
 9520	unsigned long index;
 9521	struct creds *creds;
 9522
 9523	mutex_lock(&ctx->uring_lock);
 9524	percpu_ref_kill(&ctx->refs);
 9525	if (ctx->rings)
 9526		__io_cqring_overflow_flush(ctx, true);
 9527	xa_for_each(&ctx->personalities, index, creds)
 9528		io_unregister_personality(ctx, index);
 9529	mutex_unlock(&ctx->uring_lock);
 9530
 9531	io_kill_timeouts(ctx, NULL, true);
 9532	io_poll_remove_all(ctx, NULL, true);
 9533
 9534	/* if we failed setting up the ctx, we might not have any rings */
 9535	io_iopoll_try_reap_events(ctx);
 9536
 9537	INIT_WORK(&ctx->exit_work, io_ring_exit_work);
 9538	/*
 9539	 * Use system_unbound_wq to avoid spawning tons of event kworkers
 9540	 * if we're exiting a ton of rings at the same time. It just adds
 9541	 * noise and overhead, there's no discernable change in runtime
 9542	 * over using system_wq.
 9543	 */
 9544	queue_work(system_unbound_wq, &ctx->exit_work);
 9545}
 9546
 9547static int io_uring_release(struct inode *inode, struct file *file)
 9548{
 9549	struct io_ring_ctx *ctx = file->private_data;
 9550
 9551	file->private_data = NULL;
 9552	io_ring_ctx_wait_and_kill(ctx);
 9553	return 0;
 9554}
 9555
 9556struct io_task_cancel {
 9557	struct task_struct *task;
 9558	bool all;
 9559};
 9560
 9561static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
 9562{
 9563	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 9564	struct io_task_cancel *cancel = data;
 9565	bool ret;
 9566
 9567	if (!cancel->all && (req->flags & REQ_F_LINK_TIMEOUT)) {
 9568		struct io_ring_ctx *ctx = req->ctx;
 9569
 9570		/* protect against races with linked timeouts */
 9571		spin_lock(&ctx->completion_lock);
 9572		ret = io_match_task(req, cancel->task, cancel->all);
 9573		spin_unlock(&ctx->completion_lock);
 9574	} else {
 9575		ret = io_match_task(req, cancel->task, cancel->all);
 9576	}
 9577	return ret;
 9578}
 9579
 9580static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
 9581					 struct task_struct *task,
 9582					 bool cancel_all)
 9583{
 9584	struct io_defer_entry *de;
 9585	LIST_HEAD(list);
 9586
 9587	spin_lock(&ctx->completion_lock);
 9588	list_for_each_entry_reverse(de, &ctx->defer_list, list) {
 9589		if (io_match_task(de->req, task, cancel_all)) {
 9590			list_cut_position(&list, &ctx->defer_list, &de->list);
 9591			break;
 9592		}
 9593	}
 9594	spin_unlock(&ctx->completion_lock);
 9595	if (list_empty(&list))
 9596		return false;
 9597
 9598	while (!list_empty(&list)) {
 9599		de = list_first_entry(&list, struct io_defer_entry, list);
 9600		list_del_init(&de->list);
 9601		io_req_complete_failed(de->req, -ECANCELED);
 9602		kfree(de);
 9603	}
 9604	return true;
 9605}
 9606
 9607static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
 9608{
 9609	struct io_tctx_node *node;
 9610	enum io_wq_cancel cret;
 9611	bool ret = false;
 9612
 9613	mutex_lock(&ctx->uring_lock);
 9614	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
 9615		struct io_uring_task *tctx = node->task->io_uring;
 9616
 9617		/*
 9618		 * io_wq will stay alive while we hold uring_lock, because it's
 9619		 * killed after ctx nodes, which requires to take the lock.
 9620		 */
 9621		if (!tctx || !tctx->io_wq)
 9622			continue;
 9623		cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
 9624		ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
 9625	}
 9626	mutex_unlock(&ctx->uring_lock);
 9627
 9628	return ret;
 9629}
 9630
 9631static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 9632						struct task_struct *task,
 9633						bool cancel_all)
 9634{
 9635	struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
 9636	struct io_uring_task *tctx = task ? task->io_uring : NULL;
 9637
 9638	while (1) {
 9639		enum io_wq_cancel cret;
 9640		bool ret = false;
 9641
 9642		if (!task) {
 9643			ret |= io_uring_try_cancel_iowq(ctx);
 9644		} else if (tctx && tctx->io_wq) {
 9645			/*
 9646			 * Cancels requests of all rings, not only @ctx, but
 9647			 * it's fine as the task is in exit/exec.
 9648			 */
 9649			cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
 9650					       &cancel, true);
 9651			ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
 9652		}
 9653
 9654		/* SQPOLL thread does its own polling */
 9655		if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
 9656		    (ctx->sq_data && ctx->sq_data->thread == current)) {
 9657			while (!wq_list_empty(&ctx->iopoll_list)) {
 9658				io_iopoll_try_reap_events(ctx);
 9659				ret = true;
 9660			}
 9661		}
 9662
 9663		ret |= io_cancel_defer_files(ctx, task, cancel_all);
 9664		ret |= io_poll_remove_all(ctx, task, cancel_all);
 9665		ret |= io_kill_timeouts(ctx, task, cancel_all);
 9666		if (task)
 9667			ret |= io_run_task_work();
 9668		if (!ret)
 9669			break;
 9670		cond_resched();
 9671	}
 9672}
 9673
 9674static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
 9675{
 9676	struct io_uring_task *tctx = current->io_uring;
 9677	struct io_tctx_node *node;
 9678	int ret;
 9679
 9680	if (unlikely(!tctx)) {
 9681		ret = io_uring_alloc_task_context(current, ctx);
 9682		if (unlikely(ret))
 9683			return ret;
 9684
 9685		tctx = current->io_uring;
 9686		if (ctx->iowq_limits_set) {
 9687			unsigned int limits[2] = { ctx->iowq_limits[0],
 9688						   ctx->iowq_limits[1], };
 9689
 9690			ret = io_wq_max_workers(tctx->io_wq, limits);
 9691			if (ret)
 9692				return ret;
 9693		}
 9694	}
 9695	if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
 9696		node = kmalloc(sizeof(*node), GFP_KERNEL);
 9697		if (!node)
 9698			return -ENOMEM;
 9699		node->ctx = ctx;
 9700		node->task = current;
 9701
 9702		ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
 9703					node, GFP_KERNEL));
 9704		if (ret) {
 9705			kfree(node);
 9706			return ret;
 9707		}
 9708
 9709		mutex_lock(&ctx->uring_lock);
 9710		list_add(&node->ctx_node, &ctx->tctx_list);
 9711		mutex_unlock(&ctx->uring_lock);
 9712	}
 9713	tctx->last = ctx;
 9714	return 0;
 9715}
 9716
 9717/*
 9718 * Note that this task has used io_uring. We use it for cancelation purposes.
 9719 */
 9720static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
 9721{
 9722	struct io_uring_task *tctx = current->io_uring;
 9723
 9724	if (likely(tctx && tctx->last == ctx))
 9725		return 0;
 9726	return __io_uring_add_tctx_node(ctx);
 9727}
 9728
 9729/*
 9730 * Remove this io_uring_file -> task mapping.
 9731 */
 9732static __cold void io_uring_del_tctx_node(unsigned long index)
 9733{
 9734	struct io_uring_task *tctx = current->io_uring;
 9735	struct io_tctx_node *node;
 9736
 9737	if (!tctx)
 9738		return;
 9739	node = xa_erase(&tctx->xa, index);
 9740	if (!node)
 9741		return;
 9742
 9743	WARN_ON_ONCE(current != node->task);
 9744	WARN_ON_ONCE(list_empty(&node->ctx_node));
 9745
 9746	mutex_lock(&node->ctx->uring_lock);
 9747	list_del(&node->ctx_node);
 9748	mutex_unlock(&node->ctx->uring_lock);
 9749
 9750	if (tctx->last == node->ctx)
 9751		tctx->last = NULL;
 9752	kfree(node);
 9753}
 9754
 9755static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
 9756{
 9757	struct io_wq *wq = tctx->io_wq;
 9758	struct io_tctx_node *node;
 9759	unsigned long index;
 9760
 9761	xa_for_each(&tctx->xa, index, node) {
 9762		io_uring_del_tctx_node(index);
 9763		cond_resched();
 9764	}
 9765	if (wq) {
 9766		/*
 9767		 * Must be after io_uring_del_task_file() (removes nodes under
 9768		 * uring_lock) to avoid race with io_uring_try_cancel_iowq().
 9769		 */
 9770		io_wq_put_and_exit(wq);
 9771		tctx->io_wq = NULL;
 9772	}
 9773}
 9774
 9775static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
 9776{
 9777	if (tracked)
 9778		return atomic_read(&tctx->inflight_tracked);
 9779	return percpu_counter_sum(&tctx->inflight);
 9780}
 9781
 9782static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
 9783{
 9784	struct io_uring_task *tctx = task->io_uring;
 9785	unsigned int refs = tctx->cached_refs;
 9786
 9787	if (refs) {
 9788		tctx->cached_refs = 0;
 9789		percpu_counter_sub(&tctx->inflight, refs);
 9790		put_task_struct_many(task, refs);
 9791	}
 9792}
 9793
 9794/*
 9795 * Find any io_uring ctx that this task has registered or done IO on, and cancel
 9796 * requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation.
 9797 */
 9798static __cold void io_uring_cancel_generic(bool cancel_all,
 9799					   struct io_sq_data *sqd)
 9800{
 9801	struct io_uring_task *tctx = current->io_uring;
 9802	struct io_ring_ctx *ctx;
 9803	s64 inflight;
 9804	DEFINE_WAIT(wait);
 9805
 9806	WARN_ON_ONCE(sqd && sqd->thread != current);
 9807
 9808	if (!current->io_uring)
 9809		return;
 9810	if (tctx->io_wq)
 9811		io_wq_exit_start(tctx->io_wq);
 9812
 9813	atomic_inc(&tctx->in_idle);
 9814	do {
 9815		io_uring_drop_tctx_refs(current);
 9816		/* read completions before cancelations */
 9817		inflight = tctx_inflight(tctx, !cancel_all);
 9818		if (!inflight)
 9819			break;
 9820
 9821		if (!sqd) {
 9822			struct io_tctx_node *node;
 9823			unsigned long index;
 9824
 9825			xa_for_each(&tctx->xa, index, node) {
 9826				/* sqpoll task will cancel all its requests */
 9827				if (node->ctx->sq_data)
 9828					continue;
 9829				io_uring_try_cancel_requests(node->ctx, current,
 9830							     cancel_all);
 9831			}
 9832		} else {
 9833			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 9834				io_uring_try_cancel_requests(ctx, current,
 9835							     cancel_all);
 9836		}
 9837
 9838		prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
 9839		io_uring_drop_tctx_refs(current);
 9840		/*
 9841		 * If we've seen completions, retry without waiting. This
 9842		 * avoids a race where a completion comes in before we did
 9843		 * prepare_to_wait().
 9844		 */
 9845		if (inflight == tctx_inflight(tctx, !cancel_all))
 9846			schedule();
 9847		finish_wait(&tctx->wait, &wait);
 9848	} while (1);
 9849	atomic_dec(&tctx->in_idle);
 9850
 9851	io_uring_clean_tctx(tctx);
 9852	if (cancel_all) {
 9853		/* for exec all current's requests should be gone, kill tctx */
 9854		__io_uring_free(current);
 9855	}
 9856}
 9857
 9858void __io_uring_cancel(bool cancel_all)
 9859{
 9860	io_uring_cancel_generic(cancel_all, NULL);
 9861}
 9862
 9863static void *io_uring_validate_mmap_request(struct file *file,
 9864					    loff_t pgoff, size_t sz)
 9865{
 9866	struct io_ring_ctx *ctx = file->private_data;
 9867	loff_t offset = pgoff << PAGE_SHIFT;
 9868	struct page *page;
 9869	void *ptr;
 9870
 9871	switch (offset) {
 9872	case IORING_OFF_SQ_RING:
 9873	case IORING_OFF_CQ_RING:
 9874		ptr = ctx->rings;
 9875		break;
 9876	case IORING_OFF_SQES:
 9877		ptr = ctx->sq_sqes;
 9878		break;
 9879	default:
 9880		return ERR_PTR(-EINVAL);
 9881	}
 9882
 9883	page = virt_to_head_page(ptr);
 9884	if (sz > page_size(page))
 9885		return ERR_PTR(-EINVAL);
 9886
 9887	return ptr;
 9888}
 9889
 9890#ifdef CONFIG_MMU
 9891
 9892static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
 9893{
 9894	size_t sz = vma->vm_end - vma->vm_start;
 9895	unsigned long pfn;
 9896	void *ptr;
 9897
 9898	ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
 9899	if (IS_ERR(ptr))
 9900		return PTR_ERR(ptr);
 9901
 9902	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
 9903	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
 9904}
 9905
 9906#else /* !CONFIG_MMU */
 9907
 9908static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
 9909{
 9910	return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
 9911}
 9912
 9913static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
 9914{
 9915	return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
 9916}
 9917
 9918static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
 9919	unsigned long addr, unsigned long len,
 9920	unsigned long pgoff, unsigned long flags)
 9921{
 9922	void *ptr;
 9923
 9924	ptr = io_uring_validate_mmap_request(file, pgoff, len);
 9925	if (IS_ERR(ptr))
 9926		return PTR_ERR(ptr);
 9927
 9928	return (unsigned long) ptr;
 9929}
 9930
 9931#endif /* !CONFIG_MMU */
 9932
 9933static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
 9934{
 9935	DEFINE_WAIT(wait);
 9936
 9937	do {
 9938		if (!io_sqring_full(ctx))
 9939			break;
 9940		prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
 9941
 9942		if (!io_sqring_full(ctx))
 9943			break;
 9944		schedule();
 9945	} while (!signal_pending(current));
 9946
 9947	finish_wait(&ctx->sqo_sq_wait, &wait);
 9948	return 0;
 9949}
 9950
 9951static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
 9952			  struct __kernel_timespec __user **ts,
 9953			  const sigset_t __user **sig)
 9954{
 9955	struct io_uring_getevents_arg arg;
 9956
 9957	/*
 9958	 * If EXT_ARG isn't set, then we have no timespec and the argp pointer
 9959	 * is just a pointer to the sigset_t.
 9960	 */
 9961	if (!(flags & IORING_ENTER_EXT_ARG)) {
 9962		*sig = (const sigset_t __user *) argp;
 9963		*ts = NULL;
 9964		return 0;
 9965	}
 9966
 9967	/*
 9968	 * EXT_ARG is set - ensure we agree on the size of it and copy in our
 9969	 * timespec and sigset_t pointers if good.
 9970	 */
 9971	if (*argsz != sizeof(arg))
 9972		return -EINVAL;
 9973	if (copy_from_user(&arg, argp, sizeof(arg)))
 9974		return -EFAULT;
 9975	*sig = u64_to_user_ptr(arg.sigmask);
 9976	*argsz = arg.sigmask_sz;
 9977	*ts = u64_to_user_ptr(arg.ts);
 9978	return 0;
 9979}
 9980
 9981SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 9982		u32, min_complete, u32, flags, const void __user *, argp,
 9983		size_t, argsz)
 9984{
 9985	struct io_ring_ctx *ctx;
 9986	int submitted = 0;
 9987	struct fd f;
 9988	long ret;
 9989
 9990	io_run_task_work();
 9991
 9992	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
 9993			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
 9994		return -EINVAL;
 9995
 9996	f = fdget(fd);
 9997	if (unlikely(!f.file))
 9998		return -EBADF;
 9999
10000	ret = -EOPNOTSUPP;
10001	if (unlikely(f.file->f_op != &io_uring_fops))
10002		goto out_fput;
10003
10004	ret = -ENXIO;
10005	ctx = f.file->private_data;
10006	if (unlikely(!percpu_ref_tryget(&ctx->refs)))
10007		goto out_fput;
10008
10009	ret = -EBADFD;
10010	if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
10011		goto out;
10012
10013	/*
10014	 * For SQ polling, the thread will do all submissions and completions.
10015	 * Just return the requested submit count, and wake the thread if
10016	 * we were asked to.
10017	 */
10018	ret = 0;
10019	if (ctx->flags & IORING_SETUP_SQPOLL) {
10020		io_cqring_overflow_flush(ctx);
10021
10022		if (unlikely(ctx->sq_data->thread == NULL)) {
10023			ret = -EOWNERDEAD;
10024			goto out;
10025		}
10026		if (flags & IORING_ENTER_SQ_WAKEUP)
10027			wake_up(&ctx->sq_data->wait);
10028		if (flags & IORING_ENTER_SQ_WAIT) {
10029			ret = io_sqpoll_wait_sq(ctx);
10030			if (ret)
10031				goto out;
10032		}
10033		submitted = to_submit;
10034	} else if (to_submit) {
10035		ret = io_uring_add_tctx_node(ctx);
10036		if (unlikely(ret))
10037			goto out;
10038		mutex_lock(&ctx->uring_lock);
10039		submitted = io_submit_sqes(ctx, to_submit);
10040		mutex_unlock(&ctx->uring_lock);
10041
10042		if (submitted != to_submit)
10043			goto out;
10044	}
10045	if (flags & IORING_ENTER_GETEVENTS) {
10046		const sigset_t __user *sig;
10047		struct __kernel_timespec __user *ts;
10048
10049		ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
10050		if (unlikely(ret))
10051			goto out;
10052
10053		min_complete = min(min_complete, ctx->cq_entries);
10054
10055		/*
10056		 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
10057		 * space applications don't need to do io completion events
10058		 * polling again, they can rely on io_sq_thread to do polling
10059		 * work, which can reduce cpu usage and uring_lock contention.
10060		 */
10061		if (ctx->flags & IORING_SETUP_IOPOLL &&
10062		    !(ctx->flags & IORING_SETUP_SQPOLL)) {
10063			ret = io_iopoll_check(ctx, min_complete);
10064		} else {
10065			ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
10066		}
10067	}
10068
10069out:
10070	percpu_ref_put(&ctx->refs);
10071out_fput:
10072	fdput(f);
10073	return submitted ? submitted : ret;
10074}
10075
10076#ifdef CONFIG_PROC_FS
10077static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
10078		const struct cred *cred)
10079{
10080	struct user_namespace *uns = seq_user_ns(m);
10081	struct group_info *gi;
10082	kernel_cap_t cap;
10083	unsigned __capi;
10084	int g;
10085
10086	seq_printf(m, "%5d\n", id);
10087	seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
10088	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
10089	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
10090	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
10091	seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
10092	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
10093	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
10094	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
10095	seq_puts(m, "\n\tGroups:\t");
10096	gi = cred->group_info;
10097	for (g = 0; g < gi->ngroups; g++) {
10098		seq_put_decimal_ull(m, g ? " " : "",
10099					from_kgid_munged(uns, gi->gid[g]));
10100	}
10101	seq_puts(m, "\n\tCapEff:\t");
10102	cap = cred->cap_effective;
10103	CAP_FOR_EACH_U32(__capi)
10104		seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
10105	seq_putc(m, '\n');
10106	return 0;
10107}
10108
10109static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
10110					  struct seq_file *m)
10111{
10112	struct io_sq_data *sq = NULL;
10113	struct io_overflow_cqe *ocqe;
10114	struct io_rings *r = ctx->rings;
10115	unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
10116	unsigned int sq_head = READ_ONCE(r->sq.head);
10117	unsigned int sq_tail = READ_ONCE(r->sq.tail);
10118	unsigned int cq_head = READ_ONCE(r->cq.head);
10119	unsigned int cq_tail = READ_ONCE(r->cq.tail);
10120	unsigned int sq_entries, cq_entries;
10121	bool has_lock;
10122	unsigned int i;
10123
10124	/*
10125	 * we may get imprecise sqe and cqe info if uring is actively running
10126	 * since we get cached_sq_head and cached_cq_tail without uring_lock
10127	 * and sq_tail and cq_head are changed by userspace. But it's ok since
10128	 * we usually use these info when it is stuck.
10129	 */
10130	seq_printf(m, "SqMask:\t\t0x%x\n", sq_mask);
10131	seq_printf(m, "SqHead:\t%u\n", sq_head);
10132	seq_printf(m, "SqTail:\t%u\n", sq_tail);
10133	seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
10134	seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
10135	seq_printf(m, "CqHead:\t%u\n", cq_head);
10136	seq_printf(m, "CqTail:\t%u\n", cq_tail);
10137	seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
10138	seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
10139	sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
10140	for (i = 0; i < sq_entries; i++) {
10141		unsigned int entry = i + sq_head;
10142		unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
10143		struct io_uring_sqe *sqe;
10144
10145		if (sq_idx > sq_mask)
10146			continue;
10147		sqe = &ctx->sq_sqes[sq_idx];
10148		seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
10149			   sq_idx, sqe->opcode, sqe->fd, sqe->flags,
10150			   sqe->user_data);
10151	}
10152	seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
10153	cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
10154	for (i = 0; i < cq_entries; i++) {
10155		unsigned int entry = i + cq_head;
10156		struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask];
10157
10158		seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
10159			   entry & cq_mask, cqe->user_data, cqe->res,
10160			   cqe->flags);
10161	}
10162
10163	/*
10164	 * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
10165	 * since fdinfo case grabs it in the opposite direction of normal use
10166	 * cases. If we fail to get the lock, we just don't iterate any
10167	 * structures that could be going away outside the io_uring mutex.
10168	 */
10169	has_lock = mutex_trylock(&ctx->uring_lock);
10170
10171	if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
10172		sq = ctx->sq_data;
10173		if (!sq->thread)
10174			sq = NULL;
10175	}
10176
10177	seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
10178	seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
10179	seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
10180	for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
10181		struct file *f = io_file_from_index(ctx, i);
10182
10183		if (f)
10184			seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
10185		else
10186			seq_printf(m, "%5u: <none>\n", i);
10187	}
10188	seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
10189	for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
10190		struct io_mapped_ubuf *buf = ctx->user_bufs[i];
10191		unsigned int len = buf->ubuf_end - buf->ubuf;
10192
10193		seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
10194	}
10195	if (has_lock && !xa_empty(&ctx->personalities)) {
10196		unsigned long index;
10197		const struct cred *cred;
10198
10199		seq_printf(m, "Personalities:\n");
10200		xa_for_each(&ctx->personalities, index, cred)
10201			io_uring_show_cred(m, index, cred);
10202	}
10203	if (has_lock)
10204		mutex_unlock(&ctx->uring_lock);
10205
10206	seq_puts(m, "PollList:\n");
10207	spin_lock(&ctx->completion_lock);
10208	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
10209		struct hlist_head *list = &ctx->cancel_hash[i];
10210		struct io_kiocb *req;
10211
10212		hlist_for_each_entry(req, list, hash_node)
10213			seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
10214					req->task->task_works != NULL);
10215	}
10216
10217	seq_puts(m, "CqOverflowList:\n");
10218	list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
10219		struct io_uring_cqe *cqe = &ocqe->cqe;
10220
10221		seq_printf(m, "  user_data=%llu, res=%d, flags=%x\n",
10222			   cqe->user_data, cqe->res, cqe->flags);
10223
10224	}
10225
10226	spin_unlock(&ctx->completion_lock);
10227}
10228
10229static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
10230{
10231	struct io_ring_ctx *ctx = f->private_data;
10232
10233	if (percpu_ref_tryget(&ctx->refs)) {
10234		__io_uring_show_fdinfo(ctx, m);
10235		percpu_ref_put(&ctx->refs);
10236	}
10237}
10238#endif
10239
10240static const struct file_operations io_uring_fops = {
10241	.release	= io_uring_release,
10242	.mmap		= io_uring_mmap,
10243#ifndef CONFIG_MMU
10244	.get_unmapped_area = io_uring_nommu_get_unmapped_area,
10245	.mmap_capabilities = io_uring_nommu_mmap_capabilities,
10246#endif
10247	.poll		= io_uring_poll,
10248#ifdef CONFIG_PROC_FS
10249	.show_fdinfo	= io_uring_show_fdinfo,
10250#endif
10251};
10252
10253static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
10254					 struct io_uring_params *p)
10255{
10256	struct io_rings *rings;
10257	size_t size, sq_array_offset;
10258
10259	/* make sure these are sane, as we already accounted them */
10260	ctx->sq_entries = p->sq_entries;
10261	ctx->cq_entries = p->cq_entries;
10262
10263	size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
10264	if (size == SIZE_MAX)
10265		return -EOVERFLOW;
10266
10267	rings = io_mem_alloc(size);
10268	if (!rings)
10269		return -ENOMEM;
10270
10271	ctx->rings = rings;
10272	ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
10273	rings->sq_ring_mask = p->sq_entries - 1;
10274	rings->cq_ring_mask = p->cq_entries - 1;
10275	rings->sq_ring_entries = p->sq_entries;
10276	rings->cq_ring_entries = p->cq_entries;
10277
10278	size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
10279	if (size == SIZE_MAX) {
10280		io_mem_free(ctx->rings);
10281		ctx->rings = NULL;
10282		return -EOVERFLOW;
10283	}
10284
10285	ctx->sq_sqes = io_mem_alloc(size);
10286	if (!ctx->sq_sqes) {
10287		io_mem_free(ctx->rings);
10288		ctx->rings = NULL;
10289		return -ENOMEM;
10290	}
10291
10292	return 0;
10293}
10294
10295static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
10296{
10297	int ret, fd;
10298
10299	fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
10300	if (fd < 0)
10301		return fd;
10302
10303	ret = io_uring_add_tctx_node(ctx);
10304	if (ret) {
10305		put_unused_fd(fd);
10306		return ret;
10307	}
10308	fd_install(fd, file);
10309	return fd;
10310}
10311
10312/*
10313 * Allocate an anonymous fd, this is what constitutes the application
10314 * visible backing of an io_uring instance. The application mmaps this
10315 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
10316 * we have to tie this fd to a socket for file garbage collection purposes.
10317 */
10318static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
10319{
10320	struct file *file;
10321#if defined(CONFIG_UNIX)
10322	int ret;
10323
10324	ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
10325				&ctx->ring_sock);
10326	if (ret)
10327		return ERR_PTR(ret);
10328#endif
10329
10330	file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
10331					 O_RDWR | O_CLOEXEC, NULL);
10332#if defined(CONFIG_UNIX)
10333	if (IS_ERR(file)) {
10334		sock_release(ctx->ring_sock);
10335		ctx->ring_sock = NULL;
10336	} else {
10337		ctx->ring_sock->file = file;
10338	}
10339#endif
10340	return file;
10341}
10342
10343static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
10344				  struct io_uring_params __user *params)
10345{
10346	struct io_ring_ctx *ctx;
10347	struct file *file;
10348	int ret;
10349
10350	if (!entries)
10351		return -EINVAL;
10352	if (entries > IORING_MAX_ENTRIES) {
10353		if (!(p->flags & IORING_SETUP_CLAMP))
10354			return -EINVAL;
10355		entries = IORING_MAX_ENTRIES;
10356	}
10357
10358	/*
10359	 * Use twice as many entries for the CQ ring. It's possible for the
10360	 * application to drive a higher depth than the size of the SQ ring,
10361	 * since the sqes are only used at submission time. This allows for
10362	 * some flexibility in overcommitting a bit. If the application has
10363	 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
10364	 * of CQ ring entries manually.
10365	 */
10366	p->sq_entries = roundup_pow_of_two(entries);
10367	if (p->flags & IORING_SETUP_CQSIZE) {
10368		/*
10369		 * If IORING_SETUP_CQSIZE is set, we do the same roundup
10370		 * to a power-of-two, if it isn't already. We do NOT impose
10371		 * any cq vs sq ring sizing.
10372		 */
10373		if (!p->cq_entries)
10374			return -EINVAL;
10375		if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
10376			if (!(p->flags & IORING_SETUP_CLAMP))
10377				return -EINVAL;
10378			p->cq_entries = IORING_MAX_CQ_ENTRIES;
10379		}
10380		p->cq_entries = roundup_pow_of_two(p->cq_entries);
10381		if (p->cq_entries < p->sq_entries)
10382			return -EINVAL;
10383	} else {
10384		p->cq_entries = 2 * p->sq_entries;
10385	}
10386
10387	ctx = io_ring_ctx_alloc(p);
10388	if (!ctx)
10389		return -ENOMEM;
10390	ctx->compat = in_compat_syscall();
10391	if (!capable(CAP_IPC_LOCK))
10392		ctx->user = get_uid(current_user());
10393
10394	/*
10395	 * This is just grabbed for accounting purposes. When a process exits,
10396	 * the mm is exited and dropped before the files, hence we need to hang
10397	 * on to this mm purely for the purposes of being able to unaccount
10398	 * memory (locked/pinned vm). It's not used for anything else.
10399	 */
10400	mmgrab(current->mm);
10401	ctx->mm_account = current->mm;
10402
10403	ret = io_allocate_scq_urings(ctx, p);
10404	if (ret)
10405		goto err;
10406
10407	ret = io_sq_offload_create(ctx, p);
10408	if (ret)
10409		goto err;
10410	/* always set a rsrc node */
10411	ret = io_rsrc_node_switch_start(ctx);
10412	if (ret)
10413		goto err;
10414	io_rsrc_node_switch(ctx, NULL);
10415
10416	memset(&p->sq_off, 0, sizeof(p->sq_off));
10417	p->sq_off.head = offsetof(struct io_rings, sq.head);
10418	p->sq_off.tail = offsetof(struct io_rings, sq.tail);
10419	p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
10420	p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
10421	p->sq_off.flags = offsetof(struct io_rings, sq_flags);
10422	p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
10423	p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
10424
10425	memset(&p->cq_off, 0, sizeof(p->cq_off));
10426	p->cq_off.head = offsetof(struct io_rings, cq.head);
10427	p->cq_off.tail = offsetof(struct io_rings, cq.tail);
10428	p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
10429	p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
10430	p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
10431	p->cq_off.cqes = offsetof(struct io_rings, cqes);
10432	p->cq_off.flags = offsetof(struct io_rings, cq_flags);
10433
10434	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
10435			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
10436			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
10437			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
10438			IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
10439			IORING_FEAT_RSRC_TAGS;
10440
10441	if (copy_to_user(params, p, sizeof(*p))) {
10442		ret = -EFAULT;
10443		goto err;
10444	}
10445
10446	file = io_uring_get_file(ctx);
10447	if (IS_ERR(file)) {
10448		ret = PTR_ERR(file);
10449		goto err;
10450	}
10451
10452	/*
10453	 * Install ring fd as the very last thing, so we don't risk someone
10454	 * having closed it before we finish setup
10455	 */
10456	ret = io_uring_install_fd(ctx, file);
10457	if (ret < 0) {
10458		/* fput will clean it up */
10459		fput(file);
10460		return ret;
10461	}
10462
10463	trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
10464	return ret;
10465err:
10466	io_ring_ctx_wait_and_kill(ctx);
10467	return ret;
10468}
10469
10470/*
10471 * Sets up an aio uring context, and returns the fd. Applications asks for a
10472 * ring size, we return the actual sq/cq ring sizes (among other things) in the
10473 * params structure passed in.
10474 */
10475static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
10476{
10477	struct io_uring_params p;
10478	int i;
10479
10480	if (copy_from_user(&p, params, sizeof(p)))
10481		return -EFAULT;
10482	for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
10483		if (p.resv[i])
10484			return -EINVAL;
10485	}
10486
10487	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
10488			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
10489			IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
10490			IORING_SETUP_R_DISABLED))
10491		return -EINVAL;
10492
10493	return  io_uring_create(entries, &p, params);
10494}
10495
10496SYSCALL_DEFINE2(io_uring_setup, u32, entries,
10497		struct io_uring_params __user *, params)
10498{
10499	return io_uring_setup(entries, params);
10500}
10501
10502static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
10503			   unsigned nr_args)
10504{
10505	struct io_uring_probe *p;
10506	size_t size;
10507	int i, ret;
10508
10509	size = struct_size(p, ops, nr_args);
10510	if (size == SIZE_MAX)
10511		return -EOVERFLOW;
10512	p = kzalloc(size, GFP_KERNEL);
10513	if (!p)
10514		return -ENOMEM;
10515
10516	ret = -EFAULT;
10517	if (copy_from_user(p, arg, size))
10518		goto out;
10519	ret = -EINVAL;
10520	if (memchr_inv(p, 0, size))
10521		goto out;
10522
10523	p->last_op = IORING_OP_LAST - 1;
10524	if (nr_args > IORING_OP_LAST)
10525		nr_args = IORING_OP_LAST;
10526
10527	for (i = 0; i < nr_args; i++) {
10528		p->ops[i].op = i;
10529		if (!io_op_defs[i].not_supported)
10530			p->ops[i].flags = IO_URING_OP_SUPPORTED;
10531	}
10532	p->ops_len = i;
10533
10534	ret = 0;
10535	if (copy_to_user(arg, p, size))
10536		ret = -EFAULT;
10537out:
10538	kfree(p);
10539	return ret;
10540}
10541
10542static int io_register_personality(struct io_ring_ctx *ctx)
10543{
10544	const struct cred *creds;
10545	u32 id;
10546	int ret;
10547
10548	creds = get_current_cred();
10549
10550	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
10551			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
10552	if (ret < 0) {
10553		put_cred(creds);
10554		return ret;
10555	}
10556	return id;
10557}
10558
10559static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
10560					   void __user *arg, unsigned int nr_args)
10561{
10562	struct io_uring_restriction *res;
10563	size_t size;
10564	int i, ret;
10565
10566	/* Restrictions allowed only if rings started disabled */
10567	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
10568		return -EBADFD;
10569
10570	/* We allow only a single restrictions registration */
10571	if (ctx->restrictions.registered)
10572		return -EBUSY;
10573
10574	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
10575		return -EINVAL;
10576
10577	size = array_size(nr_args, sizeof(*res));
10578	if (size == SIZE_MAX)
10579		return -EOVERFLOW;
10580
10581	res = memdup_user(arg, size);
10582	if (IS_ERR(res))
10583		return PTR_ERR(res);
10584
10585	ret = 0;
10586
10587	for (i = 0; i < nr_args; i++) {
10588		switch (res[i].opcode) {
10589		case IORING_RESTRICTION_REGISTER_OP:
10590			if (res[i].register_op >= IORING_REGISTER_LAST) {
10591				ret = -EINVAL;
10592				goto out;
10593			}
10594
10595			__set_bit(res[i].register_op,
10596				  ctx->restrictions.register_op);
10597			break;
10598		case IORING_RESTRICTION_SQE_OP:
10599			if (res[i].sqe_op >= IORING_OP_LAST) {
10600				ret = -EINVAL;
10601				goto out;
10602			}
10603
10604			__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
10605			break;
10606		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
10607			ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
10608			break;
10609		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
10610			ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
10611			break;
10612		default:
10613			ret = -EINVAL;
10614			goto out;
10615		}
10616	}
10617
10618out:
10619	/* Reset all restrictions if an error happened */
10620	if (ret != 0)
10621		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
10622	else
10623		ctx->restrictions.registered = true;
10624
10625	kfree(res);
10626	return ret;
10627}
10628
10629static int io_register_enable_rings(struct io_ring_ctx *ctx)
10630{
10631	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
10632		return -EBADFD;
10633
10634	if (ctx->restrictions.registered)
10635		ctx->restricted = 1;
10636
10637	ctx->flags &= ~IORING_SETUP_R_DISABLED;
10638	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
10639		wake_up(&ctx->sq_data->wait);
10640	return 0;
10641}
10642
10643static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
10644				     struct io_uring_rsrc_update2 *up,
10645				     unsigned nr_args)
10646{
10647	__u32 tmp;
10648	int err;
10649
10650	if (up->resv)
10651		return -EINVAL;
10652	if (check_add_overflow(up->offset, nr_args, &tmp))
10653		return -EOVERFLOW;
10654	err = io_rsrc_node_switch_start(ctx);
10655	if (err)
10656		return err;
10657
10658	switch (type) {
10659	case IORING_RSRC_FILE:
10660		return __io_sqe_files_update(ctx, up, nr_args);
10661	case IORING_RSRC_BUFFER:
10662		return __io_sqe_buffers_update(ctx, up, nr_args);
10663	}
10664	return -EINVAL;
10665}
10666
10667static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
10668				    unsigned nr_args)
10669{
10670	struct io_uring_rsrc_update2 up;
10671
10672	if (!nr_args)
10673		return -EINVAL;
10674	memset(&up, 0, sizeof(up));
10675	if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
10676		return -EFAULT;
10677	return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
10678}
10679
10680static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
10681				   unsigned size, unsigned type)
10682{
10683	struct io_uring_rsrc_update2 up;
10684
10685	if (size != sizeof(up))
10686		return -EINVAL;
10687	if (copy_from_user(&up, arg, sizeof(up)))
10688		return -EFAULT;
10689	if (!up.nr || up.resv)
10690		return -EINVAL;
10691	return __io_register_rsrc_update(ctx, type, &up, up.nr);
10692}
10693
10694static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
10695			    unsigned int size, unsigned int type)
10696{
10697	struct io_uring_rsrc_register rr;
10698
10699	/* keep it extendible */
10700	if (size != sizeof(rr))
10701		return -EINVAL;
10702
10703	memset(&rr, 0, sizeof(rr));
10704	if (copy_from_user(&rr, arg, size))
10705		return -EFAULT;
10706	if (!rr.nr || rr.resv || rr.resv2)
10707		return -EINVAL;
10708
10709	switch (type) {
10710	case IORING_RSRC_FILE:
10711		return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
10712					     rr.nr, u64_to_user_ptr(rr.tags));
10713	case IORING_RSRC_BUFFER:
10714		return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
10715					       rr.nr, u64_to_user_ptr(rr.tags));
10716	}
10717	return -EINVAL;
10718}
10719
10720static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
10721				       void __user *arg, unsigned len)
10722{
10723	struct io_uring_task *tctx = current->io_uring;
10724	cpumask_var_t new_mask;
10725	int ret;
10726
10727	if (!tctx || !tctx->io_wq)
10728		return -EINVAL;
10729
10730	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
10731		return -ENOMEM;
10732
10733	cpumask_clear(new_mask);
10734	if (len > cpumask_size())
10735		len = cpumask_size();
10736
10737	if (copy_from_user(new_mask, arg, len)) {
10738		free_cpumask_var(new_mask);
10739		return -EFAULT;
10740	}
10741
10742	ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
10743	free_cpumask_var(new_mask);
10744	return ret;
10745}
10746
10747static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
10748{
10749	struct io_uring_task *tctx = current->io_uring;
10750
10751	if (!tctx || !tctx->io_wq)
10752		return -EINVAL;
10753
10754	return io_wq_cpu_affinity(tctx->io_wq, NULL);
10755}
10756
10757static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
10758					       void __user *arg)
10759	__must_hold(&ctx->uring_lock)
10760{
10761	struct io_tctx_node *node;
10762	struct io_uring_task *tctx = NULL;
10763	struct io_sq_data *sqd = NULL;
10764	__u32 new_count[2];
10765	int i, ret;
10766
10767	if (copy_from_user(new_count, arg, sizeof(new_count)))
10768		return -EFAULT;
10769	for (i = 0; i < ARRAY_SIZE(new_count); i++)
10770		if (new_count[i] > INT_MAX)
10771			return -EINVAL;
10772
10773	if (ctx->flags & IORING_SETUP_SQPOLL) {
10774		sqd = ctx->sq_data;
10775		if (sqd) {
10776			/*
10777			 * Observe the correct sqd->lock -> ctx->uring_lock
10778			 * ordering. Fine to drop uring_lock here, we hold
10779			 * a ref to the ctx.
10780			 */
10781			refcount_inc(&sqd->refs);
10782			mutex_unlock(&ctx->uring_lock);
10783			mutex_lock(&sqd->lock);
10784			mutex_lock(&ctx->uring_lock);
10785			if (sqd->thread)
10786				tctx = sqd->thread->io_uring;
10787		}
10788	} else {
10789		tctx = current->io_uring;
10790	}
10791
10792	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
10793
10794	for (i = 0; i < ARRAY_SIZE(new_count); i++)
10795		if (new_count[i])
10796			ctx->iowq_limits[i] = new_count[i];
10797	ctx->iowq_limits_set = true;
10798
10799	if (tctx && tctx->io_wq) {
10800		ret = io_wq_max_workers(tctx->io_wq, new_count);
10801		if (ret)
10802			goto err;
10803	} else {
10804		memset(new_count, 0, sizeof(new_count));
10805	}
10806
10807	if (sqd) {
10808		mutex_unlock(&sqd->lock);
10809		io_put_sq_data(sqd);
10810	}
10811
10812	if (copy_to_user(arg, new_count, sizeof(new_count)))
10813		return -EFAULT;
10814
10815	/* that's it for SQPOLL, only the SQPOLL task creates requests */
10816	if (sqd)
10817		return 0;
10818
10819	/* now propagate the restriction to all registered users */
10820	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
10821		struct io_uring_task *tctx = node->task->io_uring;
10822
10823		if (WARN_ON_ONCE(!tctx->io_wq))
10824			continue;
10825
10826		for (i = 0; i < ARRAY_SIZE(new_count); i++)
10827			new_count[i] = ctx->iowq_limits[i];
10828		/* ignore errors, it always returns zero anyway */
10829		(void)io_wq_max_workers(tctx->io_wq, new_count);
10830	}
10831	return 0;
10832err:
10833	if (sqd) {
10834		mutex_unlock(&sqd->lock);
10835		io_put_sq_data(sqd);
10836	}
10837	return ret;
10838}
10839
10840static bool io_register_op_must_quiesce(int op)
10841{
10842	switch (op) {
10843	case IORING_REGISTER_BUFFERS:
10844	case IORING_UNREGISTER_BUFFERS:
10845	case IORING_REGISTER_FILES:
10846	case IORING_UNREGISTER_FILES:
10847	case IORING_REGISTER_FILES_UPDATE:
10848	case IORING_REGISTER_PROBE:
10849	case IORING_REGISTER_PERSONALITY:
10850	case IORING_UNREGISTER_PERSONALITY:
10851	case IORING_REGISTER_FILES2:
10852	case IORING_REGISTER_FILES_UPDATE2:
10853	case IORING_REGISTER_BUFFERS2:
10854	case IORING_REGISTER_BUFFERS_UPDATE:
10855	case IORING_REGISTER_IOWQ_AFF:
10856	case IORING_UNREGISTER_IOWQ_AFF:
10857	case IORING_REGISTER_IOWQ_MAX_WORKERS:
10858		return false;
10859	default:
10860		return true;
10861	}
10862}
10863
10864static __cold int io_ctx_quiesce(struct io_ring_ctx *ctx)
10865{
10866	long ret;
10867
10868	percpu_ref_kill(&ctx->refs);
10869
10870	/*
10871	 * Drop uring mutex before waiting for references to exit. If another
10872	 * thread is currently inside io_uring_enter() it might need to grab the
10873	 * uring_lock to make progress. If we hold it here across the drain
10874	 * wait, then we can deadlock. It's safe to drop the mutex here, since
10875	 * no new references will come in after we've killed the percpu ref.
10876	 */
10877	mutex_unlock(&ctx->uring_lock);
10878	do {
10879		ret = wait_for_completion_interruptible_timeout(&ctx->ref_comp, HZ);
10880		if (ret) {
10881			ret = min(0L, ret);
10882			break;
10883		}
10884
10885		ret = io_run_task_work_sig();
10886		io_req_caches_free(ctx);
10887	} while (ret >= 0);
10888	mutex_lock(&ctx->uring_lock);
10889
10890	if (ret)
10891		io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
10892	return ret;
10893}
10894
10895static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
10896			       void __user *arg, unsigned nr_args)
10897	__releases(ctx->uring_lock)
10898	__acquires(ctx->uring_lock)
10899{
10900	int ret;
10901
10902	/*
10903	 * We're inside the ring mutex, if the ref is already dying, then
10904	 * someone else killed the ctx or is already going through
10905	 * io_uring_register().
10906	 */
10907	if (percpu_ref_is_dying(&ctx->refs))
10908		return -ENXIO;
10909
10910	if (ctx->restricted) {
10911		if (opcode >= IORING_REGISTER_LAST)
10912			return -EINVAL;
10913		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
10914		if (!test_bit(opcode, ctx->restrictions.register_op))
10915			return -EACCES;
10916	}
10917
10918	if (io_register_op_must_quiesce(opcode)) {
10919		ret = io_ctx_quiesce(ctx);
10920		if (ret)
10921			return ret;
10922	}
10923
10924	switch (opcode) {
10925	case IORING_REGISTER_BUFFERS:
10926		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
10927		break;
10928	case IORING_UNREGISTER_BUFFERS:
10929		ret = -EINVAL;
10930		if (arg || nr_args)
10931			break;
10932		ret = io_sqe_buffers_unregister(ctx);
10933		break;
10934	case IORING_REGISTER_FILES:
10935		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
10936		break;
10937	case IORING_UNREGISTER_FILES:
10938		ret = -EINVAL;
10939		if (arg || nr_args)
10940			break;
10941		ret = io_sqe_files_unregister(ctx);
10942		break;
10943	case IORING_REGISTER_FILES_UPDATE:
10944		ret = io_register_files_update(ctx, arg, nr_args);
10945		break;
10946	case IORING_REGISTER_EVENTFD:
10947	case IORING_REGISTER_EVENTFD_ASYNC:
10948		ret = -EINVAL;
10949		if (nr_args != 1)
10950			break;
10951		ret = io_eventfd_register(ctx, arg);
10952		if (ret)
10953			break;
10954		if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
10955			ctx->eventfd_async = 1;
10956		else
10957			ctx->eventfd_async = 0;
10958		break;
10959	case IORING_UNREGISTER_EVENTFD:
10960		ret = -EINVAL;
10961		if (arg || nr_args)
10962			break;
10963		ret = io_eventfd_unregister(ctx);
10964		break;
10965	case IORING_REGISTER_PROBE:
10966		ret = -EINVAL;
10967		if (!arg || nr_args > 256)
10968			break;
10969		ret = io_probe(ctx, arg, nr_args);
10970		break;
10971	case IORING_REGISTER_PERSONALITY:
10972		ret = -EINVAL;
10973		if (arg || nr_args)
10974			break;
10975		ret = io_register_personality(ctx);
10976		break;
10977	case IORING_UNREGISTER_PERSONALITY:
10978		ret = -EINVAL;
10979		if (arg)
10980			break;
10981		ret = io_unregister_personality(ctx, nr_args);
10982		break;
10983	case IORING_REGISTER_ENABLE_RINGS:
10984		ret = -EINVAL;
10985		if (arg || nr_args)
10986			break;
10987		ret = io_register_enable_rings(ctx);
10988		break;
10989	case IORING_REGISTER_RESTRICTIONS:
10990		ret = io_register_restrictions(ctx, arg, nr_args);
10991		break;
10992	case IORING_REGISTER_FILES2:
10993		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
10994		break;
10995	case IORING_REGISTER_FILES_UPDATE2:
10996		ret = io_register_rsrc_update(ctx, arg, nr_args,
10997					      IORING_RSRC_FILE);
10998		break;
10999	case IORING_REGISTER_BUFFERS2:
11000		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
11001		break;
11002	case IORING_REGISTER_BUFFERS_UPDATE:
11003		ret = io_register_rsrc_update(ctx, arg, nr_args,
11004					      IORING_RSRC_BUFFER);
11005		break;
11006	case IORING_REGISTER_IOWQ_AFF:
11007		ret = -EINVAL;
11008		if (!arg || !nr_args)
11009			break;
11010		ret = io_register_iowq_aff(ctx, arg, nr_args);
11011		break;
11012	case IORING_UNREGISTER_IOWQ_AFF:
11013		ret = -EINVAL;
11014		if (arg || nr_args)
11015			break;
11016		ret = io_unregister_iowq_aff(ctx);
11017		break;
11018	case IORING_REGISTER_IOWQ_MAX_WORKERS:
11019		ret = -EINVAL;
11020		if (!arg || nr_args != 2)
11021			break;
11022		ret = io_register_iowq_max_workers(ctx, arg);
11023		break;
11024	default:
11025		ret = -EINVAL;
11026		break;
11027	}
11028
11029	if (io_register_op_must_quiesce(opcode)) {
11030		/* bring the ctx back to life */
11031		percpu_ref_reinit(&ctx->refs);
11032		reinit_completion(&ctx->ref_comp);
11033	}
11034	return ret;
11035}
11036
11037SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
11038		void __user *, arg, unsigned int, nr_args)
11039{
11040	struct io_ring_ctx *ctx;
11041	long ret = -EBADF;
11042	struct fd f;
11043
11044	f = fdget(fd);
11045	if (!f.file)
11046		return -EBADF;
11047
11048	ret = -EOPNOTSUPP;
11049	if (f.file->f_op != &io_uring_fops)
11050		goto out_fput;
11051
11052	ctx = f.file->private_data;
11053
11054	io_run_task_work();
11055
11056	mutex_lock(&ctx->uring_lock);
11057	ret = __io_uring_register(ctx, opcode, arg, nr_args);
11058	mutex_unlock(&ctx->uring_lock);
11059	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
11060							ctx->cq_ev_fd != NULL, ret);
11061out_fput:
11062	fdput(f);
11063	return ret;
11064}
11065
11066static int __init io_uring_init(void)
11067{
11068#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
11069	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
11070	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
11071} while (0)
11072
11073#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
11074	__BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
11075	BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
11076	BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
11077	BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
11078	BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
11079	BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
11080	BUILD_BUG_SQE_ELEM(8,  __u64,  off);
11081	BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
11082	BUILD_BUG_SQE_ELEM(16, __u64,  addr);
11083	BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
11084	BUILD_BUG_SQE_ELEM(24, __u32,  len);
11085	BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
11086	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
11087	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
11088	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
11089	BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
11090	BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
11091	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
11092	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
11093	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
11094	BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
11095	BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
11096	BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
11097	BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
11098	BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
11099	BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
11100	BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
11101	BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
11102	BUILD_BUG_SQE_ELEM(40, __u16,  buf_group);
11103	BUILD_BUG_SQE_ELEM(42, __u16,  personality);
11104	BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
11105	BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
11106
11107	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
11108		     sizeof(struct io_uring_rsrc_update));
11109	BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
11110		     sizeof(struct io_uring_rsrc_update2));
11111
11112	/* ->buf_index is u16 */
11113	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
11114
11115	/* should fit into one byte */
11116	BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
11117	BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
11118	BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
11119
11120	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
11121	BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
11122
11123	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
11124				SLAB_ACCOUNT);
11125	return 0;
11126};
11127__initcall(io_uring_init);
Configure Feed

Configure Feed