fs/io_uring.c at v5.13 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / io_uring.c
at v5.13 10207 lines 256 kB view raw
wrap content
    1// SPDX-License-Identifier: GPL-2.0
    2/*
    3 * Shared application/kernel submission and completion ring pairs, for
    4 * supporting fast/efficient IO.
    5 *
    6 * A note on the read/write ordering memory barriers that are matched between
    7 * the application and kernel side.
    8 *
    9 * After the application reads the CQ ring tail, it must use an
   10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
   11 * before writing the tail (using smp_load_acquire to read the tail will
   12 * do). It also needs a smp_mb() before updating CQ head (ordering the
   13 * entry load(s) with the head store), pairing with an implicit barrier
   14 * through a control-dependency in io_get_cqring (smp_store_release to
   15 * store head will do). Failure to do so could lead to reading invalid
   16 * CQ entries.
   17 *
   18 * Likewise, the application must use an appropriate smp_wmb() before
   19 * writing the SQ tail (ordering SQ entry stores with the tail store),
   20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
   21 * to store the tail will do). And it needs a barrier ordering the SQ
   22 * head load before writing new SQ entries (smp_load_acquire to read
   23 * head will do).
   24 *
   25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
   26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
   27 * updating the SQ tail; a full memory barrier smp_mb() is needed
   28 * between.
   29 *
   30 * Also see the examples in the liburing library:
   31 *
   32 *	git://git.kernel.dk/liburing
   33 *
   34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
   35 * from data shared between the kernel and application. This is done both
   36 * for ordering purposes, but also to ensure that once a value is loaded from
   37 * data that the application could potentially modify, it remains stable.
   38 *
   39 * Copyright (C) 2018-2019 Jens Axboe
   40 * Copyright (c) 2018-2019 Christoph Hellwig
   41 */
   42#include <linux/kernel.h>
   43#include <linux/init.h>
   44#include <linux/errno.h>
   45#include <linux/syscalls.h>
   46#include <linux/compat.h>
   47#include <net/compat.h>
   48#include <linux/refcount.h>
   49#include <linux/uio.h>
   50#include <linux/bits.h>
   51
   52#include <linux/sched/signal.h>
   53#include <linux/fs.h>
   54#include <linux/file.h>
   55#include <linux/fdtable.h>
   56#include <linux/mm.h>
   57#include <linux/mman.h>
   58#include <linux/percpu.h>
   59#include <linux/slab.h>
   60#include <linux/blkdev.h>
   61#include <linux/bvec.h>
   62#include <linux/net.h>
   63#include <net/sock.h>
   64#include <net/af_unix.h>
   65#include <net/scm.h>
   66#include <linux/anon_inodes.h>
   67#include <linux/sched/mm.h>
   68#include <linux/uaccess.h>
   69#include <linux/nospec.h>
   70#include <linux/sizes.h>
   71#include <linux/hugetlb.h>
   72#include <linux/highmem.h>
   73#include <linux/namei.h>
   74#include <linux/fsnotify.h>
   75#include <linux/fadvise.h>
   76#include <linux/eventpoll.h>
   77#include <linux/splice.h>
   78#include <linux/task_work.h>
   79#include <linux/pagemap.h>
   80#include <linux/io_uring.h>
   81
   82#define CREATE_TRACE_POINTS
   83#include <trace/events/io_uring.h>
   84
   85#include <uapi/linux/io_uring.h>
   86
   87#include "internal.h"
   88#include "io-wq.h"
   89
   90#define IORING_MAX_ENTRIES	32768
   91#define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
   92
   93/*
   94 * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
   95 */
   96#define IORING_FILE_TABLE_SHIFT	9
   97#define IORING_MAX_FILES_TABLE	(1U << IORING_FILE_TABLE_SHIFT)
   98#define IORING_FILE_TABLE_MASK	(IORING_MAX_FILES_TABLE - 1)
   99#define IORING_MAX_FIXED_FILES	(64 * IORING_MAX_FILES_TABLE)
  100#define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
  101				 IORING_REGISTER_LAST + IORING_OP_LAST)
  102
  103#define IORING_MAX_REG_BUFFERS	(1U << 14)
  104
  105#define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK|	\
  106				IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
  107				IOSQE_BUFFER_SELECT)
  108
  109struct io_uring {
  110	u32 head ____cacheline_aligned_in_smp;
  111	u32 tail ____cacheline_aligned_in_smp;
  112};
  113
  114/*
  115 * This data is shared with the application through the mmap at offsets
  116 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
  117 *
  118 * The offsets to the member fields are published through struct
  119 * io_sqring_offsets when calling io_uring_setup.
  120 */
  121struct io_rings {
  122	/*
  123	 * Head and tail offsets into the ring; the offsets need to be
  124	 * masked to get valid indices.
  125	 *
  126	 * The kernel controls head of the sq ring and the tail of the cq ring,
  127	 * and the application controls tail of the sq ring and the head of the
  128	 * cq ring.
  129	 */
  130	struct io_uring		sq, cq;
  131	/*
  132	 * Bitmasks to apply to head and tail offsets (constant, equals
  133	 * ring_entries - 1)
  134	 */
  135	u32			sq_ring_mask, cq_ring_mask;
  136	/* Ring sizes (constant, power of 2) */
  137	u32			sq_ring_entries, cq_ring_entries;
  138	/*
  139	 * Number of invalid entries dropped by the kernel due to
  140	 * invalid index stored in array
  141	 *
  142	 * Written by the kernel, shouldn't be modified by the
  143	 * application (i.e. get number of "new events" by comparing to
  144	 * cached value).
  145	 *
  146	 * After a new SQ head value was read by the application this
  147	 * counter includes all submissions that were dropped reaching
  148	 * the new SQ head (and possibly more).
  149	 */
  150	u32			sq_dropped;
  151	/*
  152	 * Runtime SQ flags
  153	 *
  154	 * Written by the kernel, shouldn't be modified by the
  155	 * application.
  156	 *
  157	 * The application needs a full memory barrier before checking
  158	 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
  159	 */
  160	u32			sq_flags;
  161	/*
  162	 * Runtime CQ flags
  163	 *
  164	 * Written by the application, shouldn't be modified by the
  165	 * kernel.
  166	 */
  167	u32                     cq_flags;
  168	/*
  169	 * Number of completion events lost because the queue was full;
  170	 * this should be avoided by the application by making sure
  171	 * there are not more requests pending than there is space in
  172	 * the completion queue.
  173	 *
  174	 * Written by the kernel, shouldn't be modified by the
  175	 * application (i.e. get number of "new events" by comparing to
  176	 * cached value).
  177	 *
  178	 * As completion events come in out of order this counter is not
  179	 * ordered with any other data.
  180	 */
  181	u32			cq_overflow;
  182	/*
  183	 * Ring buffer of completion events.
  184	 *
  185	 * The kernel writes completion events fresh every time they are
  186	 * produced, so the application is allowed to modify pending
  187	 * entries.
  188	 */
  189	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
  190};
  191
  192enum io_uring_cmd_flags {
  193	IO_URING_F_NONBLOCK		= 1,
  194	IO_URING_F_COMPLETE_DEFER	= 2,
  195};
  196
  197struct io_mapped_ubuf {
  198	u64		ubuf;
  199	u64		ubuf_end;
  200	unsigned int	nr_bvecs;
  201	unsigned long	acct_pages;
  202	struct bio_vec	bvec[];
  203};
  204
  205struct io_ring_ctx;
  206
  207struct io_overflow_cqe {
  208	struct io_uring_cqe cqe;
  209	struct list_head list;
  210};
  211
  212struct io_fixed_file {
  213	/* file * with additional FFS_* flags */
  214	unsigned long file_ptr;
  215};
  216
  217struct io_rsrc_put {
  218	struct list_head list;
  219	u64 tag;
  220	union {
  221		void *rsrc;
  222		struct file *file;
  223		struct io_mapped_ubuf *buf;
  224	};
  225};
  226
  227struct io_file_table {
  228	/* two level table */
  229	struct io_fixed_file **files;
  230};
  231
  232struct io_rsrc_node {
  233	struct percpu_ref		refs;
  234	struct list_head		node;
  235	struct list_head		rsrc_list;
  236	struct io_rsrc_data		*rsrc_data;
  237	struct llist_node		llist;
  238	bool				done;
  239};
  240
  241typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
  242
  243struct io_rsrc_data {
  244	struct io_ring_ctx		*ctx;
  245
  246	u64				*tags;
  247	rsrc_put_fn			*do_put;
  248	atomic_t			refs;
  249	struct completion		done;
  250	bool				quiesce;
  251};
  252
  253struct io_buffer {
  254	struct list_head list;
  255	__u64 addr;
  256	__u32 len;
  257	__u16 bid;
  258};
  259
  260struct io_restriction {
  261	DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
  262	DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
  263	u8 sqe_flags_allowed;
  264	u8 sqe_flags_required;
  265	bool registered;
  266};
  267
  268enum {
  269	IO_SQ_THREAD_SHOULD_STOP = 0,
  270	IO_SQ_THREAD_SHOULD_PARK,
  271};
  272
  273struct io_sq_data {
  274	refcount_t		refs;
  275	atomic_t		park_pending;
  276	struct mutex		lock;
  277
  278	/* ctx's that are using this sqd */
  279	struct list_head	ctx_list;
  280
  281	struct task_struct	*thread;
  282	struct wait_queue_head	wait;
  283
  284	unsigned		sq_thread_idle;
  285	int			sq_cpu;
  286	pid_t			task_pid;
  287	pid_t			task_tgid;
  288
  289	unsigned long		state;
  290	struct completion	exited;
  291	struct callback_head	*park_task_work;
  292};
  293
  294#define IO_IOPOLL_BATCH			8
  295#define IO_COMPL_BATCH			32
  296#define IO_REQ_CACHE_SIZE		32
  297#define IO_REQ_ALLOC_BATCH		8
  298
  299struct io_comp_state {
  300	struct io_kiocb		*reqs[IO_COMPL_BATCH];
  301	unsigned int		nr;
  302	unsigned int		locked_free_nr;
  303	/* inline/task_work completion list, under ->uring_lock */
  304	struct list_head	free_list;
  305	/* IRQ completion list, under ->completion_lock */
  306	struct list_head	locked_free_list;
  307};
  308
  309struct io_submit_link {
  310	struct io_kiocb		*head;
  311	struct io_kiocb		*last;
  312};
  313
  314struct io_submit_state {
  315	struct blk_plug		plug;
  316	struct io_submit_link	link;
  317
  318	/*
  319	 * io_kiocb alloc cache
  320	 */
  321	void			*reqs[IO_REQ_CACHE_SIZE];
  322	unsigned int		free_reqs;
  323
  324	bool			plug_started;
  325
  326	/*
  327	 * Batch completion logic
  328	 */
  329	struct io_comp_state	comp;
  330
  331	/*
  332	 * File reference cache
  333	 */
  334	struct file		*file;
  335	unsigned int		fd;
  336	unsigned int		file_refs;
  337	unsigned int		ios_left;
  338};
  339
  340struct io_ring_ctx {
  341	struct {
  342		struct percpu_ref	refs;
  343	} ____cacheline_aligned_in_smp;
  344
  345	struct {
  346		unsigned int		flags;
  347		unsigned int		compat: 1;
  348		unsigned int		drain_next: 1;
  349		unsigned int		eventfd_async: 1;
  350		unsigned int		restricted: 1;
  351
  352		/*
  353		 * Ring buffer of indices into array of io_uring_sqe, which is
  354		 * mmapped by the application using the IORING_OFF_SQES offset.
  355		 *
  356		 * This indirection could e.g. be used to assign fixed
  357		 * io_uring_sqe entries to operations and only submit them to
  358		 * the queue when needed.
  359		 *
  360		 * The kernel modifies neither the indices array nor the entries
  361		 * array.
  362		 */
  363		u32			*sq_array;
  364		unsigned		cached_sq_head;
  365		unsigned		sq_entries;
  366		unsigned		sq_mask;
  367		unsigned		sq_thread_idle;
  368		unsigned		cached_sq_dropped;
  369		unsigned		cached_cq_overflow;
  370		unsigned long		sq_check_overflow;
  371
  372		/* hashed buffered write serialization */
  373		struct io_wq_hash	*hash_map;
  374
  375		struct list_head	defer_list;
  376		struct list_head	timeout_list;
  377		struct list_head	cq_overflow_list;
  378
  379		struct io_uring_sqe	*sq_sqes;
  380	} ____cacheline_aligned_in_smp;
  381
  382	struct {
  383		struct mutex		uring_lock;
  384		wait_queue_head_t	wait;
  385	} ____cacheline_aligned_in_smp;
  386
  387	struct io_submit_state		submit_state;
  388
  389	struct io_rings	*rings;
  390
  391	/* Only used for accounting purposes */
  392	struct mm_struct	*mm_account;
  393
  394	const struct cred	*sq_creds;	/* cred used for __io_sq_thread() */
  395	struct io_sq_data	*sq_data;	/* if using sq thread polling */
  396
  397	struct wait_queue_head	sqo_sq_wait;
  398	struct list_head	sqd_list;
  399
  400	/*
  401	 * If used, fixed file set. Writers must ensure that ->refs is dead,
  402	 * readers must ensure that ->refs is alive as long as the file* is
  403	 * used. Only updated through io_uring_register(2).
  404	 */
  405	struct io_rsrc_data	*file_data;
  406	struct io_file_table	file_table;
  407	unsigned		nr_user_files;
  408
  409	/* if used, fixed mapped user buffers */
  410	struct io_rsrc_data	*buf_data;
  411	unsigned		nr_user_bufs;
  412	struct io_mapped_ubuf	**user_bufs;
  413
  414	struct user_struct	*user;
  415
  416	struct completion	ref_comp;
  417
  418#if defined(CONFIG_UNIX)
  419	struct socket		*ring_sock;
  420#endif
  421
  422	struct xarray		io_buffers;
  423
  424	struct xarray		personalities;
  425	u32			pers_next;
  426
  427	struct {
  428		unsigned		cached_cq_tail;
  429		unsigned		cq_entries;
  430		unsigned		cq_mask;
  431		atomic_t		cq_timeouts;
  432		unsigned		cq_last_tm_flush;
  433		unsigned		cq_extra;
  434		unsigned long		cq_check_overflow;
  435		struct wait_queue_head	cq_wait;
  436		struct fasync_struct	*cq_fasync;
  437		struct eventfd_ctx	*cq_ev_fd;
  438	} ____cacheline_aligned_in_smp;
  439
  440	struct {
  441		spinlock_t		completion_lock;
  442
  443		/*
  444		 * ->iopoll_list is protected by the ctx->uring_lock for
  445		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
  446		 * For SQPOLL, only the single threaded io_sq_thread() will
  447		 * manipulate the list, hence no extra locking is needed there.
  448		 */
  449		struct list_head	iopoll_list;
  450		struct hlist_head	*cancel_hash;
  451		unsigned		cancel_hash_bits;
  452		bool			poll_multi_file;
  453	} ____cacheline_aligned_in_smp;
  454
  455	struct delayed_work		rsrc_put_work;
  456	struct llist_head		rsrc_put_llist;
  457	struct list_head		rsrc_ref_list;
  458	spinlock_t			rsrc_ref_lock;
  459	struct io_rsrc_node		*rsrc_node;
  460	struct io_rsrc_node		*rsrc_backup_node;
  461	struct io_mapped_ubuf		*dummy_ubuf;
  462
  463	struct io_restriction		restrictions;
  464
  465	/* exit task_work */
  466	struct callback_head		*exit_task_work;
  467
  468	/* Keep this last, we don't need it for the fast path */
  469	struct work_struct		exit_work;
  470	struct list_head		tctx_list;
  471};
  472
  473struct io_uring_task {
  474	/* submission side */
  475	struct xarray		xa;
  476	struct wait_queue_head	wait;
  477	const struct io_ring_ctx *last;
  478	struct io_wq		*io_wq;
  479	struct percpu_counter	inflight;
  480	atomic_t		inflight_tracked;
  481	atomic_t		in_idle;
  482
  483	spinlock_t		task_lock;
  484	struct io_wq_work_list	task_list;
  485	unsigned long		task_state;
  486	struct callback_head	task_work;
  487};
  488
  489/*
  490 * First field must be the file pointer in all the
  491 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
  492 */
  493struct io_poll_iocb {
  494	struct file			*file;
  495	struct wait_queue_head		*head;
  496	__poll_t			events;
  497	bool				done;
  498	bool				canceled;
  499	struct wait_queue_entry		wait;
  500};
  501
  502struct io_poll_update {
  503	struct file			*file;
  504	u64				old_user_data;
  505	u64				new_user_data;
  506	__poll_t			events;
  507	bool				update_events;
  508	bool				update_user_data;
  509};
  510
  511struct io_close {
  512	struct file			*file;
  513	int				fd;
  514};
  515
  516struct io_timeout_data {
  517	struct io_kiocb			*req;
  518	struct hrtimer			timer;
  519	struct timespec64		ts;
  520	enum hrtimer_mode		mode;
  521};
  522
  523struct io_accept {
  524	struct file			*file;
  525	struct sockaddr __user		*addr;
  526	int __user			*addr_len;
  527	int				flags;
  528	unsigned long			nofile;
  529};
  530
  531struct io_sync {
  532	struct file			*file;
  533	loff_t				len;
  534	loff_t				off;
  535	int				flags;
  536	int				mode;
  537};
  538
  539struct io_cancel {
  540	struct file			*file;
  541	u64				addr;
  542};
  543
  544struct io_timeout {
  545	struct file			*file;
  546	u32				off;
  547	u32				target_seq;
  548	struct list_head		list;
  549	/* head of the link, used by linked timeouts only */
  550	struct io_kiocb			*head;
  551};
  552
  553struct io_timeout_rem {
  554	struct file			*file;
  555	u64				addr;
  556
  557	/* timeout update */
  558	struct timespec64		ts;
  559	u32				flags;
  560};
  561
  562struct io_rw {
  563	/* NOTE: kiocb has the file as the first member, so don't do it here */
  564	struct kiocb			kiocb;
  565	u64				addr;
  566	u64				len;
  567};
  568
  569struct io_connect {
  570	struct file			*file;
  571	struct sockaddr __user		*addr;
  572	int				addr_len;
  573};
  574
  575struct io_sr_msg {
  576	struct file			*file;
  577	union {
  578		struct compat_msghdr __user	*umsg_compat;
  579		struct user_msghdr __user	*umsg;
  580		void __user			*buf;
  581	};
  582	int				msg_flags;
  583	int				bgid;
  584	size_t				len;
  585	struct io_buffer		*kbuf;
  586};
  587
  588struct io_open {
  589	struct file			*file;
  590	int				dfd;
  591	struct filename			*filename;
  592	struct open_how			how;
  593	unsigned long			nofile;
  594};
  595
  596struct io_rsrc_update {
  597	struct file			*file;
  598	u64				arg;
  599	u32				nr_args;
  600	u32				offset;
  601};
  602
  603struct io_fadvise {
  604	struct file			*file;
  605	u64				offset;
  606	u32				len;
  607	u32				advice;
  608};
  609
  610struct io_madvise {
  611	struct file			*file;
  612	u64				addr;
  613	u32				len;
  614	u32				advice;
  615};
  616
  617struct io_epoll {
  618	struct file			*file;
  619	int				epfd;
  620	int				op;
  621	int				fd;
  622	struct epoll_event		event;
  623};
  624
  625struct io_splice {
  626	struct file			*file_out;
  627	struct file			*file_in;
  628	loff_t				off_out;
  629	loff_t				off_in;
  630	u64				len;
  631	unsigned int			flags;
  632};
  633
  634struct io_provide_buf {
  635	struct file			*file;
  636	__u64				addr;
  637	__u32				len;
  638	__u32				bgid;
  639	__u16				nbufs;
  640	__u16				bid;
  641};
  642
  643struct io_statx {
  644	struct file			*file;
  645	int				dfd;
  646	unsigned int			mask;
  647	unsigned int			flags;
  648	const char __user		*filename;
  649	struct statx __user		*buffer;
  650};
  651
  652struct io_shutdown {
  653	struct file			*file;
  654	int				how;
  655};
  656
  657struct io_rename {
  658	struct file			*file;
  659	int				old_dfd;
  660	int				new_dfd;
  661	struct filename			*oldpath;
  662	struct filename			*newpath;
  663	int				flags;
  664};
  665
  666struct io_unlink {
  667	struct file			*file;
  668	int				dfd;
  669	int				flags;
  670	struct filename			*filename;
  671};
  672
  673struct io_completion {
  674	struct file			*file;
  675	struct list_head		list;
  676	u32				cflags;
  677};
  678
  679struct io_async_connect {
  680	struct sockaddr_storage		address;
  681};
  682
  683struct io_async_msghdr {
  684	struct iovec			fast_iov[UIO_FASTIOV];
  685	/* points to an allocated iov, if NULL we use fast_iov instead */
  686	struct iovec			*free_iov;
  687	struct sockaddr __user		*uaddr;
  688	struct msghdr			msg;
  689	struct sockaddr_storage		addr;
  690};
  691
  692struct io_async_rw {
  693	struct iovec			fast_iov[UIO_FASTIOV];
  694	const struct iovec		*free_iovec;
  695	struct iov_iter			iter;
  696	size_t				bytes_done;
  697	struct wait_page_queue		wpq;
  698};
  699
  700enum {
  701	REQ_F_FIXED_FILE_BIT	= IOSQE_FIXED_FILE_BIT,
  702	REQ_F_IO_DRAIN_BIT	= IOSQE_IO_DRAIN_BIT,
  703	REQ_F_LINK_BIT		= IOSQE_IO_LINK_BIT,
  704	REQ_F_HARDLINK_BIT	= IOSQE_IO_HARDLINK_BIT,
  705	REQ_F_FORCE_ASYNC_BIT	= IOSQE_ASYNC_BIT,
  706	REQ_F_BUFFER_SELECT_BIT	= IOSQE_BUFFER_SELECT_BIT,
  707
  708	/* first byte is taken by user flags, shift it to not overlap */
  709	REQ_F_FAIL_LINK_BIT	= 8,
  710	REQ_F_INFLIGHT_BIT,
  711	REQ_F_CUR_POS_BIT,
  712	REQ_F_NOWAIT_BIT,
  713	REQ_F_LINK_TIMEOUT_BIT,
  714	REQ_F_NEED_CLEANUP_BIT,
  715	REQ_F_POLLED_BIT,
  716	REQ_F_BUFFER_SELECTED_BIT,
  717	REQ_F_LTIMEOUT_ACTIVE_BIT,
  718	REQ_F_COMPLETE_INLINE_BIT,
  719	REQ_F_REISSUE_BIT,
  720	REQ_F_DONT_REISSUE_BIT,
  721	/* keep async read/write and isreg together and in order */
  722	REQ_F_ASYNC_READ_BIT,
  723	REQ_F_ASYNC_WRITE_BIT,
  724	REQ_F_ISREG_BIT,
  725
  726	/* not a real bit, just to check we're not overflowing the space */
  727	__REQ_F_LAST_BIT,
  728};
  729
  730enum {
  731	/* ctx owns file */
  732	REQ_F_FIXED_FILE	= BIT(REQ_F_FIXED_FILE_BIT),
  733	/* drain existing IO first */
  734	REQ_F_IO_DRAIN		= BIT(REQ_F_IO_DRAIN_BIT),
  735	/* linked sqes */
  736	REQ_F_LINK		= BIT(REQ_F_LINK_BIT),
  737	/* doesn't sever on completion < 0 */
  738	REQ_F_HARDLINK		= BIT(REQ_F_HARDLINK_BIT),
  739	/* IOSQE_ASYNC */
  740	REQ_F_FORCE_ASYNC	= BIT(REQ_F_FORCE_ASYNC_BIT),
  741	/* IOSQE_BUFFER_SELECT */
  742	REQ_F_BUFFER_SELECT	= BIT(REQ_F_BUFFER_SELECT_BIT),
  743
  744	/* fail rest of links */
  745	REQ_F_FAIL_LINK		= BIT(REQ_F_FAIL_LINK_BIT),
  746	/* on inflight list, should be cancelled and waited on exit reliably */
  747	REQ_F_INFLIGHT		= BIT(REQ_F_INFLIGHT_BIT),
  748	/* read/write uses file position */
  749	REQ_F_CUR_POS		= BIT(REQ_F_CUR_POS_BIT),
  750	/* must not punt to workers */
  751	REQ_F_NOWAIT		= BIT(REQ_F_NOWAIT_BIT),
  752	/* has or had linked timeout */
  753	REQ_F_LINK_TIMEOUT	= BIT(REQ_F_LINK_TIMEOUT_BIT),
  754	/* needs cleanup */
  755	REQ_F_NEED_CLEANUP	= BIT(REQ_F_NEED_CLEANUP_BIT),
  756	/* already went through poll handler */
  757	REQ_F_POLLED		= BIT(REQ_F_POLLED_BIT),
  758	/* buffer already selected */
  759	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
  760	/* linked timeout is active, i.e. prepared by link's head */
  761	REQ_F_LTIMEOUT_ACTIVE	= BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
  762	/* completion is deferred through io_comp_state */
  763	REQ_F_COMPLETE_INLINE	= BIT(REQ_F_COMPLETE_INLINE_BIT),
  764	/* caller should reissue async */
  765	REQ_F_REISSUE		= BIT(REQ_F_REISSUE_BIT),
  766	/* don't attempt request reissue, see io_rw_reissue() */
  767	REQ_F_DONT_REISSUE	= BIT(REQ_F_DONT_REISSUE_BIT),
  768	/* supports async reads */
  769	REQ_F_ASYNC_READ	= BIT(REQ_F_ASYNC_READ_BIT),
  770	/* supports async writes */
  771	REQ_F_ASYNC_WRITE	= BIT(REQ_F_ASYNC_WRITE_BIT),
  772	/* regular file */
  773	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
  774};
  775
  776struct async_poll {
  777	struct io_poll_iocb	poll;
  778	struct io_poll_iocb	*double_poll;
  779};
  780
  781struct io_task_work {
  782	struct io_wq_work_node	node;
  783	task_work_func_t	func;
  784};
  785
  786enum {
  787	IORING_RSRC_FILE		= 0,
  788	IORING_RSRC_BUFFER		= 1,
  789};
  790
  791/*
  792 * NOTE! Each of the iocb union members has the file pointer
  793 * as the first entry in their struct definition. So you can
  794 * access the file pointer through any of the sub-structs,
  795 * or directly as just 'ki_filp' in this struct.
  796 */
  797struct io_kiocb {
  798	union {
  799		struct file		*file;
  800		struct io_rw		rw;
  801		struct io_poll_iocb	poll;
  802		struct io_poll_update	poll_update;
  803		struct io_accept	accept;
  804		struct io_sync		sync;
  805		struct io_cancel	cancel;
  806		struct io_timeout	timeout;
  807		struct io_timeout_rem	timeout_rem;
  808		struct io_connect	connect;
  809		struct io_sr_msg	sr_msg;
  810		struct io_open		open;
  811		struct io_close		close;
  812		struct io_rsrc_update	rsrc_update;
  813		struct io_fadvise	fadvise;
  814		struct io_madvise	madvise;
  815		struct io_epoll		epoll;
  816		struct io_splice	splice;
  817		struct io_provide_buf	pbuf;
  818		struct io_statx		statx;
  819		struct io_shutdown	shutdown;
  820		struct io_rename	rename;
  821		struct io_unlink	unlink;
  822		/* use only after cleaning per-op data, see io_clean_op() */
  823		struct io_completion	compl;
  824	};
  825
  826	/* opcode allocated if it needs to store data for async defer */
  827	void				*async_data;
  828	u8				opcode;
  829	/* polled IO has completed */
  830	u8				iopoll_completed;
  831
  832	u16				buf_index;
  833	u32				result;
  834
  835	struct io_ring_ctx		*ctx;
  836	unsigned int			flags;
  837	atomic_t			refs;
  838	struct task_struct		*task;
  839	u64				user_data;
  840
  841	struct io_kiocb			*link;
  842	struct percpu_ref		*fixed_rsrc_refs;
  843
  844	/* used with ctx->iopoll_list with reads/writes */
  845	struct list_head		inflight_entry;
  846	union {
  847		struct io_task_work	io_task_work;
  848		struct callback_head	task_work;
  849	};
  850	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
  851	struct hlist_node		hash_node;
  852	struct async_poll		*apoll;
  853	struct io_wq_work		work;
  854	/* store used ubuf, so we can prevent reloading */
  855	struct io_mapped_ubuf		*imu;
  856};
  857
  858struct io_tctx_node {
  859	struct list_head	ctx_node;
  860	struct task_struct	*task;
  861	struct io_ring_ctx	*ctx;
  862};
  863
  864struct io_defer_entry {
  865	struct list_head	list;
  866	struct io_kiocb		*req;
  867	u32			seq;
  868};
  869
  870struct io_op_def {
  871	/* needs req->file assigned */
  872	unsigned		needs_file : 1;
  873	/* hash wq insertion if file is a regular file */
  874	unsigned		hash_reg_file : 1;
  875	/* unbound wq insertion if file is a non-regular file */
  876	unsigned		unbound_nonreg_file : 1;
  877	/* opcode is not supported by this kernel */
  878	unsigned		not_supported : 1;
  879	/* set if opcode supports polled "wait" */
  880	unsigned		pollin : 1;
  881	unsigned		pollout : 1;
  882	/* op supports buffer selection */
  883	unsigned		buffer_select : 1;
  884	/* do prep async if is going to be punted */
  885	unsigned		needs_async_setup : 1;
  886	/* should block plug */
  887	unsigned		plug : 1;
  888	/* size of async data needed, if any */
  889	unsigned short		async_size;
  890};
  891
  892static const struct io_op_def io_op_defs[] = {
  893	[IORING_OP_NOP] = {},
  894	[IORING_OP_READV] = {
  895		.needs_file		= 1,
  896		.unbound_nonreg_file	= 1,
  897		.pollin			= 1,
  898		.buffer_select		= 1,
  899		.needs_async_setup	= 1,
  900		.plug			= 1,
  901		.async_size		= sizeof(struct io_async_rw),
  902	},
  903	[IORING_OP_WRITEV] = {
  904		.needs_file		= 1,
  905		.hash_reg_file		= 1,
  906		.unbound_nonreg_file	= 1,
  907		.pollout		= 1,
  908		.needs_async_setup	= 1,
  909		.plug			= 1,
  910		.async_size		= sizeof(struct io_async_rw),
  911	},
  912	[IORING_OP_FSYNC] = {
  913		.needs_file		= 1,
  914	},
  915	[IORING_OP_READ_FIXED] = {
  916		.needs_file		= 1,
  917		.unbound_nonreg_file	= 1,
  918		.pollin			= 1,
  919		.plug			= 1,
  920		.async_size		= sizeof(struct io_async_rw),
  921	},
  922	[IORING_OP_WRITE_FIXED] = {
  923		.needs_file		= 1,
  924		.hash_reg_file		= 1,
  925		.unbound_nonreg_file	= 1,
  926		.pollout		= 1,
  927		.plug			= 1,
  928		.async_size		= sizeof(struct io_async_rw),
  929	},
  930	[IORING_OP_POLL_ADD] = {
  931		.needs_file		= 1,
  932		.unbound_nonreg_file	= 1,
  933	},
  934	[IORING_OP_POLL_REMOVE] = {},
  935	[IORING_OP_SYNC_FILE_RANGE] = {
  936		.needs_file		= 1,
  937	},
  938	[IORING_OP_SENDMSG] = {
  939		.needs_file		= 1,
  940		.unbound_nonreg_file	= 1,
  941		.pollout		= 1,
  942		.needs_async_setup	= 1,
  943		.async_size		= sizeof(struct io_async_msghdr),
  944	},
  945	[IORING_OP_RECVMSG] = {
  946		.needs_file		= 1,
  947		.unbound_nonreg_file	= 1,
  948		.pollin			= 1,
  949		.buffer_select		= 1,
  950		.needs_async_setup	= 1,
  951		.async_size		= sizeof(struct io_async_msghdr),
  952	},
  953	[IORING_OP_TIMEOUT] = {
  954		.async_size		= sizeof(struct io_timeout_data),
  955	},
  956	[IORING_OP_TIMEOUT_REMOVE] = {
  957		/* used by timeout updates' prep() */
  958	},
  959	[IORING_OP_ACCEPT] = {
  960		.needs_file		= 1,
  961		.unbound_nonreg_file	= 1,
  962		.pollin			= 1,
  963	},
  964	[IORING_OP_ASYNC_CANCEL] = {},
  965	[IORING_OP_LINK_TIMEOUT] = {
  966		.async_size		= sizeof(struct io_timeout_data),
  967	},
  968	[IORING_OP_CONNECT] = {
  969		.needs_file		= 1,
  970		.unbound_nonreg_file	= 1,
  971		.pollout		= 1,
  972		.needs_async_setup	= 1,
  973		.async_size		= sizeof(struct io_async_connect),
  974	},
  975	[IORING_OP_FALLOCATE] = {
  976		.needs_file		= 1,
  977	},
  978	[IORING_OP_OPENAT] = {},
  979	[IORING_OP_CLOSE] = {},
  980	[IORING_OP_FILES_UPDATE] = {},
  981	[IORING_OP_STATX] = {},
  982	[IORING_OP_READ] = {
  983		.needs_file		= 1,
  984		.unbound_nonreg_file	= 1,
  985		.pollin			= 1,
  986		.buffer_select		= 1,
  987		.plug			= 1,
  988		.async_size		= sizeof(struct io_async_rw),
  989	},
  990	[IORING_OP_WRITE] = {
  991		.needs_file		= 1,
  992		.unbound_nonreg_file	= 1,
  993		.pollout		= 1,
  994		.plug			= 1,
  995		.async_size		= sizeof(struct io_async_rw),
  996	},
  997	[IORING_OP_FADVISE] = {
  998		.needs_file		= 1,
  999	},
 1000	[IORING_OP_MADVISE] = {},
 1001	[IORING_OP_SEND] = {
 1002		.needs_file		= 1,
 1003		.unbound_nonreg_file	= 1,
 1004		.pollout		= 1,
 1005	},
 1006	[IORING_OP_RECV] = {
 1007		.needs_file		= 1,
 1008		.unbound_nonreg_file	= 1,
 1009		.pollin			= 1,
 1010		.buffer_select		= 1,
 1011	},
 1012	[IORING_OP_OPENAT2] = {
 1013	},
 1014	[IORING_OP_EPOLL_CTL] = {
 1015		.unbound_nonreg_file	= 1,
 1016	},
 1017	[IORING_OP_SPLICE] = {
 1018		.needs_file		= 1,
 1019		.hash_reg_file		= 1,
 1020		.unbound_nonreg_file	= 1,
 1021	},
 1022	[IORING_OP_PROVIDE_BUFFERS] = {},
 1023	[IORING_OP_REMOVE_BUFFERS] = {},
 1024	[IORING_OP_TEE] = {
 1025		.needs_file		= 1,
 1026		.hash_reg_file		= 1,
 1027		.unbound_nonreg_file	= 1,
 1028	},
 1029	[IORING_OP_SHUTDOWN] = {
 1030		.needs_file		= 1,
 1031	},
 1032	[IORING_OP_RENAMEAT] = {},
 1033	[IORING_OP_UNLINKAT] = {},
 1034};
 1035
 1036static bool io_disarm_next(struct io_kiocb *req);
 1037static void io_uring_del_task_file(unsigned long index);
 1038static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 1039					 struct task_struct *task,
 1040					 struct files_struct *files);
 1041static void io_uring_cancel_sqpoll(struct io_sq_data *sqd);
 1042static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
 1043
 1044static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
 1045				 long res, unsigned int cflags);
 1046static void io_put_req(struct io_kiocb *req);
 1047static void io_put_req_deferred(struct io_kiocb *req, int nr);
 1048static void io_dismantle_req(struct io_kiocb *req);
 1049static void io_put_task(struct task_struct *task, int nr);
 1050static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 1051static void io_queue_linked_timeout(struct io_kiocb *req);
 1052static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 1053				     struct io_uring_rsrc_update2 *up,
 1054				     unsigned nr_args);
 1055static void io_clean_op(struct io_kiocb *req);
 1056static struct file *io_file_get(struct io_submit_state *state,
 1057				struct io_kiocb *req, int fd, bool fixed);
 1058static void __io_queue_sqe(struct io_kiocb *req);
 1059static void io_rsrc_put_work(struct work_struct *work);
 1060
 1061static void io_req_task_queue(struct io_kiocb *req);
 1062static void io_submit_flush_completions(struct io_comp_state *cs,
 1063					struct io_ring_ctx *ctx);
 1064static bool io_poll_remove_waitqs(struct io_kiocb *req);
 1065static int io_req_prep_async(struct io_kiocb *req);
 1066
 1067static struct kmem_cache *req_cachep;
 1068
 1069static const struct file_operations io_uring_fops;
 1070
 1071struct sock *io_uring_get_socket(struct file *file)
 1072{
 1073#if defined(CONFIG_UNIX)
 1074	if (file->f_op == &io_uring_fops) {
 1075		struct io_ring_ctx *ctx = file->private_data;
 1076
 1077		return ctx->ring_sock->sk;
 1078	}
 1079#endif
 1080	return NULL;
 1081}
 1082EXPORT_SYMBOL(io_uring_get_socket);
 1083
 1084#define io_for_each_link(pos, head) \
 1085	for (pos = (head); pos; pos = pos->link)
 1086
 1087static inline void io_req_set_rsrc_node(struct io_kiocb *req)
 1088{
 1089	struct io_ring_ctx *ctx = req->ctx;
 1090
 1091	if (!req->fixed_rsrc_refs) {
 1092		req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
 1093		percpu_ref_get(req->fixed_rsrc_refs);
 1094	}
 1095}
 1096
 1097static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
 1098{
 1099	bool got = percpu_ref_tryget(ref);
 1100
 1101	/* already at zero, wait for ->release() */
 1102	if (!got)
 1103		wait_for_completion(compl);
 1104	percpu_ref_resurrect(ref);
 1105	if (got)
 1106		percpu_ref_put(ref);
 1107}
 1108
 1109static bool io_match_task(struct io_kiocb *head,
 1110			  struct task_struct *task,
 1111			  struct files_struct *files)
 1112{
 1113	struct io_kiocb *req;
 1114
 1115	if (task && head->task != task)
 1116		return false;
 1117	if (!files)
 1118		return true;
 1119
 1120	io_for_each_link(req, head) {
 1121		if (req->flags & REQ_F_INFLIGHT)
 1122			return true;
 1123	}
 1124	return false;
 1125}
 1126
 1127static inline void req_set_fail_links(struct io_kiocb *req)
 1128{
 1129	if (req->flags & REQ_F_LINK)
 1130		req->flags |= REQ_F_FAIL_LINK;
 1131}
 1132
 1133static void io_ring_ctx_ref_free(struct percpu_ref *ref)
 1134{
 1135	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
 1136
 1137	complete(&ctx->ref_comp);
 1138}
 1139
 1140static inline bool io_is_timeout_noseq(struct io_kiocb *req)
 1141{
 1142	return !req->timeout.off;
 1143}
 1144
 1145static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 1146{
 1147	struct io_ring_ctx *ctx;
 1148	int hash_bits;
 1149
 1150	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 1151	if (!ctx)
 1152		return NULL;
 1153
 1154	/*
 1155	 * Use 5 bits less than the max cq entries, that should give us around
 1156	 * 32 entries per hash list if totally full and uniformly spread.
 1157	 */
 1158	hash_bits = ilog2(p->cq_entries);
 1159	hash_bits -= 5;
 1160	if (hash_bits <= 0)
 1161		hash_bits = 1;
 1162	ctx->cancel_hash_bits = hash_bits;
 1163	ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
 1164					GFP_KERNEL);
 1165	if (!ctx->cancel_hash)
 1166		goto err;
 1167	__hash_init(ctx->cancel_hash, 1U << hash_bits);
 1168
 1169	ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
 1170	if (!ctx->dummy_ubuf)
 1171		goto err;
 1172	/* set invalid range, so io_import_fixed() fails meeting it */
 1173	ctx->dummy_ubuf->ubuf = -1UL;
 1174
 1175	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
 1176			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
 1177		goto err;
 1178
 1179	ctx->flags = p->flags;
 1180	init_waitqueue_head(&ctx->sqo_sq_wait);
 1181	INIT_LIST_HEAD(&ctx->sqd_list);
 1182	init_waitqueue_head(&ctx->cq_wait);
 1183	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 1184	init_completion(&ctx->ref_comp);
 1185	xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
 1186	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
 1187	mutex_init(&ctx->uring_lock);
 1188	init_waitqueue_head(&ctx->wait);
 1189	spin_lock_init(&ctx->completion_lock);
 1190	INIT_LIST_HEAD(&ctx->iopoll_list);
 1191	INIT_LIST_HEAD(&ctx->defer_list);
 1192	INIT_LIST_HEAD(&ctx->timeout_list);
 1193	spin_lock_init(&ctx->rsrc_ref_lock);
 1194	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
 1195	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
 1196	init_llist_head(&ctx->rsrc_put_llist);
 1197	INIT_LIST_HEAD(&ctx->tctx_list);
 1198	INIT_LIST_HEAD(&ctx->submit_state.comp.free_list);
 1199	INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list);
 1200	return ctx;
 1201err:
 1202	kfree(ctx->dummy_ubuf);
 1203	kfree(ctx->cancel_hash);
 1204	kfree(ctx);
 1205	return NULL;
 1206}
 1207
 1208static bool req_need_defer(struct io_kiocb *req, u32 seq)
 1209{
 1210	if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
 1211		struct io_ring_ctx *ctx = req->ctx;
 1212
 1213		return seq + ctx->cq_extra != ctx->cached_cq_tail
 1214				+ READ_ONCE(ctx->cached_cq_overflow);
 1215	}
 1216
 1217	return false;
 1218}
 1219
 1220static void io_req_track_inflight(struct io_kiocb *req)
 1221{
 1222	if (!(req->flags & REQ_F_INFLIGHT)) {
 1223		req->flags |= REQ_F_INFLIGHT;
 1224		atomic_inc(&current->io_uring->inflight_tracked);
 1225	}
 1226}
 1227
 1228static void io_prep_async_work(struct io_kiocb *req)
 1229{
 1230	const struct io_op_def *def = &io_op_defs[req->opcode];
 1231	struct io_ring_ctx *ctx = req->ctx;
 1232
 1233	if (!req->work.creds)
 1234		req->work.creds = get_current_cred();
 1235
 1236	req->work.list.next = NULL;
 1237	req->work.flags = 0;
 1238	if (req->flags & REQ_F_FORCE_ASYNC)
 1239		req->work.flags |= IO_WQ_WORK_CONCURRENT;
 1240
 1241	if (req->flags & REQ_F_ISREG) {
 1242		if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
 1243			io_wq_hash_work(&req->work, file_inode(req->file));
 1244	} else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
 1245		if (def->unbound_nonreg_file)
 1246			req->work.flags |= IO_WQ_WORK_UNBOUND;
 1247	}
 1248
 1249	switch (req->opcode) {
 1250	case IORING_OP_SPLICE:
 1251	case IORING_OP_TEE:
 1252		if (!S_ISREG(file_inode(req->splice.file_in)->i_mode))
 1253			req->work.flags |= IO_WQ_WORK_UNBOUND;
 1254		break;
 1255	}
 1256}
 1257
 1258static void io_prep_async_link(struct io_kiocb *req)
 1259{
 1260	struct io_kiocb *cur;
 1261
 1262	io_for_each_link(cur, req)
 1263		io_prep_async_work(cur);
 1264}
 1265
 1266static void io_queue_async_work(struct io_kiocb *req)
 1267{
 1268	struct io_ring_ctx *ctx = req->ctx;
 1269	struct io_kiocb *link = io_prep_linked_timeout(req);
 1270	struct io_uring_task *tctx = req->task->io_uring;
 1271
 1272	BUG_ON(!tctx);
 1273	BUG_ON(!tctx->io_wq);
 1274
 1275	/* init ->work of the whole link before punting */
 1276	io_prep_async_link(req);
 1277	trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
 1278					&req->work, req->flags);
 1279	io_wq_enqueue(tctx->io_wq, &req->work);
 1280	if (link)
 1281		io_queue_linked_timeout(link);
 1282}
 1283
 1284static void io_kill_timeout(struct io_kiocb *req, int status)
 1285	__must_hold(&req->ctx->completion_lock)
 1286{
 1287	struct io_timeout_data *io = req->async_data;
 1288
 1289	if (hrtimer_try_to_cancel(&io->timer) != -1) {
 1290		atomic_set(&req->ctx->cq_timeouts,
 1291			atomic_read(&req->ctx->cq_timeouts) + 1);
 1292		list_del_init(&req->timeout.list);
 1293		io_cqring_fill_event(req->ctx, req->user_data, status, 0);
 1294		io_put_req_deferred(req, 1);
 1295	}
 1296}
 1297
 1298static void __io_queue_deferred(struct io_ring_ctx *ctx)
 1299{
 1300	do {
 1301		struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
 1302						struct io_defer_entry, list);
 1303
 1304		if (req_need_defer(de->req, de->seq))
 1305			break;
 1306		list_del_init(&de->list);
 1307		io_req_task_queue(de->req);
 1308		kfree(de);
 1309	} while (!list_empty(&ctx->defer_list));
 1310}
 1311
 1312static void io_flush_timeouts(struct io_ring_ctx *ctx)
 1313{
 1314	u32 seq;
 1315
 1316	if (list_empty(&ctx->timeout_list))
 1317		return;
 1318
 1319	seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
 1320
 1321	do {
 1322		u32 events_needed, events_got;
 1323		struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
 1324						struct io_kiocb, timeout.list);
 1325
 1326		if (io_is_timeout_noseq(req))
 1327			break;
 1328
 1329		/*
 1330		 * Since seq can easily wrap around over time, subtract
 1331		 * the last seq at which timeouts were flushed before comparing.
 1332		 * Assuming not more than 2^31-1 events have happened since,
 1333		 * these subtractions won't have wrapped, so we can check if
 1334		 * target is in [last_seq, current_seq] by comparing the two.
 1335		 */
 1336		events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
 1337		events_got = seq - ctx->cq_last_tm_flush;
 1338		if (events_got < events_needed)
 1339			break;
 1340
 1341		list_del_init(&req->timeout.list);
 1342		io_kill_timeout(req, 0);
 1343	} while (!list_empty(&ctx->timeout_list));
 1344
 1345	ctx->cq_last_tm_flush = seq;
 1346}
 1347
 1348static void io_commit_cqring(struct io_ring_ctx *ctx)
 1349{
 1350	io_flush_timeouts(ctx);
 1351
 1352	/* order cqe stores with ring update */
 1353	smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
 1354
 1355	if (unlikely(!list_empty(&ctx->defer_list)))
 1356		__io_queue_deferred(ctx);
 1357}
 1358
 1359static inline bool io_sqring_full(struct io_ring_ctx *ctx)
 1360{
 1361	struct io_rings *r = ctx->rings;
 1362
 1363	return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries;
 1364}
 1365
 1366static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
 1367{
 1368	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
 1369}
 1370
 1371static inline struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 1372{
 1373	struct io_rings *rings = ctx->rings;
 1374	unsigned tail;
 1375
 1376	/*
 1377	 * writes to the cq entry need to come after reading head; the
 1378	 * control dependency is enough as we're using WRITE_ONCE to
 1379	 * fill the cq entry
 1380	 */
 1381	if (__io_cqring_events(ctx) == rings->cq_ring_entries)
 1382		return NULL;
 1383
 1384	tail = ctx->cached_cq_tail++;
 1385	return &rings->cqes[tail & ctx->cq_mask];
 1386}
 1387
 1388static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
 1389{
 1390	if (likely(!ctx->cq_ev_fd))
 1391		return false;
 1392	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
 1393		return false;
 1394	return !ctx->eventfd_async || io_wq_current_is_worker();
 1395}
 1396
 1397static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 1398{
 1399	/* see waitqueue_active() comment */
 1400	smp_mb();
 1401
 1402	if (waitqueue_active(&ctx->wait))
 1403		wake_up(&ctx->wait);
 1404	if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
 1405		wake_up(&ctx->sq_data->wait);
 1406	if (io_should_trigger_evfd(ctx))
 1407		eventfd_signal(ctx->cq_ev_fd, 1);
 1408	if (waitqueue_active(&ctx->cq_wait)) {
 1409		wake_up_interruptible(&ctx->cq_wait);
 1410		kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
 1411	}
 1412}
 1413
 1414static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
 1415{
 1416	/* see waitqueue_active() comment */
 1417	smp_mb();
 1418
 1419	if (ctx->flags & IORING_SETUP_SQPOLL) {
 1420		if (waitqueue_active(&ctx->wait))
 1421			wake_up(&ctx->wait);
 1422	}
 1423	if (io_should_trigger_evfd(ctx))
 1424		eventfd_signal(ctx->cq_ev_fd, 1);
 1425	if (waitqueue_active(&ctx->cq_wait)) {
 1426		wake_up_interruptible(&ctx->cq_wait);
 1427		kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
 1428	}
 1429}
 1430
 1431/* Returns true if there are no backlogged entries after the flush */
 1432static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 1433{
 1434	struct io_rings *rings = ctx->rings;
 1435	unsigned long flags;
 1436	bool all_flushed, posted;
 1437
 1438	if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries)
 1439		return false;
 1440
 1441	posted = false;
 1442	spin_lock_irqsave(&ctx->completion_lock, flags);
 1443	while (!list_empty(&ctx->cq_overflow_list)) {
 1444		struct io_uring_cqe *cqe = io_get_cqring(ctx);
 1445		struct io_overflow_cqe *ocqe;
 1446
 1447		if (!cqe && !force)
 1448			break;
 1449		ocqe = list_first_entry(&ctx->cq_overflow_list,
 1450					struct io_overflow_cqe, list);
 1451		if (cqe)
 1452			memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
 1453		else
 1454			WRITE_ONCE(ctx->rings->cq_overflow,
 1455				   ++ctx->cached_cq_overflow);
 1456		posted = true;
 1457		list_del(&ocqe->list);
 1458		kfree(ocqe);
 1459	}
 1460
 1461	all_flushed = list_empty(&ctx->cq_overflow_list);
 1462	if (all_flushed) {
 1463		clear_bit(0, &ctx->sq_check_overflow);
 1464		clear_bit(0, &ctx->cq_check_overflow);
 1465		ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
 1466	}
 1467
 1468	if (posted)
 1469		io_commit_cqring(ctx);
 1470	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 1471	if (posted)
 1472		io_cqring_ev_posted(ctx);
 1473	return all_flushed;
 1474}
 1475
 1476static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 1477{
 1478	bool ret = true;
 1479
 1480	if (test_bit(0, &ctx->cq_check_overflow)) {
 1481		/* iopoll syncs against uring_lock, not completion_lock */
 1482		if (ctx->flags & IORING_SETUP_IOPOLL)
 1483			mutex_lock(&ctx->uring_lock);
 1484		ret = __io_cqring_overflow_flush(ctx, force);
 1485		if (ctx->flags & IORING_SETUP_IOPOLL)
 1486			mutex_unlock(&ctx->uring_lock);
 1487	}
 1488
 1489	return ret;
 1490}
 1491
 1492/*
 1493 * Shamelessly stolen from the mm implementation of page reference checking,
 1494 * see commit f958d7b528b1 for details.
 1495 */
 1496#define req_ref_zero_or_close_to_overflow(req)	\
 1497	((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
 1498
 1499static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
 1500{
 1501	return atomic_inc_not_zero(&req->refs);
 1502}
 1503
 1504static inline bool req_ref_sub_and_test(struct io_kiocb *req, int refs)
 1505{
 1506	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
 1507	return atomic_sub_and_test(refs, &req->refs);
 1508}
 1509
 1510static inline bool req_ref_put_and_test(struct io_kiocb *req)
 1511{
 1512	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
 1513	return atomic_dec_and_test(&req->refs);
 1514}
 1515
 1516static inline void req_ref_put(struct io_kiocb *req)
 1517{
 1518	WARN_ON_ONCE(req_ref_put_and_test(req));
 1519}
 1520
 1521static inline void req_ref_get(struct io_kiocb *req)
 1522{
 1523	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
 1524	atomic_inc(&req->refs);
 1525}
 1526
 1527static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
 1528				     long res, unsigned int cflags)
 1529{
 1530	struct io_overflow_cqe *ocqe;
 1531
 1532	ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
 1533	if (!ocqe) {
 1534		/*
 1535		 * If we're in ring overflow flush mode, or in task cancel mode,
 1536		 * or cannot allocate an overflow entry, then we need to drop it
 1537		 * on the floor.
 1538		 */
 1539		WRITE_ONCE(ctx->rings->cq_overflow, ++ctx->cached_cq_overflow);
 1540		return false;
 1541	}
 1542	if (list_empty(&ctx->cq_overflow_list)) {
 1543		set_bit(0, &ctx->sq_check_overflow);
 1544		set_bit(0, &ctx->cq_check_overflow);
 1545		ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
 1546	}
 1547	ocqe->cqe.user_data = user_data;
 1548	ocqe->cqe.res = res;
 1549	ocqe->cqe.flags = cflags;
 1550	list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
 1551	return true;
 1552}
 1553
 1554static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
 1555					  long res, unsigned int cflags)
 1556{
 1557	struct io_uring_cqe *cqe;
 1558
 1559	trace_io_uring_complete(ctx, user_data, res, cflags);
 1560
 1561	/*
 1562	 * If we can't get a cq entry, userspace overflowed the
 1563	 * submission (by quite a lot). Increment the overflow count in
 1564	 * the ring.
 1565	 */
 1566	cqe = io_get_cqring(ctx);
 1567	if (likely(cqe)) {
 1568		WRITE_ONCE(cqe->user_data, user_data);
 1569		WRITE_ONCE(cqe->res, res);
 1570		WRITE_ONCE(cqe->flags, cflags);
 1571		return true;
 1572	}
 1573	return io_cqring_event_overflow(ctx, user_data, res, cflags);
 1574}
 1575
 1576/* not as hot to bloat with inlining */
 1577static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
 1578					  long res, unsigned int cflags)
 1579{
 1580	return __io_cqring_fill_event(ctx, user_data, res, cflags);
 1581}
 1582
 1583static void io_req_complete_post(struct io_kiocb *req, long res,
 1584				 unsigned int cflags)
 1585{
 1586	struct io_ring_ctx *ctx = req->ctx;
 1587	unsigned long flags;
 1588
 1589	spin_lock_irqsave(&ctx->completion_lock, flags);
 1590	__io_cqring_fill_event(ctx, req->user_data, res, cflags);
 1591	/*
 1592	 * If we're the last reference to this request, add to our locked
 1593	 * free_list cache.
 1594	 */
 1595	if (req_ref_put_and_test(req)) {
 1596		struct io_comp_state *cs = &ctx->submit_state.comp;
 1597
 1598		if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
 1599			if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL_LINK))
 1600				io_disarm_next(req);
 1601			if (req->link) {
 1602				io_req_task_queue(req->link);
 1603				req->link = NULL;
 1604			}
 1605		}
 1606		io_dismantle_req(req);
 1607		io_put_task(req->task, 1);
 1608		list_add(&req->compl.list, &cs->locked_free_list);
 1609		cs->locked_free_nr++;
 1610	} else {
 1611		if (!percpu_ref_tryget(&ctx->refs))
 1612			req = NULL;
 1613	}
 1614	io_commit_cqring(ctx);
 1615	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 1616
 1617	if (req) {
 1618		io_cqring_ev_posted(ctx);
 1619		percpu_ref_put(&ctx->refs);
 1620	}
 1621}
 1622
 1623static inline bool io_req_needs_clean(struct io_kiocb *req)
 1624{
 1625	return req->flags & (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP |
 1626				REQ_F_POLLED | REQ_F_INFLIGHT);
 1627}
 1628
 1629static void io_req_complete_state(struct io_kiocb *req, long res,
 1630				  unsigned int cflags)
 1631{
 1632	if (io_req_needs_clean(req))
 1633		io_clean_op(req);
 1634	req->result = res;
 1635	req->compl.cflags = cflags;
 1636	req->flags |= REQ_F_COMPLETE_INLINE;
 1637}
 1638
 1639static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
 1640				     long res, unsigned cflags)
 1641{
 1642	if (issue_flags & IO_URING_F_COMPLETE_DEFER)
 1643		io_req_complete_state(req, res, cflags);
 1644	else
 1645		io_req_complete_post(req, res, cflags);
 1646}
 1647
 1648static inline void io_req_complete(struct io_kiocb *req, long res)
 1649{
 1650	__io_req_complete(req, 0, res, 0);
 1651}
 1652
 1653static void io_req_complete_failed(struct io_kiocb *req, long res)
 1654{
 1655	req_set_fail_links(req);
 1656	io_put_req(req);
 1657	io_req_complete_post(req, res, 0);
 1658}
 1659
 1660static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
 1661					struct io_comp_state *cs)
 1662{
 1663	spin_lock_irq(&ctx->completion_lock);
 1664	list_splice_init(&cs->locked_free_list, &cs->free_list);
 1665	cs->locked_free_nr = 0;
 1666	spin_unlock_irq(&ctx->completion_lock);
 1667}
 1668
 1669/* Returns true IFF there are requests in the cache */
 1670static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
 1671{
 1672	struct io_submit_state *state = &ctx->submit_state;
 1673	struct io_comp_state *cs = &state->comp;
 1674	int nr;
 1675
 1676	/*
 1677	 * If we have more than a batch's worth of requests in our IRQ side
 1678	 * locked cache, grab the lock and move them over to our submission
 1679	 * side cache.
 1680	 */
 1681	if (READ_ONCE(cs->locked_free_nr) > IO_COMPL_BATCH)
 1682		io_flush_cached_locked_reqs(ctx, cs);
 1683
 1684	nr = state->free_reqs;
 1685	while (!list_empty(&cs->free_list)) {
 1686		struct io_kiocb *req = list_first_entry(&cs->free_list,
 1687						struct io_kiocb, compl.list);
 1688
 1689		list_del(&req->compl.list);
 1690		state->reqs[nr++] = req;
 1691		if (nr == ARRAY_SIZE(state->reqs))
 1692			break;
 1693	}
 1694
 1695	state->free_reqs = nr;
 1696	return nr != 0;
 1697}
 1698
 1699static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
 1700{
 1701	struct io_submit_state *state = &ctx->submit_state;
 1702
 1703	BUILD_BUG_ON(IO_REQ_ALLOC_BATCH > ARRAY_SIZE(state->reqs));
 1704
 1705	if (!state->free_reqs) {
 1706		gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 1707		int ret;
 1708
 1709		if (io_flush_cached_reqs(ctx))
 1710			goto got_req;
 1711
 1712		ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
 1713					    state->reqs);
 1714
 1715		/*
 1716		 * Bulk alloc is all-or-nothing. If we fail to get a batch,
 1717		 * retry single alloc to be on the safe side.
 1718		 */
 1719		if (unlikely(ret <= 0)) {
 1720			state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
 1721			if (!state->reqs[0])
 1722				return NULL;
 1723			ret = 1;
 1724		}
 1725		state->free_reqs = ret;
 1726	}
 1727got_req:
 1728	state->free_reqs--;
 1729	return state->reqs[state->free_reqs];
 1730}
 1731
 1732static inline void io_put_file(struct file *file)
 1733{
 1734	if (file)
 1735		fput(file);
 1736}
 1737
 1738static void io_dismantle_req(struct io_kiocb *req)
 1739{
 1740	unsigned int flags = req->flags;
 1741
 1742	if (io_req_needs_clean(req))
 1743		io_clean_op(req);
 1744	if (!(flags & REQ_F_FIXED_FILE))
 1745		io_put_file(req->file);
 1746	if (req->fixed_rsrc_refs)
 1747		percpu_ref_put(req->fixed_rsrc_refs);
 1748	if (req->async_data)
 1749		kfree(req->async_data);
 1750	if (req->work.creds) {
 1751		put_cred(req->work.creds);
 1752		req->work.creds = NULL;
 1753	}
 1754}
 1755
 1756/* must to be called somewhat shortly after putting a request */
 1757static inline void io_put_task(struct task_struct *task, int nr)
 1758{
 1759	struct io_uring_task *tctx = task->io_uring;
 1760
 1761	percpu_counter_sub(&tctx->inflight, nr);
 1762	if (unlikely(atomic_read(&tctx->in_idle)))
 1763		wake_up(&tctx->wait);
 1764	put_task_struct_many(task, nr);
 1765}
 1766
 1767static void __io_free_req(struct io_kiocb *req)
 1768{
 1769	struct io_ring_ctx *ctx = req->ctx;
 1770
 1771	io_dismantle_req(req);
 1772	io_put_task(req->task, 1);
 1773
 1774	kmem_cache_free(req_cachep, req);
 1775	percpu_ref_put(&ctx->refs);
 1776}
 1777
 1778static inline void io_remove_next_linked(struct io_kiocb *req)
 1779{
 1780	struct io_kiocb *nxt = req->link;
 1781
 1782	req->link = nxt->link;
 1783	nxt->link = NULL;
 1784}
 1785
 1786static bool io_kill_linked_timeout(struct io_kiocb *req)
 1787	__must_hold(&req->ctx->completion_lock)
 1788{
 1789	struct io_kiocb *link = req->link;
 1790
 1791	/*
 1792	 * Can happen if a linked timeout fired and link had been like
 1793	 * req -> link t-out -> link t-out [-> ...]
 1794	 */
 1795	if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
 1796		struct io_timeout_data *io = link->async_data;
 1797
 1798		io_remove_next_linked(req);
 1799		link->timeout.head = NULL;
 1800		if (hrtimer_try_to_cancel(&io->timer) != -1) {
 1801			io_cqring_fill_event(link->ctx, link->user_data,
 1802					     -ECANCELED, 0);
 1803			io_put_req_deferred(link, 1);
 1804			return true;
 1805		}
 1806	}
 1807	return false;
 1808}
 1809
 1810static void io_fail_links(struct io_kiocb *req)
 1811	__must_hold(&req->ctx->completion_lock)
 1812{
 1813	struct io_kiocb *nxt, *link = req->link;
 1814
 1815	req->link = NULL;
 1816	while (link) {
 1817		nxt = link->link;
 1818		link->link = NULL;
 1819
 1820		trace_io_uring_fail_link(req, link);
 1821		io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0);
 1822		io_put_req_deferred(link, 2);
 1823		link = nxt;
 1824	}
 1825}
 1826
 1827static bool io_disarm_next(struct io_kiocb *req)
 1828	__must_hold(&req->ctx->completion_lock)
 1829{
 1830	bool posted = false;
 1831
 1832	if (likely(req->flags & REQ_F_LINK_TIMEOUT))
 1833		posted = io_kill_linked_timeout(req);
 1834	if (unlikely((req->flags & REQ_F_FAIL_LINK) &&
 1835		     !(req->flags & REQ_F_HARDLINK))) {
 1836		posted |= (req->link != NULL);
 1837		io_fail_links(req);
 1838	}
 1839	return posted;
 1840}
 1841
 1842static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
 1843{
 1844	struct io_kiocb *nxt;
 1845
 1846	/*
 1847	 * If LINK is set, we have dependent requests in this chain. If we
 1848	 * didn't fail this request, queue the first one up, moving any other
 1849	 * dependencies to the next request. In case of failure, fail the rest
 1850	 * of the chain.
 1851	 */
 1852	if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL_LINK)) {
 1853		struct io_ring_ctx *ctx = req->ctx;
 1854		unsigned long flags;
 1855		bool posted;
 1856
 1857		spin_lock_irqsave(&ctx->completion_lock, flags);
 1858		posted = io_disarm_next(req);
 1859		if (posted)
 1860			io_commit_cqring(req->ctx);
 1861		spin_unlock_irqrestore(&ctx->completion_lock, flags);
 1862		if (posted)
 1863			io_cqring_ev_posted(ctx);
 1864	}
 1865	nxt = req->link;
 1866	req->link = NULL;
 1867	return nxt;
 1868}
 1869
 1870static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
 1871{
 1872	if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
 1873		return NULL;
 1874	return __io_req_find_next(req);
 1875}
 1876
 1877static void ctx_flush_and_put(struct io_ring_ctx *ctx)
 1878{
 1879	if (!ctx)
 1880		return;
 1881	if (ctx->submit_state.comp.nr) {
 1882		mutex_lock(&ctx->uring_lock);
 1883		io_submit_flush_completions(&ctx->submit_state.comp, ctx);
 1884		mutex_unlock(&ctx->uring_lock);
 1885	}
 1886	percpu_ref_put(&ctx->refs);
 1887}
 1888
 1889static bool __tctx_task_work(struct io_uring_task *tctx)
 1890{
 1891	struct io_ring_ctx *ctx = NULL;
 1892	struct io_wq_work_list list;
 1893	struct io_wq_work_node *node;
 1894
 1895	if (wq_list_empty(&tctx->task_list))
 1896		return false;
 1897
 1898	spin_lock_irq(&tctx->task_lock);
 1899	list = tctx->task_list;
 1900	INIT_WQ_LIST(&tctx->task_list);
 1901	spin_unlock_irq(&tctx->task_lock);
 1902
 1903	node = list.first;
 1904	while (node) {
 1905		struct io_wq_work_node *next = node->next;
 1906		struct io_kiocb *req;
 1907
 1908		req = container_of(node, struct io_kiocb, io_task_work.node);
 1909		if (req->ctx != ctx) {
 1910			ctx_flush_and_put(ctx);
 1911			ctx = req->ctx;
 1912			percpu_ref_get(&ctx->refs);
 1913		}
 1914
 1915		req->task_work.func(&req->task_work);
 1916		node = next;
 1917	}
 1918
 1919	ctx_flush_and_put(ctx);
 1920	return list.first != NULL;
 1921}
 1922
 1923static void tctx_task_work(struct callback_head *cb)
 1924{
 1925	struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work);
 1926
 1927	clear_bit(0, &tctx->task_state);
 1928
 1929	while (__tctx_task_work(tctx))
 1930		cond_resched();
 1931}
 1932
 1933static int io_req_task_work_add(struct io_kiocb *req)
 1934{
 1935	struct task_struct *tsk = req->task;
 1936	struct io_uring_task *tctx = tsk->io_uring;
 1937	enum task_work_notify_mode notify;
 1938	struct io_wq_work_node *node, *prev;
 1939	unsigned long flags;
 1940	int ret = 0;
 1941
 1942	if (unlikely(tsk->flags & PF_EXITING))
 1943		return -ESRCH;
 1944
 1945	WARN_ON_ONCE(!tctx);
 1946
 1947	spin_lock_irqsave(&tctx->task_lock, flags);
 1948	wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
 1949	spin_unlock_irqrestore(&tctx->task_lock, flags);
 1950
 1951	/* task_work already pending, we're done */
 1952	if (test_bit(0, &tctx->task_state) ||
 1953	    test_and_set_bit(0, &tctx->task_state))
 1954		return 0;
 1955
 1956	/*
 1957	 * SQPOLL kernel thread doesn't need notification, just a wakeup. For
 1958	 * all other cases, use TWA_SIGNAL unconditionally to ensure we're
 1959	 * processing task_work. There's no reliable way to tell if TWA_RESUME
 1960	 * will do the job.
 1961	 */
 1962	notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
 1963
 1964	if (!task_work_add(tsk, &tctx->task_work, notify)) {
 1965		wake_up_process(tsk);
 1966		return 0;
 1967	}
 1968
 1969	/*
 1970	 * Slow path - we failed, find and delete work. if the work is not
 1971	 * in the list, it got run and we're fine.
 1972	 */
 1973	spin_lock_irqsave(&tctx->task_lock, flags);
 1974	wq_list_for_each(node, prev, &tctx->task_list) {
 1975		if (&req->io_task_work.node == node) {
 1976			wq_list_del(&tctx->task_list, node, prev);
 1977			ret = 1;
 1978			break;
 1979		}
 1980	}
 1981	spin_unlock_irqrestore(&tctx->task_lock, flags);
 1982	clear_bit(0, &tctx->task_state);
 1983	return ret;
 1984}
 1985
 1986static bool io_run_task_work_head(struct callback_head **work_head)
 1987{
 1988	struct callback_head *work, *next;
 1989	bool executed = false;
 1990
 1991	do {
 1992		work = xchg(work_head, NULL);
 1993		if (!work)
 1994			break;
 1995
 1996		do {
 1997			next = work->next;
 1998			work->func(work);
 1999			work = next;
 2000			cond_resched();
 2001		} while (work);
 2002		executed = true;
 2003	} while (1);
 2004
 2005	return executed;
 2006}
 2007
 2008static void io_task_work_add_head(struct callback_head **work_head,
 2009				  struct callback_head *task_work)
 2010{
 2011	struct callback_head *head;
 2012
 2013	do {
 2014		head = READ_ONCE(*work_head);
 2015		task_work->next = head;
 2016	} while (cmpxchg(work_head, head, task_work) != head);
 2017}
 2018
 2019static void io_req_task_work_add_fallback(struct io_kiocb *req,
 2020					  task_work_func_t cb)
 2021{
 2022	init_task_work(&req->task_work, cb);
 2023	io_task_work_add_head(&req->ctx->exit_task_work, &req->task_work);
 2024}
 2025
 2026static void io_req_task_cancel(struct callback_head *cb)
 2027{
 2028	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
 2029	struct io_ring_ctx *ctx = req->ctx;
 2030
 2031	/* ctx is guaranteed to stay alive while we hold uring_lock */
 2032	mutex_lock(&ctx->uring_lock);
 2033	io_req_complete_failed(req, req->result);
 2034	mutex_unlock(&ctx->uring_lock);
 2035}
 2036
 2037static void __io_req_task_submit(struct io_kiocb *req)
 2038{
 2039	struct io_ring_ctx *ctx = req->ctx;
 2040
 2041	/* ctx stays valid until unlock, even if we drop all ours ctx->refs */
 2042	mutex_lock(&ctx->uring_lock);
 2043	if (!(current->flags & PF_EXITING) && !current->in_execve)
 2044		__io_queue_sqe(req);
 2045	else
 2046		io_req_complete_failed(req, -EFAULT);
 2047	mutex_unlock(&ctx->uring_lock);
 2048}
 2049
 2050static void io_req_task_submit(struct callback_head *cb)
 2051{
 2052	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
 2053
 2054	__io_req_task_submit(req);
 2055}
 2056
 2057static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
 2058{
 2059	req->result = ret;
 2060	req->task_work.func = io_req_task_cancel;
 2061
 2062	if (unlikely(io_req_task_work_add(req)))
 2063		io_req_task_work_add_fallback(req, io_req_task_cancel);
 2064}
 2065
 2066static void io_req_task_queue(struct io_kiocb *req)
 2067{
 2068	req->task_work.func = io_req_task_submit;
 2069
 2070	if (unlikely(io_req_task_work_add(req)))
 2071		io_req_task_queue_fail(req, -ECANCELED);
 2072}
 2073
 2074static inline void io_queue_next(struct io_kiocb *req)
 2075{
 2076	struct io_kiocb *nxt = io_req_find_next(req);
 2077
 2078	if (nxt)
 2079		io_req_task_queue(nxt);
 2080}
 2081
 2082static void io_free_req(struct io_kiocb *req)
 2083{
 2084	io_queue_next(req);
 2085	__io_free_req(req);
 2086}
 2087
 2088struct req_batch {
 2089	struct task_struct	*task;
 2090	int			task_refs;
 2091	int			ctx_refs;
 2092};
 2093
 2094static inline void io_init_req_batch(struct req_batch *rb)
 2095{
 2096	rb->task_refs = 0;
 2097	rb->ctx_refs = 0;
 2098	rb->task = NULL;
 2099}
 2100
 2101static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
 2102				     struct req_batch *rb)
 2103{
 2104	if (rb->task)
 2105		io_put_task(rb->task, rb->task_refs);
 2106	if (rb->ctx_refs)
 2107		percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
 2108}
 2109
 2110static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
 2111			      struct io_submit_state *state)
 2112{
 2113	io_queue_next(req);
 2114	io_dismantle_req(req);
 2115
 2116	if (req->task != rb->task) {
 2117		if (rb->task)
 2118			io_put_task(rb->task, rb->task_refs);
 2119		rb->task = req->task;
 2120		rb->task_refs = 0;
 2121	}
 2122	rb->task_refs++;
 2123	rb->ctx_refs++;
 2124
 2125	if (state->free_reqs != ARRAY_SIZE(state->reqs))
 2126		state->reqs[state->free_reqs++] = req;
 2127	else
 2128		list_add(&req->compl.list, &state->comp.free_list);
 2129}
 2130
 2131static void io_submit_flush_completions(struct io_comp_state *cs,
 2132					struct io_ring_ctx *ctx)
 2133{
 2134	int i, nr = cs->nr;
 2135	struct io_kiocb *req;
 2136	struct req_batch rb;
 2137
 2138	io_init_req_batch(&rb);
 2139	spin_lock_irq(&ctx->completion_lock);
 2140	for (i = 0; i < nr; i++) {
 2141		req = cs->reqs[i];
 2142		__io_cqring_fill_event(ctx, req->user_data, req->result,
 2143					req->compl.cflags);
 2144	}
 2145	io_commit_cqring(ctx);
 2146	spin_unlock_irq(&ctx->completion_lock);
 2147
 2148	io_cqring_ev_posted(ctx);
 2149	for (i = 0; i < nr; i++) {
 2150		req = cs->reqs[i];
 2151
 2152		/* submission and completion refs */
 2153		if (req_ref_sub_and_test(req, 2))
 2154			io_req_free_batch(&rb, req, &ctx->submit_state);
 2155	}
 2156
 2157	io_req_free_batch_finish(ctx, &rb);
 2158	cs->nr = 0;
 2159}
 2160
 2161/*
 2162 * Drop reference to request, return next in chain (if there is one) if this
 2163 * was the last reference to this request.
 2164 */
 2165static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
 2166{
 2167	struct io_kiocb *nxt = NULL;
 2168
 2169	if (req_ref_put_and_test(req)) {
 2170		nxt = io_req_find_next(req);
 2171		__io_free_req(req);
 2172	}
 2173	return nxt;
 2174}
 2175
 2176static inline void io_put_req(struct io_kiocb *req)
 2177{
 2178	if (req_ref_put_and_test(req))
 2179		io_free_req(req);
 2180}
 2181
 2182static void io_put_req_deferred_cb(struct callback_head *cb)
 2183{
 2184	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
 2185
 2186	io_free_req(req);
 2187}
 2188
 2189static void io_free_req_deferred(struct io_kiocb *req)
 2190{
 2191	req->task_work.func = io_put_req_deferred_cb;
 2192	if (unlikely(io_req_task_work_add(req)))
 2193		io_req_task_work_add_fallback(req, io_put_req_deferred_cb);
 2194}
 2195
 2196static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
 2197{
 2198	if (req_ref_sub_and_test(req, refs))
 2199		io_free_req_deferred(req);
 2200}
 2201
 2202static unsigned io_cqring_events(struct io_ring_ctx *ctx)
 2203{
 2204	/* See comment at the top of this file */
 2205	smp_rmb();
 2206	return __io_cqring_events(ctx);
 2207}
 2208
 2209static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
 2210{
 2211	struct io_rings *rings = ctx->rings;
 2212
 2213	/* make sure SQ entry isn't read before tail */
 2214	return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
 2215}
 2216
 2217static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
 2218{
 2219	unsigned int cflags;
 2220
 2221	cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
 2222	cflags |= IORING_CQE_F_BUFFER;
 2223	req->flags &= ~REQ_F_BUFFER_SELECTED;
 2224	kfree(kbuf);
 2225	return cflags;
 2226}
 2227
 2228static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
 2229{
 2230	struct io_buffer *kbuf;
 2231
 2232	kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
 2233	return io_put_kbuf(req, kbuf);
 2234}
 2235
 2236static inline bool io_run_task_work(void)
 2237{
 2238	/*
 2239	 * Not safe to run on exiting task, and the task_work handling will
 2240	 * not add work to such a task.
 2241	 */
 2242	if (unlikely(current->flags & PF_EXITING))
 2243		return false;
 2244	if (current->task_works) {
 2245		__set_current_state(TASK_RUNNING);
 2246		task_work_run();
 2247		return true;
 2248	}
 2249
 2250	return false;
 2251}
 2252
 2253/*
 2254 * Find and free completed poll iocbs
 2255 */
 2256static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 2257			       struct list_head *done)
 2258{
 2259	struct req_batch rb;
 2260	struct io_kiocb *req;
 2261
 2262	/* order with ->result store in io_complete_rw_iopoll() */
 2263	smp_rmb();
 2264
 2265	io_init_req_batch(&rb);
 2266	while (!list_empty(done)) {
 2267		int cflags = 0;
 2268
 2269		req = list_first_entry(done, struct io_kiocb, inflight_entry);
 2270		list_del(&req->inflight_entry);
 2271
 2272		if (READ_ONCE(req->result) == -EAGAIN &&
 2273		    !(req->flags & REQ_F_DONT_REISSUE)) {
 2274			req->iopoll_completed = 0;
 2275			req_ref_get(req);
 2276			io_queue_async_work(req);
 2277			continue;
 2278		}
 2279
 2280		if (req->flags & REQ_F_BUFFER_SELECTED)
 2281			cflags = io_put_rw_kbuf(req);
 2282
 2283		__io_cqring_fill_event(ctx, req->user_data, req->result, cflags);
 2284		(*nr_events)++;
 2285
 2286		if (req_ref_put_and_test(req))
 2287			io_req_free_batch(&rb, req, &ctx->submit_state);
 2288	}
 2289
 2290	io_commit_cqring(ctx);
 2291	io_cqring_ev_posted_iopoll(ctx);
 2292	io_req_free_batch_finish(ctx, &rb);
 2293}
 2294
 2295static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
 2296			long min)
 2297{
 2298	struct io_kiocb *req, *tmp;
 2299	LIST_HEAD(done);
 2300	bool spin;
 2301	int ret;
 2302
 2303	/*
 2304	 * Only spin for completions if we don't have multiple devices hanging
 2305	 * off our complete list, and we're under the requested amount.
 2306	 */
 2307	spin = !ctx->poll_multi_file && *nr_events < min;
 2308
 2309	ret = 0;
 2310	list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
 2311		struct kiocb *kiocb = &req->rw.kiocb;
 2312
 2313		/*
 2314		 * Move completed and retryable entries to our local lists.
 2315		 * If we find a request that requires polling, break out
 2316		 * and complete those lists first, if we have entries there.
 2317		 */
 2318		if (READ_ONCE(req->iopoll_completed)) {
 2319			list_move_tail(&req->inflight_entry, &done);
 2320			continue;
 2321		}
 2322		if (!list_empty(&done))
 2323			break;
 2324
 2325		ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
 2326		if (ret < 0)
 2327			break;
 2328
 2329		/* iopoll may have completed current req */
 2330		if (READ_ONCE(req->iopoll_completed))
 2331			list_move_tail(&req->inflight_entry, &done);
 2332
 2333		if (ret && spin)
 2334			spin = false;
 2335		ret = 0;
 2336	}
 2337
 2338	if (!list_empty(&done))
 2339		io_iopoll_complete(ctx, nr_events, &done);
 2340
 2341	return ret;
 2342}
 2343
 2344/*
 2345 * We can't just wait for polled events to come to us, we have to actively
 2346 * find and complete them.
 2347 */
 2348static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
 2349{
 2350	if (!(ctx->flags & IORING_SETUP_IOPOLL))
 2351		return;
 2352
 2353	mutex_lock(&ctx->uring_lock);
 2354	while (!list_empty(&ctx->iopoll_list)) {
 2355		unsigned int nr_events = 0;
 2356
 2357		io_do_iopoll(ctx, &nr_events, 0);
 2358
 2359		/* let it sleep and repeat later if can't complete a request */
 2360		if (nr_events == 0)
 2361			break;
 2362		/*
 2363		 * Ensure we allow local-to-the-cpu processing to take place,
 2364		 * in this case we need to ensure that we reap all events.
 2365		 * Also let task_work, etc. to progress by releasing the mutex
 2366		 */
 2367		if (need_resched()) {
 2368			mutex_unlock(&ctx->uring_lock);
 2369			cond_resched();
 2370			mutex_lock(&ctx->uring_lock);
 2371		}
 2372	}
 2373	mutex_unlock(&ctx->uring_lock);
 2374}
 2375
 2376static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 2377{
 2378	unsigned int nr_events = 0;
 2379	int ret = 0;
 2380
 2381	/*
 2382	 * We disallow the app entering submit/complete with polling, but we
 2383	 * still need to lock the ring to prevent racing with polled issue
 2384	 * that got punted to a workqueue.
 2385	 */
 2386	mutex_lock(&ctx->uring_lock);
 2387	/*
 2388	 * Don't enter poll loop if we already have events pending.
 2389	 * If we do, we can potentially be spinning for commands that
 2390	 * already triggered a CQE (eg in error).
 2391	 */
 2392	if (test_bit(0, &ctx->cq_check_overflow))
 2393		__io_cqring_overflow_flush(ctx, false);
 2394	if (io_cqring_events(ctx))
 2395		goto out;
 2396	do {
 2397		/*
 2398		 * If a submit got punted to a workqueue, we can have the
 2399		 * application entering polling for a command before it gets
 2400		 * issued. That app will hold the uring_lock for the duration
 2401		 * of the poll right here, so we need to take a breather every
 2402		 * now and then to ensure that the issue has a chance to add
 2403		 * the poll to the issued list. Otherwise we can spin here
 2404		 * forever, while the workqueue is stuck trying to acquire the
 2405		 * very same mutex.
 2406		 */
 2407		if (list_empty(&ctx->iopoll_list)) {
 2408			mutex_unlock(&ctx->uring_lock);
 2409			io_run_task_work();
 2410			mutex_lock(&ctx->uring_lock);
 2411
 2412			if (list_empty(&ctx->iopoll_list))
 2413				break;
 2414		}
 2415		ret = io_do_iopoll(ctx, &nr_events, min);
 2416	} while (!ret && nr_events < min && !need_resched());
 2417out:
 2418	mutex_unlock(&ctx->uring_lock);
 2419	return ret;
 2420}
 2421
 2422static void kiocb_end_write(struct io_kiocb *req)
 2423{
 2424	/*
 2425	 * Tell lockdep we inherited freeze protection from submission
 2426	 * thread.
 2427	 */
 2428	if (req->flags & REQ_F_ISREG) {
 2429		struct super_block *sb = file_inode(req->file)->i_sb;
 2430
 2431		__sb_writers_acquired(sb, SB_FREEZE_WRITE);
 2432		sb_end_write(sb);
 2433	}
 2434}
 2435
 2436#ifdef CONFIG_BLOCK
 2437static bool io_resubmit_prep(struct io_kiocb *req)
 2438{
 2439	struct io_async_rw *rw = req->async_data;
 2440
 2441	if (!rw)
 2442		return !io_req_prep_async(req);
 2443	/* may have left rw->iter inconsistent on -EIOCBQUEUED */
 2444	iov_iter_revert(&rw->iter, req->result - iov_iter_count(&rw->iter));
 2445	return true;
 2446}
 2447
 2448static bool io_rw_should_reissue(struct io_kiocb *req)
 2449{
 2450	umode_t mode = file_inode(req->file)->i_mode;
 2451	struct io_ring_ctx *ctx = req->ctx;
 2452
 2453	if (!S_ISBLK(mode) && !S_ISREG(mode))
 2454		return false;
 2455	if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
 2456	    !(ctx->flags & IORING_SETUP_IOPOLL)))
 2457		return false;
 2458	/*
 2459	 * If ref is dying, we might be running poll reap from the exit work.
 2460	 * Don't attempt to reissue from that path, just let it fail with
 2461	 * -EAGAIN.
 2462	 */
 2463	if (percpu_ref_is_dying(&ctx->refs))
 2464		return false;
 2465	return true;
 2466}
 2467#else
 2468static bool io_resubmit_prep(struct io_kiocb *req)
 2469{
 2470	return false;
 2471}
 2472static bool io_rw_should_reissue(struct io_kiocb *req)
 2473{
 2474	return false;
 2475}
 2476#endif
 2477
 2478static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
 2479			     unsigned int issue_flags)
 2480{
 2481	int cflags = 0;
 2482
 2483	if (req->rw.kiocb.ki_flags & IOCB_WRITE)
 2484		kiocb_end_write(req);
 2485	if (res != req->result) {
 2486		if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
 2487		    io_rw_should_reissue(req)) {
 2488			req->flags |= REQ_F_REISSUE;
 2489			return;
 2490		}
 2491		req_set_fail_links(req);
 2492	}
 2493	if (req->flags & REQ_F_BUFFER_SELECTED)
 2494		cflags = io_put_rw_kbuf(req);
 2495	__io_req_complete(req, issue_flags, res, cflags);
 2496}
 2497
 2498static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 2499{
 2500	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 2501
 2502	__io_complete_rw(req, res, res2, 0);
 2503}
 2504
 2505static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
 2506{
 2507	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 2508
 2509	if (kiocb->ki_flags & IOCB_WRITE)
 2510		kiocb_end_write(req);
 2511	if (unlikely(res != req->result)) {
 2512		if (!(res == -EAGAIN && io_rw_should_reissue(req) &&
 2513		    io_resubmit_prep(req))) {
 2514			req_set_fail_links(req);
 2515			req->flags |= REQ_F_DONT_REISSUE;
 2516		}
 2517	}
 2518
 2519	WRITE_ONCE(req->result, res);
 2520	/* order with io_iopoll_complete() checking ->result */
 2521	smp_wmb();
 2522	WRITE_ONCE(req->iopoll_completed, 1);
 2523}
 2524
 2525/*
 2526 * After the iocb has been issued, it's safe to be found on the poll list.
 2527 * Adding the kiocb to the list AFTER submission ensures that we don't
 2528 * find it from a io_do_iopoll() thread before the issuer is done
 2529 * accessing the kiocb cookie.
 2530 */
 2531static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
 2532{
 2533	struct io_ring_ctx *ctx = req->ctx;
 2534
 2535	/*
 2536	 * Track whether we have multiple files in our lists. This will impact
 2537	 * how we do polling eventually, not spinning if we're on potentially
 2538	 * different devices.
 2539	 */
 2540	if (list_empty(&ctx->iopoll_list)) {
 2541		ctx->poll_multi_file = false;
 2542	} else if (!ctx->poll_multi_file) {
 2543		struct io_kiocb *list_req;
 2544
 2545		list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
 2546						inflight_entry);
 2547		if (list_req->file != req->file)
 2548			ctx->poll_multi_file = true;
 2549	}
 2550
 2551	/*
 2552	 * For fast devices, IO may have already completed. If it has, add
 2553	 * it to the front so we find it first.
 2554	 */
 2555	if (READ_ONCE(req->iopoll_completed))
 2556		list_add(&req->inflight_entry, &ctx->iopoll_list);
 2557	else
 2558		list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
 2559
 2560	/*
 2561	 * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread
 2562	 * task context or in io worker task context. If current task context is
 2563	 * sq thread, we don't need to check whether should wake up sq thread.
 2564	 */
 2565	if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) &&
 2566	    wq_has_sleeper(&ctx->sq_data->wait))
 2567		wake_up(&ctx->sq_data->wait);
 2568}
 2569
 2570static inline void io_state_file_put(struct io_submit_state *state)
 2571{
 2572	if (state->file_refs) {
 2573		fput_many(state->file, state->file_refs);
 2574		state->file_refs = 0;
 2575	}
 2576}
 2577
 2578/*
 2579 * Get as many references to a file as we have IOs left in this submission,
 2580 * assuming most submissions are for one file, or at least that each file
 2581 * has more than one submission.
 2582 */
 2583static struct file *__io_file_get(struct io_submit_state *state, int fd)
 2584{
 2585	if (!state)
 2586		return fget(fd);
 2587
 2588	if (state->file_refs) {
 2589		if (state->fd == fd) {
 2590			state->file_refs--;
 2591			return state->file;
 2592		}
 2593		io_state_file_put(state);
 2594	}
 2595	state->file = fget_many(fd, state->ios_left);
 2596	if (unlikely(!state->file))
 2597		return NULL;
 2598
 2599	state->fd = fd;
 2600	state->file_refs = state->ios_left - 1;
 2601	return state->file;
 2602}
 2603
 2604static bool io_bdev_nowait(struct block_device *bdev)
 2605{
 2606	return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
 2607}
 2608
 2609/*
 2610 * If we tracked the file through the SCM inflight mechanism, we could support
 2611 * any file. For now, just ensure that anything potentially problematic is done
 2612 * inline.
 2613 */
 2614static bool __io_file_supports_async(struct file *file, int rw)
 2615{
 2616	umode_t mode = file_inode(file)->i_mode;
 2617
 2618	if (S_ISBLK(mode)) {
 2619		if (IS_ENABLED(CONFIG_BLOCK) &&
 2620		    io_bdev_nowait(I_BDEV(file->f_mapping->host)))
 2621			return true;
 2622		return false;
 2623	}
 2624	if (S_ISCHR(mode) || S_ISSOCK(mode))
 2625		return true;
 2626	if (S_ISREG(mode)) {
 2627		if (IS_ENABLED(CONFIG_BLOCK) &&
 2628		    io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
 2629		    file->f_op != &io_uring_fops)
 2630			return true;
 2631		return false;
 2632	}
 2633
 2634	/* any ->read/write should understand O_NONBLOCK */
 2635	if (file->f_flags & O_NONBLOCK)
 2636		return true;
 2637
 2638	if (!(file->f_mode & FMODE_NOWAIT))
 2639		return false;
 2640
 2641	if (rw == READ)
 2642		return file->f_op->read_iter != NULL;
 2643
 2644	return file->f_op->write_iter != NULL;
 2645}
 2646
 2647static bool io_file_supports_async(struct io_kiocb *req, int rw)
 2648{
 2649	if (rw == READ && (req->flags & REQ_F_ASYNC_READ))
 2650		return true;
 2651	else if (rw == WRITE && (req->flags & REQ_F_ASYNC_WRITE))
 2652		return true;
 2653
 2654	return __io_file_supports_async(req->file, rw);
 2655}
 2656
 2657static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 2658{
 2659	struct io_ring_ctx *ctx = req->ctx;
 2660	struct kiocb *kiocb = &req->rw.kiocb;
 2661	struct file *file = req->file;
 2662	unsigned ioprio;
 2663	int ret;
 2664
 2665	if (!(req->flags & REQ_F_ISREG) && S_ISREG(file_inode(file)->i_mode))
 2666		req->flags |= REQ_F_ISREG;
 2667
 2668	kiocb->ki_pos = READ_ONCE(sqe->off);
 2669	if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
 2670		req->flags |= REQ_F_CUR_POS;
 2671		kiocb->ki_pos = file->f_pos;
 2672	}
 2673	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
 2674	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
 2675	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
 2676	if (unlikely(ret))
 2677		return ret;
 2678
 2679	/* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */
 2680	if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK))
 2681		req->flags |= REQ_F_NOWAIT;
 2682
 2683	ioprio = READ_ONCE(sqe->ioprio);
 2684	if (ioprio) {
 2685		ret = ioprio_check_cap(ioprio);
 2686		if (ret)
 2687			return ret;
 2688
 2689		kiocb->ki_ioprio = ioprio;
 2690	} else
 2691		kiocb->ki_ioprio = get_current_ioprio();
 2692
 2693	if (ctx->flags & IORING_SETUP_IOPOLL) {
 2694		if (!(kiocb->ki_flags & IOCB_DIRECT) ||
 2695		    !kiocb->ki_filp->f_op->iopoll)
 2696			return -EOPNOTSUPP;
 2697
 2698		kiocb->ki_flags |= IOCB_HIPRI;
 2699		kiocb->ki_complete = io_complete_rw_iopoll;
 2700		req->iopoll_completed = 0;
 2701	} else {
 2702		if (kiocb->ki_flags & IOCB_HIPRI)
 2703			return -EINVAL;
 2704		kiocb->ki_complete = io_complete_rw;
 2705	}
 2706
 2707	if (req->opcode == IORING_OP_READ_FIXED ||
 2708	    req->opcode == IORING_OP_WRITE_FIXED) {
 2709		req->imu = NULL;
 2710		io_req_set_rsrc_node(req);
 2711	}
 2712
 2713	req->rw.addr = READ_ONCE(sqe->addr);
 2714	req->rw.len = READ_ONCE(sqe->len);
 2715	req->buf_index = READ_ONCE(sqe->buf_index);
 2716	return 0;
 2717}
 2718
 2719static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
 2720{
 2721	switch (ret) {
 2722	case -EIOCBQUEUED:
 2723		break;
 2724	case -ERESTARTSYS:
 2725	case -ERESTARTNOINTR:
 2726	case -ERESTARTNOHAND:
 2727	case -ERESTART_RESTARTBLOCK:
 2728		/*
 2729		 * We can't just restart the syscall, since previously
 2730		 * submitted sqes may already be in progress. Just fail this
 2731		 * IO with EINTR.
 2732		 */
 2733		ret = -EINTR;
 2734		fallthrough;
 2735	default:
 2736		kiocb->ki_complete(kiocb, ret, 0);
 2737	}
 2738}
 2739
 2740static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
 2741		       unsigned int issue_flags)
 2742{
 2743	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 2744	struct io_async_rw *io = req->async_data;
 2745	bool check_reissue = kiocb->ki_complete == io_complete_rw;
 2746
 2747	/* add previously done IO, if any */
 2748	if (io && io->bytes_done > 0) {
 2749		if (ret < 0)
 2750			ret = io->bytes_done;
 2751		else
 2752			ret += io->bytes_done;
 2753	}
 2754
 2755	if (req->flags & REQ_F_CUR_POS)
 2756		req->file->f_pos = kiocb->ki_pos;
 2757	if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
 2758		__io_complete_rw(req, ret, 0, issue_flags);
 2759	else
 2760		io_rw_done(kiocb, ret);
 2761
 2762	if (check_reissue && req->flags & REQ_F_REISSUE) {
 2763		req->flags &= ~REQ_F_REISSUE;
 2764		if (io_resubmit_prep(req)) {
 2765			req_ref_get(req);
 2766			io_queue_async_work(req);
 2767		} else {
 2768			int cflags = 0;
 2769
 2770			req_set_fail_links(req);
 2771			if (req->flags & REQ_F_BUFFER_SELECTED)
 2772				cflags = io_put_rw_kbuf(req);
 2773			__io_req_complete(req, issue_flags, ret, cflags);
 2774		}
 2775	}
 2776}
 2777
 2778static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
 2779			     struct io_mapped_ubuf *imu)
 2780{
 2781	size_t len = req->rw.len;
 2782	u64 buf_end, buf_addr = req->rw.addr;
 2783	size_t offset;
 2784
 2785	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
 2786		return -EFAULT;
 2787	/* not inside the mapped region */
 2788	if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
 2789		return -EFAULT;
 2790
 2791	/*
 2792	 * May not be a start of buffer, set size appropriately
 2793	 * and advance us to the beginning.
 2794	 */
 2795	offset = buf_addr - imu->ubuf;
 2796	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
 2797
 2798	if (offset) {
 2799		/*
 2800		 * Don't use iov_iter_advance() here, as it's really slow for
 2801		 * using the latter parts of a big fixed buffer - it iterates
 2802		 * over each segment manually. We can cheat a bit here, because
 2803		 * we know that:
 2804		 *
 2805		 * 1) it's a BVEC iter, we set it up
 2806		 * 2) all bvecs are PAGE_SIZE in size, except potentially the
 2807		 *    first and last bvec
 2808		 *
 2809		 * So just find our index, and adjust the iterator afterwards.
 2810		 * If the offset is within the first bvec (or the whole first
 2811		 * bvec, just use iov_iter_advance(). This makes it easier
 2812		 * since we can just skip the first segment, which may not
 2813		 * be PAGE_SIZE aligned.
 2814		 */
 2815		const struct bio_vec *bvec = imu->bvec;
 2816
 2817		if (offset <= bvec->bv_len) {
 2818			iov_iter_advance(iter, offset);
 2819		} else {
 2820			unsigned long seg_skip;
 2821
 2822			/* skip first vec */
 2823			offset -= bvec->bv_len;
 2824			seg_skip = 1 + (offset >> PAGE_SHIFT);
 2825
 2826			iter->bvec = bvec + seg_skip;
 2827			iter->nr_segs -= seg_skip;
 2828			iter->count -= bvec->bv_len + offset;
 2829			iter->iov_offset = offset & ~PAGE_MASK;
 2830		}
 2831	}
 2832
 2833	return 0;
 2834}
 2835
 2836static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
 2837{
 2838	struct io_ring_ctx *ctx = req->ctx;
 2839	struct io_mapped_ubuf *imu = req->imu;
 2840	u16 index, buf_index = req->buf_index;
 2841
 2842	if (likely(!imu)) {
 2843		if (unlikely(buf_index >= ctx->nr_user_bufs))
 2844			return -EFAULT;
 2845		index = array_index_nospec(buf_index, ctx->nr_user_bufs);
 2846		imu = READ_ONCE(ctx->user_bufs[index]);
 2847		req->imu = imu;
 2848	}
 2849	return __io_import_fixed(req, rw, iter, imu);
 2850}
 2851
 2852static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
 2853{
 2854	if (needs_lock)
 2855		mutex_unlock(&ctx->uring_lock);
 2856}
 2857
 2858static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
 2859{
 2860	/*
 2861	 * "Normal" inline submissions always hold the uring_lock, since we
 2862	 * grab it from the system call. Same is true for the SQPOLL offload.
 2863	 * The only exception is when we've detached the request and issue it
 2864	 * from an async worker thread, grab the lock for that case.
 2865	 */
 2866	if (needs_lock)
 2867		mutex_lock(&ctx->uring_lock);
 2868}
 2869
 2870static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
 2871					  int bgid, struct io_buffer *kbuf,
 2872					  bool needs_lock)
 2873{
 2874	struct io_buffer *head;
 2875
 2876	if (req->flags & REQ_F_BUFFER_SELECTED)
 2877		return kbuf;
 2878
 2879	io_ring_submit_lock(req->ctx, needs_lock);
 2880
 2881	lockdep_assert_held(&req->ctx->uring_lock);
 2882
 2883	head = xa_load(&req->ctx->io_buffers, bgid);
 2884	if (head) {
 2885		if (!list_empty(&head->list)) {
 2886			kbuf = list_last_entry(&head->list, struct io_buffer,
 2887							list);
 2888			list_del(&kbuf->list);
 2889		} else {
 2890			kbuf = head;
 2891			xa_erase(&req->ctx->io_buffers, bgid);
 2892		}
 2893		if (*len > kbuf->len)
 2894			*len = kbuf->len;
 2895	} else {
 2896		kbuf = ERR_PTR(-ENOBUFS);
 2897	}
 2898
 2899	io_ring_submit_unlock(req->ctx, needs_lock);
 2900
 2901	return kbuf;
 2902}
 2903
 2904static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
 2905					bool needs_lock)
 2906{
 2907	struct io_buffer *kbuf;
 2908	u16 bgid;
 2909
 2910	kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
 2911	bgid = req->buf_index;
 2912	kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
 2913	if (IS_ERR(kbuf))
 2914		return kbuf;
 2915	req->rw.addr = (u64) (unsigned long) kbuf;
 2916	req->flags |= REQ_F_BUFFER_SELECTED;
 2917	return u64_to_user_ptr(kbuf->addr);
 2918}
 2919
 2920#ifdef CONFIG_COMPAT
 2921static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
 2922				bool needs_lock)
 2923{
 2924	struct compat_iovec __user *uiov;
 2925	compat_ssize_t clen;
 2926	void __user *buf;
 2927	ssize_t len;
 2928
 2929	uiov = u64_to_user_ptr(req->rw.addr);
 2930	if (!access_ok(uiov, sizeof(*uiov)))
 2931		return -EFAULT;
 2932	if (__get_user(clen, &uiov->iov_len))
 2933		return -EFAULT;
 2934	if (clen < 0)
 2935		return -EINVAL;
 2936
 2937	len = clen;
 2938	buf = io_rw_buffer_select(req, &len, needs_lock);
 2939	if (IS_ERR(buf))
 2940		return PTR_ERR(buf);
 2941	iov[0].iov_base = buf;
 2942	iov[0].iov_len = (compat_size_t) len;
 2943	return 0;
 2944}
 2945#endif
 2946
 2947static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 2948				      bool needs_lock)
 2949{
 2950	struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
 2951	void __user *buf;
 2952	ssize_t len;
 2953
 2954	if (copy_from_user(iov, uiov, sizeof(*uiov)))
 2955		return -EFAULT;
 2956
 2957	len = iov[0].iov_len;
 2958	if (len < 0)
 2959		return -EINVAL;
 2960	buf = io_rw_buffer_select(req, &len, needs_lock);
 2961	if (IS_ERR(buf))
 2962		return PTR_ERR(buf);
 2963	iov[0].iov_base = buf;
 2964	iov[0].iov_len = len;
 2965	return 0;
 2966}
 2967
 2968static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 2969				    bool needs_lock)
 2970{
 2971	if (req->flags & REQ_F_BUFFER_SELECTED) {
 2972		struct io_buffer *kbuf;
 2973
 2974		kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
 2975		iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
 2976		iov[0].iov_len = kbuf->len;
 2977		return 0;
 2978	}
 2979	if (req->rw.len != 1)
 2980		return -EINVAL;
 2981
 2982#ifdef CONFIG_COMPAT
 2983	if (req->ctx->compat)
 2984		return io_compat_import(req, iov, needs_lock);
 2985#endif
 2986
 2987	return __io_iov_buffer_select(req, iov, needs_lock);
 2988}
 2989
 2990static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
 2991			   struct iov_iter *iter, bool needs_lock)
 2992{
 2993	void __user *buf = u64_to_user_ptr(req->rw.addr);
 2994	size_t sqe_len = req->rw.len;
 2995	u8 opcode = req->opcode;
 2996	ssize_t ret;
 2997
 2998	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
 2999		*iovec = NULL;
 3000		return io_import_fixed(req, rw, iter);
 3001	}
 3002
 3003	/* buffer index only valid with fixed read/write, or buffer select  */
 3004	if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
 3005		return -EINVAL;
 3006
 3007	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
 3008		if (req->flags & REQ_F_BUFFER_SELECT) {
 3009			buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
 3010			if (IS_ERR(buf))
 3011				return PTR_ERR(buf);
 3012			req->rw.len = sqe_len;
 3013		}
 3014
 3015		ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
 3016		*iovec = NULL;
 3017		return ret;
 3018	}
 3019
 3020	if (req->flags & REQ_F_BUFFER_SELECT) {
 3021		ret = io_iov_buffer_select(req, *iovec, needs_lock);
 3022		if (!ret)
 3023			iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
 3024		*iovec = NULL;
 3025		return ret;
 3026	}
 3027
 3028	return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
 3029			      req->ctx->compat);
 3030}
 3031
 3032static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
 3033{
 3034	return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
 3035}
 3036
 3037/*
 3038 * For files that don't have ->read_iter() and ->write_iter(), handle them
 3039 * by looping over ->read() or ->write() manually.
 3040 */
 3041static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
 3042{
 3043	struct kiocb *kiocb = &req->rw.kiocb;
 3044	struct file *file = req->file;
 3045	ssize_t ret = 0;
 3046
 3047	/*
 3048	 * Don't support polled IO through this interface, and we can't
 3049	 * support non-blocking either. For the latter, this just causes
 3050	 * the kiocb to be handled from an async context.
 3051	 */
 3052	if (kiocb->ki_flags & IOCB_HIPRI)
 3053		return -EOPNOTSUPP;
 3054	if (kiocb->ki_flags & IOCB_NOWAIT)
 3055		return -EAGAIN;
 3056
 3057	while (iov_iter_count(iter)) {
 3058		struct iovec iovec;
 3059		ssize_t nr;
 3060
 3061		if (!iov_iter_is_bvec(iter)) {
 3062			iovec = iov_iter_iovec(iter);
 3063		} else {
 3064			iovec.iov_base = u64_to_user_ptr(req->rw.addr);
 3065			iovec.iov_len = req->rw.len;
 3066		}
 3067
 3068		if (rw == READ) {
 3069			nr = file->f_op->read(file, iovec.iov_base,
 3070					      iovec.iov_len, io_kiocb_ppos(kiocb));
 3071		} else {
 3072			nr = file->f_op->write(file, iovec.iov_base,
 3073					       iovec.iov_len, io_kiocb_ppos(kiocb));
 3074		}
 3075
 3076		if (nr < 0) {
 3077			if (!ret)
 3078				ret = nr;
 3079			break;
 3080		}
 3081		ret += nr;
 3082		if (nr != iovec.iov_len)
 3083			break;
 3084		req->rw.len -= nr;
 3085		req->rw.addr += nr;
 3086		iov_iter_advance(iter, nr);
 3087	}
 3088
 3089	return ret;
 3090}
 3091
 3092static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
 3093			  const struct iovec *fast_iov, struct iov_iter *iter)
 3094{
 3095	struct io_async_rw *rw = req->async_data;
 3096
 3097	memcpy(&rw->iter, iter, sizeof(*iter));
 3098	rw->free_iovec = iovec;
 3099	rw->bytes_done = 0;
 3100	/* can only be fixed buffers, no need to do anything */
 3101	if (iov_iter_is_bvec(iter))
 3102		return;
 3103	if (!iovec) {
 3104		unsigned iov_off = 0;
 3105
 3106		rw->iter.iov = rw->fast_iov;
 3107		if (iter->iov != fast_iov) {
 3108			iov_off = iter->iov - fast_iov;
 3109			rw->iter.iov += iov_off;
 3110		}
 3111		if (rw->fast_iov != fast_iov)
 3112			memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
 3113			       sizeof(struct iovec) * iter->nr_segs);
 3114	} else {
 3115		req->flags |= REQ_F_NEED_CLEANUP;
 3116	}
 3117}
 3118
 3119static inline int io_alloc_async_data(struct io_kiocb *req)
 3120{
 3121	WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
 3122	req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
 3123	return req->async_data == NULL;
 3124}
 3125
 3126static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
 3127			     const struct iovec *fast_iov,
 3128			     struct iov_iter *iter, bool force)
 3129{
 3130	if (!force && !io_op_defs[req->opcode].needs_async_setup)
 3131		return 0;
 3132	if (!req->async_data) {
 3133		if (io_alloc_async_data(req)) {
 3134			kfree(iovec);
 3135			return -ENOMEM;
 3136		}
 3137
 3138		io_req_map_rw(req, iovec, fast_iov, iter);
 3139	}
 3140	return 0;
 3141}
 3142
 3143static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
 3144{
 3145	struct io_async_rw *iorw = req->async_data;
 3146	struct iovec *iov = iorw->fast_iov;
 3147	int ret;
 3148
 3149	ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
 3150	if (unlikely(ret < 0))
 3151		return ret;
 3152
 3153	iorw->bytes_done = 0;
 3154	iorw->free_iovec = iov;
 3155	if (iov)
 3156		req->flags |= REQ_F_NEED_CLEANUP;
 3157	return 0;
 3158}
 3159
 3160static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 3161{
 3162	if (unlikely(!(req->file->f_mode & FMODE_READ)))
 3163		return -EBADF;
 3164	return io_prep_rw(req, sqe);
 3165}
 3166
 3167/*
 3168 * This is our waitqueue callback handler, registered through lock_page_async()
 3169 * when we initially tried to do the IO with the iocb armed our waitqueue.
 3170 * This gets called when the page is unlocked, and we generally expect that to
 3171 * happen when the page IO is completed and the page is now uptodate. This will
 3172 * queue a task_work based retry of the operation, attempting to copy the data
 3173 * again. If the latter fails because the page was NOT uptodate, then we will
 3174 * do a thread based blocking retry of the operation. That's the unexpected
 3175 * slow path.
 3176 */
 3177static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
 3178			     int sync, void *arg)
 3179{
 3180	struct wait_page_queue *wpq;
 3181	struct io_kiocb *req = wait->private;
 3182	struct wait_page_key *key = arg;
 3183
 3184	wpq = container_of(wait, struct wait_page_queue, wait);
 3185
 3186	if (!wake_page_match(wpq, key))
 3187		return 0;
 3188
 3189	req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
 3190	list_del_init(&wait->entry);
 3191
 3192	/* submit ref gets dropped, acquire a new one */
 3193	req_ref_get(req);
 3194	io_req_task_queue(req);
 3195	return 1;
 3196}
 3197
 3198/*
 3199 * This controls whether a given IO request should be armed for async page
 3200 * based retry. If we return false here, the request is handed to the async
 3201 * worker threads for retry. If we're doing buffered reads on a regular file,
 3202 * we prepare a private wait_page_queue entry and retry the operation. This
 3203 * will either succeed because the page is now uptodate and unlocked, or it
 3204 * will register a callback when the page is unlocked at IO completion. Through
 3205 * that callback, io_uring uses task_work to setup a retry of the operation.
 3206 * That retry will attempt the buffered read again. The retry will generally
 3207 * succeed, or in rare cases where it fails, we then fall back to using the
 3208 * async worker threads for a blocking retry.
 3209 */
 3210static bool io_rw_should_retry(struct io_kiocb *req)
 3211{
 3212	struct io_async_rw *rw = req->async_data;
 3213	struct wait_page_queue *wait = &rw->wpq;
 3214	struct kiocb *kiocb = &req->rw.kiocb;
 3215
 3216	/* never retry for NOWAIT, we just complete with -EAGAIN */
 3217	if (req->flags & REQ_F_NOWAIT)
 3218		return false;
 3219
 3220	/* Only for buffered IO */
 3221	if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
 3222		return false;
 3223
 3224	/*
 3225	 * just use poll if we can, and don't attempt if the fs doesn't
 3226	 * support callback based unlocks
 3227	 */
 3228	if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
 3229		return false;
 3230
 3231	wait->wait.func = io_async_buf_func;
 3232	wait->wait.private = req;
 3233	wait->wait.flags = 0;
 3234	INIT_LIST_HEAD(&wait->wait.entry);
 3235	kiocb->ki_flags |= IOCB_WAITQ;
 3236	kiocb->ki_flags &= ~IOCB_NOWAIT;
 3237	kiocb->ki_waitq = wait;
 3238	return true;
 3239}
 3240
 3241static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
 3242{
 3243	if (req->file->f_op->read_iter)
 3244		return call_read_iter(req->file, &req->rw.kiocb, iter);
 3245	else if (req->file->f_op->read)
 3246		return loop_rw_iter(READ, req, iter);
 3247	else
 3248		return -EINVAL;
 3249}
 3250
 3251static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 3252{
 3253	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 3254	struct kiocb *kiocb = &req->rw.kiocb;
 3255	struct iov_iter __iter, *iter = &__iter;
 3256	struct io_async_rw *rw = req->async_data;
 3257	ssize_t io_size, ret, ret2;
 3258	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 3259
 3260	if (rw) {
 3261		iter = &rw->iter;
 3262		iovec = NULL;
 3263	} else {
 3264		ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
 3265		if (ret < 0)
 3266			return ret;
 3267	}
 3268	io_size = iov_iter_count(iter);
 3269	req->result = io_size;
 3270
 3271	/* Ensure we clear previously set non-block flag */
 3272	if (!force_nonblock)
 3273		kiocb->ki_flags &= ~IOCB_NOWAIT;
 3274	else
 3275		kiocb->ki_flags |= IOCB_NOWAIT;
 3276
 3277	/* If the file doesn't support async, just async punt */
 3278	if (force_nonblock && !io_file_supports_async(req, READ)) {
 3279		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
 3280		return ret ?: -EAGAIN;
 3281	}
 3282
 3283	ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
 3284	if (unlikely(ret)) {
 3285		kfree(iovec);
 3286		return ret;
 3287	}
 3288
 3289	ret = io_iter_do_read(req, iter);
 3290
 3291	if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
 3292		req->flags &= ~REQ_F_REISSUE;
 3293		/* IOPOLL retry should happen for io-wq threads */
 3294		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
 3295			goto done;
 3296		/* no retry on NONBLOCK nor RWF_NOWAIT */
 3297		if (req->flags & REQ_F_NOWAIT)
 3298			goto done;
 3299		/* some cases will consume bytes even on error returns */
 3300		iov_iter_revert(iter, io_size - iov_iter_count(iter));
 3301		ret = 0;
 3302	} else if (ret == -EIOCBQUEUED) {
 3303		goto out_free;
 3304	} else if (ret <= 0 || ret == io_size || !force_nonblock ||
 3305		   (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) {
 3306		/* read all, failed, already did sync or don't want to retry */
 3307		goto done;
 3308	}
 3309
 3310	ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
 3311	if (ret2)
 3312		return ret2;
 3313
 3314	iovec = NULL;
 3315	rw = req->async_data;
 3316	/* now use our persistent iterator, if we aren't already */
 3317	iter = &rw->iter;
 3318
 3319	do {
 3320		io_size -= ret;
 3321		rw->bytes_done += ret;
 3322		/* if we can retry, do so with the callbacks armed */
 3323		if (!io_rw_should_retry(req)) {
 3324			kiocb->ki_flags &= ~IOCB_WAITQ;
 3325			return -EAGAIN;
 3326		}
 3327
 3328		/*
 3329		 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
 3330		 * we get -EIOCBQUEUED, then we'll get a notification when the
 3331		 * desired page gets unlocked. We can also get a partial read
 3332		 * here, and if we do, then just retry at the new offset.
 3333		 */
 3334		ret = io_iter_do_read(req, iter);
 3335		if (ret == -EIOCBQUEUED)
 3336			return 0;
 3337		/* we got some bytes, but not all. retry. */
 3338		kiocb->ki_flags &= ~IOCB_WAITQ;
 3339	} while (ret > 0 && ret < io_size);
 3340done:
 3341	kiocb_done(kiocb, ret, issue_flags);
 3342out_free:
 3343	/* it's faster to check here then delegate to kfree */
 3344	if (iovec)
 3345		kfree(iovec);
 3346	return 0;
 3347}
 3348
 3349static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 3350{
 3351	if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
 3352		return -EBADF;
 3353	return io_prep_rw(req, sqe);
 3354}
 3355
 3356static int io_write(struct io_kiocb *req, unsigned int issue_flags)
 3357{
 3358	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 3359	struct kiocb *kiocb = &req->rw.kiocb;
 3360	struct iov_iter __iter, *iter = &__iter;
 3361	struct io_async_rw *rw = req->async_data;
 3362	ssize_t ret, ret2, io_size;
 3363	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 3364
 3365	if (rw) {
 3366		iter = &rw->iter;
 3367		iovec = NULL;
 3368	} else {
 3369		ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
 3370		if (ret < 0)
 3371			return ret;
 3372	}
 3373	io_size = iov_iter_count(iter);
 3374	req->result = io_size;
 3375
 3376	/* Ensure we clear previously set non-block flag */
 3377	if (!force_nonblock)
 3378		kiocb->ki_flags &= ~IOCB_NOWAIT;
 3379	else
 3380		kiocb->ki_flags |= IOCB_NOWAIT;
 3381
 3382	/* If the file doesn't support async, just async punt */
 3383	if (force_nonblock && !io_file_supports_async(req, WRITE))
 3384		goto copy_iov;
 3385
 3386	/* file path doesn't support NOWAIT for non-direct_IO */
 3387	if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
 3388	    (req->flags & REQ_F_ISREG))
 3389		goto copy_iov;
 3390
 3391	ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
 3392	if (unlikely(ret))
 3393		goto out_free;
 3394
 3395	/*
 3396	 * Open-code file_start_write here to grab freeze protection,
 3397	 * which will be released by another thread in
 3398	 * io_complete_rw().  Fool lockdep by telling it the lock got
 3399	 * released so that it doesn't complain about the held lock when
 3400	 * we return to userspace.
 3401	 */
 3402	if (req->flags & REQ_F_ISREG) {
 3403		sb_start_write(file_inode(req->file)->i_sb);
 3404		__sb_writers_release(file_inode(req->file)->i_sb,
 3405					SB_FREEZE_WRITE);
 3406	}
 3407	kiocb->ki_flags |= IOCB_WRITE;
 3408
 3409	if (req->file->f_op->write_iter)
 3410		ret2 = call_write_iter(req->file, kiocb, iter);
 3411	else if (req->file->f_op->write)
 3412		ret2 = loop_rw_iter(WRITE, req, iter);
 3413	else
 3414		ret2 = -EINVAL;
 3415
 3416	if (req->flags & REQ_F_REISSUE) {
 3417		req->flags &= ~REQ_F_REISSUE;
 3418		ret2 = -EAGAIN;
 3419	}
 3420
 3421	/*
 3422	 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
 3423	 * retry them without IOCB_NOWAIT.
 3424	 */
 3425	if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
 3426		ret2 = -EAGAIN;
 3427	/* no retry on NONBLOCK nor RWF_NOWAIT */
 3428	if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
 3429		goto done;
 3430	if (!force_nonblock || ret2 != -EAGAIN) {
 3431		/* IOPOLL retry should happen for io-wq threads */
 3432		if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
 3433			goto copy_iov;
 3434done:
 3435		kiocb_done(kiocb, ret2, issue_flags);
 3436	} else {
 3437copy_iov:
 3438		/* some cases will consume bytes even on error returns */
 3439		iov_iter_revert(iter, io_size - iov_iter_count(iter));
 3440		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
 3441		return ret ?: -EAGAIN;
 3442	}
 3443out_free:
 3444	/* it's reportedly faster than delegating the null check to kfree() */
 3445	if (iovec)
 3446		kfree(iovec);
 3447	return ret;
 3448}
 3449
 3450static int io_renameat_prep(struct io_kiocb *req,
 3451			    const struct io_uring_sqe *sqe)
 3452{
 3453	struct io_rename *ren = &req->rename;
 3454	const char __user *oldf, *newf;
 3455
 3456	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 3457		return -EBADF;
 3458
 3459	ren->old_dfd = READ_ONCE(sqe->fd);
 3460	oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
 3461	newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 3462	ren->new_dfd = READ_ONCE(sqe->len);
 3463	ren->flags = READ_ONCE(sqe->rename_flags);
 3464
 3465	ren->oldpath = getname(oldf);
 3466	if (IS_ERR(ren->oldpath))
 3467		return PTR_ERR(ren->oldpath);
 3468
 3469	ren->newpath = getname(newf);
 3470	if (IS_ERR(ren->newpath)) {
 3471		putname(ren->oldpath);
 3472		return PTR_ERR(ren->newpath);
 3473	}
 3474
 3475	req->flags |= REQ_F_NEED_CLEANUP;
 3476	return 0;
 3477}
 3478
 3479static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
 3480{
 3481	struct io_rename *ren = &req->rename;
 3482	int ret;
 3483
 3484	if (issue_flags & IO_URING_F_NONBLOCK)
 3485		return -EAGAIN;
 3486
 3487	ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
 3488				ren->newpath, ren->flags);
 3489
 3490	req->flags &= ~REQ_F_NEED_CLEANUP;
 3491	if (ret < 0)
 3492		req_set_fail_links(req);
 3493	io_req_complete(req, ret);
 3494	return 0;
 3495}
 3496
 3497static int io_unlinkat_prep(struct io_kiocb *req,
 3498			    const struct io_uring_sqe *sqe)
 3499{
 3500	struct io_unlink *un = &req->unlink;
 3501	const char __user *fname;
 3502
 3503	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 3504		return -EBADF;
 3505
 3506	un->dfd = READ_ONCE(sqe->fd);
 3507
 3508	un->flags = READ_ONCE(sqe->unlink_flags);
 3509	if (un->flags & ~AT_REMOVEDIR)
 3510		return -EINVAL;
 3511
 3512	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
 3513	un->filename = getname(fname);
 3514	if (IS_ERR(un->filename))
 3515		return PTR_ERR(un->filename);
 3516
 3517	req->flags |= REQ_F_NEED_CLEANUP;
 3518	return 0;
 3519}
 3520
 3521static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
 3522{
 3523	struct io_unlink *un = &req->unlink;
 3524	int ret;
 3525
 3526	if (issue_flags & IO_URING_F_NONBLOCK)
 3527		return -EAGAIN;
 3528
 3529	if (un->flags & AT_REMOVEDIR)
 3530		ret = do_rmdir(un->dfd, un->filename);
 3531	else
 3532		ret = do_unlinkat(un->dfd, un->filename);
 3533
 3534	req->flags &= ~REQ_F_NEED_CLEANUP;
 3535	if (ret < 0)
 3536		req_set_fail_links(req);
 3537	io_req_complete(req, ret);
 3538	return 0;
 3539}
 3540
 3541static int io_shutdown_prep(struct io_kiocb *req,
 3542			    const struct io_uring_sqe *sqe)
 3543{
 3544#if defined(CONFIG_NET)
 3545	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 3546		return -EINVAL;
 3547	if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
 3548	    sqe->buf_index)
 3549		return -EINVAL;
 3550
 3551	req->shutdown.how = READ_ONCE(sqe->len);
 3552	return 0;
 3553#else
 3554	return -EOPNOTSUPP;
 3555#endif
 3556}
 3557
 3558static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
 3559{
 3560#if defined(CONFIG_NET)
 3561	struct socket *sock;
 3562	int ret;
 3563
 3564	if (issue_flags & IO_URING_F_NONBLOCK)
 3565		return -EAGAIN;
 3566
 3567	sock = sock_from_file(req->file);
 3568	if (unlikely(!sock))
 3569		return -ENOTSOCK;
 3570
 3571	ret = __sys_shutdown_sock(sock, req->shutdown.how);
 3572	if (ret < 0)
 3573		req_set_fail_links(req);
 3574	io_req_complete(req, ret);
 3575	return 0;
 3576#else
 3577	return -EOPNOTSUPP;
 3578#endif
 3579}
 3580
 3581static int __io_splice_prep(struct io_kiocb *req,
 3582			    const struct io_uring_sqe *sqe)
 3583{
 3584	struct io_splice* sp = &req->splice;
 3585	unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
 3586
 3587	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 3588		return -EINVAL;
 3589
 3590	sp->file_in = NULL;
 3591	sp->len = READ_ONCE(sqe->len);
 3592	sp->flags = READ_ONCE(sqe->splice_flags);
 3593
 3594	if (unlikely(sp->flags & ~valid_flags))
 3595		return -EINVAL;
 3596
 3597	sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in),
 3598				  (sp->flags & SPLICE_F_FD_IN_FIXED));
 3599	if (!sp->file_in)
 3600		return -EBADF;
 3601	req->flags |= REQ_F_NEED_CLEANUP;
 3602	return 0;
 3603}
 3604
 3605static int io_tee_prep(struct io_kiocb *req,
 3606		       const struct io_uring_sqe *sqe)
 3607{
 3608	if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
 3609		return -EINVAL;
 3610	return __io_splice_prep(req, sqe);
 3611}
 3612
 3613static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
 3614{
 3615	struct io_splice *sp = &req->splice;
 3616	struct file *in = sp->file_in;
 3617	struct file *out = sp->file_out;
 3618	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
 3619	long ret = 0;
 3620
 3621	if (issue_flags & IO_URING_F_NONBLOCK)
 3622		return -EAGAIN;
 3623	if (sp->len)
 3624		ret = do_tee(in, out, sp->len, flags);
 3625
 3626	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
 3627		io_put_file(in);
 3628	req->flags &= ~REQ_F_NEED_CLEANUP;
 3629
 3630	if (ret != sp->len)
 3631		req_set_fail_links(req);
 3632	io_req_complete(req, ret);
 3633	return 0;
 3634}
 3635
 3636static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 3637{
 3638	struct io_splice* sp = &req->splice;
 3639
 3640	sp->off_in = READ_ONCE(sqe->splice_off_in);
 3641	sp->off_out = READ_ONCE(sqe->off);
 3642	return __io_splice_prep(req, sqe);
 3643}
 3644
 3645static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
 3646{
 3647	struct io_splice *sp = &req->splice;
 3648	struct file *in = sp->file_in;
 3649	struct file *out = sp->file_out;
 3650	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
 3651	loff_t *poff_in, *poff_out;
 3652	long ret = 0;
 3653
 3654	if (issue_flags & IO_URING_F_NONBLOCK)
 3655		return -EAGAIN;
 3656
 3657	poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
 3658	poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
 3659
 3660	if (sp->len)
 3661		ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
 3662
 3663	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
 3664		io_put_file(in);
 3665	req->flags &= ~REQ_F_NEED_CLEANUP;
 3666
 3667	if (ret != sp->len)
 3668		req_set_fail_links(req);
 3669	io_req_complete(req, ret);
 3670	return 0;
 3671}
 3672
 3673/*
 3674 * IORING_OP_NOP just posts a completion event, nothing else.
 3675 */
 3676static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 3677{
 3678	struct io_ring_ctx *ctx = req->ctx;
 3679
 3680	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 3681		return -EINVAL;
 3682
 3683	__io_req_complete(req, issue_flags, 0, 0);
 3684	return 0;
 3685}
 3686
 3687static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 3688{
 3689	struct io_ring_ctx *ctx = req->ctx;
 3690
 3691	if (!req->file)
 3692		return -EBADF;
 3693
 3694	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 3695		return -EINVAL;
 3696	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
 3697		return -EINVAL;
 3698
 3699	req->sync.flags = READ_ONCE(sqe->fsync_flags);
 3700	if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
 3701		return -EINVAL;
 3702
 3703	req->sync.off = READ_ONCE(sqe->off);
 3704	req->sync.len = READ_ONCE(sqe->len);
 3705	return 0;
 3706}
 3707
 3708static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
 3709{
 3710	loff_t end = req->sync.off + req->sync.len;
 3711	int ret;
 3712
 3713	/* fsync always requires a blocking context */
 3714	if (issue_flags & IO_URING_F_NONBLOCK)
 3715		return -EAGAIN;
 3716
 3717	ret = vfs_fsync_range(req->file, req->sync.off,
 3718				end > 0 ? end : LLONG_MAX,
 3719				req->sync.flags & IORING_FSYNC_DATASYNC);
 3720	if (ret < 0)
 3721		req_set_fail_links(req);
 3722	io_req_complete(req, ret);
 3723	return 0;
 3724}
 3725
 3726static int io_fallocate_prep(struct io_kiocb *req,
 3727			     const struct io_uring_sqe *sqe)
 3728{
 3729	if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
 3730		return -EINVAL;
 3731	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 3732		return -EINVAL;
 3733
 3734	req->sync.off = READ_ONCE(sqe->off);
 3735	req->sync.len = READ_ONCE(sqe->addr);
 3736	req->sync.mode = READ_ONCE(sqe->len);
 3737	return 0;
 3738}
 3739
 3740static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
 3741{
 3742	int ret;
 3743
 3744	/* fallocate always requiring blocking context */
 3745	if (issue_flags & IO_URING_F_NONBLOCK)
 3746		return -EAGAIN;
 3747	ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
 3748				req->sync.len);
 3749	if (ret < 0)
 3750		req_set_fail_links(req);
 3751	io_req_complete(req, ret);
 3752	return 0;
 3753}
 3754
 3755static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 3756{
 3757	const char __user *fname;
 3758	int ret;
 3759
 3760	if (unlikely(sqe->ioprio || sqe->buf_index))
 3761		return -EINVAL;
 3762	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 3763		return -EBADF;
 3764
 3765	/* open.how should be already initialised */
 3766	if (!(req->open.how.flags & O_PATH) && force_o_largefile())
 3767		req->open.how.flags |= O_LARGEFILE;
 3768
 3769	req->open.dfd = READ_ONCE(sqe->fd);
 3770	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
 3771	req->open.filename = getname(fname);
 3772	if (IS_ERR(req->open.filename)) {
 3773		ret = PTR_ERR(req->open.filename);
 3774		req->open.filename = NULL;
 3775		return ret;
 3776	}
 3777	req->open.nofile = rlimit(RLIMIT_NOFILE);
 3778	req->flags |= REQ_F_NEED_CLEANUP;
 3779	return 0;
 3780}
 3781
 3782static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 3783{
 3784	u64 flags, mode;
 3785
 3786	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 3787		return -EINVAL;
 3788	mode = READ_ONCE(sqe->len);
 3789	flags = READ_ONCE(sqe->open_flags);
 3790	req->open.how = build_open_how(flags, mode);
 3791	return __io_openat_prep(req, sqe);
 3792}
 3793
 3794static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 3795{
 3796	struct open_how __user *how;
 3797	size_t len;
 3798	int ret;
 3799
 3800	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 3801		return -EINVAL;
 3802	how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 3803	len = READ_ONCE(sqe->len);
 3804	if (len < OPEN_HOW_SIZE_VER0)
 3805		return -EINVAL;
 3806
 3807	ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
 3808					len);
 3809	if (ret)
 3810		return ret;
 3811
 3812	return __io_openat_prep(req, sqe);
 3813}
 3814
 3815static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
 3816{
 3817	struct open_flags op;
 3818	struct file *file;
 3819	bool nonblock_set;
 3820	bool resolve_nonblock;
 3821	int ret;
 3822
 3823	ret = build_open_flags(&req->open.how, &op);
 3824	if (ret)
 3825		goto err;
 3826	nonblock_set = op.open_flag & O_NONBLOCK;
 3827	resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
 3828	if (issue_flags & IO_URING_F_NONBLOCK) {
 3829		/*
 3830		 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
 3831		 * it'll always -EAGAIN
 3832		 */
 3833		if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
 3834			return -EAGAIN;
 3835		op.lookup_flags |= LOOKUP_CACHED;
 3836		op.open_flag |= O_NONBLOCK;
 3837	}
 3838
 3839	ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
 3840	if (ret < 0)
 3841		goto err;
 3842
 3843	file = do_filp_open(req->open.dfd, req->open.filename, &op);
 3844	/* only retry if RESOLVE_CACHED wasn't already set by application */
 3845	if ((!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)) &&
 3846	    file == ERR_PTR(-EAGAIN)) {
 3847		/*
 3848		 * We could hang on to this 'fd', but seems like marginal
 3849		 * gain for something that is now known to be a slower path.
 3850		 * So just put it, and we'll get a new one when we retry.
 3851		 */
 3852		put_unused_fd(ret);
 3853		return -EAGAIN;
 3854	}
 3855
 3856	if (IS_ERR(file)) {
 3857		put_unused_fd(ret);
 3858		ret = PTR_ERR(file);
 3859	} else {
 3860		if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
 3861			file->f_flags &= ~O_NONBLOCK;
 3862		fsnotify_open(file);
 3863		fd_install(ret, file);
 3864	}
 3865err:
 3866	putname(req->open.filename);
 3867	req->flags &= ~REQ_F_NEED_CLEANUP;
 3868	if (ret < 0)
 3869		req_set_fail_links(req);
 3870	__io_req_complete(req, issue_flags, ret, 0);
 3871	return 0;
 3872}
 3873
 3874static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
 3875{
 3876	return io_openat2(req, issue_flags);
 3877}
 3878
 3879static int io_remove_buffers_prep(struct io_kiocb *req,
 3880				  const struct io_uring_sqe *sqe)
 3881{
 3882	struct io_provide_buf *p = &req->pbuf;
 3883	u64 tmp;
 3884
 3885	if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
 3886		return -EINVAL;
 3887
 3888	tmp = READ_ONCE(sqe->fd);
 3889	if (!tmp || tmp > USHRT_MAX)
 3890		return -EINVAL;
 3891
 3892	memset(p, 0, sizeof(*p));
 3893	p->nbufs = tmp;
 3894	p->bgid = READ_ONCE(sqe->buf_group);
 3895	return 0;
 3896}
 3897
 3898static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
 3899			       int bgid, unsigned nbufs)
 3900{
 3901	unsigned i = 0;
 3902
 3903	/* shouldn't happen */
 3904	if (!nbufs)
 3905		return 0;
 3906
 3907	/* the head kbuf is the list itself */
 3908	while (!list_empty(&buf->list)) {
 3909		struct io_buffer *nxt;
 3910
 3911		nxt = list_first_entry(&buf->list, struct io_buffer, list);
 3912		list_del(&nxt->list);
 3913		kfree(nxt);
 3914		if (++i == nbufs)
 3915			return i;
 3916	}
 3917	i++;
 3918	kfree(buf);
 3919	xa_erase(&ctx->io_buffers, bgid);
 3920
 3921	return i;
 3922}
 3923
 3924static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 3925{
 3926	struct io_provide_buf *p = &req->pbuf;
 3927	struct io_ring_ctx *ctx = req->ctx;
 3928	struct io_buffer *head;
 3929	int ret = 0;
 3930	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 3931
 3932	io_ring_submit_lock(ctx, !force_nonblock);
 3933
 3934	lockdep_assert_held(&ctx->uring_lock);
 3935
 3936	ret = -ENOENT;
 3937	head = xa_load(&ctx->io_buffers, p->bgid);
 3938	if (head)
 3939		ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
 3940	if (ret < 0)
 3941		req_set_fail_links(req);
 3942
 3943	/* complete before unlock, IOPOLL may need the lock */
 3944	__io_req_complete(req, issue_flags, ret, 0);
 3945	io_ring_submit_unlock(ctx, !force_nonblock);
 3946	return 0;
 3947}
 3948
 3949static int io_provide_buffers_prep(struct io_kiocb *req,
 3950				   const struct io_uring_sqe *sqe)
 3951{
 3952	unsigned long size, tmp_check;
 3953	struct io_provide_buf *p = &req->pbuf;
 3954	u64 tmp;
 3955
 3956	if (sqe->ioprio || sqe->rw_flags)
 3957		return -EINVAL;
 3958
 3959	tmp = READ_ONCE(sqe->fd);
 3960	if (!tmp || tmp > USHRT_MAX)
 3961		return -E2BIG;
 3962	p->nbufs = tmp;
 3963	p->addr = READ_ONCE(sqe->addr);
 3964	p->len = READ_ONCE(sqe->len);
 3965
 3966	if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
 3967				&size))
 3968		return -EOVERFLOW;
 3969	if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
 3970		return -EOVERFLOW;
 3971
 3972	size = (unsigned long)p->len * p->nbufs;
 3973	if (!access_ok(u64_to_user_ptr(p->addr), size))
 3974		return -EFAULT;
 3975
 3976	p->bgid = READ_ONCE(sqe->buf_group);
 3977	tmp = READ_ONCE(sqe->off);
 3978	if (tmp > USHRT_MAX)
 3979		return -E2BIG;
 3980	p->bid = tmp;
 3981	return 0;
 3982}
 3983
 3984static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
 3985{
 3986	struct io_buffer *buf;
 3987	u64 addr = pbuf->addr;
 3988	int i, bid = pbuf->bid;
 3989
 3990	for (i = 0; i < pbuf->nbufs; i++) {
 3991		buf = kmalloc(sizeof(*buf), GFP_KERNEL);
 3992		if (!buf)
 3993			break;
 3994
 3995		buf->addr = addr;
 3996		buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
 3997		buf->bid = bid;
 3998		addr += pbuf->len;
 3999		bid++;
 4000		if (!*head) {
 4001			INIT_LIST_HEAD(&buf->list);
 4002			*head = buf;
 4003		} else {
 4004			list_add_tail(&buf->list, &(*head)->list);
 4005		}
 4006	}
 4007
 4008	return i ? i : -ENOMEM;
 4009}
 4010
 4011static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 4012{
 4013	struct io_provide_buf *p = &req->pbuf;
 4014	struct io_ring_ctx *ctx = req->ctx;
 4015	struct io_buffer *head, *list;
 4016	int ret = 0;
 4017	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 4018
 4019	io_ring_submit_lock(ctx, !force_nonblock);
 4020
 4021	lockdep_assert_held(&ctx->uring_lock);
 4022
 4023	list = head = xa_load(&ctx->io_buffers, p->bgid);
 4024
 4025	ret = io_add_buffers(p, &head);
 4026	if (ret >= 0 && !list) {
 4027		ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL);
 4028		if (ret < 0)
 4029			__io_remove_buffers(ctx, head, p->bgid, -1U);
 4030	}
 4031	if (ret < 0)
 4032		req_set_fail_links(req);
 4033	/* complete before unlock, IOPOLL may need the lock */
 4034	__io_req_complete(req, issue_flags, ret, 0);
 4035	io_ring_submit_unlock(ctx, !force_nonblock);
 4036	return 0;
 4037}
 4038
 4039static int io_epoll_ctl_prep(struct io_kiocb *req,
 4040			     const struct io_uring_sqe *sqe)
 4041{
 4042#if defined(CONFIG_EPOLL)
 4043	if (sqe->ioprio || sqe->buf_index)
 4044		return -EINVAL;
 4045	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4046		return -EINVAL;
 4047
 4048	req->epoll.epfd = READ_ONCE(sqe->fd);
 4049	req->epoll.op = READ_ONCE(sqe->len);
 4050	req->epoll.fd = READ_ONCE(sqe->off);
 4051
 4052	if (ep_op_has_event(req->epoll.op)) {
 4053		struct epoll_event __user *ev;
 4054
 4055		ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
 4056		if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
 4057			return -EFAULT;
 4058	}
 4059
 4060	return 0;
 4061#else
 4062	return -EOPNOTSUPP;
 4063#endif
 4064}
 4065
 4066static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
 4067{
 4068#if defined(CONFIG_EPOLL)
 4069	struct io_epoll *ie = &req->epoll;
 4070	int ret;
 4071	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 4072
 4073	ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
 4074	if (force_nonblock && ret == -EAGAIN)
 4075		return -EAGAIN;
 4076
 4077	if (ret < 0)
 4078		req_set_fail_links(req);
 4079	__io_req_complete(req, issue_flags, ret, 0);
 4080	return 0;
 4081#else
 4082	return -EOPNOTSUPP;
 4083#endif
 4084}
 4085
 4086static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4087{
 4088#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
 4089	if (sqe->ioprio || sqe->buf_index || sqe->off)
 4090		return -EINVAL;
 4091	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4092		return -EINVAL;
 4093
 4094	req->madvise.addr = READ_ONCE(sqe->addr);
 4095	req->madvise.len = READ_ONCE(sqe->len);
 4096	req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
 4097	return 0;
 4098#else
 4099	return -EOPNOTSUPP;
 4100#endif
 4101}
 4102
 4103static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
 4104{
 4105#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
 4106	struct io_madvise *ma = &req->madvise;
 4107	int ret;
 4108
 4109	if (issue_flags & IO_URING_F_NONBLOCK)
 4110		return -EAGAIN;
 4111
 4112	ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
 4113	if (ret < 0)
 4114		req_set_fail_links(req);
 4115	io_req_complete(req, ret);
 4116	return 0;
 4117#else
 4118	return -EOPNOTSUPP;
 4119#endif
 4120}
 4121
 4122static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4123{
 4124	if (sqe->ioprio || sqe->buf_index || sqe->addr)
 4125		return -EINVAL;
 4126	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4127		return -EINVAL;
 4128
 4129	req->fadvise.offset = READ_ONCE(sqe->off);
 4130	req->fadvise.len = READ_ONCE(sqe->len);
 4131	req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
 4132	return 0;
 4133}
 4134
 4135static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
 4136{
 4137	struct io_fadvise *fa = &req->fadvise;
 4138	int ret;
 4139
 4140	if (issue_flags & IO_URING_F_NONBLOCK) {
 4141		switch (fa->advice) {
 4142		case POSIX_FADV_NORMAL:
 4143		case POSIX_FADV_RANDOM:
 4144		case POSIX_FADV_SEQUENTIAL:
 4145			break;
 4146		default:
 4147			return -EAGAIN;
 4148		}
 4149	}
 4150
 4151	ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
 4152	if (ret < 0)
 4153		req_set_fail_links(req);
 4154	__io_req_complete(req, issue_flags, ret, 0);
 4155	return 0;
 4156}
 4157
 4158static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4159{
 4160	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4161		return -EINVAL;
 4162	if (sqe->ioprio || sqe->buf_index)
 4163		return -EINVAL;
 4164	if (req->flags & REQ_F_FIXED_FILE)
 4165		return -EBADF;
 4166
 4167	req->statx.dfd = READ_ONCE(sqe->fd);
 4168	req->statx.mask = READ_ONCE(sqe->len);
 4169	req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
 4170	req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 4171	req->statx.flags = READ_ONCE(sqe->statx_flags);
 4172
 4173	return 0;
 4174}
 4175
 4176static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
 4177{
 4178	struct io_statx *ctx = &req->statx;
 4179	int ret;
 4180
 4181	if (issue_flags & IO_URING_F_NONBLOCK)
 4182		return -EAGAIN;
 4183
 4184	ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
 4185		       ctx->buffer);
 4186
 4187	if (ret < 0)
 4188		req_set_fail_links(req);
 4189	io_req_complete(req, ret);
 4190	return 0;
 4191}
 4192
 4193static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4194{
 4195	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4196		return -EINVAL;
 4197	if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
 4198	    sqe->rw_flags || sqe->buf_index)
 4199		return -EINVAL;
 4200	if (req->flags & REQ_F_FIXED_FILE)
 4201		return -EBADF;
 4202
 4203	req->close.fd = READ_ONCE(sqe->fd);
 4204	return 0;
 4205}
 4206
 4207static int io_close(struct io_kiocb *req, unsigned int issue_flags)
 4208{
 4209	struct files_struct *files = current->files;
 4210	struct io_close *close = &req->close;
 4211	struct fdtable *fdt;
 4212	struct file *file = NULL;
 4213	int ret = -EBADF;
 4214
 4215	spin_lock(&files->file_lock);
 4216	fdt = files_fdtable(files);
 4217	if (close->fd >= fdt->max_fds) {
 4218		spin_unlock(&files->file_lock);
 4219		goto err;
 4220	}
 4221	file = fdt->fd[close->fd];
 4222	if (!file || file->f_op == &io_uring_fops) {
 4223		spin_unlock(&files->file_lock);
 4224		file = NULL;
 4225		goto err;
 4226	}
 4227
 4228	/* if the file has a flush method, be safe and punt to async */
 4229	if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
 4230		spin_unlock(&files->file_lock);
 4231		return -EAGAIN;
 4232	}
 4233
 4234	ret = __close_fd_get_file(close->fd, &file);
 4235	spin_unlock(&files->file_lock);
 4236	if (ret < 0) {
 4237		if (ret == -ENOENT)
 4238			ret = -EBADF;
 4239		goto err;
 4240	}
 4241
 4242	/* No ->flush() or already async, safely close from here */
 4243	ret = filp_close(file, current->files);
 4244err:
 4245	if (ret < 0)
 4246		req_set_fail_links(req);
 4247	if (file)
 4248		fput(file);
 4249	__io_req_complete(req, issue_flags, ret, 0);
 4250	return 0;
 4251}
 4252
 4253static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4254{
 4255	struct io_ring_ctx *ctx = req->ctx;
 4256
 4257	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 4258		return -EINVAL;
 4259	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
 4260		return -EINVAL;
 4261
 4262	req->sync.off = READ_ONCE(sqe->off);
 4263	req->sync.len = READ_ONCE(sqe->len);
 4264	req->sync.flags = READ_ONCE(sqe->sync_range_flags);
 4265	return 0;
 4266}
 4267
 4268static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
 4269{
 4270	int ret;
 4271
 4272	/* sync_file_range always requires a blocking context */
 4273	if (issue_flags & IO_URING_F_NONBLOCK)
 4274		return -EAGAIN;
 4275
 4276	ret = sync_file_range(req->file, req->sync.off, req->sync.len,
 4277				req->sync.flags);
 4278	if (ret < 0)
 4279		req_set_fail_links(req);
 4280	io_req_complete(req, ret);
 4281	return 0;
 4282}
 4283
 4284#if defined(CONFIG_NET)
 4285static int io_setup_async_msg(struct io_kiocb *req,
 4286			      struct io_async_msghdr *kmsg)
 4287{
 4288	struct io_async_msghdr *async_msg = req->async_data;
 4289
 4290	if (async_msg)
 4291		return -EAGAIN;
 4292	if (io_alloc_async_data(req)) {
 4293		kfree(kmsg->free_iov);
 4294		return -ENOMEM;
 4295	}
 4296	async_msg = req->async_data;
 4297	req->flags |= REQ_F_NEED_CLEANUP;
 4298	memcpy(async_msg, kmsg, sizeof(*kmsg));
 4299	async_msg->msg.msg_name = &async_msg->addr;
 4300	/* if were using fast_iov, set it to the new one */
 4301	if (!async_msg->free_iov)
 4302		async_msg->msg.msg_iter.iov = async_msg->fast_iov;
 4303
 4304	return -EAGAIN;
 4305}
 4306
 4307static int io_sendmsg_copy_hdr(struct io_kiocb *req,
 4308			       struct io_async_msghdr *iomsg)
 4309{
 4310	iomsg->msg.msg_name = &iomsg->addr;
 4311	iomsg->free_iov = iomsg->fast_iov;
 4312	return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
 4313				   req->sr_msg.msg_flags, &iomsg->free_iov);
 4314}
 4315
 4316static int io_sendmsg_prep_async(struct io_kiocb *req)
 4317{
 4318	int ret;
 4319
 4320	ret = io_sendmsg_copy_hdr(req, req->async_data);
 4321	if (!ret)
 4322		req->flags |= REQ_F_NEED_CLEANUP;
 4323	return ret;
 4324}
 4325
 4326static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4327{
 4328	struct io_sr_msg *sr = &req->sr_msg;
 4329
 4330	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4331		return -EINVAL;
 4332
 4333	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 4334	sr->len = READ_ONCE(sqe->len);
 4335	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
 4336	if (sr->msg_flags & MSG_DONTWAIT)
 4337		req->flags |= REQ_F_NOWAIT;
 4338
 4339#ifdef CONFIG_COMPAT
 4340	if (req->ctx->compat)
 4341		sr->msg_flags |= MSG_CMSG_COMPAT;
 4342#endif
 4343	return 0;
 4344}
 4345
 4346static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 4347{
 4348	struct io_async_msghdr iomsg, *kmsg;
 4349	struct socket *sock;
 4350	unsigned flags;
 4351	int min_ret = 0;
 4352	int ret;
 4353
 4354	sock = sock_from_file(req->file);
 4355	if (unlikely(!sock))
 4356		return -ENOTSOCK;
 4357
 4358	kmsg = req->async_data;
 4359	if (!kmsg) {
 4360		ret = io_sendmsg_copy_hdr(req, &iomsg);
 4361		if (ret)
 4362			return ret;
 4363		kmsg = &iomsg;
 4364	}
 4365
 4366	flags = req->sr_msg.msg_flags;
 4367	if (issue_flags & IO_URING_F_NONBLOCK)
 4368		flags |= MSG_DONTWAIT;
 4369	if (flags & MSG_WAITALL)
 4370		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
 4371
 4372	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
 4373	if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
 4374		return io_setup_async_msg(req, kmsg);
 4375	if (ret == -ERESTARTSYS)
 4376		ret = -EINTR;
 4377
 4378	/* fast path, check for non-NULL to avoid function call */
 4379	if (kmsg->free_iov)
 4380		kfree(kmsg->free_iov);
 4381	req->flags &= ~REQ_F_NEED_CLEANUP;
 4382	if (ret < min_ret)
 4383		req_set_fail_links(req);
 4384	__io_req_complete(req, issue_flags, ret, 0);
 4385	return 0;
 4386}
 4387
 4388static int io_send(struct io_kiocb *req, unsigned int issue_flags)
 4389{
 4390	struct io_sr_msg *sr = &req->sr_msg;
 4391	struct msghdr msg;
 4392	struct iovec iov;
 4393	struct socket *sock;
 4394	unsigned flags;
 4395	int min_ret = 0;
 4396	int ret;
 4397
 4398	sock = sock_from_file(req->file);
 4399	if (unlikely(!sock))
 4400		return -ENOTSOCK;
 4401
 4402	ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
 4403	if (unlikely(ret))
 4404		return ret;
 4405
 4406	msg.msg_name = NULL;
 4407	msg.msg_control = NULL;
 4408	msg.msg_controllen = 0;
 4409	msg.msg_namelen = 0;
 4410
 4411	flags = req->sr_msg.msg_flags;
 4412	if (issue_flags & IO_URING_F_NONBLOCK)
 4413		flags |= MSG_DONTWAIT;
 4414	if (flags & MSG_WAITALL)
 4415		min_ret = iov_iter_count(&msg.msg_iter);
 4416
 4417	msg.msg_flags = flags;
 4418	ret = sock_sendmsg(sock, &msg);
 4419	if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
 4420		return -EAGAIN;
 4421	if (ret == -ERESTARTSYS)
 4422		ret = -EINTR;
 4423
 4424	if (ret < min_ret)
 4425		req_set_fail_links(req);
 4426	__io_req_complete(req, issue_flags, ret, 0);
 4427	return 0;
 4428}
 4429
 4430static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
 4431				 struct io_async_msghdr *iomsg)
 4432{
 4433	struct io_sr_msg *sr = &req->sr_msg;
 4434	struct iovec __user *uiov;
 4435	size_t iov_len;
 4436	int ret;
 4437
 4438	ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
 4439					&iomsg->uaddr, &uiov, &iov_len);
 4440	if (ret)
 4441		return ret;
 4442
 4443	if (req->flags & REQ_F_BUFFER_SELECT) {
 4444		if (iov_len > 1)
 4445			return -EINVAL;
 4446		if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
 4447			return -EFAULT;
 4448		sr->len = iomsg->fast_iov[0].iov_len;
 4449		iomsg->free_iov = NULL;
 4450	} else {
 4451		iomsg->free_iov = iomsg->fast_iov;
 4452		ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
 4453				     &iomsg->free_iov, &iomsg->msg.msg_iter,
 4454				     false);
 4455		if (ret > 0)
 4456			ret = 0;
 4457	}
 4458
 4459	return ret;
 4460}
 4461
 4462#ifdef CONFIG_COMPAT
 4463static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
 4464					struct io_async_msghdr *iomsg)
 4465{
 4466	struct io_sr_msg *sr = &req->sr_msg;
 4467	struct compat_iovec __user *uiov;
 4468	compat_uptr_t ptr;
 4469	compat_size_t len;
 4470	int ret;
 4471
 4472	ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
 4473				  &ptr, &len);
 4474	if (ret)
 4475		return ret;
 4476
 4477	uiov = compat_ptr(ptr);
 4478	if (req->flags & REQ_F_BUFFER_SELECT) {
 4479		compat_ssize_t clen;
 4480
 4481		if (len > 1)
 4482			return -EINVAL;
 4483		if (!access_ok(uiov, sizeof(*uiov)))
 4484			return -EFAULT;
 4485		if (__get_user(clen, &uiov->iov_len))
 4486			return -EFAULT;
 4487		if (clen < 0)
 4488			return -EINVAL;
 4489		sr->len = clen;
 4490		iomsg->free_iov = NULL;
 4491	} else {
 4492		iomsg->free_iov = iomsg->fast_iov;
 4493		ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
 4494				   UIO_FASTIOV, &iomsg->free_iov,
 4495				   &iomsg->msg.msg_iter, true);
 4496		if (ret < 0)
 4497			return ret;
 4498	}
 4499
 4500	return 0;
 4501}
 4502#endif
 4503
 4504static int io_recvmsg_copy_hdr(struct io_kiocb *req,
 4505			       struct io_async_msghdr *iomsg)
 4506{
 4507	iomsg->msg.msg_name = &iomsg->addr;
 4508
 4509#ifdef CONFIG_COMPAT
 4510	if (req->ctx->compat)
 4511		return __io_compat_recvmsg_copy_hdr(req, iomsg);
 4512#endif
 4513
 4514	return __io_recvmsg_copy_hdr(req, iomsg);
 4515}
 4516
 4517static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
 4518					       bool needs_lock)
 4519{
 4520	struct io_sr_msg *sr = &req->sr_msg;
 4521	struct io_buffer *kbuf;
 4522
 4523	kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
 4524	if (IS_ERR(kbuf))
 4525		return kbuf;
 4526
 4527	sr->kbuf = kbuf;
 4528	req->flags |= REQ_F_BUFFER_SELECTED;
 4529	return kbuf;
 4530}
 4531
 4532static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
 4533{
 4534	return io_put_kbuf(req, req->sr_msg.kbuf);
 4535}
 4536
 4537static int io_recvmsg_prep_async(struct io_kiocb *req)
 4538{
 4539	int ret;
 4540
 4541	ret = io_recvmsg_copy_hdr(req, req->async_data);
 4542	if (!ret)
 4543		req->flags |= REQ_F_NEED_CLEANUP;
 4544	return ret;
 4545}
 4546
 4547static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4548{
 4549	struct io_sr_msg *sr = &req->sr_msg;
 4550
 4551	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4552		return -EINVAL;
 4553
 4554	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 4555	sr->len = READ_ONCE(sqe->len);
 4556	sr->bgid = READ_ONCE(sqe->buf_group);
 4557	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
 4558	if (sr->msg_flags & MSG_DONTWAIT)
 4559		req->flags |= REQ_F_NOWAIT;
 4560
 4561#ifdef CONFIG_COMPAT
 4562	if (req->ctx->compat)
 4563		sr->msg_flags |= MSG_CMSG_COMPAT;
 4564#endif
 4565	return 0;
 4566}
 4567
 4568static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 4569{
 4570	struct io_async_msghdr iomsg, *kmsg;
 4571	struct socket *sock;
 4572	struct io_buffer *kbuf;
 4573	unsigned flags;
 4574	int min_ret = 0;
 4575	int ret, cflags = 0;
 4576	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 4577
 4578	sock = sock_from_file(req->file);
 4579	if (unlikely(!sock))
 4580		return -ENOTSOCK;
 4581
 4582	kmsg = req->async_data;
 4583	if (!kmsg) {
 4584		ret = io_recvmsg_copy_hdr(req, &iomsg);
 4585		if (ret)
 4586			return ret;
 4587		kmsg = &iomsg;
 4588	}
 4589
 4590	if (req->flags & REQ_F_BUFFER_SELECT) {
 4591		kbuf = io_recv_buffer_select(req, !force_nonblock);
 4592		if (IS_ERR(kbuf))
 4593			return PTR_ERR(kbuf);
 4594		kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
 4595		kmsg->fast_iov[0].iov_len = req->sr_msg.len;
 4596		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
 4597				1, req->sr_msg.len);
 4598	}
 4599
 4600	flags = req->sr_msg.msg_flags;
 4601	if (force_nonblock)
 4602		flags |= MSG_DONTWAIT;
 4603	if (flags & MSG_WAITALL)
 4604		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
 4605
 4606	ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
 4607					kmsg->uaddr, flags);
 4608	if (force_nonblock && ret == -EAGAIN)
 4609		return io_setup_async_msg(req, kmsg);
 4610	if (ret == -ERESTARTSYS)
 4611		ret = -EINTR;
 4612
 4613	if (req->flags & REQ_F_BUFFER_SELECTED)
 4614		cflags = io_put_recv_kbuf(req);
 4615	/* fast path, check for non-NULL to avoid function call */
 4616	if (kmsg->free_iov)
 4617		kfree(kmsg->free_iov);
 4618	req->flags &= ~REQ_F_NEED_CLEANUP;
 4619	if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
 4620		req_set_fail_links(req);
 4621	__io_req_complete(req, issue_flags, ret, cflags);
 4622	return 0;
 4623}
 4624
 4625static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 4626{
 4627	struct io_buffer *kbuf;
 4628	struct io_sr_msg *sr = &req->sr_msg;
 4629	struct msghdr msg;
 4630	void __user *buf = sr->buf;
 4631	struct socket *sock;
 4632	struct iovec iov;
 4633	unsigned flags;
 4634	int min_ret = 0;
 4635	int ret, cflags = 0;
 4636	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 4637
 4638	sock = sock_from_file(req->file);
 4639	if (unlikely(!sock))
 4640		return -ENOTSOCK;
 4641
 4642	if (req->flags & REQ_F_BUFFER_SELECT) {
 4643		kbuf = io_recv_buffer_select(req, !force_nonblock);
 4644		if (IS_ERR(kbuf))
 4645			return PTR_ERR(kbuf);
 4646		buf = u64_to_user_ptr(kbuf->addr);
 4647	}
 4648
 4649	ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
 4650	if (unlikely(ret))
 4651		goto out_free;
 4652
 4653	msg.msg_name = NULL;
 4654	msg.msg_control = NULL;
 4655	msg.msg_controllen = 0;
 4656	msg.msg_namelen = 0;
 4657	msg.msg_iocb = NULL;
 4658	msg.msg_flags = 0;
 4659
 4660	flags = req->sr_msg.msg_flags;
 4661	if (force_nonblock)
 4662		flags |= MSG_DONTWAIT;
 4663	if (flags & MSG_WAITALL)
 4664		min_ret = iov_iter_count(&msg.msg_iter);
 4665
 4666	ret = sock_recvmsg(sock, &msg, flags);
 4667	if (force_nonblock && ret == -EAGAIN)
 4668		return -EAGAIN;
 4669	if (ret == -ERESTARTSYS)
 4670		ret = -EINTR;
 4671out_free:
 4672	if (req->flags & REQ_F_BUFFER_SELECTED)
 4673		cflags = io_put_recv_kbuf(req);
 4674	if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
 4675		req_set_fail_links(req);
 4676	__io_req_complete(req, issue_flags, ret, cflags);
 4677	return 0;
 4678}
 4679
 4680static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4681{
 4682	struct io_accept *accept = &req->accept;
 4683
 4684	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4685		return -EINVAL;
 4686	if (sqe->ioprio || sqe->len || sqe->buf_index)
 4687		return -EINVAL;
 4688
 4689	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
 4690	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 4691	accept->flags = READ_ONCE(sqe->accept_flags);
 4692	accept->nofile = rlimit(RLIMIT_NOFILE);
 4693	return 0;
 4694}
 4695
 4696static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 4697{
 4698	struct io_accept *accept = &req->accept;
 4699	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 4700	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
 4701	int ret;
 4702
 4703	if (req->file->f_flags & O_NONBLOCK)
 4704		req->flags |= REQ_F_NOWAIT;
 4705
 4706	ret = __sys_accept4_file(req->file, file_flags, accept->addr,
 4707					accept->addr_len, accept->flags,
 4708					accept->nofile);
 4709	if (ret == -EAGAIN && force_nonblock)
 4710		return -EAGAIN;
 4711	if (ret < 0) {
 4712		if (ret == -ERESTARTSYS)
 4713			ret = -EINTR;
 4714		req_set_fail_links(req);
 4715	}
 4716	__io_req_complete(req, issue_flags, ret, 0);
 4717	return 0;
 4718}
 4719
 4720static int io_connect_prep_async(struct io_kiocb *req)
 4721{
 4722	struct io_async_connect *io = req->async_data;
 4723	struct io_connect *conn = &req->connect;
 4724
 4725	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
 4726}
 4727
 4728static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4729{
 4730	struct io_connect *conn = &req->connect;
 4731
 4732	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4733		return -EINVAL;
 4734	if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
 4735		return -EINVAL;
 4736
 4737	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
 4738	conn->addr_len =  READ_ONCE(sqe->addr2);
 4739	return 0;
 4740}
 4741
 4742static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
 4743{
 4744	struct io_async_connect __io, *io;
 4745	unsigned file_flags;
 4746	int ret;
 4747	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 4748
 4749	if (req->async_data) {
 4750		io = req->async_data;
 4751	} else {
 4752		ret = move_addr_to_kernel(req->connect.addr,
 4753						req->connect.addr_len,
 4754						&__io.address);
 4755		if (ret)
 4756			goto out;
 4757		io = &__io;
 4758	}
 4759
 4760	file_flags = force_nonblock ? O_NONBLOCK : 0;
 4761
 4762	ret = __sys_connect_file(req->file, &io->address,
 4763					req->connect.addr_len, file_flags);
 4764	if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
 4765		if (req->async_data)
 4766			return -EAGAIN;
 4767		if (io_alloc_async_data(req)) {
 4768			ret = -ENOMEM;
 4769			goto out;
 4770		}
 4771		memcpy(req->async_data, &__io, sizeof(__io));
 4772		return -EAGAIN;
 4773	}
 4774	if (ret == -ERESTARTSYS)
 4775		ret = -EINTR;
 4776out:
 4777	if (ret < 0)
 4778		req_set_fail_links(req);
 4779	__io_req_complete(req, issue_flags, ret, 0);
 4780	return 0;
 4781}
 4782#else /* !CONFIG_NET */
 4783#define IO_NETOP_FN(op)							\
 4784static int io_##op(struct io_kiocb *req, unsigned int issue_flags)	\
 4785{									\
 4786	return -EOPNOTSUPP;						\
 4787}
 4788
 4789#define IO_NETOP_PREP(op)						\
 4790IO_NETOP_FN(op)								\
 4791static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
 4792{									\
 4793	return -EOPNOTSUPP;						\
 4794}									\
 4795
 4796#define IO_NETOP_PREP_ASYNC(op)						\
 4797IO_NETOP_PREP(op)							\
 4798static int io_##op##_prep_async(struct io_kiocb *req)			\
 4799{									\
 4800	return -EOPNOTSUPP;						\
 4801}
 4802
 4803IO_NETOP_PREP_ASYNC(sendmsg);
 4804IO_NETOP_PREP_ASYNC(recvmsg);
 4805IO_NETOP_PREP_ASYNC(connect);
 4806IO_NETOP_PREP(accept);
 4807IO_NETOP_FN(send);
 4808IO_NETOP_FN(recv);
 4809#endif /* CONFIG_NET */
 4810
 4811struct io_poll_table {
 4812	struct poll_table_struct pt;
 4813	struct io_kiocb *req;
 4814	int error;
 4815};
 4816
 4817static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
 4818			   __poll_t mask, task_work_func_t func)
 4819{
 4820	int ret;
 4821
 4822	/* for instances that support it check for an event match first: */
 4823	if (mask && !(mask & poll->events))
 4824		return 0;
 4825
 4826	trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
 4827
 4828	list_del_init(&poll->wait.entry);
 4829
 4830	req->result = mask;
 4831	req->task_work.func = func;
 4832
 4833	/*
 4834	 * If this fails, then the task is exiting. When a task exits, the
 4835	 * work gets canceled, so just cancel this request as well instead
 4836	 * of executing it. We can't safely execute it anyway, as we may not
 4837	 * have the needed state needed for it anyway.
 4838	 */
 4839	ret = io_req_task_work_add(req);
 4840	if (unlikely(ret)) {
 4841		WRITE_ONCE(poll->canceled, true);
 4842		io_req_task_work_add_fallback(req, func);
 4843	}
 4844	return 1;
 4845}
 4846
 4847static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
 4848	__acquires(&req->ctx->completion_lock)
 4849{
 4850	struct io_ring_ctx *ctx = req->ctx;
 4851
 4852	if (!req->result && !READ_ONCE(poll->canceled)) {
 4853		struct poll_table_struct pt = { ._key = poll->events };
 4854
 4855		req->result = vfs_poll(req->file, &pt) & poll->events;
 4856	}
 4857
 4858	spin_lock_irq(&ctx->completion_lock);
 4859	if (!req->result && !READ_ONCE(poll->canceled)) {
 4860		add_wait_queue(poll->head, &poll->wait);
 4861		return true;
 4862	}
 4863
 4864	return false;
 4865}
 4866
 4867static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
 4868{
 4869	/* pure poll stashes this in ->async_data, poll driven retry elsewhere */
 4870	if (req->opcode == IORING_OP_POLL_ADD)
 4871		return req->async_data;
 4872	return req->apoll->double_poll;
 4873}
 4874
 4875static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
 4876{
 4877	if (req->opcode == IORING_OP_POLL_ADD)
 4878		return &req->poll;
 4879	return &req->apoll->poll;
 4880}
 4881
 4882static void io_poll_remove_double(struct io_kiocb *req)
 4883	__must_hold(&req->ctx->completion_lock)
 4884{
 4885	struct io_poll_iocb *poll = io_poll_get_double(req);
 4886
 4887	lockdep_assert_held(&req->ctx->completion_lock);
 4888
 4889	if (poll && poll->head) {
 4890		struct wait_queue_head *head = poll->head;
 4891
 4892		spin_lock(&head->lock);
 4893		list_del_init(&poll->wait.entry);
 4894		if (poll->wait.private)
 4895			req_ref_put(req);
 4896		poll->head = NULL;
 4897		spin_unlock(&head->lock);
 4898	}
 4899}
 4900
 4901static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
 4902	__must_hold(&req->ctx->completion_lock)
 4903{
 4904	struct io_ring_ctx *ctx = req->ctx;
 4905	unsigned flags = IORING_CQE_F_MORE;
 4906	int error;
 4907
 4908	if (READ_ONCE(req->poll.canceled)) {
 4909		error = -ECANCELED;
 4910		req->poll.events |= EPOLLONESHOT;
 4911	} else {
 4912		error = mangle_poll(mask);
 4913	}
 4914	if (req->poll.events & EPOLLONESHOT)
 4915		flags = 0;
 4916	if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
 4917		io_poll_remove_waitqs(req);
 4918		req->poll.done = true;
 4919		flags = 0;
 4920	}
 4921	if (flags & IORING_CQE_F_MORE)
 4922		ctx->cq_extra++;
 4923
 4924	io_commit_cqring(ctx);
 4925	return !(flags & IORING_CQE_F_MORE);
 4926}
 4927
 4928static void io_poll_task_func(struct callback_head *cb)
 4929{
 4930	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
 4931	struct io_ring_ctx *ctx = req->ctx;
 4932	struct io_kiocb *nxt;
 4933
 4934	if (io_poll_rewait(req, &req->poll)) {
 4935		spin_unlock_irq(&ctx->completion_lock);
 4936	} else {
 4937		bool done;
 4938
 4939		done = io_poll_complete(req, req->result);
 4940		if (done) {
 4941			hash_del(&req->hash_node);
 4942		} else {
 4943			req->result = 0;
 4944			add_wait_queue(req->poll.head, &req->poll.wait);
 4945		}
 4946		spin_unlock_irq(&ctx->completion_lock);
 4947		io_cqring_ev_posted(ctx);
 4948
 4949		if (done) {
 4950			nxt = io_put_req_find_next(req);
 4951			if (nxt)
 4952				__io_req_task_submit(nxt);
 4953		}
 4954	}
 4955}
 4956
 4957static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
 4958			       int sync, void *key)
 4959{
 4960	struct io_kiocb *req = wait->private;
 4961	struct io_poll_iocb *poll = io_poll_get_single(req);
 4962	__poll_t mask = key_to_poll(key);
 4963
 4964	/* for instances that support it check for an event match first: */
 4965	if (mask && !(mask & poll->events))
 4966		return 0;
 4967	if (!(poll->events & EPOLLONESHOT))
 4968		return poll->wait.func(&poll->wait, mode, sync, key);
 4969
 4970	list_del_init(&wait->entry);
 4971
 4972	if (poll && poll->head) {
 4973		bool done;
 4974
 4975		spin_lock(&poll->head->lock);
 4976		done = list_empty(&poll->wait.entry);
 4977		if (!done)
 4978			list_del_init(&poll->wait.entry);
 4979		/* make sure double remove sees this as being gone */
 4980		wait->private = NULL;
 4981		spin_unlock(&poll->head->lock);
 4982		if (!done) {
 4983			/* use wait func handler, so it matches the rq type */
 4984			poll->wait.func(&poll->wait, mode, sync, key);
 4985		}
 4986	}
 4987	req_ref_put(req);
 4988	return 1;
 4989}
 4990
 4991static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
 4992			      wait_queue_func_t wake_func)
 4993{
 4994	poll->head = NULL;
 4995	poll->done = false;
 4996	poll->canceled = false;
 4997#define IO_POLL_UNMASK	(EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
 4998	/* mask in events that we always want/need */
 4999	poll->events = events | IO_POLL_UNMASK;
 5000	INIT_LIST_HEAD(&poll->wait.entry);
 5001	init_waitqueue_func_entry(&poll->wait, wake_func);
 5002}
 5003
 5004static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
 5005			    struct wait_queue_head *head,
 5006			    struct io_poll_iocb **poll_ptr)
 5007{
 5008	struct io_kiocb *req = pt->req;
 5009
 5010	/*
 5011	 * If poll->head is already set, it's because the file being polled
 5012	 * uses multiple waitqueues for poll handling (eg one for read, one
 5013	 * for write). Setup a separate io_poll_iocb if this happens.
 5014	 */
 5015	if (unlikely(poll->head)) {
 5016		struct io_poll_iocb *poll_one = poll;
 5017
 5018		/* already have a 2nd entry, fail a third attempt */
 5019		if (*poll_ptr) {
 5020			pt->error = -EINVAL;
 5021			return;
 5022		}
 5023		/*
 5024		 * Can't handle multishot for double wait for now, turn it
 5025		 * into one-shot mode.
 5026		 */
 5027		if (!(poll_one->events & EPOLLONESHOT))
 5028			poll_one->events |= EPOLLONESHOT;
 5029		/* double add on the same waitqueue head, ignore */
 5030		if (poll_one->head == head)
 5031			return;
 5032		poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
 5033		if (!poll) {
 5034			pt->error = -ENOMEM;
 5035			return;
 5036		}
 5037		io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
 5038		req_ref_get(req);
 5039		poll->wait.private = req;
 5040		*poll_ptr = poll;
 5041	}
 5042
 5043	pt->error = 0;
 5044	poll->head = head;
 5045
 5046	if (poll->events & EPOLLEXCLUSIVE)
 5047		add_wait_queue_exclusive(head, &poll->wait);
 5048	else
 5049		add_wait_queue(head, &poll->wait);
 5050}
 5051
 5052static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
 5053			       struct poll_table_struct *p)
 5054{
 5055	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
 5056	struct async_poll *apoll = pt->req->apoll;
 5057
 5058	__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
 5059}
 5060
 5061static void io_async_task_func(struct callback_head *cb)
 5062{
 5063	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
 5064	struct async_poll *apoll = req->apoll;
 5065	struct io_ring_ctx *ctx = req->ctx;
 5066
 5067	trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
 5068
 5069	if (io_poll_rewait(req, &apoll->poll)) {
 5070		spin_unlock_irq(&ctx->completion_lock);
 5071		return;
 5072	}
 5073
 5074	hash_del(&req->hash_node);
 5075	io_poll_remove_double(req);
 5076	spin_unlock_irq(&ctx->completion_lock);
 5077
 5078	if (!READ_ONCE(apoll->poll.canceled))
 5079		__io_req_task_submit(req);
 5080	else
 5081		io_req_complete_failed(req, -ECANCELED);
 5082}
 5083
 5084static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 5085			void *key)
 5086{
 5087	struct io_kiocb *req = wait->private;
 5088	struct io_poll_iocb *poll = &req->apoll->poll;
 5089
 5090	trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
 5091					key_to_poll(key));
 5092
 5093	return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
 5094}
 5095
 5096static void io_poll_req_insert(struct io_kiocb *req)
 5097{
 5098	struct io_ring_ctx *ctx = req->ctx;
 5099	struct hlist_head *list;
 5100
 5101	list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
 5102	hlist_add_head(&req->hash_node, list);
 5103}
 5104
 5105static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
 5106				      struct io_poll_iocb *poll,
 5107				      struct io_poll_table *ipt, __poll_t mask,
 5108				      wait_queue_func_t wake_func)
 5109	__acquires(&ctx->completion_lock)
 5110{
 5111	struct io_ring_ctx *ctx = req->ctx;
 5112	bool cancel = false;
 5113
 5114	INIT_HLIST_NODE(&req->hash_node);
 5115	io_init_poll_iocb(poll, mask, wake_func);
 5116	poll->file = req->file;
 5117	poll->wait.private = req;
 5118
 5119	ipt->pt._key = mask;
 5120	ipt->req = req;
 5121	ipt->error = -EINVAL;
 5122
 5123	mask = vfs_poll(req->file, &ipt->pt) & poll->events;
 5124
 5125	spin_lock_irq(&ctx->completion_lock);
 5126	if (likely(poll->head)) {
 5127		spin_lock(&poll->head->lock);
 5128		if (unlikely(list_empty(&poll->wait.entry))) {
 5129			if (ipt->error)
 5130				cancel = true;
 5131			ipt->error = 0;
 5132			mask = 0;
 5133		}
 5134		if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error)
 5135			list_del_init(&poll->wait.entry);
 5136		else if (cancel)
 5137			WRITE_ONCE(poll->canceled, true);
 5138		else if (!poll->done) /* actually waiting for an event */
 5139			io_poll_req_insert(req);
 5140		spin_unlock(&poll->head->lock);
 5141	}
 5142
 5143	return mask;
 5144}
 5145
 5146static bool io_arm_poll_handler(struct io_kiocb *req)
 5147{
 5148	const struct io_op_def *def = &io_op_defs[req->opcode];
 5149	struct io_ring_ctx *ctx = req->ctx;
 5150	struct async_poll *apoll;
 5151	struct io_poll_table ipt;
 5152	__poll_t mask, ret;
 5153	int rw;
 5154
 5155	if (!req->file || !file_can_poll(req->file))
 5156		return false;
 5157	if (req->flags & REQ_F_POLLED)
 5158		return false;
 5159	if (def->pollin)
 5160		rw = READ;
 5161	else if (def->pollout)
 5162		rw = WRITE;
 5163	else
 5164		return false;
 5165	/* if we can't nonblock try, then no point in arming a poll handler */
 5166	if (!io_file_supports_async(req, rw))
 5167		return false;
 5168
 5169	apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
 5170	if (unlikely(!apoll))
 5171		return false;
 5172	apoll->double_poll = NULL;
 5173
 5174	req->flags |= REQ_F_POLLED;
 5175	req->apoll = apoll;
 5176
 5177	mask = EPOLLONESHOT;
 5178	if (def->pollin)
 5179		mask |= POLLIN | POLLRDNORM;
 5180	if (def->pollout)
 5181		mask |= POLLOUT | POLLWRNORM;
 5182
 5183	/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
 5184	if ((req->opcode == IORING_OP_RECVMSG) &&
 5185	    (req->sr_msg.msg_flags & MSG_ERRQUEUE))
 5186		mask &= ~POLLIN;
 5187
 5188	mask |= POLLERR | POLLPRI;
 5189
 5190	ipt.pt._qproc = io_async_queue_proc;
 5191
 5192	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
 5193					io_async_wake);
 5194	if (ret || ipt.error) {
 5195		io_poll_remove_double(req);
 5196		spin_unlock_irq(&ctx->completion_lock);
 5197		return false;
 5198	}
 5199	spin_unlock_irq(&ctx->completion_lock);
 5200	trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
 5201					apoll->poll.events);
 5202	return true;
 5203}
 5204
 5205static bool __io_poll_remove_one(struct io_kiocb *req,
 5206				 struct io_poll_iocb *poll, bool do_cancel)
 5207	__must_hold(&req->ctx->completion_lock)
 5208{
 5209	bool do_complete = false;
 5210
 5211	if (!poll->head)
 5212		return false;
 5213	spin_lock(&poll->head->lock);
 5214	if (do_cancel)
 5215		WRITE_ONCE(poll->canceled, true);
 5216	if (!list_empty(&poll->wait.entry)) {
 5217		list_del_init(&poll->wait.entry);
 5218		do_complete = true;
 5219	}
 5220	spin_unlock(&poll->head->lock);
 5221	hash_del(&req->hash_node);
 5222	return do_complete;
 5223}
 5224
 5225static bool io_poll_remove_waitqs(struct io_kiocb *req)
 5226	__must_hold(&req->ctx->completion_lock)
 5227{
 5228	bool do_complete;
 5229
 5230	io_poll_remove_double(req);
 5231	do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
 5232
 5233	if (req->opcode != IORING_OP_POLL_ADD && do_complete) {
 5234		/* non-poll requests have submit ref still */
 5235		req_ref_put(req);
 5236	}
 5237	return do_complete;
 5238}
 5239
 5240static bool io_poll_remove_one(struct io_kiocb *req)
 5241	__must_hold(&req->ctx->completion_lock)
 5242{
 5243	bool do_complete;
 5244
 5245	do_complete = io_poll_remove_waitqs(req);
 5246	if (do_complete) {
 5247		io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
 5248		io_commit_cqring(req->ctx);
 5249		req_set_fail_links(req);
 5250		io_put_req_deferred(req, 1);
 5251	}
 5252
 5253	return do_complete;
 5254}
 5255
 5256/*
 5257 * Returns true if we found and killed one or more poll requests
 5258 */
 5259static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 5260			       struct files_struct *files)
 5261{
 5262	struct hlist_node *tmp;
 5263	struct io_kiocb *req;
 5264	int posted = 0, i;
 5265
 5266	spin_lock_irq(&ctx->completion_lock);
 5267	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
 5268		struct hlist_head *list;
 5269
 5270		list = &ctx->cancel_hash[i];
 5271		hlist_for_each_entry_safe(req, tmp, list, hash_node) {
 5272			if (io_match_task(req, tsk, files))
 5273				posted += io_poll_remove_one(req);
 5274		}
 5275	}
 5276	spin_unlock_irq(&ctx->completion_lock);
 5277
 5278	if (posted)
 5279		io_cqring_ev_posted(ctx);
 5280
 5281	return posted != 0;
 5282}
 5283
 5284static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
 5285				     bool poll_only)
 5286	__must_hold(&ctx->completion_lock)
 5287{
 5288	struct hlist_head *list;
 5289	struct io_kiocb *req;
 5290
 5291	list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
 5292	hlist_for_each_entry(req, list, hash_node) {
 5293		if (sqe_addr != req->user_data)
 5294			continue;
 5295		if (poll_only && req->opcode != IORING_OP_POLL_ADD)
 5296			continue;
 5297		return req;
 5298	}
 5299	return NULL;
 5300}
 5301
 5302static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
 5303			  bool poll_only)
 5304	__must_hold(&ctx->completion_lock)
 5305{
 5306	struct io_kiocb *req;
 5307
 5308	req = io_poll_find(ctx, sqe_addr, poll_only);
 5309	if (!req)
 5310		return -ENOENT;
 5311	if (io_poll_remove_one(req))
 5312		return 0;
 5313
 5314	return -EALREADY;
 5315}
 5316
 5317static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
 5318				     unsigned int flags)
 5319{
 5320	u32 events;
 5321
 5322	events = READ_ONCE(sqe->poll32_events);
 5323#ifdef __BIG_ENDIAN
 5324	events = swahw32(events);
 5325#endif
 5326	if (!(flags & IORING_POLL_ADD_MULTI))
 5327		events |= EPOLLONESHOT;
 5328	return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
 5329}
 5330
 5331static int io_poll_update_prep(struct io_kiocb *req,
 5332			       const struct io_uring_sqe *sqe)
 5333{
 5334	struct io_poll_update *upd = &req->poll_update;
 5335	u32 flags;
 5336
 5337	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 5338		return -EINVAL;
 5339	if (sqe->ioprio || sqe->buf_index)
 5340		return -EINVAL;
 5341	flags = READ_ONCE(sqe->len);
 5342	if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
 5343		      IORING_POLL_ADD_MULTI))
 5344		return -EINVAL;
 5345	/* meaningless without update */
 5346	if (flags == IORING_POLL_ADD_MULTI)
 5347		return -EINVAL;
 5348
 5349	upd->old_user_data = READ_ONCE(sqe->addr);
 5350	upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
 5351	upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
 5352
 5353	upd->new_user_data = READ_ONCE(sqe->off);
 5354	if (!upd->update_user_data && upd->new_user_data)
 5355		return -EINVAL;
 5356	if (upd->update_events)
 5357		upd->events = io_poll_parse_events(sqe, flags);
 5358	else if (sqe->poll32_events)
 5359		return -EINVAL;
 5360
 5361	return 0;
 5362}
 5363
 5364static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 5365			void *key)
 5366{
 5367	struct io_kiocb *req = wait->private;
 5368	struct io_poll_iocb *poll = &req->poll;
 5369
 5370	return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
 5371}
 5372
 5373static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
 5374			       struct poll_table_struct *p)
 5375{
 5376	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
 5377
 5378	__io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
 5379}
 5380
 5381static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 5382{
 5383	struct io_poll_iocb *poll = &req->poll;
 5384	u32 flags;
 5385
 5386	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 5387		return -EINVAL;
 5388	if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
 5389		return -EINVAL;
 5390	flags = READ_ONCE(sqe->len);
 5391	if (flags & ~IORING_POLL_ADD_MULTI)
 5392		return -EINVAL;
 5393
 5394	poll->events = io_poll_parse_events(sqe, flags);
 5395	return 0;
 5396}
 5397
 5398static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 5399{
 5400	struct io_poll_iocb *poll = &req->poll;
 5401	struct io_ring_ctx *ctx = req->ctx;
 5402	struct io_poll_table ipt;
 5403	__poll_t mask;
 5404
 5405	ipt.pt._qproc = io_poll_queue_proc;
 5406
 5407	mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
 5408					io_poll_wake);
 5409
 5410	if (mask) { /* no async, we'd stolen it */
 5411		ipt.error = 0;
 5412		io_poll_complete(req, mask);
 5413	}
 5414	spin_unlock_irq(&ctx->completion_lock);
 5415
 5416	if (mask) {
 5417		io_cqring_ev_posted(ctx);
 5418		if (poll->events & EPOLLONESHOT)
 5419			io_put_req(req);
 5420	}
 5421	return ipt.error;
 5422}
 5423
 5424static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
 5425{
 5426	struct io_ring_ctx *ctx = req->ctx;
 5427	struct io_kiocb *preq;
 5428	bool completing;
 5429	int ret;
 5430
 5431	spin_lock_irq(&ctx->completion_lock);
 5432	preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
 5433	if (!preq) {
 5434		ret = -ENOENT;
 5435		goto err;
 5436	}
 5437
 5438	if (!req->poll_update.update_events && !req->poll_update.update_user_data) {
 5439		completing = true;
 5440		ret = io_poll_remove_one(preq) ? 0 : -EALREADY;
 5441		goto err;
 5442	}
 5443
 5444	/*
 5445	 * Don't allow racy completion with singleshot, as we cannot safely
 5446	 * update those. For multishot, if we're racing with completion, just
 5447	 * let completion re-add it.
 5448	 */
 5449	completing = !__io_poll_remove_one(preq, &preq->poll, false);
 5450	if (completing && (preq->poll.events & EPOLLONESHOT)) {
 5451		ret = -EALREADY;
 5452		goto err;
 5453	}
 5454	/* we now have a detached poll request. reissue. */
 5455	ret = 0;
 5456err:
 5457	if (ret < 0) {
 5458		spin_unlock_irq(&ctx->completion_lock);
 5459		req_set_fail_links(req);
 5460		io_req_complete(req, ret);
 5461		return 0;
 5462	}
 5463	/* only mask one event flags, keep behavior flags */
 5464	if (req->poll_update.update_events) {
 5465		preq->poll.events &= ~0xffff;
 5466		preq->poll.events |= req->poll_update.events & 0xffff;
 5467		preq->poll.events |= IO_POLL_UNMASK;
 5468	}
 5469	if (req->poll_update.update_user_data)
 5470		preq->user_data = req->poll_update.new_user_data;
 5471	spin_unlock_irq(&ctx->completion_lock);
 5472
 5473	/* complete update request, we're done with it */
 5474	io_req_complete(req, ret);
 5475
 5476	if (!completing) {
 5477		ret = io_poll_add(preq, issue_flags);
 5478		if (ret < 0) {
 5479			req_set_fail_links(preq);
 5480			io_req_complete(preq, ret);
 5481		}
 5482	}
 5483	return 0;
 5484}
 5485
 5486static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 5487{
 5488	struct io_timeout_data *data = container_of(timer,
 5489						struct io_timeout_data, timer);
 5490	struct io_kiocb *req = data->req;
 5491	struct io_ring_ctx *ctx = req->ctx;
 5492	unsigned long flags;
 5493
 5494	spin_lock_irqsave(&ctx->completion_lock, flags);
 5495	list_del_init(&req->timeout.list);
 5496	atomic_set(&req->ctx->cq_timeouts,
 5497		atomic_read(&req->ctx->cq_timeouts) + 1);
 5498
 5499	io_cqring_fill_event(ctx, req->user_data, -ETIME, 0);
 5500	io_commit_cqring(ctx);
 5501	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 5502
 5503	io_cqring_ev_posted(ctx);
 5504	req_set_fail_links(req);
 5505	io_put_req(req);
 5506	return HRTIMER_NORESTART;
 5507}
 5508
 5509static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
 5510					   __u64 user_data)
 5511	__must_hold(&ctx->completion_lock)
 5512{
 5513	struct io_timeout_data *io;
 5514	struct io_kiocb *req;
 5515	bool found = false;
 5516
 5517	list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
 5518		found = user_data == req->user_data;
 5519		if (found)
 5520			break;
 5521	}
 5522	if (!found)
 5523		return ERR_PTR(-ENOENT);
 5524
 5525	io = req->async_data;
 5526	if (hrtimer_try_to_cancel(&io->timer) == -1)
 5527		return ERR_PTR(-EALREADY);
 5528	list_del_init(&req->timeout.list);
 5529	return req;
 5530}
 5531
 5532static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
 5533	__must_hold(&ctx->completion_lock)
 5534{
 5535	struct io_kiocb *req = io_timeout_extract(ctx, user_data);
 5536
 5537	if (IS_ERR(req))
 5538		return PTR_ERR(req);
 5539
 5540	req_set_fail_links(req);
 5541	io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
 5542	io_put_req_deferred(req, 1);
 5543	return 0;
 5544}
 5545
 5546static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 5547			     struct timespec64 *ts, enum hrtimer_mode mode)
 5548	__must_hold(&ctx->completion_lock)
 5549{
 5550	struct io_kiocb *req = io_timeout_extract(ctx, user_data);
 5551	struct io_timeout_data *data;
 5552
 5553	if (IS_ERR(req))
 5554		return PTR_ERR(req);
 5555
 5556	req->timeout.off = 0; /* noseq */
 5557	data = req->async_data;
 5558	list_add_tail(&req->timeout.list, &ctx->timeout_list);
 5559	hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode);
 5560	data->timer.function = io_timeout_fn;
 5561	hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
 5562	return 0;
 5563}
 5564
 5565static int io_timeout_remove_prep(struct io_kiocb *req,
 5566				  const struct io_uring_sqe *sqe)
 5567{
 5568	struct io_timeout_rem *tr = &req->timeout_rem;
 5569
 5570	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 5571		return -EINVAL;
 5572	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
 5573		return -EINVAL;
 5574	if (sqe->ioprio || sqe->buf_index || sqe->len)
 5575		return -EINVAL;
 5576
 5577	tr->addr = READ_ONCE(sqe->addr);
 5578	tr->flags = READ_ONCE(sqe->timeout_flags);
 5579	if (tr->flags & IORING_TIMEOUT_UPDATE) {
 5580		if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS))
 5581			return -EINVAL;
 5582		if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
 5583			return -EFAULT;
 5584	} else if (tr->flags) {
 5585		/* timeout removal doesn't support flags */
 5586		return -EINVAL;
 5587	}
 5588
 5589	return 0;
 5590}
 5591
 5592static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
 5593{
 5594	return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
 5595					    : HRTIMER_MODE_REL;
 5596}
 5597
 5598/*
 5599 * Remove or update an existing timeout command
 5600 */
 5601static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
 5602{
 5603	struct io_timeout_rem *tr = &req->timeout_rem;
 5604	struct io_ring_ctx *ctx = req->ctx;
 5605	int ret;
 5606
 5607	spin_lock_irq(&ctx->completion_lock);
 5608	if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE))
 5609		ret = io_timeout_cancel(ctx, tr->addr);
 5610	else
 5611		ret = io_timeout_update(ctx, tr->addr, &tr->ts,
 5612					io_translate_timeout_mode(tr->flags));
 5613
 5614	io_cqring_fill_event(ctx, req->user_data, ret, 0);
 5615	io_commit_cqring(ctx);
 5616	spin_unlock_irq(&ctx->completion_lock);
 5617	io_cqring_ev_posted(ctx);
 5618	if (ret < 0)
 5619		req_set_fail_links(req);
 5620	io_put_req(req);
 5621	return 0;
 5622}
 5623
 5624static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 5625			   bool is_timeout_link)
 5626{
 5627	struct io_timeout_data *data;
 5628	unsigned flags;
 5629	u32 off = READ_ONCE(sqe->off);
 5630
 5631	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 5632		return -EINVAL;
 5633	if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
 5634		return -EINVAL;
 5635	if (off && is_timeout_link)
 5636		return -EINVAL;
 5637	flags = READ_ONCE(sqe->timeout_flags);
 5638	if (flags & ~IORING_TIMEOUT_ABS)
 5639		return -EINVAL;
 5640
 5641	req->timeout.off = off;
 5642
 5643	if (!req->async_data && io_alloc_async_data(req))
 5644		return -ENOMEM;
 5645
 5646	data = req->async_data;
 5647	data->req = req;
 5648
 5649	if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
 5650		return -EFAULT;
 5651
 5652	data->mode = io_translate_timeout_mode(flags);
 5653	hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
 5654	if (is_timeout_link)
 5655		io_req_track_inflight(req);
 5656	return 0;
 5657}
 5658
 5659static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
 5660{
 5661	struct io_ring_ctx *ctx = req->ctx;
 5662	struct io_timeout_data *data = req->async_data;
 5663	struct list_head *entry;
 5664	u32 tail, off = req->timeout.off;
 5665
 5666	spin_lock_irq(&ctx->completion_lock);
 5667
 5668	/*
 5669	 * sqe->off holds how many events that need to occur for this
 5670	 * timeout event to be satisfied. If it isn't set, then this is
 5671	 * a pure timeout request, sequence isn't used.
 5672	 */
 5673	if (io_is_timeout_noseq(req)) {
 5674		entry = ctx->timeout_list.prev;
 5675		goto add;
 5676	}
 5677
 5678	tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
 5679	req->timeout.target_seq = tail + off;
 5680
 5681	/* Update the last seq here in case io_flush_timeouts() hasn't.
 5682	 * This is safe because ->completion_lock is held, and submissions
 5683	 * and completions are never mixed in the same ->completion_lock section.
 5684	 */
 5685	ctx->cq_last_tm_flush = tail;
 5686
 5687	/*
 5688	 * Insertion sort, ensuring the first entry in the list is always
 5689	 * the one we need first.
 5690	 */
 5691	list_for_each_prev(entry, &ctx->timeout_list) {
 5692		struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
 5693						  timeout.list);
 5694
 5695		if (io_is_timeout_noseq(nxt))
 5696			continue;
 5697		/* nxt.seq is behind @tail, otherwise would've been completed */
 5698		if (off >= nxt->timeout.target_seq - tail)
 5699			break;
 5700	}
 5701add:
 5702	list_add(&req->timeout.list, entry);
 5703	data->timer.function = io_timeout_fn;
 5704	hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
 5705	spin_unlock_irq(&ctx->completion_lock);
 5706	return 0;
 5707}
 5708
 5709struct io_cancel_data {
 5710	struct io_ring_ctx *ctx;
 5711	u64 user_data;
 5712};
 5713
 5714static bool io_cancel_cb(struct io_wq_work *work, void *data)
 5715{
 5716	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 5717	struct io_cancel_data *cd = data;
 5718
 5719	return req->ctx == cd->ctx && req->user_data == cd->user_data;
 5720}
 5721
 5722static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
 5723			       struct io_ring_ctx *ctx)
 5724{
 5725	struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
 5726	enum io_wq_cancel cancel_ret;
 5727	int ret = 0;
 5728
 5729	if (!tctx || !tctx->io_wq)
 5730		return -ENOENT;
 5731
 5732	cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
 5733	switch (cancel_ret) {
 5734	case IO_WQ_CANCEL_OK:
 5735		ret = 0;
 5736		break;
 5737	case IO_WQ_CANCEL_RUNNING:
 5738		ret = -EALREADY;
 5739		break;
 5740	case IO_WQ_CANCEL_NOTFOUND:
 5741		ret = -ENOENT;
 5742		break;
 5743	}
 5744
 5745	return ret;
 5746}
 5747
 5748static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
 5749				     struct io_kiocb *req, __u64 sqe_addr,
 5750				     int success_ret)
 5751{
 5752	unsigned long flags;
 5753	int ret;
 5754
 5755	ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
 5756	spin_lock_irqsave(&ctx->completion_lock, flags);
 5757	if (ret != -ENOENT)
 5758		goto done;
 5759	ret = io_timeout_cancel(ctx, sqe_addr);
 5760	if (ret != -ENOENT)
 5761		goto done;
 5762	ret = io_poll_cancel(ctx, sqe_addr, false);
 5763done:
 5764	if (!ret)
 5765		ret = success_ret;
 5766	io_cqring_fill_event(ctx, req->user_data, ret, 0);
 5767	io_commit_cqring(ctx);
 5768	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 5769	io_cqring_ev_posted(ctx);
 5770
 5771	if (ret < 0)
 5772		req_set_fail_links(req);
 5773}
 5774
 5775static int io_async_cancel_prep(struct io_kiocb *req,
 5776				const struct io_uring_sqe *sqe)
 5777{
 5778	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 5779		return -EINVAL;
 5780	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
 5781		return -EINVAL;
 5782	if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
 5783		return -EINVAL;
 5784
 5785	req->cancel.addr = READ_ONCE(sqe->addr);
 5786	return 0;
 5787}
 5788
 5789static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 5790{
 5791	struct io_ring_ctx *ctx = req->ctx;
 5792	u64 sqe_addr = req->cancel.addr;
 5793	struct io_tctx_node *node;
 5794	int ret;
 5795
 5796	/* tasks should wait for their io-wq threads, so safe w/o sync */
 5797	ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
 5798	spin_lock_irq(&ctx->completion_lock);
 5799	if (ret != -ENOENT)
 5800		goto done;
 5801	ret = io_timeout_cancel(ctx, sqe_addr);
 5802	if (ret != -ENOENT)
 5803		goto done;
 5804	ret = io_poll_cancel(ctx, sqe_addr, false);
 5805	if (ret != -ENOENT)
 5806		goto done;
 5807	spin_unlock_irq(&ctx->completion_lock);
 5808
 5809	/* slow path, try all io-wq's */
 5810	io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
 5811	ret = -ENOENT;
 5812	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
 5813		struct io_uring_task *tctx = node->task->io_uring;
 5814
 5815		ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
 5816		if (ret != -ENOENT)
 5817			break;
 5818	}
 5819	io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
 5820
 5821	spin_lock_irq(&ctx->completion_lock);
 5822done:
 5823	io_cqring_fill_event(ctx, req->user_data, ret, 0);
 5824	io_commit_cqring(ctx);
 5825	spin_unlock_irq(&ctx->completion_lock);
 5826	io_cqring_ev_posted(ctx);
 5827
 5828	if (ret < 0)
 5829		req_set_fail_links(req);
 5830	io_put_req(req);
 5831	return 0;
 5832}
 5833
 5834static int io_rsrc_update_prep(struct io_kiocb *req,
 5835				const struct io_uring_sqe *sqe)
 5836{
 5837	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
 5838		return -EINVAL;
 5839	if (sqe->ioprio || sqe->rw_flags)
 5840		return -EINVAL;
 5841
 5842	req->rsrc_update.offset = READ_ONCE(sqe->off);
 5843	req->rsrc_update.nr_args = READ_ONCE(sqe->len);
 5844	if (!req->rsrc_update.nr_args)
 5845		return -EINVAL;
 5846	req->rsrc_update.arg = READ_ONCE(sqe->addr);
 5847	return 0;
 5848}
 5849
 5850static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 5851{
 5852	struct io_ring_ctx *ctx = req->ctx;
 5853	struct io_uring_rsrc_update2 up;
 5854	int ret;
 5855
 5856	if (issue_flags & IO_URING_F_NONBLOCK)
 5857		return -EAGAIN;
 5858
 5859	up.offset = req->rsrc_update.offset;
 5860	up.data = req->rsrc_update.arg;
 5861	up.nr = 0;
 5862	up.tags = 0;
 5863	up.resv = 0;
 5864
 5865	mutex_lock(&ctx->uring_lock);
 5866	ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
 5867					&up, req->rsrc_update.nr_args);
 5868	mutex_unlock(&ctx->uring_lock);
 5869
 5870	if (ret < 0)
 5871		req_set_fail_links(req);
 5872	__io_req_complete(req, issue_flags, ret, 0);
 5873	return 0;
 5874}
 5875
 5876static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 5877{
 5878	switch (req->opcode) {
 5879	case IORING_OP_NOP:
 5880		return 0;
 5881	case IORING_OP_READV:
 5882	case IORING_OP_READ_FIXED:
 5883	case IORING_OP_READ:
 5884		return io_read_prep(req, sqe);
 5885	case IORING_OP_WRITEV:
 5886	case IORING_OP_WRITE_FIXED:
 5887	case IORING_OP_WRITE:
 5888		return io_write_prep(req, sqe);
 5889	case IORING_OP_POLL_ADD:
 5890		return io_poll_add_prep(req, sqe);
 5891	case IORING_OP_POLL_REMOVE:
 5892		return io_poll_update_prep(req, sqe);
 5893	case IORING_OP_FSYNC:
 5894		return io_fsync_prep(req, sqe);
 5895	case IORING_OP_SYNC_FILE_RANGE:
 5896		return io_sfr_prep(req, sqe);
 5897	case IORING_OP_SENDMSG:
 5898	case IORING_OP_SEND:
 5899		return io_sendmsg_prep(req, sqe);
 5900	case IORING_OP_RECVMSG:
 5901	case IORING_OP_RECV:
 5902		return io_recvmsg_prep(req, sqe);
 5903	case IORING_OP_CONNECT:
 5904		return io_connect_prep(req, sqe);
 5905	case IORING_OP_TIMEOUT:
 5906		return io_timeout_prep(req, sqe, false);
 5907	case IORING_OP_TIMEOUT_REMOVE:
 5908		return io_timeout_remove_prep(req, sqe);
 5909	case IORING_OP_ASYNC_CANCEL:
 5910		return io_async_cancel_prep(req, sqe);
 5911	case IORING_OP_LINK_TIMEOUT:
 5912		return io_timeout_prep(req, sqe, true);
 5913	case IORING_OP_ACCEPT:
 5914		return io_accept_prep(req, sqe);
 5915	case IORING_OP_FALLOCATE:
 5916		return io_fallocate_prep(req, sqe);
 5917	case IORING_OP_OPENAT:
 5918		return io_openat_prep(req, sqe);
 5919	case IORING_OP_CLOSE:
 5920		return io_close_prep(req, sqe);
 5921	case IORING_OP_FILES_UPDATE:
 5922		return io_rsrc_update_prep(req, sqe);
 5923	case IORING_OP_STATX:
 5924		return io_statx_prep(req, sqe);
 5925	case IORING_OP_FADVISE:
 5926		return io_fadvise_prep(req, sqe);
 5927	case IORING_OP_MADVISE:
 5928		return io_madvise_prep(req, sqe);
 5929	case IORING_OP_OPENAT2:
 5930		return io_openat2_prep(req, sqe);
 5931	case IORING_OP_EPOLL_CTL:
 5932		return io_epoll_ctl_prep(req, sqe);
 5933	case IORING_OP_SPLICE:
 5934		return io_splice_prep(req, sqe);
 5935	case IORING_OP_PROVIDE_BUFFERS:
 5936		return io_provide_buffers_prep(req, sqe);
 5937	case IORING_OP_REMOVE_BUFFERS:
 5938		return io_remove_buffers_prep(req, sqe);
 5939	case IORING_OP_TEE:
 5940		return io_tee_prep(req, sqe);
 5941	case IORING_OP_SHUTDOWN:
 5942		return io_shutdown_prep(req, sqe);
 5943	case IORING_OP_RENAMEAT:
 5944		return io_renameat_prep(req, sqe);
 5945	case IORING_OP_UNLINKAT:
 5946		return io_unlinkat_prep(req, sqe);
 5947	}
 5948
 5949	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
 5950			req->opcode);
 5951	return -EINVAL;
 5952}
 5953
 5954static int io_req_prep_async(struct io_kiocb *req)
 5955{
 5956	if (!io_op_defs[req->opcode].needs_async_setup)
 5957		return 0;
 5958	if (WARN_ON_ONCE(req->async_data))
 5959		return -EFAULT;
 5960	if (io_alloc_async_data(req))
 5961		return -EAGAIN;
 5962
 5963	switch (req->opcode) {
 5964	case IORING_OP_READV:
 5965		return io_rw_prep_async(req, READ);
 5966	case IORING_OP_WRITEV:
 5967		return io_rw_prep_async(req, WRITE);
 5968	case IORING_OP_SENDMSG:
 5969		return io_sendmsg_prep_async(req);
 5970	case IORING_OP_RECVMSG:
 5971		return io_recvmsg_prep_async(req);
 5972	case IORING_OP_CONNECT:
 5973		return io_connect_prep_async(req);
 5974	}
 5975	printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
 5976		    req->opcode);
 5977	return -EFAULT;
 5978}
 5979
 5980static u32 io_get_sequence(struct io_kiocb *req)
 5981{
 5982	struct io_kiocb *pos;
 5983	struct io_ring_ctx *ctx = req->ctx;
 5984	u32 total_submitted, nr_reqs = 0;
 5985
 5986	io_for_each_link(pos, req)
 5987		nr_reqs++;
 5988
 5989	total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
 5990	return total_submitted - nr_reqs;
 5991}
 5992
 5993static int io_req_defer(struct io_kiocb *req)
 5994{
 5995	struct io_ring_ctx *ctx = req->ctx;
 5996	struct io_defer_entry *de;
 5997	int ret;
 5998	u32 seq;
 5999
 6000	/* Still need defer if there is pending req in defer list. */
 6001	if (likely(list_empty_careful(&ctx->defer_list) &&
 6002		!(req->flags & REQ_F_IO_DRAIN)))
 6003		return 0;
 6004
 6005	seq = io_get_sequence(req);
 6006	/* Still a chance to pass the sequence check */
 6007	if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
 6008		return 0;
 6009
 6010	ret = io_req_prep_async(req);
 6011	if (ret)
 6012		return ret;
 6013	io_prep_async_link(req);
 6014	de = kmalloc(sizeof(*de), GFP_KERNEL);
 6015	if (!de)
 6016		return -ENOMEM;
 6017
 6018	spin_lock_irq(&ctx->completion_lock);
 6019	if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
 6020		spin_unlock_irq(&ctx->completion_lock);
 6021		kfree(de);
 6022		io_queue_async_work(req);
 6023		return -EIOCBQUEUED;
 6024	}
 6025
 6026	trace_io_uring_defer(ctx, req, req->user_data);
 6027	de->req = req;
 6028	de->seq = seq;
 6029	list_add_tail(&de->list, &ctx->defer_list);
 6030	spin_unlock_irq(&ctx->completion_lock);
 6031	return -EIOCBQUEUED;
 6032}
 6033
 6034static void io_clean_op(struct io_kiocb *req)
 6035{
 6036	if (req->flags & REQ_F_BUFFER_SELECTED) {
 6037		switch (req->opcode) {
 6038		case IORING_OP_READV:
 6039		case IORING_OP_READ_FIXED:
 6040		case IORING_OP_READ:
 6041			kfree((void *)(unsigned long)req->rw.addr);
 6042			break;
 6043		case IORING_OP_RECVMSG:
 6044		case IORING_OP_RECV:
 6045			kfree(req->sr_msg.kbuf);
 6046			break;
 6047		}
 6048		req->flags &= ~REQ_F_BUFFER_SELECTED;
 6049	}
 6050
 6051	if (req->flags & REQ_F_NEED_CLEANUP) {
 6052		switch (req->opcode) {
 6053		case IORING_OP_READV:
 6054		case IORING_OP_READ_FIXED:
 6055		case IORING_OP_READ:
 6056		case IORING_OP_WRITEV:
 6057		case IORING_OP_WRITE_FIXED:
 6058		case IORING_OP_WRITE: {
 6059			struct io_async_rw *io = req->async_data;
 6060			if (io->free_iovec)
 6061				kfree(io->free_iovec);
 6062			break;
 6063			}
 6064		case IORING_OP_RECVMSG:
 6065		case IORING_OP_SENDMSG: {
 6066			struct io_async_msghdr *io = req->async_data;
 6067
 6068			kfree(io->free_iov);
 6069			break;
 6070			}
 6071		case IORING_OP_SPLICE:
 6072		case IORING_OP_TEE:
 6073			if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED))
 6074				io_put_file(req->splice.file_in);
 6075			break;
 6076		case IORING_OP_OPENAT:
 6077		case IORING_OP_OPENAT2:
 6078			if (req->open.filename)
 6079				putname(req->open.filename);
 6080			break;
 6081		case IORING_OP_RENAMEAT:
 6082			putname(req->rename.oldpath);
 6083			putname(req->rename.newpath);
 6084			break;
 6085		case IORING_OP_UNLINKAT:
 6086			putname(req->unlink.filename);
 6087			break;
 6088		}
 6089		req->flags &= ~REQ_F_NEED_CLEANUP;
 6090	}
 6091	if ((req->flags & REQ_F_POLLED) && req->apoll) {
 6092		kfree(req->apoll->double_poll);
 6093		kfree(req->apoll);
 6094		req->apoll = NULL;
 6095	}
 6096	if (req->flags & REQ_F_INFLIGHT) {
 6097		struct io_uring_task *tctx = req->task->io_uring;
 6098
 6099		atomic_dec(&tctx->inflight_tracked);
 6100		req->flags &= ~REQ_F_INFLIGHT;
 6101	}
 6102}
 6103
 6104static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 6105{
 6106	struct io_ring_ctx *ctx = req->ctx;
 6107	const struct cred *creds = NULL;
 6108	int ret;
 6109
 6110	if (req->work.creds && req->work.creds != current_cred())
 6111		creds = override_creds(req->work.creds);
 6112
 6113	switch (req->opcode) {
 6114	case IORING_OP_NOP:
 6115		ret = io_nop(req, issue_flags);
 6116		break;
 6117	case IORING_OP_READV:
 6118	case IORING_OP_READ_FIXED:
 6119	case IORING_OP_READ:
 6120		ret = io_read(req, issue_flags);
 6121		break;
 6122	case IORING_OP_WRITEV:
 6123	case IORING_OP_WRITE_FIXED:
 6124	case IORING_OP_WRITE:
 6125		ret = io_write(req, issue_flags);
 6126		break;
 6127	case IORING_OP_FSYNC:
 6128		ret = io_fsync(req, issue_flags);
 6129		break;
 6130	case IORING_OP_POLL_ADD:
 6131		ret = io_poll_add(req, issue_flags);
 6132		break;
 6133	case IORING_OP_POLL_REMOVE:
 6134		ret = io_poll_update(req, issue_flags);
 6135		break;
 6136	case IORING_OP_SYNC_FILE_RANGE:
 6137		ret = io_sync_file_range(req, issue_flags);
 6138		break;
 6139	case IORING_OP_SENDMSG:
 6140		ret = io_sendmsg(req, issue_flags);
 6141		break;
 6142	case IORING_OP_SEND:
 6143		ret = io_send(req, issue_flags);
 6144		break;
 6145	case IORING_OP_RECVMSG:
 6146		ret = io_recvmsg(req, issue_flags);
 6147		break;
 6148	case IORING_OP_RECV:
 6149		ret = io_recv(req, issue_flags);
 6150		break;
 6151	case IORING_OP_TIMEOUT:
 6152		ret = io_timeout(req, issue_flags);
 6153		break;
 6154	case IORING_OP_TIMEOUT_REMOVE:
 6155		ret = io_timeout_remove(req, issue_flags);
 6156		break;
 6157	case IORING_OP_ACCEPT:
 6158		ret = io_accept(req, issue_flags);
 6159		break;
 6160	case IORING_OP_CONNECT:
 6161		ret = io_connect(req, issue_flags);
 6162		break;
 6163	case IORING_OP_ASYNC_CANCEL:
 6164		ret = io_async_cancel(req, issue_flags);
 6165		break;
 6166	case IORING_OP_FALLOCATE:
 6167		ret = io_fallocate(req, issue_flags);
 6168		break;
 6169	case IORING_OP_OPENAT:
 6170		ret = io_openat(req, issue_flags);
 6171		break;
 6172	case IORING_OP_CLOSE:
 6173		ret = io_close(req, issue_flags);
 6174		break;
 6175	case IORING_OP_FILES_UPDATE:
 6176		ret = io_files_update(req, issue_flags);
 6177		break;
 6178	case IORING_OP_STATX:
 6179		ret = io_statx(req, issue_flags);
 6180		break;
 6181	case IORING_OP_FADVISE:
 6182		ret = io_fadvise(req, issue_flags);
 6183		break;
 6184	case IORING_OP_MADVISE:
 6185		ret = io_madvise(req, issue_flags);
 6186		break;
 6187	case IORING_OP_OPENAT2:
 6188		ret = io_openat2(req, issue_flags);
 6189		break;
 6190	case IORING_OP_EPOLL_CTL:
 6191		ret = io_epoll_ctl(req, issue_flags);
 6192		break;
 6193	case IORING_OP_SPLICE:
 6194		ret = io_splice(req, issue_flags);
 6195		break;
 6196	case IORING_OP_PROVIDE_BUFFERS:
 6197		ret = io_provide_buffers(req, issue_flags);
 6198		break;
 6199	case IORING_OP_REMOVE_BUFFERS:
 6200		ret = io_remove_buffers(req, issue_flags);
 6201		break;
 6202	case IORING_OP_TEE:
 6203		ret = io_tee(req, issue_flags);
 6204		break;
 6205	case IORING_OP_SHUTDOWN:
 6206		ret = io_shutdown(req, issue_flags);
 6207		break;
 6208	case IORING_OP_RENAMEAT:
 6209		ret = io_renameat(req, issue_flags);
 6210		break;
 6211	case IORING_OP_UNLINKAT:
 6212		ret = io_unlinkat(req, issue_flags);
 6213		break;
 6214	default:
 6215		ret = -EINVAL;
 6216		break;
 6217	}
 6218
 6219	if (creds)
 6220		revert_creds(creds);
 6221
 6222	if (ret)
 6223		return ret;
 6224
 6225	/* If the op doesn't have a file, we're not polling for it */
 6226	if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
 6227		const bool in_async = io_wq_current_is_worker();
 6228
 6229		/* workqueue context doesn't hold uring_lock, grab it now */
 6230		if (in_async)
 6231			mutex_lock(&ctx->uring_lock);
 6232
 6233		io_iopoll_req_issued(req, in_async);
 6234
 6235		if (in_async)
 6236			mutex_unlock(&ctx->uring_lock);
 6237	}
 6238
 6239	return 0;
 6240}
 6241
 6242static void io_wq_submit_work(struct io_wq_work *work)
 6243{
 6244	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 6245	struct io_kiocb *timeout;
 6246	int ret = 0;
 6247
 6248	timeout = io_prep_linked_timeout(req);
 6249	if (timeout)
 6250		io_queue_linked_timeout(timeout);
 6251
 6252	if (work->flags & IO_WQ_WORK_CANCEL)
 6253		ret = -ECANCELED;
 6254
 6255	if (!ret) {
 6256		do {
 6257			ret = io_issue_sqe(req, 0);
 6258			/*
 6259			 * We can get EAGAIN for polled IO even though we're
 6260			 * forcing a sync submission from here, since we can't
 6261			 * wait for request slots on the block side.
 6262			 */
 6263			if (ret != -EAGAIN)
 6264				break;
 6265			cond_resched();
 6266		} while (1);
 6267	}
 6268
 6269	/* avoid locking problems by failing it from a clean context */
 6270	if (ret) {
 6271		/* io-wq is going to take one down */
 6272		req_ref_get(req);
 6273		io_req_task_queue_fail(req, ret);
 6274	}
 6275}
 6276
 6277#define FFS_ASYNC_READ		0x1UL
 6278#define FFS_ASYNC_WRITE		0x2UL
 6279#ifdef CONFIG_64BIT
 6280#define FFS_ISREG		0x4UL
 6281#else
 6282#define FFS_ISREG		0x0UL
 6283#endif
 6284#define FFS_MASK		~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
 6285
 6286static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
 6287						      unsigned i)
 6288{
 6289	struct io_fixed_file *table_l2;
 6290
 6291	table_l2 = table->files[i >> IORING_FILE_TABLE_SHIFT];
 6292	return &table_l2[i & IORING_FILE_TABLE_MASK];
 6293}
 6294
 6295static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
 6296					      int index)
 6297{
 6298	struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
 6299
 6300	return (struct file *) (slot->file_ptr & FFS_MASK);
 6301}
 6302
 6303static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
 6304{
 6305	unsigned long file_ptr = (unsigned long) file;
 6306
 6307	if (__io_file_supports_async(file, READ))
 6308		file_ptr |= FFS_ASYNC_READ;
 6309	if (__io_file_supports_async(file, WRITE))
 6310		file_ptr |= FFS_ASYNC_WRITE;
 6311	if (S_ISREG(file_inode(file)->i_mode))
 6312		file_ptr |= FFS_ISREG;
 6313	file_slot->file_ptr = file_ptr;
 6314}
 6315
 6316static struct file *io_file_get(struct io_submit_state *state,
 6317				struct io_kiocb *req, int fd, bool fixed)
 6318{
 6319	struct io_ring_ctx *ctx = req->ctx;
 6320	struct file *file;
 6321
 6322	if (fixed) {
 6323		unsigned long file_ptr;
 6324
 6325		if (unlikely((unsigned int)fd >= ctx->nr_user_files))
 6326			return NULL;
 6327		fd = array_index_nospec(fd, ctx->nr_user_files);
 6328		file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
 6329		file = (struct file *) (file_ptr & FFS_MASK);
 6330		file_ptr &= ~FFS_MASK;
 6331		/* mask in overlapping REQ_F and FFS bits */
 6332		req->flags |= (file_ptr << REQ_F_ASYNC_READ_BIT);
 6333		io_req_set_rsrc_node(req);
 6334	} else {
 6335		trace_io_uring_file_get(ctx, fd);
 6336		file = __io_file_get(state, fd);
 6337
 6338		/* we don't allow fixed io_uring files */
 6339		if (file && unlikely(file->f_op == &io_uring_fops))
 6340			io_req_track_inflight(req);
 6341	}
 6342
 6343	return file;
 6344}
 6345
 6346static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 6347{
 6348	struct io_timeout_data *data = container_of(timer,
 6349						struct io_timeout_data, timer);
 6350	struct io_kiocb *prev, *req = data->req;
 6351	struct io_ring_ctx *ctx = req->ctx;
 6352	unsigned long flags;
 6353
 6354	spin_lock_irqsave(&ctx->completion_lock, flags);
 6355	prev = req->timeout.head;
 6356	req->timeout.head = NULL;
 6357
 6358	/*
 6359	 * We don't expect the list to be empty, that will only happen if we
 6360	 * race with the completion of the linked work.
 6361	 */
 6362	if (prev) {
 6363		io_remove_next_linked(prev);
 6364		if (!req_ref_inc_not_zero(prev))
 6365			prev = NULL;
 6366	}
 6367	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 6368
 6369	if (prev) {
 6370		io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
 6371		io_put_req_deferred(prev, 1);
 6372		io_put_req_deferred(req, 1);
 6373	} else {
 6374		io_req_complete_post(req, -ETIME, 0);
 6375	}
 6376	return HRTIMER_NORESTART;
 6377}
 6378
 6379static void io_queue_linked_timeout(struct io_kiocb *req)
 6380{
 6381	struct io_ring_ctx *ctx = req->ctx;
 6382
 6383	spin_lock_irq(&ctx->completion_lock);
 6384	/*
 6385	 * If the back reference is NULL, then our linked request finished
 6386	 * before we got a chance to setup the timer
 6387	 */
 6388	if (req->timeout.head) {
 6389		struct io_timeout_data *data = req->async_data;
 6390
 6391		data->timer.function = io_link_timeout_fn;
 6392		hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
 6393				data->mode);
 6394	}
 6395	spin_unlock_irq(&ctx->completion_lock);
 6396	/* drop submission reference */
 6397	io_put_req(req);
 6398}
 6399
 6400static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 6401{
 6402	struct io_kiocb *nxt = req->link;
 6403
 6404	if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) ||
 6405	    nxt->opcode != IORING_OP_LINK_TIMEOUT)
 6406		return NULL;
 6407
 6408	nxt->timeout.head = req;
 6409	nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
 6410	req->flags |= REQ_F_LINK_TIMEOUT;
 6411	return nxt;
 6412}
 6413
 6414static void __io_queue_sqe(struct io_kiocb *req)
 6415{
 6416	struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
 6417	int ret;
 6418
 6419	ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
 6420
 6421	/*
 6422	 * We async punt it if the file wasn't marked NOWAIT, or if the file
 6423	 * doesn't support non-blocking read/write attempts
 6424	 */
 6425	if (likely(!ret)) {
 6426		/* drop submission reference */
 6427		if (req->flags & REQ_F_COMPLETE_INLINE) {
 6428			struct io_ring_ctx *ctx = req->ctx;
 6429			struct io_comp_state *cs = &ctx->submit_state.comp;
 6430
 6431			cs->reqs[cs->nr++] = req;
 6432			if (cs->nr == ARRAY_SIZE(cs->reqs))
 6433				io_submit_flush_completions(cs, ctx);
 6434		} else {
 6435			io_put_req(req);
 6436		}
 6437	} else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
 6438		if (!io_arm_poll_handler(req)) {
 6439			/*
 6440			 * Queued up for async execution, worker will release
 6441			 * submit reference when the iocb is actually submitted.
 6442			 */
 6443			io_queue_async_work(req);
 6444		}
 6445	} else {
 6446		io_req_complete_failed(req, ret);
 6447	}
 6448	if (linked_timeout)
 6449		io_queue_linked_timeout(linked_timeout);
 6450}
 6451
 6452static void io_queue_sqe(struct io_kiocb *req)
 6453{
 6454	int ret;
 6455
 6456	ret = io_req_defer(req);
 6457	if (ret) {
 6458		if (ret != -EIOCBQUEUED) {
 6459fail_req:
 6460			io_req_complete_failed(req, ret);
 6461		}
 6462	} else if (req->flags & REQ_F_FORCE_ASYNC) {
 6463		ret = io_req_prep_async(req);
 6464		if (unlikely(ret))
 6465			goto fail_req;
 6466		io_queue_async_work(req);
 6467	} else {
 6468		__io_queue_sqe(req);
 6469	}
 6470}
 6471
 6472/*
 6473 * Check SQE restrictions (opcode and flags).
 6474 *
 6475 * Returns 'true' if SQE is allowed, 'false' otherwise.
 6476 */
 6477static inline bool io_check_restriction(struct io_ring_ctx *ctx,
 6478					struct io_kiocb *req,
 6479					unsigned int sqe_flags)
 6480{
 6481	if (!ctx->restricted)
 6482		return true;
 6483
 6484	if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
 6485		return false;
 6486
 6487	if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
 6488	    ctx->restrictions.sqe_flags_required)
 6489		return false;
 6490
 6491	if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
 6492			  ctx->restrictions.sqe_flags_required))
 6493		return false;
 6494
 6495	return true;
 6496}
 6497
 6498static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 6499		       const struct io_uring_sqe *sqe)
 6500{
 6501	struct io_submit_state *state;
 6502	unsigned int sqe_flags;
 6503	int personality, ret = 0;
 6504
 6505	req->opcode = READ_ONCE(sqe->opcode);
 6506	/* same numerical values with corresponding REQ_F_*, safe to copy */
 6507	req->flags = sqe_flags = READ_ONCE(sqe->flags);
 6508	req->user_data = READ_ONCE(sqe->user_data);
 6509	req->async_data = NULL;
 6510	req->file = NULL;
 6511	req->ctx = ctx;
 6512	req->link = NULL;
 6513	req->fixed_rsrc_refs = NULL;
 6514	/* one is dropped after submission, the other at completion */
 6515	atomic_set(&req->refs, 2);
 6516	req->task = current;
 6517	req->result = 0;
 6518	req->work.creds = NULL;
 6519
 6520	/* enforce forwards compatibility on users */
 6521	if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
 6522		return -EINVAL;
 6523	if (unlikely(req->opcode >= IORING_OP_LAST))
 6524		return -EINVAL;
 6525	if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
 6526		return -EACCES;
 6527
 6528	if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
 6529	    !io_op_defs[req->opcode].buffer_select)
 6530		return -EOPNOTSUPP;
 6531
 6532	personality = READ_ONCE(sqe->personality);
 6533	if (personality) {
 6534		req->work.creds = xa_load(&ctx->personalities, personality);
 6535		if (!req->work.creds)
 6536			return -EINVAL;
 6537		get_cred(req->work.creds);
 6538	}
 6539	state = &ctx->submit_state;
 6540
 6541	/*
 6542	 * Plug now if we have more than 1 IO left after this, and the target
 6543	 * is potentially a read/write to block based storage.
 6544	 */
 6545	if (!state->plug_started && state->ios_left > 1 &&
 6546	    io_op_defs[req->opcode].plug) {
 6547		blk_start_plug(&state->plug);
 6548		state->plug_started = true;
 6549	}
 6550
 6551	if (io_op_defs[req->opcode].needs_file) {
 6552		bool fixed = req->flags & REQ_F_FIXED_FILE;
 6553
 6554		req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
 6555		if (unlikely(!req->file))
 6556			ret = -EBADF;
 6557	}
 6558
 6559	state->ios_left--;
 6560	return ret;
 6561}
 6562
 6563static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 6564			 const struct io_uring_sqe *sqe)
 6565{
 6566	struct io_submit_link *link = &ctx->submit_state.link;
 6567	int ret;
 6568
 6569	ret = io_init_req(ctx, req, sqe);
 6570	if (unlikely(ret)) {
 6571fail_req:
 6572		if (link->head) {
 6573			/* fail even hard links since we don't submit */
 6574			link->head->flags |= REQ_F_FAIL_LINK;
 6575			io_req_complete_failed(link->head, -ECANCELED);
 6576			link->head = NULL;
 6577		}
 6578		io_req_complete_failed(req, ret);
 6579		return ret;
 6580	}
 6581	ret = io_req_prep(req, sqe);
 6582	if (unlikely(ret))
 6583		goto fail_req;
 6584
 6585	/* don't need @sqe from now on */
 6586	trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
 6587				true, ctx->flags & IORING_SETUP_SQPOLL);
 6588
 6589	/*
 6590	 * If we already have a head request, queue this one for async
 6591	 * submittal once the head completes. If we don't have a head but
 6592	 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
 6593	 * submitted sync once the chain is complete. If none of those
 6594	 * conditions are true (normal request), then just queue it.
 6595	 */
 6596	if (link->head) {
 6597		struct io_kiocb *head = link->head;
 6598
 6599		/*
 6600		 * Taking sequential execution of a link, draining both sides
 6601		 * of the link also fullfils IOSQE_IO_DRAIN semantics for all
 6602		 * requests in the link. So, it drains the head and the
 6603		 * next after the link request. The last one is done via
 6604		 * drain_next flag to persist the effect across calls.
 6605		 */
 6606		if (req->flags & REQ_F_IO_DRAIN) {
 6607			head->flags |= REQ_F_IO_DRAIN;
 6608			ctx->drain_next = 1;
 6609		}
 6610		ret = io_req_prep_async(req);
 6611		if (unlikely(ret))
 6612			goto fail_req;
 6613		trace_io_uring_link(ctx, req, head);
 6614		link->last->link = req;
 6615		link->last = req;
 6616
 6617		/* last request of a link, enqueue the link */
 6618		if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
 6619			io_queue_sqe(head);
 6620			link->head = NULL;
 6621		}
 6622	} else {
 6623		if (unlikely(ctx->drain_next)) {
 6624			req->flags |= REQ_F_IO_DRAIN;
 6625			ctx->drain_next = 0;
 6626		}
 6627		if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
 6628			link->head = req;
 6629			link->last = req;
 6630		} else {
 6631			io_queue_sqe(req);
 6632		}
 6633	}
 6634
 6635	return 0;
 6636}
 6637
 6638/*
 6639 * Batched submission is done, ensure local IO is flushed out.
 6640 */
 6641static void io_submit_state_end(struct io_submit_state *state,
 6642				struct io_ring_ctx *ctx)
 6643{
 6644	if (state->link.head)
 6645		io_queue_sqe(state->link.head);
 6646	if (state->comp.nr)
 6647		io_submit_flush_completions(&state->comp, ctx);
 6648	if (state->plug_started)
 6649		blk_finish_plug(&state->plug);
 6650	io_state_file_put(state);
 6651}
 6652
 6653/*
 6654 * Start submission side cache.
 6655 */
 6656static void io_submit_state_start(struct io_submit_state *state,
 6657				  unsigned int max_ios)
 6658{
 6659	state->plug_started = false;
 6660	state->ios_left = max_ios;
 6661	/* set only head, no need to init link_last in advance */
 6662	state->link.head = NULL;
 6663}
 6664
 6665static void io_commit_sqring(struct io_ring_ctx *ctx)
 6666{
 6667	struct io_rings *rings = ctx->rings;
 6668
 6669	/*
 6670	 * Ensure any loads from the SQEs are done at this point,
 6671	 * since once we write the new head, the application could
 6672	 * write new data to them.
 6673	 */
 6674	smp_store_release(&rings->sq.head, ctx->cached_sq_head);
 6675}
 6676
 6677/*
 6678 * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
 6679 * that is mapped by userspace. This means that care needs to be taken to
 6680 * ensure that reads are stable, as we cannot rely on userspace always
 6681 * being a good citizen. If members of the sqe are validated and then later
 6682 * used, it's important that those reads are done through READ_ONCE() to
 6683 * prevent a re-load down the line.
 6684 */
 6685static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
 6686{
 6687	u32 *sq_array = ctx->sq_array;
 6688	unsigned head;
 6689
 6690	/*
 6691	 * The cached sq head (or cq tail) serves two purposes:
 6692	 *
 6693	 * 1) allows us to batch the cost of updating the user visible
 6694	 *    head updates.
 6695	 * 2) allows the kernel side to track the head on its own, even
 6696	 *    though the application is the one updating it.
 6697	 */
 6698	head = READ_ONCE(sq_array[ctx->cached_sq_head++ & ctx->sq_mask]);
 6699	if (likely(head < ctx->sq_entries))
 6700		return &ctx->sq_sqes[head];
 6701
 6702	/* drop invalid entries */
 6703	ctx->cached_sq_dropped++;
 6704	WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
 6705	return NULL;
 6706}
 6707
 6708static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 6709{
 6710	int submitted = 0;
 6711
 6712	/* make sure SQ entry isn't read before tail */
 6713	nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
 6714
 6715	if (!percpu_ref_tryget_many(&ctx->refs, nr))
 6716		return -EAGAIN;
 6717
 6718	percpu_counter_add(&current->io_uring->inflight, nr);
 6719	refcount_add(nr, &current->usage);
 6720	io_submit_state_start(&ctx->submit_state, nr);
 6721
 6722	while (submitted < nr) {
 6723		const struct io_uring_sqe *sqe;
 6724		struct io_kiocb *req;
 6725
 6726		req = io_alloc_req(ctx);
 6727		if (unlikely(!req)) {
 6728			if (!submitted)
 6729				submitted = -EAGAIN;
 6730			break;
 6731		}
 6732		sqe = io_get_sqe(ctx);
 6733		if (unlikely(!sqe)) {
 6734			kmem_cache_free(req_cachep, req);
 6735			break;
 6736		}
 6737		/* will complete beyond this point, count as submitted */
 6738		submitted++;
 6739		if (io_submit_sqe(ctx, req, sqe))
 6740			break;
 6741	}
 6742
 6743	if (unlikely(submitted != nr)) {
 6744		int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
 6745		struct io_uring_task *tctx = current->io_uring;
 6746		int unused = nr - ref_used;
 6747
 6748		percpu_ref_put_many(&ctx->refs, unused);
 6749		percpu_counter_sub(&tctx->inflight, unused);
 6750		put_task_struct_many(current, unused);
 6751	}
 6752
 6753	io_submit_state_end(&ctx->submit_state, ctx);
 6754	 /* Commit SQ ring head once we've consumed and submitted all SQEs */
 6755	io_commit_sqring(ctx);
 6756
 6757	return submitted;
 6758}
 6759
 6760static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
 6761{
 6762	/* Tell userspace we may need a wakeup call */
 6763	spin_lock_irq(&ctx->completion_lock);
 6764	ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
 6765	spin_unlock_irq(&ctx->completion_lock);
 6766}
 6767
 6768static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
 6769{
 6770	spin_lock_irq(&ctx->completion_lock);
 6771	ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
 6772	spin_unlock_irq(&ctx->completion_lock);
 6773}
 6774
 6775static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
 6776{
 6777	unsigned int to_submit;
 6778	int ret = 0;
 6779
 6780	to_submit = io_sqring_entries(ctx);
 6781	/* if we're handling multiple rings, cap submit size for fairness */
 6782	if (cap_entries && to_submit > 8)
 6783		to_submit = 8;
 6784
 6785	if (!list_empty(&ctx->iopoll_list) || to_submit) {
 6786		unsigned nr_events = 0;
 6787
 6788		mutex_lock(&ctx->uring_lock);
 6789		if (!list_empty(&ctx->iopoll_list))
 6790			io_do_iopoll(ctx, &nr_events, 0);
 6791
 6792		/*
 6793		 * Don't submit if refs are dying, good for io_uring_register(),
 6794		 * but also it is relied upon by io_ring_exit_work()
 6795		 */
 6796		if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
 6797		    !(ctx->flags & IORING_SETUP_R_DISABLED))
 6798			ret = io_submit_sqes(ctx, to_submit);
 6799		mutex_unlock(&ctx->uring_lock);
 6800	}
 6801
 6802	if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
 6803		wake_up(&ctx->sqo_sq_wait);
 6804
 6805	return ret;
 6806}
 6807
 6808static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
 6809{
 6810	struct io_ring_ctx *ctx;
 6811	unsigned sq_thread_idle = 0;
 6812
 6813	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 6814		sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
 6815	sqd->sq_thread_idle = sq_thread_idle;
 6816}
 6817
 6818static int io_sq_thread(void *data)
 6819{
 6820	struct io_sq_data *sqd = data;
 6821	struct io_ring_ctx *ctx;
 6822	unsigned long timeout = 0;
 6823	char buf[TASK_COMM_LEN];
 6824	DEFINE_WAIT(wait);
 6825
 6826	snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
 6827	set_task_comm(current, buf);
 6828
 6829	if (sqd->sq_cpu != -1)
 6830		set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
 6831	else
 6832		set_cpus_allowed_ptr(current, cpu_online_mask);
 6833	current->flags |= PF_NO_SETAFFINITY;
 6834
 6835	mutex_lock(&sqd->lock);
 6836	/* a user may had exited before the thread started */
 6837	io_run_task_work_head(&sqd->park_task_work);
 6838
 6839	while (!test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) {
 6840		int ret;
 6841		bool cap_entries, sqt_spin, needs_sched;
 6842
 6843		if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
 6844		    signal_pending(current)) {
 6845			bool did_sig = false;
 6846
 6847			mutex_unlock(&sqd->lock);
 6848			if (signal_pending(current)) {
 6849				struct ksignal ksig;
 6850
 6851				did_sig = get_signal(&ksig);
 6852			}
 6853			cond_resched();
 6854			mutex_lock(&sqd->lock);
 6855			io_run_task_work();
 6856			io_run_task_work_head(&sqd->park_task_work);
 6857			if (did_sig)
 6858				break;
 6859			timeout = jiffies + sqd->sq_thread_idle;
 6860			continue;
 6861		}
 6862		sqt_spin = false;
 6863		cap_entries = !list_is_singular(&sqd->ctx_list);
 6864		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
 6865			const struct cred *creds = NULL;
 6866
 6867			if (ctx->sq_creds != current_cred())
 6868				creds = override_creds(ctx->sq_creds);
 6869			ret = __io_sq_thread(ctx, cap_entries);
 6870			if (creds)
 6871				revert_creds(creds);
 6872			if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
 6873				sqt_spin = true;
 6874		}
 6875
 6876		if (sqt_spin || !time_after(jiffies, timeout)) {
 6877			io_run_task_work();
 6878			cond_resched();
 6879			if (sqt_spin)
 6880				timeout = jiffies + sqd->sq_thread_idle;
 6881			continue;
 6882		}
 6883
 6884		prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
 6885		if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) {
 6886			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 6887				io_ring_set_wakeup_flag(ctx);
 6888
 6889			needs_sched = true;
 6890			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
 6891				if ((ctx->flags & IORING_SETUP_IOPOLL) &&
 6892				    !list_empty_careful(&ctx->iopoll_list)) {
 6893					needs_sched = false;
 6894					break;
 6895				}
 6896				if (io_sqring_entries(ctx)) {
 6897					needs_sched = false;
 6898					break;
 6899				}
 6900			}
 6901
 6902			if (needs_sched) {
 6903				mutex_unlock(&sqd->lock);
 6904				schedule();
 6905				mutex_lock(&sqd->lock);
 6906			}
 6907			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 6908				io_ring_clear_wakeup_flag(ctx);
 6909		}
 6910
 6911		finish_wait(&sqd->wait, &wait);
 6912		io_run_task_work_head(&sqd->park_task_work);
 6913		timeout = jiffies + sqd->sq_thread_idle;
 6914	}
 6915
 6916	io_uring_cancel_sqpoll(sqd);
 6917	sqd->thread = NULL;
 6918	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 6919		io_ring_set_wakeup_flag(ctx);
 6920	io_run_task_work();
 6921	io_run_task_work_head(&sqd->park_task_work);
 6922	mutex_unlock(&sqd->lock);
 6923
 6924	complete(&sqd->exited);
 6925	do_exit(0);
 6926}
 6927
 6928struct io_wait_queue {
 6929	struct wait_queue_entry wq;
 6930	struct io_ring_ctx *ctx;
 6931	unsigned to_wait;
 6932	unsigned nr_timeouts;
 6933};
 6934
 6935static inline bool io_should_wake(struct io_wait_queue *iowq)
 6936{
 6937	struct io_ring_ctx *ctx = iowq->ctx;
 6938
 6939	/*
 6940	 * Wake up if we have enough events, or if a timeout occurred since we
 6941	 * started waiting. For timeouts, we always want to return to userspace,
 6942	 * regardless of event count.
 6943	 */
 6944	return io_cqring_events(ctx) >= iowq->to_wait ||
 6945			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
 6946}
 6947
 6948static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
 6949			    int wake_flags, void *key)
 6950{
 6951	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
 6952							wq);
 6953
 6954	/*
 6955	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
 6956	 * the task, and the next invocation will do it.
 6957	 */
 6958	if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->cq_check_overflow))
 6959		return autoremove_wake_function(curr, mode, wake_flags, key);
 6960	return -1;
 6961}
 6962
 6963static int io_run_task_work_sig(void)
 6964{
 6965	if (io_run_task_work())
 6966		return 1;
 6967	if (!signal_pending(current))
 6968		return 0;
 6969	if (test_thread_flag(TIF_NOTIFY_SIGNAL))
 6970		return -ERESTARTSYS;
 6971	return -EINTR;
 6972}
 6973
 6974/* when returns >0, the caller should retry */
 6975static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 6976					  struct io_wait_queue *iowq,
 6977					  signed long *timeout)
 6978{
 6979	int ret;
 6980
 6981	/* make sure we run task_work before checking for signals */
 6982	ret = io_run_task_work_sig();
 6983	if (ret || io_should_wake(iowq))
 6984		return ret;
 6985	/* let the caller flush overflows, retry */
 6986	if (test_bit(0, &ctx->cq_check_overflow))
 6987		return 1;
 6988
 6989	*timeout = schedule_timeout(*timeout);
 6990	return !*timeout ? -ETIME : 1;
 6991}
 6992
 6993/*
 6994 * Wait until events become available, if we don't already have some. The
 6995 * application must reap them itself, as they reside on the shared cq ring.
 6996 */
 6997static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 6998			  const sigset_t __user *sig, size_t sigsz,
 6999			  struct __kernel_timespec __user *uts)
 7000{
 7001	struct io_wait_queue iowq = {
 7002		.wq = {
 7003			.private	= current,
 7004			.func		= io_wake_function,
 7005			.entry		= LIST_HEAD_INIT(iowq.wq.entry),
 7006		},
 7007		.ctx		= ctx,
 7008		.to_wait	= min_events,
 7009	};
 7010	struct io_rings *rings = ctx->rings;
 7011	signed long timeout = MAX_SCHEDULE_TIMEOUT;
 7012	int ret;
 7013
 7014	do {
 7015		io_cqring_overflow_flush(ctx, false);
 7016		if (io_cqring_events(ctx) >= min_events)
 7017			return 0;
 7018		if (!io_run_task_work())
 7019			break;
 7020	} while (1);
 7021
 7022	if (sig) {
 7023#ifdef CONFIG_COMPAT
 7024		if (in_compat_syscall())
 7025			ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
 7026						      sigsz);
 7027		else
 7028#endif
 7029			ret = set_user_sigmask(sig, sigsz);
 7030
 7031		if (ret)
 7032			return ret;
 7033	}
 7034
 7035	if (uts) {
 7036		struct timespec64 ts;
 7037
 7038		if (get_timespec64(&ts, uts))
 7039			return -EFAULT;
 7040		timeout = timespec64_to_jiffies(&ts);
 7041	}
 7042
 7043	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
 7044	trace_io_uring_cqring_wait(ctx, min_events);
 7045	do {
 7046		/* if we can't even flush overflow, don't wait for more */
 7047		if (!io_cqring_overflow_flush(ctx, false)) {
 7048			ret = -EBUSY;
 7049			break;
 7050		}
 7051		prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
 7052						TASK_INTERRUPTIBLE);
 7053		ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
 7054		finish_wait(&ctx->wait, &iowq.wq);
 7055		cond_resched();
 7056	} while (ret > 0);
 7057
 7058	restore_saved_sigmask_unless(ret == -EINTR);
 7059
 7060	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
 7061}
 7062
 7063static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
 7064{
 7065	unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
 7066
 7067	for (i = 0; i < nr_tables; i++)
 7068		kfree(table->files[i]);
 7069	kfree(table->files);
 7070	table->files = NULL;
 7071}
 7072
 7073static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
 7074{
 7075	spin_lock_bh(&ctx->rsrc_ref_lock);
 7076}
 7077
 7078static inline void io_rsrc_ref_unlock(struct io_ring_ctx *ctx)
 7079{
 7080	spin_unlock_bh(&ctx->rsrc_ref_lock);
 7081}
 7082
 7083static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
 7084{
 7085	percpu_ref_exit(&ref_node->refs);
 7086	kfree(ref_node);
 7087}
 7088
 7089static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 7090				struct io_rsrc_data *data_to_kill)
 7091{
 7092	WARN_ON_ONCE(!ctx->rsrc_backup_node);
 7093	WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
 7094
 7095	if (data_to_kill) {
 7096		struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
 7097
 7098		rsrc_node->rsrc_data = data_to_kill;
 7099		io_rsrc_ref_lock(ctx);
 7100		list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
 7101		io_rsrc_ref_unlock(ctx);
 7102
 7103		atomic_inc(&data_to_kill->refs);
 7104		percpu_ref_kill(&rsrc_node->refs);
 7105		ctx->rsrc_node = NULL;
 7106	}
 7107
 7108	if (!ctx->rsrc_node) {
 7109		ctx->rsrc_node = ctx->rsrc_backup_node;
 7110		ctx->rsrc_backup_node = NULL;
 7111	}
 7112}
 7113
 7114static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
 7115{
 7116	if (ctx->rsrc_backup_node)
 7117		return 0;
 7118	ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
 7119	return ctx->rsrc_backup_node ? 0 : -ENOMEM;
 7120}
 7121
 7122static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx)
 7123{
 7124	int ret;
 7125
 7126	/* As we may drop ->uring_lock, other task may have started quiesce */
 7127	if (data->quiesce)
 7128		return -ENXIO;
 7129
 7130	data->quiesce = true;
 7131	do {
 7132		ret = io_rsrc_node_switch_start(ctx);
 7133		if (ret)
 7134			break;
 7135		io_rsrc_node_switch(ctx, data);
 7136
 7137		/* kill initial ref, already quiesced if zero */
 7138		if (atomic_dec_and_test(&data->refs))
 7139			break;
 7140		flush_delayed_work(&ctx->rsrc_put_work);
 7141		ret = wait_for_completion_interruptible(&data->done);
 7142		if (!ret)
 7143			break;
 7144
 7145		atomic_inc(&data->refs);
 7146		/* wait for all works potentially completing data->done */
 7147		flush_delayed_work(&ctx->rsrc_put_work);
 7148		reinit_completion(&data->done);
 7149
 7150		mutex_unlock(&ctx->uring_lock);
 7151		ret = io_run_task_work_sig();
 7152		mutex_lock(&ctx->uring_lock);
 7153	} while (ret >= 0);
 7154	data->quiesce = false;
 7155
 7156	return ret;
 7157}
 7158
 7159static void io_rsrc_data_free(struct io_rsrc_data *data)
 7160{
 7161	kvfree(data->tags);
 7162	kfree(data);
 7163}
 7164
 7165static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx,
 7166					       rsrc_put_fn *do_put,
 7167					       unsigned nr)
 7168{
 7169	struct io_rsrc_data *data;
 7170
 7171	data = kzalloc(sizeof(*data), GFP_KERNEL);
 7172	if (!data)
 7173		return NULL;
 7174
 7175	data->tags = kvcalloc(nr, sizeof(*data->tags), GFP_KERNEL);
 7176	if (!data->tags) {
 7177		kfree(data);
 7178		return NULL;
 7179	}
 7180
 7181	atomic_set(&data->refs, 1);
 7182	data->ctx = ctx;
 7183	data->do_put = do_put;
 7184	init_completion(&data->done);
 7185	return data;
 7186}
 7187
 7188static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 7189{
 7190#if defined(CONFIG_UNIX)
 7191	if (ctx->ring_sock) {
 7192		struct sock *sock = ctx->ring_sock->sk;
 7193		struct sk_buff *skb;
 7194
 7195		while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
 7196			kfree_skb(skb);
 7197	}
 7198#else
 7199	int i;
 7200
 7201	for (i = 0; i < ctx->nr_user_files; i++) {
 7202		struct file *file;
 7203
 7204		file = io_file_from_index(ctx, i);
 7205		if (file)
 7206			fput(file);
 7207	}
 7208#endif
 7209	io_free_file_tables(&ctx->file_table, ctx->nr_user_files);
 7210	io_rsrc_data_free(ctx->file_data);
 7211	ctx->file_data = NULL;
 7212	ctx->nr_user_files = 0;
 7213}
 7214
 7215static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 7216{
 7217	int ret;
 7218
 7219	if (!ctx->file_data)
 7220		return -ENXIO;
 7221	ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
 7222	if (!ret)
 7223		__io_sqe_files_unregister(ctx);
 7224	return ret;
 7225}
 7226
 7227static void io_sq_thread_unpark(struct io_sq_data *sqd)
 7228	__releases(&sqd->lock)
 7229{
 7230	WARN_ON_ONCE(sqd->thread == current);
 7231
 7232	/*
 7233	 * Do the dance but not conditional clear_bit() because it'd race with
 7234	 * other threads incrementing park_pending and setting the bit.
 7235	 */
 7236	clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
 7237	if (atomic_dec_return(&sqd->park_pending))
 7238		set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
 7239	mutex_unlock(&sqd->lock);
 7240}
 7241
 7242static void io_sq_thread_park(struct io_sq_data *sqd)
 7243	__acquires(&sqd->lock)
 7244{
 7245	WARN_ON_ONCE(sqd->thread == current);
 7246
 7247	atomic_inc(&sqd->park_pending);
 7248	set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
 7249	mutex_lock(&sqd->lock);
 7250	if (sqd->thread)
 7251		wake_up_process(sqd->thread);
 7252}
 7253
 7254static void io_sq_thread_stop(struct io_sq_data *sqd)
 7255{
 7256	WARN_ON_ONCE(sqd->thread == current);
 7257	WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
 7258
 7259	set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
 7260	mutex_lock(&sqd->lock);
 7261	if (sqd->thread)
 7262		wake_up_process(sqd->thread);
 7263	mutex_unlock(&sqd->lock);
 7264	wait_for_completion(&sqd->exited);
 7265}
 7266
 7267static void io_put_sq_data(struct io_sq_data *sqd)
 7268{
 7269	if (refcount_dec_and_test(&sqd->refs)) {
 7270		WARN_ON_ONCE(atomic_read(&sqd->park_pending));
 7271
 7272		io_sq_thread_stop(sqd);
 7273		kfree(sqd);
 7274	}
 7275}
 7276
 7277static void io_sq_thread_finish(struct io_ring_ctx *ctx)
 7278{
 7279	struct io_sq_data *sqd = ctx->sq_data;
 7280
 7281	if (sqd) {
 7282		io_sq_thread_park(sqd);
 7283		list_del_init(&ctx->sqd_list);
 7284		io_sqd_update_thread_idle(sqd);
 7285		io_sq_thread_unpark(sqd);
 7286
 7287		io_put_sq_data(sqd);
 7288		ctx->sq_data = NULL;
 7289	}
 7290}
 7291
 7292static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
 7293{
 7294	struct io_ring_ctx *ctx_attach;
 7295	struct io_sq_data *sqd;
 7296	struct fd f;
 7297
 7298	f = fdget(p->wq_fd);
 7299	if (!f.file)
 7300		return ERR_PTR(-ENXIO);
 7301	if (f.file->f_op != &io_uring_fops) {
 7302		fdput(f);
 7303		return ERR_PTR(-EINVAL);
 7304	}
 7305
 7306	ctx_attach = f.file->private_data;
 7307	sqd = ctx_attach->sq_data;
 7308	if (!sqd) {
 7309		fdput(f);
 7310		return ERR_PTR(-EINVAL);
 7311	}
 7312	if (sqd->task_tgid != current->tgid) {
 7313		fdput(f);
 7314		return ERR_PTR(-EPERM);
 7315	}
 7316
 7317	refcount_inc(&sqd->refs);
 7318	fdput(f);
 7319	return sqd;
 7320}
 7321
 7322static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
 7323					 bool *attached)
 7324{
 7325	struct io_sq_data *sqd;
 7326
 7327	*attached = false;
 7328	if (p->flags & IORING_SETUP_ATTACH_WQ) {
 7329		sqd = io_attach_sq_data(p);
 7330		if (!IS_ERR(sqd)) {
 7331			*attached = true;
 7332			return sqd;
 7333		}
 7334		/* fall through for EPERM case, setup new sqd/task */
 7335		if (PTR_ERR(sqd) != -EPERM)
 7336			return sqd;
 7337	}
 7338
 7339	sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
 7340	if (!sqd)
 7341		return ERR_PTR(-ENOMEM);
 7342
 7343	atomic_set(&sqd->park_pending, 0);
 7344	refcount_set(&sqd->refs, 1);
 7345	INIT_LIST_HEAD(&sqd->ctx_list);
 7346	mutex_init(&sqd->lock);
 7347	init_waitqueue_head(&sqd->wait);
 7348	init_completion(&sqd->exited);
 7349	return sqd;
 7350}
 7351
 7352#if defined(CONFIG_UNIX)
 7353/*
 7354 * Ensure the UNIX gc is aware of our file set, so we are certain that
 7355 * the io_uring can be safely unregistered on process exit, even if we have
 7356 * loops in the file referencing.
 7357 */
 7358static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
 7359{
 7360	struct sock *sk = ctx->ring_sock->sk;
 7361	struct scm_fp_list *fpl;
 7362	struct sk_buff *skb;
 7363	int i, nr_files;
 7364
 7365	fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
 7366	if (!fpl)
 7367		return -ENOMEM;
 7368
 7369	skb = alloc_skb(0, GFP_KERNEL);
 7370	if (!skb) {
 7371		kfree(fpl);
 7372		return -ENOMEM;
 7373	}
 7374
 7375	skb->sk = sk;
 7376
 7377	nr_files = 0;
 7378	fpl->user = get_uid(current_user());
 7379	for (i = 0; i < nr; i++) {
 7380		struct file *file = io_file_from_index(ctx, i + offset);
 7381
 7382		if (!file)
 7383			continue;
 7384		fpl->fp[nr_files] = get_file(file);
 7385		unix_inflight(fpl->user, fpl->fp[nr_files]);
 7386		nr_files++;
 7387	}
 7388
 7389	if (nr_files) {
 7390		fpl->max = SCM_MAX_FD;
 7391		fpl->count = nr_files;
 7392		UNIXCB(skb).fp = fpl;
 7393		skb->destructor = unix_destruct_scm;
 7394		refcount_add(skb->truesize, &sk->sk_wmem_alloc);
 7395		skb_queue_head(&sk->sk_receive_queue, skb);
 7396
 7397		for (i = 0; i < nr_files; i++)
 7398			fput(fpl->fp[i]);
 7399	} else {
 7400		kfree_skb(skb);
 7401		kfree(fpl);
 7402	}
 7403
 7404	return 0;
 7405}
 7406
 7407/*
 7408 * If UNIX sockets are enabled, fd passing can cause a reference cycle which
 7409 * causes regular reference counting to break down. We rely on the UNIX
 7410 * garbage collection to take care of this problem for us.
 7411 */
 7412static int io_sqe_files_scm(struct io_ring_ctx *ctx)
 7413{
 7414	unsigned left, total;
 7415	int ret = 0;
 7416
 7417	total = 0;
 7418	left = ctx->nr_user_files;
 7419	while (left) {
 7420		unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
 7421
 7422		ret = __io_sqe_files_scm(ctx, this_files, total);
 7423		if (ret)
 7424			break;
 7425		left -= this_files;
 7426		total += this_files;
 7427	}
 7428
 7429	if (!ret)
 7430		return 0;
 7431
 7432	while (total < ctx->nr_user_files) {
 7433		struct file *file = io_file_from_index(ctx, total);
 7434
 7435		if (file)
 7436			fput(file);
 7437		total++;
 7438	}
 7439
 7440	return ret;
 7441}
 7442#else
 7443static int io_sqe_files_scm(struct io_ring_ctx *ctx)
 7444{
 7445	return 0;
 7446}
 7447#endif
 7448
 7449static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
 7450{
 7451	unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
 7452
 7453	table->files = kcalloc(nr_tables, sizeof(*table->files), GFP_KERNEL);
 7454	if (!table->files)
 7455		return false;
 7456
 7457	for (i = 0; i < nr_tables; i++) {
 7458		unsigned int this_files = min(nr_files, IORING_MAX_FILES_TABLE);
 7459
 7460		table->files[i] = kcalloc(this_files, sizeof(*table->files[i]),
 7461					GFP_KERNEL);
 7462		if (!table->files[i])
 7463			break;
 7464		nr_files -= this_files;
 7465	}
 7466
 7467	if (i == nr_tables)
 7468		return true;
 7469
 7470	io_free_file_tables(table, nr_tables * IORING_MAX_FILES_TABLE);
 7471	return false;
 7472}
 7473
 7474static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
 7475{
 7476	struct file *file = prsrc->file;
 7477#if defined(CONFIG_UNIX)
 7478	struct sock *sock = ctx->ring_sock->sk;
 7479	struct sk_buff_head list, *head = &sock->sk_receive_queue;
 7480	struct sk_buff *skb;
 7481	int i;
 7482
 7483	__skb_queue_head_init(&list);
 7484
 7485	/*
 7486	 * Find the skb that holds this file in its SCM_RIGHTS. When found,
 7487	 * remove this entry and rearrange the file array.
 7488	 */
 7489	skb = skb_dequeue(head);
 7490	while (skb) {
 7491		struct scm_fp_list *fp;
 7492
 7493		fp = UNIXCB(skb).fp;
 7494		for (i = 0; i < fp->count; i++) {
 7495			int left;
 7496
 7497			if (fp->fp[i] != file)
 7498				continue;
 7499
 7500			unix_notinflight(fp->user, fp->fp[i]);
 7501			left = fp->count - 1 - i;
 7502			if (left) {
 7503				memmove(&fp->fp[i], &fp->fp[i + 1],
 7504						left * sizeof(struct file *));
 7505			}
 7506			fp->count--;
 7507			if (!fp->count) {
 7508				kfree_skb(skb);
 7509				skb = NULL;
 7510			} else {
 7511				__skb_queue_tail(&list, skb);
 7512			}
 7513			fput(file);
 7514			file = NULL;
 7515			break;
 7516		}
 7517
 7518		if (!file)
 7519			break;
 7520
 7521		__skb_queue_tail(&list, skb);
 7522
 7523		skb = skb_dequeue(head);
 7524	}
 7525
 7526	if (skb_peek(&list)) {
 7527		spin_lock_irq(&head->lock);
 7528		while ((skb = __skb_dequeue(&list)) != NULL)
 7529			__skb_queue_tail(head, skb);
 7530		spin_unlock_irq(&head->lock);
 7531	}
 7532#else
 7533	fput(file);
 7534#endif
 7535}
 7536
 7537static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 7538{
 7539	struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
 7540	struct io_ring_ctx *ctx = rsrc_data->ctx;
 7541	struct io_rsrc_put *prsrc, *tmp;
 7542
 7543	list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
 7544		list_del(&prsrc->list);
 7545
 7546		if (prsrc->tag) {
 7547			bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
 7548			unsigned long flags;
 7549
 7550			io_ring_submit_lock(ctx, lock_ring);
 7551			spin_lock_irqsave(&ctx->completion_lock, flags);
 7552			io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
 7553			ctx->cq_extra++;
 7554			io_commit_cqring(ctx);
 7555			spin_unlock_irqrestore(&ctx->completion_lock, flags);
 7556			io_cqring_ev_posted(ctx);
 7557			io_ring_submit_unlock(ctx, lock_ring);
 7558		}
 7559
 7560		rsrc_data->do_put(ctx, prsrc);
 7561		kfree(prsrc);
 7562	}
 7563
 7564	io_rsrc_node_destroy(ref_node);
 7565	if (atomic_dec_and_test(&rsrc_data->refs))
 7566		complete(&rsrc_data->done);
 7567}
 7568
 7569static void io_rsrc_put_work(struct work_struct *work)
 7570{
 7571	struct io_ring_ctx *ctx;
 7572	struct llist_node *node;
 7573
 7574	ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
 7575	node = llist_del_all(&ctx->rsrc_put_llist);
 7576
 7577	while (node) {
 7578		struct io_rsrc_node *ref_node;
 7579		struct llist_node *next = node->next;
 7580
 7581		ref_node = llist_entry(node, struct io_rsrc_node, llist);
 7582		__io_rsrc_put_work(ref_node);
 7583		node = next;
 7584	}
 7585}
 7586
 7587static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
 7588{
 7589	struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
 7590	struct io_ring_ctx *ctx = node->rsrc_data->ctx;
 7591	bool first_add = false;
 7592
 7593	io_rsrc_ref_lock(ctx);
 7594	node->done = true;
 7595
 7596	while (!list_empty(&ctx->rsrc_ref_list)) {
 7597		node = list_first_entry(&ctx->rsrc_ref_list,
 7598					    struct io_rsrc_node, node);
 7599		/* recycle ref nodes in order */
 7600		if (!node->done)
 7601			break;
 7602		list_del(&node->node);
 7603		first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
 7604	}
 7605	io_rsrc_ref_unlock(ctx);
 7606
 7607	if (first_add)
 7608		mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
 7609}
 7610
 7611static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
 7612{
 7613	struct io_rsrc_node *ref_node;
 7614
 7615	ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
 7616	if (!ref_node)
 7617		return NULL;
 7618
 7619	if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
 7620			    0, GFP_KERNEL)) {
 7621		kfree(ref_node);
 7622		return NULL;
 7623	}
 7624	INIT_LIST_HEAD(&ref_node->node);
 7625	INIT_LIST_HEAD(&ref_node->rsrc_list);
 7626	ref_node->done = false;
 7627	return ref_node;
 7628}
 7629
 7630static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 7631				 unsigned nr_args, u64 __user *tags)
 7632{
 7633	__s32 __user *fds = (__s32 __user *) arg;
 7634	struct file *file;
 7635	int fd, ret;
 7636	unsigned i;
 7637	struct io_rsrc_data *file_data;
 7638
 7639	if (ctx->file_data)
 7640		return -EBUSY;
 7641	if (!nr_args)
 7642		return -EINVAL;
 7643	if (nr_args > IORING_MAX_FIXED_FILES)
 7644		return -EMFILE;
 7645	ret = io_rsrc_node_switch_start(ctx);
 7646	if (ret)
 7647		return ret;
 7648
 7649	file_data = io_rsrc_data_alloc(ctx, io_rsrc_file_put, nr_args);
 7650	if (!file_data)
 7651		return -ENOMEM;
 7652	ctx->file_data = file_data;
 7653	ret = -ENOMEM;
 7654	if (!io_alloc_file_tables(&ctx->file_table, nr_args))
 7655		goto out_free;
 7656
 7657	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
 7658		u64 tag = 0;
 7659
 7660		if ((tags && copy_from_user(&tag, &tags[i], sizeof(tag))) ||
 7661		    copy_from_user(&fd, &fds[i], sizeof(fd))) {
 7662			ret = -EFAULT;
 7663			goto out_fput;
 7664		}
 7665		/* allow sparse sets */
 7666		if (fd == -1) {
 7667			ret = -EINVAL;
 7668			if (unlikely(tag))
 7669				goto out_fput;
 7670			continue;
 7671		}
 7672
 7673		file = fget(fd);
 7674		ret = -EBADF;
 7675		if (unlikely(!file))
 7676			goto out_fput;
 7677
 7678		/*
 7679		 * Don't allow io_uring instances to be registered. If UNIX
 7680		 * isn't enabled, then this causes a reference cycle and this
 7681		 * instance can never get freed. If UNIX is enabled we'll
 7682		 * handle it just fine, but there's still no point in allowing
 7683		 * a ring fd as it doesn't support regular read/write anyway.
 7684		 */
 7685		if (file->f_op == &io_uring_fops) {
 7686			fput(file);
 7687			goto out_fput;
 7688		}
 7689		ctx->file_data->tags[i] = tag;
 7690		io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
 7691	}
 7692
 7693	ret = io_sqe_files_scm(ctx);
 7694	if (ret) {
 7695		__io_sqe_files_unregister(ctx);
 7696		return ret;
 7697	}
 7698
 7699	io_rsrc_node_switch(ctx, NULL);
 7700	return ret;
 7701out_fput:
 7702	for (i = 0; i < ctx->nr_user_files; i++) {
 7703		file = io_file_from_index(ctx, i);
 7704		if (file)
 7705			fput(file);
 7706	}
 7707	io_free_file_tables(&ctx->file_table, nr_args);
 7708	ctx->nr_user_files = 0;
 7709out_free:
 7710	io_rsrc_data_free(ctx->file_data);
 7711	ctx->file_data = NULL;
 7712	return ret;
 7713}
 7714
 7715static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
 7716				int index)
 7717{
 7718#if defined(CONFIG_UNIX)
 7719	struct sock *sock = ctx->ring_sock->sk;
 7720	struct sk_buff_head *head = &sock->sk_receive_queue;
 7721	struct sk_buff *skb;
 7722
 7723	/*
 7724	 * See if we can merge this file into an existing skb SCM_RIGHTS
 7725	 * file set. If there's no room, fall back to allocating a new skb
 7726	 * and filling it in.
 7727	 */
 7728	spin_lock_irq(&head->lock);
 7729	skb = skb_peek(head);
 7730	if (skb) {
 7731		struct scm_fp_list *fpl = UNIXCB(skb).fp;
 7732
 7733		if (fpl->count < SCM_MAX_FD) {
 7734			__skb_unlink(skb, head);
 7735			spin_unlock_irq(&head->lock);
 7736			fpl->fp[fpl->count] = get_file(file);
 7737			unix_inflight(fpl->user, fpl->fp[fpl->count]);
 7738			fpl->count++;
 7739			spin_lock_irq(&head->lock);
 7740			__skb_queue_head(head, skb);
 7741		} else {
 7742			skb = NULL;
 7743		}
 7744	}
 7745	spin_unlock_irq(&head->lock);
 7746
 7747	if (skb) {
 7748		fput(file);
 7749		return 0;
 7750	}
 7751
 7752	return __io_sqe_files_scm(ctx, 1, index);
 7753#else
 7754	return 0;
 7755#endif
 7756}
 7757
 7758static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 7759				 struct io_rsrc_node *node, void *rsrc)
 7760{
 7761	struct io_rsrc_put *prsrc;
 7762
 7763	prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
 7764	if (!prsrc)
 7765		return -ENOMEM;
 7766
 7767	prsrc->tag = data->tags[idx];
 7768	prsrc->rsrc = rsrc;
 7769	list_add(&prsrc->list, &node->rsrc_list);
 7770	return 0;
 7771}
 7772
 7773static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 7774				 struct io_uring_rsrc_update2 *up,
 7775				 unsigned nr_args)
 7776{
 7777	u64 __user *tags = u64_to_user_ptr(up->tags);
 7778	__s32 __user *fds = u64_to_user_ptr(up->data);
 7779	struct io_rsrc_data *data = ctx->file_data;
 7780	struct io_fixed_file *file_slot;
 7781	struct file *file;
 7782	int fd, i, err = 0;
 7783	unsigned int done;
 7784	bool needs_switch = false;
 7785
 7786	if (!ctx->file_data)
 7787		return -ENXIO;
 7788	if (up->offset + nr_args > ctx->nr_user_files)
 7789		return -EINVAL;
 7790
 7791	for (done = 0; done < nr_args; done++) {
 7792		u64 tag = 0;
 7793
 7794		if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
 7795		    copy_from_user(&fd, &fds[done], sizeof(fd))) {
 7796			err = -EFAULT;
 7797			break;
 7798		}
 7799		if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
 7800			err = -EINVAL;
 7801			break;
 7802		}
 7803		if (fd == IORING_REGISTER_FILES_SKIP)
 7804			continue;
 7805
 7806		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
 7807		file_slot = io_fixed_file_slot(&ctx->file_table, i);
 7808
 7809		if (file_slot->file_ptr) {
 7810			file = (struct file *)(file_slot->file_ptr & FFS_MASK);
 7811			err = io_queue_rsrc_removal(data, up->offset + done,
 7812						    ctx->rsrc_node, file);
 7813			if (err)
 7814				break;
 7815			file_slot->file_ptr = 0;
 7816			needs_switch = true;
 7817		}
 7818		if (fd != -1) {
 7819			file = fget(fd);
 7820			if (!file) {
 7821				err = -EBADF;
 7822				break;
 7823			}
 7824			/*
 7825			 * Don't allow io_uring instances to be registered. If
 7826			 * UNIX isn't enabled, then this causes a reference
 7827			 * cycle and this instance can never get freed. If UNIX
 7828			 * is enabled we'll handle it just fine, but there's
 7829			 * still no point in allowing a ring fd as it doesn't
 7830			 * support regular read/write anyway.
 7831			 */
 7832			if (file->f_op == &io_uring_fops) {
 7833				fput(file);
 7834				err = -EBADF;
 7835				break;
 7836			}
 7837			data->tags[up->offset + done] = tag;
 7838			io_fixed_file_set(file_slot, file);
 7839			err = io_sqe_file_register(ctx, file, i);
 7840			if (err) {
 7841				file_slot->file_ptr = 0;
 7842				fput(file);
 7843				break;
 7844			}
 7845		}
 7846	}
 7847
 7848	if (needs_switch)
 7849		io_rsrc_node_switch(ctx, data);
 7850	return done ? done : err;
 7851}
 7852
 7853static struct io_wq_work *io_free_work(struct io_wq_work *work)
 7854{
 7855	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 7856
 7857	req = io_put_req_find_next(req);
 7858	return req ? &req->work : NULL;
 7859}
 7860
 7861static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
 7862					struct task_struct *task)
 7863{
 7864	struct io_wq_hash *hash;
 7865	struct io_wq_data data;
 7866	unsigned int concurrency;
 7867
 7868	hash = ctx->hash_map;
 7869	if (!hash) {
 7870		hash = kzalloc(sizeof(*hash), GFP_KERNEL);
 7871		if (!hash)
 7872			return ERR_PTR(-ENOMEM);
 7873		refcount_set(&hash->refs, 1);
 7874		init_waitqueue_head(&hash->wait);
 7875		ctx->hash_map = hash;
 7876	}
 7877
 7878	data.hash = hash;
 7879	data.task = task;
 7880	data.free_work = io_free_work;
 7881	data.do_work = io_wq_submit_work;
 7882
 7883	/* Do QD, or 4 * CPUS, whatever is smallest */
 7884	concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
 7885
 7886	return io_wq_create(concurrency, &data);
 7887}
 7888
 7889static int io_uring_alloc_task_context(struct task_struct *task,
 7890				       struct io_ring_ctx *ctx)
 7891{
 7892	struct io_uring_task *tctx;
 7893	int ret;
 7894
 7895	tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
 7896	if (unlikely(!tctx))
 7897		return -ENOMEM;
 7898
 7899	ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
 7900	if (unlikely(ret)) {
 7901		kfree(tctx);
 7902		return ret;
 7903	}
 7904
 7905	tctx->io_wq = io_init_wq_offload(ctx, task);
 7906	if (IS_ERR(tctx->io_wq)) {
 7907		ret = PTR_ERR(tctx->io_wq);
 7908		percpu_counter_destroy(&tctx->inflight);
 7909		kfree(tctx);
 7910		return ret;
 7911	}
 7912
 7913	xa_init(&tctx->xa);
 7914	init_waitqueue_head(&tctx->wait);
 7915	tctx->last = NULL;
 7916	atomic_set(&tctx->in_idle, 0);
 7917	atomic_set(&tctx->inflight_tracked, 0);
 7918	task->io_uring = tctx;
 7919	spin_lock_init(&tctx->task_lock);
 7920	INIT_WQ_LIST(&tctx->task_list);
 7921	tctx->task_state = 0;
 7922	init_task_work(&tctx->task_work, tctx_task_work);
 7923	return 0;
 7924}
 7925
 7926void __io_uring_free(struct task_struct *tsk)
 7927{
 7928	struct io_uring_task *tctx = tsk->io_uring;
 7929
 7930	WARN_ON_ONCE(!xa_empty(&tctx->xa));
 7931	WARN_ON_ONCE(tctx->io_wq);
 7932
 7933	percpu_counter_destroy(&tctx->inflight);
 7934	kfree(tctx);
 7935	tsk->io_uring = NULL;
 7936}
 7937
 7938static int io_sq_offload_create(struct io_ring_ctx *ctx,
 7939				struct io_uring_params *p)
 7940{
 7941	int ret;
 7942
 7943	/* Retain compatibility with failing for an invalid attach attempt */
 7944	if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
 7945				IORING_SETUP_ATTACH_WQ) {
 7946		struct fd f;
 7947
 7948		f = fdget(p->wq_fd);
 7949		if (!f.file)
 7950			return -ENXIO;
 7951		fdput(f);
 7952		if (f.file->f_op != &io_uring_fops)
 7953			return -EINVAL;
 7954	}
 7955	if (ctx->flags & IORING_SETUP_SQPOLL) {
 7956		struct task_struct *tsk;
 7957		struct io_sq_data *sqd;
 7958		bool attached;
 7959
 7960		sqd = io_get_sq_data(p, &attached);
 7961		if (IS_ERR(sqd)) {
 7962			ret = PTR_ERR(sqd);
 7963			goto err;
 7964		}
 7965
 7966		ctx->sq_creds = get_current_cred();
 7967		ctx->sq_data = sqd;
 7968		ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
 7969		if (!ctx->sq_thread_idle)
 7970			ctx->sq_thread_idle = HZ;
 7971
 7972		io_sq_thread_park(sqd);
 7973		list_add(&ctx->sqd_list, &sqd->ctx_list);
 7974		io_sqd_update_thread_idle(sqd);
 7975		/* don't attach to a dying SQPOLL thread, would be racy */
 7976		ret = (attached && !sqd->thread) ? -ENXIO : 0;
 7977		io_sq_thread_unpark(sqd);
 7978
 7979		if (ret < 0)
 7980			goto err;
 7981		if (attached)
 7982			return 0;
 7983
 7984		if (p->flags & IORING_SETUP_SQ_AFF) {
 7985			int cpu = p->sq_thread_cpu;
 7986
 7987			ret = -EINVAL;
 7988			if (cpu >= nr_cpu_ids || !cpu_online(cpu))
 7989				goto err_sqpoll;
 7990			sqd->sq_cpu = cpu;
 7991		} else {
 7992			sqd->sq_cpu = -1;
 7993		}
 7994
 7995		sqd->task_pid = current->pid;
 7996		sqd->task_tgid = current->tgid;
 7997		tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
 7998		if (IS_ERR(tsk)) {
 7999			ret = PTR_ERR(tsk);
 8000			goto err_sqpoll;
 8001		}
 8002
 8003		sqd->thread = tsk;
 8004		ret = io_uring_alloc_task_context(tsk, ctx);
 8005		wake_up_new_task(tsk);
 8006		if (ret)
 8007			goto err;
 8008	} else if (p->flags & IORING_SETUP_SQ_AFF) {
 8009		/* Can't have SQ_AFF without SQPOLL */
 8010		ret = -EINVAL;
 8011		goto err;
 8012	}
 8013
 8014	return 0;
 8015err_sqpoll:
 8016	complete(&ctx->sq_data->exited);
 8017err:
 8018	io_sq_thread_finish(ctx);
 8019	return ret;
 8020}
 8021
 8022static inline void __io_unaccount_mem(struct user_struct *user,
 8023				      unsigned long nr_pages)
 8024{
 8025	atomic_long_sub(nr_pages, &user->locked_vm);
 8026}
 8027
 8028static inline int __io_account_mem(struct user_struct *user,
 8029				   unsigned long nr_pages)
 8030{
 8031	unsigned long page_limit, cur_pages, new_pages;
 8032
 8033	/* Don't allow more pages than we can safely lock */
 8034	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 8035
 8036	do {
 8037		cur_pages = atomic_long_read(&user->locked_vm);
 8038		new_pages = cur_pages + nr_pages;
 8039		if (new_pages > page_limit)
 8040			return -ENOMEM;
 8041	} while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
 8042					new_pages) != cur_pages);
 8043
 8044	return 0;
 8045}
 8046
 8047static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
 8048{
 8049	if (ctx->user)
 8050		__io_unaccount_mem(ctx->user, nr_pages);
 8051
 8052	if (ctx->mm_account)
 8053		atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
 8054}
 8055
 8056static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
 8057{
 8058	int ret;
 8059
 8060	if (ctx->user) {
 8061		ret = __io_account_mem(ctx->user, nr_pages);
 8062		if (ret)
 8063			return ret;
 8064	}
 8065
 8066	if (ctx->mm_account)
 8067		atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
 8068
 8069	return 0;
 8070}
 8071
 8072static void io_mem_free(void *ptr)
 8073{
 8074	struct page *page;
 8075
 8076	if (!ptr)
 8077		return;
 8078
 8079	page = virt_to_head_page(ptr);
 8080	if (put_page_testzero(page))
 8081		free_compound_page(page);
 8082}
 8083
 8084static void *io_mem_alloc(size_t size)
 8085{
 8086	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
 8087				__GFP_NORETRY | __GFP_ACCOUNT;
 8088
 8089	return (void *) __get_free_pages(gfp_flags, get_order(size));
 8090}
 8091
 8092static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
 8093				size_t *sq_offset)
 8094{
 8095	struct io_rings *rings;
 8096	size_t off, sq_array_size;
 8097
 8098	off = struct_size(rings, cqes, cq_entries);
 8099	if (off == SIZE_MAX)
 8100		return SIZE_MAX;
 8101
 8102#ifdef CONFIG_SMP
 8103	off = ALIGN(off, SMP_CACHE_BYTES);
 8104	if (off == 0)
 8105		return SIZE_MAX;
 8106#endif
 8107
 8108	if (sq_offset)
 8109		*sq_offset = off;
 8110
 8111	sq_array_size = array_size(sizeof(u32), sq_entries);
 8112	if (sq_array_size == SIZE_MAX)
 8113		return SIZE_MAX;
 8114
 8115	if (check_add_overflow(off, sq_array_size, &off))
 8116		return SIZE_MAX;
 8117
 8118	return off;
 8119}
 8120
 8121static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
 8122{
 8123	struct io_mapped_ubuf *imu = *slot;
 8124	unsigned int i;
 8125
 8126	if (imu != ctx->dummy_ubuf) {
 8127		for (i = 0; i < imu->nr_bvecs; i++)
 8128			unpin_user_page(imu->bvec[i].bv_page);
 8129		if (imu->acct_pages)
 8130			io_unaccount_mem(ctx, imu->acct_pages);
 8131		kvfree(imu);
 8132	}
 8133	*slot = NULL;
 8134}
 8135
 8136static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
 8137{
 8138	io_buffer_unmap(ctx, &prsrc->buf);
 8139	prsrc->buf = NULL;
 8140}
 8141
 8142static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
 8143{
 8144	unsigned int i;
 8145
 8146	for (i = 0; i < ctx->nr_user_bufs; i++)
 8147		io_buffer_unmap(ctx, &ctx->user_bufs[i]);
 8148	kfree(ctx->user_bufs);
 8149	io_rsrc_data_free(ctx->buf_data);
 8150	ctx->user_bufs = NULL;
 8151	ctx->buf_data = NULL;
 8152	ctx->nr_user_bufs = 0;
 8153}
 8154
 8155static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
 8156{
 8157	int ret;
 8158
 8159	if (!ctx->buf_data)
 8160		return -ENXIO;
 8161
 8162	ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
 8163	if (!ret)
 8164		__io_sqe_buffers_unregister(ctx);
 8165	return ret;
 8166}
 8167
 8168static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
 8169		       void __user *arg, unsigned index)
 8170{
 8171	struct iovec __user *src;
 8172
 8173#ifdef CONFIG_COMPAT
 8174	if (ctx->compat) {
 8175		struct compat_iovec __user *ciovs;
 8176		struct compat_iovec ciov;
 8177
 8178		ciovs = (struct compat_iovec __user *) arg;
 8179		if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
 8180			return -EFAULT;
 8181
 8182		dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
 8183		dst->iov_len = ciov.iov_len;
 8184		return 0;
 8185	}
 8186#endif
 8187	src = (struct iovec __user *) arg;
 8188	if (copy_from_user(dst, &src[index], sizeof(*dst)))
 8189		return -EFAULT;
 8190	return 0;
 8191}
 8192
 8193/*
 8194 * Not super efficient, but this is just a registration time. And we do cache
 8195 * the last compound head, so generally we'll only do a full search if we don't
 8196 * match that one.
 8197 *
 8198 * We check if the given compound head page has already been accounted, to
 8199 * avoid double accounting it. This allows us to account the full size of the
 8200 * page, not just the constituent pages of a huge page.
 8201 */
 8202static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
 8203				  int nr_pages, struct page *hpage)
 8204{
 8205	int i, j;
 8206
 8207	/* check current page array */
 8208	for (i = 0; i < nr_pages; i++) {
 8209		if (!PageCompound(pages[i]))
 8210			continue;
 8211		if (compound_head(pages[i]) == hpage)
 8212			return true;
 8213	}
 8214
 8215	/* check previously registered pages */
 8216	for (i = 0; i < ctx->nr_user_bufs; i++) {
 8217		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
 8218
 8219		for (j = 0; j < imu->nr_bvecs; j++) {
 8220			if (!PageCompound(imu->bvec[j].bv_page))
 8221				continue;
 8222			if (compound_head(imu->bvec[j].bv_page) == hpage)
 8223				return true;
 8224		}
 8225	}
 8226
 8227	return false;
 8228}
 8229
 8230static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
 8231				 int nr_pages, struct io_mapped_ubuf *imu,
 8232				 struct page **last_hpage)
 8233{
 8234	int i, ret;
 8235
 8236	imu->acct_pages = 0;
 8237	for (i = 0; i < nr_pages; i++) {
 8238		if (!PageCompound(pages[i])) {
 8239			imu->acct_pages++;
 8240		} else {
 8241			struct page *hpage;
 8242
 8243			hpage = compound_head(pages[i]);
 8244			if (hpage == *last_hpage)
 8245				continue;
 8246			*last_hpage = hpage;
 8247			if (headpage_already_acct(ctx, pages, i, hpage))
 8248				continue;
 8249			imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
 8250		}
 8251	}
 8252
 8253	if (!imu->acct_pages)
 8254		return 0;
 8255
 8256	ret = io_account_mem(ctx, imu->acct_pages);
 8257	if (ret)
 8258		imu->acct_pages = 0;
 8259	return ret;
 8260}
 8261
 8262static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 8263				  struct io_mapped_ubuf **pimu,
 8264				  struct page **last_hpage)
 8265{
 8266	struct io_mapped_ubuf *imu = NULL;
 8267	struct vm_area_struct **vmas = NULL;
 8268	struct page **pages = NULL;
 8269	unsigned long off, start, end, ubuf;
 8270	size_t size;
 8271	int ret, pret, nr_pages, i;
 8272
 8273	if (!iov->iov_base) {
 8274		*pimu = ctx->dummy_ubuf;
 8275		return 0;
 8276	}
 8277
 8278	ubuf = (unsigned long) iov->iov_base;
 8279	end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 8280	start = ubuf >> PAGE_SHIFT;
 8281	nr_pages = end - start;
 8282
 8283	*pimu = NULL;
 8284	ret = -ENOMEM;
 8285
 8286	pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
 8287	if (!pages)
 8288		goto done;
 8289
 8290	vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
 8291			      GFP_KERNEL);
 8292	if (!vmas)
 8293		goto done;
 8294
 8295	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
 8296	if (!imu)
 8297		goto done;
 8298
 8299	ret = 0;
 8300	mmap_read_lock(current->mm);
 8301	pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
 8302			      pages, vmas);
 8303	if (pret == nr_pages) {
 8304		/* don't support file backed memory */
 8305		for (i = 0; i < nr_pages; i++) {
 8306			struct vm_area_struct *vma = vmas[i];
 8307
 8308			if (vma->vm_file &&
 8309			    !is_file_hugepages(vma->vm_file)) {
 8310				ret = -EOPNOTSUPP;
 8311				break;
 8312			}
 8313		}
 8314	} else {
 8315		ret = pret < 0 ? pret : -EFAULT;
 8316	}
 8317	mmap_read_unlock(current->mm);
 8318	if (ret) {
 8319		/*
 8320		 * if we did partial map, or found file backed vmas,
 8321		 * release any pages we did get
 8322		 */
 8323		if (pret > 0)
 8324			unpin_user_pages(pages, pret);
 8325		goto done;
 8326	}
 8327
 8328	ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
 8329	if (ret) {
 8330		unpin_user_pages(pages, pret);
 8331		goto done;
 8332	}
 8333
 8334	off = ubuf & ~PAGE_MASK;
 8335	size = iov->iov_len;
 8336	for (i = 0; i < nr_pages; i++) {
 8337		size_t vec_len;
 8338
 8339		vec_len = min_t(size_t, size, PAGE_SIZE - off);
 8340		imu->bvec[i].bv_page = pages[i];
 8341		imu->bvec[i].bv_len = vec_len;
 8342		imu->bvec[i].bv_offset = off;
 8343		off = 0;
 8344		size -= vec_len;
 8345	}
 8346	/* store original address for later verification */
 8347	imu->ubuf = ubuf;
 8348	imu->ubuf_end = ubuf + iov->iov_len;
 8349	imu->nr_bvecs = nr_pages;
 8350	*pimu = imu;
 8351	ret = 0;
 8352done:
 8353	if (ret)
 8354		kvfree(imu);
 8355	kvfree(pages);
 8356	kvfree(vmas);
 8357	return ret;
 8358}
 8359
 8360static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
 8361{
 8362	ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
 8363	return ctx->user_bufs ? 0 : -ENOMEM;
 8364}
 8365
 8366static int io_buffer_validate(struct iovec *iov)
 8367{
 8368	unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
 8369
 8370	/*
 8371	 * Don't impose further limits on the size and buffer
 8372	 * constraints here, we'll -EINVAL later when IO is
 8373	 * submitted if they are wrong.
 8374	 */
 8375	if (!iov->iov_base)
 8376		return iov->iov_len ? -EFAULT : 0;
 8377	if (!iov->iov_len)
 8378		return -EFAULT;
 8379
 8380	/* arbitrary limit, but we need something */
 8381	if (iov->iov_len > SZ_1G)
 8382		return -EFAULT;
 8383
 8384	if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
 8385		return -EOVERFLOW;
 8386
 8387	return 0;
 8388}
 8389
 8390static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 8391				   unsigned int nr_args, u64 __user *tags)
 8392{
 8393	struct page *last_hpage = NULL;
 8394	struct io_rsrc_data *data;
 8395	int i, ret;
 8396	struct iovec iov;
 8397
 8398	if (ctx->user_bufs)
 8399		return -EBUSY;
 8400	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
 8401		return -EINVAL;
 8402	ret = io_rsrc_node_switch_start(ctx);
 8403	if (ret)
 8404		return ret;
 8405	data = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, nr_args);
 8406	if (!data)
 8407		return -ENOMEM;
 8408	ret = io_buffers_map_alloc(ctx, nr_args);
 8409	if (ret) {
 8410		io_rsrc_data_free(data);
 8411		return ret;
 8412	}
 8413
 8414	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
 8415		u64 tag = 0;
 8416
 8417		if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) {
 8418			ret = -EFAULT;
 8419			break;
 8420		}
 8421		ret = io_copy_iov(ctx, &iov, arg, i);
 8422		if (ret)
 8423			break;
 8424		ret = io_buffer_validate(&iov);
 8425		if (ret)
 8426			break;
 8427		if (!iov.iov_base && tag) {
 8428			ret = -EINVAL;
 8429			break;
 8430		}
 8431
 8432		ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
 8433					     &last_hpage);
 8434		if (ret)
 8435			break;
 8436		data->tags[i] = tag;
 8437	}
 8438
 8439	WARN_ON_ONCE(ctx->buf_data);
 8440
 8441	ctx->buf_data = data;
 8442	if (ret)
 8443		__io_sqe_buffers_unregister(ctx);
 8444	else
 8445		io_rsrc_node_switch(ctx, NULL);
 8446	return ret;
 8447}
 8448
 8449static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 8450				   struct io_uring_rsrc_update2 *up,
 8451				   unsigned int nr_args)
 8452{
 8453	u64 __user *tags = u64_to_user_ptr(up->tags);
 8454	struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
 8455	struct page *last_hpage = NULL;
 8456	bool needs_switch = false;
 8457	__u32 done;
 8458	int i, err;
 8459
 8460	if (!ctx->buf_data)
 8461		return -ENXIO;
 8462	if (up->offset + nr_args > ctx->nr_user_bufs)
 8463		return -EINVAL;
 8464
 8465	for (done = 0; done < nr_args; done++) {
 8466		struct io_mapped_ubuf *imu;
 8467		int offset = up->offset + done;
 8468		u64 tag = 0;
 8469
 8470		err = io_copy_iov(ctx, &iov, iovs, done);
 8471		if (err)
 8472			break;
 8473		if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
 8474			err = -EFAULT;
 8475			break;
 8476		}
 8477		err = io_buffer_validate(&iov);
 8478		if (err)
 8479			break;
 8480		if (!iov.iov_base && tag) {
 8481			err = -EINVAL;
 8482			break;
 8483		}
 8484		err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
 8485		if (err)
 8486			break;
 8487
 8488		i = array_index_nospec(offset, ctx->nr_user_bufs);
 8489		if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
 8490			err = io_queue_rsrc_removal(ctx->buf_data, offset,
 8491						    ctx->rsrc_node, ctx->user_bufs[i]);
 8492			if (unlikely(err)) {
 8493				io_buffer_unmap(ctx, &imu);
 8494				break;
 8495			}
 8496			ctx->user_bufs[i] = NULL;
 8497			needs_switch = true;
 8498		}
 8499
 8500		ctx->user_bufs[i] = imu;
 8501		ctx->buf_data->tags[offset] = tag;
 8502	}
 8503
 8504	if (needs_switch)
 8505		io_rsrc_node_switch(ctx, ctx->buf_data);
 8506	return done ? done : err;
 8507}
 8508
 8509static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
 8510{
 8511	__s32 __user *fds = arg;
 8512	int fd;
 8513
 8514	if (ctx->cq_ev_fd)
 8515		return -EBUSY;
 8516
 8517	if (copy_from_user(&fd, fds, sizeof(*fds)))
 8518		return -EFAULT;
 8519
 8520	ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
 8521	if (IS_ERR(ctx->cq_ev_fd)) {
 8522		int ret = PTR_ERR(ctx->cq_ev_fd);
 8523		ctx->cq_ev_fd = NULL;
 8524		return ret;
 8525	}
 8526
 8527	return 0;
 8528}
 8529
 8530static int io_eventfd_unregister(struct io_ring_ctx *ctx)
 8531{
 8532	if (ctx->cq_ev_fd) {
 8533		eventfd_ctx_put(ctx->cq_ev_fd);
 8534		ctx->cq_ev_fd = NULL;
 8535		return 0;
 8536	}
 8537
 8538	return -ENXIO;
 8539}
 8540
 8541static void io_destroy_buffers(struct io_ring_ctx *ctx)
 8542{
 8543	struct io_buffer *buf;
 8544	unsigned long index;
 8545
 8546	xa_for_each(&ctx->io_buffers, index, buf)
 8547		__io_remove_buffers(ctx, buf, index, -1U);
 8548}
 8549
 8550static void io_req_cache_free(struct list_head *list, struct task_struct *tsk)
 8551{
 8552	struct io_kiocb *req, *nxt;
 8553
 8554	list_for_each_entry_safe(req, nxt, list, compl.list) {
 8555		if (tsk && req->task != tsk)
 8556			continue;
 8557		list_del(&req->compl.list);
 8558		kmem_cache_free(req_cachep, req);
 8559	}
 8560}
 8561
 8562static void io_req_caches_free(struct io_ring_ctx *ctx)
 8563{
 8564	struct io_submit_state *submit_state = &ctx->submit_state;
 8565	struct io_comp_state *cs = &ctx->submit_state.comp;
 8566
 8567	mutex_lock(&ctx->uring_lock);
 8568
 8569	if (submit_state->free_reqs) {
 8570		kmem_cache_free_bulk(req_cachep, submit_state->free_reqs,
 8571				     submit_state->reqs);
 8572		submit_state->free_reqs = 0;
 8573	}
 8574
 8575	io_flush_cached_locked_reqs(ctx, cs);
 8576	io_req_cache_free(&cs->free_list, NULL);
 8577	mutex_unlock(&ctx->uring_lock);
 8578}
 8579
 8580static bool io_wait_rsrc_data(struct io_rsrc_data *data)
 8581{
 8582	if (!data)
 8583		return false;
 8584	if (!atomic_dec_and_test(&data->refs))
 8585		wait_for_completion(&data->done);
 8586	return true;
 8587}
 8588
 8589static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 8590{
 8591	io_sq_thread_finish(ctx);
 8592
 8593	if (ctx->mm_account) {
 8594		mmdrop(ctx->mm_account);
 8595		ctx->mm_account = NULL;
 8596	}
 8597
 8598	mutex_lock(&ctx->uring_lock);
 8599	if (io_wait_rsrc_data(ctx->buf_data))
 8600		__io_sqe_buffers_unregister(ctx);
 8601	if (io_wait_rsrc_data(ctx->file_data))
 8602		__io_sqe_files_unregister(ctx);
 8603	if (ctx->rings)
 8604		__io_cqring_overflow_flush(ctx, true);
 8605	mutex_unlock(&ctx->uring_lock);
 8606	io_eventfd_unregister(ctx);
 8607	io_destroy_buffers(ctx);
 8608	if (ctx->sq_creds)
 8609		put_cred(ctx->sq_creds);
 8610
 8611	/* there are no registered resources left, nobody uses it */
 8612	if (ctx->rsrc_node)
 8613		io_rsrc_node_destroy(ctx->rsrc_node);
 8614	if (ctx->rsrc_backup_node)
 8615		io_rsrc_node_destroy(ctx->rsrc_backup_node);
 8616	flush_delayed_work(&ctx->rsrc_put_work);
 8617
 8618	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
 8619	WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
 8620
 8621#if defined(CONFIG_UNIX)
 8622	if (ctx->ring_sock) {
 8623		ctx->ring_sock->file = NULL; /* so that iput() is called */
 8624		sock_release(ctx->ring_sock);
 8625	}
 8626#endif
 8627
 8628	io_mem_free(ctx->rings);
 8629	io_mem_free(ctx->sq_sqes);
 8630
 8631	percpu_ref_exit(&ctx->refs);
 8632	free_uid(ctx->user);
 8633	io_req_caches_free(ctx);
 8634	if (ctx->hash_map)
 8635		io_wq_put_hash(ctx->hash_map);
 8636	kfree(ctx->cancel_hash);
 8637	kfree(ctx->dummy_ubuf);
 8638	kfree(ctx);
 8639}
 8640
 8641static __poll_t io_uring_poll(struct file *file, poll_table *wait)
 8642{
 8643	struct io_ring_ctx *ctx = file->private_data;
 8644	__poll_t mask = 0;
 8645
 8646	poll_wait(file, &ctx->cq_wait, wait);
 8647	/*
 8648	 * synchronizes with barrier from wq_has_sleeper call in
 8649	 * io_commit_cqring
 8650	 */
 8651	smp_rmb();
 8652	if (!io_sqring_full(ctx))
 8653		mask |= EPOLLOUT | EPOLLWRNORM;
 8654
 8655	/*
 8656	 * Don't flush cqring overflow list here, just do a simple check.
 8657	 * Otherwise there could possible be ABBA deadlock:
 8658	 *      CPU0                    CPU1
 8659	 *      ----                    ----
 8660	 * lock(&ctx->uring_lock);
 8661	 *                              lock(&ep->mtx);
 8662	 *                              lock(&ctx->uring_lock);
 8663	 * lock(&ep->mtx);
 8664	 *
 8665	 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
 8666	 * pushs them to do the flush.
 8667	 */
 8668	if (io_cqring_events(ctx) || test_bit(0, &ctx->cq_check_overflow))
 8669		mask |= EPOLLIN | EPOLLRDNORM;
 8670
 8671	return mask;
 8672}
 8673
 8674static int io_uring_fasync(int fd, struct file *file, int on)
 8675{
 8676	struct io_ring_ctx *ctx = file->private_data;
 8677
 8678	return fasync_helper(fd, file, on, &ctx->cq_fasync);
 8679}
 8680
 8681static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
 8682{
 8683	const struct cred *creds;
 8684
 8685	creds = xa_erase(&ctx->personalities, id);
 8686	if (creds) {
 8687		put_cred(creds);
 8688		return 0;
 8689	}
 8690
 8691	return -EINVAL;
 8692}
 8693
 8694static inline bool io_run_ctx_fallback(struct io_ring_ctx *ctx)
 8695{
 8696	return io_run_task_work_head(&ctx->exit_task_work);
 8697}
 8698
 8699struct io_tctx_exit {
 8700	struct callback_head		task_work;
 8701	struct completion		completion;
 8702	struct io_ring_ctx		*ctx;
 8703};
 8704
 8705static void io_tctx_exit_cb(struct callback_head *cb)
 8706{
 8707	struct io_uring_task *tctx = current->io_uring;
 8708	struct io_tctx_exit *work;
 8709
 8710	work = container_of(cb, struct io_tctx_exit, task_work);
 8711	/*
 8712	 * When @in_idle, we're in cancellation and it's racy to remove the
 8713	 * node. It'll be removed by the end of cancellation, just ignore it.
 8714	 */
 8715	if (!atomic_read(&tctx->in_idle))
 8716		io_uring_del_task_file((unsigned long)work->ctx);
 8717	complete(&work->completion);
 8718}
 8719
 8720static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
 8721{
 8722	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 8723
 8724	return req->ctx == data;
 8725}
 8726
 8727static void io_ring_exit_work(struct work_struct *work)
 8728{
 8729	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
 8730	unsigned long timeout = jiffies + HZ * 60 * 5;
 8731	struct io_tctx_exit exit;
 8732	struct io_tctx_node *node;
 8733	int ret;
 8734
 8735	/*
 8736	 * If we're doing polled IO and end up having requests being
 8737	 * submitted async (out-of-line), then completions can come in while
 8738	 * we're waiting for refs to drop. We need to reap these manually,
 8739	 * as nobody else will be looking for them.
 8740	 */
 8741	do {
 8742		io_uring_try_cancel_requests(ctx, NULL, NULL);
 8743		if (ctx->sq_data) {
 8744			struct io_sq_data *sqd = ctx->sq_data;
 8745			struct task_struct *tsk;
 8746
 8747			io_sq_thread_park(sqd);
 8748			tsk = sqd->thread;
 8749			if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
 8750				io_wq_cancel_cb(tsk->io_uring->io_wq,
 8751						io_cancel_ctx_cb, ctx, true);
 8752			io_sq_thread_unpark(sqd);
 8753		}
 8754
 8755		WARN_ON_ONCE(time_after(jiffies, timeout));
 8756	} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
 8757
 8758	init_completion(&exit.completion);
 8759	init_task_work(&exit.task_work, io_tctx_exit_cb);
 8760	exit.ctx = ctx;
 8761	/*
 8762	 * Some may use context even when all refs and requests have been put,
 8763	 * and they are free to do so while still holding uring_lock or
 8764	 * completion_lock, see __io_req_task_submit(). Apart from other work,
 8765	 * this lock/unlock section also waits them to finish.
 8766	 */
 8767	mutex_lock(&ctx->uring_lock);
 8768	while (!list_empty(&ctx->tctx_list)) {
 8769		WARN_ON_ONCE(time_after(jiffies, timeout));
 8770
 8771		node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
 8772					ctx_node);
 8773		/* don't spin on a single task if cancellation failed */
 8774		list_rotate_left(&ctx->tctx_list);
 8775		ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
 8776		if (WARN_ON_ONCE(ret))
 8777			continue;
 8778		wake_up_process(node->task);
 8779
 8780		mutex_unlock(&ctx->uring_lock);
 8781		wait_for_completion(&exit.completion);
 8782		mutex_lock(&ctx->uring_lock);
 8783	}
 8784	mutex_unlock(&ctx->uring_lock);
 8785	spin_lock_irq(&ctx->completion_lock);
 8786	spin_unlock_irq(&ctx->completion_lock);
 8787
 8788	io_ring_ctx_free(ctx);
 8789}
 8790
 8791/* Returns true if we found and killed one or more timeouts */
 8792static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
 8793			     struct files_struct *files)
 8794{
 8795	struct io_kiocb *req, *tmp;
 8796	int canceled = 0;
 8797
 8798	spin_lock_irq(&ctx->completion_lock);
 8799	list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
 8800		if (io_match_task(req, tsk, files)) {
 8801			io_kill_timeout(req, -ECANCELED);
 8802			canceled++;
 8803		}
 8804	}
 8805	if (canceled != 0)
 8806		io_commit_cqring(ctx);
 8807	spin_unlock_irq(&ctx->completion_lock);
 8808	if (canceled != 0)
 8809		io_cqring_ev_posted(ctx);
 8810	return canceled != 0;
 8811}
 8812
 8813static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 8814{
 8815	unsigned long index;
 8816	struct creds *creds;
 8817
 8818	mutex_lock(&ctx->uring_lock);
 8819	percpu_ref_kill(&ctx->refs);
 8820	if (ctx->rings)
 8821		__io_cqring_overflow_flush(ctx, true);
 8822	xa_for_each(&ctx->personalities, index, creds)
 8823		io_unregister_personality(ctx, index);
 8824	mutex_unlock(&ctx->uring_lock);
 8825
 8826	io_kill_timeouts(ctx, NULL, NULL);
 8827	io_poll_remove_all(ctx, NULL, NULL);
 8828
 8829	/* if we failed setting up the ctx, we might not have any rings */
 8830	io_iopoll_try_reap_events(ctx);
 8831
 8832	INIT_WORK(&ctx->exit_work, io_ring_exit_work);
 8833	/*
 8834	 * Use system_unbound_wq to avoid spawning tons of event kworkers
 8835	 * if we're exiting a ton of rings at the same time. It just adds
 8836	 * noise and overhead, there's no discernable change in runtime
 8837	 * over using system_wq.
 8838	 */
 8839	queue_work(system_unbound_wq, &ctx->exit_work);
 8840}
 8841
 8842static int io_uring_release(struct inode *inode, struct file *file)
 8843{
 8844	struct io_ring_ctx *ctx = file->private_data;
 8845
 8846	file->private_data = NULL;
 8847	io_ring_ctx_wait_and_kill(ctx);
 8848	return 0;
 8849}
 8850
 8851struct io_task_cancel {
 8852	struct task_struct *task;
 8853	struct files_struct *files;
 8854};
 8855
 8856static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
 8857{
 8858	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 8859	struct io_task_cancel *cancel = data;
 8860	bool ret;
 8861
 8862	if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) {
 8863		unsigned long flags;
 8864		struct io_ring_ctx *ctx = req->ctx;
 8865
 8866		/* protect against races with linked timeouts */
 8867		spin_lock_irqsave(&ctx->completion_lock, flags);
 8868		ret = io_match_task(req, cancel->task, cancel->files);
 8869		spin_unlock_irqrestore(&ctx->completion_lock, flags);
 8870	} else {
 8871		ret = io_match_task(req, cancel->task, cancel->files);
 8872	}
 8873	return ret;
 8874}
 8875
 8876static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
 8877				  struct task_struct *task,
 8878				  struct files_struct *files)
 8879{
 8880	struct io_defer_entry *de;
 8881	LIST_HEAD(list);
 8882
 8883	spin_lock_irq(&ctx->completion_lock);
 8884	list_for_each_entry_reverse(de, &ctx->defer_list, list) {
 8885		if (io_match_task(de->req, task, files)) {
 8886			list_cut_position(&list, &ctx->defer_list, &de->list);
 8887			break;
 8888		}
 8889	}
 8890	spin_unlock_irq(&ctx->completion_lock);
 8891	if (list_empty(&list))
 8892		return false;
 8893
 8894	while (!list_empty(&list)) {
 8895		de = list_first_entry(&list, struct io_defer_entry, list);
 8896		list_del_init(&de->list);
 8897		io_req_complete_failed(de->req, -ECANCELED);
 8898		kfree(de);
 8899	}
 8900	return true;
 8901}
 8902
 8903static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
 8904{
 8905	struct io_tctx_node *node;
 8906	enum io_wq_cancel cret;
 8907	bool ret = false;
 8908
 8909	mutex_lock(&ctx->uring_lock);
 8910	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
 8911		struct io_uring_task *tctx = node->task->io_uring;
 8912
 8913		/*
 8914		 * io_wq will stay alive while we hold uring_lock, because it's
 8915		 * killed after ctx nodes, which requires to take the lock.
 8916		 */
 8917		if (!tctx || !tctx->io_wq)
 8918			continue;
 8919		cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
 8920		ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
 8921	}
 8922	mutex_unlock(&ctx->uring_lock);
 8923
 8924	return ret;
 8925}
 8926
 8927static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 8928					 struct task_struct *task,
 8929					 struct files_struct *files)
 8930{
 8931	struct io_task_cancel cancel = { .task = task, .files = files, };
 8932	struct io_uring_task *tctx = task ? task->io_uring : NULL;
 8933
 8934	while (1) {
 8935		enum io_wq_cancel cret;
 8936		bool ret = false;
 8937
 8938		if (!task) {
 8939			ret |= io_uring_try_cancel_iowq(ctx);
 8940		} else if (tctx && tctx->io_wq) {
 8941			/*
 8942			 * Cancels requests of all rings, not only @ctx, but
 8943			 * it's fine as the task is in exit/exec.
 8944			 */
 8945			cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
 8946					       &cancel, true);
 8947			ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
 8948		}
 8949
 8950		/* SQPOLL thread does its own polling */
 8951		if ((!(ctx->flags & IORING_SETUP_SQPOLL) && !files) ||
 8952		    (ctx->sq_data && ctx->sq_data->thread == current)) {
 8953			while (!list_empty_careful(&ctx->iopoll_list)) {
 8954				io_iopoll_try_reap_events(ctx);
 8955				ret = true;
 8956			}
 8957		}
 8958
 8959		ret |= io_cancel_defer_files(ctx, task, files);
 8960		ret |= io_poll_remove_all(ctx, task, files);
 8961		ret |= io_kill_timeouts(ctx, task, files);
 8962		ret |= io_run_task_work();
 8963		ret |= io_run_ctx_fallback(ctx);
 8964		if (!ret)
 8965			break;
 8966		cond_resched();
 8967	}
 8968}
 8969
 8970static int __io_uring_add_task_file(struct io_ring_ctx *ctx)
 8971{
 8972	struct io_uring_task *tctx = current->io_uring;
 8973	struct io_tctx_node *node;
 8974	int ret;
 8975
 8976	if (unlikely(!tctx)) {
 8977		ret = io_uring_alloc_task_context(current, ctx);
 8978		if (unlikely(ret))
 8979			return ret;
 8980		tctx = current->io_uring;
 8981	}
 8982	if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
 8983		node = kmalloc(sizeof(*node), GFP_KERNEL);
 8984		if (!node)
 8985			return -ENOMEM;
 8986		node->ctx = ctx;
 8987		node->task = current;
 8988
 8989		ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
 8990					node, GFP_KERNEL));
 8991		if (ret) {
 8992			kfree(node);
 8993			return ret;
 8994		}
 8995
 8996		mutex_lock(&ctx->uring_lock);
 8997		list_add(&node->ctx_node, &ctx->tctx_list);
 8998		mutex_unlock(&ctx->uring_lock);
 8999	}
 9000	tctx->last = ctx;
 9001	return 0;
 9002}
 9003
 9004/*
 9005 * Note that this task has used io_uring. We use it for cancelation purposes.
 9006 */
 9007static inline int io_uring_add_task_file(struct io_ring_ctx *ctx)
 9008{
 9009	struct io_uring_task *tctx = current->io_uring;
 9010
 9011	if (likely(tctx && tctx->last == ctx))
 9012		return 0;
 9013	return __io_uring_add_task_file(ctx);
 9014}
 9015
 9016/*
 9017 * Remove this io_uring_file -> task mapping.
 9018 */
 9019static void io_uring_del_task_file(unsigned long index)
 9020{
 9021	struct io_uring_task *tctx = current->io_uring;
 9022	struct io_tctx_node *node;
 9023
 9024	if (!tctx)
 9025		return;
 9026	node = xa_erase(&tctx->xa, index);
 9027	if (!node)
 9028		return;
 9029
 9030	WARN_ON_ONCE(current != node->task);
 9031	WARN_ON_ONCE(list_empty(&node->ctx_node));
 9032
 9033	mutex_lock(&node->ctx->uring_lock);
 9034	list_del(&node->ctx_node);
 9035	mutex_unlock(&node->ctx->uring_lock);
 9036
 9037	if (tctx->last == node->ctx)
 9038		tctx->last = NULL;
 9039	kfree(node);
 9040}
 9041
 9042static void io_uring_clean_tctx(struct io_uring_task *tctx)
 9043{
 9044	struct io_wq *wq = tctx->io_wq;
 9045	struct io_tctx_node *node;
 9046	unsigned long index;
 9047
 9048	xa_for_each(&tctx->xa, index, node)
 9049		io_uring_del_task_file(index);
 9050	if (wq) {
 9051		/*
 9052		 * Must be after io_uring_del_task_file() (removes nodes under
 9053		 * uring_lock) to avoid race with io_uring_try_cancel_iowq().
 9054		 */
 9055		tctx->io_wq = NULL;
 9056		io_wq_put_and_exit(wq);
 9057	}
 9058}
 9059
 9060static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
 9061{
 9062	if (tracked)
 9063		return atomic_read(&tctx->inflight_tracked);
 9064	return percpu_counter_sum(&tctx->inflight);
 9065}
 9066
 9067static void io_uring_try_cancel(struct files_struct *files)
 9068{
 9069	struct io_uring_task *tctx = current->io_uring;
 9070	struct io_tctx_node *node;
 9071	unsigned long index;
 9072
 9073	xa_for_each(&tctx->xa, index, node) {
 9074		struct io_ring_ctx *ctx = node->ctx;
 9075
 9076		/* sqpoll task will cancel all its requests */
 9077		if (!ctx->sq_data)
 9078			io_uring_try_cancel_requests(ctx, current, files);
 9079	}
 9080}
 9081
 9082/* should only be called by SQPOLL task */
 9083static void io_uring_cancel_sqpoll(struct io_sq_data *sqd)
 9084{
 9085	struct io_uring_task *tctx = current->io_uring;
 9086	struct io_ring_ctx *ctx;
 9087	s64 inflight;
 9088	DEFINE_WAIT(wait);
 9089
 9090	if (!current->io_uring)
 9091		return;
 9092	if (tctx->io_wq)
 9093		io_wq_exit_start(tctx->io_wq);
 9094
 9095	WARN_ON_ONCE(!sqd || sqd->thread != current);
 9096
 9097	atomic_inc(&tctx->in_idle);
 9098	do {
 9099		/* read completions before cancelations */
 9100		inflight = tctx_inflight(tctx, false);
 9101		if (!inflight)
 9102			break;
 9103		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 9104			io_uring_try_cancel_requests(ctx, current, NULL);
 9105
 9106		prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
 9107		/*
 9108		 * If we've seen completions, retry without waiting. This
 9109		 * avoids a race where a completion comes in before we did
 9110		 * prepare_to_wait().
 9111		 */
 9112		if (inflight == tctx_inflight(tctx, false))
 9113			schedule();
 9114		finish_wait(&tctx->wait, &wait);
 9115	} while (1);
 9116	atomic_dec(&tctx->in_idle);
 9117}
 9118
 9119/*
 9120 * Find any io_uring fd that this task has registered or done IO on, and cancel
 9121 * requests.
 9122 */
 9123void __io_uring_cancel(struct files_struct *files)
 9124{
 9125	struct io_uring_task *tctx = current->io_uring;
 9126	DEFINE_WAIT(wait);
 9127	s64 inflight;
 9128
 9129	if (tctx->io_wq)
 9130		io_wq_exit_start(tctx->io_wq);
 9131
 9132	/* make sure overflow events are dropped */
 9133	atomic_inc(&tctx->in_idle);
 9134	do {
 9135		/* read completions before cancelations */
 9136		inflight = tctx_inflight(tctx, !!files);
 9137		if (!inflight)
 9138			break;
 9139		io_uring_try_cancel(files);
 9140		prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
 9141
 9142		/*
 9143		 * If we've seen completions, retry without waiting. This
 9144		 * avoids a race where a completion comes in before we did
 9145		 * prepare_to_wait().
 9146		 */
 9147		if (inflight == tctx_inflight(tctx, !!files))
 9148			schedule();
 9149		finish_wait(&tctx->wait, &wait);
 9150	} while (1);
 9151	atomic_dec(&tctx->in_idle);
 9152
 9153	io_uring_clean_tctx(tctx);
 9154	if (!files) {
 9155		/* for exec all current's requests should be gone, kill tctx */
 9156		__io_uring_free(current);
 9157	}
 9158}
 9159
 9160static void *io_uring_validate_mmap_request(struct file *file,
 9161					    loff_t pgoff, size_t sz)
 9162{
 9163	struct io_ring_ctx *ctx = file->private_data;
 9164	loff_t offset = pgoff << PAGE_SHIFT;
 9165	struct page *page;
 9166	void *ptr;
 9167
 9168	switch (offset) {
 9169	case IORING_OFF_SQ_RING:
 9170	case IORING_OFF_CQ_RING:
 9171		ptr = ctx->rings;
 9172		break;
 9173	case IORING_OFF_SQES:
 9174		ptr = ctx->sq_sqes;
 9175		break;
 9176	default:
 9177		return ERR_PTR(-EINVAL);
 9178	}
 9179
 9180	page = virt_to_head_page(ptr);
 9181	if (sz > page_size(page))
 9182		return ERR_PTR(-EINVAL);
 9183
 9184	return ptr;
 9185}
 9186
 9187#ifdef CONFIG_MMU
 9188
 9189static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
 9190{
 9191	size_t sz = vma->vm_end - vma->vm_start;
 9192	unsigned long pfn;
 9193	void *ptr;
 9194
 9195	ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
 9196	if (IS_ERR(ptr))
 9197		return PTR_ERR(ptr);
 9198
 9199	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
 9200	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
 9201}
 9202
 9203#else /* !CONFIG_MMU */
 9204
 9205static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
 9206{
 9207	return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
 9208}
 9209
 9210static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
 9211{
 9212	return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
 9213}
 9214
 9215static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
 9216	unsigned long addr, unsigned long len,
 9217	unsigned long pgoff, unsigned long flags)
 9218{
 9219	void *ptr;
 9220
 9221	ptr = io_uring_validate_mmap_request(file, pgoff, len);
 9222	if (IS_ERR(ptr))
 9223		return PTR_ERR(ptr);
 9224
 9225	return (unsigned long) ptr;
 9226}
 9227
 9228#endif /* !CONFIG_MMU */
 9229
 9230static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
 9231{
 9232	DEFINE_WAIT(wait);
 9233
 9234	do {
 9235		if (!io_sqring_full(ctx))
 9236			break;
 9237		prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
 9238
 9239		if (!io_sqring_full(ctx))
 9240			break;
 9241		schedule();
 9242	} while (!signal_pending(current));
 9243
 9244	finish_wait(&ctx->sqo_sq_wait, &wait);
 9245	return 0;
 9246}
 9247
 9248static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
 9249			  struct __kernel_timespec __user **ts,
 9250			  const sigset_t __user **sig)
 9251{
 9252	struct io_uring_getevents_arg arg;
 9253
 9254	/*
 9255	 * If EXT_ARG isn't set, then we have no timespec and the argp pointer
 9256	 * is just a pointer to the sigset_t.
 9257	 */
 9258	if (!(flags & IORING_ENTER_EXT_ARG)) {
 9259		*sig = (const sigset_t __user *) argp;
 9260		*ts = NULL;
 9261		return 0;
 9262	}
 9263
 9264	/*
 9265	 * EXT_ARG is set - ensure we agree on the size of it and copy in our
 9266	 * timespec and sigset_t pointers if good.
 9267	 */
 9268	if (*argsz != sizeof(arg))
 9269		return -EINVAL;
 9270	if (copy_from_user(&arg, argp, sizeof(arg)))
 9271		return -EFAULT;
 9272	*sig = u64_to_user_ptr(arg.sigmask);
 9273	*argsz = arg.sigmask_sz;
 9274	*ts = u64_to_user_ptr(arg.ts);
 9275	return 0;
 9276}
 9277
 9278SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 9279		u32, min_complete, u32, flags, const void __user *, argp,
 9280		size_t, argsz)
 9281{
 9282	struct io_ring_ctx *ctx;
 9283	int submitted = 0;
 9284	struct fd f;
 9285	long ret;
 9286
 9287	io_run_task_work();
 9288
 9289	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
 9290			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
 9291		return -EINVAL;
 9292
 9293	f = fdget(fd);
 9294	if (unlikely(!f.file))
 9295		return -EBADF;
 9296
 9297	ret = -EOPNOTSUPP;
 9298	if (unlikely(f.file->f_op != &io_uring_fops))
 9299		goto out_fput;
 9300
 9301	ret = -ENXIO;
 9302	ctx = f.file->private_data;
 9303	if (unlikely(!percpu_ref_tryget(&ctx->refs)))
 9304		goto out_fput;
 9305
 9306	ret = -EBADFD;
 9307	if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
 9308		goto out;
 9309
 9310	/*
 9311	 * For SQ polling, the thread will do all submissions and completions.
 9312	 * Just return the requested submit count, and wake the thread if
 9313	 * we were asked to.
 9314	 */
 9315	ret = 0;
 9316	if (ctx->flags & IORING_SETUP_SQPOLL) {
 9317		io_cqring_overflow_flush(ctx, false);
 9318
 9319		ret = -EOWNERDEAD;
 9320		if (unlikely(ctx->sq_data->thread == NULL)) {
 9321			goto out;
 9322		}
 9323		if (flags & IORING_ENTER_SQ_WAKEUP)
 9324			wake_up(&ctx->sq_data->wait);
 9325		if (flags & IORING_ENTER_SQ_WAIT) {
 9326			ret = io_sqpoll_wait_sq(ctx);
 9327			if (ret)
 9328				goto out;
 9329		}
 9330		submitted = to_submit;
 9331	} else if (to_submit) {
 9332		ret = io_uring_add_task_file(ctx);
 9333		if (unlikely(ret))
 9334			goto out;
 9335		mutex_lock(&ctx->uring_lock);
 9336		submitted = io_submit_sqes(ctx, to_submit);
 9337		mutex_unlock(&ctx->uring_lock);
 9338
 9339		if (submitted != to_submit)
 9340			goto out;
 9341	}
 9342	if (flags & IORING_ENTER_GETEVENTS) {
 9343		const sigset_t __user *sig;
 9344		struct __kernel_timespec __user *ts;
 9345
 9346		ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
 9347		if (unlikely(ret))
 9348			goto out;
 9349
 9350		min_complete = min(min_complete, ctx->cq_entries);
 9351
 9352		/*
 9353		 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
 9354		 * space applications don't need to do io completion events
 9355		 * polling again, they can rely on io_sq_thread to do polling
 9356		 * work, which can reduce cpu usage and uring_lock contention.
 9357		 */
 9358		if (ctx->flags & IORING_SETUP_IOPOLL &&
 9359		    !(ctx->flags & IORING_SETUP_SQPOLL)) {
 9360			ret = io_iopoll_check(ctx, min_complete);
 9361		} else {
 9362			ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
 9363		}
 9364	}
 9365
 9366out:
 9367	percpu_ref_put(&ctx->refs);
 9368out_fput:
 9369	fdput(f);
 9370	return submitted ? submitted : ret;
 9371}
 9372
 9373#ifdef CONFIG_PROC_FS
 9374static int io_uring_show_cred(struct seq_file *m, unsigned int id,
 9375		const struct cred *cred)
 9376{
 9377	struct user_namespace *uns = seq_user_ns(m);
 9378	struct group_info *gi;
 9379	kernel_cap_t cap;
 9380	unsigned __capi;
 9381	int g;
 9382
 9383	seq_printf(m, "%5d\n", id);
 9384	seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
 9385	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
 9386	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
 9387	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
 9388	seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
 9389	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
 9390	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
 9391	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
 9392	seq_puts(m, "\n\tGroups:\t");
 9393	gi = cred->group_info;
 9394	for (g = 0; g < gi->ngroups; g++) {
 9395		seq_put_decimal_ull(m, g ? " " : "",
 9396					from_kgid_munged(uns, gi->gid[g]));
 9397	}
 9398	seq_puts(m, "\n\tCapEff:\t");
 9399	cap = cred->cap_effective;
 9400	CAP_FOR_EACH_U32(__capi)
 9401		seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
 9402	seq_putc(m, '\n');
 9403	return 0;
 9404}
 9405
 9406static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
 9407{
 9408	struct io_sq_data *sq = NULL;
 9409	bool has_lock;
 9410	int i;
 9411
 9412	/*
 9413	 * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
 9414	 * since fdinfo case grabs it in the opposite direction of normal use
 9415	 * cases. If we fail to get the lock, we just don't iterate any
 9416	 * structures that could be going away outside the io_uring mutex.
 9417	 */
 9418	has_lock = mutex_trylock(&ctx->uring_lock);
 9419
 9420	if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
 9421		sq = ctx->sq_data;
 9422		if (!sq->thread)
 9423			sq = NULL;
 9424	}
 9425
 9426	seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
 9427	seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
 9428	seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
 9429	for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
 9430		struct file *f = io_file_from_index(ctx, i);
 9431
 9432		if (f)
 9433			seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
 9434		else
 9435			seq_printf(m, "%5u: <none>\n", i);
 9436	}
 9437	seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
 9438	for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
 9439		struct io_mapped_ubuf *buf = ctx->user_bufs[i];
 9440		unsigned int len = buf->ubuf_end - buf->ubuf;
 9441
 9442		seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
 9443	}
 9444	if (has_lock && !xa_empty(&ctx->personalities)) {
 9445		unsigned long index;
 9446		const struct cred *cred;
 9447
 9448		seq_printf(m, "Personalities:\n");
 9449		xa_for_each(&ctx->personalities, index, cred)
 9450			io_uring_show_cred(m, index, cred);
 9451	}
 9452	seq_printf(m, "PollList:\n");
 9453	spin_lock_irq(&ctx->completion_lock);
 9454	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
 9455		struct hlist_head *list = &ctx->cancel_hash[i];
 9456		struct io_kiocb *req;
 9457
 9458		hlist_for_each_entry(req, list, hash_node)
 9459			seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
 9460					req->task->task_works != NULL);
 9461	}
 9462	spin_unlock_irq(&ctx->completion_lock);
 9463	if (has_lock)
 9464		mutex_unlock(&ctx->uring_lock);
 9465}
 9466
 9467static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
 9468{
 9469	struct io_ring_ctx *ctx = f->private_data;
 9470
 9471	if (percpu_ref_tryget(&ctx->refs)) {
 9472		__io_uring_show_fdinfo(ctx, m);
 9473		percpu_ref_put(&ctx->refs);
 9474	}
 9475}
 9476#endif
 9477
 9478static const struct file_operations io_uring_fops = {
 9479	.release	= io_uring_release,
 9480	.mmap		= io_uring_mmap,
 9481#ifndef CONFIG_MMU
 9482	.get_unmapped_area = io_uring_nommu_get_unmapped_area,
 9483	.mmap_capabilities = io_uring_nommu_mmap_capabilities,
 9484#endif
 9485	.poll		= io_uring_poll,
 9486	.fasync		= io_uring_fasync,
 9487#ifdef CONFIG_PROC_FS
 9488	.show_fdinfo	= io_uring_show_fdinfo,
 9489#endif
 9490};
 9491
 9492static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 9493				  struct io_uring_params *p)
 9494{
 9495	struct io_rings *rings;
 9496	size_t size, sq_array_offset;
 9497
 9498	/* make sure these are sane, as we already accounted them */
 9499	ctx->sq_entries = p->sq_entries;
 9500	ctx->cq_entries = p->cq_entries;
 9501
 9502	size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
 9503	if (size == SIZE_MAX)
 9504		return -EOVERFLOW;
 9505
 9506	rings = io_mem_alloc(size);
 9507	if (!rings)
 9508		return -ENOMEM;
 9509
 9510	ctx->rings = rings;
 9511	ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
 9512	rings->sq_ring_mask = p->sq_entries - 1;
 9513	rings->cq_ring_mask = p->cq_entries - 1;
 9514	rings->sq_ring_entries = p->sq_entries;
 9515	rings->cq_ring_entries = p->cq_entries;
 9516	ctx->sq_mask = rings->sq_ring_mask;
 9517	ctx->cq_mask = rings->cq_ring_mask;
 9518
 9519	size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
 9520	if (size == SIZE_MAX) {
 9521		io_mem_free(ctx->rings);
 9522		ctx->rings = NULL;
 9523		return -EOVERFLOW;
 9524	}
 9525
 9526	ctx->sq_sqes = io_mem_alloc(size);
 9527	if (!ctx->sq_sqes) {
 9528		io_mem_free(ctx->rings);
 9529		ctx->rings = NULL;
 9530		return -ENOMEM;
 9531	}
 9532
 9533	return 0;
 9534}
 9535
 9536static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
 9537{
 9538	int ret, fd;
 9539
 9540	fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
 9541	if (fd < 0)
 9542		return fd;
 9543
 9544	ret = io_uring_add_task_file(ctx);
 9545	if (ret) {
 9546		put_unused_fd(fd);
 9547		return ret;
 9548	}
 9549	fd_install(fd, file);
 9550	return fd;
 9551}
 9552
 9553/*
 9554 * Allocate an anonymous fd, this is what constitutes the application
 9555 * visible backing of an io_uring instance. The application mmaps this
 9556 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
 9557 * we have to tie this fd to a socket for file garbage collection purposes.
 9558 */
 9559static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
 9560{
 9561	struct file *file;
 9562#if defined(CONFIG_UNIX)
 9563	int ret;
 9564
 9565	ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
 9566				&ctx->ring_sock);
 9567	if (ret)
 9568		return ERR_PTR(ret);
 9569#endif
 9570
 9571	file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
 9572					O_RDWR | O_CLOEXEC);
 9573#if defined(CONFIG_UNIX)
 9574	if (IS_ERR(file)) {
 9575		sock_release(ctx->ring_sock);
 9576		ctx->ring_sock = NULL;
 9577	} else {
 9578		ctx->ring_sock->file = file;
 9579	}
 9580#endif
 9581	return file;
 9582}
 9583
 9584static int io_uring_create(unsigned entries, struct io_uring_params *p,
 9585			   struct io_uring_params __user *params)
 9586{
 9587	struct io_ring_ctx *ctx;
 9588	struct file *file;
 9589	int ret;
 9590
 9591	if (!entries)
 9592		return -EINVAL;
 9593	if (entries > IORING_MAX_ENTRIES) {
 9594		if (!(p->flags & IORING_SETUP_CLAMP))
 9595			return -EINVAL;
 9596		entries = IORING_MAX_ENTRIES;
 9597	}
 9598
 9599	/*
 9600	 * Use twice as many entries for the CQ ring. It's possible for the
 9601	 * application to drive a higher depth than the size of the SQ ring,
 9602	 * since the sqes are only used at submission time. This allows for
 9603	 * some flexibility in overcommitting a bit. If the application has
 9604	 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
 9605	 * of CQ ring entries manually.
 9606	 */
 9607	p->sq_entries = roundup_pow_of_two(entries);
 9608	if (p->flags & IORING_SETUP_CQSIZE) {
 9609		/*
 9610		 * If IORING_SETUP_CQSIZE is set, we do the same roundup
 9611		 * to a power-of-two, if it isn't already. We do NOT impose
 9612		 * any cq vs sq ring sizing.
 9613		 */
 9614		if (!p->cq_entries)
 9615			return -EINVAL;
 9616		if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
 9617			if (!(p->flags & IORING_SETUP_CLAMP))
 9618				return -EINVAL;
 9619			p->cq_entries = IORING_MAX_CQ_ENTRIES;
 9620		}
 9621		p->cq_entries = roundup_pow_of_two(p->cq_entries);
 9622		if (p->cq_entries < p->sq_entries)
 9623			return -EINVAL;
 9624	} else {
 9625		p->cq_entries = 2 * p->sq_entries;
 9626	}
 9627
 9628	ctx = io_ring_ctx_alloc(p);
 9629	if (!ctx)
 9630		return -ENOMEM;
 9631	ctx->compat = in_compat_syscall();
 9632	if (!capable(CAP_IPC_LOCK))
 9633		ctx->user = get_uid(current_user());
 9634
 9635	/*
 9636	 * This is just grabbed for accounting purposes. When a process exits,
 9637	 * the mm is exited and dropped before the files, hence we need to hang
 9638	 * on to this mm purely for the purposes of being able to unaccount
 9639	 * memory (locked/pinned vm). It's not used for anything else.
 9640	 */
 9641	mmgrab(current->mm);
 9642	ctx->mm_account = current->mm;
 9643
 9644	ret = io_allocate_scq_urings(ctx, p);
 9645	if (ret)
 9646		goto err;
 9647
 9648	ret = io_sq_offload_create(ctx, p);
 9649	if (ret)
 9650		goto err;
 9651	/* always set a rsrc node */
 9652	ret = io_rsrc_node_switch_start(ctx);
 9653	if (ret)
 9654		goto err;
 9655	io_rsrc_node_switch(ctx, NULL);
 9656
 9657	memset(&p->sq_off, 0, sizeof(p->sq_off));
 9658	p->sq_off.head = offsetof(struct io_rings, sq.head);
 9659	p->sq_off.tail = offsetof(struct io_rings, sq.tail);
 9660	p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
 9661	p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
 9662	p->sq_off.flags = offsetof(struct io_rings, sq_flags);
 9663	p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
 9664	p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
 9665
 9666	memset(&p->cq_off, 0, sizeof(p->cq_off));
 9667	p->cq_off.head = offsetof(struct io_rings, cq.head);
 9668	p->cq_off.tail = offsetof(struct io_rings, cq.tail);
 9669	p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
 9670	p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
 9671	p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
 9672	p->cq_off.cqes = offsetof(struct io_rings, cqes);
 9673	p->cq_off.flags = offsetof(struct io_rings, cq_flags);
 9674
 9675	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
 9676			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
 9677			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
 9678			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
 9679			IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
 9680			IORING_FEAT_RSRC_TAGS;
 9681
 9682	if (copy_to_user(params, p, sizeof(*p))) {
 9683		ret = -EFAULT;
 9684		goto err;
 9685	}
 9686
 9687	file = io_uring_get_file(ctx);
 9688	if (IS_ERR(file)) {
 9689		ret = PTR_ERR(file);
 9690		goto err;
 9691	}
 9692
 9693	/*
 9694	 * Install ring fd as the very last thing, so we don't risk someone
 9695	 * having closed it before we finish setup
 9696	 */
 9697	ret = io_uring_install_fd(ctx, file);
 9698	if (ret < 0) {
 9699		/* fput will clean it up */
 9700		fput(file);
 9701		return ret;
 9702	}
 9703
 9704	trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
 9705	return ret;
 9706err:
 9707	io_ring_ctx_wait_and_kill(ctx);
 9708	return ret;
 9709}
 9710
 9711/*
 9712 * Sets up an aio uring context, and returns the fd. Applications asks for a
 9713 * ring size, we return the actual sq/cq ring sizes (among other things) in the
 9714 * params structure passed in.
 9715 */
 9716static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 9717{
 9718	struct io_uring_params p;
 9719	int i;
 9720
 9721	if (copy_from_user(&p, params, sizeof(p)))
 9722		return -EFAULT;
 9723	for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
 9724		if (p.resv[i])
 9725			return -EINVAL;
 9726	}
 9727
 9728	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
 9729			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
 9730			IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
 9731			IORING_SETUP_R_DISABLED))
 9732		return -EINVAL;
 9733
 9734	return  io_uring_create(entries, &p, params);
 9735}
 9736
 9737SYSCALL_DEFINE2(io_uring_setup, u32, entries,
 9738		struct io_uring_params __user *, params)
 9739{
 9740	return io_uring_setup(entries, params);
 9741}
 9742
 9743static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
 9744{
 9745	struct io_uring_probe *p;
 9746	size_t size;
 9747	int i, ret;
 9748
 9749	size = struct_size(p, ops, nr_args);
 9750	if (size == SIZE_MAX)
 9751		return -EOVERFLOW;
 9752	p = kzalloc(size, GFP_KERNEL);
 9753	if (!p)
 9754		return -ENOMEM;
 9755
 9756	ret = -EFAULT;
 9757	if (copy_from_user(p, arg, size))
 9758		goto out;
 9759	ret = -EINVAL;
 9760	if (memchr_inv(p, 0, size))
 9761		goto out;
 9762
 9763	p->last_op = IORING_OP_LAST - 1;
 9764	if (nr_args > IORING_OP_LAST)
 9765		nr_args = IORING_OP_LAST;
 9766
 9767	for (i = 0; i < nr_args; i++) {
 9768		p->ops[i].op = i;
 9769		if (!io_op_defs[i].not_supported)
 9770			p->ops[i].flags = IO_URING_OP_SUPPORTED;
 9771	}
 9772	p->ops_len = i;
 9773
 9774	ret = 0;
 9775	if (copy_to_user(arg, p, size))
 9776		ret = -EFAULT;
 9777out:
 9778	kfree(p);
 9779	return ret;
 9780}
 9781
 9782static int io_register_personality(struct io_ring_ctx *ctx)
 9783{
 9784	const struct cred *creds;
 9785	u32 id;
 9786	int ret;
 9787
 9788	creds = get_current_cred();
 9789
 9790	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
 9791			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
 9792	if (!ret)
 9793		return id;
 9794	put_cred(creds);
 9795	return ret;
 9796}
 9797
 9798static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
 9799				    unsigned int nr_args)
 9800{
 9801	struct io_uring_restriction *res;
 9802	size_t size;
 9803	int i, ret;
 9804
 9805	/* Restrictions allowed only if rings started disabled */
 9806	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
 9807		return -EBADFD;
 9808
 9809	/* We allow only a single restrictions registration */
 9810	if (ctx->restrictions.registered)
 9811		return -EBUSY;
 9812
 9813	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
 9814		return -EINVAL;
 9815
 9816	size = array_size(nr_args, sizeof(*res));
 9817	if (size == SIZE_MAX)
 9818		return -EOVERFLOW;
 9819
 9820	res = memdup_user(arg, size);
 9821	if (IS_ERR(res))
 9822		return PTR_ERR(res);
 9823
 9824	ret = 0;
 9825
 9826	for (i = 0; i < nr_args; i++) {
 9827		switch (res[i].opcode) {
 9828		case IORING_RESTRICTION_REGISTER_OP:
 9829			if (res[i].register_op >= IORING_REGISTER_LAST) {
 9830				ret = -EINVAL;
 9831				goto out;
 9832			}
 9833
 9834			__set_bit(res[i].register_op,
 9835				  ctx->restrictions.register_op);
 9836			break;
 9837		case IORING_RESTRICTION_SQE_OP:
 9838			if (res[i].sqe_op >= IORING_OP_LAST) {
 9839				ret = -EINVAL;
 9840				goto out;
 9841			}
 9842
 9843			__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
 9844			break;
 9845		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
 9846			ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
 9847			break;
 9848		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
 9849			ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
 9850			break;
 9851		default:
 9852			ret = -EINVAL;
 9853			goto out;
 9854		}
 9855	}
 9856
 9857out:
 9858	/* Reset all restrictions if an error happened */
 9859	if (ret != 0)
 9860		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
 9861	else
 9862		ctx->restrictions.registered = true;
 9863
 9864	kfree(res);
 9865	return ret;
 9866}
 9867
 9868static int io_register_enable_rings(struct io_ring_ctx *ctx)
 9869{
 9870	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
 9871		return -EBADFD;
 9872
 9873	if (ctx->restrictions.registered)
 9874		ctx->restricted = 1;
 9875
 9876	ctx->flags &= ~IORING_SETUP_R_DISABLED;
 9877	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
 9878		wake_up(&ctx->sq_data->wait);
 9879	return 0;
 9880}
 9881
 9882static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 9883				     struct io_uring_rsrc_update2 *up,
 9884				     unsigned nr_args)
 9885{
 9886	__u32 tmp;
 9887	int err;
 9888
 9889	if (up->resv)
 9890		return -EINVAL;
 9891	if (check_add_overflow(up->offset, nr_args, &tmp))
 9892		return -EOVERFLOW;
 9893	err = io_rsrc_node_switch_start(ctx);
 9894	if (err)
 9895		return err;
 9896
 9897	switch (type) {
 9898	case IORING_RSRC_FILE:
 9899		return __io_sqe_files_update(ctx, up, nr_args);
 9900	case IORING_RSRC_BUFFER:
 9901		return __io_sqe_buffers_update(ctx, up, nr_args);
 9902	}
 9903	return -EINVAL;
 9904}
 9905
 9906static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
 9907				    unsigned nr_args)
 9908{
 9909	struct io_uring_rsrc_update2 up;
 9910
 9911	if (!nr_args)
 9912		return -EINVAL;
 9913	memset(&up, 0, sizeof(up));
 9914	if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
 9915		return -EFAULT;
 9916	return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
 9917}
 9918
 9919static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
 9920				   unsigned size, unsigned type)
 9921{
 9922	struct io_uring_rsrc_update2 up;
 9923
 9924	if (size != sizeof(up))
 9925		return -EINVAL;
 9926	if (copy_from_user(&up, arg, sizeof(up)))
 9927		return -EFAULT;
 9928	if (!up.nr || up.resv)
 9929		return -EINVAL;
 9930	return __io_register_rsrc_update(ctx, type, &up, up.nr);
 9931}
 9932
 9933static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
 9934			    unsigned int size, unsigned int type)
 9935{
 9936	struct io_uring_rsrc_register rr;
 9937
 9938	/* keep it extendible */
 9939	if (size != sizeof(rr))
 9940		return -EINVAL;
 9941
 9942	memset(&rr, 0, sizeof(rr));
 9943	if (copy_from_user(&rr, arg, size))
 9944		return -EFAULT;
 9945	if (!rr.nr || rr.resv || rr.resv2)
 9946		return -EINVAL;
 9947
 9948	switch (type) {
 9949	case IORING_RSRC_FILE:
 9950		return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
 9951					     rr.nr, u64_to_user_ptr(rr.tags));
 9952	case IORING_RSRC_BUFFER:
 9953		return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
 9954					       rr.nr, u64_to_user_ptr(rr.tags));
 9955	}
 9956	return -EINVAL;
 9957}
 9958
 9959static bool io_register_op_must_quiesce(int op)
 9960{
 9961	switch (op) {
 9962	case IORING_REGISTER_BUFFERS:
 9963	case IORING_UNREGISTER_BUFFERS:
 9964	case IORING_REGISTER_FILES:
 9965	case IORING_UNREGISTER_FILES:
 9966	case IORING_REGISTER_FILES_UPDATE:
 9967	case IORING_REGISTER_PROBE:
 9968	case IORING_REGISTER_PERSONALITY:
 9969	case IORING_UNREGISTER_PERSONALITY:
 9970	case IORING_REGISTER_FILES2:
 9971	case IORING_REGISTER_FILES_UPDATE2:
 9972	case IORING_REGISTER_BUFFERS2:
 9973	case IORING_REGISTER_BUFFERS_UPDATE:
 9974		return false;
 9975	default:
 9976		return true;
 9977	}
 9978}
 9979
 9980static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 9981			       void __user *arg, unsigned nr_args)
 9982	__releases(ctx->uring_lock)
 9983	__acquires(ctx->uring_lock)
 9984{
 9985	int ret;
 9986
 9987	/*
 9988	 * We're inside the ring mutex, if the ref is already dying, then
 9989	 * someone else killed the ctx or is already going through
 9990	 * io_uring_register().
 9991	 */
 9992	if (percpu_ref_is_dying(&ctx->refs))
 9993		return -ENXIO;
 9994
 9995	if (ctx->restricted) {
 9996		if (opcode >= IORING_REGISTER_LAST)
 9997			return -EINVAL;
 9998		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
 9999		if (!test_bit(opcode, ctx->restrictions.register_op))
10000			return -EACCES;
10001	}
10002
10003	if (io_register_op_must_quiesce(opcode)) {
10004		percpu_ref_kill(&ctx->refs);
10005
10006		/*
10007		 * Drop uring mutex before waiting for references to exit. If
10008		 * another thread is currently inside io_uring_enter() it might
10009		 * need to grab the uring_lock to make progress. If we hold it
10010		 * here across the drain wait, then we can deadlock. It's safe
10011		 * to drop the mutex here, since no new references will come in
10012		 * after we've killed the percpu ref.
10013		 */
10014		mutex_unlock(&ctx->uring_lock);
10015		do {
10016			ret = wait_for_completion_interruptible(&ctx->ref_comp);
10017			if (!ret)
10018				break;
10019			ret = io_run_task_work_sig();
10020			if (ret < 0)
10021				break;
10022		} while (1);
10023		mutex_lock(&ctx->uring_lock);
10024
10025		if (ret) {
10026			io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
10027			return ret;
10028		}
10029	}
10030
10031	switch (opcode) {
10032	case IORING_REGISTER_BUFFERS:
10033		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
10034		break;
10035	case IORING_UNREGISTER_BUFFERS:
10036		ret = -EINVAL;
10037		if (arg || nr_args)
10038			break;
10039		ret = io_sqe_buffers_unregister(ctx);
10040		break;
10041	case IORING_REGISTER_FILES:
10042		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
10043		break;
10044	case IORING_UNREGISTER_FILES:
10045		ret = -EINVAL;
10046		if (arg || nr_args)
10047			break;
10048		ret = io_sqe_files_unregister(ctx);
10049		break;
10050	case IORING_REGISTER_FILES_UPDATE:
10051		ret = io_register_files_update(ctx, arg, nr_args);
10052		break;
10053	case IORING_REGISTER_EVENTFD:
10054	case IORING_REGISTER_EVENTFD_ASYNC:
10055		ret = -EINVAL;
10056		if (nr_args != 1)
10057			break;
10058		ret = io_eventfd_register(ctx, arg);
10059		if (ret)
10060			break;
10061		if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
10062			ctx->eventfd_async = 1;
10063		else
10064			ctx->eventfd_async = 0;
10065		break;
10066	case IORING_UNREGISTER_EVENTFD:
10067		ret = -EINVAL;
10068		if (arg || nr_args)
10069			break;
10070		ret = io_eventfd_unregister(ctx);
10071		break;
10072	case IORING_REGISTER_PROBE:
10073		ret = -EINVAL;
10074		if (!arg || nr_args > 256)
10075			break;
10076		ret = io_probe(ctx, arg, nr_args);
10077		break;
10078	case IORING_REGISTER_PERSONALITY:
10079		ret = -EINVAL;
10080		if (arg || nr_args)
10081			break;
10082		ret = io_register_personality(ctx);
10083		break;
10084	case IORING_UNREGISTER_PERSONALITY:
10085		ret = -EINVAL;
10086		if (arg)
10087			break;
10088		ret = io_unregister_personality(ctx, nr_args);
10089		break;
10090	case IORING_REGISTER_ENABLE_RINGS:
10091		ret = -EINVAL;
10092		if (arg || nr_args)
10093			break;
10094		ret = io_register_enable_rings(ctx);
10095		break;
10096	case IORING_REGISTER_RESTRICTIONS:
10097		ret = io_register_restrictions(ctx, arg, nr_args);
10098		break;
10099	case IORING_REGISTER_FILES2:
10100		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
10101		break;
10102	case IORING_REGISTER_FILES_UPDATE2:
10103		ret = io_register_rsrc_update(ctx, arg, nr_args,
10104					      IORING_RSRC_FILE);
10105		break;
10106	case IORING_REGISTER_BUFFERS2:
10107		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
10108		break;
10109	case IORING_REGISTER_BUFFERS_UPDATE:
10110		ret = io_register_rsrc_update(ctx, arg, nr_args,
10111					      IORING_RSRC_BUFFER);
10112		break;
10113	default:
10114		ret = -EINVAL;
10115		break;
10116	}
10117
10118	if (io_register_op_must_quiesce(opcode)) {
10119		/* bring the ctx back to life */
10120		percpu_ref_reinit(&ctx->refs);
10121		reinit_completion(&ctx->ref_comp);
10122	}
10123	return ret;
10124}
10125
10126SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
10127		void __user *, arg, unsigned int, nr_args)
10128{
10129	struct io_ring_ctx *ctx;
10130	long ret = -EBADF;
10131	struct fd f;
10132
10133	f = fdget(fd);
10134	if (!f.file)
10135		return -EBADF;
10136
10137	ret = -EOPNOTSUPP;
10138	if (f.file->f_op != &io_uring_fops)
10139		goto out_fput;
10140
10141	ctx = f.file->private_data;
10142
10143	io_run_task_work();
10144
10145	mutex_lock(&ctx->uring_lock);
10146	ret = __io_uring_register(ctx, opcode, arg, nr_args);
10147	mutex_unlock(&ctx->uring_lock);
10148	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
10149							ctx->cq_ev_fd != NULL, ret);
10150out_fput:
10151	fdput(f);
10152	return ret;
10153}
10154
10155static int __init io_uring_init(void)
10156{
10157#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
10158	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
10159	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
10160} while (0)
10161
10162#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
10163	__BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
10164	BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
10165	BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
10166	BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
10167	BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
10168	BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
10169	BUILD_BUG_SQE_ELEM(8,  __u64,  off);
10170	BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
10171	BUILD_BUG_SQE_ELEM(16, __u64,  addr);
10172	BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
10173	BUILD_BUG_SQE_ELEM(24, __u32,  len);
10174	BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
10175	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
10176	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
10177	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
10178	BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
10179	BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
10180	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
10181	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
10182	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
10183	BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
10184	BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
10185	BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
10186	BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
10187	BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
10188	BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
10189	BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
10190	BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
10191	BUILD_BUG_SQE_ELEM(42, __u16,  personality);
10192	BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
10193
10194	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
10195		     sizeof(struct io_uring_rsrc_update));
10196	BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
10197		     sizeof(struct io_uring_rsrc_update2));
10198	/* should fit into one byte */
10199	BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
10200
10201	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
10202	BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
10203	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
10204				SLAB_ACCOUNT);
10205	return 0;
10206};
10207__initcall(io_uring_init);