fs/io_uring.c at v5.17-rc7 · tjh.dev/kernel

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / fs / io_uring.c
at v5.17-rc7 11250 lines 285 kB view raw
wrap content
    1// SPDX-License-Identifier: GPL-2.0
    2/*
    3 * Shared application/kernel submission and completion ring pairs, for
    4 * supporting fast/efficient IO.
    5 *
    6 * A note on the read/write ordering memory barriers that are matched between
    7 * the application and kernel side.
    8 *
    9 * After the application reads the CQ ring tail, it must use an
   10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
   11 * before writing the tail (using smp_load_acquire to read the tail will
   12 * do). It also needs a smp_mb() before updating CQ head (ordering the
   13 * entry load(s) with the head store), pairing with an implicit barrier
   14 * through a control-dependency in io_get_cqe (smp_store_release to
   15 * store head will do). Failure to do so could lead to reading invalid
   16 * CQ entries.
   17 *
   18 * Likewise, the application must use an appropriate smp_wmb() before
   19 * writing the SQ tail (ordering SQ entry stores with the tail store),
   20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
   21 * to store the tail will do). And it needs a barrier ordering the SQ
   22 * head load before writing new SQ entries (smp_load_acquire to read
   23 * head will do).
   24 *
   25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
   26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
   27 * updating the SQ tail; a full memory barrier smp_mb() is needed
   28 * between.
   29 *
   30 * Also see the examples in the liburing library:
   31 *
   32 *	git://git.kernel.dk/liburing
   33 *
   34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
   35 * from data shared between the kernel and application. This is done both
   36 * for ordering purposes, but also to ensure that once a value is loaded from
   37 * data that the application could potentially modify, it remains stable.
   38 *
   39 * Copyright (C) 2018-2019 Jens Axboe
   40 * Copyright (c) 2018-2019 Christoph Hellwig
   41 */
   42#include <linux/kernel.h>
   43#include <linux/init.h>
   44#include <linux/errno.h>
   45#include <linux/syscalls.h>
   46#include <linux/compat.h>
   47#include <net/compat.h>
   48#include <linux/refcount.h>
   49#include <linux/uio.h>
   50#include <linux/bits.h>
   51
   52#include <linux/sched/signal.h>
   53#include <linux/fs.h>
   54#include <linux/file.h>
   55#include <linux/fdtable.h>
   56#include <linux/mm.h>
   57#include <linux/mman.h>
   58#include <linux/percpu.h>
   59#include <linux/slab.h>
   60#include <linux/blk-mq.h>
   61#include <linux/bvec.h>
   62#include <linux/net.h>
   63#include <net/sock.h>
   64#include <net/af_unix.h>
   65#include <net/scm.h>
   66#include <linux/anon_inodes.h>
   67#include <linux/sched/mm.h>
   68#include <linux/uaccess.h>
   69#include <linux/nospec.h>
   70#include <linux/sizes.h>
   71#include <linux/hugetlb.h>
   72#include <linux/highmem.h>
   73#include <linux/namei.h>
   74#include <linux/fsnotify.h>
   75#include <linux/fadvise.h>
   76#include <linux/eventpoll.h>
   77#include <linux/splice.h>
   78#include <linux/task_work.h>
   79#include <linux/pagemap.h>
   80#include <linux/io_uring.h>
   81#include <linux/tracehook.h>
   82#include <linux/audit.h>
   83#include <linux/security.h>
   84
   85#define CREATE_TRACE_POINTS
   86#include <trace/events/io_uring.h>
   87
   88#include <uapi/linux/io_uring.h>
   89
   90#include "internal.h"
   91#include "io-wq.h"
   92
   93#define IORING_MAX_ENTRIES	32768
   94#define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
   95#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
   96
   97/* only define max */
   98#define IORING_MAX_FIXED_FILES	(1U << 15)
   99#define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
  100				 IORING_REGISTER_LAST + IORING_OP_LAST)
  101
  102#define IO_RSRC_TAG_TABLE_SHIFT	(PAGE_SHIFT - 3)
  103#define IO_RSRC_TAG_TABLE_MAX	(1U << IO_RSRC_TAG_TABLE_SHIFT)
  104#define IO_RSRC_TAG_TABLE_MASK	(IO_RSRC_TAG_TABLE_MAX - 1)
  105
  106#define IORING_MAX_REG_BUFFERS	(1U << 14)
  107
  108#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
  109			  IOSQE_IO_HARDLINK | IOSQE_ASYNC)
  110
  111#define SQE_VALID_FLAGS	(SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
  112			IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
  113
  114#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
  115				REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
  116				REQ_F_ASYNC_DATA)
  117
  118#define IO_TCTX_REFS_CACHE_NR	(1U << 10)
  119
  120struct io_uring {
  121	u32 head ____cacheline_aligned_in_smp;
  122	u32 tail ____cacheline_aligned_in_smp;
  123};
  124
  125/*
  126 * This data is shared with the application through the mmap at offsets
  127 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
  128 *
  129 * The offsets to the member fields are published through struct
  130 * io_sqring_offsets when calling io_uring_setup.
  131 */
  132struct io_rings {
  133	/*
  134	 * Head and tail offsets into the ring; the offsets need to be
  135	 * masked to get valid indices.
  136	 *
  137	 * The kernel controls head of the sq ring and the tail of the cq ring,
  138	 * and the application controls tail of the sq ring and the head of the
  139	 * cq ring.
  140	 */
  141	struct io_uring		sq, cq;
  142	/*
  143	 * Bitmasks to apply to head and tail offsets (constant, equals
  144	 * ring_entries - 1)
  145	 */
  146	u32			sq_ring_mask, cq_ring_mask;
  147	/* Ring sizes (constant, power of 2) */
  148	u32			sq_ring_entries, cq_ring_entries;
  149	/*
  150	 * Number of invalid entries dropped by the kernel due to
  151	 * invalid index stored in array
  152	 *
  153	 * Written by the kernel, shouldn't be modified by the
  154	 * application (i.e. get number of "new events" by comparing to
  155	 * cached value).
  156	 *
  157	 * After a new SQ head value was read by the application this
  158	 * counter includes all submissions that were dropped reaching
  159	 * the new SQ head (and possibly more).
  160	 */
  161	u32			sq_dropped;
  162	/*
  163	 * Runtime SQ flags
  164	 *
  165	 * Written by the kernel, shouldn't be modified by the
  166	 * application.
  167	 *
  168	 * The application needs a full memory barrier before checking
  169	 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
  170	 */
  171	u32			sq_flags;
  172	/*
  173	 * Runtime CQ flags
  174	 *
  175	 * Written by the application, shouldn't be modified by the
  176	 * kernel.
  177	 */
  178	u32			cq_flags;
  179	/*
  180	 * Number of completion events lost because the queue was full;
  181	 * this should be avoided by the application by making sure
  182	 * there are not more requests pending than there is space in
  183	 * the completion queue.
  184	 *
  185	 * Written by the kernel, shouldn't be modified by the
  186	 * application (i.e. get number of "new events" by comparing to
  187	 * cached value).
  188	 *
  189	 * As completion events come in out of order this counter is not
  190	 * ordered with any other data.
  191	 */
  192	u32			cq_overflow;
  193	/*
  194	 * Ring buffer of completion events.
  195	 *
  196	 * The kernel writes completion events fresh every time they are
  197	 * produced, so the application is allowed to modify pending
  198	 * entries.
  199	 */
  200	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
  201};
  202
  203enum io_uring_cmd_flags {
  204	IO_URING_F_COMPLETE_DEFER	= 1,
  205	IO_URING_F_UNLOCKED		= 2,
  206	/* int's last bit, sign checks are usually faster than a bit test */
  207	IO_URING_F_NONBLOCK		= INT_MIN,
  208};
  209
  210struct io_mapped_ubuf {
  211	u64		ubuf;
  212	u64		ubuf_end;
  213	unsigned int	nr_bvecs;
  214	unsigned long	acct_pages;
  215	struct bio_vec	bvec[];
  216};
  217
  218struct io_ring_ctx;
  219
  220struct io_overflow_cqe {
  221	struct io_uring_cqe cqe;
  222	struct list_head list;
  223};
  224
  225struct io_fixed_file {
  226	/* file * with additional FFS_* flags */
  227	unsigned long file_ptr;
  228};
  229
  230struct io_rsrc_put {
  231	struct list_head list;
  232	u64 tag;
  233	union {
  234		void *rsrc;
  235		struct file *file;
  236		struct io_mapped_ubuf *buf;
  237	};
  238};
  239
  240struct io_file_table {
  241	struct io_fixed_file *files;
  242};
  243
  244struct io_rsrc_node {
  245	struct percpu_ref		refs;
  246	struct list_head		node;
  247	struct list_head		rsrc_list;
  248	struct io_rsrc_data		*rsrc_data;
  249	struct llist_node		llist;
  250	bool				done;
  251};
  252
  253typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
  254
  255struct io_rsrc_data {
  256	struct io_ring_ctx		*ctx;
  257
  258	u64				**tags;
  259	unsigned int			nr;
  260	rsrc_put_fn			*do_put;
  261	atomic_t			refs;
  262	struct completion		done;
  263	bool				quiesce;
  264};
  265
  266struct io_buffer {
  267	struct list_head list;
  268	__u64 addr;
  269	__u32 len;
  270	__u16 bid;
  271};
  272
  273struct io_restriction {
  274	DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
  275	DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
  276	u8 sqe_flags_allowed;
  277	u8 sqe_flags_required;
  278	bool registered;
  279};
  280
  281enum {
  282	IO_SQ_THREAD_SHOULD_STOP = 0,
  283	IO_SQ_THREAD_SHOULD_PARK,
  284};
  285
  286struct io_sq_data {
  287	refcount_t		refs;
  288	atomic_t		park_pending;
  289	struct mutex		lock;
  290
  291	/* ctx's that are using this sqd */
  292	struct list_head	ctx_list;
  293
  294	struct task_struct	*thread;
  295	struct wait_queue_head	wait;
  296
  297	unsigned		sq_thread_idle;
  298	int			sq_cpu;
  299	pid_t			task_pid;
  300	pid_t			task_tgid;
  301
  302	unsigned long		state;
  303	struct completion	exited;
  304};
  305
  306#define IO_COMPL_BATCH			32
  307#define IO_REQ_CACHE_SIZE		32
  308#define IO_REQ_ALLOC_BATCH		8
  309
  310struct io_submit_link {
  311	struct io_kiocb		*head;
  312	struct io_kiocb		*last;
  313};
  314
  315struct io_submit_state {
  316	/* inline/task_work completion list, under ->uring_lock */
  317	struct io_wq_work_node	free_list;
  318	/* batch completion logic */
  319	struct io_wq_work_list	compl_reqs;
  320	struct io_submit_link	link;
  321
  322	bool			plug_started;
  323	bool			need_plug;
  324	bool			flush_cqes;
  325	unsigned short		submit_nr;
  326	struct blk_plug		plug;
  327};
  328
  329struct io_ring_ctx {
  330	/* const or read-mostly hot data */
  331	struct {
  332		struct percpu_ref	refs;
  333
  334		struct io_rings		*rings;
  335		unsigned int		flags;
  336		unsigned int		compat: 1;
  337		unsigned int		drain_next: 1;
  338		unsigned int		eventfd_async: 1;
  339		unsigned int		restricted: 1;
  340		unsigned int		off_timeout_used: 1;
  341		unsigned int		drain_active: 1;
  342		unsigned int		drain_disabled: 1;
  343	} ____cacheline_aligned_in_smp;
  344
  345	/* submission data */
  346	struct {
  347		struct mutex		uring_lock;
  348
  349		/*
  350		 * Ring buffer of indices into array of io_uring_sqe, which is
  351		 * mmapped by the application using the IORING_OFF_SQES offset.
  352		 *
  353		 * This indirection could e.g. be used to assign fixed
  354		 * io_uring_sqe entries to operations and only submit them to
  355		 * the queue when needed.
  356		 *
  357		 * The kernel modifies neither the indices array nor the entries
  358		 * array.
  359		 */
  360		u32			*sq_array;
  361		struct io_uring_sqe	*sq_sqes;
  362		unsigned		cached_sq_head;
  363		unsigned		sq_entries;
  364		struct list_head	defer_list;
  365
  366		/*
  367		 * Fixed resources fast path, should be accessed only under
  368		 * uring_lock, and updated through io_uring_register(2)
  369		 */
  370		struct io_rsrc_node	*rsrc_node;
  371		int			rsrc_cached_refs;
  372		struct io_file_table	file_table;
  373		unsigned		nr_user_files;
  374		unsigned		nr_user_bufs;
  375		struct io_mapped_ubuf	**user_bufs;
  376
  377		struct io_submit_state	submit_state;
  378		struct list_head	timeout_list;
  379		struct list_head	ltimeout_list;
  380		struct list_head	cq_overflow_list;
  381		struct xarray		io_buffers;
  382		struct xarray		personalities;
  383		u32			pers_next;
  384		unsigned		sq_thread_idle;
  385	} ____cacheline_aligned_in_smp;
  386
  387	/* IRQ completion list, under ->completion_lock */
  388	struct io_wq_work_list	locked_free_list;
  389	unsigned int		locked_free_nr;
  390
  391	const struct cred	*sq_creds;	/* cred used for __io_sq_thread() */
  392	struct io_sq_data	*sq_data;	/* if using sq thread polling */
  393
  394	struct wait_queue_head	sqo_sq_wait;
  395	struct list_head	sqd_list;
  396
  397	unsigned long		check_cq_overflow;
  398
  399	struct {
  400		unsigned		cached_cq_tail;
  401		unsigned		cq_entries;
  402		struct eventfd_ctx	*cq_ev_fd;
  403		struct wait_queue_head	cq_wait;
  404		unsigned		cq_extra;
  405		atomic_t		cq_timeouts;
  406		unsigned		cq_last_tm_flush;
  407	} ____cacheline_aligned_in_smp;
  408
  409	struct {
  410		spinlock_t		completion_lock;
  411
  412		spinlock_t		timeout_lock;
  413
  414		/*
  415		 * ->iopoll_list is protected by the ctx->uring_lock for
  416		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
  417		 * For SQPOLL, only the single threaded io_sq_thread() will
  418		 * manipulate the list, hence no extra locking is needed there.
  419		 */
  420		struct io_wq_work_list	iopoll_list;
  421		struct hlist_head	*cancel_hash;
  422		unsigned		cancel_hash_bits;
  423		bool			poll_multi_queue;
  424	} ____cacheline_aligned_in_smp;
  425
  426	struct io_restriction		restrictions;
  427
  428	/* slow path rsrc auxilary data, used by update/register */
  429	struct {
  430		struct io_rsrc_node		*rsrc_backup_node;
  431		struct io_mapped_ubuf		*dummy_ubuf;
  432		struct io_rsrc_data		*file_data;
  433		struct io_rsrc_data		*buf_data;
  434
  435		struct delayed_work		rsrc_put_work;
  436		struct llist_head		rsrc_put_llist;
  437		struct list_head		rsrc_ref_list;
  438		spinlock_t			rsrc_ref_lock;
  439	};
  440
  441	/* Keep this last, we don't need it for the fast path */
  442	struct {
  443		#if defined(CONFIG_UNIX)
  444			struct socket		*ring_sock;
  445		#endif
  446		/* hashed buffered write serialization */
  447		struct io_wq_hash		*hash_map;
  448
  449		/* Only used for accounting purposes */
  450		struct user_struct		*user;
  451		struct mm_struct		*mm_account;
  452
  453		/* ctx exit and cancelation */
  454		struct llist_head		fallback_llist;
  455		struct delayed_work		fallback_work;
  456		struct work_struct		exit_work;
  457		struct list_head		tctx_list;
  458		struct completion		ref_comp;
  459		u32				iowq_limits[2];
  460		bool				iowq_limits_set;
  461	};
  462};
  463
  464struct io_uring_task {
  465	/* submission side */
  466	int			cached_refs;
  467	struct xarray		xa;
  468	struct wait_queue_head	wait;
  469	const struct io_ring_ctx *last;
  470	struct io_wq		*io_wq;
  471	struct percpu_counter	inflight;
  472	atomic_t		inflight_tracked;
  473	atomic_t		in_idle;
  474
  475	spinlock_t		task_lock;
  476	struct io_wq_work_list	task_list;
  477	struct io_wq_work_list	prior_task_list;
  478	struct callback_head	task_work;
  479	bool			task_running;
  480};
  481
  482/*
  483 * First field must be the file pointer in all the
  484 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
  485 */
  486struct io_poll_iocb {
  487	struct file			*file;
  488	struct wait_queue_head		*head;
  489	__poll_t			events;
  490	struct wait_queue_entry		wait;
  491};
  492
  493struct io_poll_update {
  494	struct file			*file;
  495	u64				old_user_data;
  496	u64				new_user_data;
  497	__poll_t			events;
  498	bool				update_events;
  499	bool				update_user_data;
  500};
  501
  502struct io_close {
  503	struct file			*file;
  504	int				fd;
  505	u32				file_slot;
  506};
  507
  508struct io_timeout_data {
  509	struct io_kiocb			*req;
  510	struct hrtimer			timer;
  511	struct timespec64		ts;
  512	enum hrtimer_mode		mode;
  513	u32				flags;
  514};
  515
  516struct io_accept {
  517	struct file			*file;
  518	struct sockaddr __user		*addr;
  519	int __user			*addr_len;
  520	int				flags;
  521	u32				file_slot;
  522	unsigned long			nofile;
  523};
  524
  525struct io_sync {
  526	struct file			*file;
  527	loff_t				len;
  528	loff_t				off;
  529	int				flags;
  530	int				mode;
  531};
  532
  533struct io_cancel {
  534	struct file			*file;
  535	u64				addr;
  536};
  537
  538struct io_timeout {
  539	struct file			*file;
  540	u32				off;
  541	u32				target_seq;
  542	struct list_head		list;
  543	/* head of the link, used by linked timeouts only */
  544	struct io_kiocb			*head;
  545	/* for linked completions */
  546	struct io_kiocb			*prev;
  547};
  548
  549struct io_timeout_rem {
  550	struct file			*file;
  551	u64				addr;
  552
  553	/* timeout update */
  554	struct timespec64		ts;
  555	u32				flags;
  556	bool				ltimeout;
  557};
  558
  559struct io_rw {
  560	/* NOTE: kiocb has the file as the first member, so don't do it here */
  561	struct kiocb			kiocb;
  562	u64				addr;
  563	u64				len;
  564};
  565
  566struct io_connect {
  567	struct file			*file;
  568	struct sockaddr __user		*addr;
  569	int				addr_len;
  570};
  571
  572struct io_sr_msg {
  573	struct file			*file;
  574	union {
  575		struct compat_msghdr __user	*umsg_compat;
  576		struct user_msghdr __user	*umsg;
  577		void __user			*buf;
  578	};
  579	int				msg_flags;
  580	int				bgid;
  581	size_t				len;
  582};
  583
  584struct io_open {
  585	struct file			*file;
  586	int				dfd;
  587	u32				file_slot;
  588	struct filename			*filename;
  589	struct open_how			how;
  590	unsigned long			nofile;
  591};
  592
  593struct io_rsrc_update {
  594	struct file			*file;
  595	u64				arg;
  596	u32				nr_args;
  597	u32				offset;
  598};
  599
  600struct io_fadvise {
  601	struct file			*file;
  602	u64				offset;
  603	u32				len;
  604	u32				advice;
  605};
  606
  607struct io_madvise {
  608	struct file			*file;
  609	u64				addr;
  610	u32				len;
  611	u32				advice;
  612};
  613
  614struct io_epoll {
  615	struct file			*file;
  616	int				epfd;
  617	int				op;
  618	int				fd;
  619	struct epoll_event		event;
  620};
  621
  622struct io_splice {
  623	struct file			*file_out;
  624	struct file			*file_in;
  625	loff_t				off_out;
  626	loff_t				off_in;
  627	u64				len;
  628	unsigned int			flags;
  629};
  630
  631struct io_provide_buf {
  632	struct file			*file;
  633	__u64				addr;
  634	__u32				len;
  635	__u32				bgid;
  636	__u16				nbufs;
  637	__u16				bid;
  638};
  639
  640struct io_statx {
  641	struct file			*file;
  642	int				dfd;
  643	unsigned int			mask;
  644	unsigned int			flags;
  645	const char __user		*filename;
  646	struct statx __user		*buffer;
  647};
  648
  649struct io_shutdown {
  650	struct file			*file;
  651	int				how;
  652};
  653
  654struct io_rename {
  655	struct file			*file;
  656	int				old_dfd;
  657	int				new_dfd;
  658	struct filename			*oldpath;
  659	struct filename			*newpath;
  660	int				flags;
  661};
  662
  663struct io_unlink {
  664	struct file			*file;
  665	int				dfd;
  666	int				flags;
  667	struct filename			*filename;
  668};
  669
  670struct io_mkdir {
  671	struct file			*file;
  672	int				dfd;
  673	umode_t				mode;
  674	struct filename			*filename;
  675};
  676
  677struct io_symlink {
  678	struct file			*file;
  679	int				new_dfd;
  680	struct filename			*oldpath;
  681	struct filename			*newpath;
  682};
  683
  684struct io_hardlink {
  685	struct file			*file;
  686	int				old_dfd;
  687	int				new_dfd;
  688	struct filename			*oldpath;
  689	struct filename			*newpath;
  690	int				flags;
  691};
  692
  693struct io_async_connect {
  694	struct sockaddr_storage		address;
  695};
  696
  697struct io_async_msghdr {
  698	struct iovec			fast_iov[UIO_FASTIOV];
  699	/* points to an allocated iov, if NULL we use fast_iov instead */
  700	struct iovec			*free_iov;
  701	struct sockaddr __user		*uaddr;
  702	struct msghdr			msg;
  703	struct sockaddr_storage		addr;
  704};
  705
  706struct io_rw_state {
  707	struct iov_iter			iter;
  708	struct iov_iter_state		iter_state;
  709	struct iovec			fast_iov[UIO_FASTIOV];
  710};
  711
  712struct io_async_rw {
  713	struct io_rw_state		s;
  714	const struct iovec		*free_iovec;
  715	size_t				bytes_done;
  716	struct wait_page_queue		wpq;
  717};
  718
  719enum {
  720	REQ_F_FIXED_FILE_BIT	= IOSQE_FIXED_FILE_BIT,
  721	REQ_F_IO_DRAIN_BIT	= IOSQE_IO_DRAIN_BIT,
  722	REQ_F_LINK_BIT		= IOSQE_IO_LINK_BIT,
  723	REQ_F_HARDLINK_BIT	= IOSQE_IO_HARDLINK_BIT,
  724	REQ_F_FORCE_ASYNC_BIT	= IOSQE_ASYNC_BIT,
  725	REQ_F_BUFFER_SELECT_BIT	= IOSQE_BUFFER_SELECT_BIT,
  726	REQ_F_CQE_SKIP_BIT	= IOSQE_CQE_SKIP_SUCCESS_BIT,
  727
  728	/* first byte is taken by user flags, shift it to not overlap */
  729	REQ_F_FAIL_BIT		= 8,
  730	REQ_F_INFLIGHT_BIT,
  731	REQ_F_CUR_POS_BIT,
  732	REQ_F_NOWAIT_BIT,
  733	REQ_F_LINK_TIMEOUT_BIT,
  734	REQ_F_NEED_CLEANUP_BIT,
  735	REQ_F_POLLED_BIT,
  736	REQ_F_BUFFER_SELECTED_BIT,
  737	REQ_F_COMPLETE_INLINE_BIT,
  738	REQ_F_REISSUE_BIT,
  739	REQ_F_CREDS_BIT,
  740	REQ_F_REFCOUNT_BIT,
  741	REQ_F_ARM_LTIMEOUT_BIT,
  742	REQ_F_ASYNC_DATA_BIT,
  743	REQ_F_SKIP_LINK_CQES_BIT,
  744	/* keep async read/write and isreg together and in order */
  745	REQ_F_SUPPORT_NOWAIT_BIT,
  746	REQ_F_ISREG_BIT,
  747
  748	/* not a real bit, just to check we're not overflowing the space */
  749	__REQ_F_LAST_BIT,
  750};
  751
  752enum {
  753	/* ctx owns file */
  754	REQ_F_FIXED_FILE	= BIT(REQ_F_FIXED_FILE_BIT),
  755	/* drain existing IO first */
  756	REQ_F_IO_DRAIN		= BIT(REQ_F_IO_DRAIN_BIT),
  757	/* linked sqes */
  758	REQ_F_LINK		= BIT(REQ_F_LINK_BIT),
  759	/* doesn't sever on completion < 0 */
  760	REQ_F_HARDLINK		= BIT(REQ_F_HARDLINK_BIT),
  761	/* IOSQE_ASYNC */
  762	REQ_F_FORCE_ASYNC	= BIT(REQ_F_FORCE_ASYNC_BIT),
  763	/* IOSQE_BUFFER_SELECT */
  764	REQ_F_BUFFER_SELECT	= BIT(REQ_F_BUFFER_SELECT_BIT),
  765	/* IOSQE_CQE_SKIP_SUCCESS */
  766	REQ_F_CQE_SKIP		= BIT(REQ_F_CQE_SKIP_BIT),
  767
  768	/* fail rest of links */
  769	REQ_F_FAIL		= BIT(REQ_F_FAIL_BIT),
  770	/* on inflight list, should be cancelled and waited on exit reliably */
  771	REQ_F_INFLIGHT		= BIT(REQ_F_INFLIGHT_BIT),
  772	/* read/write uses file position */
  773	REQ_F_CUR_POS		= BIT(REQ_F_CUR_POS_BIT),
  774	/* must not punt to workers */
  775	REQ_F_NOWAIT		= BIT(REQ_F_NOWAIT_BIT),
  776	/* has or had linked timeout */
  777	REQ_F_LINK_TIMEOUT	= BIT(REQ_F_LINK_TIMEOUT_BIT),
  778	/* needs cleanup */
  779	REQ_F_NEED_CLEANUP	= BIT(REQ_F_NEED_CLEANUP_BIT),
  780	/* already went through poll handler */
  781	REQ_F_POLLED		= BIT(REQ_F_POLLED_BIT),
  782	/* buffer already selected */
  783	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
  784	/* completion is deferred through io_comp_state */
  785	REQ_F_COMPLETE_INLINE	= BIT(REQ_F_COMPLETE_INLINE_BIT),
  786	/* caller should reissue async */
  787	REQ_F_REISSUE		= BIT(REQ_F_REISSUE_BIT),
  788	/* supports async reads/writes */
  789	REQ_F_SUPPORT_NOWAIT	= BIT(REQ_F_SUPPORT_NOWAIT_BIT),
  790	/* regular file */
  791	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
  792	/* has creds assigned */
  793	REQ_F_CREDS		= BIT(REQ_F_CREDS_BIT),
  794	/* skip refcounting if not set */
  795	REQ_F_REFCOUNT		= BIT(REQ_F_REFCOUNT_BIT),
  796	/* there is a linked timeout that has to be armed */
  797	REQ_F_ARM_LTIMEOUT	= BIT(REQ_F_ARM_LTIMEOUT_BIT),
  798	/* ->async_data allocated */
  799	REQ_F_ASYNC_DATA	= BIT(REQ_F_ASYNC_DATA_BIT),
  800	/* don't post CQEs while failing linked requests */
  801	REQ_F_SKIP_LINK_CQES	= BIT(REQ_F_SKIP_LINK_CQES_BIT),
  802};
  803
  804struct async_poll {
  805	struct io_poll_iocb	poll;
  806	struct io_poll_iocb	*double_poll;
  807};
  808
  809typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
  810
  811struct io_task_work {
  812	union {
  813		struct io_wq_work_node	node;
  814		struct llist_node	fallback_node;
  815	};
  816	io_req_tw_func_t		func;
  817};
  818
  819enum {
  820	IORING_RSRC_FILE		= 0,
  821	IORING_RSRC_BUFFER		= 1,
  822};
  823
  824/*
  825 * NOTE! Each of the iocb union members has the file pointer
  826 * as the first entry in their struct definition. So you can
  827 * access the file pointer through any of the sub-structs,
  828 * or directly as just 'ki_filp' in this struct.
  829 */
  830struct io_kiocb {
  831	union {
  832		struct file		*file;
  833		struct io_rw		rw;
  834		struct io_poll_iocb	poll;
  835		struct io_poll_update	poll_update;
  836		struct io_accept	accept;
  837		struct io_sync		sync;
  838		struct io_cancel	cancel;
  839		struct io_timeout	timeout;
  840		struct io_timeout_rem	timeout_rem;
  841		struct io_connect	connect;
  842		struct io_sr_msg	sr_msg;
  843		struct io_open		open;
  844		struct io_close		close;
  845		struct io_rsrc_update	rsrc_update;
  846		struct io_fadvise	fadvise;
  847		struct io_madvise	madvise;
  848		struct io_epoll		epoll;
  849		struct io_splice	splice;
  850		struct io_provide_buf	pbuf;
  851		struct io_statx		statx;
  852		struct io_shutdown	shutdown;
  853		struct io_rename	rename;
  854		struct io_unlink	unlink;
  855		struct io_mkdir		mkdir;
  856		struct io_symlink	symlink;
  857		struct io_hardlink	hardlink;
  858	};
  859
  860	u8				opcode;
  861	/* polled IO has completed */
  862	u8				iopoll_completed;
  863	u16				buf_index;
  864	unsigned int			flags;
  865
  866	u64				user_data;
  867	u32				result;
  868	u32				cflags;
  869
  870	struct io_ring_ctx		*ctx;
  871	struct task_struct		*task;
  872
  873	struct percpu_ref		*fixed_rsrc_refs;
  874	/* store used ubuf, so we can prevent reloading */
  875	struct io_mapped_ubuf		*imu;
  876
  877	/* used by request caches, completion batching and iopoll */
  878	struct io_wq_work_node		comp_list;
  879	atomic_t			refs;
  880	struct io_kiocb			*link;
  881	struct io_task_work		io_task_work;
  882	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
  883	struct hlist_node		hash_node;
  884	/* internal polling, see IORING_FEAT_FAST_POLL */
  885	struct async_poll		*apoll;
  886	/* opcode allocated if it needs to store data for async defer */
  887	void				*async_data;
  888	struct io_wq_work		work;
  889	/* custom credentials, valid IFF REQ_F_CREDS is set */
  890	const struct cred		*creds;
  891	/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
  892	struct io_buffer		*kbuf;
  893	atomic_t			poll_refs;
  894};
  895
  896struct io_tctx_node {
  897	struct list_head	ctx_node;
  898	struct task_struct	*task;
  899	struct io_ring_ctx	*ctx;
  900};
  901
  902struct io_defer_entry {
  903	struct list_head	list;
  904	struct io_kiocb		*req;
  905	u32			seq;
  906};
  907
  908struct io_op_def {
  909	/* needs req->file assigned */
  910	unsigned		needs_file : 1;
  911	/* should block plug */
  912	unsigned		plug : 1;
  913	/* hash wq insertion if file is a regular file */
  914	unsigned		hash_reg_file : 1;
  915	/* unbound wq insertion if file is a non-regular file */
  916	unsigned		unbound_nonreg_file : 1;
  917	/* set if opcode supports polled "wait" */
  918	unsigned		pollin : 1;
  919	unsigned		pollout : 1;
  920	/* op supports buffer selection */
  921	unsigned		buffer_select : 1;
  922	/* do prep async if is going to be punted */
  923	unsigned		needs_async_setup : 1;
  924	/* opcode is not supported by this kernel */
  925	unsigned		not_supported : 1;
  926	/* skip auditing */
  927	unsigned		audit_skip : 1;
  928	/* size of async data needed, if any */
  929	unsigned short		async_size;
  930};
  931
  932static const struct io_op_def io_op_defs[] = {
  933	[IORING_OP_NOP] = {},
  934	[IORING_OP_READV] = {
  935		.needs_file		= 1,
  936		.unbound_nonreg_file	= 1,
  937		.pollin			= 1,
  938		.buffer_select		= 1,
  939		.needs_async_setup	= 1,
  940		.plug			= 1,
  941		.audit_skip		= 1,
  942		.async_size		= sizeof(struct io_async_rw),
  943	},
  944	[IORING_OP_WRITEV] = {
  945		.needs_file		= 1,
  946		.hash_reg_file		= 1,
  947		.unbound_nonreg_file	= 1,
  948		.pollout		= 1,
  949		.needs_async_setup	= 1,
  950		.plug			= 1,
  951		.audit_skip		= 1,
  952		.async_size		= sizeof(struct io_async_rw),
  953	},
  954	[IORING_OP_FSYNC] = {
  955		.needs_file		= 1,
  956		.audit_skip		= 1,
  957	},
  958	[IORING_OP_READ_FIXED] = {
  959		.needs_file		= 1,
  960		.unbound_nonreg_file	= 1,
  961		.pollin			= 1,
  962		.plug			= 1,
  963		.audit_skip		= 1,
  964		.async_size		= sizeof(struct io_async_rw),
  965	},
  966	[IORING_OP_WRITE_FIXED] = {
  967		.needs_file		= 1,
  968		.hash_reg_file		= 1,
  969		.unbound_nonreg_file	= 1,
  970		.pollout		= 1,
  971		.plug			= 1,
  972		.audit_skip		= 1,
  973		.async_size		= sizeof(struct io_async_rw),
  974	},
  975	[IORING_OP_POLL_ADD] = {
  976		.needs_file		= 1,
  977		.unbound_nonreg_file	= 1,
  978		.audit_skip		= 1,
  979	},
  980	[IORING_OP_POLL_REMOVE] = {
  981		.audit_skip		= 1,
  982	},
  983	[IORING_OP_SYNC_FILE_RANGE] = {
  984		.needs_file		= 1,
  985		.audit_skip		= 1,
  986	},
  987	[IORING_OP_SENDMSG] = {
  988		.needs_file		= 1,
  989		.unbound_nonreg_file	= 1,
  990		.pollout		= 1,
  991		.needs_async_setup	= 1,
  992		.async_size		= sizeof(struct io_async_msghdr),
  993	},
  994	[IORING_OP_RECVMSG] = {
  995		.needs_file		= 1,
  996		.unbound_nonreg_file	= 1,
  997		.pollin			= 1,
  998		.buffer_select		= 1,
  999		.needs_async_setup	= 1,
 1000		.async_size		= sizeof(struct io_async_msghdr),
 1001	},
 1002	[IORING_OP_TIMEOUT] = {
 1003		.audit_skip		= 1,
 1004		.async_size		= sizeof(struct io_timeout_data),
 1005	},
 1006	[IORING_OP_TIMEOUT_REMOVE] = {
 1007		/* used by timeout updates' prep() */
 1008		.audit_skip		= 1,
 1009	},
 1010	[IORING_OP_ACCEPT] = {
 1011		.needs_file		= 1,
 1012		.unbound_nonreg_file	= 1,
 1013		.pollin			= 1,
 1014	},
 1015	[IORING_OP_ASYNC_CANCEL] = {
 1016		.audit_skip		= 1,
 1017	},
 1018	[IORING_OP_LINK_TIMEOUT] = {
 1019		.audit_skip		= 1,
 1020		.async_size		= sizeof(struct io_timeout_data),
 1021	},
 1022	[IORING_OP_CONNECT] = {
 1023		.needs_file		= 1,
 1024		.unbound_nonreg_file	= 1,
 1025		.pollout		= 1,
 1026		.needs_async_setup	= 1,
 1027		.async_size		= sizeof(struct io_async_connect),
 1028	},
 1029	[IORING_OP_FALLOCATE] = {
 1030		.needs_file		= 1,
 1031	},
 1032	[IORING_OP_OPENAT] = {},
 1033	[IORING_OP_CLOSE] = {},
 1034	[IORING_OP_FILES_UPDATE] = {
 1035		.audit_skip		= 1,
 1036	},
 1037	[IORING_OP_STATX] = {
 1038		.audit_skip		= 1,
 1039	},
 1040	[IORING_OP_READ] = {
 1041		.needs_file		= 1,
 1042		.unbound_nonreg_file	= 1,
 1043		.pollin			= 1,
 1044		.buffer_select		= 1,
 1045		.plug			= 1,
 1046		.audit_skip		= 1,
 1047		.async_size		= sizeof(struct io_async_rw),
 1048	},
 1049	[IORING_OP_WRITE] = {
 1050		.needs_file		= 1,
 1051		.hash_reg_file		= 1,
 1052		.unbound_nonreg_file	= 1,
 1053		.pollout		= 1,
 1054		.plug			= 1,
 1055		.audit_skip		= 1,
 1056		.async_size		= sizeof(struct io_async_rw),
 1057	},
 1058	[IORING_OP_FADVISE] = {
 1059		.needs_file		= 1,
 1060		.audit_skip		= 1,
 1061	},
 1062	[IORING_OP_MADVISE] = {},
 1063	[IORING_OP_SEND] = {
 1064		.needs_file		= 1,
 1065		.unbound_nonreg_file	= 1,
 1066		.pollout		= 1,
 1067		.audit_skip		= 1,
 1068	},
 1069	[IORING_OP_RECV] = {
 1070		.needs_file		= 1,
 1071		.unbound_nonreg_file	= 1,
 1072		.pollin			= 1,
 1073		.buffer_select		= 1,
 1074		.audit_skip		= 1,
 1075	},
 1076	[IORING_OP_OPENAT2] = {
 1077	},
 1078	[IORING_OP_EPOLL_CTL] = {
 1079		.unbound_nonreg_file	= 1,
 1080		.audit_skip		= 1,
 1081	},
 1082	[IORING_OP_SPLICE] = {
 1083		.needs_file		= 1,
 1084		.hash_reg_file		= 1,
 1085		.unbound_nonreg_file	= 1,
 1086		.audit_skip		= 1,
 1087	},
 1088	[IORING_OP_PROVIDE_BUFFERS] = {
 1089		.audit_skip		= 1,
 1090	},
 1091	[IORING_OP_REMOVE_BUFFERS] = {
 1092		.audit_skip		= 1,
 1093	},
 1094	[IORING_OP_TEE] = {
 1095		.needs_file		= 1,
 1096		.hash_reg_file		= 1,
 1097		.unbound_nonreg_file	= 1,
 1098		.audit_skip		= 1,
 1099	},
 1100	[IORING_OP_SHUTDOWN] = {
 1101		.needs_file		= 1,
 1102	},
 1103	[IORING_OP_RENAMEAT] = {},
 1104	[IORING_OP_UNLINKAT] = {},
 1105	[IORING_OP_MKDIRAT] = {},
 1106	[IORING_OP_SYMLINKAT] = {},
 1107	[IORING_OP_LINKAT] = {},
 1108};
 1109
 1110/* requests with any of those set should undergo io_disarm_next() */
 1111#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
 1112
 1113static bool io_disarm_next(struct io_kiocb *req);
 1114static void io_uring_del_tctx_node(unsigned long index);
 1115static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 1116					 struct task_struct *task,
 1117					 bool cancel_all);
 1118static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
 1119
 1120static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags);
 1121
 1122static void io_put_req(struct io_kiocb *req);
 1123static void io_put_req_deferred(struct io_kiocb *req);
 1124static void io_dismantle_req(struct io_kiocb *req);
 1125static void io_queue_linked_timeout(struct io_kiocb *req);
 1126static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
 1127				     struct io_uring_rsrc_update2 *up,
 1128				     unsigned nr_args);
 1129static void io_clean_op(struct io_kiocb *req);
 1130static struct file *io_file_get(struct io_ring_ctx *ctx,
 1131				struct io_kiocb *req, int fd, bool fixed);
 1132static void __io_queue_sqe(struct io_kiocb *req);
 1133static void io_rsrc_put_work(struct work_struct *work);
 1134
 1135static void io_req_task_queue(struct io_kiocb *req);
 1136static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
 1137static int io_req_prep_async(struct io_kiocb *req);
 1138
 1139static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
 1140				 unsigned int issue_flags, u32 slot_index);
 1141static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
 1142
 1143static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
 1144
 1145static struct kmem_cache *req_cachep;
 1146
 1147static const struct file_operations io_uring_fops;
 1148
 1149struct sock *io_uring_get_socket(struct file *file)
 1150{
 1151#if defined(CONFIG_UNIX)
 1152	if (file->f_op == &io_uring_fops) {
 1153		struct io_ring_ctx *ctx = file->private_data;
 1154
 1155		return ctx->ring_sock->sk;
 1156	}
 1157#endif
 1158	return NULL;
 1159}
 1160EXPORT_SYMBOL(io_uring_get_socket);
 1161
 1162static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
 1163{
 1164	if (!*locked) {
 1165		mutex_lock(&ctx->uring_lock);
 1166		*locked = true;
 1167	}
 1168}
 1169
 1170#define io_for_each_link(pos, head) \
 1171	for (pos = (head); pos; pos = pos->link)
 1172
 1173/*
 1174 * Shamelessly stolen from the mm implementation of page reference checking,
 1175 * see commit f958d7b528b1 for details.
 1176 */
 1177#define req_ref_zero_or_close_to_overflow(req)	\
 1178	((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
 1179
 1180static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
 1181{
 1182	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
 1183	return atomic_inc_not_zero(&req->refs);
 1184}
 1185
 1186static inline bool req_ref_put_and_test(struct io_kiocb *req)
 1187{
 1188	if (likely(!(req->flags & REQ_F_REFCOUNT)))
 1189		return true;
 1190
 1191	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
 1192	return atomic_dec_and_test(&req->refs);
 1193}
 1194
 1195static inline void req_ref_get(struct io_kiocb *req)
 1196{
 1197	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
 1198	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
 1199	atomic_inc(&req->refs);
 1200}
 1201
 1202static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
 1203{
 1204	if (!wq_list_empty(&ctx->submit_state.compl_reqs))
 1205		__io_submit_flush_completions(ctx);
 1206}
 1207
 1208static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
 1209{
 1210	if (!(req->flags & REQ_F_REFCOUNT)) {
 1211		req->flags |= REQ_F_REFCOUNT;
 1212		atomic_set(&req->refs, nr);
 1213	}
 1214}
 1215
 1216static inline void io_req_set_refcount(struct io_kiocb *req)
 1217{
 1218	__io_req_set_refcount(req, 1);
 1219}
 1220
 1221#define IO_RSRC_REF_BATCH	100
 1222
 1223static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
 1224					  struct io_ring_ctx *ctx)
 1225	__must_hold(&ctx->uring_lock)
 1226{
 1227	struct percpu_ref *ref = req->fixed_rsrc_refs;
 1228
 1229	if (ref) {
 1230		if (ref == &ctx->rsrc_node->refs)
 1231			ctx->rsrc_cached_refs++;
 1232		else
 1233			percpu_ref_put(ref);
 1234	}
 1235}
 1236
 1237static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
 1238{
 1239	if (req->fixed_rsrc_refs)
 1240		percpu_ref_put(req->fixed_rsrc_refs);
 1241}
 1242
 1243static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
 1244	__must_hold(&ctx->uring_lock)
 1245{
 1246	if (ctx->rsrc_cached_refs) {
 1247		percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs);
 1248		ctx->rsrc_cached_refs = 0;
 1249	}
 1250}
 1251
 1252static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
 1253	__must_hold(&ctx->uring_lock)
 1254{
 1255	ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
 1256	percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
 1257}
 1258
 1259static inline void io_req_set_rsrc_node(struct io_kiocb *req,
 1260					struct io_ring_ctx *ctx)
 1261{
 1262	if (!req->fixed_rsrc_refs) {
 1263		req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
 1264		ctx->rsrc_cached_refs--;
 1265		if (unlikely(ctx->rsrc_cached_refs < 0))
 1266			io_rsrc_refs_refill(ctx);
 1267	}
 1268}
 1269
 1270static unsigned int __io_put_kbuf(struct io_kiocb *req)
 1271{
 1272	struct io_buffer *kbuf = req->kbuf;
 1273	unsigned int cflags;
 1274
 1275	cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
 1276	cflags |= IORING_CQE_F_BUFFER;
 1277	req->flags &= ~REQ_F_BUFFER_SELECTED;
 1278	kfree(kbuf);
 1279	req->kbuf = NULL;
 1280	return cflags;
 1281}
 1282
 1283static inline unsigned int io_put_kbuf(struct io_kiocb *req)
 1284{
 1285	if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
 1286		return 0;
 1287	return __io_put_kbuf(req);
 1288}
 1289
 1290static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
 1291{
 1292	bool got = percpu_ref_tryget(ref);
 1293
 1294	/* already at zero, wait for ->release() */
 1295	if (!got)
 1296		wait_for_completion(compl);
 1297	percpu_ref_resurrect(ref);
 1298	if (got)
 1299		percpu_ref_put(ref);
 1300}
 1301
 1302static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
 1303			  bool cancel_all)
 1304	__must_hold(&req->ctx->timeout_lock)
 1305{
 1306	struct io_kiocb *req;
 1307
 1308	if (task && head->task != task)
 1309		return false;
 1310	if (cancel_all)
 1311		return true;
 1312
 1313	io_for_each_link(req, head) {
 1314		if (req->flags & REQ_F_INFLIGHT)
 1315			return true;
 1316	}
 1317	return false;
 1318}
 1319
 1320static bool io_match_linked(struct io_kiocb *head)
 1321{
 1322	struct io_kiocb *req;
 1323
 1324	io_for_each_link(req, head) {
 1325		if (req->flags & REQ_F_INFLIGHT)
 1326			return true;
 1327	}
 1328	return false;
 1329}
 1330
 1331/*
 1332 * As io_match_task() but protected against racing with linked timeouts.
 1333 * User must not hold timeout_lock.
 1334 */
 1335static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
 1336			       bool cancel_all)
 1337{
 1338	bool matched;
 1339
 1340	if (task && head->task != task)
 1341		return false;
 1342	if (cancel_all)
 1343		return true;
 1344
 1345	if (head->flags & REQ_F_LINK_TIMEOUT) {
 1346		struct io_ring_ctx *ctx = head->ctx;
 1347
 1348		/* protect against races with linked timeouts */
 1349		spin_lock_irq(&ctx->timeout_lock);
 1350		matched = io_match_linked(head);
 1351		spin_unlock_irq(&ctx->timeout_lock);
 1352	} else {
 1353		matched = io_match_linked(head);
 1354	}
 1355	return matched;
 1356}
 1357
 1358static inline bool req_has_async_data(struct io_kiocb *req)
 1359{
 1360	return req->flags & REQ_F_ASYNC_DATA;
 1361}
 1362
 1363static inline void req_set_fail(struct io_kiocb *req)
 1364{
 1365	req->flags |= REQ_F_FAIL;
 1366	if (req->flags & REQ_F_CQE_SKIP) {
 1367		req->flags &= ~REQ_F_CQE_SKIP;
 1368		req->flags |= REQ_F_SKIP_LINK_CQES;
 1369	}
 1370}
 1371
 1372static inline void req_fail_link_node(struct io_kiocb *req, int res)
 1373{
 1374	req_set_fail(req);
 1375	req->result = res;
 1376}
 1377
 1378static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
 1379{
 1380	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
 1381
 1382	complete(&ctx->ref_comp);
 1383}
 1384
 1385static inline bool io_is_timeout_noseq(struct io_kiocb *req)
 1386{
 1387	return !req->timeout.off;
 1388}
 1389
 1390static __cold void io_fallback_req_func(struct work_struct *work)
 1391{
 1392	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
 1393						fallback_work.work);
 1394	struct llist_node *node = llist_del_all(&ctx->fallback_llist);
 1395	struct io_kiocb *req, *tmp;
 1396	bool locked = false;
 1397
 1398	percpu_ref_get(&ctx->refs);
 1399	llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
 1400		req->io_task_work.func(req, &locked);
 1401
 1402	if (locked) {
 1403		io_submit_flush_completions(ctx);
 1404		mutex_unlock(&ctx->uring_lock);
 1405	}
 1406	percpu_ref_put(&ctx->refs);
 1407}
 1408
 1409static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 1410{
 1411	struct io_ring_ctx *ctx;
 1412	int hash_bits;
 1413
 1414	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 1415	if (!ctx)
 1416		return NULL;
 1417
 1418	/*
 1419	 * Use 5 bits less than the max cq entries, that should give us around
 1420	 * 32 entries per hash list if totally full and uniformly spread.
 1421	 */
 1422	hash_bits = ilog2(p->cq_entries);
 1423	hash_bits -= 5;
 1424	if (hash_bits <= 0)
 1425		hash_bits = 1;
 1426	ctx->cancel_hash_bits = hash_bits;
 1427	ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
 1428					GFP_KERNEL);
 1429	if (!ctx->cancel_hash)
 1430		goto err;
 1431	__hash_init(ctx->cancel_hash, 1U << hash_bits);
 1432
 1433	ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
 1434	if (!ctx->dummy_ubuf)
 1435		goto err;
 1436	/* set invalid range, so io_import_fixed() fails meeting it */
 1437	ctx->dummy_ubuf->ubuf = -1UL;
 1438
 1439	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
 1440			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
 1441		goto err;
 1442
 1443	ctx->flags = p->flags;
 1444	init_waitqueue_head(&ctx->sqo_sq_wait);
 1445	INIT_LIST_HEAD(&ctx->sqd_list);
 1446	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 1447	init_completion(&ctx->ref_comp);
 1448	xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
 1449	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
 1450	mutex_init(&ctx->uring_lock);
 1451	init_waitqueue_head(&ctx->cq_wait);
 1452	spin_lock_init(&ctx->completion_lock);
 1453	spin_lock_init(&ctx->timeout_lock);
 1454	INIT_WQ_LIST(&ctx->iopoll_list);
 1455	INIT_LIST_HEAD(&ctx->defer_list);
 1456	INIT_LIST_HEAD(&ctx->timeout_list);
 1457	INIT_LIST_HEAD(&ctx->ltimeout_list);
 1458	spin_lock_init(&ctx->rsrc_ref_lock);
 1459	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
 1460	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
 1461	init_llist_head(&ctx->rsrc_put_llist);
 1462	INIT_LIST_HEAD(&ctx->tctx_list);
 1463	ctx->submit_state.free_list.next = NULL;
 1464	INIT_WQ_LIST(&ctx->locked_free_list);
 1465	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
 1466	INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
 1467	return ctx;
 1468err:
 1469	kfree(ctx->dummy_ubuf);
 1470	kfree(ctx->cancel_hash);
 1471	kfree(ctx);
 1472	return NULL;
 1473}
 1474
 1475static void io_account_cq_overflow(struct io_ring_ctx *ctx)
 1476{
 1477	struct io_rings *r = ctx->rings;
 1478
 1479	WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
 1480	ctx->cq_extra--;
 1481}
 1482
 1483static bool req_need_defer(struct io_kiocb *req, u32 seq)
 1484{
 1485	if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
 1486		struct io_ring_ctx *ctx = req->ctx;
 1487
 1488		return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
 1489	}
 1490
 1491	return false;
 1492}
 1493
 1494#define FFS_NOWAIT		0x1UL
 1495#define FFS_ISREG		0x2UL
 1496#define FFS_MASK		~(FFS_NOWAIT|FFS_ISREG)
 1497
 1498static inline bool io_req_ffs_set(struct io_kiocb *req)
 1499{
 1500	return req->flags & REQ_F_FIXED_FILE;
 1501}
 1502
 1503static inline void io_req_track_inflight(struct io_kiocb *req)
 1504{
 1505	if (!(req->flags & REQ_F_INFLIGHT)) {
 1506		req->flags |= REQ_F_INFLIGHT;
 1507		atomic_inc(&current->io_uring->inflight_tracked);
 1508	}
 1509}
 1510
 1511static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
 1512{
 1513	if (WARN_ON_ONCE(!req->link))
 1514		return NULL;
 1515
 1516	req->flags &= ~REQ_F_ARM_LTIMEOUT;
 1517	req->flags |= REQ_F_LINK_TIMEOUT;
 1518
 1519	/* linked timeouts should have two refs once prep'ed */
 1520	io_req_set_refcount(req);
 1521	__io_req_set_refcount(req->link, 2);
 1522	return req->link;
 1523}
 1524
 1525static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 1526{
 1527	if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
 1528		return NULL;
 1529	return __io_prep_linked_timeout(req);
 1530}
 1531
 1532static void io_prep_async_work(struct io_kiocb *req)
 1533{
 1534	const struct io_op_def *def = &io_op_defs[req->opcode];
 1535	struct io_ring_ctx *ctx = req->ctx;
 1536
 1537	if (!(req->flags & REQ_F_CREDS)) {
 1538		req->flags |= REQ_F_CREDS;
 1539		req->creds = get_current_cred();
 1540	}
 1541
 1542	req->work.list.next = NULL;
 1543	req->work.flags = 0;
 1544	if (req->flags & REQ_F_FORCE_ASYNC)
 1545		req->work.flags |= IO_WQ_WORK_CONCURRENT;
 1546
 1547	if (req->flags & REQ_F_ISREG) {
 1548		if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
 1549			io_wq_hash_work(&req->work, file_inode(req->file));
 1550	} else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
 1551		if (def->unbound_nonreg_file)
 1552			req->work.flags |= IO_WQ_WORK_UNBOUND;
 1553	}
 1554
 1555	switch (req->opcode) {
 1556	case IORING_OP_SPLICE:
 1557	case IORING_OP_TEE:
 1558		if (!S_ISREG(file_inode(req->splice.file_in)->i_mode))
 1559			req->work.flags |= IO_WQ_WORK_UNBOUND;
 1560		break;
 1561	}
 1562}
 1563
 1564static void io_prep_async_link(struct io_kiocb *req)
 1565{
 1566	struct io_kiocb *cur;
 1567
 1568	if (req->flags & REQ_F_LINK_TIMEOUT) {
 1569		struct io_ring_ctx *ctx = req->ctx;
 1570
 1571		spin_lock_irq(&ctx->timeout_lock);
 1572		io_for_each_link(cur, req)
 1573			io_prep_async_work(cur);
 1574		spin_unlock_irq(&ctx->timeout_lock);
 1575	} else {
 1576		io_for_each_link(cur, req)
 1577			io_prep_async_work(cur);
 1578	}
 1579}
 1580
 1581static inline void io_req_add_compl_list(struct io_kiocb *req)
 1582{
 1583	struct io_ring_ctx *ctx = req->ctx;
 1584	struct io_submit_state *state = &ctx->submit_state;
 1585
 1586	if (!(req->flags & REQ_F_CQE_SKIP))
 1587		ctx->submit_state.flush_cqes = true;
 1588	wq_list_add_tail(&req->comp_list, &state->compl_reqs);
 1589}
 1590
 1591static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
 1592{
 1593	struct io_ring_ctx *ctx = req->ctx;
 1594	struct io_kiocb *link = io_prep_linked_timeout(req);
 1595	struct io_uring_task *tctx = req->task->io_uring;
 1596
 1597	BUG_ON(!tctx);
 1598	BUG_ON(!tctx->io_wq);
 1599
 1600	/* init ->work of the whole link before punting */
 1601	io_prep_async_link(req);
 1602
 1603	/*
 1604	 * Not expected to happen, but if we do have a bug where this _can_
 1605	 * happen, catch it here and ensure the request is marked as
 1606	 * canceled. That will make io-wq go through the usual work cancel
 1607	 * procedure rather than attempt to run this request (or create a new
 1608	 * worker for it).
 1609	 */
 1610	if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
 1611		req->work.flags |= IO_WQ_WORK_CANCEL;
 1612
 1613	trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
 1614					&req->work, req->flags);
 1615	io_wq_enqueue(tctx->io_wq, &req->work);
 1616	if (link)
 1617		io_queue_linked_timeout(link);
 1618}
 1619
 1620static void io_kill_timeout(struct io_kiocb *req, int status)
 1621	__must_hold(&req->ctx->completion_lock)
 1622	__must_hold(&req->ctx->timeout_lock)
 1623{
 1624	struct io_timeout_data *io = req->async_data;
 1625
 1626	if (hrtimer_try_to_cancel(&io->timer) != -1) {
 1627		if (status)
 1628			req_set_fail(req);
 1629		atomic_set(&req->ctx->cq_timeouts,
 1630			atomic_read(&req->ctx->cq_timeouts) + 1);
 1631		list_del_init(&req->timeout.list);
 1632		io_fill_cqe_req(req, status, 0);
 1633		io_put_req_deferred(req);
 1634	}
 1635}
 1636
 1637static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
 1638{
 1639	while (!list_empty(&ctx->defer_list)) {
 1640		struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
 1641						struct io_defer_entry, list);
 1642
 1643		if (req_need_defer(de->req, de->seq))
 1644			break;
 1645		list_del_init(&de->list);
 1646		io_req_task_queue(de->req);
 1647		kfree(de);
 1648	}
 1649}
 1650
 1651static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
 1652	__must_hold(&ctx->completion_lock)
 1653{
 1654	u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
 1655
 1656	spin_lock_irq(&ctx->timeout_lock);
 1657	while (!list_empty(&ctx->timeout_list)) {
 1658		u32 events_needed, events_got;
 1659		struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
 1660						struct io_kiocb, timeout.list);
 1661
 1662		if (io_is_timeout_noseq(req))
 1663			break;
 1664
 1665		/*
 1666		 * Since seq can easily wrap around over time, subtract
 1667		 * the last seq at which timeouts were flushed before comparing.
 1668		 * Assuming not more than 2^31-1 events have happened since,
 1669		 * these subtractions won't have wrapped, so we can check if
 1670		 * target is in [last_seq, current_seq] by comparing the two.
 1671		 */
 1672		events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
 1673		events_got = seq - ctx->cq_last_tm_flush;
 1674		if (events_got < events_needed)
 1675			break;
 1676
 1677		list_del_init(&req->timeout.list);
 1678		io_kill_timeout(req, 0);
 1679	}
 1680	ctx->cq_last_tm_flush = seq;
 1681	spin_unlock_irq(&ctx->timeout_lock);
 1682}
 1683
 1684static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 1685{
 1686	if (ctx->off_timeout_used)
 1687		io_flush_timeouts(ctx);
 1688	if (ctx->drain_active)
 1689		io_queue_deferred(ctx);
 1690}
 1691
 1692static inline void io_commit_cqring(struct io_ring_ctx *ctx)
 1693{
 1694	if (unlikely(ctx->off_timeout_used || ctx->drain_active))
 1695		__io_commit_cqring_flush(ctx);
 1696	/* order cqe stores with ring update */
 1697	smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
 1698}
 1699
 1700static inline bool io_sqring_full(struct io_ring_ctx *ctx)
 1701{
 1702	struct io_rings *r = ctx->rings;
 1703
 1704	return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
 1705}
 1706
 1707static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
 1708{
 1709	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
 1710}
 1711
 1712static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
 1713{
 1714	struct io_rings *rings = ctx->rings;
 1715	unsigned tail, mask = ctx->cq_entries - 1;
 1716
 1717	/*
 1718	 * writes to the cq entry need to come after reading head; the
 1719	 * control dependency is enough as we're using WRITE_ONCE to
 1720	 * fill the cq entry
 1721	 */
 1722	if (__io_cqring_events(ctx) == ctx->cq_entries)
 1723		return NULL;
 1724
 1725	tail = ctx->cached_cq_tail++;
 1726	return &rings->cqes[tail & mask];
 1727}
 1728
 1729static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
 1730{
 1731	if (likely(!ctx->cq_ev_fd))
 1732		return false;
 1733	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
 1734		return false;
 1735	return !ctx->eventfd_async || io_wq_current_is_worker();
 1736}
 1737
 1738/*
 1739 * This should only get called when at least one event has been posted.
 1740 * Some applications rely on the eventfd notification count only changing
 1741 * IFF a new CQE has been added to the CQ ring. There's no depedency on
 1742 * 1:1 relationship between how many times this function is called (and
 1743 * hence the eventfd count) and number of CQEs posted to the CQ ring.
 1744 */
 1745static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 1746{
 1747	/*
 1748	 * wake_up_all() may seem excessive, but io_wake_function() and
 1749	 * io_should_wake() handle the termination of the loop and only
 1750	 * wake as many waiters as we need to.
 1751	 */
 1752	if (wq_has_sleeper(&ctx->cq_wait))
 1753		wake_up_all(&ctx->cq_wait);
 1754	if (io_should_trigger_evfd(ctx))
 1755		eventfd_signal(ctx->cq_ev_fd, 1);
 1756}
 1757
 1758static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
 1759{
 1760	/* see waitqueue_active() comment */
 1761	smp_mb();
 1762
 1763	if (ctx->flags & IORING_SETUP_SQPOLL) {
 1764		if (waitqueue_active(&ctx->cq_wait))
 1765			wake_up_all(&ctx->cq_wait);
 1766	}
 1767	if (io_should_trigger_evfd(ctx))
 1768		eventfd_signal(ctx->cq_ev_fd, 1);
 1769}
 1770
 1771/* Returns true if there are no backlogged entries after the flush */
 1772static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 1773{
 1774	bool all_flushed, posted;
 1775
 1776	if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
 1777		return false;
 1778
 1779	posted = false;
 1780	spin_lock(&ctx->completion_lock);
 1781	while (!list_empty(&ctx->cq_overflow_list)) {
 1782		struct io_uring_cqe *cqe = io_get_cqe(ctx);
 1783		struct io_overflow_cqe *ocqe;
 1784
 1785		if (!cqe && !force)
 1786			break;
 1787		ocqe = list_first_entry(&ctx->cq_overflow_list,
 1788					struct io_overflow_cqe, list);
 1789		if (cqe)
 1790			memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
 1791		else
 1792			io_account_cq_overflow(ctx);
 1793
 1794		posted = true;
 1795		list_del(&ocqe->list);
 1796		kfree(ocqe);
 1797	}
 1798
 1799	all_flushed = list_empty(&ctx->cq_overflow_list);
 1800	if (all_flushed) {
 1801		clear_bit(0, &ctx->check_cq_overflow);
 1802		WRITE_ONCE(ctx->rings->sq_flags,
 1803			   ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
 1804	}
 1805
 1806	if (posted)
 1807		io_commit_cqring(ctx);
 1808	spin_unlock(&ctx->completion_lock);
 1809	if (posted)
 1810		io_cqring_ev_posted(ctx);
 1811	return all_flushed;
 1812}
 1813
 1814static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
 1815{
 1816	bool ret = true;
 1817
 1818	if (test_bit(0, &ctx->check_cq_overflow)) {
 1819		/* iopoll syncs against uring_lock, not completion_lock */
 1820		if (ctx->flags & IORING_SETUP_IOPOLL)
 1821			mutex_lock(&ctx->uring_lock);
 1822		ret = __io_cqring_overflow_flush(ctx, false);
 1823		if (ctx->flags & IORING_SETUP_IOPOLL)
 1824			mutex_unlock(&ctx->uring_lock);
 1825	}
 1826
 1827	return ret;
 1828}
 1829
 1830/* must to be called somewhat shortly after putting a request */
 1831static inline void io_put_task(struct task_struct *task, int nr)
 1832{
 1833	struct io_uring_task *tctx = task->io_uring;
 1834
 1835	if (likely(task == current)) {
 1836		tctx->cached_refs += nr;
 1837	} else {
 1838		percpu_counter_sub(&tctx->inflight, nr);
 1839		if (unlikely(atomic_read(&tctx->in_idle)))
 1840			wake_up(&tctx->wait);
 1841		put_task_struct_many(task, nr);
 1842	}
 1843}
 1844
 1845static void io_task_refs_refill(struct io_uring_task *tctx)
 1846{
 1847	unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
 1848
 1849	percpu_counter_add(&tctx->inflight, refill);
 1850	refcount_add(refill, &current->usage);
 1851	tctx->cached_refs += refill;
 1852}
 1853
 1854static inline void io_get_task_refs(int nr)
 1855{
 1856	struct io_uring_task *tctx = current->io_uring;
 1857
 1858	tctx->cached_refs -= nr;
 1859	if (unlikely(tctx->cached_refs < 0))
 1860		io_task_refs_refill(tctx);
 1861}
 1862
 1863static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
 1864{
 1865	struct io_uring_task *tctx = task->io_uring;
 1866	unsigned int refs = tctx->cached_refs;
 1867
 1868	if (refs) {
 1869		tctx->cached_refs = 0;
 1870		percpu_counter_sub(&tctx->inflight, refs);
 1871		put_task_struct_many(task, refs);
 1872	}
 1873}
 1874
 1875static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
 1876				     s32 res, u32 cflags)
 1877{
 1878	struct io_overflow_cqe *ocqe;
 1879
 1880	ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
 1881	if (!ocqe) {
 1882		/*
 1883		 * If we're in ring overflow flush mode, or in task cancel mode,
 1884		 * or cannot allocate an overflow entry, then we need to drop it
 1885		 * on the floor.
 1886		 */
 1887		io_account_cq_overflow(ctx);
 1888		return false;
 1889	}
 1890	if (list_empty(&ctx->cq_overflow_list)) {
 1891		set_bit(0, &ctx->check_cq_overflow);
 1892		WRITE_ONCE(ctx->rings->sq_flags,
 1893			   ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
 1894
 1895	}
 1896	ocqe->cqe.user_data = user_data;
 1897	ocqe->cqe.res = res;
 1898	ocqe->cqe.flags = cflags;
 1899	list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
 1900	return true;
 1901}
 1902
 1903static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
 1904				 s32 res, u32 cflags)
 1905{
 1906	struct io_uring_cqe *cqe;
 1907
 1908	trace_io_uring_complete(ctx, user_data, res, cflags);
 1909
 1910	/*
 1911	 * If we can't get a cq entry, userspace overflowed the
 1912	 * submission (by quite a lot). Increment the overflow count in
 1913	 * the ring.
 1914	 */
 1915	cqe = io_get_cqe(ctx);
 1916	if (likely(cqe)) {
 1917		WRITE_ONCE(cqe->user_data, user_data);
 1918		WRITE_ONCE(cqe->res, res);
 1919		WRITE_ONCE(cqe->flags, cflags);
 1920		return true;
 1921	}
 1922	return io_cqring_event_overflow(ctx, user_data, res, cflags);
 1923}
 1924
 1925static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
 1926{
 1927	if (!(req->flags & REQ_F_CQE_SKIP))
 1928		__io_fill_cqe(req->ctx, req->user_data, res, cflags);
 1929}
 1930
 1931static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
 1932				     s32 res, u32 cflags)
 1933{
 1934	ctx->cq_extra++;
 1935	return __io_fill_cqe(ctx, user_data, res, cflags);
 1936}
 1937
 1938static void __io_req_complete_post(struct io_kiocb *req, s32 res,
 1939				   u32 cflags)
 1940{
 1941	struct io_ring_ctx *ctx = req->ctx;
 1942
 1943	if (!(req->flags & REQ_F_CQE_SKIP))
 1944		__io_fill_cqe(ctx, req->user_data, res, cflags);
 1945	/*
 1946	 * If we're the last reference to this request, add to our locked
 1947	 * free_list cache.
 1948	 */
 1949	if (req_ref_put_and_test(req)) {
 1950		if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
 1951			if (req->flags & IO_DISARM_MASK)
 1952				io_disarm_next(req);
 1953			if (req->link) {
 1954				io_req_task_queue(req->link);
 1955				req->link = NULL;
 1956			}
 1957		}
 1958		io_req_put_rsrc(req, ctx);
 1959		io_dismantle_req(req);
 1960		io_put_task(req->task, 1);
 1961		wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
 1962		ctx->locked_free_nr++;
 1963	}
 1964}
 1965
 1966static void io_req_complete_post(struct io_kiocb *req, s32 res,
 1967				 u32 cflags)
 1968{
 1969	struct io_ring_ctx *ctx = req->ctx;
 1970
 1971	spin_lock(&ctx->completion_lock);
 1972	__io_req_complete_post(req, res, cflags);
 1973	io_commit_cqring(ctx);
 1974	spin_unlock(&ctx->completion_lock);
 1975	io_cqring_ev_posted(ctx);
 1976}
 1977
 1978static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
 1979					 u32 cflags)
 1980{
 1981	req->result = res;
 1982	req->cflags = cflags;
 1983	req->flags |= REQ_F_COMPLETE_INLINE;
 1984}
 1985
 1986static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
 1987				     s32 res, u32 cflags)
 1988{
 1989	if (issue_flags & IO_URING_F_COMPLETE_DEFER)
 1990		io_req_complete_state(req, res, cflags);
 1991	else
 1992		io_req_complete_post(req, res, cflags);
 1993}
 1994
 1995static inline void io_req_complete(struct io_kiocb *req, s32 res)
 1996{
 1997	__io_req_complete(req, 0, res, 0);
 1998}
 1999
 2000static void io_req_complete_failed(struct io_kiocb *req, s32 res)
 2001{
 2002	req_set_fail(req);
 2003	io_req_complete_post(req, res, 0);
 2004}
 2005
 2006static void io_req_complete_fail_submit(struct io_kiocb *req)
 2007{
 2008	/*
 2009	 * We don't submit, fail them all, for that replace hardlinks with
 2010	 * normal links. Extra REQ_F_LINK is tolerated.
 2011	 */
 2012	req->flags &= ~REQ_F_HARDLINK;
 2013	req->flags |= REQ_F_LINK;
 2014	io_req_complete_failed(req, req->result);
 2015}
 2016
 2017/*
 2018 * Don't initialise the fields below on every allocation, but do that in
 2019 * advance and keep them valid across allocations.
 2020 */
 2021static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
 2022{
 2023	req->ctx = ctx;
 2024	req->link = NULL;
 2025	req->async_data = NULL;
 2026	/* not necessary, but safer to zero */
 2027	req->result = 0;
 2028}
 2029
 2030static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
 2031					struct io_submit_state *state)
 2032{
 2033	spin_lock(&ctx->completion_lock);
 2034	wq_list_splice(&ctx->locked_free_list, &state->free_list);
 2035	ctx->locked_free_nr = 0;
 2036	spin_unlock(&ctx->completion_lock);
 2037}
 2038
 2039/* Returns true IFF there are requests in the cache */
 2040static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
 2041{
 2042	struct io_submit_state *state = &ctx->submit_state;
 2043
 2044	/*
 2045	 * If we have more than a batch's worth of requests in our IRQ side
 2046	 * locked cache, grab the lock and move them over to our submission
 2047	 * side cache.
 2048	 */
 2049	if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
 2050		io_flush_cached_locked_reqs(ctx, state);
 2051	return !!state->free_list.next;
 2052}
 2053
 2054/*
 2055 * A request might get retired back into the request caches even before opcode
 2056 * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
 2057 * Because of that, io_alloc_req() should be called only under ->uring_lock
 2058 * and with extra caution to not get a request that is still worked on.
 2059 */
 2060static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 2061	__must_hold(&ctx->uring_lock)
 2062{
 2063	struct io_submit_state *state = &ctx->submit_state;
 2064	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 2065	void *reqs[IO_REQ_ALLOC_BATCH];
 2066	struct io_kiocb *req;
 2067	int ret, i;
 2068
 2069	if (likely(state->free_list.next || io_flush_cached_reqs(ctx)))
 2070		return true;
 2071
 2072	ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
 2073
 2074	/*
 2075	 * Bulk alloc is all-or-nothing. If we fail to get a batch,
 2076	 * retry single alloc to be on the safe side.
 2077	 */
 2078	if (unlikely(ret <= 0)) {
 2079		reqs[0] = kmem_cache_alloc(req_cachep, gfp);
 2080		if (!reqs[0])
 2081			return false;
 2082		ret = 1;
 2083	}
 2084
 2085	percpu_ref_get_many(&ctx->refs, ret);
 2086	for (i = 0; i < ret; i++) {
 2087		req = reqs[i];
 2088
 2089		io_preinit_req(req, ctx);
 2090		wq_stack_add_head(&req->comp_list, &state->free_list);
 2091	}
 2092	return true;
 2093}
 2094
 2095static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
 2096{
 2097	if (unlikely(!ctx->submit_state.free_list.next))
 2098		return __io_alloc_req_refill(ctx);
 2099	return true;
 2100}
 2101
 2102static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
 2103{
 2104	struct io_wq_work_node *node;
 2105
 2106	node = wq_stack_extract(&ctx->submit_state.free_list);
 2107	return container_of(node, struct io_kiocb, comp_list);
 2108}
 2109
 2110static inline void io_put_file(struct file *file)
 2111{
 2112	if (file)
 2113		fput(file);
 2114}
 2115
 2116static inline void io_dismantle_req(struct io_kiocb *req)
 2117{
 2118	unsigned int flags = req->flags;
 2119
 2120	if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
 2121		io_clean_op(req);
 2122	if (!(flags & REQ_F_FIXED_FILE))
 2123		io_put_file(req->file);
 2124}
 2125
 2126static __cold void __io_free_req(struct io_kiocb *req)
 2127{
 2128	struct io_ring_ctx *ctx = req->ctx;
 2129
 2130	io_req_put_rsrc(req, ctx);
 2131	io_dismantle_req(req);
 2132	io_put_task(req->task, 1);
 2133
 2134	spin_lock(&ctx->completion_lock);
 2135	wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
 2136	ctx->locked_free_nr++;
 2137	spin_unlock(&ctx->completion_lock);
 2138}
 2139
 2140static inline void io_remove_next_linked(struct io_kiocb *req)
 2141{
 2142	struct io_kiocb *nxt = req->link;
 2143
 2144	req->link = nxt->link;
 2145	nxt->link = NULL;
 2146}
 2147
 2148static bool io_kill_linked_timeout(struct io_kiocb *req)
 2149	__must_hold(&req->ctx->completion_lock)
 2150	__must_hold(&req->ctx->timeout_lock)
 2151{
 2152	struct io_kiocb *link = req->link;
 2153
 2154	if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
 2155		struct io_timeout_data *io = link->async_data;
 2156
 2157		io_remove_next_linked(req);
 2158		link->timeout.head = NULL;
 2159		if (hrtimer_try_to_cancel(&io->timer) != -1) {
 2160			list_del(&link->timeout.list);
 2161			/* leave REQ_F_CQE_SKIP to io_fill_cqe_req */
 2162			io_fill_cqe_req(link, -ECANCELED, 0);
 2163			io_put_req_deferred(link);
 2164			return true;
 2165		}
 2166	}
 2167	return false;
 2168}
 2169
 2170static void io_fail_links(struct io_kiocb *req)
 2171	__must_hold(&req->ctx->completion_lock)
 2172{
 2173	struct io_kiocb *nxt, *link = req->link;
 2174	bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES;
 2175
 2176	req->link = NULL;
 2177	while (link) {
 2178		long res = -ECANCELED;
 2179
 2180		if (link->flags & REQ_F_FAIL)
 2181			res = link->result;
 2182
 2183		nxt = link->link;
 2184		link->link = NULL;
 2185
 2186		trace_io_uring_fail_link(req, link);
 2187		if (!ignore_cqes) {
 2188			link->flags &= ~REQ_F_CQE_SKIP;
 2189			io_fill_cqe_req(link, res, 0);
 2190		}
 2191		io_put_req_deferred(link);
 2192		link = nxt;
 2193	}
 2194}
 2195
 2196static bool io_disarm_next(struct io_kiocb *req)
 2197	__must_hold(&req->ctx->completion_lock)
 2198{
 2199	bool posted = false;
 2200
 2201	if (req->flags & REQ_F_ARM_LTIMEOUT) {
 2202		struct io_kiocb *link = req->link;
 2203
 2204		req->flags &= ~REQ_F_ARM_LTIMEOUT;
 2205		if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
 2206			io_remove_next_linked(req);
 2207			/* leave REQ_F_CQE_SKIP to io_fill_cqe_req */
 2208			io_fill_cqe_req(link, -ECANCELED, 0);
 2209			io_put_req_deferred(link);
 2210			posted = true;
 2211		}
 2212	} else if (req->flags & REQ_F_LINK_TIMEOUT) {
 2213		struct io_ring_ctx *ctx = req->ctx;
 2214
 2215		spin_lock_irq(&ctx->timeout_lock);
 2216		posted = io_kill_linked_timeout(req);
 2217		spin_unlock_irq(&ctx->timeout_lock);
 2218	}
 2219	if (unlikely((req->flags & REQ_F_FAIL) &&
 2220		     !(req->flags & REQ_F_HARDLINK))) {
 2221		posted |= (req->link != NULL);
 2222		io_fail_links(req);
 2223	}
 2224	return posted;
 2225}
 2226
 2227static void __io_req_find_next_prep(struct io_kiocb *req)
 2228{
 2229	struct io_ring_ctx *ctx = req->ctx;
 2230	bool posted;
 2231
 2232	spin_lock(&ctx->completion_lock);
 2233	posted = io_disarm_next(req);
 2234	if (posted)
 2235		io_commit_cqring(ctx);
 2236	spin_unlock(&ctx->completion_lock);
 2237	if (posted)
 2238		io_cqring_ev_posted(ctx);
 2239}
 2240
 2241static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
 2242{
 2243	struct io_kiocb *nxt;
 2244
 2245	if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
 2246		return NULL;
 2247	/*
 2248	 * If LINK is set, we have dependent requests in this chain. If we
 2249	 * didn't fail this request, queue the first one up, moving any other
 2250	 * dependencies to the next request. In case of failure, fail the rest
 2251	 * of the chain.
 2252	 */
 2253	if (unlikely(req->flags & IO_DISARM_MASK))
 2254		__io_req_find_next_prep(req);
 2255	nxt = req->link;
 2256	req->link = NULL;
 2257	return nxt;
 2258}
 2259
 2260static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 2261{
 2262	if (!ctx)
 2263		return;
 2264	if (*locked) {
 2265		io_submit_flush_completions(ctx);
 2266		mutex_unlock(&ctx->uring_lock);
 2267		*locked = false;
 2268	}
 2269	percpu_ref_put(&ctx->refs);
 2270}
 2271
 2272static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx)
 2273{
 2274	io_commit_cqring(ctx);
 2275	spin_unlock(&ctx->completion_lock);
 2276	io_cqring_ev_posted(ctx);
 2277}
 2278
 2279static void handle_prev_tw_list(struct io_wq_work_node *node,
 2280				struct io_ring_ctx **ctx, bool *uring_locked)
 2281{
 2282	if (*ctx && !*uring_locked)
 2283		spin_lock(&(*ctx)->completion_lock);
 2284
 2285	do {
 2286		struct io_wq_work_node *next = node->next;
 2287		struct io_kiocb *req = container_of(node, struct io_kiocb,
 2288						    io_task_work.node);
 2289
 2290		if (req->ctx != *ctx) {
 2291			if (unlikely(!*uring_locked && *ctx))
 2292				ctx_commit_and_unlock(*ctx);
 2293
 2294			ctx_flush_and_put(*ctx, uring_locked);
 2295			*ctx = req->ctx;
 2296			/* if not contended, grab and improve batching */
 2297			*uring_locked = mutex_trylock(&(*ctx)->uring_lock);
 2298			percpu_ref_get(&(*ctx)->refs);
 2299			if (unlikely(!*uring_locked))
 2300				spin_lock(&(*ctx)->completion_lock);
 2301		}
 2302		if (likely(*uring_locked))
 2303			req->io_task_work.func(req, uring_locked);
 2304		else
 2305			__io_req_complete_post(req, req->result, io_put_kbuf(req));
 2306		node = next;
 2307	} while (node);
 2308
 2309	if (unlikely(!*uring_locked))
 2310		ctx_commit_and_unlock(*ctx);
 2311}
 2312
 2313static void handle_tw_list(struct io_wq_work_node *node,
 2314			   struct io_ring_ctx **ctx, bool *locked)
 2315{
 2316	do {
 2317		struct io_wq_work_node *next = node->next;
 2318		struct io_kiocb *req = container_of(node, struct io_kiocb,
 2319						    io_task_work.node);
 2320
 2321		if (req->ctx != *ctx) {
 2322			ctx_flush_and_put(*ctx, locked);
 2323			*ctx = req->ctx;
 2324			/* if not contended, grab and improve batching */
 2325			*locked = mutex_trylock(&(*ctx)->uring_lock);
 2326			percpu_ref_get(&(*ctx)->refs);
 2327		}
 2328		req->io_task_work.func(req, locked);
 2329		node = next;
 2330	} while (node);
 2331}
 2332
 2333static void tctx_task_work(struct callback_head *cb)
 2334{
 2335	bool uring_locked = false;
 2336	struct io_ring_ctx *ctx = NULL;
 2337	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
 2338						  task_work);
 2339
 2340	while (1) {
 2341		struct io_wq_work_node *node1, *node2;
 2342
 2343		if (!tctx->task_list.first &&
 2344		    !tctx->prior_task_list.first && uring_locked)
 2345			io_submit_flush_completions(ctx);
 2346
 2347		spin_lock_irq(&tctx->task_lock);
 2348		node1 = tctx->prior_task_list.first;
 2349		node2 = tctx->task_list.first;
 2350		INIT_WQ_LIST(&tctx->task_list);
 2351		INIT_WQ_LIST(&tctx->prior_task_list);
 2352		if (!node2 && !node1)
 2353			tctx->task_running = false;
 2354		spin_unlock_irq(&tctx->task_lock);
 2355		if (!node2 && !node1)
 2356			break;
 2357
 2358		if (node1)
 2359			handle_prev_tw_list(node1, &ctx, &uring_locked);
 2360
 2361		if (node2)
 2362			handle_tw_list(node2, &ctx, &uring_locked);
 2363		cond_resched();
 2364	}
 2365
 2366	ctx_flush_and_put(ctx, &uring_locked);
 2367
 2368	/* relaxed read is enough as only the task itself sets ->in_idle */
 2369	if (unlikely(atomic_read(&tctx->in_idle)))
 2370		io_uring_drop_tctx_refs(current);
 2371}
 2372
 2373static void io_req_task_work_add(struct io_kiocb *req, bool priority)
 2374{
 2375	struct task_struct *tsk = req->task;
 2376	struct io_uring_task *tctx = tsk->io_uring;
 2377	enum task_work_notify_mode notify;
 2378	struct io_wq_work_node *node;
 2379	unsigned long flags;
 2380	bool running;
 2381
 2382	WARN_ON_ONCE(!tctx);
 2383
 2384	spin_lock_irqsave(&tctx->task_lock, flags);
 2385	if (priority)
 2386		wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list);
 2387	else
 2388		wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
 2389	running = tctx->task_running;
 2390	if (!running)
 2391		tctx->task_running = true;
 2392	spin_unlock_irqrestore(&tctx->task_lock, flags);
 2393
 2394	/* task_work already pending, we're done */
 2395	if (running)
 2396		return;
 2397
 2398	/*
 2399	 * SQPOLL kernel thread doesn't need notification, just a wakeup. For
 2400	 * all other cases, use TWA_SIGNAL unconditionally to ensure we're
 2401	 * processing task_work. There's no reliable way to tell if TWA_RESUME
 2402	 * will do the job.
 2403	 */
 2404	notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
 2405	if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
 2406		if (notify == TWA_NONE)
 2407			wake_up_process(tsk);
 2408		return;
 2409	}
 2410
 2411	spin_lock_irqsave(&tctx->task_lock, flags);
 2412	tctx->task_running = false;
 2413	node = wq_list_merge(&tctx->prior_task_list, &tctx->task_list);
 2414	spin_unlock_irqrestore(&tctx->task_lock, flags);
 2415
 2416	while (node) {
 2417		req = container_of(node, struct io_kiocb, io_task_work.node);
 2418		node = node->next;
 2419		if (llist_add(&req->io_task_work.fallback_node,
 2420			      &req->ctx->fallback_llist))
 2421			schedule_delayed_work(&req->ctx->fallback_work, 1);
 2422	}
 2423}
 2424
 2425static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
 2426{
 2427	struct io_ring_ctx *ctx = req->ctx;
 2428
 2429	/* not needed for normal modes, but SQPOLL depends on it */
 2430	io_tw_lock(ctx, locked);
 2431	io_req_complete_failed(req, req->result);
 2432}
 2433
 2434static void io_req_task_submit(struct io_kiocb *req, bool *locked)
 2435{
 2436	struct io_ring_ctx *ctx = req->ctx;
 2437
 2438	io_tw_lock(ctx, locked);
 2439	/* req->task == current here, checking PF_EXITING is safe */
 2440	if (likely(!(req->task->flags & PF_EXITING)))
 2441		__io_queue_sqe(req);
 2442	else
 2443		io_req_complete_failed(req, -EFAULT);
 2444}
 2445
 2446static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
 2447{
 2448	req->result = ret;
 2449	req->io_task_work.func = io_req_task_cancel;
 2450	io_req_task_work_add(req, false);
 2451}
 2452
 2453static void io_req_task_queue(struct io_kiocb *req)
 2454{
 2455	req->io_task_work.func = io_req_task_submit;
 2456	io_req_task_work_add(req, false);
 2457}
 2458
 2459static void io_req_task_queue_reissue(struct io_kiocb *req)
 2460{
 2461	req->io_task_work.func = io_queue_async_work;
 2462	io_req_task_work_add(req, false);
 2463}
 2464
 2465static inline void io_queue_next(struct io_kiocb *req)
 2466{
 2467	struct io_kiocb *nxt = io_req_find_next(req);
 2468
 2469	if (nxt)
 2470		io_req_task_queue(nxt);
 2471}
 2472
 2473static void io_free_req(struct io_kiocb *req)
 2474{
 2475	io_queue_next(req);
 2476	__io_free_req(req);
 2477}
 2478
 2479static void io_free_req_work(struct io_kiocb *req, bool *locked)
 2480{
 2481	io_free_req(req);
 2482}
 2483
 2484static void io_free_batch_list(struct io_ring_ctx *ctx,
 2485				struct io_wq_work_node *node)
 2486	__must_hold(&ctx->uring_lock)
 2487{
 2488	struct task_struct *task = NULL;
 2489	int task_refs = 0;
 2490
 2491	do {
 2492		struct io_kiocb *req = container_of(node, struct io_kiocb,
 2493						    comp_list);
 2494
 2495		if (unlikely(req->flags & REQ_F_REFCOUNT)) {
 2496			node = req->comp_list.next;
 2497			if (!req_ref_put_and_test(req))
 2498				continue;
 2499		}
 2500
 2501		io_req_put_rsrc_locked(req, ctx);
 2502		io_queue_next(req);
 2503		io_dismantle_req(req);
 2504
 2505		if (req->task != task) {
 2506			if (task)
 2507				io_put_task(task, task_refs);
 2508			task = req->task;
 2509			task_refs = 0;
 2510		}
 2511		task_refs++;
 2512		node = req->comp_list.next;
 2513		wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
 2514	} while (node);
 2515
 2516	if (task)
 2517		io_put_task(task, task_refs);
 2518}
 2519
 2520static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 2521	__must_hold(&ctx->uring_lock)
 2522{
 2523	struct io_wq_work_node *node, *prev;
 2524	struct io_submit_state *state = &ctx->submit_state;
 2525
 2526	if (state->flush_cqes) {
 2527		spin_lock(&ctx->completion_lock);
 2528		wq_list_for_each(node, prev, &state->compl_reqs) {
 2529			struct io_kiocb *req = container_of(node, struct io_kiocb,
 2530						    comp_list);
 2531
 2532			if (!(req->flags & REQ_F_CQE_SKIP))
 2533				__io_fill_cqe(ctx, req->user_data, req->result,
 2534					      req->cflags);
 2535		}
 2536
 2537		io_commit_cqring(ctx);
 2538		spin_unlock(&ctx->completion_lock);
 2539		io_cqring_ev_posted(ctx);
 2540		state->flush_cqes = false;
 2541	}
 2542
 2543	io_free_batch_list(ctx, state->compl_reqs.first);
 2544	INIT_WQ_LIST(&state->compl_reqs);
 2545}
 2546
 2547/*
 2548 * Drop reference to request, return next in chain (if there is one) if this
 2549 * was the last reference to this request.
 2550 */
 2551static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
 2552{
 2553	struct io_kiocb *nxt = NULL;
 2554
 2555	if (req_ref_put_and_test(req)) {
 2556		nxt = io_req_find_next(req);
 2557		__io_free_req(req);
 2558	}
 2559	return nxt;
 2560}
 2561
 2562static inline void io_put_req(struct io_kiocb *req)
 2563{
 2564	if (req_ref_put_and_test(req))
 2565		io_free_req(req);
 2566}
 2567
 2568static inline void io_put_req_deferred(struct io_kiocb *req)
 2569{
 2570	if (req_ref_put_and_test(req)) {
 2571		req->io_task_work.func = io_free_req_work;
 2572		io_req_task_work_add(req, false);
 2573	}
 2574}
 2575
 2576static unsigned io_cqring_events(struct io_ring_ctx *ctx)
 2577{
 2578	/* See comment at the top of this file */
 2579	smp_rmb();
 2580	return __io_cqring_events(ctx);
 2581}
 2582
 2583static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
 2584{
 2585	struct io_rings *rings = ctx->rings;
 2586
 2587	/* make sure SQ entry isn't read before tail */
 2588	return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
 2589}
 2590
 2591static inline bool io_run_task_work(void)
 2592{
 2593	if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
 2594		__set_current_state(TASK_RUNNING);
 2595		tracehook_notify_signal();
 2596		return true;
 2597	}
 2598
 2599	return false;
 2600}
 2601
 2602static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 2603{
 2604	struct io_wq_work_node *pos, *start, *prev;
 2605	unsigned int poll_flags = BLK_POLL_NOSLEEP;
 2606	DEFINE_IO_COMP_BATCH(iob);
 2607	int nr_events = 0;
 2608
 2609	/*
 2610	 * Only spin for completions if we don't have multiple devices hanging
 2611	 * off our complete list.
 2612	 */
 2613	if (ctx->poll_multi_queue || force_nonspin)
 2614		poll_flags |= BLK_POLL_ONESHOT;
 2615
 2616	wq_list_for_each(pos, start, &ctx->iopoll_list) {
 2617		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
 2618		struct kiocb *kiocb = &req->rw.kiocb;
 2619		int ret;
 2620
 2621		/*
 2622		 * Move completed and retryable entries to our local lists.
 2623		 * If we find a request that requires polling, break out
 2624		 * and complete those lists first, if we have entries there.
 2625		 */
 2626		if (READ_ONCE(req->iopoll_completed))
 2627			break;
 2628
 2629		ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
 2630		if (unlikely(ret < 0))
 2631			return ret;
 2632		else if (ret)
 2633			poll_flags |= BLK_POLL_ONESHOT;
 2634
 2635		/* iopoll may have completed current req */
 2636		if (!rq_list_empty(iob.req_list) ||
 2637		    READ_ONCE(req->iopoll_completed))
 2638			break;
 2639	}
 2640
 2641	if (!rq_list_empty(iob.req_list))
 2642		iob.complete(&iob);
 2643	else if (!pos)
 2644		return 0;
 2645
 2646	prev = start;
 2647	wq_list_for_each_resume(pos, prev) {
 2648		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
 2649
 2650		/* order with io_complete_rw_iopoll(), e.g. ->result updates */
 2651		if (!smp_load_acquire(&req->iopoll_completed))
 2652			break;
 2653		if (unlikely(req->flags & REQ_F_CQE_SKIP))
 2654			continue;
 2655
 2656		__io_fill_cqe(ctx, req->user_data, req->result, io_put_kbuf(req));
 2657		nr_events++;
 2658	}
 2659
 2660	if (unlikely(!nr_events))
 2661		return 0;
 2662
 2663	io_commit_cqring(ctx);
 2664	io_cqring_ev_posted_iopoll(ctx);
 2665	pos = start ? start->next : ctx->iopoll_list.first;
 2666	wq_list_cut(&ctx->iopoll_list, prev, start);
 2667	io_free_batch_list(ctx, pos);
 2668	return nr_events;
 2669}
 2670
 2671/*
 2672 * We can't just wait for polled events to come to us, we have to actively
 2673 * find and complete them.
 2674 */
 2675static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
 2676{
 2677	if (!(ctx->flags & IORING_SETUP_IOPOLL))
 2678		return;
 2679
 2680	mutex_lock(&ctx->uring_lock);
 2681	while (!wq_list_empty(&ctx->iopoll_list)) {
 2682		/* let it sleep and repeat later if can't complete a request */
 2683		if (io_do_iopoll(ctx, true) == 0)
 2684			break;
 2685		/*
 2686		 * Ensure we allow local-to-the-cpu processing to take place,
 2687		 * in this case we need to ensure that we reap all events.
 2688		 * Also let task_work, etc. to progress by releasing the mutex
 2689		 */
 2690		if (need_resched()) {
 2691			mutex_unlock(&ctx->uring_lock);
 2692			cond_resched();
 2693			mutex_lock(&ctx->uring_lock);
 2694		}
 2695	}
 2696	mutex_unlock(&ctx->uring_lock);
 2697}
 2698
 2699static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 2700{
 2701	unsigned int nr_events = 0;
 2702	int ret = 0;
 2703
 2704	/*
 2705	 * We disallow the app entering submit/complete with polling, but we
 2706	 * still need to lock the ring to prevent racing with polled issue
 2707	 * that got punted to a workqueue.
 2708	 */
 2709	mutex_lock(&ctx->uring_lock);
 2710	/*
 2711	 * Don't enter poll loop if we already have events pending.
 2712	 * If we do, we can potentially be spinning for commands that
 2713	 * already triggered a CQE (eg in error).
 2714	 */
 2715	if (test_bit(0, &ctx->check_cq_overflow))
 2716		__io_cqring_overflow_flush(ctx, false);
 2717	if (io_cqring_events(ctx))
 2718		goto out;
 2719	do {
 2720		/*
 2721		 * If a submit got punted to a workqueue, we can have the
 2722		 * application entering polling for a command before it gets
 2723		 * issued. That app will hold the uring_lock for the duration
 2724		 * of the poll right here, so we need to take a breather every
 2725		 * now and then to ensure that the issue has a chance to add
 2726		 * the poll to the issued list. Otherwise we can spin here
 2727		 * forever, while the workqueue is stuck trying to acquire the
 2728		 * very same mutex.
 2729		 */
 2730		if (wq_list_empty(&ctx->iopoll_list)) {
 2731			u32 tail = ctx->cached_cq_tail;
 2732
 2733			mutex_unlock(&ctx->uring_lock);
 2734			io_run_task_work();
 2735			mutex_lock(&ctx->uring_lock);
 2736
 2737			/* some requests don't go through iopoll_list */
 2738			if (tail != ctx->cached_cq_tail ||
 2739			    wq_list_empty(&ctx->iopoll_list))
 2740				break;
 2741		}
 2742		ret = io_do_iopoll(ctx, !min);
 2743		if (ret < 0)
 2744			break;
 2745		nr_events += ret;
 2746		ret = 0;
 2747	} while (nr_events < min && !need_resched());
 2748out:
 2749	mutex_unlock(&ctx->uring_lock);
 2750	return ret;
 2751}
 2752
 2753static void kiocb_end_write(struct io_kiocb *req)
 2754{
 2755	/*
 2756	 * Tell lockdep we inherited freeze protection from submission
 2757	 * thread.
 2758	 */
 2759	if (req->flags & REQ_F_ISREG) {
 2760		struct super_block *sb = file_inode(req->file)->i_sb;
 2761
 2762		__sb_writers_acquired(sb, SB_FREEZE_WRITE);
 2763		sb_end_write(sb);
 2764	}
 2765}
 2766
 2767#ifdef CONFIG_BLOCK
 2768static bool io_resubmit_prep(struct io_kiocb *req)
 2769{
 2770	struct io_async_rw *rw = req->async_data;
 2771
 2772	if (!req_has_async_data(req))
 2773		return !io_req_prep_async(req);
 2774	iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
 2775	return true;
 2776}
 2777
 2778static bool io_rw_should_reissue(struct io_kiocb *req)
 2779{
 2780	umode_t mode = file_inode(req->file)->i_mode;
 2781	struct io_ring_ctx *ctx = req->ctx;
 2782
 2783	if (!S_ISBLK(mode) && !S_ISREG(mode))
 2784		return false;
 2785	if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
 2786	    !(ctx->flags & IORING_SETUP_IOPOLL)))
 2787		return false;
 2788	/*
 2789	 * If ref is dying, we might be running poll reap from the exit work.
 2790	 * Don't attempt to reissue from that path, just let it fail with
 2791	 * -EAGAIN.
 2792	 */
 2793	if (percpu_ref_is_dying(&ctx->refs))
 2794		return false;
 2795	/*
 2796	 * Play it safe and assume not safe to re-import and reissue if we're
 2797	 * not in the original thread group (or in task context).
 2798	 */
 2799	if (!same_thread_group(req->task, current) || !in_task())
 2800		return false;
 2801	return true;
 2802}
 2803#else
 2804static bool io_resubmit_prep(struct io_kiocb *req)
 2805{
 2806	return false;
 2807}
 2808static bool io_rw_should_reissue(struct io_kiocb *req)
 2809{
 2810	return false;
 2811}
 2812#endif
 2813
 2814static bool __io_complete_rw_common(struct io_kiocb *req, long res)
 2815{
 2816	if (req->rw.kiocb.ki_flags & IOCB_WRITE)
 2817		kiocb_end_write(req);
 2818	if (unlikely(res != req->result)) {
 2819		if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
 2820		    io_rw_should_reissue(req)) {
 2821			req->flags |= REQ_F_REISSUE;
 2822			return true;
 2823		}
 2824		req_set_fail(req);
 2825		req->result = res;
 2826	}
 2827	return false;
 2828}
 2829
 2830static inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
 2831{
 2832	unsigned int cflags = io_put_kbuf(req);
 2833	int res = req->result;
 2834
 2835	if (*locked) {
 2836		io_req_complete_state(req, res, cflags);
 2837		io_req_add_compl_list(req);
 2838	} else {
 2839		io_req_complete_post(req, res, cflags);
 2840	}
 2841}
 2842
 2843static void __io_complete_rw(struct io_kiocb *req, long res,
 2844			     unsigned int issue_flags)
 2845{
 2846	if (__io_complete_rw_common(req, res))
 2847		return;
 2848	__io_req_complete(req, issue_flags, req->result, io_put_kbuf(req));
 2849}
 2850
 2851static void io_complete_rw(struct kiocb *kiocb, long res)
 2852{
 2853	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 2854
 2855	if (__io_complete_rw_common(req, res))
 2856		return;
 2857	req->result = res;
 2858	req->io_task_work.func = io_req_task_complete;
 2859	io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL));
 2860}
 2861
 2862static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
 2863{
 2864	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 2865
 2866	if (kiocb->ki_flags & IOCB_WRITE)
 2867		kiocb_end_write(req);
 2868	if (unlikely(res != req->result)) {
 2869		if (res == -EAGAIN && io_rw_should_reissue(req)) {
 2870			req->flags |= REQ_F_REISSUE;
 2871			return;
 2872		}
 2873		req->result = res;
 2874	}
 2875
 2876	/* order with io_iopoll_complete() checking ->iopoll_completed */
 2877	smp_store_release(&req->iopoll_completed, 1);
 2878}
 2879
 2880/*
 2881 * After the iocb has been issued, it's safe to be found on the poll list.
 2882 * Adding the kiocb to the list AFTER submission ensures that we don't
 2883 * find it from a io_do_iopoll() thread before the issuer is done
 2884 * accessing the kiocb cookie.
 2885 */
 2886static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
 2887{
 2888	struct io_ring_ctx *ctx = req->ctx;
 2889	const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 2890
 2891	/* workqueue context doesn't hold uring_lock, grab it now */
 2892	if (unlikely(needs_lock))
 2893		mutex_lock(&ctx->uring_lock);
 2894
 2895	/*
 2896	 * Track whether we have multiple files in our lists. This will impact
 2897	 * how we do polling eventually, not spinning if we're on potentially
 2898	 * different devices.
 2899	 */
 2900	if (wq_list_empty(&ctx->iopoll_list)) {
 2901		ctx->poll_multi_queue = false;
 2902	} else if (!ctx->poll_multi_queue) {
 2903		struct io_kiocb *list_req;
 2904
 2905		list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
 2906					comp_list);
 2907		if (list_req->file != req->file)
 2908			ctx->poll_multi_queue = true;
 2909	}
 2910
 2911	/*
 2912	 * For fast devices, IO may have already completed. If it has, add
 2913	 * it to the front so we find it first.
 2914	 */
 2915	if (READ_ONCE(req->iopoll_completed))
 2916		wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
 2917	else
 2918		wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
 2919
 2920	if (unlikely(needs_lock)) {
 2921		/*
 2922		 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
 2923		 * in sq thread task context or in io worker task context. If
 2924		 * current task context is sq thread, we don't need to check
 2925		 * whether should wake up sq thread.
 2926		 */
 2927		if ((ctx->flags & IORING_SETUP_SQPOLL) &&
 2928		    wq_has_sleeper(&ctx->sq_data->wait))
 2929			wake_up(&ctx->sq_data->wait);
 2930
 2931		mutex_unlock(&ctx->uring_lock);
 2932	}
 2933}
 2934
 2935static bool io_bdev_nowait(struct block_device *bdev)
 2936{
 2937	return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
 2938}
 2939
 2940/*
 2941 * If we tracked the file through the SCM inflight mechanism, we could support
 2942 * any file. For now, just ensure that anything potentially problematic is done
 2943 * inline.
 2944 */
 2945static bool __io_file_supports_nowait(struct file *file, umode_t mode)
 2946{
 2947	if (S_ISBLK(mode)) {
 2948		if (IS_ENABLED(CONFIG_BLOCK) &&
 2949		    io_bdev_nowait(I_BDEV(file->f_mapping->host)))
 2950			return true;
 2951		return false;
 2952	}
 2953	if (S_ISSOCK(mode))
 2954		return true;
 2955	if (S_ISREG(mode)) {
 2956		if (IS_ENABLED(CONFIG_BLOCK) &&
 2957		    io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
 2958		    file->f_op != &io_uring_fops)
 2959			return true;
 2960		return false;
 2961	}
 2962
 2963	/* any ->read/write should understand O_NONBLOCK */
 2964	if (file->f_flags & O_NONBLOCK)
 2965		return true;
 2966	return file->f_mode & FMODE_NOWAIT;
 2967}
 2968
 2969/*
 2970 * If we tracked the file through the SCM inflight mechanism, we could support
 2971 * any file. For now, just ensure that anything potentially problematic is done
 2972 * inline.
 2973 */
 2974static unsigned int io_file_get_flags(struct file *file)
 2975{
 2976	umode_t mode = file_inode(file)->i_mode;
 2977	unsigned int res = 0;
 2978
 2979	if (S_ISREG(mode))
 2980		res |= FFS_ISREG;
 2981	if (__io_file_supports_nowait(file, mode))
 2982		res |= FFS_NOWAIT;
 2983	return res;
 2984}
 2985
 2986static inline bool io_file_supports_nowait(struct io_kiocb *req)
 2987{
 2988	return req->flags & REQ_F_SUPPORT_NOWAIT;
 2989}
 2990
 2991static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 2992{
 2993	struct io_ring_ctx *ctx = req->ctx;
 2994	struct kiocb *kiocb = &req->rw.kiocb;
 2995	struct file *file = req->file;
 2996	unsigned ioprio;
 2997	int ret;
 2998
 2999	if (!io_req_ffs_set(req))
 3000		req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
 3001
 3002	kiocb->ki_pos = READ_ONCE(sqe->off);
 3003	if (kiocb->ki_pos == -1) {
 3004		if (!(file->f_mode & FMODE_STREAM)) {
 3005			req->flags |= REQ_F_CUR_POS;
 3006			kiocb->ki_pos = file->f_pos;
 3007		} else {
 3008			kiocb->ki_pos = 0;
 3009		}
 3010	}
 3011	kiocb->ki_flags = iocb_flags(file);
 3012	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
 3013	if (unlikely(ret))
 3014		return ret;
 3015
 3016	/*
 3017	 * If the file is marked O_NONBLOCK, still allow retry for it if it
 3018	 * supports async. Otherwise it's impossible to use O_NONBLOCK files
 3019	 * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
 3020	 */
 3021	if ((kiocb->ki_flags & IOCB_NOWAIT) ||
 3022	    ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
 3023		req->flags |= REQ_F_NOWAIT;
 3024
 3025	if (ctx->flags & IORING_SETUP_IOPOLL) {
 3026		if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
 3027			return -EOPNOTSUPP;
 3028
 3029		kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
 3030		kiocb->ki_complete = io_complete_rw_iopoll;
 3031		req->iopoll_completed = 0;
 3032	} else {
 3033		if (kiocb->ki_flags & IOCB_HIPRI)
 3034			return -EINVAL;
 3035		kiocb->ki_complete = io_complete_rw;
 3036	}
 3037
 3038	ioprio = READ_ONCE(sqe->ioprio);
 3039	if (ioprio) {
 3040		ret = ioprio_check_cap(ioprio);
 3041		if (ret)
 3042			return ret;
 3043
 3044		kiocb->ki_ioprio = ioprio;
 3045	} else {
 3046		kiocb->ki_ioprio = get_current_ioprio();
 3047	}
 3048
 3049	req->imu = NULL;
 3050	req->rw.addr = READ_ONCE(sqe->addr);
 3051	req->rw.len = READ_ONCE(sqe->len);
 3052	req->buf_index = READ_ONCE(sqe->buf_index);
 3053	return 0;
 3054}
 3055
 3056static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
 3057{
 3058	switch (ret) {
 3059	case -EIOCBQUEUED:
 3060		break;
 3061	case -ERESTARTSYS:
 3062	case -ERESTARTNOINTR:
 3063	case -ERESTARTNOHAND:
 3064	case -ERESTART_RESTARTBLOCK:
 3065		/*
 3066		 * We can't just restart the syscall, since previously
 3067		 * submitted sqes may already be in progress. Just fail this
 3068		 * IO with EINTR.
 3069		 */
 3070		ret = -EINTR;
 3071		fallthrough;
 3072	default:
 3073		kiocb->ki_complete(kiocb, ret);
 3074	}
 3075}
 3076
 3077static void kiocb_done(struct io_kiocb *req, ssize_t ret,
 3078		       unsigned int issue_flags)
 3079{
 3080	struct io_async_rw *io = req->async_data;
 3081
 3082	/* add previously done IO, if any */
 3083	if (req_has_async_data(req) && io->bytes_done > 0) {
 3084		if (ret < 0)
 3085			ret = io->bytes_done;
 3086		else
 3087			ret += io->bytes_done;
 3088	}
 3089
 3090	if (req->flags & REQ_F_CUR_POS)
 3091		req->file->f_pos = req->rw.kiocb.ki_pos;
 3092	if (ret >= 0 && (req->rw.kiocb.ki_complete == io_complete_rw))
 3093		__io_complete_rw(req, ret, issue_flags);
 3094	else
 3095		io_rw_done(&req->rw.kiocb, ret);
 3096
 3097	if (req->flags & REQ_F_REISSUE) {
 3098		req->flags &= ~REQ_F_REISSUE;
 3099		if (io_resubmit_prep(req)) {
 3100			io_req_task_queue_reissue(req);
 3101		} else {
 3102			req_set_fail(req);
 3103			req->result = ret;
 3104			req->io_task_work.func = io_req_task_complete;
 3105			io_req_task_work_add(req, false);
 3106		}
 3107	}
 3108}
 3109
 3110static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
 3111			     struct io_mapped_ubuf *imu)
 3112{
 3113	size_t len = req->rw.len;
 3114	u64 buf_end, buf_addr = req->rw.addr;
 3115	size_t offset;
 3116
 3117	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
 3118		return -EFAULT;
 3119	/* not inside the mapped region */
 3120	if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
 3121		return -EFAULT;
 3122
 3123	/*
 3124	 * May not be a start of buffer, set size appropriately
 3125	 * and advance us to the beginning.
 3126	 */
 3127	offset = buf_addr - imu->ubuf;
 3128	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
 3129
 3130	if (offset) {
 3131		/*
 3132		 * Don't use iov_iter_advance() here, as it's really slow for
 3133		 * using the latter parts of a big fixed buffer - it iterates
 3134		 * over each segment manually. We can cheat a bit here, because
 3135		 * we know that:
 3136		 *
 3137		 * 1) it's a BVEC iter, we set it up
 3138		 * 2) all bvecs are PAGE_SIZE in size, except potentially the
 3139		 *    first and last bvec
 3140		 *
 3141		 * So just find our index, and adjust the iterator afterwards.
 3142		 * If the offset is within the first bvec (or the whole first
 3143		 * bvec, just use iov_iter_advance(). This makes it easier
 3144		 * since we can just skip the first segment, which may not
 3145		 * be PAGE_SIZE aligned.
 3146		 */
 3147		const struct bio_vec *bvec = imu->bvec;
 3148
 3149		if (offset <= bvec->bv_len) {
 3150			iov_iter_advance(iter, offset);
 3151		} else {
 3152			unsigned long seg_skip;
 3153
 3154			/* skip first vec */
 3155			offset -= bvec->bv_len;
 3156			seg_skip = 1 + (offset >> PAGE_SHIFT);
 3157
 3158			iter->bvec = bvec + seg_skip;
 3159			iter->nr_segs -= seg_skip;
 3160			iter->count -= bvec->bv_len + offset;
 3161			iter->iov_offset = offset & ~PAGE_MASK;
 3162		}
 3163	}
 3164
 3165	return 0;
 3166}
 3167
 3168static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
 3169{
 3170	struct io_mapped_ubuf *imu = req->imu;
 3171	u16 index, buf_index = req->buf_index;
 3172
 3173	if (likely(!imu)) {
 3174		struct io_ring_ctx *ctx = req->ctx;
 3175
 3176		if (unlikely(buf_index >= ctx->nr_user_bufs))
 3177			return -EFAULT;
 3178		io_req_set_rsrc_node(req, ctx);
 3179		index = array_index_nospec(buf_index, ctx->nr_user_bufs);
 3180		imu = READ_ONCE(ctx->user_bufs[index]);
 3181		req->imu = imu;
 3182	}
 3183	return __io_import_fixed(req, rw, iter, imu);
 3184}
 3185
 3186static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
 3187{
 3188	if (needs_lock)
 3189		mutex_unlock(&ctx->uring_lock);
 3190}
 3191
 3192static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
 3193{
 3194	/*
 3195	 * "Normal" inline submissions always hold the uring_lock, since we
 3196	 * grab it from the system call. Same is true for the SQPOLL offload.
 3197	 * The only exception is when we've detached the request and issue it
 3198	 * from an async worker thread, grab the lock for that case.
 3199	 */
 3200	if (needs_lock)
 3201		mutex_lock(&ctx->uring_lock);
 3202}
 3203
 3204static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
 3205					  int bgid, unsigned int issue_flags)
 3206{
 3207	struct io_buffer *kbuf = req->kbuf;
 3208	struct io_buffer *head;
 3209	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 3210
 3211	if (req->flags & REQ_F_BUFFER_SELECTED)
 3212		return kbuf;
 3213
 3214	io_ring_submit_lock(req->ctx, needs_lock);
 3215
 3216	lockdep_assert_held(&req->ctx->uring_lock);
 3217
 3218	head = xa_load(&req->ctx->io_buffers, bgid);
 3219	if (head) {
 3220		if (!list_empty(&head->list)) {
 3221			kbuf = list_last_entry(&head->list, struct io_buffer,
 3222							list);
 3223			list_del(&kbuf->list);
 3224		} else {
 3225			kbuf = head;
 3226			xa_erase(&req->ctx->io_buffers, bgid);
 3227		}
 3228		if (*len > kbuf->len)
 3229			*len = kbuf->len;
 3230		req->flags |= REQ_F_BUFFER_SELECTED;
 3231		req->kbuf = kbuf;
 3232	} else {
 3233		kbuf = ERR_PTR(-ENOBUFS);
 3234	}
 3235
 3236	io_ring_submit_unlock(req->ctx, needs_lock);
 3237	return kbuf;
 3238}
 3239
 3240static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
 3241					unsigned int issue_flags)
 3242{
 3243	struct io_buffer *kbuf;
 3244	u16 bgid;
 3245
 3246	bgid = req->buf_index;
 3247	kbuf = io_buffer_select(req, len, bgid, issue_flags);
 3248	if (IS_ERR(kbuf))
 3249		return kbuf;
 3250	return u64_to_user_ptr(kbuf->addr);
 3251}
 3252
 3253#ifdef CONFIG_COMPAT
 3254static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
 3255				unsigned int issue_flags)
 3256{
 3257	struct compat_iovec __user *uiov;
 3258	compat_ssize_t clen;
 3259	void __user *buf;
 3260	ssize_t len;
 3261
 3262	uiov = u64_to_user_ptr(req->rw.addr);
 3263	if (!access_ok(uiov, sizeof(*uiov)))
 3264		return -EFAULT;
 3265	if (__get_user(clen, &uiov->iov_len))
 3266		return -EFAULT;
 3267	if (clen < 0)
 3268		return -EINVAL;
 3269
 3270	len = clen;
 3271	buf = io_rw_buffer_select(req, &len, issue_flags);
 3272	if (IS_ERR(buf))
 3273		return PTR_ERR(buf);
 3274	iov[0].iov_base = buf;
 3275	iov[0].iov_len = (compat_size_t) len;
 3276	return 0;
 3277}
 3278#endif
 3279
 3280static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 3281				      unsigned int issue_flags)
 3282{
 3283	struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
 3284	void __user *buf;
 3285	ssize_t len;
 3286
 3287	if (copy_from_user(iov, uiov, sizeof(*uiov)))
 3288		return -EFAULT;
 3289
 3290	len = iov[0].iov_len;
 3291	if (len < 0)
 3292		return -EINVAL;
 3293	buf = io_rw_buffer_select(req, &len, issue_flags);
 3294	if (IS_ERR(buf))
 3295		return PTR_ERR(buf);
 3296	iov[0].iov_base = buf;
 3297	iov[0].iov_len = len;
 3298	return 0;
 3299}
 3300
 3301static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 3302				    unsigned int issue_flags)
 3303{
 3304	if (req->flags & REQ_F_BUFFER_SELECTED) {
 3305		struct io_buffer *kbuf = req->kbuf;
 3306
 3307		iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
 3308		iov[0].iov_len = kbuf->len;
 3309		return 0;
 3310	}
 3311	if (req->rw.len != 1)
 3312		return -EINVAL;
 3313
 3314#ifdef CONFIG_COMPAT
 3315	if (req->ctx->compat)
 3316		return io_compat_import(req, iov, issue_flags);
 3317#endif
 3318
 3319	return __io_iov_buffer_select(req, iov, issue_flags);
 3320}
 3321
 3322static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
 3323				       struct io_rw_state *s,
 3324				       unsigned int issue_flags)
 3325{
 3326	struct iov_iter *iter = &s->iter;
 3327	u8 opcode = req->opcode;
 3328	struct iovec *iovec;
 3329	void __user *buf;
 3330	size_t sqe_len;
 3331	ssize_t ret;
 3332
 3333	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
 3334		ret = io_import_fixed(req, rw, iter);
 3335		if (ret)
 3336			return ERR_PTR(ret);
 3337		return NULL;
 3338	}
 3339
 3340	/* buffer index only valid with fixed read/write, or buffer select  */
 3341	if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)))
 3342		return ERR_PTR(-EINVAL);
 3343
 3344	buf = u64_to_user_ptr(req->rw.addr);
 3345	sqe_len = req->rw.len;
 3346
 3347	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
 3348		if (req->flags & REQ_F_BUFFER_SELECT) {
 3349			buf = io_rw_buffer_select(req, &sqe_len, issue_flags);
 3350			if (IS_ERR(buf))
 3351				return ERR_CAST(buf);
 3352			req->rw.len = sqe_len;
 3353		}
 3354
 3355		ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter);
 3356		if (ret)
 3357			return ERR_PTR(ret);
 3358		return NULL;
 3359	}
 3360
 3361	iovec = s->fast_iov;
 3362	if (req->flags & REQ_F_BUFFER_SELECT) {
 3363		ret = io_iov_buffer_select(req, iovec, issue_flags);
 3364		if (ret)
 3365			return ERR_PTR(ret);
 3366		iov_iter_init(iter, rw, iovec, 1, iovec->iov_len);
 3367		return NULL;
 3368	}
 3369
 3370	ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
 3371			      req->ctx->compat);
 3372	if (unlikely(ret < 0))
 3373		return ERR_PTR(ret);
 3374	return iovec;
 3375}
 3376
 3377static inline int io_import_iovec(int rw, struct io_kiocb *req,
 3378				  struct iovec **iovec, struct io_rw_state *s,
 3379				  unsigned int issue_flags)
 3380{
 3381	*iovec = __io_import_iovec(rw, req, s, issue_flags);
 3382	if (unlikely(IS_ERR(*iovec)))
 3383		return PTR_ERR(*iovec);
 3384
 3385	iov_iter_save_state(&s->iter, &s->iter_state);
 3386	return 0;
 3387}
 3388
 3389static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
 3390{
 3391	return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
 3392}
 3393
 3394/*
 3395 * For files that don't have ->read_iter() and ->write_iter(), handle them
 3396 * by looping over ->read() or ->write() manually.
 3397 */
 3398static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
 3399{
 3400	struct kiocb *kiocb = &req->rw.kiocb;
 3401	struct file *file = req->file;
 3402	ssize_t ret = 0;
 3403
 3404	/*
 3405	 * Don't support polled IO through this interface, and we can't
 3406	 * support non-blocking either. For the latter, this just causes
 3407	 * the kiocb to be handled from an async context.
 3408	 */
 3409	if (kiocb->ki_flags & IOCB_HIPRI)
 3410		return -EOPNOTSUPP;
 3411	if ((kiocb->ki_flags & IOCB_NOWAIT) &&
 3412	    !(kiocb->ki_filp->f_flags & O_NONBLOCK))
 3413		return -EAGAIN;
 3414
 3415	while (iov_iter_count(iter)) {
 3416		struct iovec iovec;
 3417		ssize_t nr;
 3418
 3419		if (!iov_iter_is_bvec(iter)) {
 3420			iovec = iov_iter_iovec(iter);
 3421		} else {
 3422			iovec.iov_base = u64_to_user_ptr(req->rw.addr);
 3423			iovec.iov_len = req->rw.len;
 3424		}
 3425
 3426		if (rw == READ) {
 3427			nr = file->f_op->read(file, iovec.iov_base,
 3428					      iovec.iov_len, io_kiocb_ppos(kiocb));
 3429		} else {
 3430			nr = file->f_op->write(file, iovec.iov_base,
 3431					       iovec.iov_len, io_kiocb_ppos(kiocb));
 3432		}
 3433
 3434		if (nr < 0) {
 3435			if (!ret)
 3436				ret = nr;
 3437			break;
 3438		}
 3439		if (!iov_iter_is_bvec(iter)) {
 3440			iov_iter_advance(iter, nr);
 3441		} else {
 3442			req->rw.len -= nr;
 3443			req->rw.addr += nr;
 3444		}
 3445		ret += nr;
 3446		if (nr != iovec.iov_len)
 3447			break;
 3448	}
 3449
 3450	return ret;
 3451}
 3452
 3453static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
 3454			  const struct iovec *fast_iov, struct iov_iter *iter)
 3455{
 3456	struct io_async_rw *rw = req->async_data;
 3457
 3458	memcpy(&rw->s.iter, iter, sizeof(*iter));
 3459	rw->free_iovec = iovec;
 3460	rw->bytes_done = 0;
 3461	/* can only be fixed buffers, no need to do anything */
 3462	if (iov_iter_is_bvec(iter))
 3463		return;
 3464	if (!iovec) {
 3465		unsigned iov_off = 0;
 3466
 3467		rw->s.iter.iov = rw->s.fast_iov;
 3468		if (iter->iov != fast_iov) {
 3469			iov_off = iter->iov - fast_iov;
 3470			rw->s.iter.iov += iov_off;
 3471		}
 3472		if (rw->s.fast_iov != fast_iov)
 3473			memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off,
 3474			       sizeof(struct iovec) * iter->nr_segs);
 3475	} else {
 3476		req->flags |= REQ_F_NEED_CLEANUP;
 3477	}
 3478}
 3479
 3480static inline bool io_alloc_async_data(struct io_kiocb *req)
 3481{
 3482	WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
 3483	req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
 3484	if (req->async_data) {
 3485		req->flags |= REQ_F_ASYNC_DATA;
 3486		return false;
 3487	}
 3488	return true;
 3489}
 3490
 3491static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
 3492			     struct io_rw_state *s, bool force)
 3493{
 3494	if (!force && !io_op_defs[req->opcode].needs_async_setup)
 3495		return 0;
 3496	if (!req_has_async_data(req)) {
 3497		struct io_async_rw *iorw;
 3498
 3499		if (io_alloc_async_data(req)) {
 3500			kfree(iovec);
 3501			return -ENOMEM;
 3502		}
 3503
 3504		io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
 3505		iorw = req->async_data;
 3506		/* we've copied and mapped the iter, ensure state is saved */
 3507		iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
 3508	}
 3509	return 0;
 3510}
 3511
 3512static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
 3513{
 3514	struct io_async_rw *iorw = req->async_data;
 3515	struct iovec *iov;
 3516	int ret;
 3517
 3518	/* submission path, ->uring_lock should already be taken */
 3519	ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
 3520	if (unlikely(ret < 0))
 3521		return ret;
 3522
 3523	iorw->bytes_done = 0;
 3524	iorw->free_iovec = iov;
 3525	if (iov)
 3526		req->flags |= REQ_F_NEED_CLEANUP;
 3527	return 0;
 3528}
 3529
 3530static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 3531{
 3532	if (unlikely(!(req->file->f_mode & FMODE_READ)))
 3533		return -EBADF;
 3534	return io_prep_rw(req, sqe);
 3535}
 3536
 3537/*
 3538 * This is our waitqueue callback handler, registered through __folio_lock_async()
 3539 * when we initially tried to do the IO with the iocb armed our waitqueue.
 3540 * This gets called when the page is unlocked, and we generally expect that to
 3541 * happen when the page IO is completed and the page is now uptodate. This will
 3542 * queue a task_work based retry of the operation, attempting to copy the data
 3543 * again. If the latter fails because the page was NOT uptodate, then we will
 3544 * do a thread based blocking retry of the operation. That's the unexpected
 3545 * slow path.
 3546 */
 3547static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
 3548			     int sync, void *arg)
 3549{
 3550	struct wait_page_queue *wpq;
 3551	struct io_kiocb *req = wait->private;
 3552	struct wait_page_key *key = arg;
 3553
 3554	wpq = container_of(wait, struct wait_page_queue, wait);
 3555
 3556	if (!wake_page_match(wpq, key))
 3557		return 0;
 3558
 3559	req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
 3560	list_del_init(&wait->entry);
 3561	io_req_task_queue(req);
 3562	return 1;
 3563}
 3564
 3565/*
 3566 * This controls whether a given IO request should be armed for async page
 3567 * based retry. If we return false here, the request is handed to the async
 3568 * worker threads for retry. If we're doing buffered reads on a regular file,
 3569 * we prepare a private wait_page_queue entry and retry the operation. This
 3570 * will either succeed because the page is now uptodate and unlocked, or it
 3571 * will register a callback when the page is unlocked at IO completion. Through
 3572 * that callback, io_uring uses task_work to setup a retry of the operation.
 3573 * That retry will attempt the buffered read again. The retry will generally
 3574 * succeed, or in rare cases where it fails, we then fall back to using the
 3575 * async worker threads for a blocking retry.
 3576 */
 3577static bool io_rw_should_retry(struct io_kiocb *req)
 3578{
 3579	struct io_async_rw *rw = req->async_data;
 3580	struct wait_page_queue *wait = &rw->wpq;
 3581	struct kiocb *kiocb = &req->rw.kiocb;
 3582
 3583	/* never retry for NOWAIT, we just complete with -EAGAIN */
 3584	if (req->flags & REQ_F_NOWAIT)
 3585		return false;
 3586
 3587	/* Only for buffered IO */
 3588	if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
 3589		return false;
 3590
 3591	/*
 3592	 * just use poll if we can, and don't attempt if the fs doesn't
 3593	 * support callback based unlocks
 3594	 */
 3595	if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
 3596		return false;
 3597
 3598	wait->wait.func = io_async_buf_func;
 3599	wait->wait.private = req;
 3600	wait->wait.flags = 0;
 3601	INIT_LIST_HEAD(&wait->wait.entry);
 3602	kiocb->ki_flags |= IOCB_WAITQ;
 3603	kiocb->ki_flags &= ~IOCB_NOWAIT;
 3604	kiocb->ki_waitq = wait;
 3605	return true;
 3606}
 3607
 3608static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
 3609{
 3610	if (likely(req->file->f_op->read_iter))
 3611		return call_read_iter(req->file, &req->rw.kiocb, iter);
 3612	else if (req->file->f_op->read)
 3613		return loop_rw_iter(READ, req, iter);
 3614	else
 3615		return -EINVAL;
 3616}
 3617
 3618static bool need_read_all(struct io_kiocb *req)
 3619{
 3620	return req->flags & REQ_F_ISREG ||
 3621		S_ISBLK(file_inode(req->file)->i_mode);
 3622}
 3623
 3624static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 3625{
 3626	struct io_rw_state __s, *s = &__s;
 3627	struct iovec *iovec;
 3628	struct kiocb *kiocb = &req->rw.kiocb;
 3629	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 3630	struct io_async_rw *rw;
 3631	ssize_t ret, ret2;
 3632
 3633	if (!req_has_async_data(req)) {
 3634		ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
 3635		if (unlikely(ret < 0))
 3636			return ret;
 3637	} else {
 3638		rw = req->async_data;
 3639		s = &rw->s;
 3640		/*
 3641		 * We come here from an earlier attempt, restore our state to
 3642		 * match in case it doesn't. It's cheap enough that we don't
 3643		 * need to make this conditional.
 3644		 */
 3645		iov_iter_restore(&s->iter, &s->iter_state);
 3646		iovec = NULL;
 3647	}
 3648	req->result = iov_iter_count(&s->iter);
 3649
 3650	if (force_nonblock) {
 3651		/* If the file doesn't support async, just async punt */
 3652		if (unlikely(!io_file_supports_nowait(req))) {
 3653			ret = io_setup_async_rw(req, iovec, s, true);
 3654			return ret ?: -EAGAIN;
 3655		}
 3656		kiocb->ki_flags |= IOCB_NOWAIT;
 3657	} else {
 3658		/* Ensure we clear previously set non-block flag */
 3659		kiocb->ki_flags &= ~IOCB_NOWAIT;
 3660	}
 3661
 3662	ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
 3663	if (unlikely(ret)) {
 3664		kfree(iovec);
 3665		return ret;
 3666	}
 3667
 3668	ret = io_iter_do_read(req, &s->iter);
 3669
 3670	if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
 3671		req->flags &= ~REQ_F_REISSUE;
 3672		/* IOPOLL retry should happen for io-wq threads */
 3673		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
 3674			goto done;
 3675		/* no retry on NONBLOCK nor RWF_NOWAIT */
 3676		if (req->flags & REQ_F_NOWAIT)
 3677			goto done;
 3678		ret = 0;
 3679	} else if (ret == -EIOCBQUEUED) {
 3680		goto out_free;
 3681	} else if (ret == req->result || ret <= 0 || !force_nonblock ||
 3682		   (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
 3683		/* read all, failed, already did sync or don't want to retry */
 3684		goto done;
 3685	}
 3686
 3687	/*
 3688	 * Don't depend on the iter state matching what was consumed, or being
 3689	 * untouched in case of error. Restore it and we'll advance it
 3690	 * manually if we need to.
 3691	 */
 3692	iov_iter_restore(&s->iter, &s->iter_state);
 3693
 3694	ret2 = io_setup_async_rw(req, iovec, s, true);
 3695	if (ret2)
 3696		return ret2;
 3697
 3698	iovec = NULL;
 3699	rw = req->async_data;
 3700	s = &rw->s;
 3701	/*
 3702	 * Now use our persistent iterator and state, if we aren't already.
 3703	 * We've restored and mapped the iter to match.
 3704	 */
 3705
 3706	do {
 3707		/*
 3708		 * We end up here because of a partial read, either from
 3709		 * above or inside this loop. Advance the iter by the bytes
 3710		 * that were consumed.
 3711		 */
 3712		iov_iter_advance(&s->iter, ret);
 3713		if (!iov_iter_count(&s->iter))
 3714			break;
 3715		rw->bytes_done += ret;
 3716		iov_iter_save_state(&s->iter, &s->iter_state);
 3717
 3718		/* if we can retry, do so with the callbacks armed */
 3719		if (!io_rw_should_retry(req)) {
 3720			kiocb->ki_flags &= ~IOCB_WAITQ;
 3721			return -EAGAIN;
 3722		}
 3723
 3724		/*
 3725		 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
 3726		 * we get -EIOCBQUEUED, then we'll get a notification when the
 3727		 * desired page gets unlocked. We can also get a partial read
 3728		 * here, and if we do, then just retry at the new offset.
 3729		 */
 3730		ret = io_iter_do_read(req, &s->iter);
 3731		if (ret == -EIOCBQUEUED)
 3732			return 0;
 3733		/* we got some bytes, but not all. retry. */
 3734		kiocb->ki_flags &= ~IOCB_WAITQ;
 3735		iov_iter_restore(&s->iter, &s->iter_state);
 3736	} while (ret > 0);
 3737done:
 3738	kiocb_done(req, ret, issue_flags);
 3739out_free:
 3740	/* it's faster to check here then delegate to kfree */
 3741	if (iovec)
 3742		kfree(iovec);
 3743	return 0;
 3744}
 3745
 3746static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 3747{
 3748	if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
 3749		return -EBADF;
 3750	req->rw.kiocb.ki_hint = ki_hint_validate(file_write_hint(req->file));
 3751	return io_prep_rw(req, sqe);
 3752}
 3753
 3754static int io_write(struct io_kiocb *req, unsigned int issue_flags)
 3755{
 3756	struct io_rw_state __s, *s = &__s;
 3757	struct iovec *iovec;
 3758	struct kiocb *kiocb = &req->rw.kiocb;
 3759	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 3760	ssize_t ret, ret2;
 3761
 3762	if (!req_has_async_data(req)) {
 3763		ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
 3764		if (unlikely(ret < 0))
 3765			return ret;
 3766	} else {
 3767		struct io_async_rw *rw = req->async_data;
 3768
 3769		s = &rw->s;
 3770		iov_iter_restore(&s->iter, &s->iter_state);
 3771		iovec = NULL;
 3772	}
 3773	req->result = iov_iter_count(&s->iter);
 3774
 3775	if (force_nonblock) {
 3776		/* If the file doesn't support async, just async punt */
 3777		if (unlikely(!io_file_supports_nowait(req)))
 3778			goto copy_iov;
 3779
 3780		/* file path doesn't support NOWAIT for non-direct_IO */
 3781		if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
 3782		    (req->flags & REQ_F_ISREG))
 3783			goto copy_iov;
 3784
 3785		kiocb->ki_flags |= IOCB_NOWAIT;
 3786	} else {
 3787		/* Ensure we clear previously set non-block flag */
 3788		kiocb->ki_flags &= ~IOCB_NOWAIT;
 3789	}
 3790
 3791	ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
 3792	if (unlikely(ret))
 3793		goto out_free;
 3794
 3795	/*
 3796	 * Open-code file_start_write here to grab freeze protection,
 3797	 * which will be released by another thread in
 3798	 * io_complete_rw().  Fool lockdep by telling it the lock got
 3799	 * released so that it doesn't complain about the held lock when
 3800	 * we return to userspace.
 3801	 */
 3802	if (req->flags & REQ_F_ISREG) {
 3803		sb_start_write(file_inode(req->file)->i_sb);
 3804		__sb_writers_release(file_inode(req->file)->i_sb,
 3805					SB_FREEZE_WRITE);
 3806	}
 3807	kiocb->ki_flags |= IOCB_WRITE;
 3808
 3809	if (likely(req->file->f_op->write_iter))
 3810		ret2 = call_write_iter(req->file, kiocb, &s->iter);
 3811	else if (req->file->f_op->write)
 3812		ret2 = loop_rw_iter(WRITE, req, &s->iter);
 3813	else
 3814		ret2 = -EINVAL;
 3815
 3816	if (req->flags & REQ_F_REISSUE) {
 3817		req->flags &= ~REQ_F_REISSUE;
 3818		ret2 = -EAGAIN;
 3819	}
 3820
 3821	/*
 3822	 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
 3823	 * retry them without IOCB_NOWAIT.
 3824	 */
 3825	if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
 3826		ret2 = -EAGAIN;
 3827	/* no retry on NONBLOCK nor RWF_NOWAIT */
 3828	if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
 3829		goto done;
 3830	if (!force_nonblock || ret2 != -EAGAIN) {
 3831		/* IOPOLL retry should happen for io-wq threads */
 3832		if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
 3833			goto copy_iov;
 3834done:
 3835		kiocb_done(req, ret2, issue_flags);
 3836	} else {
 3837copy_iov:
 3838		iov_iter_restore(&s->iter, &s->iter_state);
 3839		ret = io_setup_async_rw(req, iovec, s, false);
 3840		return ret ?: -EAGAIN;
 3841	}
 3842out_free:
 3843	/* it's reportedly faster than delegating the null check to kfree() */
 3844	if (iovec)
 3845		kfree(iovec);
 3846	return ret;
 3847}
 3848
 3849static int io_renameat_prep(struct io_kiocb *req,
 3850			    const struct io_uring_sqe *sqe)
 3851{
 3852	struct io_rename *ren = &req->rename;
 3853	const char __user *oldf, *newf;
 3854
 3855	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 3856		return -EINVAL;
 3857	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
 3858		return -EINVAL;
 3859	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 3860		return -EBADF;
 3861
 3862	ren->old_dfd = READ_ONCE(sqe->fd);
 3863	oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
 3864	newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 3865	ren->new_dfd = READ_ONCE(sqe->len);
 3866	ren->flags = READ_ONCE(sqe->rename_flags);
 3867
 3868	ren->oldpath = getname(oldf);
 3869	if (IS_ERR(ren->oldpath))
 3870		return PTR_ERR(ren->oldpath);
 3871
 3872	ren->newpath = getname(newf);
 3873	if (IS_ERR(ren->newpath)) {
 3874		putname(ren->oldpath);
 3875		return PTR_ERR(ren->newpath);
 3876	}
 3877
 3878	req->flags |= REQ_F_NEED_CLEANUP;
 3879	return 0;
 3880}
 3881
 3882static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
 3883{
 3884	struct io_rename *ren = &req->rename;
 3885	int ret;
 3886
 3887	if (issue_flags & IO_URING_F_NONBLOCK)
 3888		return -EAGAIN;
 3889
 3890	ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
 3891				ren->newpath, ren->flags);
 3892
 3893	req->flags &= ~REQ_F_NEED_CLEANUP;
 3894	if (ret < 0)
 3895		req_set_fail(req);
 3896	io_req_complete(req, ret);
 3897	return 0;
 3898}
 3899
 3900static int io_unlinkat_prep(struct io_kiocb *req,
 3901			    const struct io_uring_sqe *sqe)
 3902{
 3903	struct io_unlink *un = &req->unlink;
 3904	const char __user *fname;
 3905
 3906	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 3907		return -EINVAL;
 3908	if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
 3909	    sqe->splice_fd_in)
 3910		return -EINVAL;
 3911	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 3912		return -EBADF;
 3913
 3914	un->dfd = READ_ONCE(sqe->fd);
 3915
 3916	un->flags = READ_ONCE(sqe->unlink_flags);
 3917	if (un->flags & ~AT_REMOVEDIR)
 3918		return -EINVAL;
 3919
 3920	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
 3921	un->filename = getname(fname);
 3922	if (IS_ERR(un->filename))
 3923		return PTR_ERR(un->filename);
 3924
 3925	req->flags |= REQ_F_NEED_CLEANUP;
 3926	return 0;
 3927}
 3928
 3929static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
 3930{
 3931	struct io_unlink *un = &req->unlink;
 3932	int ret;
 3933
 3934	if (issue_flags & IO_URING_F_NONBLOCK)
 3935		return -EAGAIN;
 3936
 3937	if (un->flags & AT_REMOVEDIR)
 3938		ret = do_rmdir(un->dfd, un->filename);
 3939	else
 3940		ret = do_unlinkat(un->dfd, un->filename);
 3941
 3942	req->flags &= ~REQ_F_NEED_CLEANUP;
 3943	if (ret < 0)
 3944		req_set_fail(req);
 3945	io_req_complete(req, ret);
 3946	return 0;
 3947}
 3948
 3949static int io_mkdirat_prep(struct io_kiocb *req,
 3950			    const struct io_uring_sqe *sqe)
 3951{
 3952	struct io_mkdir *mkd = &req->mkdir;
 3953	const char __user *fname;
 3954
 3955	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 3956		return -EINVAL;
 3957	if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index ||
 3958	    sqe->splice_fd_in)
 3959		return -EINVAL;
 3960	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 3961		return -EBADF;
 3962
 3963	mkd->dfd = READ_ONCE(sqe->fd);
 3964	mkd->mode = READ_ONCE(sqe->len);
 3965
 3966	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
 3967	mkd->filename = getname(fname);
 3968	if (IS_ERR(mkd->filename))
 3969		return PTR_ERR(mkd->filename);
 3970
 3971	req->flags |= REQ_F_NEED_CLEANUP;
 3972	return 0;
 3973}
 3974
 3975static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
 3976{
 3977	struct io_mkdir *mkd = &req->mkdir;
 3978	int ret;
 3979
 3980	if (issue_flags & IO_URING_F_NONBLOCK)
 3981		return -EAGAIN;
 3982
 3983	ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
 3984
 3985	req->flags &= ~REQ_F_NEED_CLEANUP;
 3986	if (ret < 0)
 3987		req_set_fail(req);
 3988	io_req_complete(req, ret);
 3989	return 0;
 3990}
 3991
 3992static int io_symlinkat_prep(struct io_kiocb *req,
 3993			    const struct io_uring_sqe *sqe)
 3994{
 3995	struct io_symlink *sl = &req->symlink;
 3996	const char __user *oldpath, *newpath;
 3997
 3998	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 3999		return -EINVAL;
 4000	if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index ||
 4001	    sqe->splice_fd_in)
 4002		return -EINVAL;
 4003	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 4004		return -EBADF;
 4005
 4006	sl->new_dfd = READ_ONCE(sqe->fd);
 4007	oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr));
 4008	newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 4009
 4010	sl->oldpath = getname(oldpath);
 4011	if (IS_ERR(sl->oldpath))
 4012		return PTR_ERR(sl->oldpath);
 4013
 4014	sl->newpath = getname(newpath);
 4015	if (IS_ERR(sl->newpath)) {
 4016		putname(sl->oldpath);
 4017		return PTR_ERR(sl->newpath);
 4018	}
 4019
 4020	req->flags |= REQ_F_NEED_CLEANUP;
 4021	return 0;
 4022}
 4023
 4024static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
 4025{
 4026	struct io_symlink *sl = &req->symlink;
 4027	int ret;
 4028
 4029	if (issue_flags & IO_URING_F_NONBLOCK)
 4030		return -EAGAIN;
 4031
 4032	ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
 4033
 4034	req->flags &= ~REQ_F_NEED_CLEANUP;
 4035	if (ret < 0)
 4036		req_set_fail(req);
 4037	io_req_complete(req, ret);
 4038	return 0;
 4039}
 4040
 4041static int io_linkat_prep(struct io_kiocb *req,
 4042			    const struct io_uring_sqe *sqe)
 4043{
 4044	struct io_hardlink *lnk = &req->hardlink;
 4045	const char __user *oldf, *newf;
 4046
 4047	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4048		return -EINVAL;
 4049	if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
 4050		return -EINVAL;
 4051	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 4052		return -EBADF;
 4053
 4054	lnk->old_dfd = READ_ONCE(sqe->fd);
 4055	lnk->new_dfd = READ_ONCE(sqe->len);
 4056	oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
 4057	newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 4058	lnk->flags = READ_ONCE(sqe->hardlink_flags);
 4059
 4060	lnk->oldpath = getname(oldf);
 4061	if (IS_ERR(lnk->oldpath))
 4062		return PTR_ERR(lnk->oldpath);
 4063
 4064	lnk->newpath = getname(newf);
 4065	if (IS_ERR(lnk->newpath)) {
 4066		putname(lnk->oldpath);
 4067		return PTR_ERR(lnk->newpath);
 4068	}
 4069
 4070	req->flags |= REQ_F_NEED_CLEANUP;
 4071	return 0;
 4072}
 4073
 4074static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
 4075{
 4076	struct io_hardlink *lnk = &req->hardlink;
 4077	int ret;
 4078
 4079	if (issue_flags & IO_URING_F_NONBLOCK)
 4080		return -EAGAIN;
 4081
 4082	ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
 4083				lnk->newpath, lnk->flags);
 4084
 4085	req->flags &= ~REQ_F_NEED_CLEANUP;
 4086	if (ret < 0)
 4087		req_set_fail(req);
 4088	io_req_complete(req, ret);
 4089	return 0;
 4090}
 4091
 4092static int io_shutdown_prep(struct io_kiocb *req,
 4093			    const struct io_uring_sqe *sqe)
 4094{
 4095#if defined(CONFIG_NET)
 4096	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4097		return -EINVAL;
 4098	if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
 4099		     sqe->buf_index || sqe->splice_fd_in))
 4100		return -EINVAL;
 4101
 4102	req->shutdown.how = READ_ONCE(sqe->len);
 4103	return 0;
 4104#else
 4105	return -EOPNOTSUPP;
 4106#endif
 4107}
 4108
 4109static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
 4110{
 4111#if defined(CONFIG_NET)
 4112	struct socket *sock;
 4113	int ret;
 4114
 4115	if (issue_flags & IO_URING_F_NONBLOCK)
 4116		return -EAGAIN;
 4117
 4118	sock = sock_from_file(req->file);
 4119	if (unlikely(!sock))
 4120		return -ENOTSOCK;
 4121
 4122	ret = __sys_shutdown_sock(sock, req->shutdown.how);
 4123	if (ret < 0)
 4124		req_set_fail(req);
 4125	io_req_complete(req, ret);
 4126	return 0;
 4127#else
 4128	return -EOPNOTSUPP;
 4129#endif
 4130}
 4131
 4132static int __io_splice_prep(struct io_kiocb *req,
 4133			    const struct io_uring_sqe *sqe)
 4134{
 4135	struct io_splice *sp = &req->splice;
 4136	unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
 4137
 4138	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4139		return -EINVAL;
 4140
 4141	sp->file_in = NULL;
 4142	sp->len = READ_ONCE(sqe->len);
 4143	sp->flags = READ_ONCE(sqe->splice_flags);
 4144
 4145	if (unlikely(sp->flags & ~valid_flags))
 4146		return -EINVAL;
 4147
 4148	sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in),
 4149				  (sp->flags & SPLICE_F_FD_IN_FIXED));
 4150	if (!sp->file_in)
 4151		return -EBADF;
 4152	req->flags |= REQ_F_NEED_CLEANUP;
 4153	return 0;
 4154}
 4155
 4156static int io_tee_prep(struct io_kiocb *req,
 4157		       const struct io_uring_sqe *sqe)
 4158{
 4159	if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
 4160		return -EINVAL;
 4161	return __io_splice_prep(req, sqe);
 4162}
 4163
 4164static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
 4165{
 4166	struct io_splice *sp = &req->splice;
 4167	struct file *in = sp->file_in;
 4168	struct file *out = sp->file_out;
 4169	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
 4170	long ret = 0;
 4171
 4172	if (issue_flags & IO_URING_F_NONBLOCK)
 4173		return -EAGAIN;
 4174	if (sp->len)
 4175		ret = do_tee(in, out, sp->len, flags);
 4176
 4177	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
 4178		io_put_file(in);
 4179	req->flags &= ~REQ_F_NEED_CLEANUP;
 4180
 4181	if (ret != sp->len)
 4182		req_set_fail(req);
 4183	io_req_complete(req, ret);
 4184	return 0;
 4185}
 4186
 4187static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4188{
 4189	struct io_splice *sp = &req->splice;
 4190
 4191	sp->off_in = READ_ONCE(sqe->splice_off_in);
 4192	sp->off_out = READ_ONCE(sqe->off);
 4193	return __io_splice_prep(req, sqe);
 4194}
 4195
 4196static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
 4197{
 4198	struct io_splice *sp = &req->splice;
 4199	struct file *in = sp->file_in;
 4200	struct file *out = sp->file_out;
 4201	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
 4202	loff_t *poff_in, *poff_out;
 4203	long ret = 0;
 4204
 4205	if (issue_flags & IO_URING_F_NONBLOCK)
 4206		return -EAGAIN;
 4207
 4208	poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
 4209	poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
 4210
 4211	if (sp->len)
 4212		ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
 4213
 4214	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
 4215		io_put_file(in);
 4216	req->flags &= ~REQ_F_NEED_CLEANUP;
 4217
 4218	if (ret != sp->len)
 4219		req_set_fail(req);
 4220	io_req_complete(req, ret);
 4221	return 0;
 4222}
 4223
 4224/*
 4225 * IORING_OP_NOP just posts a completion event, nothing else.
 4226 */
 4227static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 4228{
 4229	struct io_ring_ctx *ctx = req->ctx;
 4230
 4231	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 4232		return -EINVAL;
 4233
 4234	__io_req_complete(req, issue_flags, 0, 0);
 4235	return 0;
 4236}
 4237
 4238static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4239{
 4240	struct io_ring_ctx *ctx = req->ctx;
 4241
 4242	if (!req->file)
 4243		return -EBADF;
 4244
 4245	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 4246		return -EINVAL;
 4247	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
 4248		     sqe->splice_fd_in))
 4249		return -EINVAL;
 4250
 4251	req->sync.flags = READ_ONCE(sqe->fsync_flags);
 4252	if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
 4253		return -EINVAL;
 4254
 4255	req->sync.off = READ_ONCE(sqe->off);
 4256	req->sync.len = READ_ONCE(sqe->len);
 4257	return 0;
 4258}
 4259
 4260static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
 4261{
 4262	loff_t end = req->sync.off + req->sync.len;
 4263	int ret;
 4264
 4265	/* fsync always requires a blocking context */
 4266	if (issue_flags & IO_URING_F_NONBLOCK)
 4267		return -EAGAIN;
 4268
 4269	ret = vfs_fsync_range(req->file, req->sync.off,
 4270				end > 0 ? end : LLONG_MAX,
 4271				req->sync.flags & IORING_FSYNC_DATASYNC);
 4272	if (ret < 0)
 4273		req_set_fail(req);
 4274	io_req_complete(req, ret);
 4275	return 0;
 4276}
 4277
 4278static int io_fallocate_prep(struct io_kiocb *req,
 4279			     const struct io_uring_sqe *sqe)
 4280{
 4281	if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
 4282	    sqe->splice_fd_in)
 4283		return -EINVAL;
 4284	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4285		return -EINVAL;
 4286
 4287	req->sync.off = READ_ONCE(sqe->off);
 4288	req->sync.len = READ_ONCE(sqe->addr);
 4289	req->sync.mode = READ_ONCE(sqe->len);
 4290	return 0;
 4291}
 4292
 4293static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
 4294{
 4295	int ret;
 4296
 4297	/* fallocate always requiring blocking context */
 4298	if (issue_flags & IO_URING_F_NONBLOCK)
 4299		return -EAGAIN;
 4300	ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
 4301				req->sync.len);
 4302	if (ret < 0)
 4303		req_set_fail(req);
 4304	io_req_complete(req, ret);
 4305	return 0;
 4306}
 4307
 4308static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4309{
 4310	const char __user *fname;
 4311	int ret;
 4312
 4313	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4314		return -EINVAL;
 4315	if (unlikely(sqe->ioprio || sqe->buf_index))
 4316		return -EINVAL;
 4317	if (unlikely(req->flags & REQ_F_FIXED_FILE))
 4318		return -EBADF;
 4319
 4320	/* open.how should be already initialised */
 4321	if (!(req->open.how.flags & O_PATH) && force_o_largefile())
 4322		req->open.how.flags |= O_LARGEFILE;
 4323
 4324	req->open.dfd = READ_ONCE(sqe->fd);
 4325	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
 4326	req->open.filename = getname(fname);
 4327	if (IS_ERR(req->open.filename)) {
 4328		ret = PTR_ERR(req->open.filename);
 4329		req->open.filename = NULL;
 4330		return ret;
 4331	}
 4332
 4333	req->open.file_slot = READ_ONCE(sqe->file_index);
 4334	if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
 4335		return -EINVAL;
 4336
 4337	req->open.nofile = rlimit(RLIMIT_NOFILE);
 4338	req->flags |= REQ_F_NEED_CLEANUP;
 4339	return 0;
 4340}
 4341
 4342static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4343{
 4344	u64 mode = READ_ONCE(sqe->len);
 4345	u64 flags = READ_ONCE(sqe->open_flags);
 4346
 4347	req->open.how = build_open_how(flags, mode);
 4348	return __io_openat_prep(req, sqe);
 4349}
 4350
 4351static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4352{
 4353	struct open_how __user *how;
 4354	size_t len;
 4355	int ret;
 4356
 4357	how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 4358	len = READ_ONCE(sqe->len);
 4359	if (len < OPEN_HOW_SIZE_VER0)
 4360		return -EINVAL;
 4361
 4362	ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
 4363					len);
 4364	if (ret)
 4365		return ret;
 4366
 4367	return __io_openat_prep(req, sqe);
 4368}
 4369
 4370static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
 4371{
 4372	struct open_flags op;
 4373	struct file *file;
 4374	bool resolve_nonblock, nonblock_set;
 4375	bool fixed = !!req->open.file_slot;
 4376	int ret;
 4377
 4378	ret = build_open_flags(&req->open.how, &op);
 4379	if (ret)
 4380		goto err;
 4381	nonblock_set = op.open_flag & O_NONBLOCK;
 4382	resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
 4383	if (issue_flags & IO_URING_F_NONBLOCK) {
 4384		/*
 4385		 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
 4386		 * it'll always -EAGAIN
 4387		 */
 4388		if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
 4389			return -EAGAIN;
 4390		op.lookup_flags |= LOOKUP_CACHED;
 4391		op.open_flag |= O_NONBLOCK;
 4392	}
 4393
 4394	if (!fixed) {
 4395		ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
 4396		if (ret < 0)
 4397			goto err;
 4398	}
 4399
 4400	file = do_filp_open(req->open.dfd, req->open.filename, &op);
 4401	if (IS_ERR(file)) {
 4402		/*
 4403		 * We could hang on to this 'fd' on retrying, but seems like
 4404		 * marginal gain for something that is now known to be a slower
 4405		 * path. So just put it, and we'll get a new one when we retry.
 4406		 */
 4407		if (!fixed)
 4408			put_unused_fd(ret);
 4409
 4410		ret = PTR_ERR(file);
 4411		/* only retry if RESOLVE_CACHED wasn't already set by application */
 4412		if (ret == -EAGAIN &&
 4413		    (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
 4414			return -EAGAIN;
 4415		goto err;
 4416	}
 4417
 4418	if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
 4419		file->f_flags &= ~O_NONBLOCK;
 4420	fsnotify_open(file);
 4421
 4422	if (!fixed)
 4423		fd_install(ret, file);
 4424	else
 4425		ret = io_install_fixed_file(req, file, issue_flags,
 4426					    req->open.file_slot - 1);
 4427err:
 4428	putname(req->open.filename);
 4429	req->flags &= ~REQ_F_NEED_CLEANUP;
 4430	if (ret < 0)
 4431		req_set_fail(req);
 4432	__io_req_complete(req, issue_flags, ret, 0);
 4433	return 0;
 4434}
 4435
 4436static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
 4437{
 4438	return io_openat2(req, issue_flags);
 4439}
 4440
 4441static int io_remove_buffers_prep(struct io_kiocb *req,
 4442				  const struct io_uring_sqe *sqe)
 4443{
 4444	struct io_provide_buf *p = &req->pbuf;
 4445	u64 tmp;
 4446
 4447	if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
 4448	    sqe->splice_fd_in)
 4449		return -EINVAL;
 4450
 4451	tmp = READ_ONCE(sqe->fd);
 4452	if (!tmp || tmp > USHRT_MAX)
 4453		return -EINVAL;
 4454
 4455	memset(p, 0, sizeof(*p));
 4456	p->nbufs = tmp;
 4457	p->bgid = READ_ONCE(sqe->buf_group);
 4458	return 0;
 4459}
 4460
 4461static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
 4462			       int bgid, unsigned nbufs)
 4463{
 4464	unsigned i = 0;
 4465
 4466	/* shouldn't happen */
 4467	if (!nbufs)
 4468		return 0;
 4469
 4470	/* the head kbuf is the list itself */
 4471	while (!list_empty(&buf->list)) {
 4472		struct io_buffer *nxt;
 4473
 4474		nxt = list_first_entry(&buf->list, struct io_buffer, list);
 4475		list_del(&nxt->list);
 4476		kfree(nxt);
 4477		if (++i == nbufs)
 4478			return i;
 4479		cond_resched();
 4480	}
 4481	i++;
 4482	kfree(buf);
 4483	xa_erase(&ctx->io_buffers, bgid);
 4484
 4485	return i;
 4486}
 4487
 4488static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 4489{
 4490	struct io_provide_buf *p = &req->pbuf;
 4491	struct io_ring_ctx *ctx = req->ctx;
 4492	struct io_buffer *head;
 4493	int ret = 0;
 4494	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 4495
 4496	io_ring_submit_lock(ctx, needs_lock);
 4497
 4498	lockdep_assert_held(&ctx->uring_lock);
 4499
 4500	ret = -ENOENT;
 4501	head = xa_load(&ctx->io_buffers, p->bgid);
 4502	if (head)
 4503		ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
 4504	if (ret < 0)
 4505		req_set_fail(req);
 4506
 4507	/* complete before unlock, IOPOLL may need the lock */
 4508	__io_req_complete(req, issue_flags, ret, 0);
 4509	io_ring_submit_unlock(ctx, needs_lock);
 4510	return 0;
 4511}
 4512
 4513static int io_provide_buffers_prep(struct io_kiocb *req,
 4514				   const struct io_uring_sqe *sqe)
 4515{
 4516	unsigned long size, tmp_check;
 4517	struct io_provide_buf *p = &req->pbuf;
 4518	u64 tmp;
 4519
 4520	if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
 4521		return -EINVAL;
 4522
 4523	tmp = READ_ONCE(sqe->fd);
 4524	if (!tmp || tmp > USHRT_MAX)
 4525		return -E2BIG;
 4526	p->nbufs = tmp;
 4527	p->addr = READ_ONCE(sqe->addr);
 4528	p->len = READ_ONCE(sqe->len);
 4529
 4530	if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
 4531				&size))
 4532		return -EOVERFLOW;
 4533	if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
 4534		return -EOVERFLOW;
 4535
 4536	size = (unsigned long)p->len * p->nbufs;
 4537	if (!access_ok(u64_to_user_ptr(p->addr), size))
 4538		return -EFAULT;
 4539
 4540	p->bgid = READ_ONCE(sqe->buf_group);
 4541	tmp = READ_ONCE(sqe->off);
 4542	if (tmp > USHRT_MAX)
 4543		return -E2BIG;
 4544	p->bid = tmp;
 4545	return 0;
 4546}
 4547
 4548static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
 4549{
 4550	struct io_buffer *buf;
 4551	u64 addr = pbuf->addr;
 4552	int i, bid = pbuf->bid;
 4553
 4554	for (i = 0; i < pbuf->nbufs; i++) {
 4555		buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
 4556		if (!buf)
 4557			break;
 4558
 4559		buf->addr = addr;
 4560		buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
 4561		buf->bid = bid;
 4562		addr += pbuf->len;
 4563		bid++;
 4564		if (!*head) {
 4565			INIT_LIST_HEAD(&buf->list);
 4566			*head = buf;
 4567		} else {
 4568			list_add_tail(&buf->list, &(*head)->list);
 4569		}
 4570		cond_resched();
 4571	}
 4572
 4573	return i ? i : -ENOMEM;
 4574}
 4575
 4576static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 4577{
 4578	struct io_provide_buf *p = &req->pbuf;
 4579	struct io_ring_ctx *ctx = req->ctx;
 4580	struct io_buffer *head, *list;
 4581	int ret = 0;
 4582	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 4583
 4584	io_ring_submit_lock(ctx, needs_lock);
 4585
 4586	lockdep_assert_held(&ctx->uring_lock);
 4587
 4588	list = head = xa_load(&ctx->io_buffers, p->bgid);
 4589
 4590	ret = io_add_buffers(p, &head);
 4591	if (ret >= 0 && !list) {
 4592		ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL);
 4593		if (ret < 0)
 4594			__io_remove_buffers(ctx, head, p->bgid, -1U);
 4595	}
 4596	if (ret < 0)
 4597		req_set_fail(req);
 4598	/* complete before unlock, IOPOLL may need the lock */
 4599	__io_req_complete(req, issue_flags, ret, 0);
 4600	io_ring_submit_unlock(ctx, needs_lock);
 4601	return 0;
 4602}
 4603
 4604static int io_epoll_ctl_prep(struct io_kiocb *req,
 4605			     const struct io_uring_sqe *sqe)
 4606{
 4607#if defined(CONFIG_EPOLL)
 4608	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
 4609		return -EINVAL;
 4610	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4611		return -EINVAL;
 4612
 4613	req->epoll.epfd = READ_ONCE(sqe->fd);
 4614	req->epoll.op = READ_ONCE(sqe->len);
 4615	req->epoll.fd = READ_ONCE(sqe->off);
 4616
 4617	if (ep_op_has_event(req->epoll.op)) {
 4618		struct epoll_event __user *ev;
 4619
 4620		ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
 4621		if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
 4622			return -EFAULT;
 4623	}
 4624
 4625	return 0;
 4626#else
 4627	return -EOPNOTSUPP;
 4628#endif
 4629}
 4630
 4631static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
 4632{
 4633#if defined(CONFIG_EPOLL)
 4634	struct io_epoll *ie = &req->epoll;
 4635	int ret;
 4636	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 4637
 4638	ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
 4639	if (force_nonblock && ret == -EAGAIN)
 4640		return -EAGAIN;
 4641
 4642	if (ret < 0)
 4643		req_set_fail(req);
 4644	__io_req_complete(req, issue_flags, ret, 0);
 4645	return 0;
 4646#else
 4647	return -EOPNOTSUPP;
 4648#endif
 4649}
 4650
 4651static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4652{
 4653#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
 4654	if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
 4655		return -EINVAL;
 4656	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4657		return -EINVAL;
 4658
 4659	req->madvise.addr = READ_ONCE(sqe->addr);
 4660	req->madvise.len = READ_ONCE(sqe->len);
 4661	req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
 4662	return 0;
 4663#else
 4664	return -EOPNOTSUPP;
 4665#endif
 4666}
 4667
 4668static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
 4669{
 4670#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
 4671	struct io_madvise *ma = &req->madvise;
 4672	int ret;
 4673
 4674	if (issue_flags & IO_URING_F_NONBLOCK)
 4675		return -EAGAIN;
 4676
 4677	ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
 4678	if (ret < 0)
 4679		req_set_fail(req);
 4680	io_req_complete(req, ret);
 4681	return 0;
 4682#else
 4683	return -EOPNOTSUPP;
 4684#endif
 4685}
 4686
 4687static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4688{
 4689	if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
 4690		return -EINVAL;
 4691	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4692		return -EINVAL;
 4693
 4694	req->fadvise.offset = READ_ONCE(sqe->off);
 4695	req->fadvise.len = READ_ONCE(sqe->len);
 4696	req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
 4697	return 0;
 4698}
 4699
 4700static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
 4701{
 4702	struct io_fadvise *fa = &req->fadvise;
 4703	int ret;
 4704
 4705	if (issue_flags & IO_URING_F_NONBLOCK) {
 4706		switch (fa->advice) {
 4707		case POSIX_FADV_NORMAL:
 4708		case POSIX_FADV_RANDOM:
 4709		case POSIX_FADV_SEQUENTIAL:
 4710			break;
 4711		default:
 4712			return -EAGAIN;
 4713		}
 4714	}
 4715
 4716	ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
 4717	if (ret < 0)
 4718		req_set_fail(req);
 4719	__io_req_complete(req, issue_flags, ret, 0);
 4720	return 0;
 4721}
 4722
 4723static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4724{
 4725	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4726		return -EINVAL;
 4727	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
 4728		return -EINVAL;
 4729	if (req->flags & REQ_F_FIXED_FILE)
 4730		return -EBADF;
 4731
 4732	req->statx.dfd = READ_ONCE(sqe->fd);
 4733	req->statx.mask = READ_ONCE(sqe->len);
 4734	req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
 4735	req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 4736	req->statx.flags = READ_ONCE(sqe->statx_flags);
 4737
 4738	return 0;
 4739}
 4740
 4741static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
 4742{
 4743	struct io_statx *ctx = &req->statx;
 4744	int ret;
 4745
 4746	if (issue_flags & IO_URING_F_NONBLOCK)
 4747		return -EAGAIN;
 4748
 4749	ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
 4750		       ctx->buffer);
 4751
 4752	if (ret < 0)
 4753		req_set_fail(req);
 4754	io_req_complete(req, ret);
 4755	return 0;
 4756}
 4757
 4758static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4759{
 4760	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4761		return -EINVAL;
 4762	if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
 4763	    sqe->rw_flags || sqe->buf_index)
 4764		return -EINVAL;
 4765	if (req->flags & REQ_F_FIXED_FILE)
 4766		return -EBADF;
 4767
 4768	req->close.fd = READ_ONCE(sqe->fd);
 4769	req->close.file_slot = READ_ONCE(sqe->file_index);
 4770	if (req->close.file_slot && req->close.fd)
 4771		return -EINVAL;
 4772
 4773	return 0;
 4774}
 4775
 4776static int io_close(struct io_kiocb *req, unsigned int issue_flags)
 4777{
 4778	struct files_struct *files = current->files;
 4779	struct io_close *close = &req->close;
 4780	struct fdtable *fdt;
 4781	struct file *file = NULL;
 4782	int ret = -EBADF;
 4783
 4784	if (req->close.file_slot) {
 4785		ret = io_close_fixed(req, issue_flags);
 4786		goto err;
 4787	}
 4788
 4789	spin_lock(&files->file_lock);
 4790	fdt = files_fdtable(files);
 4791	if (close->fd >= fdt->max_fds) {
 4792		spin_unlock(&files->file_lock);
 4793		goto err;
 4794	}
 4795	file = fdt->fd[close->fd];
 4796	if (!file || file->f_op == &io_uring_fops) {
 4797		spin_unlock(&files->file_lock);
 4798		file = NULL;
 4799		goto err;
 4800	}
 4801
 4802	/* if the file has a flush method, be safe and punt to async */
 4803	if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
 4804		spin_unlock(&files->file_lock);
 4805		return -EAGAIN;
 4806	}
 4807
 4808	ret = __close_fd_get_file(close->fd, &file);
 4809	spin_unlock(&files->file_lock);
 4810	if (ret < 0) {
 4811		if (ret == -ENOENT)
 4812			ret = -EBADF;
 4813		goto err;
 4814	}
 4815
 4816	/* No ->flush() or already async, safely close from here */
 4817	ret = filp_close(file, current->files);
 4818err:
 4819	if (ret < 0)
 4820		req_set_fail(req);
 4821	if (file)
 4822		fput(file);
 4823	__io_req_complete(req, issue_flags, ret, 0);
 4824	return 0;
 4825}
 4826
 4827static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4828{
 4829	struct io_ring_ctx *ctx = req->ctx;
 4830
 4831	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 4832		return -EINVAL;
 4833	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
 4834		     sqe->splice_fd_in))
 4835		return -EINVAL;
 4836
 4837	req->sync.off = READ_ONCE(sqe->off);
 4838	req->sync.len = READ_ONCE(sqe->len);
 4839	req->sync.flags = READ_ONCE(sqe->sync_range_flags);
 4840	return 0;
 4841}
 4842
 4843static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
 4844{
 4845	int ret;
 4846
 4847	/* sync_file_range always requires a blocking context */
 4848	if (issue_flags & IO_URING_F_NONBLOCK)
 4849		return -EAGAIN;
 4850
 4851	ret = sync_file_range(req->file, req->sync.off, req->sync.len,
 4852				req->sync.flags);
 4853	if (ret < 0)
 4854		req_set_fail(req);
 4855	io_req_complete(req, ret);
 4856	return 0;
 4857}
 4858
 4859#if defined(CONFIG_NET)
 4860static int io_setup_async_msg(struct io_kiocb *req,
 4861			      struct io_async_msghdr *kmsg)
 4862{
 4863	struct io_async_msghdr *async_msg = req->async_data;
 4864
 4865	if (async_msg)
 4866		return -EAGAIN;
 4867	if (io_alloc_async_data(req)) {
 4868		kfree(kmsg->free_iov);
 4869		return -ENOMEM;
 4870	}
 4871	async_msg = req->async_data;
 4872	req->flags |= REQ_F_NEED_CLEANUP;
 4873	memcpy(async_msg, kmsg, sizeof(*kmsg));
 4874	async_msg->msg.msg_name = &async_msg->addr;
 4875	/* if were using fast_iov, set it to the new one */
 4876	if (!async_msg->free_iov)
 4877		async_msg->msg.msg_iter.iov = async_msg->fast_iov;
 4878
 4879	return -EAGAIN;
 4880}
 4881
 4882static int io_sendmsg_copy_hdr(struct io_kiocb *req,
 4883			       struct io_async_msghdr *iomsg)
 4884{
 4885	iomsg->msg.msg_name = &iomsg->addr;
 4886	iomsg->free_iov = iomsg->fast_iov;
 4887	return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
 4888				   req->sr_msg.msg_flags, &iomsg->free_iov);
 4889}
 4890
 4891static int io_sendmsg_prep_async(struct io_kiocb *req)
 4892{
 4893	int ret;
 4894
 4895	ret = io_sendmsg_copy_hdr(req, req->async_data);
 4896	if (!ret)
 4897		req->flags |= REQ_F_NEED_CLEANUP;
 4898	return ret;
 4899}
 4900
 4901static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 4902{
 4903	struct io_sr_msg *sr = &req->sr_msg;
 4904
 4905	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 4906		return -EINVAL;
 4907
 4908	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 4909	sr->len = READ_ONCE(sqe->len);
 4910	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
 4911	if (sr->msg_flags & MSG_DONTWAIT)
 4912		req->flags |= REQ_F_NOWAIT;
 4913
 4914#ifdef CONFIG_COMPAT
 4915	if (req->ctx->compat)
 4916		sr->msg_flags |= MSG_CMSG_COMPAT;
 4917#endif
 4918	return 0;
 4919}
 4920
 4921static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 4922{
 4923	struct io_async_msghdr iomsg, *kmsg;
 4924	struct socket *sock;
 4925	unsigned flags;
 4926	int min_ret = 0;
 4927	int ret;
 4928
 4929	sock = sock_from_file(req->file);
 4930	if (unlikely(!sock))
 4931		return -ENOTSOCK;
 4932
 4933	if (req_has_async_data(req)) {
 4934		kmsg = req->async_data;
 4935	} else {
 4936		ret = io_sendmsg_copy_hdr(req, &iomsg);
 4937		if (ret)
 4938			return ret;
 4939		kmsg = &iomsg;
 4940	}
 4941
 4942	flags = req->sr_msg.msg_flags;
 4943	if (issue_flags & IO_URING_F_NONBLOCK)
 4944		flags |= MSG_DONTWAIT;
 4945	if (flags & MSG_WAITALL)
 4946		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
 4947
 4948	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
 4949
 4950	if (ret < min_ret) {
 4951		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
 4952			return io_setup_async_msg(req, kmsg);
 4953		if (ret == -ERESTARTSYS)
 4954			ret = -EINTR;
 4955		req_set_fail(req);
 4956	}
 4957	/* fast path, check for non-NULL to avoid function call */
 4958	if (kmsg->free_iov)
 4959		kfree(kmsg->free_iov);
 4960	req->flags &= ~REQ_F_NEED_CLEANUP;
 4961	__io_req_complete(req, issue_flags, ret, 0);
 4962	return 0;
 4963}
 4964
 4965static int io_send(struct io_kiocb *req, unsigned int issue_flags)
 4966{
 4967	struct io_sr_msg *sr = &req->sr_msg;
 4968	struct msghdr msg;
 4969	struct iovec iov;
 4970	struct socket *sock;
 4971	unsigned flags;
 4972	int min_ret = 0;
 4973	int ret;
 4974
 4975	sock = sock_from_file(req->file);
 4976	if (unlikely(!sock))
 4977		return -ENOTSOCK;
 4978
 4979	ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
 4980	if (unlikely(ret))
 4981		return ret;
 4982
 4983	msg.msg_name = NULL;
 4984	msg.msg_control = NULL;
 4985	msg.msg_controllen = 0;
 4986	msg.msg_namelen = 0;
 4987
 4988	flags = req->sr_msg.msg_flags;
 4989	if (issue_flags & IO_URING_F_NONBLOCK)
 4990		flags |= MSG_DONTWAIT;
 4991	if (flags & MSG_WAITALL)
 4992		min_ret = iov_iter_count(&msg.msg_iter);
 4993
 4994	msg.msg_flags = flags;
 4995	ret = sock_sendmsg(sock, &msg);
 4996	if (ret < min_ret) {
 4997		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
 4998			return -EAGAIN;
 4999		if (ret == -ERESTARTSYS)
 5000			ret = -EINTR;
 5001		req_set_fail(req);
 5002	}
 5003	__io_req_complete(req, issue_flags, ret, 0);
 5004	return 0;
 5005}
 5006
 5007static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
 5008				 struct io_async_msghdr *iomsg)
 5009{
 5010	struct io_sr_msg *sr = &req->sr_msg;
 5011	struct iovec __user *uiov;
 5012	size_t iov_len;
 5013	int ret;
 5014
 5015	ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
 5016					&iomsg->uaddr, &uiov, &iov_len);
 5017	if (ret)
 5018		return ret;
 5019
 5020	if (req->flags & REQ_F_BUFFER_SELECT) {
 5021		if (iov_len > 1)
 5022			return -EINVAL;
 5023		if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
 5024			return -EFAULT;
 5025		sr->len = iomsg->fast_iov[0].iov_len;
 5026		iomsg->free_iov = NULL;
 5027	} else {
 5028		iomsg->free_iov = iomsg->fast_iov;
 5029		ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
 5030				     &iomsg->free_iov, &iomsg->msg.msg_iter,
 5031				     false);
 5032		if (ret > 0)
 5033			ret = 0;
 5034	}
 5035
 5036	return ret;
 5037}
 5038
 5039#ifdef CONFIG_COMPAT
 5040static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
 5041					struct io_async_msghdr *iomsg)
 5042{
 5043	struct io_sr_msg *sr = &req->sr_msg;
 5044	struct compat_iovec __user *uiov;
 5045	compat_uptr_t ptr;
 5046	compat_size_t len;
 5047	int ret;
 5048
 5049	ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
 5050				  &ptr, &len);
 5051	if (ret)
 5052		return ret;
 5053
 5054	uiov = compat_ptr(ptr);
 5055	if (req->flags & REQ_F_BUFFER_SELECT) {
 5056		compat_ssize_t clen;
 5057
 5058		if (len > 1)
 5059			return -EINVAL;
 5060		if (!access_ok(uiov, sizeof(*uiov)))
 5061			return -EFAULT;
 5062		if (__get_user(clen, &uiov->iov_len))
 5063			return -EFAULT;
 5064		if (clen < 0)
 5065			return -EINVAL;
 5066		sr->len = clen;
 5067		iomsg->free_iov = NULL;
 5068	} else {
 5069		iomsg->free_iov = iomsg->fast_iov;
 5070		ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
 5071				   UIO_FASTIOV, &iomsg->free_iov,
 5072				   &iomsg->msg.msg_iter, true);
 5073		if (ret < 0)
 5074			return ret;
 5075	}
 5076
 5077	return 0;
 5078}
 5079#endif
 5080
 5081static int io_recvmsg_copy_hdr(struct io_kiocb *req,
 5082			       struct io_async_msghdr *iomsg)
 5083{
 5084	iomsg->msg.msg_name = &iomsg->addr;
 5085
 5086#ifdef CONFIG_COMPAT
 5087	if (req->ctx->compat)
 5088		return __io_compat_recvmsg_copy_hdr(req, iomsg);
 5089#endif
 5090
 5091	return __io_recvmsg_copy_hdr(req, iomsg);
 5092}
 5093
 5094static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
 5095					       unsigned int issue_flags)
 5096{
 5097	struct io_sr_msg *sr = &req->sr_msg;
 5098
 5099	return io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
 5100}
 5101
 5102static int io_recvmsg_prep_async(struct io_kiocb *req)
 5103{
 5104	int ret;
 5105
 5106	ret = io_recvmsg_copy_hdr(req, req->async_data);
 5107	if (!ret)
 5108		req->flags |= REQ_F_NEED_CLEANUP;
 5109	return ret;
 5110}
 5111
 5112static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 5113{
 5114	struct io_sr_msg *sr = &req->sr_msg;
 5115
 5116	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 5117		return -EINVAL;
 5118
 5119	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 5120	sr->len = READ_ONCE(sqe->len);
 5121	sr->bgid = READ_ONCE(sqe->buf_group);
 5122	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
 5123	if (sr->msg_flags & MSG_DONTWAIT)
 5124		req->flags |= REQ_F_NOWAIT;
 5125
 5126#ifdef CONFIG_COMPAT
 5127	if (req->ctx->compat)
 5128		sr->msg_flags |= MSG_CMSG_COMPAT;
 5129#endif
 5130	return 0;
 5131}
 5132
 5133static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 5134{
 5135	struct io_async_msghdr iomsg, *kmsg;
 5136	struct socket *sock;
 5137	struct io_buffer *kbuf;
 5138	unsigned flags;
 5139	int ret, min_ret = 0;
 5140	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 5141
 5142	sock = sock_from_file(req->file);
 5143	if (unlikely(!sock))
 5144		return -ENOTSOCK;
 5145
 5146	if (req_has_async_data(req)) {
 5147		kmsg = req->async_data;
 5148	} else {
 5149		ret = io_recvmsg_copy_hdr(req, &iomsg);
 5150		if (ret)
 5151			return ret;
 5152		kmsg = &iomsg;
 5153	}
 5154
 5155	if (req->flags & REQ_F_BUFFER_SELECT) {
 5156		kbuf = io_recv_buffer_select(req, issue_flags);
 5157		if (IS_ERR(kbuf))
 5158			return PTR_ERR(kbuf);
 5159		kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
 5160		kmsg->fast_iov[0].iov_len = req->sr_msg.len;
 5161		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
 5162				1, req->sr_msg.len);
 5163	}
 5164
 5165	flags = req->sr_msg.msg_flags;
 5166	if (force_nonblock)
 5167		flags |= MSG_DONTWAIT;
 5168	if (flags & MSG_WAITALL)
 5169		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
 5170
 5171	ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
 5172					kmsg->uaddr, flags);
 5173	if (ret < min_ret) {
 5174		if (ret == -EAGAIN && force_nonblock)
 5175			return io_setup_async_msg(req, kmsg);
 5176		if (ret == -ERESTARTSYS)
 5177			ret = -EINTR;
 5178		req_set_fail(req);
 5179	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
 5180		req_set_fail(req);
 5181	}
 5182
 5183	/* fast path, check for non-NULL to avoid function call */
 5184	if (kmsg->free_iov)
 5185		kfree(kmsg->free_iov);
 5186	req->flags &= ~REQ_F_NEED_CLEANUP;
 5187	__io_req_complete(req, issue_flags, ret, io_put_kbuf(req));
 5188	return 0;
 5189}
 5190
 5191static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 5192{
 5193	struct io_buffer *kbuf;
 5194	struct io_sr_msg *sr = &req->sr_msg;
 5195	struct msghdr msg;
 5196	void __user *buf = sr->buf;
 5197	struct socket *sock;
 5198	struct iovec iov;
 5199	unsigned flags;
 5200	int ret, min_ret = 0;
 5201	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 5202
 5203	sock = sock_from_file(req->file);
 5204	if (unlikely(!sock))
 5205		return -ENOTSOCK;
 5206
 5207	if (req->flags & REQ_F_BUFFER_SELECT) {
 5208		kbuf = io_recv_buffer_select(req, issue_flags);
 5209		if (IS_ERR(kbuf))
 5210			return PTR_ERR(kbuf);
 5211		buf = u64_to_user_ptr(kbuf->addr);
 5212	}
 5213
 5214	ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
 5215	if (unlikely(ret))
 5216		goto out_free;
 5217
 5218	msg.msg_name = NULL;
 5219	msg.msg_control = NULL;
 5220	msg.msg_controllen = 0;
 5221	msg.msg_namelen = 0;
 5222	msg.msg_iocb = NULL;
 5223	msg.msg_flags = 0;
 5224
 5225	flags = req->sr_msg.msg_flags;
 5226	if (force_nonblock)
 5227		flags |= MSG_DONTWAIT;
 5228	if (flags & MSG_WAITALL)
 5229		min_ret = iov_iter_count(&msg.msg_iter);
 5230
 5231	ret = sock_recvmsg(sock, &msg, flags);
 5232	if (ret < min_ret) {
 5233		if (ret == -EAGAIN && force_nonblock)
 5234			return -EAGAIN;
 5235		if (ret == -ERESTARTSYS)
 5236			ret = -EINTR;
 5237		req_set_fail(req);
 5238	} else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
 5239out_free:
 5240		req_set_fail(req);
 5241	}
 5242	__io_req_complete(req, issue_flags, ret, io_put_kbuf(req));
 5243	return 0;
 5244}
 5245
 5246static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 5247{
 5248	struct io_accept *accept = &req->accept;
 5249
 5250	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 5251		return -EINVAL;
 5252	if (sqe->ioprio || sqe->len || sqe->buf_index)
 5253		return -EINVAL;
 5254
 5255	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
 5256	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 5257	accept->flags = READ_ONCE(sqe->accept_flags);
 5258	accept->nofile = rlimit(RLIMIT_NOFILE);
 5259
 5260	accept->file_slot = READ_ONCE(sqe->file_index);
 5261	if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) ||
 5262				  (accept->flags & SOCK_CLOEXEC)))
 5263		return -EINVAL;
 5264	if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
 5265		return -EINVAL;
 5266	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
 5267		accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
 5268	return 0;
 5269}
 5270
 5271static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 5272{
 5273	struct io_accept *accept = &req->accept;
 5274	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 5275	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
 5276	bool fixed = !!accept->file_slot;
 5277	struct file *file;
 5278	int ret, fd;
 5279
 5280	if (req->file->f_flags & O_NONBLOCK)
 5281		req->flags |= REQ_F_NOWAIT;
 5282
 5283	if (!fixed) {
 5284		fd = __get_unused_fd_flags(accept->flags, accept->nofile);
 5285		if (unlikely(fd < 0))
 5286			return fd;
 5287	}
 5288	file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
 5289			 accept->flags);
 5290	if (IS_ERR(file)) {
 5291		if (!fixed)
 5292			put_unused_fd(fd);
 5293		ret = PTR_ERR(file);
 5294		if (ret == -EAGAIN && force_nonblock)
 5295			return -EAGAIN;
 5296		if (ret == -ERESTARTSYS)
 5297			ret = -EINTR;
 5298		req_set_fail(req);
 5299	} else if (!fixed) {
 5300		fd_install(fd, file);
 5301		ret = fd;
 5302	} else {
 5303		ret = io_install_fixed_file(req, file, issue_flags,
 5304					    accept->file_slot - 1);
 5305	}
 5306	__io_req_complete(req, issue_flags, ret, 0);
 5307	return 0;
 5308}
 5309
 5310static int io_connect_prep_async(struct io_kiocb *req)
 5311{
 5312	struct io_async_connect *io = req->async_data;
 5313	struct io_connect *conn = &req->connect;
 5314
 5315	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
 5316}
 5317
 5318static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 5319{
 5320	struct io_connect *conn = &req->connect;
 5321
 5322	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 5323		return -EINVAL;
 5324	if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
 5325	    sqe->splice_fd_in)
 5326		return -EINVAL;
 5327
 5328	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
 5329	conn->addr_len =  READ_ONCE(sqe->addr2);
 5330	return 0;
 5331}
 5332
 5333static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
 5334{
 5335	struct io_async_connect __io, *io;
 5336	unsigned file_flags;
 5337	int ret;
 5338	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 5339
 5340	if (req_has_async_data(req)) {
 5341		io = req->async_data;
 5342	} else {
 5343		ret = move_addr_to_kernel(req->connect.addr,
 5344						req->connect.addr_len,
 5345						&__io.address);
 5346		if (ret)
 5347			goto out;
 5348		io = &__io;
 5349	}
 5350
 5351	file_flags = force_nonblock ? O_NONBLOCK : 0;
 5352
 5353	ret = __sys_connect_file(req->file, &io->address,
 5354					req->connect.addr_len, file_flags);
 5355	if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
 5356		if (req_has_async_data(req))
 5357			return -EAGAIN;
 5358		if (io_alloc_async_data(req)) {
 5359			ret = -ENOMEM;
 5360			goto out;
 5361		}
 5362		memcpy(req->async_data, &__io, sizeof(__io));
 5363		return -EAGAIN;
 5364	}
 5365	if (ret == -ERESTARTSYS)
 5366		ret = -EINTR;
 5367out:
 5368	if (ret < 0)
 5369		req_set_fail(req);
 5370	__io_req_complete(req, issue_flags, ret, 0);
 5371	return 0;
 5372}
 5373#else /* !CONFIG_NET */
 5374#define IO_NETOP_FN(op)							\
 5375static int io_##op(struct io_kiocb *req, unsigned int issue_flags)	\
 5376{									\
 5377	return -EOPNOTSUPP;						\
 5378}
 5379
 5380#define IO_NETOP_PREP(op)						\
 5381IO_NETOP_FN(op)								\
 5382static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
 5383{									\
 5384	return -EOPNOTSUPP;						\
 5385}									\
 5386
 5387#define IO_NETOP_PREP_ASYNC(op)						\
 5388IO_NETOP_PREP(op)							\
 5389static int io_##op##_prep_async(struct io_kiocb *req)			\
 5390{									\
 5391	return -EOPNOTSUPP;						\
 5392}
 5393
 5394IO_NETOP_PREP_ASYNC(sendmsg);
 5395IO_NETOP_PREP_ASYNC(recvmsg);
 5396IO_NETOP_PREP_ASYNC(connect);
 5397IO_NETOP_PREP(accept);
 5398IO_NETOP_FN(send);
 5399IO_NETOP_FN(recv);
 5400#endif /* CONFIG_NET */
 5401
 5402struct io_poll_table {
 5403	struct poll_table_struct pt;
 5404	struct io_kiocb *req;
 5405	int nr_entries;
 5406	int error;
 5407};
 5408
 5409#define IO_POLL_CANCEL_FLAG	BIT(31)
 5410#define IO_POLL_REF_MASK	((1u << 20)-1)
 5411
 5412/*
 5413 * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
 5414 * bump it and acquire ownership. It's disallowed to modify requests while not
 5415 * owning it, that prevents from races for enqueueing task_work's and b/w
 5416 * arming poll and wakeups.
 5417 */
 5418static inline bool io_poll_get_ownership(struct io_kiocb *req)
 5419{
 5420	return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
 5421}
 5422
 5423static void io_poll_mark_cancelled(struct io_kiocb *req)
 5424{
 5425	atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs);
 5426}
 5427
 5428static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
 5429{
 5430	/* pure poll stashes this in ->async_data, poll driven retry elsewhere */
 5431	if (req->opcode == IORING_OP_POLL_ADD)
 5432		return req->async_data;
 5433	return req->apoll->double_poll;
 5434}
 5435
 5436static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
 5437{
 5438	if (req->opcode == IORING_OP_POLL_ADD)
 5439		return &req->poll;
 5440	return &req->apoll->poll;
 5441}
 5442
 5443static void io_poll_req_insert(struct io_kiocb *req)
 5444{
 5445	struct io_ring_ctx *ctx = req->ctx;
 5446	struct hlist_head *list;
 5447
 5448	list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
 5449	hlist_add_head(&req->hash_node, list);
 5450}
 5451
 5452static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
 5453			      wait_queue_func_t wake_func)
 5454{
 5455	poll->head = NULL;
 5456#define IO_POLL_UNMASK	(EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
 5457	/* mask in events that we always want/need */
 5458	poll->events = events | IO_POLL_UNMASK;
 5459	INIT_LIST_HEAD(&poll->wait.entry);
 5460	init_waitqueue_func_entry(&poll->wait, wake_func);
 5461}
 5462
 5463static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
 5464{
 5465	struct wait_queue_head *head = smp_load_acquire(&poll->head);
 5466
 5467	if (head) {
 5468		spin_lock_irq(&head->lock);
 5469		list_del_init(&poll->wait.entry);
 5470		poll->head = NULL;
 5471		spin_unlock_irq(&head->lock);
 5472	}
 5473}
 5474
 5475static void io_poll_remove_entries(struct io_kiocb *req)
 5476{
 5477	struct io_poll_iocb *poll = io_poll_get_single(req);
 5478	struct io_poll_iocb *poll_double = io_poll_get_double(req);
 5479
 5480	/*
 5481	 * While we hold the waitqueue lock and the waitqueue is nonempty,
 5482	 * wake_up_pollfree() will wait for us.  However, taking the waitqueue
 5483	 * lock in the first place can race with the waitqueue being freed.
 5484	 *
 5485	 * We solve this as eventpoll does: by taking advantage of the fact that
 5486	 * all users of wake_up_pollfree() will RCU-delay the actual free.  If
 5487	 * we enter rcu_read_lock() and see that the pointer to the queue is
 5488	 * non-NULL, we can then lock it without the memory being freed out from
 5489	 * under us.
 5490	 *
 5491	 * Keep holding rcu_read_lock() as long as we hold the queue lock, in
 5492	 * case the caller deletes the entry from the queue, leaving it empty.
 5493	 * In that case, only RCU prevents the queue memory from being freed.
 5494	 */
 5495	rcu_read_lock();
 5496	io_poll_remove_entry(poll);
 5497	if (poll_double)
 5498		io_poll_remove_entry(poll_double);
 5499	rcu_read_unlock();
 5500}
 5501
 5502/*
 5503 * All poll tw should go through this. Checks for poll events, manages
 5504 * references, does rewait, etc.
 5505 *
 5506 * Returns a negative error on failure. >0 when no action require, which is
 5507 * either spurious wakeup or multishot CQE is served. 0 when it's done with
 5508 * the request, then the mask is stored in req->result.
 5509 */
 5510static int io_poll_check_events(struct io_kiocb *req)
 5511{
 5512	struct io_ring_ctx *ctx = req->ctx;
 5513	struct io_poll_iocb *poll = io_poll_get_single(req);
 5514	int v;
 5515
 5516	/* req->task == current here, checking PF_EXITING is safe */
 5517	if (unlikely(req->task->flags & PF_EXITING))
 5518		io_poll_mark_cancelled(req);
 5519
 5520	do {
 5521		v = atomic_read(&req->poll_refs);
 5522
 5523		/* tw handler should be the owner, and so have some references */
 5524		if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
 5525			return 0;
 5526		if (v & IO_POLL_CANCEL_FLAG)
 5527			return -ECANCELED;
 5528
 5529		if (!req->result) {
 5530			struct poll_table_struct pt = { ._key = poll->events };
 5531
 5532			req->result = vfs_poll(req->file, &pt) & poll->events;
 5533		}
 5534
 5535		/* multishot, just fill an CQE and proceed */
 5536		if (req->result && !(poll->events & EPOLLONESHOT)) {
 5537			__poll_t mask = mangle_poll(req->result & poll->events);
 5538			bool filled;
 5539
 5540			spin_lock(&ctx->completion_lock);
 5541			filled = io_fill_cqe_aux(ctx, req->user_data, mask,
 5542						 IORING_CQE_F_MORE);
 5543			io_commit_cqring(ctx);
 5544			spin_unlock(&ctx->completion_lock);
 5545			if (unlikely(!filled))
 5546				return -ECANCELED;
 5547			io_cqring_ev_posted(ctx);
 5548		} else if (req->result) {
 5549			return 0;
 5550		}
 5551
 5552		/*
 5553		 * Release all references, retry if someone tried to restart
 5554		 * task_work while we were executing it.
 5555		 */
 5556	} while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs));
 5557
 5558	return 1;
 5559}
 5560
 5561static void io_poll_task_func(struct io_kiocb *req, bool *locked)
 5562{
 5563	struct io_ring_ctx *ctx = req->ctx;
 5564	int ret;
 5565
 5566	ret = io_poll_check_events(req);
 5567	if (ret > 0)
 5568		return;
 5569
 5570	if (!ret) {
 5571		req->result = mangle_poll(req->result & req->poll.events);
 5572	} else {
 5573		req->result = ret;
 5574		req_set_fail(req);
 5575	}
 5576
 5577	io_poll_remove_entries(req);
 5578	spin_lock(&ctx->completion_lock);
 5579	hash_del(&req->hash_node);
 5580	__io_req_complete_post(req, req->result, 0);
 5581	io_commit_cqring(ctx);
 5582	spin_unlock(&ctx->completion_lock);
 5583	io_cqring_ev_posted(ctx);
 5584}
 5585
 5586static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
 5587{
 5588	struct io_ring_ctx *ctx = req->ctx;
 5589	int ret;
 5590
 5591	ret = io_poll_check_events(req);
 5592	if (ret > 0)
 5593		return;
 5594
 5595	io_poll_remove_entries(req);
 5596	spin_lock(&ctx->completion_lock);
 5597	hash_del(&req->hash_node);
 5598	spin_unlock(&ctx->completion_lock);
 5599
 5600	if (!ret)
 5601		io_req_task_submit(req, locked);
 5602	else
 5603		io_req_complete_failed(req, ret);
 5604}
 5605
 5606static void __io_poll_execute(struct io_kiocb *req, int mask)
 5607{
 5608	req->result = mask;
 5609	if (req->opcode == IORING_OP_POLL_ADD)
 5610		req->io_task_work.func = io_poll_task_func;
 5611	else
 5612		req->io_task_work.func = io_apoll_task_func;
 5613
 5614	trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
 5615	io_req_task_work_add(req, false);
 5616}
 5617
 5618static inline void io_poll_execute(struct io_kiocb *req, int res)
 5619{
 5620	if (io_poll_get_ownership(req))
 5621		__io_poll_execute(req, res);
 5622}
 5623
 5624static void io_poll_cancel_req(struct io_kiocb *req)
 5625{
 5626	io_poll_mark_cancelled(req);
 5627	/* kick tw, which should complete the request */
 5628	io_poll_execute(req, 0);
 5629}
 5630
 5631static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 5632			void *key)
 5633{
 5634	struct io_kiocb *req = wait->private;
 5635	struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
 5636						 wait);
 5637	__poll_t mask = key_to_poll(key);
 5638
 5639	if (unlikely(mask & POLLFREE)) {
 5640		io_poll_mark_cancelled(req);
 5641		/* we have to kick tw in case it's not already */
 5642		io_poll_execute(req, 0);
 5643
 5644		/*
 5645		 * If the waitqueue is being freed early but someone is already
 5646		 * holds ownership over it, we have to tear down the request as
 5647		 * best we can. That means immediately removing the request from
 5648		 * its waitqueue and preventing all further accesses to the
 5649		 * waitqueue via the request.
 5650		 */
 5651		list_del_init(&poll->wait.entry);
 5652
 5653		/*
 5654		 * Careful: this *must* be the last step, since as soon
 5655		 * as req->head is NULL'ed out, the request can be
 5656		 * completed and freed, since aio_poll_complete_work()
 5657		 * will no longer need to take the waitqueue lock.
 5658		 */
 5659		smp_store_release(&poll->head, NULL);
 5660		return 1;
 5661	}
 5662
 5663	/* for instances that support it check for an event match first */
 5664	if (mask && !(mask & poll->events))
 5665		return 0;
 5666
 5667	if (io_poll_get_ownership(req)) {
 5668		/* optional, saves extra locking for removal in tw handler */
 5669		if (mask && poll->events & EPOLLONESHOT) {
 5670			list_del_init(&poll->wait.entry);
 5671			poll->head = NULL;
 5672		}
 5673		__io_poll_execute(req, mask);
 5674	}
 5675	return 1;
 5676}
 5677
 5678static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
 5679			    struct wait_queue_head *head,
 5680			    struct io_poll_iocb **poll_ptr)
 5681{
 5682	struct io_kiocb *req = pt->req;
 5683
 5684	/*
 5685	 * The file being polled uses multiple waitqueues for poll handling
 5686	 * (e.g. one for read, one for write). Setup a separate io_poll_iocb
 5687	 * if this happens.
 5688	 */
 5689	if (unlikely(pt->nr_entries)) {
 5690		struct io_poll_iocb *first = poll;
 5691
 5692		/* double add on the same waitqueue head, ignore */
 5693		if (first->head == head)
 5694			return;
 5695		/* already have a 2nd entry, fail a third attempt */
 5696		if (*poll_ptr) {
 5697			if ((*poll_ptr)->head == head)
 5698				return;
 5699			pt->error = -EINVAL;
 5700			return;
 5701		}
 5702
 5703		poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
 5704		if (!poll) {
 5705			pt->error = -ENOMEM;
 5706			return;
 5707		}
 5708		io_init_poll_iocb(poll, first->events, first->wait.func);
 5709		*poll_ptr = poll;
 5710		if (req->opcode == IORING_OP_POLL_ADD)
 5711			req->flags |= REQ_F_ASYNC_DATA;
 5712	}
 5713
 5714	pt->nr_entries++;
 5715	poll->head = head;
 5716	poll->wait.private = req;
 5717
 5718	if (poll->events & EPOLLEXCLUSIVE)
 5719		add_wait_queue_exclusive(head, &poll->wait);
 5720	else
 5721		add_wait_queue(head, &poll->wait);
 5722}
 5723
 5724static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
 5725			       struct poll_table_struct *p)
 5726{
 5727	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
 5728
 5729	__io_queue_proc(&pt->req->poll, pt, head,
 5730			(struct io_poll_iocb **) &pt->req->async_data);
 5731}
 5732
 5733static int __io_arm_poll_handler(struct io_kiocb *req,
 5734				 struct io_poll_iocb *poll,
 5735				 struct io_poll_table *ipt, __poll_t mask)
 5736{
 5737	struct io_ring_ctx *ctx = req->ctx;
 5738	int v;
 5739
 5740	INIT_HLIST_NODE(&req->hash_node);
 5741	io_init_poll_iocb(poll, mask, io_poll_wake);
 5742	poll->file = req->file;
 5743	poll->wait.private = req;
 5744
 5745	ipt->pt._key = mask;
 5746	ipt->req = req;
 5747	ipt->error = 0;
 5748	ipt->nr_entries = 0;
 5749
 5750	/*
 5751	 * Take the ownership to delay any tw execution up until we're done
 5752	 * with poll arming. see io_poll_get_ownership().
 5753	 */
 5754	atomic_set(&req->poll_refs, 1);
 5755	mask = vfs_poll(req->file, &ipt->pt) & poll->events;
 5756
 5757	if (mask && (poll->events & EPOLLONESHOT)) {
 5758		io_poll_remove_entries(req);
 5759		/* no one else has access to the req, forget about the ref */
 5760		return mask;
 5761	}
 5762	if (!mask && unlikely(ipt->error || !ipt->nr_entries)) {
 5763		io_poll_remove_entries(req);
 5764		if (!ipt->error)
 5765			ipt->error = -EINVAL;
 5766		return 0;
 5767	}
 5768
 5769	spin_lock(&ctx->completion_lock);
 5770	io_poll_req_insert(req);
 5771	spin_unlock(&ctx->completion_lock);
 5772
 5773	if (mask) {
 5774		/* can't multishot if failed, just queue the event we've got */
 5775		if (unlikely(ipt->error || !ipt->nr_entries))
 5776			poll->events |= EPOLLONESHOT;
 5777		__io_poll_execute(req, mask);
 5778		return 0;
 5779	}
 5780
 5781	/*
 5782	 * Release ownership. If someone tried to queue a tw while it was
 5783	 * locked, kick it off for them.
 5784	 */
 5785	v = atomic_dec_return(&req->poll_refs);
 5786	if (unlikely(v & IO_POLL_REF_MASK))
 5787		__io_poll_execute(req, 0);
 5788	return 0;
 5789}
 5790
 5791static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
 5792			       struct poll_table_struct *p)
 5793{
 5794	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
 5795	struct async_poll *apoll = pt->req->apoll;
 5796
 5797	__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
 5798}
 5799
 5800enum {
 5801	IO_APOLL_OK,
 5802	IO_APOLL_ABORTED,
 5803	IO_APOLL_READY
 5804};
 5805
 5806static int io_arm_poll_handler(struct io_kiocb *req)
 5807{
 5808	const struct io_op_def *def = &io_op_defs[req->opcode];
 5809	struct io_ring_ctx *ctx = req->ctx;
 5810	struct async_poll *apoll;
 5811	struct io_poll_table ipt;
 5812	__poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI;
 5813	int ret;
 5814
 5815	if (!def->pollin && !def->pollout)
 5816		return IO_APOLL_ABORTED;
 5817	if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED))
 5818		return IO_APOLL_ABORTED;
 5819
 5820	if (def->pollin) {
 5821		mask |= POLLIN | POLLRDNORM;
 5822
 5823		/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
 5824		if ((req->opcode == IORING_OP_RECVMSG) &&
 5825		    (req->sr_msg.msg_flags & MSG_ERRQUEUE))
 5826			mask &= ~POLLIN;
 5827	} else {
 5828		mask |= POLLOUT | POLLWRNORM;
 5829	}
 5830
 5831	apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
 5832	if (unlikely(!apoll))
 5833		return IO_APOLL_ABORTED;
 5834	apoll->double_poll = NULL;
 5835	req->apoll = apoll;
 5836	req->flags |= REQ_F_POLLED;
 5837	ipt.pt._qproc = io_async_queue_proc;
 5838
 5839	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask);
 5840	if (ret || ipt.error)
 5841		return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
 5842
 5843	trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
 5844				mask, apoll->poll.events);
 5845	return IO_APOLL_OK;
 5846}
 5847
 5848/*
 5849 * Returns true if we found and killed one or more poll requests
 5850 */
 5851static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
 5852				      struct task_struct *tsk, bool cancel_all)
 5853{
 5854	struct hlist_node *tmp;
 5855	struct io_kiocb *req;
 5856	bool found = false;
 5857	int i;
 5858
 5859	spin_lock(&ctx->completion_lock);
 5860	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
 5861		struct hlist_head *list;
 5862
 5863		list = &ctx->cancel_hash[i];
 5864		hlist_for_each_entry_safe(req, tmp, list, hash_node) {
 5865			if (io_match_task_safe(req, tsk, cancel_all)) {
 5866				io_poll_cancel_req(req);
 5867				found = true;
 5868			}
 5869		}
 5870	}
 5871	spin_unlock(&ctx->completion_lock);
 5872	return found;
 5873}
 5874
 5875static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
 5876				     bool poll_only)
 5877	__must_hold(&ctx->completion_lock)
 5878{
 5879	struct hlist_head *list;
 5880	struct io_kiocb *req;
 5881
 5882	list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
 5883	hlist_for_each_entry(req, list, hash_node) {
 5884		if (sqe_addr != req->user_data)
 5885			continue;
 5886		if (poll_only && req->opcode != IORING_OP_POLL_ADD)
 5887			continue;
 5888		return req;
 5889	}
 5890	return NULL;
 5891}
 5892
 5893static bool io_poll_disarm(struct io_kiocb *req)
 5894	__must_hold(&ctx->completion_lock)
 5895{
 5896	if (!io_poll_get_ownership(req))
 5897		return false;
 5898	io_poll_remove_entries(req);
 5899	hash_del(&req->hash_node);
 5900	return true;
 5901}
 5902
 5903static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
 5904			  bool poll_only)
 5905	__must_hold(&ctx->completion_lock)
 5906{
 5907	struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only);
 5908
 5909	if (!req)
 5910		return -ENOENT;
 5911	io_poll_cancel_req(req);
 5912	return 0;
 5913}
 5914
 5915static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
 5916				     unsigned int flags)
 5917{
 5918	u32 events;
 5919
 5920	events = READ_ONCE(sqe->poll32_events);
 5921#ifdef __BIG_ENDIAN
 5922	events = swahw32(events);
 5923#endif
 5924	if (!(flags & IORING_POLL_ADD_MULTI))
 5925		events |= EPOLLONESHOT;
 5926	return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
 5927}
 5928
 5929static int io_poll_update_prep(struct io_kiocb *req,
 5930			       const struct io_uring_sqe *sqe)
 5931{
 5932	struct io_poll_update *upd = &req->poll_update;
 5933	u32 flags;
 5934
 5935	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 5936		return -EINVAL;
 5937	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
 5938		return -EINVAL;
 5939	flags = READ_ONCE(sqe->len);
 5940	if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
 5941		      IORING_POLL_ADD_MULTI))
 5942		return -EINVAL;
 5943	/* meaningless without update */
 5944	if (flags == IORING_POLL_ADD_MULTI)
 5945		return -EINVAL;
 5946
 5947	upd->old_user_data = READ_ONCE(sqe->addr);
 5948	upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
 5949	upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
 5950
 5951	upd->new_user_data = READ_ONCE(sqe->off);
 5952	if (!upd->update_user_data && upd->new_user_data)
 5953		return -EINVAL;
 5954	if (upd->update_events)
 5955		upd->events = io_poll_parse_events(sqe, flags);
 5956	else if (sqe->poll32_events)
 5957		return -EINVAL;
 5958
 5959	return 0;
 5960}
 5961
 5962static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 5963{
 5964	struct io_poll_iocb *poll = &req->poll;
 5965	u32 flags;
 5966
 5967	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 5968		return -EINVAL;
 5969	if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
 5970		return -EINVAL;
 5971	flags = READ_ONCE(sqe->len);
 5972	if (flags & ~IORING_POLL_ADD_MULTI)
 5973		return -EINVAL;
 5974	if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
 5975		return -EINVAL;
 5976
 5977	io_req_set_refcount(req);
 5978	poll->events = io_poll_parse_events(sqe, flags);
 5979	return 0;
 5980}
 5981
 5982static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 5983{
 5984	struct io_poll_iocb *poll = &req->poll;
 5985	struct io_poll_table ipt;
 5986	int ret;
 5987
 5988	ipt.pt._qproc = io_poll_queue_proc;
 5989
 5990	ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events);
 5991	ret = ret ?: ipt.error;
 5992	if (ret)
 5993		__io_req_complete(req, issue_flags, ret, 0);
 5994	return 0;
 5995}
 5996
 5997static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
 5998{
 5999	struct io_ring_ctx *ctx = req->ctx;
 6000	struct io_kiocb *preq;
 6001	int ret2, ret = 0;
 6002	bool locked;
 6003
 6004	spin_lock(&ctx->completion_lock);
 6005	preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
 6006	if (!preq || !io_poll_disarm(preq)) {
 6007		spin_unlock(&ctx->completion_lock);
 6008		ret = preq ? -EALREADY : -ENOENT;
 6009		goto out;
 6010	}
 6011	spin_unlock(&ctx->completion_lock);
 6012
 6013	if (req->poll_update.update_events || req->poll_update.update_user_data) {
 6014		/* only mask one event flags, keep behavior flags */
 6015		if (req->poll_update.update_events) {
 6016			preq->poll.events &= ~0xffff;
 6017			preq->poll.events |= req->poll_update.events & 0xffff;
 6018			preq->poll.events |= IO_POLL_UNMASK;
 6019		}
 6020		if (req->poll_update.update_user_data)
 6021			preq->user_data = req->poll_update.new_user_data;
 6022
 6023		ret2 = io_poll_add(preq, issue_flags);
 6024		/* successfully updated, don't complete poll request */
 6025		if (!ret2)
 6026			goto out;
 6027	}
 6028
 6029	req_set_fail(preq);
 6030	preq->result = -ECANCELED;
 6031	locked = !(issue_flags & IO_URING_F_UNLOCKED);
 6032	io_req_task_complete(preq, &locked);
 6033out:
 6034	if (ret < 0)
 6035		req_set_fail(req);
 6036	/* complete update request, we're done with it */
 6037	__io_req_complete(req, issue_flags, ret, 0);
 6038	return 0;
 6039}
 6040
 6041static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 6042{
 6043	struct io_timeout_data *data = container_of(timer,
 6044						struct io_timeout_data, timer);
 6045	struct io_kiocb *req = data->req;
 6046	struct io_ring_ctx *ctx = req->ctx;
 6047	unsigned long flags;
 6048
 6049	spin_lock_irqsave(&ctx->timeout_lock, flags);
 6050	list_del_init(&req->timeout.list);
 6051	atomic_set(&req->ctx->cq_timeouts,
 6052		atomic_read(&req->ctx->cq_timeouts) + 1);
 6053	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
 6054
 6055	if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
 6056		req_set_fail(req);
 6057
 6058	req->result = -ETIME;
 6059	req->io_task_work.func = io_req_task_complete;
 6060	io_req_task_work_add(req, false);
 6061	return HRTIMER_NORESTART;
 6062}
 6063
 6064static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
 6065					   __u64 user_data)
 6066	__must_hold(&ctx->timeout_lock)
 6067{
 6068	struct io_timeout_data *io;
 6069	struct io_kiocb *req;
 6070	bool found = false;
 6071
 6072	list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
 6073		found = user_data == req->user_data;
 6074		if (found)
 6075			break;
 6076	}
 6077	if (!found)
 6078		return ERR_PTR(-ENOENT);
 6079
 6080	io = req->async_data;
 6081	if (hrtimer_try_to_cancel(&io->timer) == -1)
 6082		return ERR_PTR(-EALREADY);
 6083	list_del_init(&req->timeout.list);
 6084	return req;
 6085}
 6086
 6087static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
 6088	__must_hold(&ctx->completion_lock)
 6089	__must_hold(&ctx->timeout_lock)
 6090{
 6091	struct io_kiocb *req = io_timeout_extract(ctx, user_data);
 6092
 6093	if (IS_ERR(req))
 6094		return PTR_ERR(req);
 6095
 6096	req_set_fail(req);
 6097	io_fill_cqe_req(req, -ECANCELED, 0);
 6098	io_put_req_deferred(req);
 6099	return 0;
 6100}
 6101
 6102static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
 6103{
 6104	switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
 6105	case IORING_TIMEOUT_BOOTTIME:
 6106		return CLOCK_BOOTTIME;
 6107	case IORING_TIMEOUT_REALTIME:
 6108		return CLOCK_REALTIME;
 6109	default:
 6110		/* can't happen, vetted at prep time */
 6111		WARN_ON_ONCE(1);
 6112		fallthrough;
 6113	case 0:
 6114		return CLOCK_MONOTONIC;
 6115	}
 6116}
 6117
 6118static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 6119				    struct timespec64 *ts, enum hrtimer_mode mode)
 6120	__must_hold(&ctx->timeout_lock)
 6121{
 6122	struct io_timeout_data *io;
 6123	struct io_kiocb *req;
 6124	bool found = false;
 6125
 6126	list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
 6127		found = user_data == req->user_data;
 6128		if (found)
 6129			break;
 6130	}
 6131	if (!found)
 6132		return -ENOENT;
 6133
 6134	io = req->async_data;
 6135	if (hrtimer_try_to_cancel(&io->timer) == -1)
 6136		return -EALREADY;
 6137	hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
 6138	io->timer.function = io_link_timeout_fn;
 6139	hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
 6140	return 0;
 6141}
 6142
 6143static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
 6144			     struct timespec64 *ts, enum hrtimer_mode mode)
 6145	__must_hold(&ctx->timeout_lock)
 6146{
 6147	struct io_kiocb *req = io_timeout_extract(ctx, user_data);
 6148	struct io_timeout_data *data;
 6149
 6150	if (IS_ERR(req))
 6151		return PTR_ERR(req);
 6152
 6153	req->timeout.off = 0; /* noseq */
 6154	data = req->async_data;
 6155	list_add_tail(&req->timeout.list, &ctx->timeout_list);
 6156	hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
 6157	data->timer.function = io_timeout_fn;
 6158	hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
 6159	return 0;
 6160}
 6161
 6162static int io_timeout_remove_prep(struct io_kiocb *req,
 6163				  const struct io_uring_sqe *sqe)
 6164{
 6165	struct io_timeout_rem *tr = &req->timeout_rem;
 6166
 6167	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 6168		return -EINVAL;
 6169	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
 6170		return -EINVAL;
 6171	if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
 6172		return -EINVAL;
 6173
 6174	tr->ltimeout = false;
 6175	tr->addr = READ_ONCE(sqe->addr);
 6176	tr->flags = READ_ONCE(sqe->timeout_flags);
 6177	if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
 6178		if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
 6179			return -EINVAL;
 6180		if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
 6181			tr->ltimeout = true;
 6182		if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
 6183			return -EINVAL;
 6184		if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
 6185			return -EFAULT;
 6186		if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0)
 6187			return -EINVAL;
 6188	} else if (tr->flags) {
 6189		/* timeout removal doesn't support flags */
 6190		return -EINVAL;
 6191	}
 6192
 6193	return 0;
 6194}
 6195
 6196static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
 6197{
 6198	return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
 6199					    : HRTIMER_MODE_REL;
 6200}
 6201
 6202/*
 6203 * Remove or update an existing timeout command
 6204 */
 6205static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
 6206{
 6207	struct io_timeout_rem *tr = &req->timeout_rem;
 6208	struct io_ring_ctx *ctx = req->ctx;
 6209	int ret;
 6210
 6211	if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
 6212		spin_lock(&ctx->completion_lock);
 6213		spin_lock_irq(&ctx->timeout_lock);
 6214		ret = io_timeout_cancel(ctx, tr->addr);
 6215		spin_unlock_irq(&ctx->timeout_lock);
 6216		spin_unlock(&ctx->completion_lock);
 6217	} else {
 6218		enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
 6219
 6220		spin_lock_irq(&ctx->timeout_lock);
 6221		if (tr->ltimeout)
 6222			ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
 6223		else
 6224			ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
 6225		spin_unlock_irq(&ctx->timeout_lock);
 6226	}
 6227
 6228	if (ret < 0)
 6229		req_set_fail(req);
 6230	io_req_complete_post(req, ret, 0);
 6231	return 0;
 6232}
 6233
 6234static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 6235			   bool is_timeout_link)
 6236{
 6237	struct io_timeout_data *data;
 6238	unsigned flags;
 6239	u32 off = READ_ONCE(sqe->off);
 6240
 6241	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 6242		return -EINVAL;
 6243	if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
 6244	    sqe->splice_fd_in)
 6245		return -EINVAL;
 6246	if (off && is_timeout_link)
 6247		return -EINVAL;
 6248	flags = READ_ONCE(sqe->timeout_flags);
 6249	if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
 6250		      IORING_TIMEOUT_ETIME_SUCCESS))
 6251		return -EINVAL;
 6252	/* more than one clock specified is invalid, obviously */
 6253	if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
 6254		return -EINVAL;
 6255
 6256	INIT_LIST_HEAD(&req->timeout.list);
 6257	req->timeout.off = off;
 6258	if (unlikely(off && !req->ctx->off_timeout_used))
 6259		req->ctx->off_timeout_used = true;
 6260
 6261	if (WARN_ON_ONCE(req_has_async_data(req)))
 6262		return -EFAULT;
 6263	if (io_alloc_async_data(req))
 6264		return -ENOMEM;
 6265
 6266	data = req->async_data;
 6267	data->req = req;
 6268	data->flags = flags;
 6269
 6270	if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
 6271		return -EFAULT;
 6272
 6273	if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)
 6274		return -EINVAL;
 6275
 6276	data->mode = io_translate_timeout_mode(flags);
 6277	hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
 6278
 6279	if (is_timeout_link) {
 6280		struct io_submit_link *link = &req->ctx->submit_state.link;
 6281
 6282		if (!link->head)
 6283			return -EINVAL;
 6284		if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
 6285			return -EINVAL;
 6286		req->timeout.head = link->last;
 6287		link->last->flags |= REQ_F_ARM_LTIMEOUT;
 6288	}
 6289	return 0;
 6290}
 6291
 6292static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
 6293{
 6294	struct io_ring_ctx *ctx = req->ctx;
 6295	struct io_timeout_data *data = req->async_data;
 6296	struct list_head *entry;
 6297	u32 tail, off = req->timeout.off;
 6298
 6299	spin_lock_irq(&ctx->timeout_lock);
 6300
 6301	/*
 6302	 * sqe->off holds how many events that need to occur for this
 6303	 * timeout event to be satisfied. If it isn't set, then this is
 6304	 * a pure timeout request, sequence isn't used.
 6305	 */
 6306	if (io_is_timeout_noseq(req)) {
 6307		entry = ctx->timeout_list.prev;
 6308		goto add;
 6309	}
 6310
 6311	tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
 6312	req->timeout.target_seq = tail + off;
 6313
 6314	/* Update the last seq here in case io_flush_timeouts() hasn't.
 6315	 * This is safe because ->completion_lock is held, and submissions
 6316	 * and completions are never mixed in the same ->completion_lock section.
 6317	 */
 6318	ctx->cq_last_tm_flush = tail;
 6319
 6320	/*
 6321	 * Insertion sort, ensuring the first entry in the list is always
 6322	 * the one we need first.
 6323	 */
 6324	list_for_each_prev(entry, &ctx->timeout_list) {
 6325		struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
 6326						  timeout.list);
 6327
 6328		if (io_is_timeout_noseq(nxt))
 6329			continue;
 6330		/* nxt.seq is behind @tail, otherwise would've been completed */
 6331		if (off >= nxt->timeout.target_seq - tail)
 6332			break;
 6333	}
 6334add:
 6335	list_add(&req->timeout.list, entry);
 6336	data->timer.function = io_timeout_fn;
 6337	hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
 6338	spin_unlock_irq(&ctx->timeout_lock);
 6339	return 0;
 6340}
 6341
 6342struct io_cancel_data {
 6343	struct io_ring_ctx *ctx;
 6344	u64 user_data;
 6345};
 6346
 6347static bool io_cancel_cb(struct io_wq_work *work, void *data)
 6348{
 6349	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 6350	struct io_cancel_data *cd = data;
 6351
 6352	return req->ctx == cd->ctx && req->user_data == cd->user_data;
 6353}
 6354
 6355static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
 6356			       struct io_ring_ctx *ctx)
 6357{
 6358	struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
 6359	enum io_wq_cancel cancel_ret;
 6360	int ret = 0;
 6361
 6362	if (!tctx || !tctx->io_wq)
 6363		return -ENOENT;
 6364
 6365	cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
 6366	switch (cancel_ret) {
 6367	case IO_WQ_CANCEL_OK:
 6368		ret = 0;
 6369		break;
 6370	case IO_WQ_CANCEL_RUNNING:
 6371		ret = -EALREADY;
 6372		break;
 6373	case IO_WQ_CANCEL_NOTFOUND:
 6374		ret = -ENOENT;
 6375		break;
 6376	}
 6377
 6378	return ret;
 6379}
 6380
 6381static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
 6382{
 6383	struct io_ring_ctx *ctx = req->ctx;
 6384	int ret;
 6385
 6386	WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
 6387
 6388	ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
 6389	/*
 6390	 * Fall-through even for -EALREADY, as we may have poll armed
 6391	 * that need unarming.
 6392	 */
 6393	if (!ret)
 6394		return 0;
 6395
 6396	spin_lock(&ctx->completion_lock);
 6397	ret = io_poll_cancel(ctx, sqe_addr, false);
 6398	if (ret != -ENOENT)
 6399		goto out;
 6400
 6401	spin_lock_irq(&ctx->timeout_lock);
 6402	ret = io_timeout_cancel(ctx, sqe_addr);
 6403	spin_unlock_irq(&ctx->timeout_lock);
 6404out:
 6405	spin_unlock(&ctx->completion_lock);
 6406	return ret;
 6407}
 6408
 6409static int io_async_cancel_prep(struct io_kiocb *req,
 6410				const struct io_uring_sqe *sqe)
 6411{
 6412	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 6413		return -EINVAL;
 6414	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
 6415		return -EINVAL;
 6416	if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags ||
 6417	    sqe->splice_fd_in)
 6418		return -EINVAL;
 6419
 6420	req->cancel.addr = READ_ONCE(sqe->addr);
 6421	return 0;
 6422}
 6423
 6424static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 6425{
 6426	struct io_ring_ctx *ctx = req->ctx;
 6427	u64 sqe_addr = req->cancel.addr;
 6428	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 6429	struct io_tctx_node *node;
 6430	int ret;
 6431
 6432	ret = io_try_cancel_userdata(req, sqe_addr);
 6433	if (ret != -ENOENT)
 6434		goto done;
 6435
 6436	/* slow path, try all io-wq's */
 6437	io_ring_submit_lock(ctx, needs_lock);
 6438	ret = -ENOENT;
 6439	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
 6440		struct io_uring_task *tctx = node->task->io_uring;
 6441
 6442		ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
 6443		if (ret != -ENOENT)
 6444			break;
 6445	}
 6446	io_ring_submit_unlock(ctx, needs_lock);
 6447done:
 6448	if (ret < 0)
 6449		req_set_fail(req);
 6450	io_req_complete_post(req, ret, 0);
 6451	return 0;
 6452}
 6453
 6454static int io_rsrc_update_prep(struct io_kiocb *req,
 6455				const struct io_uring_sqe *sqe)
 6456{
 6457	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
 6458		return -EINVAL;
 6459	if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
 6460		return -EINVAL;
 6461
 6462	req->rsrc_update.offset = READ_ONCE(sqe->off);
 6463	req->rsrc_update.nr_args = READ_ONCE(sqe->len);
 6464	if (!req->rsrc_update.nr_args)
 6465		return -EINVAL;
 6466	req->rsrc_update.arg = READ_ONCE(sqe->addr);
 6467	return 0;
 6468}
 6469
 6470static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 6471{
 6472	struct io_ring_ctx *ctx = req->ctx;
 6473	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 6474	struct io_uring_rsrc_update2 up;
 6475	int ret;
 6476
 6477	up.offset = req->rsrc_update.offset;
 6478	up.data = req->rsrc_update.arg;
 6479	up.nr = 0;
 6480	up.tags = 0;
 6481	up.resv = 0;
 6482
 6483	io_ring_submit_lock(ctx, needs_lock);
 6484	ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
 6485					&up, req->rsrc_update.nr_args);
 6486	io_ring_submit_unlock(ctx, needs_lock);
 6487
 6488	if (ret < 0)
 6489		req_set_fail(req);
 6490	__io_req_complete(req, issue_flags, ret, 0);
 6491	return 0;
 6492}
 6493
 6494static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 6495{
 6496	switch (req->opcode) {
 6497	case IORING_OP_NOP:
 6498		return 0;
 6499	case IORING_OP_READV:
 6500	case IORING_OP_READ_FIXED:
 6501	case IORING_OP_READ:
 6502		return io_read_prep(req, sqe);
 6503	case IORING_OP_WRITEV:
 6504	case IORING_OP_WRITE_FIXED:
 6505	case IORING_OP_WRITE:
 6506		return io_write_prep(req, sqe);
 6507	case IORING_OP_POLL_ADD:
 6508		return io_poll_add_prep(req, sqe);
 6509	case IORING_OP_POLL_REMOVE:
 6510		return io_poll_update_prep(req, sqe);
 6511	case IORING_OP_FSYNC:
 6512		return io_fsync_prep(req, sqe);
 6513	case IORING_OP_SYNC_FILE_RANGE:
 6514		return io_sfr_prep(req, sqe);
 6515	case IORING_OP_SENDMSG:
 6516	case IORING_OP_SEND:
 6517		return io_sendmsg_prep(req, sqe);
 6518	case IORING_OP_RECVMSG:
 6519	case IORING_OP_RECV:
 6520		return io_recvmsg_prep(req, sqe);
 6521	case IORING_OP_CONNECT:
 6522		return io_connect_prep(req, sqe);
 6523	case IORING_OP_TIMEOUT:
 6524		return io_timeout_prep(req, sqe, false);
 6525	case IORING_OP_TIMEOUT_REMOVE:
 6526		return io_timeout_remove_prep(req, sqe);
 6527	case IORING_OP_ASYNC_CANCEL:
 6528		return io_async_cancel_prep(req, sqe);
 6529	case IORING_OP_LINK_TIMEOUT:
 6530		return io_timeout_prep(req, sqe, true);
 6531	case IORING_OP_ACCEPT:
 6532		return io_accept_prep(req, sqe);
 6533	case IORING_OP_FALLOCATE:
 6534		return io_fallocate_prep(req, sqe);
 6535	case IORING_OP_OPENAT:
 6536		return io_openat_prep(req, sqe);
 6537	case IORING_OP_CLOSE:
 6538		return io_close_prep(req, sqe);
 6539	case IORING_OP_FILES_UPDATE:
 6540		return io_rsrc_update_prep(req, sqe);
 6541	case IORING_OP_STATX:
 6542		return io_statx_prep(req, sqe);
 6543	case IORING_OP_FADVISE:
 6544		return io_fadvise_prep(req, sqe);
 6545	case IORING_OP_MADVISE:
 6546		return io_madvise_prep(req, sqe);
 6547	case IORING_OP_OPENAT2:
 6548		return io_openat2_prep(req, sqe);
 6549	case IORING_OP_EPOLL_CTL:
 6550		return io_epoll_ctl_prep(req, sqe);
 6551	case IORING_OP_SPLICE:
 6552		return io_splice_prep(req, sqe);
 6553	case IORING_OP_PROVIDE_BUFFERS:
 6554		return io_provide_buffers_prep(req, sqe);
 6555	case IORING_OP_REMOVE_BUFFERS:
 6556		return io_remove_buffers_prep(req, sqe);
 6557	case IORING_OP_TEE:
 6558		return io_tee_prep(req, sqe);
 6559	case IORING_OP_SHUTDOWN:
 6560		return io_shutdown_prep(req, sqe);
 6561	case IORING_OP_RENAMEAT:
 6562		return io_renameat_prep(req, sqe);
 6563	case IORING_OP_UNLINKAT:
 6564		return io_unlinkat_prep(req, sqe);
 6565	case IORING_OP_MKDIRAT:
 6566		return io_mkdirat_prep(req, sqe);
 6567	case IORING_OP_SYMLINKAT:
 6568		return io_symlinkat_prep(req, sqe);
 6569	case IORING_OP_LINKAT:
 6570		return io_linkat_prep(req, sqe);
 6571	}
 6572
 6573	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
 6574			req->opcode);
 6575	return -EINVAL;
 6576}
 6577
 6578static int io_req_prep_async(struct io_kiocb *req)
 6579{
 6580	if (!io_op_defs[req->opcode].needs_async_setup)
 6581		return 0;
 6582	if (WARN_ON_ONCE(req_has_async_data(req)))
 6583		return -EFAULT;
 6584	if (io_alloc_async_data(req))
 6585		return -EAGAIN;
 6586
 6587	switch (req->opcode) {
 6588	case IORING_OP_READV:
 6589		return io_rw_prep_async(req, READ);
 6590	case IORING_OP_WRITEV:
 6591		return io_rw_prep_async(req, WRITE);
 6592	case IORING_OP_SENDMSG:
 6593		return io_sendmsg_prep_async(req);
 6594	case IORING_OP_RECVMSG:
 6595		return io_recvmsg_prep_async(req);
 6596	case IORING_OP_CONNECT:
 6597		return io_connect_prep_async(req);
 6598	}
 6599	printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
 6600		    req->opcode);
 6601	return -EFAULT;
 6602}
 6603
 6604static u32 io_get_sequence(struct io_kiocb *req)
 6605{
 6606	u32 seq = req->ctx->cached_sq_head;
 6607
 6608	/* need original cached_sq_head, but it was increased for each req */
 6609	io_for_each_link(req, req)
 6610		seq--;
 6611	return seq;
 6612}
 6613
 6614static __cold void io_drain_req(struct io_kiocb *req)
 6615{
 6616	struct io_ring_ctx *ctx = req->ctx;
 6617	struct io_defer_entry *de;
 6618	int ret;
 6619	u32 seq = io_get_sequence(req);
 6620
 6621	/* Still need defer if there is pending req in defer list. */
 6622	spin_lock(&ctx->completion_lock);
 6623	if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
 6624		spin_unlock(&ctx->completion_lock);
 6625queue:
 6626		ctx->drain_active = false;
 6627		io_req_task_queue(req);
 6628		return;
 6629	}
 6630	spin_unlock(&ctx->completion_lock);
 6631
 6632	ret = io_req_prep_async(req);
 6633	if (ret) {
 6634fail:
 6635		io_req_complete_failed(req, ret);
 6636		return;
 6637	}
 6638	io_prep_async_link(req);
 6639	de = kmalloc(sizeof(*de), GFP_KERNEL);
 6640	if (!de) {
 6641		ret = -ENOMEM;
 6642		goto fail;
 6643	}
 6644
 6645	spin_lock(&ctx->completion_lock);
 6646	if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
 6647		spin_unlock(&ctx->completion_lock);
 6648		kfree(de);
 6649		goto queue;
 6650	}
 6651
 6652	trace_io_uring_defer(ctx, req, req->user_data);
 6653	de->req = req;
 6654	de->seq = seq;
 6655	list_add_tail(&de->list, &ctx->defer_list);
 6656	spin_unlock(&ctx->completion_lock);
 6657}
 6658
 6659static void io_clean_op(struct io_kiocb *req)
 6660{
 6661	if (req->flags & REQ_F_BUFFER_SELECTED)
 6662		io_put_kbuf(req);
 6663
 6664	if (req->flags & REQ_F_NEED_CLEANUP) {
 6665		switch (req->opcode) {
 6666		case IORING_OP_READV:
 6667		case IORING_OP_READ_FIXED:
 6668		case IORING_OP_READ:
 6669		case IORING_OP_WRITEV:
 6670		case IORING_OP_WRITE_FIXED:
 6671		case IORING_OP_WRITE: {
 6672			struct io_async_rw *io = req->async_data;
 6673
 6674			kfree(io->free_iovec);
 6675			break;
 6676			}
 6677		case IORING_OP_RECVMSG:
 6678		case IORING_OP_SENDMSG: {
 6679			struct io_async_msghdr *io = req->async_data;
 6680
 6681			kfree(io->free_iov);
 6682			break;
 6683			}
 6684		case IORING_OP_SPLICE:
 6685		case IORING_OP_TEE:
 6686			if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED))
 6687				io_put_file(req->splice.file_in);
 6688			break;
 6689		case IORING_OP_OPENAT:
 6690		case IORING_OP_OPENAT2:
 6691			if (req->open.filename)
 6692				putname(req->open.filename);
 6693			break;
 6694		case IORING_OP_RENAMEAT:
 6695			putname(req->rename.oldpath);
 6696			putname(req->rename.newpath);
 6697			break;
 6698		case IORING_OP_UNLINKAT:
 6699			putname(req->unlink.filename);
 6700			break;
 6701		case IORING_OP_MKDIRAT:
 6702			putname(req->mkdir.filename);
 6703			break;
 6704		case IORING_OP_SYMLINKAT:
 6705			putname(req->symlink.oldpath);
 6706			putname(req->symlink.newpath);
 6707			break;
 6708		case IORING_OP_LINKAT:
 6709			putname(req->hardlink.oldpath);
 6710			putname(req->hardlink.newpath);
 6711			break;
 6712		}
 6713	}
 6714	if ((req->flags & REQ_F_POLLED) && req->apoll) {
 6715		kfree(req->apoll->double_poll);
 6716		kfree(req->apoll);
 6717		req->apoll = NULL;
 6718	}
 6719	if (req->flags & REQ_F_INFLIGHT) {
 6720		struct io_uring_task *tctx = req->task->io_uring;
 6721
 6722		atomic_dec(&tctx->inflight_tracked);
 6723	}
 6724	if (req->flags & REQ_F_CREDS)
 6725		put_cred(req->creds);
 6726	if (req->flags & REQ_F_ASYNC_DATA) {
 6727		kfree(req->async_data);
 6728		req->async_data = NULL;
 6729	}
 6730	req->flags &= ~IO_REQ_CLEAN_FLAGS;
 6731}
 6732
 6733static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 6734{
 6735	const struct cred *creds = NULL;
 6736	int ret;
 6737
 6738	if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
 6739		creds = override_creds(req->creds);
 6740
 6741	if (!io_op_defs[req->opcode].audit_skip)
 6742		audit_uring_entry(req->opcode);
 6743
 6744	switch (req->opcode) {
 6745	case IORING_OP_NOP:
 6746		ret = io_nop(req, issue_flags);
 6747		break;
 6748	case IORING_OP_READV:
 6749	case IORING_OP_READ_FIXED:
 6750	case IORING_OP_READ:
 6751		ret = io_read(req, issue_flags);
 6752		break;
 6753	case IORING_OP_WRITEV:
 6754	case IORING_OP_WRITE_FIXED:
 6755	case IORING_OP_WRITE:
 6756		ret = io_write(req, issue_flags);
 6757		break;
 6758	case IORING_OP_FSYNC:
 6759		ret = io_fsync(req, issue_flags);
 6760		break;
 6761	case IORING_OP_POLL_ADD:
 6762		ret = io_poll_add(req, issue_flags);
 6763		break;
 6764	case IORING_OP_POLL_REMOVE:
 6765		ret = io_poll_update(req, issue_flags);
 6766		break;
 6767	case IORING_OP_SYNC_FILE_RANGE:
 6768		ret = io_sync_file_range(req, issue_flags);
 6769		break;
 6770	case IORING_OP_SENDMSG:
 6771		ret = io_sendmsg(req, issue_flags);
 6772		break;
 6773	case IORING_OP_SEND:
 6774		ret = io_send(req, issue_flags);
 6775		break;
 6776	case IORING_OP_RECVMSG:
 6777		ret = io_recvmsg(req, issue_flags);
 6778		break;
 6779	case IORING_OP_RECV:
 6780		ret = io_recv(req, issue_flags);
 6781		break;
 6782	case IORING_OP_TIMEOUT:
 6783		ret = io_timeout(req, issue_flags);
 6784		break;
 6785	case IORING_OP_TIMEOUT_REMOVE:
 6786		ret = io_timeout_remove(req, issue_flags);
 6787		break;
 6788	case IORING_OP_ACCEPT:
 6789		ret = io_accept(req, issue_flags);
 6790		break;
 6791	case IORING_OP_CONNECT:
 6792		ret = io_connect(req, issue_flags);
 6793		break;
 6794	case IORING_OP_ASYNC_CANCEL:
 6795		ret = io_async_cancel(req, issue_flags);
 6796		break;
 6797	case IORING_OP_FALLOCATE:
 6798		ret = io_fallocate(req, issue_flags);
 6799		break;
 6800	case IORING_OP_OPENAT:
 6801		ret = io_openat(req, issue_flags);
 6802		break;
 6803	case IORING_OP_CLOSE:
 6804		ret = io_close(req, issue_flags);
 6805		break;
 6806	case IORING_OP_FILES_UPDATE:
 6807		ret = io_files_update(req, issue_flags);
 6808		break;
 6809	case IORING_OP_STATX:
 6810		ret = io_statx(req, issue_flags);
 6811		break;
 6812	case IORING_OP_FADVISE:
 6813		ret = io_fadvise(req, issue_flags);
 6814		break;
 6815	case IORING_OP_MADVISE:
 6816		ret = io_madvise(req, issue_flags);
 6817		break;
 6818	case IORING_OP_OPENAT2:
 6819		ret = io_openat2(req, issue_flags);
 6820		break;
 6821	case IORING_OP_EPOLL_CTL:
 6822		ret = io_epoll_ctl(req, issue_flags);
 6823		break;
 6824	case IORING_OP_SPLICE:
 6825		ret = io_splice(req, issue_flags);
 6826		break;
 6827	case IORING_OP_PROVIDE_BUFFERS:
 6828		ret = io_provide_buffers(req, issue_flags);
 6829		break;
 6830	case IORING_OP_REMOVE_BUFFERS:
 6831		ret = io_remove_buffers(req, issue_flags);
 6832		break;
 6833	case IORING_OP_TEE:
 6834		ret = io_tee(req, issue_flags);
 6835		break;
 6836	case IORING_OP_SHUTDOWN:
 6837		ret = io_shutdown(req, issue_flags);
 6838		break;
 6839	case IORING_OP_RENAMEAT:
 6840		ret = io_renameat(req, issue_flags);
 6841		break;
 6842	case IORING_OP_UNLINKAT:
 6843		ret = io_unlinkat(req, issue_flags);
 6844		break;
 6845	case IORING_OP_MKDIRAT:
 6846		ret = io_mkdirat(req, issue_flags);
 6847		break;
 6848	case IORING_OP_SYMLINKAT:
 6849		ret = io_symlinkat(req, issue_flags);
 6850		break;
 6851	case IORING_OP_LINKAT:
 6852		ret = io_linkat(req, issue_flags);
 6853		break;
 6854	default:
 6855		ret = -EINVAL;
 6856		break;
 6857	}
 6858
 6859	if (!io_op_defs[req->opcode].audit_skip)
 6860		audit_uring_exit(!ret, ret);
 6861
 6862	if (creds)
 6863		revert_creds(creds);
 6864	if (ret)
 6865		return ret;
 6866	/* If the op doesn't have a file, we're not polling for it */
 6867	if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
 6868		io_iopoll_req_issued(req, issue_flags);
 6869
 6870	return 0;
 6871}
 6872
 6873static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
 6874{
 6875	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 6876
 6877	req = io_put_req_find_next(req);
 6878	return req ? &req->work : NULL;
 6879}
 6880
 6881static void io_wq_submit_work(struct io_wq_work *work)
 6882{
 6883	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 6884	unsigned int issue_flags = IO_URING_F_UNLOCKED;
 6885	bool needs_poll = false;
 6886	struct io_kiocb *timeout;
 6887	int ret = 0;
 6888
 6889	/* one will be dropped by ->io_free_work() after returning to io-wq */
 6890	if (!(req->flags & REQ_F_REFCOUNT))
 6891		__io_req_set_refcount(req, 2);
 6892	else
 6893		req_ref_get(req);
 6894
 6895	timeout = io_prep_linked_timeout(req);
 6896	if (timeout)
 6897		io_queue_linked_timeout(timeout);
 6898
 6899	/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
 6900	if (work->flags & IO_WQ_WORK_CANCEL) {
 6901		io_req_task_queue_fail(req, -ECANCELED);
 6902		return;
 6903	}
 6904
 6905	if (req->flags & REQ_F_FORCE_ASYNC) {
 6906		const struct io_op_def *def = &io_op_defs[req->opcode];
 6907		bool opcode_poll = def->pollin || def->pollout;
 6908
 6909		if (opcode_poll && file_can_poll(req->file)) {
 6910			needs_poll = true;
 6911			issue_flags |= IO_URING_F_NONBLOCK;
 6912		}
 6913	}
 6914
 6915	do {
 6916		ret = io_issue_sqe(req, issue_flags);
 6917		if (ret != -EAGAIN)
 6918			break;
 6919		/*
 6920		 * We can get EAGAIN for iopolled IO even though we're
 6921		 * forcing a sync submission from here, since we can't
 6922		 * wait for request slots on the block side.
 6923		 */
 6924		if (!needs_poll) {
 6925			cond_resched();
 6926			continue;
 6927		}
 6928
 6929		if (io_arm_poll_handler(req) == IO_APOLL_OK)
 6930			return;
 6931		/* aborted or ready, in either case retry blocking */
 6932		needs_poll = false;
 6933		issue_flags &= ~IO_URING_F_NONBLOCK;
 6934	} while (1);
 6935
 6936	/* avoid locking problems by failing it from a clean context */
 6937	if (ret)
 6938		io_req_task_queue_fail(req, ret);
 6939}
 6940
 6941static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
 6942						       unsigned i)
 6943{
 6944	return &table->files[i];
 6945}
 6946
 6947static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
 6948					      int index)
 6949{
 6950	struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
 6951
 6952	return (struct file *) (slot->file_ptr & FFS_MASK);
 6953}
 6954
 6955static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
 6956{
 6957	unsigned long file_ptr = (unsigned long) file;
 6958
 6959	file_ptr |= io_file_get_flags(file);
 6960	file_slot->file_ptr = file_ptr;
 6961}
 6962
 6963static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
 6964					     struct io_kiocb *req, int fd)
 6965{
 6966	struct file *file;
 6967	unsigned long file_ptr;
 6968
 6969	if (unlikely((unsigned int)fd >= ctx->nr_user_files))
 6970		return NULL;
 6971	fd = array_index_nospec(fd, ctx->nr_user_files);
 6972	file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
 6973	file = (struct file *) (file_ptr & FFS_MASK);
 6974	file_ptr &= ~FFS_MASK;
 6975	/* mask in overlapping REQ_F and FFS bits */
 6976	req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
 6977	io_req_set_rsrc_node(req, ctx);
 6978	return file;
 6979}
 6980
 6981static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
 6982				       struct io_kiocb *req, int fd)
 6983{
 6984	struct file *file = fget(fd);
 6985
 6986	trace_io_uring_file_get(ctx, fd);
 6987
 6988	/* we don't allow fixed io_uring files */
 6989	if (file && unlikely(file->f_op == &io_uring_fops))
 6990		io_req_track_inflight(req);
 6991	return file;
 6992}
 6993
 6994static inline struct file *io_file_get(struct io_ring_ctx *ctx,
 6995				       struct io_kiocb *req, int fd, bool fixed)
 6996{
 6997	if (fixed)
 6998		return io_file_get_fixed(ctx, req, fd);
 6999	else
 7000		return io_file_get_normal(ctx, req, fd);
 7001}
 7002
 7003static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
 7004{
 7005	struct io_kiocb *prev = req->timeout.prev;
 7006	int ret = -ENOENT;
 7007
 7008	if (prev) {
 7009		if (!(req->task->flags & PF_EXITING))
 7010			ret = io_try_cancel_userdata(req, prev->user_data);
 7011		io_req_complete_post(req, ret ?: -ETIME, 0);
 7012		io_put_req(prev);
 7013	} else {
 7014		io_req_complete_post(req, -ETIME, 0);
 7015	}
 7016}
 7017
 7018static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 7019{
 7020	struct io_timeout_data *data = container_of(timer,
 7021						struct io_timeout_data, timer);
 7022	struct io_kiocb *prev, *req = data->req;
 7023	struct io_ring_ctx *ctx = req->ctx;
 7024	unsigned long flags;
 7025
 7026	spin_lock_irqsave(&ctx->timeout_lock, flags);
 7027	prev = req->timeout.head;
 7028	req->timeout.head = NULL;
 7029
 7030	/*
 7031	 * We don't expect the list to be empty, that will only happen if we
 7032	 * race with the completion of the linked work.
 7033	 */
 7034	if (prev) {
 7035		io_remove_next_linked(prev);
 7036		if (!req_ref_inc_not_zero(prev))
 7037			prev = NULL;
 7038	}
 7039	list_del(&req->timeout.list);
 7040	req->timeout.prev = prev;
 7041	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
 7042
 7043	req->io_task_work.func = io_req_task_link_timeout;
 7044	io_req_task_work_add(req, false);
 7045	return HRTIMER_NORESTART;
 7046}
 7047
 7048static void io_queue_linked_timeout(struct io_kiocb *req)
 7049{
 7050	struct io_ring_ctx *ctx = req->ctx;
 7051
 7052	spin_lock_irq(&ctx->timeout_lock);
 7053	/*
 7054	 * If the back reference is NULL, then our linked request finished
 7055	 * before we got a chance to setup the timer
 7056	 */
 7057	if (req->timeout.head) {
 7058		struct io_timeout_data *data = req->async_data;
 7059
 7060		data->timer.function = io_link_timeout_fn;
 7061		hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
 7062				data->mode);
 7063		list_add_tail(&req->timeout.list, &ctx->ltimeout_list);
 7064	}
 7065	spin_unlock_irq(&ctx->timeout_lock);
 7066	/* drop submission reference */
 7067	io_put_req(req);
 7068}
 7069
 7070static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
 7071	__must_hold(&req->ctx->uring_lock)
 7072{
 7073	struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
 7074
 7075	switch (io_arm_poll_handler(req)) {
 7076	case IO_APOLL_READY:
 7077		io_req_task_queue(req);
 7078		break;
 7079	case IO_APOLL_ABORTED:
 7080		/*
 7081		 * Queued up for async execution, worker will release
 7082		 * submit reference when the iocb is actually submitted.
 7083		 */
 7084		io_queue_async_work(req, NULL);
 7085		break;
 7086	}
 7087
 7088	if (linked_timeout)
 7089		io_queue_linked_timeout(linked_timeout);
 7090}
 7091
 7092static inline void __io_queue_sqe(struct io_kiocb *req)
 7093	__must_hold(&req->ctx->uring_lock)
 7094{
 7095	struct io_kiocb *linked_timeout;
 7096	int ret;
 7097
 7098	ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
 7099
 7100	if (req->flags & REQ_F_COMPLETE_INLINE) {
 7101		io_req_add_compl_list(req);
 7102		return;
 7103	}
 7104	/*
 7105	 * We async punt it if the file wasn't marked NOWAIT, or if the file
 7106	 * doesn't support non-blocking read/write attempts
 7107	 */
 7108	if (likely(!ret)) {
 7109		linked_timeout = io_prep_linked_timeout(req);
 7110		if (linked_timeout)
 7111			io_queue_linked_timeout(linked_timeout);
 7112	} else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
 7113		io_queue_sqe_arm_apoll(req);
 7114	} else {
 7115		io_req_complete_failed(req, ret);
 7116	}
 7117}
 7118
 7119static void io_queue_sqe_fallback(struct io_kiocb *req)
 7120	__must_hold(&req->ctx->uring_lock)
 7121{
 7122	if (req->flags & REQ_F_FAIL) {
 7123		io_req_complete_fail_submit(req);
 7124	} else if (unlikely(req->ctx->drain_active)) {
 7125		io_drain_req(req);
 7126	} else {
 7127		int ret = io_req_prep_async(req);
 7128
 7129		if (unlikely(ret))
 7130			io_req_complete_failed(req, ret);
 7131		else
 7132			io_queue_async_work(req, NULL);
 7133	}
 7134}
 7135
 7136static inline void io_queue_sqe(struct io_kiocb *req)
 7137	__must_hold(&req->ctx->uring_lock)
 7138{
 7139	if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))))
 7140		__io_queue_sqe(req);
 7141	else
 7142		io_queue_sqe_fallback(req);
 7143}
 7144
 7145/*
 7146 * Check SQE restrictions (opcode and flags).
 7147 *
 7148 * Returns 'true' if SQE is allowed, 'false' otherwise.
 7149 */
 7150static inline bool io_check_restriction(struct io_ring_ctx *ctx,
 7151					struct io_kiocb *req,
 7152					unsigned int sqe_flags)
 7153{
 7154	if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
 7155		return false;
 7156
 7157	if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
 7158	    ctx->restrictions.sqe_flags_required)
 7159		return false;
 7160
 7161	if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
 7162			  ctx->restrictions.sqe_flags_required))
 7163		return false;
 7164
 7165	return true;
 7166}
 7167
 7168static void io_init_req_drain(struct io_kiocb *req)
 7169{
 7170	struct io_ring_ctx *ctx = req->ctx;
 7171	struct io_kiocb *head = ctx->submit_state.link.head;
 7172
 7173	ctx->drain_active = true;
 7174	if (head) {
 7175		/*
 7176		 * If we need to drain a request in the middle of a link, drain
 7177		 * the head request and the next request/link after the current
 7178		 * link. Considering sequential execution of links,
 7179		 * REQ_F_IO_DRAIN will be maintained for every request of our
 7180		 * link.
 7181		 */
 7182		head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
 7183		ctx->drain_next = true;
 7184	}
 7185}
 7186
 7187static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 7188		       const struct io_uring_sqe *sqe)
 7189	__must_hold(&ctx->uring_lock)
 7190{
 7191	unsigned int sqe_flags;
 7192	int personality;
 7193	u8 opcode;
 7194
 7195	/* req is partially pre-initialised, see io_preinit_req() */
 7196	req->opcode = opcode = READ_ONCE(sqe->opcode);
 7197	/* same numerical values with corresponding REQ_F_*, safe to copy */
 7198	req->flags = sqe_flags = READ_ONCE(sqe->flags);
 7199	req->user_data = READ_ONCE(sqe->user_data);
 7200	req->file = NULL;
 7201	req->fixed_rsrc_refs = NULL;
 7202	req->task = current;
 7203
 7204	if (unlikely(opcode >= IORING_OP_LAST)) {
 7205		req->opcode = 0;
 7206		return -EINVAL;
 7207	}
 7208	if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
 7209		/* enforce forwards compatibility on users */
 7210		if (sqe_flags & ~SQE_VALID_FLAGS)
 7211			return -EINVAL;
 7212		if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
 7213		    !io_op_defs[opcode].buffer_select)
 7214			return -EOPNOTSUPP;
 7215		if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
 7216			ctx->drain_disabled = true;
 7217		if (sqe_flags & IOSQE_IO_DRAIN) {
 7218			if (ctx->drain_disabled)
 7219				return -EOPNOTSUPP;
 7220			io_init_req_drain(req);
 7221		}
 7222	}
 7223	if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
 7224		if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
 7225			return -EACCES;
 7226		/* knock it to the slow queue path, will be drained there */
 7227		if (ctx->drain_active)
 7228			req->flags |= REQ_F_FORCE_ASYNC;
 7229		/* if there is no link, we're at "next" request and need to drain */
 7230		if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
 7231			ctx->drain_next = false;
 7232			ctx->drain_active = true;
 7233			req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
 7234		}
 7235	}
 7236
 7237	if (io_op_defs[opcode].needs_file) {
 7238		struct io_submit_state *state = &ctx->submit_state;
 7239
 7240		/*
 7241		 * Plug now if we have more than 2 IO left after this, and the
 7242		 * target is potentially a read/write to block based storage.
 7243		 */
 7244		if (state->need_plug && io_op_defs[opcode].plug) {
 7245			state->plug_started = true;
 7246			state->need_plug = false;
 7247			blk_start_plug_nr_ios(&state->plug, state->submit_nr);
 7248		}
 7249
 7250		req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
 7251					(sqe_flags & IOSQE_FIXED_FILE));
 7252		if (unlikely(!req->file))
 7253			return -EBADF;
 7254	}
 7255
 7256	personality = READ_ONCE(sqe->personality);
 7257	if (personality) {
 7258		int ret;
 7259
 7260		req->creds = xa_load(&ctx->personalities, personality);
 7261		if (!req->creds)
 7262			return -EINVAL;
 7263		get_cred(req->creds);
 7264		ret = security_uring_override_creds(req->creds);
 7265		if (ret) {
 7266			put_cred(req->creds);
 7267			return ret;
 7268		}
 7269		req->flags |= REQ_F_CREDS;
 7270	}
 7271
 7272	return io_req_prep(req, sqe);
 7273}
 7274
 7275static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 7276			 const struct io_uring_sqe *sqe)
 7277	__must_hold(&ctx->uring_lock)
 7278{
 7279	struct io_submit_link *link = &ctx->submit_state.link;
 7280	int ret;
 7281
 7282	ret = io_init_req(ctx, req, sqe);
 7283	if (unlikely(ret)) {
 7284		trace_io_uring_req_failed(sqe, ret);
 7285
 7286		/* fail even hard links since we don't submit */
 7287		if (link->head) {
 7288			/*
 7289			 * we can judge a link req is failed or cancelled by if
 7290			 * REQ_F_FAIL is set, but the head is an exception since
 7291			 * it may be set REQ_F_FAIL because of other req's failure
 7292			 * so let's leverage req->result to distinguish if a head
 7293			 * is set REQ_F_FAIL because of its failure or other req's
 7294			 * failure so that we can set the correct ret code for it.
 7295			 * init result here to avoid affecting the normal path.
 7296			 */
 7297			if (!(link->head->flags & REQ_F_FAIL))
 7298				req_fail_link_node(link->head, -ECANCELED);
 7299		} else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
 7300			/*
 7301			 * the current req is a normal req, we should return
 7302			 * error and thus break the submittion loop.
 7303			 */
 7304			io_req_complete_failed(req, ret);
 7305			return ret;
 7306		}
 7307		req_fail_link_node(req, ret);
 7308	}
 7309
 7310	/* don't need @sqe from now on */
 7311	trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
 7312				  req->flags, true,
 7313				  ctx->flags & IORING_SETUP_SQPOLL);
 7314
 7315	/*
 7316	 * If we already have a head request, queue this one for async
 7317	 * submittal once the head completes. If we don't have a head but
 7318	 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
 7319	 * submitted sync once the chain is complete. If none of those
 7320	 * conditions are true (normal request), then just queue it.
 7321	 */
 7322	if (link->head) {
 7323		struct io_kiocb *head = link->head;
 7324
 7325		if (!(req->flags & REQ_F_FAIL)) {
 7326			ret = io_req_prep_async(req);
 7327			if (unlikely(ret)) {
 7328				req_fail_link_node(req, ret);
 7329				if (!(head->flags & REQ_F_FAIL))
 7330					req_fail_link_node(head, -ECANCELED);
 7331			}
 7332		}
 7333		trace_io_uring_link(ctx, req, head);
 7334		link->last->link = req;
 7335		link->last = req;
 7336
 7337		if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
 7338			return 0;
 7339		/* last request of a link, enqueue the link */
 7340		link->head = NULL;
 7341		req = head;
 7342	} else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
 7343		link->head = req;
 7344		link->last = req;
 7345		return 0;
 7346	}
 7347
 7348	io_queue_sqe(req);
 7349	return 0;
 7350}
 7351
 7352/*
 7353 * Batched submission is done, ensure local IO is flushed out.
 7354 */
 7355static void io_submit_state_end(struct io_ring_ctx *ctx)
 7356{
 7357	struct io_submit_state *state = &ctx->submit_state;
 7358
 7359	if (state->link.head)
 7360		io_queue_sqe(state->link.head);
 7361	/* flush only after queuing links as they can generate completions */
 7362	io_submit_flush_completions(ctx);
 7363	if (state->plug_started)
 7364		blk_finish_plug(&state->plug);
 7365}
 7366
 7367/*
 7368 * Start submission side cache.
 7369 */
 7370static void io_submit_state_start(struct io_submit_state *state,
 7371				  unsigned int max_ios)
 7372{
 7373	state->plug_started = false;
 7374	state->need_plug = max_ios > 2;
 7375	state->submit_nr = max_ios;
 7376	/* set only head, no need to init link_last in advance */
 7377	state->link.head = NULL;
 7378}
 7379
 7380static void io_commit_sqring(struct io_ring_ctx *ctx)
 7381{
 7382	struct io_rings *rings = ctx->rings;
 7383
 7384	/*
 7385	 * Ensure any loads from the SQEs are done at this point,
 7386	 * since once we write the new head, the application could
 7387	 * write new data to them.
 7388	 */
 7389	smp_store_release(&rings->sq.head, ctx->cached_sq_head);
 7390}
 7391
 7392/*
 7393 * Fetch an sqe, if one is available. Note this returns a pointer to memory
 7394 * that is mapped by userspace. This means that care needs to be taken to
 7395 * ensure that reads are stable, as we cannot rely on userspace always
 7396 * being a good citizen. If members of the sqe are validated and then later
 7397 * used, it's important that those reads are done through READ_ONCE() to
 7398 * prevent a re-load down the line.
 7399 */
 7400static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
 7401{
 7402	unsigned head, mask = ctx->sq_entries - 1;
 7403	unsigned sq_idx = ctx->cached_sq_head++ & mask;
 7404
 7405	/*
 7406	 * The cached sq head (or cq tail) serves two purposes:
 7407	 *
 7408	 * 1) allows us to batch the cost of updating the user visible
 7409	 *    head updates.
 7410	 * 2) allows the kernel side to track the head on its own, even
 7411	 *    though the application is the one updating it.
 7412	 */
 7413	head = READ_ONCE(ctx->sq_array[sq_idx]);
 7414	if (likely(head < ctx->sq_entries))
 7415		return &ctx->sq_sqes[head];
 7416
 7417	/* drop invalid entries */
 7418	ctx->cq_extra--;
 7419	WRITE_ONCE(ctx->rings->sq_dropped,
 7420		   READ_ONCE(ctx->rings->sq_dropped) + 1);
 7421	return NULL;
 7422}
 7423
 7424static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 7425	__must_hold(&ctx->uring_lock)
 7426{
 7427	unsigned int entries = io_sqring_entries(ctx);
 7428	int submitted = 0;
 7429
 7430	if (unlikely(!entries))
 7431		return 0;
 7432	/* make sure SQ entry isn't read before tail */
 7433	nr = min3(nr, ctx->sq_entries, entries);
 7434	io_get_task_refs(nr);
 7435
 7436	io_submit_state_start(&ctx->submit_state, nr);
 7437	do {
 7438		const struct io_uring_sqe *sqe;
 7439		struct io_kiocb *req;
 7440
 7441		if (unlikely(!io_alloc_req_refill(ctx))) {
 7442			if (!submitted)
 7443				submitted = -EAGAIN;
 7444			break;
 7445		}
 7446		req = io_alloc_req(ctx);
 7447		sqe = io_get_sqe(ctx);
 7448		if (unlikely(!sqe)) {
 7449			wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
 7450			break;
 7451		}
 7452		/* will complete beyond this point, count as submitted */
 7453		submitted++;
 7454		if (io_submit_sqe(ctx, req, sqe))
 7455			break;
 7456	} while (submitted < nr);
 7457
 7458	if (unlikely(submitted != nr)) {
 7459		int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
 7460		int unused = nr - ref_used;
 7461
 7462		current->io_uring->cached_refs += unused;
 7463	}
 7464
 7465	io_submit_state_end(ctx);
 7466	 /* Commit SQ ring head once we've consumed and submitted all SQEs */
 7467	io_commit_sqring(ctx);
 7468
 7469	return submitted;
 7470}
 7471
 7472static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
 7473{
 7474	return READ_ONCE(sqd->state);
 7475}
 7476
 7477static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
 7478{
 7479	/* Tell userspace we may need a wakeup call */
 7480	spin_lock(&ctx->completion_lock);
 7481	WRITE_ONCE(ctx->rings->sq_flags,
 7482		   ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
 7483	spin_unlock(&ctx->completion_lock);
 7484}
 7485
 7486static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
 7487{
 7488	spin_lock(&ctx->completion_lock);
 7489	WRITE_ONCE(ctx->rings->sq_flags,
 7490		   ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
 7491	spin_unlock(&ctx->completion_lock);
 7492}
 7493
 7494static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
 7495{
 7496	unsigned int to_submit;
 7497	int ret = 0;
 7498
 7499	to_submit = io_sqring_entries(ctx);
 7500	/* if we're handling multiple rings, cap submit size for fairness */
 7501	if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
 7502		to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
 7503
 7504	if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
 7505		const struct cred *creds = NULL;
 7506
 7507		if (ctx->sq_creds != current_cred())
 7508			creds = override_creds(ctx->sq_creds);
 7509
 7510		mutex_lock(&ctx->uring_lock);
 7511		if (!wq_list_empty(&ctx->iopoll_list))
 7512			io_do_iopoll(ctx, true);
 7513
 7514		/*
 7515		 * Don't submit if refs are dying, good for io_uring_register(),
 7516		 * but also it is relied upon by io_ring_exit_work()
 7517		 */
 7518		if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
 7519		    !(ctx->flags & IORING_SETUP_R_DISABLED))
 7520			ret = io_submit_sqes(ctx, to_submit);
 7521		mutex_unlock(&ctx->uring_lock);
 7522
 7523		if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
 7524			wake_up(&ctx->sqo_sq_wait);
 7525		if (creds)
 7526			revert_creds(creds);
 7527	}
 7528
 7529	return ret;
 7530}
 7531
 7532static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
 7533{
 7534	struct io_ring_ctx *ctx;
 7535	unsigned sq_thread_idle = 0;
 7536
 7537	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 7538		sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
 7539	sqd->sq_thread_idle = sq_thread_idle;
 7540}
 7541
 7542static bool io_sqd_handle_event(struct io_sq_data *sqd)
 7543{
 7544	bool did_sig = false;
 7545	struct ksignal ksig;
 7546
 7547	if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
 7548	    signal_pending(current)) {
 7549		mutex_unlock(&sqd->lock);
 7550		if (signal_pending(current))
 7551			did_sig = get_signal(&ksig);
 7552		cond_resched();
 7553		mutex_lock(&sqd->lock);
 7554	}
 7555	return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
 7556}
 7557
 7558static int io_sq_thread(void *data)
 7559{
 7560	struct io_sq_data *sqd = data;
 7561	struct io_ring_ctx *ctx;
 7562	unsigned long timeout = 0;
 7563	char buf[TASK_COMM_LEN];
 7564	DEFINE_WAIT(wait);
 7565
 7566	snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
 7567	set_task_comm(current, buf);
 7568
 7569	if (sqd->sq_cpu != -1)
 7570		set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
 7571	else
 7572		set_cpus_allowed_ptr(current, cpu_online_mask);
 7573	current->flags |= PF_NO_SETAFFINITY;
 7574
 7575	audit_alloc_kernel(current);
 7576
 7577	mutex_lock(&sqd->lock);
 7578	while (1) {
 7579		bool cap_entries, sqt_spin = false;
 7580
 7581		if (io_sqd_events_pending(sqd) || signal_pending(current)) {
 7582			if (io_sqd_handle_event(sqd))
 7583				break;
 7584			timeout = jiffies + sqd->sq_thread_idle;
 7585		}
 7586
 7587		cap_entries = !list_is_singular(&sqd->ctx_list);
 7588		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
 7589			int ret = __io_sq_thread(ctx, cap_entries);
 7590
 7591			if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
 7592				sqt_spin = true;
 7593		}
 7594		if (io_run_task_work())
 7595			sqt_spin = true;
 7596
 7597		if (sqt_spin || !time_after(jiffies, timeout)) {
 7598			cond_resched();
 7599			if (sqt_spin)
 7600				timeout = jiffies + sqd->sq_thread_idle;
 7601			continue;
 7602		}
 7603
 7604		prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
 7605		if (!io_sqd_events_pending(sqd) && !current->task_works) {
 7606			bool needs_sched = true;
 7607
 7608			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
 7609				io_ring_set_wakeup_flag(ctx);
 7610
 7611				if ((ctx->flags & IORING_SETUP_IOPOLL) &&
 7612				    !wq_list_empty(&ctx->iopoll_list)) {
 7613					needs_sched = false;
 7614					break;
 7615				}
 7616				if (io_sqring_entries(ctx)) {
 7617					needs_sched = false;
 7618					break;
 7619				}
 7620			}
 7621
 7622			if (needs_sched) {
 7623				mutex_unlock(&sqd->lock);
 7624				schedule();
 7625				mutex_lock(&sqd->lock);
 7626			}
 7627			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 7628				io_ring_clear_wakeup_flag(ctx);
 7629		}
 7630
 7631		finish_wait(&sqd->wait, &wait);
 7632		timeout = jiffies + sqd->sq_thread_idle;
 7633	}
 7634
 7635	io_uring_cancel_generic(true, sqd);
 7636	sqd->thread = NULL;
 7637	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 7638		io_ring_set_wakeup_flag(ctx);
 7639	io_run_task_work();
 7640	mutex_unlock(&sqd->lock);
 7641
 7642	audit_free(current);
 7643
 7644	complete(&sqd->exited);
 7645	do_exit(0);
 7646}
 7647
 7648struct io_wait_queue {
 7649	struct wait_queue_entry wq;
 7650	struct io_ring_ctx *ctx;
 7651	unsigned cq_tail;
 7652	unsigned nr_timeouts;
 7653};
 7654
 7655static inline bool io_should_wake(struct io_wait_queue *iowq)
 7656{
 7657	struct io_ring_ctx *ctx = iowq->ctx;
 7658	int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
 7659
 7660	/*
 7661	 * Wake up if we have enough events, or if a timeout occurred since we
 7662	 * started waiting. For timeouts, we always want to return to userspace,
 7663	 * regardless of event count.
 7664	 */
 7665	return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
 7666}
 7667
 7668static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
 7669			    int wake_flags, void *key)
 7670{
 7671	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
 7672							wq);
 7673
 7674	/*
 7675	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
 7676	 * the task, and the next invocation will do it.
 7677	 */
 7678	if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow))
 7679		return autoremove_wake_function(curr, mode, wake_flags, key);
 7680	return -1;
 7681}
 7682
 7683static int io_run_task_work_sig(void)
 7684{
 7685	if (io_run_task_work())
 7686		return 1;
 7687	if (!signal_pending(current))
 7688		return 0;
 7689	if (test_thread_flag(TIF_NOTIFY_SIGNAL))
 7690		return -ERESTARTSYS;
 7691	return -EINTR;
 7692}
 7693
 7694/* when returns >0, the caller should retry */
 7695static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 7696					  struct io_wait_queue *iowq,
 7697					  ktime_t timeout)
 7698{
 7699	int ret;
 7700
 7701	/* make sure we run task_work before checking for signals */
 7702	ret = io_run_task_work_sig();
 7703	if (ret || io_should_wake(iowq))
 7704		return ret;
 7705	/* let the caller flush overflows, retry */
 7706	if (test_bit(0, &ctx->check_cq_overflow))
 7707		return 1;
 7708
 7709	if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
 7710		return -ETIME;
 7711	return 1;
 7712}
 7713
 7714/*
 7715 * Wait until events become available, if we don't already have some. The
 7716 * application must reap them itself, as they reside on the shared cq ring.
 7717 */
 7718static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 7719			  const sigset_t __user *sig, size_t sigsz,
 7720			  struct __kernel_timespec __user *uts)
 7721{
 7722	struct io_wait_queue iowq;
 7723	struct io_rings *rings = ctx->rings;
 7724	ktime_t timeout = KTIME_MAX;
 7725	int ret;
 7726
 7727	do {
 7728		io_cqring_overflow_flush(ctx);
 7729		if (io_cqring_events(ctx) >= min_events)
 7730			return 0;
 7731		if (!io_run_task_work())
 7732			break;
 7733	} while (1);
 7734
 7735	if (uts) {
 7736		struct timespec64 ts;
 7737
 7738		if (get_timespec64(&ts, uts))
 7739			return -EFAULT;
 7740		timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
 7741	}
 7742
 7743	if (sig) {
 7744#ifdef CONFIG_COMPAT
 7745		if (in_compat_syscall())
 7746			ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
 7747						      sigsz);
 7748		else
 7749#endif
 7750			ret = set_user_sigmask(sig, sigsz);
 7751
 7752		if (ret)
 7753			return ret;
 7754	}
 7755
 7756	init_waitqueue_func_entry(&iowq.wq, io_wake_function);
 7757	iowq.wq.private = current;
 7758	INIT_LIST_HEAD(&iowq.wq.entry);
 7759	iowq.ctx = ctx;
 7760	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
 7761	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
 7762
 7763	trace_io_uring_cqring_wait(ctx, min_events);
 7764	do {
 7765		/* if we can't even flush overflow, don't wait for more */
 7766		if (!io_cqring_overflow_flush(ctx)) {
 7767			ret = -EBUSY;
 7768			break;
 7769		}
 7770		prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
 7771						TASK_INTERRUPTIBLE);
 7772		ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
 7773		finish_wait(&ctx->cq_wait, &iowq.wq);
 7774		cond_resched();
 7775	} while (ret > 0);
 7776
 7777	restore_saved_sigmask_unless(ret == -EINTR);
 7778
 7779	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
 7780}
 7781
 7782static void io_free_page_table(void **table, size_t size)
 7783{
 7784	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
 7785
 7786	for (i = 0; i < nr_tables; i++)
 7787		kfree(table[i]);
 7788	kfree(table);
 7789}
 7790
 7791static __cold void **io_alloc_page_table(size_t size)
 7792{
 7793	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
 7794	size_t init_size = size;
 7795	void **table;
 7796
 7797	table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
 7798	if (!table)
 7799		return NULL;
 7800
 7801	for (i = 0; i < nr_tables; i++) {
 7802		unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
 7803
 7804		table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
 7805		if (!table[i]) {
 7806			io_free_page_table(table, init_size);
 7807			return NULL;
 7808		}
 7809		size -= this_size;
 7810	}
 7811	return table;
 7812}
 7813
 7814static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
 7815{
 7816	percpu_ref_exit(&ref_node->refs);
 7817	kfree(ref_node);
 7818}
 7819
 7820static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
 7821{
 7822	struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
 7823	struct io_ring_ctx *ctx = node->rsrc_data->ctx;
 7824	unsigned long flags;
 7825	bool first_add = false;
 7826	unsigned long delay = HZ;
 7827
 7828	spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
 7829	node->done = true;
 7830
 7831	/* if we are mid-quiesce then do not delay */
 7832	if (node->rsrc_data->quiesce)
 7833		delay = 0;
 7834
 7835	while (!list_empty(&ctx->rsrc_ref_list)) {
 7836		node = list_first_entry(&ctx->rsrc_ref_list,
 7837					    struct io_rsrc_node, node);
 7838		/* recycle ref nodes in order */
 7839		if (!node->done)
 7840			break;
 7841		list_del(&node->node);
 7842		first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
 7843	}
 7844	spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
 7845
 7846	if (first_add)
 7847		mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
 7848}
 7849
 7850static struct io_rsrc_node *io_rsrc_node_alloc(void)
 7851{
 7852	struct io_rsrc_node *ref_node;
 7853
 7854	ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
 7855	if (!ref_node)
 7856		return NULL;
 7857
 7858	if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
 7859			    0, GFP_KERNEL)) {
 7860		kfree(ref_node);
 7861		return NULL;
 7862	}
 7863	INIT_LIST_HEAD(&ref_node->node);
 7864	INIT_LIST_HEAD(&ref_node->rsrc_list);
 7865	ref_node->done = false;
 7866	return ref_node;
 7867}
 7868
 7869static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 7870				struct io_rsrc_data *data_to_kill)
 7871	__must_hold(&ctx->uring_lock)
 7872{
 7873	WARN_ON_ONCE(!ctx->rsrc_backup_node);
 7874	WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
 7875
 7876	io_rsrc_refs_drop(ctx);
 7877
 7878	if (data_to_kill) {
 7879		struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
 7880
 7881		rsrc_node->rsrc_data = data_to_kill;
 7882		spin_lock_irq(&ctx->rsrc_ref_lock);
 7883		list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
 7884		spin_unlock_irq(&ctx->rsrc_ref_lock);
 7885
 7886		atomic_inc(&data_to_kill->refs);
 7887		percpu_ref_kill(&rsrc_node->refs);
 7888		ctx->rsrc_node = NULL;
 7889	}
 7890
 7891	if (!ctx->rsrc_node) {
 7892		ctx->rsrc_node = ctx->rsrc_backup_node;
 7893		ctx->rsrc_backup_node = NULL;
 7894	}
 7895}
 7896
 7897static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
 7898{
 7899	if (ctx->rsrc_backup_node)
 7900		return 0;
 7901	ctx->rsrc_backup_node = io_rsrc_node_alloc();
 7902	return ctx->rsrc_backup_node ? 0 : -ENOMEM;
 7903}
 7904
 7905static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 7906				      struct io_ring_ctx *ctx)
 7907{
 7908	int ret;
 7909
 7910	/* As we may drop ->uring_lock, other task may have started quiesce */
 7911	if (data->quiesce)
 7912		return -ENXIO;
 7913
 7914	data->quiesce = true;
 7915	do {
 7916		ret = io_rsrc_node_switch_start(ctx);
 7917		if (ret)
 7918			break;
 7919		io_rsrc_node_switch(ctx, data);
 7920
 7921		/* kill initial ref, already quiesced if zero */
 7922		if (atomic_dec_and_test(&data->refs))
 7923			break;
 7924		mutex_unlock(&ctx->uring_lock);
 7925		flush_delayed_work(&ctx->rsrc_put_work);
 7926		ret = wait_for_completion_interruptible(&data->done);
 7927		if (!ret) {
 7928			mutex_lock(&ctx->uring_lock);
 7929			if (atomic_read(&data->refs) > 0) {
 7930				/*
 7931				 * it has been revived by another thread while
 7932				 * we were unlocked
 7933				 */
 7934				mutex_unlock(&ctx->uring_lock);
 7935			} else {
 7936				break;
 7937			}
 7938		}
 7939
 7940		atomic_inc(&data->refs);
 7941		/* wait for all works potentially completing data->done */
 7942		flush_delayed_work(&ctx->rsrc_put_work);
 7943		reinit_completion(&data->done);
 7944
 7945		ret = io_run_task_work_sig();
 7946		mutex_lock(&ctx->uring_lock);
 7947	} while (ret >= 0);
 7948	data->quiesce = false;
 7949
 7950	return ret;
 7951}
 7952
 7953static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
 7954{
 7955	unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
 7956	unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
 7957
 7958	return &data->tags[table_idx][off];
 7959}
 7960
 7961static void io_rsrc_data_free(struct io_rsrc_data *data)
 7962{
 7963	size_t size = data->nr * sizeof(data->tags[0][0]);
 7964
 7965	if (data->tags)
 7966		io_free_page_table((void **)data->tags, size);
 7967	kfree(data);
 7968}
 7969
 7970static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
 7971				     u64 __user *utags, unsigned nr,
 7972				     struct io_rsrc_data **pdata)
 7973{
 7974	struct io_rsrc_data *data;
 7975	int ret = -ENOMEM;
 7976	unsigned i;
 7977
 7978	data = kzalloc(sizeof(*data), GFP_KERNEL);
 7979	if (!data)
 7980		return -ENOMEM;
 7981	data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
 7982	if (!data->tags) {
 7983		kfree(data);
 7984		return -ENOMEM;
 7985	}
 7986
 7987	data->nr = nr;
 7988	data->ctx = ctx;
 7989	data->do_put = do_put;
 7990	if (utags) {
 7991		ret = -EFAULT;
 7992		for (i = 0; i < nr; i++) {
 7993			u64 *tag_slot = io_get_tag_slot(data, i);
 7994
 7995			if (copy_from_user(tag_slot, &utags[i],
 7996					   sizeof(*tag_slot)))
 7997				goto fail;
 7998		}
 7999	}
 8000
 8001	atomic_set(&data->refs, 1);
 8002	init_completion(&data->done);
 8003	*pdata = data;
 8004	return 0;
 8005fail:
 8006	io_rsrc_data_free(data);
 8007	return ret;
 8008}
 8009
 8010static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
 8011{
 8012	table->files = kvcalloc(nr_files, sizeof(table->files[0]),
 8013				GFP_KERNEL_ACCOUNT);
 8014	return !!table->files;
 8015}
 8016
 8017static void io_free_file_tables(struct io_file_table *table)
 8018{
 8019	kvfree(table->files);
 8020	table->files = NULL;
 8021}
 8022
 8023static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 8024{
 8025#if defined(CONFIG_UNIX)
 8026	if (ctx->ring_sock) {
 8027		struct sock *sock = ctx->ring_sock->sk;
 8028		struct sk_buff *skb;
 8029
 8030		while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
 8031			kfree_skb(skb);
 8032	}
 8033#else
 8034	int i;
 8035
 8036	for (i = 0; i < ctx->nr_user_files; i++) {
 8037		struct file *file;
 8038
 8039		file = io_file_from_index(ctx, i);
 8040		if (file)
 8041			fput(file);
 8042	}
 8043#endif
 8044	io_free_file_tables(&ctx->file_table);
 8045	io_rsrc_data_free(ctx->file_data);
 8046	ctx->file_data = NULL;
 8047	ctx->nr_user_files = 0;
 8048}
 8049
 8050static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 8051{
 8052	int ret;
 8053
 8054	if (!ctx->file_data)
 8055		return -ENXIO;
 8056	ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
 8057	if (!ret)
 8058		__io_sqe_files_unregister(ctx);
 8059	return ret;
 8060}
 8061
 8062static void io_sq_thread_unpark(struct io_sq_data *sqd)
 8063	__releases(&sqd->lock)
 8064{
 8065	WARN_ON_ONCE(sqd->thread == current);
 8066
 8067	/*
 8068	 * Do the dance but not conditional clear_bit() because it'd race with
 8069	 * other threads incrementing park_pending and setting the bit.
 8070	 */
 8071	clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
 8072	if (atomic_dec_return(&sqd->park_pending))
 8073		set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
 8074	mutex_unlock(&sqd->lock);
 8075}
 8076
 8077static void io_sq_thread_park(struct io_sq_data *sqd)
 8078	__acquires(&sqd->lock)
 8079{
 8080	WARN_ON_ONCE(sqd->thread == current);
 8081
 8082	atomic_inc(&sqd->park_pending);
 8083	set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
 8084	mutex_lock(&sqd->lock);
 8085	if (sqd->thread)
 8086		wake_up_process(sqd->thread);
 8087}
 8088
 8089static void io_sq_thread_stop(struct io_sq_data *sqd)
 8090{
 8091	WARN_ON_ONCE(sqd->thread == current);
 8092	WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
 8093
 8094	set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
 8095	mutex_lock(&sqd->lock);
 8096	if (sqd->thread)
 8097		wake_up_process(sqd->thread);
 8098	mutex_unlock(&sqd->lock);
 8099	wait_for_completion(&sqd->exited);
 8100}
 8101
 8102static void io_put_sq_data(struct io_sq_data *sqd)
 8103{
 8104	if (refcount_dec_and_test(&sqd->refs)) {
 8105		WARN_ON_ONCE(atomic_read(&sqd->park_pending));
 8106
 8107		io_sq_thread_stop(sqd);
 8108		kfree(sqd);
 8109	}
 8110}
 8111
 8112static void io_sq_thread_finish(struct io_ring_ctx *ctx)
 8113{
 8114	struct io_sq_data *sqd = ctx->sq_data;
 8115
 8116	if (sqd) {
 8117		io_sq_thread_park(sqd);
 8118		list_del_init(&ctx->sqd_list);
 8119		io_sqd_update_thread_idle(sqd);
 8120		io_sq_thread_unpark(sqd);
 8121
 8122		io_put_sq_data(sqd);
 8123		ctx->sq_data = NULL;
 8124	}
 8125}
 8126
 8127static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
 8128{
 8129	struct io_ring_ctx *ctx_attach;
 8130	struct io_sq_data *sqd;
 8131	struct fd f;
 8132
 8133	f = fdget(p->wq_fd);
 8134	if (!f.file)
 8135		return ERR_PTR(-ENXIO);
 8136	if (f.file->f_op != &io_uring_fops) {
 8137		fdput(f);
 8138		return ERR_PTR(-EINVAL);
 8139	}
 8140
 8141	ctx_attach = f.file->private_data;
 8142	sqd = ctx_attach->sq_data;
 8143	if (!sqd) {
 8144		fdput(f);
 8145		return ERR_PTR(-EINVAL);
 8146	}
 8147	if (sqd->task_tgid != current->tgid) {
 8148		fdput(f);
 8149		return ERR_PTR(-EPERM);
 8150	}
 8151
 8152	refcount_inc(&sqd->refs);
 8153	fdput(f);
 8154	return sqd;
 8155}
 8156
 8157static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
 8158					 bool *attached)
 8159{
 8160	struct io_sq_data *sqd;
 8161
 8162	*attached = false;
 8163	if (p->flags & IORING_SETUP_ATTACH_WQ) {
 8164		sqd = io_attach_sq_data(p);
 8165		if (!IS_ERR(sqd)) {
 8166			*attached = true;
 8167			return sqd;
 8168		}
 8169		/* fall through for EPERM case, setup new sqd/task */
 8170		if (PTR_ERR(sqd) != -EPERM)
 8171			return sqd;
 8172	}
 8173
 8174	sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
 8175	if (!sqd)
 8176		return ERR_PTR(-ENOMEM);
 8177
 8178	atomic_set(&sqd->park_pending, 0);
 8179	refcount_set(&sqd->refs, 1);
 8180	INIT_LIST_HEAD(&sqd->ctx_list);
 8181	mutex_init(&sqd->lock);
 8182	init_waitqueue_head(&sqd->wait);
 8183	init_completion(&sqd->exited);
 8184	return sqd;
 8185}
 8186
 8187#if defined(CONFIG_UNIX)
 8188/*
 8189 * Ensure the UNIX gc is aware of our file set, so we are certain that
 8190 * the io_uring can be safely unregistered on process exit, even if we have
 8191 * loops in the file referencing.
 8192 */
 8193static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
 8194{
 8195	struct sock *sk = ctx->ring_sock->sk;
 8196	struct scm_fp_list *fpl;
 8197	struct sk_buff *skb;
 8198	int i, nr_files;
 8199
 8200	fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
 8201	if (!fpl)
 8202		return -ENOMEM;
 8203
 8204	skb = alloc_skb(0, GFP_KERNEL);
 8205	if (!skb) {
 8206		kfree(fpl);
 8207		return -ENOMEM;
 8208	}
 8209
 8210	skb->sk = sk;
 8211
 8212	nr_files = 0;
 8213	fpl->user = get_uid(current_user());
 8214	for (i = 0; i < nr; i++) {
 8215		struct file *file = io_file_from_index(ctx, i + offset);
 8216
 8217		if (!file)
 8218			continue;
 8219		fpl->fp[nr_files] = get_file(file);
 8220		unix_inflight(fpl->user, fpl->fp[nr_files]);
 8221		nr_files++;
 8222	}
 8223
 8224	if (nr_files) {
 8225		fpl->max = SCM_MAX_FD;
 8226		fpl->count = nr_files;
 8227		UNIXCB(skb).fp = fpl;
 8228		skb->destructor = unix_destruct_scm;
 8229		refcount_add(skb->truesize, &sk->sk_wmem_alloc);
 8230		skb_queue_head(&sk->sk_receive_queue, skb);
 8231
 8232		for (i = 0; i < nr_files; i++)
 8233			fput(fpl->fp[i]);
 8234	} else {
 8235		kfree_skb(skb);
 8236		kfree(fpl);
 8237	}
 8238
 8239	return 0;
 8240}
 8241
 8242/*
 8243 * If UNIX sockets are enabled, fd passing can cause a reference cycle which
 8244 * causes regular reference counting to break down. We rely on the UNIX
 8245 * garbage collection to take care of this problem for us.
 8246 */
 8247static int io_sqe_files_scm(struct io_ring_ctx *ctx)
 8248{
 8249	unsigned left, total;
 8250	int ret = 0;
 8251
 8252	total = 0;
 8253	left = ctx->nr_user_files;
 8254	while (left) {
 8255		unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
 8256
 8257		ret = __io_sqe_files_scm(ctx, this_files, total);
 8258		if (ret)
 8259			break;
 8260		left -= this_files;
 8261		total += this_files;
 8262	}
 8263
 8264	if (!ret)
 8265		return 0;
 8266
 8267	while (total < ctx->nr_user_files) {
 8268		struct file *file = io_file_from_index(ctx, total);
 8269
 8270		if (file)
 8271			fput(file);
 8272		total++;
 8273	}
 8274
 8275	return ret;
 8276}
 8277#else
 8278static int io_sqe_files_scm(struct io_ring_ctx *ctx)
 8279{
 8280	return 0;
 8281}
 8282#endif
 8283
 8284static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
 8285{
 8286	struct file *file = prsrc->file;
 8287#if defined(CONFIG_UNIX)
 8288	struct sock *sock = ctx->ring_sock->sk;
 8289	struct sk_buff_head list, *head = &sock->sk_receive_queue;
 8290	struct sk_buff *skb;
 8291	int i;
 8292
 8293	__skb_queue_head_init(&list);
 8294
 8295	/*
 8296	 * Find the skb that holds this file in its SCM_RIGHTS. When found,
 8297	 * remove this entry and rearrange the file array.
 8298	 */
 8299	skb = skb_dequeue(head);
 8300	while (skb) {
 8301		struct scm_fp_list *fp;
 8302
 8303		fp = UNIXCB(skb).fp;
 8304		for (i = 0; i < fp->count; i++) {
 8305			int left;
 8306
 8307			if (fp->fp[i] != file)
 8308				continue;
 8309
 8310			unix_notinflight(fp->user, fp->fp[i]);
 8311			left = fp->count - 1 - i;
 8312			if (left) {
 8313				memmove(&fp->fp[i], &fp->fp[i + 1],
 8314						left * sizeof(struct file *));
 8315			}
 8316			fp->count--;
 8317			if (!fp->count) {
 8318				kfree_skb(skb);
 8319				skb = NULL;
 8320			} else {
 8321				__skb_queue_tail(&list, skb);
 8322			}
 8323			fput(file);
 8324			file = NULL;
 8325			break;
 8326		}
 8327
 8328		if (!file)
 8329			break;
 8330
 8331		__skb_queue_tail(&list, skb);
 8332
 8333		skb = skb_dequeue(head);
 8334	}
 8335
 8336	if (skb_peek(&list)) {
 8337		spin_lock_irq(&head->lock);
 8338		while ((skb = __skb_dequeue(&list)) != NULL)
 8339			__skb_queue_tail(head, skb);
 8340		spin_unlock_irq(&head->lock);
 8341	}
 8342#else
 8343	fput(file);
 8344#endif
 8345}
 8346
 8347static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
 8348{
 8349	struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
 8350	struct io_ring_ctx *ctx = rsrc_data->ctx;
 8351	struct io_rsrc_put *prsrc, *tmp;
 8352
 8353	list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
 8354		list_del(&prsrc->list);
 8355
 8356		if (prsrc->tag) {
 8357			bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
 8358
 8359			io_ring_submit_lock(ctx, lock_ring);
 8360			spin_lock(&ctx->completion_lock);
 8361			io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
 8362			io_commit_cqring(ctx);
 8363			spin_unlock(&ctx->completion_lock);
 8364			io_cqring_ev_posted(ctx);
 8365			io_ring_submit_unlock(ctx, lock_ring);
 8366		}
 8367
 8368		rsrc_data->do_put(ctx, prsrc);
 8369		kfree(prsrc);
 8370	}
 8371
 8372	io_rsrc_node_destroy(ref_node);
 8373	if (atomic_dec_and_test(&rsrc_data->refs))
 8374		complete(&rsrc_data->done);
 8375}
 8376
 8377static void io_rsrc_put_work(struct work_struct *work)
 8378{
 8379	struct io_ring_ctx *ctx;
 8380	struct llist_node *node;
 8381
 8382	ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
 8383	node = llist_del_all(&ctx->rsrc_put_llist);
 8384
 8385	while (node) {
 8386		struct io_rsrc_node *ref_node;
 8387		struct llist_node *next = node->next;
 8388
 8389		ref_node = llist_entry(node, struct io_rsrc_node, llist);
 8390		__io_rsrc_put_work(ref_node);
 8391		node = next;
 8392	}
 8393}
 8394
 8395static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 8396				 unsigned nr_args, u64 __user *tags)
 8397{
 8398	__s32 __user *fds = (__s32 __user *) arg;
 8399	struct file *file;
 8400	int fd, ret;
 8401	unsigned i;
 8402
 8403	if (ctx->file_data)
 8404		return -EBUSY;
 8405	if (!nr_args)
 8406		return -EINVAL;
 8407	if (nr_args > IORING_MAX_FIXED_FILES)
 8408		return -EMFILE;
 8409	if (nr_args > rlimit(RLIMIT_NOFILE))
 8410		return -EMFILE;
 8411	ret = io_rsrc_node_switch_start(ctx);
 8412	if (ret)
 8413		return ret;
 8414	ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
 8415				 &ctx->file_data);
 8416	if (ret)
 8417		return ret;
 8418
 8419	ret = -ENOMEM;
 8420	if (!io_alloc_file_tables(&ctx->file_table, nr_args))
 8421		goto out_free;
 8422
 8423	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
 8424		if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
 8425			ret = -EFAULT;
 8426			goto out_fput;
 8427		}
 8428		/* allow sparse sets */
 8429		if (fd == -1) {
 8430			ret = -EINVAL;
 8431			if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
 8432				goto out_fput;
 8433			continue;
 8434		}
 8435
 8436		file = fget(fd);
 8437		ret = -EBADF;
 8438		if (unlikely(!file))
 8439			goto out_fput;
 8440
 8441		/*
 8442		 * Don't allow io_uring instances to be registered. If UNIX
 8443		 * isn't enabled, then this causes a reference cycle and this
 8444		 * instance can never get freed. If UNIX is enabled we'll
 8445		 * handle it just fine, but there's still no point in allowing
 8446		 * a ring fd as it doesn't support regular read/write anyway.
 8447		 */
 8448		if (file->f_op == &io_uring_fops) {
 8449			fput(file);
 8450			goto out_fput;
 8451		}
 8452		io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
 8453	}
 8454
 8455	ret = io_sqe_files_scm(ctx);
 8456	if (ret) {
 8457		__io_sqe_files_unregister(ctx);
 8458		return ret;
 8459	}
 8460
 8461	io_rsrc_node_switch(ctx, NULL);
 8462	return ret;
 8463out_fput:
 8464	for (i = 0; i < ctx->nr_user_files; i++) {
 8465		file = io_file_from_index(ctx, i);
 8466		if (file)
 8467			fput(file);
 8468	}
 8469	io_free_file_tables(&ctx->file_table);
 8470	ctx->nr_user_files = 0;
 8471out_free:
 8472	io_rsrc_data_free(ctx->file_data);
 8473	ctx->file_data = NULL;
 8474	return ret;
 8475}
 8476
 8477static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
 8478				int index)
 8479{
 8480#if defined(CONFIG_UNIX)
 8481	struct sock *sock = ctx->ring_sock->sk;
 8482	struct sk_buff_head *head = &sock->sk_receive_queue;
 8483	struct sk_buff *skb;
 8484
 8485	/*
 8486	 * See if we can merge this file into an existing skb SCM_RIGHTS
 8487	 * file set. If there's no room, fall back to allocating a new skb
 8488	 * and filling it in.
 8489	 */
 8490	spin_lock_irq(&head->lock);
 8491	skb = skb_peek(head);
 8492	if (skb) {
 8493		struct scm_fp_list *fpl = UNIXCB(skb).fp;
 8494
 8495		if (fpl->count < SCM_MAX_FD) {
 8496			__skb_unlink(skb, head);
 8497			spin_unlock_irq(&head->lock);
 8498			fpl->fp[fpl->count] = get_file(file);
 8499			unix_inflight(fpl->user, fpl->fp[fpl->count]);
 8500			fpl->count++;
 8501			spin_lock_irq(&head->lock);
 8502			__skb_queue_head(head, skb);
 8503		} else {
 8504			skb = NULL;
 8505		}
 8506	}
 8507	spin_unlock_irq(&head->lock);
 8508
 8509	if (skb) {
 8510		fput(file);
 8511		return 0;
 8512	}
 8513
 8514	return __io_sqe_files_scm(ctx, 1, index);
 8515#else
 8516	return 0;
 8517#endif
 8518}
 8519
 8520static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 8521				 struct io_rsrc_node *node, void *rsrc)
 8522{
 8523	struct io_rsrc_put *prsrc;
 8524
 8525	prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
 8526	if (!prsrc)
 8527		return -ENOMEM;
 8528
 8529	prsrc->tag = *io_get_tag_slot(data, idx);
 8530	prsrc->rsrc = rsrc;
 8531	list_add(&prsrc->list, &node->rsrc_list);
 8532	return 0;
 8533}
 8534
 8535static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
 8536				 unsigned int issue_flags, u32 slot_index)
 8537{
 8538	struct io_ring_ctx *ctx = req->ctx;
 8539	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 8540	bool needs_switch = false;
 8541	struct io_fixed_file *file_slot;
 8542	int ret = -EBADF;
 8543
 8544	io_ring_submit_lock(ctx, needs_lock);
 8545	if (file->f_op == &io_uring_fops)
 8546		goto err;
 8547	ret = -ENXIO;
 8548	if (!ctx->file_data)
 8549		goto err;
 8550	ret = -EINVAL;
 8551	if (slot_index >= ctx->nr_user_files)
 8552		goto err;
 8553
 8554	slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
 8555	file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
 8556
 8557	if (file_slot->file_ptr) {
 8558		struct file *old_file;
 8559
 8560		ret = io_rsrc_node_switch_start(ctx);
 8561		if (ret)
 8562			goto err;
 8563
 8564		old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
 8565		ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
 8566					    ctx->rsrc_node, old_file);
 8567		if (ret)
 8568			goto err;
 8569		file_slot->file_ptr = 0;
 8570		needs_switch = true;
 8571	}
 8572
 8573	*io_get_tag_slot(ctx->file_data, slot_index) = 0;
 8574	io_fixed_file_set(file_slot, file);
 8575	ret = io_sqe_file_register(ctx, file, slot_index);
 8576	if (ret) {
 8577		file_slot->file_ptr = 0;
 8578		goto err;
 8579	}
 8580
 8581	ret = 0;
 8582err:
 8583	if (needs_switch)
 8584		io_rsrc_node_switch(ctx, ctx->file_data);
 8585	io_ring_submit_unlock(ctx, needs_lock);
 8586	if (ret)
 8587		fput(file);
 8588	return ret;
 8589}
 8590
 8591static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
 8592{
 8593	unsigned int offset = req->close.file_slot - 1;
 8594	struct io_ring_ctx *ctx = req->ctx;
 8595	bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
 8596	struct io_fixed_file *file_slot;
 8597	struct file *file;
 8598	int ret, i;
 8599
 8600	io_ring_submit_lock(ctx, needs_lock);
 8601	ret = -ENXIO;
 8602	if (unlikely(!ctx->file_data))
 8603		goto out;
 8604	ret = -EINVAL;
 8605	if (offset >= ctx->nr_user_files)
 8606		goto out;
 8607	ret = io_rsrc_node_switch_start(ctx);
 8608	if (ret)
 8609		goto out;
 8610
 8611	i = array_index_nospec(offset, ctx->nr_user_files);
 8612	file_slot = io_fixed_file_slot(&ctx->file_table, i);
 8613	ret = -EBADF;
 8614	if (!file_slot->file_ptr)
 8615		goto out;
 8616
 8617	file = (struct file *)(file_slot->file_ptr & FFS_MASK);
 8618	ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
 8619	if (ret)
 8620		goto out;
 8621
 8622	file_slot->file_ptr = 0;
 8623	io_rsrc_node_switch(ctx, ctx->file_data);
 8624	ret = 0;
 8625out:
 8626	io_ring_submit_unlock(ctx, needs_lock);
 8627	return ret;
 8628}
 8629
 8630static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 8631				 struct io_uring_rsrc_update2 *up,
 8632				 unsigned nr_args)
 8633{
 8634	u64 __user *tags = u64_to_user_ptr(up->tags);
 8635	__s32 __user *fds = u64_to_user_ptr(up->data);
 8636	struct io_rsrc_data *data = ctx->file_data;
 8637	struct io_fixed_file *file_slot;
 8638	struct file *file;
 8639	int fd, i, err = 0;
 8640	unsigned int done;
 8641	bool needs_switch = false;
 8642
 8643	if (!ctx->file_data)
 8644		return -ENXIO;
 8645	if (up->offset + nr_args > ctx->nr_user_files)
 8646		return -EINVAL;
 8647
 8648	for (done = 0; done < nr_args; done++) {
 8649		u64 tag = 0;
 8650
 8651		if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
 8652		    copy_from_user(&fd, &fds[done], sizeof(fd))) {
 8653			err = -EFAULT;
 8654			break;
 8655		}
 8656		if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
 8657			err = -EINVAL;
 8658			break;
 8659		}
 8660		if (fd == IORING_REGISTER_FILES_SKIP)
 8661			continue;
 8662
 8663		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
 8664		file_slot = io_fixed_file_slot(&ctx->file_table, i);
 8665
 8666		if (file_slot->file_ptr) {
 8667			file = (struct file *)(file_slot->file_ptr & FFS_MASK);
 8668			err = io_queue_rsrc_removal(data, up->offset + done,
 8669						    ctx->rsrc_node, file);
 8670			if (err)
 8671				break;
 8672			file_slot->file_ptr = 0;
 8673			needs_switch = true;
 8674		}
 8675		if (fd != -1) {
 8676			file = fget(fd);
 8677			if (!file) {
 8678				err = -EBADF;
 8679				break;
 8680			}
 8681			/*
 8682			 * Don't allow io_uring instances to be registered. If
 8683			 * UNIX isn't enabled, then this causes a reference
 8684			 * cycle and this instance can never get freed. If UNIX
 8685			 * is enabled we'll handle it just fine, but there's
 8686			 * still no point in allowing a ring fd as it doesn't
 8687			 * support regular read/write anyway.
 8688			 */
 8689			if (file->f_op == &io_uring_fops) {
 8690				fput(file);
 8691				err = -EBADF;
 8692				break;
 8693			}
 8694			*io_get_tag_slot(data, up->offset + done) = tag;
 8695			io_fixed_file_set(file_slot, file);
 8696			err = io_sqe_file_register(ctx, file, i);
 8697			if (err) {
 8698				file_slot->file_ptr = 0;
 8699				fput(file);
 8700				break;
 8701			}
 8702		}
 8703	}
 8704
 8705	if (needs_switch)
 8706		io_rsrc_node_switch(ctx, data);
 8707	return done ? done : err;
 8708}
 8709
 8710static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
 8711					struct task_struct *task)
 8712{
 8713	struct io_wq_hash *hash;
 8714	struct io_wq_data data;
 8715	unsigned int concurrency;
 8716
 8717	mutex_lock(&ctx->uring_lock);
 8718	hash = ctx->hash_map;
 8719	if (!hash) {
 8720		hash = kzalloc(sizeof(*hash), GFP_KERNEL);
 8721		if (!hash) {
 8722			mutex_unlock(&ctx->uring_lock);
 8723			return ERR_PTR(-ENOMEM);
 8724		}
 8725		refcount_set(&hash->refs, 1);
 8726		init_waitqueue_head(&hash->wait);
 8727		ctx->hash_map = hash;
 8728	}
 8729	mutex_unlock(&ctx->uring_lock);
 8730
 8731	data.hash = hash;
 8732	data.task = task;
 8733	data.free_work = io_wq_free_work;
 8734	data.do_work = io_wq_submit_work;
 8735
 8736	/* Do QD, or 4 * CPUS, whatever is smallest */
 8737	concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
 8738
 8739	return io_wq_create(concurrency, &data);
 8740}
 8741
 8742static __cold int io_uring_alloc_task_context(struct task_struct *task,
 8743					      struct io_ring_ctx *ctx)
 8744{
 8745	struct io_uring_task *tctx;
 8746	int ret;
 8747
 8748	tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
 8749	if (unlikely(!tctx))
 8750		return -ENOMEM;
 8751
 8752	ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
 8753	if (unlikely(ret)) {
 8754		kfree(tctx);
 8755		return ret;
 8756	}
 8757
 8758	tctx->io_wq = io_init_wq_offload(ctx, task);
 8759	if (IS_ERR(tctx->io_wq)) {
 8760		ret = PTR_ERR(tctx->io_wq);
 8761		percpu_counter_destroy(&tctx->inflight);
 8762		kfree(tctx);
 8763		return ret;
 8764	}
 8765
 8766	xa_init(&tctx->xa);
 8767	init_waitqueue_head(&tctx->wait);
 8768	atomic_set(&tctx->in_idle, 0);
 8769	atomic_set(&tctx->inflight_tracked, 0);
 8770	task->io_uring = tctx;
 8771	spin_lock_init(&tctx->task_lock);
 8772	INIT_WQ_LIST(&tctx->task_list);
 8773	INIT_WQ_LIST(&tctx->prior_task_list);
 8774	init_task_work(&tctx->task_work, tctx_task_work);
 8775	return 0;
 8776}
 8777
 8778void __io_uring_free(struct task_struct *tsk)
 8779{
 8780	struct io_uring_task *tctx = tsk->io_uring;
 8781
 8782	WARN_ON_ONCE(!xa_empty(&tctx->xa));
 8783	WARN_ON_ONCE(tctx->io_wq);
 8784	WARN_ON_ONCE(tctx->cached_refs);
 8785
 8786	percpu_counter_destroy(&tctx->inflight);
 8787	kfree(tctx);
 8788	tsk->io_uring = NULL;
 8789}
 8790
 8791static __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
 8792				       struct io_uring_params *p)
 8793{
 8794	int ret;
 8795
 8796	/* Retain compatibility with failing for an invalid attach attempt */
 8797	if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
 8798				IORING_SETUP_ATTACH_WQ) {
 8799		struct fd f;
 8800
 8801		f = fdget(p->wq_fd);
 8802		if (!f.file)
 8803			return -ENXIO;
 8804		if (f.file->f_op != &io_uring_fops) {
 8805			fdput(f);
 8806			return -EINVAL;
 8807		}
 8808		fdput(f);
 8809	}
 8810	if (ctx->flags & IORING_SETUP_SQPOLL) {
 8811		struct task_struct *tsk;
 8812		struct io_sq_data *sqd;
 8813		bool attached;
 8814
 8815		ret = security_uring_sqpoll();
 8816		if (ret)
 8817			return ret;
 8818
 8819		sqd = io_get_sq_data(p, &attached);
 8820		if (IS_ERR(sqd)) {
 8821			ret = PTR_ERR(sqd);
 8822			goto err;
 8823		}
 8824
 8825		ctx->sq_creds = get_current_cred();
 8826		ctx->sq_data = sqd;
 8827		ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
 8828		if (!ctx->sq_thread_idle)
 8829			ctx->sq_thread_idle = HZ;
 8830
 8831		io_sq_thread_park(sqd);
 8832		list_add(&ctx->sqd_list, &sqd->ctx_list);
 8833		io_sqd_update_thread_idle(sqd);
 8834		/* don't attach to a dying SQPOLL thread, would be racy */
 8835		ret = (attached && !sqd->thread) ? -ENXIO : 0;
 8836		io_sq_thread_unpark(sqd);
 8837
 8838		if (ret < 0)
 8839			goto err;
 8840		if (attached)
 8841			return 0;
 8842
 8843		if (p->flags & IORING_SETUP_SQ_AFF) {
 8844			int cpu = p->sq_thread_cpu;
 8845
 8846			ret = -EINVAL;
 8847			if (cpu >= nr_cpu_ids || !cpu_online(cpu))
 8848				goto err_sqpoll;
 8849			sqd->sq_cpu = cpu;
 8850		} else {
 8851			sqd->sq_cpu = -1;
 8852		}
 8853
 8854		sqd->task_pid = current->pid;
 8855		sqd->task_tgid = current->tgid;
 8856		tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
 8857		if (IS_ERR(tsk)) {
 8858			ret = PTR_ERR(tsk);
 8859			goto err_sqpoll;
 8860		}
 8861
 8862		sqd->thread = tsk;
 8863		ret = io_uring_alloc_task_context(tsk, ctx);
 8864		wake_up_new_task(tsk);
 8865		if (ret)
 8866			goto err;
 8867	} else if (p->flags & IORING_SETUP_SQ_AFF) {
 8868		/* Can't have SQ_AFF without SQPOLL */
 8869		ret = -EINVAL;
 8870		goto err;
 8871	}
 8872
 8873	return 0;
 8874err_sqpoll:
 8875	complete(&ctx->sq_data->exited);
 8876err:
 8877	io_sq_thread_finish(ctx);
 8878	return ret;
 8879}
 8880
 8881static inline void __io_unaccount_mem(struct user_struct *user,
 8882				      unsigned long nr_pages)
 8883{
 8884	atomic_long_sub(nr_pages, &user->locked_vm);
 8885}
 8886
 8887static inline int __io_account_mem(struct user_struct *user,
 8888				   unsigned long nr_pages)
 8889{
 8890	unsigned long page_limit, cur_pages, new_pages;
 8891
 8892	/* Don't allow more pages than we can safely lock */
 8893	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 8894
 8895	do {
 8896		cur_pages = atomic_long_read(&user->locked_vm);
 8897		new_pages = cur_pages + nr_pages;
 8898		if (new_pages > page_limit)
 8899			return -ENOMEM;
 8900	} while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
 8901					new_pages) != cur_pages);
 8902
 8903	return 0;
 8904}
 8905
 8906static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
 8907{
 8908	if (ctx->user)
 8909		__io_unaccount_mem(ctx->user, nr_pages);
 8910
 8911	if (ctx->mm_account)
 8912		atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
 8913}
 8914
 8915static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
 8916{
 8917	int ret;
 8918
 8919	if (ctx->user) {
 8920		ret = __io_account_mem(ctx->user, nr_pages);
 8921		if (ret)
 8922			return ret;
 8923	}
 8924
 8925	if (ctx->mm_account)
 8926		atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
 8927
 8928	return 0;
 8929}
 8930
 8931static void io_mem_free(void *ptr)
 8932{
 8933	struct page *page;
 8934
 8935	if (!ptr)
 8936		return;
 8937
 8938	page = virt_to_head_page(ptr);
 8939	if (put_page_testzero(page))
 8940		free_compound_page(page);
 8941}
 8942
 8943static void *io_mem_alloc(size_t size)
 8944{
 8945	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
 8946
 8947	return (void *) __get_free_pages(gfp, get_order(size));
 8948}
 8949
 8950static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
 8951				size_t *sq_offset)
 8952{
 8953	struct io_rings *rings;
 8954	size_t off, sq_array_size;
 8955
 8956	off = struct_size(rings, cqes, cq_entries);
 8957	if (off == SIZE_MAX)
 8958		return SIZE_MAX;
 8959
 8960#ifdef CONFIG_SMP
 8961	off = ALIGN(off, SMP_CACHE_BYTES);
 8962	if (off == 0)
 8963		return SIZE_MAX;
 8964#endif
 8965
 8966	if (sq_offset)
 8967		*sq_offset = off;
 8968
 8969	sq_array_size = array_size(sizeof(u32), sq_entries);
 8970	if (sq_array_size == SIZE_MAX)
 8971		return SIZE_MAX;
 8972
 8973	if (check_add_overflow(off, sq_array_size, &off))
 8974		return SIZE_MAX;
 8975
 8976	return off;
 8977}
 8978
 8979static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
 8980{
 8981	struct io_mapped_ubuf *imu = *slot;
 8982	unsigned int i;
 8983
 8984	if (imu != ctx->dummy_ubuf) {
 8985		for (i = 0; i < imu->nr_bvecs; i++)
 8986			unpin_user_page(imu->bvec[i].bv_page);
 8987		if (imu->acct_pages)
 8988			io_unaccount_mem(ctx, imu->acct_pages);
 8989		kvfree(imu);
 8990	}
 8991	*slot = NULL;
 8992}
 8993
 8994static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
 8995{
 8996	io_buffer_unmap(ctx, &prsrc->buf);
 8997	prsrc->buf = NULL;
 8998}
 8999
 9000static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
 9001{
 9002	unsigned int i;
 9003
 9004	for (i = 0; i < ctx->nr_user_bufs; i++)
 9005		io_buffer_unmap(ctx, &ctx->user_bufs[i]);
 9006	kfree(ctx->user_bufs);
 9007	io_rsrc_data_free(ctx->buf_data);
 9008	ctx->user_bufs = NULL;
 9009	ctx->buf_data = NULL;
 9010	ctx->nr_user_bufs = 0;
 9011}
 9012
 9013static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
 9014{
 9015	int ret;
 9016
 9017	if (!ctx->buf_data)
 9018		return -ENXIO;
 9019
 9020	ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
 9021	if (!ret)
 9022		__io_sqe_buffers_unregister(ctx);
 9023	return ret;
 9024}
 9025
 9026static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
 9027		       void __user *arg, unsigned index)
 9028{
 9029	struct iovec __user *src;
 9030
 9031#ifdef CONFIG_COMPAT
 9032	if (ctx->compat) {
 9033		struct compat_iovec __user *ciovs;
 9034		struct compat_iovec ciov;
 9035
 9036		ciovs = (struct compat_iovec __user *) arg;
 9037		if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
 9038			return -EFAULT;
 9039
 9040		dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
 9041		dst->iov_len = ciov.iov_len;
 9042		return 0;
 9043	}
 9044#endif
 9045	src = (struct iovec __user *) arg;
 9046	if (copy_from_user(dst, &src[index], sizeof(*dst)))
 9047		return -EFAULT;
 9048	return 0;
 9049}
 9050
 9051/*
 9052 * Not super efficient, but this is just a registration time. And we do cache
 9053 * the last compound head, so generally we'll only do a full search if we don't
 9054 * match that one.
 9055 *
 9056 * We check if the given compound head page has already been accounted, to
 9057 * avoid double accounting it. This allows us to account the full size of the
 9058 * page, not just the constituent pages of a huge page.
 9059 */
 9060static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
 9061				  int nr_pages, struct page *hpage)
 9062{
 9063	int i, j;
 9064
 9065	/* check current page array */
 9066	for (i = 0; i < nr_pages; i++) {
 9067		if (!PageCompound(pages[i]))
 9068			continue;
 9069		if (compound_head(pages[i]) == hpage)
 9070			return true;
 9071	}
 9072
 9073	/* check previously registered pages */
 9074	for (i = 0; i < ctx->nr_user_bufs; i++) {
 9075		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
 9076
 9077		for (j = 0; j < imu->nr_bvecs; j++) {
 9078			if (!PageCompound(imu->bvec[j].bv_page))
 9079				continue;
 9080			if (compound_head(imu->bvec[j].bv_page) == hpage)
 9081				return true;
 9082		}
 9083	}
 9084
 9085	return false;
 9086}
 9087
 9088static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
 9089				 int nr_pages, struct io_mapped_ubuf *imu,
 9090				 struct page **last_hpage)
 9091{
 9092	int i, ret;
 9093
 9094	imu->acct_pages = 0;
 9095	for (i = 0; i < nr_pages; i++) {
 9096		if (!PageCompound(pages[i])) {
 9097			imu->acct_pages++;
 9098		} else {
 9099			struct page *hpage;
 9100
 9101			hpage = compound_head(pages[i]);
 9102			if (hpage == *last_hpage)
 9103				continue;
 9104			*last_hpage = hpage;
 9105			if (headpage_already_acct(ctx, pages, i, hpage))
 9106				continue;
 9107			imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
 9108		}
 9109	}
 9110
 9111	if (!imu->acct_pages)
 9112		return 0;
 9113
 9114	ret = io_account_mem(ctx, imu->acct_pages);
 9115	if (ret)
 9116		imu->acct_pages = 0;
 9117	return ret;
 9118}
 9119
 9120static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 9121				  struct io_mapped_ubuf **pimu,
 9122				  struct page **last_hpage)
 9123{
 9124	struct io_mapped_ubuf *imu = NULL;
 9125	struct vm_area_struct **vmas = NULL;
 9126	struct page **pages = NULL;
 9127	unsigned long off, start, end, ubuf;
 9128	size_t size;
 9129	int ret, pret, nr_pages, i;
 9130
 9131	if (!iov->iov_base) {
 9132		*pimu = ctx->dummy_ubuf;
 9133		return 0;
 9134	}
 9135
 9136	ubuf = (unsigned long) iov->iov_base;
 9137	end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 9138	start = ubuf >> PAGE_SHIFT;
 9139	nr_pages = end - start;
 9140
 9141	*pimu = NULL;
 9142	ret = -ENOMEM;
 9143
 9144	pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
 9145	if (!pages)
 9146		goto done;
 9147
 9148	vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
 9149			      GFP_KERNEL);
 9150	if (!vmas)
 9151		goto done;
 9152
 9153	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
 9154	if (!imu)
 9155		goto done;
 9156
 9157	ret = 0;
 9158	mmap_read_lock(current->mm);
 9159	pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
 9160			      pages, vmas);
 9161	if (pret == nr_pages) {
 9162		/* don't support file backed memory */
 9163		for (i = 0; i < nr_pages; i++) {
 9164			struct vm_area_struct *vma = vmas[i];
 9165
 9166			if (vma_is_shmem(vma))
 9167				continue;
 9168			if (vma->vm_file &&
 9169			    !is_file_hugepages(vma->vm_file)) {
 9170				ret = -EOPNOTSUPP;
 9171				break;
 9172			}
 9173		}
 9174	} else {
 9175		ret = pret < 0 ? pret : -EFAULT;
 9176	}
 9177	mmap_read_unlock(current->mm);
 9178	if (ret) {
 9179		/*
 9180		 * if we did partial map, or found file backed vmas,
 9181		 * release any pages we did get
 9182		 */
 9183		if (pret > 0)
 9184			unpin_user_pages(pages, pret);
 9185		goto done;
 9186	}
 9187
 9188	ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
 9189	if (ret) {
 9190		unpin_user_pages(pages, pret);
 9191		goto done;
 9192	}
 9193
 9194	off = ubuf & ~PAGE_MASK;
 9195	size = iov->iov_len;
 9196	for (i = 0; i < nr_pages; i++) {
 9197		size_t vec_len;
 9198
 9199		vec_len = min_t(size_t, size, PAGE_SIZE - off);
 9200		imu->bvec[i].bv_page = pages[i];
 9201		imu->bvec[i].bv_len = vec_len;
 9202		imu->bvec[i].bv_offset = off;
 9203		off = 0;
 9204		size -= vec_len;
 9205	}
 9206	/* store original address for later verification */
 9207	imu->ubuf = ubuf;
 9208	imu->ubuf_end = ubuf + iov->iov_len;
 9209	imu->nr_bvecs = nr_pages;
 9210	*pimu = imu;
 9211	ret = 0;
 9212done:
 9213	if (ret)
 9214		kvfree(imu);
 9215	kvfree(pages);
 9216	kvfree(vmas);
 9217	return ret;
 9218}
 9219
 9220static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
 9221{
 9222	ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
 9223	return ctx->user_bufs ? 0 : -ENOMEM;
 9224}
 9225
 9226static int io_buffer_validate(struct iovec *iov)
 9227{
 9228	unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
 9229
 9230	/*
 9231	 * Don't impose further limits on the size and buffer
 9232	 * constraints here, we'll -EINVAL later when IO is
 9233	 * submitted if they are wrong.
 9234	 */
 9235	if (!iov->iov_base)
 9236		return iov->iov_len ? -EFAULT : 0;
 9237	if (!iov->iov_len)
 9238		return -EFAULT;
 9239
 9240	/* arbitrary limit, but we need something */
 9241	if (iov->iov_len > SZ_1G)
 9242		return -EFAULT;
 9243
 9244	if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
 9245		return -EOVERFLOW;
 9246
 9247	return 0;
 9248}
 9249
 9250static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 9251				   unsigned int nr_args, u64 __user *tags)
 9252{
 9253	struct page *last_hpage = NULL;
 9254	struct io_rsrc_data *data;
 9255	int i, ret;
 9256	struct iovec iov;
 9257
 9258	if (ctx->user_bufs)
 9259		return -EBUSY;
 9260	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
 9261		return -EINVAL;
 9262	ret = io_rsrc_node_switch_start(ctx);
 9263	if (ret)
 9264		return ret;
 9265	ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
 9266	if (ret)
 9267		return ret;
 9268	ret = io_buffers_map_alloc(ctx, nr_args);
 9269	if (ret) {
 9270		io_rsrc_data_free(data);
 9271		return ret;
 9272	}
 9273
 9274	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
 9275		ret = io_copy_iov(ctx, &iov, arg, i);
 9276		if (ret)
 9277			break;
 9278		ret = io_buffer_validate(&iov);
 9279		if (ret)
 9280			break;
 9281		if (!iov.iov_base && *io_get_tag_slot(data, i)) {
 9282			ret = -EINVAL;
 9283			break;
 9284		}
 9285
 9286		ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
 9287					     &last_hpage);
 9288		if (ret)
 9289			break;
 9290	}
 9291
 9292	WARN_ON_ONCE(ctx->buf_data);
 9293
 9294	ctx->buf_data = data;
 9295	if (ret)
 9296		__io_sqe_buffers_unregister(ctx);
 9297	else
 9298		io_rsrc_node_switch(ctx, NULL);
 9299	return ret;
 9300}
 9301
 9302static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 9303				   struct io_uring_rsrc_update2 *up,
 9304				   unsigned int nr_args)
 9305{
 9306	u64 __user *tags = u64_to_user_ptr(up->tags);
 9307	struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
 9308	struct page *last_hpage = NULL;
 9309	bool needs_switch = false;
 9310	__u32 done;
 9311	int i, err;
 9312
 9313	if (!ctx->buf_data)
 9314		return -ENXIO;
 9315	if (up->offset + nr_args > ctx->nr_user_bufs)
 9316		return -EINVAL;
 9317
 9318	for (done = 0; done < nr_args; done++) {
 9319		struct io_mapped_ubuf *imu;
 9320		int offset = up->offset + done;
 9321		u64 tag = 0;
 9322
 9323		err = io_copy_iov(ctx, &iov, iovs, done);
 9324		if (err)
 9325			break;
 9326		if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
 9327			err = -EFAULT;
 9328			break;
 9329		}
 9330		err = io_buffer_validate(&iov);
 9331		if (err)
 9332			break;
 9333		if (!iov.iov_base && tag) {
 9334			err = -EINVAL;
 9335			break;
 9336		}
 9337		err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
 9338		if (err)
 9339			break;
 9340
 9341		i = array_index_nospec(offset, ctx->nr_user_bufs);
 9342		if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
 9343			err = io_queue_rsrc_removal(ctx->buf_data, offset,
 9344						    ctx->rsrc_node, ctx->user_bufs[i]);
 9345			if (unlikely(err)) {
 9346				io_buffer_unmap(ctx, &imu);
 9347				break;
 9348			}
 9349			ctx->user_bufs[i] = NULL;
 9350			needs_switch = true;
 9351		}
 9352
 9353		ctx->user_bufs[i] = imu;
 9354		*io_get_tag_slot(ctx->buf_data, offset) = tag;
 9355	}
 9356
 9357	if (needs_switch)
 9358		io_rsrc_node_switch(ctx, ctx->buf_data);
 9359	return done ? done : err;
 9360}
 9361
 9362static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
 9363{
 9364	__s32 __user *fds = arg;
 9365	int fd;
 9366
 9367	if (ctx->cq_ev_fd)
 9368		return -EBUSY;
 9369
 9370	if (copy_from_user(&fd, fds, sizeof(*fds)))
 9371		return -EFAULT;
 9372
 9373	ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
 9374	if (IS_ERR(ctx->cq_ev_fd)) {
 9375		int ret = PTR_ERR(ctx->cq_ev_fd);
 9376
 9377		ctx->cq_ev_fd = NULL;
 9378		return ret;
 9379	}
 9380
 9381	return 0;
 9382}
 9383
 9384static int io_eventfd_unregister(struct io_ring_ctx *ctx)
 9385{
 9386	if (ctx->cq_ev_fd) {
 9387		eventfd_ctx_put(ctx->cq_ev_fd);
 9388		ctx->cq_ev_fd = NULL;
 9389		return 0;
 9390	}
 9391
 9392	return -ENXIO;
 9393}
 9394
 9395static void io_destroy_buffers(struct io_ring_ctx *ctx)
 9396{
 9397	struct io_buffer *buf;
 9398	unsigned long index;
 9399
 9400	xa_for_each(&ctx->io_buffers, index, buf)
 9401		__io_remove_buffers(ctx, buf, index, -1U);
 9402}
 9403
 9404static void io_req_caches_free(struct io_ring_ctx *ctx)
 9405{
 9406	struct io_submit_state *state = &ctx->submit_state;
 9407	int nr = 0;
 9408
 9409	mutex_lock(&ctx->uring_lock);
 9410	io_flush_cached_locked_reqs(ctx, state);
 9411
 9412	while (state->free_list.next) {
 9413		struct io_wq_work_node *node;
 9414		struct io_kiocb *req;
 9415
 9416		node = wq_stack_extract(&state->free_list);
 9417		req = container_of(node, struct io_kiocb, comp_list);
 9418		kmem_cache_free(req_cachep, req);
 9419		nr++;
 9420	}
 9421	if (nr)
 9422		percpu_ref_put_many(&ctx->refs, nr);
 9423	mutex_unlock(&ctx->uring_lock);
 9424}
 9425
 9426static void io_wait_rsrc_data(struct io_rsrc_data *data)
 9427{
 9428	if (data && !atomic_dec_and_test(&data->refs))
 9429		wait_for_completion(&data->done);
 9430}
 9431
 9432static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 9433{
 9434	io_sq_thread_finish(ctx);
 9435
 9436	if (ctx->mm_account) {
 9437		mmdrop(ctx->mm_account);
 9438		ctx->mm_account = NULL;
 9439	}
 9440
 9441	io_rsrc_refs_drop(ctx);
 9442	/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
 9443	io_wait_rsrc_data(ctx->buf_data);
 9444	io_wait_rsrc_data(ctx->file_data);
 9445
 9446	mutex_lock(&ctx->uring_lock);
 9447	if (ctx->buf_data)
 9448		__io_sqe_buffers_unregister(ctx);
 9449	if (ctx->file_data)
 9450		__io_sqe_files_unregister(ctx);
 9451	if (ctx->rings)
 9452		__io_cqring_overflow_flush(ctx, true);
 9453	mutex_unlock(&ctx->uring_lock);
 9454	io_eventfd_unregister(ctx);
 9455	io_destroy_buffers(ctx);
 9456	if (ctx->sq_creds)
 9457		put_cred(ctx->sq_creds);
 9458
 9459	/* there are no registered resources left, nobody uses it */
 9460	if (ctx->rsrc_node)
 9461		io_rsrc_node_destroy(ctx->rsrc_node);
 9462	if (ctx->rsrc_backup_node)
 9463		io_rsrc_node_destroy(ctx->rsrc_backup_node);
 9464	flush_delayed_work(&ctx->rsrc_put_work);
 9465	flush_delayed_work(&ctx->fallback_work);
 9466
 9467	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
 9468	WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
 9469
 9470#if defined(CONFIG_UNIX)
 9471	if (ctx->ring_sock) {
 9472		ctx->ring_sock->file = NULL; /* so that iput() is called */
 9473		sock_release(ctx->ring_sock);
 9474	}
 9475#endif
 9476	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
 9477
 9478	io_mem_free(ctx->rings);
 9479	io_mem_free(ctx->sq_sqes);
 9480
 9481	percpu_ref_exit(&ctx->refs);
 9482	free_uid(ctx->user);
 9483	io_req_caches_free(ctx);
 9484	if (ctx->hash_map)
 9485		io_wq_put_hash(ctx->hash_map);
 9486	kfree(ctx->cancel_hash);
 9487	kfree(ctx->dummy_ubuf);
 9488	kfree(ctx);
 9489}
 9490
 9491static __poll_t io_uring_poll(struct file *file, poll_table *wait)
 9492{
 9493	struct io_ring_ctx *ctx = file->private_data;
 9494	__poll_t mask = 0;
 9495
 9496	poll_wait(file, &ctx->cq_wait, wait);
 9497	/*
 9498	 * synchronizes with barrier from wq_has_sleeper call in
 9499	 * io_commit_cqring
 9500	 */
 9501	smp_rmb();
 9502	if (!io_sqring_full(ctx))
 9503		mask |= EPOLLOUT | EPOLLWRNORM;
 9504
 9505	/*
 9506	 * Don't flush cqring overflow list here, just do a simple check.
 9507	 * Otherwise there could possible be ABBA deadlock:
 9508	 *      CPU0                    CPU1
 9509	 *      ----                    ----
 9510	 * lock(&ctx->uring_lock);
 9511	 *                              lock(&ep->mtx);
 9512	 *                              lock(&ctx->uring_lock);
 9513	 * lock(&ep->mtx);
 9514	 *
 9515	 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
 9516	 * pushs them to do the flush.
 9517	 */
 9518	if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow))
 9519		mask |= EPOLLIN | EPOLLRDNORM;
 9520
 9521	return mask;
 9522}
 9523
 9524static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
 9525{
 9526	const struct cred *creds;
 9527
 9528	creds = xa_erase(&ctx->personalities, id);
 9529	if (creds) {
 9530		put_cred(creds);
 9531		return 0;
 9532	}
 9533
 9534	return -EINVAL;
 9535}
 9536
 9537struct io_tctx_exit {
 9538	struct callback_head		task_work;
 9539	struct completion		completion;
 9540	struct io_ring_ctx		*ctx;
 9541};
 9542
 9543static __cold void io_tctx_exit_cb(struct callback_head *cb)
 9544{
 9545	struct io_uring_task *tctx = current->io_uring;
 9546	struct io_tctx_exit *work;
 9547
 9548	work = container_of(cb, struct io_tctx_exit, task_work);
 9549	/*
 9550	 * When @in_idle, we're in cancellation and it's racy to remove the
 9551	 * node. It'll be removed by the end of cancellation, just ignore it.
 9552	 */
 9553	if (!atomic_read(&tctx->in_idle))
 9554		io_uring_del_tctx_node((unsigned long)work->ctx);
 9555	complete(&work->completion);
 9556}
 9557
 9558static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
 9559{
 9560	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 9561
 9562	return req->ctx == data;
 9563}
 9564
 9565static __cold void io_ring_exit_work(struct work_struct *work)
 9566{
 9567	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
 9568	unsigned long timeout = jiffies + HZ * 60 * 5;
 9569	unsigned long interval = HZ / 20;
 9570	struct io_tctx_exit exit;
 9571	struct io_tctx_node *node;
 9572	int ret;
 9573
 9574	/*
 9575	 * If we're doing polled IO and end up having requests being
 9576	 * submitted async (out-of-line), then completions can come in while
 9577	 * we're waiting for refs to drop. We need to reap these manually,
 9578	 * as nobody else will be looking for them.
 9579	 */
 9580	do {
 9581		io_uring_try_cancel_requests(ctx, NULL, true);
 9582		if (ctx->sq_data) {
 9583			struct io_sq_data *sqd = ctx->sq_data;
 9584			struct task_struct *tsk;
 9585
 9586			io_sq_thread_park(sqd);
 9587			tsk = sqd->thread;
 9588			if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
 9589				io_wq_cancel_cb(tsk->io_uring->io_wq,
 9590						io_cancel_ctx_cb, ctx, true);
 9591			io_sq_thread_unpark(sqd);
 9592		}
 9593
 9594		io_req_caches_free(ctx);
 9595
 9596		if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
 9597			/* there is little hope left, don't run it too often */
 9598			interval = HZ * 60;
 9599		}
 9600	} while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
 9601
 9602	init_completion(&exit.completion);
 9603	init_task_work(&exit.task_work, io_tctx_exit_cb);
 9604	exit.ctx = ctx;
 9605	/*
 9606	 * Some may use context even when all refs and requests have been put,
 9607	 * and they are free to do so while still holding uring_lock or
 9608	 * completion_lock, see io_req_task_submit(). Apart from other work,
 9609	 * this lock/unlock section also waits them to finish.
 9610	 */
 9611	mutex_lock(&ctx->uring_lock);
 9612	while (!list_empty(&ctx->tctx_list)) {
 9613		WARN_ON_ONCE(time_after(jiffies, timeout));
 9614
 9615		node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
 9616					ctx_node);
 9617		/* don't spin on a single task if cancellation failed */
 9618		list_rotate_left(&ctx->tctx_list);
 9619		ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
 9620		if (WARN_ON_ONCE(ret))
 9621			continue;
 9622
 9623		mutex_unlock(&ctx->uring_lock);
 9624		wait_for_completion(&exit.completion);
 9625		mutex_lock(&ctx->uring_lock);
 9626	}
 9627	mutex_unlock(&ctx->uring_lock);
 9628	spin_lock(&ctx->completion_lock);
 9629	spin_unlock(&ctx->completion_lock);
 9630
 9631	io_ring_ctx_free(ctx);
 9632}
 9633
 9634/* Returns true if we found and killed one or more timeouts */
 9635static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
 9636				    struct task_struct *tsk, bool cancel_all)
 9637{
 9638	struct io_kiocb *req, *tmp;
 9639	int canceled = 0;
 9640
 9641	spin_lock(&ctx->completion_lock);
 9642	spin_lock_irq(&ctx->timeout_lock);
 9643	list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
 9644		if (io_match_task(req, tsk, cancel_all)) {
 9645			io_kill_timeout(req, -ECANCELED);
 9646			canceled++;
 9647		}
 9648	}
 9649	spin_unlock_irq(&ctx->timeout_lock);
 9650	if (canceled != 0)
 9651		io_commit_cqring(ctx);
 9652	spin_unlock(&ctx->completion_lock);
 9653	if (canceled != 0)
 9654		io_cqring_ev_posted(ctx);
 9655	return canceled != 0;
 9656}
 9657
 9658static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 9659{
 9660	unsigned long index;
 9661	struct creds *creds;
 9662
 9663	mutex_lock(&ctx->uring_lock);
 9664	percpu_ref_kill(&ctx->refs);
 9665	if (ctx->rings)
 9666		__io_cqring_overflow_flush(ctx, true);
 9667	xa_for_each(&ctx->personalities, index, creds)
 9668		io_unregister_personality(ctx, index);
 9669	mutex_unlock(&ctx->uring_lock);
 9670
 9671	io_kill_timeouts(ctx, NULL, true);
 9672	io_poll_remove_all(ctx, NULL, true);
 9673
 9674	/* if we failed setting up the ctx, we might not have any rings */
 9675	io_iopoll_try_reap_events(ctx);
 9676
 9677	INIT_WORK(&ctx->exit_work, io_ring_exit_work);
 9678	/*
 9679	 * Use system_unbound_wq to avoid spawning tons of event kworkers
 9680	 * if we're exiting a ton of rings at the same time. It just adds
 9681	 * noise and overhead, there's no discernable change in runtime
 9682	 * over using system_wq.
 9683	 */
 9684	queue_work(system_unbound_wq, &ctx->exit_work);
 9685}
 9686
 9687static int io_uring_release(struct inode *inode, struct file *file)
 9688{
 9689	struct io_ring_ctx *ctx = file->private_data;
 9690
 9691	file->private_data = NULL;
 9692	io_ring_ctx_wait_and_kill(ctx);
 9693	return 0;
 9694}
 9695
 9696struct io_task_cancel {
 9697	struct task_struct *task;
 9698	bool all;
 9699};
 9700
 9701static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
 9702{
 9703	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 9704	struct io_task_cancel *cancel = data;
 9705
 9706	return io_match_task_safe(req, cancel->task, cancel->all);
 9707}
 9708
 9709static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
 9710					 struct task_struct *task,
 9711					 bool cancel_all)
 9712{
 9713	struct io_defer_entry *de;
 9714	LIST_HEAD(list);
 9715
 9716	spin_lock(&ctx->completion_lock);
 9717	list_for_each_entry_reverse(de, &ctx->defer_list, list) {
 9718		if (io_match_task_safe(de->req, task, cancel_all)) {
 9719			list_cut_position(&list, &ctx->defer_list, &de->list);
 9720			break;
 9721		}
 9722	}
 9723	spin_unlock(&ctx->completion_lock);
 9724	if (list_empty(&list))
 9725		return false;
 9726
 9727	while (!list_empty(&list)) {
 9728		de = list_first_entry(&list, struct io_defer_entry, list);
 9729		list_del_init(&de->list);
 9730		io_req_complete_failed(de->req, -ECANCELED);
 9731		kfree(de);
 9732	}
 9733	return true;
 9734}
 9735
 9736static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
 9737{
 9738	struct io_tctx_node *node;
 9739	enum io_wq_cancel cret;
 9740	bool ret = false;
 9741
 9742	mutex_lock(&ctx->uring_lock);
 9743	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
 9744		struct io_uring_task *tctx = node->task->io_uring;
 9745
 9746		/*
 9747		 * io_wq will stay alive while we hold uring_lock, because it's
 9748		 * killed after ctx nodes, which requires to take the lock.
 9749		 */
 9750		if (!tctx || !tctx->io_wq)
 9751			continue;
 9752		cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
 9753		ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
 9754	}
 9755	mutex_unlock(&ctx->uring_lock);
 9756
 9757	return ret;
 9758}
 9759
 9760static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 9761						struct task_struct *task,
 9762						bool cancel_all)
 9763{
 9764	struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
 9765	struct io_uring_task *tctx = task ? task->io_uring : NULL;
 9766
 9767	while (1) {
 9768		enum io_wq_cancel cret;
 9769		bool ret = false;
 9770
 9771		if (!task) {
 9772			ret |= io_uring_try_cancel_iowq(ctx);
 9773		} else if (tctx && tctx->io_wq) {
 9774			/*
 9775			 * Cancels requests of all rings, not only @ctx, but
 9776			 * it's fine as the task is in exit/exec.
 9777			 */
 9778			cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
 9779					       &cancel, true);
 9780			ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
 9781		}
 9782
 9783		/* SQPOLL thread does its own polling */
 9784		if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
 9785		    (ctx->sq_data && ctx->sq_data->thread == current)) {
 9786			while (!wq_list_empty(&ctx->iopoll_list)) {
 9787				io_iopoll_try_reap_events(ctx);
 9788				ret = true;
 9789			}
 9790		}
 9791
 9792		ret |= io_cancel_defer_files(ctx, task, cancel_all);
 9793		ret |= io_poll_remove_all(ctx, task, cancel_all);
 9794		ret |= io_kill_timeouts(ctx, task, cancel_all);
 9795		if (task)
 9796			ret |= io_run_task_work();
 9797		if (!ret)
 9798			break;
 9799		cond_resched();
 9800	}
 9801}
 9802
 9803static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
 9804{
 9805	struct io_uring_task *tctx = current->io_uring;
 9806	struct io_tctx_node *node;
 9807	int ret;
 9808
 9809	if (unlikely(!tctx)) {
 9810		ret = io_uring_alloc_task_context(current, ctx);
 9811		if (unlikely(ret))
 9812			return ret;
 9813
 9814		tctx = current->io_uring;
 9815		if (ctx->iowq_limits_set) {
 9816			unsigned int limits[2] = { ctx->iowq_limits[0],
 9817						   ctx->iowq_limits[1], };
 9818
 9819			ret = io_wq_max_workers(tctx->io_wq, limits);
 9820			if (ret)
 9821				return ret;
 9822		}
 9823	}
 9824	if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
 9825		node = kmalloc(sizeof(*node), GFP_KERNEL);
 9826		if (!node)
 9827			return -ENOMEM;
 9828		node->ctx = ctx;
 9829		node->task = current;
 9830
 9831		ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
 9832					node, GFP_KERNEL));
 9833		if (ret) {
 9834			kfree(node);
 9835			return ret;
 9836		}
 9837
 9838		mutex_lock(&ctx->uring_lock);
 9839		list_add(&node->ctx_node, &ctx->tctx_list);
 9840		mutex_unlock(&ctx->uring_lock);
 9841	}
 9842	tctx->last = ctx;
 9843	return 0;
 9844}
 9845
 9846/*
 9847 * Note that this task has used io_uring. We use it for cancelation purposes.
 9848 */
 9849static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
 9850{
 9851	struct io_uring_task *tctx = current->io_uring;
 9852
 9853	if (likely(tctx && tctx->last == ctx))
 9854		return 0;
 9855	return __io_uring_add_tctx_node(ctx);
 9856}
 9857
 9858/*
 9859 * Remove this io_uring_file -> task mapping.
 9860 */
 9861static __cold void io_uring_del_tctx_node(unsigned long index)
 9862{
 9863	struct io_uring_task *tctx = current->io_uring;
 9864	struct io_tctx_node *node;
 9865
 9866	if (!tctx)
 9867		return;
 9868	node = xa_erase(&tctx->xa, index);
 9869	if (!node)
 9870		return;
 9871
 9872	WARN_ON_ONCE(current != node->task);
 9873	WARN_ON_ONCE(list_empty(&node->ctx_node));
 9874
 9875	mutex_lock(&node->ctx->uring_lock);
 9876	list_del(&node->ctx_node);
 9877	mutex_unlock(&node->ctx->uring_lock);
 9878
 9879	if (tctx->last == node->ctx)
 9880		tctx->last = NULL;
 9881	kfree(node);
 9882}
 9883
 9884static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
 9885{
 9886	struct io_wq *wq = tctx->io_wq;
 9887	struct io_tctx_node *node;
 9888	unsigned long index;
 9889
 9890	xa_for_each(&tctx->xa, index, node) {
 9891		io_uring_del_tctx_node(index);
 9892		cond_resched();
 9893	}
 9894	if (wq) {
 9895		/*
 9896		 * Must be after io_uring_del_tctx_node() (removes nodes under
 9897		 * uring_lock) to avoid race with io_uring_try_cancel_iowq().
 9898		 */
 9899		io_wq_put_and_exit(wq);
 9900		tctx->io_wq = NULL;
 9901	}
 9902}
 9903
 9904static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
 9905{
 9906	if (tracked)
 9907		return atomic_read(&tctx->inflight_tracked);
 9908	return percpu_counter_sum(&tctx->inflight);
 9909}
 9910
 9911/*
 9912 * Find any io_uring ctx that this task has registered or done IO on, and cancel
 9913 * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
 9914 */
 9915static __cold void io_uring_cancel_generic(bool cancel_all,
 9916					   struct io_sq_data *sqd)
 9917{
 9918	struct io_uring_task *tctx = current->io_uring;
 9919	struct io_ring_ctx *ctx;
 9920	s64 inflight;
 9921	DEFINE_WAIT(wait);
 9922
 9923	WARN_ON_ONCE(sqd && sqd->thread != current);
 9924
 9925	if (!current->io_uring)
 9926		return;
 9927	if (tctx->io_wq)
 9928		io_wq_exit_start(tctx->io_wq);
 9929
 9930	atomic_inc(&tctx->in_idle);
 9931	do {
 9932		io_uring_drop_tctx_refs(current);
 9933		/* read completions before cancelations */
 9934		inflight = tctx_inflight(tctx, !cancel_all);
 9935		if (!inflight)
 9936			break;
 9937
 9938		if (!sqd) {
 9939			struct io_tctx_node *node;
 9940			unsigned long index;
 9941
 9942			xa_for_each(&tctx->xa, index, node) {
 9943				/* sqpoll task will cancel all its requests */
 9944				if (node->ctx->sq_data)
 9945					continue;
 9946				io_uring_try_cancel_requests(node->ctx, current,
 9947							     cancel_all);
 9948			}
 9949		} else {
 9950			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 9951				io_uring_try_cancel_requests(ctx, current,
 9952							     cancel_all);
 9953		}
 9954
 9955		prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
 9956		io_run_task_work();
 9957		io_uring_drop_tctx_refs(current);
 9958
 9959		/*
 9960		 * If we've seen completions, retry without waiting. This
 9961		 * avoids a race where a completion comes in before we did
 9962		 * prepare_to_wait().
 9963		 */
 9964		if (inflight == tctx_inflight(tctx, !cancel_all))
 9965			schedule();
 9966		finish_wait(&tctx->wait, &wait);
 9967	} while (1);
 9968
 9969	io_uring_clean_tctx(tctx);
 9970	if (cancel_all) {
 9971		/*
 9972		 * We shouldn't run task_works after cancel, so just leave
 9973		 * ->in_idle set for normal exit.
 9974		 */
 9975		atomic_dec(&tctx->in_idle);
 9976		/* for exec all current's requests should be gone, kill tctx */
 9977		__io_uring_free(current);
 9978	}
 9979}
 9980
 9981void __io_uring_cancel(bool cancel_all)
 9982{
 9983	io_uring_cancel_generic(cancel_all, NULL);
 9984}
 9985
 9986static void *io_uring_validate_mmap_request(struct file *file,
 9987					    loff_t pgoff, size_t sz)
 9988{
 9989	struct io_ring_ctx *ctx = file->private_data;
 9990	loff_t offset = pgoff << PAGE_SHIFT;
 9991	struct page *page;
 9992	void *ptr;
 9993
 9994	switch (offset) {
 9995	case IORING_OFF_SQ_RING:
 9996	case IORING_OFF_CQ_RING:
 9997		ptr = ctx->rings;
 9998		break;
 9999	case IORING_OFF_SQES:
10000		ptr = ctx->sq_sqes;
10001		break;
10002	default:
10003		return ERR_PTR(-EINVAL);
10004	}
10005
10006	page = virt_to_head_page(ptr);
10007	if (sz > page_size(page))
10008		return ERR_PTR(-EINVAL);
10009
10010	return ptr;
10011}
10012
10013#ifdef CONFIG_MMU
10014
10015static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
10016{
10017	size_t sz = vma->vm_end - vma->vm_start;
10018	unsigned long pfn;
10019	void *ptr;
10020
10021	ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
10022	if (IS_ERR(ptr))
10023		return PTR_ERR(ptr);
10024
10025	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
10026	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
10027}
10028
10029#else /* !CONFIG_MMU */
10030
10031static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
10032{
10033	return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
10034}
10035
10036static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
10037{
10038	return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
10039}
10040
10041static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
10042	unsigned long addr, unsigned long len,
10043	unsigned long pgoff, unsigned long flags)
10044{
10045	void *ptr;
10046
10047	ptr = io_uring_validate_mmap_request(file, pgoff, len);
10048	if (IS_ERR(ptr))
10049		return PTR_ERR(ptr);
10050
10051	return (unsigned long) ptr;
10052}
10053
10054#endif /* !CONFIG_MMU */
10055
10056static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
10057{
10058	DEFINE_WAIT(wait);
10059
10060	do {
10061		if (!io_sqring_full(ctx))
10062			break;
10063		prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
10064
10065		if (!io_sqring_full(ctx))
10066			break;
10067		schedule();
10068	} while (!signal_pending(current));
10069
10070	finish_wait(&ctx->sqo_sq_wait, &wait);
10071	return 0;
10072}
10073
10074static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
10075			  struct __kernel_timespec __user **ts,
10076			  const sigset_t __user **sig)
10077{
10078	struct io_uring_getevents_arg arg;
10079
10080	/*
10081	 * If EXT_ARG isn't set, then we have no timespec and the argp pointer
10082	 * is just a pointer to the sigset_t.
10083	 */
10084	if (!(flags & IORING_ENTER_EXT_ARG)) {
10085		*sig = (const sigset_t __user *) argp;
10086		*ts = NULL;
10087		return 0;
10088	}
10089
10090	/*
10091	 * EXT_ARG is set - ensure we agree on the size of it and copy in our
10092	 * timespec and sigset_t pointers if good.
10093	 */
10094	if (*argsz != sizeof(arg))
10095		return -EINVAL;
10096	if (copy_from_user(&arg, argp, sizeof(arg)))
10097		return -EFAULT;
10098	*sig = u64_to_user_ptr(arg.sigmask);
10099	*argsz = arg.sigmask_sz;
10100	*ts = u64_to_user_ptr(arg.ts);
10101	return 0;
10102}
10103
10104SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
10105		u32, min_complete, u32, flags, const void __user *, argp,
10106		size_t, argsz)
10107{
10108	struct io_ring_ctx *ctx;
10109	int submitted = 0;
10110	struct fd f;
10111	long ret;
10112
10113	io_run_task_work();
10114
10115	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
10116			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
10117		return -EINVAL;
10118
10119	f = fdget(fd);
10120	if (unlikely(!f.file))
10121		return -EBADF;
10122
10123	ret = -EOPNOTSUPP;
10124	if (unlikely(f.file->f_op != &io_uring_fops))
10125		goto out_fput;
10126
10127	ret = -ENXIO;
10128	ctx = f.file->private_data;
10129	if (unlikely(!percpu_ref_tryget(&ctx->refs)))
10130		goto out_fput;
10131
10132	ret = -EBADFD;
10133	if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
10134		goto out;
10135
10136	/*
10137	 * For SQ polling, the thread will do all submissions and completions.
10138	 * Just return the requested submit count, and wake the thread if
10139	 * we were asked to.
10140	 */
10141	ret = 0;
10142	if (ctx->flags & IORING_SETUP_SQPOLL) {
10143		io_cqring_overflow_flush(ctx);
10144
10145		if (unlikely(ctx->sq_data->thread == NULL)) {
10146			ret = -EOWNERDEAD;
10147			goto out;
10148		}
10149		if (flags & IORING_ENTER_SQ_WAKEUP)
10150			wake_up(&ctx->sq_data->wait);
10151		if (flags & IORING_ENTER_SQ_WAIT) {
10152			ret = io_sqpoll_wait_sq(ctx);
10153			if (ret)
10154				goto out;
10155		}
10156		submitted = to_submit;
10157	} else if (to_submit) {
10158		ret = io_uring_add_tctx_node(ctx);
10159		if (unlikely(ret))
10160			goto out;
10161		mutex_lock(&ctx->uring_lock);
10162		submitted = io_submit_sqes(ctx, to_submit);
10163		mutex_unlock(&ctx->uring_lock);
10164
10165		if (submitted != to_submit)
10166			goto out;
10167	}
10168	if (flags & IORING_ENTER_GETEVENTS) {
10169		const sigset_t __user *sig;
10170		struct __kernel_timespec __user *ts;
10171
10172		ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
10173		if (unlikely(ret))
10174			goto out;
10175
10176		min_complete = min(min_complete, ctx->cq_entries);
10177
10178		/*
10179		 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
10180		 * space applications don't need to do io completion events
10181		 * polling again, they can rely on io_sq_thread to do polling
10182		 * work, which can reduce cpu usage and uring_lock contention.
10183		 */
10184		if (ctx->flags & IORING_SETUP_IOPOLL &&
10185		    !(ctx->flags & IORING_SETUP_SQPOLL)) {
10186			ret = io_iopoll_check(ctx, min_complete);
10187		} else {
10188			ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
10189		}
10190	}
10191
10192out:
10193	percpu_ref_put(&ctx->refs);
10194out_fput:
10195	fdput(f);
10196	return submitted ? submitted : ret;
10197}
10198
10199#ifdef CONFIG_PROC_FS
10200static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
10201		const struct cred *cred)
10202{
10203	struct user_namespace *uns = seq_user_ns(m);
10204	struct group_info *gi;
10205	kernel_cap_t cap;
10206	unsigned __capi;
10207	int g;
10208
10209	seq_printf(m, "%5d\n", id);
10210	seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
10211	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
10212	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
10213	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
10214	seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
10215	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
10216	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
10217	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
10218	seq_puts(m, "\n\tGroups:\t");
10219	gi = cred->group_info;
10220	for (g = 0; g < gi->ngroups; g++) {
10221		seq_put_decimal_ull(m, g ? " " : "",
10222					from_kgid_munged(uns, gi->gid[g]));
10223	}
10224	seq_puts(m, "\n\tCapEff:\t");
10225	cap = cred->cap_effective;
10226	CAP_FOR_EACH_U32(__capi)
10227		seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
10228	seq_putc(m, '\n');
10229	return 0;
10230}
10231
10232static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
10233					  struct seq_file *m)
10234{
10235	struct io_sq_data *sq = NULL;
10236	struct io_overflow_cqe *ocqe;
10237	struct io_rings *r = ctx->rings;
10238	unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
10239	unsigned int sq_head = READ_ONCE(r->sq.head);
10240	unsigned int sq_tail = READ_ONCE(r->sq.tail);
10241	unsigned int cq_head = READ_ONCE(r->cq.head);
10242	unsigned int cq_tail = READ_ONCE(r->cq.tail);
10243	unsigned int sq_entries, cq_entries;
10244	bool has_lock;
10245	unsigned int i;
10246
10247	/*
10248	 * we may get imprecise sqe and cqe info if uring is actively running
10249	 * since we get cached_sq_head and cached_cq_tail without uring_lock
10250	 * and sq_tail and cq_head are changed by userspace. But it's ok since
10251	 * we usually use these info when it is stuck.
10252	 */
10253	seq_printf(m, "SqMask:\t0x%x\n", sq_mask);
10254	seq_printf(m, "SqHead:\t%u\n", sq_head);
10255	seq_printf(m, "SqTail:\t%u\n", sq_tail);
10256	seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
10257	seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
10258	seq_printf(m, "CqHead:\t%u\n", cq_head);
10259	seq_printf(m, "CqTail:\t%u\n", cq_tail);
10260	seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
10261	seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
10262	sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
10263	for (i = 0; i < sq_entries; i++) {
10264		unsigned int entry = i + sq_head;
10265		unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
10266		struct io_uring_sqe *sqe;
10267
10268		if (sq_idx > sq_mask)
10269			continue;
10270		sqe = &ctx->sq_sqes[sq_idx];
10271		seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
10272			   sq_idx, sqe->opcode, sqe->fd, sqe->flags,
10273			   sqe->user_data);
10274	}
10275	seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
10276	cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
10277	for (i = 0; i < cq_entries; i++) {
10278		unsigned int entry = i + cq_head;
10279		struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask];
10280
10281		seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
10282			   entry & cq_mask, cqe->user_data, cqe->res,
10283			   cqe->flags);
10284	}
10285
10286	/*
10287	 * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
10288	 * since fdinfo case grabs it in the opposite direction of normal use
10289	 * cases. If we fail to get the lock, we just don't iterate any
10290	 * structures that could be going away outside the io_uring mutex.
10291	 */
10292	has_lock = mutex_trylock(&ctx->uring_lock);
10293
10294	if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
10295		sq = ctx->sq_data;
10296		if (!sq->thread)
10297			sq = NULL;
10298	}
10299
10300	seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
10301	seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
10302	seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
10303	for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
10304		struct file *f = io_file_from_index(ctx, i);
10305
10306		if (f)
10307			seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
10308		else
10309			seq_printf(m, "%5u: <none>\n", i);
10310	}
10311	seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
10312	for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
10313		struct io_mapped_ubuf *buf = ctx->user_bufs[i];
10314		unsigned int len = buf->ubuf_end - buf->ubuf;
10315
10316		seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
10317	}
10318	if (has_lock && !xa_empty(&ctx->personalities)) {
10319		unsigned long index;
10320		const struct cred *cred;
10321
10322		seq_printf(m, "Personalities:\n");
10323		xa_for_each(&ctx->personalities, index, cred)
10324			io_uring_show_cred(m, index, cred);
10325	}
10326	if (has_lock)
10327		mutex_unlock(&ctx->uring_lock);
10328
10329	seq_puts(m, "PollList:\n");
10330	spin_lock(&ctx->completion_lock);
10331	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
10332		struct hlist_head *list = &ctx->cancel_hash[i];
10333		struct io_kiocb *req;
10334
10335		hlist_for_each_entry(req, list, hash_node)
10336			seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
10337					req->task->task_works != NULL);
10338	}
10339
10340	seq_puts(m, "CqOverflowList:\n");
10341	list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
10342		struct io_uring_cqe *cqe = &ocqe->cqe;
10343
10344		seq_printf(m, "  user_data=%llu, res=%d, flags=%x\n",
10345			   cqe->user_data, cqe->res, cqe->flags);
10346
10347	}
10348
10349	spin_unlock(&ctx->completion_lock);
10350}
10351
10352static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
10353{
10354	struct io_ring_ctx *ctx = f->private_data;
10355
10356	if (percpu_ref_tryget(&ctx->refs)) {
10357		__io_uring_show_fdinfo(ctx, m);
10358		percpu_ref_put(&ctx->refs);
10359	}
10360}
10361#endif
10362
10363static const struct file_operations io_uring_fops = {
10364	.release	= io_uring_release,
10365	.mmap		= io_uring_mmap,
10366#ifndef CONFIG_MMU
10367	.get_unmapped_area = io_uring_nommu_get_unmapped_area,
10368	.mmap_capabilities = io_uring_nommu_mmap_capabilities,
10369#endif
10370	.poll		= io_uring_poll,
10371#ifdef CONFIG_PROC_FS
10372	.show_fdinfo	= io_uring_show_fdinfo,
10373#endif
10374};
10375
10376static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
10377					 struct io_uring_params *p)
10378{
10379	struct io_rings *rings;
10380	size_t size, sq_array_offset;
10381
10382	/* make sure these are sane, as we already accounted them */
10383	ctx->sq_entries = p->sq_entries;
10384	ctx->cq_entries = p->cq_entries;
10385
10386	size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
10387	if (size == SIZE_MAX)
10388		return -EOVERFLOW;
10389
10390	rings = io_mem_alloc(size);
10391	if (!rings)
10392		return -ENOMEM;
10393
10394	ctx->rings = rings;
10395	ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
10396	rings->sq_ring_mask = p->sq_entries - 1;
10397	rings->cq_ring_mask = p->cq_entries - 1;
10398	rings->sq_ring_entries = p->sq_entries;
10399	rings->cq_ring_entries = p->cq_entries;
10400
10401	size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
10402	if (size == SIZE_MAX) {
10403		io_mem_free(ctx->rings);
10404		ctx->rings = NULL;
10405		return -EOVERFLOW;
10406	}
10407
10408	ctx->sq_sqes = io_mem_alloc(size);
10409	if (!ctx->sq_sqes) {
10410		io_mem_free(ctx->rings);
10411		ctx->rings = NULL;
10412		return -ENOMEM;
10413	}
10414
10415	return 0;
10416}
10417
10418static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
10419{
10420	int ret, fd;
10421
10422	fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
10423	if (fd < 0)
10424		return fd;
10425
10426	ret = io_uring_add_tctx_node(ctx);
10427	if (ret) {
10428		put_unused_fd(fd);
10429		return ret;
10430	}
10431	fd_install(fd, file);
10432	return fd;
10433}
10434
10435/*
10436 * Allocate an anonymous fd, this is what constitutes the application
10437 * visible backing of an io_uring instance. The application mmaps this
10438 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
10439 * we have to tie this fd to a socket for file garbage collection purposes.
10440 */
10441static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
10442{
10443	struct file *file;
10444#if defined(CONFIG_UNIX)
10445	int ret;
10446
10447	ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
10448				&ctx->ring_sock);
10449	if (ret)
10450		return ERR_PTR(ret);
10451#endif
10452
10453	file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
10454					 O_RDWR | O_CLOEXEC, NULL);
10455#if defined(CONFIG_UNIX)
10456	if (IS_ERR(file)) {
10457		sock_release(ctx->ring_sock);
10458		ctx->ring_sock = NULL;
10459	} else {
10460		ctx->ring_sock->file = file;
10461	}
10462#endif
10463	return file;
10464}
10465
10466static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
10467				  struct io_uring_params __user *params)
10468{
10469	struct io_ring_ctx *ctx;
10470	struct file *file;
10471	int ret;
10472
10473	if (!entries)
10474		return -EINVAL;
10475	if (entries > IORING_MAX_ENTRIES) {
10476		if (!(p->flags & IORING_SETUP_CLAMP))
10477			return -EINVAL;
10478		entries = IORING_MAX_ENTRIES;
10479	}
10480
10481	/*
10482	 * Use twice as many entries for the CQ ring. It's possible for the
10483	 * application to drive a higher depth than the size of the SQ ring,
10484	 * since the sqes are only used at submission time. This allows for
10485	 * some flexibility in overcommitting a bit. If the application has
10486	 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
10487	 * of CQ ring entries manually.
10488	 */
10489	p->sq_entries = roundup_pow_of_two(entries);
10490	if (p->flags & IORING_SETUP_CQSIZE) {
10491		/*
10492		 * If IORING_SETUP_CQSIZE is set, we do the same roundup
10493		 * to a power-of-two, if it isn't already. We do NOT impose
10494		 * any cq vs sq ring sizing.
10495		 */
10496		if (!p->cq_entries)
10497			return -EINVAL;
10498		if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
10499			if (!(p->flags & IORING_SETUP_CLAMP))
10500				return -EINVAL;
10501			p->cq_entries = IORING_MAX_CQ_ENTRIES;
10502		}
10503		p->cq_entries = roundup_pow_of_two(p->cq_entries);
10504		if (p->cq_entries < p->sq_entries)
10505			return -EINVAL;
10506	} else {
10507		p->cq_entries = 2 * p->sq_entries;
10508	}
10509
10510	ctx = io_ring_ctx_alloc(p);
10511	if (!ctx)
10512		return -ENOMEM;
10513	ctx->compat = in_compat_syscall();
10514	if (!capable(CAP_IPC_LOCK))
10515		ctx->user = get_uid(current_user());
10516
10517	/*
10518	 * This is just grabbed for accounting purposes. When a process exits,
10519	 * the mm is exited and dropped before the files, hence we need to hang
10520	 * on to this mm purely for the purposes of being able to unaccount
10521	 * memory (locked/pinned vm). It's not used for anything else.
10522	 */
10523	mmgrab(current->mm);
10524	ctx->mm_account = current->mm;
10525
10526	ret = io_allocate_scq_urings(ctx, p);
10527	if (ret)
10528		goto err;
10529
10530	ret = io_sq_offload_create(ctx, p);
10531	if (ret)
10532		goto err;
10533	/* always set a rsrc node */
10534	ret = io_rsrc_node_switch_start(ctx);
10535	if (ret)
10536		goto err;
10537	io_rsrc_node_switch(ctx, NULL);
10538
10539	memset(&p->sq_off, 0, sizeof(p->sq_off));
10540	p->sq_off.head = offsetof(struct io_rings, sq.head);
10541	p->sq_off.tail = offsetof(struct io_rings, sq.tail);
10542	p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
10543	p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
10544	p->sq_off.flags = offsetof(struct io_rings, sq_flags);
10545	p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
10546	p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
10547
10548	memset(&p->cq_off, 0, sizeof(p->cq_off));
10549	p->cq_off.head = offsetof(struct io_rings, cq.head);
10550	p->cq_off.tail = offsetof(struct io_rings, cq.tail);
10551	p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
10552	p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
10553	p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
10554	p->cq_off.cqes = offsetof(struct io_rings, cqes);
10555	p->cq_off.flags = offsetof(struct io_rings, cq_flags);
10556
10557	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
10558			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
10559			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
10560			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
10561			IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
10562			IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP;
10563
10564	if (copy_to_user(params, p, sizeof(*p))) {
10565		ret = -EFAULT;
10566		goto err;
10567	}
10568
10569	file = io_uring_get_file(ctx);
10570	if (IS_ERR(file)) {
10571		ret = PTR_ERR(file);
10572		goto err;
10573	}
10574
10575	/*
10576	 * Install ring fd as the very last thing, so we don't risk someone
10577	 * having closed it before we finish setup
10578	 */
10579	ret = io_uring_install_fd(ctx, file);
10580	if (ret < 0) {
10581		/* fput will clean it up */
10582		fput(file);
10583		return ret;
10584	}
10585
10586	trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
10587	return ret;
10588err:
10589	io_ring_ctx_wait_and_kill(ctx);
10590	return ret;
10591}
10592
10593/*
10594 * Sets up an aio uring context, and returns the fd. Applications asks for a
10595 * ring size, we return the actual sq/cq ring sizes (among other things) in the
10596 * params structure passed in.
10597 */
10598static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
10599{
10600	struct io_uring_params p;
10601	int i;
10602
10603	if (copy_from_user(&p, params, sizeof(p)))
10604		return -EFAULT;
10605	for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
10606		if (p.resv[i])
10607			return -EINVAL;
10608	}
10609
10610	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
10611			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
10612			IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
10613			IORING_SETUP_R_DISABLED))
10614		return -EINVAL;
10615
10616	return  io_uring_create(entries, &p, params);
10617}
10618
10619SYSCALL_DEFINE2(io_uring_setup, u32, entries,
10620		struct io_uring_params __user *, params)
10621{
10622	return io_uring_setup(entries, params);
10623}
10624
10625static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
10626			   unsigned nr_args)
10627{
10628	struct io_uring_probe *p;
10629	size_t size;
10630	int i, ret;
10631
10632	size = struct_size(p, ops, nr_args);
10633	if (size == SIZE_MAX)
10634		return -EOVERFLOW;
10635	p = kzalloc(size, GFP_KERNEL);
10636	if (!p)
10637		return -ENOMEM;
10638
10639	ret = -EFAULT;
10640	if (copy_from_user(p, arg, size))
10641		goto out;
10642	ret = -EINVAL;
10643	if (memchr_inv(p, 0, size))
10644		goto out;
10645
10646	p->last_op = IORING_OP_LAST - 1;
10647	if (nr_args > IORING_OP_LAST)
10648		nr_args = IORING_OP_LAST;
10649
10650	for (i = 0; i < nr_args; i++) {
10651		p->ops[i].op = i;
10652		if (!io_op_defs[i].not_supported)
10653			p->ops[i].flags = IO_URING_OP_SUPPORTED;
10654	}
10655	p->ops_len = i;
10656
10657	ret = 0;
10658	if (copy_to_user(arg, p, size))
10659		ret = -EFAULT;
10660out:
10661	kfree(p);
10662	return ret;
10663}
10664
10665static int io_register_personality(struct io_ring_ctx *ctx)
10666{
10667	const struct cred *creds;
10668	u32 id;
10669	int ret;
10670
10671	creds = get_current_cred();
10672
10673	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
10674			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
10675	if (ret < 0) {
10676		put_cred(creds);
10677		return ret;
10678	}
10679	return id;
10680}
10681
10682static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
10683					   void __user *arg, unsigned int nr_args)
10684{
10685	struct io_uring_restriction *res;
10686	size_t size;
10687	int i, ret;
10688
10689	/* Restrictions allowed only if rings started disabled */
10690	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
10691		return -EBADFD;
10692
10693	/* We allow only a single restrictions registration */
10694	if (ctx->restrictions.registered)
10695		return -EBUSY;
10696
10697	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
10698		return -EINVAL;
10699
10700	size = array_size(nr_args, sizeof(*res));
10701	if (size == SIZE_MAX)
10702		return -EOVERFLOW;
10703
10704	res = memdup_user(arg, size);
10705	if (IS_ERR(res))
10706		return PTR_ERR(res);
10707
10708	ret = 0;
10709
10710	for (i = 0; i < nr_args; i++) {
10711		switch (res[i].opcode) {
10712		case IORING_RESTRICTION_REGISTER_OP:
10713			if (res[i].register_op >= IORING_REGISTER_LAST) {
10714				ret = -EINVAL;
10715				goto out;
10716			}
10717
10718			__set_bit(res[i].register_op,
10719				  ctx->restrictions.register_op);
10720			break;
10721		case IORING_RESTRICTION_SQE_OP:
10722			if (res[i].sqe_op >= IORING_OP_LAST) {
10723				ret = -EINVAL;
10724				goto out;
10725			}
10726
10727			__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
10728			break;
10729		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
10730			ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
10731			break;
10732		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
10733			ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
10734			break;
10735		default:
10736			ret = -EINVAL;
10737			goto out;
10738		}
10739	}
10740
10741out:
10742	/* Reset all restrictions if an error happened */
10743	if (ret != 0)
10744		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
10745	else
10746		ctx->restrictions.registered = true;
10747
10748	kfree(res);
10749	return ret;
10750}
10751
10752static int io_register_enable_rings(struct io_ring_ctx *ctx)
10753{
10754	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
10755		return -EBADFD;
10756
10757	if (ctx->restrictions.registered)
10758		ctx->restricted = 1;
10759
10760	ctx->flags &= ~IORING_SETUP_R_DISABLED;
10761	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
10762		wake_up(&ctx->sq_data->wait);
10763	return 0;
10764}
10765
10766static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
10767				     struct io_uring_rsrc_update2 *up,
10768				     unsigned nr_args)
10769{
10770	__u32 tmp;
10771	int err;
10772
10773	if (up->resv)
10774		return -EINVAL;
10775	if (check_add_overflow(up->offset, nr_args, &tmp))
10776		return -EOVERFLOW;
10777	err = io_rsrc_node_switch_start(ctx);
10778	if (err)
10779		return err;
10780
10781	switch (type) {
10782	case IORING_RSRC_FILE:
10783		return __io_sqe_files_update(ctx, up, nr_args);
10784	case IORING_RSRC_BUFFER:
10785		return __io_sqe_buffers_update(ctx, up, nr_args);
10786	}
10787	return -EINVAL;
10788}
10789
10790static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
10791				    unsigned nr_args)
10792{
10793	struct io_uring_rsrc_update2 up;
10794
10795	if (!nr_args)
10796		return -EINVAL;
10797	memset(&up, 0, sizeof(up));
10798	if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
10799		return -EFAULT;
10800	return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
10801}
10802
10803static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
10804				   unsigned size, unsigned type)
10805{
10806	struct io_uring_rsrc_update2 up;
10807
10808	if (size != sizeof(up))
10809		return -EINVAL;
10810	if (copy_from_user(&up, arg, sizeof(up)))
10811		return -EFAULT;
10812	if (!up.nr || up.resv)
10813		return -EINVAL;
10814	return __io_register_rsrc_update(ctx, type, &up, up.nr);
10815}
10816
10817static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
10818			    unsigned int size, unsigned int type)
10819{
10820	struct io_uring_rsrc_register rr;
10821
10822	/* keep it extendible */
10823	if (size != sizeof(rr))
10824		return -EINVAL;
10825
10826	memset(&rr, 0, sizeof(rr));
10827	if (copy_from_user(&rr, arg, size))
10828		return -EFAULT;
10829	if (!rr.nr || rr.resv || rr.resv2)
10830		return -EINVAL;
10831
10832	switch (type) {
10833	case IORING_RSRC_FILE:
10834		return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
10835					     rr.nr, u64_to_user_ptr(rr.tags));
10836	case IORING_RSRC_BUFFER:
10837		return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
10838					       rr.nr, u64_to_user_ptr(rr.tags));
10839	}
10840	return -EINVAL;
10841}
10842
10843static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
10844				       void __user *arg, unsigned len)
10845{
10846	struct io_uring_task *tctx = current->io_uring;
10847	cpumask_var_t new_mask;
10848	int ret;
10849
10850	if (!tctx || !tctx->io_wq)
10851		return -EINVAL;
10852
10853	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
10854		return -ENOMEM;
10855
10856	cpumask_clear(new_mask);
10857	if (len > cpumask_size())
10858		len = cpumask_size();
10859
10860	if (copy_from_user(new_mask, arg, len)) {
10861		free_cpumask_var(new_mask);
10862		return -EFAULT;
10863	}
10864
10865	ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
10866	free_cpumask_var(new_mask);
10867	return ret;
10868}
10869
10870static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
10871{
10872	struct io_uring_task *tctx = current->io_uring;
10873
10874	if (!tctx || !tctx->io_wq)
10875		return -EINVAL;
10876
10877	return io_wq_cpu_affinity(tctx->io_wq, NULL);
10878}
10879
10880static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
10881					       void __user *arg)
10882	__must_hold(&ctx->uring_lock)
10883{
10884	struct io_tctx_node *node;
10885	struct io_uring_task *tctx = NULL;
10886	struct io_sq_data *sqd = NULL;
10887	__u32 new_count[2];
10888	int i, ret;
10889
10890	if (copy_from_user(new_count, arg, sizeof(new_count)))
10891		return -EFAULT;
10892	for (i = 0; i < ARRAY_SIZE(new_count); i++)
10893		if (new_count[i] > INT_MAX)
10894			return -EINVAL;
10895
10896	if (ctx->flags & IORING_SETUP_SQPOLL) {
10897		sqd = ctx->sq_data;
10898		if (sqd) {
10899			/*
10900			 * Observe the correct sqd->lock -> ctx->uring_lock
10901			 * ordering. Fine to drop uring_lock here, we hold
10902			 * a ref to the ctx.
10903			 */
10904			refcount_inc(&sqd->refs);
10905			mutex_unlock(&ctx->uring_lock);
10906			mutex_lock(&sqd->lock);
10907			mutex_lock(&ctx->uring_lock);
10908			if (sqd->thread)
10909				tctx = sqd->thread->io_uring;
10910		}
10911	} else {
10912		tctx = current->io_uring;
10913	}
10914
10915	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
10916
10917	for (i = 0; i < ARRAY_SIZE(new_count); i++)
10918		if (new_count[i])
10919			ctx->iowq_limits[i] = new_count[i];
10920	ctx->iowq_limits_set = true;
10921
10922	if (tctx && tctx->io_wq) {
10923		ret = io_wq_max_workers(tctx->io_wq, new_count);
10924		if (ret)
10925			goto err;
10926	} else {
10927		memset(new_count, 0, sizeof(new_count));
10928	}
10929
10930	if (sqd) {
10931		mutex_unlock(&sqd->lock);
10932		io_put_sq_data(sqd);
10933	}
10934
10935	if (copy_to_user(arg, new_count, sizeof(new_count)))
10936		return -EFAULT;
10937
10938	/* that's it for SQPOLL, only the SQPOLL task creates requests */
10939	if (sqd)
10940		return 0;
10941
10942	/* now propagate the restriction to all registered users */
10943	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
10944		struct io_uring_task *tctx = node->task->io_uring;
10945
10946		if (WARN_ON_ONCE(!tctx->io_wq))
10947			continue;
10948
10949		for (i = 0; i < ARRAY_SIZE(new_count); i++)
10950			new_count[i] = ctx->iowq_limits[i];
10951		/* ignore errors, it always returns zero anyway */
10952		(void)io_wq_max_workers(tctx->io_wq, new_count);
10953	}
10954	return 0;
10955err:
10956	if (sqd) {
10957		mutex_unlock(&sqd->lock);
10958		io_put_sq_data(sqd);
10959	}
10960	return ret;
10961}
10962
10963static bool io_register_op_must_quiesce(int op)
10964{
10965	switch (op) {
10966	case IORING_REGISTER_BUFFERS:
10967	case IORING_UNREGISTER_BUFFERS:
10968	case IORING_REGISTER_FILES:
10969	case IORING_UNREGISTER_FILES:
10970	case IORING_REGISTER_FILES_UPDATE:
10971	case IORING_REGISTER_PROBE:
10972	case IORING_REGISTER_PERSONALITY:
10973	case IORING_UNREGISTER_PERSONALITY:
10974	case IORING_REGISTER_FILES2:
10975	case IORING_REGISTER_FILES_UPDATE2:
10976	case IORING_REGISTER_BUFFERS2:
10977	case IORING_REGISTER_BUFFERS_UPDATE:
10978	case IORING_REGISTER_IOWQ_AFF:
10979	case IORING_UNREGISTER_IOWQ_AFF:
10980	case IORING_REGISTER_IOWQ_MAX_WORKERS:
10981		return false;
10982	default:
10983		return true;
10984	}
10985}
10986
10987static __cold int io_ctx_quiesce(struct io_ring_ctx *ctx)
10988{
10989	long ret;
10990
10991	percpu_ref_kill(&ctx->refs);
10992
10993	/*
10994	 * Drop uring mutex before waiting for references to exit. If another
10995	 * thread is currently inside io_uring_enter() it might need to grab the
10996	 * uring_lock to make progress. If we hold it here across the drain
10997	 * wait, then we can deadlock. It's safe to drop the mutex here, since
10998	 * no new references will come in after we've killed the percpu ref.
10999	 */
11000	mutex_unlock(&ctx->uring_lock);
11001	do {
11002		ret = wait_for_completion_interruptible_timeout(&ctx->ref_comp, HZ);
11003		if (ret) {
11004			ret = min(0L, ret);
11005			break;
11006		}
11007
11008		ret = io_run_task_work_sig();
11009		io_req_caches_free(ctx);
11010	} while (ret >= 0);
11011	mutex_lock(&ctx->uring_lock);
11012
11013	if (ret)
11014		io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
11015	return ret;
11016}
11017
11018static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
11019			       void __user *arg, unsigned nr_args)
11020	__releases(ctx->uring_lock)
11021	__acquires(ctx->uring_lock)
11022{
11023	int ret;
11024
11025	/*
11026	 * We're inside the ring mutex, if the ref is already dying, then
11027	 * someone else killed the ctx or is already going through
11028	 * io_uring_register().
11029	 */
11030	if (percpu_ref_is_dying(&ctx->refs))
11031		return -ENXIO;
11032
11033	if (ctx->restricted) {
11034		if (opcode >= IORING_REGISTER_LAST)
11035			return -EINVAL;
11036		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
11037		if (!test_bit(opcode, ctx->restrictions.register_op))
11038			return -EACCES;
11039	}
11040
11041	if (io_register_op_must_quiesce(opcode)) {
11042		ret = io_ctx_quiesce(ctx);
11043		if (ret)
11044			return ret;
11045	}
11046
11047	switch (opcode) {
11048	case IORING_REGISTER_BUFFERS:
11049		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
11050		break;
11051	case IORING_UNREGISTER_BUFFERS:
11052		ret = -EINVAL;
11053		if (arg || nr_args)
11054			break;
11055		ret = io_sqe_buffers_unregister(ctx);
11056		break;
11057	case IORING_REGISTER_FILES:
11058		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
11059		break;
11060	case IORING_UNREGISTER_FILES:
11061		ret = -EINVAL;
11062		if (arg || nr_args)
11063			break;
11064		ret = io_sqe_files_unregister(ctx);
11065		break;
11066	case IORING_REGISTER_FILES_UPDATE:
11067		ret = io_register_files_update(ctx, arg, nr_args);
11068		break;
11069	case IORING_REGISTER_EVENTFD:
11070	case IORING_REGISTER_EVENTFD_ASYNC:
11071		ret = -EINVAL;
11072		if (nr_args != 1)
11073			break;
11074		ret = io_eventfd_register(ctx, arg);
11075		if (ret)
11076			break;
11077		if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
11078			ctx->eventfd_async = 1;
11079		else
11080			ctx->eventfd_async = 0;
11081		break;
11082	case IORING_UNREGISTER_EVENTFD:
11083		ret = -EINVAL;
11084		if (arg || nr_args)
11085			break;
11086		ret = io_eventfd_unregister(ctx);
11087		break;
11088	case IORING_REGISTER_PROBE:
11089		ret = -EINVAL;
11090		if (!arg || nr_args > 256)
11091			break;
11092		ret = io_probe(ctx, arg, nr_args);
11093		break;
11094	case IORING_REGISTER_PERSONALITY:
11095		ret = -EINVAL;
11096		if (arg || nr_args)
11097			break;
11098		ret = io_register_personality(ctx);
11099		break;
11100	case IORING_UNREGISTER_PERSONALITY:
11101		ret = -EINVAL;
11102		if (arg)
11103			break;
11104		ret = io_unregister_personality(ctx, nr_args);
11105		break;
11106	case IORING_REGISTER_ENABLE_RINGS:
11107		ret = -EINVAL;
11108		if (arg || nr_args)
11109			break;
11110		ret = io_register_enable_rings(ctx);
11111		break;
11112	case IORING_REGISTER_RESTRICTIONS:
11113		ret = io_register_restrictions(ctx, arg, nr_args);
11114		break;
11115	case IORING_REGISTER_FILES2:
11116		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
11117		break;
11118	case IORING_REGISTER_FILES_UPDATE2:
11119		ret = io_register_rsrc_update(ctx, arg, nr_args,
11120					      IORING_RSRC_FILE);
11121		break;
11122	case IORING_REGISTER_BUFFERS2:
11123		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
11124		break;
11125	case IORING_REGISTER_BUFFERS_UPDATE:
11126		ret = io_register_rsrc_update(ctx, arg, nr_args,
11127					      IORING_RSRC_BUFFER);
11128		break;
11129	case IORING_REGISTER_IOWQ_AFF:
11130		ret = -EINVAL;
11131		if (!arg || !nr_args)
11132			break;
11133		ret = io_register_iowq_aff(ctx, arg, nr_args);
11134		break;
11135	case IORING_UNREGISTER_IOWQ_AFF:
11136		ret = -EINVAL;
11137		if (arg || nr_args)
11138			break;
11139		ret = io_unregister_iowq_aff(ctx);
11140		break;
11141	case IORING_REGISTER_IOWQ_MAX_WORKERS:
11142		ret = -EINVAL;
11143		if (!arg || nr_args != 2)
11144			break;
11145		ret = io_register_iowq_max_workers(ctx, arg);
11146		break;
11147	default:
11148		ret = -EINVAL;
11149		break;
11150	}
11151
11152	if (io_register_op_must_quiesce(opcode)) {
11153		/* bring the ctx back to life */
11154		percpu_ref_reinit(&ctx->refs);
11155		reinit_completion(&ctx->ref_comp);
11156	}
11157	return ret;
11158}
11159
11160SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
11161		void __user *, arg, unsigned int, nr_args)
11162{
11163	struct io_ring_ctx *ctx;
11164	long ret = -EBADF;
11165	struct fd f;
11166
11167	f = fdget(fd);
11168	if (!f.file)
11169		return -EBADF;
11170
11171	ret = -EOPNOTSUPP;
11172	if (f.file->f_op != &io_uring_fops)
11173		goto out_fput;
11174
11175	ctx = f.file->private_data;
11176
11177	io_run_task_work();
11178
11179	mutex_lock(&ctx->uring_lock);
11180	ret = __io_uring_register(ctx, opcode, arg, nr_args);
11181	mutex_unlock(&ctx->uring_lock);
11182	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
11183							ctx->cq_ev_fd != NULL, ret);
11184out_fput:
11185	fdput(f);
11186	return ret;
11187}
11188
11189static int __init io_uring_init(void)
11190{
11191#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
11192	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
11193	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
11194} while (0)
11195
11196#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
11197	__BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
11198	BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
11199	BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
11200	BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
11201	BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
11202	BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
11203	BUILD_BUG_SQE_ELEM(8,  __u64,  off);
11204	BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
11205	BUILD_BUG_SQE_ELEM(16, __u64,  addr);
11206	BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
11207	BUILD_BUG_SQE_ELEM(24, __u32,  len);
11208	BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
11209	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
11210	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
11211	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
11212	BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
11213	BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
11214	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
11215	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
11216	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
11217	BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
11218	BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
11219	BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
11220	BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
11221	BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
11222	BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
11223	BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
11224	BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
11225	BUILD_BUG_SQE_ELEM(40, __u16,  buf_group);
11226	BUILD_BUG_SQE_ELEM(42, __u16,  personality);
11227	BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
11228	BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
11229
11230	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
11231		     sizeof(struct io_uring_rsrc_update));
11232	BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
11233		     sizeof(struct io_uring_rsrc_update2));
11234
11235	/* ->buf_index is u16 */
11236	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
11237
11238	/* should fit into one byte */
11239	BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
11240	BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
11241	BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
11242
11243	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
11244	BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
11245
11246	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
11247				SLAB_ACCOUNT);
11248	return 0;
11249};
11250__initcall(io_uring_init);
Configure Feed

Configure Feed