Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Code related to the io_uring_register() syscall
4 *
5 * Copyright (C) 2023 Jens Axboe
6 */
7#include <linux/kernel.h>
8#include <linux/errno.h>
9#include <linux/syscalls.h>
10#include <linux/refcount.h>
11#include <linux/bits.h>
12#include <linux/fs.h>
13#include <linux/file.h>
14#include <linux/slab.h>
15#include <linux/uaccess.h>
16#include <linux/nospec.h>
17#include <linux/compat.h>
18#include <linux/io_uring.h>
19#include <linux/io_uring_types.h>
20
21#include "filetable.h"
22#include "io_uring.h"
23#include "opdef.h"
24#include "tctx.h"
25#include "rsrc.h"
26#include "sqpoll.h"
27#include "register.h"
28#include "cancel.h"
29#include "kbuf.h"
30#include "napi.h"
31#include "eventfd.h"
32#include "msg_ring.h"
33#include "memmap.h"
34#include "zcrx.h"
35#include "query.h"
36
37#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
38 IORING_REGISTER_LAST + IORING_OP_LAST)
39
40static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
41 unsigned nr_args)
42{
43 struct io_uring_probe *p;
44 size_t size;
45 int i, ret;
46
47 if (nr_args > IORING_OP_LAST)
48 nr_args = IORING_OP_LAST;
49
50 size = struct_size(p, ops, nr_args);
51 p = memdup_user(arg, size);
52 if (IS_ERR(p))
53 return PTR_ERR(p);
54 ret = -EINVAL;
55 if (memchr_inv(p, 0, size))
56 goto out;
57
58 p->last_op = IORING_OP_LAST - 1;
59
60 for (i = 0; i < nr_args; i++) {
61 p->ops[i].op = i;
62 if (io_uring_op_supported(i))
63 p->ops[i].flags = IO_URING_OP_SUPPORTED;
64 }
65 p->ops_len = i;
66
67 ret = 0;
68 if (copy_to_user(arg, p, size))
69 ret = -EFAULT;
70out:
71 kfree(p);
72 return ret;
73}
74
75int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
76{
77 const struct cred *creds;
78
79 creds = xa_erase(&ctx->personalities, id);
80 if (creds) {
81 put_cred(creds);
82 return 0;
83 }
84
85 return -EINVAL;
86}
87
88
89static int io_register_personality(struct io_ring_ctx *ctx)
90{
91 const struct cred *creds;
92 u32 id;
93 int ret;
94
95 creds = get_current_cred();
96
97 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
98 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
99 if (ret < 0) {
100 put_cred(creds);
101 return ret;
102 }
103 return id;
104}
105
106static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
107 struct io_restriction *restrictions)
108{
109 struct io_uring_restriction *res;
110 size_t size;
111 int i, ret;
112
113 if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
114 return -EINVAL;
115
116 size = array_size(nr_args, sizeof(*res));
117 if (size == SIZE_MAX)
118 return -EOVERFLOW;
119
120 res = memdup_user(arg, size);
121 if (IS_ERR(res))
122 return PTR_ERR(res);
123
124 ret = -EINVAL;
125
126 for (i = 0; i < nr_args; i++) {
127 switch (res[i].opcode) {
128 case IORING_RESTRICTION_REGISTER_OP:
129 if (res[i].register_op >= IORING_REGISTER_LAST)
130 goto err;
131 __set_bit(res[i].register_op, restrictions->register_op);
132 break;
133 case IORING_RESTRICTION_SQE_OP:
134 if (res[i].sqe_op >= IORING_OP_LAST)
135 goto err;
136 __set_bit(res[i].sqe_op, restrictions->sqe_op);
137 break;
138 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
139 restrictions->sqe_flags_allowed = res[i].sqe_flags;
140 break;
141 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
142 restrictions->sqe_flags_required = res[i].sqe_flags;
143 break;
144 default:
145 goto err;
146 }
147 }
148
149 ret = 0;
150
151err:
152 kfree(res);
153 return ret;
154}
155
156static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
157 void __user *arg, unsigned int nr_args)
158{
159 int ret;
160
161 /* Restrictions allowed only if rings started disabled */
162 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
163 return -EBADFD;
164
165 /* We allow only a single restrictions registration */
166 if (ctx->restrictions.registered)
167 return -EBUSY;
168
169 ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
170 /* Reset all restrictions if an error happened */
171 if (ret != 0)
172 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
173 else
174 ctx->restrictions.registered = true;
175 return ret;
176}
177
178static int io_register_enable_rings(struct io_ring_ctx *ctx)
179{
180 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
181 return -EBADFD;
182
183 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
184 WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
185 /*
186 * Lazy activation attempts would fail if it was polled before
187 * submitter_task is set.
188 */
189 if (wq_has_sleeper(&ctx->poll_wq))
190 io_activate_pollwq(ctx);
191 }
192
193 if (ctx->restrictions.registered)
194 ctx->restricted = 1;
195
196 ctx->flags &= ~IORING_SETUP_R_DISABLED;
197 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
198 wake_up(&ctx->sq_data->wait);
199 return 0;
200}
201
202static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
203 cpumask_var_t new_mask)
204{
205 int ret;
206
207 if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
208 ret = io_wq_cpu_affinity(current->io_uring, new_mask);
209 } else {
210 mutex_unlock(&ctx->uring_lock);
211 ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
212 mutex_lock(&ctx->uring_lock);
213 }
214
215 return ret;
216}
217
218static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
219 void __user *arg, unsigned len)
220{
221 cpumask_var_t new_mask;
222 int ret;
223
224 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
225 return -ENOMEM;
226
227 cpumask_clear(new_mask);
228 if (len > cpumask_size())
229 len = cpumask_size();
230
231#ifdef CONFIG_COMPAT
232 if (in_compat_syscall())
233 ret = compat_get_bitmap(cpumask_bits(new_mask),
234 (const compat_ulong_t __user *)arg,
235 len * 8 /* CHAR_BIT */);
236 else
237#endif
238 ret = copy_from_user(new_mask, arg, len);
239
240 if (ret) {
241 free_cpumask_var(new_mask);
242 return -EFAULT;
243 }
244
245 ret = __io_register_iowq_aff(ctx, new_mask);
246 free_cpumask_var(new_mask);
247 return ret;
248}
249
250static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
251{
252 return __io_register_iowq_aff(ctx, NULL);
253}
254
255static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
256 void __user *arg)
257 __must_hold(&ctx->uring_lock)
258{
259 struct io_tctx_node *node;
260 struct io_uring_task *tctx = NULL;
261 struct io_sq_data *sqd = NULL;
262 __u32 new_count[2];
263 int i, ret;
264
265 if (copy_from_user(new_count, arg, sizeof(new_count)))
266 return -EFAULT;
267 for (i = 0; i < ARRAY_SIZE(new_count); i++)
268 if (new_count[i] > INT_MAX)
269 return -EINVAL;
270
271 if (ctx->flags & IORING_SETUP_SQPOLL) {
272 sqd = ctx->sq_data;
273 if (sqd) {
274 struct task_struct *tsk;
275
276 /*
277 * Observe the correct sqd->lock -> ctx->uring_lock
278 * ordering. Fine to drop uring_lock here, we hold
279 * a ref to the ctx.
280 */
281 refcount_inc(&sqd->refs);
282 mutex_unlock(&ctx->uring_lock);
283 mutex_lock(&sqd->lock);
284 mutex_lock(&ctx->uring_lock);
285 tsk = sqpoll_task_locked(sqd);
286 if (tsk)
287 tctx = tsk->io_uring;
288 }
289 } else {
290 tctx = current->io_uring;
291 }
292
293 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
294
295 for (i = 0; i < ARRAY_SIZE(new_count); i++)
296 if (new_count[i])
297 ctx->iowq_limits[i] = new_count[i];
298 ctx->iowq_limits_set = true;
299
300 if (tctx && tctx->io_wq) {
301 ret = io_wq_max_workers(tctx->io_wq, new_count);
302 if (ret)
303 goto err;
304 } else {
305 memset(new_count, 0, sizeof(new_count));
306 }
307
308 if (sqd) {
309 mutex_unlock(&ctx->uring_lock);
310 mutex_unlock(&sqd->lock);
311 io_put_sq_data(sqd);
312 mutex_lock(&ctx->uring_lock);
313 }
314
315 if (copy_to_user(arg, new_count, sizeof(new_count)))
316 return -EFAULT;
317
318 /* that's it for SQPOLL, only the SQPOLL task creates requests */
319 if (sqd)
320 return 0;
321
322 /* now propagate the restriction to all registered users */
323 mutex_lock(&ctx->tctx_lock);
324 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
325 tctx = node->task->io_uring;
326 if (WARN_ON_ONCE(!tctx->io_wq))
327 continue;
328
329 for (i = 0; i < ARRAY_SIZE(new_count); i++)
330 new_count[i] = ctx->iowq_limits[i];
331 /* ignore errors, it always returns zero anyway */
332 (void)io_wq_max_workers(tctx->io_wq, new_count);
333 }
334 mutex_unlock(&ctx->tctx_lock);
335 return 0;
336err:
337 if (sqd) {
338 mutex_unlock(&ctx->uring_lock);
339 mutex_unlock(&sqd->lock);
340 io_put_sq_data(sqd);
341 mutex_lock(&ctx->uring_lock);
342 }
343 return ret;
344}
345
346static int io_register_clock(struct io_ring_ctx *ctx,
347 struct io_uring_clock_register __user *arg)
348{
349 struct io_uring_clock_register reg;
350
351 if (copy_from_user(®, arg, sizeof(reg)))
352 return -EFAULT;
353 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)))
354 return -EINVAL;
355
356 switch (reg.clockid) {
357 case CLOCK_MONOTONIC:
358 ctx->clock_offset = 0;
359 break;
360 case CLOCK_BOOTTIME:
361 ctx->clock_offset = TK_OFFS_BOOT;
362 break;
363 default:
364 return -EINVAL;
365 }
366
367 ctx->clockid = reg.clockid;
368 return 0;
369}
370
371/*
372 * State to maintain until we can swap. Both new and old state, used for
373 * either mapping or freeing.
374 */
375struct io_ring_ctx_rings {
376 struct io_rings *rings;
377 struct io_uring_sqe *sq_sqes;
378
379 struct io_mapped_region sq_region;
380 struct io_mapped_region ring_region;
381};
382
383static void io_register_free_rings(struct io_ring_ctx *ctx,
384 struct io_ring_ctx_rings *r)
385{
386 io_free_region(ctx->user, &r->sq_region);
387 io_free_region(ctx->user, &r->ring_region);
388}
389
390#define swap_old(ctx, o, n, field) \
391 do { \
392 (o).field = (ctx)->field; \
393 (ctx)->field = (n).field; \
394 } while (0)
395
396#define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
397#define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
398 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
399 IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED)
400
401static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
402{
403 struct io_ctx_config config;
404 struct io_uring_region_desc rd;
405 struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
406 unsigned i, tail, old_head;
407 struct io_uring_params *p = &config.p;
408 struct io_rings_layout *rl = &config.layout;
409 int ret;
410
411 memset(&config, 0, sizeof(config));
412
413 /* limited to DEFER_TASKRUN for now */
414 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
415 return -EINVAL;
416 if (copy_from_user(p, arg, sizeof(*p)))
417 return -EFAULT;
418 if (p->flags & ~RESIZE_FLAGS)
419 return -EINVAL;
420
421 /* properties that are always inherited */
422 p->flags |= (ctx->flags & COPY_FLAGS);
423
424 ret = io_prepare_config(&config);
425 if (unlikely(ret))
426 return ret;
427
428 memset(&rd, 0, sizeof(rd));
429 rd.size = PAGE_ALIGN(rl->rings_size);
430 if (p->flags & IORING_SETUP_NO_MMAP) {
431 rd.user_addr = p->cq_off.user_addr;
432 rd.flags |= IORING_MEM_REGION_TYPE_USER;
433 }
434 ret = io_create_region(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
435 if (ret)
436 return ret;
437
438 n.rings = io_region_get_ptr(&n.ring_region);
439
440 /*
441 * At this point n.rings is shared with userspace, just like o.rings
442 * is as well. While we don't expect userspace to modify it while
443 * a resize is in progress, and it's most likely that userspace will
444 * shoot itself in the foot if it does, we can't always assume good
445 * intent... Use read/write once helpers from here on to indicate the
446 * shared nature of it.
447 */
448 WRITE_ONCE(n.rings->sq_ring_mask, p->sq_entries - 1);
449 WRITE_ONCE(n.rings->cq_ring_mask, p->cq_entries - 1);
450 WRITE_ONCE(n.rings->sq_ring_entries, p->sq_entries);
451 WRITE_ONCE(n.rings->cq_ring_entries, p->cq_entries);
452
453 if (copy_to_user(arg, p, sizeof(*p))) {
454 io_register_free_rings(ctx, &n);
455 return -EFAULT;
456 }
457
458 memset(&rd, 0, sizeof(rd));
459 rd.size = PAGE_ALIGN(rl->sq_size);
460 if (p->flags & IORING_SETUP_NO_MMAP) {
461 rd.user_addr = p->sq_off.user_addr;
462 rd.flags |= IORING_MEM_REGION_TYPE_USER;
463 }
464 ret = io_create_region(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
465 if (ret) {
466 io_register_free_rings(ctx, &n);
467 return ret;
468 }
469 n.sq_sqes = io_region_get_ptr(&n.sq_region);
470
471 /*
472 * If using SQPOLL, park the thread
473 */
474 if (ctx->sq_data) {
475 mutex_unlock(&ctx->uring_lock);
476 io_sq_thread_park(ctx->sq_data);
477 mutex_lock(&ctx->uring_lock);
478 }
479
480 /*
481 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude
482 * any new mmap's on the ring fd. Clear out existing mappings to prevent
483 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
484 * existing rings beyond this point will fail. Not that it could proceed
485 * at this point anyway, as the io_uring mmap side needs go grab the
486 * ctx->mmap_lock as well. Likewise, hold the completion lock over the
487 * duration of the actual swap.
488 */
489 mutex_lock(&ctx->mmap_lock);
490 spin_lock(&ctx->completion_lock);
491 o.rings = ctx->rings;
492 ctx->rings = NULL;
493 o.sq_sqes = ctx->sq_sqes;
494 ctx->sq_sqes = NULL;
495
496 /*
497 * Now copy SQ and CQ entries, if any. If either of the destination
498 * rings can't hold what is already there, then fail the operation.
499 */
500 tail = READ_ONCE(o.rings->sq.tail);
501 old_head = READ_ONCE(o.rings->sq.head);
502 if (tail - old_head > p->sq_entries)
503 goto overflow;
504 for (i = old_head; i < tail; i++) {
505 unsigned src_head = i & (ctx->sq_entries - 1);
506 unsigned dst_head = i & (p->sq_entries - 1);
507
508 n.sq_sqes[dst_head] = o.sq_sqes[src_head];
509 }
510 WRITE_ONCE(n.rings->sq.head, old_head);
511 WRITE_ONCE(n.rings->sq.tail, tail);
512
513 tail = READ_ONCE(o.rings->cq.tail);
514 old_head = READ_ONCE(o.rings->cq.head);
515 if (tail - old_head > p->cq_entries) {
516overflow:
517 /* restore old rings, and return -EOVERFLOW via cleanup path */
518 ctx->rings = o.rings;
519 ctx->sq_sqes = o.sq_sqes;
520 to_free = &n;
521 ret = -EOVERFLOW;
522 goto out;
523 }
524 for (i = old_head; i < tail; i++) {
525 unsigned src_head = i & (ctx->cq_entries - 1);
526 unsigned dst_head = i & (p->cq_entries - 1);
527
528 n.rings->cqes[dst_head] = o.rings->cqes[src_head];
529 }
530 WRITE_ONCE(n.rings->cq.head, old_head);
531 WRITE_ONCE(n.rings->cq.tail, tail);
532 /* invalidate cached cqe refill */
533 ctx->cqe_cached = ctx->cqe_sentinel = NULL;
534
535 WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
536 atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags));
537 WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
538 WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
539
540 /* all done, store old pointers and assign new ones */
541 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
542 ctx->sq_array = (u32 *)((char *)n.rings + rl->sq_array_offset);
543
544 ctx->sq_entries = p->sq_entries;
545 ctx->cq_entries = p->cq_entries;
546
547 ctx->rings = n.rings;
548 ctx->sq_sqes = n.sq_sqes;
549 swap_old(ctx, o, n, ring_region);
550 swap_old(ctx, o, n, sq_region);
551 to_free = &o;
552 ret = 0;
553out:
554 spin_unlock(&ctx->completion_lock);
555 mutex_unlock(&ctx->mmap_lock);
556 io_register_free_rings(ctx, to_free);
557
558 if (ctx->sq_data)
559 io_sq_thread_unpark(ctx->sq_data);
560
561 return ret;
562}
563
564static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
565{
566 struct io_uring_mem_region_reg __user *reg_uptr = uarg;
567 struct io_uring_mem_region_reg reg;
568 struct io_uring_region_desc __user *rd_uptr;
569 struct io_uring_region_desc rd;
570 struct io_mapped_region region = {};
571 int ret;
572
573 if (io_region_is_set(&ctx->param_region))
574 return -EBUSY;
575 if (copy_from_user(®, reg_uptr, sizeof(reg)))
576 return -EFAULT;
577 rd_uptr = u64_to_user_ptr(reg.region_uptr);
578 if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
579 return -EFAULT;
580 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)))
581 return -EINVAL;
582 if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
583 return -EINVAL;
584
585 /*
586 * This ensures there are no waiters. Waiters are unlocked and it's
587 * hard to synchronise with them, especially if we need to initialise
588 * the region.
589 */
590 if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
591 !(ctx->flags & IORING_SETUP_R_DISABLED))
592 return -EINVAL;
593
594 ret = io_create_region(ctx, ®ion, &rd, IORING_MAP_OFF_PARAM_REGION);
595 if (ret)
596 return ret;
597 if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
598 io_free_region(ctx->user, ®ion);
599 return -EFAULT;
600 }
601
602 if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
603 ctx->cq_wait_arg = io_region_get_ptr(®ion);
604 ctx->cq_wait_size = rd.size;
605 }
606
607 io_region_publish(ctx, ®ion, &ctx->param_region);
608 return 0;
609}
610
611static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
612 void __user *arg, unsigned nr_args)
613 __releases(ctx->uring_lock)
614 __acquires(ctx->uring_lock)
615{
616 int ret;
617
618 /*
619 * We don't quiesce the refs for register anymore and so it can't be
620 * dying as we're holding a file ref here.
621 */
622 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
623 return -ENXIO;
624
625 if (ctx->submitter_task && ctx->submitter_task != current)
626 return -EEXIST;
627
628 if (ctx->restricted) {
629 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
630 if (!test_bit(opcode, ctx->restrictions.register_op))
631 return -EACCES;
632 }
633
634 switch (opcode) {
635 case IORING_REGISTER_BUFFERS:
636 ret = -EFAULT;
637 if (!arg)
638 break;
639 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
640 break;
641 case IORING_UNREGISTER_BUFFERS:
642 ret = -EINVAL;
643 if (arg || nr_args)
644 break;
645 ret = io_sqe_buffers_unregister(ctx);
646 break;
647 case IORING_REGISTER_FILES:
648 ret = -EFAULT;
649 if (!arg)
650 break;
651 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
652 break;
653 case IORING_UNREGISTER_FILES:
654 ret = -EINVAL;
655 if (arg || nr_args)
656 break;
657 ret = io_sqe_files_unregister(ctx);
658 break;
659 case IORING_REGISTER_FILES_UPDATE:
660 ret = io_register_files_update(ctx, arg, nr_args);
661 break;
662 case IORING_REGISTER_EVENTFD:
663 ret = -EINVAL;
664 if (nr_args != 1)
665 break;
666 ret = io_eventfd_register(ctx, arg, 0);
667 break;
668 case IORING_REGISTER_EVENTFD_ASYNC:
669 ret = -EINVAL;
670 if (nr_args != 1)
671 break;
672 ret = io_eventfd_register(ctx, arg, 1);
673 break;
674 case IORING_UNREGISTER_EVENTFD:
675 ret = -EINVAL;
676 if (arg || nr_args)
677 break;
678 ret = io_eventfd_unregister(ctx);
679 break;
680 case IORING_REGISTER_PROBE:
681 ret = -EINVAL;
682 if (!arg || nr_args > 256)
683 break;
684 ret = io_probe(ctx, arg, nr_args);
685 break;
686 case IORING_REGISTER_PERSONALITY:
687 ret = -EINVAL;
688 if (arg || nr_args)
689 break;
690 ret = io_register_personality(ctx);
691 break;
692 case IORING_UNREGISTER_PERSONALITY:
693 ret = -EINVAL;
694 if (arg)
695 break;
696 ret = io_unregister_personality(ctx, nr_args);
697 break;
698 case IORING_REGISTER_ENABLE_RINGS:
699 ret = -EINVAL;
700 if (arg || nr_args)
701 break;
702 ret = io_register_enable_rings(ctx);
703 break;
704 case IORING_REGISTER_RESTRICTIONS:
705 ret = io_register_restrictions(ctx, arg, nr_args);
706 break;
707 case IORING_REGISTER_FILES2:
708 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
709 break;
710 case IORING_REGISTER_FILES_UPDATE2:
711 ret = io_register_rsrc_update(ctx, arg, nr_args,
712 IORING_RSRC_FILE);
713 break;
714 case IORING_REGISTER_BUFFERS2:
715 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
716 break;
717 case IORING_REGISTER_BUFFERS_UPDATE:
718 ret = io_register_rsrc_update(ctx, arg, nr_args,
719 IORING_RSRC_BUFFER);
720 break;
721 case IORING_REGISTER_IOWQ_AFF:
722 ret = -EINVAL;
723 if (!arg || !nr_args)
724 break;
725 ret = io_register_iowq_aff(ctx, arg, nr_args);
726 break;
727 case IORING_UNREGISTER_IOWQ_AFF:
728 ret = -EINVAL;
729 if (arg || nr_args)
730 break;
731 ret = io_unregister_iowq_aff(ctx);
732 break;
733 case IORING_REGISTER_IOWQ_MAX_WORKERS:
734 ret = -EINVAL;
735 if (!arg || nr_args != 2)
736 break;
737 ret = io_register_iowq_max_workers(ctx, arg);
738 break;
739 case IORING_REGISTER_RING_FDS:
740 ret = io_ringfd_register(ctx, arg, nr_args);
741 break;
742 case IORING_UNREGISTER_RING_FDS:
743 ret = io_ringfd_unregister(ctx, arg, nr_args);
744 break;
745 case IORING_REGISTER_PBUF_RING:
746 ret = -EINVAL;
747 if (!arg || nr_args != 1)
748 break;
749 ret = io_register_pbuf_ring(ctx, arg);
750 break;
751 case IORING_UNREGISTER_PBUF_RING:
752 ret = -EINVAL;
753 if (!arg || nr_args != 1)
754 break;
755 ret = io_unregister_pbuf_ring(ctx, arg);
756 break;
757 case IORING_REGISTER_SYNC_CANCEL:
758 ret = -EINVAL;
759 if (!arg || nr_args != 1)
760 break;
761 ret = io_sync_cancel(ctx, arg);
762 break;
763 case IORING_REGISTER_FILE_ALLOC_RANGE:
764 ret = -EINVAL;
765 if (!arg || nr_args)
766 break;
767 ret = io_register_file_alloc_range(ctx, arg);
768 break;
769 case IORING_REGISTER_PBUF_STATUS:
770 ret = -EINVAL;
771 if (!arg || nr_args != 1)
772 break;
773 ret = io_register_pbuf_status(ctx, arg);
774 break;
775 case IORING_REGISTER_NAPI:
776 ret = -EINVAL;
777 if (!arg || nr_args != 1)
778 break;
779 ret = io_register_napi(ctx, arg);
780 break;
781 case IORING_UNREGISTER_NAPI:
782 ret = -EINVAL;
783 if (nr_args != 1)
784 break;
785 ret = io_unregister_napi(ctx, arg);
786 break;
787 case IORING_REGISTER_CLOCK:
788 ret = -EINVAL;
789 if (!arg || nr_args)
790 break;
791 ret = io_register_clock(ctx, arg);
792 break;
793 case IORING_REGISTER_CLONE_BUFFERS:
794 ret = -EINVAL;
795 if (!arg || nr_args != 1)
796 break;
797 ret = io_register_clone_buffers(ctx, arg);
798 break;
799 case IORING_REGISTER_ZCRX_IFQ:
800 ret = -EINVAL;
801 if (!arg || nr_args != 1)
802 break;
803 ret = io_register_zcrx_ifq(ctx, arg);
804 break;
805 case IORING_REGISTER_RESIZE_RINGS:
806 ret = -EINVAL;
807 if (!arg || nr_args != 1)
808 break;
809 ret = io_register_resize_rings(ctx, arg);
810 break;
811 case IORING_REGISTER_MEM_REGION:
812 ret = -EINVAL;
813 if (!arg || nr_args != 1)
814 break;
815 ret = io_register_mem_region(ctx, arg);
816 break;
817 case IORING_REGISTER_QUERY:
818 ret = io_query(arg, nr_args);
819 break;
820 case IORING_REGISTER_ZCRX_CTRL:
821 ret = io_zcrx_ctrl(ctx, arg, nr_args);
822 break;
823 default:
824 ret = -EINVAL;
825 break;
826 }
827
828 return ret;
829}
830
831/*
832 * Given an 'fd' value, return the ctx associated with if. If 'registered' is
833 * true, then the registered index is used. Otherwise, the normal fd table.
834 * Caller must call fput() on the returned file, unless it's an ERR_PTR.
835 */
836struct file *io_uring_register_get_file(unsigned int fd, bool registered)
837{
838 struct file *file;
839
840 if (registered) {
841 /*
842 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
843 * need only dereference our task private array to find it.
844 */
845 struct io_uring_task *tctx = current->io_uring;
846
847 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
848 return ERR_PTR(-EINVAL);
849 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
850 file = tctx->registered_rings[fd];
851 if (file)
852 get_file(file);
853 } else {
854 file = fget(fd);
855 }
856
857 if (unlikely(!file))
858 return ERR_PTR(-EBADF);
859 if (io_is_uring_fops(file))
860 return file;
861 fput(file);
862 return ERR_PTR(-EOPNOTSUPP);
863}
864
865static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args)
866{
867 struct io_uring_sqe sqe;
868
869 if (!arg || nr_args != 1)
870 return -EINVAL;
871 if (copy_from_user(&sqe, arg, sizeof(sqe)))
872 return -EFAULT;
873 /* no flags supported */
874 if (sqe.flags)
875 return -EINVAL;
876 if (sqe.opcode != IORING_OP_MSG_RING)
877 return -EINVAL;
878
879 return io_uring_sync_msg_ring(&sqe);
880}
881
882/*
883 * "blind" registration opcodes are ones where there's no ring given, and
884 * hence the source fd must be -1.
885 */
886static int io_uring_register_blind(unsigned int opcode, void __user *arg,
887 unsigned int nr_args)
888{
889 switch (opcode) {
890 case IORING_REGISTER_SEND_MSG_RING:
891 return io_uring_register_send_msg_ring(arg, nr_args);
892 case IORING_REGISTER_QUERY:
893 return io_query(arg, nr_args);
894 }
895 return -EINVAL;
896}
897
898SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
899 void __user *, arg, unsigned int, nr_args)
900{
901 struct io_ring_ctx *ctx;
902 long ret = -EBADF;
903 struct file *file;
904 bool use_registered_ring;
905
906 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
907 opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
908
909 if (opcode >= IORING_REGISTER_LAST)
910 return -EINVAL;
911
912 if (fd == -1)
913 return io_uring_register_blind(opcode, arg, nr_args);
914
915 file = io_uring_register_get_file(fd, use_registered_ring);
916 if (IS_ERR(file))
917 return PTR_ERR(file);
918 ctx = file->private_data;
919
920 mutex_lock(&ctx->uring_lock);
921 ret = __io_uring_register(ctx, opcode, arg, nr_args);
922
923 trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
924 ctx->buf_table.nr, ret);
925 mutex_unlock(&ctx->uring_lock);
926
927 fput(file);
928 return ret;
929}