Merge branch 'for-6.12/io_uring' into for-6.12/io_uring-discard

+3

include/linux/io_uring_types.h

··· 239 239 struct io_rings *rings; 240 240 struct percpu_ref refs; 241 241 242 + clockid_t clockid; 243 + enum tk_offsets clock_offset; 244 + 242 245 enum task_work_notify_mode notify_method; 243 246 unsigned sq_thread_idle; 244 247 } ____cacheline_aligned_in_smp;

+28 -1

include/uapi/linux/io_uring.h

··· 440 440 * IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv 441 441 * IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct 442 442 * them from sends. 443 + * IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion will get 444 + * more completions. In other words, the buffer is being 445 + * partially consumed, and will be used by the kernel for 446 + * more completions. This is only set for buffers used via 447 + * the incremental buffer consumption, as provided by 448 + * a ring buffer setup with IOU_PBUF_RING_INC. For any 449 + * other provided buffer type, all completions with a 450 + * buffer passed back is automatically returned to the 451 + * application. 443 452 */ 444 453 #define IORING_CQE_F_BUFFER (1U << 0) 445 454 #define IORING_CQE_F_MORE (1U << 1) 446 455 #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) 447 456 #define IORING_CQE_F_NOTIF (1U << 3) 457 + #define IORING_CQE_F_BUF_MORE (1U << 4) 448 458 449 459 #define IORING_CQE_BUFFER_SHIFT 16 450 460 ··· 517 507 #define IORING_ENTER_SQ_WAIT (1U << 2) 518 508 #define IORING_ENTER_EXT_ARG (1U << 3) 519 509 #define IORING_ENTER_REGISTERED_RING (1U << 4) 510 + #define IORING_ENTER_ABS_TIMER (1U << 5) 520 511 521 512 /* 522 513 * Passed in for io_uring_setup(2). Copied back with updated info on success ··· 553 542 #define IORING_FEAT_LINKED_FILE (1U << 12) 554 543 #define IORING_FEAT_REG_REG_RING (1U << 13) 555 544 #define IORING_FEAT_RECVSEND_BUNDLE (1U << 14) 545 + #define IORING_FEAT_MIN_TIMEOUT (1U << 15) 556 546 557 547 /* 558 548 * io_uring_register(2) opcodes and arguments ··· 606 594 /* set/clear busy poll settings */ 607 595 IORING_REGISTER_NAPI = 27, 608 596 IORING_UNREGISTER_NAPI = 28, 597 + 598 + IORING_REGISTER_CLOCK = 29, 609 599 610 600 /* this goes last */ 611 601 IORING_REGISTER_LAST, ··· 689 675 __u32 resv2[3]; 690 676 }; 691 677 678 + struct io_uring_clock_register { 679 + __u32 clockid; 680 + __u32 __resv[3]; 681 + }; 682 + 692 683 struct io_uring_buf { 693 684 __u64 addr; 694 685 __u32 len; ··· 726 707 * mmap(2) with the offset set as: 727 708 * IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) 728 709 * to get a virtual mapping for the ring. 710 + * IOU_PBUF_RING_INC: If set, buffers consumed from this buffer ring can be 711 + * consumed incrementally. Normally one (or more) buffers 712 + * are fully consumed. With incremental consumptions, it's 713 + * feasible to register big ranges of buffers, and each 714 + * use of it will consume only as much as it needs. This 715 + * requires that both the kernel and application keep 716 + * track of where the current read/recv index is at. 729 717 */ 730 718 enum io_uring_register_pbuf_ring_flags { 731 719 IOU_PBUF_RING_MMAP = 1, 720 + IOU_PBUF_RING_INC = 2, 732 721 }; 733 722 734 723 /* argument for IORING_(UN)REGISTER_PBUF_RING */ ··· 785 758 struct io_uring_getevents_arg { 786 759 __u64 sigmask; 787 760 __u32 sigmask_sz; 788 - __u32 pad; 761 + __u32 min_wait_usec; 789 762 __u64 ts; 790 763 }; 791 764

+13

init/Kconfig

··· 1687 1687 applications to submit and complete IO through submission and 1688 1688 completion rings that are shared between the kernel and application. 1689 1689 1690 + config GCOV_PROFILE_URING 1691 + bool "Enable GCOV profiling on the io_uring subsystem" 1692 + depends on GCOV_KERNEL 1693 + help 1694 + Enable GCOV profiling on the io_uring subsystem, to facilitate 1695 + code coverage testing. 1696 + 1697 + If unsure, say N. 1698 + 1699 + Note that this will have a negative impact on the performance of 1700 + the io_uring subsystem, hence this should only be enabled for 1701 + specific test purposes. 1702 + 1690 1703 config ADVISE_SYSCALLS 1691 1704 bool "Enable madvise/fadvise syscalls" if EXPERT 1692 1705 default y

+4

io_uring/Makefile

··· 2 2 # 3 3 # Makefile for io_uring 4 4 5 + ifdef CONFIG_GCOV_PROFILE_URING 6 + GCOV_PROFILE := y 7 + endif 8 + 5 9 obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ 6 10 tctx.o filetable.o rw.o net.o poll.o \ 7 11 eventfd.o uring_cmd.o openclose.o \

+7 -6

io_uring/eventfd.c

··· 15 15 struct eventfd_ctx *cq_ev_fd; 16 16 unsigned int eventfd_async: 1; 17 17 struct rcu_head rcu; 18 - atomic_t refs; 18 + refcount_t refs; 19 19 atomic_t ops; 20 20 }; 21 21 ··· 37 37 38 38 eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); 39 39 40 - if (atomic_dec_and_test(&ev_fd->refs)) 40 + if (refcount_dec_and_test(&ev_fd->refs)) 41 41 io_eventfd_free(rcu); 42 42 } 43 43 ··· 63 63 */ 64 64 if (unlikely(!ev_fd)) 65 65 return; 66 - if (!atomic_inc_not_zero(&ev_fd->refs)) 66 + if (!refcount_inc_not_zero(&ev_fd->refs)) 67 67 return; 68 68 if (ev_fd->eventfd_async && !io_wq_current_is_worker()) 69 69 goto out; ··· 77 77 } 78 78 } 79 79 out: 80 - if (atomic_dec_and_test(&ev_fd->refs)) 80 + if (refcount_dec_and_test(&ev_fd->refs)) 81 81 call_rcu(&ev_fd->rcu, io_eventfd_free); 82 82 } 83 83 ··· 126 126 ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); 127 127 if (IS_ERR(ev_fd->cq_ev_fd)) { 128 128 int ret = PTR_ERR(ev_fd->cq_ev_fd); 129 + 129 130 kfree(ev_fd); 130 131 return ret; 131 132 } ··· 137 136 138 137 ev_fd->eventfd_async = eventfd_async; 139 138 ctx->has_evfd = true; 140 - atomic_set(&ev_fd->refs, 1); 139 + refcount_set(&ev_fd->refs, 1); 141 140 atomic_set(&ev_fd->ops, 0); 142 141 rcu_assign_pointer(ctx->io_ev_fd, ev_fd); 143 142 return 0; ··· 152 151 if (ev_fd) { 153 152 ctx->has_evfd = false; 154 153 rcu_assign_pointer(ctx->io_ev_fd, NULL); 155 - if (atomic_dec_and_test(&ev_fd->refs)) 154 + if (refcount_dec_and_test(&ev_fd->refs)) 156 155 call_rcu(&ev_fd->rcu, io_eventfd_free); 157 156 return 0; 158 157 }

+13 -1

io_uring/fdinfo.c

··· 221 221 cqe->user_data, cqe->res, cqe->flags); 222 222 223 223 } 224 - 225 224 spin_unlock(&ctx->completion_lock); 225 + 226 + #ifdef CONFIG_NET_RX_BUSY_POLL 227 + if (ctx->napi_enabled) { 228 + seq_puts(m, "NAPI:\tenabled\n"); 229 + seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt); 230 + if (ctx->napi_prefer_busy_poll) 231 + seq_puts(m, "napi_prefer_busy_poll:\ttrue\n"); 232 + else 233 + seq_puts(m, "napi_prefer_busy_poll:\tfalse\n"); 234 + } else { 235 + seq_puts(m, "NAPI:\tdisabled\n"); 236 + } 237 + #endif 226 238 } 227 239 #endif

+19 -6

io_uring/io-wq.c

··· 13 13 #include <linux/slab.h> 14 14 #include <linux/rculist_nulls.h> 15 15 #include <linux/cpu.h> 16 + #include <linux/cpuset.h> 16 17 #include <linux/task_work.h> 17 18 #include <linux/audit.h> 18 19 #include <linux/mmu_context.h> ··· 1168 1167 1169 1168 if (!alloc_cpumask_var(&wq->cpu_mask, GFP_KERNEL)) 1170 1169 goto err; 1171 - cpumask_copy(wq->cpu_mask, cpu_possible_mask); 1170 + cpuset_cpus_allowed(data->task, wq->cpu_mask); 1172 1171 wq->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; 1173 1172 wq->acct[IO_WQ_ACCT_UNBOUND].max_workers = 1174 1173 task_rlimit(current, RLIMIT_NPROC); ··· 1323 1322 1324 1323 int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask) 1325 1324 { 1325 + cpumask_var_t allowed_mask; 1326 + int ret = 0; 1327 + 1326 1328 if (!tctx || !tctx->io_wq) 1327 1329 return -EINVAL; 1328 1330 1331 + if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL)) 1332 + return -ENOMEM; 1333 + 1329 1334 rcu_read_lock(); 1330 - if (mask) 1331 - cpumask_copy(tctx->io_wq->cpu_mask, mask); 1332 - else 1333 - cpumask_copy(tctx->io_wq->cpu_mask, cpu_possible_mask); 1335 + cpuset_cpus_allowed(tctx->io_wq->task, allowed_mask); 1336 + if (mask) { 1337 + if (cpumask_subset(mask, allowed_mask)) 1338 + cpumask_copy(tctx->io_wq->cpu_mask, mask); 1339 + else 1340 + ret = -EINVAL; 1341 + } else { 1342 + cpumask_copy(tctx->io_wq->cpu_mask, allowed_mask); 1343 + } 1334 1344 rcu_read_unlock(); 1335 1345 1336 - return 0; 1346 + free_cpumask_var(allowed_mask); 1347 + return ret; 1337 1348 } 1338 1349 1339 1350 /*

+159 -53

io_uring/io_uring.c

··· 904 904 lockdep_assert_held(&req->ctx->uring_lock); 905 905 906 906 req_set_fail(req); 907 - io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); 907 + io_req_set_res(req, res, io_put_kbuf(req, res, IO_URING_F_UNLOCKED)); 908 908 if (def->fail) 909 909 def->fail(req); 910 910 io_req_complete_defer(req); ··· 2350 2350 return percpu_counter_read_positive(&tctx->inflight); 2351 2351 } 2352 2352 2353 - /* when returns >0, the caller should retry */ 2354 - static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, 2355 - struct io_wait_queue *iowq) 2353 + static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) 2356 2354 { 2357 - int ret; 2355 + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); 2358 2356 2357 + WRITE_ONCE(iowq->hit_timeout, 1); 2358 + iowq->min_timeout = 0; 2359 + wake_up_process(iowq->wq.private); 2360 + return HRTIMER_NORESTART; 2361 + } 2362 + 2363 + /* 2364 + * Doing min_timeout portion. If we saw any timeouts, events, or have work, 2365 + * wake up. If not, and we have a normal timeout, switch to that and keep 2366 + * sleeping. 2367 + */ 2368 + static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer) 2369 + { 2370 + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); 2371 + struct io_ring_ctx *ctx = iowq->ctx; 2372 + 2373 + /* no general timeout, or shorter (or equal), we are done */ 2374 + if (iowq->timeout == KTIME_MAX || 2375 + ktime_compare(iowq->min_timeout, iowq->timeout) >= 0) 2376 + goto out_wake; 2377 + /* work we may need to run, wake function will see if we need to wake */ 2378 + if (io_has_work(ctx)) 2379 + goto out_wake; 2380 + /* got events since we started waiting, min timeout is done */ 2381 + if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail)) 2382 + goto out_wake; 2383 + /* if we have any events and min timeout expired, we're done */ 2384 + if (io_cqring_events(ctx)) 2385 + goto out_wake; 2386 + 2387 + /* 2388 + * If using deferred task_work running and application is waiting on 2389 + * more than one request, ensure we reset it now where we are switching 2390 + * to normal sleeps. Any request completion post min_wait should wake 2391 + * the task and return. 2392 + */ 2393 + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 2394 + atomic_set(&ctx->cq_wait_nr, 1); 2395 + smp_mb(); 2396 + if (!llist_empty(&ctx->work_llist)) 2397 + goto out_wake; 2398 + } 2399 + 2400 + iowq->t.function = io_cqring_timer_wakeup; 2401 + hrtimer_set_expires(timer, iowq->timeout); 2402 + return HRTIMER_RESTART; 2403 + out_wake: 2404 + return io_cqring_timer_wakeup(timer); 2405 + } 2406 + 2407 + static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, 2408 + clockid_t clock_id, ktime_t start_time) 2409 + { 2410 + ktime_t timeout; 2411 + 2412 + hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS); 2413 + if (iowq->min_timeout) { 2414 + timeout = ktime_add_ns(iowq->min_timeout, start_time); 2415 + iowq->t.function = io_cqring_min_timer_wakeup; 2416 + } else { 2417 + timeout = iowq->timeout; 2418 + iowq->t.function = io_cqring_timer_wakeup; 2419 + } 2420 + 2421 + hrtimer_set_expires_range_ns(&iowq->t, timeout, 0); 2422 + hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS); 2423 + 2424 + if (!READ_ONCE(iowq->hit_timeout)) 2425 + schedule(); 2426 + 2427 + hrtimer_cancel(&iowq->t); 2428 + destroy_hrtimer_on_stack(&iowq->t); 2429 + __set_current_state(TASK_RUNNING); 2430 + 2431 + return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0; 2432 + } 2433 + 2434 + static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, 2435 + struct io_wait_queue *iowq, 2436 + ktime_t start_time) 2437 + { 2438 + int ret = 0; 2439 + 2440 + /* 2441 + * Mark us as being in io_wait if we have pending requests, so cpufreq 2442 + * can take into account that the task is waiting for IO - turns out 2443 + * to be important for low QD IO. 2444 + */ 2445 + if (current_pending_io()) 2446 + current->in_iowait = 1; 2447 + if (iowq->timeout != KTIME_MAX || iowq->min_timeout) 2448 + ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time); 2449 + else 2450 + schedule(); 2451 + current->in_iowait = 0; 2452 + return ret; 2453 + } 2454 + 2455 + /* If this returns > 0, the caller should retry */ 2456 + static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, 2457 + struct io_wait_queue *iowq, 2458 + ktime_t start_time) 2459 + { 2359 2460 if (unlikely(READ_ONCE(ctx->check_cq))) 2360 2461 return 1; 2361 2462 if (unlikely(!llist_empty(&ctx->work_llist))) ··· 2468 2367 if (unlikely(io_should_wake(iowq))) 2469 2368 return 0; 2470 2369 2471 - /* 2472 - * Mark us as being in io_wait if we have pending requests, so cpufreq 2473 - * can take into account that the task is waiting for IO - turns out 2474 - * to be important for low QD IO. 2475 - */ 2476 - if (current_pending_io()) 2477 - current->in_iowait = 1; 2478 - ret = 0; 2479 - if (iowq->timeout == KTIME_MAX) 2480 - schedule(); 2481 - else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS)) 2482 - ret = -ETIME; 2483 - current->in_iowait = 0; 2484 - return ret; 2370 + return __io_cqring_wait_schedule(ctx, iowq, start_time); 2485 2371 } 2372 + 2373 + struct ext_arg { 2374 + size_t argsz; 2375 + struct __kernel_timespec __user *ts; 2376 + const sigset_t __user *sig; 2377 + ktime_t min_time; 2378 + }; 2486 2379 2487 2380 /* 2488 2381 * Wait until events become available, if we don't already have some. The 2489 2382 * application must reap them itself, as they reside on the shared cq ring. 2490 2383 */ 2491 - static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, 2492 - const sigset_t __user *sig, size_t sigsz, 2493 - struct __kernel_timespec __user *uts) 2384 + static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, 2385 + struct ext_arg *ext_arg) 2494 2386 { 2495 2387 struct io_wait_queue iowq; 2496 2388 struct io_rings *rings = ctx->rings; 2389 + ktime_t start_time; 2497 2390 int ret; 2498 2391 2499 2392 if (!io_allowed_run_tw(ctx)) ··· 2505 2410 iowq.wq.private = current; 2506 2411 INIT_LIST_HEAD(&iowq.wq.entry); 2507 2412 iowq.ctx = ctx; 2508 - iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); 2509 2413 iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; 2414 + iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail); 2415 + iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); 2416 + iowq.hit_timeout = 0; 2417 + iowq.min_timeout = ext_arg->min_time; 2510 2418 iowq.timeout = KTIME_MAX; 2419 + start_time = io_get_time(ctx); 2511 2420 2512 - if (uts) { 2421 + if (ext_arg->ts) { 2513 2422 struct timespec64 ts; 2514 - ktime_t dt; 2515 2423 2516 - if (get_timespec64(&ts, uts)) 2424 + if (get_timespec64(&ts, ext_arg->ts)) 2517 2425 return -EFAULT; 2518 2426 2519 - dt = timespec64_to_ktime(ts); 2520 - iowq.timeout = ktime_add(dt, ktime_get()); 2521 - io_napi_adjust_timeout(ctx, &iowq, dt); 2427 + iowq.timeout = timespec64_to_ktime(ts); 2428 + if (!(flags & IORING_ENTER_ABS_TIMER)) 2429 + iowq.timeout = ktime_add(iowq.timeout, start_time); 2522 2430 } 2523 2431 2524 - if (sig) { 2432 + if (ext_arg->sig) { 2525 2433 #ifdef CONFIG_COMPAT 2526 2434 if (in_compat_syscall()) 2527 - ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, 2528 - sigsz); 2435 + ret = set_compat_user_sigmask((const compat_sigset_t __user *)ext_arg->sig, 2436 + ext_arg->argsz); 2529 2437 else 2530 2438 #endif 2531 - ret = set_user_sigmask(sig, sigsz); 2439 + ret = set_user_sigmask(ext_arg->sig, ext_arg->argsz); 2532 2440 2533 2441 if (ret) 2534 2442 return ret; ··· 2541 2443 2542 2444 trace_io_uring_cqring_wait(ctx, min_events); 2543 2445 do { 2544 - int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail); 2545 2446 unsigned long check_cq; 2447 + int nr_wait; 2448 + 2449 + /* if min timeout has been hit, don't reset wait count */ 2450 + if (!iowq.hit_timeout) 2451 + nr_wait = (int) iowq.cq_tail - 2452 + READ_ONCE(ctx->rings->cq.tail); 2453 + else 2454 + nr_wait = 1; 2546 2455 2547 2456 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 2548 2457 atomic_set(&ctx->cq_wait_nr, nr_wait); ··· 2559 2454 TASK_INTERRUPTIBLE); 2560 2455 } 2561 2456 2562 - ret = io_cqring_wait_schedule(ctx, &iowq); 2457 + ret = io_cqring_wait_schedule(ctx, &iowq, start_time); 2563 2458 __set_current_state(TASK_RUNNING); 2564 2459 atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); 2565 2460 ··· 3217 3112 return 0; 3218 3113 } 3219 3114 3220 - static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, 3221 - struct __kernel_timespec __user **ts, 3222 - const sigset_t __user **sig) 3115 + static int io_get_ext_arg(unsigned flags, const void __user *argp, 3116 + struct ext_arg *ext_arg) 3223 3117 { 3224 3118 struct io_uring_getevents_arg arg; 3225 3119 ··· 3227 3123 * is just a pointer to the sigset_t. 3228 3124 */ 3229 3125 if (!(flags & IORING_ENTER_EXT_ARG)) { 3230 - *sig = (const sigset_t __user *) argp; 3231 - *ts = NULL; 3126 + ext_arg->sig = (const sigset_t __user *) argp; 3127 + ext_arg->ts = NULL; 3232 3128 return 0; 3233 3129 } 3234 3130 ··· 3236 3132 * EXT_ARG is set - ensure we agree on the size of it and copy in our 3237 3133 * timespec and sigset_t pointers if good. 3238 3134 */ 3239 - if (*argsz != sizeof(arg)) 3135 + if (ext_arg->argsz != sizeof(arg)) 3240 3136 return -EINVAL; 3241 3137 if (copy_from_user(&arg, argp, sizeof(arg))) 3242 3138 return -EFAULT; 3243 - if (arg.pad) 3244 - return -EINVAL; 3245 - *sig = u64_to_user_ptr(arg.sigmask); 3246 - *argsz = arg.sigmask_sz; 3247 - *ts = u64_to_user_ptr(arg.ts); 3139 + ext_arg->min_time = arg.min_wait_usec * NSEC_PER_USEC; 3140 + ext_arg->sig = u64_to_user_ptr(arg.sigmask); 3141 + ext_arg->argsz = arg.sigmask_sz; 3142 + ext_arg->ts = u64_to_user_ptr(arg.ts); 3248 3143 return 0; 3249 3144 } 3250 3145 ··· 3257 3154 3258 3155 if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | 3259 3156 IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | 3260 - IORING_ENTER_REGISTERED_RING))) 3157 + IORING_ENTER_REGISTERED_RING | 3158 + IORING_ENTER_ABS_TIMER))) 3261 3159 return -EINVAL; 3262 3160 3263 3161 /* ··· 3349 3245 } 3350 3246 mutex_unlock(&ctx->uring_lock); 3351 3247 } else { 3352 - const sigset_t __user *sig; 3353 - struct __kernel_timespec __user *ts; 3248 + struct ext_arg ext_arg = { .argsz = argsz }; 3354 3249 3355 - ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); 3250 + ret2 = io_get_ext_arg(flags, argp, &ext_arg); 3356 3251 if (likely(!ret2)) { 3357 3252 min_complete = min(min_complete, 3358 3253 ctx->cq_entries); 3359 - ret2 = io_cqring_wait(ctx, min_complete, sig, 3360 - argsz, ts); 3254 + ret2 = io_cqring_wait(ctx, min_complete, flags, 3255 + &ext_arg); 3361 3256 } 3362 3257 } 3363 3258 ··· 3527 3424 if (!ctx) 3528 3425 return -ENOMEM; 3529 3426 3427 + ctx->clockid = CLOCK_MONOTONIC; 3428 + ctx->clock_offset = 0; 3429 + 3530 3430 if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && 3531 3431 !(ctx->flags & IORING_SETUP_IOPOLL) && 3532 3432 !(ctx->flags & IORING_SETUP_SQPOLL)) ··· 3641 3535 IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | 3642 3536 IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | 3643 3537 IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | 3644 - IORING_FEAT_RECVSEND_BUNDLE; 3538 + IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT; 3645 3539 3646 3540 if (copy_to_user(params, p, sizeof(*p))) { 3647 3541 ret = -EFAULT;

+12

io_uring/io_uring.h

··· 39 39 struct wait_queue_entry wq; 40 40 struct io_ring_ctx *ctx; 41 41 unsigned cq_tail; 42 + unsigned cq_min_tail; 42 43 unsigned nr_timeouts; 44 + int hit_timeout; 45 + ktime_t min_timeout; 43 46 ktime_t timeout; 47 + struct hrtimer t; 44 48 45 49 #ifdef CONFIG_NET_RX_BUSY_POLL 46 50 ktime_t napi_busy_poll_dt; ··· 439 435 return true; 440 436 } 441 437 return false; 438 + } 439 + 440 + static inline ktime_t io_get_time(struct io_ring_ctx *ctx) 441 + { 442 + if (ctx->clockid == CLOCK_MONOTONIC) 443 + return ktime_get(); 444 + 445 + return ktime_get_with_offset(ctx->clock_offset); 442 446 } 443 447 444 448 enum {

+53 -43

io_uring/kbuf.c

··· 70 70 return true; 71 71 } 72 72 73 - void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) 73 + void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags) 74 74 { 75 75 /* 76 76 * We can add this buffer back to two lists: ··· 88 88 struct io_ring_ctx *ctx = req->ctx; 89 89 90 90 spin_lock(&ctx->completion_lock); 91 - __io_put_kbuf_list(req, &ctx->io_buffers_comp); 91 + __io_put_kbuf_list(req, len, &ctx->io_buffers_comp); 92 92 spin_unlock(&ctx->completion_lock); 93 93 } else { 94 94 lockdep_assert_held(&req->ctx->uring_lock); 95 95 96 - __io_put_kbuf_list(req, &req->ctx->io_buffers_cache); 96 + __io_put_kbuf_list(req, len, &req->ctx->io_buffers_cache); 97 97 } 98 98 } 99 99 ··· 132 132 return 1; 133 133 } 134 134 135 - static struct io_uring_buf *io_ring_head_to_buf(struct io_uring_buf_ring *br, 136 - __u16 head, __u16 mask) 137 - { 138 - return &br->bufs[head & mask]; 139 - } 140 - 141 135 static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, 142 136 struct io_buffer_list *bl, 143 137 unsigned int issue_flags) ··· 165 171 * the transfer completes (or if we get -EAGAIN and must poll of 166 172 * retry). 167 173 */ 168 - req->flags &= ~REQ_F_BUFFERS_COMMIT; 174 + io_kbuf_commit(req, bl, *len, 1); 169 175 req->buf_list = NULL; 170 - bl->head++; 171 176 } 172 177 return u64_to_user_ptr(buf->addr); 173 178 } ··· 182 189 183 190 bl = io_buffer_get_list(ctx, req->buf_index); 184 191 if (likely(bl)) { 185 - if (bl->is_buf_ring) 192 + if (bl->flags & IOBL_BUF_RING) 186 193 ret = io_ring_buffer_select(req, len, bl, issue_flags); 187 194 else 188 195 ret = io_provided_buffer_select(req, len, bl); ··· 212 219 buf = io_ring_head_to_buf(br, head, bl->mask); 213 220 if (arg->max_len) { 214 221 u32 len = READ_ONCE(buf->len); 215 - size_t needed; 216 222 217 223 if (unlikely(!len)) 218 224 return -ENOBUFS; 219 - needed = (arg->max_len + len - 1) / len; 220 - needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT); 221 - if (nr_avail > needed) 222 - nr_avail = needed; 225 + /* 226 + * Limit incremental buffers to 1 segment. No point trying 227 + * to peek ahead and map more than we need, when the buffers 228 + * themselves should be large when setup with 229 + * IOU_PBUF_RING_INC. 230 + */ 231 + if (bl->flags & IOBL_INC) { 232 + nr_avail = 1; 233 + } else { 234 + size_t needed; 235 + 236 + needed = (arg->max_len + len - 1) / len; 237 + needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT); 238 + if (nr_avail > needed) 239 + nr_avail = needed; 240 + } 223 241 } 224 242 225 243 /* ··· 255 251 256 252 req->buf_index = buf->bid; 257 253 do { 258 - /* truncate end piece, if needed */ 259 - if (buf->len > arg->max_len) 260 - buf->len = arg->max_len; 254 + u32 len = buf->len; 255 + 256 + /* truncate end piece, if needed, for non partial buffers */ 257 + if (len > arg->max_len) { 258 + len = arg->max_len; 259 + if (!(bl->flags & IOBL_INC)) 260 + buf->len = len; 261 + } 261 262 262 263 iov->iov_base = u64_to_user_ptr(buf->addr); 263 - iov->iov_len = buf->len; 264 + iov->iov_len = len; 264 265 iov++; 265 266 266 - arg->out_len += buf->len; 267 - arg->max_len -= buf->len; 267 + arg->out_len += len; 268 + arg->max_len -= len; 268 269 if (!arg->max_len) 269 270 break; 270 271 ··· 296 287 if (unlikely(!bl)) 297 288 goto out_unlock; 298 289 299 - if (bl->is_buf_ring) { 290 + if (bl->flags & IOBL_BUF_RING) { 300 291 ret = io_ring_buffers_peek(req, arg, bl); 301 292 /* 302 293 * Don't recycle these buffers if we need to go through poll. ··· 306 297 * committed them, they cannot be put back in the queue. 307 298 */ 308 299 if (ret > 0) { 309 - req->flags |= REQ_F_BL_NO_RECYCLE; 310 - req->buf_list->head += ret; 300 + req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE; 301 + io_kbuf_commit(req, bl, arg->out_len, ret); 311 302 } 312 303 } else { 313 304 ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs); ··· 329 320 if (unlikely(!bl)) 330 321 return -ENOENT; 331 322 332 - if (bl->is_buf_ring) { 323 + if (bl->flags & IOBL_BUF_RING) { 333 324 ret = io_ring_buffers_peek(req, arg, bl); 334 325 if (ret > 0) 335 326 req->flags |= REQ_F_BUFFERS_COMMIT; ··· 349 340 if (!nbufs) 350 341 return 0; 351 342 352 - if (bl->is_buf_ring) { 343 + if (bl->flags & IOBL_BUF_RING) { 353 344 i = bl->buf_ring->tail - bl->head; 354 345 if (bl->buf_nr_pages) { 355 346 int j; 356 347 357 - if (!bl->is_mmap) { 348 + if (!(bl->flags & IOBL_MMAP)) { 358 349 for (j = 0; j < bl->buf_nr_pages; j++) 359 350 unpin_user_page(bl->buf_pages[j]); 360 351 } 361 352 io_pages_unmap(bl->buf_ring, &bl->buf_pages, 362 - &bl->buf_nr_pages, bl->is_mmap); 363 - bl->is_mmap = 0; 353 + &bl->buf_nr_pages, bl->flags & IOBL_MMAP); 354 + bl->flags &= ~IOBL_MMAP; 364 355 } 365 356 /* make sure it's seen as empty */ 366 357 INIT_LIST_HEAD(&bl->buf_list); 367 - bl->is_buf_ring = 0; 358 + bl->flags &= ~IOBL_BUF_RING; 368 359 return i; 369 360 } 370 361 ··· 451 442 if (bl) { 452 443 ret = -EINVAL; 453 444 /* can't use provide/remove buffers command on mapped buffers */ 454 - if (!bl->is_buf_ring) 445 + if (!(bl->flags & IOBL_BUF_RING)) 455 446 ret = __io_remove_buffers(ctx, bl, p->nbufs); 456 447 } 457 448 io_ring_submit_unlock(ctx, issue_flags); ··· 598 589 } 599 590 } 600 591 /* can't add buffers via this command for a mapped buffer ring */ 601 - if (bl->is_buf_ring) { 592 + if (bl->flags & IOBL_BUF_RING) { 602 593 ret = -EINVAL; 603 594 goto err; 604 595 } ··· 650 641 bl->buf_pages = pages; 651 642 bl->buf_nr_pages = nr_pages; 652 643 bl->buf_ring = br; 653 - bl->is_buf_ring = 1; 654 - bl->is_mmap = 0; 644 + bl->flags |= IOBL_BUF_RING; 645 + bl->flags &= ~IOBL_MMAP; 655 646 return 0; 656 647 error_unpin: 657 648 unpin_user_pages(pages, nr_pages); ··· 674 665 return -ENOMEM; 675 666 } 676 667 677 - bl->is_buf_ring = 1; 678 - bl->is_mmap = 1; 668 + bl->flags |= (IOBL_BUF_RING | IOBL_MMAP); 679 669 return 0; 680 670 } 681 671 ··· 691 683 692 684 if (reg.resv[0] || reg.resv[1] || reg.resv[2]) 693 685 return -EINVAL; 694 - if (reg.flags & ~IOU_PBUF_RING_MMAP) 686 + if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC)) 695 687 return -EINVAL; 696 688 if (!(reg.flags & IOU_PBUF_RING_MMAP)) { 697 689 if (!reg.ring_addr) ··· 713 705 bl = io_buffer_get_list(ctx, reg.bgid); 714 706 if (bl) { 715 707 /* if mapped buffer ring OR classic exists, don't allow */ 716 - if (bl->is_buf_ring || !list_empty(&bl->buf_list)) 708 + if (bl->flags & IOBL_BUF_RING || !list_empty(&bl->buf_list)) 717 709 return -EEXIST; 718 710 } else { 719 711 free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); ··· 729 721 if (!ret) { 730 722 bl->nr_entries = reg.ring_entries; 731 723 bl->mask = reg.ring_entries - 1; 724 + if (reg.flags & IOU_PBUF_RING_INC) 725 + bl->flags |= IOBL_INC; 732 726 733 727 io_buffer_add_list(ctx, bl, reg.bgid); 734 728 return 0; ··· 757 747 bl = io_buffer_get_list(ctx, reg.bgid); 758 748 if (!bl) 759 749 return -ENOENT; 760 - if (!bl->is_buf_ring) 750 + if (!(bl->flags & IOBL_BUF_RING)) 761 751 return -EINVAL; 762 752 763 753 xa_erase(&ctx->io_bl_xa, bl->bgid); ··· 781 771 bl = io_buffer_get_list(ctx, buf_status.buf_group); 782 772 if (!bl) 783 773 return -ENOENT; 784 - if (!bl->is_buf_ring) 774 + if (!(bl->flags & IOBL_BUF_RING)) 785 775 return -EINVAL; 786 776 787 777 buf_status.head = bl->head; ··· 812 802 bl = xa_load(&ctx->io_bl_xa, bgid); 813 803 /* must be a mmap'able buffer ring and have pages */ 814 804 ret = false; 815 - if (bl && bl->is_mmap) 805 + if (bl && bl->flags & IOBL_MMAP) 816 806 ret = atomic_inc_not_zero(&bl->refs); 817 807 rcu_read_unlock(); 818 808

+67 -27

io_uring/kbuf.h

··· 4 4 5 5 #include <uapi/linux/io_uring.h> 6 6 7 + enum { 8 + /* ring mapped provided buffers */ 9 + IOBL_BUF_RING = 1, 10 + /* ring mapped provided buffers, but mmap'ed by application */ 11 + IOBL_MMAP = 2, 12 + /* buffers are consumed incrementally rather than always fully */ 13 + IOBL_INC = 4, 14 + 15 + }; 16 + 7 17 struct io_buffer_list { 8 18 /* 9 19 * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not, ··· 35 25 __u16 head; 36 26 __u16 mask; 37 27 38 - atomic_t refs; 28 + __u16 flags; 39 29 40 - /* ring mapped provided buffers */ 41 - __u8 is_buf_ring; 42 - /* ring mapped provided buffers, but mmap'ed by application */ 43 - __u8 is_mmap; 30 + atomic_t refs; 44 31 }; 45 32 46 33 struct io_buffer { ··· 59 52 struct iovec *iovs; 60 53 size_t out_len; 61 54 size_t max_len; 62 - int nr_iovs; 63 - int mode; 55 + unsigned short nr_iovs; 56 + unsigned short mode; 64 57 }; 65 58 66 59 void __user *io_buffer_select(struct io_kiocb *req, size_t *len, ··· 80 73 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); 81 74 int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); 82 75 83 - void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); 76 + void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags); 84 77 85 78 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); 86 79 ··· 124 117 return false; 125 118 } 126 119 127 - static inline void __io_put_kbuf_ring(struct io_kiocb *req, int nr) 120 + /* Mapped buffer ring, return io_uring_buf from head */ 121 + #define io_ring_head_to_buf(br, head, mask) &(br)->bufs[(head) & (mask)] 122 + 123 + static inline bool io_kbuf_commit(struct io_kiocb *req, 124 + struct io_buffer_list *bl, int len, int nr) 125 + { 126 + if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT))) 127 + return true; 128 + 129 + req->flags &= ~REQ_F_BUFFERS_COMMIT; 130 + 131 + if (unlikely(len < 0)) 132 + return true; 133 + 134 + if (bl->flags & IOBL_INC) { 135 + struct io_uring_buf *buf; 136 + 137 + buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask); 138 + if (WARN_ON_ONCE(len > buf->len)) 139 + len = buf->len; 140 + buf->len -= len; 141 + if (buf->len) { 142 + buf->addr += len; 143 + return false; 144 + } 145 + } 146 + 147 + bl->head += nr; 148 + return true; 149 + } 150 + 151 + static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) 128 152 { 129 153 struct io_buffer_list *bl = req->buf_list; 154 + bool ret = true; 130 155 131 156 if (bl) { 132 - if (req->flags & REQ_F_BUFFERS_COMMIT) { 133 - bl->head += nr; 134 - req->flags &= ~REQ_F_BUFFERS_COMMIT; 135 - } 157 + ret = io_kbuf_commit(req, bl, len, nr); 136 158 req->buf_index = bl->bgid; 137 159 } 138 160 req->flags &= ~REQ_F_BUFFER_RING; 161 + return ret; 139 162 } 140 163 141 - static inline void __io_put_kbuf_list(struct io_kiocb *req, 164 + static inline void __io_put_kbuf_list(struct io_kiocb *req, int len, 142 165 struct list_head *list) 143 166 { 144 167 if (req->flags & REQ_F_BUFFER_RING) { 145 - __io_put_kbuf_ring(req, 1); 168 + __io_put_kbuf_ring(req, len, 1); 146 169 } else { 147 170 req->buf_index = req->kbuf->bgid; 148 171 list_add(&req->kbuf->list, list); ··· 187 150 if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) 188 151 return; 189 152 190 - __io_put_kbuf_list(req, &req->ctx->io_buffers_comp); 153 + /* len == 0 is fine here, non-ring will always drop all of it */ 154 + __io_put_kbuf_list(req, 0, &req->ctx->io_buffers_comp); 191 155 } 192 156 193 - static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs, 194 - unsigned issue_flags) 157 + static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int len, 158 + int nbufs, unsigned issue_flags) 195 159 { 196 160 unsigned int ret; 197 161 ··· 200 162 return 0; 201 163 202 164 ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); 203 - if (req->flags & REQ_F_BUFFER_RING) 204 - __io_put_kbuf_ring(req, nbufs); 205 - else 206 - __io_put_kbuf(req, issue_flags); 165 + if (req->flags & REQ_F_BUFFER_RING) { 166 + if (!__io_put_kbuf_ring(req, len, nbufs)) 167 + ret |= IORING_CQE_F_BUF_MORE; 168 + } else { 169 + __io_put_kbuf(req, len, issue_flags); 170 + } 207 171 return ret; 208 172 } 209 173 210 - static inline unsigned int io_put_kbuf(struct io_kiocb *req, 174 + static inline unsigned int io_put_kbuf(struct io_kiocb *req, int len, 211 175 unsigned issue_flags) 212 176 { 213 - return __io_put_kbufs(req, 1, issue_flags); 177 + return __io_put_kbufs(req, len, 1, issue_flags); 214 178 } 215 179 216 - static inline unsigned int io_put_kbufs(struct io_kiocb *req, int nbufs, 217 - unsigned issue_flags) 180 + static inline unsigned int io_put_kbufs(struct io_kiocb *req, int len, 181 + int nbufs, unsigned issue_flags) 218 182 { 219 - return __io_put_kbufs(req, nbufs, issue_flags); 183 + return __io_put_kbufs(req, len, nbufs, issue_flags); 220 184 } 221 185 #endif

+11 -24

io_uring/napi.c

··· 270 270 } 271 271 272 272 /* 273 - * __io_napi_adjust_timeout() - adjust busy loop timeout 274 - * @ctx: pointer to io-uring context structure 275 - * @iowq: pointer to io wait queue 276 - * @ts: pointer to timespec or NULL 277 - * 278 - * Adjust the busy loop timeout according to timespec and busy poll timeout. 279 - * If the specified NAPI timeout is bigger than the wait timeout, then adjust 280 - * the NAPI timeout accordingly. 281 - */ 282 - void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, 283 - ktime_t to_wait) 284 - { 285 - ktime_t poll_dt = READ_ONCE(ctx->napi_busy_poll_dt); 286 - 287 - if (to_wait) 288 - poll_dt = min(poll_dt, to_wait); 289 - 290 - iowq->napi_busy_poll_dt = poll_dt; 291 - } 292 - 293 - /* 294 273 * __io_napi_busy_loop() - execute busy poll loop 295 274 * @ctx: pointer to io-uring context structure 296 275 * @iowq: pointer to io wait queue ··· 278 299 */ 279 300 void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) 280 301 { 281 - iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll); 302 + if (ctx->flags & IORING_SETUP_SQPOLL) 303 + return; 282 304 283 - if (!(ctx->flags & IORING_SETUP_SQPOLL)) 284 - io_napi_blocking_busy_loop(ctx, iowq); 305 + iowq->napi_busy_poll_dt = READ_ONCE(ctx->napi_busy_poll_dt); 306 + if (iowq->timeout != KTIME_MAX) { 307 + ktime_t dt = ktime_sub(iowq->timeout, io_get_time(ctx)); 308 + 309 + iowq->napi_busy_poll_dt = min_t(u64, iowq->napi_busy_poll_dt, dt); 310 + } 311 + 312 + iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll); 313 + io_napi_blocking_busy_loop(ctx, iowq); 285 314 } 286 315 287 316 /*

-16

io_uring/napi.h

··· 17 17 18 18 void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock); 19 19 20 - void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, 21 - struct io_wait_queue *iowq, ktime_t to_wait); 22 20 void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq); 23 21 int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx); 24 22 25 23 static inline bool io_napi(struct io_ring_ctx *ctx) 26 24 { 27 25 return !list_empty(&ctx->napi_list); 28 - } 29 - 30 - static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx, 31 - struct io_wait_queue *iowq, 32 - ktime_t to_wait) 33 - { 34 - if (!io_napi(ctx)) 35 - return; 36 - __io_napi_adjust_timeout(ctx, iowq, to_wait); 37 26 } 38 27 39 28 static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, ··· 73 84 return false; 74 85 } 75 86 static inline void io_napi_add(struct io_kiocb *req) 76 - { 77 - } 78 - static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx, 79 - struct io_wait_queue *iowq, 80 - ktime_t to_wait) 81 87 { 82 88 } 83 89 static inline void io_napi_busy_loop(struct io_ring_ctx *ctx,

+17 -10

io_uring/net.c

··· 434 434 sr->buf_group = req->buf_index; 435 435 req->buf_list = NULL; 436 436 } 437 - if (req->flags & REQ_F_BUFFER_SELECT && sr->len) 438 - return -EINVAL; 439 437 440 438 #ifdef CONFIG_COMPAT 441 439 if (req->ctx->compat) ··· 497 499 unsigned int cflags; 498 500 499 501 if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { 500 - cflags = io_put_kbuf(req, issue_flags); 502 + cflags = io_put_kbuf(req, *ret, issue_flags); 501 503 goto finish; 502 504 } 503 505 504 - cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), issue_flags); 506 + cflags = io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), issue_flags); 505 507 506 508 if (bundle_finished || req->flags & REQ_F_BL_EMPTY) 507 509 goto finish; ··· 597 599 if (io_do_buffer_select(req)) { 598 600 struct buf_sel_arg arg = { 599 601 .iovs = &kmsg->fast_iov, 600 - .max_len = INT_MAX, 602 + .max_len = min_not_zero(sr->len, INT_MAX), 601 603 .nr_iovs = 1, 602 604 }; 603 605 ··· 616 618 if (unlikely(ret < 0)) 617 619 return ret; 618 620 619 - sr->len = arg.out_len; 620 - iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, arg.iovs, ret, 621 - arg.out_len); 622 621 if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { 623 622 kmsg->free_iov_nr = ret; 624 623 kmsg->free_iov = arg.iovs; 625 624 req->flags |= REQ_F_NEED_CLEANUP; 625 + } 626 + sr->len = arg.out_len; 627 + 628 + if (ret == 1) { 629 + sr->buf = arg.iovs[0].iov_base; 630 + ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, 631 + &kmsg->msg.msg_iter); 632 + if (unlikely(ret)) 633 + return ret; 634 + } else { 635 + iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, 636 + arg.iovs, ret, arg.out_len); 626 637 } 627 638 } 628 639 ··· 842 835 cflags |= IORING_CQE_F_SOCK_NONEMPTY; 843 836 844 837 if (sr->flags & IORING_RECVSEND_BUNDLE) { 845 - cflags |= io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), 838 + cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), 846 839 issue_flags); 847 840 /* bundle with no more immediate buffers, we're done */ 848 841 if (req->flags & REQ_F_BL_EMPTY) 849 842 goto finish; 850 843 } else { 851 - cflags |= io_put_kbuf(req, issue_flags); 844 + cflags |= io_put_kbuf(req, *ret, issue_flags); 852 845 } 853 846 854 847 /*

+31

io_uring/register.c

··· 335 335 return ret; 336 336 } 337 337 338 + static int io_register_clock(struct io_ring_ctx *ctx, 339 + struct io_uring_clock_register __user *arg) 340 + { 341 + struct io_uring_clock_register reg; 342 + 343 + if (copy_from_user(&reg, arg, sizeof(reg))) 344 + return -EFAULT; 345 + if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv))) 346 + return -EINVAL; 347 + 348 + switch (reg.clockid) { 349 + case CLOCK_MONOTONIC: 350 + ctx->clock_offset = 0; 351 + break; 352 + case CLOCK_BOOTTIME: 353 + ctx->clock_offset = TK_OFFS_BOOT; 354 + break; 355 + default: 356 + return -EINVAL; 357 + } 358 + 359 + ctx->clockid = reg.clockid; 360 + return 0; 361 + } 362 + 338 363 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, 339 364 void __user *arg, unsigned nr_args) 340 365 __releases(ctx->uring_lock) ··· 535 510 if (nr_args != 1) 536 511 break; 537 512 ret = io_unregister_napi(ctx, arg); 513 + break; 514 + case IORING_REGISTER_CLOCK: 515 + ret = -EINVAL; 516 + if (!arg || nr_args) 517 + break; 518 + ret = io_register_clock(ctx, arg); 538 519 break; 539 520 default: 540 521 ret = -EINVAL;

+108 -41

io_uring/rsrc.c

··· 855 855 return ret; 856 856 } 857 857 858 + static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages, 859 + struct io_imu_folio_data *data, int nr_folios) 860 + { 861 + struct page **page_array = *pages, **new_array = NULL; 862 + int nr_pages_left = *nr_pages, i, j; 863 + 864 + /* Store head pages only*/ 865 + new_array = kvmalloc_array(nr_folios, sizeof(struct page *), 866 + GFP_KERNEL); 867 + if (!new_array) 868 + return false; 869 + 870 + new_array[0] = compound_head(page_array[0]); 871 + /* 872 + * The pages are bound to the folio, it doesn't 873 + * actually unpin them but drops all but one reference, 874 + * which is usually put down by io_buffer_unmap(). 875 + * Note, needs a better helper. 876 + */ 877 + if (data->nr_pages_head > 1) 878 + unpin_user_pages(&page_array[1], data->nr_pages_head - 1); 879 + 880 + j = data->nr_pages_head; 881 + nr_pages_left -= data->nr_pages_head; 882 + for (i = 1; i < nr_folios; i++) { 883 + unsigned int nr_unpin; 884 + 885 + new_array[i] = page_array[j]; 886 + nr_unpin = min_t(unsigned int, nr_pages_left - 1, 887 + data->nr_pages_mid - 1); 888 + if (nr_unpin) 889 + unpin_user_pages(&page_array[j+1], nr_unpin); 890 + j += data->nr_pages_mid; 891 + nr_pages_left -= data->nr_pages_mid; 892 + } 893 + kvfree(page_array); 894 + *pages = new_array; 895 + *nr_pages = nr_folios; 896 + return true; 897 + } 898 + 899 + static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages, 900 + struct io_imu_folio_data *data) 901 + { 902 + struct page **page_array = *pages; 903 + struct folio *folio = page_folio(page_array[0]); 904 + unsigned int count = 1, nr_folios = 1; 905 + int i; 906 + 907 + if (*nr_pages <= 1) 908 + return false; 909 + 910 + data->nr_pages_mid = folio_nr_pages(folio); 911 + if (data->nr_pages_mid == 1) 912 + return false; 913 + 914 + data->folio_shift = folio_shift(folio); 915 + /* 916 + * Check if pages are contiguous inside a folio, and all folios have 917 + * the same page count except for the head and tail. 918 + */ 919 + for (i = 1; i < *nr_pages; i++) { 920 + if (page_folio(page_array[i]) == folio && 921 + page_array[i] == page_array[i-1] + 1) { 922 + count++; 923 + continue; 924 + } 925 + 926 + if (nr_folios == 1) { 927 + if (folio_page_idx(folio, page_array[i-1]) != 928 + data->nr_pages_mid - 1) 929 + return false; 930 + 931 + data->nr_pages_head = count; 932 + } else if (count != data->nr_pages_mid) { 933 + return false; 934 + } 935 + 936 + folio = page_folio(page_array[i]); 937 + if (folio_size(folio) != (1UL << data->folio_shift) || 938 + folio_page_idx(folio, page_array[i]) != 0) 939 + return false; 940 + 941 + count = 1; 942 + nr_folios++; 943 + } 944 + if (nr_folios == 1) 945 + data->nr_pages_head = count; 946 + 947 + return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios); 948 + } 949 + 858 950 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 859 951 struct io_mapped_ubuf **pimu, 860 952 struct page **last_hpage) ··· 956 864 unsigned long off; 957 865 size_t size; 958 866 int ret, nr_pages, i; 959 - struct folio *folio = NULL; 867 + struct io_imu_folio_data data; 868 + bool coalesced; 960 869 961 870 *pimu = (struct io_mapped_ubuf *)&dummy_ubuf; 962 871 if (!iov->iov_base) ··· 972 879 goto done; 973 880 } 974 881 975 - /* If it's a huge page, try to coalesce them into a single bvec entry */ 976 - if (nr_pages > 1) { 977 - folio = page_folio(pages[0]); 978 - for (i = 1; i < nr_pages; i++) { 979 - /* 980 - * Pages must be consecutive and on the same folio for 981 - * this to work 982 - */ 983 - if (page_folio(pages[i]) != folio || 984 - pages[i] != pages[i - 1] + 1) { 985 - folio = NULL; 986 - break; 987 - } 988 - } 989 - if (folio) { 990 - /* 991 - * The pages are bound to the folio, it doesn't 992 - * actually unpin them but drops all but one reference, 993 - * which is usually put down by io_buffer_unmap(). 994 - * Note, needs a better helper. 995 - */ 996 - unpin_user_pages(&pages[1], nr_pages - 1); 997 - nr_pages = 1; 998 - } 999 - } 882 + /* If it's huge page(s), try to coalesce them into fewer bvec entries */ 883 + coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data); 1000 884 1001 885 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); 1002 886 if (!imu) ··· 985 915 goto done; 986 916 } 987 917 988 - off = (unsigned long) iov->iov_base & ~PAGE_MASK; 989 918 size = iov->iov_len; 990 919 /* store original address for later verification */ 991 920 imu->ubuf = (unsigned long) iov->iov_base; 992 921 imu->ubuf_end = imu->ubuf + iov->iov_len; 993 922 imu->nr_bvecs = nr_pages; 923 + imu->folio_shift = PAGE_SHIFT; 924 + imu->folio_mask = PAGE_MASK; 925 + if (coalesced) { 926 + imu->folio_shift = data.folio_shift; 927 + imu->folio_mask = ~((1UL << data.folio_shift) - 1); 928 + } 929 + off = (unsigned long) iov->iov_base & ~imu->folio_mask; 994 930 *pimu = imu; 995 931 ret = 0; 996 932 997 - if (folio) { 998 - bvec_set_page(&imu->bvec[0], pages[0], size, off); 999 - goto done; 1000 - } 1001 933 for (i = 0; i < nr_pages; i++) { 1002 934 size_t vec_len; 1003 935 1004 - vec_len = min_t(size_t, size, PAGE_SIZE - off); 936 + vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); 1005 937 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); 1006 938 off = 0; 1007 939 size -= vec_len; ··· 1114 1042 * we know that: 1115 1043 * 1116 1044 * 1) it's a BVEC iter, we set it up 1117 - * 2) all bvecs are PAGE_SIZE in size, except potentially the 1045 + * 2) all bvecs are the same in size, except potentially the 1118 1046 * first and last bvec 1119 1047 * 1120 1048 * So just find our index, and adjust the iterator afterwards. 1121 1049 * If the offset is within the first bvec (or the whole first 1122 1050 * bvec, just use iov_iter_advance(). This makes it easier 1123 1051 * since we can just skip the first segment, which may not 1124 - * be PAGE_SIZE aligned. 1052 + * be folio_size aligned. 1125 1053 */ 1126 1054 const struct bio_vec *bvec = imu->bvec; 1127 1055 1128 1056 if (offset < bvec->bv_len) { 1129 - /* 1130 - * Note, huge pages buffers consists of one large 1131 - * bvec entry and should always go this way. The other 1132 - * branch doesn't expect non PAGE_SIZE'd chunks. 1133 - */ 1134 1057 iter->bvec = bvec; 1135 1058 iter->count -= offset; 1136 1059 iter->iov_offset = offset; ··· 1134 1067 1135 1068 /* skip first vec */ 1136 1069 offset -= bvec->bv_len; 1137 - seg_skip = 1 + (offset >> PAGE_SHIFT); 1070 + seg_skip = 1 + (offset >> imu->folio_shift); 1138 1071 1139 1072 iter->bvec = bvec + seg_skip; 1140 1073 iter->nr_segs -= seg_skip; 1141 1074 iter->count -= bvec->bv_len + offset; 1142 - iter->iov_offset = offset & ~PAGE_MASK; 1075 + iter->iov_offset = offset & ~imu->folio_mask; 1143 1076 } 1144 1077 } 1145 1078

+10 -2

io_uring/rsrc.h

··· 22 22 }; 23 23 }; 24 24 25 - typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); 26 - 27 25 struct io_rsrc_data { 28 26 struct io_ring_ctx *ctx; 29 27 ··· 44 46 u64 ubuf; 45 47 u64 ubuf_end; 46 48 unsigned int nr_bvecs; 49 + unsigned int folio_shift; 47 50 unsigned long acct_pages; 51 + unsigned long folio_mask; 48 52 struct bio_vec bvec[] __counted_by(nr_bvecs); 53 + }; 54 + 55 + struct io_imu_folio_data { 56 + /* Head folio can be partially included in the fixed buf */ 57 + unsigned int nr_pages_head; 58 + /* For non-head/tail folios, has to be fully included */ 59 + unsigned int nr_pages_mid; 60 + unsigned int folio_shift; 49 61 }; 50 62 51 63 void io_rsrc_node_ref_zero(struct io_rsrc_node *node);

+13 -6

io_uring/rw.c

··· 467 467 static bool __io_complete_rw_common(struct io_kiocb *req, long res) 468 468 { 469 469 if (unlikely(res != req->cqe.res)) { 470 - if ((res == -EAGAIN || res == -EOPNOTSUPP) && 471 - io_rw_should_reissue(req)) { 470 + if (res == -EAGAIN && io_rw_should_reissue(req)) { 472 471 /* 473 472 * Reissue will start accounting again, finish the 474 473 * current cycle. ··· 510 511 io_req_io_end(req); 511 512 512 513 if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) 513 - req->cqe.flags |= io_put_kbuf(req, 0); 514 + req->cqe.flags |= io_put_kbuf(req, req->cqe.res, 0); 514 515 515 516 io_req_rw_cleanup(req, 0); 516 517 io_req_task_complete(req, ts); ··· 592 593 */ 593 594 io_req_io_end(req); 594 595 io_req_set_res(req, final_ret, 595 - io_put_kbuf(req, issue_flags)); 596 + io_put_kbuf(req, ret, issue_flags)); 596 597 io_req_rw_cleanup(req, issue_flags); 597 598 return IOU_OK; 598 599 } ··· 854 855 855 856 ret = io_iter_do_read(rw, &io->iter); 856 857 858 + /* 859 + * Some file systems like to return -EOPNOTSUPP for an IOCB_NOWAIT 860 + * issue, even though they should be returning -EAGAIN. To be safe, 861 + * retry from blocking context for either. 862 + */ 863 + if (ret == -EOPNOTSUPP && force_nonblock) 864 + ret = -EAGAIN; 865 + 857 866 if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { 858 867 req->flags &= ~REQ_F_REISSUE; 859 868 /* If we can poll, just do that. */ ··· 982 975 * Put our buffer and post a CQE. If we fail to post a CQE, then 983 976 * jump to the termination path. This request is then done. 984 977 */ 985 - cflags = io_put_kbuf(req, issue_flags); 978 + cflags = io_put_kbuf(req, ret, issue_flags); 986 979 rw->len = 0; /* similarly to above, reset len to 0 */ 987 980 988 981 if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { ··· 1174 1167 if (!smp_load_acquire(&req->iopoll_completed)) 1175 1168 break; 1176 1169 nr_events++; 1177 - req->cqe.flags = io_put_kbuf(req, 0); 1170 + req->cqe.flags = io_put_kbuf(req, req->cqe.res, 0); 1178 1171 if (req->opcode != IORING_OP_URING_CMD) 1179 1172 io_req_rw_cleanup(req, 0); 1180 1173 }

+5 -2

io_uring/sqpoll.c

··· 10 10 #include <linux/slab.h> 11 11 #include <linux/audit.h> 12 12 #include <linux/security.h> 13 + #include <linux/cpuset.h> 13 14 #include <linux/io_uring.h> 14 15 15 16 #include <uapi/linux/io_uring.h> ··· 177 176 if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) 178 177 to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; 179 178 180 - if (!wq_list_empty(&ctx->iopoll_list) || to_submit) { 179 + if (to_submit || !wq_list_empty(&ctx->iopoll_list)) { 181 180 const struct cred *creds = NULL; 182 181 183 182 if (ctx->sq_creds != current_cred()) ··· 461 460 return 0; 462 461 463 462 if (p->flags & IORING_SETUP_SQ_AFF) { 463 + struct cpumask allowed_mask; 464 464 int cpu = p->sq_thread_cpu; 465 465 466 466 ret = -EINVAL; 467 - if (cpu >= nr_cpu_ids || !cpu_online(cpu)) 467 + cpuset_cpus_allowed(current, &allowed_mask); 468 + if (!cpumask_test_cpu(cpu, &allowed_mask)) 468 469 goto err_sqpoll; 469 470 sqd->sq_cpu = cpu; 470 471 } else {