Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'for-6.6/io_uring-2023-08-28' of git://git.kernel.dk/linux

Pull io_uring updates from Jens Axboe:
"Fairly quiet round in terms of features, mostly just improvements all
over the map for existing code. In detail:

- Initial support for socket operations through io_uring. Latter half
of this will likely land with the 6.7 kernel, then allowing things
like get/setsockopt (Breno)

- Cleanup of the cancel code, and then adding support for canceling
requests with the opcode as the key (me)

- Improvements for the io-wq locking (me)

- Fix affinity setting for SQPOLL based io-wq (me)

- Remove the io_uring userspace code. These were added initially as
copies from liburing, but all of them have since bitrotted and are
way out of date at this point. Rather than attempt to keep them in
sync, just get rid of them. People will have liburing available
anyway for these examples. (Pavel)

- Series improving the CQ/SQ ring caching (Pavel)

- Misc fixes and cleanups (Pavel, Yue, me)"

* tag 'for-6.6/io_uring-2023-08-28' of git://git.kernel.dk/linux: (47 commits)
io_uring: move iopoll ctx fields around
io_uring: move multishot cqe cache in ctx
io_uring: separate task_work/waiting cache line
io_uring: banish non-hot data to end of io_ring_ctx
io_uring: move non aligned field to the end
io_uring: add option to remove SQ indirection
io_uring: compact SQ/CQ heads/tails
io_uring: force inline io_fill_cqe_req
io_uring: merge iopoll and normal completion paths
io_uring: reorder cqring_flush and wakeups
io_uring: optimise extra io_get_cqe null check
io_uring: refactor __io_get_cqe()
io_uring: simplify big_cqe handling
io_uring: cqe init hardening
io_uring: improve cqe !tracing hot path
io_uring/rsrc: Annotate struct io_mapped_ubuf with __counted_by
io_uring/sqpoll: fix io-wq affinity when IORING_SETUP_SQPOLL is used
io_uring: simplify io_run_task_work_sig return
io_uring/rsrc: keep one global dummy_ubuf
io_uring: never overflow io_aux_cqe
...

+442 -1777
-1
MAINTAINERS
··· 10966 10966 F: include/trace/events/io_uring.h 10967 10967 F: include/uapi/linux/io_uring.h 10968 10968 F: io_uring/ 10969 - F: tools/io_uring/ 10970 10969 10971 10970 IPMI SUBSYSTEM 10972 10971 M: Corey Minyard <minyard@acm.org>
+6
include/linux/io_uring.h
··· 81 81 if (tsk->io_uring) 82 82 __io_uring_free(tsk); 83 83 } 84 + int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); 84 85 #else 85 86 static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, 86 87 struct iov_iter *iter, void *ioucmd) ··· 116 115 static inline const char *io_uring_get_opcode(u8 opcode) 117 116 { 118 117 return ""; 118 + } 119 + static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd, 120 + unsigned int issue_flags) 121 + { 122 + return -EOPNOTSUPP; 119 123 } 120 124 #endif 121 125
+65 -64
include/linux/io_uring_types.h
··· 69 69 }; 70 70 71 71 struct io_uring { 72 - u32 head ____cacheline_aligned_in_smp; 73 - u32 tail ____cacheline_aligned_in_smp; 72 + u32 head; 73 + u32 tail; 74 74 }; 75 75 76 76 /* ··· 176 176 unsigned short submit_nr; 177 177 unsigned int cqes_count; 178 178 struct blk_plug plug; 179 - struct io_uring_cqe cqes[16]; 180 179 }; 181 180 182 181 struct io_ev_fd { ··· 204 205 unsigned int has_evfd: 1; 205 206 /* all CQEs should be posted only by the submitter task */ 206 207 unsigned int task_complete: 1; 208 + unsigned int lockless_cq: 1; 207 209 unsigned int syscall_iopoll: 1; 208 210 unsigned int poll_activated: 1; 209 211 unsigned int drain_disabled: 1; 210 212 unsigned int compat: 1; 211 213 214 + struct task_struct *submitter_task; 215 + struct io_rings *rings; 216 + struct percpu_ref refs; 217 + 212 218 enum task_work_notify_mode notify_method; 213 - 214 - /* 215 - * If IORING_SETUP_NO_MMAP is used, then the below holds 216 - * the gup'ed pages for the two rings, and the sqes. 217 - */ 218 - unsigned short n_ring_pages; 219 - unsigned short n_sqe_pages; 220 - struct page **ring_pages; 221 - struct page **sqe_pages; 222 - 223 - struct io_rings *rings; 224 - struct task_struct *submitter_task; 225 - struct percpu_ref refs; 226 219 } ____cacheline_aligned_in_smp; 227 220 228 221 /* submission data */ ··· 252 261 253 262 struct io_buffer_list *io_bl; 254 263 struct xarray io_bl_xa; 255 - struct list_head io_buffers_cache; 256 264 257 265 struct io_hash_table cancel_table_locked; 258 - struct list_head cq_overflow_list; 259 266 struct io_alloc_cache apoll_cache; 260 267 struct io_alloc_cache netmsg_cache; 268 + 269 + /* 270 + * ->iopoll_list is protected by the ctx->uring_lock for 271 + * io_uring instances that don't use IORING_SETUP_SQPOLL. 272 + * For SQPOLL, only the single threaded io_sq_thread() will 273 + * manipulate the list, hence no extra locking is needed there. 274 + */ 275 + struct io_wq_work_list iopoll_list; 276 + bool poll_multi_queue; 261 277 } ____cacheline_aligned_in_smp; 262 - 263 - /* IRQ completion list, under ->completion_lock */ 264 - struct io_wq_work_list locked_free_list; 265 - unsigned int locked_free_nr; 266 - 267 - const struct cred *sq_creds; /* cred used for __io_sq_thread() */ 268 - struct io_sq_data *sq_data; /* if using sq thread polling */ 269 - 270 - struct wait_queue_head sqo_sq_wait; 271 - struct list_head sqd_list; 272 - 273 - unsigned long check_cq; 274 - 275 - unsigned int file_alloc_start; 276 - unsigned int file_alloc_end; 277 - 278 - struct xarray personalities; 279 - u32 pers_next; 280 278 281 279 struct { 282 280 /* ··· 278 298 unsigned cached_cq_tail; 279 299 unsigned cq_entries; 280 300 struct io_ev_fd __rcu *io_ev_fd; 281 - struct wait_queue_head cq_wait; 282 301 unsigned cq_extra; 283 302 } ____cacheline_aligned_in_smp; 284 303 304 + /* 305 + * task_work and async notification delivery cacheline. Expected to 306 + * regularly bounce b/w CPUs. 307 + */ 285 308 struct { 286 - spinlock_t completion_lock; 287 - 288 - bool poll_multi_queue; 289 - atomic_t cq_wait_nr; 290 - 291 - /* 292 - * ->iopoll_list is protected by the ctx->uring_lock for 293 - * io_uring instances that don't use IORING_SETUP_SQPOLL. 294 - * For SQPOLL, only the single threaded io_sq_thread() will 295 - * manipulate the list, hence no extra locking is needed there. 296 - */ 297 - struct io_wq_work_list iopoll_list; 298 - struct io_hash_table cancel_table; 299 - 300 309 struct llist_head work_llist; 301 - 302 - struct list_head io_buffers_comp; 310 + unsigned long check_cq; 311 + atomic_t cq_wait_nr; 312 + atomic_t cq_timeouts; 313 + struct wait_queue_head cq_wait; 303 314 } ____cacheline_aligned_in_smp; 304 315 305 316 /* timeouts */ 306 317 struct { 307 318 spinlock_t timeout_lock; 308 - atomic_t cq_timeouts; 309 319 struct list_head timeout_list; 310 320 struct list_head ltimeout_list; 311 321 unsigned cq_last_tm_flush; 312 322 } ____cacheline_aligned_in_smp; 323 + 324 + struct io_uring_cqe completion_cqes[16]; 325 + 326 + spinlock_t completion_lock; 327 + 328 + /* IRQ completion list, under ->completion_lock */ 329 + struct io_wq_work_list locked_free_list; 330 + unsigned int locked_free_nr; 331 + 332 + struct list_head io_buffers_comp; 333 + struct list_head cq_overflow_list; 334 + struct io_hash_table cancel_table; 335 + 336 + const struct cred *sq_creds; /* cred used for __io_sq_thread() */ 337 + struct io_sq_data *sq_data; /* if using sq thread polling */ 338 + 339 + struct wait_queue_head sqo_sq_wait; 340 + struct list_head sqd_list; 341 + 342 + unsigned int file_alloc_start; 343 + unsigned int file_alloc_end; 344 + 345 + struct xarray personalities; 346 + u32 pers_next; 347 + 348 + struct list_head io_buffers_cache; 313 349 314 350 /* Keep this last, we don't need it for the fast path */ 315 351 struct wait_queue_head poll_wq; ··· 370 374 unsigned sq_thread_idle; 371 375 /* protected by ->completion_lock */ 372 376 unsigned evfd_last_cq_tail; 377 + 378 + /* 379 + * If IORING_SETUP_NO_MMAP is used, then the below holds 380 + * the gup'ed pages for the two rings, and the sqes. 381 + */ 382 + unsigned short n_ring_pages; 383 + unsigned short n_sqe_pages; 384 + struct page **ring_pages; 385 + struct page **sqe_pages; 373 386 }; 374 387 375 388 struct io_tw_state { ··· 414 409 REQ_F_SINGLE_POLL_BIT, 415 410 REQ_F_DOUBLE_POLL_BIT, 416 411 REQ_F_PARTIAL_IO_BIT, 417 - REQ_F_CQE32_INIT_BIT, 418 412 REQ_F_APOLL_MULTISHOT_BIT, 419 413 REQ_F_CLEAR_POLLIN_BIT, 420 414 REQ_F_HASH_LOCKED_BIT, ··· 483 479 REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), 484 480 /* fast poll multishot mode */ 485 481 REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), 486 - /* ->extra1 and ->extra2 are initialised */ 487 - REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT), 488 482 /* recvmsg special flag, clear EPOLLIN */ 489 483 REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), 490 484 /* hashed into ->cancel_hash_locked, protected by ->uring_lock */ ··· 581 579 struct io_task_work io_task_work; 582 580 unsigned nr_tw; 583 581 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ 584 - union { 585 - struct hlist_node hash_node; 586 - struct { 587 - u64 extra1; 588 - u64 extra2; 589 - }; 590 - }; 582 + struct hlist_node hash_node; 591 583 /* internal polling, see IORING_FEAT_FAST_POLL */ 592 584 struct async_poll *apoll; 593 585 /* opcode allocated if it needs to store data for async defer */ ··· 591 595 /* custom credentials, valid IFF REQ_F_CREDS is set */ 592 596 const struct cred *creds; 593 597 struct io_wq_work work; 598 + 599 + struct { 600 + u64 extra1; 601 + u64 extra2; 602 + } big_cqe; 594 603 }; 595 604 596 605 struct io_overflow_cqe {
+20 -1
include/uapi/linux/io_uring.h
··· 185 185 */ 186 186 #define IORING_SETUP_REGISTERED_FD_ONLY (1U << 15) 187 187 188 + /* 189 + * Removes indirection through the SQ index array. 190 + */ 191 + #define IORING_SETUP_NO_SQARRAY (1U << 16) 192 + 188 193 enum io_uring_op { 189 194 IORING_OP_NOP, 190 195 IORING_OP_READV, ··· 304 299 * request 'user_data' 305 300 * IORING_ASYNC_CANCEL_ANY Match any request 306 301 * IORING_ASYNC_CANCEL_FD_FIXED 'fd' passed in is a fixed descriptor 302 + * IORING_ASYNC_CANCEL_USERDATA Match on user_data, default for no other key 303 + * IORING_ASYNC_CANCEL_OP Match request based on opcode 307 304 */ 308 305 #define IORING_ASYNC_CANCEL_ALL (1U << 0) 309 306 #define IORING_ASYNC_CANCEL_FD (1U << 1) 310 307 #define IORING_ASYNC_CANCEL_ANY (1U << 2) 311 308 #define IORING_ASYNC_CANCEL_FD_FIXED (1U << 3) 309 + #define IORING_ASYNC_CANCEL_USERDATA (1U << 4) 310 + #define IORING_ASYNC_CANCEL_OP (1U << 5) 312 311 313 312 /* 314 313 * send/sendmsg and recv/recvmsg flags (sqe->ioprio) ··· 706 697 __s32 fd; 707 698 __u32 flags; 708 699 struct __kernel_timespec timeout; 709 - __u64 pad[4]; 700 + __u8 opcode; 701 + __u8 pad[7]; 702 + __u64 pad2[3]; 710 703 }; 711 704 712 705 /* ··· 726 715 __u32 controllen; 727 716 __u32 payloadlen; 728 717 __u32 flags; 718 + }; 719 + 720 + /* 721 + * Argument for IORING_OP_URING_CMD when file is a socket 722 + */ 723 + enum { 724 + SOCKET_URING_OP_SIOCINQ = 0, 725 + SOCKET_URING_OP_SIOCOUTQ, 729 726 }; 730 727 731 728 #ifdef __cplusplus
+54 -22
io_uring/cancel.c
··· 22 22 u64 addr; 23 23 u32 flags; 24 24 s32 fd; 25 + u8 opcode; 25 26 }; 26 27 27 28 #define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ 28 - IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED) 29 + IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED | \ 30 + IORING_ASYNC_CANCEL_USERDATA | IORING_ASYNC_CANCEL_OP) 31 + 32 + /* 33 + * Returns true if the request matches the criteria outlined by 'cd'. 34 + */ 35 + bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd) 36 + { 37 + bool match_user_data = cd->flags & IORING_ASYNC_CANCEL_USERDATA; 38 + 39 + if (req->ctx != cd->ctx) 40 + return false; 41 + 42 + if (!(cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP))) 43 + match_user_data = true; 44 + 45 + if (cd->flags & IORING_ASYNC_CANCEL_ANY) 46 + goto check_seq; 47 + if (cd->flags & IORING_ASYNC_CANCEL_FD) { 48 + if (req->file != cd->file) 49 + return false; 50 + } 51 + if (cd->flags & IORING_ASYNC_CANCEL_OP) { 52 + if (req->opcode != cd->opcode) 53 + return false; 54 + } 55 + if (match_user_data && req->cqe.user_data != cd->data) 56 + return false; 57 + if (cd->flags & IORING_ASYNC_CANCEL_ALL) { 58 + check_seq: 59 + if (cd->seq == req->work.cancel_seq) 60 + return false; 61 + req->work.cancel_seq = cd->seq; 62 + } 63 + 64 + return true; 65 + } 29 66 30 67 static bool io_cancel_cb(struct io_wq_work *work, void *data) 31 68 { 32 69 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 33 70 struct io_cancel_data *cd = data; 34 71 35 - if (req->ctx != cd->ctx) 36 - return false; 37 - if (cd->flags & IORING_ASYNC_CANCEL_ANY) { 38 - ; 39 - } else if (cd->flags & IORING_ASYNC_CANCEL_FD) { 40 - if (req->file != cd->file) 41 - return false; 42 - } else { 43 - if (req->cqe.user_data != cd->data) 44 - return false; 45 - } 46 - if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { 47 - if (cd->seq == req->work.cancel_seq) 48 - return false; 49 - req->work.cancel_seq = cd->seq; 50 - } 51 - return true; 72 + return io_cancel_req_match(req, cd); 52 73 } 53 74 54 75 static int io_async_cancel_one(struct io_uring_task *tctx, ··· 132 111 133 112 if (unlikely(req->flags & REQ_F_BUFFER_SELECT)) 134 113 return -EINVAL; 135 - if (sqe->off || sqe->len || sqe->splice_fd_in) 114 + if (sqe->off || sqe->splice_fd_in) 136 115 return -EINVAL; 137 116 138 117 cancel->addr = READ_ONCE(sqe->addr); ··· 143 122 if (cancel->flags & IORING_ASYNC_CANCEL_ANY) 144 123 return -EINVAL; 145 124 cancel->fd = READ_ONCE(sqe->fd); 125 + } 126 + if (cancel->flags & IORING_ASYNC_CANCEL_OP) { 127 + if (cancel->flags & IORING_ASYNC_CANCEL_ANY) 128 + return -EINVAL; 129 + cancel->opcode = READ_ONCE(sqe->len); 146 130 } 147 131 148 132 return 0; ··· 195 169 .ctx = req->ctx, 196 170 .data = cancel->addr, 197 171 .flags = cancel->flags, 172 + .opcode = cancel->opcode, 198 173 .seq = atomic_inc_return(&req->ctx->cancel_seq), 199 174 }; 200 175 struct io_uring_task *tctx = req->task->io_uring; ··· 265 238 struct io_uring_sync_cancel_reg sc; 266 239 struct fd f = { }; 267 240 DEFINE_WAIT(wait); 268 - int ret; 241 + int ret, i; 269 242 270 243 if (copy_from_user(&sc, arg, sizeof(sc))) 271 244 return -EFAULT; 272 245 if (sc.flags & ~CANCEL_FLAGS) 273 246 return -EINVAL; 274 - if (sc.pad[0] || sc.pad[1] || sc.pad[2] || sc.pad[3]) 275 - return -EINVAL; 247 + for (i = 0; i < ARRAY_SIZE(sc.pad); i++) 248 + if (sc.pad[i]) 249 + return -EINVAL; 250 + for (i = 0; i < ARRAY_SIZE(sc.pad2); i++) 251 + if (sc.pad2[i]) 252 + return -EINVAL; 276 253 277 254 cd.data = sc.addr; 278 255 cd.flags = sc.flags; 256 + cd.opcode = sc.opcode; 279 257 280 258 /* we can grab a normal file descriptor upfront */ 281 259 if ((cd.flags & IORING_ASYNC_CANCEL_FD) &&
+2 -1
io_uring/cancel.h
··· 8 8 u64 data; 9 9 struct file *file; 10 10 }; 11 + u8 opcode; 11 12 u32 flags; 12 13 int seq; 13 14 }; 14 - 15 15 16 16 int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 17 17 int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags); ··· 21 21 void init_hash_table(struct io_hash_table *table, unsigned size); 22 22 23 23 int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg); 24 + bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd);
+6 -12
io_uring/fdinfo.c
··· 46 46 return 0; 47 47 } 48 48 49 - static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, 50 - struct seq_file *m) 49 + /* 50 + * Caller holds a reference to the file already, we don't need to do 51 + * anything else to get an extra reference. 52 + */ 53 + __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) 51 54 { 55 + struct io_ring_ctx *ctx = f->private_data; 52 56 struct io_sq_data *sq = NULL; 53 57 struct io_overflow_cqe *ocqe; 54 58 struct io_rings *r = ctx->rings; ··· 206 202 } 207 203 208 204 spin_unlock(&ctx->completion_lock); 209 - } 210 - 211 - __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) 212 - { 213 - struct io_ring_ctx *ctx = f->private_data; 214 - 215 - if (percpu_ref_tryget(&ctx->refs)) { 216 - __io_uring_show_fdinfo(ctx, m); 217 - percpu_ref_put(&ctx->refs); 218 - } 219 205 } 220 206 #endif
+47 -23
io_uring/io-wq.c
··· 232 232 do_exit(0); 233 233 } 234 234 235 - static inline bool io_acct_run_queue(struct io_wq_acct *acct) 235 + static inline bool __io_acct_run_queue(struct io_wq_acct *acct) 236 236 { 237 - bool ret = false; 237 + return !test_bit(IO_ACCT_STALLED_BIT, &acct->flags) && 238 + !wq_list_empty(&acct->work_list); 239 + } 238 240 241 + /* 242 + * If there's work to do, returns true with acct->lock acquired. If not, 243 + * returns false with no lock held. 244 + */ 245 + static inline bool io_acct_run_queue(struct io_wq_acct *acct) 246 + __acquires(&acct->lock) 247 + { 239 248 raw_spin_lock(&acct->lock); 240 - if (!wq_list_empty(&acct->work_list) && 241 - !test_bit(IO_ACCT_STALLED_BIT, &acct->flags)) 242 - ret = true; 243 - raw_spin_unlock(&acct->lock); 249 + if (__io_acct_run_queue(acct)) 250 + return true; 244 251 245 - return ret; 252 + raw_spin_unlock(&acct->lock); 253 + return false; 246 254 } 247 255 248 256 /* ··· 276 268 io_worker_release(worker); 277 269 continue; 278 270 } 279 - if (wake_up_process(worker->task)) { 280 - io_worker_release(worker); 281 - return true; 282 - } 271 + /* 272 + * If the worker is already running, it's either already 273 + * starting work or finishing work. In either case, if it does 274 + * to go sleep, we'll kick off a new task for this work anyway. 275 + */ 276 + wake_up_process(worker->task); 283 277 io_worker_release(worker); 278 + return true; 284 279 } 285 280 286 281 return false; ··· 408 397 if (!io_acct_run_queue(acct)) 409 398 return; 410 399 400 + raw_spin_unlock(&acct->lock); 411 401 atomic_inc(&acct->nr_running); 412 402 atomic_inc(&wq->worker_refs); 413 403 io_queue_worker_create(worker, acct, create_worker_cb); ··· 533 521 raw_spin_unlock(&worker->lock); 534 522 } 535 523 536 - static void io_worker_handle_work(struct io_worker *worker) 524 + /* 525 + * Called with acct->lock held, drops it before returning 526 + */ 527 + static void io_worker_handle_work(struct io_wq_acct *acct, 528 + struct io_worker *worker) 529 + __releases(&acct->lock) 537 530 { 538 - struct io_wq_acct *acct = io_wq_get_acct(worker); 539 531 struct io_wq *wq = worker->wq; 540 532 bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state); 541 533 ··· 553 537 * can't make progress, any work completion or insertion will 554 538 * clear the stalled flag. 555 539 */ 556 - raw_spin_lock(&acct->lock); 557 540 work = io_get_next_work(acct, worker); 558 541 raw_spin_unlock(&acct->lock); 559 542 if (work) { ··· 606 591 wake_up(&wq->hash->wait); 607 592 } 608 593 } while (work); 594 + 595 + if (!__io_acct_run_queue(acct)) 596 + break; 597 + raw_spin_lock(&acct->lock); 609 598 } while (1); 610 599 } 611 600 ··· 630 611 long ret; 631 612 632 613 set_current_state(TASK_INTERRUPTIBLE); 614 + 615 + /* 616 + * If we have work to do, io_acct_run_queue() returns with 617 + * the acct->lock held. If not, it will drop it. 618 + */ 633 619 while (io_acct_run_queue(acct)) 634 - io_worker_handle_work(worker); 620 + io_worker_handle_work(acct, worker); 635 621 636 622 raw_spin_lock(&wq->lock); 637 623 /* ··· 669 645 } 670 646 } 671 647 672 - if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) 673 - io_worker_handle_work(worker); 648 + if (test_bit(IO_WQ_BIT_EXIT, &wq->state) && io_acct_run_queue(acct)) 649 + io_worker_handle_work(acct, worker); 674 650 675 651 io_worker_exit(worker); 676 652 return 0; ··· 933 909 clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); 934 910 raw_spin_unlock(&acct->lock); 935 911 936 - raw_spin_lock(&wq->lock); 937 912 rcu_read_lock(); 938 913 do_create = !io_wq_activate_free_worker(wq, acct); 939 914 rcu_read_unlock(); 940 - 941 - raw_spin_unlock(&wq->lock); 942 915 943 916 if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) || 944 917 !atomic_read(&acct->nr_running))) { ··· 1306 1285 return __io_wq_cpu_online(wq, cpu, false); 1307 1286 } 1308 1287 1309 - int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask) 1288 + int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask) 1310 1289 { 1290 + if (!tctx || !tctx->io_wq) 1291 + return -EINVAL; 1292 + 1311 1293 rcu_read_lock(); 1312 1294 if (mask) 1313 - cpumask_copy(wq->cpu_mask, mask); 1295 + cpumask_copy(tctx->io_wq->cpu_mask, mask); 1314 1296 else 1315 - cpumask_copy(wq->cpu_mask, cpu_possible_mask); 1297 + cpumask_copy(tctx->io_wq->cpu_mask, cpu_possible_mask); 1316 1298 rcu_read_unlock(); 1317 1299 1318 1300 return 0;
+1 -1
io_uring/io-wq.h
··· 50 50 void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); 51 51 void io_wq_hash_work(struct io_wq_work *work, void *val); 52 52 53 - int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask); 53 + int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask); 54 54 int io_wq_max_workers(struct io_wq *wq, int *new_count); 55 55 56 56 static inline bool io_wq_is_hashed(struct io_wq_work *work)
+125 -104
io_uring/io_uring.c
··· 147 147 bool cancel_all); 148 148 149 149 static void io_queue_sqe(struct io_kiocb *req); 150 - static void io_move_task_work_from_local(struct io_ring_ctx *ctx); 151 - static void __io_submit_flush_completions(struct io_ring_ctx *ctx); 152 150 153 151 struct kmem_cache *req_cachep; 154 152 ··· 227 229 static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) 228 230 { 229 231 wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); 230 - kasan_poison_object_data(req_cachep, req); 231 232 } 232 233 233 234 static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) ··· 289 292 goto err; 290 293 if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits)) 291 294 goto err; 292 - 293 - ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); 294 - if (!ctx->dummy_ubuf) 295 - goto err; 296 - /* set invalid range, so io_import_fixed() fails meeting it */ 297 - ctx->dummy_ubuf->ubuf = -1UL; 298 - 299 295 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 300 296 0, GFP_KERNEL)) 301 297 goto err; ··· 327 337 INIT_WQ_LIST(&ctx->submit_state.compl_reqs); 328 338 return ctx; 329 339 err: 330 - kfree(ctx->dummy_ubuf); 331 340 kfree(ctx->cancel_table.hbs); 332 341 kfree(ctx->cancel_table_locked.hbs); 333 342 kfree(ctx->io_bl); ··· 615 626 616 627 static inline void __io_cq_lock(struct io_ring_ctx *ctx) 617 628 { 618 - if (!ctx->task_complete) 629 + if (!ctx->lockless_cq) 619 630 spin_lock(&ctx->completion_lock); 620 631 } 621 632 ··· 628 639 static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) 629 640 { 630 641 io_commit_cqring(ctx); 631 - 632 - if (ctx->task_complete) { 633 - /* 634 - * ->task_complete implies that only current might be waiting 635 - * for CQEs, and obviously, we currently don't. No one is 636 - * waiting, wakeups are futile, skip them. 637 - */ 638 - io_commit_cqring_flush(ctx); 639 - } else { 640 - spin_unlock(&ctx->completion_lock); 641 - io_commit_cqring_flush(ctx); 642 - io_cqring_wake(ctx); 642 + if (!ctx->task_complete) { 643 + if (!ctx->lockless_cq) 644 + spin_unlock(&ctx->completion_lock); 645 + /* IOPOLL rings only need to wake up if it's also SQPOLL */ 646 + if (!ctx->syscall_iopoll) 647 + io_cqring_wake(ctx); 643 648 } 649 + io_commit_cqring_flush(ctx); 644 650 } 645 651 646 652 static void io_cq_unlock_post(struct io_ring_ctx *ctx) ··· 643 659 { 644 660 io_commit_cqring(ctx); 645 661 spin_unlock(&ctx->completion_lock); 646 - io_commit_cqring_flush(ctx); 647 662 io_cqring_wake(ctx); 663 + io_commit_cqring_flush(ctx); 648 664 } 649 665 650 666 /* Returns true if there are no backlogged entries after the flush */ ··· 677 693 678 694 io_cq_lock(ctx); 679 695 while (!list_empty(&ctx->cq_overflow_list)) { 680 - struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true); 696 + struct io_uring_cqe *cqe; 681 697 struct io_overflow_cqe *ocqe; 682 698 683 - if (!cqe) 699 + if (!io_get_cqe_overflow(ctx, &cqe, true)) 684 700 break; 685 701 ocqe = list_first_entry(&ctx->cq_overflow_list, 686 702 struct io_overflow_cqe, list); ··· 799 815 return true; 800 816 } 801 817 802 - bool io_req_cqe_overflow(struct io_kiocb *req) 818 + void io_req_cqe_overflow(struct io_kiocb *req) 803 819 { 804 - if (!(req->flags & REQ_F_CQE32_INIT)) { 805 - req->extra1 = 0; 806 - req->extra2 = 0; 807 - } 808 - return io_cqring_event_overflow(req->ctx, req->cqe.user_data, 809 - req->cqe.res, req->cqe.flags, 810 - req->extra1, req->extra2); 820 + io_cqring_event_overflow(req->ctx, req->cqe.user_data, 821 + req->cqe.res, req->cqe.flags, 822 + req->big_cqe.extra1, req->big_cqe.extra2); 823 + memset(&req->big_cqe, 0, sizeof(req->big_cqe)); 811 824 } 812 825 813 826 /* ··· 812 831 * control dependency is enough as we're using WRITE_ONCE to 813 832 * fill the cq entry 814 833 */ 815 - struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) 834 + bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) 816 835 { 817 836 struct io_rings *rings = ctx->rings; 818 837 unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); ··· 824 843 * Force overflow the completion. 825 844 */ 826 845 if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))) 827 - return NULL; 846 + return false; 828 847 829 848 /* userspace may cheat modifying the tail, be safe and do min */ 830 849 queued = min(__io_cqring_events(ctx), ctx->cq_entries); ··· 832 851 /* we need a contiguous range, limit based on the current array offset */ 833 852 len = min(free, ctx->cq_entries - off); 834 853 if (!len) 835 - return NULL; 854 + return false; 836 855 837 856 if (ctx->flags & IORING_SETUP_CQE32) { 838 857 off <<= 1; ··· 841 860 842 861 ctx->cqe_cached = &rings->cqes[off]; 843 862 ctx->cqe_sentinel = ctx->cqe_cached + len; 844 - 845 - ctx->cached_cq_tail++; 846 - ctx->cqe_cached++; 847 - if (ctx->flags & IORING_SETUP_CQE32) 848 - ctx->cqe_cached++; 849 - return &rings->cqes[off]; 863 + return true; 850 864 } 851 865 852 866 static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, ··· 856 880 * submission (by quite a lot). Increment the overflow count in 857 881 * the ring. 858 882 */ 859 - cqe = io_get_cqe(ctx); 860 - if (likely(cqe)) { 883 + if (likely(io_get_cqe(ctx, &cqe))) { 861 884 trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); 862 885 863 886 WRITE_ONCE(cqe->user_data, user_data); ··· 880 905 881 906 lockdep_assert_held(&ctx->uring_lock); 882 907 for (i = 0; i < state->cqes_count; i++) { 883 - struct io_uring_cqe *cqe = &state->cqes[i]; 908 + struct io_uring_cqe *cqe = &ctx->completion_cqes[i]; 884 909 885 910 if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) { 886 911 if (ctx->task_complete) { ··· 916 941 return __io_post_aux_cqe(ctx, user_data, res, cflags, true); 917 942 } 918 943 919 - bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags, 920 - bool allow_overflow) 944 + /* 945 + * A helper for multishot requests posting additional CQEs. 946 + * Should only be used from a task_work including IO_URING_F_MULTISHOT. 947 + */ 948 + bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags) 921 949 { 922 950 struct io_ring_ctx *ctx = req->ctx; 923 951 u64 user_data = req->cqe.user_data; 924 952 struct io_uring_cqe *cqe; 925 953 926 954 if (!defer) 927 - return __io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow); 955 + return __io_post_aux_cqe(ctx, user_data, res, cflags, false); 928 956 929 957 lockdep_assert_held(&ctx->uring_lock); 930 958 931 - if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->submit_state.cqes)) { 959 + if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->completion_cqes)) { 932 960 __io_cq_lock(ctx); 933 961 __io_flush_post_cqes(ctx); 934 962 /* no need to flush - flush is deferred */ ··· 942 964 * however it's main job is to prevent unbounded posted completions, 943 965 * and in that it works just as well. 944 966 */ 945 - if (!allow_overflow && test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) 967 + if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) 946 968 return false; 947 969 948 - cqe = &ctx->submit_state.cqes[ctx->submit_state.cqes_count++]; 970 + cqe = &ctx->completion_cqes[ctx->submit_state.cqes_count++]; 949 971 cqe->user_data = user_data; 950 972 cqe->res = res; 951 973 cqe->flags = cflags; ··· 958 980 struct io_rsrc_node *rsrc_node = NULL; 959 981 960 982 io_cq_lock(ctx); 961 - if (!(req->flags & REQ_F_CQE_SKIP)) 962 - io_fill_cqe_req(ctx, req); 983 + if (!(req->flags & REQ_F_CQE_SKIP)) { 984 + if (!io_fill_cqe_req(ctx, req)) 985 + io_req_cqe_overflow(req); 986 + } 963 987 964 988 /* 965 989 * If we're the last reference to this request, add to our locked ··· 979 999 io_put_kbuf_comp(req); 980 1000 if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) 981 1001 io_clean_op(req); 982 - if (!(req->flags & REQ_F_FIXED_FILE)) 983 - io_put_file(req->file); 1002 + io_put_file(req); 984 1003 985 1004 rsrc_node = req->rsrc_node; 986 1005 /* ··· 1041 1062 req->link = NULL; 1042 1063 req->async_data = NULL; 1043 1064 /* not necessary, but safer to zero */ 1044 - req->cqe.res = 0; 1065 + memset(&req->cqe, 0, sizeof(req->cqe)); 1066 + memset(&req->big_cqe, 0, sizeof(req->big_cqe)); 1045 1067 } 1046 1068 1047 1069 static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, ··· 1487 1507 io_req_task_queue(nxt); 1488 1508 } 1489 1509 1490 - void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) 1510 + static void io_free_batch_list(struct io_ring_ctx *ctx, 1511 + struct io_wq_work_node *node) 1491 1512 __must_hold(&ctx->uring_lock) 1492 1513 { 1493 1514 do { ··· 1515 1534 if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) 1516 1535 io_clean_op(req); 1517 1536 } 1518 - if (!(req->flags & REQ_F_FIXED_FILE)) 1519 - io_put_file(req->file); 1537 + io_put_file(req); 1520 1538 1521 1539 io_req_put_rsrc_locked(req, ctx); 1522 1540 ··· 1525 1545 } while (node); 1526 1546 } 1527 1547 1528 - static void __io_submit_flush_completions(struct io_ring_ctx *ctx) 1548 + void __io_submit_flush_completions(struct io_ring_ctx *ctx) 1529 1549 __must_hold(&ctx->uring_lock) 1530 1550 { 1531 1551 struct io_submit_state *state = &ctx->submit_state; ··· 1540 1560 comp_list); 1541 1561 1542 1562 if (!(req->flags & REQ_F_CQE_SKIP) && 1543 - unlikely(!__io_fill_cqe_req(ctx, req))) { 1563 + unlikely(!io_fill_cqe_req(ctx, req))) { 1544 1564 if (ctx->task_complete) { 1545 1565 spin_lock(&ctx->completion_lock); 1546 1566 io_req_cqe_overflow(req); ··· 1596 1616 static int io_iopoll_check(struct io_ring_ctx *ctx, long min) 1597 1617 { 1598 1618 unsigned int nr_events = 0; 1599 - int ret = 0; 1600 1619 unsigned long check_cq; 1601 1620 1602 1621 if (!io_allowed_run_tw(ctx)) ··· 1621 1642 return 0; 1622 1643 1623 1644 do { 1645 + int ret = 0; 1646 + 1624 1647 /* 1625 1648 * If a submit got punted to a workqueue, we can have the 1626 1649 * application entering polling for a command before it gets ··· 1651 1670 break; 1652 1671 } 1653 1672 ret = io_do_iopoll(ctx, !min); 1654 - if (ret < 0) 1655 - break; 1656 - nr_events += ret; 1657 - ret = 0; 1658 - } while (nr_events < min && !need_resched()); 1673 + if (unlikely(ret < 0)) 1674 + return ret; 1659 1675 1660 - return ret; 1676 + if (task_sigpending(current)) 1677 + return -EINTR; 1678 + if (need_resched()) 1679 + break; 1680 + 1681 + nr_events += ret; 1682 + } while (nr_events < min); 1683 + 1684 + return 0; 1661 1685 } 1662 1686 1663 1687 void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts) ··· 2347 2361 */ 2348 2362 static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) 2349 2363 { 2350 - unsigned head, mask = ctx->sq_entries - 1; 2351 - unsigned sq_idx = ctx->cached_sq_head++ & mask; 2364 + unsigned mask = ctx->sq_entries - 1; 2365 + unsigned head = ctx->cached_sq_head++ & mask; 2366 + 2367 + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) { 2368 + head = READ_ONCE(ctx->sq_array[head]); 2369 + if (unlikely(head >= ctx->sq_entries)) { 2370 + /* drop invalid entries */ 2371 + spin_lock(&ctx->completion_lock); 2372 + ctx->cq_extra--; 2373 + spin_unlock(&ctx->completion_lock); 2374 + WRITE_ONCE(ctx->rings->sq_dropped, 2375 + READ_ONCE(ctx->rings->sq_dropped) + 1); 2376 + return false; 2377 + } 2378 + } 2352 2379 2353 2380 /* 2354 2381 * The cached sq head (or cq tail) serves two purposes: ··· 2371 2372 * 2) allows the kernel side to track the head on its own, even 2372 2373 * though the application is the one updating it. 2373 2374 */ 2374 - head = READ_ONCE(ctx->sq_array[sq_idx]); 2375 - if (likely(head < ctx->sq_entries)) { 2376 - /* double index for 128-byte SQEs, twice as long */ 2377 - if (ctx->flags & IORING_SETUP_SQE128) 2378 - head <<= 1; 2379 - *sqe = &ctx->sq_sqes[head]; 2380 - return true; 2381 - } 2382 2375 2383 - /* drop invalid entries */ 2384 - ctx->cq_extra--; 2385 - WRITE_ONCE(ctx->rings->sq_dropped, 2386 - READ_ONCE(ctx->rings->sq_dropped) + 1); 2387 - return false; 2376 + /* double index for 128-byte SQEs, twice as long */ 2377 + if (ctx->flags & IORING_SETUP_SQE128) 2378 + head <<= 1; 2379 + *sqe = &ctx->sq_sqes[head]; 2380 + return true; 2388 2381 } 2389 2382 2390 2383 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) ··· 2475 2484 if (!llist_empty(&ctx->work_llist)) { 2476 2485 __set_current_state(TASK_RUNNING); 2477 2486 if (io_run_local_work(ctx) > 0) 2478 - return 1; 2487 + return 0; 2479 2488 } 2480 2489 if (io_run_task_work() > 0) 2481 - return 1; 2490 + return 0; 2482 2491 if (task_sigpending(current)) 2483 2492 return -EINTR; 2484 2493 return 0; ··· 2752 2761 return SIZE_MAX; 2753 2762 #endif 2754 2763 2764 + if (ctx->flags & IORING_SETUP_NO_SQARRAY) { 2765 + if (sq_offset) 2766 + *sq_offset = SIZE_MAX; 2767 + return off; 2768 + } 2769 + 2755 2770 if (sq_offset) 2756 2771 *sq_offset = off; 2757 2772 ··· 2900 2903 io_wq_put_hash(ctx->hash_map); 2901 2904 kfree(ctx->cancel_table.hbs); 2902 2905 kfree(ctx->cancel_table_locked.hbs); 2903 - kfree(ctx->dummy_ubuf); 2904 2906 kfree(ctx->io_bl); 2905 2907 xa_destroy(&ctx->io_bl_xa); 2906 2908 kfree(ctx); ··· 3729 3733 return PTR_ERR(rings); 3730 3734 3731 3735 ctx->rings = rings; 3732 - ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); 3736 + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) 3737 + ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); 3733 3738 rings->sq_ring_mask = p->sq_entries - 1; 3734 3739 rings->cq_ring_mask = p->cq_entries - 1; 3735 3740 rings->sq_ring_entries = p->sq_entries; ··· 3859 3862 !(ctx->flags & IORING_SETUP_SQPOLL)) 3860 3863 ctx->task_complete = true; 3861 3864 3865 + if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) 3866 + ctx->lockless_cq = true; 3867 + 3862 3868 /* 3863 3869 * lazy poll_wq activation relies on ->task_complete for synchronisation 3864 3870 * purposes, see io_activate_pollwq() ··· 3941 3941 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); 3942 3942 p->sq_off.flags = offsetof(struct io_rings, sq_flags); 3943 3943 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); 3944 - p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; 3944 + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) 3945 + p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; 3945 3946 p->sq_off.resv1 = 0; 3946 3947 if (!(ctx->flags & IORING_SETUP_NO_MMAP)) 3947 3948 p->sq_off.user_addr = 0; ··· 4031 4030 IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | 4032 4031 IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | 4033 4032 IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | 4034 - IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY)) 4033 + IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | 4034 + IORING_SETUP_NO_SQARRAY)) 4035 4035 return -EINVAL; 4036 4036 4037 4037 return io_uring_create(entries, &p, params); ··· 4195 4193 return 0; 4196 4194 } 4197 4195 4196 + static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, 4197 + cpumask_var_t new_mask) 4198 + { 4199 + int ret; 4200 + 4201 + if (!(ctx->flags & IORING_SETUP_SQPOLL)) { 4202 + ret = io_wq_cpu_affinity(current->io_uring, new_mask); 4203 + } else { 4204 + mutex_unlock(&ctx->uring_lock); 4205 + ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); 4206 + mutex_lock(&ctx->uring_lock); 4207 + } 4208 + 4209 + return ret; 4210 + } 4211 + 4198 4212 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, 4199 4213 void __user *arg, unsigned len) 4200 4214 { 4201 - struct io_uring_task *tctx = current->io_uring; 4202 4215 cpumask_var_t new_mask; 4203 4216 int ret; 4204 - 4205 - if (!tctx || !tctx->io_wq) 4206 - return -EINVAL; 4207 4217 4208 4218 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 4209 4219 return -ENOMEM; ··· 4237 4223 return -EFAULT; 4238 4224 } 4239 4225 4240 - ret = io_wq_cpu_affinity(tctx->io_wq, new_mask); 4226 + ret = __io_register_iowq_aff(ctx, new_mask); 4241 4227 free_cpumask_var(new_mask); 4242 4228 return ret; 4243 4229 } 4244 4230 4245 4231 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) 4246 4232 { 4247 - struct io_uring_task *tctx = current->io_uring; 4248 - 4249 - if (!tctx || !tctx->io_wq) 4250 - return -EINVAL; 4251 - 4252 - return io_wq_cpu_affinity(tctx->io_wq, NULL); 4233 + return __io_register_iowq_aff(ctx, NULL); 4253 4234 } 4254 4235 4255 4236 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, ··· 4620 4611 4621 4612 io_uring_optable_init(); 4622 4613 4623 - req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | 4624 - SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); 4614 + /* 4615 + * Allow user copy in the per-command field, which starts after the 4616 + * file in io_kiocb and until the opcode field. The openat2 handling 4617 + * requires copying in user memory into the io_kiocb object in that 4618 + * range, and HARDENED_USERCOPY will complain if we haven't 4619 + * correctly annotated this range. 4620 + */ 4621 + req_cachep = kmem_cache_create_usercopy("io_kiocb", 4622 + sizeof(struct io_kiocb), 0, 4623 + SLAB_HWCACHE_ALIGN | SLAB_PANIC | 4624 + SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, 4625 + offsetof(struct io_kiocb, cmd.data), 4626 + sizeof_field(struct io_kiocb, cmd.data), NULL); 4627 + 4625 4628 return 0; 4626 4629 }; 4627 4630 __initcall(io_uring_init);
+30 -49
io_uring/io_uring.h
··· 38 38 IOU_STOP_MULTISHOT = -ECANCELED, 39 39 }; 40 40 41 - struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow); 42 - bool io_req_cqe_overflow(struct io_kiocb *req); 41 + bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); 42 + void io_req_cqe_overflow(struct io_kiocb *req); 43 43 int io_run_task_work_sig(struct io_ring_ctx *ctx); 44 44 void io_req_defer_failed(struct io_kiocb *req, s32 res); 45 45 void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags); 46 46 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); 47 - bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags, 48 - bool allow_overflow); 47 + bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags); 49 48 void __io_commit_cqring_flush(struct io_ring_ctx *ctx); 50 49 51 50 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); ··· 72 73 int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); 73 74 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); 74 75 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); 75 - void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node); 76 + void __io_submit_flush_completions(struct io_ring_ctx *ctx); 76 77 int io_req_prep_async(struct io_kiocb *req); 77 78 78 79 struct io_wq_work *io_wq_free_work(struct io_wq_work *work); ··· 109 110 #define io_for_each_link(pos, head) \ 110 111 for (pos = (head); pos; pos = pos->link) 111 112 112 - static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx, 113 - bool overflow) 113 + static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx, 114 + struct io_uring_cqe **ret, 115 + bool overflow) 114 116 { 115 117 io_lockdep_assert_cq_locked(ctx); 116 118 117 - if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { 118 - struct io_uring_cqe *cqe = ctx->cqe_cached; 119 - 120 - ctx->cached_cq_tail++; 121 - ctx->cqe_cached++; 122 - if (ctx->flags & IORING_SETUP_CQE32) 123 - ctx->cqe_cached++; 124 - return cqe; 119 + if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) { 120 + if (unlikely(!io_cqe_cache_refill(ctx, overflow))) 121 + return false; 125 122 } 126 - 127 - return __io_get_cqe(ctx, overflow); 123 + *ret = ctx->cqe_cached; 124 + ctx->cached_cq_tail++; 125 + ctx->cqe_cached++; 126 + if (ctx->flags & IORING_SETUP_CQE32) 127 + ctx->cqe_cached++; 128 + return true; 128 129 } 129 130 130 - static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) 131 + static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret) 131 132 { 132 - return io_get_cqe_overflow(ctx, false); 133 + return io_get_cqe_overflow(ctx, ret, false); 133 134 } 134 135 135 - static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, 136 - struct io_kiocb *req) 136 + static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, 137 + struct io_kiocb *req) 137 138 { 138 139 struct io_uring_cqe *cqe; 139 140 ··· 142 143 * submission (by quite a lot). Increment the overflow count in 143 144 * the ring. 144 145 */ 145 - cqe = io_get_cqe(ctx); 146 - if (unlikely(!cqe)) 146 + if (unlikely(!io_get_cqe(ctx, &cqe))) 147 147 return false; 148 148 149 - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, 150 - req->cqe.res, req->cqe.flags, 151 - (req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0, 152 - (req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0); 149 + if (trace_io_uring_complete_enabled()) 150 + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, 151 + req->cqe.res, req->cqe.flags, 152 + req->big_cqe.extra1, req->big_cqe.extra2); 153 153 154 154 memcpy(cqe, &req->cqe, sizeof(*cqe)); 155 - 156 155 if (ctx->flags & IORING_SETUP_CQE32) { 157 - u64 extra1 = 0, extra2 = 0; 158 - 159 - if (req->flags & REQ_F_CQE32_INIT) { 160 - extra1 = req->extra1; 161 - extra2 = req->extra2; 162 - } 163 - 164 - WRITE_ONCE(cqe->big_cqe[0], extra1); 165 - WRITE_ONCE(cqe->big_cqe[1], extra2); 156 + memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe)); 157 + memset(&req->big_cqe, 0, sizeof(req->big_cqe)); 166 158 } 167 159 return true; 168 - } 169 - 170 - static inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, 171 - struct io_kiocb *req) 172 - { 173 - if (likely(__io_fill_cqe_req(ctx, req))) 174 - return true; 175 - return io_req_cqe_overflow(req); 176 160 } 177 161 178 162 static inline void req_set_fail(struct io_kiocb *req) ··· 178 196 return req->flags & REQ_F_ASYNC_DATA; 179 197 } 180 198 181 - static inline void io_put_file(struct file *file) 199 + static inline void io_put_file(struct io_kiocb *req) 182 200 { 183 - if (file) 184 - fput(file); 201 + if (!(req->flags & REQ_F_FIXED_FILE) && req->file) 202 + fput(req->file); 185 203 } 186 204 187 205 static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx, ··· 336 354 struct io_kiocb *req; 337 355 338 356 req = container_of(ctx->submit_state.free_list.next, struct io_kiocb, comp_list); 339 - kasan_unpoison_object_data(req_cachep, req); 340 357 wq_stack_extract(&ctx->submit_state.free_list); 341 358 return req; 342 359 }
+4 -4
io_uring/net.c
··· 641 641 } 642 642 643 643 if (!mshot_finished) { 644 - if (io_aux_cqe(req, issue_flags & IO_URING_F_COMPLETE_DEFER, 645 - *ret, cflags | IORING_CQE_F_MORE, true)) { 644 + if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, 645 + *ret, cflags | IORING_CQE_F_MORE)) { 646 646 io_recv_prep_retry(req); 647 647 /* Known not-empty or unknown state, retry */ 648 648 if (cflags & IORING_CQE_F_SOCK_NONEMPTY || ··· 1366 1366 1367 1367 if (ret < 0) 1368 1368 return ret; 1369 - if (io_aux_cqe(req, issue_flags & IO_URING_F_COMPLETE_DEFER, ret, 1370 - IORING_CQE_F_MORE, true)) 1369 + if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, 1370 + ret, IORING_CQE_F_MORE)) 1371 1371 goto retry; 1372 1372 1373 1373 return -ECANCELED;
+9 -12
io_uring/poll.c
··· 300 300 __poll_t mask = mangle_poll(req->cqe.res & 301 301 req->apoll_events); 302 302 303 - if (!io_aux_cqe(req, ts->locked, mask, 304 - IORING_CQE_F_MORE, false)) { 303 + if (!io_fill_cqe_req_aux(req, ts->locked, mask, 304 + IORING_CQE_F_MORE)) { 305 305 io_req_set_res(req, mask, 0); 306 306 return IOU_POLL_REMOVE_POLL_USE_RES; 307 307 } ··· 824 824 825 825 spin_lock(&hb->lock); 826 826 hlist_for_each_entry(req, &hb->list, hash_node) { 827 - if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && 828 - req->file != cd->file) 829 - continue; 830 - if (cd->seq == req->work.cancel_seq) 831 - continue; 832 - req->work.cancel_seq = cd->seq; 833 - *out_bucket = hb; 834 - return req; 827 + if (io_cancel_req_match(req, cd)) { 828 + *out_bucket = hb; 829 + return req; 830 + } 835 831 } 836 832 spin_unlock(&hb->lock); 837 833 } ··· 851 855 struct io_hash_bucket *bucket; 852 856 struct io_kiocb *req; 853 857 854 - if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) 858 + if (cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP | 859 + IORING_ASYNC_CANCEL_ANY)) 855 860 req = io_poll_file_find(ctx, cd, table, &bucket); 856 861 else 857 862 req = io_poll_find(ctx, false, cd, table, &bucket); ··· 969 972 int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) 970 973 { 971 974 struct io_poll_update *poll_update = io_kiocb_to_cmd(req, struct io_poll_update); 972 - struct io_cancel_data cd = { .data = poll_update->old_user_data, }; 973 975 struct io_ring_ctx *ctx = req->ctx; 976 + struct io_cancel_data cd = { .ctx = ctx, .data = poll_update->old_user_data, }; 974 977 struct io_hash_bucket *bucket; 975 978 struct io_kiocb *preq; 976 979 int ret2, ret = 0;
+10 -4
io_uring/rsrc.c
··· 33 33 #define IORING_MAX_FIXED_FILES (1U << 20) 34 34 #define IORING_MAX_REG_BUFFERS (1U << 14) 35 35 36 + static const struct io_mapped_ubuf dummy_ubuf = { 37 + /* set invalid range, so io_import_fixed() fails meeting it */ 38 + .ubuf = -1UL, 39 + .ubuf_end = 0, 40 + }; 41 + 36 42 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 37 43 { 38 44 unsigned long page_limit, cur_pages, new_pages; ··· 138 132 struct io_mapped_ubuf *imu = *slot; 139 133 unsigned int i; 140 134 141 - if (imu != ctx->dummy_ubuf) { 135 + if (imu != &dummy_ubuf) { 142 136 for (i = 0; i < imu->nr_bvecs; i++) 143 137 unpin_user_page(imu->bvec[i].bv_page); 144 138 if (imu->acct_pages) ··· 465 459 break; 466 460 467 461 i = array_index_nospec(up->offset + done, ctx->nr_user_bufs); 468 - if (ctx->user_bufs[i] != ctx->dummy_ubuf) { 462 + if (ctx->user_bufs[i] != &dummy_ubuf) { 469 463 err = io_queue_rsrc_removal(ctx->buf_data, i, 470 464 ctx->user_bufs[i]); 471 465 if (unlikely(err)) { 472 466 io_buffer_unmap(ctx, &imu); 473 467 break; 474 468 } 475 - ctx->user_bufs[i] = ctx->dummy_ubuf; 469 + ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf; 476 470 } 477 471 478 472 ctx->user_bufs[i] = imu; ··· 1083 1077 int ret, nr_pages, i; 1084 1078 struct folio *folio = NULL; 1085 1079 1086 - *pimu = ctx->dummy_ubuf; 1080 + *pimu = (struct io_mapped_ubuf *)&dummy_ubuf; 1087 1081 if (!iov->iov_base) 1088 1082 return 0; 1089 1083
+1 -2
io_uring/rsrc.h
··· 54 54 u64 ubuf_end; 55 55 unsigned int nr_bvecs; 56 56 unsigned long acct_pages; 57 - struct bio_vec bvec[]; 57 + struct bio_vec bvec[] __counted_by(nr_bvecs); 58 58 }; 59 59 60 - void io_rsrc_put_tw(struct callback_head *cb); 61 60 void io_rsrc_node_ref_zero(struct io_rsrc_node *node); 62 61 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node); 63 62 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
+5 -19
io_uring/rw.c
··· 989 989 return ret; 990 990 } 991 991 992 - static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) 993 - { 994 - io_commit_cqring_flush(ctx); 995 - if (ctx->flags & IORING_SETUP_SQPOLL) 996 - io_cqring_wake(ctx); 997 - } 998 - 999 992 void io_rw_fail(struct io_kiocb *req) 1000 993 { 1001 994 int res; ··· 1059 1066 if (!smp_load_acquire(&req->iopoll_completed)) 1060 1067 break; 1061 1068 nr_events++; 1062 - if (unlikely(req->flags & REQ_F_CQE_SKIP)) 1063 - continue; 1064 - 1065 1069 req->cqe.flags = io_put_kbuf(req, 0); 1066 - if (unlikely(!__io_fill_cqe_req(ctx, req))) { 1067 - spin_lock(&ctx->completion_lock); 1068 - io_req_cqe_overflow(req); 1069 - spin_unlock(&ctx->completion_lock); 1070 - } 1071 1070 } 1072 - 1073 1071 if (unlikely(!nr_events)) 1074 1072 return 0; 1075 1073 1076 - io_commit_cqring(ctx); 1077 - io_cqring_ev_posted_iopoll(ctx); 1078 1074 pos = start ? start->next : ctx->iopoll_list.first; 1079 1075 wq_list_cut(&ctx->iopoll_list, prev, start); 1080 - io_free_batch_list(ctx, pos); 1076 + 1077 + if (WARN_ON_ONCE(!wq_list_empty(&ctx->submit_state.compl_reqs))) 1078 + return 0; 1079 + ctx->submit_state.compl_reqs.first = pos; 1080 + __io_submit_flush_completions(ctx); 1081 1081 return nr_events; 1082 1082 }
+2 -2
io_uring/splice.c
··· 68 68 ret = do_tee(in, out, sp->len, flags); 69 69 70 70 if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) 71 - io_put_file(in); 71 + fput(in); 72 72 done: 73 73 if (ret != sp->len) 74 74 req_set_fail(req); ··· 112 112 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); 113 113 114 114 if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) 115 - io_put_file(in); 115 + fput(in); 116 116 done: 117 117 if (ret != sp->len) 118 118 req_set_fail(req);
+15
io_uring/sqpoll.c
··· 421 421 io_sq_thread_finish(ctx); 422 422 return ret; 423 423 } 424 + 425 + __cold int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, 426 + cpumask_var_t mask) 427 + { 428 + struct io_sq_data *sqd = ctx->sq_data; 429 + int ret = -EINVAL; 430 + 431 + if (sqd) { 432 + io_sq_thread_park(sqd); 433 + ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask); 434 + io_sq_thread_unpark(sqd); 435 + } 436 + 437 + return ret; 438 + }
+1
io_uring/sqpoll.h
··· 27 27 void io_sq_thread_unpark(struct io_sq_data *sqd); 28 28 void io_put_sq_data(struct io_sq_data *sqd); 29 29 void io_sqpoll_wait_sq(struct io_ring_ctx *ctx); 30 + int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, cpumask_var_t mask);
+7 -13
io_uring/timeout.c
··· 73 73 74 74 if (!io_timeout_finish(timeout, data)) { 75 75 bool filled; 76 - filled = io_aux_cqe(req, ts->locked, -ETIME, IORING_CQE_F_MORE, 77 - false); 76 + filled = io_fill_cqe_req_aux(req, ts->locked, -ETIME, 77 + IORING_CQE_F_MORE); 78 78 if (filled) { 79 79 /* re-arm timer */ 80 80 spin_lock_irq(&ctx->timeout_lock); ··· 268 268 list_for_each_entry(timeout, &ctx->timeout_list, list) { 269 269 struct io_kiocb *tmp = cmd_to_io_kiocb(timeout); 270 270 271 - if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && 272 - cd->data != tmp->cqe.user_data) 273 - continue; 274 - if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { 275 - if (cd->seq == tmp->work.cancel_seq) 276 - continue; 277 - tmp->work.cancel_seq = cd->seq; 271 + if (io_cancel_req_match(tmp, cd)) { 272 + req = tmp; 273 + break; 278 274 } 279 - req = tmp; 280 - break; 281 275 } 282 276 if (!req) 283 277 return ERR_PTR(-ENOENT); ··· 403 409 struct timespec64 *ts, enum hrtimer_mode mode) 404 410 __must_hold(&ctx->timeout_lock) 405 411 { 406 - struct io_cancel_data cd = { .data = user_data, }; 412 + struct io_cancel_data cd = { .ctx = ctx, .data = user_data, }; 407 413 struct io_kiocb *req = io_timeout_extract(ctx, &cd); 408 414 struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); 409 415 struct io_timeout_data *data; ··· 467 473 int ret; 468 474 469 475 if (!(tr->flags & IORING_TIMEOUT_UPDATE)) { 470 - struct io_cancel_data cd = { .data = tr->addr, }; 476 + struct io_cancel_data cd = { .ctx = ctx, .data = tr->addr, }; 471 477 472 478 spin_lock(&ctx->completion_lock); 473 479 ret = io_timeout_cancel(ctx, &cd);
+30 -3
io_uring/uring_cmd.c
··· 7 7 #include <linux/nospec.h> 8 8 9 9 #include <uapi/linux/io_uring.h> 10 + #include <uapi/asm-generic/ioctls.h> 10 11 11 12 #include "io_uring.h" 12 13 #include "rsrc.h" ··· 43 42 static inline void io_req_set_cqe32_extra(struct io_kiocb *req, 44 43 u64 extra1, u64 extra2) 45 44 { 46 - req->extra1 = extra1; 47 - req->extra2 = extra2; 48 - req->flags |= REQ_F_CQE32_INIT; 45 + req->big_cqe.extra1 = extra1; 46 + req->big_cqe.extra2 = extra2; 49 47 } 50 48 51 49 /* ··· 164 164 return io_import_fixed(rw, iter, req->imu, ubuf, len); 165 165 } 166 166 EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed); 167 + 168 + int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) 169 + { 170 + struct socket *sock = cmd->file->private_data; 171 + struct sock *sk = sock->sk; 172 + struct proto *prot = READ_ONCE(sk->sk_prot); 173 + int ret, arg = 0; 174 + 175 + if (!prot || !prot->ioctl) 176 + return -EOPNOTSUPP; 177 + 178 + switch (cmd->sqe->cmd_op) { 179 + case SOCKET_URING_OP_SIOCINQ: 180 + ret = prot->ioctl(sk, SIOCINQ, &arg); 181 + if (ret) 182 + return ret; 183 + return arg; 184 + case SOCKET_URING_OP_SIOCOUTQ: 185 + ret = prot->ioctl(sk, SIOCOUTQ, &arg); 186 + if (ret) 187 + return ret; 188 + return arg; 189 + default: 190 + return -EOPNOTSUPP; 191 + } 192 + } 193 + EXPORT_SYMBOL_GPL(io_uring_cmd_sock);
+2
net/socket.c
··· 88 88 #include <linux/xattr.h> 89 89 #include <linux/nospec.h> 90 90 #include <linux/indirect_call_wrapper.h> 91 + #include <linux/io_uring.h> 91 92 92 93 #include <linux/uaccess.h> 93 94 #include <asm/unistd.h> ··· 161 160 #ifdef CONFIG_COMPAT 162 161 .compat_ioctl = compat_sock_ioctl, 163 162 #endif 163 + .uring_cmd = io_uring_cmd_sock, 164 164 .mmap = sock_mmap, 165 165 .release = sock_close, 166 166 .fasync = sock_fasync,
-18
tools/io_uring/Makefile
··· 1 - # SPDX-License-Identifier: GPL-2.0 2 - # Makefile for io_uring test tools 3 - CFLAGS += -Wall -Wextra -g -D_GNU_SOURCE 4 - LDLIBS += -lpthread 5 - 6 - all: io_uring-cp io_uring-bench 7 - %: %.c 8 - $(CC) $(CFLAGS) -o $@ $^ 9 - 10 - io_uring-bench: syscall.o io_uring-bench.o 11 - $(CC) $(CFLAGS) -o $@ $^ $(LDLIBS) 12 - 13 - io_uring-cp: setup.o syscall.o queue.o 14 - 15 - clean: 16 - $(RM) io_uring-cp io_uring-bench *.o 17 - 18 - .PHONY: all clean
-29
tools/io_uring/README
··· 1 - This directory includes a few programs that demonstrate how to use io_uring 2 - in an application. The examples are: 3 - 4 - io_uring-cp 5 - A very basic io_uring implementation of cp(1). It takes two 6 - arguments, copies the first argument to the second. This example 7 - is part of liburing, and hence uses the simplified liburing API 8 - for setting up an io_uring instance, submitting IO, completing IO, 9 - etc. The support functions in queue.c and setup.c are straight 10 - out of liburing. 11 - 12 - io_uring-bench 13 - Benchmark program that does random reads on a number of files. This 14 - app demonstrates the various features of io_uring, like fixed files, 15 - fixed buffers, and polled IO. There are options in the program to 16 - control which features to use. Arguments is the file (or files) that 17 - io_uring-bench should operate on. This uses the raw io_uring 18 - interface. 19 - 20 - liburing can be cloned with git here: 21 - 22 - git://git.kernel.dk/liburing 23 - 24 - and contains a number of unit tests as well for testing io_uring. It also 25 - comes with man pages for the three system calls. 26 - 27 - Fio includes an io_uring engine, you can clone fio here: 28 - 29 - git://git.kernel.dk/fio
-16
tools/io_uring/barrier.h
··· 1 - #ifndef LIBURING_BARRIER_H 2 - #define LIBURING_BARRIER_H 3 - 4 - #if defined(__x86_64) || defined(__i386__) 5 - #define read_barrier() __asm__ __volatile__("":::"memory") 6 - #define write_barrier() __asm__ __volatile__("":::"memory") 7 - #else 8 - /* 9 - * Add arch appropriate definitions. Be safe and use full barriers for 10 - * archs we don't have support for. 11 - */ 12 - #define read_barrier() __sync_synchronize() 13 - #define write_barrier() __sync_synchronize() 14 - #endif 15 - 16 - #endif
-592
tools/io_uring/io_uring-bench.c
··· 1 - // SPDX-License-Identifier: GPL-2.0 2 - /* 3 - * Simple benchmark program that uses the various features of io_uring 4 - * to provide fast random access to a device/file. It has various 5 - * options that are control how we use io_uring, see the OPTIONS section 6 - * below. This uses the raw io_uring interface. 7 - * 8 - * Copyright (C) 2018-2019 Jens Axboe 9 - */ 10 - #include <stdio.h> 11 - #include <errno.h> 12 - #include <assert.h> 13 - #include <stdlib.h> 14 - #include <stddef.h> 15 - #include <signal.h> 16 - #include <inttypes.h> 17 - 18 - #include <sys/types.h> 19 - #include <sys/stat.h> 20 - #include <sys/ioctl.h> 21 - #include <sys/syscall.h> 22 - #include <sys/resource.h> 23 - #include <sys/mman.h> 24 - #include <sys/uio.h> 25 - #include <linux/fs.h> 26 - #include <fcntl.h> 27 - #include <unistd.h> 28 - #include <string.h> 29 - #include <pthread.h> 30 - #include <sched.h> 31 - 32 - #include "liburing.h" 33 - #include "barrier.h" 34 - 35 - #define min(a, b) ((a < b) ? (a) : (b)) 36 - 37 - struct io_sq_ring { 38 - unsigned *head; 39 - unsigned *tail; 40 - unsigned *ring_mask; 41 - unsigned *ring_entries; 42 - unsigned *flags; 43 - unsigned *array; 44 - }; 45 - 46 - struct io_cq_ring { 47 - unsigned *head; 48 - unsigned *tail; 49 - unsigned *ring_mask; 50 - unsigned *ring_entries; 51 - struct io_uring_cqe *cqes; 52 - }; 53 - 54 - #define DEPTH 128 55 - 56 - #define BATCH_SUBMIT 32 57 - #define BATCH_COMPLETE 32 58 - 59 - #define BS 4096 60 - 61 - #define MAX_FDS 16 62 - 63 - static unsigned sq_ring_mask, cq_ring_mask; 64 - 65 - struct file { 66 - unsigned long max_blocks; 67 - unsigned pending_ios; 68 - int real_fd; 69 - int fixed_fd; 70 - }; 71 - 72 - struct submitter { 73 - pthread_t thread; 74 - int ring_fd; 75 - struct drand48_data rand; 76 - struct io_sq_ring sq_ring; 77 - struct io_uring_sqe *sqes; 78 - struct iovec iovecs[DEPTH]; 79 - struct io_cq_ring cq_ring; 80 - int inflight; 81 - unsigned long reaps; 82 - unsigned long done; 83 - unsigned long calls; 84 - volatile int finish; 85 - 86 - __s32 *fds; 87 - 88 - struct file files[MAX_FDS]; 89 - unsigned nr_files; 90 - unsigned cur_file; 91 - }; 92 - 93 - static struct submitter submitters[1]; 94 - static volatile int finish; 95 - 96 - /* 97 - * OPTIONS: Set these to test the various features of io_uring. 98 - */ 99 - static int polled = 1; /* use IO polling */ 100 - static int fixedbufs = 1; /* use fixed user buffers */ 101 - static int register_files = 1; /* use fixed files */ 102 - static int buffered = 0; /* use buffered IO, not O_DIRECT */ 103 - static int sq_thread_poll = 0; /* use kernel submission/poller thread */ 104 - static int sq_thread_cpu = -1; /* pin above thread to this CPU */ 105 - static int do_nop = 0; /* no-op SQ ring commands */ 106 - 107 - static int io_uring_register_buffers(struct submitter *s) 108 - { 109 - if (do_nop) 110 - return 0; 111 - 112 - return io_uring_register(s->ring_fd, IORING_REGISTER_BUFFERS, s->iovecs, 113 - DEPTH); 114 - } 115 - 116 - static int io_uring_register_files(struct submitter *s) 117 - { 118 - unsigned i; 119 - 120 - if (do_nop) 121 - return 0; 122 - 123 - s->fds = calloc(s->nr_files, sizeof(__s32)); 124 - for (i = 0; i < s->nr_files; i++) { 125 - s->fds[i] = s->files[i].real_fd; 126 - s->files[i].fixed_fd = i; 127 - } 128 - 129 - return io_uring_register(s->ring_fd, IORING_REGISTER_FILES, s->fds, 130 - s->nr_files); 131 - } 132 - 133 - static int lk_gettid(void) 134 - { 135 - return syscall(__NR_gettid); 136 - } 137 - 138 - static unsigned file_depth(struct submitter *s) 139 - { 140 - return (DEPTH + s->nr_files - 1) / s->nr_files; 141 - } 142 - 143 - static void init_io(struct submitter *s, unsigned index) 144 - { 145 - struct io_uring_sqe *sqe = &s->sqes[index]; 146 - unsigned long offset; 147 - struct file *f; 148 - long r; 149 - 150 - if (do_nop) { 151 - sqe->opcode = IORING_OP_NOP; 152 - return; 153 - } 154 - 155 - if (s->nr_files == 1) { 156 - f = &s->files[0]; 157 - } else { 158 - f = &s->files[s->cur_file]; 159 - if (f->pending_ios >= file_depth(s)) { 160 - s->cur_file++; 161 - if (s->cur_file == s->nr_files) 162 - s->cur_file = 0; 163 - f = &s->files[s->cur_file]; 164 - } 165 - } 166 - f->pending_ios++; 167 - 168 - lrand48_r(&s->rand, &r); 169 - offset = (r % (f->max_blocks - 1)) * BS; 170 - 171 - if (register_files) { 172 - sqe->flags = IOSQE_FIXED_FILE; 173 - sqe->fd = f->fixed_fd; 174 - } else { 175 - sqe->flags = 0; 176 - sqe->fd = f->real_fd; 177 - } 178 - if (fixedbufs) { 179 - sqe->opcode = IORING_OP_READ_FIXED; 180 - sqe->addr = (unsigned long) s->iovecs[index].iov_base; 181 - sqe->len = BS; 182 - sqe->buf_index = index; 183 - } else { 184 - sqe->opcode = IORING_OP_READV; 185 - sqe->addr = (unsigned long) &s->iovecs[index]; 186 - sqe->len = 1; 187 - sqe->buf_index = 0; 188 - } 189 - sqe->ioprio = 0; 190 - sqe->off = offset; 191 - sqe->user_data = (unsigned long) f; 192 - } 193 - 194 - static int prep_more_ios(struct submitter *s, unsigned max_ios) 195 - { 196 - struct io_sq_ring *ring = &s->sq_ring; 197 - unsigned index, tail, next_tail, prepped = 0; 198 - 199 - next_tail = tail = *ring->tail; 200 - do { 201 - next_tail++; 202 - read_barrier(); 203 - if (next_tail == *ring->head) 204 - break; 205 - 206 - index = tail & sq_ring_mask; 207 - init_io(s, index); 208 - ring->array[index] = index; 209 - prepped++; 210 - tail = next_tail; 211 - } while (prepped < max_ios); 212 - 213 - if (*ring->tail != tail) { 214 - /* order tail store with writes to sqes above */ 215 - write_barrier(); 216 - *ring->tail = tail; 217 - write_barrier(); 218 - } 219 - return prepped; 220 - } 221 - 222 - static int get_file_size(struct file *f) 223 - { 224 - struct stat st; 225 - 226 - if (fstat(f->real_fd, &st) < 0) 227 - return -1; 228 - if (S_ISBLK(st.st_mode)) { 229 - unsigned long long bytes; 230 - 231 - if (ioctl(f->real_fd, BLKGETSIZE64, &bytes) != 0) 232 - return -1; 233 - 234 - f->max_blocks = bytes / BS; 235 - return 0; 236 - } else if (S_ISREG(st.st_mode)) { 237 - f->max_blocks = st.st_size / BS; 238 - return 0; 239 - } 240 - 241 - return -1; 242 - } 243 - 244 - static int reap_events(struct submitter *s) 245 - { 246 - struct io_cq_ring *ring = &s->cq_ring; 247 - struct io_uring_cqe *cqe; 248 - unsigned head, reaped = 0; 249 - 250 - head = *ring->head; 251 - do { 252 - struct file *f; 253 - 254 - read_barrier(); 255 - if (head == *ring->tail) 256 - break; 257 - cqe = &ring->cqes[head & cq_ring_mask]; 258 - if (!do_nop) { 259 - f = (struct file *) (uintptr_t) cqe->user_data; 260 - f->pending_ios--; 261 - if (cqe->res != BS) { 262 - printf("io: unexpected ret=%d\n", cqe->res); 263 - if (polled && cqe->res == -EOPNOTSUPP) 264 - printf("Your filesystem doesn't support poll\n"); 265 - return -1; 266 - } 267 - } 268 - reaped++; 269 - head++; 270 - } while (1); 271 - 272 - s->inflight -= reaped; 273 - *ring->head = head; 274 - write_barrier(); 275 - return reaped; 276 - } 277 - 278 - static void *submitter_fn(void *data) 279 - { 280 - struct submitter *s = data; 281 - struct io_sq_ring *ring = &s->sq_ring; 282 - int ret, prepped; 283 - 284 - printf("submitter=%d\n", lk_gettid()); 285 - 286 - srand48_r(pthread_self(), &s->rand); 287 - 288 - prepped = 0; 289 - do { 290 - int to_wait, to_submit, this_reap, to_prep; 291 - 292 - if (!prepped && s->inflight < DEPTH) { 293 - to_prep = min(DEPTH - s->inflight, BATCH_SUBMIT); 294 - prepped = prep_more_ios(s, to_prep); 295 - } 296 - s->inflight += prepped; 297 - submit_more: 298 - to_submit = prepped; 299 - submit: 300 - if (to_submit && (s->inflight + to_submit <= DEPTH)) 301 - to_wait = 0; 302 - else 303 - to_wait = min(s->inflight + to_submit, BATCH_COMPLETE); 304 - 305 - /* 306 - * Only need to call io_uring_enter if we're not using SQ thread 307 - * poll, or if IORING_SQ_NEED_WAKEUP is set. 308 - */ 309 - if (!sq_thread_poll || (*ring->flags & IORING_SQ_NEED_WAKEUP)) { 310 - unsigned flags = 0; 311 - 312 - if (to_wait) 313 - flags = IORING_ENTER_GETEVENTS; 314 - if ((*ring->flags & IORING_SQ_NEED_WAKEUP)) 315 - flags |= IORING_ENTER_SQ_WAKEUP; 316 - ret = io_uring_enter(s->ring_fd, to_submit, to_wait, 317 - flags, NULL); 318 - s->calls++; 319 - } 320 - 321 - /* 322 - * For non SQ thread poll, we already got the events we needed 323 - * through the io_uring_enter() above. For SQ thread poll, we 324 - * need to loop here until we find enough events. 325 - */ 326 - this_reap = 0; 327 - do { 328 - int r; 329 - r = reap_events(s); 330 - if (r == -1) { 331 - s->finish = 1; 332 - break; 333 - } else if (r > 0) 334 - this_reap += r; 335 - } while (sq_thread_poll && this_reap < to_wait); 336 - s->reaps += this_reap; 337 - 338 - if (ret >= 0) { 339 - if (!ret) { 340 - to_submit = 0; 341 - if (s->inflight) 342 - goto submit; 343 - continue; 344 - } else if (ret < to_submit) { 345 - int diff = to_submit - ret; 346 - 347 - s->done += ret; 348 - prepped -= diff; 349 - goto submit_more; 350 - } 351 - s->done += ret; 352 - prepped = 0; 353 - continue; 354 - } else if (ret < 0) { 355 - if (errno == EAGAIN) { 356 - if (s->finish) 357 - break; 358 - if (this_reap) 359 - goto submit; 360 - to_submit = 0; 361 - goto submit; 362 - } 363 - printf("io_submit: %s\n", strerror(errno)); 364 - break; 365 - } 366 - } while (!s->finish); 367 - 368 - finish = 1; 369 - return NULL; 370 - } 371 - 372 - static void sig_int(int sig) 373 - { 374 - printf("Exiting on signal %d\n", sig); 375 - submitters[0].finish = 1; 376 - finish = 1; 377 - } 378 - 379 - static void arm_sig_int(void) 380 - { 381 - struct sigaction act; 382 - 383 - memset(&act, 0, sizeof(act)); 384 - act.sa_handler = sig_int; 385 - act.sa_flags = SA_RESTART; 386 - sigaction(SIGINT, &act, NULL); 387 - } 388 - 389 - static int setup_ring(struct submitter *s) 390 - { 391 - struct io_sq_ring *sring = &s->sq_ring; 392 - struct io_cq_ring *cring = &s->cq_ring; 393 - struct io_uring_params p; 394 - int ret, fd; 395 - void *ptr; 396 - 397 - memset(&p, 0, sizeof(p)); 398 - 399 - if (polled && !do_nop) 400 - p.flags |= IORING_SETUP_IOPOLL; 401 - if (sq_thread_poll) { 402 - p.flags |= IORING_SETUP_SQPOLL; 403 - if (sq_thread_cpu != -1) { 404 - p.flags |= IORING_SETUP_SQ_AFF; 405 - p.sq_thread_cpu = sq_thread_cpu; 406 - } 407 - } 408 - 409 - fd = io_uring_setup(DEPTH, &p); 410 - if (fd < 0) { 411 - perror("io_uring_setup"); 412 - return 1; 413 - } 414 - s->ring_fd = fd; 415 - 416 - if (fixedbufs) { 417 - ret = io_uring_register_buffers(s); 418 - if (ret < 0) { 419 - perror("io_uring_register_buffers"); 420 - return 1; 421 - } 422 - } 423 - 424 - if (register_files) { 425 - ret = io_uring_register_files(s); 426 - if (ret < 0) { 427 - perror("io_uring_register_files"); 428 - return 1; 429 - } 430 - } 431 - 432 - ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32), 433 - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 434 - IORING_OFF_SQ_RING); 435 - printf("sq_ring ptr = 0x%p\n", ptr); 436 - sring->head = ptr + p.sq_off.head; 437 - sring->tail = ptr + p.sq_off.tail; 438 - sring->ring_mask = ptr + p.sq_off.ring_mask; 439 - sring->ring_entries = ptr + p.sq_off.ring_entries; 440 - sring->flags = ptr + p.sq_off.flags; 441 - sring->array = ptr + p.sq_off.array; 442 - sq_ring_mask = *sring->ring_mask; 443 - 444 - s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe), 445 - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 446 - IORING_OFF_SQES); 447 - printf("sqes ptr = 0x%p\n", s->sqes); 448 - 449 - ptr = mmap(0, p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe), 450 - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 451 - IORING_OFF_CQ_RING); 452 - printf("cq_ring ptr = 0x%p\n", ptr); 453 - cring->head = ptr + p.cq_off.head; 454 - cring->tail = ptr + p.cq_off.tail; 455 - cring->ring_mask = ptr + p.cq_off.ring_mask; 456 - cring->ring_entries = ptr + p.cq_off.ring_entries; 457 - cring->cqes = ptr + p.cq_off.cqes; 458 - cq_ring_mask = *cring->ring_mask; 459 - return 0; 460 - } 461 - 462 - static void file_depths(char *buf) 463 - { 464 - struct submitter *s = &submitters[0]; 465 - unsigned i; 466 - char *p; 467 - 468 - buf[0] = '\0'; 469 - p = buf; 470 - for (i = 0; i < s->nr_files; i++) { 471 - struct file *f = &s->files[i]; 472 - 473 - if (i + 1 == s->nr_files) 474 - p += sprintf(p, "%d", f->pending_ios); 475 - else 476 - p += sprintf(p, "%d, ", f->pending_ios); 477 - } 478 - } 479 - 480 - int main(int argc, char *argv[]) 481 - { 482 - struct submitter *s = &submitters[0]; 483 - unsigned long done, calls, reap; 484 - int err, i, flags, fd; 485 - char *fdepths; 486 - void *ret; 487 - 488 - if (!do_nop && argc < 2) { 489 - printf("%s: filename\n", argv[0]); 490 - return 1; 491 - } 492 - 493 - flags = O_RDONLY | O_NOATIME; 494 - if (!buffered) 495 - flags |= O_DIRECT; 496 - 497 - i = 1; 498 - while (!do_nop && i < argc) { 499 - struct file *f; 500 - 501 - if (s->nr_files == MAX_FDS) { 502 - printf("Max number of files (%d) reached\n", MAX_FDS); 503 - break; 504 - } 505 - fd = open(argv[i], flags); 506 - if (fd < 0) { 507 - perror("open"); 508 - return 1; 509 - } 510 - 511 - f = &s->files[s->nr_files]; 512 - f->real_fd = fd; 513 - if (get_file_size(f)) { 514 - printf("failed getting size of device/file\n"); 515 - return 1; 516 - } 517 - if (f->max_blocks <= 1) { 518 - printf("Zero file/device size?\n"); 519 - return 1; 520 - } 521 - f->max_blocks--; 522 - 523 - printf("Added file %s\n", argv[i]); 524 - s->nr_files++; 525 - i++; 526 - } 527 - 528 - if (fixedbufs) { 529 - struct rlimit rlim; 530 - 531 - rlim.rlim_cur = RLIM_INFINITY; 532 - rlim.rlim_max = RLIM_INFINITY; 533 - if (setrlimit(RLIMIT_MEMLOCK, &rlim) < 0) { 534 - perror("setrlimit"); 535 - return 1; 536 - } 537 - } 538 - 539 - arm_sig_int(); 540 - 541 - for (i = 0; i < DEPTH; i++) { 542 - void *buf; 543 - 544 - if (posix_memalign(&buf, BS, BS)) { 545 - printf("failed alloc\n"); 546 - return 1; 547 - } 548 - s->iovecs[i].iov_base = buf; 549 - s->iovecs[i].iov_len = BS; 550 - } 551 - 552 - err = setup_ring(s); 553 - if (err) { 554 - printf("ring setup failed: %s, %d\n", strerror(errno), err); 555 - return 1; 556 - } 557 - printf("polled=%d, fixedbufs=%d, buffered=%d", polled, fixedbufs, buffered); 558 - printf(" QD=%d, sq_ring=%d, cq_ring=%d\n", DEPTH, *s->sq_ring.ring_entries, *s->cq_ring.ring_entries); 559 - 560 - pthread_create(&s->thread, NULL, submitter_fn, s); 561 - 562 - fdepths = malloc(8 * s->nr_files); 563 - reap = calls = done = 0; 564 - do { 565 - unsigned long this_done = 0; 566 - unsigned long this_reap = 0; 567 - unsigned long this_call = 0; 568 - unsigned long rpc = 0, ipc = 0; 569 - 570 - sleep(1); 571 - this_done += s->done; 572 - this_call += s->calls; 573 - this_reap += s->reaps; 574 - if (this_call - calls) { 575 - rpc = (this_done - done) / (this_call - calls); 576 - ipc = (this_reap - reap) / (this_call - calls); 577 - } else 578 - rpc = ipc = -1; 579 - file_depths(fdepths); 580 - printf("IOPS=%lu, IOS/call=%ld/%ld, inflight=%u (%s)\n", 581 - this_done - done, rpc, ipc, s->inflight, 582 - fdepths); 583 - done = this_done; 584 - calls = this_call; 585 - reap = this_reap; 586 - } while (!finish); 587 - 588 - pthread_join(s->thread, &ret); 589 - close(s->ring_fd); 590 - free(fdepths); 591 - return 0; 592 - }
-283
tools/io_uring/io_uring-cp.c
··· 1 - // SPDX-License-Identifier: GPL-2.0 2 - /* 3 - * Simple test program that demonstrates a file copy through io_uring. This 4 - * uses the API exposed by liburing. 5 - * 6 - * Copyright (C) 2018-2019 Jens Axboe 7 - */ 8 - #include <stdio.h> 9 - #include <fcntl.h> 10 - #include <string.h> 11 - #include <stdlib.h> 12 - #include <unistd.h> 13 - #include <assert.h> 14 - #include <errno.h> 15 - #include <inttypes.h> 16 - #include <sys/types.h> 17 - #include <sys/stat.h> 18 - #include <sys/ioctl.h> 19 - 20 - #include "liburing.h" 21 - 22 - #define QD 64 23 - #define BS (32*1024) 24 - 25 - static int infd, outfd; 26 - 27 - struct io_data { 28 - int read; 29 - off_t first_offset, offset; 30 - size_t first_len; 31 - struct iovec iov; 32 - }; 33 - 34 - static int setup_context(unsigned entries, struct io_uring *ring) 35 - { 36 - int ret; 37 - 38 - ret = io_uring_queue_init(entries, ring, 0); 39 - if (ret < 0) { 40 - fprintf(stderr, "queue_init: %s\n", strerror(-ret)); 41 - return -1; 42 - } 43 - 44 - return 0; 45 - } 46 - 47 - static int get_file_size(int fd, off_t *size) 48 - { 49 - struct stat st; 50 - 51 - if (fstat(fd, &st) < 0) 52 - return -1; 53 - if (S_ISREG(st.st_mode)) { 54 - *size = st.st_size; 55 - return 0; 56 - } else if (S_ISBLK(st.st_mode)) { 57 - unsigned long long bytes; 58 - 59 - if (ioctl(fd, BLKGETSIZE64, &bytes) != 0) 60 - return -1; 61 - 62 - *size = bytes; 63 - return 0; 64 - } 65 - 66 - return -1; 67 - } 68 - 69 - static void queue_prepped(struct io_uring *ring, struct io_data *data) 70 - { 71 - struct io_uring_sqe *sqe; 72 - 73 - sqe = io_uring_get_sqe(ring); 74 - assert(sqe); 75 - 76 - if (data->read) 77 - io_uring_prep_readv(sqe, infd, &data->iov, 1, data->offset); 78 - else 79 - io_uring_prep_writev(sqe, outfd, &data->iov, 1, data->offset); 80 - 81 - io_uring_sqe_set_data(sqe, data); 82 - } 83 - 84 - static int queue_read(struct io_uring *ring, off_t size, off_t offset) 85 - { 86 - struct io_uring_sqe *sqe; 87 - struct io_data *data; 88 - 89 - data = malloc(size + sizeof(*data)); 90 - if (!data) 91 - return 1; 92 - 93 - sqe = io_uring_get_sqe(ring); 94 - if (!sqe) { 95 - free(data); 96 - return 1; 97 - } 98 - 99 - data->read = 1; 100 - data->offset = data->first_offset = offset; 101 - 102 - data->iov.iov_base = data + 1; 103 - data->iov.iov_len = size; 104 - data->first_len = size; 105 - 106 - io_uring_prep_readv(sqe, infd, &data->iov, 1, offset); 107 - io_uring_sqe_set_data(sqe, data); 108 - return 0; 109 - } 110 - 111 - static void queue_write(struct io_uring *ring, struct io_data *data) 112 - { 113 - data->read = 0; 114 - data->offset = data->first_offset; 115 - 116 - data->iov.iov_base = data + 1; 117 - data->iov.iov_len = data->first_len; 118 - 119 - queue_prepped(ring, data); 120 - io_uring_submit(ring); 121 - } 122 - 123 - static int copy_file(struct io_uring *ring, off_t insize) 124 - { 125 - unsigned long reads, writes; 126 - struct io_uring_cqe *cqe; 127 - off_t write_left, offset; 128 - int ret; 129 - 130 - write_left = insize; 131 - writes = reads = offset = 0; 132 - 133 - while (insize || write_left) { 134 - int had_reads, got_comp; 135 - 136 - /* 137 - * Queue up as many reads as we can 138 - */ 139 - had_reads = reads; 140 - while (insize) { 141 - off_t this_size = insize; 142 - 143 - if (reads + writes >= QD) 144 - break; 145 - if (this_size > BS) 146 - this_size = BS; 147 - else if (!this_size) 148 - break; 149 - 150 - if (queue_read(ring, this_size, offset)) 151 - break; 152 - 153 - insize -= this_size; 154 - offset += this_size; 155 - reads++; 156 - } 157 - 158 - if (had_reads != reads) { 159 - ret = io_uring_submit(ring); 160 - if (ret < 0) { 161 - fprintf(stderr, "io_uring_submit: %s\n", strerror(-ret)); 162 - break; 163 - } 164 - } 165 - 166 - /* 167 - * Queue is full at this point. Find at least one completion. 168 - */ 169 - got_comp = 0; 170 - while (write_left) { 171 - struct io_data *data; 172 - 173 - if (!got_comp) { 174 - ret = io_uring_wait_cqe(ring, &cqe); 175 - got_comp = 1; 176 - } else { 177 - ret = io_uring_peek_cqe(ring, &cqe); 178 - if (ret == -EAGAIN) { 179 - cqe = NULL; 180 - ret = 0; 181 - } 182 - } 183 - if (ret < 0) { 184 - fprintf(stderr, "io_uring_peek_cqe: %s\n", 185 - strerror(-ret)); 186 - return 1; 187 - } 188 - if (!cqe) 189 - break; 190 - 191 - data = io_uring_cqe_get_data(cqe); 192 - if (cqe->res < 0) { 193 - if (cqe->res == -EAGAIN) { 194 - queue_prepped(ring, data); 195 - io_uring_cqe_seen(ring, cqe); 196 - continue; 197 - } 198 - fprintf(stderr, "cqe failed: %s\n", 199 - strerror(-cqe->res)); 200 - return 1; 201 - } else if (cqe->res != data->iov.iov_len) { 202 - /* Short read/write, adjust and requeue */ 203 - data->iov.iov_base += cqe->res; 204 - data->iov.iov_len -= cqe->res; 205 - data->offset += cqe->res; 206 - queue_prepped(ring, data); 207 - io_uring_cqe_seen(ring, cqe); 208 - continue; 209 - } 210 - 211 - /* 212 - * All done. if write, nothing else to do. if read, 213 - * queue up corresponding write. 214 - */ 215 - if (data->read) { 216 - queue_write(ring, data); 217 - write_left -= data->first_len; 218 - reads--; 219 - writes++; 220 - } else { 221 - free(data); 222 - writes--; 223 - } 224 - io_uring_cqe_seen(ring, cqe); 225 - } 226 - } 227 - 228 - /* wait out pending writes */ 229 - while (writes) { 230 - struct io_data *data; 231 - 232 - ret = io_uring_wait_cqe(ring, &cqe); 233 - if (ret) { 234 - fprintf(stderr, "wait_cqe=%d\n", ret); 235 - return 1; 236 - } 237 - if (cqe->res < 0) { 238 - fprintf(stderr, "write res=%d\n", cqe->res); 239 - return 1; 240 - } 241 - data = io_uring_cqe_get_data(cqe); 242 - free(data); 243 - writes--; 244 - io_uring_cqe_seen(ring, cqe); 245 - } 246 - 247 - return 0; 248 - } 249 - 250 - int main(int argc, char *argv[]) 251 - { 252 - struct io_uring ring; 253 - off_t insize; 254 - int ret; 255 - 256 - if (argc < 3) { 257 - printf("%s: infile outfile\n", argv[0]); 258 - return 1; 259 - } 260 - 261 - infd = open(argv[1], O_RDONLY); 262 - if (infd < 0) { 263 - perror("open infile"); 264 - return 1; 265 - } 266 - outfd = open(argv[2], O_WRONLY | O_CREAT | O_TRUNC, 0644); 267 - if (outfd < 0) { 268 - perror("open outfile"); 269 - return 1; 270 - } 271 - 272 - if (setup_context(QD, &ring)) 273 - return 1; 274 - if (get_file_size(infd, &insize)) 275 - return 1; 276 - 277 - ret = copy_file(&ring, insize); 278 - 279 - close(infd); 280 - close(outfd); 281 - io_uring_queue_exit(&ring); 282 - return ret; 283 - }
-187
tools/io_uring/liburing.h
··· 1 - #ifndef LIB_URING_H 2 - #define LIB_URING_H 3 - 4 - #ifdef __cplusplus 5 - extern "C" { 6 - #endif 7 - 8 - #include <sys/uio.h> 9 - #include <signal.h> 10 - #include <string.h> 11 - #include "../../include/uapi/linux/io_uring.h" 12 - #include <inttypes.h> 13 - #include <linux/swab.h> 14 - #include "barrier.h" 15 - 16 - /* 17 - * Library interface to io_uring 18 - */ 19 - struct io_uring_sq { 20 - unsigned *khead; 21 - unsigned *ktail; 22 - unsigned *kring_mask; 23 - unsigned *kring_entries; 24 - unsigned *kflags; 25 - unsigned *kdropped; 26 - unsigned *array; 27 - struct io_uring_sqe *sqes; 28 - 29 - unsigned sqe_head; 30 - unsigned sqe_tail; 31 - 32 - size_t ring_sz; 33 - }; 34 - 35 - struct io_uring_cq { 36 - unsigned *khead; 37 - unsigned *ktail; 38 - unsigned *kring_mask; 39 - unsigned *kring_entries; 40 - unsigned *koverflow; 41 - struct io_uring_cqe *cqes; 42 - 43 - size_t ring_sz; 44 - }; 45 - 46 - struct io_uring { 47 - struct io_uring_sq sq; 48 - struct io_uring_cq cq; 49 - int ring_fd; 50 - }; 51 - 52 - /* 53 - * System calls 54 - */ 55 - extern int io_uring_setup(unsigned entries, struct io_uring_params *p); 56 - extern int io_uring_enter(int fd, unsigned to_submit, 57 - unsigned min_complete, unsigned flags, sigset_t *sig); 58 - extern int io_uring_register(int fd, unsigned int opcode, void *arg, 59 - unsigned int nr_args); 60 - 61 - /* 62 - * Library interface 63 - */ 64 - extern int io_uring_queue_init(unsigned entries, struct io_uring *ring, 65 - unsigned flags); 66 - extern int io_uring_queue_mmap(int fd, struct io_uring_params *p, 67 - struct io_uring *ring); 68 - extern void io_uring_queue_exit(struct io_uring *ring); 69 - extern int io_uring_peek_cqe(struct io_uring *ring, 70 - struct io_uring_cqe **cqe_ptr); 71 - extern int io_uring_wait_cqe(struct io_uring *ring, 72 - struct io_uring_cqe **cqe_ptr); 73 - extern int io_uring_submit(struct io_uring *ring); 74 - extern struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring); 75 - 76 - /* 77 - * Must be called after io_uring_{peek,wait}_cqe() after the cqe has 78 - * been processed by the application. 79 - */ 80 - static inline void io_uring_cqe_seen(struct io_uring *ring, 81 - struct io_uring_cqe *cqe) 82 - { 83 - if (cqe) { 84 - struct io_uring_cq *cq = &ring->cq; 85 - 86 - (*cq->khead)++; 87 - /* 88 - * Ensure that the kernel sees our new head, the kernel has 89 - * the matching read barrier. 90 - */ 91 - write_barrier(); 92 - } 93 - } 94 - 95 - /* 96 - * Command prep helpers 97 - */ 98 - static inline void io_uring_sqe_set_data(struct io_uring_sqe *sqe, void *data) 99 - { 100 - sqe->user_data = (unsigned long) data; 101 - } 102 - 103 - static inline void *io_uring_cqe_get_data(struct io_uring_cqe *cqe) 104 - { 105 - return (void *) (uintptr_t) cqe->user_data; 106 - } 107 - 108 - static inline void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, 109 - const void *addr, unsigned len, 110 - off_t offset) 111 - { 112 - memset(sqe, 0, sizeof(*sqe)); 113 - sqe->opcode = op; 114 - sqe->fd = fd; 115 - sqe->off = offset; 116 - sqe->addr = (unsigned long) addr; 117 - sqe->len = len; 118 - } 119 - 120 - static inline void io_uring_prep_readv(struct io_uring_sqe *sqe, int fd, 121 - const struct iovec *iovecs, 122 - unsigned nr_vecs, off_t offset) 123 - { 124 - io_uring_prep_rw(IORING_OP_READV, sqe, fd, iovecs, nr_vecs, offset); 125 - } 126 - 127 - static inline void io_uring_prep_read_fixed(struct io_uring_sqe *sqe, int fd, 128 - void *buf, unsigned nbytes, 129 - off_t offset) 130 - { 131 - io_uring_prep_rw(IORING_OP_READ_FIXED, sqe, fd, buf, nbytes, offset); 132 - } 133 - 134 - static inline void io_uring_prep_writev(struct io_uring_sqe *sqe, int fd, 135 - const struct iovec *iovecs, 136 - unsigned nr_vecs, off_t offset) 137 - { 138 - io_uring_prep_rw(IORING_OP_WRITEV, sqe, fd, iovecs, nr_vecs, offset); 139 - } 140 - 141 - static inline void io_uring_prep_write_fixed(struct io_uring_sqe *sqe, int fd, 142 - const void *buf, unsigned nbytes, 143 - off_t offset) 144 - { 145 - io_uring_prep_rw(IORING_OP_WRITE_FIXED, sqe, fd, buf, nbytes, offset); 146 - } 147 - 148 - static inline void io_uring_prep_poll_add(struct io_uring_sqe *sqe, int fd, 149 - unsigned poll_mask) 150 - { 151 - memset(sqe, 0, sizeof(*sqe)); 152 - sqe->opcode = IORING_OP_POLL_ADD; 153 - sqe->fd = fd; 154 - #if __BYTE_ORDER == __BIG_ENDIAN 155 - poll_mask = __swahw32(poll_mask); 156 - #endif 157 - sqe->poll_events = poll_mask; 158 - } 159 - 160 - static inline void io_uring_prep_poll_remove(struct io_uring_sqe *sqe, 161 - void *user_data) 162 - { 163 - memset(sqe, 0, sizeof(*sqe)); 164 - sqe->opcode = IORING_OP_POLL_REMOVE; 165 - sqe->addr = (unsigned long) user_data; 166 - } 167 - 168 - static inline void io_uring_prep_fsync(struct io_uring_sqe *sqe, int fd, 169 - unsigned fsync_flags) 170 - { 171 - memset(sqe, 0, sizeof(*sqe)); 172 - sqe->opcode = IORING_OP_FSYNC; 173 - sqe->fd = fd; 174 - sqe->fsync_flags = fsync_flags; 175 - } 176 - 177 - static inline void io_uring_prep_nop(struct io_uring_sqe *sqe) 178 - { 179 - memset(sqe, 0, sizeof(*sqe)); 180 - sqe->opcode = IORING_OP_NOP; 181 - } 182 - 183 - #ifdef __cplusplus 184 - } 185 - #endif 186 - 187 - #endif
-156
tools/io_uring/queue.c
··· 1 - #include <sys/types.h> 2 - #include <sys/stat.h> 3 - #include <sys/mman.h> 4 - #include <unistd.h> 5 - #include <errno.h> 6 - #include <string.h> 7 - 8 - #include "liburing.h" 9 - #include "barrier.h" 10 - 11 - static int __io_uring_get_cqe(struct io_uring *ring, 12 - struct io_uring_cqe **cqe_ptr, int wait) 13 - { 14 - struct io_uring_cq *cq = &ring->cq; 15 - const unsigned mask = *cq->kring_mask; 16 - unsigned head; 17 - int ret; 18 - 19 - *cqe_ptr = NULL; 20 - head = *cq->khead; 21 - do { 22 - /* 23 - * It's necessary to use a read_barrier() before reading 24 - * the CQ tail, since the kernel updates it locklessly. The 25 - * kernel has the matching store barrier for the update. The 26 - * kernel also ensures that previous stores to CQEs are ordered 27 - * with the tail update. 28 - */ 29 - read_barrier(); 30 - if (head != *cq->ktail) { 31 - *cqe_ptr = &cq->cqes[head & mask]; 32 - break; 33 - } 34 - if (!wait) 35 - break; 36 - ret = io_uring_enter(ring->ring_fd, 0, 1, 37 - IORING_ENTER_GETEVENTS, NULL); 38 - if (ret < 0) 39 - return -errno; 40 - } while (1); 41 - 42 - return 0; 43 - } 44 - 45 - /* 46 - * Return an IO completion, if one is readily available. Returns 0 with 47 - * cqe_ptr filled in on success, -errno on failure. 48 - */ 49 - int io_uring_peek_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr) 50 - { 51 - return __io_uring_get_cqe(ring, cqe_ptr, 0); 52 - } 53 - 54 - /* 55 - * Return an IO completion, waiting for it if necessary. Returns 0 with 56 - * cqe_ptr filled in on success, -errno on failure. 57 - */ 58 - int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr) 59 - { 60 - return __io_uring_get_cqe(ring, cqe_ptr, 1); 61 - } 62 - 63 - /* 64 - * Submit sqes acquired from io_uring_get_sqe() to the kernel. 65 - * 66 - * Returns number of sqes submitted 67 - */ 68 - int io_uring_submit(struct io_uring *ring) 69 - { 70 - struct io_uring_sq *sq = &ring->sq; 71 - const unsigned mask = *sq->kring_mask; 72 - unsigned ktail, ktail_next, submitted, to_submit; 73 - int ret; 74 - 75 - /* 76 - * If we have pending IO in the kring, submit it first. We need a 77 - * read barrier here to match the kernels store barrier when updating 78 - * the SQ head. 79 - */ 80 - read_barrier(); 81 - if (*sq->khead != *sq->ktail) { 82 - submitted = *sq->kring_entries; 83 - goto submit; 84 - } 85 - 86 - if (sq->sqe_head == sq->sqe_tail) 87 - return 0; 88 - 89 - /* 90 - * Fill in sqes that we have queued up, adding them to the kernel ring 91 - */ 92 - submitted = 0; 93 - ktail = ktail_next = *sq->ktail; 94 - to_submit = sq->sqe_tail - sq->sqe_head; 95 - while (to_submit--) { 96 - ktail_next++; 97 - read_barrier(); 98 - 99 - sq->array[ktail & mask] = sq->sqe_head & mask; 100 - ktail = ktail_next; 101 - 102 - sq->sqe_head++; 103 - submitted++; 104 - } 105 - 106 - if (!submitted) 107 - return 0; 108 - 109 - if (*sq->ktail != ktail) { 110 - /* 111 - * First write barrier ensures that the SQE stores are updated 112 - * with the tail update. This is needed so that the kernel 113 - * will never see a tail update without the preceeding sQE 114 - * stores being done. 115 - */ 116 - write_barrier(); 117 - *sq->ktail = ktail; 118 - /* 119 - * The kernel has the matching read barrier for reading the 120 - * SQ tail. 121 - */ 122 - write_barrier(); 123 - } 124 - 125 - submit: 126 - ret = io_uring_enter(ring->ring_fd, submitted, 0, 127 - IORING_ENTER_GETEVENTS, NULL); 128 - if (ret < 0) 129 - return -errno; 130 - 131 - return ret; 132 - } 133 - 134 - /* 135 - * Return an sqe to fill. Application must later call io_uring_submit() 136 - * when it's ready to tell the kernel about it. The caller may call this 137 - * function multiple times before calling io_uring_submit(). 138 - * 139 - * Returns a vacant sqe, or NULL if we're full. 140 - */ 141 - struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring) 142 - { 143 - struct io_uring_sq *sq = &ring->sq; 144 - unsigned next = sq->sqe_tail + 1; 145 - struct io_uring_sqe *sqe; 146 - 147 - /* 148 - * All sqes are used 149 - */ 150 - if (next - sq->sqe_head > *sq->kring_entries) 151 - return NULL; 152 - 153 - sqe = &sq->sqes[sq->sqe_tail & *sq->kring_mask]; 154 - sq->sqe_tail = next; 155 - return sqe; 156 - }
-107
tools/io_uring/setup.c
··· 1 - #include <sys/types.h> 2 - #include <sys/stat.h> 3 - #include <sys/mman.h> 4 - #include <unistd.h> 5 - #include <errno.h> 6 - #include <string.h> 7 - 8 - #include "liburing.h" 9 - 10 - static int io_uring_mmap(int fd, struct io_uring_params *p, 11 - struct io_uring_sq *sq, struct io_uring_cq *cq) 12 - { 13 - size_t size; 14 - void *ptr; 15 - int ret; 16 - 17 - sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned); 18 - ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE, 19 - MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); 20 - if (ptr == MAP_FAILED) 21 - return -errno; 22 - sq->khead = ptr + p->sq_off.head; 23 - sq->ktail = ptr + p->sq_off.tail; 24 - sq->kring_mask = ptr + p->sq_off.ring_mask; 25 - sq->kring_entries = ptr + p->sq_off.ring_entries; 26 - sq->kflags = ptr + p->sq_off.flags; 27 - sq->kdropped = ptr + p->sq_off.dropped; 28 - sq->array = ptr + p->sq_off.array; 29 - 30 - size = p->sq_entries * sizeof(struct io_uring_sqe); 31 - sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE, 32 - MAP_SHARED | MAP_POPULATE, fd, 33 - IORING_OFF_SQES); 34 - if (sq->sqes == MAP_FAILED) { 35 - ret = -errno; 36 - err: 37 - munmap(sq->khead, sq->ring_sz); 38 - return ret; 39 - } 40 - 41 - cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe); 42 - ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE, 43 - MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING); 44 - if (ptr == MAP_FAILED) { 45 - ret = -errno; 46 - munmap(sq->sqes, p->sq_entries * sizeof(struct io_uring_sqe)); 47 - goto err; 48 - } 49 - cq->khead = ptr + p->cq_off.head; 50 - cq->ktail = ptr + p->cq_off.tail; 51 - cq->kring_mask = ptr + p->cq_off.ring_mask; 52 - cq->kring_entries = ptr + p->cq_off.ring_entries; 53 - cq->koverflow = ptr + p->cq_off.overflow; 54 - cq->cqes = ptr + p->cq_off.cqes; 55 - return 0; 56 - } 57 - 58 - /* 59 - * For users that want to specify sq_thread_cpu or sq_thread_idle, this 60 - * interface is a convenient helper for mmap()ing the rings. 61 - * Returns -1 on error, or zero on success. On success, 'ring' 62 - * contains the necessary information to read/write to the rings. 63 - */ 64 - int io_uring_queue_mmap(int fd, struct io_uring_params *p, struct io_uring *ring) 65 - { 66 - int ret; 67 - 68 - memset(ring, 0, sizeof(*ring)); 69 - ret = io_uring_mmap(fd, p, &ring->sq, &ring->cq); 70 - if (!ret) 71 - ring->ring_fd = fd; 72 - return ret; 73 - } 74 - 75 - /* 76 - * Returns -1 on error, or zero on success. On success, 'ring' 77 - * contains the necessary information to read/write to the rings. 78 - */ 79 - int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags) 80 - { 81 - struct io_uring_params p; 82 - int fd, ret; 83 - 84 - memset(&p, 0, sizeof(p)); 85 - p.flags = flags; 86 - 87 - fd = io_uring_setup(entries, &p); 88 - if (fd < 0) 89 - return fd; 90 - 91 - ret = io_uring_queue_mmap(fd, &p, ring); 92 - if (ret) 93 - close(fd); 94 - 95 - return ret; 96 - } 97 - 98 - void io_uring_queue_exit(struct io_uring *ring) 99 - { 100 - struct io_uring_sq *sq = &ring->sq; 101 - struct io_uring_cq *cq = &ring->cq; 102 - 103 - munmap(sq->sqes, *sq->kring_entries * sizeof(struct io_uring_sqe)); 104 - munmap(sq->khead, sq->ring_sz); 105 - munmap(cq->khead, cq->ring_sz); 106 - close(ring->ring_fd); 107 - }
-52
tools/io_uring/syscall.c
··· 1 - /* 2 - * Will go away once libc support is there 3 - */ 4 - #include <unistd.h> 5 - #include <sys/syscall.h> 6 - #include <sys/uio.h> 7 - #include <signal.h> 8 - #include "liburing.h" 9 - 10 - #ifdef __alpha__ 11 - /* 12 - * alpha is the only exception, all other architectures 13 - * have common numbers for new system calls. 14 - */ 15 - # ifndef __NR_io_uring_setup 16 - # define __NR_io_uring_setup 535 17 - # endif 18 - # ifndef __NR_io_uring_enter 19 - # define __NR_io_uring_enter 536 20 - # endif 21 - # ifndef __NR_io_uring_register 22 - # define __NR_io_uring_register 537 23 - # endif 24 - #else /* !__alpha__ */ 25 - # ifndef __NR_io_uring_setup 26 - # define __NR_io_uring_setup 425 27 - # endif 28 - # ifndef __NR_io_uring_enter 29 - # define __NR_io_uring_enter 426 30 - # endif 31 - # ifndef __NR_io_uring_register 32 - # define __NR_io_uring_register 427 33 - # endif 34 - #endif 35 - 36 - int io_uring_register(int fd, unsigned int opcode, void *arg, 37 - unsigned int nr_args) 38 - { 39 - return syscall(__NR_io_uring_register, fd, opcode, arg, nr_args); 40 - } 41 - 42 - int io_uring_setup(unsigned int entries, struct io_uring_params *p) 43 - { 44 - return syscall(__NR_io_uring_setup, entries, p); 45 - } 46 - 47 - int io_uring_enter(int fd, unsigned int to_submit, unsigned int min_complete, 48 - unsigned int flags, sigset_t *sig) 49 - { 50 - return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, 51 - flags, sig, _NSIG / 8); 52 - }