Merge tag 'for-6.6/io_uring-2023-08-28' of git://git.kernel.dk/linux

-1

MAINTAINERS

··· 10966 10966 F: include/trace/events/io_uring.h 10967 10967 F: include/uapi/linux/io_uring.h 10968 10968 F: io_uring/ 10969 - F: tools/io_uring/ 10970 10969 10971 10970 IPMI SUBSYSTEM 10972 10971 M: Corey Minyard <minyard@acm.org>

+6

include/linux/io_uring.h

··· 81 81 if (tsk->io_uring) 82 82 __io_uring_free(tsk); 83 83 } 84 + int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); 84 85 #else 85 86 static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, 86 87 struct iov_iter *iter, void *ioucmd) ··· 116 115 static inline const char *io_uring_get_opcode(u8 opcode) 117 116 { 118 117 return ""; 118 + } 119 + static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd, 120 + unsigned int issue_flags) 121 + { 122 + return -EOPNOTSUPP; 119 123 } 120 124 #endif 121 125

+65 -64

include/linux/io_uring_types.h

··· 69 69 }; 70 70 71 71 struct io_uring { 72 - u32 head ____cacheline_aligned_in_smp; 73 - u32 tail ____cacheline_aligned_in_smp; 72 + u32 head; 73 + u32 tail; 74 74 }; 75 75 76 76 /* ··· 176 176 unsigned short submit_nr; 177 177 unsigned int cqes_count; 178 178 struct blk_plug plug; 179 - struct io_uring_cqe cqes[16]; 180 179 }; 181 180 182 181 struct io_ev_fd { ··· 204 205 unsigned int has_evfd: 1; 205 206 /* all CQEs should be posted only by the submitter task */ 206 207 unsigned int task_complete: 1; 208 + unsigned int lockless_cq: 1; 207 209 unsigned int syscall_iopoll: 1; 208 210 unsigned int poll_activated: 1; 209 211 unsigned int drain_disabled: 1; 210 212 unsigned int compat: 1; 211 213 214 + struct task_struct *submitter_task; 215 + struct io_rings *rings; 216 + struct percpu_ref refs; 217 + 212 218 enum task_work_notify_mode notify_method; 213 - 214 - /* 215 - * If IORING_SETUP_NO_MMAP is used, then the below holds 216 - * the gup'ed pages for the two rings, and the sqes. 217 - */ 218 - unsigned short n_ring_pages; 219 - unsigned short n_sqe_pages; 220 - struct page **ring_pages; 221 - struct page **sqe_pages; 222 - 223 - struct io_rings *rings; 224 - struct task_struct *submitter_task; 225 - struct percpu_ref refs; 226 219 } ____cacheline_aligned_in_smp; 227 220 228 221 /* submission data */ ··· 252 261 253 262 struct io_buffer_list *io_bl; 254 263 struct xarray io_bl_xa; 255 - struct list_head io_buffers_cache; 256 264 257 265 struct io_hash_table cancel_table_locked; 258 - struct list_head cq_overflow_list; 259 266 struct io_alloc_cache apoll_cache; 260 267 struct io_alloc_cache netmsg_cache; 268 + 269 + /* 270 + * ->iopoll_list is protected by the ctx->uring_lock for 271 + * io_uring instances that don't use IORING_SETUP_SQPOLL. 272 + * For SQPOLL, only the single threaded io_sq_thread() will 273 + * manipulate the list, hence no extra locking is needed there. 274 + */ 275 + struct io_wq_work_list iopoll_list; 276 + bool poll_multi_queue; 261 277 } ____cacheline_aligned_in_smp; 262 - 263 - /* IRQ completion list, under ->completion_lock */ 264 - struct io_wq_work_list locked_free_list; 265 - unsigned int locked_free_nr; 266 - 267 - const struct cred *sq_creds; /* cred used for __io_sq_thread() */ 268 - struct io_sq_data *sq_data; /* if using sq thread polling */ 269 - 270 - struct wait_queue_head sqo_sq_wait; 271 - struct list_head sqd_list; 272 - 273 - unsigned long check_cq; 274 - 275 - unsigned int file_alloc_start; 276 - unsigned int file_alloc_end; 277 - 278 - struct xarray personalities; 279 - u32 pers_next; 280 278 281 279 struct { 282 280 /* ··· 278 298 unsigned cached_cq_tail; 279 299 unsigned cq_entries; 280 300 struct io_ev_fd __rcu *io_ev_fd; 281 - struct wait_queue_head cq_wait; 282 301 unsigned cq_extra; 283 302 } ____cacheline_aligned_in_smp; 284 303 304 + /* 305 + * task_work and async notification delivery cacheline. Expected to 306 + * regularly bounce b/w CPUs. 307 + */ 285 308 struct { 286 - spinlock_t completion_lock; 287 - 288 - bool poll_multi_queue; 289 - atomic_t cq_wait_nr; 290 - 291 - /* 292 - * ->iopoll_list is protected by the ctx->uring_lock for 293 - * io_uring instances that don't use IORING_SETUP_SQPOLL. 294 - * For SQPOLL, only the single threaded io_sq_thread() will 295 - * manipulate the list, hence no extra locking is needed there. 296 - */ 297 - struct io_wq_work_list iopoll_list; 298 - struct io_hash_table cancel_table; 299 - 300 309 struct llist_head work_llist; 301 - 302 - struct list_head io_buffers_comp; 310 + unsigned long check_cq; 311 + atomic_t cq_wait_nr; 312 + atomic_t cq_timeouts; 313 + struct wait_queue_head cq_wait; 303 314 } ____cacheline_aligned_in_smp; 304 315 305 316 /* timeouts */ 306 317 struct { 307 318 spinlock_t timeout_lock; 308 - atomic_t cq_timeouts; 309 319 struct list_head timeout_list; 310 320 struct list_head ltimeout_list; 311 321 unsigned cq_last_tm_flush; 312 322 } ____cacheline_aligned_in_smp; 323 + 324 + struct io_uring_cqe completion_cqes[16]; 325 + 326 + spinlock_t completion_lock; 327 + 328 + /* IRQ completion list, under ->completion_lock */ 329 + struct io_wq_work_list locked_free_list; 330 + unsigned int locked_free_nr; 331 + 332 + struct list_head io_buffers_comp; 333 + struct list_head cq_overflow_list; 334 + struct io_hash_table cancel_table; 335 + 336 + const struct cred *sq_creds; /* cred used for __io_sq_thread() */ 337 + struct io_sq_data *sq_data; /* if using sq thread polling */ 338 + 339 + struct wait_queue_head sqo_sq_wait; 340 + struct list_head sqd_list; 341 + 342 + unsigned int file_alloc_start; 343 + unsigned int file_alloc_end; 344 + 345 + struct xarray personalities; 346 + u32 pers_next; 347 + 348 + struct list_head io_buffers_cache; 313 349 314 350 /* Keep this last, we don't need it for the fast path */ 315 351 struct wait_queue_head poll_wq; ··· 370 374 unsigned sq_thread_idle; 371 375 /* protected by ->completion_lock */ 372 376 unsigned evfd_last_cq_tail; 377 + 378 + /* 379 + * If IORING_SETUP_NO_MMAP is used, then the below holds 380 + * the gup'ed pages for the two rings, and the sqes. 381 + */ 382 + unsigned short n_ring_pages; 383 + unsigned short n_sqe_pages; 384 + struct page **ring_pages; 385 + struct page **sqe_pages; 373 386 }; 374 387 375 388 struct io_tw_state { ··· 414 409 REQ_F_SINGLE_POLL_BIT, 415 410 REQ_F_DOUBLE_POLL_BIT, 416 411 REQ_F_PARTIAL_IO_BIT, 417 - REQ_F_CQE32_INIT_BIT, 418 412 REQ_F_APOLL_MULTISHOT_BIT, 419 413 REQ_F_CLEAR_POLLIN_BIT, 420 414 REQ_F_HASH_LOCKED_BIT, ··· 483 479 REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), 484 480 /* fast poll multishot mode */ 485 481 REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), 486 - /* ->extra1 and ->extra2 are initialised */ 487 - REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT), 488 482 /* recvmsg special flag, clear EPOLLIN */ 489 483 REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), 490 484 /* hashed into ->cancel_hash_locked, protected by ->uring_lock */ ··· 581 579 struct io_task_work io_task_work; 582 580 unsigned nr_tw; 583 581 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ 584 - union { 585 - struct hlist_node hash_node; 586 - struct { 587 - u64 extra1; 588 - u64 extra2; 589 - }; 590 - }; 582 + struct hlist_node hash_node; 591 583 /* internal polling, see IORING_FEAT_FAST_POLL */ 592 584 struct async_poll *apoll; 593 585 /* opcode allocated if it needs to store data for async defer */ ··· 591 595 /* custom credentials, valid IFF REQ_F_CREDS is set */ 592 596 const struct cred *creds; 593 597 struct io_wq_work work; 598 + 599 + struct { 600 + u64 extra1; 601 + u64 extra2; 602 + } big_cqe; 594 603 }; 595 604 596 605 struct io_overflow_cqe {

+20 -1

include/uapi/linux/io_uring.h

··· 185 185 */ 186 186 #define IORING_SETUP_REGISTERED_FD_ONLY (1U << 15) 187 187 188 + /* 189 + * Removes indirection through the SQ index array. 190 + */ 191 + #define IORING_SETUP_NO_SQARRAY (1U << 16) 192 + 188 193 enum io_uring_op { 189 194 IORING_OP_NOP, 190 195 IORING_OP_READV, ··· 304 299 * request 'user_data' 305 300 * IORING_ASYNC_CANCEL_ANY Match any request 306 301 * IORING_ASYNC_CANCEL_FD_FIXED 'fd' passed in is a fixed descriptor 302 + * IORING_ASYNC_CANCEL_USERDATA Match on user_data, default for no other key 303 + * IORING_ASYNC_CANCEL_OP Match request based on opcode 307 304 */ 308 305 #define IORING_ASYNC_CANCEL_ALL (1U << 0) 309 306 #define IORING_ASYNC_CANCEL_FD (1U << 1) 310 307 #define IORING_ASYNC_CANCEL_ANY (1U << 2) 311 308 #define IORING_ASYNC_CANCEL_FD_FIXED (1U << 3) 309 + #define IORING_ASYNC_CANCEL_USERDATA (1U << 4) 310 + #define IORING_ASYNC_CANCEL_OP (1U << 5) 312 311 313 312 /* 314 313 * send/sendmsg and recv/recvmsg flags (sqe->ioprio) ··· 706 697 __s32 fd; 707 698 __u32 flags; 708 699 struct __kernel_timespec timeout; 709 - __u64 pad[4]; 700 + __u8 opcode; 701 + __u8 pad[7]; 702 + __u64 pad2[3]; 710 703 }; 711 704 712 705 /* ··· 726 715 __u32 controllen; 727 716 __u32 payloadlen; 728 717 __u32 flags; 718 + }; 719 + 720 + /* 721 + * Argument for IORING_OP_URING_CMD when file is a socket 722 + */ 723 + enum { 724 + SOCKET_URING_OP_SIOCINQ = 0, 725 + SOCKET_URING_OP_SIOCOUTQ, 729 726 }; 730 727 731 728 #ifdef __cplusplus

+54 -22

io_uring/cancel.c

··· 22 22 u64 addr; 23 23 u32 flags; 24 24 s32 fd; 25 + u8 opcode; 25 26 }; 26 27 27 28 #define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ 28 - IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED) 29 + IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED | \ 30 + IORING_ASYNC_CANCEL_USERDATA | IORING_ASYNC_CANCEL_OP) 31 + 32 + /* 33 + * Returns true if the request matches the criteria outlined by 'cd'. 34 + */ 35 + bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd) 36 + { 37 + bool match_user_data = cd->flags & IORING_ASYNC_CANCEL_USERDATA; 38 + 39 + if (req->ctx != cd->ctx) 40 + return false; 41 + 42 + if (!(cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP))) 43 + match_user_data = true; 44 + 45 + if (cd->flags & IORING_ASYNC_CANCEL_ANY) 46 + goto check_seq; 47 + if (cd->flags & IORING_ASYNC_CANCEL_FD) { 48 + if (req->file != cd->file) 49 + return false; 50 + } 51 + if (cd->flags & IORING_ASYNC_CANCEL_OP) { 52 + if (req->opcode != cd->opcode) 53 + return false; 54 + } 55 + if (match_user_data && req->cqe.user_data != cd->data) 56 + return false; 57 + if (cd->flags & IORING_ASYNC_CANCEL_ALL) { 58 + check_seq: 59 + if (cd->seq == req->work.cancel_seq) 60 + return false; 61 + req->work.cancel_seq = cd->seq; 62 + } 63 + 64 + return true; 65 + } 29 66 30 67 static bool io_cancel_cb(struct io_wq_work *work, void *data) 31 68 { 32 69 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 33 70 struct io_cancel_data *cd = data; 34 71 35 - if (req->ctx != cd->ctx) 36 - return false; 37 - if (cd->flags & IORING_ASYNC_CANCEL_ANY) { 38 - ; 39 - } else if (cd->flags & IORING_ASYNC_CANCEL_FD) { 40 - if (req->file != cd->file) 41 - return false; 42 - } else { 43 - if (req->cqe.user_data != cd->data) 44 - return false; 45 - } 46 - if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { 47 - if (cd->seq == req->work.cancel_seq) 48 - return false; 49 - req->work.cancel_seq = cd->seq; 50 - } 51 - return true; 72 + return io_cancel_req_match(req, cd); 52 73 } 53 74 54 75 static int io_async_cancel_one(struct io_uring_task *tctx, ··· 132 111 133 112 if (unlikely(req->flags & REQ_F_BUFFER_SELECT)) 134 113 return -EINVAL; 135 - if (sqe->off || sqe->len || sqe->splice_fd_in) 114 + if (sqe->off || sqe->splice_fd_in) 136 115 return -EINVAL; 137 116 138 117 cancel->addr = READ_ONCE(sqe->addr); ··· 143 122 if (cancel->flags & IORING_ASYNC_CANCEL_ANY) 144 123 return -EINVAL; 145 124 cancel->fd = READ_ONCE(sqe->fd); 125 + } 126 + if (cancel->flags & IORING_ASYNC_CANCEL_OP) { 127 + if (cancel->flags & IORING_ASYNC_CANCEL_ANY) 128 + return -EINVAL; 129 + cancel->opcode = READ_ONCE(sqe->len); 146 130 } 147 131 148 132 return 0; ··· 195 169 .ctx = req->ctx, 196 170 .data = cancel->addr, 197 171 .flags = cancel->flags, 172 + .opcode = cancel->opcode, 198 173 .seq = atomic_inc_return(&req->ctx->cancel_seq), 199 174 }; 200 175 struct io_uring_task *tctx = req->task->io_uring; ··· 265 238 struct io_uring_sync_cancel_reg sc; 266 239 struct fd f = { }; 267 240 DEFINE_WAIT(wait); 268 - int ret; 241 + int ret, i; 269 242 270 243 if (copy_from_user(&sc, arg, sizeof(sc))) 271 244 return -EFAULT; 272 245 if (sc.flags & ~CANCEL_FLAGS) 273 246 return -EINVAL; 274 - if (sc.pad[0] || sc.pad[1] || sc.pad[2] || sc.pad[3]) 275 - return -EINVAL; 247 + for (i = 0; i < ARRAY_SIZE(sc.pad); i++) 248 + if (sc.pad[i]) 249 + return -EINVAL; 250 + for (i = 0; i < ARRAY_SIZE(sc.pad2); i++) 251 + if (sc.pad2[i]) 252 + return -EINVAL; 276 253 277 254 cd.data = sc.addr; 278 255 cd.flags = sc.flags; 256 + cd.opcode = sc.opcode; 279 257 280 258 /* we can grab a normal file descriptor upfront */ 281 259 if ((cd.flags & IORING_ASYNC_CANCEL_FD) &&

+2 -1

io_uring/cancel.h

··· 8 8 u64 data; 9 9 struct file *file; 10 10 }; 11 + u8 opcode; 11 12 u32 flags; 12 13 int seq; 13 14 }; 14 - 15 15 16 16 int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 17 17 int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags); ··· 21 21 void init_hash_table(struct io_hash_table *table, unsigned size); 22 22 23 23 int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg); 24 + bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd);

+6 -12

io_uring/fdinfo.c

··· 46 46 return 0; 47 47 } 48 48 49 - static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, 50 - struct seq_file *m) 49 + /* 50 + * Caller holds a reference to the file already, we don't need to do 51 + * anything else to get an extra reference. 52 + */ 53 + __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) 51 54 { 55 + struct io_ring_ctx *ctx = f->private_data; 52 56 struct io_sq_data *sq = NULL; 53 57 struct io_overflow_cqe *ocqe; 54 58 struct io_rings *r = ctx->rings; ··· 206 202 } 207 203 208 204 spin_unlock(&ctx->completion_lock); 209 - } 210 - 211 - __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) 212 - { 213 - struct io_ring_ctx *ctx = f->private_data; 214 - 215 - if (percpu_ref_tryget(&ctx->refs)) { 216 - __io_uring_show_fdinfo(ctx, m); 217 - percpu_ref_put(&ctx->refs); 218 - } 219 205 } 220 206 #endif

+47 -23

io_uring/io-wq.c

··· 232 232 do_exit(0); 233 233 } 234 234 235 - static inline bool io_acct_run_queue(struct io_wq_acct *acct) 235 + static inline bool __io_acct_run_queue(struct io_wq_acct *acct) 236 236 { 237 - bool ret = false; 237 + return !test_bit(IO_ACCT_STALLED_BIT, &acct->flags) && 238 + !wq_list_empty(&acct->work_list); 239 + } 238 240 241 + /* 242 + * If there's work to do, returns true with acct->lock acquired. If not, 243 + * returns false with no lock held. 244 + */ 245 + static inline bool io_acct_run_queue(struct io_wq_acct *acct) 246 + __acquires(&acct->lock) 247 + { 239 248 raw_spin_lock(&acct->lock); 240 - if (!wq_list_empty(&acct->work_list) && 241 - !test_bit(IO_ACCT_STALLED_BIT, &acct->flags)) 242 - ret = true; 243 - raw_spin_unlock(&acct->lock); 249 + if (__io_acct_run_queue(acct)) 250 + return true; 244 251 245 - return ret; 252 + raw_spin_unlock(&acct->lock); 253 + return false; 246 254 } 247 255 248 256 /* ··· 276 268 io_worker_release(worker); 277 269 continue; 278 270 } 279 - if (wake_up_process(worker->task)) { 280 - io_worker_release(worker); 281 - return true; 282 - } 271 + /* 272 + * If the worker is already running, it's either already 273 + * starting work or finishing work. In either case, if it does 274 + * to go sleep, we'll kick off a new task for this work anyway. 275 + */ 276 + wake_up_process(worker->task); 283 277 io_worker_release(worker); 278 + return true; 284 279 } 285 280 286 281 return false; ··· 408 397 if (!io_acct_run_queue(acct)) 409 398 return; 410 399 400 + raw_spin_unlock(&acct->lock); 411 401 atomic_inc(&acct->nr_running); 412 402 atomic_inc(&wq->worker_refs); 413 403 io_queue_worker_create(worker, acct, create_worker_cb); ··· 533 521 raw_spin_unlock(&worker->lock); 534 522 } 535 523 536 - static void io_worker_handle_work(struct io_worker *worker) 524 + /* 525 + * Called with acct->lock held, drops it before returning 526 + */ 527 + static void io_worker_handle_work(struct io_wq_acct *acct, 528 + struct io_worker *worker) 529 + __releases(&acct->lock) 537 530 { 538 - struct io_wq_acct *acct = io_wq_get_acct(worker); 539 531 struct io_wq *wq = worker->wq; 540 532 bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state); 541 533 ··· 553 537 * can't make progress, any work completion or insertion will 554 538 * clear the stalled flag. 555 539 */ 556 - raw_spin_lock(&acct->lock); 557 540 work = io_get_next_work(acct, worker); 558 541 raw_spin_unlock(&acct->lock); 559 542 if (work) { ··· 606 591 wake_up(&wq->hash->wait); 607 592 } 608 593 } while (work); 594 + 595 + if (!__io_acct_run_queue(acct)) 596 + break; 597 + raw_spin_lock(&acct->lock); 609 598 } while (1); 610 599 } 611 600 ··· 630 611 long ret; 631 612 632 613 set_current_state(TASK_INTERRUPTIBLE); 614 + 615 + /* 616 + * If we have work to do, io_acct_run_queue() returns with 617 + * the acct->lock held. If not, it will drop it. 618 + */ 633 619 while (io_acct_run_queue(acct)) 634 - io_worker_handle_work(worker); 620 + io_worker_handle_work(acct, worker); 635 621 636 622 raw_spin_lock(&wq->lock); 637 623 /* ··· 669 645 } 670 646 } 671 647 672 - if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) 673 - io_worker_handle_work(worker); 648 + if (test_bit(IO_WQ_BIT_EXIT, &wq->state) && io_acct_run_queue(acct)) 649 + io_worker_handle_work(acct, worker); 674 650 675 651 io_worker_exit(worker); 676 652 return 0; ··· 933 909 clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); 934 910 raw_spin_unlock(&acct->lock); 935 911 936 - raw_spin_lock(&wq->lock); 937 912 rcu_read_lock(); 938 913 do_create = !io_wq_activate_free_worker(wq, acct); 939 914 rcu_read_unlock(); 940 - 941 - raw_spin_unlock(&wq->lock); 942 915 943 916 if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) || 944 917 !atomic_read(&acct->nr_running))) { ··· 1306 1285 return __io_wq_cpu_online(wq, cpu, false); 1307 1286 } 1308 1287 1309 - int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask) 1288 + int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask) 1310 1289 { 1290 + if (!tctx || !tctx->io_wq) 1291 + return -EINVAL; 1292 + 1311 1293 rcu_read_lock(); 1312 1294 if (mask) 1313 - cpumask_copy(wq->cpu_mask, mask); 1295 + cpumask_copy(tctx->io_wq->cpu_mask, mask); 1314 1296 else 1315 - cpumask_copy(wq->cpu_mask, cpu_possible_mask); 1297 + cpumask_copy(tctx->io_wq->cpu_mask, cpu_possible_mask); 1316 1298 rcu_read_unlock(); 1317 1299 1318 1300 return 0;

+1 -1

io_uring/io-wq.h

··· 50 50 void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); 51 51 void io_wq_hash_work(struct io_wq_work *work, void *val); 52 52 53 - int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask); 53 + int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask); 54 54 int io_wq_max_workers(struct io_wq *wq, int *new_count); 55 55 56 56 static inline bool io_wq_is_hashed(struct io_wq_work *work)

+125 -104

io_uring/io_uring.c

··· 147 147 bool cancel_all); 148 148 149 149 static void io_queue_sqe(struct io_kiocb *req); 150 - static void io_move_task_work_from_local(struct io_ring_ctx *ctx); 151 - static void __io_submit_flush_completions(struct io_ring_ctx *ctx); 152 150 153 151 struct kmem_cache *req_cachep; 154 152 ··· 227 229 static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) 228 230 { 229 231 wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); 230 - kasan_poison_object_data(req_cachep, req); 231 232 } 232 233 233 234 static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) ··· 289 292 goto err; 290 293 if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits)) 291 294 goto err; 292 - 293 - ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); 294 - if (!ctx->dummy_ubuf) 295 - goto err; 296 - /* set invalid range, so io_import_fixed() fails meeting it */ 297 - ctx->dummy_ubuf->ubuf = -1UL; 298 - 299 295 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 300 296 0, GFP_KERNEL)) 301 297 goto err; ··· 327 337 INIT_WQ_LIST(&ctx->submit_state.compl_reqs); 328 338 return ctx; 329 339 err: 330 - kfree(ctx->dummy_ubuf); 331 340 kfree(ctx->cancel_table.hbs); 332 341 kfree(ctx->cancel_table_locked.hbs); 333 342 kfree(ctx->io_bl); ··· 615 626 616 627 static inline void __io_cq_lock(struct io_ring_ctx *ctx) 617 628 { 618 - if (!ctx->task_complete) 629 + if (!ctx->lockless_cq) 619 630 spin_lock(&ctx->completion_lock); 620 631 } 621 632 ··· 628 639 static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) 629 640 { 630 641 io_commit_cqring(ctx); 631 - 632 - if (ctx->task_complete) { 633 - /* 634 - * ->task_complete implies that only current might be waiting 635 - * for CQEs, and obviously, we currently don't. No one is 636 - * waiting, wakeups are futile, skip them. 637 - */ 638 - io_commit_cqring_flush(ctx); 639 - } else { 640 - spin_unlock(&ctx->completion_lock); 641 - io_commit_cqring_flush(ctx); 642 - io_cqring_wake(ctx); 642 + if (!ctx->task_complete) { 643 + if (!ctx->lockless_cq) 644 + spin_unlock(&ctx->completion_lock); 645 + /* IOPOLL rings only need to wake up if it's also SQPOLL */ 646 + if (!ctx->syscall_iopoll) 647 + io_cqring_wake(ctx); 643 648 } 649 + io_commit_cqring_flush(ctx); 644 650 } 645 651 646 652 static void io_cq_unlock_post(struct io_ring_ctx *ctx) ··· 643 659 { 644 660 io_commit_cqring(ctx); 645 661 spin_unlock(&ctx->completion_lock); 646 - io_commit_cqring_flush(ctx); 647 662 io_cqring_wake(ctx); 663 + io_commit_cqring_flush(ctx); 648 664 } 649 665 650 666 /* Returns true if there are no backlogged entries after the flush */ ··· 677 693 678 694 io_cq_lock(ctx); 679 695 while (!list_empty(&ctx->cq_overflow_list)) { 680 - struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true); 696 + struct io_uring_cqe *cqe; 681 697 struct io_overflow_cqe *ocqe; 682 698 683 - if (!cqe) 699 + if (!io_get_cqe_overflow(ctx, &cqe, true)) 684 700 break; 685 701 ocqe = list_first_entry(&ctx->cq_overflow_list, 686 702 struct io_overflow_cqe, list); ··· 799 815 return true; 800 816 } 801 817 802 - bool io_req_cqe_overflow(struct io_kiocb *req) 818 + void io_req_cqe_overflow(struct io_kiocb *req) 803 819 { 804 - if (!(req->flags & REQ_F_CQE32_INIT)) { 805 - req->extra1 = 0; 806 - req->extra2 = 0; 807 - } 808 - return io_cqring_event_overflow(req->ctx, req->cqe.user_data, 809 - req->cqe.res, req->cqe.flags, 810 - req->extra1, req->extra2); 820 + io_cqring_event_overflow(req->ctx, req->cqe.user_data, 821 + req->cqe.res, req->cqe.flags, 822 + req->big_cqe.extra1, req->big_cqe.extra2); 823 + memset(&req->big_cqe, 0, sizeof(req->big_cqe)); 811 824 } 812 825 813 826 /* ··· 812 831 * control dependency is enough as we're using WRITE_ONCE to 813 832 * fill the cq entry 814 833 */ 815 - struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) 834 + bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) 816 835 { 817 836 struct io_rings *rings = ctx->rings; 818 837 unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); ··· 824 843 * Force overflow the completion. 825 844 */ 826 845 if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))) 827 - return NULL; 846 + return false; 828 847 829 848 /* userspace may cheat modifying the tail, be safe and do min */ 830 849 queued = min(__io_cqring_events(ctx), ctx->cq_entries); ··· 832 851 /* we need a contiguous range, limit based on the current array offset */ 833 852 len = min(free, ctx->cq_entries - off); 834 853 if (!len) 835 - return NULL; 854 + return false; 836 855 837 856 if (ctx->flags & IORING_SETUP_CQE32) { 838 857 off <<= 1; ··· 841 860 842 861 ctx->cqe_cached = &rings->cqes[off]; 843 862 ctx->cqe_sentinel = ctx->cqe_cached + len; 844 - 845 - ctx->cached_cq_tail++; 846 - ctx->cqe_cached++; 847 - if (ctx->flags & IORING_SETUP_CQE32) 848 - ctx->cqe_cached++; 849 - return &rings->cqes[off]; 863 + return true; 850 864 } 851 865 852 866 static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, ··· 856 880 * submission (by quite a lot). Increment the overflow count in 857 881 * the ring. 858 882 */ 859 - cqe = io_get_cqe(ctx); 860 - if (likely(cqe)) { 883 + if (likely(io_get_cqe(ctx, &cqe))) { 861 884 trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); 862 885 863 886 WRITE_ONCE(cqe->user_data, user_data); ··· 880 905 881 906 lockdep_assert_held(&ctx->uring_lock); 882 907 for (i = 0; i < state->cqes_count; i++) { 883 - struct io_uring_cqe *cqe = &state->cqes[i]; 908 + struct io_uring_cqe *cqe = &ctx->completion_cqes[i]; 884 909 885 910 if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) { 886 911 if (ctx->task_complete) { ··· 916 941 return __io_post_aux_cqe(ctx, user_data, res, cflags, true); 917 942 } 918 943 919 - bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags, 920 - bool allow_overflow) 944 + /* 945 + * A helper for multishot requests posting additional CQEs. 946 + * Should only be used from a task_work including IO_URING_F_MULTISHOT. 947 + */ 948 + bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags) 921 949 { 922 950 struct io_ring_ctx *ctx = req->ctx; 923 951 u64 user_data = req->cqe.user_data; 924 952 struct io_uring_cqe *cqe; 925 953 926 954 if (!defer) 927 - return __io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow); 955 + return __io_post_aux_cqe(ctx, user_data, res, cflags, false); 928 956 929 957 lockdep_assert_held(&ctx->uring_lock); 930 958 931 - if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->submit_state.cqes)) { 959 + if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->completion_cqes)) { 932 960 __io_cq_lock(ctx); 933 961 __io_flush_post_cqes(ctx); 934 962 /* no need to flush - flush is deferred */ ··· 942 964 * however it's main job is to prevent unbounded posted completions, 943 965 * and in that it works just as well. 944 966 */ 945 - if (!allow_overflow && test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) 967 + if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) 946 968 return false; 947 969 948 - cqe = &ctx->submit_state.cqes[ctx->submit_state.cqes_count++]; 970 + cqe = &ctx->completion_cqes[ctx->submit_state.cqes_count++]; 949 971 cqe->user_data = user_data; 950 972 cqe->res = res; 951 973 cqe->flags = cflags; ··· 958 980 struct io_rsrc_node *rsrc_node = NULL; 959 981 960 982 io_cq_lock(ctx); 961 - if (!(req->flags & REQ_F_CQE_SKIP)) 962 - io_fill_cqe_req(ctx, req); 983 + if (!(req->flags & REQ_F_CQE_SKIP)) { 984 + if (!io_fill_cqe_req(ctx, req)) 985 + io_req_cqe_overflow(req); 986 + } 963 987 964 988 /* 965 989 * If we're the last reference to this request, add to our locked ··· 979 999 io_put_kbuf_comp(req); 980 1000 if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) 981 1001 io_clean_op(req); 982 - if (!(req->flags & REQ_F_FIXED_FILE)) 983 - io_put_file(req->file); 1002 + io_put_file(req); 984 1003 985 1004 rsrc_node = req->rsrc_node; 986 1005 /* ··· 1041 1062 req->link = NULL; 1042 1063 req->async_data = NULL; 1043 1064 /* not necessary, but safer to zero */ 1044 - req->cqe.res = 0; 1065 + memset(&req->cqe, 0, sizeof(req->cqe)); 1066 + memset(&req->big_cqe, 0, sizeof(req->big_cqe)); 1045 1067 } 1046 1068 1047 1069 static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, ··· 1487 1507 io_req_task_queue(nxt); 1488 1508 } 1489 1509 1490 - void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) 1510 + static void io_free_batch_list(struct io_ring_ctx *ctx, 1511 + struct io_wq_work_node *node) 1491 1512 __must_hold(&ctx->uring_lock) 1492 1513 { 1493 1514 do { ··· 1515 1534 if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) 1516 1535 io_clean_op(req); 1517 1536 } 1518 - if (!(req->flags & REQ_F_FIXED_FILE)) 1519 - io_put_file(req->file); 1537 + io_put_file(req); 1520 1538 1521 1539 io_req_put_rsrc_locked(req, ctx); 1522 1540 ··· 1525 1545 } while (node); 1526 1546 } 1527 1547 1528 - static void __io_submit_flush_completions(struct io_ring_ctx *ctx) 1548 + void __io_submit_flush_completions(struct io_ring_ctx *ctx) 1529 1549 __must_hold(&ctx->uring_lock) 1530 1550 { 1531 1551 struct io_submit_state *state = &ctx->submit_state; ··· 1540 1560 comp_list); 1541 1561 1542 1562 if (!(req->flags & REQ_F_CQE_SKIP) && 1543 - unlikely(!__io_fill_cqe_req(ctx, req))) { 1563 + unlikely(!io_fill_cqe_req(ctx, req))) { 1544 1564 if (ctx->task_complete) { 1545 1565 spin_lock(&ctx->completion_lock); 1546 1566 io_req_cqe_overflow(req); ··· 1596 1616 static int io_iopoll_check(struct io_ring_ctx *ctx, long min) 1597 1617 { 1598 1618 unsigned int nr_events = 0; 1599 - int ret = 0; 1600 1619 unsigned long check_cq; 1601 1620 1602 1621 if (!io_allowed_run_tw(ctx)) ··· 1621 1642 return 0; 1622 1643 1623 1644 do { 1645 + int ret = 0; 1646 + 1624 1647 /* 1625 1648 * If a submit got punted to a workqueue, we can have the 1626 1649 * application entering polling for a command before it gets ··· 1651 1670 break; 1652 1671 } 1653 1672 ret = io_do_iopoll(ctx, !min); 1654 - if (ret < 0) 1655 - break; 1656 - nr_events += ret; 1657 - ret = 0; 1658 - } while (nr_events < min && !need_resched()); 1673 + if (unlikely(ret < 0)) 1674 + return ret; 1659 1675 1660 - return ret; 1676 + if (task_sigpending(current)) 1677 + return -EINTR; 1678 + if (need_resched()) 1679 + break; 1680 + 1681 + nr_events += ret; 1682 + } while (nr_events < min); 1683 + 1684 + return 0; 1661 1685 } 1662 1686 1663 1687 void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts) ··· 2347 2361 */ 2348 2362 static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) 2349 2363 { 2350 - unsigned head, mask = ctx->sq_entries - 1; 2351 - unsigned sq_idx = ctx->cached_sq_head++ & mask; 2364 + unsigned mask = ctx->sq_entries - 1; 2365 + unsigned head = ctx->cached_sq_head++ & mask; 2366 + 2367 + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) { 2368 + head = READ_ONCE(ctx->sq_array[head]); 2369 + if (unlikely(head >= ctx->sq_entries)) { 2370 + /* drop invalid entries */ 2371 + spin_lock(&ctx->completion_lock); 2372 + ctx->cq_extra--; 2373 + spin_unlock(&ctx->completion_lock); 2374 + WRITE_ONCE(ctx->rings->sq_dropped, 2375 + READ_ONCE(ctx->rings->sq_dropped) + 1); 2376 + return false; 2377 + } 2378 + } 2352 2379 2353 2380 /* 2354 2381 * The cached sq head (or cq tail) serves two purposes: ··· 2371 2372 * 2) allows the kernel side to track the head on its own, even 2372 2373 * though the application is the one updating it. 2373 2374 */ 2374 - head = READ_ONCE(ctx->sq_array[sq_idx]); 2375 - if (likely(head < ctx->sq_entries)) { 2376 - /* double index for 128-byte SQEs, twice as long */ 2377 - if (ctx->flags & IORING_SETUP_SQE128) 2378 - head <<= 1; 2379 - *sqe = &ctx->sq_sqes[head]; 2380 - return true; 2381 - } 2382 2375 2383 - /* drop invalid entries */ 2384 - ctx->cq_extra--; 2385 - WRITE_ONCE(ctx->rings->sq_dropped, 2386 - READ_ONCE(ctx->rings->sq_dropped) + 1); 2387 - return false; 2376 + /* double index for 128-byte SQEs, twice as long */ 2377 + if (ctx->flags & IORING_SETUP_SQE128) 2378 + head <<= 1; 2379 + *sqe = &ctx->sq_sqes[head]; 2380 + return true; 2388 2381 } 2389 2382 2390 2383 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) ··· 2475 2484 if (!llist_empty(&ctx->work_llist)) { 2476 2485 __set_current_state(TASK_RUNNING); 2477 2486 if (io_run_local_work(ctx) > 0) 2478 - return 1; 2487 + return 0; 2479 2488 } 2480 2489 if (io_run_task_work() > 0) 2481 - return 1; 2490 + return 0; 2482 2491 if (task_sigpending(current)) 2483 2492 return -EINTR; 2484 2493 return 0; ··· 2752 2761 return SIZE_MAX; 2753 2762 #endif 2754 2763 2764 + if (ctx->flags & IORING_SETUP_NO_SQARRAY) { 2765 + if (sq_offset) 2766 + *sq_offset = SIZE_MAX; 2767 + return off; 2768 + } 2769 + 2755 2770 if (sq_offset) 2756 2771 *sq_offset = off; 2757 2772 ··· 2900 2903 io_wq_put_hash(ctx->hash_map); 2901 2904 kfree(ctx->cancel_table.hbs); 2902 2905 kfree(ctx->cancel_table_locked.hbs); 2903 - kfree(ctx->dummy_ubuf); 2904 2906 kfree(ctx->io_bl); 2905 2907 xa_destroy(&ctx->io_bl_xa); 2906 2908 kfree(ctx); ··· 3729 3733 return PTR_ERR(rings); 3730 3734 3731 3735 ctx->rings = rings; 3732 - ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); 3736 + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) 3737 + ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); 3733 3738 rings->sq_ring_mask = p->sq_entries - 1; 3734 3739 rings->cq_ring_mask = p->cq_entries - 1; 3735 3740 rings->sq_ring_entries = p->sq_entries; ··· 3859 3862 !(ctx->flags & IORING_SETUP_SQPOLL)) 3860 3863 ctx->task_complete = true; 3861 3864 3865 + if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) 3866 + ctx->lockless_cq = true; 3867 + 3862 3868 /* 3863 3869 * lazy poll_wq activation relies on ->task_complete for synchronisation 3864 3870 * purposes, see io_activate_pollwq() ··· 3941 3941 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); 3942 3942 p->sq_off.flags = offsetof(struct io_rings, sq_flags); 3943 3943 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); 3944 - p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; 3944 + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) 3945 + p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; 3945 3946 p->sq_off.resv1 = 0; 3946 3947 if (!(ctx->flags & IORING_SETUP_NO_MMAP)) 3947 3948 p->sq_off.user_addr = 0; ··· 4031 4030 IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | 4032 4031 IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | 4033 4032 IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | 4034 - IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY)) 4033 + IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | 4034 + IORING_SETUP_NO_SQARRAY)) 4035 4035 return -EINVAL; 4036 4036 4037 4037 return io_uring_create(entries, &p, params); ··· 4195 4193 return 0; 4196 4194 } 4197 4195 4196 + static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, 4197 + cpumask_var_t new_mask) 4198 + { 4199 + int ret; 4200 + 4201 + if (!(ctx->flags & IORING_SETUP_SQPOLL)) { 4202 + ret = io_wq_cpu_affinity(current->io_uring, new_mask); 4203 + } else { 4204 + mutex_unlock(&ctx->uring_lock); 4205 + ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); 4206 + mutex_lock(&ctx->uring_lock); 4207 + } 4208 + 4209 + return ret; 4210 + } 4211 + 4198 4212 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, 4199 4213 void __user *arg, unsigned len) 4200 4214 { 4201 - struct io_uring_task *tctx = current->io_uring; 4202 4215 cpumask_var_t new_mask; 4203 4216 int ret; 4204 - 4205 - if (!tctx || !tctx->io_wq) 4206 - return -EINVAL; 4207 4217 4208 4218 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 4209 4219 return -ENOMEM; ··· 4237 4223 return -EFAULT; 4238 4224 } 4239 4225 4240 - ret = io_wq_cpu_affinity(tctx->io_wq, new_mask); 4226 + ret = __io_register_iowq_aff(ctx, new_mask); 4241 4227 free_cpumask_var(new_mask); 4242 4228 return ret; 4243 4229 } 4244 4230 4245 4231 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) 4246 4232 { 4247 - struct io_uring_task *tctx = current->io_uring; 4248 - 4249 - if (!tctx || !tctx->io_wq) 4250 - return -EINVAL; 4251 - 4252 - return io_wq_cpu_affinity(tctx->io_wq, NULL); 4233 + return __io_register_iowq_aff(ctx, NULL); 4253 4234 } 4254 4235 4255 4236 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, ··· 4620 4611 4621 4612 io_uring_optable_init(); 4622 4613 4623 - req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | 4624 - SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); 4614 + /* 4615 + * Allow user copy in the per-command field, which starts after the 4616 + * file in io_kiocb and until the opcode field. The openat2 handling 4617 + * requires copying in user memory into the io_kiocb object in that 4618 + * range, and HARDENED_USERCOPY will complain if we haven't 4619 + * correctly annotated this range. 4620 + */ 4621 + req_cachep = kmem_cache_create_usercopy("io_kiocb", 4622 + sizeof(struct io_kiocb), 0, 4623 + SLAB_HWCACHE_ALIGN | SLAB_PANIC | 4624 + SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, 4625 + offsetof(struct io_kiocb, cmd.data), 4626 + sizeof_field(struct io_kiocb, cmd.data), NULL); 4627 + 4625 4628 return 0; 4626 4629 }; 4627 4630 __initcall(io_uring_init);

+30 -49

io_uring/io_uring.h

··· 38 38 IOU_STOP_MULTISHOT = -ECANCELED, 39 39 }; 40 40 41 - struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow); 42 - bool io_req_cqe_overflow(struct io_kiocb *req); 41 + bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); 42 + void io_req_cqe_overflow(struct io_kiocb *req); 43 43 int io_run_task_work_sig(struct io_ring_ctx *ctx); 44 44 void io_req_defer_failed(struct io_kiocb *req, s32 res); 45 45 void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags); 46 46 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); 47 - bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags, 48 - bool allow_overflow); 47 + bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags); 49 48 void __io_commit_cqring_flush(struct io_ring_ctx *ctx); 50 49 51 50 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); ··· 72 73 int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); 73 74 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); 74 75 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); 75 - void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node); 76 + void __io_submit_flush_completions(struct io_ring_ctx *ctx); 76 77 int io_req_prep_async(struct io_kiocb *req); 77 78 78 79 struct io_wq_work *io_wq_free_work(struct io_wq_work *work); ··· 109 110 #define io_for_each_link(pos, head) \ 110 111 for (pos = (head); pos; pos = pos->link) 111 112 112 - static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx, 113 - bool overflow) 113 + static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx, 114 + struct io_uring_cqe **ret, 115 + bool overflow) 114 116 { 115 117 io_lockdep_assert_cq_locked(ctx); 116 118 117 - if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { 118 - struct io_uring_cqe *cqe = ctx->cqe_cached; 119 - 120 - ctx->cached_cq_tail++; 121 - ctx->cqe_cached++; 122 - if (ctx->flags & IORING_SETUP_CQE32) 123 - ctx->cqe_cached++; 124 - return cqe; 119 + if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) { 120 + if (unlikely(!io_cqe_cache_refill(ctx, overflow))) 121 + return false; 125 122 } 126 - 127 - return __io_get_cqe(ctx, overflow); 123 + *ret = ctx->cqe_cached; 124 + ctx->cached_cq_tail++; 125 + ctx->cqe_cached++; 126 + if (ctx->flags & IORING_SETUP_CQE32) 127 + ctx->cqe_cached++; 128 + return true; 128 129 } 129 130 130 - static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) 131 + static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret) 131 132 { 132 - return io_get_cqe_overflow(ctx, false); 133 + return io_get_cqe_overflow(ctx, ret, false); 133 134 } 134 135 135 - static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, 136 - struct io_kiocb *req) 136 + static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, 137 + struct io_kiocb *req) 137 138 { 138 139 struct io_uring_cqe *cqe; 139 140 ··· 142 143 * submission (by quite a lot). Increment the overflow count in 143 144 * the ring. 144 145 */ 145 - cqe = io_get_cqe(ctx); 146 - if (unlikely(!cqe)) 146 + if (unlikely(!io_get_cqe(ctx, &cqe))) 147 147 return false; 148 148 149 - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, 150 - req->cqe.res, req->cqe.flags, 151 - (req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0, 152 - (req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0); 149 + if (trace_io_uring_complete_enabled()) 150 + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, 151 + req->cqe.res, req->cqe.flags, 152 + req->big_cqe.extra1, req->big_cqe.extra2); 153 153 154 154 memcpy(cqe, &req->cqe, sizeof(*cqe)); 155 - 156 155 if (ctx->flags & IORING_SETUP_CQE32) { 157 - u64 extra1 = 0, extra2 = 0; 158 - 159 - if (req->flags & REQ_F_CQE32_INIT) { 160 - extra1 = req->extra1; 161 - extra2 = req->extra2; 162 - } 163 - 164 - WRITE_ONCE(cqe->big_cqe[0], extra1); 165 - WRITE_ONCE(cqe->big_cqe[1], extra2); 156 + memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe)); 157 + memset(&req->big_cqe, 0, sizeof(req->big_cqe)); 166 158 } 167 159 return true; 168 - } 169 - 170 - static inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, 171 - struct io_kiocb *req) 172 - { 173 - if (likely(__io_fill_cqe_req(ctx, req))) 174 - return true; 175 - return io_req_cqe_overflow(req); 176 160 } 177 161 178 162 static inline void req_set_fail(struct io_kiocb *req) ··· 178 196 return req->flags & REQ_F_ASYNC_DATA; 179 197 } 180 198 181 - static inline void io_put_file(struct file *file) 199 + static inline void io_put_file(struct io_kiocb *req) 182 200 { 183 - if (file) 184 - fput(file); 201 + if (!(req->flags & REQ_F_FIXED_FILE) && req->file) 202 + fput(req->file); 185 203 } 186 204 187 205 static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx, ··· 336 354 struct io_kiocb *req; 337 355 338 356 req = container_of(ctx->submit_state.free_list.next, struct io_kiocb, comp_list); 339 - kasan_unpoison_object_data(req_cachep, req); 340 357 wq_stack_extract(&ctx->submit_state.free_list); 341 358 return req; 342 359 }

+4 -4

io_uring/net.c

··· 641 641 } 642 642 643 643 if (!mshot_finished) { 644 - if (io_aux_cqe(req, issue_flags & IO_URING_F_COMPLETE_DEFER, 645 - *ret, cflags | IORING_CQE_F_MORE, true)) { 644 + if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, 645 + *ret, cflags | IORING_CQE_F_MORE)) { 646 646 io_recv_prep_retry(req); 647 647 /* Known not-empty or unknown state, retry */ 648 648 if (cflags & IORING_CQE_F_SOCK_NONEMPTY || ··· 1366 1366 1367 1367 if (ret < 0) 1368 1368 return ret; 1369 - if (io_aux_cqe(req, issue_flags & IO_URING_F_COMPLETE_DEFER, ret, 1370 - IORING_CQE_F_MORE, true)) 1369 + if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, 1370 + ret, IORING_CQE_F_MORE)) 1371 1371 goto retry; 1372 1372 1373 1373 return -ECANCELED;

+9 -12

io_uring/poll.c

··· 300 300 __poll_t mask = mangle_poll(req->cqe.res & 301 301 req->apoll_events); 302 302 303 - if (!io_aux_cqe(req, ts->locked, mask, 304 - IORING_CQE_F_MORE, false)) { 303 + if (!io_fill_cqe_req_aux(req, ts->locked, mask, 304 + IORING_CQE_F_MORE)) { 305 305 io_req_set_res(req, mask, 0); 306 306 return IOU_POLL_REMOVE_POLL_USE_RES; 307 307 } ··· 824 824 825 825 spin_lock(&hb->lock); 826 826 hlist_for_each_entry(req, &hb->list, hash_node) { 827 - if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && 828 - req->file != cd->file) 829 - continue; 830 - if (cd->seq == req->work.cancel_seq) 831 - continue; 832 - req->work.cancel_seq = cd->seq; 833 - *out_bucket = hb; 834 - return req; 827 + if (io_cancel_req_match(req, cd)) { 828 + *out_bucket = hb; 829 + return req; 830 + } 835 831 } 836 832 spin_unlock(&hb->lock); 837 833 } ··· 851 855 struct io_hash_bucket *bucket; 852 856 struct io_kiocb *req; 853 857 854 - if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) 858 + if (cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP | 859 + IORING_ASYNC_CANCEL_ANY)) 855 860 req = io_poll_file_find(ctx, cd, table, &bucket); 856 861 else 857 862 req = io_poll_find(ctx, false, cd, table, &bucket); ··· 969 972 int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) 970 973 { 971 974 struct io_poll_update *poll_update = io_kiocb_to_cmd(req, struct io_poll_update); 972 - struct io_cancel_data cd = { .data = poll_update->old_user_data, }; 973 975 struct io_ring_ctx *ctx = req->ctx; 976 + struct io_cancel_data cd = { .ctx = ctx, .data = poll_update->old_user_data, }; 974 977 struct io_hash_bucket *bucket; 975 978 struct io_kiocb *preq; 976 979 int ret2, ret = 0;

+10 -4

io_uring/rsrc.c

··· 33 33 #define IORING_MAX_FIXED_FILES (1U << 20) 34 34 #define IORING_MAX_REG_BUFFERS (1U << 14) 35 35 36 + static const struct io_mapped_ubuf dummy_ubuf = { 37 + /* set invalid range, so io_import_fixed() fails meeting it */ 38 + .ubuf = -1UL, 39 + .ubuf_end = 0, 40 + }; 41 + 36 42 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 37 43 { 38 44 unsigned long page_limit, cur_pages, new_pages; ··· 138 132 struct io_mapped_ubuf *imu = *slot; 139 133 unsigned int i; 140 134 141 - if (imu != ctx->dummy_ubuf) { 135 + if (imu != &dummy_ubuf) { 142 136 for (i = 0; i < imu->nr_bvecs; i++) 143 137 unpin_user_page(imu->bvec[i].bv_page); 144 138 if (imu->acct_pages) ··· 465 459 break; 466 460 467 461 i = array_index_nospec(up->offset + done, ctx->nr_user_bufs); 468 - if (ctx->user_bufs[i] != ctx->dummy_ubuf) { 462 + if (ctx->user_bufs[i] != &dummy_ubuf) { 469 463 err = io_queue_rsrc_removal(ctx->buf_data, i, 470 464 ctx->user_bufs[i]); 471 465 if (unlikely(err)) { 472 466 io_buffer_unmap(ctx, &imu); 473 467 break; 474 468 } 475 - ctx->user_bufs[i] = ctx->dummy_ubuf; 469 + ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf; 476 470 } 477 471 478 472 ctx->user_bufs[i] = imu; ··· 1083 1077 int ret, nr_pages, i; 1084 1078 struct folio *folio = NULL; 1085 1079 1086 - *pimu = ctx->dummy_ubuf; 1080 + *pimu = (struct io_mapped_ubuf *)&dummy_ubuf; 1087 1081 if (!iov->iov_base) 1088 1082 return 0; 1089 1083

+1 -2

io_uring/rsrc.h

··· 54 54 u64 ubuf_end; 55 55 unsigned int nr_bvecs; 56 56 unsigned long acct_pages; 57 - struct bio_vec bvec[]; 57 + struct bio_vec bvec[] __counted_by(nr_bvecs); 58 58 }; 59 59 60 - void io_rsrc_put_tw(struct callback_head *cb); 61 60 void io_rsrc_node_ref_zero(struct io_rsrc_node *node); 62 61 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node); 63 62 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);

+5 -19

io_uring/rw.c

··· 989 989 return ret; 990 990 } 991 991 992 - static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) 993 - { 994 - io_commit_cqring_flush(ctx); 995 - if (ctx->flags & IORING_SETUP_SQPOLL) 996 - io_cqring_wake(ctx); 997 - } 998 - 999 992 void io_rw_fail(struct io_kiocb *req) 1000 993 { 1001 994 int res; ··· 1059 1066 if (!smp_load_acquire(&req->iopoll_completed)) 1060 1067 break; 1061 1068 nr_events++; 1062 - if (unlikely(req->flags & REQ_F_CQE_SKIP)) 1063 - continue; 1064 - 1065 1069 req->cqe.flags = io_put_kbuf(req, 0); 1066 - if (unlikely(!__io_fill_cqe_req(ctx, req))) { 1067 - spin_lock(&ctx->completion_lock); 1068 - io_req_cqe_overflow(req); 1069 - spin_unlock(&ctx->completion_lock); 1070 - } 1071 1070 } 1072 - 1073 1071 if (unlikely(!nr_events)) 1074 1072 return 0; 1075 1073 1076 - io_commit_cqring(ctx); 1077 - io_cqring_ev_posted_iopoll(ctx); 1078 1074 pos = start ? start->next : ctx->iopoll_list.first; 1079 1075 wq_list_cut(&ctx->iopoll_list, prev, start); 1080 - io_free_batch_list(ctx, pos); 1076 + 1077 + if (WARN_ON_ONCE(!wq_list_empty(&ctx->submit_state.compl_reqs))) 1078 + return 0; 1079 + ctx->submit_state.compl_reqs.first = pos; 1080 + __io_submit_flush_completions(ctx); 1081 1081 return nr_events; 1082 1082 }

+2 -2

io_uring/splice.c

··· 68 68 ret = do_tee(in, out, sp->len, flags); 69 69 70 70 if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) 71 - io_put_file(in); 71 + fput(in); 72 72 done: 73 73 if (ret != sp->len) 74 74 req_set_fail(req); ··· 112 112 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); 113 113 114 114 if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) 115 - io_put_file(in); 115 + fput(in); 116 116 done: 117 117 if (ret != sp->len) 118 118 req_set_fail(req);

+15

io_uring/sqpoll.c

··· 421 421 io_sq_thread_finish(ctx); 422 422 return ret; 423 423 } 424 + 425 + __cold int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, 426 + cpumask_var_t mask) 427 + { 428 + struct io_sq_data *sqd = ctx->sq_data; 429 + int ret = -EINVAL; 430 + 431 + if (sqd) { 432 + io_sq_thread_park(sqd); 433 + ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask); 434 + io_sq_thread_unpark(sqd); 435 + } 436 + 437 + return ret; 438 + }

+1

io_uring/sqpoll.h

··· 27 27 void io_sq_thread_unpark(struct io_sq_data *sqd); 28 28 void io_put_sq_data(struct io_sq_data *sqd); 29 29 void io_sqpoll_wait_sq(struct io_ring_ctx *ctx); 30 + int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, cpumask_var_t mask);

+7 -13

io_uring/timeout.c

··· 73 73 74 74 if (!io_timeout_finish(timeout, data)) { 75 75 bool filled; 76 - filled = io_aux_cqe(req, ts->locked, -ETIME, IORING_CQE_F_MORE, 77 - false); 76 + filled = io_fill_cqe_req_aux(req, ts->locked, -ETIME, 77 + IORING_CQE_F_MORE); 78 78 if (filled) { 79 79 /* re-arm timer */ 80 80 spin_lock_irq(&ctx->timeout_lock); ··· 268 268 list_for_each_entry(timeout, &ctx->timeout_list, list) { 269 269 struct io_kiocb *tmp = cmd_to_io_kiocb(timeout); 270 270 271 - if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && 272 - cd->data != tmp->cqe.user_data) 273 - continue; 274 - if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { 275 - if (cd->seq == tmp->work.cancel_seq) 276 - continue; 277 - tmp->work.cancel_seq = cd->seq; 271 + if (io_cancel_req_match(tmp, cd)) { 272 + req = tmp; 273 + break; 278 274 } 279 - req = tmp; 280 - break; 281 275 } 282 276 if (!req) 283 277 return ERR_PTR(-ENOENT); ··· 403 409 struct timespec64 *ts, enum hrtimer_mode mode) 404 410 __must_hold(&ctx->timeout_lock) 405 411 { 406 - struct io_cancel_data cd = { .data = user_data, }; 412 + struct io_cancel_data cd = { .ctx = ctx, .data = user_data, }; 407 413 struct io_kiocb *req = io_timeout_extract(ctx, &cd); 408 414 struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); 409 415 struct io_timeout_data *data; ··· 467 473 int ret; 468 474 469 475 if (!(tr->flags & IORING_TIMEOUT_UPDATE)) { 470 - struct io_cancel_data cd = { .data = tr->addr, }; 476 + struct io_cancel_data cd = { .ctx = ctx, .data = tr->addr, }; 471 477 472 478 spin_lock(&ctx->completion_lock); 473 479 ret = io_timeout_cancel(ctx, &cd);

+30 -3

io_uring/uring_cmd.c

··· 7 7 #include <linux/nospec.h> 8 8 9 9 #include <uapi/linux/io_uring.h> 10 + #include <uapi/asm-generic/ioctls.h> 10 11 11 12 #include "io_uring.h" 12 13 #include "rsrc.h" ··· 43 42 static inline void io_req_set_cqe32_extra(struct io_kiocb *req, 44 43 u64 extra1, u64 extra2) 45 44 { 46 - req->extra1 = extra1; 47 - req->extra2 = extra2; 48 - req->flags |= REQ_F_CQE32_INIT; 45 + req->big_cqe.extra1 = extra1; 46 + req->big_cqe.extra2 = extra2; 49 47 } 50 48 51 49 /* ··· 164 164 return io_import_fixed(rw, iter, req->imu, ubuf, len); 165 165 } 166 166 EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed); 167 + 168 + int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) 169 + { 170 + struct socket *sock = cmd->file->private_data; 171 + struct sock *sk = sock->sk; 172 + struct proto *prot = READ_ONCE(sk->sk_prot); 173 + int ret, arg = 0; 174 + 175 + if (!prot || !prot->ioctl) 176 + return -EOPNOTSUPP; 177 + 178 + switch (cmd->sqe->cmd_op) { 179 + case SOCKET_URING_OP_SIOCINQ: 180 + ret = prot->ioctl(sk, SIOCINQ, &arg); 181 + if (ret) 182 + return ret; 183 + return arg; 184 + case SOCKET_URING_OP_SIOCOUTQ: 185 + ret = prot->ioctl(sk, SIOCOUTQ, &arg); 186 + if (ret) 187 + return ret; 188 + return arg; 189 + default: 190 + return -EOPNOTSUPP; 191 + } 192 + } 193 + EXPORT_SYMBOL_GPL(io_uring_cmd_sock);

+2

net/socket.c

··· 88 88 #include <linux/xattr.h> 89 89 #include <linux/nospec.h> 90 90 #include <linux/indirect_call_wrapper.h> 91 + #include <linux/io_uring.h> 91 92 92 93 #include <linux/uaccess.h> 93 94 #include <asm/unistd.h> ··· 161 160 #ifdef CONFIG_COMPAT 162 161 .compat_ioctl = compat_sock_ioctl, 163 162 #endif 163 + .uring_cmd = io_uring_cmd_sock, 164 164 .mmap = sock_mmap, 165 165 .release = sock_close, 166 166 .fasync = sock_fasync,

-18

tools/io_uring/Makefile

··· 1 - # SPDX-License-Identifier: GPL-2.0 2 - # Makefile for io_uring test tools 3 - CFLAGS += -Wall -Wextra -g -D_GNU_SOURCE 4 - LDLIBS += -lpthread 5 - 6 - all: io_uring-cp io_uring-bench 7 - %: %.c 8 - $(CC) $(CFLAGS) -o $@ $^ 9 - 10 - io_uring-bench: syscall.o io_uring-bench.o 11 - $(CC) $(CFLAGS) -o $@ $^ $(LDLIBS) 12 - 13 - io_uring-cp: setup.o syscall.o queue.o 14 - 15 - clean: 16 - $(RM) io_uring-cp io_uring-bench *.o 17 - 18 - .PHONY: all clean

-29

tools/io_uring/README

··· 1 - This directory includes a few programs that demonstrate how to use io_uring 2 - in an application. The examples are: 3 - 4 - io_uring-cp 5 - A very basic io_uring implementation of cp(1). It takes two 6 - arguments, copies the first argument to the second. This example 7 - is part of liburing, and hence uses the simplified liburing API 8 - for setting up an io_uring instance, submitting IO, completing IO, 9 - etc. The support functions in queue.c and setup.c are straight 10 - out of liburing. 11 - 12 - io_uring-bench 13 - Benchmark program that does random reads on a number of files. This 14 - app demonstrates the various features of io_uring, like fixed files, 15 - fixed buffers, and polled IO. There are options in the program to 16 - control which features to use. Arguments is the file (or files) that 17 - io_uring-bench should operate on. This uses the raw io_uring 18 - interface. 19 - 20 - liburing can be cloned with git here: 21 - 22 - git://git.kernel.dk/liburing 23 - 24 - and contains a number of unit tests as well for testing io_uring. It also 25 - comes with man pages for the three system calls. 26 - 27 - Fio includes an io_uring engine, you can clone fio here: 28 - 29 - git://git.kernel.dk/fio

-16

tools/io_uring/barrier.h

··· 1 - #ifndef LIBURING_BARRIER_H 2 - #define LIBURING_BARRIER_H 3 - 4 - #if defined(__x86_64) || defined(__i386__) 5 - #define read_barrier() __asm__ __volatile__("":::"memory") 6 - #define write_barrier() __asm__ __volatile__("":::"memory") 7 - #else 8 - /* 9 - * Add arch appropriate definitions. Be safe and use full barriers for 10 - * archs we don't have support for. 11 - */ 12 - #define read_barrier() __sync_synchronize() 13 - #define write_barrier() __sync_synchronize() 14 - #endif 15 - 16 - #endif

-592

tools/io_uring/io_uring-bench.c

··· 1 - // SPDX-License-Identifier: GPL-2.0 2 - /* 3 - * Simple benchmark program that uses the various features of io_uring 4 - * to provide fast random access to a device/file. It has various 5 - * options that are control how we use io_uring, see the OPTIONS section 6 - * below. This uses the raw io_uring interface. 7 - * 8 - * Copyright (C) 2018-2019 Jens Axboe 9 - */ 10 - #include <stdio.h> 11 - #include <errno.h> 12 - #include <assert.h> 13 - #include <stdlib.h> 14 - #include <stddef.h> 15 - #include <signal.h> 16 - #include <inttypes.h> 17 - 18 - #include <sys/types.h> 19 - #include <sys/stat.h> 20 - #include <sys/ioctl.h> 21 - #include <sys/syscall.h> 22 - #include <sys/resource.h> 23 - #include <sys/mman.h> 24 - #include <sys/uio.h> 25 - #include <linux/fs.h> 26 - #include <fcntl.h> 27 - #include <unistd.h> 28 - #include <string.h> 29 - #include <pthread.h> 30 - #include <sched.h> 31 - 32 - #include "liburing.h" 33 - #include "barrier.h" 34 - 35 - #define min(a, b) ((a < b) ? (a) : (b)) 36 - 37 - struct io_sq_ring { 38 - unsigned *head; 39 - unsigned *tail; 40 - unsigned *ring_mask; 41 - unsigned *ring_entries; 42 - unsigned *flags; 43 - unsigned *array; 44 - }; 45 - 46 - struct io_cq_ring { 47 - unsigned *head; 48 - unsigned *tail; 49 - unsigned *ring_mask; 50 - unsigned *ring_entries; 51 - struct io_uring_cqe *cqes; 52 - }; 53 - 54 - #define DEPTH 128 55 - 56 - #define BATCH_SUBMIT 32 57 - #define BATCH_COMPLETE 32 58 - 59 - #define BS 4096 60 - 61 - #define MAX_FDS 16 62 - 63 - static unsigned sq_ring_mask, cq_ring_mask; 64 - 65 - struct file { 66 - unsigned long max_blocks; 67 - unsigned pending_ios; 68 - int real_fd; 69 - int fixed_fd; 70 - }; 71 - 72 - struct submitter { 73 - pthread_t thread; 74 - int ring_fd; 75 - struct drand48_data rand; 76 - struct io_sq_ring sq_ring; 77 - struct io_uring_sqe *sqes; 78 - struct iovec iovecs[DEPTH]; 79 - struct io_cq_ring cq_ring; 80 - int inflight; 81 - unsigned long reaps; 82 - unsigned long done; 83 - unsigned long calls; 84 - volatile int finish; 85 - 86 - __s32 *fds; 87 - 88 - struct file files[MAX_FDS]; 89 - unsigned nr_files; 90 - unsigned cur_file; 91 - }; 92 - 93 - static struct submitter submitters[1]; 94 - static volatile int finish; 95 - 96 - /* 97 - * OPTIONS: Set these to test the various features of io_uring. 98 - */ 99 - static int polled = 1; /* use IO polling */ 100 - static int fixedbufs = 1; /* use fixed user buffers */ 101 - static int register_files = 1; /* use fixed files */ 102 - static int buffered = 0; /* use buffered IO, not O_DIRECT */ 103 - static int sq_thread_poll = 0; /* use kernel submission/poller thread */ 104 - static int sq_thread_cpu = -1; /* pin above thread to this CPU */ 105 - static int do_nop = 0; /* no-op SQ ring commands */ 106 - 107 - static int io_uring_register_buffers(struct submitter *s) 108 - { 109 - if (do_nop) 110 - return 0; 111 - 112 - return io_uring_register(s->ring_fd, IORING_REGISTER_BUFFERS, s->iovecs, 113 - DEPTH); 114 - } 115 - 116 - static int io_uring_register_files(struct submitter *s) 117 - { 118 - unsigned i; 119 - 120 - if (do_nop) 121 - return 0; 122 - 123 - s->fds = calloc(s->nr_files, sizeof(__s32)); 124 - for (i = 0; i < s->nr_files; i++) { 125 - s->fds[i] = s->files[i].real_fd; 126 - s->files[i].fixed_fd = i; 127 - } 128 - 129 - return io_uring_register(s->ring_fd, IORING_REGISTER_FILES, s->fds, 130 - s->nr_files); 131 - } 132 - 133 - static int lk_gettid(void) 134 - { 135 - return syscall(__NR_gettid); 136 - } 137 - 138 - static unsigned file_depth(struct submitter *s) 139 - { 140 - return (DEPTH + s->nr_files - 1) / s->nr_files; 141 - } 142 - 143 - static void init_io(struct submitter *s, unsigned index) 144 - { 145 - struct io_uring_sqe *sqe = &s->sqes[index]; 146 - unsigned long offset; 147 - struct file *f; 148 - long r; 149 - 150 - if (do_nop) { 151 - sqe->opcode = IORING_OP_NOP; 152 - return; 153 - } 154 - 155 - if (s->nr_files == 1) { 156 - f = &s->files[0]; 157 - } else { 158 - f = &s->files[s->cur_file]; 159 - if (f->pending_ios >= file_depth(s)) { 160 - s->cur_file++; 161 - if (s->cur_file == s->nr_files) 162 - s->cur_file = 0; 163 - f = &s->files[s->cur_file]; 164 - } 165 - } 166 - f->pending_ios++; 167 - 168 - lrand48_r(&s->rand, &r); 169 - offset = (r % (f->max_blocks - 1)) * BS; 170 - 171 - if (register_files) { 172 - sqe->flags = IOSQE_FIXED_FILE; 173 - sqe->fd = f->fixed_fd; 174 - } else { 175 - sqe->flags = 0; 176 - sqe->fd = f->real_fd; 177 - } 178 - if (fixedbufs) { 179 - sqe->opcode = IORING_OP_READ_FIXED; 180 - sqe->addr = (unsigned long) s->iovecs[index].iov_base; 181 - sqe->len = BS; 182 - sqe->buf_index = index; 183 - } else { 184 - sqe->opcode = IORING_OP_READV; 185 - sqe->addr = (unsigned long) &s->iovecs[index]; 186 - sqe->len = 1; 187 - sqe->buf_index = 0; 188 - } 189 - sqe->ioprio = 0; 190 - sqe->off = offset; 191 - sqe->user_data = (unsigned long) f; 192 - } 193 - 194 - static int prep_more_ios(struct submitter *s, unsigned max_ios) 195 - { 196 - struct io_sq_ring *ring = &s->sq_ring; 197 - unsigned index, tail, next_tail, prepped = 0; 198 - 199 - next_tail = tail = *ring->tail; 200 - do { 201 - next_tail++; 202 - read_barrier(); 203 - if (next_tail == *ring->head) 204 - break; 205 - 206 - index = tail & sq_ring_mask; 207 - init_io(s, index); 208 - ring->array[index] = index; 209 - prepped++; 210 - tail = next_tail; 211 - } while (prepped < max_ios); 212 - 213 - if (*ring->tail != tail) { 214 - /* order tail store with writes to sqes above */ 215 - write_barrier(); 216 - *ring->tail = tail; 217 - write_barrier(); 218 - } 219 - return prepped; 220 - } 221 - 222 - static int get_file_size(struct file *f) 223 - { 224 - struct stat st; 225 - 226 - if (fstat(f->real_fd, &st) < 0) 227 - return -1; 228 - if (S_ISBLK(st.st_mode)) { 229 - unsigned long long bytes; 230 - 231 - if (ioctl(f->real_fd, BLKGETSIZE64, &bytes) != 0) 232 - return -1; 233 - 234 - f->max_blocks = bytes / BS; 235 - return 0; 236 - } else if (S_ISREG(st.st_mode)) { 237 - f->max_blocks = st.st_size / BS; 238 - return 0; 239 - } 240 - 241 - return -1; 242 - } 243 - 244 - static int reap_events(struct submitter *s) 245 - { 246 - struct io_cq_ring *ring = &s->cq_ring; 247 - struct io_uring_cqe *cqe; 248 - unsigned head, reaped = 0; 249 - 250 - head = *ring->head; 251 - do { 252 - struct file *f; 253 - 254 - read_barrier(); 255 - if (head == *ring->tail) 256 - break; 257 - cqe = &ring->cqes[head & cq_ring_mask]; 258 - if (!do_nop) { 259 - f = (struct file *) (uintptr_t) cqe->user_data; 260 - f->pending_ios--; 261 - if (cqe->res != BS) { 262 - printf("io: unexpected ret=%d\n", cqe->res); 263 - if (polled && cqe->res == -EOPNOTSUPP) 264 - printf("Your filesystem doesn't support poll\n"); 265 - return -1; 266 - } 267 - } 268 - reaped++; 269 - head++; 270 - } while (1); 271 - 272 - s->inflight -= reaped; 273 - *ring->head = head; 274 - write_barrier(); 275 - return reaped; 276 - } 277 - 278 - static void *submitter_fn(void *data) 279 - { 280 - struct submitter *s = data; 281 - struct io_sq_ring *ring = &s->sq_ring; 282 - int ret, prepped; 283 - 284 - printf("submitter=%d\n", lk_gettid()); 285 - 286 - srand48_r(pthread_self(), &s->rand); 287 - 288 - prepped = 0; 289 - do { 290 - int to_wait, to_submit, this_reap, to_prep; 291 - 292 - if (!prepped && s->inflight < DEPTH) { 293 - to_prep = min(DEPTH - s->inflight, BATCH_SUBMIT); 294 - prepped = prep_more_ios(s, to_prep); 295 - } 296 - s->inflight += prepped; 297 - submit_more: 298 - to_submit = prepped; 299 - submit: 300 - if (to_submit && (s->inflight + to_submit <= DEPTH)) 301 - to_wait = 0; 302 - else 303 - to_wait = min(s->inflight + to_submit, BATCH_COMPLETE); 304 - 305 - /* 306 - * Only need to call io_uring_enter if we're not using SQ thread 307 - * poll, or if IORING_SQ_NEED_WAKEUP is set. 308 - */ 309 - if (!sq_thread_poll || (*ring->flags & IORING_SQ_NEED_WAKEUP)) { 310 - unsigned flags = 0; 311 - 312 - if (to_wait) 313 - flags = IORING_ENTER_GETEVENTS; 314 - if ((*ring->flags & IORING_SQ_NEED_WAKEUP)) 315 - flags |= IORING_ENTER_SQ_WAKEUP; 316 - ret = io_uring_enter(s->ring_fd, to_submit, to_wait, 317 - flags, NULL); 318 - s->calls++; 319 - } 320 - 321 - /* 322 - * For non SQ thread poll, we already got the events we needed 323 - * through the io_uring_enter() above. For SQ thread poll, we 324 - * need to loop here until we find enough events. 325 - */ 326 - this_reap = 0; 327 - do { 328 - int r; 329 - r = reap_events(s); 330 - if (r == -1) { 331 - s->finish = 1; 332 - break; 333 - } else if (r > 0) 334 - this_reap += r; 335 - } while (sq_thread_poll && this_reap < to_wait); 336 - s->reaps += this_reap; 337 - 338 - if (ret >= 0) { 339 - if (!ret) { 340 - to_submit = 0; 341 - if (s->inflight) 342 - goto submit; 343 - continue; 344 - } else if (ret < to_submit) { 345 - int diff = to_submit - ret; 346 - 347 - s->done += ret; 348 - prepped -= diff; 349 - goto submit_more; 350 - } 351 - s->done += ret; 352 - prepped = 0; 353 - continue; 354 - } else if (ret < 0) { 355 - if (errno == EAGAIN) { 356 - if (s->finish) 357 - break; 358 - if (this_reap) 359 - goto submit; 360 - to_submit = 0; 361 - goto submit; 362 - } 363 - printf("io_submit: %s\n", strerror(errno)); 364 - break; 365 - } 366 - } while (!s->finish); 367 - 368 - finish = 1; 369 - return NULL; 370 - } 371 - 372 - static void sig_int(int sig) 373 - { 374 - printf("Exiting on signal %d\n", sig); 375 - submitters[0].finish = 1; 376 - finish = 1; 377 - } 378 - 379 - static void arm_sig_int(void) 380 - { 381 - struct sigaction act; 382 - 383 - memset(&act, 0, sizeof(act)); 384 - act.sa_handler = sig_int; 385 - act.sa_flags = SA_RESTART; 386 - sigaction(SIGINT, &act, NULL); 387 - } 388 - 389 - static int setup_ring(struct submitter *s) 390 - { 391 - struct io_sq_ring *sring = &s->sq_ring; 392 - struct io_cq_ring *cring = &s->cq_ring; 393 - struct io_uring_params p; 394 - int ret, fd; 395 - void *ptr; 396 - 397 - memset(&p, 0, sizeof(p)); 398 - 399 - if (polled && !do_nop) 400 - p.flags |= IORING_SETUP_IOPOLL; 401 - if (sq_thread_poll) { 402 - p.flags |= IORING_SETUP_SQPOLL; 403 - if (sq_thread_cpu != -1) { 404 - p.flags |= IORING_SETUP_SQ_AFF; 405 - p.sq_thread_cpu = sq_thread_cpu; 406 - } 407 - } 408 - 409 - fd = io_uring_setup(DEPTH, &p); 410 - if (fd < 0) { 411 - perror("io_uring_setup"); 412 - return 1; 413 - } 414 - s->ring_fd = fd; 415 - 416 - if (fixedbufs) { 417 - ret = io_uring_register_buffers(s); 418 - if (ret < 0) { 419 - perror("io_uring_register_buffers"); 420 - return 1; 421 - } 422 - } 423 - 424 - if (register_files) { 425 - ret = io_uring_register_files(s); 426 - if (ret < 0) { 427 - perror("io_uring_register_files"); 428 - return 1; 429 - } 430 - } 431 - 432 - ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32), 433 - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 434 - IORING_OFF_SQ_RING); 435 - printf("sq_ring ptr = 0x%p\n", ptr); 436 - sring->head = ptr + p.sq_off.head; 437 - sring->tail = ptr + p.sq_off.tail; 438 - sring->ring_mask = ptr + p.sq_off.ring_mask; 439 - sring->ring_entries = ptr + p.sq_off.ring_entries; 440 - sring->flags = ptr + p.sq_off.flags; 441 - sring->array = ptr + p.sq_off.array; 442 - sq_ring_mask = *sring->ring_mask; 443 - 444 - s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe), 445 - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 446 - IORING_OFF_SQES); 447 - printf("sqes ptr = 0x%p\n", s->sqes); 448 - 449 - ptr = mmap(0, p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe), 450 - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 451 - IORING_OFF_CQ_RING); 452 - printf("cq_ring ptr = 0x%p\n", ptr); 453 - cring->head = ptr + p.cq_off.head; 454 - cring->tail = ptr + p.cq_off.tail; 455 - cring->ring_mask = ptr + p.cq_off.ring_mask; 456 - cring->ring_entries = ptr + p.cq_off.ring_entries; 457 - cring->cqes = ptr + p.cq_off.cqes; 458 - cq_ring_mask = *cring->ring_mask; 459 - return 0; 460 - } 461 - 462 - static void file_depths(char *buf) 463 - { 464 - struct submitter *s = &submitters[0]; 465 - unsigned i; 466 - char *p; 467 - 468 - buf[0] = '\0'; 469 - p = buf; 470 - for (i = 0; i < s->nr_files; i++) { 471 - struct file *f = &s->files[i]; 472 - 473 - if (i + 1 == s->nr_files) 474 - p += sprintf(p, "%d", f->pending_ios); 475 - else 476 - p += sprintf(p, "%d, ", f->pending_ios); 477 - } 478 - } 479 - 480 - int main(int argc, char *argv[]) 481 - { 482 - struct submitter *s = &submitters[0]; 483 - unsigned long done, calls, reap; 484 - int err, i, flags, fd; 485 - char *fdepths; 486 - void *ret; 487 - 488 - if (!do_nop && argc < 2) { 489 - printf("%s: filename\n", argv[0]); 490 - return 1; 491 - } 492 - 493 - flags = O_RDONLY | O_NOATIME; 494 - if (!buffered) 495 - flags |= O_DIRECT; 496 - 497 - i = 1; 498 - while (!do_nop && i < argc) { 499 - struct file *f; 500 - 501 - if (s->nr_files == MAX_FDS) { 502 - printf("Max number of files (%d) reached\n", MAX_FDS); 503 - break; 504 - } 505 - fd = open(argv[i], flags); 506 - if (fd < 0) { 507 - perror("open"); 508 - return 1; 509 - } 510 - 511 - f = &s->files[s->nr_files]; 512 - f->real_fd = fd; 513 - if (get_file_size(f)) { 514 - printf("failed getting size of device/file\n"); 515 - return 1; 516 - } 517 - if (f->max_blocks <= 1) { 518 - printf("Zero file/device size?\n"); 519 - return 1; 520 - } 521 - f->max_blocks--; 522 - 523 - printf("Added file %s\n", argv[i]); 524 - s->nr_files++; 525 - i++; 526 - } 527 - 528 - if (fixedbufs) { 529 - struct rlimit rlim; 530 - 531 - rlim.rlim_cur = RLIM_INFINITY; 532 - rlim.rlim_max = RLIM_INFINITY; 533 - if (setrlimit(RLIMIT_MEMLOCK, &rlim) < 0) { 534 - perror("setrlimit"); 535 - return 1; 536 - } 537 - } 538 - 539 - arm_sig_int(); 540 - 541 - for (i = 0; i < DEPTH; i++) { 542 - void *buf; 543 - 544 - if (posix_memalign(&buf, BS, BS)) { 545 - printf("failed alloc\n"); 546 - return 1; 547 - } 548 - s->iovecs[i].iov_base = buf; 549 - s->iovecs[i].iov_len = BS; 550 - } 551 - 552 - err = setup_ring(s); 553 - if (err) { 554 - printf("ring setup failed: %s, %d\n", strerror(errno), err); 555 - return 1; 556 - } 557 - printf("polled=%d, fixedbufs=%d, buffered=%d", polled, fixedbufs, buffered); 558 - printf(" QD=%d, sq_ring=%d, cq_ring=%d\n", DEPTH, *s->sq_ring.ring_entries, *s->cq_ring.ring_entries); 559 - 560 - pthread_create(&s->thread, NULL, submitter_fn, s); 561 - 562 - fdepths = malloc(8 * s->nr_files); 563 - reap = calls = done = 0; 564 - do { 565 - unsigned long this_done = 0; 566 - unsigned long this_reap = 0; 567 - unsigned long this_call = 0; 568 - unsigned long rpc = 0, ipc = 0; 569 - 570 - sleep(1); 571 - this_done += s->done; 572 - this_call += s->calls; 573 - this_reap += s->reaps; 574 - if (this_call - calls) { 575 - rpc = (this_done - done) / (this_call - calls); 576 - ipc = (this_reap - reap) / (this_call - calls); 577 - } else 578 - rpc = ipc = -1; 579 - file_depths(fdepths); 580 - printf("IOPS=%lu, IOS/call=%ld/%ld, inflight=%u (%s)\n", 581 - this_done - done, rpc, ipc, s->inflight, 582 - fdepths); 583 - done = this_done; 584 - calls = this_call; 585 - reap = this_reap; 586 - } while (!finish); 587 - 588 - pthread_join(s->thread, &ret); 589 - close(s->ring_fd); 590 - free(fdepths); 591 - return 0; 592 - }

-283

tools/io_uring/io_uring-cp.c

··· 1 - // SPDX-License-Identifier: GPL-2.0 2 - /* 3 - * Simple test program that demonstrates a file copy through io_uring. This 4 - * uses the API exposed by liburing. 5 - * 6 - * Copyright (C) 2018-2019 Jens Axboe 7 - */ 8 - #include <stdio.h> 9 - #include <fcntl.h> 10 - #include <string.h> 11 - #include <stdlib.h> 12 - #include <unistd.h> 13 - #include <assert.h> 14 - #include <errno.h> 15 - #include <inttypes.h> 16 - #include <sys/types.h> 17 - #include <sys/stat.h> 18 - #include <sys/ioctl.h> 19 - 20 - #include "liburing.h" 21 - 22 - #define QD 64 23 - #define BS (32*1024) 24 - 25 - static int infd, outfd; 26 - 27 - struct io_data { 28 - int read; 29 - off_t first_offset, offset; 30 - size_t first_len; 31 - struct iovec iov; 32 - }; 33 - 34 - static int setup_context(unsigned entries, struct io_uring *ring) 35 - { 36 - int ret; 37 - 38 - ret = io_uring_queue_init(entries, ring, 0); 39 - if (ret < 0) { 40 - fprintf(stderr, "queue_init: %s\n", strerror(-ret)); 41 - return -1; 42 - } 43 - 44 - return 0; 45 - } 46 - 47 - static int get_file_size(int fd, off_t *size) 48 - { 49 - struct stat st; 50 - 51 - if (fstat(fd, &st) < 0) 52 - return -1; 53 - if (S_ISREG(st.st_mode)) { 54 - *size = st.st_size; 55 - return 0; 56 - } else if (S_ISBLK(st.st_mode)) { 57 - unsigned long long bytes; 58 - 59 - if (ioctl(fd, BLKGETSIZE64, &bytes) != 0) 60 - return -1; 61 - 62 - *size = bytes; 63 - return 0; 64 - } 65 - 66 - return -1; 67 - } 68 - 69 - static void queue_prepped(struct io_uring *ring, struct io_data *data) 70 - { 71 - struct io_uring_sqe *sqe; 72 - 73 - sqe = io_uring_get_sqe(ring); 74 - assert(sqe); 75 - 76 - if (data->read) 77 - io_uring_prep_readv(sqe, infd, &data->iov, 1, data->offset); 78 - else 79 - io_uring_prep_writev(sqe, outfd, &data->iov, 1, data->offset); 80 - 81 - io_uring_sqe_set_data(sqe, data); 82 - } 83 - 84 - static int queue_read(struct io_uring *ring, off_t size, off_t offset) 85 - { 86 - struct io_uring_sqe *sqe; 87 - struct io_data *data; 88 - 89 - data = malloc(size + sizeof(*data)); 90 - if (!data) 91 - return 1; 92 - 93 - sqe = io_uring_get_sqe(ring); 94 - if (!sqe) { 95 - free(data); 96 - return 1; 97 - } 98 - 99 - data->read = 1; 100 - data->offset = data->first_offset = offset; 101 - 102 - data->iov.iov_base = data + 1; 103 - data->iov.iov_len = size; 104 - data->first_len = size; 105 - 106 - io_uring_prep_readv(sqe, infd, &data->iov, 1, offset); 107 - io_uring_sqe_set_data(sqe, data); 108 - return 0; 109 - } 110 - 111 - static void queue_write(struct io_uring *ring, struct io_data *data) 112 - { 113 - data->read = 0; 114 - data->offset = data->first_offset; 115 - 116 - data->iov.iov_base = data + 1; 117 - data->iov.iov_len = data->first_len; 118 - 119 - queue_prepped(ring, data); 120 - io_uring_submit(ring); 121 - } 122 - 123 - static int copy_file(struct io_uring *ring, off_t insize) 124 - { 125 - unsigned long reads, writes; 126 - struct io_uring_cqe *cqe; 127 - off_t write_left, offset; 128 - int ret; 129 - 130 - write_left = insize; 131 - writes = reads = offset = 0; 132 - 133 - while (insize || write_left) { 134 - int had_reads, got_comp; 135 - 136 - /* 137 - * Queue up as many reads as we can 138 - */ 139 - had_reads = reads; 140 - while (insize) { 141 - off_t this_size = insize; 142 - 143 - if (reads + writes >= QD) 144 - break; 145 - if (this_size > BS) 146 - this_size = BS; 147 - else if (!this_size) 148 - break; 149 - 150 - if (queue_read(ring, this_size, offset)) 151 - break; 152 - 153 - insize -= this_size; 154 - offset += this_size; 155 - reads++; 156 - } 157 - 158 - if (had_reads != reads) { 159 - ret = io_uring_submit(ring); 160 - if (ret < 0) { 161 - fprintf(stderr, "io_uring_submit: %s\n", strerror(-ret)); 162 - break; 163 - } 164 - } 165 - 166 - /* 167 - * Queue is full at this point. Find at least one completion. 168 - */ 169 - got_comp = 0; 170 - while (write_left) { 171 - struct io_data *data; 172 - 173 - if (!got_comp) { 174 - ret = io_uring_wait_cqe(ring, &cqe); 175 - got_comp = 1; 176 - } else { 177 - ret = io_uring_peek_cqe(ring, &cqe); 178 - if (ret == -EAGAIN) { 179 - cqe = NULL; 180 - ret = 0; 181 - } 182 - } 183 - if (ret < 0) { 184 - fprintf(stderr, "io_uring_peek_cqe: %s\n", 185 - strerror(-ret)); 186 - return 1; 187 - } 188 - if (!cqe) 189 - break; 190 - 191 - data = io_uring_cqe_get_data(cqe); 192 - if (cqe->res < 0) { 193 - if (cqe->res == -EAGAIN) { 194 - queue_prepped(ring, data); 195 - io_uring_cqe_seen(ring, cqe); 196 - continue; 197 - } 198 - fprintf(stderr, "cqe failed: %s\n", 199 - strerror(-cqe->res)); 200 - return 1; 201 - } else if (cqe->res != data->iov.iov_len) { 202 - /* Short read/write, adjust and requeue */ 203 - data->iov.iov_base += cqe->res; 204 - data->iov.iov_len -= cqe->res; 205 - data->offset += cqe->res; 206 - queue_prepped(ring, data); 207 - io_uring_cqe_seen(ring, cqe); 208 - continue; 209 - } 210 - 211 - /* 212 - * All done. if write, nothing else to do. if read, 213 - * queue up corresponding write. 214 - */ 215 - if (data->read) { 216 - queue_write(ring, data); 217 - write_left -= data->first_len; 218 - reads--; 219 - writes++; 220 - } else { 221 - free(data); 222 - writes--; 223 - } 224 - io_uring_cqe_seen(ring, cqe); 225 - } 226 - } 227 - 228 - /* wait out pending writes */ 229 - while (writes) { 230 - struct io_data *data; 231 - 232 - ret = io_uring_wait_cqe(ring, &cqe); 233 - if (ret) { 234 - fprintf(stderr, "wait_cqe=%d\n", ret); 235 - return 1; 236 - } 237 - if (cqe->res < 0) { 238 - fprintf(stderr, "write res=%d\n", cqe->res); 239 - return 1; 240 - } 241 - data = io_uring_cqe_get_data(cqe); 242 - free(data); 243 - writes--; 244 - io_uring_cqe_seen(ring, cqe); 245 - } 246 - 247 - return 0; 248 - } 249 - 250 - int main(int argc, char *argv[]) 251 - { 252 - struct io_uring ring; 253 - off_t insize; 254 - int ret; 255 - 256 - if (argc < 3) { 257 - printf("%s: infile outfile\n", argv[0]); 258 - return 1; 259 - } 260 - 261 - infd = open(argv[1], O_RDONLY); 262 - if (infd < 0) { 263 - perror("open infile"); 264 - return 1; 265 - } 266 - outfd = open(argv[2], O_WRONLY | O_CREAT | O_TRUNC, 0644); 267 - if (outfd < 0) { 268 - perror("open outfile"); 269 - return 1; 270 - } 271 - 272 - if (setup_context(QD, &ring)) 273 - return 1; 274 - if (get_file_size(infd, &insize)) 275 - return 1; 276 - 277 - ret = copy_file(&ring, insize); 278 - 279 - close(infd); 280 - close(outfd); 281 - io_uring_queue_exit(&ring); 282 - return ret; 283 - }

-187

tools/io_uring/liburing.h

··· 1 - #ifndef LIB_URING_H 2 - #define LIB_URING_H 3 - 4 - #ifdef __cplusplus 5 - extern "C" { 6 - #endif 7 - 8 - #include <sys/uio.h> 9 - #include <signal.h> 10 - #include <string.h> 11 - #include "../../include/uapi/linux/io_uring.h" 12 - #include <inttypes.h> 13 - #include <linux/swab.h> 14 - #include "barrier.h" 15 - 16 - /* 17 - * Library interface to io_uring 18 - */ 19 - struct io_uring_sq { 20 - unsigned *khead; 21 - unsigned *ktail; 22 - unsigned *kring_mask; 23 - unsigned *kring_entries; 24 - unsigned *kflags; 25 - unsigned *kdropped; 26 - unsigned *array; 27 - struct io_uring_sqe *sqes; 28 - 29 - unsigned sqe_head; 30 - unsigned sqe_tail; 31 - 32 - size_t ring_sz; 33 - }; 34 - 35 - struct io_uring_cq { 36 - unsigned *khead; 37 - unsigned *ktail; 38 - unsigned *kring_mask; 39 - unsigned *kring_entries; 40 - unsigned *koverflow; 41 - struct io_uring_cqe *cqes; 42 - 43 - size_t ring_sz; 44 - }; 45 - 46 - struct io_uring { 47 - struct io_uring_sq sq; 48 - struct io_uring_cq cq; 49 - int ring_fd; 50 - }; 51 - 52 - /* 53 - * System calls 54 - */ 55 - extern int io_uring_setup(unsigned entries, struct io_uring_params *p); 56 - extern int io_uring_enter(int fd, unsigned to_submit, 57 - unsigned min_complete, unsigned flags, sigset_t *sig); 58 - extern int io_uring_register(int fd, unsigned int opcode, void *arg, 59 - unsigned int nr_args); 60 - 61 - /* 62 - * Library interface 63 - */ 64 - extern int io_uring_queue_init(unsigned entries, struct io_uring *ring, 65 - unsigned flags); 66 - extern int io_uring_queue_mmap(int fd, struct io_uring_params *p, 67 - struct io_uring *ring); 68 - extern void io_uring_queue_exit(struct io_uring *ring); 69 - extern int io_uring_peek_cqe(struct io_uring *ring, 70 - struct io_uring_cqe **cqe_ptr); 71 - extern int io_uring_wait_cqe(struct io_uring *ring, 72 - struct io_uring_cqe **cqe_ptr); 73 - extern int io_uring_submit(struct io_uring *ring); 74 - extern struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring); 75 - 76 - /* 77 - * Must be called after io_uring_{peek,wait}_cqe() after the cqe has 78 - * been processed by the application. 79 - */ 80 - static inline void io_uring_cqe_seen(struct io_uring *ring, 81 - struct io_uring_cqe *cqe) 82 - { 83 - if (cqe) { 84 - struct io_uring_cq *cq = &ring->cq; 85 - 86 - (*cq->khead)++; 87 - /* 88 - * Ensure that the kernel sees our new head, the kernel has 89 - * the matching read barrier. 90 - */ 91 - write_barrier(); 92 - } 93 - } 94 - 95 - /* 96 - * Command prep helpers 97 - */ 98 - static inline void io_uring_sqe_set_data(struct io_uring_sqe *sqe, void *data) 99 - { 100 - sqe->user_data = (unsigned long) data; 101 - } 102 - 103 - static inline void *io_uring_cqe_get_data(struct io_uring_cqe *cqe) 104 - { 105 - return (void *) (uintptr_t) cqe->user_data; 106 - } 107 - 108 - static inline void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, 109 - const void *addr, unsigned len, 110 - off_t offset) 111 - { 112 - memset(sqe, 0, sizeof(*sqe)); 113 - sqe->opcode = op; 114 - sqe->fd = fd; 115 - sqe->off = offset; 116 - sqe->addr = (unsigned long) addr; 117 - sqe->len = len; 118 - } 119 - 120 - static inline void io_uring_prep_readv(struct io_uring_sqe *sqe, int fd, 121 - const struct iovec *iovecs, 122 - unsigned nr_vecs, off_t offset) 123 - { 124 - io_uring_prep_rw(IORING_OP_READV, sqe, fd, iovecs, nr_vecs, offset); 125 - } 126 - 127 - static inline void io_uring_prep_read_fixed(struct io_uring_sqe *sqe, int fd, 128 - void *buf, unsigned nbytes, 129 - off_t offset) 130 - { 131 - io_uring_prep_rw(IORING_OP_READ_FIXED, sqe, fd, buf, nbytes, offset); 132 - } 133 - 134 - static inline void io_uring_prep_writev(struct io_uring_sqe *sqe, int fd, 135 - const struct iovec *iovecs, 136 - unsigned nr_vecs, off_t offset) 137 - { 138 - io_uring_prep_rw(IORING_OP_WRITEV, sqe, fd, iovecs, nr_vecs, offset); 139 - } 140 - 141 - static inline void io_uring_prep_write_fixed(struct io_uring_sqe *sqe, int fd, 142 - const void *buf, unsigned nbytes, 143 - off_t offset) 144 - { 145 - io_uring_prep_rw(IORING_OP_WRITE_FIXED, sqe, fd, buf, nbytes, offset); 146 - } 147 - 148 - static inline void io_uring_prep_poll_add(struct io_uring_sqe *sqe, int fd, 149 - unsigned poll_mask) 150 - { 151 - memset(sqe, 0, sizeof(*sqe)); 152 - sqe->opcode = IORING_OP_POLL_ADD; 153 - sqe->fd = fd; 154 - #if __BYTE_ORDER == __BIG_ENDIAN 155 - poll_mask = __swahw32(poll_mask); 156 - #endif 157 - sqe->poll_events = poll_mask; 158 - } 159 - 160 - static inline void io_uring_prep_poll_remove(struct io_uring_sqe *sqe, 161 - void *user_data) 162 - { 163 - memset(sqe, 0, sizeof(*sqe)); 164 - sqe->opcode = IORING_OP_POLL_REMOVE; 165 - sqe->addr = (unsigned long) user_data; 166 - } 167 - 168 - static inline void io_uring_prep_fsync(struct io_uring_sqe *sqe, int fd, 169 - unsigned fsync_flags) 170 - { 171 - memset(sqe, 0, sizeof(*sqe)); 172 - sqe->opcode = IORING_OP_FSYNC; 173 - sqe->fd = fd; 174 - sqe->fsync_flags = fsync_flags; 175 - } 176 - 177 - static inline void io_uring_prep_nop(struct io_uring_sqe *sqe) 178 - { 179 - memset(sqe, 0, sizeof(*sqe)); 180 - sqe->opcode = IORING_OP_NOP; 181 - } 182 - 183 - #ifdef __cplusplus 184 - } 185 - #endif 186 - 187 - #endif

-156

tools/io_uring/queue.c

··· 1 - #include <sys/types.h> 2 - #include <sys/stat.h> 3 - #include <sys/mman.h> 4 - #include <unistd.h> 5 - #include <errno.h> 6 - #include <string.h> 7 - 8 - #include "liburing.h" 9 - #include "barrier.h" 10 - 11 - static int __io_uring_get_cqe(struct io_uring *ring, 12 - struct io_uring_cqe **cqe_ptr, int wait) 13 - { 14 - struct io_uring_cq *cq = &ring->cq; 15 - const unsigned mask = *cq->kring_mask; 16 - unsigned head; 17 - int ret; 18 - 19 - *cqe_ptr = NULL; 20 - head = *cq->khead; 21 - do { 22 - /* 23 - * It's necessary to use a read_barrier() before reading 24 - * the CQ tail, since the kernel updates it locklessly. The 25 - * kernel has the matching store barrier for the update. The 26 - * kernel also ensures that previous stores to CQEs are ordered 27 - * with the tail update. 28 - */ 29 - read_barrier(); 30 - if (head != *cq->ktail) { 31 - *cqe_ptr = &cq->cqes[head & mask]; 32 - break; 33 - } 34 - if (!wait) 35 - break; 36 - ret = io_uring_enter(ring->ring_fd, 0, 1, 37 - IORING_ENTER_GETEVENTS, NULL); 38 - if (ret < 0) 39 - return -errno; 40 - } while (1); 41 - 42 - return 0; 43 - } 44 - 45 - /* 46 - * Return an IO completion, if one is readily available. Returns 0 with 47 - * cqe_ptr filled in on success, -errno on failure. 48 - */ 49 - int io_uring_peek_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr) 50 - { 51 - return __io_uring_get_cqe(ring, cqe_ptr, 0); 52 - } 53 - 54 - /* 55 - * Return an IO completion, waiting for it if necessary. Returns 0 with 56 - * cqe_ptr filled in on success, -errno on failure. 57 - */ 58 - int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr) 59 - { 60 - return __io_uring_get_cqe(ring, cqe_ptr, 1); 61 - } 62 - 63 - /* 64 - * Submit sqes acquired from io_uring_get_sqe() to the kernel. 65 - * 66 - * Returns number of sqes submitted 67 - */ 68 - int io_uring_submit(struct io_uring *ring) 69 - { 70 - struct io_uring_sq *sq = &ring->sq; 71 - const unsigned mask = *sq->kring_mask; 72 - unsigned ktail, ktail_next, submitted, to_submit; 73 - int ret; 74 - 75 - /* 76 - * If we have pending IO in the kring, submit it first. We need a 77 - * read barrier here to match the kernels store barrier when updating 78 - * the SQ head. 79 - */ 80 - read_barrier(); 81 - if (*sq->khead != *sq->ktail) { 82 - submitted = *sq->kring_entries; 83 - goto submit; 84 - } 85 - 86 - if (sq->sqe_head == sq->sqe_tail) 87 - return 0; 88 - 89 - /* 90 - * Fill in sqes that we have queued up, adding them to the kernel ring 91 - */ 92 - submitted = 0; 93 - ktail = ktail_next = *sq->ktail; 94 - to_submit = sq->sqe_tail - sq->sqe_head; 95 - while (to_submit--) { 96 - ktail_next++; 97 - read_barrier(); 98 - 99 - sq->array[ktail & mask] = sq->sqe_head & mask; 100 - ktail = ktail_next; 101 - 102 - sq->sqe_head++; 103 - submitted++; 104 - } 105 - 106 - if (!submitted) 107 - return 0; 108 - 109 - if (*sq->ktail != ktail) { 110 - /* 111 - * First write barrier ensures that the SQE stores are updated 112 - * with the tail update. This is needed so that the kernel 113 - * will never see a tail update without the preceeding sQE 114 - * stores being done. 115 - */ 116 - write_barrier(); 117 - *sq->ktail = ktail; 118 - /* 119 - * The kernel has the matching read barrier for reading the 120 - * SQ tail. 121 - */ 122 - write_barrier(); 123 - } 124 - 125 - submit: 126 - ret = io_uring_enter(ring->ring_fd, submitted, 0, 127 - IORING_ENTER_GETEVENTS, NULL); 128 - if (ret < 0) 129 - return -errno; 130 - 131 - return ret; 132 - } 133 - 134 - /* 135 - * Return an sqe to fill. Application must later call io_uring_submit() 136 - * when it's ready to tell the kernel about it. The caller may call this 137 - * function multiple times before calling io_uring_submit(). 138 - * 139 - * Returns a vacant sqe, or NULL if we're full. 140 - */ 141 - struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring) 142 - { 143 - struct io_uring_sq *sq = &ring->sq; 144 - unsigned next = sq->sqe_tail + 1; 145 - struct io_uring_sqe *sqe; 146 - 147 - /* 148 - * All sqes are used 149 - */ 150 - if (next - sq->sqe_head > *sq->kring_entries) 151 - return NULL; 152 - 153 - sqe = &sq->sqes[sq->sqe_tail & *sq->kring_mask]; 154 - sq->sqe_tail = next; 155 - return sqe; 156 - }

-107

tools/io_uring/setup.c

··· 1 - #include <sys/types.h> 2 - #include <sys/stat.h> 3 - #include <sys/mman.h> 4 - #include <unistd.h> 5 - #include <errno.h> 6 - #include <string.h> 7 - 8 - #include "liburing.h" 9 - 10 - static int io_uring_mmap(int fd, struct io_uring_params *p, 11 - struct io_uring_sq *sq, struct io_uring_cq *cq) 12 - { 13 - size_t size; 14 - void *ptr; 15 - int ret; 16 - 17 - sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned); 18 - ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE, 19 - MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); 20 - if (ptr == MAP_FAILED) 21 - return -errno; 22 - sq->khead = ptr + p->sq_off.head; 23 - sq->ktail = ptr + p->sq_off.tail; 24 - sq->kring_mask = ptr + p->sq_off.ring_mask; 25 - sq->kring_entries = ptr + p->sq_off.ring_entries; 26 - sq->kflags = ptr + p->sq_off.flags; 27 - sq->kdropped = ptr + p->sq_off.dropped; 28 - sq->array = ptr + p->sq_off.array; 29 - 30 - size = p->sq_entries * sizeof(struct io_uring_sqe); 31 - sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE, 32 - MAP_SHARED | MAP_POPULATE, fd, 33 - IORING_OFF_SQES); 34 - if (sq->sqes == MAP_FAILED) { 35 - ret = -errno; 36 - err: 37 - munmap(sq->khead, sq->ring_sz); 38 - return ret; 39 - } 40 - 41 - cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe); 42 - ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE, 43 - MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING); 44 - if (ptr == MAP_FAILED) { 45 - ret = -errno; 46 - munmap(sq->sqes, p->sq_entries * sizeof(struct io_uring_sqe)); 47 - goto err; 48 - } 49 - cq->khead = ptr + p->cq_off.head; 50 - cq->ktail = ptr + p->cq_off.tail; 51 - cq->kring_mask = ptr + p->cq_off.ring_mask; 52 - cq->kring_entries = ptr + p->cq_off.ring_entries; 53 - cq->koverflow = ptr + p->cq_off.overflow; 54 - cq->cqes = ptr + p->cq_off.cqes; 55 - return 0; 56 - } 57 - 58 - /* 59 - * For users that want to specify sq_thread_cpu or sq_thread_idle, this 60 - * interface is a convenient helper for mmap()ing the rings. 61 - * Returns -1 on error, or zero on success. On success, 'ring' 62 - * contains the necessary information to read/write to the rings. 63 - */ 64 - int io_uring_queue_mmap(int fd, struct io_uring_params *p, struct io_uring *ring) 65 - { 66 - int ret; 67 - 68 - memset(ring, 0, sizeof(*ring)); 69 - ret = io_uring_mmap(fd, p, &ring->sq, &ring->cq); 70 - if (!ret) 71 - ring->ring_fd = fd; 72 - return ret; 73 - } 74 - 75 - /* 76 - * Returns -1 on error, or zero on success. On success, 'ring' 77 - * contains the necessary information to read/write to the rings. 78 - */ 79 - int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags) 80 - { 81 - struct io_uring_params p; 82 - int fd, ret; 83 - 84 - memset(&p, 0, sizeof(p)); 85 - p.flags = flags; 86 - 87 - fd = io_uring_setup(entries, &p); 88 - if (fd < 0) 89 - return fd; 90 - 91 - ret = io_uring_queue_mmap(fd, &p, ring); 92 - if (ret) 93 - close(fd); 94 - 95 - return ret; 96 - } 97 - 98 - void io_uring_queue_exit(struct io_uring *ring) 99 - { 100 - struct io_uring_sq *sq = &ring->sq; 101 - struct io_uring_cq *cq = &ring->cq; 102 - 103 - munmap(sq->sqes, *sq->kring_entries * sizeof(struct io_uring_sqe)); 104 - munmap(sq->khead, sq->ring_sz); 105 - munmap(cq->khead, cq->ring_sz); 106 - close(ring->ring_fd); 107 - }

-52

tools/io_uring/syscall.c

··· 1 - /* 2 - * Will go away once libc support is there 3 - */ 4 - #include <unistd.h> 5 - #include <sys/syscall.h> 6 - #include <sys/uio.h> 7 - #include <signal.h> 8 - #include "liburing.h" 9 - 10 - #ifdef __alpha__ 11 - /* 12 - * alpha is the only exception, all other architectures 13 - * have common numbers for new system calls. 14 - */ 15 - # ifndef __NR_io_uring_setup 16 - # define __NR_io_uring_setup 535 17 - # endif 18 - # ifndef __NR_io_uring_enter 19 - # define __NR_io_uring_enter 536 20 - # endif 21 - # ifndef __NR_io_uring_register 22 - # define __NR_io_uring_register 537 23 - # endif 24 - #else /* !__alpha__ */ 25 - # ifndef __NR_io_uring_setup 26 - # define __NR_io_uring_setup 425 27 - # endif 28 - # ifndef __NR_io_uring_enter 29 - # define __NR_io_uring_enter 426 30 - # endif 31 - # ifndef __NR_io_uring_register 32 - # define __NR_io_uring_register 427 33 - # endif 34 - #endif 35 - 36 - int io_uring_register(int fd, unsigned int opcode, void *arg, 37 - unsigned int nr_args) 38 - { 39 - return syscall(__NR_io_uring_register, fd, opcode, arg, nr_args); 40 - } 41 - 42 - int io_uring_setup(unsigned int entries, struct io_uring_params *p) 43 - { 44 - return syscall(__NR_io_uring_setup, entries, p); 45 - } 46 - 47 - int io_uring_enter(int fd, unsigned int to_submit, unsigned int min_complete, 48 - unsigned int flags, sigset_t *sig) 49 - { 50 - return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, 51 - flags, sig, _NSIG / 8); 52 - }