at v6.0 16 kB view raw
1/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ 2/* 3 * Header file for the io_uring interface. 4 * 5 * Copyright (C) 2019 Jens Axboe 6 * Copyright (C) 2019 Christoph Hellwig 7 */ 8#ifndef LINUX_IO_URING_H 9#define LINUX_IO_URING_H 10 11#include <linux/fs.h> 12#include <linux/types.h> 13#include <linux/time_types.h> 14 15#ifdef __cplusplus 16extern "C" { 17#endif 18 19/* 20 * IO submission data structure (Submission Queue Entry) 21 */ 22struct io_uring_sqe { 23 __u8 opcode; /* type of operation for this sqe */ 24 __u8 flags; /* IOSQE_ flags */ 25 __u16 ioprio; /* ioprio for the request */ 26 __s32 fd; /* file descriptor to do IO on */ 27 union { 28 __u64 off; /* offset into file */ 29 __u64 addr2; 30 struct { 31 __u32 cmd_op; 32 __u32 __pad1; 33 }; 34 }; 35 union { 36 __u64 addr; /* pointer to buffer or iovecs */ 37 __u64 splice_off_in; 38 }; 39 __u32 len; /* buffer size or number of iovecs */ 40 union { 41 __kernel_rwf_t rw_flags; 42 __u32 fsync_flags; 43 __u16 poll_events; /* compatibility */ 44 __u32 poll32_events; /* word-reversed for BE */ 45 __u32 sync_range_flags; 46 __u32 msg_flags; 47 __u32 timeout_flags; 48 __u32 accept_flags; 49 __u32 cancel_flags; 50 __u32 open_flags; 51 __u32 statx_flags; 52 __u32 fadvise_advice; 53 __u32 splice_flags; 54 __u32 rename_flags; 55 __u32 unlink_flags; 56 __u32 hardlink_flags; 57 __u32 xattr_flags; 58 __u32 msg_ring_flags; 59 }; 60 __u64 user_data; /* data to be passed back at completion time */ 61 /* pack this to avoid bogus arm OABI complaints */ 62 union { 63 /* index into fixed buffers, if used */ 64 __u16 buf_index; 65 /* for grouped buffer selection */ 66 __u16 buf_group; 67 } __attribute__((packed)); 68 /* personality to use, if used */ 69 __u16 personality; 70 union { 71 __s32 splice_fd_in; 72 __u32 file_index; 73 struct { 74 __u16 addr_len; 75 __u16 __pad3[1]; 76 }; 77 }; 78 union { 79 struct { 80 __u64 addr3; 81 __u64 __pad2[1]; 82 }; 83 /* 84 * If the ring is initialized with IORING_SETUP_SQE128, then 85 * this field is used for 80 bytes of arbitrary command data 86 */ 87 __u8 cmd[0]; 88 }; 89}; 90 91/* 92 * If sqe->file_index is set to this for opcodes that instantiate a new 93 * direct descriptor (like openat/openat2/accept), then io_uring will allocate 94 * an available direct descriptor instead of having the application pass one 95 * in. The picked direct descriptor will be returned in cqe->res, or -ENFILE 96 * if the space is full. 97 */ 98#define IORING_FILE_INDEX_ALLOC (~0U) 99 100enum { 101 IOSQE_FIXED_FILE_BIT, 102 IOSQE_IO_DRAIN_BIT, 103 IOSQE_IO_LINK_BIT, 104 IOSQE_IO_HARDLINK_BIT, 105 IOSQE_ASYNC_BIT, 106 IOSQE_BUFFER_SELECT_BIT, 107 IOSQE_CQE_SKIP_SUCCESS_BIT, 108}; 109 110/* 111 * sqe->flags 112 */ 113/* use fixed fileset */ 114#define IOSQE_FIXED_FILE (1U << IOSQE_FIXED_FILE_BIT) 115/* issue after inflight IO */ 116#define IOSQE_IO_DRAIN (1U << IOSQE_IO_DRAIN_BIT) 117/* links next sqe */ 118#define IOSQE_IO_LINK (1U << IOSQE_IO_LINK_BIT) 119/* like LINK, but stronger */ 120#define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT) 121/* always go async */ 122#define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT) 123/* select buffer from sqe->buf_group */ 124#define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT) 125/* don't post CQE if request succeeded */ 126#define IOSQE_CQE_SKIP_SUCCESS (1U << IOSQE_CQE_SKIP_SUCCESS_BIT) 127 128/* 129 * io_uring_setup() flags 130 */ 131#define IORING_SETUP_IOPOLL (1U << 0) /* io_context is polled */ 132#define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */ 133#define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */ 134#define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */ 135#define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */ 136#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */ 137#define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */ 138#define IORING_SETUP_SUBMIT_ALL (1U << 7) /* continue submit on error */ 139/* 140 * Cooperative task running. When requests complete, they often require 141 * forcing the submitter to transition to the kernel to complete. If this 142 * flag is set, work will be done when the task transitions anyway, rather 143 * than force an inter-processor interrupt reschedule. This avoids interrupting 144 * a task running in userspace, and saves an IPI. 145 */ 146#define IORING_SETUP_COOP_TASKRUN (1U << 8) 147/* 148 * If COOP_TASKRUN is set, get notified if task work is available for 149 * running and a kernel transition would be needed to run it. This sets 150 * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN. 151 */ 152#define IORING_SETUP_TASKRUN_FLAG (1U << 9) 153#define IORING_SETUP_SQE128 (1U << 10) /* SQEs are 128 byte */ 154#define IORING_SETUP_CQE32 (1U << 11) /* CQEs are 32 byte */ 155/* 156 * Only one task is allowed to submit requests 157 */ 158#define IORING_SETUP_SINGLE_ISSUER (1U << 12) 159 160enum io_uring_op { 161 IORING_OP_NOP, 162 IORING_OP_READV, 163 IORING_OP_WRITEV, 164 IORING_OP_FSYNC, 165 IORING_OP_READ_FIXED, 166 IORING_OP_WRITE_FIXED, 167 IORING_OP_POLL_ADD, 168 IORING_OP_POLL_REMOVE, 169 IORING_OP_SYNC_FILE_RANGE, 170 IORING_OP_SENDMSG, 171 IORING_OP_RECVMSG, 172 IORING_OP_TIMEOUT, 173 IORING_OP_TIMEOUT_REMOVE, 174 IORING_OP_ACCEPT, 175 IORING_OP_ASYNC_CANCEL, 176 IORING_OP_LINK_TIMEOUT, 177 IORING_OP_CONNECT, 178 IORING_OP_FALLOCATE, 179 IORING_OP_OPENAT, 180 IORING_OP_CLOSE, 181 IORING_OP_FILES_UPDATE, 182 IORING_OP_STATX, 183 IORING_OP_READ, 184 IORING_OP_WRITE, 185 IORING_OP_FADVISE, 186 IORING_OP_MADVISE, 187 IORING_OP_SEND, 188 IORING_OP_RECV, 189 IORING_OP_OPENAT2, 190 IORING_OP_EPOLL_CTL, 191 IORING_OP_SPLICE, 192 IORING_OP_PROVIDE_BUFFERS, 193 IORING_OP_REMOVE_BUFFERS, 194 IORING_OP_TEE, 195 IORING_OP_SHUTDOWN, 196 IORING_OP_RENAMEAT, 197 IORING_OP_UNLINKAT, 198 IORING_OP_MKDIRAT, 199 IORING_OP_SYMLINKAT, 200 IORING_OP_LINKAT, 201 IORING_OP_MSG_RING, 202 IORING_OP_FSETXATTR, 203 IORING_OP_SETXATTR, 204 IORING_OP_FGETXATTR, 205 IORING_OP_GETXATTR, 206 IORING_OP_SOCKET, 207 IORING_OP_URING_CMD, 208 IORING_OP_SEND_ZC, 209 210 /* this goes last, obviously */ 211 IORING_OP_LAST, 212}; 213 214/* 215 * sqe->fsync_flags 216 */ 217#define IORING_FSYNC_DATASYNC (1U << 0) 218 219/* 220 * sqe->timeout_flags 221 */ 222#define IORING_TIMEOUT_ABS (1U << 0) 223#define IORING_TIMEOUT_UPDATE (1U << 1) 224#define IORING_TIMEOUT_BOOTTIME (1U << 2) 225#define IORING_TIMEOUT_REALTIME (1U << 3) 226#define IORING_LINK_TIMEOUT_UPDATE (1U << 4) 227#define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5) 228#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME) 229#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE) 230/* 231 * sqe->splice_flags 232 * extends splice(2) flags 233 */ 234#define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */ 235 236/* 237 * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the 238 * command flags for POLL_ADD are stored in sqe->len. 239 * 240 * IORING_POLL_ADD_MULTI Multishot poll. Sets IORING_CQE_F_MORE if 241 * the poll handler will continue to report 242 * CQEs on behalf of the same SQE. 243 * 244 * IORING_POLL_UPDATE Update existing poll request, matching 245 * sqe->addr as the old user_data field. 246 * 247 * IORING_POLL_LEVEL Level triggered poll. 248 */ 249#define IORING_POLL_ADD_MULTI (1U << 0) 250#define IORING_POLL_UPDATE_EVENTS (1U << 1) 251#define IORING_POLL_UPDATE_USER_DATA (1U << 2) 252#define IORING_POLL_ADD_LEVEL (1U << 3) 253 254/* 255 * ASYNC_CANCEL flags. 256 * 257 * IORING_ASYNC_CANCEL_ALL Cancel all requests that match the given key 258 * IORING_ASYNC_CANCEL_FD Key off 'fd' for cancelation rather than the 259 * request 'user_data' 260 * IORING_ASYNC_CANCEL_ANY Match any request 261 * IORING_ASYNC_CANCEL_FD_FIXED 'fd' passed in is a fixed descriptor 262 */ 263#define IORING_ASYNC_CANCEL_ALL (1U << 0) 264#define IORING_ASYNC_CANCEL_FD (1U << 1) 265#define IORING_ASYNC_CANCEL_ANY (1U << 2) 266#define IORING_ASYNC_CANCEL_FD_FIXED (1U << 3) 267 268/* 269 * send/sendmsg and recv/recvmsg flags (sqe->ioprio) 270 * 271 * IORING_RECVSEND_POLL_FIRST If set, instead of first attempting to send 272 * or receive and arm poll if that yields an 273 * -EAGAIN result, arm poll upfront and skip 274 * the initial transfer attempt. 275 * 276 * IORING_RECV_MULTISHOT Multishot recv. Sets IORING_CQE_F_MORE if 277 * the handler will continue to report 278 * CQEs on behalf of the same SQE. 279 * 280 * IORING_RECVSEND_FIXED_BUF Use registered buffers, the index is stored in 281 * the buf_index field. 282 */ 283#define IORING_RECVSEND_POLL_FIRST (1U << 0) 284#define IORING_RECV_MULTISHOT (1U << 1) 285#define IORING_RECVSEND_FIXED_BUF (1U << 2) 286 287/* 288 * accept flags stored in sqe->ioprio 289 */ 290#define IORING_ACCEPT_MULTISHOT (1U << 0) 291 292/* 293 * IORING_OP_MSG_RING command types, stored in sqe->addr 294 */ 295enum { 296 IORING_MSG_DATA, /* pass sqe->len as 'res' and off as user_data */ 297 IORING_MSG_SEND_FD, /* send a registered fd to another ring */ 298}; 299 300/* 301 * IORING_OP_MSG_RING flags (sqe->msg_ring_flags) 302 * 303 * IORING_MSG_RING_CQE_SKIP Don't post a CQE to the target ring. Not 304 * applicable for IORING_MSG_DATA, obviously. 305 */ 306#define IORING_MSG_RING_CQE_SKIP (1U << 0) 307 308/* 309 * IO completion data structure (Completion Queue Entry) 310 */ 311struct io_uring_cqe { 312 __u64 user_data; /* sqe->data submission passed back */ 313 __s32 res; /* result code for this event */ 314 __u32 flags; 315 316 /* 317 * If the ring is initialized with IORING_SETUP_CQE32, then this field 318 * contains 16-bytes of padding, doubling the size of the CQE. 319 */ 320 __u64 big_cqe[]; 321}; 322 323/* 324 * cqe->flags 325 * 326 * IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID 327 * IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries 328 * IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv 329 * IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct 330 * them from sends. 331 */ 332#define IORING_CQE_F_BUFFER (1U << 0) 333#define IORING_CQE_F_MORE (1U << 1) 334#define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) 335#define IORING_CQE_F_NOTIF (1U << 3) 336 337enum { 338 IORING_CQE_BUFFER_SHIFT = 16, 339}; 340 341/* 342 * Magic offsets for the application to mmap the data it needs 343 */ 344#define IORING_OFF_SQ_RING 0ULL 345#define IORING_OFF_CQ_RING 0x8000000ULL 346#define IORING_OFF_SQES 0x10000000ULL 347 348/* 349 * Filled with the offset for mmap(2) 350 */ 351struct io_sqring_offsets { 352 __u32 head; 353 __u32 tail; 354 __u32 ring_mask; 355 __u32 ring_entries; 356 __u32 flags; 357 __u32 dropped; 358 __u32 array; 359 __u32 resv1; 360 __u64 resv2; 361}; 362 363/* 364 * sq_ring->flags 365 */ 366#define IORING_SQ_NEED_WAKEUP (1U << 0) /* needs io_uring_enter wakeup */ 367#define IORING_SQ_CQ_OVERFLOW (1U << 1) /* CQ ring is overflown */ 368#define IORING_SQ_TASKRUN (1U << 2) /* task should enter the kernel */ 369 370struct io_cqring_offsets { 371 __u32 head; 372 __u32 tail; 373 __u32 ring_mask; 374 __u32 ring_entries; 375 __u32 overflow; 376 __u32 cqes; 377 __u32 flags; 378 __u32 resv1; 379 __u64 resv2; 380}; 381 382/* 383 * cq_ring->flags 384 */ 385 386/* disable eventfd notifications */ 387#define IORING_CQ_EVENTFD_DISABLED (1U << 0) 388 389/* 390 * io_uring_enter(2) flags 391 */ 392#define IORING_ENTER_GETEVENTS (1U << 0) 393#define IORING_ENTER_SQ_WAKEUP (1U << 1) 394#define IORING_ENTER_SQ_WAIT (1U << 2) 395#define IORING_ENTER_EXT_ARG (1U << 3) 396#define IORING_ENTER_REGISTERED_RING (1U << 4) 397 398/* 399 * Passed in for io_uring_setup(2). Copied back with updated info on success 400 */ 401struct io_uring_params { 402 __u32 sq_entries; 403 __u32 cq_entries; 404 __u32 flags; 405 __u32 sq_thread_cpu; 406 __u32 sq_thread_idle; 407 __u32 features; 408 __u32 wq_fd; 409 __u32 resv[3]; 410 struct io_sqring_offsets sq_off; 411 struct io_cqring_offsets cq_off; 412}; 413 414/* 415 * io_uring_params->features flags 416 */ 417#define IORING_FEAT_SINGLE_MMAP (1U << 0) 418#define IORING_FEAT_NODROP (1U << 1) 419#define IORING_FEAT_SUBMIT_STABLE (1U << 2) 420#define IORING_FEAT_RW_CUR_POS (1U << 3) 421#define IORING_FEAT_CUR_PERSONALITY (1U << 4) 422#define IORING_FEAT_FAST_POLL (1U << 5) 423#define IORING_FEAT_POLL_32BITS (1U << 6) 424#define IORING_FEAT_SQPOLL_NONFIXED (1U << 7) 425#define IORING_FEAT_EXT_ARG (1U << 8) 426#define IORING_FEAT_NATIVE_WORKERS (1U << 9) 427#define IORING_FEAT_RSRC_TAGS (1U << 10) 428#define IORING_FEAT_CQE_SKIP (1U << 11) 429#define IORING_FEAT_LINKED_FILE (1U << 12) 430 431/* 432 * io_uring_register(2) opcodes and arguments 433 */ 434enum { 435 IORING_REGISTER_BUFFERS = 0, 436 IORING_UNREGISTER_BUFFERS = 1, 437 IORING_REGISTER_FILES = 2, 438 IORING_UNREGISTER_FILES = 3, 439 IORING_REGISTER_EVENTFD = 4, 440 IORING_UNREGISTER_EVENTFD = 5, 441 IORING_REGISTER_FILES_UPDATE = 6, 442 IORING_REGISTER_EVENTFD_ASYNC = 7, 443 IORING_REGISTER_PROBE = 8, 444 IORING_REGISTER_PERSONALITY = 9, 445 IORING_UNREGISTER_PERSONALITY = 10, 446 IORING_REGISTER_RESTRICTIONS = 11, 447 IORING_REGISTER_ENABLE_RINGS = 12, 448 449 /* extended with tagging */ 450 IORING_REGISTER_FILES2 = 13, 451 IORING_REGISTER_FILES_UPDATE2 = 14, 452 IORING_REGISTER_BUFFERS2 = 15, 453 IORING_REGISTER_BUFFERS_UPDATE = 16, 454 455 /* set/clear io-wq thread affinities */ 456 IORING_REGISTER_IOWQ_AFF = 17, 457 IORING_UNREGISTER_IOWQ_AFF = 18, 458 459 /* set/get max number of io-wq workers */ 460 IORING_REGISTER_IOWQ_MAX_WORKERS = 19, 461 462 /* register/unregister io_uring fd with the ring */ 463 IORING_REGISTER_RING_FDS = 20, 464 IORING_UNREGISTER_RING_FDS = 21, 465 466 /* register ring based provide buffer group */ 467 IORING_REGISTER_PBUF_RING = 22, 468 IORING_UNREGISTER_PBUF_RING = 23, 469 470 /* sync cancelation API */ 471 IORING_REGISTER_SYNC_CANCEL = 24, 472 473 /* register a range of fixed file slots for automatic slot allocation */ 474 IORING_REGISTER_FILE_ALLOC_RANGE = 25, 475 476 /* this goes last */ 477 IORING_REGISTER_LAST 478}; 479 480/* io-wq worker categories */ 481enum { 482 IO_WQ_BOUND, 483 IO_WQ_UNBOUND, 484}; 485 486/* deprecated, see struct io_uring_rsrc_update */ 487struct io_uring_files_update { 488 __u32 offset; 489 __u32 resv; 490 __aligned_u64 /* __s32 * */ fds; 491}; 492 493/* 494 * Register a fully sparse file space, rather than pass in an array of all 495 * -1 file descriptors. 496 */ 497#define IORING_RSRC_REGISTER_SPARSE (1U << 0) 498 499struct io_uring_rsrc_register { 500 __u32 nr; 501 __u32 flags; 502 __u64 resv2; 503 __aligned_u64 data; 504 __aligned_u64 tags; 505}; 506 507struct io_uring_rsrc_update { 508 __u32 offset; 509 __u32 resv; 510 __aligned_u64 data; 511}; 512 513struct io_uring_rsrc_update2 { 514 __u32 offset; 515 __u32 resv; 516 __aligned_u64 data; 517 __aligned_u64 tags; 518 __u32 nr; 519 __u32 resv2; 520}; 521 522struct io_uring_notification_slot { 523 __u64 tag; 524 __u64 resv[3]; 525}; 526 527struct io_uring_notification_register { 528 __u32 nr_slots; 529 __u32 resv; 530 __u64 resv2; 531 __u64 data; 532 __u64 resv3; 533}; 534 535/* Skip updating fd indexes set to this value in the fd table */ 536#define IORING_REGISTER_FILES_SKIP (-2) 537 538#define IO_URING_OP_SUPPORTED (1U << 0) 539 540struct io_uring_probe_op { 541 __u8 op; 542 __u8 resv; 543 __u16 flags; /* IO_URING_OP_* flags */ 544 __u32 resv2; 545}; 546 547struct io_uring_probe { 548 __u8 last_op; /* last opcode supported */ 549 __u8 ops_len; /* length of ops[] array below */ 550 __u16 resv; 551 __u32 resv2[3]; 552 struct io_uring_probe_op ops[]; 553}; 554 555struct io_uring_restriction { 556 __u16 opcode; 557 union { 558 __u8 register_op; /* IORING_RESTRICTION_REGISTER_OP */ 559 __u8 sqe_op; /* IORING_RESTRICTION_SQE_OP */ 560 __u8 sqe_flags; /* IORING_RESTRICTION_SQE_FLAGS_* */ 561 }; 562 __u8 resv; 563 __u32 resv2[3]; 564}; 565 566struct io_uring_buf { 567 __u64 addr; 568 __u32 len; 569 __u16 bid; 570 __u16 resv; 571}; 572 573struct io_uring_buf_ring { 574 union { 575 /* 576 * To avoid spilling into more pages than we need to, the 577 * ring tail is overlaid with the io_uring_buf->resv field. 578 */ 579 struct { 580 __u64 resv1; 581 __u32 resv2; 582 __u16 resv3; 583 __u16 tail; 584 }; 585 struct io_uring_buf bufs[0]; 586 }; 587}; 588 589/* argument for IORING_(UN)REGISTER_PBUF_RING */ 590struct io_uring_buf_reg { 591 __u64 ring_addr; 592 __u32 ring_entries; 593 __u16 bgid; 594 __u16 pad; 595 __u64 resv[3]; 596}; 597 598/* 599 * io_uring_restriction->opcode values 600 */ 601enum { 602 /* Allow an io_uring_register(2) opcode */ 603 IORING_RESTRICTION_REGISTER_OP = 0, 604 605 /* Allow an sqe opcode */ 606 IORING_RESTRICTION_SQE_OP = 1, 607 608 /* Allow sqe flags */ 609 IORING_RESTRICTION_SQE_FLAGS_ALLOWED = 2, 610 611 /* Require sqe flags (these flags must be set on each submission) */ 612 IORING_RESTRICTION_SQE_FLAGS_REQUIRED = 3, 613 614 IORING_RESTRICTION_LAST 615}; 616 617struct io_uring_getevents_arg { 618 __u64 sigmask; 619 __u32 sigmask_sz; 620 __u32 pad; 621 __u64 ts; 622}; 623 624/* 625 * Argument for IORING_REGISTER_SYNC_CANCEL 626 */ 627struct io_uring_sync_cancel_reg { 628 __u64 addr; 629 __s32 fd; 630 __u32 flags; 631 struct __kernel_timespec timeout; 632 __u64 pad[4]; 633}; 634 635/* 636 * Argument for IORING_REGISTER_FILE_ALLOC_RANGE 637 * The range is specified as [off, off + len) 638 */ 639struct io_uring_file_index_range { 640 __u32 off; 641 __u32 len; 642 __u64 resv; 643}; 644 645struct io_uring_recvmsg_out { 646 __u32 namelen; 647 __u32 controllen; 648 __u32 payloadlen; 649 __u32 flags; 650}; 651 652#ifdef __cplusplus 653} 654#endif 655 656#endif