Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'for-6.14/io_uring-20250119' of git://git.kernel.dk/linux

Pull io_uring updates from Jens Axboe:
"Not a lot in terms of features this time around, mostly just cleanups
and code consolidation:

- Support for PI meta data read/write via io_uring, with NVMe and
SCSI covered

- Cleanup the per-op structure caching, making it consistent across
various command types

- Consolidate the various user mapped features into a concept called
regions, making the various users of that consistent

- Various cleanups and fixes"

* tag 'for-6.14/io_uring-20250119' of git://git.kernel.dk/linux: (56 commits)
io_uring/fdinfo: fix io_uring_show_fdinfo() misuse of ->d_iname
io_uring: reuse io_should_terminate_tw() for cmds
io_uring: Factor out a function to parse restrictions
io_uring/rsrc: require cloned buffers to share accounting contexts
io_uring: simplify the SQPOLL thread check when cancelling requests
io_uring: expose read/write attribute capability
io_uring/rw: don't gate retry on completion context
io_uring/rw: handle -EAGAIN retry at IO completion time
io_uring/rw: use io_rw_recycle() from cleanup path
io_uring/rsrc: simplify the bvec iter count calculation
io_uring: ensure io_queue_deferred() is out-of-line
io_uring/rw: always clear ->bytes_done on io_async_rw setup
io_uring/rw: use NULL for rw->free_iovec assigment
io_uring/rw: don't mask in f_iocb_flags
io_uring/msg_ring: Drop custom destructor
io_uring: Move old async data allocation helper to header
io_uring/rw: Allocate async data through helper
io_uring/net: Allocate msghdr async data through helper
io_uring/uring_cmd: Allocate async data through generic helper
io_uring/poll: Allocate apoll with generic alloc_cache helper
...

+859 -790
+69 -15
block/bio-integrity.c
··· 118 118 119 119 static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip) 120 120 { 121 - unsigned short nr_vecs = bip->bip_max_vcnt - 1; 122 - struct bio_vec *copy = &bip->bip_vec[1]; 123 - size_t bytes = bip->bip_iter.bi_size; 124 - struct iov_iter iter; 121 + unsigned short orig_nr_vecs = bip->bip_max_vcnt - 1; 122 + struct bio_vec *orig_bvecs = &bip->bip_vec[1]; 123 + struct bio_vec *bounce_bvec = &bip->bip_vec[0]; 124 + size_t bytes = bounce_bvec->bv_len; 125 + struct iov_iter orig_iter; 125 126 int ret; 126 127 127 - iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes); 128 - ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter); 128 + iov_iter_bvec(&orig_iter, ITER_DEST, orig_bvecs, orig_nr_vecs, bytes); 129 + ret = copy_to_iter(bvec_virt(bounce_bvec), bytes, &orig_iter); 129 130 WARN_ON_ONCE(ret != bytes); 130 131 131 - bio_integrity_unpin_bvec(copy, nr_vecs, true); 132 + bio_integrity_unpin_bvec(orig_bvecs, orig_nr_vecs, true); 132 133 } 133 134 134 135 /** ··· 302 301 return nr_bvecs; 303 302 } 304 303 305 - int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes) 304 + int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) 306 305 { 307 306 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 308 307 unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits); 309 308 struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages; 310 309 struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec; 310 + size_t offset, bytes = iter->count; 311 311 unsigned int direction, nr_bvecs; 312 - struct iov_iter iter; 313 312 int ret, nr_vecs; 314 - size_t offset; 315 313 bool copy; 316 314 317 315 if (bio_integrity(bio)) ··· 323 323 else 324 324 direction = ITER_SOURCE; 325 325 326 - iov_iter_ubuf(&iter, direction, ubuf, bytes); 327 - nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1); 326 + nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS + 1); 328 327 if (nr_vecs > BIO_MAX_VECS) 329 328 return -E2BIG; 330 329 if (nr_vecs > UIO_FASTIOV) { ··· 333 334 pages = NULL; 334 335 } 335 336 336 - copy = !iov_iter_is_aligned(&iter, align, align); 337 - ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset); 337 + copy = !iov_iter_is_aligned(iter, align, align); 338 + ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs, 0, &offset); 338 339 if (unlikely(ret < 0)) 339 340 goto free_bvec; 340 341 ··· 361 362 free_bvec: 362 363 if (bvec != stack_vec) 363 364 kfree(bvec); 365 + return ret; 366 + } 367 + 368 + static void bio_uio_meta_to_bip(struct bio *bio, struct uio_meta *meta) 369 + { 370 + struct bio_integrity_payload *bip = bio_integrity(bio); 371 + 372 + if (meta->flags & IO_INTEGRITY_CHK_GUARD) 373 + bip->bip_flags |= BIP_CHECK_GUARD; 374 + if (meta->flags & IO_INTEGRITY_CHK_APPTAG) 375 + bip->bip_flags |= BIP_CHECK_APPTAG; 376 + if (meta->flags & IO_INTEGRITY_CHK_REFTAG) 377 + bip->bip_flags |= BIP_CHECK_REFTAG; 378 + 379 + bip->app_tag = meta->app_tag; 380 + } 381 + 382 + int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta) 383 + { 384 + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); 385 + unsigned int integrity_bytes; 386 + int ret; 387 + struct iov_iter it; 388 + 389 + if (!bi) 390 + return -EINVAL; 391 + /* 392 + * original meta iterator can be bigger. 393 + * process integrity info corresponding to current data buffer only. 394 + */ 395 + it = meta->iter; 396 + integrity_bytes = bio_integrity_bytes(bi, bio_sectors(bio)); 397 + if (it.count < integrity_bytes) 398 + return -EINVAL; 399 + 400 + /* should fit into two bytes */ 401 + BUILD_BUG_ON(IO_INTEGRITY_VALID_FLAGS >= (1 << 16)); 402 + 403 + if (meta->flags && (meta->flags & ~IO_INTEGRITY_VALID_FLAGS)) 404 + return -EINVAL; 405 + 406 + it.count = integrity_bytes; 407 + ret = bio_integrity_map_user(bio, &it); 408 + if (!ret) { 409 + bio_uio_meta_to_bip(bio, meta); 410 + bip_set_seed(bio_integrity(bio), meta->seed); 411 + iov_iter_advance(&meta->iter, integrity_bytes); 412 + meta->seed += bio_integrity_intervals(bi, bio_sectors(bio)); 413 + } 364 414 return ret; 365 415 } 366 416 ··· 483 435 if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) 484 436 bip->bip_flags |= BIP_IP_CHECKSUM; 485 437 438 + /* describe what tags to check in payload */ 439 + if (bi->csum_type) 440 + bip->bip_flags |= BIP_CHECK_GUARD; 441 + if (bi->flags & BLK_INTEGRITY_REF_TAG) 442 + bip->bip_flags |= BIP_CHECK_REFTAG; 486 443 if (bio_integrity_add_page(bio, virt_to_page(buf), len, 487 444 offset_in_page(buf)) < len) { 488 445 printk(KERN_ERR "could not attach integrity payload\n"); ··· 612 559 613 560 bip->bip_vec = bip_src->bip_vec; 614 561 bip->bip_iter = bip_src->bip_iter; 615 - bip->bip_flags = bip_src->bip_flags & ~BIP_BLOCK_INTEGRITY; 562 + bip->bip_flags = bip_src->bip_flags & BIP_CLONE_FLAGS; 563 + bip->app_tag = bip_src->app_tag; 616 564 617 565 return 0; 618 566 }
+9 -1
block/blk-integrity.c
··· 115 115 int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, 116 116 ssize_t bytes) 117 117 { 118 - int ret = bio_integrity_map_user(rq->bio, ubuf, bytes); 118 + int ret; 119 + struct iov_iter iter; 120 + unsigned int direction; 119 121 122 + if (op_is_write(req_op(rq))) 123 + direction = ITER_DEST; 124 + else 125 + direction = ITER_SOURCE; 126 + iov_iter_ubuf(&iter, direction, ubuf, bytes); 127 + ret = bio_integrity_map_user(rq->bio, &iter); 120 128 if (ret) 121 129 return ret; 122 130
+35 -10
block/fops.c
··· 54 54 struct bio bio; 55 55 ssize_t ret; 56 56 57 + WARN_ON_ONCE(iocb->ki_flags & IOCB_HAS_METADATA); 57 58 if (nr_pages <= DIO_INLINE_BIO_VECS) 58 59 vecs = inline_vecs; 59 60 else { ··· 125 124 { 126 125 struct blkdev_dio *dio = bio->bi_private; 127 126 bool should_dirty = dio->flags & DIO_SHOULD_DIRTY; 127 + bool is_sync = dio->flags & DIO_IS_SYNC; 128 128 129 129 if (bio->bi_status && !dio->bio.bi_status) 130 130 dio->bio.bi_status = bio->bi_status; 131 131 132 + if (!is_sync && (dio->iocb->ki_flags & IOCB_HAS_METADATA)) 133 + bio_integrity_unmap_user(bio); 134 + 132 135 if (atomic_dec_and_test(&dio->ref)) { 133 - if (!(dio->flags & DIO_IS_SYNC)) { 136 + if (!is_sync) { 134 137 struct kiocb *iocb = dio->iocb; 135 138 ssize_t ret; 136 139 ··· 226 221 * a retry of this from blocking context. 227 222 */ 228 223 if (unlikely(iov_iter_count(iter))) { 229 - bio_release_pages(bio, false); 230 - bio_clear_flag(bio, BIO_REFFED); 231 - bio_put(bio); 232 - blk_finish_plug(&plug); 233 - return -EAGAIN; 224 + ret = -EAGAIN; 225 + goto fail; 234 226 } 235 227 bio->bi_opf |= REQ_NOWAIT; 228 + } 229 + if (!is_sync && (iocb->ki_flags & IOCB_HAS_METADATA)) { 230 + ret = bio_integrity_map_iter(bio, iocb->private); 231 + if (unlikely(ret)) 232 + goto fail; 236 233 } 237 234 238 235 if (is_read) { ··· 276 269 277 270 bio_put(&dio->bio); 278 271 return ret; 272 + fail: 273 + bio_release_pages(bio, false); 274 + bio_clear_flag(bio, BIO_REFFED); 275 + bio_put(bio); 276 + blk_finish_plug(&plug); 277 + return ret; 279 278 } 280 279 281 280 static void blkdev_bio_end_io_async(struct bio *bio) ··· 298 285 } else { 299 286 ret = blk_status_to_errno(bio->bi_status); 300 287 } 288 + 289 + if (iocb->ki_flags & IOCB_HAS_METADATA) 290 + bio_integrity_unmap_user(bio); 301 291 302 292 iocb->ki_complete(iocb, ret); 303 293 ··· 346 330 bio_iov_bvec_set(bio, iter); 347 331 } else { 348 332 ret = bio_iov_iter_get_pages(bio, iter); 349 - if (unlikely(ret)) { 350 - bio_put(bio); 351 - return ret; 352 - } 333 + if (unlikely(ret)) 334 + goto out_bio_put; 353 335 } 354 336 dio->size = bio->bi_iter.bi_size; 355 337 ··· 358 344 } 359 345 } else { 360 346 task_io_account_write(bio->bi_iter.bi_size); 347 + } 348 + 349 + if (iocb->ki_flags & IOCB_HAS_METADATA) { 350 + ret = bio_integrity_map_iter(bio, iocb->private); 351 + WRITE_ONCE(iocb->private, NULL); 352 + if (unlikely(ret)) 353 + goto out_bio_put; 361 354 } 362 355 363 356 if (iocb->ki_flags & IOCB_ATOMIC) ··· 381 360 submit_bio(bio); 382 361 } 383 362 return -EIOCBQUEUED; 363 + 364 + out_bio_put: 365 + bio_put(bio); 366 + return ret; 384 367 } 385 368 386 369 static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+13 -8
drivers/nvme/host/core.c
··· 885 885 return BLK_STS_OK; 886 886 } 887 887 888 + static void nvme_set_app_tag(struct request *req, struct nvme_command *cmnd) 889 + { 890 + cmnd->rw.lbat = cpu_to_le16(bio_integrity(req->bio)->app_tag); 891 + cmnd->rw.lbatm = cpu_to_le16(0xffff); 892 + } 893 + 888 894 static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd, 889 895 struct request *req) 890 896 { ··· 1023 1017 control |= NVME_RW_PRINFO_PRACT; 1024 1018 } 1025 1019 1026 - switch (ns->head->pi_type) { 1027 - case NVME_NS_DPS_PI_TYPE3: 1020 + if (bio_integrity_flagged(req->bio, BIP_CHECK_GUARD)) 1028 1021 control |= NVME_RW_PRINFO_PRCHK_GUARD; 1029 - break; 1030 - case NVME_NS_DPS_PI_TYPE1: 1031 - case NVME_NS_DPS_PI_TYPE2: 1032 - control |= NVME_RW_PRINFO_PRCHK_GUARD | 1033 - NVME_RW_PRINFO_PRCHK_REF; 1022 + if (bio_integrity_flagged(req->bio, BIP_CHECK_REFTAG)) { 1023 + control |= NVME_RW_PRINFO_PRCHK_REF; 1034 1024 if (op == nvme_cmd_zone_append) 1035 1025 control |= NVME_RW_APPEND_PIREMAP; 1036 1026 nvme_set_ref_tag(ns, cmnd, req); 1037 - break; 1027 + } 1028 + if (bio_integrity_flagged(req->bio, BIP_CHECK_APPTAG)) { 1029 + control |= NVME_RW_PRINFO_PRCHK_APP; 1030 + nvme_set_app_tag(req, cmnd); 1038 1031 } 1039 1032 } 1040 1033
+2 -2
drivers/scsi/sd.c
··· 809 809 if (bio_integrity_flagged(bio, BIP_IP_CHECKSUM)) 810 810 scmd->prot_flags |= SCSI_PROT_IP_CHECKSUM; 811 811 812 - if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false) 812 + if (bio_integrity_flagged(bio, BIP_CHECK_GUARD)) 813 813 scmd->prot_flags |= SCSI_PROT_GUARD_CHECK; 814 814 } 815 815 816 816 if (dif != T10_PI_TYPE3_PROTECTION) { /* DIX/DIF Type 0, 1, 2 */ 817 817 scmd->prot_flags |= SCSI_PROT_REF_INCREMENT; 818 818 819 - if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false) 819 + if (bio_integrity_flagged(bio, BIP_CHECK_REFTAG)) 820 820 scmd->prot_flags |= SCSI_PROT_REF_CHECK; 821 821 } 822 822
+18 -7
include/linux/bio-integrity.h
··· 7 7 enum bip_flags { 8 8 BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ 9 9 BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ 10 - BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ 11 - BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ 12 - BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ 13 - BIP_COPY_USER = 1 << 5, /* Kernel bounce buffer in use */ 10 + BIP_DISK_NOCHECK = 1 << 2, /* disable disk integrity checking */ 11 + BIP_IP_CHECKSUM = 1 << 3, /* IP checksum */ 12 + BIP_COPY_USER = 1 << 4, /* Kernel bounce buffer in use */ 13 + BIP_CHECK_GUARD = 1 << 5, /* guard check */ 14 + BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ 15 + BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ 14 16 }; 15 17 16 18 struct bio_integrity_payload { ··· 23 21 unsigned short bip_vcnt; /* # of integrity bio_vecs */ 24 22 unsigned short bip_max_vcnt; /* integrity bio_vec slots */ 25 23 unsigned short bip_flags; /* control flags */ 24 + u16 app_tag; /* application tag value */ 26 25 27 26 struct bvec_iter bio_iter; /* for rewinding parent bio */ 28 27 ··· 32 29 struct bio_vec *bip_vec; 33 30 struct bio_vec bip_inline_vecs[];/* embedded bvec array */ 34 31 }; 32 + 33 + #define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_IP_CHECKSUM | \ 34 + BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG) 35 35 36 36 #ifdef CONFIG_BLK_DEV_INTEGRITY 37 37 ··· 78 72 unsigned int nr); 79 73 int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, 80 74 unsigned int offset); 81 - int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len); 75 + int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter); 76 + int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta); 82 77 void bio_integrity_unmap_user(struct bio *bio); 83 78 bool bio_integrity_prep(struct bio *bio); 84 79 void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); ··· 105 98 { 106 99 } 107 100 108 - static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, 109 - ssize_t len) 101 + static inline int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) 102 + { 103 + return -EINVAL; 104 + } 105 + 106 + static inline int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta) 110 107 { 111 108 return -EINVAL; 112 109 }
+1
include/linux/fs.h
··· 349 349 #define IOCB_DIO_CALLER_COMP (1 << 22) 350 350 /* kiocb is a read or write operation submitted by fs/aio.c. */ 351 351 #define IOCB_AIO_RW (1 << 23) 352 + #define IOCB_HAS_METADATA (1 << 24) 352 353 353 354 /* for use in trace events */ 354 355 #define TRACE_IOCB_STRINGS \
+14 -12
include/linux/io_uring_types.h
··· 78 78 79 79 struct io_mapped_region { 80 80 struct page **pages; 81 - void *vmap_ptr; 82 - size_t nr_pages; 81 + void *ptr; 82 + unsigned nr_pages; 83 + unsigned flags; 83 84 }; 84 85 85 86 /* ··· 294 293 295 294 struct io_submit_state submit_state; 296 295 296 + /* 297 + * Modifications are protected by ->uring_lock and ->mmap_lock. 298 + * The flags, buf_pages and buf_nr_pages fields should be stable 299 + * once published. 300 + */ 297 301 struct xarray io_bl_xa; 298 302 299 303 struct io_hash_table cancel_table; ··· 430 424 * side will need to grab this lock, to prevent either side from 431 425 * being run concurrently with the other. 432 426 */ 433 - struct mutex resize_lock; 427 + struct mutex mmap_lock; 434 428 435 - /* 436 - * If IORING_SETUP_NO_MMAP is used, then the below holds 437 - * the gup'ed pages for the two rings, and the sqes. 438 - */ 439 - unsigned short n_ring_pages; 440 - unsigned short n_sqe_pages; 441 - struct page **ring_pages; 442 - struct page **sqe_pages; 443 - 429 + struct io_mapped_region sq_region; 430 + struct io_mapped_region ring_region; 444 431 /* used for optimised request parameter and wait argument passing */ 445 432 struct io_mapped_region param_region; 446 433 }; ··· 480 481 REQ_F_BL_NO_RECYCLE_BIT, 481 482 REQ_F_BUFFERS_COMMIT_BIT, 482 483 REQ_F_BUF_NODE_BIT, 484 + REQ_F_HAS_METADATA_BIT, 483 485 484 486 /* not a real bit, just to check we're not overflowing the space */ 485 487 __REQ_F_LAST_BIT, ··· 561 561 REQ_F_BUFFERS_COMMIT = IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT), 562 562 /* buf node is valid */ 563 563 REQ_F_BUF_NODE = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT), 564 + /* request has read/write metadata assigned */ 565 + REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT), 564 566 }; 565 567 566 568 typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
+9
include/linux/uio.h
··· 82 82 }; 83 83 }; 84 84 85 + typedef __u16 uio_meta_flags_t; 86 + 87 + struct uio_meta { 88 + uio_meta_flags_t flags; 89 + u16 app_tag; 90 + u64 seed; 91 + struct iov_iter iter; 92 + }; 93 + 85 94 static inline const struct iovec *iter_iov(const struct iov_iter *iter) 86 95 { 87 96 if (iter->iter_type == ITER_UBUF)
+9
include/uapi/linux/fs.h
··· 40 40 #define BLOCK_SIZE_BITS 10 41 41 #define BLOCK_SIZE (1<<BLOCK_SIZE_BITS) 42 42 43 + /* flags for integrity meta */ 44 + #define IO_INTEGRITY_CHK_GUARD (1U << 0) /* enforce guard check */ 45 + #define IO_INTEGRITY_CHK_REFTAG (1U << 1) /* enforce ref check */ 46 + #define IO_INTEGRITY_CHK_APPTAG (1U << 2) /* enforce app check */ 47 + 48 + #define IO_INTEGRITY_VALID_FLAGS (IO_INTEGRITY_CHK_GUARD | \ 49 + IO_INTEGRITY_CHK_REFTAG | \ 50 + IO_INTEGRITY_CHK_APPTAG) 51 + 43 52 #define SEEK_SET 0 /* seek relative to beginning of file */ 44 53 #define SEEK_CUR 1 /* seek relative to current file position */ 45 54 #define SEEK_END 2 /* seek relative to end of file */
+17
include/uapi/linux/io_uring.h
··· 98 98 __u64 addr3; 99 99 __u64 __pad2[1]; 100 100 }; 101 + struct { 102 + __u64 attr_ptr; /* pointer to attribute information */ 103 + __u64 attr_type_mask; /* bit mask of attributes */ 104 + }; 101 105 __u64 optval; 102 106 /* 103 107 * If the ring is initialized with IORING_SETUP_SQE128, then ··· 109 105 */ 110 106 __u8 cmd[0]; 111 107 }; 108 + }; 109 + 110 + /* sqe->attr_type_mask flags */ 111 + #define IORING_RW_ATTR_FLAG_PI (1U << 0) 112 + /* PI attribute information */ 113 + struct io_uring_attr_pi { 114 + __u16 flags; 115 + __u16 app_tag; 116 + __u32 len; 117 + __u64 addr; 118 + __u64 seed; 119 + __u64 rsvd; 112 120 }; 113 121 114 122 /* ··· 577 561 #define IORING_FEAT_REG_REG_RING (1U << 13) 578 562 #define IORING_FEAT_RECVSEND_BUNDLE (1U << 14) 579 563 #define IORING_FEAT_MIN_TIMEOUT (1U << 15) 564 + #define IORING_FEAT_RW_ATTR (1U << 16) 580 565 581 566 /* 582 567 * io_uring_register(2) opcodes and arguments
+13
io_uring/alloc_cache.h
··· 30 30 return NULL; 31 31 } 32 32 33 + static inline void *io_cache_alloc(struct io_alloc_cache *cache, gfp_t gfp, 34 + void (*init_once)(void *obj)) 35 + { 36 + if (unlikely(!cache->nr_cached)) { 37 + void *obj = kmalloc(cache->elem_size, gfp); 38 + 39 + if (obj && init_once) 40 + init_once(obj); 41 + return obj; 42 + } 43 + return io_alloc_cache_get(cache); 44 + } 45 + 33 46 /* returns false if the cache was initialized properly */ 34 47 static inline bool io_alloc_cache_init(struct io_alloc_cache *cache, 35 48 unsigned max_nr, size_t size)
+5 -4
io_uring/fdinfo.c
··· 211 211 212 212 if (ctx->file_table.data.nodes[i]) 213 213 f = io_slot_file(ctx->file_table.data.nodes[i]); 214 - if (f) 215 - seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname); 216 - else 217 - seq_printf(m, "%5u: <none>\n", i); 214 + if (f) { 215 + seq_printf(m, "%5u: ", i); 216 + seq_file_path(m, f, " \t\n\\"); 217 + seq_puts(m, "\n"); 218 + } 218 219 } 219 220 seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.nr); 220 221 for (i = 0; has_lock && i < ctx->buf_table.nr; i++) {
+1 -12
io_uring/futex.c
··· 251 251 io_req_task_work_add(req); 252 252 } 253 253 254 - static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx) 255 - { 256 - struct io_futex_data *ifd; 257 - 258 - ifd = io_alloc_cache_get(&ctx->futex_cache); 259 - if (ifd) 260 - return ifd; 261 - 262 - return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT); 263 - } 264 - 265 254 int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags) 266 255 { 267 256 struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); ··· 320 331 } 321 332 322 333 io_ring_submit_lock(ctx, issue_flags); 323 - ifd = io_alloc_ifd(ctx); 334 + ifd = io_cache_alloc(&ctx->futex_cache, GFP_NOWAIT, NULL); 324 335 if (!ifd) { 325 336 ret = -ENOMEM; 326 337 goto done_unlock;
+61 -77
io_uring/io_uring.c
··· 115 115 REQ_F_ASYNC_DATA) 116 116 117 117 #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ 118 - IO_REQ_CLEAN_FLAGS) 118 + REQ_F_REISSUE | IO_REQ_CLEAN_FLAGS) 119 119 120 120 #define IO_TCTX_REFS_CACHE_NR (1U << 10) 121 121 ··· 143 143 144 144 static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 145 145 struct io_uring_task *tctx, 146 - bool cancel_all); 146 + bool cancel_all, 147 + bool is_sqpoll_thread); 147 148 148 149 static void io_queue_sqe(struct io_kiocb *req); 149 150 ··· 351 350 INIT_WQ_LIST(&ctx->submit_state.compl_reqs); 352 351 INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd); 353 352 io_napi_init(ctx); 354 - mutex_init(&ctx->resize_lock); 353 + mutex_init(&ctx->mmap_lock); 355 354 356 355 return ctx; 357 356 ··· 362 361 io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); 363 362 io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); 364 363 io_alloc_cache_free(&ctx->uring_cache, kfree); 365 - io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); 364 + io_alloc_cache_free(&ctx->msg_cache, kfree); 366 365 io_futex_cache_free(ctx); 367 366 kvfree(ctx->cancel_table.hbs); 368 367 xa_destroy(&ctx->io_bl_xa); ··· 551 550 io_req_task_work_add(req); 552 551 } 553 552 554 - static __cold void io_queue_deferred(struct io_ring_ctx *ctx) 553 + static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) 555 554 { 555 + spin_lock(&ctx->completion_lock); 556 556 while (!list_empty(&ctx->defer_list)) { 557 557 struct io_defer_entry *de = list_first_entry(&ctx->defer_list, 558 558 struct io_defer_entry, list); ··· 564 562 io_req_task_queue(de->req); 565 563 kfree(de); 566 564 } 565 + spin_unlock(&ctx->completion_lock); 567 566 } 568 567 569 568 void __io_commit_cqring_flush(struct io_ring_ctx *ctx) ··· 573 570 io_poll_wq_wake(ctx); 574 571 if (ctx->off_timeout_used) 575 572 io_flush_timeouts(ctx); 576 - if (ctx->drain_active) { 577 - spin_lock(&ctx->completion_lock); 573 + if (ctx->drain_active) 578 574 io_queue_deferred(ctx); 579 - spin_unlock(&ctx->completion_lock); 580 - } 581 575 if (ctx->has_evfd) 582 576 io_eventfd_flush_signal(ctx); 583 577 } ··· 1401 1401 comp_list); 1402 1402 1403 1403 if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) { 1404 + if (req->flags & REQ_F_REISSUE) { 1405 + node = req->comp_list.next; 1406 + req->flags &= ~REQ_F_REISSUE; 1407 + io_queue_iowq(req); 1408 + continue; 1409 + } 1404 1410 if (req->flags & REQ_F_REFCOUNT) { 1405 1411 node = req->comp_list.next; 1406 1412 if (!req_ref_put_and_test(req)) ··· 1446 1440 struct io_kiocb *req = container_of(node, struct io_kiocb, 1447 1441 comp_list); 1448 1442 1449 - if (!(req->flags & REQ_F_CQE_SKIP) && 1443 + /* 1444 + * Requests marked with REQUEUE should not post a CQE, they 1445 + * will go through the io-wq retry machinery and post one 1446 + * later. 1447 + */ 1448 + if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) && 1450 1449 unlikely(!io_fill_cqe_req(ctx, req))) { 1451 1450 if (ctx->lockless_cq) { 1452 1451 spin_lock(&ctx->completion_lock); ··· 1649 1638 if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT)) 1650 1639 res |= REQ_F_SUPPORT_NOWAIT; 1651 1640 return res; 1652 - } 1653 - 1654 - bool io_alloc_async_data(struct io_kiocb *req) 1655 - { 1656 - const struct io_issue_def *def = &io_issue_defs[req->opcode]; 1657 - 1658 - WARN_ON_ONCE(!def->async_size); 1659 - req->async_data = kmalloc(def->async_size, GFP_KERNEL); 1660 - if (req->async_data) { 1661 - req->flags |= REQ_F_ASYNC_DATA; 1662 - return false; 1663 - } 1664 - return true; 1665 1641 } 1666 1642 1667 1643 static u32 io_get_sequence(struct io_kiocb *req) ··· 2629 2631 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; 2630 2632 } 2631 2633 2632 - static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, 2633 - size_t size) 2634 - { 2635 - return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr, 2636 - size); 2637 - } 2638 - 2639 - static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, 2640 - size_t size) 2641 - { 2642 - return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr, 2643 - size); 2644 - } 2645 - 2646 2634 static void io_rings_free(struct io_ring_ctx *ctx) 2647 2635 { 2648 - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { 2649 - io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages, 2650 - true); 2651 - io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages, 2652 - true); 2653 - } else { 2654 - io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); 2655 - ctx->n_ring_pages = 0; 2656 - io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages); 2657 - ctx->n_sqe_pages = 0; 2658 - vunmap(ctx->rings); 2659 - vunmap(ctx->sq_sqes); 2660 - } 2661 - 2636 + io_free_region(ctx, &ctx->sq_region); 2637 + io_free_region(ctx, &ctx->ring_region); 2662 2638 ctx->rings = NULL; 2663 2639 ctx->sq_sqes = NULL; 2664 2640 } ··· 2704 2732 io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); 2705 2733 io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); 2706 2734 io_alloc_cache_free(&ctx->uring_cache, kfree); 2707 - io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); 2735 + io_alloc_cache_free(&ctx->msg_cache, kfree); 2708 2736 io_futex_cache_free(ctx); 2709 2737 io_destroy_buffers(ctx); 2710 2738 io_free_region(ctx, &ctx->param_region); ··· 2866 2894 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) 2867 2895 io_move_task_work_from_local(ctx); 2868 2896 2869 - while (io_uring_try_cancel_requests(ctx, NULL, true)) 2897 + /* The SQPOLL thread never reaches this path */ 2898 + while (io_uring_try_cancel_requests(ctx, NULL, true, false)) 2870 2899 cond_resched(); 2871 2900 2872 2901 if (ctx->sq_data) { ··· 3035 3062 3036 3063 static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 3037 3064 struct io_uring_task *tctx, 3038 - bool cancel_all) 3065 + bool cancel_all, 3066 + bool is_sqpoll_thread) 3039 3067 { 3040 3068 struct io_task_cancel cancel = { .tctx = tctx, .all = cancel_all, }; 3041 3069 enum io_wq_cancel cret; ··· 3066 3092 3067 3093 /* SQPOLL thread does its own polling */ 3068 3094 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || 3069 - (ctx->sq_data && ctx->sq_data->thread == current)) { 3095 + is_sqpoll_thread) { 3070 3096 while (!wq_list_empty(&ctx->iopoll_list)) { 3071 3097 io_iopoll_try_reap_events(ctx); 3072 3098 ret = true; ··· 3139 3165 continue; 3140 3166 loop |= io_uring_try_cancel_requests(node->ctx, 3141 3167 current->io_uring, 3142 - cancel_all); 3168 + cancel_all, 3169 + false); 3143 3170 } 3144 3171 } else { 3145 3172 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 3146 3173 loop |= io_uring_try_cancel_requests(ctx, 3147 3174 current->io_uring, 3148 - cancel_all); 3175 + cancel_all, 3176 + true); 3149 3177 } 3150 3178 3151 3179 if (loop) { ··· 3209 3233 end > ctx->cq_wait_size)) 3210 3234 return ERR_PTR(-EFAULT); 3211 3235 3236 + offset = array_index_nospec(offset, ctx->cq_wait_size - size); 3212 3237 return ctx->cq_wait_arg + offset; 3213 3238 } 3214 3239 ··· 3454 3477 static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, 3455 3478 struct io_uring_params *p) 3456 3479 { 3480 + struct io_uring_region_desc rd; 3457 3481 struct io_rings *rings; 3458 3482 size_t size, sq_array_offset; 3459 - void *ptr; 3483 + int ret; 3460 3484 3461 3485 /* make sure these are sane, as we already accounted them */ 3462 3486 ctx->sq_entries = p->sq_entries; ··· 3468 3490 if (size == SIZE_MAX) 3469 3491 return -EOVERFLOW; 3470 3492 3471 - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) 3472 - rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size); 3473 - else 3474 - rings = io_rings_map(ctx, p->cq_off.user_addr, size); 3493 + memset(&rd, 0, sizeof(rd)); 3494 + rd.size = PAGE_ALIGN(size); 3495 + if (ctx->flags & IORING_SETUP_NO_MMAP) { 3496 + rd.user_addr = p->cq_off.user_addr; 3497 + rd.flags |= IORING_MEM_REGION_TYPE_USER; 3498 + } 3499 + ret = io_create_region(ctx, &ctx->ring_region, &rd, IORING_OFF_CQ_RING); 3500 + if (ret) 3501 + return ret; 3502 + ctx->rings = rings = io_region_get_ptr(&ctx->ring_region); 3475 3503 3476 - if (IS_ERR(rings)) 3477 - return PTR_ERR(rings); 3478 - 3479 - ctx->rings = rings; 3480 3504 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) 3481 3505 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); 3482 3506 rings->sq_ring_mask = p->sq_entries - 1; ··· 3495 3515 return -EOVERFLOW; 3496 3516 } 3497 3517 3498 - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) 3499 - ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size); 3500 - else 3501 - ptr = io_sqes_map(ctx, p->sq_off.user_addr, size); 3502 - 3503 - if (IS_ERR(ptr)) { 3504 - io_rings_free(ctx); 3505 - return PTR_ERR(ptr); 3518 + memset(&rd, 0, sizeof(rd)); 3519 + rd.size = PAGE_ALIGN(size); 3520 + if (ctx->flags & IORING_SETUP_NO_MMAP) { 3521 + rd.user_addr = p->sq_off.user_addr; 3522 + rd.flags |= IORING_MEM_REGION_TYPE_USER; 3506 3523 } 3507 - 3508 - ctx->sq_sqes = ptr; 3524 + ret = io_create_region(ctx, &ctx->sq_region, &rd, IORING_OFF_SQES); 3525 + if (ret) { 3526 + io_rings_free(ctx); 3527 + return ret; 3528 + } 3529 + ctx->sq_sqes = io_region_get_ptr(&ctx->sq_region); 3509 3530 return 0; 3510 3531 } 3511 3532 ··· 3714 3733 IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | 3715 3734 IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | 3716 3735 IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | 3717 - IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT; 3736 + IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT | 3737 + IORING_FEAT_RW_ATTR; 3718 3738 3719 3739 if (copy_to_user(params, p, sizeof(*p))) { 3720 3740 ret = -EFAULT; ··· 3876 3894 BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]); 3877 3895 BUILD_BUG_SQE_ELEM(48, __u64, addr3); 3878 3896 BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd); 3897 + BUILD_BUG_SQE_ELEM(48, __u64, attr_ptr); 3898 + BUILD_BUG_SQE_ELEM(56, __u64, attr_type_mask); 3879 3899 BUILD_BUG_SQE_ELEM(56, __u64, __pad2); 3880 3900 3881 3901 BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
+23
io_uring/io_uring.h
··· 8 8 #include <linux/poll.h> 9 9 #include <linux/io_uring_types.h> 10 10 #include <uapi/linux/eventpoll.h> 11 + #include "alloc_cache.h" 11 12 #include "io-wq.h" 12 13 #include "slist.h" 13 14 #include "filetable.h" 15 + #include "opdef.h" 14 16 15 17 #ifndef CREATE_TRACE_POINTS 16 18 #include <trace/events/io_uring.h> ··· 223 221 { 224 222 req->cqe.res = res; 225 223 req->cqe.flags = cflags; 224 + } 225 + 226 + static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache, 227 + struct io_kiocb *req, 228 + void (*init_once)(void *obj)) 229 + { 230 + req->async_data = io_cache_alloc(cache, GFP_KERNEL, init_once); 231 + if (req->async_data) 232 + req->flags |= REQ_F_ASYNC_DATA; 233 + return req->async_data; 234 + } 235 + 236 + static inline void *io_uring_alloc_async_data_nocache(struct io_kiocb *req) 237 + { 238 + const struct io_issue_def *def = &io_issue_defs[req->opcode]; 239 + 240 + WARN_ON_ONCE(!def->async_size); 241 + req->async_data = kmalloc(def->async_size, GFP_KERNEL); 242 + if (req->async_data) 243 + req->flags |= REQ_F_ASYNC_DATA; 244 + return req->async_data; 226 245 } 227 246 228 247 static inline bool req_has_async_data(struct io_kiocb *req)
+70 -158
io_uring/kbuf.c
··· 45 45 /* 46 46 * Store buffer group ID and finally mark the list as visible. 47 47 * The normal lookup doesn't care about the visibility as we're 48 - * always under the ->uring_lock, but the RCU lookup from mmap does. 48 + * always under the ->uring_lock, but lookups from mmap do. 49 49 */ 50 50 bl->bgid = bgid; 51 - atomic_set(&bl->refs, 1); 51 + guard(mutex)(&ctx->mmap_lock); 52 52 return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); 53 53 } 54 54 ··· 353 353 354 354 if (bl->flags & IOBL_BUF_RING) { 355 355 i = bl->buf_ring->tail - bl->head; 356 - if (bl->buf_nr_pages) { 357 - int j; 358 - 359 - if (!(bl->flags & IOBL_MMAP)) { 360 - for (j = 0; j < bl->buf_nr_pages; j++) 361 - unpin_user_page(bl->buf_pages[j]); 362 - } 363 - io_pages_unmap(bl->buf_ring, &bl->buf_pages, 364 - &bl->buf_nr_pages, bl->flags & IOBL_MMAP); 365 - bl->flags &= ~IOBL_MMAP; 366 - } 356 + io_free_region(ctx, &bl->region); 367 357 /* make sure it's seen as empty */ 368 358 INIT_LIST_HEAD(&bl->buf_list); 369 359 bl->flags &= ~IOBL_BUF_RING; ··· 376 386 return i; 377 387 } 378 388 379 - void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) 389 + static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) 380 390 { 381 - if (atomic_dec_and_test(&bl->refs)) { 382 - __io_remove_buffers(ctx, bl, -1U); 383 - kfree_rcu(bl, rcu); 384 - } 391 + __io_remove_buffers(ctx, bl, -1U); 392 + kfree(bl); 385 393 } 386 394 387 395 void io_destroy_buffers(struct io_ring_ctx *ctx) ··· 387 399 struct io_buffer_list *bl; 388 400 struct list_head *item, *tmp; 389 401 struct io_buffer *buf; 390 - unsigned long index; 391 402 392 - xa_for_each(&ctx->io_bl_xa, index, bl) { 393 - xa_erase(&ctx->io_bl_xa, bl->bgid); 403 + while (1) { 404 + unsigned long index = 0; 405 + 406 + scoped_guard(mutex, &ctx->mmap_lock) { 407 + bl = xa_find(&ctx->io_bl_xa, &index, ULONG_MAX, XA_PRESENT); 408 + if (bl) 409 + xa_erase(&ctx->io_bl_xa, bl->bgid); 410 + } 411 + if (!bl) 412 + break; 394 413 io_put_bl(ctx, bl); 395 414 } 396 415 ··· 586 591 INIT_LIST_HEAD(&bl->buf_list); 587 592 ret = io_buffer_add_list(ctx, bl, p->bgid); 588 593 if (ret) { 589 - /* 590 - * Doesn't need rcu free as it was never visible, but 591 - * let's keep it consistent throughout. 592 - */ 593 - kfree_rcu(bl, rcu); 594 + kfree(bl); 594 595 goto err; 595 596 } 596 597 } ··· 606 615 return IOU_OK; 607 616 } 608 617 609 - static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, 610 - struct io_buffer_list *bl) 611 - { 612 - struct io_uring_buf_ring *br = NULL; 613 - struct page **pages; 614 - int nr_pages, ret; 615 - 616 - pages = io_pin_pages(reg->ring_addr, 617 - flex_array_size(br, bufs, reg->ring_entries), 618 - &nr_pages); 619 - if (IS_ERR(pages)) 620 - return PTR_ERR(pages); 621 - 622 - br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); 623 - if (!br) { 624 - ret = -ENOMEM; 625 - goto error_unpin; 626 - } 627 - 628 - #ifdef SHM_COLOUR 629 - /* 630 - * On platforms that have specific aliasing requirements, SHM_COLOUR 631 - * is set and we must guarantee that the kernel and user side align 632 - * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and 633 - * the application mmap's the provided ring buffer. Fail the request 634 - * if we, by chance, don't end up with aligned addresses. The app 635 - * should use IOU_PBUF_RING_MMAP instead, and liburing will handle 636 - * this transparently. 637 - */ 638 - if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) { 639 - ret = -EINVAL; 640 - goto error_unpin; 641 - } 642 - #endif 643 - bl->buf_pages = pages; 644 - bl->buf_nr_pages = nr_pages; 645 - bl->buf_ring = br; 646 - bl->flags |= IOBL_BUF_RING; 647 - bl->flags &= ~IOBL_MMAP; 648 - return 0; 649 - error_unpin: 650 - unpin_user_pages(pages, nr_pages); 651 - kvfree(pages); 652 - vunmap(br); 653 - return ret; 654 - } 655 - 656 - static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, 657 - struct io_uring_buf_reg *reg, 658 - struct io_buffer_list *bl) 659 - { 660 - size_t ring_size; 661 - 662 - ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); 663 - 664 - bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size); 665 - if (IS_ERR(bl->buf_ring)) { 666 - bl->buf_ring = NULL; 667 - return -ENOMEM; 668 - } 669 - 670 - bl->flags |= (IOBL_BUF_RING | IOBL_MMAP); 671 - return 0; 672 - } 673 - 674 618 int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) 675 619 { 676 620 struct io_uring_buf_reg reg; 677 621 struct io_buffer_list *bl, *free_bl = NULL; 622 + struct io_uring_region_desc rd; 623 + struct io_uring_buf_ring *br; 624 + unsigned long mmap_offset; 625 + unsigned long ring_size; 678 626 int ret; 679 627 680 628 lockdep_assert_held(&ctx->uring_lock); ··· 625 695 return -EINVAL; 626 696 if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC)) 627 697 return -EINVAL; 628 - if (!(reg.flags & IOU_PBUF_RING_MMAP)) { 629 - if (!reg.ring_addr) 630 - return -EFAULT; 631 - if (reg.ring_addr & ~PAGE_MASK) 632 - return -EINVAL; 633 - } else { 634 - if (reg.ring_addr) 635 - return -EINVAL; 636 - } 637 - 638 698 if (!is_power_of_2(reg.ring_entries)) 639 699 return -EINVAL; 640 - 641 700 /* cannot disambiguate full vs empty due to head/tail size */ 642 701 if (reg.ring_entries >= 65536) 643 702 return -EINVAL; ··· 642 723 return -ENOMEM; 643 724 } 644 725 645 - if (!(reg.flags & IOU_PBUF_RING_MMAP)) 646 - ret = io_pin_pbuf_ring(&reg, bl); 647 - else 648 - ret = io_alloc_pbuf_ring(ctx, &reg, bl); 726 + mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT; 727 + ring_size = flex_array_size(br, bufs, reg.ring_entries); 649 728 650 - if (!ret) { 651 - bl->nr_entries = reg.ring_entries; 652 - bl->mask = reg.ring_entries - 1; 653 - if (reg.flags & IOU_PBUF_RING_INC) 654 - bl->flags |= IOBL_INC; 655 - 656 - io_buffer_add_list(ctx, bl, reg.bgid); 657 - return 0; 729 + memset(&rd, 0, sizeof(rd)); 730 + rd.size = PAGE_ALIGN(ring_size); 731 + if (!(reg.flags & IOU_PBUF_RING_MMAP)) { 732 + rd.user_addr = reg.ring_addr; 733 + rd.flags |= IORING_MEM_REGION_TYPE_USER; 658 734 } 735 + ret = io_create_region_mmap_safe(ctx, &bl->region, &rd, mmap_offset); 736 + if (ret) 737 + goto fail; 738 + br = io_region_get_ptr(&bl->region); 659 739 660 - kfree_rcu(free_bl, rcu); 740 + #ifdef SHM_COLOUR 741 + /* 742 + * On platforms that have specific aliasing requirements, SHM_COLOUR 743 + * is set and we must guarantee that the kernel and user side align 744 + * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and 745 + * the application mmap's the provided ring buffer. Fail the request 746 + * if we, by chance, don't end up with aligned addresses. The app 747 + * should use IOU_PBUF_RING_MMAP instead, and liburing will handle 748 + * this transparently. 749 + */ 750 + if (!(reg.flags & IOU_PBUF_RING_MMAP) && 751 + ((reg.ring_addr | (unsigned long)br) & (SHM_COLOUR - 1))) { 752 + ret = -EINVAL; 753 + goto fail; 754 + } 755 + #endif 756 + 757 + bl->nr_entries = reg.ring_entries; 758 + bl->mask = reg.ring_entries - 1; 759 + bl->flags |= IOBL_BUF_RING; 760 + bl->buf_ring = br; 761 + if (reg.flags & IOU_PBUF_RING_INC) 762 + bl->flags |= IOBL_INC; 763 + io_buffer_add_list(ctx, bl, reg.bgid); 764 + return 0; 765 + fail: 766 + io_free_region(ctx, &bl->region); 767 + kfree(free_bl); 661 768 return ret; 662 769 } 663 770 ··· 707 762 if (!(bl->flags & IOBL_BUF_RING)) 708 763 return -EINVAL; 709 764 710 - xa_erase(&ctx->io_bl_xa, bl->bgid); 765 + scoped_guard(mutex, &ctx->mmap_lock) 766 + xa_erase(&ctx->io_bl_xa, bl->bgid); 767 + 711 768 io_put_bl(ctx, bl); 712 769 return 0; 713 770 } ··· 740 793 return 0; 741 794 } 742 795 743 - struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, 744 - unsigned long bgid) 796 + struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, 797 + unsigned int bgid) 745 798 { 746 799 struct io_buffer_list *bl; 747 - bool ret; 748 800 749 - /* 750 - * We have to be a bit careful here - we're inside mmap and cannot grab 751 - * the uring_lock. This means the buffer_list could be simultaneously 752 - * going away, if someone is trying to be sneaky. Look it up under rcu 753 - * so we know it's not going away, and attempt to grab a reference to 754 - * it. If the ref is already zero, then fail the mapping. If successful, 755 - * the caller will call io_put_bl() to drop the the reference at at the 756 - * end. This may then safely free the buffer_list (and drop the pages) 757 - * at that point, vm_insert_pages() would've already grabbed the 758 - * necessary vma references. 759 - */ 760 - rcu_read_lock(); 801 + lockdep_assert_held(&ctx->mmap_lock); 802 + 761 803 bl = xa_load(&ctx->io_bl_xa, bgid); 762 - /* must be a mmap'able buffer ring and have pages */ 763 - ret = false; 764 - if (bl && bl->flags & IOBL_MMAP) 765 - ret = atomic_inc_not_zero(&bl->refs); 766 - rcu_read_unlock(); 767 - 768 - if (ret) 769 - return bl; 770 - 771 - return ERR_PTR(-EINVAL); 772 - } 773 - 774 - int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma) 775 - { 776 - struct io_ring_ctx *ctx = file->private_data; 777 - loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT; 778 - struct io_buffer_list *bl; 779 - int bgid, ret; 780 - 781 - bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; 782 - bl = io_pbuf_get_bl(ctx, bgid); 783 - if (IS_ERR(bl)) 784 - return PTR_ERR(bl); 785 - 786 - ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages); 787 - io_put_bl(ctx, bl); 788 - return ret; 804 + if (!bl || !(bl->flags & IOBL_BUF_RING)) 805 + return NULL; 806 + return &bl->region; 789 807 }
+6 -14
io_uring/kbuf.h
··· 3 3 #define IOU_KBUF_H 4 4 5 5 #include <uapi/linux/io_uring.h> 6 + #include <linux/io_uring_types.h> 6 7 7 8 enum { 8 9 /* ring mapped provided buffers */ 9 10 IOBL_BUF_RING = 1, 10 - /* ring mapped provided buffers, but mmap'ed by application */ 11 - IOBL_MMAP = 2, 12 11 /* buffers are consumed incrementally rather than always fully */ 13 - IOBL_INC = 4, 14 - 12 + IOBL_INC = 2, 15 13 }; 16 14 17 15 struct io_buffer_list { ··· 19 21 */ 20 22 union { 21 23 struct list_head buf_list; 22 - struct { 23 - struct page **buf_pages; 24 - struct io_uring_buf_ring *buf_ring; 25 - }; 26 - struct rcu_head rcu; 24 + struct io_uring_buf_ring *buf_ring; 27 25 }; 28 26 __u16 bgid; 29 27 ··· 31 37 32 38 __u16 flags; 33 39 34 - atomic_t refs; 40 + struct io_mapped_region region; 35 41 }; 36 42 37 43 struct io_buffer { ··· 78 84 79 85 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); 80 86 81 - void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl); 82 - struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, 83 - unsigned long bgid); 84 - int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma); 87 + struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, 88 + unsigned int bgid); 85 89 86 90 static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) 87 91 {
+194 -205
io_uring/memmap.c
··· 36 36 return page_address(page); 37 37 } 38 38 39 - static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size, 40 - gfp_t gfp) 41 - { 42 - void *ret; 43 - int i; 44 - 45 - for (i = 0; i < nr_pages; i++) { 46 - pages[i] = alloc_page(gfp); 47 - if (!pages[i]) 48 - goto err; 49 - } 50 - 51 - ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); 52 - if (ret) 53 - return ret; 54 - err: 55 - while (i--) 56 - put_page(pages[i]); 57 - return ERR_PTR(-ENOMEM); 58 - } 59 - 60 - void *io_pages_map(struct page ***out_pages, unsigned short *npages, 61 - size_t size) 62 - { 63 - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; 64 - struct page **pages; 65 - int nr_pages; 66 - void *ret; 67 - 68 - nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 69 - pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp); 70 - if (!pages) 71 - return ERR_PTR(-ENOMEM); 72 - 73 - ret = io_mem_alloc_compound(pages, nr_pages, size, gfp); 74 - if (!IS_ERR(ret)) 75 - goto done; 76 - if (nr_pages == 1) 77 - goto fail; 78 - 79 - ret = io_mem_alloc_single(pages, nr_pages, size, gfp); 80 - if (!IS_ERR(ret)) { 81 - done: 82 - *out_pages = pages; 83 - *npages = nr_pages; 84 - return ret; 85 - } 86 - fail: 87 - kvfree(pages); 88 - *out_pages = NULL; 89 - *npages = 0; 90 - return ret; 91 - } 92 - 93 - void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, 94 - bool put_pages) 95 - { 96 - bool do_vunmap = false; 97 - 98 - if (!ptr) 99 - return; 100 - 101 - if (put_pages && *npages) { 102 - struct page **to_free = *pages; 103 - int i; 104 - 105 - /* 106 - * Only did vmap for the non-compound multiple page case. 107 - * For the compound page, we just need to put the head. 108 - */ 109 - if (PageCompound(to_free[0])) 110 - *npages = 1; 111 - else if (*npages > 1) 112 - do_vunmap = true; 113 - for (i = 0; i < *npages; i++) 114 - put_page(to_free[i]); 115 - } 116 - if (do_vunmap) 117 - vunmap(ptr); 118 - kvfree(*pages); 119 - *pages = NULL; 120 - *npages = 0; 121 - } 122 - 123 - void io_pages_free(struct page ***pages, int npages) 124 - { 125 - struct page **page_array = *pages; 126 - 127 - if (!page_array) 128 - return; 129 - 130 - unpin_user_pages(page_array, npages); 131 - kvfree(page_array); 132 - *pages = NULL; 133 - } 134 - 135 39 struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) 136 40 { 137 41 unsigned long start, end, nr_pages; ··· 78 174 return ERR_PTR(ret); 79 175 } 80 176 81 - void *__io_uaddr_map(struct page ***pages, unsigned short *npages, 82 - unsigned long uaddr, size_t size) 83 - { 84 - struct page **page_array; 85 - unsigned int nr_pages; 86 - void *page_addr; 87 - 88 - *npages = 0; 89 - 90 - if (uaddr & (PAGE_SIZE - 1) || !size) 91 - return ERR_PTR(-EINVAL); 92 - 93 - nr_pages = 0; 94 - page_array = io_pin_pages(uaddr, size, &nr_pages); 95 - if (IS_ERR(page_array)) 96 - return page_array; 97 - 98 - page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL); 99 - if (page_addr) { 100 - *pages = page_array; 101 - *npages = nr_pages; 102 - return page_addr; 103 - } 104 - 105 - io_pages_free(&page_array, nr_pages); 106 - return ERR_PTR(-ENOMEM); 107 - } 177 + enum { 178 + /* memory was vmap'ed for the kernel, freeing the region vunmap's it */ 179 + IO_REGION_F_VMAP = 1, 180 + /* memory is provided by user and pinned by the kernel */ 181 + IO_REGION_F_USER_PROVIDED = 2, 182 + /* only the first page in the array is ref'ed */ 183 + IO_REGION_F_SINGLE_REF = 4, 184 + }; 108 185 109 186 void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) 110 187 { 111 188 if (mr->pages) { 112 - unpin_user_pages(mr->pages, mr->nr_pages); 189 + long nr_refs = mr->nr_pages; 190 + 191 + if (mr->flags & IO_REGION_F_SINGLE_REF) 192 + nr_refs = 1; 193 + 194 + if (mr->flags & IO_REGION_F_USER_PROVIDED) 195 + unpin_user_pages(mr->pages, nr_refs); 196 + else 197 + release_pages(mr->pages, nr_refs); 198 + 113 199 kvfree(mr->pages); 114 200 } 115 - if (mr->vmap_ptr) 116 - vunmap(mr->vmap_ptr); 201 + if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr) 202 + vunmap(mr->ptr); 117 203 if (mr->nr_pages && ctx->user) 118 204 __io_unaccount_mem(ctx->user, mr->nr_pages); 119 205 120 206 memset(mr, 0, sizeof(*mr)); 121 207 } 122 208 123 - int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, 124 - struct io_uring_region_desc *reg) 209 + static int io_region_init_ptr(struct io_mapped_region *mr) 125 210 { 126 - int pages_accounted = 0; 211 + struct io_imu_folio_data ifd; 212 + void *ptr; 213 + 214 + if (io_check_coalesce_buffer(mr->pages, mr->nr_pages, &ifd)) { 215 + if (ifd.nr_folios == 1) { 216 + mr->ptr = page_address(mr->pages[0]); 217 + return 0; 218 + } 219 + } 220 + ptr = vmap(mr->pages, mr->nr_pages, VM_MAP, PAGE_KERNEL); 221 + if (!ptr) 222 + return -ENOMEM; 223 + 224 + mr->ptr = ptr; 225 + mr->flags |= IO_REGION_F_VMAP; 226 + return 0; 227 + } 228 + 229 + static int io_region_pin_pages(struct io_ring_ctx *ctx, 230 + struct io_mapped_region *mr, 231 + struct io_uring_region_desc *reg) 232 + { 233 + unsigned long size = mr->nr_pages << PAGE_SHIFT; 127 234 struct page **pages; 235 + int nr_pages; 236 + 237 + pages = io_pin_pages(reg->user_addr, size, &nr_pages); 238 + if (IS_ERR(pages)) 239 + return PTR_ERR(pages); 240 + if (WARN_ON_ONCE(nr_pages != mr->nr_pages)) 241 + return -EFAULT; 242 + 243 + mr->pages = pages; 244 + mr->flags |= IO_REGION_F_USER_PROVIDED; 245 + return 0; 246 + } 247 + 248 + static int io_region_allocate_pages(struct io_ring_ctx *ctx, 249 + struct io_mapped_region *mr, 250 + struct io_uring_region_desc *reg, 251 + unsigned long mmap_offset) 252 + { 253 + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; 254 + unsigned long size = mr->nr_pages << PAGE_SHIFT; 255 + unsigned long nr_allocated; 256 + struct page **pages; 257 + void *p; 258 + 259 + pages = kvmalloc_array(mr->nr_pages, sizeof(*pages), gfp); 260 + if (!pages) 261 + return -ENOMEM; 262 + 263 + p = io_mem_alloc_compound(pages, mr->nr_pages, size, gfp); 264 + if (!IS_ERR(p)) { 265 + mr->flags |= IO_REGION_F_SINGLE_REF; 266 + goto done; 267 + } 268 + 269 + nr_allocated = alloc_pages_bulk_array_node(gfp, NUMA_NO_NODE, 270 + mr->nr_pages, pages); 271 + if (nr_allocated != mr->nr_pages) { 272 + if (nr_allocated) 273 + release_pages(pages, nr_allocated); 274 + kvfree(pages); 275 + return -ENOMEM; 276 + } 277 + done: 278 + reg->mmap_offset = mmap_offset; 279 + mr->pages = pages; 280 + return 0; 281 + } 282 + 283 + int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, 284 + struct io_uring_region_desc *reg, 285 + unsigned long mmap_offset) 286 + { 128 287 int nr_pages, ret; 129 - void *vptr; 130 288 u64 end; 131 289 132 - if (WARN_ON_ONCE(mr->pages || mr->vmap_ptr || mr->nr_pages)) 290 + if (WARN_ON_ONCE(mr->pages || mr->ptr || mr->nr_pages)) 133 291 return -EFAULT; 134 292 if (memchr_inv(&reg->__resv, 0, sizeof(reg->__resv))) 135 293 return -EINVAL; 136 - if (reg->flags != IORING_MEM_REGION_TYPE_USER) 294 + if (reg->flags & ~IORING_MEM_REGION_TYPE_USER) 137 295 return -EINVAL; 138 - if (!reg->user_addr) 296 + /* user_addr should be set IFF it's a user memory backed region */ 297 + if ((reg->flags & IORING_MEM_REGION_TYPE_USER) != !!reg->user_addr) 139 298 return -EFAULT; 140 299 if (!reg->size || reg->mmap_offset || reg->id) 141 300 return -EINVAL; ··· 209 242 if (check_add_overflow(reg->user_addr, reg->size, &end)) 210 243 return -EOVERFLOW; 211 244 212 - pages = io_pin_pages(reg->user_addr, reg->size, &nr_pages); 213 - if (IS_ERR(pages)) 214 - return PTR_ERR(pages); 215 - 245 + nr_pages = reg->size >> PAGE_SHIFT; 216 246 if (ctx->user) { 217 247 ret = __io_account_mem(ctx->user, nr_pages); 218 248 if (ret) 219 - goto out_free; 220 - pages_accounted = nr_pages; 249 + return ret; 221 250 } 222 - 223 - vptr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); 224 - if (!vptr) { 225 - ret = -ENOMEM; 226 - goto out_free; 227 - } 228 - 229 - mr->pages = pages; 230 - mr->vmap_ptr = vptr; 231 251 mr->nr_pages = nr_pages; 252 + 253 + if (reg->flags & IORING_MEM_REGION_TYPE_USER) 254 + ret = io_region_pin_pages(ctx, mr, reg); 255 + else 256 + ret = io_region_allocate_pages(ctx, mr, reg, mmap_offset); 257 + if (ret) 258 + goto out_free; 259 + 260 + ret = io_region_init_ptr(mr); 261 + if (ret) 262 + goto out_free; 232 263 return 0; 233 264 out_free: 234 - if (pages_accounted) 235 - __io_unaccount_mem(ctx->user, pages_accounted); 236 - io_pages_free(&pages, nr_pages); 265 + io_free_region(ctx, mr); 237 266 return ret; 267 + } 268 + 269 + int io_create_region_mmap_safe(struct io_ring_ctx *ctx, struct io_mapped_region *mr, 270 + struct io_uring_region_desc *reg, 271 + unsigned long mmap_offset) 272 + { 273 + struct io_mapped_region tmp_mr; 274 + int ret; 275 + 276 + memcpy(&tmp_mr, mr, sizeof(tmp_mr)); 277 + ret = io_create_region(ctx, &tmp_mr, reg, mmap_offset); 278 + if (ret) 279 + return ret; 280 + 281 + /* 282 + * Once published mmap can find it without holding only the ->mmap_lock 283 + * and not ->uring_lock. 284 + */ 285 + guard(mutex)(&ctx->mmap_lock); 286 + memcpy(mr, &tmp_mr, sizeof(tmp_mr)); 287 + return 0; 288 + } 289 + 290 + static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx, 291 + loff_t pgoff) 292 + { 293 + loff_t offset = pgoff << PAGE_SHIFT; 294 + unsigned int bgid; 295 + 296 + switch (offset & IORING_OFF_MMAP_MASK) { 297 + case IORING_OFF_SQ_RING: 298 + case IORING_OFF_CQ_RING: 299 + return &ctx->ring_region; 300 + case IORING_OFF_SQES: 301 + return &ctx->sq_region; 302 + case IORING_OFF_PBUF_RING: 303 + bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; 304 + return io_pbuf_get_region(ctx, bgid); 305 + case IORING_MAP_OFF_PARAM_REGION: 306 + return &ctx->param_region; 307 + } 308 + return NULL; 309 + } 310 + 311 + static void *io_region_validate_mmap(struct io_ring_ctx *ctx, 312 + struct io_mapped_region *mr) 313 + { 314 + lockdep_assert_held(&ctx->mmap_lock); 315 + 316 + if (!io_region_is_set(mr)) 317 + return ERR_PTR(-EINVAL); 318 + if (mr->flags & IO_REGION_F_USER_PROVIDED) 319 + return ERR_PTR(-EINVAL); 320 + 321 + return io_region_get_ptr(mr); 238 322 } 239 323 240 324 static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, 241 325 size_t sz) 242 326 { 243 327 struct io_ring_ctx *ctx = file->private_data; 244 - loff_t offset = pgoff << PAGE_SHIFT; 328 + struct io_mapped_region *region; 245 329 246 - switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) { 247 - case IORING_OFF_SQ_RING: 248 - case IORING_OFF_CQ_RING: 249 - /* Don't allow mmap if the ring was setup without it */ 250 - if (ctx->flags & IORING_SETUP_NO_MMAP) 251 - return ERR_PTR(-EINVAL); 252 - if (!ctx->rings) 253 - return ERR_PTR(-EFAULT); 254 - return ctx->rings; 255 - case IORING_OFF_SQES: 256 - /* Don't allow mmap if the ring was setup without it */ 257 - if (ctx->flags & IORING_SETUP_NO_MMAP) 258 - return ERR_PTR(-EINVAL); 259 - if (!ctx->sq_sqes) 260 - return ERR_PTR(-EFAULT); 261 - return ctx->sq_sqes; 262 - case IORING_OFF_PBUF_RING: { 263 - struct io_buffer_list *bl; 264 - unsigned int bgid; 265 - void *ptr; 266 - 267 - bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; 268 - bl = io_pbuf_get_bl(ctx, bgid); 269 - if (IS_ERR(bl)) 270 - return bl; 271 - ptr = bl->buf_ring; 272 - io_put_bl(ctx, bl); 273 - return ptr; 274 - } 275 - } 276 - 277 - return ERR_PTR(-EINVAL); 278 - } 279 - 280 - int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, 281 - struct page **pages, int npages) 282 - { 283 - unsigned long nr_pages = npages; 284 - 285 - vm_flags_set(vma, VM_DONTEXPAND); 286 - return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); 330 + region = io_mmap_get_region(ctx, pgoff); 331 + if (!region) 332 + return ERR_PTR(-EINVAL); 333 + return io_region_validate_mmap(ctx, region); 287 334 } 288 335 289 336 #ifdef CONFIG_MMU 337 + 338 + static int io_region_mmap(struct io_ring_ctx *ctx, 339 + struct io_mapped_region *mr, 340 + struct vm_area_struct *vma, 341 + unsigned max_pages) 342 + { 343 + unsigned long nr_pages = min(mr->nr_pages, max_pages); 344 + 345 + vm_flags_set(vma, VM_DONTEXPAND); 346 + return vm_insert_pages(vma, vma->vm_start, mr->pages, &nr_pages); 347 + } 290 348 291 349 __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 292 350 { 293 351 struct io_ring_ctx *ctx = file->private_data; 294 352 size_t sz = vma->vm_end - vma->vm_start; 295 353 long offset = vma->vm_pgoff << PAGE_SHIFT; 296 - unsigned int npages; 354 + unsigned int page_limit = UINT_MAX; 355 + struct io_mapped_region *region; 297 356 void *ptr; 298 357 299 - guard(mutex)(&ctx->resize_lock); 358 + guard(mutex)(&ctx->mmap_lock); 300 359 301 360 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); 302 361 if (IS_ERR(ptr)) ··· 331 338 switch (offset & IORING_OFF_MMAP_MASK) { 332 339 case IORING_OFF_SQ_RING: 333 340 case IORING_OFF_CQ_RING: 334 - npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT); 335 - return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages); 336 - case IORING_OFF_SQES: 337 - return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, 338 - ctx->n_sqe_pages); 339 - case IORING_OFF_PBUF_RING: 340 - return io_pbuf_mmap(file, vma); 341 + page_limit = (sz + PAGE_SIZE - 1) >> PAGE_SHIFT; 342 + break; 341 343 } 342 344 343 - return -EINVAL; 345 + region = io_mmap_get_region(ctx, vma->vm_pgoff); 346 + return io_region_mmap(ctx, region, vma, page_limit); 344 347 } 345 348 346 349 unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, ··· 354 365 if (addr) 355 366 return -EINVAL; 356 367 357 - guard(mutex)(&ctx->resize_lock); 368 + guard(mutex)(&ctx->mmap_lock); 358 369 359 370 ptr = io_uring_validate_mmap_request(filp, pgoff, len); 360 371 if (IS_ERR(ptr)) ··· 404 415 struct io_ring_ctx *ctx = file->private_data; 405 416 void *ptr; 406 417 407 - guard(mutex)(&ctx->resize_lock); 418 + guard(mutex)(&ctx->mmap_lock); 408 419 409 420 ptr = io_uring_validate_mmap_request(file, pgoff, len); 410 421 if (IS_ERR(ptr))
+10 -13
io_uring/memmap.h
··· 1 1 #ifndef IO_URING_MEMMAP_H 2 2 #define IO_URING_MEMMAP_H 3 3 4 + #define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL 5 + 4 6 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); 5 - void io_pages_free(struct page ***pages, int npages); 6 - int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, 7 - struct page **pages, int npages); 8 - 9 - void *io_pages_map(struct page ***out_pages, unsigned short *npages, 10 - size_t size); 11 - void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, 12 - bool put_pages); 13 - 14 - void *__io_uaddr_map(struct page ***pages, unsigned short *npages, 15 - unsigned long uaddr, size_t size); 16 7 17 8 #ifndef CONFIG_MMU 18 9 unsigned int io_uring_nommu_mmap_capabilities(struct file *file); ··· 15 24 16 25 void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr); 17 26 int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, 18 - struct io_uring_region_desc *reg); 27 + struct io_uring_region_desc *reg, 28 + unsigned long mmap_offset); 29 + 30 + int io_create_region_mmap_safe(struct io_ring_ctx *ctx, 31 + struct io_mapped_region *mr, 32 + struct io_uring_region_desc *reg, 33 + unsigned long mmap_offset); 19 34 20 35 static inline void *io_region_get_ptr(struct io_mapped_region *mr) 21 36 { 22 - return mr->vmap_ptr; 37 + return mr->ptr; 23 38 } 24 39 25 40 static inline bool io_region_is_set(struct io_mapped_region *mr)
-7
io_uring/msg_ring.c
··· 354 354 return __io_msg_ring_data(fd_file(f)->private_data, 355 355 &io_msg, IO_URING_F_UNLOCKED); 356 356 } 357 - 358 - void io_msg_cache_free(const void *entry) 359 - { 360 - struct io_kiocb *req = (struct io_kiocb *) entry; 361 - 362 - kmem_cache_free(req_cachep, req); 363 - }
-1
io_uring/msg_ring.h
··· 4 4 int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 5 5 int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags); 6 6 void io_msg_ring_cleanup(struct io_kiocb *req); 7 - void io_msg_cache_free(const void *entry);
+18 -17
io_uring/net.c
··· 155 155 } 156 156 } 157 157 158 + static void io_msg_async_data_init(void *obj) 159 + { 160 + struct io_async_msghdr *hdr = (struct io_async_msghdr *)obj; 161 + 162 + hdr->free_iov = NULL; 163 + hdr->free_iov_nr = 0; 164 + } 165 + 158 166 static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) 159 167 { 160 168 struct io_ring_ctx *ctx = req->ctx; 161 169 struct io_async_msghdr *hdr; 162 170 163 - hdr = io_alloc_cache_get(&ctx->netmsg_cache); 164 - if (hdr) { 165 - if (hdr->free_iov) { 166 - kasan_mempool_unpoison_object(hdr->free_iov, 167 - hdr->free_iov_nr * sizeof(struct iovec)); 168 - req->flags |= REQ_F_NEED_CLEANUP; 169 - } 170 - req->flags |= REQ_F_ASYNC_DATA; 171 - req->async_data = hdr; 172 - return hdr; 173 - } 171 + hdr = io_uring_alloc_async_data(&ctx->netmsg_cache, req, 172 + io_msg_async_data_init); 173 + if (!hdr) 174 + return NULL; 174 175 175 - if (!io_alloc_async_data(req)) { 176 - hdr = req->async_data; 177 - hdr->free_iov_nr = 0; 178 - hdr->free_iov = NULL; 179 - return hdr; 176 + /* If the async data was cached, we might have an iov cached inside. */ 177 + if (hdr->free_iov) { 178 + kasan_mempool_unpoison_object(hdr->free_iov, 179 + hdr->free_iov_nr * sizeof(struct iovec)); 180 + req->flags |= REQ_F_NEED_CLEANUP; 180 181 } 181 - return NULL; 182 + return hdr; 182 183 } 183 184 184 185 /* assign new iovec to kmsg, if we need to */
+5 -8
io_uring/poll.c
··· 648 648 if (req->flags & REQ_F_POLLED) { 649 649 apoll = req->apoll; 650 650 kfree(apoll->double_poll); 651 - } else if (!(issue_flags & IO_URING_F_UNLOCKED)) { 652 - apoll = io_alloc_cache_get(&ctx->apoll_cache); 653 - if (!apoll) 654 - goto alloc_apoll; 655 - apoll->poll.retries = APOLL_MAX_RETRY; 656 651 } else { 657 - alloc_apoll: 658 - apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); 659 - if (unlikely(!apoll)) 652 + if (!(issue_flags & IO_URING_F_UNLOCKED)) 653 + apoll = io_cache_alloc(&ctx->apoll_cache, GFP_ATOMIC, NULL); 654 + else 655 + apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); 656 + if (!apoll) 660 657 return NULL; 661 658 apoll->poll.retries = APOLL_MAX_RETRY; 662 659 }
+80 -83
io_uring/register.c
··· 104 104 return id; 105 105 } 106 106 107 - static __cold int io_register_restrictions(struct io_ring_ctx *ctx, 108 - void __user *arg, unsigned int nr_args) 107 + static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args, 108 + struct io_restriction *restrictions) 109 109 { 110 110 struct io_uring_restriction *res; 111 111 size_t size; 112 112 int i, ret; 113 - 114 - /* Restrictions allowed only if rings started disabled */ 115 - if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 116 - return -EBADFD; 117 - 118 - /* We allow only a single restrictions registration */ 119 - if (ctx->restrictions.registered) 120 - return -EBUSY; 121 113 122 114 if (!arg || nr_args > IORING_MAX_RESTRICTIONS) 123 115 return -EINVAL; ··· 122 130 if (IS_ERR(res)) 123 131 return PTR_ERR(res); 124 132 125 - ret = 0; 133 + ret = -EINVAL; 126 134 127 135 for (i = 0; i < nr_args; i++) { 128 136 switch (res[i].opcode) { 129 137 case IORING_RESTRICTION_REGISTER_OP: 130 - if (res[i].register_op >= IORING_REGISTER_LAST) { 131 - ret = -EINVAL; 132 - goto out; 133 - } 134 - 135 - __set_bit(res[i].register_op, 136 - ctx->restrictions.register_op); 138 + if (res[i].register_op >= IORING_REGISTER_LAST) 139 + goto err; 140 + __set_bit(res[i].register_op, restrictions->register_op); 137 141 break; 138 142 case IORING_RESTRICTION_SQE_OP: 139 - if (res[i].sqe_op >= IORING_OP_LAST) { 140 - ret = -EINVAL; 141 - goto out; 142 - } 143 - 144 - __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); 143 + if (res[i].sqe_op >= IORING_OP_LAST) 144 + goto err; 145 + __set_bit(res[i].sqe_op, restrictions->sqe_op); 145 146 break; 146 147 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: 147 - ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; 148 + restrictions->sqe_flags_allowed = res[i].sqe_flags; 148 149 break; 149 150 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: 150 - ctx->restrictions.sqe_flags_required = res[i].sqe_flags; 151 + restrictions->sqe_flags_required = res[i].sqe_flags; 151 152 break; 152 153 default: 153 - ret = -EINVAL; 154 - goto out; 154 + goto err; 155 155 } 156 156 } 157 157 158 - out: 158 + ret = 0; 159 + 160 + err: 161 + kfree(res); 162 + return ret; 163 + } 164 + 165 + static __cold int io_register_restrictions(struct io_ring_ctx *ctx, 166 + void __user *arg, unsigned int nr_args) 167 + { 168 + int ret; 169 + 170 + /* Restrictions allowed only if rings started disabled */ 171 + if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 172 + return -EBADFD; 173 + 174 + /* We allow only a single restrictions registration */ 175 + if (ctx->restrictions.registered) 176 + return -EBUSY; 177 + 178 + ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions); 159 179 /* Reset all restrictions if an error happened */ 160 180 if (ret != 0) 161 181 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); 162 182 else 163 183 ctx->restrictions.registered = true; 164 - 165 - kfree(res); 166 184 return ret; 167 185 } 168 186 ··· 369 367 * either mapping or freeing. 370 368 */ 371 369 struct io_ring_ctx_rings { 372 - unsigned short n_ring_pages; 373 - unsigned short n_sqe_pages; 374 - struct page **ring_pages; 375 - struct page **sqe_pages; 376 - struct io_uring_sqe *sq_sqes; 377 370 struct io_rings *rings; 371 + struct io_uring_sqe *sq_sqes; 372 + 373 + struct io_mapped_region sq_region; 374 + struct io_mapped_region ring_region; 378 375 }; 379 376 380 - static void io_register_free_rings(struct io_uring_params *p, 377 + static void io_register_free_rings(struct io_ring_ctx *ctx, 378 + struct io_uring_params *p, 381 379 struct io_ring_ctx_rings *r) 382 380 { 383 - if (!(p->flags & IORING_SETUP_NO_MMAP)) { 384 - io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages, 385 - true); 386 - io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages, 387 - true); 388 - } else { 389 - io_pages_free(&r->ring_pages, r->n_ring_pages); 390 - io_pages_free(&r->sqe_pages, r->n_sqe_pages); 391 - vunmap(r->rings); 392 - vunmap(r->sq_sqes); 393 - } 381 + io_free_region(ctx, &r->sq_region); 382 + io_free_region(ctx, &r->ring_region); 394 383 } 395 384 396 385 #define swap_old(ctx, o, n, field) \ ··· 396 403 397 404 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) 398 405 { 406 + struct io_uring_region_desc rd; 399 407 struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; 400 408 size_t size, sq_array_offset; 401 409 unsigned i, tail, old_head; 402 410 struct io_uring_params p; 403 - void *ptr; 404 411 int ret; 405 412 406 413 /* for single issuer, must be owner resizing */ ··· 434 441 if (size == SIZE_MAX) 435 442 return -EOVERFLOW; 436 443 437 - if (!(p.flags & IORING_SETUP_NO_MMAP)) 438 - n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size); 439 - else 440 - n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages, 441 - p.cq_off.user_addr, size); 442 - if (IS_ERR(n.rings)) 443 - return PTR_ERR(n.rings); 444 + memset(&rd, 0, sizeof(rd)); 445 + rd.size = PAGE_ALIGN(size); 446 + if (p.flags & IORING_SETUP_NO_MMAP) { 447 + rd.user_addr = p.cq_off.user_addr; 448 + rd.flags |= IORING_MEM_REGION_TYPE_USER; 449 + } 450 + ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); 451 + if (ret) { 452 + io_register_free_rings(ctx, &p, &n); 453 + return ret; 454 + } 455 + n.rings = io_region_get_ptr(&n.ring_region); 444 456 445 457 /* 446 458 * At this point n.rings is shared with userspace, just like o.rings ··· 461 463 WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries); 462 464 463 465 if (copy_to_user(arg, &p, sizeof(p))) { 464 - io_register_free_rings(&p, &n); 466 + io_register_free_rings(ctx, &p, &n); 465 467 return -EFAULT; 466 468 } 467 469 ··· 470 472 else 471 473 size = array_size(sizeof(struct io_uring_sqe), p.sq_entries); 472 474 if (size == SIZE_MAX) { 473 - io_register_free_rings(&p, &n); 475 + io_register_free_rings(ctx, &p, &n); 474 476 return -EOVERFLOW; 475 477 } 476 478 477 - if (!(p.flags & IORING_SETUP_NO_MMAP)) 478 - ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size); 479 - else 480 - ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages, 481 - p.sq_off.user_addr, 482 - size); 483 - if (IS_ERR(ptr)) { 484 - io_register_free_rings(&p, &n); 485 - return PTR_ERR(ptr); 479 + memset(&rd, 0, sizeof(rd)); 480 + rd.size = PAGE_ALIGN(size); 481 + if (p.flags & IORING_SETUP_NO_MMAP) { 482 + rd.user_addr = p.sq_off.user_addr; 483 + rd.flags |= IORING_MEM_REGION_TYPE_USER; 486 484 } 485 + ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES); 486 + if (ret) { 487 + io_register_free_rings(ctx, &p, &n); 488 + return ret; 489 + } 490 + n.sq_sqes = io_region_get_ptr(&n.sq_region); 487 491 488 492 /* 489 493 * If using SQPOLL, park the thread ··· 497 497 } 498 498 499 499 /* 500 - * We'll do the swap. Grab the ctx->resize_lock, which will exclude 500 + * We'll do the swap. Grab the ctx->mmap_lock, which will exclude 501 501 * any new mmap's on the ring fd. Clear out existing mappings to prevent 502 502 * mmap from seeing them, as we'll unmap them. Any attempt to mmap 503 503 * existing rings beyond this point will fail. Not that it could proceed 504 504 * at this point anyway, as the io_uring mmap side needs go grab the 505 - * ctx->resize_lock as well. Likewise, hold the completion lock over the 505 + * ctx->mmap_lock as well. Likewise, hold the completion lock over the 506 506 * duration of the actual swap. 507 507 */ 508 - mutex_lock(&ctx->resize_lock); 508 + mutex_lock(&ctx->mmap_lock); 509 509 spin_lock(&ctx->completion_lock); 510 510 o.rings = ctx->rings; 511 511 ctx->rings = NULL; ··· 516 516 * Now copy SQ and CQ entries, if any. If either of the destination 517 517 * rings can't hold what is already there, then fail the operation. 518 518 */ 519 - n.sq_sqes = ptr; 520 519 tail = READ_ONCE(o.rings->sq.tail); 521 520 old_head = READ_ONCE(o.rings->sq.head); 522 521 if (tail - old_head > p.sq_entries) ··· 526 527 527 528 n.sq_sqes[dst_head] = o.sq_sqes[src_head]; 528 529 } 529 - WRITE_ONCE(n.rings->sq.head, READ_ONCE(o.rings->sq.head)); 530 - WRITE_ONCE(n.rings->sq.tail, READ_ONCE(o.rings->sq.tail)); 530 + WRITE_ONCE(n.rings->sq.head, old_head); 531 + WRITE_ONCE(n.rings->sq.tail, tail); 531 532 532 533 tail = READ_ONCE(o.rings->cq.tail); 533 534 old_head = READ_ONCE(o.rings->cq.head); ··· 546 547 547 548 n.rings->cqes[dst_head] = o.rings->cqes[src_head]; 548 549 } 549 - WRITE_ONCE(n.rings->cq.head, READ_ONCE(o.rings->cq.head)); 550 - WRITE_ONCE(n.rings->cq.tail, READ_ONCE(o.rings->cq.tail)); 550 + WRITE_ONCE(n.rings->cq.head, old_head); 551 + WRITE_ONCE(n.rings->cq.tail, tail); 551 552 /* invalidate cached cqe refill */ 552 553 ctx->cqe_cached = ctx->cqe_sentinel = NULL; 553 554 ··· 565 566 566 567 ctx->rings = n.rings; 567 568 ctx->sq_sqes = n.sq_sqes; 568 - swap_old(ctx, o, n, n_ring_pages); 569 - swap_old(ctx, o, n, n_sqe_pages); 570 - swap_old(ctx, o, n, ring_pages); 571 - swap_old(ctx, o, n, sqe_pages); 569 + swap_old(ctx, o, n, ring_region); 570 + swap_old(ctx, o, n, sq_region); 572 571 to_free = &o; 573 572 ret = 0; 574 573 out: 575 574 spin_unlock(&ctx->completion_lock); 576 - mutex_unlock(&ctx->resize_lock); 577 - io_register_free_rings(&p, to_free); 575 + mutex_unlock(&ctx->mmap_lock); 576 + io_register_free_rings(ctx, &p, to_free); 578 577 579 578 if (ctx->sq_data) 580 579 io_sq_thread_unpark(ctx->sq_data); ··· 595 598 rd_uptr = u64_to_user_ptr(reg.region_uptr); 596 599 if (copy_from_user(&rd, rd_uptr, sizeof(rd))) 597 600 return -EFAULT; 598 - 599 601 if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv))) 600 602 return -EINVAL; 601 603 if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG) ··· 609 613 !(ctx->flags & IORING_SETUP_R_DISABLED)) 610 614 return -EINVAL; 611 615 612 - ret = io_create_region(ctx, &ctx->param_region, &rd); 616 + ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd, 617 + IORING_MAP_OFF_PARAM_REGION); 613 618 if (ret) 614 619 return ret; 615 620 if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
+22 -18
io_uring/rsrc.c
··· 626 626 return ret; 627 627 } 628 628 629 - static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages, 630 - struct io_imu_folio_data *data, int nr_folios) 629 + static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, 630 + struct io_imu_folio_data *data) 631 631 { 632 632 struct page **page_array = *pages, **new_array = NULL; 633 633 int nr_pages_left = *nr_pages, i, j; 634 + int nr_folios = data->nr_folios; 634 635 635 636 /* Store head pages only*/ 636 637 new_array = kvmalloc_array(nr_folios, sizeof(struct page *), ··· 668 667 return true; 669 668 } 670 669 671 - static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages, 672 - struct io_imu_folio_data *data) 670 + bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, 671 + struct io_imu_folio_data *data) 673 672 { 674 - struct page **page_array = *pages; 675 673 struct folio *folio = page_folio(page_array[0]); 676 674 unsigned int count = 1, nr_folios = 1; 677 675 int i; 678 676 679 - if (*nr_pages <= 1) 680 - return false; 681 - 682 677 data->nr_pages_mid = folio_nr_pages(folio); 683 - if (data->nr_pages_mid == 1) 684 - return false; 685 - 686 678 data->folio_shift = folio_shift(folio); 679 + 687 680 /* 688 681 * Check if pages are contiguous inside a folio, and all folios have 689 682 * the same page count except for the head and tail. 690 683 */ 691 - for (i = 1; i < *nr_pages; i++) { 684 + for (i = 1; i < nr_pages; i++) { 692 685 if (page_folio(page_array[i]) == folio && 693 686 page_array[i] == page_array[i-1] + 1) { 694 687 count++; ··· 710 715 if (nr_folios == 1) 711 716 data->nr_pages_head = count; 712 717 713 - return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios); 718 + data->nr_folios = nr_folios; 719 + return true; 714 720 } 715 721 716 722 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, ··· 725 729 size_t size; 726 730 int ret, nr_pages, i; 727 731 struct io_imu_folio_data data; 728 - bool coalesced; 732 + bool coalesced = false; 729 733 730 734 if (!iov->iov_base) 731 735 return NULL; ··· 745 749 } 746 750 747 751 /* If it's huge page(s), try to coalesce them into fewer bvec entries */ 748 - coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data); 752 + if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) { 753 + if (data.nr_pages_mid != 1) 754 + coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); 755 + } 749 756 750 757 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); 751 758 if (!imu) ··· 882 883 * and advance us to the beginning. 883 884 */ 884 885 offset = buf_addr - imu->ubuf; 885 - iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); 886 + iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, len); 886 887 887 888 if (offset) { 888 889 /* ··· 904 905 const struct bio_vec *bvec = imu->bvec; 905 906 906 907 if (offset < bvec->bv_len) { 907 - iter->count -= offset; 908 908 iter->iov_offset = offset; 909 909 } else { 910 910 unsigned long seg_skip; ··· 914 916 915 917 iter->bvec += seg_skip; 916 918 iter->nr_segs -= seg_skip; 917 - iter->count -= bvec->bv_len + offset; 918 919 iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); 919 920 } 920 921 } ··· 927 930 struct io_rsrc_data data; 928 931 int i, ret, off, nr; 929 932 unsigned int nbufs; 933 + 934 + /* 935 + * Accounting state is shared between the two rings; that only works if 936 + * both rings are accounted towards the same counters. 937 + */ 938 + if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account) 939 + return -EINVAL; 930 940 931 941 /* if offsets are given, must have nr specified too */ 932 942 if (!arg->nr && (arg->dst_off || arg->src_off))
+4
io_uring/rsrc.h
··· 40 40 /* For non-head/tail folios, has to be fully included */ 41 41 unsigned int nr_pages_mid; 42 42 unsigned int folio_shift; 43 + unsigned int nr_folios; 43 44 }; 44 45 45 46 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type); ··· 66 65 unsigned size, unsigned type); 67 66 int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 68 67 unsigned int size, unsigned int type); 68 + 69 + bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, 70 + struct io_imu_folio_data *data); 69 71 70 72 static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data, 71 73 int index)
+123 -89
io_uring/rw.c
··· 202 202 * mean that the underlying data can be gone at any time. But that 203 203 * should be fixed seperately, and then this check could be killed. 204 204 */ 205 - if (!(req->flags & REQ_F_REFCOUNT)) { 205 + if (!(req->flags & (REQ_F_REISSUE | REQ_F_REFCOUNT))) { 206 206 req->flags &= ~REQ_F_NEED_CLEANUP; 207 207 io_rw_recycle(req, issue_flags); 208 208 } 209 + } 210 + 211 + static void io_rw_async_data_init(void *obj) 212 + { 213 + struct io_async_rw *rw = (struct io_async_rw *)obj; 214 + 215 + rw->free_iovec = NULL; 216 + rw->bytes_done = 0; 209 217 } 210 218 211 219 static int io_rw_alloc_async(struct io_kiocb *req) ··· 221 213 struct io_ring_ctx *ctx = req->ctx; 222 214 struct io_async_rw *rw; 223 215 224 - rw = io_alloc_cache_get(&ctx->rw_cache); 225 - if (rw) { 226 - if (rw->free_iovec) { 227 - kasan_mempool_unpoison_object(rw->free_iovec, 228 - rw->free_iov_nr * sizeof(struct iovec)); 229 - req->flags |= REQ_F_NEED_CLEANUP; 230 - } 231 - req->flags |= REQ_F_ASYNC_DATA; 232 - req->async_data = rw; 233 - goto done; 216 + rw = io_uring_alloc_async_data(&ctx->rw_cache, req, io_rw_async_data_init); 217 + if (!rw) 218 + return -ENOMEM; 219 + if (rw->free_iovec) { 220 + kasan_mempool_unpoison_object(rw->free_iovec, 221 + rw->free_iov_nr * sizeof(struct iovec)); 222 + req->flags |= REQ_F_NEED_CLEANUP; 234 223 } 235 - 236 - if (!io_alloc_async_data(req)) { 237 - rw = req->async_data; 238 - rw->free_iovec = NULL; 239 - rw->free_iov_nr = 0; 240 - done: 241 - rw->bytes_done = 0; 242 - return 0; 243 - } 244 - 245 - return -ENOMEM; 224 + rw->bytes_done = 0; 225 + return 0; 246 226 } 247 227 248 228 static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) 249 229 { 250 230 struct io_async_rw *rw; 251 - int ret; 252 231 253 232 if (io_rw_alloc_async(req)) 254 233 return -ENOMEM; ··· 244 249 return 0; 245 250 246 251 rw = req->async_data; 247 - ret = io_import_iovec(ddir, req, rw, 0); 252 + return io_import_iovec(ddir, req, rw, 0); 253 + } 254 + 255 + static inline void io_meta_save_state(struct io_async_rw *io) 256 + { 257 + io->meta_state.seed = io->meta.seed; 258 + iov_iter_save_state(&io->meta.iter, &io->meta_state.iter_meta); 259 + } 260 + 261 + static inline void io_meta_restore(struct io_async_rw *io, struct kiocb *kiocb) 262 + { 263 + if (kiocb->ki_flags & IOCB_HAS_METADATA) { 264 + io->meta.seed = io->meta_state.seed; 265 + iov_iter_restore(&io->meta.iter, &io->meta_state.iter_meta); 266 + } 267 + } 268 + 269 + static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir, 270 + u64 attr_ptr, u64 attr_type_mask) 271 + { 272 + struct io_uring_attr_pi pi_attr; 273 + struct io_async_rw *io; 274 + int ret; 275 + 276 + if (copy_from_user(&pi_attr, u64_to_user_ptr(attr_ptr), 277 + sizeof(pi_attr))) 278 + return -EFAULT; 279 + 280 + if (pi_attr.rsvd) 281 + return -EINVAL; 282 + 283 + io = req->async_data; 284 + io->meta.flags = pi_attr.flags; 285 + io->meta.app_tag = pi_attr.app_tag; 286 + io->meta.seed = pi_attr.seed; 287 + ret = import_ubuf(ddir, u64_to_user_ptr(pi_attr.addr), 288 + pi_attr.len, &io->meta.iter); 248 289 if (unlikely(ret < 0)) 249 290 return ret; 250 - 251 - iov_iter_save_state(&rw->iter, &rw->iter_state); 252 - return 0; 291 + req->flags |= REQ_F_HAS_METADATA; 292 + io_meta_save_state(io); 293 + return ret; 253 294 } 254 295 255 296 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, ··· 293 262 { 294 263 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 295 264 unsigned ioprio; 265 + u64 attr_type_mask; 296 266 int ret; 297 267 298 268 rw->kiocb.ki_pos = READ_ONCE(sqe->off); ··· 311 279 rw->kiocb.ki_ioprio = get_current_ioprio(); 312 280 } 313 281 rw->kiocb.dio_complete = NULL; 282 + rw->kiocb.ki_flags = 0; 314 283 315 284 rw->addr = READ_ONCE(sqe->addr); 316 285 rw->len = READ_ONCE(sqe->len); 317 286 rw->flags = READ_ONCE(sqe->rw_flags); 318 - return io_prep_rw_setup(req, ddir, do_import); 287 + ret = io_prep_rw_setup(req, ddir, do_import); 288 + 289 + if (unlikely(ret)) 290 + return ret; 291 + 292 + attr_type_mask = READ_ONCE(sqe->attr_type_mask); 293 + if (attr_type_mask) { 294 + u64 attr_ptr; 295 + 296 + /* only PI attribute is supported currently */ 297 + if (attr_type_mask != IORING_RW_ATTR_FLAG_PI) 298 + return -EINVAL; 299 + 300 + attr_ptr = READ_ONCE(sqe->attr_ptr); 301 + ret = io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask); 302 + } 303 + return ret; 319 304 } 320 305 321 306 int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe) ··· 434 385 435 386 void io_readv_writev_cleanup(struct io_kiocb *req) 436 387 { 437 - io_rw_iovec_free(req->async_data); 388 + lockdep_assert_held(&req->ctx->uring_lock); 389 + io_rw_recycle(req, 0); 438 390 } 439 391 440 392 static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) ··· 455 405 return NULL; 456 406 } 457 407 458 - #ifdef CONFIG_BLOCK 459 - static void io_resubmit_prep(struct io_kiocb *req) 460 - { 461 - struct io_async_rw *io = req->async_data; 462 - 463 - iov_iter_restore(&io->iter, &io->iter_state); 464 - } 465 - 466 408 static bool io_rw_should_reissue(struct io_kiocb *req) 467 409 { 410 + #ifdef CONFIG_BLOCK 411 + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 468 412 umode_t mode = file_inode(req->file)->i_mode; 413 + struct io_async_rw *io = req->async_data; 469 414 struct io_ring_ctx *ctx = req->ctx; 470 415 471 416 if (!S_ISBLK(mode) && !S_ISREG(mode)) ··· 475 430 */ 476 431 if (percpu_ref_is_dying(&ctx->refs)) 477 432 return false; 478 - /* 479 - * Play it safe and assume not safe to re-import and reissue if we're 480 - * not in the original thread group (or in task context). 481 - */ 482 - if (!same_thread_group(req->tctx->task, current) || !in_task()) 483 - return false; 433 + 434 + io_meta_restore(io, &rw->kiocb); 435 + iov_iter_restore(&io->iter, &io->iter_state); 484 436 return true; 485 - } 486 437 #else 487 - static void io_resubmit_prep(struct io_kiocb *req) 488 - { 489 - } 490 - static bool io_rw_should_reissue(struct io_kiocb *req) 491 - { 492 438 return false; 493 - } 494 439 #endif 440 + } 495 441 496 442 static void io_req_end_write(struct io_kiocb *req) 497 443 { ··· 509 473 } 510 474 } 511 475 512 - static bool __io_complete_rw_common(struct io_kiocb *req, long res) 476 + static void __io_complete_rw_common(struct io_kiocb *req, long res) 513 477 { 514 - if (unlikely(res != req->cqe.res)) { 515 - if (res == -EAGAIN && io_rw_should_reissue(req)) { 516 - /* 517 - * Reissue will start accounting again, finish the 518 - * current cycle. 519 - */ 520 - io_req_io_end(req); 521 - req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; 522 - return true; 523 - } 478 + if (res == req->cqe.res) 479 + return; 480 + if (res == -EAGAIN && io_rw_should_reissue(req)) { 481 + req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; 482 + } else { 524 483 req_set_fail(req); 525 484 req->cqe.res = res; 526 485 } 527 - return false; 528 486 } 529 487 530 488 static inline int io_fixup_rw_res(struct io_kiocb *req, long res) ··· 561 531 struct io_kiocb *req = cmd_to_io_kiocb(rw); 562 532 563 533 if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) { 564 - if (__io_complete_rw_common(req, res)) 565 - return; 534 + __io_complete_rw_common(req, res); 566 535 io_req_set_res(req, io_fixup_rw_res(req, res), 0); 567 536 } 568 537 req->io_task_work.func = io_req_rw_complete; ··· 623 594 if (ret >= 0 && req->flags & REQ_F_CUR_POS) 624 595 req->file->f_pos = rw->kiocb.ki_pos; 625 596 if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) { 626 - if (!__io_complete_rw_common(req, ret)) { 627 - /* 628 - * Safe to call io_end from here as we're inline 629 - * from the submission path. 630 - */ 631 - io_req_io_end(req); 632 - io_req_set_res(req, final_ret, 633 - io_put_kbuf(req, ret, issue_flags)); 634 - io_req_rw_cleanup(req, issue_flags); 635 - return IOU_OK; 636 - } 597 + __io_complete_rw_common(req, ret); 598 + /* 599 + * Safe to call io_end from here as we're inline 600 + * from the submission path. 601 + */ 602 + io_req_io_end(req); 603 + io_req_set_res(req, final_ret, io_put_kbuf(req, ret, issue_flags)); 604 + io_req_rw_cleanup(req, issue_flags); 605 + return IOU_OK; 637 606 } else { 638 607 io_rw_done(&rw->kiocb, ret); 639 608 } 640 609 641 - if (req->flags & REQ_F_REISSUE) { 642 - req->flags &= ~REQ_F_REISSUE; 643 - io_resubmit_prep(req); 644 - return -EAGAIN; 645 - } 646 610 return IOU_ISSUE_SKIP_COMPLETE; 647 611 } 648 612 ··· 758 736 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 759 737 struct kiocb *kiocb = &rw->kiocb; 760 738 761 - /* never retry for NOWAIT, we just complete with -EAGAIN */ 762 - if (req->flags & REQ_F_NOWAIT) 739 + /* 740 + * Never retry for NOWAIT or a request with metadata, we just complete 741 + * with -EAGAIN. 742 + */ 743 + if (req->flags & (REQ_F_NOWAIT | REQ_F_HAS_METADATA)) 763 744 return false; 764 745 765 746 /* Only for buffered IO */ ··· 853 828 kiocb->ki_complete = io_complete_rw; 854 829 } 855 830 831 + if (req->flags & REQ_F_HAS_METADATA) { 832 + struct io_async_rw *io = req->async_data; 833 + 834 + /* 835 + * We have a union of meta fields with wpq used for buffered-io 836 + * in io_async_rw, so fail it here. 837 + */ 838 + if (!(req->file->f_flags & O_DIRECT)) 839 + return -EOPNOTSUPP; 840 + kiocb->ki_flags |= IOCB_HAS_METADATA; 841 + kiocb->private = &io->meta; 842 + } 843 + 856 844 return 0; 857 845 } 858 846 ··· 914 876 if (ret == -EOPNOTSUPP && force_nonblock) 915 877 ret = -EAGAIN; 916 878 917 - if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { 918 - req->flags &= ~REQ_F_REISSUE; 879 + if (ret == -EAGAIN) { 919 880 /* If we can poll, just do that. */ 920 881 if (io_file_can_poll(req)) 921 882 return -EAGAIN; ··· 939 902 * manually if we need to. 940 903 */ 941 904 iov_iter_restore(&io->iter, &io->iter_state); 905 + io_meta_restore(io, kiocb); 942 906 943 907 do { 944 908 /* ··· 1125 1087 else 1126 1088 ret2 = -EINVAL; 1127 1089 1128 - if (req->flags & REQ_F_REISSUE) { 1129 - req->flags &= ~REQ_F_REISSUE; 1130 - ret2 = -EAGAIN; 1131 - } 1132 - 1133 1090 /* 1134 1091 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just 1135 1092 * retry them without IOCB_NOWAIT. ··· 1160 1127 } else { 1161 1128 ret_eagain: 1162 1129 iov_iter_restore(&io->iter, &io->iter_state); 1130 + io_meta_restore(io, kiocb); 1163 1131 if (kiocb->ki_flags & IOCB_WRITE) 1164 1132 io_req_end_write(req); 1165 1133 return -EAGAIN;
+13 -1
io_uring/rw.h
··· 2 2 3 3 #include <linux/pagemap.h> 4 4 5 + struct io_meta_state { 6 + u32 seed; 7 + struct iov_iter_state iter_meta; 8 + }; 9 + 5 10 struct io_async_rw { 6 11 size_t bytes_done; 7 12 struct iov_iter iter; ··· 14 9 struct iovec fast_iov; 15 10 struct iovec *free_iovec; 16 11 int free_iov_nr; 17 - struct wait_page_queue wpq; 12 + /* wpq is for buffered io, while meta fields are used with direct io */ 13 + union { 14 + struct wait_page_queue wpq; 15 + struct { 16 + struct uio_meta meta; 17 + struct io_meta_state meta_state; 18 + }; 19 + }; 18 20 }; 19 21 20 22 int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+2 -3
io_uring/timeout.c
··· 544 544 545 545 if (WARN_ON_ONCE(req_has_async_data(req))) 546 546 return -EFAULT; 547 - if (io_alloc_async_data(req)) 547 + data = io_uring_alloc_async_data_nocache(req); 548 + if (!data) 548 549 return -ENOMEM; 549 - 550 - data = req->async_data; 551 550 data->req = req; 552 551 data->flags = flags; 553 552
+11 -23
io_uring/uring_cmd.c
··· 16 16 #include "rsrc.h" 17 17 #include "uring_cmd.h" 18 18 19 - static struct io_uring_cmd_data *io_uring_async_get(struct io_kiocb *req) 20 - { 21 - struct io_ring_ctx *ctx = req->ctx; 22 - struct io_uring_cmd_data *cache; 23 - 24 - cache = io_alloc_cache_get(&ctx->uring_cache); 25 - if (cache) { 26 - cache->op_data = NULL; 27 - req->flags |= REQ_F_ASYNC_DATA; 28 - req->async_data = cache; 29 - return cache; 30 - } 31 - if (!io_alloc_async_data(req)) { 32 - cache = req->async_data; 33 - cache->op_data = NULL; 34 - return cache; 35 - } 36 - return NULL; 37 - } 38 - 39 19 static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) 40 20 { 41 21 struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); ··· 110 130 struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); 111 131 unsigned int flags = IO_URING_F_COMPLETE_DEFER; 112 132 113 - if (current->flags & (PF_EXITING | PF_KTHREAD)) 133 + if (io_should_terminate_tw()) 114 134 flags |= IO_URING_F_TASK_DEAD; 115 135 116 136 /* task_work executor checks the deffered list completion */ ··· 168 188 } 169 189 EXPORT_SYMBOL_GPL(io_uring_cmd_done); 170 190 191 + static void io_uring_cmd_init_once(void *obj) 192 + { 193 + struct io_uring_cmd_data *data = obj; 194 + 195 + data->op_data = NULL; 196 + } 197 + 171 198 static int io_uring_cmd_prep_setup(struct io_kiocb *req, 172 199 const struct io_uring_sqe *sqe) 173 200 { 174 201 struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); 175 202 struct io_uring_cmd_data *cache; 176 203 177 - cache = io_uring_async_get(req); 178 - if (unlikely(!cache)) 204 + cache = io_uring_alloc_async_data(&req->ctx->uring_cache, req, 205 + io_uring_cmd_init_once); 206 + if (!cache) 179 207 return -ENOMEM; 180 208 181 209 if (!(req->flags & REQ_F_FORCE_ASYNC)) {
+2 -2
io_uring/waitid.c
··· 303 303 struct io_waitid_async *iwa; 304 304 int ret; 305 305 306 - if (io_alloc_async_data(req)) 306 + iwa = io_uring_alloc_async_data_nocache(req); 307 + if (!iwa) 307 308 return -ENOMEM; 308 309 309 - iwa = req->async_data; 310 310 iwa->req = req; 311 311 312 312 ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info,