Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'for-linus-20191205' of git://git.kernel.dk/linux-block

Pull more block and io_uring updates from Jens Axboe:
"I wasn't expecting this to be so big, and if I was, I would have used
separate branches for this. Going forward I'll be doing separate
branches for the current tree, just like for the next kernel version
tree. In any case, this contains:

- Series from Christoph that fixes an inherent race condition with
zoned devices and revalidation.

- null_blk zone size fix (Damien)

- Fix for a regression in this merge window that caused busy spins by
sending empty disk uevents (Eric)

- Fix for a regression in this merge window for bfq stats (Hou)

- Fix for io_uring creds allocation failure handling (me)

- io_uring -ERESTARTSYS send/recvmsg fix (me)

- Series that fixes the need for applications to retain state across
async request punts for io_uring. This one is a bit larger than I
would have hoped, but I think it's important we get this fixed for
5.5.

- connect(2) improvement for io_uring, handling EINPROGRESS instead
of having applications needing to poll for it (me)

- Have io_uring use a hash for poll requests instead of an rbtree.
This turned out to work much better in practice, so I think we
should make the switch now. For some workloads, even with a fair
amount of cancellations, the insertion sort is just too expensive.
(me)

- Various little io_uring fixes (me, Jackie, Pavel, LimingWu)

- Fix for brd unaligned IO, and a warning for the future (Ming)

- Fix for a bio integrity data leak (Justin)

- bvec_iter_advance() improvement (Pavel)

- Xen blkback page unmap fix (SeongJae)

The major items in here are all well tested, and on the liburing side
we continue to add regression and feature test cases. We're up to 50
topic cases now, each with anywhere from 1 to more than 10 cases in
each"

* tag 'for-linus-20191205' of git://git.kernel.dk/linux-block: (33 commits)
block: fix memleak of bio integrity data
io_uring: fix a typo in a comment
bfq-iosched: Ensure bio->bi_blkg is valid before using it
io_uring: hook all linked requests via link_list
io_uring: fix error handling in io_queue_link_head
io_uring: use hash table for poll command lookups
io-wq: clear node->next on list deletion
io_uring: ensure deferred timeouts copy necessary data
io_uring: allow IO_SQE_* flags on IORING_OP_TIMEOUT
null_blk: remove unused variable warning on !CONFIG_BLK_DEV_ZONED
brd: warn on un-aligned buffer
brd: remove max_hw_sectors queue limit
xen/blkback: Avoid unmapping unmapped grant pages
io_uring: handle connect -EINPROGRESS like -EAGAIN
block: set the zone size in blk_revalidate_disk_zones atomically
block: don't handle bio based drivers in blk_revalidate_disk_zones
block: allocate the zone bitmaps lazily
block: replace seq_zones_bitmap with conv_zones_bitmap
block: simplify blkdev_nr_zones
block: remove the empty line at the end of blk-zoned.c
...

+683 -417
+3
block/bfq-cgroup.c
··· 351 351 { 352 352 struct bfq_group *bfqg = blkg_to_bfqg(rq->bio->bi_blkg); 353 353 354 + if (!bfqg) 355 + return; 356 + 354 357 blkg_rwstat_add(&bfqg->stats.bytes, rq->cmd_flags, blk_rq_bytes(rq)); 355 358 blkg_rwstat_add(&bfqg->stats.ios, rq->cmd_flags, 1); 356 359 }
+1 -1
block/bio-integrity.c
··· 87 87 * Description: Used to free the integrity portion of a bio. Usually 88 88 * called from bio_free(). 89 89 */ 90 - static void bio_integrity_free(struct bio *bio) 90 + void bio_integrity_free(struct bio *bio) 91 91 { 92 92 struct bio_integrity_payload *bip = bio_integrity(bio); 93 93 struct bio_set *bs = bio->bi_pool;
+3
block/bio.c
··· 233 233 void bio_uninit(struct bio *bio) 234 234 { 235 235 bio_disassociate_blkg(bio); 236 + 237 + if (bio_integrity(bio)) 238 + bio_integrity_free(bio); 236 239 } 237 240 EXPORT_SYMBOL(bio_uninit); 238 241
+69 -80
block/blk-zoned.c
··· 70 70 } 71 71 EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); 72 72 73 - static inline unsigned int __blkdev_nr_zones(struct request_queue *q, 74 - sector_t nr_sectors) 75 - { 76 - sector_t zone_sectors = blk_queue_zone_sectors(q); 77 - 78 - return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors); 79 - } 80 - 81 73 /** 82 74 * blkdev_nr_zones - Get number of zones 83 - * @bdev: Target block device 75 + * @disk: Target gendisk 84 76 * 85 - * Description: 86 - * Return the total number of zones of a zoned block device. 87 - * For a regular block device, the number of zones is always 0. 77 + * Return the total number of zones of a zoned block device. For a block 78 + * device without zone capabilities, the number of zones is always 0. 88 79 */ 89 - unsigned int blkdev_nr_zones(struct block_device *bdev) 80 + unsigned int blkdev_nr_zones(struct gendisk *disk) 90 81 { 91 - struct request_queue *q = bdev_get_queue(bdev); 82 + sector_t zone_sectors = blk_queue_zone_sectors(disk->queue); 92 83 93 - if (!blk_queue_is_zoned(q)) 84 + if (!blk_queue_is_zoned(disk->queue)) 94 85 return 0; 95 - 96 - return __blkdev_nr_zones(q, get_capacity(bdev->bd_disk)); 86 + return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors); 97 87 } 98 88 EXPORT_SYMBOL_GPL(blkdev_nr_zones); 99 89 ··· 332 342 333 343 void blk_queue_free_zone_bitmaps(struct request_queue *q) 334 344 { 335 - kfree(q->seq_zones_bitmap); 336 - q->seq_zones_bitmap = NULL; 345 + kfree(q->conv_zones_bitmap); 346 + q->conv_zones_bitmap = NULL; 337 347 kfree(q->seq_zones_wlock); 338 348 q->seq_zones_wlock = NULL; 339 349 } 340 350 341 351 struct blk_revalidate_zone_args { 342 352 struct gendisk *disk; 343 - unsigned long *seq_zones_bitmap; 353 + unsigned long *conv_zones_bitmap; 344 354 unsigned long *seq_zones_wlock; 355 + unsigned int nr_zones; 356 + sector_t zone_sectors; 345 357 sector_t sector; 346 358 }; 347 359 ··· 356 364 struct blk_revalidate_zone_args *args = data; 357 365 struct gendisk *disk = args->disk; 358 366 struct request_queue *q = disk->queue; 359 - sector_t zone_sectors = blk_queue_zone_sectors(q); 360 367 sector_t capacity = get_capacity(disk); 361 368 362 369 /* 363 370 * All zones must have the same size, with the exception on an eventual 364 371 * smaller last zone. 365 372 */ 366 - if (zone->start + zone_sectors < capacity && 367 - zone->len != zone_sectors) { 368 - pr_warn("%s: Invalid zoned device with non constant zone size\n", 369 - disk->disk_name); 370 - return false; 371 - } 373 + if (zone->start == 0) { 374 + if (zone->len == 0 || !is_power_of_2(zone->len)) { 375 + pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n", 376 + disk->disk_name, zone->len); 377 + return -ENODEV; 378 + } 372 379 373 - if (zone->start + zone->len >= capacity && 374 - zone->len > zone_sectors) { 375 - pr_warn("%s: Invalid zoned device with larger last zone size\n", 376 - disk->disk_name); 377 - return -ENODEV; 380 + args->zone_sectors = zone->len; 381 + args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len); 382 + } else if (zone->start + args->zone_sectors < capacity) { 383 + if (zone->len != args->zone_sectors) { 384 + pr_warn("%s: Invalid zoned device with non constant zone size\n", 385 + disk->disk_name); 386 + return -ENODEV; 387 + } 388 + } else { 389 + if (zone->len > args->zone_sectors) { 390 + pr_warn("%s: Invalid zoned device with larger last zone size\n", 391 + disk->disk_name); 392 + return -ENODEV; 393 + } 378 394 } 379 395 380 396 /* Check for holes in the zone report */ ··· 395 395 /* Check zone type */ 396 396 switch (zone->type) { 397 397 case BLK_ZONE_TYPE_CONVENTIONAL: 398 + if (!args->conv_zones_bitmap) { 399 + args->conv_zones_bitmap = 400 + blk_alloc_zone_bitmap(q->node, args->nr_zones); 401 + if (!args->conv_zones_bitmap) 402 + return -ENOMEM; 403 + } 404 + set_bit(idx, args->conv_zones_bitmap); 405 + break; 398 406 case BLK_ZONE_TYPE_SEQWRITE_REQ: 399 407 case BLK_ZONE_TYPE_SEQWRITE_PREF: 408 + if (!args->seq_zones_wlock) { 409 + args->seq_zones_wlock = 410 + blk_alloc_zone_bitmap(q->node, args->nr_zones); 411 + if (!args->seq_zones_wlock) 412 + return -ENOMEM; 413 + } 400 414 break; 401 415 default: 402 416 pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", ··· 418 404 return -ENODEV; 419 405 } 420 406 421 - if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) 422 - set_bit(idx, args->seq_zones_bitmap); 423 - 424 407 args->sector += zone->len; 425 408 return 0; 426 - } 427 - 428 - static int blk_update_zone_info(struct gendisk *disk, unsigned int nr_zones, 429 - struct blk_revalidate_zone_args *args) 430 - { 431 - /* 432 - * Ensure that all memory allocations in this context are done as 433 - * if GFP_NOIO was specified. 434 - */ 435 - unsigned int noio_flag = memalloc_noio_save(); 436 - struct request_queue *q = disk->queue; 437 - int ret; 438 - 439 - args->seq_zones_wlock = blk_alloc_zone_bitmap(q->node, nr_zones); 440 - if (!args->seq_zones_wlock) 441 - return -ENOMEM; 442 - args->seq_zones_bitmap = blk_alloc_zone_bitmap(q->node, nr_zones); 443 - if (!args->seq_zones_bitmap) 444 - return -ENOMEM; 445 - 446 - ret = disk->fops->report_zones(disk, 0, nr_zones, 447 - blk_revalidate_zone_cb, args); 448 - memalloc_noio_restore(noio_flag); 449 - return ret; 450 409 } 451 410 452 411 /** ··· 428 441 * 429 442 * Helper function for low-level device drivers to (re) allocate and initialize 430 443 * a disk request queue zone bitmaps. This functions should normally be called 431 - * within the disk ->revalidate method. For BIO based queues, no zone bitmap 432 - * is allocated. 444 + * within the disk ->revalidate method for blk-mq based drivers. For BIO based 445 + * drivers only q->nr_zones needs to be updated so that the sysfs exposed value 446 + * is correct. 433 447 */ 434 448 int blk_revalidate_disk_zones(struct gendisk *disk) 435 449 { 436 450 struct request_queue *q = disk->queue; 437 - unsigned int nr_zones = __blkdev_nr_zones(q, get_capacity(disk)); 438 - struct blk_revalidate_zone_args args = { .disk = disk }; 439 - int ret = 0; 451 + struct blk_revalidate_zone_args args = { 452 + .disk = disk, 453 + }; 454 + unsigned int noio_flag; 455 + int ret; 440 456 441 457 if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) 442 458 return -EIO; 459 + if (WARN_ON_ONCE(!queue_is_mq(q))) 460 + return -EIO; 443 461 444 462 /* 445 - * BIO based queues do not use a scheduler so only q->nr_zones 446 - * needs to be updated so that the sysfs exposed value is correct. 463 + * Ensure that all memory allocations in this context are done as if 464 + * GFP_NOIO was specified. 447 465 */ 448 - if (!queue_is_mq(q)) { 449 - q->nr_zones = nr_zones; 450 - return 0; 451 - } 452 - 453 - if (nr_zones) 454 - ret = blk_update_zone_info(disk, nr_zones, &args); 466 + noio_flag = memalloc_noio_save(); 467 + ret = disk->fops->report_zones(disk, 0, UINT_MAX, 468 + blk_revalidate_zone_cb, &args); 469 + memalloc_noio_restore(noio_flag); 455 470 456 471 /* 457 - * Install the new bitmaps, making sure the queue is stopped and 458 - * all I/Os are completed (i.e. a scheduler is not referencing the 459 - * bitmaps). 472 + * Install the new bitmaps and update nr_zones only once the queue is 473 + * stopped and all I/Os are completed (i.e. a scheduler is not 474 + * referencing the bitmaps). 460 475 */ 461 476 blk_mq_freeze_queue(q); 462 477 if (ret >= 0) { 463 - q->nr_zones = nr_zones; 478 + blk_queue_chunk_sectors(q, args.zone_sectors); 479 + q->nr_zones = args.nr_zones; 464 480 swap(q->seq_zones_wlock, args.seq_zones_wlock); 465 - swap(q->seq_zones_bitmap, args.seq_zones_bitmap); 481 + swap(q->conv_zones_bitmap, args.conv_zones_bitmap); 466 482 ret = 0; 467 483 } else { 468 484 pr_warn("%s: failed to revalidate zones\n", disk->disk_name); ··· 474 484 blk_mq_unfreeze_queue(q); 475 485 476 486 kfree(args.seq_zones_wlock); 477 - kfree(args.seq_zones_bitmap); 487 + kfree(args.conv_zones_bitmap); 478 488 return ret; 479 489 } 480 490 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); 481 -
+4
block/blk.h
··· 121 121 #ifdef CONFIG_BLK_DEV_INTEGRITY 122 122 void blk_flush_integrity(void); 123 123 bool __bio_integrity_endio(struct bio *); 124 + void bio_integrity_free(struct bio *bio); 124 125 static inline bool bio_integrity_endio(struct bio *bio) 125 126 { 126 127 if (bio_integrity(bio)) ··· 166 165 static inline bool bio_integrity_endio(struct bio *bio) 167 166 { 168 167 return true; 168 + } 169 + static inline void bio_integrity_free(struct bio *bio) 170 + { 169 171 } 170 172 #endif /* CONFIG_BLK_DEV_INTEGRITY */ 171 173
+1 -1
block/ioctl.c
··· 512 512 case BLKGETZONESZ: 513 513 return put_uint(arg, bdev_zone_sectors(bdev)); 514 514 case BLKGETNRZONES: 515 - return put_uint(arg, blkdev_nr_zones(bdev)); 515 + return put_uint(arg, blkdev_nr_zones(bdev->bd_disk)); 516 516 case HDIO_GETGEO: 517 517 return blkdev_getgeo(bdev, argp); 518 518 case BLKRAGET:
+4 -1
drivers/block/brd.c
··· 297 297 unsigned int len = bvec.bv_len; 298 298 int err; 299 299 300 + /* Don't support un-aligned buffer */ 301 + WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) || 302 + (len & (SECTOR_SIZE - 1))); 303 + 300 304 err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset, 301 305 bio_op(bio), sector); 302 306 if (err) ··· 386 382 goto out_free_dev; 387 383 388 384 blk_queue_make_request(brd->brd_queue, brd_make_request); 389 - blk_queue_max_hw_sectors(brd->brd_queue, 1024); 390 385 391 386 /* This is so fdisk will align partitions on 4k, because of 392 387 * direct_access API needing 4k alignment, returning a PFN
+25 -15
drivers/block/null_blk_main.c
··· 1559 1559 1560 1560 static int null_gendisk_register(struct nullb *nullb) 1561 1561 { 1562 + sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT; 1562 1563 struct gendisk *disk; 1563 - sector_t size; 1564 1564 1565 1565 disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node); 1566 1566 if (!disk) 1567 1567 return -ENOMEM; 1568 - size = (sector_t)nullb->dev->size * 1024 * 1024ULL; 1569 - set_capacity(disk, size >> 9); 1568 + set_capacity(disk, size); 1570 1569 1571 1570 disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; 1572 1571 disk->major = null_major; ··· 1575 1576 disk->queue = nullb->q; 1576 1577 strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); 1577 1578 1579 + #ifdef CONFIG_BLK_DEV_ZONED 1578 1580 if (nullb->dev->zoned) { 1579 - int ret = blk_revalidate_disk_zones(disk); 1580 - 1581 - if (ret != 0) 1582 - return ret; 1581 + if (queue_is_mq(nullb->q)) { 1582 + int ret = blk_revalidate_disk_zones(disk); 1583 + if (ret) 1584 + return ret; 1585 + } else { 1586 + blk_queue_chunk_sectors(nullb->q, 1587 + nullb->dev->zone_size_sects); 1588 + nullb->q->nr_zones = blkdev_nr_zones(disk); 1589 + } 1583 1590 } 1591 + #endif 1584 1592 1585 1593 add_disk(disk); 1586 1594 return 0; ··· 1613 1607 return blk_mq_alloc_tag_set(set); 1614 1608 } 1615 1609 1616 - static void null_validate_conf(struct nullb_device *dev) 1610 + static int null_validate_conf(struct nullb_device *dev) 1617 1611 { 1618 1612 dev->blocksize = round_down(dev->blocksize, 512); 1619 1613 dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); ··· 1640 1634 /* can not stop a queue */ 1641 1635 if (dev->queue_mode == NULL_Q_BIO) 1642 1636 dev->mbps = 0; 1637 + 1638 + if (dev->zoned && 1639 + (!dev->zone_size || !is_power_of_2(dev->zone_size))) { 1640 + pr_err("zone_size must be power-of-two\n"); 1641 + return -EINVAL; 1642 + } 1643 + 1644 + return 0; 1643 1645 } 1644 1646 1645 1647 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION ··· 1680 1666 struct nullb *nullb; 1681 1667 int rv; 1682 1668 1683 - null_validate_conf(dev); 1669 + rv = null_validate_conf(dev); 1670 + if (rv) 1671 + return rv; 1684 1672 1685 1673 nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node); 1686 1674 if (!nullb) { ··· 1747 1731 if (rv) 1748 1732 goto out_cleanup_blk_queue; 1749 1733 1750 - blk_queue_chunk_sectors(nullb->q, dev->zone_size_sects); 1751 1734 nullb->q->limits.zoned = BLK_ZONED_HM; 1752 1735 blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, nullb->q); 1753 1736 blk_queue_required_elevator_features(nullb->q, ··· 1805 1790 pr_warn("invalid block size\n"); 1806 1791 pr_warn("defaults block size to %lu\n", PAGE_SIZE); 1807 1792 g_bs = PAGE_SIZE; 1808 - } 1809 - 1810 - if (!is_power_of_2(g_zone_size)) { 1811 - pr_err("zone_size must be power-of-two\n"); 1812 - return -EINVAL; 1813 1793 } 1814 1794 1815 1795 if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) {
+2
drivers/block/xen-blkback/blkback.c
··· 936 936 out_of_memory: 937 937 pr_alert("%s: out of memory\n", __func__); 938 938 put_free_pages(ring, pages_to_gnt, segs_to_map); 939 + for (i = last_map; i < num; i++) 940 + pages[i]->handle = BLKBACK_INVALID_HANDLE; 939 941 return -ENOMEM; 940 942 } 941 943
+7 -5
drivers/md/dm-table.c
··· 1954 1954 /* 1955 1955 * For a zoned target, the number of zones should be updated for the 1956 1956 * correct value to be exposed in sysfs queue/nr_zones. For a BIO based 1957 - * target, this is all that is needed. For a request based target, the 1958 - * queue zone bitmaps must also be updated. 1959 - * Use blk_revalidate_disk_zones() to handle this. 1957 + * target, this is all that is needed. 1960 1958 */ 1961 - if (blk_queue_is_zoned(q)) 1962 - blk_revalidate_disk_zones(t->md->disk); 1959 + #ifdef CONFIG_BLK_DEV_ZONED 1960 + if (blk_queue_is_zoned(q)) { 1961 + WARN_ON_ONCE(queue_is_mq(q)); 1962 + q->nr_zones = blkdev_nr_zones(t->md->disk); 1963 + } 1964 + #endif 1963 1965 1964 1966 /* Allow reads to exceed readahead limits */ 1965 1967 q->backing_dev_info->io_pages = limits->max_sectors >> (PAGE_SHIFT - 9);
+1 -1
drivers/md/dm-zoned-target.c
··· 727 727 dev->zone_nr_blocks = dmz_sect2blk(dev->zone_nr_sectors); 728 728 dev->zone_nr_blocks_shift = ilog2(dev->zone_nr_blocks); 729 729 730 - dev->nr_zones = blkdev_nr_zones(dev->bdev); 730 + dev->nr_zones = blkdev_nr_zones(dev->bdev->bd_disk); 731 731 732 732 dmz->dev = dev; 733 733
-2
drivers/scsi/sd_zbc.c
··· 412 412 goto err; 413 413 414 414 /* The drive satisfies the kernel restrictions: set it up */ 415 - blk_queue_chunk_sectors(sdkp->disk->queue, 416 - logical_to_sectors(sdkp->device, zone_blocks)); 417 415 blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, sdkp->disk->queue); 418 416 blk_queue_required_elevator_features(sdkp->disk->queue, 419 417 ELEVATOR_F_ZBD_SEQ_WRITE);
+1 -1
fs/block_dev.c
··· 1531 1531 ret = blk_add_partitions(disk, bdev); 1532 1532 if (ret == -EAGAIN) 1533 1533 goto rescan; 1534 - } else { 1534 + } else if (invalidate) { 1535 1535 /* 1536 1536 * Tell userspace that the media / partition table may have 1537 1537 * changed.
+1 -1
fs/io-wq.c
··· 111 111 112 112 struct task_struct *manager; 113 113 struct user_struct *user; 114 - struct cred *creds; 114 + const struct cred *creds; 115 115 struct mm_struct *mm; 116 116 refcount_t refs; 117 117 struct completion done;
+4 -7
fs/io-wq.h
··· 52 52 list->last = prev; 53 53 if (prev) 54 54 prev->next = node->next; 55 + node->next = NULL; 55 56 } 56 57 57 58 #define wq_list_for_each(pos, prv, head) \ ··· 88 87 struct io_wq_data { 89 88 struct mm_struct *mm; 90 89 struct user_struct *user; 91 - struct cred *creds; 90 + const struct cred *creds; 92 91 93 92 get_work_fn *get_work; 94 93 put_work_fn *put_work; ··· 119 118 static inline void io_wq_worker_running(struct task_struct *tsk) 120 119 { 121 120 } 122 - #endif 121 + #endif /* CONFIG_IO_WQ */ 123 122 124 - static inline bool io_wq_current_is_worker(void) 125 - { 126 - return in_task() && (current->flags & PF_IO_WORKER); 127 - } 128 - #endif 123 + #endif /* INTERNAL_IO_WQ_H */
+498 -218
fs/io_uring.c
··· 145 145 /* 146 146 * Number of completion events lost because the queue was full; 147 147 * this should be avoided by the application by making sure 148 - * there are not more requests pending thatn there is space in 148 + * there are not more requests pending than there is space in 149 149 * the completion queue. 150 150 * 151 151 * Written by the kernel, shouldn't be modified by the ··· 238 238 239 239 struct user_struct *user; 240 240 241 - struct cred *creds; 241 + const struct cred *creds; 242 242 243 243 /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */ 244 244 struct completion *completions; ··· 275 275 * manipulate the list, hence no extra locking is needed there. 276 276 */ 277 277 struct list_head poll_list; 278 - struct rb_root cancel_tree; 278 + struct hlist_head *cancel_hash; 279 + unsigned cancel_hash_bits; 279 280 280 281 spinlock_t inflight_lock; 281 282 struct list_head inflight_list; ··· 304 303 u32 seq_offset; 305 304 }; 306 305 307 - struct io_timeout { 308 - struct file *file; 309 - struct io_timeout_data *data; 306 + struct io_async_connect { 307 + struct sockaddr_storage address; 308 + }; 309 + 310 + struct io_async_msghdr { 311 + struct iovec fast_iov[UIO_FASTIOV]; 312 + struct iovec *iov; 313 + struct sockaddr __user *uaddr; 314 + struct msghdr msg; 315 + }; 316 + 317 + struct io_async_rw { 318 + struct iovec fast_iov[UIO_FASTIOV]; 319 + struct iovec *iov; 320 + ssize_t nr_segs; 321 + ssize_t size; 322 + }; 323 + 324 + struct io_async_ctx { 325 + struct io_uring_sqe sqe; 326 + union { 327 + struct io_async_rw rw; 328 + struct io_async_msghdr msg; 329 + struct io_async_connect connect; 330 + struct io_timeout_data timeout; 331 + }; 310 332 }; 311 333 312 334 /* ··· 343 319 struct file *file; 344 320 struct kiocb rw; 345 321 struct io_poll_iocb poll; 346 - struct io_timeout timeout; 347 322 }; 348 323 349 324 const struct io_uring_sqe *sqe; 325 + struct io_async_ctx *io; 350 326 struct file *ring_file; 351 327 int ring_fd; 352 328 bool has_user; ··· 356 332 struct io_ring_ctx *ctx; 357 333 union { 358 334 struct list_head list; 359 - struct rb_node rb_node; 335 + struct hlist_node hash_node; 360 336 }; 361 337 struct list_head link_list; 362 338 unsigned int flags; ··· 377 353 #define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */ 378 354 #define REQ_F_INFLIGHT 16384 /* on inflight list */ 379 355 #define REQ_F_COMP_LOCKED 32768 /* completion under lock */ 380 - #define REQ_F_FREE_SQE 65536 /* free sqe if not async queued */ 381 356 u64 user_data; 382 357 u32 result; 383 358 u32 sequence; ··· 445 422 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) 446 423 { 447 424 struct io_ring_ctx *ctx; 425 + int hash_bits; 448 426 449 427 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 450 428 if (!ctx) ··· 458 434 ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL); 459 435 if (!ctx->completions) 460 436 goto err; 437 + 438 + /* 439 + * Use 5 bits less than the max cq entries, that should give us around 440 + * 32 entries per hash list if totally full and uniformly spread. 441 + */ 442 + hash_bits = ilog2(p->cq_entries); 443 + hash_bits -= 5; 444 + if (hash_bits <= 0) 445 + hash_bits = 1; 446 + ctx->cancel_hash_bits = hash_bits; 447 + ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head), 448 + GFP_KERNEL); 449 + if (!ctx->cancel_hash) 450 + goto err; 451 + __hash_init(ctx->cancel_hash, 1U << hash_bits); 461 452 462 453 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 463 454 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) ··· 487 448 init_waitqueue_head(&ctx->wait); 488 449 spin_lock_init(&ctx->completion_lock); 489 450 INIT_LIST_HEAD(&ctx->poll_list); 490 - ctx->cancel_tree = RB_ROOT; 491 451 INIT_LIST_HEAD(&ctx->defer_list); 492 452 INIT_LIST_HEAD(&ctx->timeout_list); 493 453 init_waitqueue_head(&ctx->inflight_wait); ··· 497 459 if (ctx->fallback_req) 498 460 kmem_cache_free(req_cachep, ctx->fallback_req); 499 461 kfree(ctx->completions); 462 + kfree(ctx->cancel_hash); 500 463 kfree(ctx); 501 464 return NULL; 502 465 } ··· 631 592 { 632 593 int ret; 633 594 634 - ret = hrtimer_try_to_cancel(&req->timeout.data->timer); 595 + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); 635 596 if (ret != -1) { 636 597 atomic_inc(&req->ctx->cq_timeouts); 637 598 list_del_init(&req->list); ··· 845 806 } 846 807 847 808 got_it: 809 + req->io = NULL; 848 810 req->ring_file = NULL; 849 811 req->file = NULL; 850 812 req->ctx = ctx; ··· 876 836 { 877 837 struct io_ring_ctx *ctx = req->ctx; 878 838 879 - if (req->flags & REQ_F_FREE_SQE) 880 - kfree(req->sqe); 839 + if (req->io) 840 + kfree(req->io); 881 841 if (req->file && !(req->flags & REQ_F_FIXED_FILE)) 882 842 fput(req->file); 883 843 if (req->flags & REQ_F_INFLIGHT) { ··· 889 849 wake_up(&ctx->inflight_wait); 890 850 spin_unlock_irqrestore(&ctx->inflight_lock, flags); 891 851 } 892 - if (req->flags & REQ_F_TIMEOUT) 893 - kfree(req->timeout.data); 894 852 percpu_ref_put(&ctx->refs); 895 853 if (likely(!io_is_fallback_req(req))) 896 854 kmem_cache_free(req_cachep, req); ··· 901 863 struct io_ring_ctx *ctx = req->ctx; 902 864 int ret; 903 865 904 - ret = hrtimer_try_to_cancel(&req->timeout.data->timer); 866 + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); 905 867 if (ret != -1) { 906 868 io_cqring_fill_event(req, -ECANCELED); 907 869 io_commit_cqring(ctx); ··· 916 878 static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) 917 879 { 918 880 struct io_ring_ctx *ctx = req->ctx; 919 - struct io_kiocb *nxt; 920 881 bool wake_ev = false; 921 882 922 883 /* Already got next link */ ··· 927 890 * potentially happen if the chain is messed up, check to be on the 928 891 * safe side. 929 892 */ 930 - nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); 931 - while (nxt) { 932 - list_del_init(&nxt->list); 893 + while (!list_empty(&req->link_list)) { 894 + struct io_kiocb *nxt = list_first_entry(&req->link_list, 895 + struct io_kiocb, link_list); 933 896 934 - if ((req->flags & REQ_F_LINK_TIMEOUT) && 935 - (nxt->flags & REQ_F_TIMEOUT)) { 897 + if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) && 898 + (nxt->flags & REQ_F_TIMEOUT))) { 899 + list_del_init(&nxt->link_list); 936 900 wake_ev |= io_link_cancel_timeout(nxt); 937 - nxt = list_first_entry_or_null(&req->link_list, 938 - struct io_kiocb, list); 939 901 req->flags &= ~REQ_F_LINK_TIMEOUT; 940 902 continue; 941 903 } 942 - if (!list_empty(&req->link_list)) { 943 - INIT_LIST_HEAD(&nxt->link_list); 944 - list_splice(&req->link_list, &nxt->link_list); 945 - nxt->flags |= REQ_F_LINK; 946 - } 947 904 905 + list_del_init(&req->link_list); 906 + if (!list_empty(&nxt->link_list)) 907 + nxt->flags |= REQ_F_LINK; 948 908 *nxtptr = nxt; 949 909 break; 950 910 } ··· 957 923 static void io_fail_links(struct io_kiocb *req) 958 924 { 959 925 struct io_ring_ctx *ctx = req->ctx; 960 - struct io_kiocb *link; 961 926 unsigned long flags; 962 927 963 928 spin_lock_irqsave(&ctx->completion_lock, flags); 964 929 965 930 while (!list_empty(&req->link_list)) { 966 - link = list_first_entry(&req->link_list, struct io_kiocb, list); 967 - list_del_init(&link->list); 931 + struct io_kiocb *link = list_first_entry(&req->link_list, 932 + struct io_kiocb, link_list); 968 933 934 + list_del_init(&link->link_list); 969 935 trace_io_uring_fail_link(req, link); 970 936 971 937 if ((req->flags & REQ_F_LINK_TIMEOUT) && ··· 1113 1079 * completions for those, only batch free for fixed 1114 1080 * file and non-linked commands. 1115 1081 */ 1116 - if (((req->flags & 1117 - (REQ_F_FIXED_FILE|REQ_F_LINK|REQ_F_FREE_SQE)) == 1118 - REQ_F_FIXED_FILE) && !io_is_fallback_req(req)) { 1082 + if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == 1083 + REQ_F_FIXED_FILE) && !io_is_fallback_req(req) && 1084 + !req->io) { 1119 1085 reqs[to_free++] = req; 1120 1086 if (to_free == ARRAY_SIZE(reqs)) 1121 1087 io_free_req_many(ctx, reqs, &to_free); ··· 1444 1410 if (S_ISREG(file_inode(req->file)->i_mode)) 1445 1411 req->flags |= REQ_F_ISREG; 1446 1412 1447 - /* 1448 - * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so 1449 - * we know to async punt it even if it was opened O_NONBLOCK 1450 - */ 1451 - if (force_nonblock && !io_file_supports_async(req->file)) { 1452 - req->flags |= REQ_F_MUST_PUNT; 1453 - return -EAGAIN; 1454 - } 1455 - 1456 1413 kiocb->ki_pos = READ_ONCE(sqe->off); 1457 1414 kiocb->ki_flags = iocb_flags(kiocb->ki_filp); 1458 1415 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); ··· 1612 1587 return io_import_fixed(req->ctx, rw, sqe, iter); 1613 1588 } 1614 1589 1590 + if (req->io) { 1591 + struct io_async_rw *iorw = &req->io->rw; 1592 + 1593 + *iovec = iorw->iov; 1594 + iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size); 1595 + if (iorw->iov == iorw->fast_iov) 1596 + *iovec = NULL; 1597 + return iorw->size; 1598 + } 1599 + 1615 1600 if (!req->has_user) 1616 1601 return -EFAULT; 1617 1602 ··· 1692 1657 return ret; 1693 1658 } 1694 1659 1660 + static void io_req_map_io(struct io_kiocb *req, ssize_t io_size, 1661 + struct iovec *iovec, struct iovec *fast_iov, 1662 + struct iov_iter *iter) 1663 + { 1664 + req->io->rw.nr_segs = iter->nr_segs; 1665 + req->io->rw.size = io_size; 1666 + req->io->rw.iov = iovec; 1667 + if (!req->io->rw.iov) { 1668 + req->io->rw.iov = req->io->rw.fast_iov; 1669 + memcpy(req->io->rw.iov, fast_iov, 1670 + sizeof(struct iovec) * iter->nr_segs); 1671 + } 1672 + } 1673 + 1674 + static int io_setup_async_io(struct io_kiocb *req, ssize_t io_size, 1675 + struct iovec *iovec, struct iovec *fast_iov, 1676 + struct iov_iter *iter) 1677 + { 1678 + req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); 1679 + if (req->io) { 1680 + io_req_map_io(req, io_size, iovec, fast_iov, iter); 1681 + memcpy(&req->io->sqe, req->sqe, sizeof(req->io->sqe)); 1682 + req->sqe = &req->io->sqe; 1683 + return 0; 1684 + } 1685 + 1686 + return -ENOMEM; 1687 + } 1688 + 1689 + static int io_read_prep(struct io_kiocb *req, struct iovec **iovec, 1690 + struct iov_iter *iter, bool force_nonblock) 1691 + { 1692 + ssize_t ret; 1693 + 1694 + ret = io_prep_rw(req, force_nonblock); 1695 + if (ret) 1696 + return ret; 1697 + 1698 + if (unlikely(!(req->file->f_mode & FMODE_READ))) 1699 + return -EBADF; 1700 + 1701 + return io_import_iovec(READ, req, iovec, iter); 1702 + } 1703 + 1695 1704 static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, 1696 1705 bool force_nonblock) 1697 1706 { ··· 1744 1665 struct iov_iter iter; 1745 1666 struct file *file; 1746 1667 size_t iov_count; 1747 - ssize_t read_size, ret; 1668 + ssize_t io_size, ret; 1748 1669 1749 - ret = io_prep_rw(req, force_nonblock); 1750 - if (ret) 1751 - return ret; 1752 - file = kiocb->ki_filp; 1670 + if (!req->io) { 1671 + ret = io_read_prep(req, &iovec, &iter, force_nonblock); 1672 + if (ret < 0) 1673 + return ret; 1674 + } else { 1675 + ret = io_import_iovec(READ, req, &iovec, &iter); 1676 + if (ret < 0) 1677 + return ret; 1678 + } 1753 1679 1754 - if (unlikely(!(file->f_mode & FMODE_READ))) 1755 - return -EBADF; 1756 - 1757 - ret = io_import_iovec(READ, req, &iovec, &iter); 1758 - if (ret < 0) 1759 - return ret; 1760 - 1761 - read_size = ret; 1680 + file = req->file; 1681 + io_size = ret; 1762 1682 if (req->flags & REQ_F_LINK) 1763 - req->result = read_size; 1683 + req->result = io_size; 1684 + 1685 + /* 1686 + * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so 1687 + * we know to async punt it even if it was opened O_NONBLOCK 1688 + */ 1689 + if (force_nonblock && !io_file_supports_async(file)) { 1690 + req->flags |= REQ_F_MUST_PUNT; 1691 + goto copy_iov; 1692 + } 1764 1693 1765 1694 iov_count = iov_iter_count(&iter); 1766 1695 ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); ··· 1790 1703 */ 1791 1704 if (force_nonblock && !(req->flags & REQ_F_NOWAIT) && 1792 1705 (req->flags & REQ_F_ISREG) && 1793 - ret2 > 0 && ret2 < read_size) 1706 + ret2 > 0 && ret2 < io_size) 1794 1707 ret2 = -EAGAIN; 1795 1708 /* Catch -EAGAIN return for forced non-blocking submission */ 1796 - if (!force_nonblock || ret2 != -EAGAIN) 1709 + if (!force_nonblock || ret2 != -EAGAIN) { 1797 1710 kiocb_done(kiocb, ret2, nxt, req->in_async); 1798 - else 1799 - ret = -EAGAIN; 1711 + } else { 1712 + copy_iov: 1713 + ret = io_setup_async_io(req, io_size, iovec, 1714 + inline_vecs, &iter); 1715 + if (ret) 1716 + goto out_free; 1717 + return -EAGAIN; 1718 + } 1800 1719 } 1720 + out_free: 1801 1721 kfree(iovec); 1802 1722 return ret; 1723 + } 1724 + 1725 + static int io_write_prep(struct io_kiocb *req, struct iovec **iovec, 1726 + struct iov_iter *iter, bool force_nonblock) 1727 + { 1728 + ssize_t ret; 1729 + 1730 + ret = io_prep_rw(req, force_nonblock); 1731 + if (ret) 1732 + return ret; 1733 + 1734 + if (unlikely(!(req->file->f_mode & FMODE_WRITE))) 1735 + return -EBADF; 1736 + 1737 + return io_import_iovec(WRITE, req, iovec, iter); 1803 1738 } 1804 1739 1805 1740 static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, ··· 1832 1723 struct iov_iter iter; 1833 1724 struct file *file; 1834 1725 size_t iov_count; 1835 - ssize_t ret; 1726 + ssize_t ret, io_size; 1836 1727 1837 - ret = io_prep_rw(req, force_nonblock); 1838 - if (ret) 1839 - return ret; 1728 + if (!req->io) { 1729 + ret = io_write_prep(req, &iovec, &iter, force_nonblock); 1730 + if (ret < 0) 1731 + return ret; 1732 + } else { 1733 + ret = io_import_iovec(WRITE, req, &iovec, &iter); 1734 + if (ret < 0) 1735 + return ret; 1736 + } 1840 1737 1841 1738 file = kiocb->ki_filp; 1842 - if (unlikely(!(file->f_mode & FMODE_WRITE))) 1843 - return -EBADF; 1844 - 1845 - ret = io_import_iovec(WRITE, req, &iovec, &iter); 1846 - if (ret < 0) 1847 - return ret; 1848 - 1739 + io_size = ret; 1849 1740 if (req->flags & REQ_F_LINK) 1850 - req->result = ret; 1741 + req->result = io_size; 1742 + 1743 + /* 1744 + * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so 1745 + * we know to async punt it even if it was opened O_NONBLOCK 1746 + */ 1747 + if (force_nonblock && !io_file_supports_async(req->file)) { 1748 + req->flags |= REQ_F_MUST_PUNT; 1749 + goto copy_iov; 1750 + } 1751 + 1752 + if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) 1753 + goto copy_iov; 1851 1754 1852 1755 iov_count = iov_iter_count(&iter); 1853 - 1854 - ret = -EAGAIN; 1855 - if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) 1856 - goto out_free; 1857 - 1858 1756 ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count); 1859 1757 if (!ret) { 1860 1758 ssize_t ret2; ··· 1885 1769 ret2 = call_write_iter(file, kiocb, &iter); 1886 1770 else 1887 1771 ret2 = loop_rw_iter(WRITE, file, kiocb, &iter); 1888 - if (!force_nonblock || ret2 != -EAGAIN) 1772 + if (!force_nonblock || ret2 != -EAGAIN) { 1889 1773 kiocb_done(kiocb, ret2, nxt, req->in_async); 1890 - else 1891 - ret = -EAGAIN; 1774 + } else { 1775 + copy_iov: 1776 + ret = io_setup_async_io(req, io_size, iovec, 1777 + inline_vecs, &iter); 1778 + if (ret) 1779 + goto out_free; 1780 + return -EAGAIN; 1781 + } 1892 1782 } 1893 1783 out_free: 1894 1784 kfree(iovec); ··· 2010 1888 return 0; 2011 1889 } 2012 1890 2013 - #if defined(CONFIG_NET) 2014 - static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, 2015 - struct io_kiocb **nxt, bool force_nonblock, 2016 - long (*fn)(struct socket *, struct user_msghdr __user *, 2017 - unsigned int)) 1891 + static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) 2018 1892 { 1893 + #if defined(CONFIG_NET) 1894 + const struct io_uring_sqe *sqe = req->sqe; 1895 + struct user_msghdr __user *msg; 1896 + unsigned flags; 1897 + 1898 + flags = READ_ONCE(sqe->msg_flags); 1899 + msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); 1900 + return sendmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.iov); 1901 + #else 1902 + return 0; 1903 + #endif 1904 + } 1905 + 1906 + static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, 1907 + struct io_kiocb **nxt, bool force_nonblock) 1908 + { 1909 + #if defined(CONFIG_NET) 1910 + struct socket *sock; 1911 + int ret; 1912 + 1913 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 1914 + return -EINVAL; 1915 + 1916 + sock = sock_from_file(req->file, &ret); 1917 + if (sock) { 1918 + struct io_async_ctx io, *copy; 1919 + struct sockaddr_storage addr; 1920 + struct msghdr *kmsg; 1921 + unsigned flags; 1922 + 1923 + flags = READ_ONCE(sqe->msg_flags); 1924 + if (flags & MSG_DONTWAIT) 1925 + req->flags |= REQ_F_NOWAIT; 1926 + else if (force_nonblock) 1927 + flags |= MSG_DONTWAIT; 1928 + 1929 + if (req->io) { 1930 + kmsg = &req->io->msg.msg; 1931 + kmsg->msg_name = &addr; 1932 + } else { 1933 + kmsg = &io.msg.msg; 1934 + kmsg->msg_name = &addr; 1935 + io.msg.iov = io.msg.fast_iov; 1936 + ret = io_sendmsg_prep(req, &io); 1937 + if (ret) 1938 + goto out; 1939 + } 1940 + 1941 + ret = __sys_sendmsg_sock(sock, kmsg, flags); 1942 + if (force_nonblock && ret == -EAGAIN) { 1943 + copy = kmalloc(sizeof(*copy), GFP_KERNEL); 1944 + if (!copy) { 1945 + ret = -ENOMEM; 1946 + goto out; 1947 + } 1948 + memcpy(&copy->msg, &io.msg, sizeof(copy->msg)); 1949 + req->io = copy; 1950 + memcpy(&req->io->sqe, req->sqe, sizeof(*req->sqe)); 1951 + req->sqe = &req->io->sqe; 1952 + return ret; 1953 + } 1954 + if (ret == -ERESTARTSYS) 1955 + ret = -EINTR; 1956 + } 1957 + 1958 + out: 1959 + io_cqring_add_event(req, ret); 1960 + if (ret < 0 && (req->flags & REQ_F_LINK)) 1961 + req->flags |= REQ_F_FAIL_LINK; 1962 + io_put_req_find_next(req, nxt); 1963 + return 0; 1964 + #else 1965 + return -EOPNOTSUPP; 1966 + #endif 1967 + } 1968 + 1969 + static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) 1970 + { 1971 + #if defined(CONFIG_NET) 1972 + const struct io_uring_sqe *sqe = req->sqe; 1973 + struct user_msghdr __user *msg; 1974 + unsigned flags; 1975 + 1976 + flags = READ_ONCE(sqe->msg_flags); 1977 + msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); 1978 + return recvmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.uaddr, 1979 + &io->msg.iov); 1980 + #else 1981 + return 0; 1982 + #endif 1983 + } 1984 + 1985 + static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, 1986 + struct io_kiocb **nxt, bool force_nonblock) 1987 + { 1988 + #if defined(CONFIG_NET) 2019 1989 struct socket *sock; 2020 1990 int ret; 2021 1991 ··· 2117 1903 sock = sock_from_file(req->file, &ret); 2118 1904 if (sock) { 2119 1905 struct user_msghdr __user *msg; 1906 + struct io_async_ctx io, *copy; 1907 + struct sockaddr_storage addr; 1908 + struct msghdr *kmsg; 2120 1909 unsigned flags; 2121 1910 2122 1911 flags = READ_ONCE(sqe->msg_flags); ··· 2130 1913 2131 1914 msg = (struct user_msghdr __user *) (unsigned long) 2132 1915 READ_ONCE(sqe->addr); 1916 + if (req->io) { 1917 + kmsg = &req->io->msg.msg; 1918 + kmsg->msg_name = &addr; 1919 + } else { 1920 + kmsg = &io.msg.msg; 1921 + kmsg->msg_name = &addr; 1922 + io.msg.iov = io.msg.fast_iov; 1923 + ret = io_recvmsg_prep(req, &io); 1924 + if (ret) 1925 + goto out; 1926 + } 2133 1927 2134 - ret = fn(sock, msg, flags); 2135 - if (force_nonblock && ret == -EAGAIN) 1928 + ret = __sys_recvmsg_sock(sock, kmsg, msg, io.msg.uaddr, flags); 1929 + if (force_nonblock && ret == -EAGAIN) { 1930 + copy = kmalloc(sizeof(*copy), GFP_KERNEL); 1931 + if (!copy) { 1932 + ret = -ENOMEM; 1933 + goto out; 1934 + } 1935 + memcpy(copy, &io, sizeof(*copy)); 1936 + req->io = copy; 1937 + memcpy(&req->io->sqe, req->sqe, sizeof(*req->sqe)); 1938 + req->sqe = &req->io->sqe; 2136 1939 return ret; 1940 + } 1941 + if (ret == -ERESTARTSYS) 1942 + ret = -EINTR; 2137 1943 } 2138 1944 1945 + out: 2139 1946 io_cqring_add_event(req, ret); 2140 1947 if (ret < 0 && (req->flags & REQ_F_LINK)) 2141 1948 req->flags |= REQ_F_FAIL_LINK; 2142 1949 io_put_req_find_next(req, nxt); 2143 1950 return 0; 2144 - } 2145 - #endif 2146 - 2147 - static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, 2148 - struct io_kiocb **nxt, bool force_nonblock) 2149 - { 2150 - #if defined(CONFIG_NET) 2151 - return io_send_recvmsg(req, sqe, nxt, force_nonblock, 2152 - __sys_sendmsg_sock); 2153 - #else 2154 - return -EOPNOTSUPP; 2155 - #endif 2156 - } 2157 - 2158 - static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, 2159 - struct io_kiocb **nxt, bool force_nonblock) 2160 - { 2161 - #if defined(CONFIG_NET) 2162 - return io_send_recvmsg(req, sqe, nxt, force_nonblock, 2163 - __sys_recvmsg_sock); 2164 1951 #else 2165 1952 return -EOPNOTSUPP; 2166 1953 #endif ··· 2206 1985 #endif 2207 1986 } 2208 1987 1988 + static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io) 1989 + { 1990 + #if defined(CONFIG_NET) 1991 + const struct io_uring_sqe *sqe = req->sqe; 1992 + struct sockaddr __user *addr; 1993 + int addr_len; 1994 + 1995 + addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); 1996 + addr_len = READ_ONCE(sqe->addr2); 1997 + return move_addr_to_kernel(addr, addr_len, &io->connect.address); 1998 + #else 1999 + return 0; 2000 + #endif 2001 + } 2002 + 2209 2003 static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, 2210 2004 struct io_kiocb **nxt, bool force_nonblock) 2211 2005 { 2212 2006 #if defined(CONFIG_NET) 2213 - struct sockaddr __user *addr; 2007 + struct io_async_ctx __io, *io; 2214 2008 unsigned file_flags; 2215 2009 int addr_len, ret; 2216 2010 ··· 2234 1998 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) 2235 1999 return -EINVAL; 2236 2000 2237 - addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); 2238 2001 addr_len = READ_ONCE(sqe->addr2); 2239 2002 file_flags = force_nonblock ? O_NONBLOCK : 0; 2240 2003 2241 - ret = __sys_connect_file(req->file, addr, addr_len, file_flags); 2242 - if (ret == -EAGAIN && force_nonblock) 2004 + if (req->io) { 2005 + io = req->io; 2006 + } else { 2007 + ret = io_connect_prep(req, &__io); 2008 + if (ret) 2009 + goto out; 2010 + io = &__io; 2011 + } 2012 + 2013 + ret = __sys_connect_file(req->file, &io->connect.address, addr_len, 2014 + file_flags); 2015 + if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { 2016 + io = kmalloc(sizeof(*io), GFP_KERNEL); 2017 + if (!io) { 2018 + ret = -ENOMEM; 2019 + goto out; 2020 + } 2021 + memcpy(&io->connect, &__io.connect, sizeof(io->connect)); 2022 + req->io = io; 2023 + memcpy(&io->sqe, req->sqe, sizeof(*req->sqe)); 2024 + req->sqe = &io->sqe; 2243 2025 return -EAGAIN; 2026 + } 2244 2027 if (ret == -ERESTARTSYS) 2245 2028 ret = -EINTR; 2029 + out: 2246 2030 if (ret < 0 && (req->flags & REQ_F_LINK)) 2247 2031 req->flags |= REQ_F_FAIL_LINK; 2248 2032 io_cqring_add_event(req, ret); ··· 2271 2015 #else 2272 2016 return -EOPNOTSUPP; 2273 2017 #endif 2274 - } 2275 - 2276 - static inline void io_poll_remove_req(struct io_kiocb *req) 2277 - { 2278 - if (!RB_EMPTY_NODE(&req->rb_node)) { 2279 - rb_erase(&req->rb_node, &req->ctx->cancel_tree); 2280 - RB_CLEAR_NODE(&req->rb_node); 2281 - } 2282 2018 } 2283 2019 2284 2020 static void io_poll_remove_one(struct io_kiocb *req) ··· 2284 2036 io_queue_async_work(req); 2285 2037 } 2286 2038 spin_unlock(&poll->head->lock); 2287 - io_poll_remove_req(req); 2039 + hash_del(&req->hash_node); 2288 2040 } 2289 2041 2290 2042 static void io_poll_remove_all(struct io_ring_ctx *ctx) 2291 2043 { 2292 - struct rb_node *node; 2044 + struct hlist_node *tmp; 2293 2045 struct io_kiocb *req; 2046 + int i; 2294 2047 2295 2048 spin_lock_irq(&ctx->completion_lock); 2296 - while ((node = rb_first(&ctx->cancel_tree)) != NULL) { 2297 - req = rb_entry(node, struct io_kiocb, rb_node); 2298 - io_poll_remove_one(req); 2049 + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { 2050 + struct hlist_head *list; 2051 + 2052 + list = &ctx->cancel_hash[i]; 2053 + hlist_for_each_entry_safe(req, tmp, list, hash_node) 2054 + io_poll_remove_one(req); 2299 2055 } 2300 2056 spin_unlock_irq(&ctx->completion_lock); 2301 2057 } 2302 2058 2303 2059 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) 2304 2060 { 2305 - struct rb_node *p, *parent = NULL; 2061 + struct hlist_head *list; 2306 2062 struct io_kiocb *req; 2307 2063 2308 - p = ctx->cancel_tree.rb_node; 2309 - while (p) { 2310 - parent = p; 2311 - req = rb_entry(parent, struct io_kiocb, rb_node); 2312 - if (sqe_addr < req->user_data) { 2313 - p = p->rb_left; 2314 - } else if (sqe_addr > req->user_data) { 2315 - p = p->rb_right; 2316 - } else { 2064 + list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; 2065 + hlist_for_each_entry(req, list, hash_node) { 2066 + if (sqe_addr == req->user_data) { 2317 2067 io_poll_remove_one(req); 2318 2068 return 0; 2319 2069 } ··· 2393 2147 spin_unlock_irq(&ctx->completion_lock); 2394 2148 return; 2395 2149 } 2396 - io_poll_remove_req(req); 2150 + hash_del(&req->hash_node); 2397 2151 io_poll_complete(req, mask, ret); 2398 2152 spin_unlock_irq(&ctx->completion_lock); 2399 2153 ··· 2428 2182 * for finalizing the request, mark us as having grabbed that already. 2429 2183 */ 2430 2184 if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { 2431 - io_poll_remove_req(req); 2185 + hash_del(&req->hash_node); 2432 2186 io_poll_complete(req, mask, 0); 2433 2187 req->flags |= REQ_F_COMP_LOCKED; 2434 2188 io_put_req(req); ··· 2466 2220 static void io_poll_req_insert(struct io_kiocb *req) 2467 2221 { 2468 2222 struct io_ring_ctx *ctx = req->ctx; 2469 - struct rb_node **p = &ctx->cancel_tree.rb_node; 2470 - struct rb_node *parent = NULL; 2471 - struct io_kiocb *tmp; 2223 + struct hlist_head *list; 2472 2224 2473 - while (*p) { 2474 - parent = *p; 2475 - tmp = rb_entry(parent, struct io_kiocb, rb_node); 2476 - if (req->user_data < tmp->user_data) 2477 - p = &(*p)->rb_left; 2478 - else 2479 - p = &(*p)->rb_right; 2480 - } 2481 - rb_link_node(&req->rb_node, parent, p); 2482 - rb_insert_color(&req->rb_node, &ctx->cancel_tree); 2225 + list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; 2226 + hlist_add_head(&req->hash_node, list); 2483 2227 } 2484 2228 2485 2229 static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, ··· 2493 2257 if (!poll->wait) 2494 2258 return -ENOMEM; 2495 2259 2496 - req->sqe = NULL; 2260 + req->io = NULL; 2497 2261 INIT_IO_WORK(&req->work, io_poll_complete_work); 2498 2262 events = READ_ONCE(sqe->poll_events); 2499 2263 poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; 2500 - RB_CLEAR_NODE(&req->rb_node); 2264 + INIT_HLIST_NODE(&req->hash_node); 2501 2265 2502 2266 poll->head = NULL; 2503 2267 poll->done = false; ··· 2604 2368 if (ret == -ENOENT) 2605 2369 return ret; 2606 2370 2607 - ret = hrtimer_try_to_cancel(&req->timeout.data->timer); 2371 + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); 2608 2372 if (ret == -1) 2609 2373 return -EALREADY; 2610 2374 ··· 2646 2410 return 0; 2647 2411 } 2648 2412 2649 - static int io_timeout_setup(struct io_kiocb *req) 2413 + static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, 2414 + bool is_timeout_link) 2650 2415 { 2651 2416 const struct io_uring_sqe *sqe = req->sqe; 2652 2417 struct io_timeout_data *data; ··· 2657 2420 return -EINVAL; 2658 2421 if (sqe->ioprio || sqe->buf_index || sqe->len != 1) 2659 2422 return -EINVAL; 2423 + if (sqe->off && is_timeout_link) 2424 + return -EINVAL; 2660 2425 flags = READ_ONCE(sqe->timeout_flags); 2661 2426 if (flags & ~IORING_TIMEOUT_ABS) 2662 2427 return -EINVAL; 2663 2428 2664 - data = kzalloc(sizeof(struct io_timeout_data), GFP_KERNEL); 2665 - if (!data) 2666 - return -ENOMEM; 2429 + data = &io->timeout; 2667 2430 data->req = req; 2668 - req->timeout.data = data; 2669 2431 req->flags |= REQ_F_TIMEOUT; 2670 2432 2671 2433 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) ··· 2676 2440 data->mode = HRTIMER_MODE_REL; 2677 2441 2678 2442 hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); 2443 + req->io = io; 2679 2444 return 0; 2680 2445 } 2681 2446 ··· 2685 2448 unsigned count; 2686 2449 struct io_ring_ctx *ctx = req->ctx; 2687 2450 struct io_timeout_data *data; 2451 + struct io_async_ctx *io; 2688 2452 struct list_head *entry; 2689 2453 unsigned span = 0; 2690 - int ret; 2691 2454 2692 - ret = io_timeout_setup(req); 2693 - /* common setup allows flags (like links) set, we don't */ 2694 - if (!ret && sqe->flags) 2695 - ret = -EINVAL; 2696 - if (ret) 2697 - return ret; 2455 + io = req->io; 2456 + if (!io) { 2457 + int ret; 2458 + 2459 + io = kmalloc(sizeof(*io), GFP_KERNEL); 2460 + if (!io) 2461 + return -ENOMEM; 2462 + ret = io_timeout_prep(req, io, false); 2463 + if (ret) { 2464 + kfree(io); 2465 + return ret; 2466 + } 2467 + } 2468 + data = &req->io->timeout; 2698 2469 2699 2470 /* 2700 2471 * sqe->off holds how many events that need to occur for this ··· 2718 2473 } 2719 2474 2720 2475 req->sequence = ctx->cached_sq_head + count - 1; 2721 - req->timeout.data->seq_offset = count; 2476 + data->seq_offset = count; 2722 2477 2723 2478 /* 2724 2479 * Insertion sort, ensuring the first entry in the list is always ··· 2729 2484 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); 2730 2485 unsigned nxt_sq_head; 2731 2486 long long tmp, tmp_nxt; 2732 - u32 nxt_offset = nxt->timeout.data->seq_offset; 2487 + u32 nxt_offset = nxt->io->timeout.seq_offset; 2733 2488 2734 2489 if (nxt->flags & REQ_F_TIMEOUT_NOSEQ) 2735 2490 continue; ··· 2762 2517 req->sequence -= span; 2763 2518 add: 2764 2519 list_add(&req->list, entry); 2765 - data = req->timeout.data; 2766 2520 data->timer.function = io_timeout_fn; 2767 2521 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); 2768 2522 spin_unlock_irq(&ctx->completion_lock); ··· 2842 2598 return 0; 2843 2599 } 2844 2600 2601 + static int io_req_defer_prep(struct io_kiocb *req, struct io_async_ctx *io) 2602 + { 2603 + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 2604 + struct iov_iter iter; 2605 + ssize_t ret; 2606 + 2607 + memcpy(&io->sqe, req->sqe, sizeof(io->sqe)); 2608 + req->sqe = &io->sqe; 2609 + 2610 + switch (io->sqe.opcode) { 2611 + case IORING_OP_READV: 2612 + case IORING_OP_READ_FIXED: 2613 + ret = io_read_prep(req, &iovec, &iter, true); 2614 + break; 2615 + case IORING_OP_WRITEV: 2616 + case IORING_OP_WRITE_FIXED: 2617 + ret = io_write_prep(req, &iovec, &iter, true); 2618 + break; 2619 + case IORING_OP_SENDMSG: 2620 + ret = io_sendmsg_prep(req, io); 2621 + break; 2622 + case IORING_OP_RECVMSG: 2623 + ret = io_recvmsg_prep(req, io); 2624 + break; 2625 + case IORING_OP_CONNECT: 2626 + ret = io_connect_prep(req, io); 2627 + break; 2628 + case IORING_OP_TIMEOUT: 2629 + return io_timeout_prep(req, io, false); 2630 + case IORING_OP_LINK_TIMEOUT: 2631 + return io_timeout_prep(req, io, true); 2632 + default: 2633 + req->io = io; 2634 + return 0; 2635 + } 2636 + 2637 + if (ret < 0) 2638 + return ret; 2639 + 2640 + req->io = io; 2641 + io_req_map_io(req, ret, iovec, inline_vecs, &iter); 2642 + return 0; 2643 + } 2644 + 2845 2645 static int io_req_defer(struct io_kiocb *req) 2846 2646 { 2847 - struct io_uring_sqe *sqe_copy; 2848 2647 struct io_ring_ctx *ctx = req->ctx; 2648 + struct io_async_ctx *io; 2649 + int ret; 2849 2650 2850 2651 /* Still need defer if there is pending req in defer list. */ 2851 2652 if (!req_need_defer(req) && list_empty(&ctx->defer_list)) 2852 2653 return 0; 2853 2654 2854 - sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); 2855 - if (!sqe_copy) 2655 + io = kmalloc(sizeof(*io), GFP_KERNEL); 2656 + if (!io) 2856 2657 return -EAGAIN; 2658 + 2659 + ret = io_req_defer_prep(req, io); 2660 + if (ret < 0) { 2661 + kfree(io); 2662 + return ret; 2663 + } 2857 2664 2858 2665 spin_lock_irq(&ctx->completion_lock); 2859 2666 if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { 2860 2667 spin_unlock_irq(&ctx->completion_lock); 2861 - kfree(sqe_copy); 2862 2668 return 0; 2863 2669 } 2864 - 2865 - memcpy(sqe_copy, req->sqe, sizeof(*sqe_copy)); 2866 - req->flags |= REQ_F_FREE_SQE; 2867 - req->sqe = sqe_copy; 2868 2670 2869 2671 trace_io_uring_defer(ctx, req, req->user_data); 2870 2672 list_add_tail(&req->list, &ctx->defer_list); ··· 3166 2876 * We don't expect the list to be empty, that will only happen if we 3167 2877 * race with the completion of the linked work. 3168 2878 */ 3169 - if (!list_empty(&req->list)) { 3170 - prev = list_entry(req->list.prev, struct io_kiocb, link_list); 2879 + if (!list_empty(&req->link_list)) { 2880 + prev = list_entry(req->link_list.prev, struct io_kiocb, 2881 + link_list); 3171 2882 if (refcount_inc_not_zero(&prev->refs)) { 3172 - list_del_init(&req->list); 2883 + list_del_init(&req->link_list); 3173 2884 prev->flags &= ~REQ_F_LINK_TIMEOUT; 3174 2885 } else 3175 2886 prev = NULL; ··· 3200 2909 * we got a chance to setup the timer 3201 2910 */ 3202 2911 spin_lock_irq(&ctx->completion_lock); 3203 - if (!list_empty(&req->list)) { 3204 - struct io_timeout_data *data = req->timeout.data; 2912 + if (!list_empty(&req->link_list)) { 2913 + struct io_timeout_data *data = &req->io->timeout; 3205 2914 3206 2915 data->timer.function = io_link_timeout_fn; 3207 2916 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), ··· 3220 2929 if (!(req->flags & REQ_F_LINK)) 3221 2930 return NULL; 3222 2931 3223 - nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); 2932 + nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, 2933 + link_list); 3224 2934 if (!nxt || nxt->sqe->opcode != IORING_OP_LINK_TIMEOUT) 3225 2935 return NULL; 3226 2936 ··· 3245 2953 */ 3246 2954 if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || 3247 2955 (req->flags & REQ_F_MUST_PUNT))) { 3248 - struct io_uring_sqe *sqe_copy; 3249 - 3250 - sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL); 3251 - if (!sqe_copy) 3252 - goto err; 3253 - 3254 - req->sqe = sqe_copy; 3255 - req->flags |= REQ_F_FREE_SQE; 3256 - 3257 2956 if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) { 3258 2957 ret = io_grab_files(req); 3259 2958 if (ret) ··· 3313 3030 3314 3031 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) 3315 3032 3316 - static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, 3033 + static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, 3317 3034 struct io_kiocb **link) 3318 3035 { 3319 3036 struct io_ring_ctx *ctx = req->ctx; ··· 3332 3049 err_req: 3333 3050 io_cqring_add_event(req, ret); 3334 3051 io_double_put_req(req); 3335 - return; 3052 + return false; 3336 3053 } 3337 3054 3338 3055 /* ··· 3344 3061 */ 3345 3062 if (*link) { 3346 3063 struct io_kiocb *prev = *link; 3347 - struct io_uring_sqe *sqe_copy; 3064 + struct io_async_ctx *io; 3348 3065 3349 3066 if (req->sqe->flags & IOSQE_IO_DRAIN) 3350 3067 (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; 3351 3068 3352 - if (READ_ONCE(req->sqe->opcode) == IORING_OP_LINK_TIMEOUT) { 3353 - ret = io_timeout_setup(req); 3354 - /* common setup allows offset being set, we don't */ 3355 - if (!ret && req->sqe->off) 3356 - ret = -EINVAL; 3357 - if (ret) { 3358 - prev->flags |= REQ_F_FAIL_LINK; 3359 - goto err_req; 3360 - } 3361 - } 3362 - 3363 - sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL); 3364 - if (!sqe_copy) { 3069 + io = kmalloc(sizeof(*io), GFP_KERNEL); 3070 + if (!io) { 3365 3071 ret = -EAGAIN; 3366 3072 goto err_req; 3367 3073 } 3368 3074 3369 - req->sqe = sqe_copy; 3370 - req->flags |= REQ_F_FREE_SQE; 3075 + ret = io_req_defer_prep(req, io); 3076 + if (ret) { 3077 + kfree(io); 3078 + prev->flags |= REQ_F_FAIL_LINK; 3079 + goto err_req; 3080 + } 3371 3081 trace_io_uring_link(ctx, req, prev); 3372 - list_add_tail(&req->list, &prev->link_list); 3082 + list_add_tail(&req->link_list, &prev->link_list); 3373 3083 } else if (req->sqe->flags & IOSQE_IO_LINK) { 3374 3084 req->flags |= REQ_F_LINK; 3375 3085 ··· 3371 3095 } else { 3372 3096 io_queue_sqe(req); 3373 3097 } 3098 + 3099 + return true; 3374 3100 } 3375 3101 3376 3102 /* ··· 3391 3113 * Start submission side cache. 3392 3114 */ 3393 3115 static void io_submit_state_start(struct io_submit_state *state, 3394 - struct io_ring_ctx *ctx, unsigned max_ios) 3116 + unsigned int max_ios) 3395 3117 { 3396 3118 blk_start_plug(&state->plug); 3397 3119 state->free_reqs = 0; ··· 3475 3197 return -EBUSY; 3476 3198 3477 3199 if (nr > IO_PLUG_THRESHOLD) { 3478 - io_submit_state_start(&state, ctx, nr); 3200 + io_submit_state_start(&state, nr); 3479 3201 statep = &state; 3480 3202 } 3481 3203 ··· 3502 3224 } 3503 3225 } 3504 3226 3227 + submitted++; 3505 3228 sqe_flags = req->sqe->flags; 3506 3229 3507 3230 req->ring_file = ring_file; ··· 3512 3233 req->needs_fixed_file = async; 3513 3234 trace_io_uring_submit_sqe(ctx, req->sqe->user_data, 3514 3235 true, async); 3515 - io_submit_sqe(req, statep, &link); 3516 - submitted++; 3517 - 3236 + if (!io_submit_sqe(req, statep, &link)) 3237 + break; 3518 3238 /* 3519 3239 * If previous wasn't linked and we have a linked command, 3520 3240 * that's the end of the chain. Submit the previous link. ··· 4641 4363 free_uid(ctx->user); 4642 4364 put_cred(ctx->creds); 4643 4365 kfree(ctx->completions); 4366 + kfree(ctx->cancel_hash); 4644 4367 kmem_cache_free(req_cachep, ctx->fallback_req); 4645 4368 kfree(ctx); 4646 4369 } ··· 5038 4759 ctx->compat = in_compat_syscall(); 5039 4760 ctx->account_mem = account_mem; 5040 4761 ctx->user = user; 5041 - ctx->creds = prepare_creds(); 4762 + ctx->creds = get_current_cred(); 5042 4763 5043 4764 ret = io_allocate_scq_urings(ctx, p); 5044 4765 if (ret) ··· 5073 4794 if (ret < 0) 5074 4795 goto err; 5075 4796 5076 - p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP; 4797 + p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | 4798 + IORING_FEAT_SUBMIT_STABLE; 5077 4799 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); 5078 4800 return ret; 5079 4801 err:
+10 -14
include/linux/blkdev.h
··· 357 357 #define BLK_ALL_ZONES ((unsigned int)-1) 358 358 int blkdev_report_zones(struct block_device *bdev, sector_t sector, 359 359 unsigned int nr_zones, report_zones_cb cb, void *data); 360 - 361 - extern unsigned int blkdev_nr_zones(struct block_device *bdev); 360 + unsigned int blkdev_nr_zones(struct gendisk *disk); 362 361 extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, 363 362 sector_t sectors, sector_t nr_sectors, 364 363 gfp_t gfp_mask); ··· 370 371 371 372 #else /* CONFIG_BLK_DEV_ZONED */ 372 373 373 - static inline unsigned int blkdev_nr_zones(struct block_device *bdev) 374 - { 375 - return 0; 376 - } 377 - 378 - static inline int blk_revalidate_disk_zones(struct gendisk *disk) 374 + static inline unsigned int blkdev_nr_zones(struct gendisk *disk) 379 375 { 380 376 return 0; 381 377 } ··· 498 504 /* 499 505 * Zoned block device information for request dispatch control. 500 506 * nr_zones is the total number of zones of the device. This is always 501 - * 0 for regular block devices. seq_zones_bitmap is a bitmap of nr_zones 502 - * bits which indicates if a zone is conventional (bit clear) or 503 - * sequential (bit set). seq_zones_wlock is a bitmap of nr_zones 507 + * 0 for regular block devices. conv_zones_bitmap is a bitmap of nr_zones 508 + * bits which indicates if a zone is conventional (bit set) or 509 + * sequential (bit clear). seq_zones_wlock is a bitmap of nr_zones 504 510 * bits which indicates if a zone is write locked, that is, if a write 505 511 * request targeting the zone was dispatched. All three fields are 506 512 * initialized by the low level device driver (e.g. scsi/sd.c). ··· 513 519 * blk_mq_unfreeze_queue(). 514 520 */ 515 521 unsigned int nr_zones; 516 - unsigned long *seq_zones_bitmap; 522 + unsigned long *conv_zones_bitmap; 517 523 unsigned long *seq_zones_wlock; 518 524 #endif /* CONFIG_BLK_DEV_ZONED */ 519 525 ··· 718 724 static inline bool blk_queue_zone_is_seq(struct request_queue *q, 719 725 sector_t sector) 720 726 { 721 - if (!blk_queue_is_zoned(q) || !q->seq_zones_bitmap) 727 + if (!blk_queue_is_zoned(q)) 722 728 return false; 723 - return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap); 729 + if (!q->conv_zones_bitmap) 730 + return true; 731 + return !test_bit(blk_queue_zone_no(q, sector), q->conv_zones_bitmap); 724 732 } 725 733 #else /* CONFIG_BLK_DEV_ZONED */ 726 734 static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
+10 -12
include/linux/bvec.h
··· 87 87 static inline bool bvec_iter_advance(const struct bio_vec *bv, 88 88 struct bvec_iter *iter, unsigned bytes) 89 89 { 90 + unsigned int idx = iter->bi_idx; 91 + 90 92 if (WARN_ONCE(bytes > iter->bi_size, 91 93 "Attempted to advance past end of bvec iter\n")) { 92 94 iter->bi_size = 0; 93 95 return false; 94 96 } 95 97 96 - while (bytes) { 97 - const struct bio_vec *cur = bv + iter->bi_idx; 98 - unsigned len = min3(bytes, iter->bi_size, 99 - cur->bv_len - iter->bi_bvec_done); 98 + iter->bi_size -= bytes; 99 + bytes += iter->bi_bvec_done; 100 100 101 - bytes -= len; 102 - iter->bi_size -= len; 103 - iter->bi_bvec_done += len; 104 - 105 - if (iter->bi_bvec_done == cur->bv_len) { 106 - iter->bi_bvec_done = 0; 107 - iter->bi_idx++; 108 - } 101 + while (bytes && bytes >= bv[idx].bv_len) { 102 + bytes -= bv[idx].bv_len; 103 + idx++; 109 104 } 105 + 106 + iter->bi_idx = idx; 107 + iter->bi_bvec_done = bytes; 110 108 return true; 111 109 } 112 110
+13 -7
include/linux/socket.h
··· 378 378 extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, 379 379 unsigned int vlen, unsigned int flags, 380 380 bool forbid_cmsg_compat); 381 - extern long __sys_sendmsg_sock(struct socket *sock, 382 - struct user_msghdr __user *msg, 381 + extern long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg, 383 382 unsigned int flags); 384 - extern long __sys_recvmsg_sock(struct socket *sock, 385 - struct user_msghdr __user *msg, 383 + extern long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg, 384 + struct user_msghdr __user *umsg, 385 + struct sockaddr __user *uaddr, 386 386 unsigned int flags); 387 + extern int sendmsg_copy_msghdr(struct msghdr *msg, 388 + struct user_msghdr __user *umsg, unsigned flags, 389 + struct iovec **iov); 390 + extern int recvmsg_copy_msghdr(struct msghdr *msg, 391 + struct user_msghdr __user *umsg, unsigned flags, 392 + struct sockaddr __user **uaddr, 393 + struct iovec **iov); 387 394 388 395 /* helpers which do the actual work for syscalls */ 389 396 extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size, ··· 406 399 int __user *upeer_addrlen, int flags); 407 400 extern int __sys_socket(int family, int type, int protocol); 408 401 extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen); 409 - extern int __sys_connect_file(struct file *file, 410 - struct sockaddr __user *uservaddr, int addrlen, 411 - int file_flags); 402 + extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr, 403 + int addrlen, int file_flags); 412 404 extern int __sys_connect(int fd, struct sockaddr __user *uservaddr, 413 405 int addrlen); 414 406 extern int __sys_listen(int fd, int backlog);
+1
include/uapi/linux/io_uring.h
··· 157 157 */ 158 158 #define IORING_FEAT_SINGLE_MMAP (1U << 0) 159 159 #define IORING_FEAT_NODROP (1U << 1) 160 + #define IORING_FEAT_SUBMIT_STABLE (1U << 2) 160 161 161 162 /* 162 163 * io_uring_register(2) opcodes and arguments
+25 -51
net/socket.c
··· 1826 1826 * include the -EINPROGRESS status for such sockets. 1827 1827 */ 1828 1828 1829 - int __sys_connect_file(struct file *file, struct sockaddr __user *uservaddr, 1829 + int __sys_connect_file(struct file *file, struct sockaddr_storage *address, 1830 1830 int addrlen, int file_flags) 1831 1831 { 1832 1832 struct socket *sock; 1833 - struct sockaddr_storage address; 1834 1833 int err; 1835 1834 1836 1835 sock = sock_from_file(file, &err); 1837 1836 if (!sock) 1838 1837 goto out; 1839 - err = move_addr_to_kernel(uservaddr, addrlen, &address); 1840 - if (err < 0) 1841 - goto out; 1842 1838 1843 1839 err = 1844 - security_socket_connect(sock, (struct sockaddr *)&address, addrlen); 1840 + security_socket_connect(sock, (struct sockaddr *)address, addrlen); 1845 1841 if (err) 1846 1842 goto out; 1847 1843 1848 - err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen, 1844 + err = sock->ops->connect(sock, (struct sockaddr *)address, addrlen, 1849 1845 sock->file->f_flags | file_flags); 1850 1846 out: 1851 1847 return err; ··· 1854 1858 1855 1859 f = fdget(fd); 1856 1860 if (f.file) { 1857 - ret = __sys_connect_file(f.file, uservaddr, addrlen, 0); 1861 + struct sockaddr_storage address; 1862 + 1863 + ret = move_addr_to_kernel(uservaddr, addrlen, &address); 1864 + if (!ret) 1865 + ret = __sys_connect_file(f.file, &address, addrlen, 0); 1858 1866 if (f.flags) 1859 1867 fput(f.file); 1860 1868 } ··· 2346 2346 return err; 2347 2347 } 2348 2348 2349 - static int sendmsg_copy_msghdr(struct msghdr *msg, 2350 - struct user_msghdr __user *umsg, unsigned flags, 2351 - struct iovec **iov) 2349 + int sendmsg_copy_msghdr(struct msghdr *msg, 2350 + struct user_msghdr __user *umsg, unsigned flags, 2351 + struct iovec **iov) 2352 2352 { 2353 2353 int err; 2354 2354 ··· 2390 2390 /* 2391 2391 * BSD sendmsg interface 2392 2392 */ 2393 - long __sys_sendmsg_sock(struct socket *sock, struct user_msghdr __user *umsg, 2393 + long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg, 2394 2394 unsigned int flags) 2395 2395 { 2396 - struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; 2397 - struct sockaddr_storage address; 2398 - struct msghdr msg = { .msg_name = &address }; 2399 - ssize_t err; 2400 - 2401 - err = sendmsg_copy_msghdr(&msg, umsg, flags, &iov); 2402 - if (err) 2403 - return err; 2404 2396 /* disallow ancillary data requests from this path */ 2405 - if (msg.msg_control || msg.msg_controllen) { 2406 - err = -EINVAL; 2407 - goto out; 2408 - } 2397 + if (msg->msg_control || msg->msg_controllen) 2398 + return -EINVAL; 2409 2399 2410 - err = ____sys_sendmsg(sock, &msg, flags, NULL, 0); 2411 - out: 2412 - kfree(iov); 2413 - return err; 2400 + return ____sys_sendmsg(sock, msg, flags, NULL, 0); 2414 2401 } 2415 2402 2416 2403 long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, ··· 2503 2516 return __sys_sendmmsg(fd, mmsg, vlen, flags, true); 2504 2517 } 2505 2518 2506 - static int recvmsg_copy_msghdr(struct msghdr *msg, 2507 - struct user_msghdr __user *umsg, unsigned flags, 2508 - struct sockaddr __user **uaddr, 2509 - struct iovec **iov) 2519 + int recvmsg_copy_msghdr(struct msghdr *msg, 2520 + struct user_msghdr __user *umsg, unsigned flags, 2521 + struct sockaddr __user **uaddr, 2522 + struct iovec **iov) 2510 2523 { 2511 2524 ssize_t err; 2512 2525 ··· 2596 2609 * BSD recvmsg interface 2597 2610 */ 2598 2611 2599 - long __sys_recvmsg_sock(struct socket *sock, struct user_msghdr __user *umsg, 2600 - unsigned int flags) 2612 + long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg, 2613 + struct user_msghdr __user *umsg, 2614 + struct sockaddr __user *uaddr, unsigned int flags) 2601 2615 { 2602 - struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; 2603 - struct sockaddr_storage address; 2604 - struct msghdr msg = { .msg_name = &address }; 2605 - struct sockaddr __user *uaddr; 2606 - ssize_t err; 2607 - 2608 - err = recvmsg_copy_msghdr(&msg, umsg, flags, &uaddr, &iov); 2609 - if (err) 2610 - return err; 2611 2616 /* disallow ancillary data requests from this path */ 2612 - if (msg.msg_control || msg.msg_controllen) { 2613 - err = -EINVAL; 2614 - goto out; 2615 - } 2617 + if (msg->msg_control || msg->msg_controllen) 2618 + return -EINVAL; 2616 2619 2617 - err = ____sys_recvmsg(sock, &msg, umsg, uaddr, flags, 0); 2618 - out: 2619 - kfree(iov); 2620 - return err; 2620 + return ____sys_recvmsg(sock, msg, umsg, uaddr, flags, 0); 2621 2621 } 2622 2622 2623 2623 long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,