Merge tag 'block-6.13-20242901' of git://git.kernel.dk/linux

Pull more block updates from Jens Axboe:

- NVMe pull request via Keith:
- Use correct srcu list traversal (Breno)
- Scatter-gather support for metadata (Keith)
- Fabrics shutdown race condition fix (Nilay)
- Persistent reservations updates (Guixin)

- Add the required bits for MD atomic write support for raid0/1/10

- Correct return value for unknown opcode in ublk

- Fix deadlock with zone revalidation

- Fix for the io priority request vs bio cleanups

- Use the correct unsigned int type for various limit helpers

- Fix for a race in loop

- Cleanup blk_rq_prep_clone() to prevent uninit-value warning and make
it easier for actual humans to read

- Fix potential UAF when iterating tags

- A few fixes for bfq-iosched UAF issues

- Fix for brd discard not decrementing the allocated page count

- Various little fixes and cleanups

* tag 'block-6.13-20242901' of git://git.kernel.dk/linux: (36 commits)
brd: decrease the number of allocated pages which discarded
block, bfq: fix bfqq uaf in bfq_limit_depth()
block: Don't allow an atomic write be truncated in blkdev_write_iter()
mq-deadline: don't call req_get_ioprio from the I/O completion handler
block: Prevent potential deadlock in blk_revalidate_disk_zones()
block: Remove extra part pointer NULLify in blk_rq_init()
nvme: tuning pr code by using defined structs and macros
nvme: introduce change ptpl and iekey definition
block: return bool from get_disk_ro and bdev_read_only
block: remove a duplicate definition for bdev_read_only
block: return bool from blk_rq_aligned
block: return unsigned int from blk_lim_dma_alignment_and_pad
block: return unsigned int from queue_dma_alignment
block: return unsigned int from bdev_io_opt
block: req->bio is always set in the merge code
block: don't bother checking the data direction for merges
block: blk-mq: fix uninit-value in blk_rq_prep_clone and refactor
Revert "block, bfq: merge bfq_release_process_ref() into bfq_put_cooperator()"
md/raid10: Atomic write support
md/raid1: Atomic write support
...

+549 -194
+1
block/bfq-cgroup.c
··· 736 736 */ 737 737 bfq_put_cooperator(sync_bfqq); 738 738 bic_set_bfqq(bic, NULL, true, act_idx); 739 + bfq_release_process_ref(bfqd, sync_bfqq); 739 740 } 740 741 } 741 742
+28 -15
block/bfq-iosched.c
··· 582 582 #define BFQ_LIMIT_INLINE_DEPTH 16 583 583 584 584 #ifdef CONFIG_BFQ_GROUP_IOSCHED 585 - static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit) 585 + static bool bfqq_request_over_limit(struct bfq_data *bfqd, 586 + struct bfq_io_cq *bic, blk_opf_t opf, 587 + unsigned int act_idx, int limit) 586 588 { 587 - struct bfq_data *bfqd = bfqq->bfqd; 588 - struct bfq_entity *entity = &bfqq->entity; 589 589 struct bfq_entity *inline_entities[BFQ_LIMIT_INLINE_DEPTH]; 590 590 struct bfq_entity **entities = inline_entities; 591 - int depth, level, alloc_depth = BFQ_LIMIT_INLINE_DEPTH; 592 - int class_idx = bfqq->ioprio_class - 1; 591 + int alloc_depth = BFQ_LIMIT_INLINE_DEPTH; 593 592 struct bfq_sched_data *sched_data; 593 + struct bfq_entity *entity; 594 + struct bfq_queue *bfqq; 594 595 unsigned long wsum; 595 596 bool ret = false; 596 - 597 - if (!entity->on_st_or_in_serv) 598 - return false; 597 + int depth; 598 + int level; 599 599 600 600 retry: 601 601 spin_lock_irq(&bfqd->lock); 602 + bfqq = bic_to_bfqq(bic, op_is_sync(opf), act_idx); 603 + if (!bfqq) 604 + goto out; 605 + 606 + entity = &bfqq->entity; 607 + if (!entity->on_st_or_in_serv) 608 + goto out; 609 + 602 610 /* +1 for bfqq entity, root cgroup not included */ 603 611 depth = bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css.cgroup->level + 1; 604 612 if (depth > alloc_depth) { ··· 651 643 * class. 652 644 */ 653 645 wsum = 0; 654 - for (i = 0; i <= class_idx; i++) { 646 + for (i = 0; i <= bfqq->ioprio_class - 1; i++) { 655 647 wsum = wsum * IOPRIO_BE_NR + 656 648 sched_data->service_tree[i].wsum; 657 649 } ··· 674 666 return ret; 675 667 } 676 668 #else 677 - static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit) 669 + static bool bfqq_request_over_limit(struct bfq_data *bfqd, 670 + struct bfq_io_cq *bic, blk_opf_t opf, 671 + unsigned int act_idx, int limit) 678 672 { 679 673 return false; 680 674 } ··· 714 704 } 715 705 716 706 for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) { 717 - struct bfq_queue *bfqq = 718 - bic_to_bfqq(bic, op_is_sync(opf), act_idx); 707 + /* Fast path to check if bfqq is already allocated. */ 708 + if (!bic_to_bfqq(bic, op_is_sync(opf), act_idx)) 709 + continue; 719 710 720 711 /* 721 712 * Does queue (or any parent entity) exceed number of ··· 724 713 * limit depth so that it cannot consume more 725 714 * available requests and thus starve other entities. 726 715 */ 727 - if (bfqq && bfqq_request_over_limit(bfqq, limit)) { 716 + if (bfqq_request_over_limit(bfqd, bic, opf, act_idx, limit)) { 728 717 depth = 1; 729 718 break; 730 719 } ··· 5445 5434 bfq_put_queue(__bfqq); 5446 5435 __bfqq = next; 5447 5436 } 5448 - 5449 - bfq_release_process_ref(bfqq->bfqd, bfqq); 5450 5437 } 5451 5438 5452 5439 static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) ··· 5457 5448 bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); 5458 5449 5459 5450 bfq_put_cooperator(bfqq); 5451 + 5452 + bfq_release_process_ref(bfqd, bfqq); 5460 5453 } 5461 5454 5462 5455 static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync, ··· 6745 6734 bic_set_bfqq(bic, NULL, true, bfqq->actuator_idx); 6746 6735 6747 6736 bfq_put_cooperator(bfqq); 6737 + 6738 + bfq_release_process_ref(bfqq->bfqd, bfqq); 6748 6739 return NULL; 6749 6740 } 6750 6741
+7 -28
block/blk-merge.c
··· 864 864 if (req_op(req) != req_op(next)) 865 865 return NULL; 866 866 867 - if (rq_data_dir(req) != rq_data_dir(next)) 867 + if (req->bio->bi_write_hint != next->bio->bi_write_hint) 868 868 return NULL; 869 - 870 - if (req->bio && next->bio) { 871 - /* Don't merge requests with different write hints. */ 872 - if (req->bio->bi_write_hint != next->bio->bi_write_hint) 873 - return NULL; 874 - if (req->bio->bi_ioprio != next->bio->bi_ioprio) 875 - return NULL; 876 - } 877 - 869 + if (req->bio->bi_ioprio != next->bio->bi_ioprio) 870 + return NULL; 878 871 if (!blk_atomic_write_mergeable_rqs(req, next)) 879 872 return NULL; 880 873 ··· 979 986 if (req_op(rq) != bio_op(bio)) 980 987 return false; 981 988 982 - /* different data direction or already started, don't merge */ 983 - if (bio_data_dir(bio) != rq_data_dir(rq)) 984 - return false; 985 - 986 - /* don't merge across cgroup boundaries */ 987 989 if (!blk_cgroup_mergeable(rq, bio)) 988 990 return false; 989 - 990 - /* only merge integrity protected bio into ditto rq */ 991 991 if (blk_integrity_merge_bio(rq->q, rq, bio) == false) 992 992 return false; 993 - 994 - /* Only merge if the crypt contexts are compatible */ 995 993 if (!bio_crypt_rq_ctx_compatible(rq, bio)) 996 994 return false; 997 - 998 - if (rq->bio) { 999 - /* Don't merge requests with different write hints. */ 1000 - if (rq->bio->bi_write_hint != bio->bi_write_hint) 1001 - return false; 1002 - if (rq->bio->bi_ioprio != bio->bi_ioprio) 1003 - return false; 1004 - } 1005 - 995 + if (rq->bio->bi_write_hint != bio->bi_write_hint) 996 + return false; 997 + if (rq->bio->bi_ioprio != bio->bi_ioprio) 998 + return false; 1006 999 if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false) 1007 1000 return false; 1008 1001
+6 -8
block/blk-mq.c
··· 388 388 rq->tag = BLK_MQ_NO_TAG; 389 389 rq->internal_tag = BLK_MQ_NO_TAG; 390 390 rq->start_time_ns = blk_time_get_ns(); 391 - rq->part = NULL; 392 391 blk_crypto_rq_set_defaults(rq); 393 392 } 394 393 EXPORT_SYMBOL(blk_rq_init); ··· 3272 3273 int (*bio_ctr)(struct bio *, struct bio *, void *), 3273 3274 void *data) 3274 3275 { 3275 - struct bio *bio, *bio_src; 3276 + struct bio *bio_src; 3276 3277 3277 3278 if (!bs) 3278 3279 bs = &fs_bio_set; 3279 3280 3280 3281 __rq_for_each_bio(bio_src, rq_src) { 3281 - bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask, 3282 - bs); 3282 + struct bio *bio = bio_alloc_clone(rq->q->disk->part0, bio_src, 3283 + gfp_mask, bs); 3283 3284 if (!bio) 3284 3285 goto free_and_out; 3285 3286 3286 - if (bio_ctr && bio_ctr(bio, bio_src, data)) 3287 + if (bio_ctr && bio_ctr(bio, bio_src, data)) { 3288 + bio_put(bio); 3287 3289 goto free_and_out; 3290 + } 3288 3291 3289 3292 if (rq->bio) { 3290 3293 rq->biotail->bi_next = bio; ··· 3294 3293 } else { 3295 3294 rq->bio = rq->biotail = bio; 3296 3295 } 3297 - bio = NULL; 3298 3296 } 3299 3297 3300 3298 /* Copy attributes of the original request to the clone request. */ ··· 3311 3311 return 0; 3312 3312 3313 3313 free_and_out: 3314 - if (bio) 3315 - bio_put(bio); 3316 3314 blk_rq_unprep_clone(rq); 3317 3315 3318 3316 return -ENOMEM;
+139 -2
block/blk-settings.c
··· 178 178 if (!lim->atomic_write_hw_max) 179 179 goto unsupported; 180 180 181 + if (WARN_ON_ONCE(!is_power_of_2(lim->atomic_write_hw_unit_min))) 182 + goto unsupported; 183 + 184 + if (WARN_ON_ONCE(!is_power_of_2(lim->atomic_write_hw_unit_max))) 185 + goto unsupported; 186 + 187 + if (WARN_ON_ONCE(lim->atomic_write_hw_unit_min > 188 + lim->atomic_write_hw_unit_max)) 189 + goto unsupported; 190 + 191 + if (WARN_ON_ONCE(lim->atomic_write_hw_unit_max > 192 + lim->atomic_write_hw_max)) 193 + goto unsupported; 194 + 181 195 boundary_sectors = lim->atomic_write_hw_boundary >> SECTOR_SHIFT; 182 196 183 197 if (boundary_sectors) { 198 + if (WARN_ON_ONCE(lim->atomic_write_hw_max > 199 + lim->atomic_write_hw_boundary)) 200 + goto unsupported; 184 201 /* 185 202 * A feature of boundary support is that it disallows bios to 186 203 * be merged which would result in a merged request which ··· 264 247 */ 265 248 if (lim->io_min < lim->physical_block_size) 266 249 lim->io_min = lim->physical_block_size; 250 + 251 + /* 252 + * The optimal I/O size may not be aligned to physical block size 253 + * (because it may be limited by dma engines which have no clue about 254 + * block size of the disks attached to them), so we round it down here. 255 + */ 256 + lim->io_opt = round_down(lim->io_opt, lim->physical_block_size); 267 257 268 258 /* 269 259 * max_hw_sectors has a somewhat weird default for historical reason, ··· 482 458 /* Why are these in bytes, not sectors? */ 483 459 alignment = lim->discard_alignment >> SECTOR_SHIFT; 484 460 granularity = lim->discard_granularity >> SECTOR_SHIFT; 485 - if (!granularity) 486 - return 0; 487 461 488 462 /* Offset of the partition start in 'granularity' sectors */ 489 463 offset = sector_div(sector, granularity); ··· 499 477 if (sectors < PAGE_SIZE >> SECTOR_SHIFT) 500 478 sectors = PAGE_SIZE >> SECTOR_SHIFT; 501 479 return sectors; 480 + } 481 + 482 + /* Check if second and later bottom devices are compliant */ 483 + static bool blk_stack_atomic_writes_tail(struct queue_limits *t, 484 + struct queue_limits *b) 485 + { 486 + /* We're not going to support different boundary sizes.. yet */ 487 + if (t->atomic_write_hw_boundary != b->atomic_write_hw_boundary) 488 + return false; 489 + 490 + /* Can't support this */ 491 + if (t->atomic_write_hw_unit_min > b->atomic_write_hw_unit_max) 492 + return false; 493 + 494 + /* Or this */ 495 + if (t->atomic_write_hw_unit_max < b->atomic_write_hw_unit_min) 496 + return false; 497 + 498 + t->atomic_write_hw_max = min(t->atomic_write_hw_max, 499 + b->atomic_write_hw_max); 500 + t->atomic_write_hw_unit_min = max(t->atomic_write_hw_unit_min, 501 + b->atomic_write_hw_unit_min); 502 + t->atomic_write_hw_unit_max = min(t->atomic_write_hw_unit_max, 503 + b->atomic_write_hw_unit_max); 504 + return true; 505 + } 506 + 507 + /* Check for valid boundary of first bottom device */ 508 + static bool blk_stack_atomic_writes_boundary_head(struct queue_limits *t, 509 + struct queue_limits *b) 510 + { 511 + /* 512 + * Ensure atomic write boundary is aligned with chunk sectors. Stacked 513 + * devices store chunk sectors in t->io_min. 514 + */ 515 + if (b->atomic_write_hw_boundary > t->io_min && 516 + b->atomic_write_hw_boundary % t->io_min) 517 + return false; 518 + if (t->io_min > b->atomic_write_hw_boundary && 519 + t->io_min % b->atomic_write_hw_boundary) 520 + return false; 521 + 522 + t->atomic_write_hw_boundary = b->atomic_write_hw_boundary; 523 + return true; 524 + } 525 + 526 + 527 + /* Check stacking of first bottom device */ 528 + static bool blk_stack_atomic_writes_head(struct queue_limits *t, 529 + struct queue_limits *b) 530 + { 531 + if (b->atomic_write_hw_boundary && 532 + !blk_stack_atomic_writes_boundary_head(t, b)) 533 + return false; 534 + 535 + if (t->io_min <= SECTOR_SIZE) { 536 + /* No chunk sectors, so use bottom device values directly */ 537 + t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; 538 + t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min; 539 + t->atomic_write_hw_max = b->atomic_write_hw_max; 540 + return true; 541 + } 542 + 543 + /* 544 + * Find values for limits which work for chunk size. 545 + * b->atomic_write_hw_unit_{min, max} may not be aligned with chunk 546 + * size (t->io_min), as chunk size is not restricted to a power-of-2. 547 + * So we need to find highest power-of-2 which works for the chunk 548 + * size. 549 + * As an example scenario, we could have b->unit_max = 16K and 550 + * t->io_min = 24K. For this case, reduce t->unit_max to a value 551 + * aligned with both limits, i.e. 8K in this example. 552 + */ 553 + t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; 554 + while (t->io_min % t->atomic_write_hw_unit_max) 555 + t->atomic_write_hw_unit_max /= 2; 556 + 557 + t->atomic_write_hw_unit_min = min(b->atomic_write_hw_unit_min, 558 + t->atomic_write_hw_unit_max); 559 + t->atomic_write_hw_max = min(b->atomic_write_hw_max, t->io_min); 560 + 561 + return true; 562 + } 563 + 564 + static void blk_stack_atomic_writes_limits(struct queue_limits *t, 565 + struct queue_limits *b) 566 + { 567 + if (!(t->features & BLK_FEAT_ATOMIC_WRITES_STACKED)) 568 + goto unsupported; 569 + 570 + if (!b->atomic_write_unit_min) 571 + goto unsupported; 572 + 573 + /* 574 + * If atomic_write_hw_max is set, we have already stacked 1x bottom 575 + * device, so check for compliance. 576 + */ 577 + if (t->atomic_write_hw_max) { 578 + if (!blk_stack_atomic_writes_tail(t, b)) 579 + goto unsupported; 580 + return; 581 + } 582 + 583 + if (!blk_stack_atomic_writes_head(t, b)) 584 + goto unsupported; 585 + return; 586 + 587 + unsupported: 588 + t->atomic_write_hw_max = 0; 589 + t->atomic_write_hw_unit_max = 0; 590 + t->atomic_write_hw_unit_min = 0; 591 + t->atomic_write_hw_boundary = 0; 592 + t->features &= ~BLK_FEAT_ATOMIC_WRITES_STACKED; 502 593 } 503 594 504 595 /** ··· 774 639 t->zone_write_granularity = 0; 775 640 t->max_zone_append_sectors = 0; 776 641 } 642 + blk_stack_atomic_writes_limits(t, b); 643 + 777 644 return ret; 778 645 } 779 646 EXPORT_SYMBOL(blk_stack_limits);
+2 -4
block/blk-sysfs.c
··· 810 810 * faster to shut down and is made fully functional here as 811 811 * request_queues for non-existent devices never get registered. 812 812 */ 813 - if (!blk_queue_init_done(q)) { 814 - blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q); 815 - percpu_ref_switch_to_percpu(&q->q_usage_counter); 816 - } 813 + blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q); 814 + percpu_ref_switch_to_percpu(&q->q_usage_counter); 817 815 818 816 return ret; 819 817
+10 -4
block/blk-zoned.c
··· 1551 1551 unsigned int nr_seq_zones, nr_conv_zones; 1552 1552 unsigned int pool_size; 1553 1553 struct queue_limits lim; 1554 + int ret; 1554 1555 1555 1556 disk->nr_zones = args->nr_zones; 1556 1557 disk->zone_capacity = args->zone_capacity; ··· 1602 1601 } 1603 1602 1604 1603 commit: 1605 - return queue_limits_commit_update(q, &lim); 1604 + blk_mq_freeze_queue(q); 1605 + ret = queue_limits_commit_update(q, &lim); 1606 + blk_mq_unfreeze_queue(q); 1607 + 1608 + return ret; 1606 1609 } 1607 1610 1608 1611 static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, ··· 1821 1816 * Set the new disk zone parameters only once the queue is frozen and 1822 1817 * all I/Os are completed. 1823 1818 */ 1824 - blk_mq_freeze_queue(q); 1825 1819 if (ret > 0) 1826 1820 ret = disk_update_zone_resources(disk, &args); 1827 1821 else 1828 1822 pr_warn("%s: failed to revalidate zones\n", disk->disk_name); 1829 - if (ret) 1823 + if (ret) { 1824 + blk_mq_freeze_queue(q); 1830 1825 disk_free_zone_resources(disk); 1831 - blk_mq_unfreeze_queue(q); 1826 + blk_mq_unfreeze_queue(q); 1827 + } 1832 1828 1833 1829 return ret; 1834 1830 }
+4 -1
block/fops.c
··· 677 677 struct file *file = iocb->ki_filp; 678 678 struct inode *bd_inode = bdev_file_inode(file); 679 679 struct block_device *bdev = I_BDEV(bd_inode); 680 + bool atomic = iocb->ki_flags & IOCB_ATOMIC; 680 681 loff_t size = bdev_nr_bytes(bdev); 681 682 size_t shorted = 0; 682 683 ssize_t ret; ··· 697 696 if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) 698 697 return -EOPNOTSUPP; 699 698 700 - if (iocb->ki_flags & IOCB_ATOMIC) { 699 + if (atomic) { 701 700 ret = generic_atomic_write_valid(iocb, from); 702 701 if (ret) 703 702 return ret; ··· 705 704 706 705 size -= iocb->ki_pos; 707 706 if (iov_iter_count(from) > size) { 707 + if (atomic) 708 + return -EINVAL; 708 709 shorted = iov_iter_count(from) - size; 709 710 iov_iter_truncate(from, size); 710 711 }
+3 -6
block/genhd.c
··· 742 742 * If the disk does not own the queue, allow using passthrough requests 743 743 * again. Else leave the queue frozen to fail all I/O. 744 744 */ 745 - if (!test_bit(GD_OWNS_QUEUE, &disk->state)) { 746 - blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q); 745 + if (!test_bit(GD_OWNS_QUEUE, &disk->state)) 747 746 __blk_mq_unfreeze_queue(q, true); 748 - } else { 749 - if (queue_is_mq(q)) 750 - blk_mq_exit_queue(q); 751 - } 747 + else if (queue_is_mq(q)) 748 + blk_mq_exit_queue(q); 752 749 753 750 if (start_drain) 754 751 blk_unfreeze_release_lock(q, true, queue_dying);
+4 -9
block/mq-deadline.c
··· 685 685 686 686 prio = ioprio_class_to_prio[ioprio_class]; 687 687 per_prio = &dd->per_prio[prio]; 688 - if (!rq->elv.priv[0]) { 688 + if (!rq->elv.priv[0]) 689 689 per_prio->stats.inserted++; 690 - rq->elv.priv[0] = (void *)(uintptr_t)1; 691 - } 690 + rq->elv.priv[0] = per_prio; 692 691 693 692 if (blk_mq_sched_try_insert_merge(q, rq, free)) 694 693 return; ··· 752 753 */ 753 754 static void dd_finish_request(struct request *rq) 754 755 { 755 - struct request_queue *q = rq->q; 756 - struct deadline_data *dd = q->elevator->elevator_data; 757 - const u8 ioprio_class = dd_rq_ioclass(rq); 758 - const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; 759 - struct dd_per_prio *per_prio = &dd->per_prio[prio]; 756 + struct dd_per_prio *per_prio = rq->elv.priv[0]; 760 757 761 758 /* 762 759 * The block layer core may call dd_finish_request() without having 763 760 * called dd_insert_requests(). Skip requests that bypassed I/O 764 761 * scheduling. See also blk_mq_request_bypass_insert(). 765 762 */ 766 - if (rq->elv.priv[0]) 763 + if (per_prio) 767 764 atomic_inc(&per_prio->stats.completed); 768 765 } 769 766
+3 -1
drivers/block/brd.c
··· 231 231 xa_lock(&brd->brd_pages); 232 232 while (size >= PAGE_SIZE && aligned_sector < rd_size * 2) { 233 233 page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT); 234 - if (page) 234 + if (page) { 235 235 __free_page(page); 236 + brd->brd_nr_pages--; 237 + } 236 238 aligned_sector += PAGE_SECTORS; 237 239 size -= PAGE_SIZE; 238 240 }
+15 -15
drivers/block/loop.c
··· 770 770 &loop_attribute_group); 771 771 } 772 772 773 - static void loop_config_discard(struct loop_device *lo, 774 - struct queue_limits *lim) 773 + static void loop_get_discard_config(struct loop_device *lo, 774 + u32 *granularity, u32 *max_discard_sectors) 775 775 { 776 776 struct file *file = lo->lo_backing_file; 777 777 struct inode *inode = file->f_mapping->host; 778 - u32 granularity = 0, max_discard_sectors = 0; 779 778 struct kstatfs sbuf; 780 779 781 780 /* ··· 787 788 if (S_ISBLK(inode->i_mode)) { 788 789 struct block_device *bdev = I_BDEV(inode); 789 790 790 - max_discard_sectors = bdev_write_zeroes_sectors(bdev); 791 - granularity = bdev_discard_granularity(bdev); 791 + *max_discard_sectors = bdev_write_zeroes_sectors(bdev); 792 + *granularity = bdev_discard_granularity(bdev); 792 793 793 794 /* 794 795 * We use punch hole to reclaim the free space used by the 795 796 * image a.k.a. discard. 796 797 */ 797 798 } else if (file->f_op->fallocate && !vfs_statfs(&file->f_path, &sbuf)) { 798 - max_discard_sectors = UINT_MAX >> 9; 799 - granularity = sbuf.f_bsize; 799 + *max_discard_sectors = UINT_MAX >> 9; 800 + *granularity = sbuf.f_bsize; 800 801 } 801 - 802 - lim->max_hw_discard_sectors = max_discard_sectors; 803 - lim->max_write_zeroes_sectors = max_discard_sectors; 804 - if (max_discard_sectors) 805 - lim->discard_granularity = granularity; 806 - else 807 - lim->discard_granularity = 0; 808 802 } 809 803 810 804 struct loop_worker { ··· 983 991 struct inode *inode = file->f_mapping->host; 984 992 struct block_device *backing_bdev = NULL; 985 993 struct queue_limits lim; 994 + u32 granularity = 0, max_discard_sectors = 0; 986 995 987 996 if (S_ISBLK(inode->i_mode)) 988 997 backing_bdev = I_BDEV(inode); ··· 992 999 993 1000 if (!bsize) 994 1001 bsize = loop_default_blocksize(lo, backing_bdev); 1002 + 1003 + loop_get_discard_config(lo, &granularity, &max_discard_sectors); 995 1004 996 1005 lim = queue_limits_start_update(lo->lo_queue); 997 1006 lim.logical_block_size = bsize; ··· 1004 1009 lim.features |= BLK_FEAT_WRITE_CACHE; 1005 1010 if (backing_bdev && !bdev_nonrot(backing_bdev)) 1006 1011 lim.features |= BLK_FEAT_ROTATIONAL; 1007 - loop_config_discard(lo, &lim); 1012 + lim.max_hw_discard_sectors = max_discard_sectors; 1013 + lim.max_write_zeroes_sectors = max_discard_sectors; 1014 + if (max_discard_sectors) 1015 + lim.discard_granularity = granularity; 1016 + else 1017 + lim.discard_granularity = 0; 1008 1018 return queue_limits_commit_update(lo->lo_queue, &lim); 1009 1019 } 1010 1020
+1 -1
drivers/block/ublk_drv.c
··· 3041 3041 ret = ublk_ctrl_end_recovery(ub, cmd); 3042 3042 break; 3043 3043 default: 3044 - ret = -ENOTSUPP; 3044 + ret = -EOPNOTSUPP; 3045 3045 break; 3046 3046 } 3047 3047
+1
drivers/md/raid0.c
··· 384 384 lim.max_write_zeroes_sectors = mddev->chunk_sectors; 385 385 lim.io_min = mddev->chunk_sectors << 9; 386 386 lim.io_opt = lim.io_min * mddev->raid_disks; 387 + lim.features |= BLK_FEAT_ATOMIC_WRITES_STACKED; 387 388 err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); 388 389 if (err) { 389 390 queue_limits_cancel_update(mddev->gendisk->queue);
+18 -2
drivers/md/raid1.c
··· 1571 1571 continue; 1572 1572 } 1573 1573 if (is_bad) { 1574 - int good_sectors = first_bad - r1_bio->sector; 1574 + int good_sectors; 1575 + 1576 + /* 1577 + * We cannot atomically write this, so just 1578 + * error in that case. It could be possible to 1579 + * atomically write other mirrors, but the 1580 + * complexity of supporting that is not worth 1581 + * the benefit. 1582 + */ 1583 + if (bio->bi_opf & REQ_ATOMIC) { 1584 + error = -EIO; 1585 + goto err_handle; 1586 + } 1587 + 1588 + good_sectors = first_bad - r1_bio->sector; 1575 1589 if (good_sectors < max_sectors) 1576 1590 max_sectors = good_sectors; 1577 1591 } ··· 1671 1657 1672 1658 mbio->bi_iter.bi_sector = (r1_bio->sector + rdev->data_offset); 1673 1659 mbio->bi_end_io = raid1_end_write_request; 1674 - mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA)); 1660 + mbio->bi_opf = bio_op(bio) | 1661 + (bio->bi_opf & (REQ_SYNC | REQ_FUA | REQ_ATOMIC)); 1675 1662 if (test_bit(FailFast, &rdev->flags) && 1676 1663 !test_bit(WriteMostly, &rdev->flags) && 1677 1664 conf->raid_disks - mddev->degraded > 1) ··· 3239 3224 3240 3225 md_init_stacking_limits(&lim); 3241 3226 lim.max_write_zeroes_sectors = 0; 3227 + lim.features |= BLK_FEAT_ATOMIC_WRITES_STACKED; 3242 3228 err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); 3243 3229 if (err) { 3244 3230 queue_limits_cancel_update(mddev->gendisk->queue);
+18 -2
drivers/md/raid10.c
··· 1255 1255 const enum req_op op = bio_op(bio); 1256 1256 const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; 1257 1257 const blk_opf_t do_fua = bio->bi_opf & REQ_FUA; 1258 + const blk_opf_t do_atomic = bio->bi_opf & REQ_ATOMIC; 1258 1259 unsigned long flags; 1259 1260 struct r10conf *conf = mddev->private; 1260 1261 struct md_rdev *rdev; ··· 1274 1273 mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr + 1275 1274 choose_data_offset(r10_bio, rdev)); 1276 1275 mbio->bi_end_io = raid10_end_write_request; 1277 - mbio->bi_opf = op | do_sync | do_fua; 1276 + mbio->bi_opf = op | do_sync | do_fua | do_atomic; 1278 1277 if (!replacement && test_bit(FailFast, 1279 1278 &conf->mirrors[devnum].rdev->flags) 1280 1279 && enough(conf, devnum)) ··· 1469 1468 continue; 1470 1469 } 1471 1470 if (is_bad) { 1472 - int good_sectors = first_bad - dev_sector; 1471 + int good_sectors; 1472 + 1473 + /* 1474 + * We cannot atomically write this, so just 1475 + * error in that case. It could be possible to 1476 + * atomically write other mirrors, but the 1477 + * complexity of supporting that is not worth 1478 + * the benefit. 1479 + */ 1480 + if (bio->bi_opf & REQ_ATOMIC) { 1481 + error = -EIO; 1482 + goto err_handle; 1483 + } 1484 + 1485 + good_sectors = first_bad - dev_sector; 1473 1486 if (good_sectors < max_sectors) 1474 1487 max_sectors = good_sectors; 1475 1488 } ··· 4040 4025 lim.max_write_zeroes_sectors = 0; 4041 4026 lim.io_min = mddev->chunk_sectors << 9; 4042 4027 lim.io_opt = lim.io_min * raid10_nr_stripes(conf); 4028 + lim.features |= BLK_FEAT_ATOMIC_WRITES_STACKED; 4043 4029 err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); 4044 4030 if (err) { 4045 4031 queue_limits_cancel_update(mddev->gendisk->queue);
+15 -7
drivers/nvme/host/core.c
··· 1305 1305 queue_delayed_work(nvme_wq, &ctrl->ka_work, delay); 1306 1306 } 1307 1307 1308 - static void nvme_keep_alive_finish(struct request *rq, 1309 - blk_status_t status, struct nvme_ctrl *ctrl) 1308 + static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq, 1309 + blk_status_t status) 1310 1310 { 1311 + struct nvme_ctrl *ctrl = rq->end_io_data; 1311 1312 unsigned long rtt = jiffies - (rq->deadline - rq->timeout); 1312 1313 unsigned long delay = nvme_keep_alive_work_period(ctrl); 1313 1314 enum nvme_ctrl_state state = nvme_ctrl_state(ctrl); ··· 1325 1324 delay = 0; 1326 1325 } 1327 1326 1327 + blk_mq_free_request(rq); 1328 + 1328 1329 if (status) { 1329 1330 dev_err(ctrl->device, 1330 1331 "failed nvme_keep_alive_end_io error=%d\n", 1331 1332 status); 1332 - return; 1333 + return RQ_END_IO_NONE; 1333 1334 } 1334 1335 1335 1336 ctrl->ka_last_check_time = jiffies; 1336 1337 ctrl->comp_seen = false; 1337 1338 if (state == NVME_CTRL_LIVE || state == NVME_CTRL_CONNECTING) 1338 1339 queue_delayed_work(nvme_wq, &ctrl->ka_work, delay); 1340 + return RQ_END_IO_NONE; 1339 1341 } 1340 1342 1341 1343 static void nvme_keep_alive_work(struct work_struct *work) ··· 1347 1343 struct nvme_ctrl, ka_work); 1348 1344 bool comp_seen = ctrl->comp_seen; 1349 1345 struct request *rq; 1350 - blk_status_t status; 1351 1346 1352 1347 ctrl->ka_last_check_time = jiffies; 1353 1348 ··· 1369 1366 nvme_init_request(rq, &ctrl->ka_cmd); 1370 1367 1371 1368 rq->timeout = ctrl->kato * HZ; 1372 - status = blk_execute_rq(rq, false); 1373 - nvme_keep_alive_finish(rq, status, ctrl); 1374 - blk_mq_free_request(rq); 1369 + rq->end_io = nvme_keep_alive_end_io; 1370 + rq->end_io_data = ctrl; 1371 + blk_execute_rq_nowait(rq, false); 1375 1372 } 1376 1373 1377 1374 static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) ··· 4603 4600 4604 4601 void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl) 4605 4602 { 4603 + /* 4604 + * As we're about to destroy the queue and free tagset 4605 + * we can not have keep-alive work running. 4606 + */ 4607 + nvme_stop_keep_alive(ctrl); 4606 4608 blk_mq_destroy_queue(ctrl->admin_q); 4607 4609 blk_put_queue(ctrl->admin_q); 4608 4610 if (ctrl->ops->flags & NVME_F_FABRICS) {
+10 -2
drivers/nvme/host/ioctl.c
··· 120 120 struct nvme_ns *ns = q->queuedata; 121 121 struct block_device *bdev = ns ? ns->disk->part0 : NULL; 122 122 bool supports_metadata = bdev && blk_get_integrity(bdev->bd_disk); 123 + struct nvme_ctrl *ctrl = nvme_req(req)->ctrl; 123 124 bool has_metadata = meta_buffer && meta_len; 124 125 struct bio *bio = NULL; 125 126 int ret; 126 127 127 - if (has_metadata && !supports_metadata) 128 - return -EINVAL; 128 + if (!nvme_ctrl_sgl_supported(ctrl)) 129 + dev_warn_once(ctrl->device, "using unchecked data buffer\n"); 130 + if (has_metadata) { 131 + if (!supports_metadata) 132 + return -EINVAL; 133 + if (!nvme_ctrl_meta_sgl_supported(ctrl)) 134 + dev_warn_once(ctrl->device, 135 + "using unchecked metadata buffer\n"); 136 + } 129 137 130 138 if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) { 131 139 struct iov_iter iter;
+14 -7
drivers/nvme/host/multipath.c
··· 165 165 int srcu_idx; 166 166 167 167 srcu_idx = srcu_read_lock(&ctrl->srcu); 168 - list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { 168 + list_for_each_entry_srcu(ns, &ctrl->namespaces, list, 169 + srcu_read_lock_held(&ctrl->srcu)) { 169 170 if (!ns->head->disk) 170 171 continue; 171 172 kblockd_schedule_work(&ns->head->requeue_work); ··· 210 209 int srcu_idx; 211 210 212 211 srcu_idx = srcu_read_lock(&ctrl->srcu); 213 - list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { 212 + list_for_each_entry_srcu(ns, &ctrl->namespaces, list, 213 + srcu_read_lock_held(&ctrl->srcu)) { 214 214 nvme_mpath_clear_current_path(ns); 215 215 kblockd_schedule_work(&ns->head->requeue_work); 216 216 } ··· 226 224 int srcu_idx; 227 225 228 226 srcu_idx = srcu_read_lock(&head->srcu); 229 - list_for_each_entry_rcu(ns, &head->list, siblings) { 227 + list_for_each_entry_srcu(ns, &head->list, siblings, 228 + srcu_read_lock_held(&head->srcu)) { 230 229 if (capacity != get_capacity(ns->disk)) 231 230 clear_bit(NVME_NS_READY, &ns->flags); 232 231 } ··· 260 257 int found_distance = INT_MAX, fallback_distance = INT_MAX, distance; 261 258 struct nvme_ns *found = NULL, *fallback = NULL, *ns; 262 259 263 - list_for_each_entry_rcu(ns, &head->list, siblings) { 260 + list_for_each_entry_srcu(ns, &head->list, siblings, 261 + srcu_read_lock_held(&head->srcu)) { 264 262 if (nvme_path_is_disabled(ns)) 265 263 continue; 266 264 ··· 360 356 unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX; 361 357 unsigned int depth; 362 358 363 - list_for_each_entry_rcu(ns, &head->list, siblings) { 359 + list_for_each_entry_srcu(ns, &head->list, siblings, 360 + srcu_read_lock_held(&head->srcu)) { 364 361 if (nvme_path_is_disabled(ns)) 365 362 continue; 366 363 ··· 429 424 if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) 430 425 return NULL; 431 426 432 - list_for_each_entry_rcu(ns, &head->list, siblings) { 427 + list_for_each_entry_srcu(ns, &head->list, siblings, 428 + srcu_read_lock_held(&head->srcu)) { 433 429 if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags)) 434 430 continue; 435 431 switch (nvme_ctrl_state(ns->ctrl)) { ··· 789 783 return 0; 790 784 791 785 srcu_idx = srcu_read_lock(&ctrl->srcu); 792 - list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { 786 + list_for_each_entry_srcu(ns, &ctrl->namespaces, list, 787 + srcu_read_lock_held(&ctrl->srcu)) { 793 788 unsigned nsid; 794 789 again: 795 790 nsid = le32_to_cpu(desc->nsids[n]);
+9 -1
drivers/nvme/host/nvme.h
··· 1123 1123 1124 1124 static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl) 1125 1125 { 1126 - return ctrl->sgls & ((1 << 0) | (1 << 1)); 1126 + return ctrl->sgls & (NVME_CTRL_SGLS_BYTE_ALIGNED | 1127 + NVME_CTRL_SGLS_DWORD_ALIGNED); 1128 + } 1129 + 1130 + static inline bool nvme_ctrl_meta_sgl_supported(struct nvme_ctrl *ctrl) 1131 + { 1132 + if (ctrl->ops->flags & NVME_F_FABRICS) 1133 + return true; 1134 + return ctrl->sgls & NVME_CTRL_SGLS_MSDS; 1127 1135 } 1128 1136 1129 1137 #ifdef CONFIG_NVME_HOST_AUTH
+131 -16
drivers/nvme/host/pci.c
··· 43 43 */ 44 44 #define NVME_MAX_KB_SZ 8192 45 45 #define NVME_MAX_SEGS 128 46 + #define NVME_MAX_META_SEGS 15 46 47 #define NVME_MAX_NR_ALLOCATIONS 5 47 48 48 49 static int use_threaded_interrupts; ··· 145 144 struct sg_table *hmb_sgt; 146 145 147 146 mempool_t *iod_mempool; 147 + mempool_t *iod_meta_mempool; 148 148 149 149 /* shadow doorbell buffer support: */ 150 150 __le32 *dbbuf_dbs; ··· 241 239 dma_addr_t first_dma; 242 240 dma_addr_t meta_dma; 243 241 struct sg_table sgt; 242 + struct sg_table meta_sgt; 243 + union nvme_descriptor meta_list; 244 244 union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS]; 245 245 }; 246 246 ··· 510 506 spin_unlock(&nvmeq->sq_lock); 511 507 } 512 508 509 + static inline bool nvme_pci_metadata_use_sgls(struct nvme_dev *dev, 510 + struct request *req) 511 + { 512 + if (!nvme_ctrl_meta_sgl_supported(&dev->ctrl)) 513 + return false; 514 + return req->nr_integrity_segments > 1 || 515 + nvme_req(req)->flags & NVME_REQ_USERCMD; 516 + } 517 + 513 518 static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req, 514 519 int nseg) 515 520 { ··· 531 518 return false; 532 519 if (!nvmeq->qid) 533 520 return false; 521 + if (nvme_pci_metadata_use_sgls(dev, req)) 522 + return true; 534 523 if (!sgl_threshold || avg_seg_size < sgl_threshold) 535 - return false; 524 + return nvme_req(req)->flags & NVME_REQ_USERCMD; 536 525 return true; 537 526 } 538 527 ··· 795 780 struct bio_vec bv = req_bvec(req); 796 781 797 782 if (!is_pci_p2pdma_page(bv.bv_page)) { 798 - if ((bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) + 783 + if (!nvme_pci_metadata_use_sgls(dev, req) && 784 + (bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) + 799 785 bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2) 800 786 return nvme_setup_prp_simple(dev, req, 801 787 &cmnd->rw, &bv); ··· 840 824 return ret; 841 825 } 842 826 843 - static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req, 844 - struct nvme_command *cmnd) 827 + static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev, 828 + struct request *req) 829 + { 830 + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 831 + struct nvme_rw_command *cmnd = &iod->cmd.rw; 832 + struct nvme_sgl_desc *sg_list; 833 + struct scatterlist *sgl, *sg; 834 + unsigned int entries; 835 + dma_addr_t sgl_dma; 836 + int rc, i; 837 + 838 + iod->meta_sgt.sgl = mempool_alloc(dev->iod_meta_mempool, GFP_ATOMIC); 839 + if (!iod->meta_sgt.sgl) 840 + return BLK_STS_RESOURCE; 841 + 842 + sg_init_table(iod->meta_sgt.sgl, req->nr_integrity_segments); 843 + iod->meta_sgt.orig_nents = blk_rq_map_integrity_sg(req, 844 + iod->meta_sgt.sgl); 845 + if (!iod->meta_sgt.orig_nents) 846 + goto out_free_sg; 847 + 848 + rc = dma_map_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 849 + DMA_ATTR_NO_WARN); 850 + if (rc) 851 + goto out_free_sg; 852 + 853 + sg_list = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, &sgl_dma); 854 + if (!sg_list) 855 + goto out_unmap_sg; 856 + 857 + entries = iod->meta_sgt.nents; 858 + iod->meta_list.sg_list = sg_list; 859 + iod->meta_dma = sgl_dma; 860 + 861 + cmnd->flags = NVME_CMD_SGL_METASEG; 862 + cmnd->metadata = cpu_to_le64(sgl_dma); 863 + 864 + sgl = iod->meta_sgt.sgl; 865 + if (entries == 1) { 866 + nvme_pci_sgl_set_data(sg_list, sgl); 867 + return BLK_STS_OK; 868 + } 869 + 870 + sgl_dma += sizeof(*sg_list); 871 + nvme_pci_sgl_set_seg(sg_list, sgl_dma, entries); 872 + for_each_sg(sgl, sg, entries, i) 873 + nvme_pci_sgl_set_data(&sg_list[i + 1], sg); 874 + 875 + return BLK_STS_OK; 876 + 877 + out_unmap_sg: 878 + dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0); 879 + out_free_sg: 880 + mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool); 881 + return BLK_STS_RESOURCE; 882 + } 883 + 884 + static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_dev *dev, 885 + struct request *req) 845 886 { 846 887 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 847 888 struct bio_vec bv = rq_integrity_vec(req); 889 + struct nvme_command *cmnd = &iod->cmd; 848 890 849 891 iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0); 850 892 if (dma_mapping_error(dev->dev, iod->meta_dma)) 851 893 return BLK_STS_IOERR; 852 894 cmnd->rw.metadata = cpu_to_le64(iod->meta_dma); 853 895 return BLK_STS_OK; 896 + } 897 + 898 + static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req) 899 + { 900 + if (nvme_pci_metadata_use_sgls(dev, req)) 901 + return nvme_pci_setup_meta_sgls(dev, req); 902 + return nvme_pci_setup_meta_mptr(dev, req); 854 903 } 855 904 856 905 static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) ··· 926 845 iod->aborted = false; 927 846 iod->nr_allocations = -1; 928 847 iod->sgt.nents = 0; 848 + iod->meta_sgt.nents = 0; 929 849 930 850 ret = nvme_setup_cmd(req->q->queuedata, req); 931 851 if (ret) ··· 939 857 } 940 858 941 859 if (blk_integrity_rq(req)) { 942 - ret = nvme_map_metadata(dev, req, &iod->cmd); 860 + ret = nvme_map_metadata(dev, req); 943 861 if (ret) 944 862 goto out_unmap_data; 945 863 } ··· 1037 955 *rqlist = requeue_list; 1038 956 } 1039 957 958 + static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev, 959 + struct request *req) 960 + { 961 + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 962 + 963 + if (!iod->meta_sgt.nents) { 964 + dma_unmap_page(dev->dev, iod->meta_dma, 965 + rq_integrity_vec(req).bv_len, 966 + rq_dma_dir(req)); 967 + return; 968 + } 969 + 970 + dma_pool_free(dev->prp_small_pool, iod->meta_list.sg_list, 971 + iod->meta_dma); 972 + dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0); 973 + mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool); 974 + } 975 + 1040 976 static __always_inline void nvme_pci_unmap_rq(struct request *req) 1041 977 { 1042 978 struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 1043 979 struct nvme_dev *dev = nvmeq->dev; 1044 980 1045 - if (blk_integrity_rq(req)) { 1046 - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 1047 - 1048 - dma_unmap_page(dev->dev, iod->meta_dma, 1049 - rq_integrity_vec(req).bv_len, rq_dma_dir(req)); 1050 - } 981 + if (blk_integrity_rq(req)) 982 + nvme_unmap_metadata(dev, req); 1051 983 1052 984 if (blk_rq_nr_phys_segments(req)) 1053 985 nvme_unmap_data(dev, req); ··· 2857 2761 2858 2762 static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) 2859 2763 { 2764 + size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1); 2860 2765 size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS; 2861 2766 2862 2767 dev->iod_mempool = mempool_create_node(1, ··· 2866 2769 dev_to_node(dev->dev)); 2867 2770 if (!dev->iod_mempool) 2868 2771 return -ENOMEM; 2772 + 2773 + dev->iod_meta_mempool = mempool_create_node(1, 2774 + mempool_kmalloc, mempool_kfree, 2775 + (void *)meta_size, GFP_KERNEL, 2776 + dev_to_node(dev->dev)); 2777 + if (!dev->iod_meta_mempool) 2778 + goto free; 2779 + 2869 2780 return 0; 2781 + free: 2782 + mempool_destroy(dev->iod_mempool); 2783 + return -ENOMEM; 2870 2784 } 2871 2785 2872 2786 static void nvme_free_tagset(struct nvme_dev *dev) ··· 2941 2833 result = nvme_init_ctrl_finish(&dev->ctrl, was_suspend); 2942 2834 if (result) 2943 2835 goto out; 2836 + 2837 + if (nvme_ctrl_meta_sgl_supported(&dev->ctrl)) 2838 + dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS; 2839 + else 2840 + dev->ctrl.max_integrity_segments = 1; 2944 2841 2945 2842 nvme_dbbuf_dma_alloc(dev); 2946 2843 ··· 3214 3101 dev->ctrl.max_hw_sectors = min_t(u32, 3215 3102 NVME_MAX_KB_SZ << 1, dma_opt_mapping_size(&pdev->dev) >> 9); 3216 3103 dev->ctrl.max_segments = NVME_MAX_SEGS; 3217 - 3218 - /* 3219 - * There is no support for SGLs for metadata (yet), so we are limited to 3220 - * a single integrity segment for the separate metadata pointer. 3221 - */ 3222 3104 dev->ctrl.max_integrity_segments = 1; 3223 3105 return dev; 3224 3106 ··· 3276 3168 if (result) 3277 3169 goto out_disable; 3278 3170 3171 + if (nvme_ctrl_meta_sgl_supported(&dev->ctrl)) 3172 + dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS; 3173 + else 3174 + dev->ctrl.max_integrity_segments = 1; 3175 + 3279 3176 nvme_dbbuf_dma_alloc(dev); 3280 3177 3281 3178 result = nvme_setup_host_mem(dev); ··· 3323 3210 nvme_free_queues(dev, 0); 3324 3211 out_release_iod_mempool: 3325 3212 mempool_destroy(dev->iod_mempool); 3213 + mempool_destroy(dev->iod_meta_mempool); 3326 3214 out_release_prp_pools: 3327 3215 nvme_release_prp_pools(dev); 3328 3216 out_dev_unmap: ··· 3389 3275 nvme_dbbuf_dma_free(dev); 3390 3276 nvme_free_queues(dev, 0); 3391 3277 mempool_destroy(dev->iod_mempool); 3278 + mempool_destroy(dev->iod_meta_mempool); 3392 3279 nvme_release_prp_pools(dev); 3393 3280 nvme_dev_unmap(dev); 3394 3281 nvme_uninit_ctrl(&dev->ctrl);
+77 -49
drivers/nvme/host/pr.c
··· 94 94 } 95 95 } 96 96 97 - static int nvme_send_pr_command(struct block_device *bdev, 98 - struct nvme_command *c, void *data, unsigned int data_len) 97 + static int __nvme_send_pr_command(struct block_device *bdev, u32 cdw10, 98 + u32 cdw11, u8 op, void *data, unsigned int data_len) 99 99 { 100 - if (nvme_disk_is_ns_head(bdev->bd_disk)) 101 - return nvme_send_ns_head_pr_command(bdev, c, data, data_len); 102 - 103 - return nvme_send_ns_pr_command(bdev->bd_disk->private_data, c, data, 104 - data_len); 105 - } 106 - 107 - static int nvme_pr_command(struct block_device *bdev, u32 cdw10, 108 - u64 key, u64 sa_key, u8 op) 109 - { 110 - struct nvme_command c = { }; 111 - u8 data[16] = { 0, }; 112 - int ret; 113 - 114 - put_unaligned_le64(key, &data[0]); 115 - put_unaligned_le64(sa_key, &data[8]); 100 + struct nvme_command c = { 0 }; 116 101 117 102 c.common.opcode = op; 118 103 c.common.cdw10 = cpu_to_le32(cdw10); 104 + c.common.cdw11 = cpu_to_le32(cdw11); 119 105 120 - ret = nvme_send_pr_command(bdev, &c, data, sizeof(data)); 121 - if (ret < 0) 122 - return ret; 123 - 124 - return nvme_status_to_pr_err(ret); 106 + if (nvme_disk_is_ns_head(bdev->bd_disk)) 107 + return nvme_send_ns_head_pr_command(bdev, &c, data, data_len); 108 + return nvme_send_ns_pr_command(bdev->bd_disk->private_data, &c, 109 + data, data_len); 125 110 } 126 111 127 - static int nvme_pr_register(struct block_device *bdev, u64 old, 128 - u64 new, unsigned flags) 112 + static int nvme_send_pr_command(struct block_device *bdev, u32 cdw10, u32 cdw11, 113 + u8 op, void *data, unsigned int data_len) 129 114 { 115 + int ret; 116 + 117 + ret = __nvme_send_pr_command(bdev, cdw10, cdw11, op, data, data_len); 118 + return ret < 0 ? ret : nvme_status_to_pr_err(ret); 119 + } 120 + 121 + static int nvme_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, 122 + unsigned int flags) 123 + { 124 + struct nvmet_pr_register_data data = { 0 }; 130 125 u32 cdw10; 131 126 132 127 if (flags & ~PR_FL_IGNORE_KEY) 133 128 return -EOPNOTSUPP; 134 129 135 - cdw10 = old ? 2 : 0; 136 - cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0; 137 - cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */ 138 - return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register); 130 + data.crkey = cpu_to_le64(old_key); 131 + data.nrkey = cpu_to_le64(new_key); 132 + 133 + cdw10 = old_key ? NVME_PR_REGISTER_ACT_REPLACE : 134 + NVME_PR_REGISTER_ACT_REG; 135 + cdw10 |= (flags & PR_FL_IGNORE_KEY) ? NVME_PR_IGNORE_KEY : 0; 136 + cdw10 |= NVME_PR_CPTPL_PERSIST; 137 + 138 + return nvme_send_pr_command(bdev, cdw10, 0, nvme_cmd_resv_register, 139 + &data, sizeof(data)); 139 140 } 140 141 141 142 static int nvme_pr_reserve(struct block_device *bdev, u64 key, 142 143 enum pr_type type, unsigned flags) 143 144 { 145 + struct nvmet_pr_acquire_data data = { 0 }; 144 146 u32 cdw10; 145 147 146 148 if (flags & ~PR_FL_IGNORE_KEY) 147 149 return -EOPNOTSUPP; 148 150 149 - cdw10 = nvme_pr_type_from_blk(type) << 8; 150 - cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0); 151 - return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire); 151 + data.crkey = cpu_to_le64(key); 152 + 153 + cdw10 = NVME_PR_ACQUIRE_ACT_ACQUIRE; 154 + cdw10 |= nvme_pr_type_from_blk(type) << 8; 155 + cdw10 |= (flags & PR_FL_IGNORE_KEY) ? NVME_PR_IGNORE_KEY : 0; 156 + 157 + return nvme_send_pr_command(bdev, cdw10, 0, nvme_cmd_resv_acquire, 158 + &data, sizeof(data)); 152 159 } 153 160 154 161 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, 155 162 enum pr_type type, bool abort) 156 163 { 157 - u32 cdw10 = nvme_pr_type_from_blk(type) << 8 | (abort ? 2 : 1); 164 + struct nvmet_pr_acquire_data data = { 0 }; 165 + u32 cdw10; 158 166 159 - return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire); 167 + data.crkey = cpu_to_le64(old); 168 + data.prkey = cpu_to_le64(new); 169 + 170 + cdw10 = abort ? NVME_PR_ACQUIRE_ACT_PREEMPT_AND_ABORT : 171 + NVME_PR_ACQUIRE_ACT_PREEMPT; 172 + cdw10 |= nvme_pr_type_from_blk(type) << 8; 173 + 174 + return nvme_send_pr_command(bdev, cdw10, 0, nvme_cmd_resv_acquire, 175 + &data, sizeof(data)); 160 176 } 161 177 162 178 static int nvme_pr_clear(struct block_device *bdev, u64 key) 163 179 { 164 - u32 cdw10 = 1 | (key ? 0 : 1 << 3); 180 + struct nvmet_pr_release_data data = { 0 }; 181 + u32 cdw10; 165 182 166 - return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); 183 + data.crkey = cpu_to_le64(key); 184 + 185 + cdw10 = NVME_PR_RELEASE_ACT_CLEAR; 186 + cdw10 |= key ? 0 : NVME_PR_IGNORE_KEY; 187 + 188 + return nvme_send_pr_command(bdev, cdw10, 0, nvme_cmd_resv_release, 189 + &data, sizeof(data)); 167 190 } 168 191 169 192 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type) 170 193 { 171 - u32 cdw10 = nvme_pr_type_from_blk(type) << 8 | (key ? 0 : 1 << 3); 194 + struct nvmet_pr_release_data data = { 0 }; 195 + u32 cdw10; 172 196 173 - return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); 197 + data.crkey = cpu_to_le64(key); 198 + 199 + cdw10 = NVME_PR_RELEASE_ACT_RELEASE; 200 + cdw10 |= nvme_pr_type_from_blk(type) << 8; 201 + cdw10 |= key ? 0 : NVME_PR_IGNORE_KEY; 202 + 203 + return nvme_send_pr_command(bdev, cdw10, 0, nvme_cmd_resv_release, 204 + &data, sizeof(data)); 174 205 } 175 206 176 207 static int nvme_pr_resv_report(struct block_device *bdev, void *data, 177 208 u32 data_len, bool *eds) 178 209 { 179 - struct nvme_command c = { }; 210 + u32 cdw10, cdw11; 180 211 int ret; 181 212 182 - c.common.opcode = nvme_cmd_resv_report; 183 - c.common.cdw10 = cpu_to_le32(nvme_bytes_to_numd(data_len)); 184 - c.common.cdw11 = cpu_to_le32(NVME_EXTENDED_DATA_STRUCT); 213 + cdw10 = nvme_bytes_to_numd(data_len); 214 + cdw11 = NVME_EXTENDED_DATA_STRUCT; 185 215 *eds = true; 186 216 187 217 retry: 188 - ret = nvme_send_pr_command(bdev, &c, data, data_len); 218 + ret = __nvme_send_pr_command(bdev, cdw10, cdw11, nvme_cmd_resv_report, 219 + data, data_len); 189 220 if (ret == NVME_SC_HOST_ID_INCONSIST && 190 - c.common.cdw11 == cpu_to_le32(NVME_EXTENDED_DATA_STRUCT)) { 191 - c.common.cdw11 = 0; 221 + cdw11 == NVME_EXTENDED_DATA_STRUCT) { 222 + cdw11 = 0; 192 223 *eds = false; 193 224 goto retry; 194 225 } 195 226 196 - if (ret < 0) 197 - return ret; 198 - 199 - return nvme_status_to_pr_err(ret); 227 + return ret < 0 ? ret : nvme_status_to_pr_err(ret); 200 228 } 201 229 202 230 static int nvme_pr_read_keys(struct block_device *bdev,
+2 -2
drivers/nvme/host/rdma.c
··· 1019 1019 goto destroy_admin; 1020 1020 } 1021 1021 1022 - if (!(ctrl->ctrl.sgls & (1 << 2))) { 1022 + if (!(ctrl->ctrl.sgls & NVME_CTRL_SGLS_KSDBDS)) { 1023 1023 ret = -EOPNOTSUPP; 1024 1024 dev_err(ctrl->ctrl.device, 1025 1025 "Mandatory keyed sgls are not supported!\n"); ··· 1051 1051 ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1; 1052 1052 } 1053 1053 1054 - if (ctrl->ctrl.sgls & (1 << 20)) 1054 + if (ctrl->ctrl.sgls & NVME_CTRL_SGLS_SAOS) 1055 1055 ctrl->use_inline_data = true; 1056 1056 1057 1057 if (ctrl->ctrl.queue_count > 1) {
+4 -3
drivers/nvme/target/admin-cmd.c
··· 601 601 id->awun = 0; 602 602 id->awupf = 0; 603 603 604 - id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ 604 + /* we always support SGLs */ 605 + id->sgls = cpu_to_le32(NVME_CTRL_SGLS_BYTE_ALIGNED); 605 606 if (ctrl->ops->flags & NVMF_KEYED_SGLS) 606 - id->sgls |= cpu_to_le32(1 << 2); 607 + id->sgls |= cpu_to_le32(NVME_CTRL_SGLS_KSDBDS); 607 608 if (req->port->inline_data_size) 608 - id->sgls |= cpu_to_le32(1 << 20); 609 + id->sgls |= cpu_to_le32(NVME_CTRL_SGLS_SAOS); 609 610 610 611 strscpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn)); 611 612
+12 -8
include/linux/blkdev.h
··· 333 333 #define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \ 334 334 ((__force blk_features_t)(1u << 15)) 335 335 336 + /* stacked device can/does support atomic writes */ 337 + #define BLK_FEAT_ATOMIC_WRITES_STACKED \ 338 + ((__force blk_features_t)(1u << 16)) 339 + 336 340 /* 337 341 * Flags automatically inherited when stacking limits. 338 342 */ ··· 779 775 atomic_andnot(flag, &bdev->__bd_flags); 780 776 } 781 777 782 - static inline int get_disk_ro(struct gendisk *disk) 778 + static inline bool get_disk_ro(struct gendisk *disk) 783 779 { 784 780 return bdev_test_flag(disk->part0, BD_READ_ONLY) || 785 781 test_bit(GD_READ_ONLY, &disk->state); 786 782 } 787 783 788 - static inline int bdev_read_only(struct block_device *bdev) 784 + static inline bool bdev_read_only(struct block_device *bdev) 789 785 { 790 786 return bdev_test_flag(bdev, BD_READ_ONLY) || get_disk_ro(bdev->bd_disk); 791 787 } ··· 1265 1261 return q->limits.io_min; 1266 1262 } 1267 1263 1268 - static inline int bdev_io_min(struct block_device *bdev) 1264 + static inline unsigned int bdev_io_min(struct block_device *bdev) 1269 1265 { 1270 1266 return queue_io_min(bdev_get_queue(bdev)); 1271 1267 } ··· 1275 1271 return q->limits.io_opt; 1276 1272 } 1277 1273 1278 - static inline int bdev_io_opt(struct block_device *bdev) 1274 + static inline unsigned int bdev_io_opt(struct block_device *bdev) 1279 1275 { 1280 1276 return queue_io_opt(bdev_get_queue(bdev)); 1281 1277 } ··· 1421 1417 return is_seq; 1422 1418 } 1423 1419 1424 - static inline int queue_dma_alignment(const struct request_queue *q) 1420 + static inline unsigned int queue_dma_alignment(const struct request_queue *q) 1425 1421 { 1426 1422 return q->limits.dma_alignment; 1427 1423 } ··· 1462 1458 bdev_logical_block_size(bdev) - 1); 1463 1459 } 1464 1460 1465 - static inline int blk_lim_dma_alignment_and_pad(struct queue_limits *lim) 1461 + static inline unsigned int 1462 + blk_lim_dma_alignment_and_pad(struct queue_limits *lim) 1466 1463 { 1467 1464 return lim->dma_alignment | lim->dma_pad_mask; 1468 1465 } 1469 1466 1470 - static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr, 1467 + static inline bool blk_rq_aligned(struct request_queue *q, unsigned long addr, 1471 1468 unsigned int len) 1472 1469 { 1473 1470 unsigned int alignment = blk_lim_dma_alignment_and_pad(&q->limits); ··· 1586 1581 return bio_end_io_acct_remapped(bio, start_time, bio->bi_bdev); 1587 1582 } 1588 1583 1589 - int bdev_read_only(struct block_device *bdev); 1590 1584 int set_blocksize(struct file *file, int size); 1591 1585 1592 1586 int lookup_bdev(const char *pathname, dev_t *dev);
+14
include/linux/nvme.h
··· 389 389 NVME_CTRL_CTRATT_PREDICTABLE_LAT = 1 << 5, 390 390 NVME_CTRL_CTRATT_NAMESPACE_GRANULARITY = 1 << 7, 391 391 NVME_CTRL_CTRATT_UUID_LIST = 1 << 9, 392 + NVME_CTRL_SGLS_BYTE_ALIGNED = 1, 393 + NVME_CTRL_SGLS_DWORD_ALIGNED = 2, 394 + NVME_CTRL_SGLS_KSDBDS = 1 << 2, 395 + NVME_CTRL_SGLS_MSDS = 1 << 19, 396 + NVME_CTRL_SGLS_SAOS = 1 << 20, 392 397 }; 393 398 394 399 struct nvme_lbaf { ··· 2170 2165 NVME_PR_RELEASE_ACT_RELEASE = 0, 2171 2166 NVME_PR_RELEASE_ACT_CLEAR = 1, 2172 2167 }; 2168 + 2169 + enum nvme_pr_change_ptpl { 2170 + NVME_PR_CPTPL_NO_CHANGE = 0, 2171 + NVME_PR_CPTPL_RESV = 1 << 30, 2172 + NVME_PR_CPTPL_CLEARED = 2 << 30, 2173 + NVME_PR_CPTPL_PERSIST = 3 << 30, 2174 + }; 2175 + 2176 + #define NVME_PR_IGNORE_KEY (1 << 3) 2173 2177 2174 2178 #endif /* _LINUX_NVME_H */
+1 -1
rust/kernel/block/mq/gen_disk.rs
··· 45 45 46 46 /// Validate block size by verifying that it is between 512 and `PAGE_SIZE`, 47 47 /// and that it is a power of two. 48 - fn validate_block_size(size: u32) -> Result<()> { 48 + fn validate_block_size(size: u32) -> Result { 49 49 if !(512..=bindings::PAGE_SIZE as u32).contains(&size) || !size.is_power_of_two() { 50 50 Err(error::code::EINVAL) 51 51 } else {