Merge tag 'block-6.13-20242901' of git://git.kernel.dk/linux

Pull more block updates from Jens Axboe:

- NVMe pull request via Keith:
- Use correct srcu list traversal (Breno)
- Scatter-gather support for metadata (Keith)
- Fabrics shutdown race condition fix (Nilay)
- Persistent reservations updates (Guixin)

- Add the required bits for MD atomic write support for raid0/1/10

- Correct return value for unknown opcode in ublk

- Fix deadlock with zone revalidation

- Fix for the io priority request vs bio cleanups

- Use the correct unsigned int type for various limit helpers

- Fix for a race in loop

- Cleanup blk_rq_prep_clone() to prevent uninit-value warning and make
it easier for actual humans to read

- Fix potential UAF when iterating tags

- A few fixes for bfq-iosched UAF issues

- Fix for brd discard not decrementing the allocated page count

- Various little fixes and cleanups

* tag 'block-6.13-20242901' of git://git.kernel.dk/linux: (36 commits)
brd: decrease the number of allocated pages which discarded
block, bfq: fix bfqq uaf in bfq_limit_depth()
block: Don't allow an atomic write be truncated in blkdev_write_iter()
mq-deadline: don't call req_get_ioprio from the I/O completion handler
block: Prevent potential deadlock in blk_revalidate_disk_zones()
block: Remove extra part pointer NULLify in blk_rq_init()
nvme: tuning pr code by using defined structs and macros
nvme: introduce change ptpl and iekey definition
block: return bool from get_disk_ro and bdev_read_only
block: remove a duplicate definition for bdev_read_only
block: return bool from blk_rq_aligned
block: return unsigned int from blk_lim_dma_alignment_and_pad
block: return unsigned int from queue_dma_alignment
block: return unsigned int from bdev_io_opt
block: req->bio is always set in the merge code
block: don't bother checking the data direction for merges
block: blk-mq: fix uninit-value in blk_rq_prep_clone and refactor
Revert "block, bfq: merge bfq_release_process_ref() into bfq_put_cooperator()"
md/raid10: Atomic write support
md/raid1: Atomic write support
...

+549 -194
+1
block/bfq-cgroup.c
··· 736 */ 737 bfq_put_cooperator(sync_bfqq); 738 bic_set_bfqq(bic, NULL, true, act_idx); 739 } 740 } 741
··· 736 */ 737 bfq_put_cooperator(sync_bfqq); 738 bic_set_bfqq(bic, NULL, true, act_idx); 739 + bfq_release_process_ref(bfqd, sync_bfqq); 740 } 741 } 742
+28 -15
block/bfq-iosched.c
··· 582 #define BFQ_LIMIT_INLINE_DEPTH 16 583 584 #ifdef CONFIG_BFQ_GROUP_IOSCHED 585 - static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit) 586 { 587 - struct bfq_data *bfqd = bfqq->bfqd; 588 - struct bfq_entity *entity = &bfqq->entity; 589 struct bfq_entity *inline_entities[BFQ_LIMIT_INLINE_DEPTH]; 590 struct bfq_entity **entities = inline_entities; 591 - int depth, level, alloc_depth = BFQ_LIMIT_INLINE_DEPTH; 592 - int class_idx = bfqq->ioprio_class - 1; 593 struct bfq_sched_data *sched_data; 594 unsigned long wsum; 595 bool ret = false; 596 - 597 - if (!entity->on_st_or_in_serv) 598 - return false; 599 600 retry: 601 spin_lock_irq(&bfqd->lock); 602 /* +1 for bfqq entity, root cgroup not included */ 603 depth = bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css.cgroup->level + 1; 604 if (depth > alloc_depth) { ··· 651 * class. 652 */ 653 wsum = 0; 654 - for (i = 0; i <= class_idx; i++) { 655 wsum = wsum * IOPRIO_BE_NR + 656 sched_data->service_tree[i].wsum; 657 } ··· 674 return ret; 675 } 676 #else 677 - static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit) 678 { 679 return false; 680 } ··· 714 } 715 716 for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) { 717 - struct bfq_queue *bfqq = 718 - bic_to_bfqq(bic, op_is_sync(opf), act_idx); 719 720 /* 721 * Does queue (or any parent entity) exceed number of ··· 724 * limit depth so that it cannot consume more 725 * available requests and thus starve other entities. 726 */ 727 - if (bfqq && bfqq_request_over_limit(bfqq, limit)) { 728 depth = 1; 729 break; 730 } ··· 5445 bfq_put_queue(__bfqq); 5446 __bfqq = next; 5447 } 5448 - 5449 - bfq_release_process_ref(bfqq->bfqd, bfqq); 5450 } 5451 5452 static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) ··· 5457 bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); 5458 5459 bfq_put_cooperator(bfqq); 5460 } 5461 5462 static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync, ··· 6745 bic_set_bfqq(bic, NULL, true, bfqq->actuator_idx); 6746 6747 bfq_put_cooperator(bfqq); 6748 return NULL; 6749 } 6750
··· 582 #define BFQ_LIMIT_INLINE_DEPTH 16 583 584 #ifdef CONFIG_BFQ_GROUP_IOSCHED 585 + static bool bfqq_request_over_limit(struct bfq_data *bfqd, 586 + struct bfq_io_cq *bic, blk_opf_t opf, 587 + unsigned int act_idx, int limit) 588 { 589 struct bfq_entity *inline_entities[BFQ_LIMIT_INLINE_DEPTH]; 590 struct bfq_entity **entities = inline_entities; 591 + int alloc_depth = BFQ_LIMIT_INLINE_DEPTH; 592 struct bfq_sched_data *sched_data; 593 + struct bfq_entity *entity; 594 + struct bfq_queue *bfqq; 595 unsigned long wsum; 596 bool ret = false; 597 + int depth; 598 + int level; 599 600 retry: 601 spin_lock_irq(&bfqd->lock); 602 + bfqq = bic_to_bfqq(bic, op_is_sync(opf), act_idx); 603 + if (!bfqq) 604 + goto out; 605 + 606 + entity = &bfqq->entity; 607 + if (!entity->on_st_or_in_serv) 608 + goto out; 609 + 610 /* +1 for bfqq entity, root cgroup not included */ 611 depth = bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css.cgroup->level + 1; 612 if (depth > alloc_depth) { ··· 643 * class. 644 */ 645 wsum = 0; 646 + for (i = 0; i <= bfqq->ioprio_class - 1; i++) { 647 wsum = wsum * IOPRIO_BE_NR + 648 sched_data->service_tree[i].wsum; 649 } ··· 666 return ret; 667 } 668 #else 669 + static bool bfqq_request_over_limit(struct bfq_data *bfqd, 670 + struct bfq_io_cq *bic, blk_opf_t opf, 671 + unsigned int act_idx, int limit) 672 { 673 return false; 674 } ··· 704 } 705 706 for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) { 707 + /* Fast path to check if bfqq is already allocated. */ 708 + if (!bic_to_bfqq(bic, op_is_sync(opf), act_idx)) 709 + continue; 710 711 /* 712 * Does queue (or any parent entity) exceed number of ··· 713 * limit depth so that it cannot consume more 714 * available requests and thus starve other entities. 715 */ 716 + if (bfqq_request_over_limit(bfqd, bic, opf, act_idx, limit)) { 717 depth = 1; 718 break; 719 } ··· 5434 bfq_put_queue(__bfqq); 5435 __bfqq = next; 5436 } 5437 } 5438 5439 static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) ··· 5448 bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); 5449 5450 bfq_put_cooperator(bfqq); 5451 + 5452 + bfq_release_process_ref(bfqd, bfqq); 5453 } 5454 5455 static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync, ··· 6734 bic_set_bfqq(bic, NULL, true, bfqq->actuator_idx); 6735 6736 bfq_put_cooperator(bfqq); 6737 + 6738 + bfq_release_process_ref(bfqq->bfqd, bfqq); 6739 return NULL; 6740 } 6741
+7 -28
block/blk-merge.c
··· 864 if (req_op(req) != req_op(next)) 865 return NULL; 866 867 - if (rq_data_dir(req) != rq_data_dir(next)) 868 return NULL; 869 - 870 - if (req->bio && next->bio) { 871 - /* Don't merge requests with different write hints. */ 872 - if (req->bio->bi_write_hint != next->bio->bi_write_hint) 873 - return NULL; 874 - if (req->bio->bi_ioprio != next->bio->bi_ioprio) 875 - return NULL; 876 - } 877 - 878 if (!blk_atomic_write_mergeable_rqs(req, next)) 879 return NULL; 880 ··· 979 if (req_op(rq) != bio_op(bio)) 980 return false; 981 982 - /* different data direction or already started, don't merge */ 983 - if (bio_data_dir(bio) != rq_data_dir(rq)) 984 - return false; 985 - 986 - /* don't merge across cgroup boundaries */ 987 if (!blk_cgroup_mergeable(rq, bio)) 988 return false; 989 - 990 - /* only merge integrity protected bio into ditto rq */ 991 if (blk_integrity_merge_bio(rq->q, rq, bio) == false) 992 return false; 993 - 994 - /* Only merge if the crypt contexts are compatible */ 995 if (!bio_crypt_rq_ctx_compatible(rq, bio)) 996 return false; 997 - 998 - if (rq->bio) { 999 - /* Don't merge requests with different write hints. */ 1000 - if (rq->bio->bi_write_hint != bio->bi_write_hint) 1001 - return false; 1002 - if (rq->bio->bi_ioprio != bio->bi_ioprio) 1003 - return false; 1004 - } 1005 - 1006 if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false) 1007 return false; 1008
··· 864 if (req_op(req) != req_op(next)) 865 return NULL; 866 867 + if (req->bio->bi_write_hint != next->bio->bi_write_hint) 868 return NULL; 869 + if (req->bio->bi_ioprio != next->bio->bi_ioprio) 870 + return NULL; 871 if (!blk_atomic_write_mergeable_rqs(req, next)) 872 return NULL; 873 ··· 986 if (req_op(rq) != bio_op(bio)) 987 return false; 988 989 if (!blk_cgroup_mergeable(rq, bio)) 990 return false; 991 if (blk_integrity_merge_bio(rq->q, rq, bio) == false) 992 return false; 993 if (!bio_crypt_rq_ctx_compatible(rq, bio)) 994 return false; 995 + if (rq->bio->bi_write_hint != bio->bi_write_hint) 996 + return false; 997 + if (rq->bio->bi_ioprio != bio->bi_ioprio) 998 + return false; 999 if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false) 1000 return false; 1001
+6 -8
block/blk-mq.c
··· 388 rq->tag = BLK_MQ_NO_TAG; 389 rq->internal_tag = BLK_MQ_NO_TAG; 390 rq->start_time_ns = blk_time_get_ns(); 391 - rq->part = NULL; 392 blk_crypto_rq_set_defaults(rq); 393 } 394 EXPORT_SYMBOL(blk_rq_init); ··· 3272 int (*bio_ctr)(struct bio *, struct bio *, void *), 3273 void *data) 3274 { 3275 - struct bio *bio, *bio_src; 3276 3277 if (!bs) 3278 bs = &fs_bio_set; 3279 3280 __rq_for_each_bio(bio_src, rq_src) { 3281 - bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask, 3282 - bs); 3283 if (!bio) 3284 goto free_and_out; 3285 3286 - if (bio_ctr && bio_ctr(bio, bio_src, data)) 3287 goto free_and_out; 3288 3289 if (rq->bio) { 3290 rq->biotail->bi_next = bio; ··· 3294 } else { 3295 rq->bio = rq->biotail = bio; 3296 } 3297 - bio = NULL; 3298 } 3299 3300 /* Copy attributes of the original request to the clone request. */ ··· 3311 return 0; 3312 3313 free_and_out: 3314 - if (bio) 3315 - bio_put(bio); 3316 blk_rq_unprep_clone(rq); 3317 3318 return -ENOMEM;
··· 388 rq->tag = BLK_MQ_NO_TAG; 389 rq->internal_tag = BLK_MQ_NO_TAG; 390 rq->start_time_ns = blk_time_get_ns(); 391 blk_crypto_rq_set_defaults(rq); 392 } 393 EXPORT_SYMBOL(blk_rq_init); ··· 3273 int (*bio_ctr)(struct bio *, struct bio *, void *), 3274 void *data) 3275 { 3276 + struct bio *bio_src; 3277 3278 if (!bs) 3279 bs = &fs_bio_set; 3280 3281 __rq_for_each_bio(bio_src, rq_src) { 3282 + struct bio *bio = bio_alloc_clone(rq->q->disk->part0, bio_src, 3283 + gfp_mask, bs); 3284 if (!bio) 3285 goto free_and_out; 3286 3287 + if (bio_ctr && bio_ctr(bio, bio_src, data)) { 3288 + bio_put(bio); 3289 goto free_and_out; 3290 + } 3291 3292 if (rq->bio) { 3293 rq->biotail->bi_next = bio; ··· 3293 } else { 3294 rq->bio = rq->biotail = bio; 3295 } 3296 } 3297 3298 /* Copy attributes of the original request to the clone request. */ ··· 3311 return 0; 3312 3313 free_and_out: 3314 blk_rq_unprep_clone(rq); 3315 3316 return -ENOMEM;
+139 -2
block/blk-settings.c
··· 178 if (!lim->atomic_write_hw_max) 179 goto unsupported; 180 181 boundary_sectors = lim->atomic_write_hw_boundary >> SECTOR_SHIFT; 182 183 if (boundary_sectors) { 184 /* 185 * A feature of boundary support is that it disallows bios to 186 * be merged which would result in a merged request which ··· 264 */ 265 if (lim->io_min < lim->physical_block_size) 266 lim->io_min = lim->physical_block_size; 267 268 /* 269 * max_hw_sectors has a somewhat weird default for historical reason, ··· 482 /* Why are these in bytes, not sectors? */ 483 alignment = lim->discard_alignment >> SECTOR_SHIFT; 484 granularity = lim->discard_granularity >> SECTOR_SHIFT; 485 - if (!granularity) 486 - return 0; 487 488 /* Offset of the partition start in 'granularity' sectors */ 489 offset = sector_div(sector, granularity); ··· 499 if (sectors < PAGE_SIZE >> SECTOR_SHIFT) 500 sectors = PAGE_SIZE >> SECTOR_SHIFT; 501 return sectors; 502 } 503 504 /** ··· 774 t->zone_write_granularity = 0; 775 t->max_zone_append_sectors = 0; 776 } 777 return ret; 778 } 779 EXPORT_SYMBOL(blk_stack_limits);
··· 178 if (!lim->atomic_write_hw_max) 179 goto unsupported; 180 181 + if (WARN_ON_ONCE(!is_power_of_2(lim->atomic_write_hw_unit_min))) 182 + goto unsupported; 183 + 184 + if (WARN_ON_ONCE(!is_power_of_2(lim->atomic_write_hw_unit_max))) 185 + goto unsupported; 186 + 187 + if (WARN_ON_ONCE(lim->atomic_write_hw_unit_min > 188 + lim->atomic_write_hw_unit_max)) 189 + goto unsupported; 190 + 191 + if (WARN_ON_ONCE(lim->atomic_write_hw_unit_max > 192 + lim->atomic_write_hw_max)) 193 + goto unsupported; 194 + 195 boundary_sectors = lim->atomic_write_hw_boundary >> SECTOR_SHIFT; 196 197 if (boundary_sectors) { 198 + if (WARN_ON_ONCE(lim->atomic_write_hw_max > 199 + lim->atomic_write_hw_boundary)) 200 + goto unsupported; 201 /* 202 * A feature of boundary support is that it disallows bios to 203 * be merged which would result in a merged request which ··· 247 */ 248 if (lim->io_min < lim->physical_block_size) 249 lim->io_min = lim->physical_block_size; 250 + 251 + /* 252 + * The optimal I/O size may not be aligned to physical block size 253 + * (because it may be limited by dma engines which have no clue about 254 + * block size of the disks attached to them), so we round it down here. 255 + */ 256 + lim->io_opt = round_down(lim->io_opt, lim->physical_block_size); 257 258 /* 259 * max_hw_sectors has a somewhat weird default for historical reason, ··· 458 /* Why are these in bytes, not sectors? */ 459 alignment = lim->discard_alignment >> SECTOR_SHIFT; 460 granularity = lim->discard_granularity >> SECTOR_SHIFT; 461 462 /* Offset of the partition start in 'granularity' sectors */ 463 offset = sector_div(sector, granularity); ··· 477 if (sectors < PAGE_SIZE >> SECTOR_SHIFT) 478 sectors = PAGE_SIZE >> SECTOR_SHIFT; 479 return sectors; 480 + } 481 + 482 + /* Check if second and later bottom devices are compliant */ 483 + static bool blk_stack_atomic_writes_tail(struct queue_limits *t, 484 + struct queue_limits *b) 485 + { 486 + /* We're not going to support different boundary sizes.. yet */ 487 + if (t->atomic_write_hw_boundary != b->atomic_write_hw_boundary) 488 + return false; 489 + 490 + /* Can't support this */ 491 + if (t->atomic_write_hw_unit_min > b->atomic_write_hw_unit_max) 492 + return false; 493 + 494 + /* Or this */ 495 + if (t->atomic_write_hw_unit_max < b->atomic_write_hw_unit_min) 496 + return false; 497 + 498 + t->atomic_write_hw_max = min(t->atomic_write_hw_max, 499 + b->atomic_write_hw_max); 500 + t->atomic_write_hw_unit_min = max(t->atomic_write_hw_unit_min, 501 + b->atomic_write_hw_unit_min); 502 + t->atomic_write_hw_unit_max = min(t->atomic_write_hw_unit_max, 503 + b->atomic_write_hw_unit_max); 504 + return true; 505 + } 506 + 507 + /* Check for valid boundary of first bottom device */ 508 + static bool blk_stack_atomic_writes_boundary_head(struct queue_limits *t, 509 + struct queue_limits *b) 510 + { 511 + /* 512 + * Ensure atomic write boundary is aligned with chunk sectors. Stacked 513 + * devices store chunk sectors in t->io_min. 514 + */ 515 + if (b->atomic_write_hw_boundary > t->io_min && 516 + b->atomic_write_hw_boundary % t->io_min) 517 + return false; 518 + if (t->io_min > b->atomic_write_hw_boundary && 519 + t->io_min % b->atomic_write_hw_boundary) 520 + return false; 521 + 522 + t->atomic_write_hw_boundary = b->atomic_write_hw_boundary; 523 + return true; 524 + } 525 + 526 + 527 + /* Check stacking of first bottom device */ 528 + static bool blk_stack_atomic_writes_head(struct queue_limits *t, 529 + struct queue_limits *b) 530 + { 531 + if (b->atomic_write_hw_boundary && 532 + !blk_stack_atomic_writes_boundary_head(t, b)) 533 + return false; 534 + 535 + if (t->io_min <= SECTOR_SIZE) { 536 + /* No chunk sectors, so use bottom device values directly */ 537 + t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; 538 + t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min; 539 + t->atomic_write_hw_max = b->atomic_write_hw_max; 540 + return true; 541 + } 542 + 543 + /* 544 + * Find values for limits which work for chunk size. 545 + * b->atomic_write_hw_unit_{min, max} may not be aligned with chunk 546 + * size (t->io_min), as chunk size is not restricted to a power-of-2. 547 + * So we need to find highest power-of-2 which works for the chunk 548 + * size. 549 + * As an example scenario, we could have b->unit_max = 16K and 550 + * t->io_min = 24K. For this case, reduce t->unit_max to a value 551 + * aligned with both limits, i.e. 8K in this example. 552 + */ 553 + t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; 554 + while (t->io_min % t->atomic_write_hw_unit_max) 555 + t->atomic_write_hw_unit_max /= 2; 556 + 557 + t->atomic_write_hw_unit_min = min(b->atomic_write_hw_unit_min, 558 + t->atomic_write_hw_unit_max); 559 + t->atomic_write_hw_max = min(b->atomic_write_hw_max, t->io_min); 560 + 561 + return true; 562 + } 563 + 564 + static void blk_stack_atomic_writes_limits(struct queue_limits *t, 565 + struct queue_limits *b) 566 + { 567 + if (!(t->features & BLK_FEAT_ATOMIC_WRITES_STACKED)) 568 + goto unsupported; 569 + 570 + if (!b->atomic_write_unit_min) 571 + goto unsupported; 572 + 573 + /* 574 + * If atomic_write_hw_max is set, we have already stacked 1x bottom 575 + * device, so check for compliance. 576 + */ 577 + if (t->atomic_write_hw_max) { 578 + if (!blk_stack_atomic_writes_tail(t, b)) 579 + goto unsupported; 580 + return; 581 + } 582 + 583 + if (!blk_stack_atomic_writes_head(t, b)) 584 + goto unsupported; 585 + return; 586 + 587 + unsupported: 588 + t->atomic_write_hw_max = 0; 589 + t->atomic_write_hw_unit_max = 0; 590 + t->atomic_write_hw_unit_min = 0; 591 + t->atomic_write_hw_boundary = 0; 592 + t->features &= ~BLK_FEAT_ATOMIC_WRITES_STACKED; 593 } 594 595 /** ··· 639 t->zone_write_granularity = 0; 640 t->max_zone_append_sectors = 0; 641 } 642 + blk_stack_atomic_writes_limits(t, b); 643 + 644 return ret; 645 } 646 EXPORT_SYMBOL(blk_stack_limits);
+2 -4
block/blk-sysfs.c
··· 810 * faster to shut down and is made fully functional here as 811 * request_queues for non-existent devices never get registered. 812 */ 813 - if (!blk_queue_init_done(q)) { 814 - blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q); 815 - percpu_ref_switch_to_percpu(&q->q_usage_counter); 816 - } 817 818 return ret; 819
··· 810 * faster to shut down and is made fully functional here as 811 * request_queues for non-existent devices never get registered. 812 */ 813 + blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q); 814 + percpu_ref_switch_to_percpu(&q->q_usage_counter); 815 816 return ret; 817
+10 -4
block/blk-zoned.c
··· 1551 unsigned int nr_seq_zones, nr_conv_zones; 1552 unsigned int pool_size; 1553 struct queue_limits lim; 1554 1555 disk->nr_zones = args->nr_zones; 1556 disk->zone_capacity = args->zone_capacity; ··· 1602 } 1603 1604 commit: 1605 - return queue_limits_commit_update(q, &lim); 1606 } 1607 1608 static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, ··· 1821 * Set the new disk zone parameters only once the queue is frozen and 1822 * all I/Os are completed. 1823 */ 1824 - blk_mq_freeze_queue(q); 1825 if (ret > 0) 1826 ret = disk_update_zone_resources(disk, &args); 1827 else 1828 pr_warn("%s: failed to revalidate zones\n", disk->disk_name); 1829 - if (ret) 1830 disk_free_zone_resources(disk); 1831 - blk_mq_unfreeze_queue(q); 1832 1833 return ret; 1834 }
··· 1551 unsigned int nr_seq_zones, nr_conv_zones; 1552 unsigned int pool_size; 1553 struct queue_limits lim; 1554 + int ret; 1555 1556 disk->nr_zones = args->nr_zones; 1557 disk->zone_capacity = args->zone_capacity; ··· 1601 } 1602 1603 commit: 1604 + blk_mq_freeze_queue(q); 1605 + ret = queue_limits_commit_update(q, &lim); 1606 + blk_mq_unfreeze_queue(q); 1607 + 1608 + return ret; 1609 } 1610 1611 static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, ··· 1816 * Set the new disk zone parameters only once the queue is frozen and 1817 * all I/Os are completed. 1818 */ 1819 if (ret > 0) 1820 ret = disk_update_zone_resources(disk, &args); 1821 else 1822 pr_warn("%s: failed to revalidate zones\n", disk->disk_name); 1823 + if (ret) { 1824 + blk_mq_freeze_queue(q); 1825 disk_free_zone_resources(disk); 1826 + blk_mq_unfreeze_queue(q); 1827 + } 1828 1829 return ret; 1830 }
+4 -1
block/fops.c
··· 677 struct file *file = iocb->ki_filp; 678 struct inode *bd_inode = bdev_file_inode(file); 679 struct block_device *bdev = I_BDEV(bd_inode); 680 loff_t size = bdev_nr_bytes(bdev); 681 size_t shorted = 0; 682 ssize_t ret; ··· 697 if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) 698 return -EOPNOTSUPP; 699 700 - if (iocb->ki_flags & IOCB_ATOMIC) { 701 ret = generic_atomic_write_valid(iocb, from); 702 if (ret) 703 return ret; ··· 705 706 size -= iocb->ki_pos; 707 if (iov_iter_count(from) > size) { 708 shorted = iov_iter_count(from) - size; 709 iov_iter_truncate(from, size); 710 }
··· 677 struct file *file = iocb->ki_filp; 678 struct inode *bd_inode = bdev_file_inode(file); 679 struct block_device *bdev = I_BDEV(bd_inode); 680 + bool atomic = iocb->ki_flags & IOCB_ATOMIC; 681 loff_t size = bdev_nr_bytes(bdev); 682 size_t shorted = 0; 683 ssize_t ret; ··· 696 if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) 697 return -EOPNOTSUPP; 698 699 + if (atomic) { 700 ret = generic_atomic_write_valid(iocb, from); 701 if (ret) 702 return ret; ··· 704 705 size -= iocb->ki_pos; 706 if (iov_iter_count(from) > size) { 707 + if (atomic) 708 + return -EINVAL; 709 shorted = iov_iter_count(from) - size; 710 iov_iter_truncate(from, size); 711 }
+3 -6
block/genhd.c
··· 742 * If the disk does not own the queue, allow using passthrough requests 743 * again. Else leave the queue frozen to fail all I/O. 744 */ 745 - if (!test_bit(GD_OWNS_QUEUE, &disk->state)) { 746 - blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q); 747 __blk_mq_unfreeze_queue(q, true); 748 - } else { 749 - if (queue_is_mq(q)) 750 - blk_mq_exit_queue(q); 751 - } 752 753 if (start_drain) 754 blk_unfreeze_release_lock(q, true, queue_dying);
··· 742 * If the disk does not own the queue, allow using passthrough requests 743 * again. Else leave the queue frozen to fail all I/O. 744 */ 745 + if (!test_bit(GD_OWNS_QUEUE, &disk->state)) 746 __blk_mq_unfreeze_queue(q, true); 747 + else if (queue_is_mq(q)) 748 + blk_mq_exit_queue(q); 749 750 if (start_drain) 751 blk_unfreeze_release_lock(q, true, queue_dying);
+4 -9
block/mq-deadline.c
··· 685 686 prio = ioprio_class_to_prio[ioprio_class]; 687 per_prio = &dd->per_prio[prio]; 688 - if (!rq->elv.priv[0]) { 689 per_prio->stats.inserted++; 690 - rq->elv.priv[0] = (void *)(uintptr_t)1; 691 - } 692 693 if (blk_mq_sched_try_insert_merge(q, rq, free)) 694 return; ··· 752 */ 753 static void dd_finish_request(struct request *rq) 754 { 755 - struct request_queue *q = rq->q; 756 - struct deadline_data *dd = q->elevator->elevator_data; 757 - const u8 ioprio_class = dd_rq_ioclass(rq); 758 - const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; 759 - struct dd_per_prio *per_prio = &dd->per_prio[prio]; 760 761 /* 762 * The block layer core may call dd_finish_request() without having 763 * called dd_insert_requests(). Skip requests that bypassed I/O 764 * scheduling. See also blk_mq_request_bypass_insert(). 765 */ 766 - if (rq->elv.priv[0]) 767 atomic_inc(&per_prio->stats.completed); 768 } 769
··· 685 686 prio = ioprio_class_to_prio[ioprio_class]; 687 per_prio = &dd->per_prio[prio]; 688 + if (!rq->elv.priv[0]) 689 per_prio->stats.inserted++; 690 + rq->elv.priv[0] = per_prio; 691 692 if (blk_mq_sched_try_insert_merge(q, rq, free)) 693 return; ··· 753 */ 754 static void dd_finish_request(struct request *rq) 755 { 756 + struct dd_per_prio *per_prio = rq->elv.priv[0]; 757 758 /* 759 * The block layer core may call dd_finish_request() without having 760 * called dd_insert_requests(). Skip requests that bypassed I/O 761 * scheduling. See also blk_mq_request_bypass_insert(). 762 */ 763 + if (per_prio) 764 atomic_inc(&per_prio->stats.completed); 765 } 766
+3 -1
drivers/block/brd.c
··· 231 xa_lock(&brd->brd_pages); 232 while (size >= PAGE_SIZE && aligned_sector < rd_size * 2) { 233 page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT); 234 - if (page) 235 __free_page(page); 236 aligned_sector += PAGE_SECTORS; 237 size -= PAGE_SIZE; 238 }
··· 231 xa_lock(&brd->brd_pages); 232 while (size >= PAGE_SIZE && aligned_sector < rd_size * 2) { 233 page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT); 234 + if (page) { 235 __free_page(page); 236 + brd->brd_nr_pages--; 237 + } 238 aligned_sector += PAGE_SECTORS; 239 size -= PAGE_SIZE; 240 }
+15 -15
drivers/block/loop.c
··· 770 &loop_attribute_group); 771 } 772 773 - static void loop_config_discard(struct loop_device *lo, 774 - struct queue_limits *lim) 775 { 776 struct file *file = lo->lo_backing_file; 777 struct inode *inode = file->f_mapping->host; 778 - u32 granularity = 0, max_discard_sectors = 0; 779 struct kstatfs sbuf; 780 781 /* ··· 787 if (S_ISBLK(inode->i_mode)) { 788 struct block_device *bdev = I_BDEV(inode); 789 790 - max_discard_sectors = bdev_write_zeroes_sectors(bdev); 791 - granularity = bdev_discard_granularity(bdev); 792 793 /* 794 * We use punch hole to reclaim the free space used by the 795 * image a.k.a. discard. 796 */ 797 } else if (file->f_op->fallocate && !vfs_statfs(&file->f_path, &sbuf)) { 798 - max_discard_sectors = UINT_MAX >> 9; 799 - granularity = sbuf.f_bsize; 800 } 801 - 802 - lim->max_hw_discard_sectors = max_discard_sectors; 803 - lim->max_write_zeroes_sectors = max_discard_sectors; 804 - if (max_discard_sectors) 805 - lim->discard_granularity = granularity; 806 - else 807 - lim->discard_granularity = 0; 808 } 809 810 struct loop_worker { ··· 983 struct inode *inode = file->f_mapping->host; 984 struct block_device *backing_bdev = NULL; 985 struct queue_limits lim; 986 987 if (S_ISBLK(inode->i_mode)) 988 backing_bdev = I_BDEV(inode); ··· 992 993 if (!bsize) 994 bsize = loop_default_blocksize(lo, backing_bdev); 995 996 lim = queue_limits_start_update(lo->lo_queue); 997 lim.logical_block_size = bsize; ··· 1004 lim.features |= BLK_FEAT_WRITE_CACHE; 1005 if (backing_bdev && !bdev_nonrot(backing_bdev)) 1006 lim.features |= BLK_FEAT_ROTATIONAL; 1007 - loop_config_discard(lo, &lim); 1008 return queue_limits_commit_update(lo->lo_queue, &lim); 1009 } 1010
··· 770 &loop_attribute_group); 771 } 772 773 + static void loop_get_discard_config(struct loop_device *lo, 774 + u32 *granularity, u32 *max_discard_sectors) 775 { 776 struct file *file = lo->lo_backing_file; 777 struct inode *inode = file->f_mapping->host; 778 struct kstatfs sbuf; 779 780 /* ··· 788 if (S_ISBLK(inode->i_mode)) { 789 struct block_device *bdev = I_BDEV(inode); 790 791 + *max_discard_sectors = bdev_write_zeroes_sectors(bdev); 792 + *granularity = bdev_discard_granularity(bdev); 793 794 /* 795 * We use punch hole to reclaim the free space used by the 796 * image a.k.a. discard. 797 */ 798 } else if (file->f_op->fallocate && !vfs_statfs(&file->f_path, &sbuf)) { 799 + *max_discard_sectors = UINT_MAX >> 9; 800 + *granularity = sbuf.f_bsize; 801 } 802 } 803 804 struct loop_worker { ··· 991 struct inode *inode = file->f_mapping->host; 992 struct block_device *backing_bdev = NULL; 993 struct queue_limits lim; 994 + u32 granularity = 0, max_discard_sectors = 0; 995 996 if (S_ISBLK(inode->i_mode)) 997 backing_bdev = I_BDEV(inode); ··· 999 1000 if (!bsize) 1001 bsize = loop_default_blocksize(lo, backing_bdev); 1002 + 1003 + loop_get_discard_config(lo, &granularity, &max_discard_sectors); 1004 1005 lim = queue_limits_start_update(lo->lo_queue); 1006 lim.logical_block_size = bsize; ··· 1009 lim.features |= BLK_FEAT_WRITE_CACHE; 1010 if (backing_bdev && !bdev_nonrot(backing_bdev)) 1011 lim.features |= BLK_FEAT_ROTATIONAL; 1012 + lim.max_hw_discard_sectors = max_discard_sectors; 1013 + lim.max_write_zeroes_sectors = max_discard_sectors; 1014 + if (max_discard_sectors) 1015 + lim.discard_granularity = granularity; 1016 + else 1017 + lim.discard_granularity = 0; 1018 return queue_limits_commit_update(lo->lo_queue, &lim); 1019 } 1020
+1 -1
drivers/block/ublk_drv.c
··· 3041 ret = ublk_ctrl_end_recovery(ub, cmd); 3042 break; 3043 default: 3044 - ret = -ENOTSUPP; 3045 break; 3046 } 3047
··· 3041 ret = ublk_ctrl_end_recovery(ub, cmd); 3042 break; 3043 default: 3044 + ret = -EOPNOTSUPP; 3045 break; 3046 } 3047
+1
drivers/md/raid0.c
··· 384 lim.max_write_zeroes_sectors = mddev->chunk_sectors; 385 lim.io_min = mddev->chunk_sectors << 9; 386 lim.io_opt = lim.io_min * mddev->raid_disks; 387 err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); 388 if (err) { 389 queue_limits_cancel_update(mddev->gendisk->queue);
··· 384 lim.max_write_zeroes_sectors = mddev->chunk_sectors; 385 lim.io_min = mddev->chunk_sectors << 9; 386 lim.io_opt = lim.io_min * mddev->raid_disks; 387 + lim.features |= BLK_FEAT_ATOMIC_WRITES_STACKED; 388 err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); 389 if (err) { 390 queue_limits_cancel_update(mddev->gendisk->queue);
+18 -2
drivers/md/raid1.c
··· 1571 continue; 1572 } 1573 if (is_bad) { 1574 - int good_sectors = first_bad - r1_bio->sector; 1575 if (good_sectors < max_sectors) 1576 max_sectors = good_sectors; 1577 } ··· 1671 1672 mbio->bi_iter.bi_sector = (r1_bio->sector + rdev->data_offset); 1673 mbio->bi_end_io = raid1_end_write_request; 1674 - mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA)); 1675 if (test_bit(FailFast, &rdev->flags) && 1676 !test_bit(WriteMostly, &rdev->flags) && 1677 conf->raid_disks - mddev->degraded > 1) ··· 3239 3240 md_init_stacking_limits(&lim); 3241 lim.max_write_zeroes_sectors = 0; 3242 err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); 3243 if (err) { 3244 queue_limits_cancel_update(mddev->gendisk->queue);
··· 1571 continue; 1572 } 1573 if (is_bad) { 1574 + int good_sectors; 1575 + 1576 + /* 1577 + * We cannot atomically write this, so just 1578 + * error in that case. It could be possible to 1579 + * atomically write other mirrors, but the 1580 + * complexity of supporting that is not worth 1581 + * the benefit. 1582 + */ 1583 + if (bio->bi_opf & REQ_ATOMIC) { 1584 + error = -EIO; 1585 + goto err_handle; 1586 + } 1587 + 1588 + good_sectors = first_bad - r1_bio->sector; 1589 if (good_sectors < max_sectors) 1590 max_sectors = good_sectors; 1591 } ··· 1657 1658 mbio->bi_iter.bi_sector = (r1_bio->sector + rdev->data_offset); 1659 mbio->bi_end_io = raid1_end_write_request; 1660 + mbio->bi_opf = bio_op(bio) | 1661 + (bio->bi_opf & (REQ_SYNC | REQ_FUA | REQ_ATOMIC)); 1662 if (test_bit(FailFast, &rdev->flags) && 1663 !test_bit(WriteMostly, &rdev->flags) && 1664 conf->raid_disks - mddev->degraded > 1) ··· 3224 3225 md_init_stacking_limits(&lim); 3226 lim.max_write_zeroes_sectors = 0; 3227 + lim.features |= BLK_FEAT_ATOMIC_WRITES_STACKED; 3228 err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); 3229 if (err) { 3230 queue_limits_cancel_update(mddev->gendisk->queue);
+18 -2
drivers/md/raid10.c
··· 1255 const enum req_op op = bio_op(bio); 1256 const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; 1257 const blk_opf_t do_fua = bio->bi_opf & REQ_FUA; 1258 unsigned long flags; 1259 struct r10conf *conf = mddev->private; 1260 struct md_rdev *rdev; ··· 1274 mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr + 1275 choose_data_offset(r10_bio, rdev)); 1276 mbio->bi_end_io = raid10_end_write_request; 1277 - mbio->bi_opf = op | do_sync | do_fua; 1278 if (!replacement && test_bit(FailFast, 1279 &conf->mirrors[devnum].rdev->flags) 1280 && enough(conf, devnum)) ··· 1469 continue; 1470 } 1471 if (is_bad) { 1472 - int good_sectors = first_bad - dev_sector; 1473 if (good_sectors < max_sectors) 1474 max_sectors = good_sectors; 1475 } ··· 4040 lim.max_write_zeroes_sectors = 0; 4041 lim.io_min = mddev->chunk_sectors << 9; 4042 lim.io_opt = lim.io_min * raid10_nr_stripes(conf); 4043 err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); 4044 if (err) { 4045 queue_limits_cancel_update(mddev->gendisk->queue);
··· 1255 const enum req_op op = bio_op(bio); 1256 const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; 1257 const blk_opf_t do_fua = bio->bi_opf & REQ_FUA; 1258 + const blk_opf_t do_atomic = bio->bi_opf & REQ_ATOMIC; 1259 unsigned long flags; 1260 struct r10conf *conf = mddev->private; 1261 struct md_rdev *rdev; ··· 1273 mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr + 1274 choose_data_offset(r10_bio, rdev)); 1275 mbio->bi_end_io = raid10_end_write_request; 1276 + mbio->bi_opf = op | do_sync | do_fua | do_atomic; 1277 if (!replacement && test_bit(FailFast, 1278 &conf->mirrors[devnum].rdev->flags) 1279 && enough(conf, devnum)) ··· 1468 continue; 1469 } 1470 if (is_bad) { 1471 + int good_sectors; 1472 + 1473 + /* 1474 + * We cannot atomically write this, so just 1475 + * error in that case. It could be possible to 1476 + * atomically write other mirrors, but the 1477 + * complexity of supporting that is not worth 1478 + * the benefit. 1479 + */ 1480 + if (bio->bi_opf & REQ_ATOMIC) { 1481 + error = -EIO; 1482 + goto err_handle; 1483 + } 1484 + 1485 + good_sectors = first_bad - dev_sector; 1486 if (good_sectors < max_sectors) 1487 max_sectors = good_sectors; 1488 } ··· 4025 lim.max_write_zeroes_sectors = 0; 4026 lim.io_min = mddev->chunk_sectors << 9; 4027 lim.io_opt = lim.io_min * raid10_nr_stripes(conf); 4028 + lim.features |= BLK_FEAT_ATOMIC_WRITES_STACKED; 4029 err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); 4030 if (err) { 4031 queue_limits_cancel_update(mddev->gendisk->queue);
+15 -7
drivers/nvme/host/core.c
··· 1305 queue_delayed_work(nvme_wq, &ctrl->ka_work, delay); 1306 } 1307 1308 - static void nvme_keep_alive_finish(struct request *rq, 1309 - blk_status_t status, struct nvme_ctrl *ctrl) 1310 { 1311 unsigned long rtt = jiffies - (rq->deadline - rq->timeout); 1312 unsigned long delay = nvme_keep_alive_work_period(ctrl); 1313 enum nvme_ctrl_state state = nvme_ctrl_state(ctrl); ··· 1325 delay = 0; 1326 } 1327 1328 if (status) { 1329 dev_err(ctrl->device, 1330 "failed nvme_keep_alive_end_io error=%d\n", 1331 status); 1332 - return; 1333 } 1334 1335 ctrl->ka_last_check_time = jiffies; 1336 ctrl->comp_seen = false; 1337 if (state == NVME_CTRL_LIVE || state == NVME_CTRL_CONNECTING) 1338 queue_delayed_work(nvme_wq, &ctrl->ka_work, delay); 1339 } 1340 1341 static void nvme_keep_alive_work(struct work_struct *work) ··· 1347 struct nvme_ctrl, ka_work); 1348 bool comp_seen = ctrl->comp_seen; 1349 struct request *rq; 1350 - blk_status_t status; 1351 1352 ctrl->ka_last_check_time = jiffies; 1353 ··· 1369 nvme_init_request(rq, &ctrl->ka_cmd); 1370 1371 rq->timeout = ctrl->kato * HZ; 1372 - status = blk_execute_rq(rq, false); 1373 - nvme_keep_alive_finish(rq, status, ctrl); 1374 - blk_mq_free_request(rq); 1375 } 1376 1377 static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) ··· 4603 4604 void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl) 4605 { 4606 blk_mq_destroy_queue(ctrl->admin_q); 4607 blk_put_queue(ctrl->admin_q); 4608 if (ctrl->ops->flags & NVME_F_FABRICS) {
··· 1305 queue_delayed_work(nvme_wq, &ctrl->ka_work, delay); 1306 } 1307 1308 + static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq, 1309 + blk_status_t status) 1310 { 1311 + struct nvme_ctrl *ctrl = rq->end_io_data; 1312 unsigned long rtt = jiffies - (rq->deadline - rq->timeout); 1313 unsigned long delay = nvme_keep_alive_work_period(ctrl); 1314 enum nvme_ctrl_state state = nvme_ctrl_state(ctrl); ··· 1324 delay = 0; 1325 } 1326 1327 + blk_mq_free_request(rq); 1328 + 1329 if (status) { 1330 dev_err(ctrl->device, 1331 "failed nvme_keep_alive_end_io error=%d\n", 1332 status); 1333 + return RQ_END_IO_NONE; 1334 } 1335 1336 ctrl->ka_last_check_time = jiffies; 1337 ctrl->comp_seen = false; 1338 if (state == NVME_CTRL_LIVE || state == NVME_CTRL_CONNECTING) 1339 queue_delayed_work(nvme_wq, &ctrl->ka_work, delay); 1340 + return RQ_END_IO_NONE; 1341 } 1342 1343 static void nvme_keep_alive_work(struct work_struct *work) ··· 1343 struct nvme_ctrl, ka_work); 1344 bool comp_seen = ctrl->comp_seen; 1345 struct request *rq; 1346 1347 ctrl->ka_last_check_time = jiffies; 1348 ··· 1366 nvme_init_request(rq, &ctrl->ka_cmd); 1367 1368 rq->timeout = ctrl->kato * HZ; 1369 + rq->end_io = nvme_keep_alive_end_io; 1370 + rq->end_io_data = ctrl; 1371 + blk_execute_rq_nowait(rq, false); 1372 } 1373 1374 static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) ··· 4600 4601 void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl) 4602 { 4603 + /* 4604 + * As we're about to destroy the queue and free tagset 4605 + * we can not have keep-alive work running. 4606 + */ 4607 + nvme_stop_keep_alive(ctrl); 4608 blk_mq_destroy_queue(ctrl->admin_q); 4609 blk_put_queue(ctrl->admin_q); 4610 if (ctrl->ops->flags & NVME_F_FABRICS) {
+10 -2
drivers/nvme/host/ioctl.c
··· 120 struct nvme_ns *ns = q->queuedata; 121 struct block_device *bdev = ns ? ns->disk->part0 : NULL; 122 bool supports_metadata = bdev && blk_get_integrity(bdev->bd_disk); 123 bool has_metadata = meta_buffer && meta_len; 124 struct bio *bio = NULL; 125 int ret; 126 127 - if (has_metadata && !supports_metadata) 128 - return -EINVAL; 129 130 if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) { 131 struct iov_iter iter;
··· 120 struct nvme_ns *ns = q->queuedata; 121 struct block_device *bdev = ns ? ns->disk->part0 : NULL; 122 bool supports_metadata = bdev && blk_get_integrity(bdev->bd_disk); 123 + struct nvme_ctrl *ctrl = nvme_req(req)->ctrl; 124 bool has_metadata = meta_buffer && meta_len; 125 struct bio *bio = NULL; 126 int ret; 127 128 + if (!nvme_ctrl_sgl_supported(ctrl)) 129 + dev_warn_once(ctrl->device, "using unchecked data buffer\n"); 130 + if (has_metadata) { 131 + if (!supports_metadata) 132 + return -EINVAL; 133 + if (!nvme_ctrl_meta_sgl_supported(ctrl)) 134 + dev_warn_once(ctrl->device, 135 + "using unchecked metadata buffer\n"); 136 + } 137 138 if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) { 139 struct iov_iter iter;
+14 -7
drivers/nvme/host/multipath.c
··· 165 int srcu_idx; 166 167 srcu_idx = srcu_read_lock(&ctrl->srcu); 168 - list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { 169 if (!ns->head->disk) 170 continue; 171 kblockd_schedule_work(&ns->head->requeue_work); ··· 210 int srcu_idx; 211 212 srcu_idx = srcu_read_lock(&ctrl->srcu); 213 - list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { 214 nvme_mpath_clear_current_path(ns); 215 kblockd_schedule_work(&ns->head->requeue_work); 216 } ··· 226 int srcu_idx; 227 228 srcu_idx = srcu_read_lock(&head->srcu); 229 - list_for_each_entry_rcu(ns, &head->list, siblings) { 230 if (capacity != get_capacity(ns->disk)) 231 clear_bit(NVME_NS_READY, &ns->flags); 232 } ··· 260 int found_distance = INT_MAX, fallback_distance = INT_MAX, distance; 261 struct nvme_ns *found = NULL, *fallback = NULL, *ns; 262 263 - list_for_each_entry_rcu(ns, &head->list, siblings) { 264 if (nvme_path_is_disabled(ns)) 265 continue; 266 ··· 360 unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX; 361 unsigned int depth; 362 363 - list_for_each_entry_rcu(ns, &head->list, siblings) { 364 if (nvme_path_is_disabled(ns)) 365 continue; 366 ··· 429 if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) 430 return NULL; 431 432 - list_for_each_entry_rcu(ns, &head->list, siblings) { 433 if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags)) 434 continue; 435 switch (nvme_ctrl_state(ns->ctrl)) { ··· 789 return 0; 790 791 srcu_idx = srcu_read_lock(&ctrl->srcu); 792 - list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { 793 unsigned nsid; 794 again: 795 nsid = le32_to_cpu(desc->nsids[n]);
··· 165 int srcu_idx; 166 167 srcu_idx = srcu_read_lock(&ctrl->srcu); 168 + list_for_each_entry_srcu(ns, &ctrl->namespaces, list, 169 + srcu_read_lock_held(&ctrl->srcu)) { 170 if (!ns->head->disk) 171 continue; 172 kblockd_schedule_work(&ns->head->requeue_work); ··· 209 int srcu_idx; 210 211 srcu_idx = srcu_read_lock(&ctrl->srcu); 212 + list_for_each_entry_srcu(ns, &ctrl->namespaces, list, 213 + srcu_read_lock_held(&ctrl->srcu)) { 214 nvme_mpath_clear_current_path(ns); 215 kblockd_schedule_work(&ns->head->requeue_work); 216 } ··· 224 int srcu_idx; 225 226 srcu_idx = srcu_read_lock(&head->srcu); 227 + list_for_each_entry_srcu(ns, &head->list, siblings, 228 + srcu_read_lock_held(&head->srcu)) { 229 if (capacity != get_capacity(ns->disk)) 230 clear_bit(NVME_NS_READY, &ns->flags); 231 } ··· 257 int found_distance = INT_MAX, fallback_distance = INT_MAX, distance; 258 struct nvme_ns *found = NULL, *fallback = NULL, *ns; 259 260 + list_for_each_entry_srcu(ns, &head->list, siblings, 261 + srcu_read_lock_held(&head->srcu)) { 262 if (nvme_path_is_disabled(ns)) 263 continue; 264 ··· 356 unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX; 357 unsigned int depth; 358 359 + list_for_each_entry_srcu(ns, &head->list, siblings, 360 + srcu_read_lock_held(&head->srcu)) { 361 if (nvme_path_is_disabled(ns)) 362 continue; 363 ··· 424 if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) 425 return NULL; 426 427 + list_for_each_entry_srcu(ns, &head->list, siblings, 428 + srcu_read_lock_held(&head->srcu)) { 429 if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags)) 430 continue; 431 switch (nvme_ctrl_state(ns->ctrl)) { ··· 783 return 0; 784 785 srcu_idx = srcu_read_lock(&ctrl->srcu); 786 + list_for_each_entry_srcu(ns, &ctrl->namespaces, list, 787 + srcu_read_lock_held(&ctrl->srcu)) { 788 unsigned nsid; 789 again: 790 nsid = le32_to_cpu(desc->nsids[n]);
+9 -1
drivers/nvme/host/nvme.h
··· 1123 1124 static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl) 1125 { 1126 - return ctrl->sgls & ((1 << 0) | (1 << 1)); 1127 } 1128 1129 #ifdef CONFIG_NVME_HOST_AUTH
··· 1123 1124 static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl) 1125 { 1126 + return ctrl->sgls & (NVME_CTRL_SGLS_BYTE_ALIGNED | 1127 + NVME_CTRL_SGLS_DWORD_ALIGNED); 1128 + } 1129 + 1130 + static inline bool nvme_ctrl_meta_sgl_supported(struct nvme_ctrl *ctrl) 1131 + { 1132 + if (ctrl->ops->flags & NVME_F_FABRICS) 1133 + return true; 1134 + return ctrl->sgls & NVME_CTRL_SGLS_MSDS; 1135 } 1136 1137 #ifdef CONFIG_NVME_HOST_AUTH
+131 -16
drivers/nvme/host/pci.c
··· 43 */ 44 #define NVME_MAX_KB_SZ 8192 45 #define NVME_MAX_SEGS 128 46 #define NVME_MAX_NR_ALLOCATIONS 5 47 48 static int use_threaded_interrupts; ··· 145 struct sg_table *hmb_sgt; 146 147 mempool_t *iod_mempool; 148 149 /* shadow doorbell buffer support: */ 150 __le32 *dbbuf_dbs; ··· 241 dma_addr_t first_dma; 242 dma_addr_t meta_dma; 243 struct sg_table sgt; 244 union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS]; 245 }; 246 ··· 510 spin_unlock(&nvmeq->sq_lock); 511 } 512 513 static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req, 514 int nseg) 515 { ··· 531 return false; 532 if (!nvmeq->qid) 533 return false; 534 if (!sgl_threshold || avg_seg_size < sgl_threshold) 535 - return false; 536 return true; 537 } 538 ··· 795 struct bio_vec bv = req_bvec(req); 796 797 if (!is_pci_p2pdma_page(bv.bv_page)) { 798 - if ((bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) + 799 bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2) 800 return nvme_setup_prp_simple(dev, req, 801 &cmnd->rw, &bv); ··· 840 return ret; 841 } 842 843 - static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req, 844 - struct nvme_command *cmnd) 845 { 846 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 847 struct bio_vec bv = rq_integrity_vec(req); 848 849 iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0); 850 if (dma_mapping_error(dev->dev, iod->meta_dma)) 851 return BLK_STS_IOERR; 852 cmnd->rw.metadata = cpu_to_le64(iod->meta_dma); 853 return BLK_STS_OK; 854 } 855 856 static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) ··· 926 iod->aborted = false; 927 iod->nr_allocations = -1; 928 iod->sgt.nents = 0; 929 930 ret = nvme_setup_cmd(req->q->queuedata, req); 931 if (ret) ··· 939 } 940 941 if (blk_integrity_rq(req)) { 942 - ret = nvme_map_metadata(dev, req, &iod->cmd); 943 if (ret) 944 goto out_unmap_data; 945 } ··· 1037 *rqlist = requeue_list; 1038 } 1039 1040 static __always_inline void nvme_pci_unmap_rq(struct request *req) 1041 { 1042 struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 1043 struct nvme_dev *dev = nvmeq->dev; 1044 1045 - if (blk_integrity_rq(req)) { 1046 - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 1047 - 1048 - dma_unmap_page(dev->dev, iod->meta_dma, 1049 - rq_integrity_vec(req).bv_len, rq_dma_dir(req)); 1050 - } 1051 1052 if (blk_rq_nr_phys_segments(req)) 1053 nvme_unmap_data(dev, req); ··· 2857 2858 static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) 2859 { 2860 size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS; 2861 2862 dev->iod_mempool = mempool_create_node(1, ··· 2866 dev_to_node(dev->dev)); 2867 if (!dev->iod_mempool) 2868 return -ENOMEM; 2869 return 0; 2870 } 2871 2872 static void nvme_free_tagset(struct nvme_dev *dev) ··· 2941 result = nvme_init_ctrl_finish(&dev->ctrl, was_suspend); 2942 if (result) 2943 goto out; 2944 2945 nvme_dbbuf_dma_alloc(dev); 2946 ··· 3214 dev->ctrl.max_hw_sectors = min_t(u32, 3215 NVME_MAX_KB_SZ << 1, dma_opt_mapping_size(&pdev->dev) >> 9); 3216 dev->ctrl.max_segments = NVME_MAX_SEGS; 3217 - 3218 - /* 3219 - * There is no support for SGLs for metadata (yet), so we are limited to 3220 - * a single integrity segment for the separate metadata pointer. 3221 - */ 3222 dev->ctrl.max_integrity_segments = 1; 3223 return dev; 3224 ··· 3276 if (result) 3277 goto out_disable; 3278 3279 nvme_dbbuf_dma_alloc(dev); 3280 3281 result = nvme_setup_host_mem(dev); ··· 3323 nvme_free_queues(dev, 0); 3324 out_release_iod_mempool: 3325 mempool_destroy(dev->iod_mempool); 3326 out_release_prp_pools: 3327 nvme_release_prp_pools(dev); 3328 out_dev_unmap: ··· 3389 nvme_dbbuf_dma_free(dev); 3390 nvme_free_queues(dev, 0); 3391 mempool_destroy(dev->iod_mempool); 3392 nvme_release_prp_pools(dev); 3393 nvme_dev_unmap(dev); 3394 nvme_uninit_ctrl(&dev->ctrl);
··· 43 */ 44 #define NVME_MAX_KB_SZ 8192 45 #define NVME_MAX_SEGS 128 46 + #define NVME_MAX_META_SEGS 15 47 #define NVME_MAX_NR_ALLOCATIONS 5 48 49 static int use_threaded_interrupts; ··· 144 struct sg_table *hmb_sgt; 145 146 mempool_t *iod_mempool; 147 + mempool_t *iod_meta_mempool; 148 149 /* shadow doorbell buffer support: */ 150 __le32 *dbbuf_dbs; ··· 239 dma_addr_t first_dma; 240 dma_addr_t meta_dma; 241 struct sg_table sgt; 242 + struct sg_table meta_sgt; 243 + union nvme_descriptor meta_list; 244 union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS]; 245 }; 246 ··· 506 spin_unlock(&nvmeq->sq_lock); 507 } 508 509 + static inline bool nvme_pci_metadata_use_sgls(struct nvme_dev *dev, 510 + struct request *req) 511 + { 512 + if (!nvme_ctrl_meta_sgl_supported(&dev->ctrl)) 513 + return false; 514 + return req->nr_integrity_segments > 1 || 515 + nvme_req(req)->flags & NVME_REQ_USERCMD; 516 + } 517 + 518 static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req, 519 int nseg) 520 { ··· 518 return false; 519 if (!nvmeq->qid) 520 return false; 521 + if (nvme_pci_metadata_use_sgls(dev, req)) 522 + return true; 523 if (!sgl_threshold || avg_seg_size < sgl_threshold) 524 + return nvme_req(req)->flags & NVME_REQ_USERCMD; 525 return true; 526 } 527 ··· 780 struct bio_vec bv = req_bvec(req); 781 782 if (!is_pci_p2pdma_page(bv.bv_page)) { 783 + if (!nvme_pci_metadata_use_sgls(dev, req) && 784 + (bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) + 785 bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2) 786 return nvme_setup_prp_simple(dev, req, 787 &cmnd->rw, &bv); ··· 824 return ret; 825 } 826 827 + static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev, 828 + struct request *req) 829 + { 830 + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 831 + struct nvme_rw_command *cmnd = &iod->cmd.rw; 832 + struct nvme_sgl_desc *sg_list; 833 + struct scatterlist *sgl, *sg; 834 + unsigned int entries; 835 + dma_addr_t sgl_dma; 836 + int rc, i; 837 + 838 + iod->meta_sgt.sgl = mempool_alloc(dev->iod_meta_mempool, GFP_ATOMIC); 839 + if (!iod->meta_sgt.sgl) 840 + return BLK_STS_RESOURCE; 841 + 842 + sg_init_table(iod->meta_sgt.sgl, req->nr_integrity_segments); 843 + iod->meta_sgt.orig_nents = blk_rq_map_integrity_sg(req, 844 + iod->meta_sgt.sgl); 845 + if (!iod->meta_sgt.orig_nents) 846 + goto out_free_sg; 847 + 848 + rc = dma_map_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 849 + DMA_ATTR_NO_WARN); 850 + if (rc) 851 + goto out_free_sg; 852 + 853 + sg_list = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, &sgl_dma); 854 + if (!sg_list) 855 + goto out_unmap_sg; 856 + 857 + entries = iod->meta_sgt.nents; 858 + iod->meta_list.sg_list = sg_list; 859 + iod->meta_dma = sgl_dma; 860 + 861 + cmnd->flags = NVME_CMD_SGL_METASEG; 862 + cmnd->metadata = cpu_to_le64(sgl_dma); 863 + 864 + sgl = iod->meta_sgt.sgl; 865 + if (entries == 1) { 866 + nvme_pci_sgl_set_data(sg_list, sgl); 867 + return BLK_STS_OK; 868 + } 869 + 870 + sgl_dma += sizeof(*sg_list); 871 + nvme_pci_sgl_set_seg(sg_list, sgl_dma, entries); 872 + for_each_sg(sgl, sg, entries, i) 873 + nvme_pci_sgl_set_data(&sg_list[i + 1], sg); 874 + 875 + return BLK_STS_OK; 876 + 877 + out_unmap_sg: 878 + dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0); 879 + out_free_sg: 880 + mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool); 881 + return BLK_STS_RESOURCE; 882 + } 883 + 884 + static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_dev *dev, 885 + struct request *req) 886 { 887 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 888 struct bio_vec bv = rq_integrity_vec(req); 889 + struct nvme_command *cmnd = &iod->cmd; 890 891 iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0); 892 if (dma_mapping_error(dev->dev, iod->meta_dma)) 893 return BLK_STS_IOERR; 894 cmnd->rw.metadata = cpu_to_le64(iod->meta_dma); 895 return BLK_STS_OK; 896 + } 897 + 898 + static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req) 899 + { 900 + if (nvme_pci_metadata_use_sgls(dev, req)) 901 + return nvme_pci_setup_meta_sgls(dev, req); 902 + return nvme_pci_setup_meta_mptr(dev, req); 903 } 904 905 static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) ··· 845 iod->aborted = false; 846 iod->nr_allocations = -1; 847 iod->sgt.nents = 0; 848 + iod->meta_sgt.nents = 0; 849 850 ret = nvme_setup_cmd(req->q->queuedata, req); 851 if (ret) ··· 857 } 858 859 if (blk_integrity_rq(req)) { 860 + ret = nvme_map_metadata(dev, req); 861 if (ret) 862 goto out_unmap_data; 863 } ··· 955 *rqlist = requeue_list; 956 } 957 958 + static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev, 959 + struct request *req) 960 + { 961 + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 962 + 963 + if (!iod->meta_sgt.nents) { 964 + dma_unmap_page(dev->dev, iod->meta_dma, 965 + rq_integrity_vec(req).bv_len, 966 + rq_dma_dir(req)); 967 + return; 968 + } 969 + 970 + dma_pool_free(dev->prp_small_pool, iod->meta_list.sg_list, 971 + iod->meta_dma); 972 + dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0); 973 + mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool); 974 + } 975 + 976 static __always_inline void nvme_pci_unmap_rq(struct request *req) 977 { 978 struct nvme_queue *nvmeq = req->mq_hctx->driver_data; 979 struct nvme_dev *dev = nvmeq->dev; 980 981 + if (blk_integrity_rq(req)) 982 + nvme_unmap_metadata(dev, req); 983 984 if (blk_rq_nr_phys_segments(req)) 985 nvme_unmap_data(dev, req); ··· 2761 2762 static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) 2763 { 2764 + size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1); 2765 size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS; 2766 2767 dev->iod_mempool = mempool_create_node(1, ··· 2769 dev_to_node(dev->dev)); 2770 if (!dev->iod_mempool) 2771 return -ENOMEM; 2772 + 2773 + dev->iod_meta_mempool = mempool_create_node(1, 2774 + mempool_kmalloc, mempool_kfree, 2775 + (void *)meta_size, GFP_KERNEL, 2776 + dev_to_node(dev->dev)); 2777 + if (!dev->iod_meta_mempool) 2778 + goto free; 2779 + 2780 return 0; 2781 + free: 2782 + mempool_destroy(dev->iod_mempool); 2783 + return -ENOMEM; 2784 } 2785 2786 static void nvme_free_tagset(struct nvme_dev *dev) ··· 2833 result = nvme_init_ctrl_finish(&dev->ctrl, was_suspend); 2834 if (result) 2835 goto out; 2836 + 2837 + if (nvme_ctrl_meta_sgl_supported(&dev->ctrl)) 2838 + dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS; 2839 + else 2840 + dev->ctrl.max_integrity_segments = 1; 2841 2842 nvme_dbbuf_dma_alloc(dev); 2843 ··· 3101 dev->ctrl.max_hw_sectors = min_t(u32, 3102 NVME_MAX_KB_SZ << 1, dma_opt_mapping_size(&pdev->dev) >> 9); 3103 dev->ctrl.max_segments = NVME_MAX_SEGS; 3104 dev->ctrl.max_integrity_segments = 1; 3105 return dev; 3106 ··· 3168 if (result) 3169 goto out_disable; 3170 3171 + if (nvme_ctrl_meta_sgl_supported(&dev->ctrl)) 3172 + dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS; 3173 + else 3174 + dev->ctrl.max_integrity_segments = 1; 3175 + 3176 nvme_dbbuf_dma_alloc(dev); 3177 3178 result = nvme_setup_host_mem(dev); ··· 3210 nvme_free_queues(dev, 0); 3211 out_release_iod_mempool: 3212 mempool_destroy(dev->iod_mempool); 3213 + mempool_destroy(dev->iod_meta_mempool); 3214 out_release_prp_pools: 3215 nvme_release_prp_pools(dev); 3216 out_dev_unmap: ··· 3275 nvme_dbbuf_dma_free(dev); 3276 nvme_free_queues(dev, 0); 3277 mempool_destroy(dev->iod_mempool); 3278 + mempool_destroy(dev->iod_meta_mempool); 3279 nvme_release_prp_pools(dev); 3280 nvme_dev_unmap(dev); 3281 nvme_uninit_ctrl(&dev->ctrl);
+77 -49
drivers/nvme/host/pr.c
··· 94 } 95 } 96 97 - static int nvme_send_pr_command(struct block_device *bdev, 98 - struct nvme_command *c, void *data, unsigned int data_len) 99 { 100 - if (nvme_disk_is_ns_head(bdev->bd_disk)) 101 - return nvme_send_ns_head_pr_command(bdev, c, data, data_len); 102 - 103 - return nvme_send_ns_pr_command(bdev->bd_disk->private_data, c, data, 104 - data_len); 105 - } 106 - 107 - static int nvme_pr_command(struct block_device *bdev, u32 cdw10, 108 - u64 key, u64 sa_key, u8 op) 109 - { 110 - struct nvme_command c = { }; 111 - u8 data[16] = { 0, }; 112 - int ret; 113 - 114 - put_unaligned_le64(key, &data[0]); 115 - put_unaligned_le64(sa_key, &data[8]); 116 117 c.common.opcode = op; 118 c.common.cdw10 = cpu_to_le32(cdw10); 119 120 - ret = nvme_send_pr_command(bdev, &c, data, sizeof(data)); 121 - if (ret < 0) 122 - return ret; 123 - 124 - return nvme_status_to_pr_err(ret); 125 } 126 127 - static int nvme_pr_register(struct block_device *bdev, u64 old, 128 - u64 new, unsigned flags) 129 { 130 u32 cdw10; 131 132 if (flags & ~PR_FL_IGNORE_KEY) 133 return -EOPNOTSUPP; 134 135 - cdw10 = old ? 2 : 0; 136 - cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0; 137 - cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */ 138 - return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register); 139 } 140 141 static int nvme_pr_reserve(struct block_device *bdev, u64 key, 142 enum pr_type type, unsigned flags) 143 { 144 u32 cdw10; 145 146 if (flags & ~PR_FL_IGNORE_KEY) 147 return -EOPNOTSUPP; 148 149 - cdw10 = nvme_pr_type_from_blk(type) << 8; 150 - cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0); 151 - return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire); 152 } 153 154 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, 155 enum pr_type type, bool abort) 156 { 157 - u32 cdw10 = nvme_pr_type_from_blk(type) << 8 | (abort ? 2 : 1); 158 159 - return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire); 160 } 161 162 static int nvme_pr_clear(struct block_device *bdev, u64 key) 163 { 164 - u32 cdw10 = 1 | (key ? 0 : 1 << 3); 165 166 - return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); 167 } 168 169 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type) 170 { 171 - u32 cdw10 = nvme_pr_type_from_blk(type) << 8 | (key ? 0 : 1 << 3); 172 173 - return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); 174 } 175 176 static int nvme_pr_resv_report(struct block_device *bdev, void *data, 177 u32 data_len, bool *eds) 178 { 179 - struct nvme_command c = { }; 180 int ret; 181 182 - c.common.opcode = nvme_cmd_resv_report; 183 - c.common.cdw10 = cpu_to_le32(nvme_bytes_to_numd(data_len)); 184 - c.common.cdw11 = cpu_to_le32(NVME_EXTENDED_DATA_STRUCT); 185 *eds = true; 186 187 retry: 188 - ret = nvme_send_pr_command(bdev, &c, data, data_len); 189 if (ret == NVME_SC_HOST_ID_INCONSIST && 190 - c.common.cdw11 == cpu_to_le32(NVME_EXTENDED_DATA_STRUCT)) { 191 - c.common.cdw11 = 0; 192 *eds = false; 193 goto retry; 194 } 195 196 - if (ret < 0) 197 - return ret; 198 - 199 - return nvme_status_to_pr_err(ret); 200 } 201 202 static int nvme_pr_read_keys(struct block_device *bdev,
··· 94 } 95 } 96 97 + static int __nvme_send_pr_command(struct block_device *bdev, u32 cdw10, 98 + u32 cdw11, u8 op, void *data, unsigned int data_len) 99 { 100 + struct nvme_command c = { 0 }; 101 102 c.common.opcode = op; 103 c.common.cdw10 = cpu_to_le32(cdw10); 104 + c.common.cdw11 = cpu_to_le32(cdw11); 105 106 + if (nvme_disk_is_ns_head(bdev->bd_disk)) 107 + return nvme_send_ns_head_pr_command(bdev, &c, data, data_len); 108 + return nvme_send_ns_pr_command(bdev->bd_disk->private_data, &c, 109 + data, data_len); 110 } 111 112 + static int nvme_send_pr_command(struct block_device *bdev, u32 cdw10, u32 cdw11, 113 + u8 op, void *data, unsigned int data_len) 114 { 115 + int ret; 116 + 117 + ret = __nvme_send_pr_command(bdev, cdw10, cdw11, op, data, data_len); 118 + return ret < 0 ? ret : nvme_status_to_pr_err(ret); 119 + } 120 + 121 + static int nvme_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, 122 + unsigned int flags) 123 + { 124 + struct nvmet_pr_register_data data = { 0 }; 125 u32 cdw10; 126 127 if (flags & ~PR_FL_IGNORE_KEY) 128 return -EOPNOTSUPP; 129 130 + data.crkey = cpu_to_le64(old_key); 131 + data.nrkey = cpu_to_le64(new_key); 132 + 133 + cdw10 = old_key ? NVME_PR_REGISTER_ACT_REPLACE : 134 + NVME_PR_REGISTER_ACT_REG; 135 + cdw10 |= (flags & PR_FL_IGNORE_KEY) ? NVME_PR_IGNORE_KEY : 0; 136 + cdw10 |= NVME_PR_CPTPL_PERSIST; 137 + 138 + return nvme_send_pr_command(bdev, cdw10, 0, nvme_cmd_resv_register, 139 + &data, sizeof(data)); 140 } 141 142 static int nvme_pr_reserve(struct block_device *bdev, u64 key, 143 enum pr_type type, unsigned flags) 144 { 145 + struct nvmet_pr_acquire_data data = { 0 }; 146 u32 cdw10; 147 148 if (flags & ~PR_FL_IGNORE_KEY) 149 return -EOPNOTSUPP; 150 151 + data.crkey = cpu_to_le64(key); 152 + 153 + cdw10 = NVME_PR_ACQUIRE_ACT_ACQUIRE; 154 + cdw10 |= nvme_pr_type_from_blk(type) << 8; 155 + cdw10 |= (flags & PR_FL_IGNORE_KEY) ? NVME_PR_IGNORE_KEY : 0; 156 + 157 + return nvme_send_pr_command(bdev, cdw10, 0, nvme_cmd_resv_acquire, 158 + &data, sizeof(data)); 159 } 160 161 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, 162 enum pr_type type, bool abort) 163 { 164 + struct nvmet_pr_acquire_data data = { 0 }; 165 + u32 cdw10; 166 167 + data.crkey = cpu_to_le64(old); 168 + data.prkey = cpu_to_le64(new); 169 + 170 + cdw10 = abort ? NVME_PR_ACQUIRE_ACT_PREEMPT_AND_ABORT : 171 + NVME_PR_ACQUIRE_ACT_PREEMPT; 172 + cdw10 |= nvme_pr_type_from_blk(type) << 8; 173 + 174 + return nvme_send_pr_command(bdev, cdw10, 0, nvme_cmd_resv_acquire, 175 + &data, sizeof(data)); 176 } 177 178 static int nvme_pr_clear(struct block_device *bdev, u64 key) 179 { 180 + struct nvmet_pr_release_data data = { 0 }; 181 + u32 cdw10; 182 183 + data.crkey = cpu_to_le64(key); 184 + 185 + cdw10 = NVME_PR_RELEASE_ACT_CLEAR; 186 + cdw10 |= key ? 0 : NVME_PR_IGNORE_KEY; 187 + 188 + return nvme_send_pr_command(bdev, cdw10, 0, nvme_cmd_resv_release, 189 + &data, sizeof(data)); 190 } 191 192 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type) 193 { 194 + struct nvmet_pr_release_data data = { 0 }; 195 + u32 cdw10; 196 197 + data.crkey = cpu_to_le64(key); 198 + 199 + cdw10 = NVME_PR_RELEASE_ACT_RELEASE; 200 + cdw10 |= nvme_pr_type_from_blk(type) << 8; 201 + cdw10 |= key ? 0 : NVME_PR_IGNORE_KEY; 202 + 203 + return nvme_send_pr_command(bdev, cdw10, 0, nvme_cmd_resv_release, 204 + &data, sizeof(data)); 205 } 206 207 static int nvme_pr_resv_report(struct block_device *bdev, void *data, 208 u32 data_len, bool *eds) 209 { 210 + u32 cdw10, cdw11; 211 int ret; 212 213 + cdw10 = nvme_bytes_to_numd(data_len); 214 + cdw11 = NVME_EXTENDED_DATA_STRUCT; 215 *eds = true; 216 217 retry: 218 + ret = __nvme_send_pr_command(bdev, cdw10, cdw11, nvme_cmd_resv_report, 219 + data, data_len); 220 if (ret == NVME_SC_HOST_ID_INCONSIST && 221 + cdw11 == NVME_EXTENDED_DATA_STRUCT) { 222 + cdw11 = 0; 223 *eds = false; 224 goto retry; 225 } 226 227 + return ret < 0 ? ret : nvme_status_to_pr_err(ret); 228 } 229 230 static int nvme_pr_read_keys(struct block_device *bdev,
+2 -2
drivers/nvme/host/rdma.c
··· 1019 goto destroy_admin; 1020 } 1021 1022 - if (!(ctrl->ctrl.sgls & (1 << 2))) { 1023 ret = -EOPNOTSUPP; 1024 dev_err(ctrl->ctrl.device, 1025 "Mandatory keyed sgls are not supported!\n"); ··· 1051 ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1; 1052 } 1053 1054 - if (ctrl->ctrl.sgls & (1 << 20)) 1055 ctrl->use_inline_data = true; 1056 1057 if (ctrl->ctrl.queue_count > 1) {
··· 1019 goto destroy_admin; 1020 } 1021 1022 + if (!(ctrl->ctrl.sgls & NVME_CTRL_SGLS_KSDBDS)) { 1023 ret = -EOPNOTSUPP; 1024 dev_err(ctrl->ctrl.device, 1025 "Mandatory keyed sgls are not supported!\n"); ··· 1051 ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1; 1052 } 1053 1054 + if (ctrl->ctrl.sgls & NVME_CTRL_SGLS_SAOS) 1055 ctrl->use_inline_data = true; 1056 1057 if (ctrl->ctrl.queue_count > 1) {
+4 -3
drivers/nvme/target/admin-cmd.c
··· 601 id->awun = 0; 602 id->awupf = 0; 603 604 - id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ 605 if (ctrl->ops->flags & NVMF_KEYED_SGLS) 606 - id->sgls |= cpu_to_le32(1 << 2); 607 if (req->port->inline_data_size) 608 - id->sgls |= cpu_to_le32(1 << 20); 609 610 strscpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn)); 611
··· 601 id->awun = 0; 602 id->awupf = 0; 603 604 + /* we always support SGLs */ 605 + id->sgls = cpu_to_le32(NVME_CTRL_SGLS_BYTE_ALIGNED); 606 if (ctrl->ops->flags & NVMF_KEYED_SGLS) 607 + id->sgls |= cpu_to_le32(NVME_CTRL_SGLS_KSDBDS); 608 if (req->port->inline_data_size) 609 + id->sgls |= cpu_to_le32(NVME_CTRL_SGLS_SAOS); 610 611 strscpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn)); 612
+12 -8
include/linux/blkdev.h
··· 333 #define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \ 334 ((__force blk_features_t)(1u << 15)) 335 336 /* 337 * Flags automatically inherited when stacking limits. 338 */ ··· 779 atomic_andnot(flag, &bdev->__bd_flags); 780 } 781 782 - static inline int get_disk_ro(struct gendisk *disk) 783 { 784 return bdev_test_flag(disk->part0, BD_READ_ONLY) || 785 test_bit(GD_READ_ONLY, &disk->state); 786 } 787 788 - static inline int bdev_read_only(struct block_device *bdev) 789 { 790 return bdev_test_flag(bdev, BD_READ_ONLY) || get_disk_ro(bdev->bd_disk); 791 } ··· 1265 return q->limits.io_min; 1266 } 1267 1268 - static inline int bdev_io_min(struct block_device *bdev) 1269 { 1270 return queue_io_min(bdev_get_queue(bdev)); 1271 } ··· 1275 return q->limits.io_opt; 1276 } 1277 1278 - static inline int bdev_io_opt(struct block_device *bdev) 1279 { 1280 return queue_io_opt(bdev_get_queue(bdev)); 1281 } ··· 1421 return is_seq; 1422 } 1423 1424 - static inline int queue_dma_alignment(const struct request_queue *q) 1425 { 1426 return q->limits.dma_alignment; 1427 } ··· 1462 bdev_logical_block_size(bdev) - 1); 1463 } 1464 1465 - static inline int blk_lim_dma_alignment_and_pad(struct queue_limits *lim) 1466 { 1467 return lim->dma_alignment | lim->dma_pad_mask; 1468 } 1469 1470 - static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr, 1471 unsigned int len) 1472 { 1473 unsigned int alignment = blk_lim_dma_alignment_and_pad(&q->limits); ··· 1586 return bio_end_io_acct_remapped(bio, start_time, bio->bi_bdev); 1587 } 1588 1589 - int bdev_read_only(struct block_device *bdev); 1590 int set_blocksize(struct file *file, int size); 1591 1592 int lookup_bdev(const char *pathname, dev_t *dev);
··· 333 #define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \ 334 ((__force blk_features_t)(1u << 15)) 335 336 + /* stacked device can/does support atomic writes */ 337 + #define BLK_FEAT_ATOMIC_WRITES_STACKED \ 338 + ((__force blk_features_t)(1u << 16)) 339 + 340 /* 341 * Flags automatically inherited when stacking limits. 342 */ ··· 775 atomic_andnot(flag, &bdev->__bd_flags); 776 } 777 778 + static inline bool get_disk_ro(struct gendisk *disk) 779 { 780 return bdev_test_flag(disk->part0, BD_READ_ONLY) || 781 test_bit(GD_READ_ONLY, &disk->state); 782 } 783 784 + static inline bool bdev_read_only(struct block_device *bdev) 785 { 786 return bdev_test_flag(bdev, BD_READ_ONLY) || get_disk_ro(bdev->bd_disk); 787 } ··· 1261 return q->limits.io_min; 1262 } 1263 1264 + static inline unsigned int bdev_io_min(struct block_device *bdev) 1265 { 1266 return queue_io_min(bdev_get_queue(bdev)); 1267 } ··· 1271 return q->limits.io_opt; 1272 } 1273 1274 + static inline unsigned int bdev_io_opt(struct block_device *bdev) 1275 { 1276 return queue_io_opt(bdev_get_queue(bdev)); 1277 } ··· 1417 return is_seq; 1418 } 1419 1420 + static inline unsigned int queue_dma_alignment(const struct request_queue *q) 1421 { 1422 return q->limits.dma_alignment; 1423 } ··· 1458 bdev_logical_block_size(bdev) - 1); 1459 } 1460 1461 + static inline unsigned int 1462 + blk_lim_dma_alignment_and_pad(struct queue_limits *lim) 1463 { 1464 return lim->dma_alignment | lim->dma_pad_mask; 1465 } 1466 1467 + static inline bool blk_rq_aligned(struct request_queue *q, unsigned long addr, 1468 unsigned int len) 1469 { 1470 unsigned int alignment = blk_lim_dma_alignment_and_pad(&q->limits); ··· 1581 return bio_end_io_acct_remapped(bio, start_time, bio->bi_bdev); 1582 } 1583 1584 int set_blocksize(struct file *file, int size); 1585 1586 int lookup_bdev(const char *pathname, dev_t *dev);
+14
include/linux/nvme.h
··· 389 NVME_CTRL_CTRATT_PREDICTABLE_LAT = 1 << 5, 390 NVME_CTRL_CTRATT_NAMESPACE_GRANULARITY = 1 << 7, 391 NVME_CTRL_CTRATT_UUID_LIST = 1 << 9, 392 }; 393 394 struct nvme_lbaf { ··· 2170 NVME_PR_RELEASE_ACT_RELEASE = 0, 2171 NVME_PR_RELEASE_ACT_CLEAR = 1, 2172 }; 2173 2174 #endif /* _LINUX_NVME_H */
··· 389 NVME_CTRL_CTRATT_PREDICTABLE_LAT = 1 << 5, 390 NVME_CTRL_CTRATT_NAMESPACE_GRANULARITY = 1 << 7, 391 NVME_CTRL_CTRATT_UUID_LIST = 1 << 9, 392 + NVME_CTRL_SGLS_BYTE_ALIGNED = 1, 393 + NVME_CTRL_SGLS_DWORD_ALIGNED = 2, 394 + NVME_CTRL_SGLS_KSDBDS = 1 << 2, 395 + NVME_CTRL_SGLS_MSDS = 1 << 19, 396 + NVME_CTRL_SGLS_SAOS = 1 << 20, 397 }; 398 399 struct nvme_lbaf { ··· 2165 NVME_PR_RELEASE_ACT_RELEASE = 0, 2166 NVME_PR_RELEASE_ACT_CLEAR = 1, 2167 }; 2168 + 2169 + enum nvme_pr_change_ptpl { 2170 + NVME_PR_CPTPL_NO_CHANGE = 0, 2171 + NVME_PR_CPTPL_RESV = 1 << 30, 2172 + NVME_PR_CPTPL_CLEARED = 2 << 30, 2173 + NVME_PR_CPTPL_PERSIST = 3 << 30, 2174 + }; 2175 + 2176 + #define NVME_PR_IGNORE_KEY (1 << 3) 2177 2178 #endif /* _LINUX_NVME_H */
+1 -1
rust/kernel/block/mq/gen_disk.rs
··· 45 46 /// Validate block size by verifying that it is between 512 and `PAGE_SIZE`, 47 /// and that it is a power of two. 48 - fn validate_block_size(size: u32) -> Result<()> { 49 if !(512..=bindings::PAGE_SIZE as u32).contains(&size) || !size.is_power_of_two() { 50 Err(error::code::EINVAL) 51 } else {
··· 45 46 /// Validate block size by verifying that it is between 512 and `PAGE_SIZE`, 47 /// and that it is a power of two. 48 + fn validate_block_size(size: u32) -> Result { 49 if !(512..=bindings::PAGE_SIZE as u32).contains(&size) || !size.is_power_of_two() { 50 Err(error::code::EINVAL) 51 } else {