Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'block-6.5-2023-07-03' of git://git.kernel.dk/linux

Pull more block updates from Jens Axboe:
"Mostly items that came in a bit late for the initial pull request,
wanted to make sure they had the appropriate amount of linux-next soak
before going upstream.

Outside of stragglers, just generic fixes for either merge window
items, or longer standing bugs"

* tag 'block-6.5-2023-07-03' of git://git.kernel.dk/linux: (25 commits)
md/raid0: add discard support for the 'original' layout
nvme: disable controller on reset state failure
nvme: sync timeout work on failed reset
nvme: ensure unquiesce on teardown
cdrom/gdrom: Fix build error
nvme: improved uring polling
block: add request polling helper
nvme-mpath: fix I/O failure with EAGAIN when failing over I/O
nvme: host: fix command name spelling
blk-sysfs: add a new attr_group for blk_mq
blk-iocost: move wbt_enable/disable_default() out of spinlock
blk-wbt: cleanup rwb_enabled() and wbt_disabled()
blk-wbt: remove dead code to handle wbt enable/disable with io inflight
blk-wbt: don't create wbt sysfs entry if CONFIG_BLK_WBT is disabled
blk-mq: fix two misuses on RQF_USE_SCHED
blk-throttle: Fix io statistics for cgroup v1
bcache: Fix bcache device claiming
bcache: Alloc holder object before async registration
raid10: avoid spin_lock from fastpath from raid10_unplug()
md: fix 'delete_mutex' deadlock
...

+341 -304
+4 -2
block/blk-cgroup.c
··· 2086 2086 struct blkg_iostat_set *bis; 2087 2087 unsigned long flags; 2088 2088 2089 + if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) 2090 + return; 2091 + 2089 2092 /* Root-level stats are sourced from system-wide IO stats */ 2090 2093 if (!cgroup_parent(blkcg->css.cgroup)) 2091 2094 return; ··· 2119 2116 } 2120 2117 2121 2118 u64_stats_update_end_irqrestore(&bis->sync, flags); 2122 - if (cgroup_subsys_on_dfl(io_cgrp_subsys)) 2123 - cgroup_rstat_updated(blkcg->css.cgroup, cpu); 2119 + cgroup_rstat_updated(blkcg->css.cgroup, cpu); 2124 2120 put_cpu(); 2125 2121 } 2126 2122
+5 -2
block/blk-iocost.c
··· 3301 3301 blk_stat_enable_accounting(disk->queue); 3302 3302 blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); 3303 3303 ioc->enabled = true; 3304 - wbt_disable_default(disk); 3305 3304 } else { 3306 3305 blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); 3307 3306 ioc->enabled = false; 3308 - wbt_enable_default(disk); 3309 3307 } 3310 3308 3311 3309 if (user) { ··· 3315 3317 3316 3318 ioc_refresh_params(ioc, true); 3317 3319 spin_unlock_irq(&ioc->lock); 3320 + 3321 + if (enable) 3322 + wbt_disable_default(disk); 3323 + else 3324 + wbt_enable_default(disk); 3318 3325 3319 3326 blk_mq_unquiesce_queue(disk->queue); 3320 3327 blk_mq_unfreeze_queue(disk->queue);
+37 -17
block/blk-mq.c
··· 49 49 blk_insert_t flags); 50 50 static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, 51 51 struct list_head *list); 52 - 53 - static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q, 54 - blk_qc_t qc) 55 - { 56 - return xa_load(&q->hctx_table, qc); 57 - } 58 - 59 - static inline blk_qc_t blk_rq_to_qc(struct request *rq) 60 - { 61 - return rq->mq_hctx->queue_num; 62 - } 52 + static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, 53 + struct io_comp_batch *iob, unsigned int flags); 63 54 64 55 /* 65 56 * Check if any of the ctx, dispatch list or elevator ··· 1239 1248 q->integrity.profile->prepare_fn(rq); 1240 1249 #endif 1241 1250 if (rq->bio && rq->bio->bi_opf & REQ_POLLED) 1242 - WRITE_ONCE(rq->bio->bi_cookie, blk_rq_to_qc(rq)); 1251 + WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num); 1243 1252 } 1244 1253 EXPORT_SYMBOL(blk_mq_start_request); 1245 1254 ··· 1271 1280 1272 1281 if (!plug->multiple_queues && last && last->q != rq->q) 1273 1282 plug->multiple_queues = true; 1274 - if (!plug->has_elevator && (rq->rq_flags & RQF_USE_SCHED)) 1283 + /* 1284 + * Any request allocated from sched tags can't be issued to 1285 + * ->queue_rqs() directly 1286 + */ 1287 + if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS)) 1275 1288 plug->has_elevator = true; 1276 1289 rq->rq_next = NULL; 1277 1290 rq_list_add(&plug->mq_list, rq); ··· 1345 1350 static void blk_rq_poll_completion(struct request *rq, struct completion *wait) 1346 1351 { 1347 1352 do { 1348 - blk_mq_poll(rq->q, blk_rq_to_qc(rq), NULL, 0); 1353 + blk_hctx_poll(rq->q, rq->mq_hctx, NULL, 0); 1349 1354 cond_resched(); 1350 1355 } while (!completion_done(wait)); 1351 1356 } ··· 4740 4745 } 4741 4746 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); 4742 4747 4743 - int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, 4744 - unsigned int flags) 4748 + static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, 4749 + struct io_comp_batch *iob, unsigned int flags) 4745 4750 { 4746 - struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie); 4747 4751 long state = get_current_state(); 4748 4752 int ret; 4749 4753 ··· 4766 4772 __set_current_state(TASK_RUNNING); 4767 4773 return 0; 4768 4774 } 4775 + 4776 + int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, 4777 + struct io_comp_batch *iob, unsigned int flags) 4778 + { 4779 + struct blk_mq_hw_ctx *hctx = xa_load(&q->hctx_table, cookie); 4780 + 4781 + return blk_hctx_poll(q, hctx, iob, flags); 4782 + } 4783 + 4784 + int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, 4785 + unsigned int poll_flags) 4786 + { 4787 + struct request_queue *q = rq->q; 4788 + int ret; 4789 + 4790 + if (!blk_rq_is_poll(rq)) 4791 + return 0; 4792 + if (!percpu_ref_tryget(&q->q_usage_counter)) 4793 + return 0; 4794 + 4795 + ret = blk_hctx_poll(q, rq->mq_hctx, iob, poll_flags); 4796 + blk_queue_exit(q); 4797 + 4798 + return ret; 4799 + } 4800 + EXPORT_SYMBOL_GPL(blk_rq_poll); 4769 4801 4770 4802 unsigned int blk_mq_rq_cpu(struct request *rq) 4771 4803 {
+103 -78
block/blk-sysfs.c
··· 47 47 return count; 48 48 } 49 49 50 - static ssize_t queue_var_store64(s64 *var, const char *page) 51 - { 52 - int err; 53 - s64 v; 54 - 55 - err = kstrtos64(page, 10, &v); 56 - if (err < 0) 57 - return err; 58 - 59 - *var = v; 60 - return 0; 61 - } 62 - 63 50 static ssize_t queue_requests_show(struct request_queue *q, char *page) 64 51 { 65 52 return queue_var_show(q->nr_requests, page); ··· 438 451 return count; 439 452 } 440 453 441 - static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) 442 - { 443 - if (!wbt_rq_qos(q)) 444 - return -EINVAL; 445 - 446 - if (wbt_disabled(q)) 447 - return sprintf(page, "0\n"); 448 - 449 - return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); 450 - } 451 - 452 - static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, 453 - size_t count) 454 - { 455 - struct rq_qos *rqos; 456 - ssize_t ret; 457 - s64 val; 458 - 459 - ret = queue_var_store64(&val, page); 460 - if (ret < 0) 461 - return ret; 462 - if (val < -1) 463 - return -EINVAL; 464 - 465 - rqos = wbt_rq_qos(q); 466 - if (!rqos) { 467 - ret = wbt_init(q->disk); 468 - if (ret) 469 - return ret; 470 - } 471 - 472 - if (val == -1) 473 - val = wbt_default_latency_nsec(q); 474 - else if (val >= 0) 475 - val *= 1000ULL; 476 - 477 - if (wbt_get_min_lat(q) == val) 478 - return count; 479 - 480 - /* 481 - * Ensure that the queue is idled, in case the latency update 482 - * ends up either enabling or disabling wbt completely. We can't 483 - * have IO inflight if that happens. 484 - */ 485 - blk_mq_freeze_queue(q); 486 - blk_mq_quiesce_queue(q); 487 - 488 - wbt_set_min_lat(q, val); 489 - 490 - blk_mq_unquiesce_queue(q); 491 - blk_mq_unfreeze_queue(q); 492 - 493 - return count; 494 - } 495 - 496 454 static ssize_t queue_wc_show(struct request_queue *q, char *page) 497 455 { 498 456 if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) ··· 530 598 QUEUE_RO_ENTRY(queue_fua, "fua"); 531 599 QUEUE_RO_ENTRY(queue_dax, "dax"); 532 600 QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); 533 - QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); 534 601 QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); 535 602 QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment"); 536 603 ··· 548 617 QUEUE_RW_ENTRY(queue_random, "add_random"); 549 618 QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes"); 550 619 620 + #ifdef CONFIG_BLK_WBT 621 + static ssize_t queue_var_store64(s64 *var, const char *page) 622 + { 623 + int err; 624 + s64 v; 625 + 626 + err = kstrtos64(page, 10, &v); 627 + if (err < 0) 628 + return err; 629 + 630 + *var = v; 631 + return 0; 632 + } 633 + 634 + static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) 635 + { 636 + if (!wbt_rq_qos(q)) 637 + return -EINVAL; 638 + 639 + if (wbt_disabled(q)) 640 + return sprintf(page, "0\n"); 641 + 642 + return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); 643 + } 644 + 645 + static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, 646 + size_t count) 647 + { 648 + struct rq_qos *rqos; 649 + ssize_t ret; 650 + s64 val; 651 + 652 + ret = queue_var_store64(&val, page); 653 + if (ret < 0) 654 + return ret; 655 + if (val < -1) 656 + return -EINVAL; 657 + 658 + rqos = wbt_rq_qos(q); 659 + if (!rqos) { 660 + ret = wbt_init(q->disk); 661 + if (ret) 662 + return ret; 663 + } 664 + 665 + if (val == -1) 666 + val = wbt_default_latency_nsec(q); 667 + else if (val >= 0) 668 + val *= 1000ULL; 669 + 670 + if (wbt_get_min_lat(q) == val) 671 + return count; 672 + 673 + /* 674 + * Ensure that the queue is idled, in case the latency update 675 + * ends up either enabling or disabling wbt completely. We can't 676 + * have IO inflight if that happens. 677 + */ 678 + blk_mq_freeze_queue(q); 679 + blk_mq_quiesce_queue(q); 680 + 681 + wbt_set_min_lat(q, val); 682 + 683 + blk_mq_unquiesce_queue(q); 684 + blk_mq_unfreeze_queue(q); 685 + 686 + return count; 687 + } 688 + 689 + QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); 690 + #endif 691 + 551 692 static struct attribute *queue_attrs[] = { 552 - &queue_requests_entry.attr, 553 693 &queue_ra_entry.attr, 554 694 &queue_max_hw_sectors_entry.attr, 555 695 &queue_max_sectors_entry.attr, ··· 628 626 &queue_max_discard_segments_entry.attr, 629 627 &queue_max_integrity_segments_entry.attr, 630 628 &queue_max_segment_size_entry.attr, 631 - &elv_iosched_entry.attr, 632 629 &queue_hw_sector_size_entry.attr, 633 630 &queue_logical_block_size_entry.attr, 634 631 &queue_physical_block_size_entry.attr, ··· 648 647 &queue_max_open_zones_entry.attr, 649 648 &queue_max_active_zones_entry.attr, 650 649 &queue_nomerges_entry.attr, 651 - &queue_rq_affinity_entry.attr, 652 650 &queue_iostats_entry.attr, 653 651 &queue_stable_writes_entry.attr, 654 652 &queue_random_entry.attr, ··· 655 655 &queue_wc_entry.attr, 656 656 &queue_fua_entry.attr, 657 657 &queue_dax_entry.attr, 658 - &queue_wb_lat_entry.attr, 659 658 &queue_poll_delay_entry.attr, 660 - &queue_io_timeout_entry.attr, 661 659 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 662 660 &blk_throtl_sample_time_entry.attr, 663 661 #endif 664 662 &queue_virt_boundary_mask_entry.attr, 665 663 &queue_dma_alignment_entry.attr, 664 + NULL, 665 + }; 666 + 667 + static struct attribute *blk_mq_queue_attrs[] = { 668 + &queue_requests_entry.attr, 669 + &elv_iosched_entry.attr, 670 + &queue_rq_affinity_entry.attr, 671 + &queue_io_timeout_entry.attr, 672 + #ifdef CONFIG_BLK_WBT 673 + &queue_wb_lat_entry.attr, 674 + #endif 666 675 NULL, 667 676 }; 668 677 ··· 681 672 struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); 682 673 struct request_queue *q = disk->queue; 683 674 684 - if (attr == &queue_io_timeout_entry.attr && 685 - (!q->mq_ops || !q->mq_ops->timeout)) 686 - return 0; 687 - 688 675 if ((attr == &queue_max_open_zones_entry.attr || 689 676 attr == &queue_max_active_zones_entry.attr) && 690 677 !blk_queue_is_zoned(q)) 678 + return 0; 679 + 680 + return attr->mode; 681 + } 682 + 683 + static umode_t blk_mq_queue_attr_visible(struct kobject *kobj, 684 + struct attribute *attr, int n) 685 + { 686 + struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); 687 + struct request_queue *q = disk->queue; 688 + 689 + if (!queue_is_mq(q)) 690 + return 0; 691 + 692 + if (attr == &queue_io_timeout_entry.attr && !q->mq_ops->timeout) 691 693 return 0; 692 694 693 695 return attr->mode; ··· 709 689 .is_visible = queue_attr_visible, 710 690 }; 711 691 692 + static struct attribute_group blk_mq_queue_attr_group = { 693 + .attrs = blk_mq_queue_attrs, 694 + .is_visible = blk_mq_queue_attr_visible, 695 + }; 712 696 713 697 #define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr) 714 698 ··· 757 733 758 734 static const struct attribute_group *blk_queue_attr_groups[] = { 759 735 &queue_attr_group, 736 + &blk_mq_queue_attr_group, 760 737 NULL 761 738 }; 762 739
-6
block/blk-throttle.c
··· 2178 2178 2179 2179 rcu_read_lock(); 2180 2180 2181 - if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { 2182 - blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, 2183 - bio->bi_iter.bi_size); 2184 - blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); 2185 - } 2186 - 2187 2181 spin_lock_irq(&q->queue_lock); 2188 2182 2189 2183 throtl_update_latency_buckets(td);
+9
block/blk-throttle.h
··· 185 185 struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg); 186 186 int rw = bio_data_dir(bio); 187 187 188 + if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { 189 + if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { 190 + bio_set_flag(bio, BIO_CGROUP_ACCT); 191 + blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, 192 + bio->bi_iter.bi_size); 193 + } 194 + blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); 195 + } 196 + 188 197 /* iops limit is always counted */ 189 198 if (tg->has_rules_iops[rw]) 190 199 return true;
+2 -19
block/blk-wbt.c
··· 146 146 static inline bool rwb_enabled(struct rq_wb *rwb) 147 147 { 148 148 return rwb && rwb->enable_state != WBT_STATE_OFF_DEFAULT && 149 - rwb->wb_normal != 0; 149 + rwb->enable_state != WBT_STATE_OFF_MANUAL; 150 150 } 151 151 152 152 static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) ··· 199 199 int inflight, limit; 200 200 201 201 inflight = atomic_dec_return(&rqw->inflight); 202 - 203 - /* 204 - * wbt got disabled with IO in flight. Wake up any potential 205 - * waiters, we don't have to do more than that. 206 - */ 207 - if (unlikely(!rwb_enabled(rwb))) { 208 - rwb_wake_all(rwb); 209 - return; 210 - } 211 202 212 203 /* 213 204 * For discards, our limit is always the background. For writes, if ··· 494 503 { 495 504 struct rq_qos *rqos = wbt_rq_qos(q); 496 505 497 - return !rqos || RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT || 498 - RQWB(rqos)->enable_state == WBT_STATE_OFF_MANUAL; 506 + return !rqos || !rwb_enabled(RQWB(rqos)); 499 507 } 500 508 501 509 u64 wbt_get_min_lat(struct request_queue *q) ··· 534 544 static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf) 535 545 { 536 546 unsigned int limit; 537 - 538 - /* 539 - * If we got disabled, just return UINT_MAX. This ensures that 540 - * we'll properly inc a new IO, and dec+wakeup at the end. 541 - */ 542 - if (!rwb_enabled(rwb)) 543 - return UINT_MAX; 544 547 545 548 if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD) 546 549 return rwb->wb_background;
-19
block/blk-wbt.h
··· 18 18 19 19 #else 20 20 21 - static inline int wbt_init(struct gendisk *disk) 22 - { 23 - return -EINVAL; 24 - } 25 21 static inline void wbt_disable_default(struct gendisk *disk) 26 22 { 27 23 } ··· 26 30 } 27 31 static inline void wbt_set_write_cache(struct request_queue *q, bool wc) 28 32 { 29 - } 30 - static inline u64 wbt_get_min_lat(struct request_queue *q) 31 - { 32 - return 0; 33 - } 34 - static inline void wbt_set_min_lat(struct request_queue *q, u64 val) 35 - { 36 - } 37 - static inline u64 wbt_default_latency_nsec(struct request_queue *q) 38 - { 39 - return 0; 40 - } 41 - static inline bool wbt_disabled(struct request_queue *q) 42 - { 43 - return true; 44 33 } 45 34 46 35 #endif /* CONFIG_BLK_WBT */
+2 -2
drivers/cdrom/gdrom.c
··· 481 481 disk_check_media_change(disk); 482 482 483 483 mutex_lock(&gdrom_mutex); 484 - ret = cdrom_open(gd.cd_info); 484 + ret = cdrom_open(gd.cd_info, mode); 485 485 mutex_unlock(&gdrom_mutex); 486 486 return ret; 487 487 } ··· 489 489 static void gdrom_bdops_release(struct gendisk *disk) 490 490 { 491 491 mutex_lock(&gdrom_mutex); 492 - cdrom_release(gd.cd_info, mode); 492 + cdrom_release(gd.cd_info); 493 493 mutex_unlock(&gdrom_mutex); 494 494 } 495 495
+59 -64
drivers/md/bcache/super.c
··· 1369 1369 put_page(virt_to_page(dc->sb_disk)); 1370 1370 1371 1371 if (!IS_ERR_OR_NULL(dc->bdev)) 1372 - blkdev_put(dc->bdev, bcache_kobj); 1372 + blkdev_put(dc->bdev, dc); 1373 1373 1374 1374 wake_up(&unregister_wait); 1375 1375 ··· 1453 1453 1454 1454 memcpy(&dc->sb, sb, sizeof(struct cache_sb)); 1455 1455 dc->bdev = bdev; 1456 - dc->bdev->bd_holder = dc; 1457 1456 dc->sb_disk = sb_disk; 1458 1457 1459 1458 if (cached_dev_init(dc, sb->block_size << 9)) ··· 2217 2218 put_page(virt_to_page(ca->sb_disk)); 2218 2219 2219 2220 if (!IS_ERR_OR_NULL(ca->bdev)) 2220 - blkdev_put(ca->bdev, bcache_kobj); 2221 + blkdev_put(ca->bdev, ca); 2221 2222 2222 2223 kfree(ca); 2223 2224 module_put(THIS_MODULE); ··· 2344 2345 2345 2346 memcpy(&ca->sb, sb, sizeof(struct cache_sb)); 2346 2347 ca->bdev = bdev; 2347 - ca->bdev->bd_holder = ca; 2348 2348 ca->sb_disk = sb_disk; 2349 2349 2350 2350 if (bdev_max_discard_sectors((bdev))) ··· 2357 2359 * call blkdev_put() to bdev in bch_cache_release(). So we 2358 2360 * explicitly call blkdev_put() here. 2359 2361 */ 2360 - blkdev_put(bdev, bcache_kobj); 2362 + blkdev_put(bdev, ca); 2361 2363 if (ret == -ENOMEM) 2362 2364 err = "cache_alloc(): -ENOMEM"; 2363 2365 else if (ret == -EPERM) ··· 2446 2448 struct cache_sb *sb; 2447 2449 struct cache_sb_disk *sb_disk; 2448 2450 struct block_device *bdev; 2451 + void *holder; 2449 2452 }; 2450 2453 2451 2454 static void register_bdev_worker(struct work_struct *work) ··· 2454 2455 int fail = false; 2455 2456 struct async_reg_args *args = 2456 2457 container_of(work, struct async_reg_args, reg_work.work); 2457 - struct cached_dev *dc; 2458 - 2459 - dc = kzalloc(sizeof(*dc), GFP_KERNEL); 2460 - if (!dc) { 2461 - fail = true; 2462 - put_page(virt_to_page(args->sb_disk)); 2463 - blkdev_put(args->bdev, bcache_kobj); 2464 - goto out; 2465 - } 2466 2458 2467 2459 mutex_lock(&bch_register_lock); 2468 - if (register_bdev(args->sb, args->sb_disk, args->bdev, dc) < 0) 2460 + if (register_bdev(args->sb, args->sb_disk, args->bdev, args->holder) 2461 + < 0) 2469 2462 fail = true; 2470 2463 mutex_unlock(&bch_register_lock); 2471 2464 2472 - out: 2473 2465 if (fail) 2474 2466 pr_info("error %s: fail to register backing device\n", 2475 2467 args->path); ··· 2475 2485 int fail = false; 2476 2486 struct async_reg_args *args = 2477 2487 container_of(work, struct async_reg_args, reg_work.work); 2478 - struct cache *ca; 2479 - 2480 - ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2481 - if (!ca) { 2482 - fail = true; 2483 - put_page(virt_to_page(args->sb_disk)); 2484 - blkdev_put(args->bdev, bcache_kobj); 2485 - goto out; 2486 - } 2487 2488 2488 2489 /* blkdev_put() will be called in bch_cache_release() */ 2489 - if (register_cache(args->sb, args->sb_disk, args->bdev, ca) != 0) 2490 + if (register_cache(args->sb, args->sb_disk, args->bdev, args->holder)) 2490 2491 fail = true; 2491 2492 2492 - out: 2493 2493 if (fail) 2494 2494 pr_info("error %s: fail to register cache device\n", 2495 2495 args->path); ··· 2500 2520 queue_delayed_work(system_wq, &args->reg_work, 10); 2501 2521 } 2502 2522 2523 + static void *alloc_holder_object(struct cache_sb *sb) 2524 + { 2525 + if (SB_IS_BDEV(sb)) 2526 + return kzalloc(sizeof(struct cached_dev), GFP_KERNEL); 2527 + return kzalloc(sizeof(struct cache), GFP_KERNEL); 2528 + } 2529 + 2503 2530 static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, 2504 2531 const char *buffer, size_t size) 2505 2532 { ··· 2514 2527 char *path = NULL; 2515 2528 struct cache_sb *sb; 2516 2529 struct cache_sb_disk *sb_disk; 2517 - struct block_device *bdev; 2530 + struct block_device *bdev, *bdev2; 2531 + void *holder = NULL; 2518 2532 ssize_t ret; 2519 2533 bool async_registration = false; 2534 + bool quiet = false; 2520 2535 2521 2536 #ifdef CONFIG_BCACHE_ASYNC_REGISTRATION 2522 2537 async_registration = true; ··· 2547 2558 2548 2559 ret = -EINVAL; 2549 2560 err = "failed to open device"; 2550 - bdev = blkdev_get_by_path(strim(path), BLK_OPEN_READ | BLK_OPEN_WRITE, 2551 - bcache_kobj, NULL); 2561 + bdev = blkdev_get_by_path(strim(path), BLK_OPEN_READ, NULL, NULL); 2562 + if (IS_ERR(bdev)) 2563 + goto out_free_sb; 2564 + 2565 + err = "failed to set blocksize"; 2566 + if (set_blocksize(bdev, 4096)) 2567 + goto out_blkdev_put; 2568 + 2569 + err = read_super(sb, bdev, &sb_disk); 2570 + if (err) 2571 + goto out_blkdev_put; 2572 + 2573 + holder = alloc_holder_object(sb); 2574 + if (!holder) { 2575 + ret = -ENOMEM; 2576 + err = "cannot allocate memory"; 2577 + goto out_put_sb_page; 2578 + } 2579 + 2580 + /* Now reopen in exclusive mode with proper holder */ 2581 + bdev2 = blkdev_get_by_dev(bdev->bd_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, 2582 + holder, NULL); 2583 + blkdev_put(bdev, NULL); 2584 + bdev = bdev2; 2552 2585 if (IS_ERR(bdev)) { 2553 - if (bdev == ERR_PTR(-EBUSY)) { 2586 + ret = PTR_ERR(bdev); 2587 + bdev = NULL; 2588 + if (ret == -EBUSY) { 2554 2589 dev_t dev; 2555 2590 2556 2591 mutex_lock(&bch_register_lock); ··· 2584 2571 else 2585 2572 err = "device busy"; 2586 2573 mutex_unlock(&bch_register_lock); 2587 - if (attr == &ksysfs_register_quiet) 2588 - goto done; 2574 + if (attr == &ksysfs_register_quiet) { 2575 + quiet = true; 2576 + ret = size; 2577 + } 2589 2578 } 2590 - goto out_free_sb; 2579 + goto out_free_holder; 2591 2580 } 2592 - 2593 - err = "failed to set blocksize"; 2594 - if (set_blocksize(bdev, 4096)) 2595 - goto out_blkdev_put; 2596 - 2597 - err = read_super(sb, bdev, &sb_disk); 2598 - if (err) 2599 - goto out_blkdev_put; 2600 2581 2601 2582 err = "failed to register device"; 2602 2583 ··· 2602 2595 if (!args) { 2603 2596 ret = -ENOMEM; 2604 2597 err = "cannot allocate memory"; 2605 - goto out_put_sb_page; 2598 + goto out_free_holder; 2606 2599 } 2607 2600 2608 2601 args->path = path; 2609 2602 args->sb = sb; 2610 2603 args->sb_disk = sb_disk; 2611 2604 args->bdev = bdev; 2605 + args->holder = holder; 2612 2606 register_device_async(args); 2613 2607 /* No wait and returns to user space */ 2614 2608 goto async_done; 2615 2609 } 2616 2610 2617 2611 if (SB_IS_BDEV(sb)) { 2618 - struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL); 2619 - 2620 - if (!dc) { 2621 - ret = -ENOMEM; 2622 - err = "cannot allocate memory"; 2623 - goto out_put_sb_page; 2624 - } 2625 - 2626 2612 mutex_lock(&bch_register_lock); 2627 - ret = register_bdev(sb, sb_disk, bdev, dc); 2613 + ret = register_bdev(sb, sb_disk, bdev, holder); 2628 2614 mutex_unlock(&bch_register_lock); 2629 2615 /* blkdev_put() will be called in cached_dev_free() */ 2630 2616 if (ret < 0) 2631 2617 goto out_free_sb; 2632 2618 } else { 2633 - struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2634 - 2635 - if (!ca) { 2636 - ret = -ENOMEM; 2637 - err = "cannot allocate memory"; 2638 - goto out_put_sb_page; 2639 - } 2640 - 2641 2619 /* blkdev_put() will be called in bch_cache_release() */ 2642 - ret = register_cache(sb, sb_disk, bdev, ca); 2620 + ret = register_cache(sb, sb_disk, bdev, holder); 2643 2621 if (ret) 2644 2622 goto out_free_sb; 2645 2623 } 2646 2624 2647 - done: 2648 2625 kfree(sb); 2649 2626 kfree(path); 2650 2627 module_put(THIS_MODULE); 2651 2628 async_done: 2652 2629 return size; 2653 2630 2631 + out_free_holder: 2632 + kfree(holder); 2654 2633 out_put_sb_page: 2655 2634 put_page(virt_to_page(sb_disk)); 2656 2635 out_blkdev_put: 2657 - blkdev_put(bdev, register_bcache); 2636 + if (bdev) 2637 + blkdev_put(bdev, holder); 2658 2638 out_free_sb: 2659 2639 kfree(sb); 2660 2640 out_free_path: ··· 2650 2656 out_module_put: 2651 2657 module_put(THIS_MODULE); 2652 2658 out: 2653 - pr_info("error %s: %s\n", path?path:"", err); 2659 + if (!quiet) 2660 + pr_info("error %s: %s\n", path?path:"", err); 2654 2661 return ret; 2655 2662 } 2656 2663
+11 -21
drivers/md/md.c
··· 643 643 { 644 644 mutex_init(&mddev->open_mutex); 645 645 mutex_init(&mddev->reconfig_mutex); 646 - mutex_init(&mddev->delete_mutex); 647 646 mutex_init(&mddev->bitmap_info.mutex); 648 647 INIT_LIST_HEAD(&mddev->disks); 649 648 INIT_LIST_HEAD(&mddev->all_mddevs); ··· 748 749 749 750 static const struct attribute_group md_redundancy_group; 750 751 751 - static void md_free_rdev(struct mddev *mddev) 752 + void mddev_unlock(struct mddev *mddev) 752 753 { 753 754 struct md_rdev *rdev; 754 755 struct md_rdev *tmp; 756 + LIST_HEAD(delete); 755 757 756 - mutex_lock(&mddev->delete_mutex); 757 - if (list_empty(&mddev->deleting)) 758 - goto out; 758 + if (!list_empty(&mddev->deleting)) 759 + list_splice_init(&mddev->deleting, &delete); 759 760 760 - list_for_each_entry_safe(rdev, tmp, &mddev->deleting, same_set) { 761 - list_del_init(&rdev->same_set); 762 - kobject_del(&rdev->kobj); 763 - export_rdev(rdev, mddev); 764 - } 765 - out: 766 - mutex_unlock(&mddev->delete_mutex); 767 - } 768 - 769 - void mddev_unlock(struct mddev *mddev) 770 - { 771 761 if (mddev->to_remove) { 772 762 /* These cannot be removed under reconfig_mutex as 773 763 * an access to the files will try to take reconfig_mutex ··· 796 808 } else 797 809 mutex_unlock(&mddev->reconfig_mutex); 798 810 799 - md_free_rdev(mddev); 811 + list_for_each_entry_safe(rdev, tmp, &delete, same_set) { 812 + list_del_init(&rdev->same_set); 813 + kobject_del(&rdev->kobj); 814 + export_rdev(rdev, mddev); 815 + } 800 816 801 817 md_wakeup_thread(mddev->thread); 802 818 wake_up(&mddev->sb_wait); ··· 2450 2458 if (test_bit(AutoDetected, &rdev->flags)) 2451 2459 md_autodetect_dev(rdev->bdev->bd_dev); 2452 2460 #endif 2453 - blkdev_put(rdev->bdev, mddev->major_version == -2 ? &claim_rdev : rdev); 2461 + blkdev_put(rdev->bdev, mddev->external ? &claim_rdev : rdev); 2454 2462 rdev->bdev = NULL; 2455 2463 kobject_put(&rdev->kobj); 2456 2464 } ··· 2480 2488 * reconfig_mutex is held, hence it can't be called under 2481 2489 * reconfig_mutex and it's delayed to mddev_unlock(). 2482 2490 */ 2483 - mutex_lock(&mddev->delete_mutex); 2484 2491 list_add(&rdev->same_set, &mddev->deleting); 2485 - mutex_unlock(&mddev->delete_mutex); 2486 2492 } 2487 2493 2488 2494 static void export_array(struct mddev *mddev) ··· 6130 6140 mddev->resync_min = 0; 6131 6141 mddev->resync_max = MaxSector; 6132 6142 mddev->reshape_position = MaxSector; 6133 - mddev->external = 0; 6143 + /* we still need mddev->external in export_rdev, do not clear it yet */ 6134 6144 mddev->persistent = 0; 6135 6145 mddev->level = LEVEL_NONE; 6136 6146 mddev->clevel[0] = 0;
+1 -3
drivers/md/md.h
··· 531 531 532 532 /* 533 533 * Temporarily store rdev that will be finally removed when 534 - * reconfig_mutex is unlocked. 534 + * reconfig_mutex is unlocked, protected by reconfig_mutex. 535 535 */ 536 536 struct list_head deleting; 537 - /* Protect the deleting list */ 538 - struct mutex delete_mutex; 539 537 540 538 bool has_superblocks:1; 541 539 bool fail_last_dev:1;
+54 -8
drivers/md/raid0.c
··· 270 270 goto abort; 271 271 } 272 272 273 + if (conf->layout == RAID0_ORIG_LAYOUT) { 274 + for (i = 1; i < conf->nr_strip_zones; i++) { 275 + sector_t first_sector = conf->strip_zone[i-1].zone_end; 276 + 277 + sector_div(first_sector, mddev->chunk_sectors); 278 + zone = conf->strip_zone + i; 279 + /* disk_shift is first disk index used in the zone */ 280 + zone->disk_shift = sector_div(first_sector, 281 + zone->nb_dev); 282 + } 283 + } 284 + 273 285 pr_debug("md/raid0:%s: done.\n", mdname(mddev)); 274 286 *private_conf = conf; 275 287 ··· 443 431 return ret; 444 432 } 445 433 434 + /* 435 + * Convert disk_index to the disk order in which it is read/written. 436 + * For example, if we have 4 disks, they are numbered 0,1,2,3. If we 437 + * write the disks starting at disk 3, then the read/write order would 438 + * be disk 3, then 0, then 1, and then disk 2 and we want map_disk_shift() 439 + * to map the disks as follows 0,1,2,3 => 1,2,3,0. So disk 0 would map 440 + * to 1, 1 to 2, 2 to 3, and 3 to 0. That way we can compare disks in 441 + * that 'output' space to understand the read/write disk ordering. 442 + */ 443 + static int map_disk_shift(int disk_index, int num_disks, int disk_shift) 444 + { 445 + return ((disk_index + num_disks - disk_shift) % num_disks); 446 + } 447 + 446 448 static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) 447 449 { 448 450 struct r0conf *conf = mddev->private; ··· 470 444 sector_t end_disk_offset; 471 445 unsigned int end_disk_index; 472 446 unsigned int disk; 447 + sector_t orig_start, orig_end; 473 448 449 + orig_start = start; 474 450 zone = find_zone(conf, &start); 475 451 476 452 if (bio_end_sector(bio) > zone->zone_end) { ··· 486 458 } else 487 459 end = bio_end_sector(bio); 488 460 461 + orig_end = end; 489 462 if (zone != conf->strip_zone) 490 463 end = end - zone[-1].zone_end; 491 464 ··· 498 469 last_stripe_index = end; 499 470 sector_div(last_stripe_index, stripe_size); 500 471 501 - start_disk_index = (int)(start - first_stripe_index * stripe_size) / 502 - mddev->chunk_sectors; 472 + /* In the first zone the original and alternate layouts are the same */ 473 + if ((conf->layout == RAID0_ORIG_LAYOUT) && (zone != conf->strip_zone)) { 474 + sector_div(orig_start, mddev->chunk_sectors); 475 + start_disk_index = sector_div(orig_start, zone->nb_dev); 476 + start_disk_index = map_disk_shift(start_disk_index, 477 + zone->nb_dev, 478 + zone->disk_shift); 479 + sector_div(orig_end, mddev->chunk_sectors); 480 + end_disk_index = sector_div(orig_end, zone->nb_dev); 481 + end_disk_index = map_disk_shift(end_disk_index, 482 + zone->nb_dev, zone->disk_shift); 483 + } else { 484 + start_disk_index = (int)(start - first_stripe_index * stripe_size) / 485 + mddev->chunk_sectors; 486 + end_disk_index = (int)(end - last_stripe_index * stripe_size) / 487 + mddev->chunk_sectors; 488 + } 503 489 start_disk_offset = ((int)(start - first_stripe_index * stripe_size) % 504 490 mddev->chunk_sectors) + 505 491 first_stripe_index * mddev->chunk_sectors; 506 - end_disk_index = (int)(end - last_stripe_index * stripe_size) / 507 - mddev->chunk_sectors; 508 492 end_disk_offset = ((int)(end - last_stripe_index * stripe_size) % 509 493 mddev->chunk_sectors) + 510 494 last_stripe_index * mddev->chunk_sectors; ··· 525 483 for (disk = 0; disk < zone->nb_dev; disk++) { 526 484 sector_t dev_start, dev_end; 527 485 struct md_rdev *rdev; 486 + int compare_disk; 528 487 529 - if (disk < start_disk_index) 488 + compare_disk = map_disk_shift(disk, zone->nb_dev, 489 + zone->disk_shift); 490 + 491 + if (compare_disk < start_disk_index) 530 492 dev_start = (first_stripe_index + 1) * 531 493 mddev->chunk_sectors; 532 - else if (disk > start_disk_index) 494 + else if (compare_disk > start_disk_index) 533 495 dev_start = first_stripe_index * mddev->chunk_sectors; 534 496 else 535 497 dev_start = start_disk_offset; 536 498 537 - if (disk < end_disk_index) 499 + if (compare_disk < end_disk_index) 538 500 dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; 539 - else if (disk > end_disk_index) 501 + else if (compare_disk > end_disk_index) 540 502 dev_end = last_stripe_index * mddev->chunk_sectors; 541 503 else 542 504 dev_end = end_disk_offset;
+1
drivers/md/raid0.h
··· 6 6 sector_t zone_end; /* Start of the next zone (in sectors) */ 7 7 sector_t dev_start; /* Zone offset in real dev (in sectors) */ 8 8 int nb_dev; /* # of devices attached to the zone */ 9 + int disk_shift; /* start disk for the original layout */ 9 10 }; 10 11 11 12 /* Linux 3.14 (20d0189b101) made an unintended change to
+1 -1
drivers/md/raid1-10.c
··· 116 116 117 117 static inline void raid1_submit_write(struct bio *bio) 118 118 { 119 - struct md_rdev *rdev = (struct md_rdev *)bio->bi_bdev; 119 + struct md_rdev *rdev = (void *)bio->bi_bdev; 120 120 121 121 bio->bi_next = NULL; 122 122 bio_set_dev(bio, rdev->bdev);
+3 -3
drivers/md/raid10.c
··· 325 325 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) 326 326 bio->bi_status = BLK_STS_IOERR; 327 327 328 - if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) 328 + if (r10_bio->start_time) 329 329 bio_end_io_acct(bio, r10_bio->start_time); 330 330 bio_endio(bio); 331 331 /* ··· 1118 1118 spin_lock_irq(&conf->device_lock); 1119 1119 bio_list_merge(&conf->pending_bio_list, &plug->pending); 1120 1120 spin_unlock_irq(&conf->device_lock); 1121 - wake_up(&conf->wait_barrier); 1121 + wake_up_barrier(conf); 1122 1122 md_wakeup_thread(mddev->thread); 1123 1123 kfree(plug); 1124 1124 return; ··· 1127 1127 /* we aren't scheduling, so we can do the write-out directly. */ 1128 1128 bio = bio_list_get(&plug->pending); 1129 1129 raid1_prepare_flush_writes(mddev->bitmap); 1130 - wake_up(&conf->wait_barrier); 1130 + wake_up_barrier(conf); 1131 1131 1132 1132 while (bio) { /* submit pending writes */ 1133 1133 struct bio *next = bio->bi_next;
+1 -1
drivers/nvme/host/constants.c
··· 12 12 [nvme_cmd_read] = "Read", 13 13 [nvme_cmd_write_uncor] = "Write Uncorrectable", 14 14 [nvme_cmd_compare] = "Compare", 15 - [nvme_cmd_write_zeroes] = "Write Zeros", 15 + [nvme_cmd_write_zeroes] = "Write Zeroes", 16 16 [nvme_cmd_dsm] = "Dataset Management", 17 17 [nvme_cmd_verify] = "Verify", 18 18 [nvme_cmd_resv_register] = "Reservation Register",
+5 -1
drivers/nvme/host/core.c
··· 1134 1134 mutex_unlock(&ctrl->scan_lock); 1135 1135 } 1136 1136 if (effects & NVME_CMD_EFFECTS_CCC) { 1137 - dev_info(ctrl->device, 1137 + if (!test_and_set_bit(NVME_CTRL_DIRTY_CAPABILITY, 1138 + &ctrl->flags)) { 1139 + dev_info(ctrl->device, 1138 1140 "controller capabilities changed, reset may be required to take effect.\n"); 1141 + } 1139 1142 } 1140 1143 if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { 1141 1144 nvme_queue_scan(ctrl); ··· 3180 3177 return ret; 3181 3178 } 3182 3179 3180 + clear_bit(NVME_CTRL_DIRTY_CAPABILITY, &ctrl->flags); 3183 3181 ctrl->identified = true; 3184 3182 3185 3183 return 0;
+19 -51
drivers/nvme/host/ioctl.c
··· 505 505 { 506 506 struct io_uring_cmd *ioucmd = req->end_io_data; 507 507 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 508 - void *cookie = READ_ONCE(ioucmd->cookie); 509 508 510 509 req->bio = pdu->bio; 511 510 if (nvme_req(req)->flags & NVME_REQ_CANCELLED) ··· 517 518 * For iopoll, complete it directly. 518 519 * Otherwise, move the completion to task work. 519 520 */ 520 - if (cookie != NULL && blk_rq_is_poll(req)) 521 + if (blk_rq_is_poll(req)) { 522 + WRITE_ONCE(ioucmd->cookie, NULL); 521 523 nvme_uring_task_cb(ioucmd, IO_URING_F_UNLOCKED); 522 - else 524 + } else { 523 525 io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); 526 + } 524 527 525 528 return RQ_END_IO_FREE; 526 529 } ··· 532 531 { 533 532 struct io_uring_cmd *ioucmd = req->end_io_data; 534 533 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 535 - void *cookie = READ_ONCE(ioucmd->cookie); 536 534 537 535 req->bio = pdu->bio; 538 536 pdu->req = req; ··· 540 540 * For iopoll, complete it directly. 541 541 * Otherwise, move the completion to task work. 542 542 */ 543 - if (cookie != NULL && blk_rq_is_poll(req)) 543 + if (blk_rq_is_poll(req)) { 544 + WRITE_ONCE(ioucmd->cookie, NULL); 544 545 nvme_uring_task_meta_cb(ioucmd, IO_URING_F_UNLOCKED); 545 - else 546 + } else { 546 547 io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_meta_cb); 548 + } 547 549 548 550 return RQ_END_IO_NONE; 549 551 } ··· 601 599 if (issue_flags & IO_URING_F_IOPOLL) 602 600 rq_flags |= REQ_POLLED; 603 601 604 - retry: 605 602 req = nvme_alloc_user_request(q, &c, rq_flags, blk_flags); 606 603 if (IS_ERR(req)) 607 604 return PTR_ERR(req); ··· 614 613 return ret; 615 614 } 616 615 617 - if (issue_flags & IO_URING_F_IOPOLL && rq_flags & REQ_POLLED) { 618 - if (unlikely(!req->bio)) { 619 - /* we can't poll this, so alloc regular req instead */ 620 - blk_mq_free_request(req); 621 - rq_flags &= ~REQ_POLLED; 622 - goto retry; 623 - } else { 624 - WRITE_ONCE(ioucmd->cookie, req->bio); 625 - req->bio->bi_opf |= REQ_POLLED; 626 - } 616 + if (blk_rq_is_poll(req)) { 617 + ioucmd->flags |= IORING_URING_CMD_POLLED; 618 + WRITE_ONCE(ioucmd->cookie, req); 627 619 } 620 + 628 621 /* to free bio on completion, as req->bio will be null at that time */ 629 622 pdu->bio = req->bio; 630 623 pdu->meta_len = d.metadata_len; ··· 780 785 struct io_comp_batch *iob, 781 786 unsigned int poll_flags) 782 787 { 783 - struct bio *bio; 788 + struct request *req; 784 789 int ret = 0; 785 - struct nvme_ns *ns; 786 - struct request_queue *q; 790 + 791 + if (!(ioucmd->flags & IORING_URING_CMD_POLLED)) 792 + return 0; 787 793 788 794 rcu_read_lock(); 789 - bio = READ_ONCE(ioucmd->cookie); 790 - ns = container_of(file_inode(ioucmd->file)->i_cdev, 791 - struct nvme_ns, cdev); 792 - q = ns->queue; 793 - if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev) 794 - ret = bio_poll(bio, iob, poll_flags); 795 + req = READ_ONCE(ioucmd->cookie); 796 + if (req && blk_rq_is_poll(req)) 797 + ret = blk_rq_poll(req, iob, poll_flags); 795 798 rcu_read_unlock(); 796 799 return ret; 797 800 } ··· 880 887 881 888 if (ns) 882 889 ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags); 883 - srcu_read_unlock(&head->srcu, srcu_idx); 884 - return ret; 885 - } 886 - 887 - int nvme_ns_head_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, 888 - struct io_comp_batch *iob, 889 - unsigned int poll_flags) 890 - { 891 - struct cdev *cdev = file_inode(ioucmd->file)->i_cdev; 892 - struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev); 893 - int srcu_idx = srcu_read_lock(&head->srcu); 894 - struct nvme_ns *ns = nvme_find_path(head); 895 - struct bio *bio; 896 - int ret = 0; 897 - struct request_queue *q; 898 - 899 - if (ns) { 900 - rcu_read_lock(); 901 - bio = READ_ONCE(ioucmd->cookie); 902 - q = ns->queue; 903 - if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio 904 - && bio->bi_bdev) 905 - ret = bio_poll(bio, iob, poll_flags); 906 - rcu_read_unlock(); 907 - } 908 890 srcu_read_unlock(&head->srcu, srcu_idx); 909 891 return ret; 910 892 }
+9 -1
drivers/nvme/host/multipath.c
··· 106 106 bio->bi_opf &= ~REQ_POLLED; 107 107 bio->bi_cookie = BLK_QC_T_NONE; 108 108 } 109 + /* 110 + * The alternate request queue that we may end up submitting 111 + * the bio to may be frozen temporarily, in this case REQ_NOWAIT 112 + * will fail the I/O immediately with EAGAIN to the issuer. 113 + * We are not in the issuer context which cannot block. Clear 114 + * the flag to avoid spurious EAGAIN I/O failures. 115 + */ 116 + bio->bi_opf &= ~REQ_NOWAIT; 109 117 } 110 118 blk_steal_bios(&ns->head->requeue_list, req); 111 119 spin_unlock_irqrestore(&ns->head->requeue_lock, flags); ··· 478 470 .unlocked_ioctl = nvme_ns_head_chr_ioctl, 479 471 .compat_ioctl = compat_ptr_ioctl, 480 472 .uring_cmd = nvme_ns_head_chr_uring_cmd, 481 - .uring_cmd_iopoll = nvme_ns_head_chr_uring_cmd_iopoll, 473 + .uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll, 482 474 }; 483 475 484 476 static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
+1 -2
drivers/nvme/host/nvme.h
··· 250 250 NVME_CTRL_STARTED_ONCE = 2, 251 251 NVME_CTRL_STOPPED = 3, 252 252 NVME_CTRL_SKIP_ID_CNS_CS = 4, 253 + NVME_CTRL_DIRTY_CAPABILITY = 5, 253 254 }; 254 255 255 256 struct nvme_ctrl { ··· 856 855 long nvme_dev_ioctl(struct file *file, unsigned int cmd, 857 856 unsigned long arg); 858 857 int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, 859 - struct io_comp_batch *iob, unsigned int poll_flags); 860 - int nvme_ns_head_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, 861 858 struct io_comp_batch *iob, unsigned int poll_flags); 862 859 int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, 863 860 unsigned int issue_flags);
+4 -1
drivers/nvme/host/pci.c
··· 2690 2690 if (dev->ctrl.state != NVME_CTRL_RESETTING) { 2691 2691 dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n", 2692 2692 dev->ctrl.state); 2693 - return; 2693 + result = -ENODEV; 2694 + goto out; 2694 2695 } 2695 2696 2696 2697 /* ··· 2778 2777 result); 2779 2778 nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); 2780 2779 nvme_dev_disable(dev, true); 2780 + nvme_sync_queues(&dev->ctrl); 2781 2781 nvme_mark_namespaces_dead(&dev->ctrl); 2782 + nvme_unquiesce_io_queues(&dev->ctrl); 2782 2783 nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD); 2783 2784 } 2784 2785
+1 -1
drivers/nvme/target/nvmet.h
··· 79 79 struct completion disable_done; 80 80 mempool_t *bvec_pool; 81 81 82 - int use_p2pmem; 83 82 struct pci_dev *p2p_dev; 83 + int use_p2pmem; 84 84 int pi_type; 85 85 int metadata_size; 86 86 u8 csi;
+7 -1
include/linux/blk-mq.h
··· 715 715 void blk_mq_free_tag_set(struct blk_mq_tag_set *set); 716 716 717 717 void blk_mq_free_request(struct request *rq); 718 + int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, 719 + unsigned int poll_flags); 718 720 719 721 bool blk_mq_queue_inflight(struct request_queue *q); 720 722 ··· 854 852 struct io_comp_batch *iob, int ioerror, 855 853 void (*complete)(struct io_comp_batch *)) 856 854 { 857 - if (!iob || (req->rq_flags & RQF_USE_SCHED) || ioerror || 855 + /* 856 + * blk_mq_end_request_batch() can't end request allocated from 857 + * sched tags 858 + */ 859 + if (!iob || (req->rq_flags & RQF_SCHED_TAGS) || ioerror || 858 860 (req->end_io && !blk_rq_is_passthrough(req))) 859 861 return false; 860 862
+2
include/uapi/linux/io_uring.h
··· 244 244 * sqe->uring_cmd_flags 245 245 * IORING_URING_CMD_FIXED use registered buffer; pass this flag 246 246 * along with setting sqe->buf_index. 247 + * IORING_URING_CMD_POLLED driver use only 247 248 */ 248 249 #define IORING_URING_CMD_FIXED (1U << 0) 250 + #define IORING_URING_CMD_POLLED (1U << 31) 249 251 250 252 251 253 /*