Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'for-5.18/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper updates from Mike Snitzer:

- Significant refactoring and fixing of how DM core does bio-based IO
accounting with focus on fixing wildly inaccurate IO stats for
dm-crypt (and other DM targets that defer bio submission in their own
workqueues). End result is proper IO accounting, made possible by
targets being updated to use the new dm_submit_bio_remap() interface.

- Add hipri bio polling support (REQ_POLLED) to bio-based DM.

- Reduce dm_io and dm_target_io structs so that a single dm_io (which
contains dm_target_io and first clone bio) weighs in at 256 bytes.
For reference the bio struct is 128 bytes.

- Various other small cleanups, fixes or improvements in DM core and
targets.

- Update MAINTAINERS with my kernel.org email address to allow
distinction between my "upstream" and "Red" Hats.

* tag 'for-5.18/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (46 commits)
dm: consolidate spinlocks in dm_io struct
dm: reduce size of dm_io and dm_target_io structs
dm: switch dm_target_io booleans over to proper flags
dm: switch dm_io booleans over to proper flags
dm: update email address in MAINTAINERS
dm: return void from __send_empty_flush
dm: factor out dm_io_complete
dm cache: use dm_submit_bio_remap
dm: simplify dm_sumbit_bio_remap interface
dm thin: use dm_submit_bio_remap
dm: add WARN_ON_ONCE to dm_submit_bio_remap
dm: support bio polling
block: add ->poll_bio to block_device_operations
dm mpath: use DMINFO instead of printk with KERN_INFO
dm: stop using bdevname
dm-zoned: remove the ->name field in struct dmz_dev
dm: remove unnecessary local variables in __bind
dm: requeue IO if mapping table not yet available
dm io: remove stale comment block for dm_io()
dm thin metadata: remove unused dm_thin_remove_block and __remove
...

+686 -362
+1 -1
MAINTAINERS
··· 5605 5605 5606 5606 DEVICE-MAPPER (LVM) 5607 5607 M: Alasdair Kergon <agk@redhat.com> 5608 - M: Mike Snitzer <snitzer@redhat.com> 5608 + M: Mike Snitzer <snitzer@kernel.org> 5609 5609 M: dm-devel@redhat.com 5610 5610 L: dm-devel@redhat.com 5611 5611 S: Maintained
+9 -5
block/blk-core.c
··· 688 688 * 689 689 * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio. 690 690 * bio_list_on_stack[1] contains bios that were submitted before the current 691 - * ->submit_bio_bio, but that haven't been processed yet. 691 + * ->submit_bio, but that haven't been processed yet. 692 692 */ 693 693 static void __submit_bio_noacct(struct bio *bio) 694 694 { ··· 955 955 { 956 956 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 957 957 blk_qc_t cookie = READ_ONCE(bio->bi_cookie); 958 - int ret; 958 + int ret = 0; 959 959 960 960 if (cookie == BLK_QC_T_NONE || 961 961 !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) ··· 965 965 966 966 if (blk_queue_enter(q, BLK_MQ_REQ_NOWAIT)) 967 967 return 0; 968 - if (WARN_ON_ONCE(!queue_is_mq(q))) 969 - ret = 0; /* not yet implemented, should not happen */ 970 - else 968 + if (queue_is_mq(q)) { 971 969 ret = blk_mq_poll(q, cookie, iob, flags); 970 + } else { 971 + struct gendisk *disk = q->disk; 972 + 973 + if (disk && disk->fops->poll_bio) 974 + ret = disk->fops->poll_bio(bio, iob, flags); 975 + } 972 976 blk_queue_exit(q); 973 977 return ret; 974 978 }
+4
block/genhd.c
··· 412 412 struct device *ddev = disk_to_dev(disk); 413 413 int ret; 414 414 415 + /* Only makes sense for bio-based to set ->poll_bio */ 416 + if (queue_is_mq(disk->queue) && disk->fops->poll_bio) 417 + return -EINVAL; 418 + 415 419 /* 416 420 * The disk queue should now be all set with enough information about 417 421 * the device for the elevator code to pick an adequate default
+3 -1
drivers/md/dm-cache-policy-smq.c
··· 1026 1026 * This scheme reminds me of a graph of entropy vs probability of a 1027 1027 * binary variable. 1028 1028 */ 1029 - static unsigned table[] = {1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1}; 1029 + static const unsigned int table[] = { 1030 + 1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1 1031 + }; 1030 1032 1031 1033 unsigned hits = mq->cache_stats.hits; 1032 1034 unsigned misses = mq->cache_stats.misses;
+8 -9
drivers/md/dm-cache-target.c
··· 803 803 static void accounted_request(struct cache *cache, struct bio *bio) 804 804 { 805 805 accounted_begin(cache, bio); 806 - submit_bio_noacct(bio); 806 + dm_submit_bio_remap(bio, NULL); 807 807 } 808 808 809 809 static void issue_op(struct bio *bio, void *context) ··· 1708 1708 bool commit_needed; 1709 1709 1710 1710 if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED) 1711 - submit_bio_noacct(bio); 1711 + dm_submit_bio_remap(bio, NULL); 1712 1712 1713 1713 return commit_needed; 1714 1714 } ··· 1774 1774 1775 1775 if (cache->features.discard_passdown) { 1776 1776 remap_to_origin(cache, bio); 1777 - submit_bio_noacct(bio); 1777 + dm_submit_bio_remap(bio, NULL); 1778 1778 } else 1779 1779 bio_endio(bio); 1780 1780 ··· 2015 2015 { 2016 2016 int r; 2017 2017 sector_t metadata_dev_size; 2018 - char b[BDEVNAME_SIZE]; 2019 2018 2020 2019 if (!at_least_one_arg(as, error)) 2021 2020 return -EINVAL; ··· 2028 2029 2029 2030 metadata_dev_size = get_dev_size(ca->metadata_dev); 2030 2031 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 2031 - DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 2032 - bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 2032 + DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.", 2033 + ca->metadata_dev->bdev, THIN_METADATA_MAX_SECTORS); 2033 2034 2034 2035 return 0; 2035 2036 } ··· 2356 2357 2357 2358 cache->ti = ca->ti; 2358 2359 ti->private = cache; 2360 + ti->accounts_remapped_io = true; 2359 2361 ti->num_flush_bios = 2; 2360 2362 ti->flush_supported = true; 2361 2363 ··· 3345 3345 struct block_device *origin_bdev = cache->origin_dev->bdev; 3346 3346 struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits; 3347 3347 const char *reason = NULL; 3348 - char buf[BDEVNAME_SIZE]; 3349 3348 3350 3349 if (!cache->features.discard_passdown) 3351 3350 return; ··· 3356 3357 reason = "max discard sectors smaller than a block"; 3357 3358 3358 3359 if (reason) { 3359 - DMWARN("Origin device (%s) %s: Disabling discard passdown.", 3360 - bdevname(origin_bdev, buf), reason); 3360 + DMWARN("Origin device (%pg) %s: Disabling discard passdown.", 3361 + origin_bdev, reason); 3361 3362 cache->features.discard_passdown = false; 3362 3363 } 3363 3364 }
+4 -6
drivers/md/dm-clone-target.c
··· 1682 1682 { 1683 1683 int r; 1684 1684 sector_t metadata_dev_size; 1685 - char b[BDEVNAME_SIZE]; 1686 1685 1687 1686 r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 1688 1687 &clone->metadata_dev); ··· 1692 1693 1693 1694 metadata_dev_size = get_dev_size(clone->metadata_dev); 1694 1695 if (metadata_dev_size > DM_CLONE_METADATA_MAX_SECTORS_WARNING) 1695 - DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 1696 - bdevname(clone->metadata_dev->bdev, b), DM_CLONE_METADATA_MAX_SECTORS); 1696 + DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.", 1697 + clone->metadata_dev->bdev, DM_CLONE_METADATA_MAX_SECTORS); 1697 1698 1698 1699 return 0; 1699 1700 } ··· 2032 2033 struct block_device *dest_dev = clone->dest_dev->bdev; 2033 2034 struct queue_limits *dest_limits = &bdev_get_queue(dest_dev)->limits; 2034 2035 const char *reason = NULL; 2035 - char buf[BDEVNAME_SIZE]; 2036 2036 2037 2037 if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) 2038 2038 return; ··· 2042 2044 reason = "max discard sectors smaller than a region"; 2043 2045 2044 2046 if (reason) { 2045 - DMWARN("Destination device (%s) %s: Disabling discard passdown.", 2046 - bdevname(dest_dev, buf), reason); 2047 + DMWARN("Destination device (%pd) %s: Disabling discard passdown.", 2048 + dest_dev, reason); 2047 2049 clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); 2048 2050 } 2049 2051 }
+71 -28
drivers/md/dm-core.h
··· 64 64 struct gendisk *disk; 65 65 struct dax_device *dax_dev; 66 66 67 + wait_queue_head_t wait; 68 + unsigned long __percpu *pending_io; 69 + 70 + /* forced geometry settings */ 71 + struct hd_geometry geometry; 72 + 73 + /* 74 + * Processing queue (flush) 75 + */ 76 + struct workqueue_struct *wq; 77 + 67 78 /* 68 79 * A list of ios that arrived while we were suspended. 69 80 */ 70 81 struct work_struct work; 71 - wait_queue_head_t wait; 72 82 spinlock_t deferred_lock; 73 83 struct bio_list deferred; 74 84 ··· 93 83 struct list_head uevent_list; 94 84 spinlock_t uevent_lock; /* Protect access to uevent_list */ 95 85 86 + /* for blk-mq request-based DM support */ 87 + bool init_tio_pdu:1; 88 + struct blk_mq_tag_set *tag_set; 89 + 90 + struct dm_stats stats; 91 + 96 92 /* the number of internal suspends */ 97 93 unsigned internal_suspend_count; 94 + 95 + int swap_bios; 96 + struct semaphore swap_bios_semaphore; 97 + struct mutex swap_bios_lock; 98 98 99 99 /* 100 100 * io objects are allocated from here. ··· 112 92 struct bio_set io_bs; 113 93 struct bio_set bs; 114 94 115 - /* 116 - * Processing queue (flush) 117 - */ 118 - struct workqueue_struct *wq; 119 - 120 - /* forced geometry settings */ 121 - struct hd_geometry geometry; 122 - 123 95 /* kobject and completion */ 124 96 struct dm_kobject_holder kobj_holder; 125 - 126 - int swap_bios; 127 - struct semaphore swap_bios_semaphore; 128 - struct mutex swap_bios_lock; 129 - 130 - struct dm_stats stats; 131 - 132 - /* for blk-mq request-based DM support */ 133 - struct blk_mq_tag_set *tag_set; 134 - bool init_tio_pdu:1; 135 97 136 98 struct srcu_struct io_barrier; 137 99 ··· 208 206 /* 209 207 * One of these is allocated per clone bio. 210 208 */ 211 - #define DM_TIO_MAGIC 7282014 209 + #define DM_TIO_MAGIC 28714 212 210 struct dm_target_io { 213 - unsigned int magic; 211 + unsigned short magic; 212 + unsigned short flags; 213 + unsigned int target_bio_nr; 214 214 struct dm_io *io; 215 215 struct dm_target *ti; 216 - unsigned int target_bio_nr; 217 216 unsigned int *len_ptr; 218 - bool inside_dm_io; 217 + sector_t old_sector; 219 218 struct bio clone; 220 219 }; 220 + 221 + /* 222 + * dm_target_io flags 223 + */ 224 + enum { 225 + DM_TIO_INSIDE_DM_IO, 226 + DM_TIO_IS_DUPLICATE_BIO 227 + }; 228 + 229 + static inline bool dm_tio_flagged(struct dm_target_io *tio, unsigned int bit) 230 + { 231 + return (tio->flags & (1U << bit)) != 0; 232 + } 233 + 234 + static inline void dm_tio_set_flag(struct dm_target_io *tio, unsigned int bit) 235 + { 236 + tio->flags |= (1U << bit); 237 + } 221 238 222 239 /* 223 240 * One of these is allocated per original bio. 224 241 * It contains the first clone used for that original. 225 242 */ 226 - #define DM_IO_MAGIC 5191977 243 + #define DM_IO_MAGIC 19577 227 244 struct dm_io { 228 - unsigned int magic; 229 - struct mapped_device *md; 230 - blk_status_t status; 245 + unsigned short magic; 246 + unsigned short flags; 231 247 atomic_t io_count; 248 + struct mapped_device *md; 232 249 struct bio *orig_bio; 250 + blk_status_t status; 251 + spinlock_t lock; 233 252 unsigned long start_time; 234 - spinlock_t endio_lock; 253 + void *data; 254 + struct hlist_node node; 255 + struct task_struct *map_task; 235 256 struct dm_stats_aux stats_aux; 236 257 /* last member of dm_target_io is 'struct bio' */ 237 258 struct dm_target_io tio; 238 259 }; 260 + 261 + /* 262 + * dm_io flags 263 + */ 264 + enum { 265 + DM_IO_START_ACCT, 266 + DM_IO_ACCOUNTED 267 + }; 268 + 269 + static inline bool dm_io_flagged(struct dm_io *io, unsigned int bit) 270 + { 271 + return (io->flags & (1U << bit)) != 0; 272 + } 273 + 274 + static inline void dm_io_set_flag(struct dm_io *io, unsigned int bit) 275 + { 276 + io->flags |= (1U << bit); 277 + } 239 278 240 279 static inline void dm_io_inc_pending(struct dm_io *io) 241 280 {
+9 -6
drivers/md/dm-crypt.c
··· 1827 1827 crypt_dec_pending(io); 1828 1828 } 1829 1829 1830 + #define CRYPT_MAP_READ_GFP GFP_NOWAIT 1831 + 1830 1832 static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) 1831 1833 { 1832 1834 struct crypt_config *cc = io->cc; ··· 1856 1854 return 1; 1857 1855 } 1858 1856 1859 - submit_bio_noacct(clone); 1857 + dm_submit_bio_remap(io->base_bio, clone); 1860 1858 return 0; 1861 1859 } 1862 1860 ··· 1882 1880 { 1883 1881 struct bio *clone = io->ctx.bio_out; 1884 1882 1885 - submit_bio_noacct(clone); 1883 + dm_submit_bio_remap(io->base_bio, clone); 1886 1884 } 1887 1885 1888 1886 #define crypt_io_from_node(node) rb_entry((node), struct dm_crypt_io, rb_node) ··· 1961 1959 1962 1960 if ((likely(!async) && test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) || 1963 1961 test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags)) { 1964 - submit_bio_noacct(clone); 1962 + dm_submit_bio_remap(io->base_bio, clone); 1965 1963 return; 1966 1964 } 1967 1965 ··· 2580 2578 2581 2579 static int get_key_size(char **key_string) 2582 2580 { 2583 - return (*key_string[0] == ':') ? -EINVAL : strlen(*key_string) >> 1; 2581 + return (*key_string[0] == ':') ? -EINVAL : (int)(strlen(*key_string) >> 1); 2584 2582 } 2585 2583 2586 2584 #endif /* CONFIG_KEYS */ ··· 3363 3361 3364 3362 ti->num_flush_bios = 1; 3365 3363 ti->limit_swap_bios = true; 3364 + ti->accounts_remapped_io = true; 3366 3365 3367 3366 dm_audit_log_ctr(DM_MSG_PREFIX, ti, 1); 3368 3367 return 0; ··· 3432 3429 io->ctx.r.req = (struct skcipher_request *)(io + 1); 3433 3430 3434 3431 if (bio_data_dir(io->base_bio) == READ) { 3435 - if (kcryptd_io_read(io, GFP_NOWAIT)) 3432 + if (kcryptd_io_read(io, CRYPT_MAP_READ_GFP)) 3436 3433 kcryptd_queue_read(io); 3437 3434 } else 3438 3435 kcryptd_queue_crypt(io); ··· 3627 3624 3628 3625 static struct target_type crypt_target = { 3629 3626 .name = "crypt", 3630 - .version = {1, 23, 0}, 3627 + .version = {1, 24, 0}, 3631 3628 .module = THIS_MODULE, 3632 3629 .ctr = crypt_ctr, 3633 3630 .dtr = crypt_dtr,
+3 -2
drivers/md/dm-delay.c
··· 72 72 while (bio) { 73 73 n = bio->bi_next; 74 74 bio->bi_next = NULL; 75 - submit_bio_noacct(bio); 75 + dm_submit_bio_remap(bio, NULL); 76 76 bio = n; 77 77 } 78 78 } ··· 232 232 233 233 ti->num_flush_bios = 1; 234 234 ti->num_discard_bios = 1; 235 + ti->accounts_remapped_io = true; 235 236 ti->per_io_data_size = sizeof(struct dm_delay_info); 236 237 return 0; 237 238 ··· 356 355 357 356 static struct target_type delay_target = { 358 357 .name = "delay", 359 - .version = {1, 2, 1}, 358 + .version = {1, 3, 0}, 360 359 .features = DM_TARGET_PASSES_INTEGRITY, 361 360 .module = THIS_MODULE, 362 361 .ctr = delay_ctr,
+3 -3
drivers/md/dm-ima.c
··· 455 455 scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, 456 456 "%sname=%s,uuid=%s;device_resume=no_data;", 457 457 DM_IMA_VERSION_STR, dev_name, dev_uuid); 458 - l += strlen(device_table_data); 458 + l = strlen(device_table_data); 459 459 460 460 } 461 461 ··· 568 568 scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, 569 569 "%sname=%s,uuid=%s;device_remove=no_data;", 570 570 DM_IMA_VERSION_STR, dev_name, dev_uuid); 571 - l += strlen(device_table_data); 571 + l = strlen(device_table_data); 572 572 } 573 573 574 574 memcpy(device_table_data + l, remove_all_str, remove_all_len); ··· 654 654 scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, 655 655 "%sname=%s,uuid=%s;table_clear=no_data;", 656 656 DM_IMA_VERSION_STR, dev_name, dev_uuid); 657 - l += strlen(device_table_data); 657 + l = strlen(device_table_data); 658 658 } 659 659 660 660 capacity_len = strlen(capacity_str);
-8
drivers/md/dm-io.c
··· 525 525 return 0; 526 526 } 527 527 528 - /* 529 - * New collapsed (a)synchronous interface. 530 - * 531 - * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug 532 - * the queue with blk_unplug() some time later or set REQ_SYNC in 533 - * io_req->bi_opf. If you fail to do one of these, the IO will be submitted to 534 - * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c. 535 - */ 536 528 int dm_io(struct dm_io_request *io_req, unsigned num_regions, 537 529 struct dm_io_region *where, unsigned long *sync_error_bits) 538 530 {
+2
drivers/md/dm-ioctl.c
··· 18 18 #include <linux/dm-ioctl.h> 19 19 #include <linux/hdreg.h> 20 20 #include <linux/compat.h> 21 + #include <linux/nospec.h> 21 22 22 23 #include <linux/uaccess.h> 23 24 #include <linux/ima.h> ··· 1789 1788 if (unlikely(cmd >= ARRAY_SIZE(_ioctls))) 1790 1789 return NULL; 1791 1790 1791 + cmd = array_index_nospec(cmd, ARRAY_SIZE(_ioctls)); 1792 1792 *ioctl_flags = _ioctls[cmd].flags; 1793 1793 return _ioctls[cmd].fn; 1794 1794 }
+1 -4
drivers/md/dm-mpath.c
··· 899 899 if (m->hw_handler_name) { 900 900 r = scsi_dh_attach(q, m->hw_handler_name); 901 901 if (r == -EBUSY) { 902 - char b[BDEVNAME_SIZE]; 903 - 904 - printk(KERN_INFO "dm-mpath: retaining handler on device %s\n", 905 - bdevname(bdev, b)); 902 + DMINFO("retaining handler on device %pg", bdev); 906 903 goto retain; 907 904 } 908 905 if (r < 0) {
+6 -1
drivers/md/dm-rq.c
··· 491 491 492 492 if (unlikely(!ti)) { 493 493 int srcu_idx; 494 - struct dm_table *map = dm_get_live_table(md, &srcu_idx); 494 + struct dm_table *map; 495 495 496 + map = dm_get_live_table(md, &srcu_idx); 497 + if (unlikely(!map)) { 498 + dm_put_live_table(md, srcu_idx); 499 + return BLK_STS_RESOURCE; 500 + } 496 501 ti = dm_table_find_target(map, 0); 497 502 dm_put_live_table(md, srcu_idx); 498 503 }
+29 -5
drivers/md/dm-stats.c
··· 195 195 196 196 mutex_init(&stats->mutex); 197 197 INIT_LIST_HEAD(&stats->list); 198 + stats->precise_timestamps = false; 198 199 stats->last = alloc_percpu(struct dm_stats_last_position); 199 200 for_each_possible_cpu(cpu) { 200 201 last = per_cpu_ptr(stats->last, cpu); ··· 230 229 } 231 230 free_percpu(stats->last); 232 231 mutex_destroy(&stats->mutex); 232 + } 233 + 234 + static void dm_stats_recalc_precise_timestamps(struct dm_stats *stats) 235 + { 236 + struct list_head *l; 237 + struct dm_stat *tmp_s; 238 + bool precise_timestamps = false; 239 + 240 + list_for_each(l, &stats->list) { 241 + tmp_s = container_of(l, struct dm_stat, list_entry); 242 + if (tmp_s->stat_flags & STAT_PRECISE_TIMESTAMPS) { 243 + precise_timestamps = true; 244 + break; 245 + } 246 + } 247 + stats->precise_timestamps = precise_timestamps; 233 248 } 234 249 235 250 static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, ··· 393 376 } 394 377 ret_id = s->id; 395 378 list_add_tail_rcu(&s->list_entry, l); 379 + 380 + dm_stats_recalc_precise_timestamps(stats); 381 + 396 382 mutex_unlock(&stats->mutex); 397 383 398 384 resume_callback(md); ··· 438 418 } 439 419 440 420 list_del_rcu(&s->list_entry); 421 + 422 + dm_stats_recalc_precise_timestamps(stats); 423 + 441 424 mutex_unlock(&stats->mutex); 442 425 443 426 /* ··· 644 621 645 622 void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw, 646 623 sector_t bi_sector, unsigned bi_sectors, bool end, 647 - unsigned long duration_jiffies, 624 + unsigned long start_time, 648 625 struct dm_stats_aux *stats_aux) 649 626 { 650 627 struct dm_stat *s; 651 628 sector_t end_sector; 652 629 struct dm_stats_last_position *last; 653 630 bool got_precise_time; 631 + unsigned long duration_jiffies = 0; 654 632 655 633 if (unlikely(!bi_sectors)) 656 634 return; ··· 671 647 )); 672 648 WRITE_ONCE(last->last_sector, end_sector); 673 649 WRITE_ONCE(last->last_rw, bi_rw); 674 - } 650 + } else 651 + duration_jiffies = jiffies - start_time; 675 652 676 653 rcu_read_lock(); 677 654 678 655 got_precise_time = false; 679 656 list_for_each_entry_rcu(s, &stats->list, list_entry) { 680 657 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) { 681 - if (!end) 682 - stats_aux->duration_ns = ktime_to_ns(ktime_get()); 683 - else 658 + /* start (!end) duration_ns is set by DM core's alloc_io() */ 659 + if (end) 684 660 stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns; 685 661 got_precise_time = true; 686 662 }
+8 -3
drivers/md/dm-stats.h
··· 13 13 struct mutex mutex; 14 14 struct list_head list; /* list of struct dm_stat */ 15 15 struct dm_stats_last_position __percpu *last; 16 - sector_t last_sector; 17 - unsigned last_rw; 16 + bool precise_timestamps; 18 17 }; 19 18 20 19 struct dm_stats_aux { ··· 31 32 32 33 void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw, 33 34 sector_t bi_sector, unsigned bi_sectors, bool end, 34 - unsigned long duration_jiffies, 35 + unsigned long start_time, 35 36 struct dm_stats_aux *aux); 36 37 37 38 static inline bool dm_stats_used(struct dm_stats *st) 38 39 { 39 40 return !list_empty(&st->list); 41 + } 42 + 43 + static inline void dm_stats_record_start(struct dm_stats *stats, struct dm_stats_aux *aux) 44 + { 45 + if (unlikely(stats->precise_timestamps)) 46 + aux->duration_ns = ktime_to_ns(ktime_get()); 40 47 } 41 48 42 49 #endif
+41 -16
drivers/md/dm-table.c
··· 230 230 sector_t dev_size = bdev_nr_sectors(bdev); 231 231 unsigned short logical_block_size_sectors = 232 232 limits->logical_block_size >> SECTOR_SHIFT; 233 - char b[BDEVNAME_SIZE]; 234 233 235 234 if (!dev_size) 236 235 return 0; 237 236 238 237 if ((start >= dev_size) || (start + len > dev_size)) { 239 - DMWARN("%s: %s too small for target: " 238 + DMWARN("%s: %pg too small for target: " 240 239 "start=%llu, len=%llu, dev_size=%llu", 241 - dm_device_name(ti->table->md), bdevname(bdev, b), 240 + dm_device_name(ti->table->md), bdev, 242 241 (unsigned long long)start, 243 242 (unsigned long long)len, 244 243 (unsigned long long)dev_size); ··· 252 253 unsigned int zone_sectors = bdev_zone_sectors(bdev); 253 254 254 255 if (start & (zone_sectors - 1)) { 255 - DMWARN("%s: start=%llu not aligned to h/w zone size %u of %s", 256 + DMWARN("%s: start=%llu not aligned to h/w zone size %u of %pg", 256 257 dm_device_name(ti->table->md), 257 258 (unsigned long long)start, 258 - zone_sectors, bdevname(bdev, b)); 259 + zone_sectors, bdev); 259 260 return 1; 260 261 } 261 262 ··· 269 270 * the sector range. 270 271 */ 271 272 if (len & (zone_sectors - 1)) { 272 - DMWARN("%s: len=%llu not aligned to h/w zone size %u of %s", 273 + DMWARN("%s: len=%llu not aligned to h/w zone size %u of %pg", 273 274 dm_device_name(ti->table->md), 274 275 (unsigned long long)len, 275 - zone_sectors, bdevname(bdev, b)); 276 + zone_sectors, bdev); 276 277 return 1; 277 278 } 278 279 } ··· 282 283 283 284 if (start & (logical_block_size_sectors - 1)) { 284 285 DMWARN("%s: start=%llu not aligned to h/w " 285 - "logical block size %u of %s", 286 + "logical block size %u of %pg", 286 287 dm_device_name(ti->table->md), 287 288 (unsigned long long)start, 288 - limits->logical_block_size, bdevname(bdev, b)); 289 + limits->logical_block_size, bdev); 289 290 return 1; 290 291 } 291 292 292 293 if (len & (logical_block_size_sectors - 1)) { 293 294 DMWARN("%s: len=%llu not aligned to h/w " 294 - "logical block size %u of %s", 295 + "logical block size %u of %pg", 295 296 dm_device_name(ti->table->md), 296 297 (unsigned long long)len, 297 - limits->logical_block_size, bdevname(bdev, b)); 298 + limits->logical_block_size, bdev); 298 299 return 1; 299 300 } 300 301 ··· 399 400 struct queue_limits *limits = data; 400 401 struct block_device *bdev = dev->bdev; 401 402 struct request_queue *q = bdev_get_queue(bdev); 402 - char b[BDEVNAME_SIZE]; 403 403 404 404 if (unlikely(!q)) { 405 - DMWARN("%s: Cannot set limits for nonexistent device %s", 406 - dm_device_name(ti->table->md), bdevname(bdev, b)); 405 + DMWARN("%s: Cannot set limits for nonexistent device %pg", 406 + dm_device_name(ti->table->md), bdev); 407 407 return 0; 408 408 } 409 409 410 410 if (blk_stack_limits(limits, &q->limits, 411 411 get_start_sect(bdev) + start) < 0) 412 - DMWARN("%s: adding target device %s caused an alignment inconsistency: " 412 + DMWARN("%s: adding target device %pg caused an alignment inconsistency: " 413 413 "physical_block_size=%u, logical_block_size=%u, " 414 414 "alignment_offset=%u, start=%llu", 415 - dm_device_name(ti->table->md), bdevname(bdev, b), 415 + dm_device_name(ti->table->md), bdev, 416 416 q->limits.physical_block_size, 417 417 q->limits.logical_block_size, 418 418 q->limits.alignment_offset, ··· 1481 1483 return &t->targets[(KEYS_PER_NODE * n) + k]; 1482 1484 } 1483 1485 1486 + static int device_not_poll_capable(struct dm_target *ti, struct dm_dev *dev, 1487 + sector_t start, sector_t len, void *data) 1488 + { 1489 + struct request_queue *q = bdev_get_queue(dev->bdev); 1490 + 1491 + return !test_bit(QUEUE_FLAG_POLL, &q->queue_flags); 1492 + } 1493 + 1484 1494 /* 1485 1495 * type->iterate_devices() should be called when the sanity check needs to 1486 1496 * iterate and check all underlying data devices. iterate_devices() will ··· 1537 1531 (*num_devices)++; 1538 1532 1539 1533 return 0; 1534 + } 1535 + 1536 + static int dm_table_supports_poll(struct dm_table *t) 1537 + { 1538 + return !dm_table_any_dev_attr(t, device_not_poll_capable, NULL); 1540 1539 } 1541 1540 1542 1541 /* ··· 2079 2068 2080 2069 dm_update_crypto_profile(q, t); 2081 2070 disk_update_readahead(t->md->disk); 2071 + 2072 + /* 2073 + * Check for request-based device is left to 2074 + * dm_mq_init_request_queue()->blk_mq_init_allocated_queue(). 2075 + * 2076 + * For bio-based device, only set QUEUE_FLAG_POLL when all 2077 + * underlying devices supporting polling. 2078 + */ 2079 + if (__table_type_bio_based(t->type)) { 2080 + if (dm_table_supports_poll(t)) 2081 + blk_queue_flag_set(QUEUE_FLAG_POLL, q); 2082 + else 2083 + blk_queue_flag_clear(QUEUE_FLAG_POLL, q); 2084 + } 2082 2085 2083 2086 return 0; 2084 2087 }
-28
drivers/md/dm-thin-metadata.c
··· 1665 1665 return r; 1666 1666 } 1667 1667 1668 - static int __remove(struct dm_thin_device *td, dm_block_t block) 1669 - { 1670 - int r; 1671 - struct dm_pool_metadata *pmd = td->pmd; 1672 - dm_block_t keys[2] = { td->id, block }; 1673 - 1674 - r = dm_btree_remove(&pmd->info, pmd->root, keys, &pmd->root); 1675 - if (r) 1676 - return r; 1677 - 1678 - td->mapped_blocks--; 1679 - td->changed = true; 1680 - 1681 - return 0; 1682 - } 1683 - 1684 1668 static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_t end) 1685 1669 { 1686 1670 int r; ··· 1722 1738 value = cpu_to_le64(mapping_root); 1723 1739 __dm_bless_for_disk(&value); 1724 1740 return dm_btree_insert(&pmd->tl_info, pmd->root, keys, &value, &pmd->root); 1725 - } 1726 - 1727 - int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block) 1728 - { 1729 - int r = -EINVAL; 1730 - 1731 - pmd_write_lock(td->pmd); 1732 - if (!td->pmd->fail_io) 1733 - r = __remove(td, block); 1734 - pmd_write_unlock(td->pmd); 1735 - 1736 - return r; 1737 1741 } 1738 1742 1739 1743 int dm_thin_remove_range(struct dm_thin_device *td,
-1
drivers/md/dm-thin-metadata.h
··· 166 166 int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block, 167 167 dm_block_t data_block); 168 168 169 - int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block); 170 169 int dm_thin_remove_range(struct dm_thin_device *td, 171 170 dm_block_t begin, dm_block_t end); 172 171
+7 -8
drivers/md/dm-thin.c
··· 161 161 162 162 static void throttle_work_update(struct throttle *t) 163 163 { 164 - if (!t->throttle_applied && jiffies > t->threshold) { 164 + if (!t->throttle_applied && time_is_before_jiffies(t->threshold)) { 165 165 down_write(&t->lock); 166 166 t->throttle_applied = true; 167 167 } ··· 755 755 struct pool *pool = tc->pool; 756 756 757 757 if (!bio_triggers_commit(tc, bio)) { 758 - submit_bio_noacct(bio); 758 + dm_submit_bio_remap(bio, NULL); 759 759 return; 760 760 } 761 761 ··· 2383 2383 if (bio->bi_opf & REQ_PREFLUSH) 2384 2384 bio_endio(bio); 2385 2385 else 2386 - submit_bio_noacct(bio); 2386 + dm_submit_bio_remap(bio, NULL); 2387 2387 } 2388 2388 } 2389 2389 ··· 2824 2824 struct block_device *data_bdev = pt->data_dev->bdev; 2825 2825 struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits; 2826 2826 const char *reason = NULL; 2827 - char buf[BDEVNAME_SIZE]; 2828 2827 2829 2828 if (!pt->adjusted_pf.discard_passdown) 2830 2829 return; ··· 2835 2836 reason = "max discard sectors smaller than a block"; 2836 2837 2837 2838 if (reason) { 2838 - DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason); 2839 + DMWARN("Data device (%pg) %s: Disabling discard passdown.", data_bdev, reason); 2839 2840 pt->adjusted_pf.discard_passdown = false; 2840 2841 } 2841 2842 } ··· 3200 3201 static void warn_if_metadata_device_too_big(struct block_device *bdev) 3201 3202 { 3202 3203 sector_t metadata_dev_size = get_dev_size(bdev); 3203 - char buffer[BDEVNAME_SIZE]; 3204 3204 3205 3205 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) 3206 - DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 3207 - bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS); 3206 + DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.", 3207 + bdev, THIN_METADATA_MAX_SECTORS); 3208 3208 } 3209 3209 3210 3210 static sector_t get_metadata_dev_size(struct block_device *bdev) ··· 4231 4233 4232 4234 ti->num_flush_bios = 1; 4233 4235 ti->flush_supported = true; 4236 + ti->accounts_remapped_io = true; 4234 4237 ti->per_io_data_size = sizeof(struct dm_thin_endio_hook); 4235 4238 4236 4239 /* In case the pool supports discards, pass them on. */
+2 -2
drivers/md/dm-zoned-metadata.c
··· 1101 1101 */ 1102 1102 static int dmz_read_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set) 1103 1103 { 1104 - dmz_zmd_debug(zmd, "read superblock set %d dev %s block %llu", 1105 - set, sb->dev->name, sb->block); 1104 + dmz_zmd_debug(zmd, "read superblock set %d dev %pg block %llu", 1105 + set, sb->dev->bdev, sb->block); 1106 1106 1107 1107 return dmz_rdwr_block(sb->dev, REQ_OP_READ, 1108 1108 sb->block, sb->mblk->page);
-1
drivers/md/dm-zoned-target.c
··· 730 730 } 731 731 dev->bdev = bdev; 732 732 dev->dev_idx = idx; 733 - (void)bdevname(dev->bdev, dev->name); 734 733 735 734 dev->capacity = bdev_nr_sectors(bdev); 736 735 if (ti->begin) {
+4 -5
drivers/md/dm-zoned.h
··· 56 56 struct dmz_metadata *metadata; 57 57 struct dmz_reclaim *reclaim; 58 58 59 - char name[BDEVNAME_SIZE]; 60 59 uuid_t uuid; 61 60 62 61 sector_t capacity; ··· 175 176 * Message functions. 176 177 */ 177 178 #define dmz_dev_info(dev, format, args...) \ 178 - DMINFO("(%s): " format, (dev)->name, ## args) 179 + DMINFO("(%pg): " format, (dev)->bdev, ## args) 179 180 180 181 #define dmz_dev_err(dev, format, args...) \ 181 - DMERR("(%s): " format, (dev)->name, ## args) 182 + DMERR("(%pg): " format, (dev)->bdev, ## args) 182 183 183 184 #define dmz_dev_warn(dev, format, args...) \ 184 - DMWARN("(%s): " format, (dev)->name, ## args) 185 + DMWARN("(%pg): " format, (dev)->bdev, ## args) 185 186 186 187 #define dmz_dev_debug(dev, format, args...) \ 187 - DMDEBUG("(%s): " format, (dev)->name, ## args) 188 + DMDEBUG("(%pg): " format, (dev)->bdev, ## args) 188 189 189 190 /* 190 191 * Functions defined in dm-zoned-metadata.c
+459 -216
drivers/md/dm.c
··· 40 40 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 41 41 #define DM_COOKIE_LENGTH 24 42 42 43 + /* 44 + * For REQ_POLLED fs bio, this flag is set if we link mapped underlying 45 + * dm_io into one list, and reuse bio->bi_private as the list head. Before 46 + * ending this fs bio, we will recover its ->bi_private. 47 + */ 48 + #define REQ_DM_POLL_LIST REQ_DRV 49 + 43 50 static const char *_name = DM_NAME; 44 51 45 52 static unsigned int major = 0; ··· 80 73 struct dm_io *io; 81 74 sector_t sector; 82 75 unsigned sector_count; 76 + bool submit_as_polled; 83 77 }; 84 78 85 79 #define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone)) ··· 94 86 95 87 void *dm_per_bio_data(struct bio *bio, size_t data_size) 96 88 { 97 - if (!clone_to_tio(bio)->inside_dm_io) 89 + if (!dm_tio_flagged(clone_to_tio(bio), DM_TIO_INSIDE_DM_IO)) 98 90 return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size; 99 91 return (char *)bio - DM_IO_BIO_OFFSET - data_size; 100 92 } ··· 493 485 } 494 486 EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone); 495 487 496 - static void start_io_acct(struct dm_io *io) 488 + static bool bio_is_flush_with_data(struct bio *bio) 497 489 { 498 - struct mapped_device *md = io->md; 499 - struct bio *bio = io->orig_bio; 500 - 501 - bio_start_io_acct_time(bio, io->start_time); 502 - if (unlikely(dm_stats_used(&md->stats))) 503 - dm_stats_account_io(&md->stats, bio_data_dir(bio), 504 - bio->bi_iter.bi_sector, bio_sectors(bio), 505 - false, 0, &io->stats_aux); 490 + return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size); 506 491 } 507 492 508 - static void end_io_acct(struct mapped_device *md, struct bio *bio, 509 - unsigned long start_time, struct dm_stats_aux *stats_aux) 493 + static void dm_io_acct(bool end, struct mapped_device *md, struct bio *bio, 494 + unsigned long start_time, struct dm_stats_aux *stats_aux) 510 495 { 511 - unsigned long duration = jiffies - start_time; 496 + bool is_flush_with_data; 497 + unsigned int bi_size; 512 498 513 - bio_end_io_acct(bio, start_time); 499 + /* If REQ_PREFLUSH set save any payload but do not account it */ 500 + is_flush_with_data = bio_is_flush_with_data(bio); 501 + if (is_flush_with_data) { 502 + bi_size = bio->bi_iter.bi_size; 503 + bio->bi_iter.bi_size = 0; 504 + } 505 + 506 + if (!end) 507 + bio_start_io_acct_time(bio, start_time); 508 + else 509 + bio_end_io_acct(bio, start_time); 514 510 515 511 if (unlikely(dm_stats_used(&md->stats))) 516 512 dm_stats_account_io(&md->stats, bio_data_dir(bio), 517 513 bio->bi_iter.bi_sector, bio_sectors(bio), 518 - true, duration, stats_aux); 514 + end, start_time, stats_aux); 519 515 520 - /* nudge anyone waiting on suspend queue */ 521 - if (unlikely(wq_has_sleeper(&md->wait))) 522 - wake_up(&md->wait); 516 + /* Restore bio's payload so it does get accounted upon requeue */ 517 + if (is_flush_with_data) 518 + bio->bi_iter.bi_size = bi_size; 519 + } 520 + 521 + static void __dm_start_io_acct(struct dm_io *io, struct bio *bio) 522 + { 523 + dm_io_acct(false, io->md, bio, io->start_time, &io->stats_aux); 524 + } 525 + 526 + static void dm_start_io_acct(struct dm_io *io, struct bio *clone) 527 + { 528 + /* Must account IO to DM device in terms of orig_bio */ 529 + struct bio *bio = io->orig_bio; 530 + 531 + /* 532 + * Ensure IO accounting is only ever started once. 533 + * Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO. 534 + */ 535 + if (!clone || 536 + likely(!dm_tio_flagged(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO))) { 537 + if (WARN_ON_ONCE(dm_io_flagged(io, DM_IO_ACCOUNTED))) 538 + return; 539 + dm_io_set_flag(io, DM_IO_ACCOUNTED); 540 + } else { 541 + unsigned long flags; 542 + if (dm_io_flagged(io, DM_IO_ACCOUNTED)) 543 + return; 544 + /* Can afford locking given DM_TIO_IS_DUPLICATE_BIO */ 545 + spin_lock_irqsave(&io->lock, flags); 546 + dm_io_set_flag(io, DM_IO_ACCOUNTED); 547 + spin_unlock_irqrestore(&io->lock, flags); 548 + } 549 + 550 + __dm_start_io_acct(io, bio); 551 + } 552 + 553 + static void dm_end_io_acct(struct dm_io *io, struct bio *bio) 554 + { 555 + dm_io_acct(true, io->md, bio, io->start_time, &io->stats_aux); 523 556 } 524 557 525 558 static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) ··· 572 523 clone = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, &md->io_bs); 573 524 574 525 tio = clone_to_tio(clone); 575 - tio->inside_dm_io = true; 526 + tio->flags = 0; 527 + dm_tio_set_flag(tio, DM_TIO_INSIDE_DM_IO); 576 528 tio->io = NULL; 577 529 578 530 io = container_of(tio, struct dm_io, tio); 579 531 io->magic = DM_IO_MAGIC; 580 532 io->status = 0; 581 533 atomic_set(&io->io_count, 1); 582 - io->orig_bio = bio; 534 + this_cpu_inc(*md->pending_io); 535 + io->orig_bio = NULL; 583 536 io->md = md; 584 - spin_lock_init(&io->endio_lock); 585 - 537 + io->map_task = current; 538 + spin_lock_init(&io->lock); 586 539 io->start_time = jiffies; 540 + io->flags = 0; 541 + 542 + dm_stats_record_start(&md->stats, &io->stats_aux); 587 543 588 544 return io; 589 545 } 590 546 591 - static void free_io(struct mapped_device *md, struct dm_io *io) 547 + static void free_io(struct dm_io *io) 592 548 { 593 549 bio_put(&io->tio.clone); 594 550 } ··· 602 548 unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask) 603 549 { 604 550 struct dm_target_io *tio; 551 + struct bio *clone; 605 552 606 553 if (!ci->io->tio.io) { 607 554 /* the dm_target_io embedded in ci->io is available */ 608 555 tio = &ci->io->tio; 556 + /* alloc_io() already initialized embedded clone */ 557 + clone = &tio->clone; 609 558 } else { 610 - struct bio *clone = bio_alloc_clone(ci->bio->bi_bdev, ci->bio, 611 - gfp_mask, &ci->io->md->bs); 559 + clone = bio_alloc_clone(ci->bio->bi_bdev, ci->bio, 560 + gfp_mask, &ci->io->md->bs); 612 561 if (!clone) 613 562 return NULL; 614 563 564 + /* REQ_DM_POLL_LIST shouldn't be inherited */ 565 + clone->bi_opf &= ~REQ_DM_POLL_LIST; 566 + 615 567 tio = clone_to_tio(clone); 616 - tio->inside_dm_io = false; 568 + tio->flags = 0; /* also clears DM_TIO_INSIDE_DM_IO */ 617 569 } 618 570 619 571 tio->magic = DM_TIO_MAGIC; ··· 627 567 tio->ti = ti; 628 568 tio->target_bio_nr = target_bio_nr; 629 569 tio->len_ptr = len; 570 + tio->old_sector = 0; 630 571 631 - return &tio->clone; 572 + if (len) { 573 + clone->bi_iter.bi_size = to_bytes(*len); 574 + if (bio_integrity(clone)) 575 + bio_integrity_trim(clone); 576 + } 577 + 578 + return clone; 632 579 } 633 580 634 581 static void free_tio(struct bio *clone) 635 582 { 636 - if (clone_to_tio(clone)->inside_dm_io) 583 + if (dm_tio_flagged(clone_to_tio(clone), DM_TIO_INSIDE_DM_IO)) 637 584 return; 638 585 bio_put(clone); 639 586 } ··· 847 780 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 848 781 } 849 782 783 + static void dm_io_complete(struct dm_io *io) 784 + { 785 + blk_status_t io_error; 786 + struct mapped_device *md = io->md; 787 + struct bio *bio = io->orig_bio; 788 + 789 + if (io->status == BLK_STS_DM_REQUEUE) { 790 + unsigned long flags; 791 + /* 792 + * Target requested pushing back the I/O. 793 + */ 794 + spin_lock_irqsave(&md->deferred_lock, flags); 795 + if (__noflush_suspending(md) && 796 + !WARN_ON_ONCE(dm_is_zone_write(md, bio))) { 797 + /* NOTE early return due to BLK_STS_DM_REQUEUE below */ 798 + bio_list_add_head(&md->deferred, bio); 799 + } else { 800 + /* 801 + * noflush suspend was interrupted or this is 802 + * a write to a zoned target. 803 + */ 804 + io->status = BLK_STS_IOERR; 805 + } 806 + spin_unlock_irqrestore(&md->deferred_lock, flags); 807 + } 808 + 809 + io_error = io->status; 810 + if (dm_io_flagged(io, DM_IO_ACCOUNTED)) 811 + dm_end_io_acct(io, bio); 812 + else if (!io_error) { 813 + /* 814 + * Must handle target that DM_MAPIO_SUBMITTED only to 815 + * then bio_endio() rather than dm_submit_bio_remap() 816 + */ 817 + __dm_start_io_acct(io, bio); 818 + dm_end_io_acct(io, bio); 819 + } 820 + free_io(io); 821 + smp_wmb(); 822 + this_cpu_dec(*md->pending_io); 823 + 824 + /* nudge anyone waiting on suspend queue */ 825 + if (unlikely(wq_has_sleeper(&md->wait))) 826 + wake_up(&md->wait); 827 + 828 + if (io_error == BLK_STS_DM_REQUEUE) { 829 + /* 830 + * Upper layer won't help us poll split bio, io->orig_bio 831 + * may only reflect a subset of the pre-split original, 832 + * so clear REQ_POLLED in case of requeue 833 + */ 834 + bio->bi_opf &= ~REQ_POLLED; 835 + return; 836 + } 837 + 838 + if (bio_is_flush_with_data(bio)) { 839 + /* 840 + * Preflush done for flush with data, reissue 841 + * without REQ_PREFLUSH. 842 + */ 843 + bio->bi_opf &= ~REQ_PREFLUSH; 844 + queue_io(md, bio); 845 + } else { 846 + /* done with normal IO or empty flush */ 847 + if (io_error) 848 + bio->bi_status = io_error; 849 + bio_endio(bio); 850 + } 851 + } 852 + 853 + static inline bool dm_tio_is_normal(struct dm_target_io *tio) 854 + { 855 + return (dm_tio_flagged(tio, DM_TIO_INSIDE_DM_IO) && 856 + !dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); 857 + } 858 + 850 859 /* 851 860 * Decrements the number of outstanding ios that a bio has been 852 861 * cloned into, completing the original io if necc. 853 862 */ 854 863 void dm_io_dec_pending(struct dm_io *io, blk_status_t error) 855 864 { 856 - unsigned long flags; 857 - blk_status_t io_error; 858 - struct bio *bio; 859 - struct mapped_device *md = io->md; 860 - unsigned long start_time = 0; 861 - struct dm_stats_aux stats_aux; 862 - 863 865 /* Push-back supersedes any I/O errors */ 864 866 if (unlikely(error)) { 865 - spin_lock_irqsave(&io->endio_lock, flags); 866 - if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md))) 867 + unsigned long flags; 868 + spin_lock_irqsave(&io->lock, flags); 869 + if (!(io->status == BLK_STS_DM_REQUEUE && 870 + __noflush_suspending(io->md))) 867 871 io->status = error; 868 - spin_unlock_irqrestore(&io->endio_lock, flags); 872 + spin_unlock_irqrestore(&io->lock, flags); 869 873 } 870 874 871 - if (atomic_dec_and_test(&io->io_count)) { 872 - bio = io->orig_bio; 873 - if (io->status == BLK_STS_DM_REQUEUE) { 874 - /* 875 - * Target requested pushing back the I/O. 876 - */ 877 - spin_lock_irqsave(&md->deferred_lock, flags); 878 - if (__noflush_suspending(md) && 879 - !WARN_ON_ONCE(dm_is_zone_write(md, bio))) { 880 - /* NOTE early return due to BLK_STS_DM_REQUEUE below */ 881 - bio_list_add_head(&md->deferred, bio); 882 - } else { 883 - /* 884 - * noflush suspend was interrupted or this is 885 - * a write to a zoned target. 886 - */ 887 - io->status = BLK_STS_IOERR; 888 - } 889 - spin_unlock_irqrestore(&md->deferred_lock, flags); 890 - } 891 - 892 - io_error = io->status; 893 - start_time = io->start_time; 894 - stats_aux = io->stats_aux; 895 - free_io(md, io); 896 - end_io_acct(md, bio, start_time, &stats_aux); 897 - 898 - if (io_error == BLK_STS_DM_REQUEUE) 899 - return; 900 - 901 - if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) { 902 - /* 903 - * Preflush done for flush with data, reissue 904 - * without REQ_PREFLUSH. 905 - */ 906 - bio->bi_opf &= ~REQ_PREFLUSH; 907 - queue_io(md, bio); 908 - } else { 909 - /* done with normal IO or empty flush */ 910 - if (io_error) 911 - bio->bi_status = io_error; 912 - bio_endio(bio); 913 - } 914 - } 875 + if (atomic_dec_and_test(&io->io_count)) 876 + dm_io_complete(io); 915 877 } 916 878 917 879 void disable_discard(struct mapped_device *md) ··· 1154 1058 /* 1155 1059 * A target may call dm_accept_partial_bio only from the map routine. It is 1156 1060 * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management 1157 - * operations and REQ_OP_ZONE_APPEND (zone append writes). 1061 + * operations, REQ_OP_ZONE_APPEND (zone append writes) and any bio serviced by 1062 + * __send_duplicate_bios(). 1158 1063 * 1159 1064 * dm_accept_partial_bio informs the dm that the target only wants to process 1160 1065 * additional n_sectors sectors of the bio and the rest of the data should be ··· 1186 1089 struct dm_target_io *tio = clone_to_tio(bio); 1187 1090 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; 1188 1091 1189 - BUG_ON(bio->bi_opf & REQ_PREFLUSH); 1092 + BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); 1190 1093 BUG_ON(op_is_zone_mgmt(bio_op(bio))); 1191 1094 BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND); 1192 1095 BUG_ON(bi_size > *tio->len_ptr); ··· 1196 1099 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; 1197 1100 } 1198 1101 EXPORT_SYMBOL_GPL(dm_accept_partial_bio); 1102 + 1103 + static inline void __dm_submit_bio_remap(struct bio *clone, 1104 + dev_t dev, sector_t old_sector) 1105 + { 1106 + trace_block_bio_remap(clone, dev, old_sector); 1107 + submit_bio_noacct(clone); 1108 + } 1109 + 1110 + /* 1111 + * @clone: clone bio that DM core passed to target's .map function 1112 + * @tgt_clone: clone of @clone bio that target needs submitted 1113 + * 1114 + * Targets should use this interface to submit bios they take 1115 + * ownership of when returning DM_MAPIO_SUBMITTED. 1116 + * 1117 + * Target should also enable ti->accounts_remapped_io 1118 + */ 1119 + void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone) 1120 + { 1121 + struct dm_target_io *tio = clone_to_tio(clone); 1122 + struct dm_io *io = tio->io; 1123 + 1124 + WARN_ON_ONCE(!tio->ti->accounts_remapped_io); 1125 + 1126 + /* establish bio that will get submitted */ 1127 + if (!tgt_clone) 1128 + tgt_clone = clone; 1129 + 1130 + /* 1131 + * Account io->origin_bio to DM dev on behalf of target 1132 + * that took ownership of IO with DM_MAPIO_SUBMITTED. 1133 + */ 1134 + if (io->map_task == current) { 1135 + /* Still in target's map function */ 1136 + dm_io_set_flag(io, DM_IO_START_ACCT); 1137 + } else { 1138 + /* 1139 + * Called by another thread, managed by DM target, 1140 + * wait for dm_split_and_process_bio() to store 1141 + * io->orig_bio 1142 + */ 1143 + while (unlikely(!smp_load_acquire(&io->orig_bio))) 1144 + msleep(1); 1145 + dm_start_io_acct(io, clone); 1146 + } 1147 + 1148 + __dm_submit_bio_remap(tgt_clone, disk_devt(io->md->disk), 1149 + tio->old_sector); 1150 + } 1151 + EXPORT_SYMBOL_GPL(dm_submit_bio_remap); 1199 1152 1200 1153 static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch) 1201 1154 { ··· 1267 1120 { 1268 1121 struct dm_target_io *tio = clone_to_tio(clone); 1269 1122 int r; 1270 - sector_t sector; 1271 1123 struct dm_io *io = tio->io; 1272 1124 struct dm_target *ti = tio->ti; 1273 1125 1274 1126 clone->bi_end_io = clone_endio; 1275 1127 1276 1128 /* 1277 - * Map the clone. If r == 0 we don't need to do 1278 - * anything, the target has assumed ownership of 1279 - * this io. 1129 + * Map the clone. 1280 1130 */ 1281 1131 dm_io_inc_pending(io); 1282 - sector = clone->bi_iter.bi_sector; 1132 + tio->old_sector = clone->bi_iter.bi_sector; 1283 1133 1284 1134 if (unlikely(swap_bios_limit(ti, clone))) { 1285 1135 struct mapped_device *md = io->md; ··· 1298 1154 1299 1155 switch (r) { 1300 1156 case DM_MAPIO_SUBMITTED: 1157 + /* target has assumed ownership of this io */ 1158 + if (!ti->accounts_remapped_io) 1159 + dm_io_set_flag(io, DM_IO_START_ACCT); 1301 1160 break; 1302 1161 case DM_MAPIO_REMAPPED: 1303 - /* the bio has been remapped so dispatch it */ 1304 - trace_block_bio_remap(clone, bio_dev(io->orig_bio), sector); 1305 - submit_bio_noacct(clone); 1162 + /* 1163 + * the bio has been remapped so dispatch it, but defer 1164 + * dm_start_io_acct() until after possible bio_split(). 1165 + */ 1166 + __dm_submit_bio_remap(clone, disk_devt(io->md->disk), 1167 + tio->old_sector); 1168 + dm_io_set_flag(io, DM_IO_START_ACCT); 1306 1169 break; 1307 1170 case DM_MAPIO_KILL: 1308 - if (unlikely(swap_bios_limit(ti, clone))) { 1309 - struct mapped_device *md = io->md; 1310 - up(&md->swap_bios_semaphore); 1311 - } 1312 - free_tio(clone); 1313 - dm_io_dec_pending(io, BLK_STS_IOERR); 1314 - break; 1315 1171 case DM_MAPIO_REQUEUE: 1316 - if (unlikely(swap_bios_limit(ti, clone))) { 1317 - struct mapped_device *md = io->md; 1318 - up(&md->swap_bios_semaphore); 1319 - } 1172 + if (unlikely(swap_bios_limit(ti, clone))) 1173 + up(&io->md->swap_bios_semaphore); 1320 1174 free_tio(clone); 1321 - dm_io_dec_pending(io, BLK_STS_DM_REQUEUE); 1175 + if (r == DM_MAPIO_KILL) 1176 + dm_io_dec_pending(io, BLK_STS_IOERR); 1177 + else 1178 + dm_io_dec_pending(io, BLK_STS_DM_REQUEUE); 1322 1179 break; 1323 1180 default: 1324 1181 DMWARN("unimplemented target map return value: %d", r); 1325 1182 BUG(); 1326 1183 } 1327 - } 1328 - 1329 - static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len) 1330 - { 1331 - bio->bi_iter.bi_sector = sector; 1332 - bio->bi_iter.bi_size = to_bytes(len); 1333 - } 1334 - 1335 - /* 1336 - * Creates a bio that consists of range of complete bvecs. 1337 - */ 1338 - static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, 1339 - sector_t sector, unsigned *len) 1340 - { 1341 - struct bio *bio = ci->bio, *clone; 1342 - 1343 - clone = alloc_tio(ci, ti, 0, len, GFP_NOIO); 1344 - bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); 1345 - clone->bi_iter.bi_size = to_bytes(*len); 1346 - 1347 - if (bio_integrity(bio)) 1348 - bio_integrity_trim(clone); 1349 - 1350 - __map_bio(clone); 1351 - return 0; 1352 1184 } 1353 1185 1354 1186 static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, ··· 1368 1248 break; 1369 1249 case 1: 1370 1250 clone = alloc_tio(ci, ti, 0, len, GFP_NOIO); 1371 - if (len) 1372 - bio_setup_sector(clone, ci->sector, *len); 1251 + dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO); 1373 1252 __map_bio(clone); 1374 1253 break; 1375 1254 default: 1376 1255 alloc_multiple_bios(&blist, ci, ti, num_bios, len); 1377 1256 while ((clone = bio_list_pop(&blist))) { 1378 - if (len) 1379 - bio_setup_sector(clone, ci->sector, *len); 1257 + dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO); 1380 1258 __map_bio(clone); 1381 1259 } 1382 1260 break; 1383 1261 } 1384 1262 } 1385 1263 1386 - static int __send_empty_flush(struct clone_info *ci) 1264 + static void __send_empty_flush(struct clone_info *ci) 1387 1265 { 1388 1266 unsigned target_nr = 0; 1389 1267 struct dm_target *ti; ··· 1398 1280 ci->bio = &flush_bio; 1399 1281 ci->sector_count = 0; 1400 1282 1401 - BUG_ON(bio_has_data(ci->bio)); 1402 1283 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1403 1284 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); 1404 1285 1405 1286 bio_uninit(ci->bio); 1406 - return 0; 1407 1287 } 1408 1288 1409 - static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti, 1410 - unsigned num_bios) 1289 + static void __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti, 1290 + unsigned num_bios) 1411 1291 { 1412 1292 unsigned len; 1413 - 1414 - /* 1415 - * Even though the device advertised support for this type of 1416 - * request, that does not mean every target supports it, and 1417 - * reconfiguration might also have changed that since the 1418 - * check was performed. 1419 - */ 1420 - if (!num_bios) 1421 - return -EOPNOTSUPP; 1422 1293 1423 1294 len = min_t(sector_t, ci->sector_count, 1424 1295 max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector))); 1425 1296 1426 - __send_duplicate_bios(ci, ti, num_bios, &len); 1427 - 1297 + /* 1298 + * dm_accept_partial_bio cannot be used with duplicate bios, 1299 + * so update clone_info cursor before __send_duplicate_bios(). 1300 + */ 1428 1301 ci->sector += len; 1429 1302 ci->sector_count -= len; 1430 1303 1431 - return 0; 1304 + __send_duplicate_bios(ci, ti, num_bios, &len); 1432 1305 } 1433 1306 1434 1307 static bool is_abnormal_io(struct bio *bio) ··· 1441 1332 static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, 1442 1333 int *result) 1443 1334 { 1444 - struct bio *bio = ci->bio; 1445 1335 unsigned num_bios = 0; 1446 1336 1447 - switch (bio_op(bio)) { 1337 + switch (bio_op(ci->bio)) { 1448 1338 case REQ_OP_DISCARD: 1449 1339 num_bios = ti->num_discard_bios; 1450 1340 break; ··· 1460 1352 return false; 1461 1353 } 1462 1354 1463 - *result = __send_changing_extent_only(ci, ti, num_bios); 1355 + /* 1356 + * Even though the device advertised support for this type of 1357 + * request, that does not mean every target supports it, and 1358 + * reconfiguration might also have changed that since the 1359 + * check was performed. 1360 + */ 1361 + if (!num_bios) 1362 + *result = -EOPNOTSUPP; 1363 + else { 1364 + __send_changing_extent_only(ci, ti, num_bios); 1365 + *result = 0; 1366 + } 1464 1367 return true; 1368 + } 1369 + 1370 + /* 1371 + * Reuse ->bi_private as hlist head for storing all dm_io instances 1372 + * associated with this bio, and this bio's bi_private needs to be 1373 + * stored in dm_io->data before the reuse. 1374 + * 1375 + * bio->bi_private is owned by fs or upper layer, so block layer won't 1376 + * touch it after splitting. Meantime it won't be changed by anyone after 1377 + * bio is submitted. So this reuse is safe. 1378 + */ 1379 + static inline struct hlist_head *dm_get_bio_hlist_head(struct bio *bio) 1380 + { 1381 + return (struct hlist_head *)&bio->bi_private; 1382 + } 1383 + 1384 + static void dm_queue_poll_io(struct bio *bio, struct dm_io *io) 1385 + { 1386 + struct hlist_head *head = dm_get_bio_hlist_head(bio); 1387 + 1388 + if (!(bio->bi_opf & REQ_DM_POLL_LIST)) { 1389 + bio->bi_opf |= REQ_DM_POLL_LIST; 1390 + /* 1391 + * Save .bi_private into dm_io, so that we can reuse 1392 + * .bi_private as hlist head for storing dm_io list 1393 + */ 1394 + io->data = bio->bi_private; 1395 + 1396 + INIT_HLIST_HEAD(head); 1397 + 1398 + /* tell block layer to poll for completion */ 1399 + bio->bi_cookie = ~BLK_QC_T_NONE; 1400 + } else { 1401 + /* 1402 + * bio recursed due to split, reuse original poll list, 1403 + * and save bio->bi_private too. 1404 + */ 1405 + io->data = hlist_entry(head->first, struct dm_io, node)->data; 1406 + } 1407 + 1408 + hlist_add_head(&io->node, head); 1465 1409 } 1466 1410 1467 1411 /* 1468 1412 * Select the correct strategy for processing a non-flush bio. 1469 1413 */ 1470 - static int __split_and_process_non_flush(struct clone_info *ci) 1414 + static int __split_and_process_bio(struct clone_info *ci) 1471 1415 { 1416 + struct bio *clone; 1472 1417 struct dm_target *ti; 1473 1418 unsigned len; 1474 1419 int r; ··· 1533 1372 if (__process_abnormal_io(ci, ti, &r)) 1534 1373 return r; 1535 1374 1536 - len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count); 1375 + /* 1376 + * Only support bio polling for normal IO, and the target io is 1377 + * exactly inside the dm_io instance (verified in dm_poll_dm_io) 1378 + */ 1379 + ci->submit_as_polled = ci->bio->bi_opf & REQ_POLLED; 1537 1380 1538 - r = __clone_and_map_data_bio(ci, ti, ci->sector, &len); 1539 - if (r < 0) 1540 - return r; 1381 + len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count); 1382 + clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO); 1383 + __map_bio(clone); 1541 1384 1542 1385 ci->sector += len; 1543 1386 ci->sector_count -= len; ··· 1554 1389 { 1555 1390 ci->map = map; 1556 1391 ci->io = alloc_io(md, bio); 1392 + ci->bio = bio; 1393 + ci->submit_as_polled = false; 1557 1394 ci->sector = bio->bi_iter.bi_sector; 1395 + ci->sector_count = bio_sectors(bio); 1396 + 1397 + /* Shouldn't happen but sector_count was being set to 0 so... */ 1398 + if (WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count)) 1399 + ci->sector_count = 0; 1558 1400 } 1559 1401 1560 1402 /* 1561 1403 * Entry point to split a bio into clones and submit them to the targets. 1562 1404 */ 1563 - static void __split_and_process_bio(struct mapped_device *md, 1564 - struct dm_table *map, struct bio *bio) 1405 + static void dm_split_and_process_bio(struct mapped_device *md, 1406 + struct dm_table *map, struct bio *bio) 1565 1407 { 1566 1408 struct clone_info ci; 1409 + struct bio *orig_bio = NULL; 1567 1410 int error = 0; 1568 1411 1569 1412 init_clone_info(&ci, md, map, bio); 1570 1413 1571 1414 if (bio->bi_opf & REQ_PREFLUSH) { 1572 - error = __send_empty_flush(&ci); 1573 - /* dm_io_dec_pending submits any data associated with flush */ 1574 - } else if (op_is_zone_mgmt(bio_op(bio))) { 1575 - ci.bio = bio; 1576 - ci.sector_count = 0; 1577 - error = __split_and_process_non_flush(&ci); 1578 - } else { 1579 - ci.bio = bio; 1580 - ci.sector_count = bio_sectors(bio); 1581 - error = __split_and_process_non_flush(&ci); 1582 - if (ci.sector_count && !error) { 1583 - /* 1584 - * Remainder must be passed to submit_bio_noacct() 1585 - * so that it gets handled *after* bios already submitted 1586 - * have been completely processed. 1587 - * We take a clone of the original to store in 1588 - * ci.io->orig_bio to be used by end_io_acct() and 1589 - * for dec_pending to use for completion handling. 1590 - */ 1591 - struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count, 1592 - GFP_NOIO, &md->queue->bio_split); 1593 - ci.io->orig_bio = b; 1594 - 1595 - bio_chain(b, bio); 1596 - trace_block_split(b, bio->bi_iter.bi_sector); 1597 - submit_bio_noacct(bio); 1598 - } 1415 + __send_empty_flush(&ci); 1416 + /* dm_io_complete submits any data associated with flush */ 1417 + goto out; 1599 1418 } 1600 - start_io_acct(ci.io); 1601 1419 1602 - /* drop the extra reference count */ 1603 - dm_io_dec_pending(ci.io, errno_to_blk_status(error)); 1420 + error = __split_and_process_bio(&ci); 1421 + ci.io->map_task = NULL; 1422 + if (error || !ci.sector_count) 1423 + goto out; 1424 + 1425 + /* 1426 + * Remainder must be passed to submit_bio_noacct() so it gets handled 1427 + * *after* bios already submitted have been completely processed. 1428 + * We take a clone of the original to store in ci.io->orig_bio to be 1429 + * used by dm_end_io_acct() and for dm_io_complete() to use for 1430 + * completion handling. 1431 + */ 1432 + orig_bio = bio_split(bio, bio_sectors(bio) - ci.sector_count, 1433 + GFP_NOIO, &md->queue->bio_split); 1434 + bio_chain(orig_bio, bio); 1435 + trace_block_split(orig_bio, bio->bi_iter.bi_sector); 1436 + submit_bio_noacct(bio); 1437 + out: 1438 + if (!orig_bio) 1439 + orig_bio = bio; 1440 + smp_store_release(&ci.io->orig_bio, orig_bio); 1441 + if (dm_io_flagged(ci.io, DM_IO_START_ACCT)) 1442 + dm_start_io_acct(ci.io, NULL); 1443 + 1444 + /* 1445 + * Drop the extra reference count for non-POLLED bio, and hold one 1446 + * reference for POLLED bio, which will be released in dm_poll_bio 1447 + * 1448 + * Add every dm_io instance into the hlist_head which is stored in 1449 + * bio->bi_private, so that dm_poll_bio can poll them all. 1450 + */ 1451 + if (error || !ci.submit_as_polled) 1452 + dm_io_dec_pending(ci.io, errno_to_blk_status(error)); 1453 + else 1454 + dm_queue_poll_io(bio, ci.io); 1604 1455 } 1605 1456 1606 1457 static void dm_submit_bio(struct bio *bio) ··· 1626 1445 struct dm_table *map; 1627 1446 1628 1447 map = dm_get_live_table(md, &srcu_idx); 1629 - if (unlikely(!map)) { 1630 - DMERR_LIMIT("%s: mapping table unavailable, erroring io", 1631 - dm_device_name(md)); 1632 - bio_io_error(bio); 1633 - goto out; 1634 - } 1635 1448 1636 - /* If suspended, queue this IO for later */ 1637 - if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { 1449 + /* If suspended, or map not yet available, queue this IO for later */ 1450 + if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) || 1451 + unlikely(!map)) { 1638 1452 if (bio->bi_opf & REQ_NOWAIT) 1639 1453 bio_wouldblock_error(bio); 1640 1454 else if (bio->bi_opf & REQ_RAHEAD) ··· 1646 1470 if (is_abnormal_io(bio)) 1647 1471 blk_queue_split(&bio); 1648 1472 1649 - __split_and_process_bio(md, map, bio); 1473 + dm_split_and_process_bio(md, map, bio); 1650 1474 out: 1651 1475 dm_put_live_table(md, srcu_idx); 1476 + } 1477 + 1478 + static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob, 1479 + unsigned int flags) 1480 + { 1481 + WARN_ON_ONCE(!dm_tio_is_normal(&io->tio)); 1482 + 1483 + /* don't poll if the mapped io is done */ 1484 + if (atomic_read(&io->io_count) > 1) 1485 + bio_poll(&io->tio.clone, iob, flags); 1486 + 1487 + /* bio_poll holds the last reference */ 1488 + return atomic_read(&io->io_count) == 1; 1489 + } 1490 + 1491 + static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob, 1492 + unsigned int flags) 1493 + { 1494 + struct hlist_head *head = dm_get_bio_hlist_head(bio); 1495 + struct hlist_head tmp = HLIST_HEAD_INIT; 1496 + struct hlist_node *next; 1497 + struct dm_io *io; 1498 + 1499 + /* Only poll normal bio which was marked as REQ_DM_POLL_LIST */ 1500 + if (!(bio->bi_opf & REQ_DM_POLL_LIST)) 1501 + return 0; 1502 + 1503 + WARN_ON_ONCE(hlist_empty(head)); 1504 + 1505 + hlist_move_list(head, &tmp); 1506 + 1507 + /* 1508 + * Restore .bi_private before possibly completing dm_io. 1509 + * 1510 + * bio_poll() is only possible once @bio has been completely 1511 + * submitted via submit_bio_noacct()'s depth-first submission. 1512 + * So there is no dm_queue_poll_io() race associated with 1513 + * clearing REQ_DM_POLL_LIST here. 1514 + */ 1515 + bio->bi_opf &= ~REQ_DM_POLL_LIST; 1516 + bio->bi_private = hlist_entry(tmp.first, struct dm_io, node)->data; 1517 + 1518 + hlist_for_each_entry_safe(io, next, &tmp, node) { 1519 + if (dm_poll_dm_io(io, iob, flags)) { 1520 + hlist_del_init(&io->node); 1521 + /* 1522 + * clone_endio() has already occurred, so passing 1523 + * error as 0 here doesn't override io->status 1524 + */ 1525 + dm_io_dec_pending(io, 0); 1526 + } 1527 + } 1528 + 1529 + /* Not done? */ 1530 + if (!hlist_empty(&tmp)) { 1531 + bio->bi_opf |= REQ_DM_POLL_LIST; 1532 + /* Reset bio->bi_private to dm_io list head */ 1533 + hlist_move_list(&tmp, head); 1534 + return 0; 1535 + } 1536 + return 1; 1652 1537 } 1653 1538 1654 1539 /*----------------------------------------------------------------- ··· 1794 1557 md->dax_dev = NULL; 1795 1558 } 1796 1559 1560 + dm_cleanup_zoned_dev(md); 1797 1561 if (md->disk) { 1798 1562 spin_lock(&_minor_lock); 1799 1563 md->disk->private_data = NULL; ··· 1807 1569 blk_cleanup_disk(md->disk); 1808 1570 } 1809 1571 1572 + if (md->pending_io) { 1573 + free_percpu(md->pending_io); 1574 + md->pending_io = NULL; 1575 + } 1576 + 1810 1577 cleanup_srcu_struct(&md->io_barrier); 1811 1578 1812 1579 mutex_destroy(&md->suspend_lock); ··· 1820 1577 mutex_destroy(&md->swap_bios_lock); 1821 1578 1822 1579 dm_mq_cleanup_mapped_device(md); 1823 - dm_cleanup_zoned_dev(md); 1824 1580 } 1825 1581 1826 1582 /* ··· 1911 1669 1912 1670 md->wq = alloc_workqueue("kdmflush/%s", WQ_MEM_RECLAIM, 0, md->name); 1913 1671 if (!md->wq) 1672 + goto bad; 1673 + 1674 + md->pending_io = alloc_percpu(unsigned long); 1675 + if (!md->pending_io) 1914 1676 goto bad; 1915 1677 1916 1678 dm_stats_init(&md->stats); ··· 2026 1780 struct queue_limits *limits) 2027 1781 { 2028 1782 struct dm_table *old_map; 2029 - struct request_queue *q = md->queue; 2030 - bool request_based = dm_table_request_based(t); 2031 1783 sector_t size; 2032 1784 int ret; 2033 1785 ··· 2046 1802 2047 1803 dm_table_event_callback(t, event_callback, md); 2048 1804 2049 - if (request_based) { 1805 + if (dm_table_request_based(t)) { 2050 1806 /* 2051 1807 * Leverage the fact that request-based DM targets are 2052 1808 * immutable singletons - used to optimize dm_mq_queue_rq. ··· 2060 1816 goto out; 2061 1817 } 2062 1818 2063 - ret = dm_table_set_restrictions(t, q, limits); 1819 + ret = dm_table_set_restrictions(t, md->queue, limits); 2064 1820 if (ret) { 2065 1821 old_map = ERR_PTR(ret); 2066 1822 goto out; ··· 2072 1828 2073 1829 if (old_map) 2074 1830 dm_sync_table(md); 2075 - 2076 1831 out: 2077 1832 return old_map; 2078 1833 } ··· 2321 2078 } 2322 2079 EXPORT_SYMBOL_GPL(dm_put); 2323 2080 2324 - static bool md_in_flight_bios(struct mapped_device *md) 2081 + static bool dm_in_flight_bios(struct mapped_device *md) 2325 2082 { 2326 2083 int cpu; 2327 - struct block_device *part = dm_disk(md)->part0; 2328 - long sum = 0; 2084 + unsigned long sum = 0; 2329 2085 2330 - for_each_possible_cpu(cpu) { 2331 - sum += part_stat_local_read_cpu(part, in_flight[0], cpu); 2332 - sum += part_stat_local_read_cpu(part, in_flight[1], cpu); 2333 - } 2086 + for_each_possible_cpu(cpu) 2087 + sum += *per_cpu_ptr(md->pending_io, cpu); 2334 2088 2335 2089 return sum != 0; 2336 2090 } ··· 2340 2100 while (true) { 2341 2101 prepare_to_wait(&md->wait, &wait, task_state); 2342 2102 2343 - if (!md_in_flight_bios(md)) 2103 + if (!dm_in_flight_bios(md)) 2344 2104 break; 2345 2105 2346 2106 if (signal_pending_state(task_state, current)) { ··· 2351 2111 io_schedule(); 2352 2112 } 2353 2113 finish_wait(&md->wait, &wait); 2114 + 2115 + smp_rmb(); 2354 2116 2355 2117 return r; 2356 2118 } ··· 2525 2283 /* 2526 2284 * Here we must make sure that no processes are submitting requests 2527 2285 * to target drivers i.e. no one may be executing 2528 - * __split_and_process_bio from dm_submit_bio. 2286 + * dm_split_and_process_bio from dm_submit_bio. 2529 2287 * 2530 - * To get all processes out of __split_and_process_bio in dm_submit_bio, 2288 + * To get all processes out of dm_split_and_process_bio in dm_submit_bio, 2531 2289 * we take the write lock. To prevent any process from reentering 2532 - * __split_and_process_bio from dm_submit_bio and quiesce the thread 2290 + * dm_split_and_process_bio from dm_submit_bio and quiesce the thread 2533 2291 * (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND and call 2534 2292 * flush_workqueue(md->wq). 2535 2293 */ ··· 3137 2895 3138 2896 static const struct block_device_operations dm_blk_dops = { 3139 2897 .submit_bio = dm_submit_bio, 2898 + .poll_bio = dm_poll_bio, 3140 2899 .open = dm_blk_open, 3141 2900 .release = dm_blk_close, 3142 2901 .ioctl = dm_blk_ioctl,
+2
include/linux/blkdev.h
··· 1457 1457 1458 1458 struct block_device_operations { 1459 1459 void (*submit_bio)(struct bio *bio); 1460 + int (*poll_bio)(struct bio *bio, struct io_comp_batch *iob, 1461 + unsigned int flags); 1460 1462 int (*open) (struct block_device *, fmode_t); 1461 1463 void (*release) (struct gendisk *, fmode_t); 1462 1464 int (*rw_page)(struct block_device *, sector_t, struct page *, unsigned int);
+8 -1
include/linux/device-mapper.h
··· 358 358 bool limit_swap_bios:1; 359 359 360 360 /* 361 - * Set if this target implements a a zoned device and needs emulation of 361 + * Set if this target implements a zoned device and needs emulation of 362 362 * zone append operations using regular writes. 363 363 */ 364 364 bool emulate_zone_append:1; 365 + 366 + /* 367 + * Set if the target will submit IO using dm_submit_bio_remap() 368 + * after returning DM_MAPIO_SUBMITTED from its map function. 369 + */ 370 + bool accounts_remapped_io:1; 365 371 }; 366 372 367 373 void *dm_per_bio_data(struct bio *bio, size_t data_size); ··· 471 465 int dm_post_suspending(struct dm_target *ti); 472 466 int dm_noflush_suspending(struct dm_target *ti); 473 467 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors); 468 + void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone); 474 469 union map_info *dm_get_rq_mapinfo(struct request *rq); 475 470 476 471 #ifdef CONFIG_BLK_DEV_ZONED
+2 -2
include/uapi/linux/dm-ioctl.h
··· 286 286 #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) 287 287 288 288 #define DM_VERSION_MAJOR 4 289 - #define DM_VERSION_MINOR 45 289 + #define DM_VERSION_MINOR 46 290 290 #define DM_VERSION_PATCHLEVEL 0 291 - #define DM_VERSION_EXTRA "-ioctl (2021-03-22)" 291 + #define DM_VERSION_EXTRA "-ioctl (2022-02-22)" 292 292 293 293 /* Status bits */ 294 294 #define DM_READONLY_FLAG (1 << 0) /* In/Out */