Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

dm: allocate requests in target when stacking on blk-mq devices

For blk-mq request-based DM the responsibility of allocating a cloned
request is transfered from DM core to the target type. Doing so
enables the cloned request to be allocated from the appropriate
blk-mq request_queue's pool (only the DM target, e.g. multipath, can
know which block device to send a given cloned request to).

Care was taken to preserve compatibility with old-style block request
completion that requires request-based DM _not_ acquire the clone
request's queue lock in the completion path. As such, there are now 2
different request-based DM target_type interfaces:
1) the original .map_rq() interface will continue to be used for
non-blk-mq devices -- the preallocated clone request is passed in
from DM core.
2) a new .clone_and_map_rq() and .release_clone_rq() will be used for
blk-mq devices -- blk_get_request() and blk_put_request() are used
respectively from these hooks.

dm_table_set_type() was updated to detect if the request-based target is
being stacked on blk-mq devices, if so DM_TYPE_MQ_REQUEST_BASED is set.
DM core disallows switching the DM table's type after it is set. This
means that there is no mixing of non-blk-mq and blk-mq devices within
the same request-based DM table.

[This patch was started by Keith and later heavily modified by Mike]

Tested-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>

+185 -48
+43 -8
drivers/md/dm-mpath.c
··· 11 11 #include "dm-path-selector.h" 12 12 #include "dm-uevent.h" 13 13 14 + #include <linux/blkdev.h> 14 15 #include <linux/ctype.h> 15 16 #include <linux/init.h> 16 17 #include <linux/mempool.h> ··· 379 378 /* 380 379 * Map cloned requests 381 380 */ 382 - static int multipath_map(struct dm_target *ti, struct request *clone, 383 - union map_info *map_context) 381 + static int __multipath_map(struct dm_target *ti, struct request *clone, 382 + union map_info *map_context, 383 + struct request *rq, struct request **__clone) 384 384 { 385 385 struct multipath *m = (struct multipath *) ti->private; 386 386 int r = DM_MAPIO_REQUEUE; 387 - size_t nr_bytes = blk_rq_bytes(clone); 387 + size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq); 388 388 struct pgpath *pgpath; 389 389 struct block_device *bdev; 390 390 struct dm_mpath_io *mpio; ··· 418 416 419 417 bdev = pgpath->path.dev->bdev; 420 418 421 - clone->q = bdev_get_queue(bdev); 422 - clone->rq_disk = bdev->bd_disk; 423 - clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; 424 - 425 419 spin_unlock_irq(&m->lock); 420 + 421 + if (clone) { 422 + /* Old request-based interface: allocated clone is passed in */ 423 + clone->q = bdev_get_queue(bdev); 424 + clone->rq_disk = bdev->bd_disk; 425 + clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; 426 + } else { 427 + /* blk-mq request-based interface */ 428 + *__clone = blk_get_request(bdev_get_queue(bdev), 429 + rq_data_dir(rq), GFP_KERNEL); 430 + if (IS_ERR(*__clone)) 431 + /* ENOMEM, requeue */ 432 + return r; 433 + (*__clone)->bio = (*__clone)->biotail = NULL; 434 + (*__clone)->rq_disk = bdev->bd_disk; 435 + (*__clone)->cmd_flags |= REQ_FAILFAST_TRANSPORT; 436 + } 426 437 427 438 if (pgpath->pg->ps.type->start_io) 428 439 pgpath->pg->ps.type->start_io(&pgpath->pg->ps, ··· 447 432 spin_unlock_irq(&m->lock); 448 433 449 434 return r; 435 + } 436 + 437 + static int multipath_map(struct dm_target *ti, struct request *clone, 438 + union map_info *map_context) 439 + { 440 + return __multipath_map(ti, clone, map_context, NULL, NULL); 441 + } 442 + 443 + static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, 444 + union map_info *map_context, 445 + struct request **clone) 446 + { 447 + return __multipath_map(ti, NULL, map_context, rq, clone); 448 + } 449 + 450 + static void multipath_release_clone(struct request *clone) 451 + { 452 + blk_put_request(clone); 450 453 } 451 454 452 455 /* ··· 1703 1670 *---------------------------------------------------------------*/ 1704 1671 static struct target_type multipath_target = { 1705 1672 .name = "multipath", 1706 - .version = {1, 7, 0}, 1673 + .version = {1, 8, 0}, 1707 1674 .module = THIS_MODULE, 1708 1675 .ctr = multipath_ctr, 1709 1676 .dtr = multipath_dtr, 1710 1677 .map_rq = multipath_map, 1678 + .clone_and_map_rq = multipath_clone_and_map, 1679 + .release_clone_rq = multipath_release_clone, 1711 1680 .rq_end_io = multipath_end_io, 1712 1681 .presuspend = multipath_presuspend, 1713 1682 .postsuspend = multipath_postsuspend,
+29 -5
drivers/md/dm-table.c
··· 827 827 { 828 828 unsigned i; 829 829 unsigned bio_based = 0, request_based = 0, hybrid = 0; 830 + bool use_blk_mq = false; 830 831 struct dm_target *tgt; 831 832 struct dm_dev_internal *dd; 832 833 struct list_head *devices; ··· 873 872 /* Non-request-stackable devices can't be used for request-based dm */ 874 873 devices = dm_table_get_devices(t); 875 874 list_for_each_entry(dd, devices, list) { 876 - if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev->bdev))) { 877 - DMWARN("table load rejected: including" 878 - " non-request-stackable devices"); 875 + struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev); 876 + 877 + if (!blk_queue_stackable(q)) { 878 + DMERR("table load rejected: including" 879 + " non-request-stackable devices"); 879 880 return -EINVAL; 880 881 } 882 + 883 + if (q->mq_ops) 884 + use_blk_mq = true; 885 + } 886 + 887 + if (use_blk_mq) { 888 + /* verify _all_ devices in the table are blk-mq devices */ 889 + list_for_each_entry(dd, devices, list) 890 + if (!bdev_get_queue(dd->dm_dev->bdev)->mq_ops) { 891 + DMERR("table load rejected: not all devices" 892 + " are blk-mq request-stackable"); 893 + return -EINVAL; 894 + } 881 895 } 882 896 883 897 /* ··· 906 890 return -EINVAL; 907 891 } 908 892 909 - t->type = DM_TYPE_REQUEST_BASED; 893 + t->type = !use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED; 910 894 911 895 return 0; 912 896 } ··· 923 907 924 908 bool dm_table_request_based(struct dm_table *t) 925 909 { 926 - return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED; 910 + unsigned table_type = dm_table_get_type(t); 911 + 912 + return (table_type == DM_TYPE_REQUEST_BASED || 913 + table_type == DM_TYPE_MQ_REQUEST_BASED); 914 + } 915 + 916 + bool dm_table_mq_request_based(struct dm_table *t) 917 + { 918 + return dm_table_get_type(t) == DM_TYPE_MQ_REQUEST_BASED; 927 919 } 928 920 929 921 static int dm_table_alloc_md_mempools(struct dm_table *t)
+14 -1
drivers/md/dm-target.c
··· 137 137 return -EIO; 138 138 } 139 139 140 + static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq, 141 + union map_info *map_context, 142 + struct request **clone) 143 + { 144 + return -EIO; 145 + } 146 + 147 + static void io_err_release_clone_rq(struct request *clone) 148 + { 149 + } 150 + 140 151 static struct target_type error_target = { 141 152 .name = "error", 142 - .version = {1, 2, 0}, 153 + .version = {1, 3, 0}, 143 154 .ctr = io_err_ctr, 144 155 .dtr = io_err_dtr, 145 156 .map = io_err_map, 146 157 .map_rq = io_err_map_rq, 158 + .clone_and_map_rq = io_err_clone_and_map_rq, 159 + .release_clone_rq = io_err_release_clone_rq, 147 160 }; 148 161 149 162 int __init dm_target_init(void)
+85 -29
drivers/md/dm.c
··· 1044 1044 struct dm_rq_target_io *tio = clone->end_io_data; 1045 1045 1046 1046 blk_rq_unprep_clone(clone); 1047 - free_clone_request(tio->md, clone); 1047 + if (clone->q && clone->q->mq_ops) 1048 + tio->ti->type->release_clone_rq(clone); 1049 + else 1050 + free_clone_request(tio->md, clone); 1048 1051 free_rq_tio(tio); 1049 1052 } 1050 1053 ··· 1089 1086 rq->special = NULL; 1090 1087 rq->cmd_flags &= ~REQ_DONTPREP; 1091 1088 1092 - free_rq_clone(clone); 1089 + if (clone) 1090 + free_rq_clone(clone); 1093 1091 } 1094 1092 1095 1093 /* ··· 1189 1185 struct dm_rq_target_io *tio = rq->special; 1190 1186 struct request *clone = tio->clone; 1191 1187 1188 + if (!clone) { 1189 + blk_end_request_all(rq, tio->error); 1190 + rq_completed(tio->md, rq_data_dir(rq), false); 1191 + free_rq_tio(tio); 1192 + return; 1193 + } 1194 + 1192 1195 if (rq->cmd_flags & REQ_FAILED) 1193 1196 mapped = false; 1194 1197 ··· 1218 1207 * Complete the not-mapped clone and the original request with the error status 1219 1208 * through softirq context. 1220 1209 * Target's rq_end_io() function isn't called. 1221 - * This may be used when the target's map_rq() function fails. 1210 + * This may be used when the target's map_rq() or clone_and_map_rq() functions fail. 1222 1211 */ 1223 1212 static void dm_kill_unmapped_request(struct request *rq, int error) 1224 1213 { ··· 1233 1222 { 1234 1223 struct dm_rq_target_io *tio = clone->end_io_data; 1235 1224 1236 - /* 1237 - * For just cleaning up the information of the queue in which 1238 - * the clone was dispatched. 1239 - * The clone is *NOT* freed actually here because it is alloced from 1240 - * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. 1241 - */ 1242 - __blk_put_request(clone->q, clone); 1225 + if (!clone->q->mq_ops) { 1226 + /* 1227 + * For just cleaning up the information of the queue in which 1228 + * the clone was dispatched. 1229 + * The clone is *NOT* freed actually here because it is alloced 1230 + * from dm own mempool (REQ_ALLOCED isn't set). 1231 + */ 1232 + __blk_put_request(clone->q, clone); 1233 + } 1243 1234 1244 1235 /* 1245 1236 * Actual request completion is done in a softirq context which doesn't ··· 1802 1789 struct mapped_device *md, gfp_t gfp_mask) 1803 1790 { 1804 1791 struct dm_rq_target_io *tio; 1792 + int srcu_idx; 1793 + struct dm_table *table; 1805 1794 1806 1795 tio = alloc_rq_tio(md, gfp_mask); 1807 1796 if (!tio) ··· 1817 1802 memset(&tio->info, 0, sizeof(tio->info)); 1818 1803 init_kthread_work(&tio->work, map_tio_request); 1819 1804 1820 - if (!clone_rq(rq, md, tio, gfp_mask)) { 1821 - free_rq_tio(tio); 1822 - return NULL; 1805 + table = dm_get_live_table(md, &srcu_idx); 1806 + if (!dm_table_mq_request_based(table)) { 1807 + if (!clone_rq(rq, md, tio, gfp_mask)) { 1808 + dm_put_live_table(md, srcu_idx); 1809 + free_rq_tio(tio); 1810 + return NULL; 1811 + } 1823 1812 } 1813 + dm_put_live_table(md, srcu_idx); 1824 1814 1825 1815 return tio; 1826 1816 } ··· 1855 1835 1856 1836 /* 1857 1837 * Returns: 1858 - * 0 : the request has been processed (not requeued) 1859 - * !0 : the request has been requeued 1838 + * 0 : the request has been processed 1839 + * DM_MAPIO_REQUEUE : the original request needs to be requeued 1840 + * < 0 : the request was completed due to failure 1860 1841 */ 1861 1842 static int map_request(struct dm_target *ti, struct request *rq, 1862 1843 struct mapped_device *md) 1863 1844 { 1864 - int r, requeued = 0; 1845 + int r; 1865 1846 struct dm_rq_target_io *tio = rq->special; 1866 - struct request *clone = tio->clone; 1847 + struct request *clone = NULL; 1867 1848 1868 - r = ti->type->map_rq(ti, clone, &tio->info); 1849 + if (tio->clone) { 1850 + clone = tio->clone; 1851 + r = ti->type->map_rq(ti, clone, &tio->info); 1852 + } else { 1853 + r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); 1854 + if (r < 0) { 1855 + /* The target wants to complete the I/O */ 1856 + dm_kill_unmapped_request(rq, r); 1857 + return r; 1858 + } 1859 + if (IS_ERR(clone)) 1860 + return DM_MAPIO_REQUEUE; 1861 + if (setup_clone(clone, rq, tio, GFP_KERNEL)) { 1862 + /* -ENOMEM */ 1863 + ti->type->release_clone_rq(clone); 1864 + return DM_MAPIO_REQUEUE; 1865 + } 1866 + } 1867 + 1869 1868 switch (r) { 1870 1869 case DM_MAPIO_SUBMITTED: 1871 1870 /* The target has taken the I/O to submit by itself later */ ··· 1898 1859 case DM_MAPIO_REQUEUE: 1899 1860 /* The target wants to requeue the I/O */ 1900 1861 dm_requeue_unmapped_request(clone); 1901 - requeued = 1; 1902 1862 break; 1903 1863 default: 1904 1864 if (r > 0) { ··· 1907 1869 1908 1870 /* The target wants to complete the I/O */ 1909 1871 dm_kill_unmapped_request(rq, r); 1910 - break; 1872 + return r; 1911 1873 } 1912 1874 1913 - return requeued; 1875 + return 0; 1914 1876 } 1915 1877 1916 1878 static void map_tio_request(struct kthread_work *work) 1917 1879 { 1918 1880 struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work); 1881 + struct request *rq = tio->orig; 1882 + struct mapped_device *md = tio->md; 1919 1883 1920 - map_request(tio->ti, tio->orig, tio->md); 1884 + if (map_request(tio->ti, rq, md) == DM_MAPIO_REQUEUE) 1885 + dm_requeue_unmapped_original_request(md, rq); 1921 1886 } 1922 1887 1923 1888 static void dm_start_request(struct mapped_device *md, struct request *orig) ··· 2500 2459 return md->type; 2501 2460 } 2502 2461 2462 + static bool dm_md_type_request_based(struct mapped_device *md) 2463 + { 2464 + unsigned table_type = dm_get_md_type(md); 2465 + 2466 + return (table_type == DM_TYPE_REQUEST_BASED || 2467 + table_type == DM_TYPE_MQ_REQUEST_BASED); 2468 + } 2469 + 2503 2470 struct target_type *dm_get_immutable_target_type(struct mapped_device *md) 2504 2471 { 2505 2472 return md->immutable_target_type; ··· 2560 2511 */ 2561 2512 int dm_setup_md_queue(struct mapped_device *md) 2562 2513 { 2563 - if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) && 2564 - !dm_init_request_based_queue(md)) { 2514 + if (dm_md_type_request_based(md) && !dm_init_request_based_queue(md)) { 2565 2515 DMWARN("Cannot initialize queue for request-based mapped device"); 2566 2516 return -EINVAL; 2567 2517 } ··· 3232 3184 { 3233 3185 struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL); 3234 3186 struct kmem_cache *cachep; 3235 - unsigned int pool_size; 3187 + unsigned int pool_size = 0; 3236 3188 unsigned int front_pad; 3237 3189 3238 3190 if (!pools) 3239 3191 return NULL; 3240 3192 3241 - if (type == DM_TYPE_BIO_BASED) { 3193 + switch (type) { 3194 + case DM_TYPE_BIO_BASED: 3242 3195 cachep = _io_cache; 3243 3196 pool_size = dm_get_reserved_bio_based_ios(); 3244 3197 front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); 3245 - } else if (type == DM_TYPE_REQUEST_BASED) { 3246 - cachep = _rq_tio_cache; 3198 + break; 3199 + case DM_TYPE_REQUEST_BASED: 3247 3200 pool_size = dm_get_reserved_rq_based_ios(); 3248 3201 pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache); 3249 3202 if (!pools->rq_pool) 3250 3203 goto out; 3204 + /* fall through to setup remaining rq-based pools */ 3205 + case DM_TYPE_MQ_REQUEST_BASED: 3206 + cachep = _rq_tio_cache; 3207 + if (!pool_size) 3208 + pool_size = dm_get_reserved_rq_based_ios(); 3251 3209 front_pad = offsetof(struct dm_rq_clone_bio_info, clone); 3252 3210 /* per_bio_data_size is not used. See __bind_mempools(). */ 3253 3211 WARN_ON(per_bio_data_size != 0); 3254 - } else 3212 + break; 3213 + default: 3255 3214 goto out; 3215 + } 3256 3216 3257 3217 pools->io_pool = mempool_create_slab_pool(pool_size, cachep); 3258 3218 if (!pools->io_pool)
+5 -3
drivers/md/dm.h
··· 34 34 /* 35 35 * Type of table and mapped_device's mempool 36 36 */ 37 - #define DM_TYPE_NONE 0 38 - #define DM_TYPE_BIO_BASED 1 39 - #define DM_TYPE_REQUEST_BASED 2 37 + #define DM_TYPE_NONE 0 38 + #define DM_TYPE_BIO_BASED 1 39 + #define DM_TYPE_REQUEST_BASED 2 40 + #define DM_TYPE_MQ_REQUEST_BASED 3 40 41 41 42 /* 42 43 * List of devices that a metadevice uses and should open/close. ··· 74 73 unsigned dm_table_get_type(struct dm_table *t); 75 74 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); 76 75 bool dm_table_request_based(struct dm_table *t); 76 + bool dm_table_mq_request_based(struct dm_table *t); 77 77 void dm_table_free_md_mempools(struct dm_table *t); 78 78 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); 79 79
+7
include/linux/device-mapper.h
··· 48 48 typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio); 49 49 typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone, 50 50 union map_info *map_context); 51 + typedef int (*dm_clone_and_map_request_fn) (struct dm_target *ti, 52 + struct request *rq, 53 + union map_info *map_context, 54 + struct request **clone); 55 + typedef void (*dm_release_clone_request_fn) (struct request *clone); 51 56 52 57 /* 53 58 * Returns: ··· 148 143 dm_dtr_fn dtr; 149 144 dm_map_fn map; 150 145 dm_map_request_fn map_rq; 146 + dm_clone_and_map_request_fn clone_and_map_rq; 147 + dm_release_clone_request_fn release_clone_rq; 151 148 dm_endio_fn end_io; 152 149 dm_request_endio_fn rq_end_io; 153 150 dm_presuspend_fn presuspend;
+2 -2
include/uapi/linux/dm-ioctl.h
··· 267 267 #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) 268 268 269 269 #define DM_VERSION_MAJOR 4 270 - #define DM_VERSION_MINOR 29 270 + #define DM_VERSION_MINOR 30 271 271 #define DM_VERSION_PATCHLEVEL 0 272 - #define DM_VERSION_EXTRA "-ioctl (2014-10-28)" 272 + #define DM_VERSION_EXTRA "-ioctl (2014-12-22)" 273 273 274 274 /* Status bits */ 275 275 #define DM_READONLY_FLAG (1 << 0) /* In/Out */