dm: enable request based option

This patch enables request-based dm.

o Request-based dm and bio-based dm coexist, since there are
some target drivers which are more fitting to bio-based dm.
Also, there are other bio-based devices in the kernel
(e.g. md, loop).
Since bio-based device can't receive struct request,
there are some limitations on device stacking between
bio-based and request-based.

type of underlying device
bio-based request-based
----------------------------------------------
bio-based OK OK
request-based -- OK

The device type is recognized by the queue flag in the kernel,
so dm follows that.

o The type of a dm device is decided at the first table binding time.
Once the type of a dm device is decided, the type can't be changed.

o Mempool allocations are deferred to at the table loading time, since
mempools for request-based dm are different from those for bio-based
dm and needed mempool type is fixed by the type of table.

o Currently, request-based dm supports only tables that have a single
target. To support multiple targets, we need to support request
splitting or prevent bio/request from spanning multiple targets.
The former needs lots of changes in the block layer, and the latter
needs that all target drivers support merge() function.
Both will take a time.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

authored by Kiyoshi Ueda and committed by Alasdair G Kergon e6ee8c0b cec47e3d

+285 -26
+13
drivers/md/dm-ioctl.c
··· 1050 1050 next = spec->next; 1051 1051 } 1052 1052 1053 + r = dm_table_set_type(table); 1054 + if (r) { 1055 + DMWARN("unable to set table type"); 1056 + return r; 1057 + } 1058 + 1053 1059 return dm_table_complete(table); 1054 1060 } 1055 1061 ··· 1097 1091 if (r) { 1098 1092 DMERR("%s: could not register integrity profile.", 1099 1093 dm_device_name(md)); 1094 + dm_table_destroy(t); 1095 + goto out; 1096 + } 1097 + 1098 + r = dm_table_alloc_md_mempools(t); 1099 + if (r) { 1100 + DMWARN("unable to allocate mempools for this table"); 1100 1101 dm_table_destroy(t); 1101 1102 goto out; 1102 1103 }
+111
drivers/md/dm-table.c
··· 41 41 struct dm_table { 42 42 struct mapped_device *md; 43 43 atomic_t holders; 44 + unsigned type; 44 45 45 46 /* btree table */ 46 47 unsigned int depth; ··· 66 65 /* events get handed up using this callback */ 67 66 void (*event_fn)(void *); 68 67 void *event_context; 68 + 69 + struct dm_md_mempools *mempools; 69 70 }; 70 71 71 72 /* ··· 260 257 /* free the device list */ 261 258 if (t->devices.next != &t->devices) 262 259 free_devices(&t->devices); 260 + 261 + dm_free_md_mempools(t->mempools); 263 262 264 263 kfree(t); 265 264 } ··· 769 764 return r; 770 765 } 771 766 767 + int dm_table_set_type(struct dm_table *t) 768 + { 769 + unsigned i; 770 + unsigned bio_based = 0, request_based = 0; 771 + struct dm_target *tgt; 772 + struct dm_dev_internal *dd; 773 + struct list_head *devices; 774 + 775 + for (i = 0; i < t->num_targets; i++) { 776 + tgt = t->targets + i; 777 + if (dm_target_request_based(tgt)) 778 + request_based = 1; 779 + else 780 + bio_based = 1; 781 + 782 + if (bio_based && request_based) { 783 + DMWARN("Inconsistent table: different target types" 784 + " can't be mixed up"); 785 + return -EINVAL; 786 + } 787 + } 788 + 789 + if (bio_based) { 790 + /* We must use this table as bio-based */ 791 + t->type = DM_TYPE_BIO_BASED; 792 + return 0; 793 + } 794 + 795 + BUG_ON(!request_based); /* No targets in this table */ 796 + 797 + /* Non-request-stackable devices can't be used for request-based dm */ 798 + devices = dm_table_get_devices(t); 799 + list_for_each_entry(dd, devices, list) { 800 + if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev.bdev))) { 801 + DMWARN("table load rejected: including" 802 + " non-request-stackable devices"); 803 + return -EINVAL; 804 + } 805 + } 806 + 807 + /* 808 + * Request-based dm supports only tables that have a single target now. 809 + * To support multiple targets, request splitting support is needed, 810 + * and that needs lots of changes in the block-layer. 811 + * (e.g. request completion process for partial completion.) 812 + */ 813 + if (t->num_targets > 1) { 814 + DMWARN("Request-based dm doesn't support multiple targets yet"); 815 + return -EINVAL; 816 + } 817 + 818 + t->type = DM_TYPE_REQUEST_BASED; 819 + 820 + return 0; 821 + } 822 + 823 + unsigned dm_table_get_type(struct dm_table *t) 824 + { 825 + return t->type; 826 + } 827 + 828 + bool dm_table_request_based(struct dm_table *t) 829 + { 830 + return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED; 831 + } 832 + 833 + int dm_table_alloc_md_mempools(struct dm_table *t) 834 + { 835 + unsigned type = dm_table_get_type(t); 836 + 837 + if (unlikely(type == DM_TYPE_NONE)) { 838 + DMWARN("no table type is set, can't allocate mempools"); 839 + return -EINVAL; 840 + } 841 + 842 + t->mempools = dm_alloc_md_mempools(type); 843 + if (!t->mempools) 844 + return -ENOMEM; 845 + 846 + return 0; 847 + } 848 + 849 + void dm_table_free_md_mempools(struct dm_table *t) 850 + { 851 + dm_free_md_mempools(t->mempools); 852 + t->mempools = NULL; 853 + } 854 + 855 + struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t) 856 + { 857 + return t->mempools; 858 + } 859 + 772 860 static int setup_indexes(struct dm_table *t) 773 861 { 774 862 int i; ··· 1083 985 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q); 1084 986 1085 987 dm_table_set_integrity(t); 988 + 989 + /* 990 + * QUEUE_FLAG_STACKABLE must be set after all queue settings are 991 + * visible to other CPUs because, once the flag is set, incoming bios 992 + * are processed by request-based dm, which refers to the queue 993 + * settings. 994 + * Until the flag set, bios are passed to bio-based dm and queued to 995 + * md->deferred where queue settings are not needed yet. 996 + * Those bios are passed to request-based dm at the resume time. 997 + */ 998 + smp_mb(); 999 + if (dm_table_request_based(t)) 1000 + queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q); 1086 1001 } 1087 1002 1088 1003 unsigned int dm_table_get_num_targets(struct dm_table *t)
+136 -26
drivers/md/dm.c
··· 190 190 struct bio barrier_bio; 191 191 }; 192 192 193 + /* 194 + * For mempools pre-allocation at the table loading time. 195 + */ 196 + struct dm_md_mempools { 197 + mempool_t *io_pool; 198 + mempool_t *tio_pool; 199 + struct bio_set *bs; 200 + }; 201 + 193 202 #define MIN_IOS 256 194 203 static struct kmem_cache *_io_cache; 195 204 static struct kmem_cache *_tio_cache; ··· 1748 1739 INIT_LIST_HEAD(&md->uevent_list); 1749 1740 spin_lock_init(&md->uevent_lock); 1750 1741 1751 - md->queue = blk_alloc_queue(GFP_KERNEL); 1742 + md->queue = blk_init_queue(dm_request_fn, NULL); 1752 1743 if (!md->queue) 1753 1744 goto bad_queue; 1754 1745 1746 + /* 1747 + * Request-based dm devices cannot be stacked on top of bio-based dm 1748 + * devices. The type of this dm device has not been decided yet, 1749 + * although we initialized the queue using blk_init_queue(). 1750 + * The type is decided at the first table loading time. 1751 + * To prevent problematic device stacking, clear the queue flag 1752 + * for request stacking support until then. 1753 + * 1754 + * This queue is new, so no concurrency on the queue_flags. 1755 + */ 1756 + queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 1757 + md->saved_make_request_fn = md->queue->make_request_fn; 1755 1758 md->queue->queuedata = md; 1756 1759 md->queue->backing_dev_info.congested_fn = dm_any_congested; 1757 1760 md->queue->backing_dev_info.congested_data = md; ··· 1772 1751 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1773 1752 md->queue->unplug_fn = dm_unplug_all; 1774 1753 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1775 - 1776 - md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); 1777 - if (!md->io_pool) 1778 - goto bad_io_pool; 1779 - 1780 - md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); 1781 - if (!md->tio_pool) 1782 - goto bad_tio_pool; 1783 - 1784 - md->bs = bioset_create(16, 0); 1785 - if (!md->bs) 1786 - goto bad_no_bioset; 1754 + blk_queue_softirq_done(md->queue, dm_softirq_done); 1755 + blk_queue_prep_rq(md->queue, dm_prep_fn); 1756 + blk_queue_lld_busy(md->queue, dm_lld_busy); 1787 1757 1788 1758 md->disk = alloc_disk(1); 1789 1759 if (!md->disk) ··· 1816 1804 bad_thread: 1817 1805 put_disk(md->disk); 1818 1806 bad_disk: 1819 - bioset_free(md->bs); 1820 - bad_no_bioset: 1821 - mempool_destroy(md->tio_pool); 1822 - bad_tio_pool: 1823 - mempool_destroy(md->io_pool); 1824 - bad_io_pool: 1825 1807 blk_cleanup_queue(md->queue); 1826 1808 bad_queue: 1827 1809 free_minor(minor); ··· 1835 1829 unlock_fs(md); 1836 1830 bdput(md->bdev); 1837 1831 destroy_workqueue(md->wq); 1838 - mempool_destroy(md->tio_pool); 1839 - mempool_destroy(md->io_pool); 1840 - bioset_free(md->bs); 1832 + if (md->tio_pool) 1833 + mempool_destroy(md->tio_pool); 1834 + if (md->io_pool) 1835 + mempool_destroy(md->io_pool); 1836 + if (md->bs) 1837 + bioset_free(md->bs); 1841 1838 blk_integrity_unregister(md->disk); 1842 1839 del_gendisk(md->disk); 1843 1840 free_minor(minor); ··· 1853 1844 blk_cleanup_queue(md->queue); 1854 1845 module_put(THIS_MODULE); 1855 1846 kfree(md); 1847 + } 1848 + 1849 + static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 1850 + { 1851 + struct dm_md_mempools *p; 1852 + 1853 + if (md->io_pool && md->tio_pool && md->bs) 1854 + /* the md already has necessary mempools */ 1855 + goto out; 1856 + 1857 + p = dm_table_get_md_mempools(t); 1858 + BUG_ON(!p || md->io_pool || md->tio_pool || md->bs); 1859 + 1860 + md->io_pool = p->io_pool; 1861 + p->io_pool = NULL; 1862 + md->tio_pool = p->tio_pool; 1863 + p->tio_pool = NULL; 1864 + md->bs = p->bs; 1865 + p->bs = NULL; 1866 + 1867 + out: 1868 + /* mempool bind completed, now no need any mempools in the table */ 1869 + dm_table_free_md_mempools(t); 1856 1870 } 1857 1871 1858 1872 /* ··· 1928 1896 } 1929 1897 1930 1898 dm_table_event_callback(t, event_callback, md); 1899 + 1900 + /* 1901 + * The queue hasn't been stopped yet, if the old table type wasn't 1902 + * for request-based during suspension. So stop it to prevent 1903 + * I/O mapping before resume. 1904 + * This must be done before setting the queue restrictions, 1905 + * because request-based dm may be run just after the setting. 1906 + */ 1907 + if (dm_table_request_based(t) && !blk_queue_stopped(q)) 1908 + stop_queue(q); 1909 + 1910 + __bind_mempools(md, t); 1931 1911 1932 1912 write_lock(&md->map_lock); 1933 1913 md->map = t; ··· 2154 2110 2155 2111 up_write(&md->io_lock); 2156 2112 2157 - if (bio_barrier(c)) 2158 - process_barrier(md, c); 2159 - else 2160 - __split_and_process_bio(md, c); 2113 + if (dm_request_based(md)) 2114 + generic_make_request(c); 2115 + else { 2116 + if (bio_barrier(c)) 2117 + process_barrier(md, c); 2118 + else 2119 + __split_and_process_bio(md, c); 2120 + } 2161 2121 2162 2122 down_write(&md->io_lock); 2163 2123 } ··· 2193 2145 r = dm_calculate_queue_limits(table, &limits); 2194 2146 if (r) 2195 2147 goto out; 2148 + 2149 + /* cannot change the device type, once a table is bound */ 2150 + if (md->map && 2151 + (dm_table_get_type(md->map) != dm_table_get_type(table))) { 2152 + DMWARN("can't change the device type after a table is bound"); 2153 + goto out; 2154 + } 2196 2155 2197 2156 __unbind(md); 2198 2157 r = __bind(md, table, &limits); ··· 2596 2541 return r; 2597 2542 } 2598 2543 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2544 + 2545 + struct dm_md_mempools *dm_alloc_md_mempools(unsigned type) 2546 + { 2547 + struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); 2548 + 2549 + if (!pools) 2550 + return NULL; 2551 + 2552 + pools->io_pool = (type == DM_TYPE_BIO_BASED) ? 2553 + mempool_create_slab_pool(MIN_IOS, _io_cache) : 2554 + mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache); 2555 + if (!pools->io_pool) 2556 + goto free_pools_and_out; 2557 + 2558 + pools->tio_pool = (type == DM_TYPE_BIO_BASED) ? 2559 + mempool_create_slab_pool(MIN_IOS, _tio_cache) : 2560 + mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); 2561 + if (!pools->tio_pool) 2562 + goto free_io_pool_and_out; 2563 + 2564 + pools->bs = (type == DM_TYPE_BIO_BASED) ? 2565 + bioset_create(16, 0) : bioset_create(MIN_IOS, 0); 2566 + if (!pools->bs) 2567 + goto free_tio_pool_and_out; 2568 + 2569 + return pools; 2570 + 2571 + free_tio_pool_and_out: 2572 + mempool_destroy(pools->tio_pool); 2573 + 2574 + free_io_pool_and_out: 2575 + mempool_destroy(pools->io_pool); 2576 + 2577 + free_pools_and_out: 2578 + kfree(pools); 2579 + 2580 + return NULL; 2581 + } 2582 + 2583 + void dm_free_md_mempools(struct dm_md_mempools *pools) 2584 + { 2585 + if (!pools) 2586 + return; 2587 + 2588 + if (pools->io_pool) 2589 + mempool_destroy(pools->io_pool); 2590 + 2591 + if (pools->tio_pool) 2592 + mempool_destroy(pools->tio_pool); 2593 + 2594 + if (pools->bs) 2595 + bioset_free(pools->bs); 2596 + 2597 + kfree(pools); 2598 + } 2599 2599 2600 2600 static struct block_device_operations dm_blk_dops = { 2601 2601 .open = dm_blk_open,
+25
drivers/md/dm.h
··· 23 23 #define DM_SUSPEND_NOFLUSH_FLAG (1 << 1) 24 24 25 25 /* 26 + * Type of table and mapped_device's mempool 27 + */ 28 + #define DM_TYPE_NONE 0 29 + #define DM_TYPE_BIO_BASED 1 30 + #define DM_TYPE_REQUEST_BASED 2 31 + 32 + /* 26 33 * List of devices that a metadevice uses and should open/close. 27 34 */ 28 35 struct dm_dev_internal { ··· 39 32 }; 40 33 41 34 struct dm_table; 35 + struct dm_md_mempools; 42 36 43 37 /*----------------------------------------------------------------- 44 38 * Internal table functions. ··· 59 51 int dm_table_resume_targets(struct dm_table *t); 60 52 int dm_table_any_congested(struct dm_table *t, int bdi_bits); 61 53 int dm_table_any_busy_target(struct dm_table *t); 54 + int dm_table_set_type(struct dm_table *t); 55 + unsigned dm_table_get_type(struct dm_table *t); 56 + bool dm_table_request_based(struct dm_table *t); 57 + int dm_table_alloc_md_mempools(struct dm_table *t); 58 + void dm_table_free_md_mempools(struct dm_table *t); 59 + struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); 62 60 63 61 /* 64 62 * To check the return value from dm_table_find_target(). 65 63 */ 66 64 #define dm_target_is_valid(t) ((t)->table) 65 + 66 + /* 67 + * To check whether the target type is request-based or not (bio-based). 68 + */ 69 + #define dm_target_request_based(t) ((t)->type->map_rq != NULL) 67 70 68 71 /*----------------------------------------------------------------- 69 72 * A registry of target types. ··· 120 101 121 102 int dm_kcopyd_init(void); 122 103 void dm_kcopyd_exit(void); 104 + 105 + /* 106 + * Mempool operations 107 + */ 108 + struct dm_md_mempools *dm_alloc_md_mempools(unsigned type); 109 + void dm_free_md_mempools(struct dm_md_mempools *pools); 123 110 124 111 #endif