Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

dm bio prison v1: improve concurrent IO performance

Split the bio prison into multiple regions, with a separate rbtree and
associated lock for each region.

To get fast bio prison locking and not damage the performance of
discards too much the bio-prison now stipulates that discards should
not cross a BIO_PRISON_MAX_RANGE boundary.

Because the range of a key (block_end - block_begin) must not exceed
BIO_PRISON_MAX_RANGE: break_up_discard_bio() now ensures the data
range reflected in PHYSICAL key doesn't exceed BIO_PRISON_MAX_RANGE.
And splitting the thin target's discards (handled with VIRTUAL key) is
achieved by updating dm-thin.c to set limits->max_discard_sectors in
terms of BIO_PRISON_MAX_RANGE _and_ setting the thin and thin-pool
targets' max_discard_granularity to true.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>

authored by

Joe Thornber and committed by
Mike Snitzer
e2dd8aca 06961c48

+121 -68
+57 -30
drivers/md/dm-bio-prison-v1.c
··· 16 16 17 17 /*----------------------------------------------------------------*/ 18 18 19 + #define NR_LOCKS 64 20 + #define LOCK_MASK (NR_LOCKS - 1) 19 21 #define MIN_CELLS 1024 20 22 21 - struct dm_bio_prison { 23 + struct prison_region { 22 24 spinlock_t lock; 23 - struct rb_root cells; 25 + struct rb_root cell; 26 + } ____cacheline_aligned_in_smp; 27 + 28 + struct dm_bio_prison { 29 + struct prison_region regions[NR_LOCKS]; 24 30 mempool_t cell_pool; 25 31 }; 26 32 ··· 40 34 */ 41 35 struct dm_bio_prison *dm_bio_prison_create(void) 42 36 { 43 - struct dm_bio_prison *prison = kzalloc(sizeof(*prison), GFP_KERNEL); 44 37 int ret; 38 + unsigned i; 39 + struct dm_bio_prison *prison = kzalloc(sizeof(*prison), GFP_KERNEL); 45 40 46 41 if (!prison) 47 42 return NULL; 48 43 49 - spin_lock_init(&prison->lock); 44 + for (i = 0; i < NR_LOCKS; i++) { 45 + spin_lock_init(&prison->regions[i].lock); 46 + prison->regions[i].cell = RB_ROOT; 47 + } 50 48 51 49 ret = mempool_init_slab_pool(&prison->cell_pool, MIN_CELLS, _cell_cache); 52 50 if (ret) { 53 51 kfree(prison); 54 52 return NULL; 55 53 } 56 - 57 - prison->cells = RB_ROOT; 58 54 59 55 return prison; 60 56 } ··· 115 107 return 0; 116 108 } 117 109 118 - static int __bio_detain(struct dm_bio_prison *prison, 110 + static unsigned lock_nr(struct dm_cell_key *key) 111 + { 112 + return (key->block_begin >> BIO_PRISON_MAX_RANGE_SHIFT) & LOCK_MASK; 113 + } 114 + 115 + static void check_range(struct dm_cell_key *key) 116 + { 117 + BUG_ON(key->block_end - key->block_begin > BIO_PRISON_MAX_RANGE); 118 + BUG_ON((key->block_begin >> BIO_PRISON_MAX_RANGE_SHIFT) != 119 + ((key->block_end - 1) >> BIO_PRISON_MAX_RANGE_SHIFT)); 120 + } 121 + 122 + static int __bio_detain(struct rb_root *root, 119 123 struct dm_cell_key *key, 120 124 struct bio *inmate, 121 125 struct dm_bio_prison_cell *cell_prealloc, 122 126 struct dm_bio_prison_cell **cell_result) 123 127 { 124 128 int r; 125 - struct rb_node **new = &prison->cells.rb_node, *parent = NULL; 129 + struct rb_node **new = &root->rb_node, *parent = NULL; 126 130 127 131 while (*new) { 128 132 struct dm_bio_prison_cell *cell = ··· 159 139 *cell_result = cell_prealloc; 160 140 161 141 rb_link_node(&cell_prealloc->node, parent, new); 162 - rb_insert_color(&cell_prealloc->node, &prison->cells); 142 + rb_insert_color(&cell_prealloc->node, root); 163 143 164 144 return 0; 165 145 } ··· 171 151 struct dm_bio_prison_cell **cell_result) 172 152 { 173 153 int r; 154 + unsigned l = lock_nr(key); 155 + check_range(key); 174 156 175 - spin_lock_irq(&prison->lock); 176 - r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result); 177 - spin_unlock_irq(&prison->lock); 157 + spin_lock_irq(&prison->regions[l].lock); 158 + r = __bio_detain(&prison->regions[l].cell, key, inmate, cell_prealloc, cell_result); 159 + spin_unlock_irq(&prison->regions[l].lock); 178 160 179 161 return r; 180 162 } ··· 203 181 /* 204 182 * @inmates must have been initialised prior to this call 205 183 */ 206 - static void __cell_release(struct dm_bio_prison *prison, 184 + static void __cell_release(struct rb_root *root, 207 185 struct dm_bio_prison_cell *cell, 208 186 struct bio_list *inmates) 209 187 { 210 - rb_erase(&cell->node, &prison->cells); 188 + rb_erase(&cell->node, root); 211 189 212 190 if (inmates) { 213 191 if (cell->holder) ··· 220 198 struct dm_bio_prison_cell *cell, 221 199 struct bio_list *bios) 222 200 { 223 - spin_lock_irq(&prison->lock); 224 - __cell_release(prison, cell, bios); 225 - spin_unlock_irq(&prison->lock); 201 + unsigned l = lock_nr(&cell->key); 202 + 203 + spin_lock_irq(&prison->regions[l].lock); 204 + __cell_release(&prison->regions[l].cell, cell, bios); 205 + spin_unlock_irq(&prison->regions[l].lock); 226 206 } 227 207 EXPORT_SYMBOL_GPL(dm_cell_release); 228 208 229 209 /* 230 210 * Sometimes we don't want the holder, just the additional bios. 231 211 */ 232 - static void __cell_release_no_holder(struct dm_bio_prison *prison, 212 + static void __cell_release_no_holder(struct rb_root *root, 233 213 struct dm_bio_prison_cell *cell, 234 214 struct bio_list *inmates) 235 215 { 236 - rb_erase(&cell->node, &prison->cells); 216 + rb_erase(&cell->node, root); 237 217 bio_list_merge(inmates, &cell->bios); 238 218 } 239 219 ··· 243 219 struct dm_bio_prison_cell *cell, 244 220 struct bio_list *inmates) 245 221 { 222 + unsigned l = lock_nr(&cell->key); 246 223 unsigned long flags; 247 224 248 - spin_lock_irqsave(&prison->lock, flags); 249 - __cell_release_no_holder(prison, cell, inmates); 250 - spin_unlock_irqrestore(&prison->lock, flags); 225 + spin_lock_irqsave(&prison->regions[l].lock, flags); 226 + __cell_release_no_holder(&prison->regions[l].cell, cell, inmates); 227 + spin_unlock_irqrestore(&prison->regions[l].lock, flags); 251 228 } 252 229 EXPORT_SYMBOL_GPL(dm_cell_release_no_holder); 253 230 ··· 273 248 void *context, 274 249 struct dm_bio_prison_cell *cell) 275 250 { 276 - spin_lock_irq(&prison->lock); 251 + unsigned l = lock_nr(&cell->key); 252 + spin_lock_irq(&prison->regions[l].lock); 277 253 visit_fn(context, cell); 278 - rb_erase(&cell->node, &prison->cells); 279 - spin_unlock_irq(&prison->lock); 254 + rb_erase(&cell->node, &prison->regions[l].cell); 255 + spin_unlock_irq(&prison->regions[l].lock); 280 256 } 281 257 EXPORT_SYMBOL_GPL(dm_cell_visit_release); 282 258 283 - static int __promote_or_release(struct dm_bio_prison *prison, 259 + static int __promote_or_release(struct rb_root *root, 284 260 struct dm_bio_prison_cell *cell) 285 261 { 286 262 if (bio_list_empty(&cell->bios)) { 287 - rb_erase(&cell->node, &prison->cells); 263 + rb_erase(&cell->node, root); 288 264 return 1; 289 265 } 290 266 ··· 297 271 struct dm_bio_prison_cell *cell) 298 272 { 299 273 int r; 274 + unsigned l = lock_nr(&cell->key); 300 275 301 - spin_lock_irq(&prison->lock); 302 - r = __promote_or_release(prison, cell); 303 - spin_unlock_irq(&prison->lock); 276 + spin_lock_irq(&prison->regions[l].lock); 277 + r = __promote_or_release(&prison->regions[l].cell, cell); 278 + spin_unlock_irq(&prison->regions[l].lock); 304 279 305 280 return r; 306 281 }
+10
drivers/md/dm-bio-prison-v1.h
··· 35 35 }; 36 36 37 37 /* 38 + * The range of a key (block_end - block_begin) must not 39 + * exceed BIO_PRISON_MAX_RANGE. Also the range must not 40 + * cross a similarly sized boundary. 41 + * 42 + * Must be a power of 2. 43 + */ 44 + #define BIO_PRISON_MAX_RANGE 1024 45 + #define BIO_PRISON_MAX_RANGE_SHIFT 10 46 + 47 + /* 38 48 * Treat this as opaque, only in header so callers can manage allocation 39 49 * themselves. 40 50 */
+54 -38
drivers/md/dm-thin.c
··· 1674 1674 struct dm_cell_key data_key; 1675 1675 struct dm_bio_prison_cell *data_cell; 1676 1676 struct dm_thin_new_mapping *m; 1677 - dm_block_t virt_begin, virt_end, data_begin; 1677 + dm_block_t virt_begin, virt_end, data_begin, data_end; 1678 + dm_block_t len, next_boundary; 1678 1679 1679 1680 while (begin != end) { 1680 - r = ensure_next_mapping(pool); 1681 - if (r) 1682 - /* we did our best */ 1683 - return; 1684 - 1685 1681 r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end, 1686 1682 &data_begin, &maybe_shared); 1687 - if (r) 1683 + if (r) { 1688 1684 /* 1689 1685 * Silently fail, letting any mappings we've 1690 1686 * created complete. 1691 1687 */ 1692 1688 break; 1693 - 1694 - build_key(tc->td, PHYSICAL, data_begin, data_begin + (virt_end - virt_begin), &data_key); 1695 - if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) { 1696 - /* contention, we'll give up with this range */ 1697 - begin = virt_end; 1698 - continue; 1699 1689 } 1700 1690 1701 - /* 1702 - * IO may still be going to the destination block. We must 1703 - * quiesce before we can do the removal. 1704 - */ 1705 - m = get_next_mapping(pool); 1706 - m->tc = tc; 1707 - m->maybe_shared = maybe_shared; 1708 - m->virt_begin = virt_begin; 1709 - m->virt_end = virt_end; 1710 - m->data_block = data_begin; 1711 - m->cell = data_cell; 1712 - m->bio = bio; 1691 + data_end = data_begin + (virt_end - virt_begin); 1713 1692 1714 1693 /* 1715 - * The parent bio must not complete before sub discard bios are 1716 - * chained to it (see end_discard's bio_chain)! 1717 - * 1718 - * This per-mapping bi_remaining increment is paired with 1719 - * the implicit decrement that occurs via bio_endio() in 1720 - * end_discard(). 1694 + * Make sure the data region obeys the bio prison restrictions. 1721 1695 */ 1722 - bio_inc_remaining(bio); 1723 - if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) 1724 - pool->process_prepared_discard(m); 1696 + while (data_begin < data_end) { 1697 + r = ensure_next_mapping(pool); 1698 + if (r) 1699 + return; /* we did our best */ 1700 + 1701 + next_boundary = ((data_begin >> BIO_PRISON_MAX_RANGE_SHIFT) + 1) 1702 + << BIO_PRISON_MAX_RANGE_SHIFT; 1703 + len = min_t(sector_t, data_end - data_begin, next_boundary - data_begin); 1704 + 1705 + build_key(tc->td, PHYSICAL, data_begin, data_begin + len, &data_key); 1706 + if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) { 1707 + /* contention, we'll give up with this range */ 1708 + data_begin += len; 1709 + continue; 1710 + } 1711 + 1712 + /* 1713 + * IO may still be going to the destination block. We must 1714 + * quiesce before we can do the removal. 1715 + */ 1716 + m = get_next_mapping(pool); 1717 + m->tc = tc; 1718 + m->maybe_shared = maybe_shared; 1719 + m->virt_begin = virt_begin; 1720 + m->virt_end = virt_begin + len; 1721 + m->data_block = data_begin; 1722 + m->cell = data_cell; 1723 + m->bio = bio; 1724 + 1725 + /* 1726 + * The parent bio must not complete before sub discard bios are 1727 + * chained to it (see end_discard's bio_chain)! 1728 + * 1729 + * This per-mapping bi_remaining increment is paired with 1730 + * the implicit decrement that occurs via bio_endio() in 1731 + * end_discard(). 1732 + */ 1733 + bio_inc_remaining(bio); 1734 + if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) 1735 + pool->process_prepared_discard(m); 1736 + 1737 + virt_begin += len; 1738 + data_begin += len; 1739 + } 1725 1740 1726 1741 begin = virt_end; 1727 1742 } ··· 3395 3380 */ 3396 3381 if (pf.discard_enabled && pf.discard_passdown) { 3397 3382 ti->num_discard_bios = 1; 3398 - 3399 3383 /* 3400 3384 * Setting 'discards_supported' circumvents the normal 3401 3385 * stacking of discard limits (this keeps the pool and 3402 3386 * thin devices' discard limits consistent). 3403 3387 */ 3404 3388 ti->discards_supported = true; 3389 + ti->max_discard_granularity = true; 3405 3390 } 3406 3391 ti->private = pt; 3407 3392 ··· 4111 4096 .name = "thin-pool", 4112 4097 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 4113 4098 DM_TARGET_IMMUTABLE, 4114 - .version = {1, 22, 0}, 4099 + .version = {1, 23, 0}, 4115 4100 .module = THIS_MODULE, 4116 4101 .ctr = pool_ctr, 4117 4102 .dtr = pool_dtr, ··· 4276 4261 if (tc->pool->pf.discard_enabled) { 4277 4262 ti->discards_supported = true; 4278 4263 ti->num_discard_bios = 1; 4264 + ti->max_discard_granularity = true; 4279 4265 } 4280 4266 4281 4267 mutex_unlock(&dm_thin_pool_table.mutex); ··· 4492 4476 return; 4493 4477 4494 4478 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; 4495 - limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */ 4479 + limits->max_discard_sectors = pool->sectors_per_block * BIO_PRISON_MAX_RANGE; 4496 4480 } 4497 4481 4498 4482 static struct target_type thin_target = { 4499 4483 .name = "thin", 4500 - .version = {1, 22, 0}, 4484 + .version = {1, 23, 0}, 4501 4485 .module = THIS_MODULE, 4502 4486 .ctr = thin_ctr, 4503 4487 .dtr = thin_dtr,