Merge tag 'dm-3.19-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

+14 -8

Documentation/device-mapper/cache-policies.txt

··· 47 47 'discard_promote_adjustment <value>' 48 48 49 49 The sequential threshold indicates the number of contiguous I/Os 50 - required before a stream is treated as sequential. The random threshold 50 + required before a stream is treated as sequential. Once a stream is 51 + considered sequential it will bypass the cache. The random threshold 51 52 is the number of intervening non-contiguous I/Os that must be seen 52 53 before the stream is treated as random again. 53 54 54 55 The sequential and random thresholds default to 512 and 4 respectively. 55 56 56 - Large, sequential ios are probably better left on the origin device 57 - since spindles tend to have good bandwidth. The io_tracker counts 58 - contiguous I/Os to try to spot when the io is in one of these sequential 59 - modes. 57 + Large, sequential I/Os are probably better left on the origin device 58 + since spindles tend to have good sequential I/O bandwidth. The 59 + io_tracker counts contiguous I/Os to try to spot when the I/O is in one 60 + of these sequential modes. But there are use-cases for wanting to 61 + promote sequential blocks to the cache (e.g. fast application startup). 62 + If sequential threshold is set to 0 the sequential I/O detection is 63 + disabled and sequential I/O will no longer implicitly bypass the cache. 64 + Setting the random threshold to 0 does _not_ disable the random I/O 65 + stream detection. 60 66 61 - Internally the mq policy maintains a promotion threshold variable. If 62 - the hit count of a block not in the cache goes above this threshold it 63 - gets promoted to the cache. The read, write and discard promote adjustment 67 + Internally the mq policy determines a promotion threshold. If the hit 68 + count of a block not in the cache goes above this threshold it gets 69 + promoted to the cache. The read, write and discard promote adjustment 64 70 tunables allow you to tweak the promotion threshold by adding a small 65 71 value based on the io type. They default to 4, 8 and 1 respectively. 66 72 If you're trying to quickly warm a new cache device you may wish to

+92 -104

drivers/md/dm-bio-prison.c

··· 14 14 15 15 /*----------------------------------------------------------------*/ 16 16 17 - struct bucket { 18 - spinlock_t lock; 19 - struct hlist_head cells; 20 - }; 17 + #define MIN_CELLS 1024 21 18 22 19 struct dm_bio_prison { 20 + spinlock_t lock; 23 21 mempool_t *cell_pool; 24 - 25 - unsigned nr_buckets; 26 - unsigned hash_mask; 27 - struct bucket *buckets; 22 + struct rb_root cells; 28 23 }; 29 - 30 - /*----------------------------------------------------------------*/ 31 - 32 - static uint32_t calc_nr_buckets(unsigned nr_cells) 33 - { 34 - uint32_t n = 128; 35 - 36 - nr_cells /= 4; 37 - nr_cells = min(nr_cells, 8192u); 38 - 39 - while (n < nr_cells) 40 - n <<= 1; 41 - 42 - return n; 43 - } 44 24 45 25 static struct kmem_cache *_cell_cache; 46 26 47 - static void init_bucket(struct bucket *b) 48 - { 49 - spin_lock_init(&b->lock); 50 - INIT_HLIST_HEAD(&b->cells); 51 - } 27 + /*----------------------------------------------------------------*/ 52 28 53 29 /* 54 30 * @nr_cells should be the number of cells you want in use _concurrently_. 55 31 * Don't confuse it with the number of distinct keys. 56 32 */ 57 - struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells) 33 + struct dm_bio_prison *dm_bio_prison_create(void) 58 34 { 59 - unsigned i; 60 - uint32_t nr_buckets = calc_nr_buckets(nr_cells); 61 - size_t len = sizeof(struct dm_bio_prison) + 62 - (sizeof(struct bucket) * nr_buckets); 63 - struct dm_bio_prison *prison = kmalloc(len, GFP_KERNEL); 35 + struct dm_bio_prison *prison = kmalloc(sizeof(*prison), GFP_KERNEL); 64 36 65 37 if (!prison) 66 38 return NULL; 67 39 68 - prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache); 40 + spin_lock_init(&prison->lock); 41 + 42 + prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache); 69 43 if (!prison->cell_pool) { 70 44 kfree(prison); 71 45 return NULL; 72 46 } 73 47 74 - prison->nr_buckets = nr_buckets; 75 - prison->hash_mask = nr_buckets - 1; 76 - prison->buckets = (struct bucket *) (prison + 1); 77 - for (i = 0; i < nr_buckets; i++) 78 - init_bucket(prison->buckets + i); 48 + prison->cells = RB_ROOT; 79 49 80 50 return prison; 81 51 } ··· 71 101 } 72 102 EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell); 73 103 74 - static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key) 75 - { 76 - const unsigned long BIG_PRIME = 4294967291UL; 77 - uint64_t hash = key->block * BIG_PRIME; 78 - 79 - return (uint32_t) (hash & prison->hash_mask); 80 - } 81 - 82 - static int keys_equal(struct dm_cell_key *lhs, struct dm_cell_key *rhs) 83 - { 84 - return (lhs->virtual == rhs->virtual) && 85 - (lhs->dev == rhs->dev) && 86 - (lhs->block == rhs->block); 87 - } 88 - 89 - static struct bucket *get_bucket(struct dm_bio_prison *prison, 90 - struct dm_cell_key *key) 91 - { 92 - return prison->buckets + hash_key(prison, key); 93 - } 94 - 95 - static struct dm_bio_prison_cell *__search_bucket(struct bucket *b, 96 - struct dm_cell_key *key) 97 - { 98 - struct dm_bio_prison_cell *cell; 99 - 100 - hlist_for_each_entry(cell, &b->cells, list) 101 - if (keys_equal(&cell->key, key)) 102 - return cell; 103 - 104 - return NULL; 105 - } 106 - 107 - static void __setup_new_cell(struct bucket *b, 108 - struct dm_cell_key *key, 104 + static void __setup_new_cell(struct dm_cell_key *key, 109 105 struct bio *holder, 110 106 struct dm_bio_prison_cell *cell) 111 107 { 112 - memcpy(&cell->key, key, sizeof(cell->key)); 113 - cell->holder = holder; 114 - bio_list_init(&cell->bios); 115 - hlist_add_head(&cell->list, &b->cells); 108 + memcpy(&cell->key, key, sizeof(cell->key)); 109 + cell->holder = holder; 110 + bio_list_init(&cell->bios); 116 111 } 117 112 118 - static int __bio_detain(struct bucket *b, 113 + static int cmp_keys(struct dm_cell_key *lhs, 114 + struct dm_cell_key *rhs) 115 + { 116 + if (lhs->virtual < rhs->virtual) 117 + return -1; 118 + 119 + if (lhs->virtual > rhs->virtual) 120 + return 1; 121 + 122 + if (lhs->dev < rhs->dev) 123 + return -1; 124 + 125 + if (lhs->dev > rhs->dev) 126 + return 1; 127 + 128 + if (lhs->block_end <= rhs->block_begin) 129 + return -1; 130 + 131 + if (lhs->block_begin >= rhs->block_end) 132 + return 1; 133 + 134 + return 0; 135 + } 136 + 137 + static int __bio_detain(struct dm_bio_prison *prison, 119 138 struct dm_cell_key *key, 120 139 struct bio *inmate, 121 140 struct dm_bio_prison_cell *cell_prealloc, 122 141 struct dm_bio_prison_cell **cell_result) 123 142 { 124 - struct dm_bio_prison_cell *cell; 143 + int r; 144 + struct rb_node **new = &prison->cells.rb_node, *parent = NULL; 125 145 126 - cell = __search_bucket(b, key); 127 - if (cell) { 128 - if (inmate) 129 - bio_list_add(&cell->bios, inmate); 130 - *cell_result = cell; 131 - return 1; 146 + while (*new) { 147 + struct dm_bio_prison_cell *cell = 148 + container_of(*new, struct dm_bio_prison_cell, node); 149 + 150 + r = cmp_keys(key, &cell->key); 151 + 152 + parent = *new; 153 + if (r < 0) 154 + new = &((*new)->rb_left); 155 + else if (r > 0) 156 + new = &((*new)->rb_right); 157 + else { 158 + if (inmate) 159 + bio_list_add(&cell->bios, inmate); 160 + *cell_result = cell; 161 + return 1; 162 + } 132 163 } 133 164 134 - __setup_new_cell(b, key, inmate, cell_prealloc); 165 + __setup_new_cell(key, inmate, cell_prealloc); 135 166 *cell_result = cell_prealloc; 167 + 168 + rb_link_node(&cell_prealloc->node, parent, new); 169 + rb_insert_color(&cell_prealloc->node, &prison->cells); 170 + 136 171 return 0; 137 172 } 138 173 ··· 149 174 { 150 175 int r; 151 176 unsigned long flags; 152 - struct bucket *b = get_bucket(prison, key); 153 177 154 - spin_lock_irqsave(&b->lock, flags); 155 - r = __bio_detain(b, key, inmate, cell_prealloc, cell_result); 156 - spin_unlock_irqrestore(&b->lock, flags); 178 + spin_lock_irqsave(&prison->lock, flags); 179 + r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result); 180 + spin_unlock_irqrestore(&prison->lock, flags); 157 181 158 182 return r; 159 183 } ··· 179 205 /* 180 206 * @inmates must have been initialised prior to this call 181 207 */ 182 - static void __cell_release(struct dm_bio_prison_cell *cell, 208 + static void __cell_release(struct dm_bio_prison *prison, 209 + struct dm_bio_prison_cell *cell, 183 210 struct bio_list *inmates) 184 211 { 185 - hlist_del(&cell->list); 212 + rb_erase(&cell->node, &prison->cells); 186 213 187 214 if (inmates) { 188 215 if (cell->holder) ··· 197 222 struct bio_list *bios) 198 223 { 199 224 unsigned long flags; 200 - struct bucket *b = get_bucket(prison, &cell->key); 201 225 202 - spin_lock_irqsave(&b->lock, flags); 203 - __cell_release(cell, bios); 204 - spin_unlock_irqrestore(&b->lock, flags); 226 + spin_lock_irqsave(&prison->lock, flags); 227 + __cell_release(prison, cell, bios); 228 + spin_unlock_irqrestore(&prison->lock, flags); 205 229 } 206 230 EXPORT_SYMBOL_GPL(dm_cell_release); 207 231 208 232 /* 209 233 * Sometimes we don't want the holder, just the additional bios. 210 234 */ 211 - static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, 235 + static void __cell_release_no_holder(struct dm_bio_prison *prison, 236 + struct dm_bio_prison_cell *cell, 212 237 struct bio_list *inmates) 213 238 { 214 - hlist_del(&cell->list); 239 + rb_erase(&cell->node, &prison->cells); 215 240 bio_list_merge(inmates, &cell->bios); 216 241 } 217 242 ··· 220 245 struct bio_list *inmates) 221 246 { 222 247 unsigned long flags; 223 - struct bucket *b = get_bucket(prison, &cell->key); 224 248 225 - spin_lock_irqsave(&b->lock, flags); 226 - __cell_release_no_holder(cell, inmates); 227 - spin_unlock_irqrestore(&b->lock, flags); 249 + spin_lock_irqsave(&prison->lock, flags); 250 + __cell_release_no_holder(prison, cell, inmates); 251 + spin_unlock_irqrestore(&prison->lock, flags); 228 252 } 229 253 EXPORT_SYMBOL_GPL(dm_cell_release_no_holder); 230 254 ··· 240 266 bio_endio(bio, error); 241 267 } 242 268 EXPORT_SYMBOL_GPL(dm_cell_error); 269 + 270 + void dm_cell_visit_release(struct dm_bio_prison *prison, 271 + void (*visit_fn)(void *, struct dm_bio_prison_cell *), 272 + void *context, 273 + struct dm_bio_prison_cell *cell) 274 + { 275 + unsigned long flags; 276 + 277 + spin_lock_irqsave(&prison->lock, flags); 278 + visit_fn(context, cell); 279 + rb_erase(&cell->node, &prison->cells); 280 + spin_unlock_irqrestore(&prison->lock, flags); 281 + } 282 + EXPORT_SYMBOL_GPL(dm_cell_visit_release); 243 283 244 284 /*----------------------------------------------------------------*/ 245 285

+21 -7

drivers/md/dm-bio-prison.h

··· 10 10 #include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */ 11 11 #include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */ 12 12 13 - #include <linux/list.h> 14 13 #include <linux/bio.h> 14 + #include <linux/rbtree.h> 15 15 16 16 /*----------------------------------------------------------------*/ 17 17 ··· 23 23 */ 24 24 struct dm_bio_prison; 25 25 26 - /* FIXME: this needs to be more abstract */ 26 + /* 27 + * Keys define a range of blocks within either a virtual or physical 28 + * device. 29 + */ 27 30 struct dm_cell_key { 28 31 int virtual; 29 32 dm_thin_id dev; 30 - dm_block_t block; 33 + dm_block_t block_begin, block_end; 31 34 }; 32 35 33 36 /* ··· 38 35 * themselves. 39 36 */ 40 37 struct dm_bio_prison_cell { 41 - struct hlist_node list; 38 + struct list_head user_list; /* for client use */ 39 + struct rb_node node; 40 + 42 41 struct dm_cell_key key; 43 42 struct bio *holder; 44 43 struct bio_list bios; 45 44 }; 46 45 47 - struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells); 46 + struct dm_bio_prison *dm_bio_prison_create(void); 48 47 void dm_bio_prison_destroy(struct dm_bio_prison *prison); 49 48 50 49 /* ··· 62 57 struct dm_bio_prison_cell *cell); 63 58 64 59 /* 65 - * Creates, or retrieves a cell for the given key. 60 + * Creates, or retrieves a cell that overlaps the given key. 66 61 * 67 62 * Returns 1 if pre-existing cell returned, zero if new cell created using 68 63 * @cell_prealloc. ··· 73 68 struct dm_bio_prison_cell **cell_result); 74 69 75 70 /* 76 - * An atomic op that combines retrieving a cell, and adding a bio to it. 71 + * An atomic op that combines retrieving or creating a cell, and adding a 72 + * bio to it. 77 73 * 78 74 * Returns 1 if the cell was already held, 0 if @inmate is the new holder. 79 75 */ ··· 92 86 struct bio_list *inmates); 93 87 void dm_cell_error(struct dm_bio_prison *prison, 94 88 struct dm_bio_prison_cell *cell, int error); 89 + 90 + /* 91 + * Visits the cell and then releases. Guarantees no new inmates are 92 + * inserted between the visit and release. 93 + */ 94 + void dm_cell_visit_release(struct dm_bio_prison *prison, 95 + void (*visit_fn)(void *, struct dm_bio_prison_cell *), 96 + void *context, struct dm_bio_prison_cell *cell); 95 97 96 98 /*----------------------------------------------------------------*/ 97 99

+148 -78

drivers/md/dm-bufio.c

··· 14 14 #include <linux/vmalloc.h> 15 15 #include <linux/shrinker.h> 16 16 #include <linux/module.h> 17 + #include <linux/rbtree.h> 17 18 18 19 #define DM_MSG_PREFIX "bufio" 19 20 ··· 35 34 /* 36 35 * Check buffer ages in this interval (seconds) 37 36 */ 38 - #define DM_BUFIO_WORK_TIMER_SECS 10 37 + #define DM_BUFIO_WORK_TIMER_SECS 30 39 38 40 39 /* 41 40 * Free buffers when they are older than this (seconds) 42 41 */ 43 - #define DM_BUFIO_DEFAULT_AGE_SECS 60 42 + #define DM_BUFIO_DEFAULT_AGE_SECS 300 43 + 44 + /* 45 + * The nr of bytes of cached data to keep around. 46 + */ 47 + #define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024) 44 48 45 49 /* 46 50 * The number of bvec entries that are embedded directly in the buffer. 47 51 * If the chunk size is larger, dm-io is used to do the io. 48 52 */ 49 53 #define DM_BUFIO_INLINE_VECS 16 50 - 51 - /* 52 - * Buffer hash 53 - */ 54 - #define DM_BUFIO_HASH_BITS 20 55 - #define DM_BUFIO_HASH(block) \ 56 - ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \ 57 - ((1 << DM_BUFIO_HASH_BITS) - 1)) 58 54 59 55 /* 60 56 * Don't try to use kmem_cache_alloc for blocks larger than this. ··· 104 106 105 107 unsigned minimum_buffers; 106 108 107 - struct hlist_head *cache_hash; 109 + struct rb_root buffer_tree; 108 110 wait_queue_head_t free_buffer_wait; 109 111 110 112 int async_write_error; ··· 133 135 }; 134 136 135 137 struct dm_buffer { 136 - struct hlist_node hash_list; 138 + struct rb_node node; 137 139 struct list_head lru_list; 138 140 sector_t block; 139 141 void *data; ··· 221 223 * Buffers are freed after this timeout 222 224 */ 223 225 static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; 226 + static unsigned dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES; 224 227 225 228 static unsigned long dm_bufio_peak_allocated; 226 229 static unsigned long dm_bufio_allocated_kmem_cache; ··· 251 252 * dm_bufio_cache_size_per_client and dm_bufio_client_count 252 253 */ 253 254 static DEFINE_MUTEX(dm_bufio_clients_lock); 255 + 256 + /*---------------------------------------------------------------- 257 + * A red/black tree acts as an index for all the buffers. 258 + *--------------------------------------------------------------*/ 259 + static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) 260 + { 261 + struct rb_node *n = c->buffer_tree.rb_node; 262 + struct dm_buffer *b; 263 + 264 + while (n) { 265 + b = container_of(n, struct dm_buffer, node); 266 + 267 + if (b->block == block) 268 + return b; 269 + 270 + n = (b->block < block) ? n->rb_left : n->rb_right; 271 + } 272 + 273 + return NULL; 274 + } 275 + 276 + static void __insert(struct dm_bufio_client *c, struct dm_buffer *b) 277 + { 278 + struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL; 279 + struct dm_buffer *found; 280 + 281 + while (*new) { 282 + found = container_of(*new, struct dm_buffer, node); 283 + 284 + if (found->block == b->block) { 285 + BUG_ON(found != b); 286 + return; 287 + } 288 + 289 + parent = *new; 290 + new = (found->block < b->block) ? 291 + &((*new)->rb_left) : &((*new)->rb_right); 292 + } 293 + 294 + rb_link_node(&b->node, parent, new); 295 + rb_insert_color(&b->node, &c->buffer_tree); 296 + } 297 + 298 + static void __remove(struct dm_bufio_client *c, struct dm_buffer *b) 299 + { 300 + rb_erase(&b->node, &c->buffer_tree); 301 + } 254 302 255 303 /*----------------------------------------------------------------*/ 256 304 ··· 480 434 b->block = block; 481 435 b->list_mode = dirty; 482 436 list_add(&b->lru_list, &c->lru[dirty]); 483 - hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]); 437 + __insert(b->c, b); 484 438 b->last_accessed = jiffies; 485 439 } 486 440 ··· 494 448 BUG_ON(!c->n_buffers[b->list_mode]); 495 449 496 450 c->n_buffers[b->list_mode]--; 497 - hlist_del(&b->hash_list); 451 + __remove(b->c, b); 498 452 list_del(&b->lru_list); 499 453 } 500 454 ··· 578 532 end_io(&b->bio, r); 579 533 } 580 534 535 + static void inline_endio(struct bio *bio, int error) 536 + { 537 + bio_end_io_t *end_fn = bio->bi_private; 538 + 539 + /* 540 + * Reset the bio to free any attached resources 541 + * (e.g. bio integrity profiles). 542 + */ 543 + bio_reset(bio); 544 + 545 + end_fn(bio, error); 546 + } 547 + 581 548 static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, 582 549 bio_end_io_t *end_io) 583 550 { ··· 602 543 b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; 603 544 b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits; 604 545 b->bio.bi_bdev = b->c->bdev; 605 - b->bio.bi_end_io = end_io; 546 + b->bio.bi_end_io = inline_endio; 547 + /* 548 + * Use of .bi_private isn't a problem here because 549 + * the dm_buffer's inline bio is local to bufio. 550 + */ 551 + b->bio.bi_private = end_io; 606 552 607 553 /* 608 554 * We assume that if len >= PAGE_SIZE ptr is page-aligned. ··· 949 885 950 886 if (c->n_buffers[LIST_DIRTY] > threshold_buffers) 951 887 __write_dirty_buffers_async(c, 1, write_list); 952 - } 953 - 954 - /* 955 - * Find a buffer in the hash. 956 - */ 957 - static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) 958 - { 959 - struct dm_buffer *b; 960 - 961 - hlist_for_each_entry(b, &c->cache_hash[DM_BUFIO_HASH(block)], 962 - hash_list) { 963 - dm_bufio_cond_resched(); 964 - if (b->block == block) 965 - return b; 966 - } 967 - 968 - return NULL; 969 888 } 970 889 971 890 /*---------------------------------------------------------------- ··· 1480 1433 } 1481 1434 1482 1435 /* 1483 - * Test if the buffer is unused and too old, and commit it. 1436 + * We may not be able to evict this buffer if IO pending or the client 1437 + * is still using it. Caller is expected to know buffer is too old. 1438 + * 1484 1439 * And if GFP_NOFS is used, we must not do any I/O because we hold 1485 1440 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets 1486 1441 * rerouted to different bufio client. 1487 1442 */ 1488 - static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp, 1489 - unsigned long max_jiffies) 1443 + static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp) 1490 1444 { 1491 - if (jiffies - b->last_accessed < max_jiffies) 1492 - return 0; 1493 - 1494 1445 if (!(gfp & __GFP_FS)) { 1495 1446 if (test_bit(B_READING, &b->state) || 1496 1447 test_bit(B_WRITING, &b->state) || 1497 1448 test_bit(B_DIRTY, &b->state)) 1498 - return 0; 1449 + return false; 1499 1450 } 1500 1451 1501 1452 if (b->hold_count) 1502 - return 0; 1453 + return false; 1503 1454 1504 1455 __make_buffer_clean(b); 1505 1456 __unlink_buffer(b); 1506 1457 __free_buffer_wake(b); 1507 1458 1508 - return 1; 1459 + return true; 1509 1460 } 1510 1461 1511 - static long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, 1512 - gfp_t gfp_mask) 1462 + static unsigned get_retain_buffers(struct dm_bufio_client *c) 1463 + { 1464 + unsigned retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes); 1465 + return retain_bytes / c->block_size; 1466 + } 1467 + 1468 + static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, 1469 + gfp_t gfp_mask) 1513 1470 { 1514 1471 int l; 1515 1472 struct dm_buffer *b, *tmp; 1516 - long freed = 0; 1473 + unsigned long freed = 0; 1474 + unsigned long count = nr_to_scan; 1475 + unsigned retain_target = get_retain_buffers(c); 1517 1476 1518 1477 for (l = 0; l < LIST_SIZE; l++) { 1519 1478 list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) { 1520 - freed += __cleanup_old_buffer(b, gfp_mask, 0); 1521 - if (!--nr_to_scan) 1479 + if (__try_evict_buffer(b, gfp_mask)) 1480 + freed++; 1481 + if (!--nr_to_scan || ((count - freed) <= retain_target)) 1522 1482 return freed; 1523 1483 dm_bufio_cond_resched(); 1524 1484 } ··· 1587 1533 r = -ENOMEM; 1588 1534 goto bad_client; 1589 1535 } 1590 - c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS); 1591 - if (!c->cache_hash) { 1592 - r = -ENOMEM; 1593 - goto bad_hash; 1594 - } 1536 + c->buffer_tree = RB_ROOT; 1595 1537 1596 1538 c->bdev = bdev; 1597 1539 c->block_size = block_size; ··· 1605 1555 INIT_LIST_HEAD(&c->lru[i]); 1606 1556 c->n_buffers[i] = 0; 1607 1557 } 1608 - 1609 - for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) 1610 - INIT_HLIST_HEAD(&c->cache_hash[i]); 1611 1558 1612 1559 mutex_init(&c->lock); 1613 1560 INIT_LIST_HEAD(&c->reserved_buffers); ··· 1679 1632 } 1680 1633 dm_io_client_destroy(c->dm_io); 1681 1634 bad_dm_io: 1682 - vfree(c->cache_hash); 1683 - bad_hash: 1684 1635 kfree(c); 1685 1636 bad_client: 1686 1637 return ERR_PTR(r); ··· 1705 1660 1706 1661 mutex_unlock(&dm_bufio_clients_lock); 1707 1662 1708 - for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) 1709 - BUG_ON(!hlist_empty(&c->cache_hash[i])); 1710 - 1663 + BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree)); 1711 1664 BUG_ON(c->need_reserved_buffers); 1712 1665 1713 1666 while (!list_empty(&c->reserved_buffers)) { ··· 1723 1680 BUG_ON(c->n_buffers[i]); 1724 1681 1725 1682 dm_io_client_destroy(c->dm_io); 1726 - vfree(c->cache_hash); 1727 1683 kfree(c); 1728 1684 } 1729 1685 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); 1730 1686 1731 - static void cleanup_old_buffers(void) 1687 + static unsigned get_max_age_hz(void) 1732 1688 { 1733 - unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age); 1734 - struct dm_bufio_client *c; 1689 + unsigned max_age = ACCESS_ONCE(dm_bufio_max_age); 1735 1690 1736 - if (max_age > ULONG_MAX / HZ) 1737 - max_age = ULONG_MAX / HZ; 1691 + if (max_age > UINT_MAX / HZ) 1692 + max_age = UINT_MAX / HZ; 1738 1693 1739 - mutex_lock(&dm_bufio_clients_lock); 1740 - list_for_each_entry(c, &dm_bufio_all_clients, client_list) { 1741 - if (!dm_bufio_trylock(c)) 1742 - continue; 1694 + return max_age * HZ; 1695 + } 1743 1696 1744 - while (!list_empty(&c->lru[LIST_CLEAN])) { 1745 - struct dm_buffer *b; 1746 - b = list_entry(c->lru[LIST_CLEAN].prev, 1747 - struct dm_buffer, lru_list); 1748 - if (!__cleanup_old_buffer(b, 0, max_age * HZ)) 1749 - break; 1750 - dm_bufio_cond_resched(); 1751 - } 1697 + static bool older_than(struct dm_buffer *b, unsigned long age_hz) 1698 + { 1699 + return (jiffies - b->last_accessed) >= age_hz; 1700 + } 1752 1701 1753 - dm_bufio_unlock(c); 1702 + static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz) 1703 + { 1704 + struct dm_buffer *b, *tmp; 1705 + unsigned retain_target = get_retain_buffers(c); 1706 + unsigned count; 1707 + 1708 + dm_bufio_lock(c); 1709 + 1710 + count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; 1711 + list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) { 1712 + if (count <= retain_target) 1713 + break; 1714 + 1715 + if (!older_than(b, age_hz)) 1716 + break; 1717 + 1718 + if (__try_evict_buffer(b, 0)) 1719 + count--; 1720 + 1754 1721 dm_bufio_cond_resched(); 1755 1722 } 1723 + 1724 + dm_bufio_unlock(c); 1725 + } 1726 + 1727 + static void cleanup_old_buffers(void) 1728 + { 1729 + unsigned long max_age_hz = get_max_age_hz(); 1730 + struct dm_bufio_client *c; 1731 + 1732 + mutex_lock(&dm_bufio_clients_lock); 1733 + 1734 + list_for_each_entry(c, &dm_bufio_all_clients, client_list) 1735 + __evict_old_buffers(c, max_age_hz); 1736 + 1756 1737 mutex_unlock(&dm_bufio_clients_lock); 1757 1738 } 1758 1739 ··· 1900 1833 1901 1834 module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); 1902 1835 MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); 1836 + 1837 + module_param_named(retain_bytes, dm_bufio_retain_bytes, uint, S_IRUGO | S_IWUSR); 1838 + MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory"); 1903 1839 1904 1840 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); 1905 1841 MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");

+11

drivers/md/dm-cache-block-types.h

··· 19 19 20 20 typedef dm_block_t __bitwise__ dm_oblock_t; 21 21 typedef uint32_t __bitwise__ dm_cblock_t; 22 + typedef dm_block_t __bitwise__ dm_dblock_t; 22 23 23 24 static inline dm_oblock_t to_oblock(dm_block_t b) 24 25 { ··· 39 38 static inline uint32_t from_cblock(dm_cblock_t b) 40 39 { 41 40 return (__force uint32_t) b; 41 + } 42 + 43 + static inline dm_dblock_t to_dblock(dm_block_t b) 44 + { 45 + return (__force dm_dblock_t) b; 46 + } 47 + 48 + static inline dm_block_t from_dblock(dm_dblock_t b) 49 + { 50 + return (__force dm_block_t) b; 42 51 } 43 52 44 53 #endif /* DM_CACHE_BLOCK_TYPES_H */

+17 -17

drivers/md/dm-cache-metadata.c

··· 109 109 dm_block_t discard_root; 110 110 111 111 sector_t discard_block_size; 112 - dm_oblock_t discard_nr_blocks; 112 + dm_dblock_t discard_nr_blocks; 113 113 114 114 sector_t data_block_size; 115 115 dm_cblock_t cache_blocks; ··· 329 329 disk_super->hint_root = cpu_to_le64(cmd->hint_root); 330 330 disk_super->discard_root = cpu_to_le64(cmd->discard_root); 331 331 disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); 332 - disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks)); 332 + disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks)); 333 333 disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE); 334 334 disk_super->data_block_size = cpu_to_le32(cmd->data_block_size); 335 335 disk_super->cache_blocks = cpu_to_le32(0); ··· 528 528 cmd->hint_root = le64_to_cpu(disk_super->hint_root); 529 529 cmd->discard_root = le64_to_cpu(disk_super->discard_root); 530 530 cmd->discard_block_size = le64_to_cpu(disk_super->discard_block_size); 531 - cmd->discard_nr_blocks = to_oblock(le64_to_cpu(disk_super->discard_nr_blocks)); 531 + cmd->discard_nr_blocks = to_dblock(le64_to_cpu(disk_super->discard_nr_blocks)); 532 532 cmd->data_block_size = le32_to_cpu(disk_super->data_block_size); 533 533 cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks)); 534 534 strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name)); ··· 626 626 disk_super->hint_root = cpu_to_le64(cmd->hint_root); 627 627 disk_super->discard_root = cpu_to_le64(cmd->discard_root); 628 628 disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); 629 - disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks)); 629 + disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks)); 630 630 disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks)); 631 631 strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name)); 632 632 disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]); ··· 797 797 798 798 int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, 799 799 sector_t discard_block_size, 800 - dm_oblock_t new_nr_entries) 800 + dm_dblock_t new_nr_entries) 801 801 { 802 802 int r; 803 803 804 804 down_write(&cmd->root_lock); 805 805 r = dm_bitset_resize(&cmd->discard_info, 806 806 cmd->discard_root, 807 - from_oblock(cmd->discard_nr_blocks), 808 - from_oblock(new_nr_entries), 807 + from_dblock(cmd->discard_nr_blocks), 808 + from_dblock(new_nr_entries), 809 809 false, &cmd->discard_root); 810 810 if (!r) { 811 811 cmd->discard_block_size = discard_block_size; ··· 818 818 return r; 819 819 } 820 820 821 - static int __set_discard(struct dm_cache_metadata *cmd, dm_oblock_t b) 821 + static int __set_discard(struct dm_cache_metadata *cmd, dm_dblock_t b) 822 822 { 823 823 return dm_bitset_set_bit(&cmd->discard_info, cmd->discard_root, 824 - from_oblock(b), &cmd->discard_root); 824 + from_dblock(b), &cmd->discard_root); 825 825 } 826 826 827 - static int __clear_discard(struct dm_cache_metadata *cmd, dm_oblock_t b) 827 + static int __clear_discard(struct dm_cache_metadata *cmd, dm_dblock_t b) 828 828 { 829 829 return dm_bitset_clear_bit(&cmd->discard_info, cmd->discard_root, 830 - from_oblock(b), &cmd->discard_root); 830 + from_dblock(b), &cmd->discard_root); 831 831 } 832 832 833 - static int __is_discarded(struct dm_cache_metadata *cmd, dm_oblock_t b, 833 + static int __is_discarded(struct dm_cache_metadata *cmd, dm_dblock_t b, 834 834 bool *is_discarded) 835 835 { 836 836 return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root, 837 - from_oblock(b), &cmd->discard_root, 837 + from_dblock(b), &cmd->discard_root, 838 838 is_discarded); 839 839 } 840 840 841 841 static int __discard(struct dm_cache_metadata *cmd, 842 - dm_oblock_t dblock, bool discard) 842 + dm_dblock_t dblock, bool discard) 843 843 { 844 844 int r; 845 845 ··· 852 852 } 853 853 854 854 int dm_cache_set_discard(struct dm_cache_metadata *cmd, 855 - dm_oblock_t dblock, bool discard) 855 + dm_dblock_t dblock, bool discard) 856 856 { 857 857 int r; 858 858 ··· 870 870 dm_block_t b; 871 871 bool discard; 872 872 873 - for (b = 0; b < from_oblock(cmd->discard_nr_blocks); b++) { 874 - dm_oblock_t dblock = to_oblock(b); 873 + for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) { 874 + dm_dblock_t dblock = to_dblock(b); 875 875 876 876 if (cmd->clean_when_opened) { 877 877 r = __is_discarded(cmd, dblock, &discard);

+3 -3

drivers/md/dm-cache-metadata.h

··· 70 70 71 71 int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, 72 72 sector_t discard_block_size, 73 - dm_oblock_t new_nr_entries); 73 + dm_dblock_t new_nr_entries); 74 74 75 75 typedef int (*load_discard_fn)(void *context, sector_t discard_block_size, 76 - dm_oblock_t dblock, bool discarded); 76 + dm_dblock_t dblock, bool discarded); 77 77 int dm_cache_load_discards(struct dm_cache_metadata *cmd, 78 78 load_discard_fn fn, void *context); 79 79 80 - int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_oblock_t dblock, bool discard); 80 + int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_dblock_t dblock, bool discard); 81 81 82 82 int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock); 83 83 int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock, dm_oblock_t oblock);

+55 -29

drivers/md/dm-cache-policy-mq.c

··· 181 181 * Gives us the oldest entry of the lowest popoulated level. If the first 182 182 * level is emptied then we shift down one level. 183 183 */ 184 - static struct list_head *queue_pop(struct queue *q) 184 + static struct list_head *queue_peek(struct queue *q) 185 185 { 186 186 unsigned level; 187 - struct list_head *r; 188 187 189 188 for (level = 0; level < NR_QUEUE_LEVELS; level++) 190 - if (!list_empty(q->qs + level)) { 191 - r = q->qs[level].next; 192 - list_del(r); 193 - 194 - /* have we just emptied the bottom level? */ 195 - if (level == 0 && list_empty(q->qs)) 196 - queue_shift_down(q); 197 - 198 - return r; 199 - } 189 + if (!list_empty(q->qs + level)) 190 + return q->qs[level].next; 200 191 201 192 return NULL; 193 + } 194 + 195 + static struct list_head *queue_pop(struct queue *q) 196 + { 197 + struct list_head *r = queue_peek(q); 198 + 199 + if (r) { 200 + list_del(r); 201 + 202 + /* have we just emptied the bottom level? */ 203 + if (list_empty(q->qs)) 204 + queue_shift_down(q); 205 + } 206 + 207 + return r; 202 208 } 203 209 204 210 static struct list_head *list_pop(struct list_head *lh) ··· 389 383 unsigned generation; 390 384 unsigned generation_period; /* in lookups (will probably change) */ 391 385 392 - /* 393 - * Entries in the pre_cache whose hit count passes the promotion 394 - * threshold move to the cache proper. Working out the correct 395 - * value for the promotion_threshold is crucial to this policy. 396 - */ 397 - unsigned promote_threshold; 398 - 399 386 unsigned discard_promote_adjustment; 400 387 unsigned read_promote_adjustment; 401 388 unsigned write_promote_adjustment; ··· 405 406 #define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1 406 407 #define DEFAULT_READ_PROMOTE_ADJUSTMENT 4 407 408 #define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8 409 + #define DISCOURAGE_DEMOTING_DIRTY_THRESHOLD 128 408 410 409 411 /*----------------------------------------------------------------*/ 410 412 ··· 518 518 return e; 519 519 } 520 520 521 + static struct entry *peek(struct queue *q) 522 + { 523 + struct list_head *h = queue_peek(q); 524 + return h ? container_of(h, struct entry, list) : NULL; 525 + } 526 + 521 527 /* 522 528 * Has this entry already been updated? 523 529 */ ··· 576 570 break; 577 571 } 578 572 } 579 - 580 - mq->promote_threshold = nr ? total / nr : 1; 581 - if (mq->promote_threshold * nr < total) 582 - mq->promote_threshold++; 583 573 } 584 574 } 585 575 ··· 643 641 } 644 642 645 643 /* 644 + * Entries in the pre_cache whose hit count passes the promotion 645 + * threshold move to the cache proper. Working out the correct 646 + * value for the promotion_threshold is crucial to this policy. 647 + */ 648 + static unsigned promote_threshold(struct mq_policy *mq) 649 + { 650 + struct entry *e; 651 + 652 + if (any_free_cblocks(mq)) 653 + return 0; 654 + 655 + e = peek(&mq->cache_clean); 656 + if (e) 657 + return e->hit_count; 658 + 659 + e = peek(&mq->cache_dirty); 660 + if (e) 661 + return e->hit_count + DISCOURAGE_DEMOTING_DIRTY_THRESHOLD; 662 + 663 + /* This should never happen */ 664 + return 0; 665 + } 666 + 667 + /* 646 668 * We modify the basic promotion_threshold depending on the specific io. 647 669 * 648 670 * If the origin block has been discarded then there's no cost to copy it ··· 679 653 bool discarded_oblock, int data_dir) 680 654 { 681 655 if (data_dir == READ) 682 - return mq->promote_threshold + mq->read_promote_adjustment; 656 + return promote_threshold(mq) + mq->read_promote_adjustment; 683 657 684 658 if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) { 685 659 /* ··· 689 663 return mq->discard_promote_adjustment; 690 664 } 691 665 692 - return mq->promote_threshold + mq->write_promote_adjustment; 666 + return promote_threshold(mq) + mq->write_promote_adjustment; 693 667 } 694 668 695 669 static bool should_promote(struct mq_policy *mq, struct entry *e, ··· 865 839 if (e && in_cache(mq, e)) 866 840 r = cache_entry_found(mq, e, result); 867 841 868 - else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL) 842 + else if (mq->tracker.thresholds[PATTERN_SEQUENTIAL] && 843 + iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL) 869 844 result->op = POLICY_MISS; 870 845 871 846 else if (e) ··· 1257 1230 mq->tick = 0; 1258 1231 mq->hit_count = 0; 1259 1232 mq->generation = 0; 1260 - mq->promote_threshold = 0; 1261 1233 mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT; 1262 1234 mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT; 1263 1235 mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT; ··· 1291 1265 1292 1266 static struct dm_cache_policy_type mq_policy_type = { 1293 1267 .name = "mq", 1294 - .version = {1, 2, 0}, 1268 + .version = {1, 3, 0}, 1295 1269 .hint_size = 4, 1296 1270 .owner = THIS_MODULE, 1297 1271 .create = mq_create ··· 1299 1273 1300 1274 static struct dm_cache_policy_type default_policy_type = { 1301 1275 .name = "default", 1302 - .version = {1, 2, 0}, 1276 + .version = {1, 3, 0}, 1303 1277 .hint_size = 4, 1304 1278 .owner = THIS_MODULE, 1305 1279 .create = mq_create,

+303 -79

drivers/md/dm-cache-target.c

··· 95 95 96 96 /*----------------------------------------------------------------*/ 97 97 98 - #define PRISON_CELLS 1024 99 98 #define MIGRATION_POOL_SIZE 128 100 99 #define COMMIT_PERIOD HZ 101 100 #define MIGRATION_COUNT_WINDOW 10 ··· 236 237 /* 237 238 * origin_blocks entries, discarded if set. 238 239 */ 239 - dm_oblock_t discard_nr_blocks; 240 + dm_dblock_t discard_nr_blocks; 240 241 unsigned long *discard_bitset; 242 + uint32_t discard_block_size; /* a power of 2 times sectors per block */ 241 243 242 244 /* 243 245 * Rather than reconstructing the table line for the status we just ··· 310 310 dm_cblock_t cblock; 311 311 312 312 bool err:1; 313 + bool discard:1; 313 314 bool writeback:1; 314 315 bool demote:1; 315 316 bool promote:1; ··· 434 433 435 434 /*----------------------------------------------------------------*/ 436 435 437 - static void build_key(dm_oblock_t oblock, struct dm_cell_key *key) 436 + static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key) 438 437 { 439 438 key->virtual = 0; 440 439 key->dev = 0; 441 - key->block = from_oblock(oblock); 440 + key->block_begin = from_oblock(begin); 441 + key->block_end = from_oblock(end); 442 442 } 443 443 444 444 /* ··· 449 447 */ 450 448 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); 451 449 452 - static int bio_detain(struct cache *cache, dm_oblock_t oblock, 453 - struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 454 - cell_free_fn free_fn, void *free_context, 455 - struct dm_bio_prison_cell **cell_result) 450 + static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end, 451 + struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 452 + cell_free_fn free_fn, void *free_context, 453 + struct dm_bio_prison_cell **cell_result) 456 454 { 457 455 int r; 458 456 struct dm_cell_key key; 459 457 460 - build_key(oblock, &key); 458 + build_key(oblock_begin, oblock_end, &key); 461 459 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); 462 460 if (r) 463 461 free_fn(free_context, cell_prealloc); 464 462 465 463 return r; 464 + } 465 + 466 + static int bio_detain(struct cache *cache, dm_oblock_t oblock, 467 + struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 468 + cell_free_fn free_fn, void *free_context, 469 + struct dm_bio_prison_cell **cell_result) 470 + { 471 + dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 472 + return bio_detain_range(cache, oblock, end, bio, 473 + cell_prealloc, free_fn, free_context, cell_result); 466 474 } 467 475 468 476 static int get_cell(struct cache *cache, ··· 486 474 487 475 cell_prealloc = prealloc_get_cell(structs); 488 476 489 - build_key(oblock, &key); 477 + build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key); 490 478 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); 491 479 if (r) 492 480 prealloc_put_cell(structs, cell_prealloc); ··· 536 524 return b; 537 525 } 538 526 539 - static void set_discard(struct cache *cache, dm_oblock_t b) 527 + static dm_block_t oblocks_per_dblock(struct cache *cache) 528 + { 529 + dm_block_t oblocks = cache->discard_block_size; 530 + 531 + if (block_size_is_power_of_two(cache)) 532 + oblocks >>= cache->sectors_per_block_shift; 533 + else 534 + oblocks = block_div(oblocks, cache->sectors_per_block); 535 + 536 + return oblocks; 537 + } 538 + 539 + static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 540 + { 541 + return to_dblock(block_div(from_oblock(oblock), 542 + oblocks_per_dblock(cache))); 543 + } 544 + 545 + static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock) 546 + { 547 + return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache)); 548 + } 549 + 550 + static void set_discard(struct cache *cache, dm_dblock_t b) 540 551 { 541 552 unsigned long flags; 542 553 554 + BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); 543 555 atomic_inc(&cache->stats.discard_count); 544 556 545 557 spin_lock_irqsave(&cache->lock, flags); 546 - set_bit(from_oblock(b), cache->discard_bitset); 558 + set_bit(from_dblock(b), cache->discard_bitset); 547 559 spin_unlock_irqrestore(&cache->lock, flags); 548 560 } 549 561 550 - static void clear_discard(struct cache *cache, dm_oblock_t b) 562 + static void clear_discard(struct cache *cache, dm_dblock_t b) 551 563 { 552 564 unsigned long flags; 553 565 554 566 spin_lock_irqsave(&cache->lock, flags); 555 - clear_bit(from_oblock(b), cache->discard_bitset); 567 + clear_bit(from_dblock(b), cache->discard_bitset); 556 568 spin_unlock_irqrestore(&cache->lock, flags); 557 569 } 558 570 559 - static bool is_discarded(struct cache *cache, dm_oblock_t b) 571 + static bool is_discarded(struct cache *cache, dm_dblock_t b) 560 572 { 561 573 int r; 562 574 unsigned long flags; 563 575 564 576 spin_lock_irqsave(&cache->lock, flags); 565 - r = test_bit(from_oblock(b), cache->discard_bitset); 577 + r = test_bit(from_dblock(b), cache->discard_bitset); 566 578 spin_unlock_irqrestore(&cache->lock, flags); 567 579 568 580 return r; ··· 598 562 unsigned long flags; 599 563 600 564 spin_lock_irqsave(&cache->lock, flags); 601 - r = test_bit(from_oblock(b), cache->discard_bitset); 565 + r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 566 + cache->discard_bitset); 602 567 spin_unlock_irqrestore(&cache->lock, flags); 603 568 604 569 return r; ··· 724 687 check_if_tick_bio_needed(cache, bio); 725 688 remap_to_origin(cache, bio); 726 689 if (bio_data_dir(bio) == WRITE) 727 - clear_discard(cache, oblock); 690 + clear_discard(cache, oblock_to_dblock(cache, oblock)); 728 691 } 729 692 730 693 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, ··· 734 697 remap_to_cache(cache, bio, cblock); 735 698 if (bio_data_dir(bio) == WRITE) { 736 699 set_dirty(cache, oblock, cblock); 737 - clear_discard(cache, oblock); 700 + clear_discard(cache, oblock_to_dblock(cache, oblock)); 738 701 } 739 702 } 740 703 ··· 988 951 } 989 952 990 953 } else { 991 - clear_dirty(cache, mg->new_oblock, mg->cblock); 992 - if (mg->requeue_holder) 954 + if (mg->requeue_holder) { 955 + clear_dirty(cache, mg->new_oblock, mg->cblock); 993 956 cell_defer(cache, mg->new_ocell, true); 994 - else { 957 + } else { 958 + /* 959 + * The block was promoted via an overwrite, so it's dirty. 960 + */ 961 + set_dirty(cache, mg->new_oblock, mg->cblock); 995 962 bio_endio(mg->new_ocell->holder, 0); 996 963 cell_defer(cache, mg->new_ocell, false); 997 964 } ··· 1019 978 wake_worker(cache); 1020 979 } 1021 980 1022 - static void issue_copy_real(struct dm_cache_migration *mg) 981 + static void issue_copy(struct dm_cache_migration *mg) 1023 982 { 1024 983 int r; 1025 984 struct dm_io_region o_region, c_region; ··· 1098 1057 migration_success_pre_commit(mg); 1099 1058 } 1100 1059 1101 - static void issue_copy(struct dm_cache_migration *mg) 1060 + static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1061 + dm_dblock_t *b, dm_dblock_t *e) 1062 + { 1063 + sector_t sb = bio->bi_iter.bi_sector; 1064 + sector_t se = bio_end_sector(bio); 1065 + 1066 + *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1067 + 1068 + if (se - sb < cache->discard_block_size) 1069 + *e = *b; 1070 + else 1071 + *e = to_dblock(block_div(se, cache->discard_block_size)); 1072 + } 1073 + 1074 + static void issue_discard(struct dm_cache_migration *mg) 1075 + { 1076 + dm_dblock_t b, e; 1077 + struct bio *bio = mg->new_ocell->holder; 1078 + 1079 + calc_discard_block_range(mg->cache, bio, &b, &e); 1080 + while (b != e) { 1081 + set_discard(mg->cache, b); 1082 + b = to_dblock(from_dblock(b) + 1); 1083 + } 1084 + 1085 + bio_endio(bio, 0); 1086 + cell_defer(mg->cache, mg->new_ocell, false); 1087 + free_migration(mg); 1088 + } 1089 + 1090 + static void issue_copy_or_discard(struct dm_cache_migration *mg) 1102 1091 { 1103 1092 bool avoid; 1104 1093 struct cache *cache = mg->cache; 1094 + 1095 + if (mg->discard) { 1096 + issue_discard(mg); 1097 + return; 1098 + } 1105 1099 1106 1100 if (mg->writeback || mg->demote) 1107 1101 avoid = !is_dirty(cache, mg->cblock) || ··· 1146 1070 1147 1071 avoid = is_discarded_oblock(cache, mg->new_oblock); 1148 1072 1149 - if (!avoid && bio_writes_complete_block(cache, bio)) { 1073 + if (writeback_mode(&cache->features) && 1074 + !avoid && bio_writes_complete_block(cache, bio)) { 1150 1075 issue_overwrite(mg, bio); 1151 1076 return; 1152 1077 } 1153 1078 } 1154 1079 1155 - avoid ? avoid_copy(mg) : issue_copy_real(mg); 1080 + avoid ? avoid_copy(mg) : issue_copy(mg); 1156 1081 } 1157 1082 1158 1083 static void complete_migration(struct dm_cache_migration *mg) ··· 1238 1161 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1239 1162 1240 1163 mg->err = false; 1164 + mg->discard = false; 1241 1165 mg->writeback = false; 1242 1166 mg->demote = false; 1243 1167 mg->promote = true; ··· 1262 1184 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1263 1185 1264 1186 mg->err = false; 1187 + mg->discard = false; 1265 1188 mg->writeback = true; 1266 1189 mg->demote = false; 1267 1190 mg->promote = false; ··· 1288 1209 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1289 1210 1290 1211 mg->err = false; 1212 + mg->discard = false; 1291 1213 mg->writeback = false; 1292 1214 mg->demote = true; 1293 1215 mg->promote = true; ··· 1317 1237 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1318 1238 1319 1239 mg->err = false; 1240 + mg->discard = false; 1320 1241 mg->writeback = false; 1321 1242 mg->demote = true; 1322 1243 mg->promote = false; ··· 1331 1250 mg->start_jiffies = jiffies; 1332 1251 1333 1252 inc_nr_migrations(cache); 1253 + quiesce_migration(mg); 1254 + } 1255 + 1256 + static void discard(struct cache *cache, struct prealloc *structs, 1257 + struct dm_bio_prison_cell *cell) 1258 + { 1259 + struct dm_cache_migration *mg = prealloc_get_migration(structs); 1260 + 1261 + mg->err = false; 1262 + mg->discard = true; 1263 + mg->writeback = false; 1264 + mg->demote = false; 1265 + mg->promote = false; 1266 + mg->requeue_holder = false; 1267 + mg->invalidate = false; 1268 + mg->cache = cache; 1269 + mg->old_ocell = NULL; 1270 + mg->new_ocell = cell; 1271 + mg->start_jiffies = jiffies; 1272 + 1334 1273 quiesce_migration(mg); 1335 1274 } 1336 1275 ··· 1387 1286 issue(cache, bio); 1388 1287 } 1389 1288 1390 - /* 1391 - * People generally discard large parts of a device, eg, the whole device 1392 - * when formatting. Splitting these large discards up into cache block 1393 - * sized ios and then quiescing (always neccessary for discard) takes too 1394 - * long. 1395 - * 1396 - * We keep it simple, and allow any size of discard to come in, and just 1397 - * mark off blocks on the discard bitset. No passdown occurs! 1398 - * 1399 - * To implement passdown we need to change the bio_prison such that a cell 1400 - * can have a key that spans many blocks. 1401 - */ 1402 - static void process_discard_bio(struct cache *cache, struct bio *bio) 1289 + static void process_discard_bio(struct cache *cache, struct prealloc *structs, 1290 + struct bio *bio) 1403 1291 { 1404 - dm_block_t start_block = dm_sector_div_up(bio->bi_iter.bi_sector, 1405 - cache->sectors_per_block); 1406 - dm_block_t end_block = bio_end_sector(bio); 1407 - dm_block_t b; 1292 + int r; 1293 + dm_dblock_t b, e; 1294 + struct dm_bio_prison_cell *cell_prealloc, *new_ocell; 1408 1295 1409 - end_block = block_div(end_block, cache->sectors_per_block); 1296 + calc_discard_block_range(cache, bio, &b, &e); 1297 + if (b == e) { 1298 + bio_endio(bio, 0); 1299 + return; 1300 + } 1410 1301 1411 - for (b = start_block; b < end_block; b++) 1412 - set_discard(cache, to_oblock(b)); 1302 + cell_prealloc = prealloc_get_cell(structs); 1303 + r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc, 1304 + (cell_free_fn) prealloc_put_cell, 1305 + structs, &new_ocell); 1306 + if (r > 0) 1307 + return; 1413 1308 1414 - bio_endio(bio, 0); 1309 + discard(cache, structs, new_ocell); 1415 1310 } 1416 1311 1417 1312 static bool spare_migration_bandwidth(struct cache *cache) ··· 1437 1340 dm_oblock_t block = get_bio_block(cache, bio); 1438 1341 struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell; 1439 1342 struct policy_result lookup_result; 1440 - bool discarded_block = is_discarded_oblock(cache, block); 1441 1343 bool passthrough = passthrough_mode(&cache->features); 1442 - bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache)); 1344 + bool discarded_block, can_migrate; 1443 1345 1444 1346 /* 1445 1347 * Check to see if that block is currently migrating. ··· 1449 1353 structs, &new_ocell); 1450 1354 if (r > 0) 1451 1355 return; 1356 + 1357 + discarded_block = is_discarded_oblock(cache, block); 1358 + can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache)); 1452 1359 1453 1360 r = policy_map(cache->policy, block, true, can_migrate, discarded_block, 1454 1361 bio, &lookup_result); ··· 1599 1500 if (bio->bi_rw & REQ_FLUSH) 1600 1501 process_flush_bio(cache, bio); 1601 1502 else if (bio->bi_rw & REQ_DISCARD) 1602 - process_discard_bio(cache, bio); 1503 + process_discard_bio(cache, &structs, bio); 1603 1504 else 1604 1505 process_bio(cache, &structs, bio); 1605 1506 } ··· 1814 1715 process_invalidation_requests(cache); 1815 1716 } 1816 1717 1817 - process_migrations(cache, &cache->quiesced_migrations, issue_copy); 1718 + process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard); 1818 1719 process_migrations(cache, &cache->completed_migrations, complete_migration); 1819 1720 1820 1721 if (commit_if_needed(cache)) { ··· 2279 2180 return 0; 2280 2181 } 2281 2182 2183 + /* 2184 + * We want the discard block size to be at least the size of the cache 2185 + * block size and have no more than 2^14 discard blocks across the origin. 2186 + */ 2187 + #define MAX_DISCARD_BLOCKS (1 << 14) 2188 + 2189 + static bool too_many_discard_blocks(sector_t discard_block_size, 2190 + sector_t origin_size) 2191 + { 2192 + (void) sector_div(origin_size, discard_block_size); 2193 + 2194 + return origin_size > MAX_DISCARD_BLOCKS; 2195 + } 2196 + 2197 + static sector_t calculate_discard_block_size(sector_t cache_block_size, 2198 + sector_t origin_size) 2199 + { 2200 + sector_t discard_block_size = cache_block_size; 2201 + 2202 + if (origin_size) 2203 + while (too_many_discard_blocks(discard_block_size, origin_size)) 2204 + discard_block_size *= 2; 2205 + 2206 + return discard_block_size; 2207 + } 2208 + 2209 + static void set_cache_size(struct cache *cache, dm_cblock_t size) 2210 + { 2211 + dm_block_t nr_blocks = from_cblock(size); 2212 + 2213 + if (nr_blocks > (1 << 20) && cache->cache_size != size) 2214 + DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" 2215 + "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" 2216 + "Please consider increasing the cache block size to reduce the overall cache block count.", 2217 + (unsigned long long) nr_blocks); 2218 + 2219 + cache->cache_size = size; 2220 + } 2221 + 2282 2222 #define DEFAULT_MIGRATION_THRESHOLD 2048 2283 2223 2284 2224 static int cache_create(struct cache_args *ca, struct cache **result) ··· 2342 2204 ti->num_discard_bios = 1; 2343 2205 ti->discards_supported = true; 2344 2206 ti->discard_zeroes_data_unsupported = true; 2345 - /* Discard bios must be split on a block boundary */ 2346 - ti->split_discard_bios = true; 2207 + ti->split_discard_bios = false; 2347 2208 2348 2209 cache->features = ca->features; 2349 2210 ti->per_bio_data_size = get_per_bio_data_size(cache); ··· 2372 2235 2373 2236 cache->sectors_per_block_shift = -1; 2374 2237 cache_size = block_div(cache_size, ca->block_size); 2375 - cache->cache_size = to_cblock(cache_size); 2238 + set_cache_size(cache, to_cblock(cache_size)); 2376 2239 } else { 2377 2240 cache->sectors_per_block_shift = __ffs(ca->block_size); 2378 - cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift); 2241 + set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); 2379 2242 } 2380 2243 2381 2244 r = create_cache_policy(cache, ca, error); ··· 2440 2303 } 2441 2304 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2442 2305 2443 - cache->discard_nr_blocks = cache->origin_blocks; 2444 - cache->discard_bitset = alloc_bitset(from_oblock(cache->discard_nr_blocks)); 2306 + cache->discard_block_size = 2307 + calculate_discard_block_size(cache->sectors_per_block, 2308 + cache->origin_sectors); 2309 + cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, 2310 + cache->discard_block_size)); 2311 + cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2445 2312 if (!cache->discard_bitset) { 2446 2313 *error = "could not allocate discard bitset"; 2447 2314 goto bad; 2448 2315 } 2449 - clear_bitset(cache->discard_bitset, from_oblock(cache->discard_nr_blocks)); 2316 + clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2450 2317 2451 2318 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2452 2319 if (IS_ERR(cache->copier)) { ··· 2468 2327 INIT_DELAYED_WORK(&cache->waker, do_waker); 2469 2328 cache->last_commit_jiffies = jiffies; 2470 2329 2471 - cache->prison = dm_bio_prison_create(PRISON_CELLS); 2330 + cache->prison = dm_bio_prison_create(); 2472 2331 if (!cache->prison) { 2473 2332 *error = "could not create bio prison"; 2474 2333 goto bad; ··· 2690 2549 static int cache_map(struct dm_target *ti, struct bio *bio) 2691 2550 { 2692 2551 int r; 2693 - struct dm_bio_prison_cell *cell; 2552 + struct dm_bio_prison_cell *cell = NULL; 2694 2553 struct cache *cache = ti->private; 2695 2554 2696 2555 r = __cache_map(cache, bio, &cell); 2697 - if (r == DM_MAPIO_REMAPPED) { 2556 + if (r == DM_MAPIO_REMAPPED && cell) { 2698 2557 inc_ds(cache, bio, cell); 2699 2558 cell_defer(cache, cell, false); 2700 2559 } ··· 2740 2599 { 2741 2600 unsigned i, r; 2742 2601 2743 - r = dm_cache_discard_bitset_resize(cache->cmd, cache->sectors_per_block, 2744 - cache->origin_blocks); 2602 + r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 2603 + cache->discard_nr_blocks); 2745 2604 if (r) { 2746 2605 DMERR("could not resize on-disk discard bitset"); 2747 2606 return r; 2748 2607 } 2749 2608 2750 - for (i = 0; i < from_oblock(cache->discard_nr_blocks); i++) { 2751 - r = dm_cache_set_discard(cache->cmd, to_oblock(i), 2752 - is_discarded(cache, to_oblock(i))); 2609 + for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 2610 + r = dm_cache_set_discard(cache->cmd, to_dblock(i), 2611 + is_discarded(cache, to_dblock(i))); 2753 2612 if (r) 2754 2613 return r; 2755 2614 } ··· 2821 2680 return 0; 2822 2681 } 2823 2682 2824 - static int load_discard(void *context, sector_t discard_block_size, 2825 - dm_oblock_t oblock, bool discard) 2826 - { 2827 - struct cache *cache = context; 2683 + /* 2684 + * The discard block size in the on disk metadata is not 2685 + * neccessarily the same as we're currently using. So we have to 2686 + * be careful to only set the discarded attribute if we know it 2687 + * covers a complete block of the new size. 2688 + */ 2689 + struct discard_load_info { 2690 + struct cache *cache; 2828 2691 2829 - if (discard) 2830 - set_discard(cache, oblock); 2831 - else 2832 - clear_discard(cache, oblock); 2692 + /* 2693 + * These blocks are sized using the on disk dblock size, rather 2694 + * than the current one. 2695 + */ 2696 + dm_block_t block_size; 2697 + dm_block_t discard_begin, discard_end; 2698 + }; 2699 + 2700 + static void discard_load_info_init(struct cache *cache, 2701 + struct discard_load_info *li) 2702 + { 2703 + li->cache = cache; 2704 + li->discard_begin = li->discard_end = 0; 2705 + } 2706 + 2707 + static void set_discard_range(struct discard_load_info *li) 2708 + { 2709 + sector_t b, e; 2710 + 2711 + if (li->discard_begin == li->discard_end) 2712 + return; 2713 + 2714 + /* 2715 + * Convert to sectors. 2716 + */ 2717 + b = li->discard_begin * li->block_size; 2718 + e = li->discard_end * li->block_size; 2719 + 2720 + /* 2721 + * Then convert back to the current dblock size. 2722 + */ 2723 + b = dm_sector_div_up(b, li->cache->discard_block_size); 2724 + sector_div(e, li->cache->discard_block_size); 2725 + 2726 + /* 2727 + * The origin may have shrunk, so we need to check we're still in 2728 + * bounds. 2729 + */ 2730 + if (e > from_dblock(li->cache->discard_nr_blocks)) 2731 + e = from_dblock(li->cache->discard_nr_blocks); 2732 + 2733 + for (; b < e; b++) 2734 + set_discard(li->cache, to_dblock(b)); 2735 + } 2736 + 2737 + static int load_discard(void *context, sector_t discard_block_size, 2738 + dm_dblock_t dblock, bool discard) 2739 + { 2740 + struct discard_load_info *li = context; 2741 + 2742 + li->block_size = discard_block_size; 2743 + 2744 + if (discard) { 2745 + if (from_dblock(dblock) == li->discard_end) 2746 + /* 2747 + * We're already in a discard range, just extend it. 2748 + */ 2749 + li->discard_end = li->discard_end + 1ULL; 2750 + 2751 + else { 2752 + /* 2753 + * Emit the old range and start a new one. 2754 + */ 2755 + set_discard_range(li); 2756 + li->discard_begin = from_dblock(dblock); 2757 + li->discard_end = li->discard_begin + 1ULL; 2758 + } 2759 + } else { 2760 + set_discard_range(li); 2761 + li->discard_begin = li->discard_end = 0; 2762 + } 2833 2763 2834 2764 return 0; 2835 2765 } ··· 2942 2730 return r; 2943 2731 } 2944 2732 2945 - cache->cache_size = new_size; 2733 + set_cache_size(cache, new_size); 2946 2734 2947 2735 return 0; 2948 2736 } ··· 2984 2772 } 2985 2773 2986 2774 if (!cache->loaded_discards) { 2987 - r = dm_cache_load_discards(cache->cmd, load_discard, cache); 2775 + struct discard_load_info li; 2776 + 2777 + /* 2778 + * The discard bitset could have been resized, or the 2779 + * discard block size changed. To be safe we start by 2780 + * setting every dblock to not discarded. 2781 + */ 2782 + clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2783 + 2784 + discard_load_info_init(cache, &li); 2785 + r = dm_cache_load_discards(cache->cmd, load_discard, &li); 2988 2786 if (r) { 2989 2787 DMERR("could not load origin discards"); 2990 2788 return r; 2991 2789 } 2790 + set_discard_range(&li); 2992 2791 2993 2792 cache->loaded_discards = true; 2994 2793 } ··· 3302 3079 /* 3303 3080 * FIXME: these limits may be incompatible with the cache device 3304 3081 */ 3305 - limits->max_discard_sectors = cache->sectors_per_block; 3306 - limits->discard_granularity = cache->sectors_per_block << SECTOR_SHIFT; 3082 + limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, 3083 + cache->origin_sectors); 3084 + limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 3307 3085 } 3308 3086 3309 3087 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) ··· 3328 3104 3329 3105 static struct target_type cache_target = { 3330 3106 .name = "cache", 3331 - .version = {1, 5, 0}, 3107 + .version = {1, 6, 0}, 3332 3108 .module = THIS_MODULE, 3333 3109 .ctr = cache_ctr, 3334 3110 .dtr = cache_dtr,

+1 -1

drivers/md/dm-crypt.c

··· 705 705 for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++) 706 706 crypto_xor(data + i * 8, buf, 8); 707 707 out: 708 - memset(buf, 0, sizeof(buf)); 708 + memzero_explicit(buf, sizeof(buf)); 709 709 return r; 710 710 } 711 711

+4 -1

drivers/md/dm-ioctl.c

··· 684 684 int srcu_idx; 685 685 686 686 param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | 687 - DM_ACTIVE_PRESENT_FLAG); 687 + DM_ACTIVE_PRESENT_FLAG | DM_INTERNAL_SUSPEND_FLAG); 688 688 689 689 if (dm_suspended_md(md)) 690 690 param->flags |= DM_SUSPEND_FLAG; 691 + 692 + if (dm_suspended_internally_md(md)) 693 + param->flags |= DM_INTERNAL_SUSPEND_FLAG; 691 694 692 695 if (dm_test_deferred_remove_flag(md)) 693 696 param->flags |= DM_DEFERRED_REMOVE;

+1 -1

drivers/md/dm-stats.c

··· 824 824 return 1; 825 825 826 826 id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data, 827 - dm_internal_suspend, dm_internal_resume, md); 827 + dm_internal_suspend_fast, dm_internal_resume_fast, md); 828 828 if (id < 0) 829 829 return id; 830 830

+29 -7

drivers/md/dm-table.c

··· 1521 1521 } 1522 1522 EXPORT_SYMBOL(dm_table_get_mode); 1523 1523 1524 - static void suspend_targets(struct dm_table *t, unsigned postsuspend) 1524 + enum suspend_mode { 1525 + PRESUSPEND, 1526 + PRESUSPEND_UNDO, 1527 + POSTSUSPEND, 1528 + }; 1529 + 1530 + static void suspend_targets(struct dm_table *t, enum suspend_mode mode) 1525 1531 { 1526 1532 int i = t->num_targets; 1527 1533 struct dm_target *ti = t->targets; 1528 1534 1529 1535 while (i--) { 1530 - if (postsuspend) { 1536 + switch (mode) { 1537 + case PRESUSPEND: 1538 + if (ti->type->presuspend) 1539 + ti->type->presuspend(ti); 1540 + break; 1541 + case PRESUSPEND_UNDO: 1542 + if (ti->type->presuspend_undo) 1543 + ti->type->presuspend_undo(ti); 1544 + break; 1545 + case POSTSUSPEND: 1531 1546 if (ti->type->postsuspend) 1532 1547 ti->type->postsuspend(ti); 1533 - } else if (ti->type->presuspend) 1534 - ti->type->presuspend(ti); 1535 - 1548 + break; 1549 + } 1536 1550 ti++; 1537 1551 } 1538 1552 } ··· 1556 1542 if (!t) 1557 1543 return; 1558 1544 1559 - suspend_targets(t, 0); 1545 + suspend_targets(t, PRESUSPEND); 1546 + } 1547 + 1548 + void dm_table_presuspend_undo_targets(struct dm_table *t) 1549 + { 1550 + if (!t) 1551 + return; 1552 + 1553 + suspend_targets(t, PRESUSPEND_UNDO); 1560 1554 } 1561 1555 1562 1556 void dm_table_postsuspend_targets(struct dm_table *t) ··· 1572 1550 if (!t) 1573 1551 return; 1574 1552 1575 - suspend_targets(t, 1); 1553 + suspend_targets(t, POSTSUSPEND); 1576 1554 } 1577 1555 1578 1556 int dm_table_resume_targets(struct dm_table *t)

+19 -18

drivers/md/dm-thin-metadata.c

··· 1384 1384 } 1385 1385 1386 1386 int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, 1387 - int can_block, struct dm_thin_lookup_result *result) 1387 + int can_issue_io, struct dm_thin_lookup_result *result) 1388 1388 { 1389 - int r = -EINVAL; 1390 - uint64_t block_time = 0; 1389 + int r; 1391 1390 __le64 value; 1392 1391 struct dm_pool_metadata *pmd = td->pmd; 1393 1392 dm_block_t keys[2] = { td->id, block }; 1394 1393 struct dm_btree_info *info; 1395 1394 1396 - if (can_block) { 1397 - down_read(&pmd->root_lock); 1398 - info = &pmd->info; 1399 - } else if (down_read_trylock(&pmd->root_lock)) 1400 - info = &pmd->nb_info; 1401 - else 1402 - return -EWOULDBLOCK; 1403 - 1404 1395 if (pmd->fail_io) 1405 - goto out; 1396 + return -EINVAL; 1397 + 1398 + down_read(&pmd->root_lock); 1399 + 1400 + if (can_issue_io) { 1401 + info = &pmd->info; 1402 + } else 1403 + info = &pmd->nb_info; 1406 1404 1407 1405 r = dm_btree_lookup(info, pmd->root, keys, &value); 1408 - if (!r) 1409 - block_time = le64_to_cpu(value); 1410 - 1411 - out: 1412 - up_read(&pmd->root_lock); 1413 - 1414 1406 if (!r) { 1407 + uint64_t block_time = 0; 1415 1408 dm_block_t exception_block; 1416 1409 uint32_t exception_time; 1410 + 1411 + block_time = le64_to_cpu(value); 1417 1412 unpack_block_time(block_time, &exception_block, 1418 1413 &exception_time); 1419 1414 result->block = exception_block; 1420 1415 result->shared = __snapshotted_since(td, exception_time); 1421 1416 } 1422 1417 1418 + up_read(&pmd->root_lock); 1423 1419 return r; 1424 1420 } 1425 1421 ··· 1808 1812 up_read(&pmd->root_lock); 1809 1813 1810 1814 return needs_check; 1815 + } 1816 + 1817 + void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd) 1818 + { 1819 + dm_tm_issue_prefetches(pmd->tm); 1811 1820 }

+7 -2

drivers/md/dm-thin-metadata.h

··· 139 139 140 140 /* 141 141 * Returns: 142 - * -EWOULDBLOCK iff @can_block is set and would block. 142 + * -EWOULDBLOCK iff @can_issue_io is set and would issue IO 143 143 * -ENODATA iff that mapping is not present. 144 144 * 0 success 145 145 */ 146 146 int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, 147 - int can_block, struct dm_thin_lookup_result *result); 147 + int can_issue_io, struct dm_thin_lookup_result *result); 148 148 149 149 /* 150 150 * Obtain an unused block. ··· 212 212 */ 213 213 int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd); 214 214 bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd); 215 + 216 + /* 217 + * Issue any prefetches that may be useful. 218 + */ 219 + void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd); 215 220 216 221 /*----------------------------------------------------------------*/ 217 222

+612 -158

drivers/md/dm-thin.c

··· 11 11 #include <linux/device-mapper.h> 12 12 #include <linux/dm-io.h> 13 13 #include <linux/dm-kcopyd.h> 14 + #include <linux/log2.h> 14 15 #include <linux/list.h> 15 16 #include <linux/rculist.h> 16 17 #include <linux/init.h> 17 18 #include <linux/module.h> 18 19 #include <linux/slab.h> 20 + #include <linux/sort.h> 19 21 #include <linux/rbtree.h> 20 22 21 23 #define DM_MSG_PREFIX "thin" ··· 27 25 */ 28 26 #define ENDIO_HOOK_POOL_SIZE 1024 29 27 #define MAPPING_POOL_SIZE 1024 30 - #define PRISON_CELLS 1024 31 28 #define COMMIT_PERIOD HZ 32 29 #define NO_SPACE_TIMEOUT_SECS 60 33 30 ··· 115 114 { 116 115 key->virtual = 0; 117 116 key->dev = dm_thin_dev_id(td); 118 - key->block = b; 117 + key->block_begin = b; 118 + key->block_end = b + 1ULL; 119 119 } 120 120 121 121 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, ··· 124 122 { 125 123 key->virtual = 1; 126 124 key->dev = dm_thin_dev_id(td); 127 - key->block = b; 125 + key->block_begin = b; 126 + key->block_end = b + 1ULL; 127 + } 128 + 129 + /*----------------------------------------------------------------*/ 130 + 131 + #define THROTTLE_THRESHOLD (1 * HZ) 132 + 133 + struct throttle { 134 + struct rw_semaphore lock; 135 + unsigned long threshold; 136 + bool throttle_applied; 137 + }; 138 + 139 + static void throttle_init(struct throttle *t) 140 + { 141 + init_rwsem(&t->lock); 142 + t->throttle_applied = false; 143 + } 144 + 145 + static void throttle_work_start(struct throttle *t) 146 + { 147 + t->threshold = jiffies + THROTTLE_THRESHOLD; 148 + } 149 + 150 + static void throttle_work_update(struct throttle *t) 151 + { 152 + if (!t->throttle_applied && jiffies > t->threshold) { 153 + down_write(&t->lock); 154 + t->throttle_applied = true; 155 + } 156 + } 157 + 158 + static void throttle_work_complete(struct throttle *t) 159 + { 160 + if (t->throttle_applied) { 161 + t->throttle_applied = false; 162 + up_write(&t->lock); 163 + } 164 + } 165 + 166 + static void throttle_lock(struct throttle *t) 167 + { 168 + down_read(&t->lock); 169 + } 170 + 171 + static void throttle_unlock(struct throttle *t) 172 + { 173 + up_read(&t->lock); 128 174 } 129 175 130 176 /*----------------------------------------------------------------*/ ··· 205 155 206 156 struct thin_c; 207 157 typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio); 158 + typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell); 208 159 typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m); 160 + 161 + #define CELL_SORT_ARRAY_SIZE 8192 209 162 210 163 struct pool { 211 164 struct list_head list; ··· 224 171 225 172 struct pool_features pf; 226 173 bool low_water_triggered:1; /* A dm event has been sent */ 174 + bool suspended:1; 227 175 228 176 struct dm_bio_prison *prison; 229 177 struct dm_kcopyd_client *copier; 230 178 231 179 struct workqueue_struct *wq; 180 + struct throttle throttle; 232 181 struct work_struct worker; 233 182 struct delayed_work waker; 234 183 struct delayed_work no_space_timeout; ··· 253 198 process_bio_fn process_bio; 254 199 process_bio_fn process_discard; 255 200 201 + process_cell_fn process_cell; 202 + process_cell_fn process_discard_cell; 203 + 256 204 process_mapping_fn process_prepared_mapping; 257 205 process_mapping_fn process_prepared_discard; 206 + 207 + struct dm_bio_prison_cell *cell_sort_array[CELL_SORT_ARRAY_SIZE]; 258 208 }; 259 209 260 210 static enum pool_mode get_pool_mode(struct pool *pool); ··· 292 232 293 233 struct pool *pool; 294 234 struct dm_thin_device *td; 235 + struct mapped_device *thin_md; 236 + 295 237 bool requeue_mode:1; 296 238 spinlock_t lock; 239 + struct list_head deferred_cells; 297 240 struct bio_list deferred_bio_list; 298 241 struct bio_list retry_on_resume_list; 299 242 struct rb_root sort_bio_list; /* sorted list of deferred bios */ ··· 353 290 dm_bio_prison_free_cell(pool->prison, cell); 354 291 } 355 292 293 + static void cell_visit_release(struct pool *pool, 294 + void (*fn)(void *, struct dm_bio_prison_cell *), 295 + void *context, 296 + struct dm_bio_prison_cell *cell) 297 + { 298 + dm_cell_visit_release(pool->prison, fn, context, cell); 299 + dm_bio_prison_free_cell(pool->prison, cell); 300 + } 301 + 356 302 static void cell_release_no_holder(struct pool *pool, 357 303 struct dm_bio_prison_cell *cell, 358 304 struct bio_list *bios) 359 305 { 360 306 dm_cell_release_no_holder(pool->prison, cell, bios); 361 307 dm_bio_prison_free_cell(pool->prison, cell); 362 - } 363 - 364 - static void cell_defer_no_holder_no_free(struct thin_c *tc, 365 - struct dm_bio_prison_cell *cell) 366 - { 367 - struct pool *pool = tc->pool; 368 - unsigned long flags; 369 - 370 - spin_lock_irqsave(&tc->lock, flags); 371 - dm_cell_release_no_holder(pool->prison, cell, &tc->deferred_bio_list); 372 - spin_unlock_irqrestore(&tc->lock, flags); 373 - 374 - wake_worker(pool); 375 308 } 376 309 377 310 static void cell_error_with_code(struct pool *pool, ··· 380 321 static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell) 381 322 { 382 323 cell_error_with_code(pool, cell, -EIO); 324 + } 325 + 326 + static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell) 327 + { 328 + cell_error_with_code(pool, cell, 0); 329 + } 330 + 331 + static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell) 332 + { 333 + cell_error_with_code(pool, cell, DM_ENDIO_REQUEUE); 383 334 } 384 335 385 336 /*----------------------------------------------------------------*/ ··· 462 393 struct rb_node rb_node; 463 394 }; 464 395 465 - static void requeue_bio_list(struct thin_c *tc, struct bio_list *master) 396 + static void __merge_bio_list(struct bio_list *bios, struct bio_list *master) 397 + { 398 + bio_list_merge(bios, master); 399 + bio_list_init(master); 400 + } 401 + 402 + static void error_bio_list(struct bio_list *bios, int error) 466 403 { 467 404 struct bio *bio; 405 + 406 + while ((bio = bio_list_pop(bios))) 407 + bio_endio(bio, error); 408 + } 409 + 410 + static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error) 411 + { 468 412 struct bio_list bios; 469 413 unsigned long flags; 470 414 471 415 bio_list_init(&bios); 472 416 473 417 spin_lock_irqsave(&tc->lock, flags); 474 - bio_list_merge(&bios, master); 475 - bio_list_init(master); 418 + __merge_bio_list(&bios, master); 476 419 spin_unlock_irqrestore(&tc->lock, flags); 477 420 478 - while ((bio = bio_list_pop(&bios))) 479 - bio_endio(bio, DM_ENDIO_REQUEUE); 421 + error_bio_list(&bios, error); 422 + } 423 + 424 + static void requeue_deferred_cells(struct thin_c *tc) 425 + { 426 + struct pool *pool = tc->pool; 427 + unsigned long flags; 428 + struct list_head cells; 429 + struct dm_bio_prison_cell *cell, *tmp; 430 + 431 + INIT_LIST_HEAD(&cells); 432 + 433 + spin_lock_irqsave(&tc->lock, flags); 434 + list_splice_init(&tc->deferred_cells, &cells); 435 + spin_unlock_irqrestore(&tc->lock, flags); 436 + 437 + list_for_each_entry_safe(cell, tmp, &cells, user_list) 438 + cell_requeue(pool, cell); 480 439 } 481 440 482 441 static void requeue_io(struct thin_c *tc) 483 442 { 484 - requeue_bio_list(tc, &tc->deferred_bio_list); 485 - requeue_bio_list(tc, &tc->retry_on_resume_list); 486 - } 487 - 488 - static void error_thin_retry_list(struct thin_c *tc) 489 - { 490 - struct bio *bio; 491 - unsigned long flags; 492 443 struct bio_list bios; 444 + unsigned long flags; 493 445 494 446 bio_list_init(&bios); 495 447 496 448 spin_lock_irqsave(&tc->lock, flags); 497 - bio_list_merge(&bios, &tc->retry_on_resume_list); 498 - bio_list_init(&tc->retry_on_resume_list); 449 + __merge_bio_list(&bios, &tc->deferred_bio_list); 450 + __merge_bio_list(&bios, &tc->retry_on_resume_list); 499 451 spin_unlock_irqrestore(&tc->lock, flags); 500 452 501 - while ((bio = bio_list_pop(&bios))) 502 - bio_io_error(bio); 453 + error_bio_list(&bios, DM_ENDIO_REQUEUE); 454 + requeue_deferred_cells(tc); 503 455 } 504 456 505 457 static void error_retry_list(struct pool *pool) ··· 529 439 530 440 rcu_read_lock(); 531 441 list_for_each_entry_rcu(tc, &pool->active_thins, list) 532 - error_thin_retry_list(tc); 442 + error_thin_bio_list(tc, &tc->retry_on_resume_list, -EIO); 533 443 rcu_read_unlock(); 534 444 } 535 445 ··· 719 629 */ 720 630 721 631 /* 722 - * This sends the bios in the cell back to the deferred_bios list. 723 - */ 724 - static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell) 725 - { 726 - struct pool *pool = tc->pool; 727 - unsigned long flags; 728 - 729 - spin_lock_irqsave(&tc->lock, flags); 730 - cell_release(pool, cell, &tc->deferred_bio_list); 731 - spin_unlock_irqrestore(&tc->lock, flags); 732 - 733 - wake_worker(pool); 734 - } 735 - 736 - /* 737 - * Same as cell_defer above, except it omits the original holder of the cell. 632 + * This sends the bios in the cell, except the original holder, back 633 + * to the deferred_bios list. 738 634 */ 739 635 static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell) 740 636 { ··· 732 656 spin_unlock_irqrestore(&tc->lock, flags); 733 657 734 658 wake_worker(pool); 659 + } 660 + 661 + static void thin_defer_bio(struct thin_c *tc, struct bio *bio); 662 + 663 + struct remap_info { 664 + struct thin_c *tc; 665 + struct bio_list defer_bios; 666 + struct bio_list issue_bios; 667 + }; 668 + 669 + static void __inc_remap_and_issue_cell(void *context, 670 + struct dm_bio_prison_cell *cell) 671 + { 672 + struct remap_info *info = context; 673 + struct bio *bio; 674 + 675 + while ((bio = bio_list_pop(&cell->bios))) { 676 + if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) 677 + bio_list_add(&info->defer_bios, bio); 678 + else { 679 + inc_all_io_entry(info->tc->pool, bio); 680 + 681 + /* 682 + * We can't issue the bios with the bio prison lock 683 + * held, so we add them to a list to issue on 684 + * return from this function. 685 + */ 686 + bio_list_add(&info->issue_bios, bio); 687 + } 688 + } 689 + } 690 + 691 + static void inc_remap_and_issue_cell(struct thin_c *tc, 692 + struct dm_bio_prison_cell *cell, 693 + dm_block_t block) 694 + { 695 + struct bio *bio; 696 + struct remap_info info; 697 + 698 + info.tc = tc; 699 + bio_list_init(&info.defer_bios); 700 + bio_list_init(&info.issue_bios); 701 + 702 + /* 703 + * We have to be careful to inc any bios we're about to issue 704 + * before the cell is released, and avoid a race with new bios 705 + * being added to the cell. 706 + */ 707 + cell_visit_release(tc->pool, __inc_remap_and_issue_cell, 708 + &info, cell); 709 + 710 + while ((bio = bio_list_pop(&info.defer_bios))) 711 + thin_defer_bio(tc, bio); 712 + 713 + while ((bio = bio_list_pop(&info.issue_bios))) 714 + remap_and_issue(info.tc, bio, block); 735 715 } 736 716 737 717 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) ··· 838 706 * the bios in the cell. 839 707 */ 840 708 if (bio) { 841 - cell_defer_no_holder(tc, m->cell); 709 + inc_remap_and_issue_cell(tc, m->cell, m->data_block); 842 710 bio_endio(bio, 0); 843 - } else 844 - cell_defer(tc, m->cell); 711 + } else { 712 + inc_all_io_entry(tc->pool, m->cell->holder); 713 + remap_and_issue(tc, m->cell->holder, m->data_block); 714 + inc_remap_and_issue_cell(tc, m->cell, m->data_block); 715 + } 845 716 846 717 out: 847 718 list_del(&m->list); ··· 977 842 } 978 843 } 979 844 845 + static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio, 846 + dm_block_t data_block, 847 + struct dm_thin_new_mapping *m) 848 + { 849 + struct pool *pool = tc->pool; 850 + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 851 + 852 + h->overwrite_mapping = m; 853 + m->bio = bio; 854 + save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 855 + inc_all_io_entry(pool, bio); 856 + remap_and_issue(tc, bio, data_block); 857 + } 858 + 980 859 /* 981 860 * A partial copy also needs to zero the uncopied region. 982 861 */ ··· 1025 876 * If the whole block of data is being overwritten, we can issue the 1026 877 * bio immediately. Otherwise we use kcopyd to clone the data first. 1027 878 */ 1028 - if (io_overwrites_block(pool, bio)) { 1029 - struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1030 - 1031 - h->overwrite_mapping = m; 1032 - m->bio = bio; 1033 - save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 1034 - inc_all_io_entry(pool, bio); 1035 - remap_and_issue(tc, bio, data_dest); 1036 - } else { 879 + if (io_overwrites_block(pool, bio)) 880 + remap_and_issue_overwrite(tc, bio, data_dest, m); 881 + else { 1037 882 struct dm_io_region from, to; 1038 883 1039 884 from.bdev = origin->bdev; ··· 1096 953 if (!pool->pf.zero_new_blocks) 1097 954 process_prepared_mapping(m); 1098 955 1099 - else if (io_overwrites_block(pool, bio)) { 1100 - struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 956 + else if (io_overwrites_block(pool, bio)) 957 + remap_and_issue_overwrite(tc, bio, data_block, m); 1101 958 1102 - h->overwrite_mapping = m; 1103 - m->bio = bio; 1104 - save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 1105 - inc_all_io_entry(pool, bio); 1106 - remap_and_issue(tc, bio, data_block); 1107 - 1108 - } else 959 + else 1109 960 ll_zero(tc, m, 1110 961 data_block * pool->sectors_per_block, 1111 962 (data_block + 1) * pool->sectors_per_block); ··· 1271 1134 bio_list_init(&bios); 1272 1135 cell_release(pool, cell, &bios); 1273 1136 1274 - error = should_error_unserviceable_bio(pool); 1275 - if (error) 1276 - while ((bio = bio_list_pop(&bios))) 1277 - bio_endio(bio, error); 1278 - else 1279 - while ((bio = bio_list_pop(&bios))) 1280 - retry_on_resume(bio); 1137 + while ((bio = bio_list_pop(&bios))) 1138 + retry_on_resume(bio); 1281 1139 } 1282 1140 1283 - static void process_discard(struct thin_c *tc, struct bio *bio) 1141 + static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell) 1284 1142 { 1285 1143 int r; 1286 - unsigned long flags; 1144 + struct bio *bio = cell->holder; 1287 1145 struct pool *pool = tc->pool; 1288 - struct dm_bio_prison_cell *cell, *cell2; 1289 - struct dm_cell_key key, key2; 1146 + struct dm_bio_prison_cell *cell2; 1147 + struct dm_cell_key key2; 1290 1148 dm_block_t block = get_bio_block(tc, bio); 1291 1149 struct dm_thin_lookup_result lookup_result; 1292 1150 struct dm_thin_new_mapping *m; 1293 1151 1294 - build_virtual_key(tc->td, block, &key); 1295 - if (bio_detain(tc->pool, &key, bio, &cell)) 1152 + if (tc->requeue_mode) { 1153 + cell_requeue(pool, cell); 1296 1154 return; 1155 + } 1297 1156 1298 1157 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1299 1158 switch (r) { ··· 1320 1187 m->cell2 = cell2; 1321 1188 m->bio = bio; 1322 1189 1323 - if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) { 1324 - spin_lock_irqsave(&pool->lock, flags); 1325 - list_add_tail(&m->list, &pool->prepared_discards); 1326 - spin_unlock_irqrestore(&pool->lock, flags); 1327 - wake_worker(pool); 1328 - } 1190 + if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) 1191 + pool->process_prepared_discard(m); 1192 + 1329 1193 } else { 1330 1194 inc_all_io_entry(pool, bio); 1331 1195 cell_defer_no_holder(tc, cell); ··· 1357 1227 } 1358 1228 } 1359 1229 1230 + static void process_discard_bio(struct thin_c *tc, struct bio *bio) 1231 + { 1232 + struct dm_bio_prison_cell *cell; 1233 + struct dm_cell_key key; 1234 + dm_block_t block = get_bio_block(tc, bio); 1235 + 1236 + build_virtual_key(tc->td, block, &key); 1237 + if (bio_detain(tc->pool, &key, bio, &cell)) 1238 + return; 1239 + 1240 + process_discard_cell(tc, cell); 1241 + } 1242 + 1360 1243 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, 1361 1244 struct dm_cell_key *key, 1362 1245 struct dm_thin_lookup_result *lookup_result, ··· 1398 1255 } 1399 1256 } 1400 1257 1258 + static void __remap_and_issue_shared_cell(void *context, 1259 + struct dm_bio_prison_cell *cell) 1260 + { 1261 + struct remap_info *info = context; 1262 + struct bio *bio; 1263 + 1264 + while ((bio = bio_list_pop(&cell->bios))) { 1265 + if ((bio_data_dir(bio) == WRITE) || 1266 + (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA))) 1267 + bio_list_add(&info->defer_bios, bio); 1268 + else { 1269 + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));; 1270 + 1271 + h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds); 1272 + inc_all_io_entry(info->tc->pool, bio); 1273 + bio_list_add(&info->issue_bios, bio); 1274 + } 1275 + } 1276 + } 1277 + 1278 + static void remap_and_issue_shared_cell(struct thin_c *tc, 1279 + struct dm_bio_prison_cell *cell, 1280 + dm_block_t block) 1281 + { 1282 + struct bio *bio; 1283 + struct remap_info info; 1284 + 1285 + info.tc = tc; 1286 + bio_list_init(&info.defer_bios); 1287 + bio_list_init(&info.issue_bios); 1288 + 1289 + cell_visit_release(tc->pool, __remap_and_issue_shared_cell, 1290 + &info, cell); 1291 + 1292 + while ((bio = bio_list_pop(&info.defer_bios))) 1293 + thin_defer_bio(tc, bio); 1294 + 1295 + while ((bio = bio_list_pop(&info.issue_bios))) 1296 + remap_and_issue(tc, bio, block); 1297 + } 1298 + 1401 1299 static void process_shared_bio(struct thin_c *tc, struct bio *bio, 1402 1300 dm_block_t block, 1403 - struct dm_thin_lookup_result *lookup_result) 1301 + struct dm_thin_lookup_result *lookup_result, 1302 + struct dm_bio_prison_cell *virt_cell) 1404 1303 { 1405 - struct dm_bio_prison_cell *cell; 1304 + struct dm_bio_prison_cell *data_cell; 1406 1305 struct pool *pool = tc->pool; 1407 1306 struct dm_cell_key key; 1408 1307 ··· 1453 1268 * of being broken so we have nothing further to do here. 1454 1269 */ 1455 1270 build_data_key(tc->td, lookup_result->block, &key); 1456 - if (bio_detain(pool, &key, bio, &cell)) 1271 + if (bio_detain(pool, &key, bio, &data_cell)) { 1272 + cell_defer_no_holder(tc, virt_cell); 1457 1273 return; 1274 + } 1458 1275 1459 - if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) 1460 - break_sharing(tc, bio, block, &key, lookup_result, cell); 1461 - else { 1276 + if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) { 1277 + break_sharing(tc, bio, block, &key, lookup_result, data_cell); 1278 + cell_defer_no_holder(tc, virt_cell); 1279 + } else { 1462 1280 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1463 1281 1464 1282 h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds); 1465 1283 inc_all_io_entry(pool, bio); 1466 - cell_defer_no_holder(tc, cell); 1467 - 1468 1284 remap_and_issue(tc, bio, lookup_result->block); 1285 + 1286 + remap_and_issue_shared_cell(tc, data_cell, lookup_result->block); 1287 + remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block); 1469 1288 } 1470 1289 } 1471 1290 ··· 1522 1333 } 1523 1334 } 1524 1335 1525 - static void process_bio(struct thin_c *tc, struct bio *bio) 1336 + static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell) 1526 1337 { 1527 1338 int r; 1528 1339 struct pool *pool = tc->pool; 1340 + struct bio *bio = cell->holder; 1529 1341 dm_block_t block = get_bio_block(tc, bio); 1530 - struct dm_bio_prison_cell *cell; 1531 - struct dm_cell_key key; 1532 1342 struct dm_thin_lookup_result lookup_result; 1533 1343 1534 - /* 1535 - * If cell is already occupied, then the block is already 1536 - * being provisioned so we have nothing further to do here. 1537 - */ 1538 - build_virtual_key(tc->td, block, &key); 1539 - if (bio_detain(pool, &key, bio, &cell)) 1344 + if (tc->requeue_mode) { 1345 + cell_requeue(pool, cell); 1540 1346 return; 1347 + } 1541 1348 1542 1349 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1543 1350 switch (r) { 1544 1351 case 0: 1545 - if (lookup_result.shared) { 1546 - process_shared_bio(tc, bio, block, &lookup_result); 1547 - cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */ 1548 - } else { 1352 + if (lookup_result.shared) 1353 + process_shared_bio(tc, bio, block, &lookup_result, cell); 1354 + else { 1549 1355 inc_all_io_entry(pool, bio); 1550 - cell_defer_no_holder(tc, cell); 1551 - 1552 1356 remap_and_issue(tc, bio, lookup_result.block); 1357 + inc_remap_and_issue_cell(tc, cell, lookup_result.block); 1553 1358 } 1554 1359 break; 1555 1360 ··· 1577 1394 } 1578 1395 } 1579 1396 1580 - static void process_bio_read_only(struct thin_c *tc, struct bio *bio) 1397 + static void process_bio(struct thin_c *tc, struct bio *bio) 1398 + { 1399 + struct pool *pool = tc->pool; 1400 + dm_block_t block = get_bio_block(tc, bio); 1401 + struct dm_bio_prison_cell *cell; 1402 + struct dm_cell_key key; 1403 + 1404 + /* 1405 + * If cell is already occupied, then the block is already 1406 + * being provisioned so we have nothing further to do here. 1407 + */ 1408 + build_virtual_key(tc->td, block, &key); 1409 + if (bio_detain(pool, &key, bio, &cell)) 1410 + return; 1411 + 1412 + process_cell(tc, cell); 1413 + } 1414 + 1415 + static void __process_bio_read_only(struct thin_c *tc, struct bio *bio, 1416 + struct dm_bio_prison_cell *cell) 1581 1417 { 1582 1418 int r; 1583 1419 int rw = bio_data_dir(bio); ··· 1606 1404 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1607 1405 switch (r) { 1608 1406 case 0: 1609 - if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) 1407 + if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) { 1610 1408 handle_unserviceable_bio(tc->pool, bio); 1611 - else { 1409 + if (cell) 1410 + cell_defer_no_holder(tc, cell); 1411 + } else { 1612 1412 inc_all_io_entry(tc->pool, bio); 1613 1413 remap_and_issue(tc, bio, lookup_result.block); 1414 + if (cell) 1415 + inc_remap_and_issue_cell(tc, cell, lookup_result.block); 1614 1416 } 1615 1417 break; 1616 1418 1617 1419 case -ENODATA: 1420 + if (cell) 1421 + cell_defer_no_holder(tc, cell); 1618 1422 if (rw != READ) { 1619 1423 handle_unserviceable_bio(tc->pool, bio); 1620 1424 break; ··· 1639 1431 default: 1640 1432 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d", 1641 1433 __func__, r); 1434 + if (cell) 1435 + cell_defer_no_holder(tc, cell); 1642 1436 bio_io_error(bio); 1643 1437 break; 1644 1438 } 1439 + } 1440 + 1441 + static void process_bio_read_only(struct thin_c *tc, struct bio *bio) 1442 + { 1443 + __process_bio_read_only(tc, bio, NULL); 1444 + } 1445 + 1446 + static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell *cell) 1447 + { 1448 + __process_bio_read_only(tc, cell->holder, cell); 1645 1449 } 1646 1450 1647 1451 static void process_bio_success(struct thin_c *tc, struct bio *bio) ··· 1664 1444 static void process_bio_fail(struct thin_c *tc, struct bio *bio) 1665 1445 { 1666 1446 bio_io_error(bio); 1447 + } 1448 + 1449 + static void process_cell_success(struct thin_c *tc, struct dm_bio_prison_cell *cell) 1450 + { 1451 + cell_success(tc->pool, cell); 1452 + } 1453 + 1454 + static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell) 1455 + { 1456 + cell_error(tc->pool, cell); 1667 1457 } 1668 1458 1669 1459 /* ··· 1757 1527 struct bio *bio; 1758 1528 struct bio_list bios; 1759 1529 struct blk_plug plug; 1530 + unsigned count = 0; 1760 1531 1761 1532 if (tc->requeue_mode) { 1762 - requeue_bio_list(tc, &tc->deferred_bio_list); 1533 + error_thin_bio_list(tc, &tc->deferred_bio_list, DM_ENDIO_REQUEUE); 1763 1534 return; 1764 1535 } 1765 1536 ··· 1799 1568 pool->process_discard(tc, bio); 1800 1569 else 1801 1570 pool->process_bio(tc, bio); 1571 + 1572 + if ((count++ & 127) == 0) { 1573 + throttle_work_update(&pool->throttle); 1574 + dm_pool_issue_prefetches(pool->pmd); 1575 + } 1802 1576 } 1803 1577 blk_finish_plug(&plug); 1578 + } 1579 + 1580 + static int cmp_cells(const void *lhs, const void *rhs) 1581 + { 1582 + struct dm_bio_prison_cell *lhs_cell = *((struct dm_bio_prison_cell **) lhs); 1583 + struct dm_bio_prison_cell *rhs_cell = *((struct dm_bio_prison_cell **) rhs); 1584 + 1585 + BUG_ON(!lhs_cell->holder); 1586 + BUG_ON(!rhs_cell->holder); 1587 + 1588 + if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector) 1589 + return -1; 1590 + 1591 + if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector) 1592 + return 1; 1593 + 1594 + return 0; 1595 + } 1596 + 1597 + static unsigned sort_cells(struct pool *pool, struct list_head *cells) 1598 + { 1599 + unsigned count = 0; 1600 + struct dm_bio_prison_cell *cell, *tmp; 1601 + 1602 + list_for_each_entry_safe(cell, tmp, cells, user_list) { 1603 + if (count >= CELL_SORT_ARRAY_SIZE) 1604 + break; 1605 + 1606 + pool->cell_sort_array[count++] = cell; 1607 + list_del(&cell->user_list); 1608 + } 1609 + 1610 + sort(pool->cell_sort_array, count, sizeof(cell), cmp_cells, NULL); 1611 + 1612 + return count; 1613 + } 1614 + 1615 + static void process_thin_deferred_cells(struct thin_c *tc) 1616 + { 1617 + struct pool *pool = tc->pool; 1618 + unsigned long flags; 1619 + struct list_head cells; 1620 + struct dm_bio_prison_cell *cell; 1621 + unsigned i, j, count; 1622 + 1623 + INIT_LIST_HEAD(&cells); 1624 + 1625 + spin_lock_irqsave(&tc->lock, flags); 1626 + list_splice_init(&tc->deferred_cells, &cells); 1627 + spin_unlock_irqrestore(&tc->lock, flags); 1628 + 1629 + if (list_empty(&cells)) 1630 + return; 1631 + 1632 + do { 1633 + count = sort_cells(tc->pool, &cells); 1634 + 1635 + for (i = 0; i < count; i++) { 1636 + cell = pool->cell_sort_array[i]; 1637 + BUG_ON(!cell->holder); 1638 + 1639 + /* 1640 + * If we've got no free new_mapping structs, and processing 1641 + * this bio might require one, we pause until there are some 1642 + * prepared mappings to process. 1643 + */ 1644 + if (ensure_next_mapping(pool)) { 1645 + for (j = i; j < count; j++) 1646 + list_add(&pool->cell_sort_array[j]->user_list, &cells); 1647 + 1648 + spin_lock_irqsave(&tc->lock, flags); 1649 + list_splice(&cells, &tc->deferred_cells); 1650 + spin_unlock_irqrestore(&tc->lock, flags); 1651 + return; 1652 + } 1653 + 1654 + if (cell->holder->bi_rw & REQ_DISCARD) 1655 + pool->process_discard_cell(tc, cell); 1656 + else 1657 + pool->process_cell(tc, cell); 1658 + } 1659 + } while (!list_empty(&cells)); 1804 1660 } 1805 1661 1806 1662 static void thin_get(struct thin_c *tc); ··· 1938 1620 1939 1621 tc = get_first_thin(pool); 1940 1622 while (tc) { 1623 + process_thin_deferred_cells(tc); 1941 1624 process_thin_deferred_bios(tc); 1942 1625 tc = get_next_thin(pool, tc); 1943 1626 } ··· 1972 1653 { 1973 1654 struct pool *pool = container_of(ws, struct pool, worker); 1974 1655 1656 + throttle_work_start(&pool->throttle); 1657 + dm_pool_issue_prefetches(pool->pmd); 1658 + throttle_work_update(&pool->throttle); 1975 1659 process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping); 1660 + throttle_work_update(&pool->throttle); 1976 1661 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard); 1662 + throttle_work_update(&pool->throttle); 1977 1663 process_deferred_bios(pool); 1664 + throttle_work_complete(&pool->throttle); 1978 1665 } 1979 1666 1980 1667 /* ··· 2117 1792 dm_pool_metadata_read_only(pool->pmd); 2118 1793 pool->process_bio = process_bio_fail; 2119 1794 pool->process_discard = process_bio_fail; 1795 + pool->process_cell = process_cell_fail; 1796 + pool->process_discard_cell = process_cell_fail; 2120 1797 pool->process_prepared_mapping = process_prepared_mapping_fail; 2121 1798 pool->process_prepared_discard = process_prepared_discard_fail; 2122 1799 ··· 2131 1804 dm_pool_metadata_read_only(pool->pmd); 2132 1805 pool->process_bio = process_bio_read_only; 2133 1806 pool->process_discard = process_bio_success; 1807 + pool->process_cell = process_cell_read_only; 1808 + pool->process_discard_cell = process_cell_success; 2134 1809 pool->process_prepared_mapping = process_prepared_mapping_fail; 2135 1810 pool->process_prepared_discard = process_prepared_discard_passdown; 2136 1811 ··· 2151 1822 if (old_mode != new_mode) 2152 1823 notify_of_pool_mode_change(pool, "out-of-data-space"); 2153 1824 pool->process_bio = process_bio_read_only; 2154 - pool->process_discard = process_discard; 1825 + pool->process_discard = process_discard_bio; 1826 + pool->process_cell = process_cell_read_only; 1827 + pool->process_discard_cell = process_discard_cell; 2155 1828 pool->process_prepared_mapping = process_prepared_mapping; 2156 1829 pool->process_prepared_discard = process_prepared_discard_passdown; 2157 1830 ··· 2166 1835 notify_of_pool_mode_change(pool, "write"); 2167 1836 dm_pool_metadata_read_write(pool->pmd); 2168 1837 pool->process_bio = process_bio; 2169 - pool->process_discard = process_discard; 1838 + pool->process_discard = process_discard_bio; 1839 + pool->process_cell = process_cell; 1840 + pool->process_discard_cell = process_discard_cell; 2170 1841 pool->process_prepared_mapping = process_prepared_mapping; 2171 1842 pool->process_prepared_discard = process_prepared_discard; 2172 1843 break; ··· 2228 1895 wake_worker(pool); 2229 1896 } 2230 1897 1898 + static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio) 1899 + { 1900 + struct pool *pool = tc->pool; 1901 + 1902 + throttle_lock(&pool->throttle); 1903 + thin_defer_bio(tc, bio); 1904 + throttle_unlock(&pool->throttle); 1905 + } 1906 + 1907 + static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell) 1908 + { 1909 + unsigned long flags; 1910 + struct pool *pool = tc->pool; 1911 + 1912 + throttle_lock(&pool->throttle); 1913 + spin_lock_irqsave(&tc->lock, flags); 1914 + list_add_tail(&cell->user_list, &tc->deferred_cells); 1915 + spin_unlock_irqrestore(&tc->lock, flags); 1916 + throttle_unlock(&pool->throttle); 1917 + 1918 + wake_worker(pool); 1919 + } 1920 + 2231 1921 static void thin_hook_bio(struct thin_c *tc, struct bio *bio) 2232 1922 { 2233 1923 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); ··· 2271 1915 dm_block_t block = get_bio_block(tc, bio); 2272 1916 struct dm_thin_device *td = tc->td; 2273 1917 struct dm_thin_lookup_result result; 2274 - struct dm_bio_prison_cell cell1, cell2; 2275 - struct dm_bio_prison_cell *cell_result; 1918 + struct dm_bio_prison_cell *virt_cell, *data_cell; 2276 1919 struct dm_cell_key key; 2277 1920 2278 1921 thin_hook_bio(tc, bio); ··· 2287 1932 } 2288 1933 2289 1934 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { 2290 - thin_defer_bio(tc, bio); 1935 + thin_defer_bio_with_throttle(tc, bio); 2291 1936 return DM_MAPIO_SUBMITTED; 2292 1937 } 2293 1938 ··· 2296 1941 * there's a race with discard. 2297 1942 */ 2298 1943 build_virtual_key(tc->td, block, &key); 2299 - if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result)) 1944 + if (bio_detain(tc->pool, &key, bio, &virt_cell)) 2300 1945 return DM_MAPIO_SUBMITTED; 2301 1946 2302 1947 r = dm_thin_find_block(td, block, 0, &result); ··· 2321 1966 * More distant ancestors are irrelevant. The 2322 1967 * shared flag will be set in their case. 2323 1968 */ 2324 - thin_defer_bio(tc, bio); 2325 - cell_defer_no_holder_no_free(tc, &cell1); 1969 + thin_defer_cell(tc, virt_cell); 2326 1970 return DM_MAPIO_SUBMITTED; 2327 1971 } 2328 1972 2329 1973 build_data_key(tc->td, result.block, &key); 2330 - if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) { 2331 - cell_defer_no_holder_no_free(tc, &cell1); 1974 + if (bio_detain(tc->pool, &key, bio, &data_cell)) { 1975 + cell_defer_no_holder(tc, virt_cell); 2332 1976 return DM_MAPIO_SUBMITTED; 2333 1977 } 2334 1978 2335 1979 inc_all_io_entry(tc->pool, bio); 2336 - cell_defer_no_holder_no_free(tc, &cell2); 2337 - cell_defer_no_holder_no_free(tc, &cell1); 1980 + cell_defer_no_holder(tc, data_cell); 1981 + cell_defer_no_holder(tc, virt_cell); 2338 1982 2339 1983 remap(tc, bio, result.block); 2340 1984 return DM_MAPIO_REMAPPED; ··· 2345 1991 * of doing so. 2346 1992 */ 2347 1993 handle_unserviceable_bio(tc->pool, bio); 2348 - cell_defer_no_holder_no_free(tc, &cell1); 1994 + cell_defer_no_holder(tc, virt_cell); 2349 1995 return DM_MAPIO_SUBMITTED; 2350 1996 } 2351 1997 /* fall through */ 2352 1998 2353 1999 case -EWOULDBLOCK: 2354 - /* 2355 - * In future, the failed dm_thin_find_block above could 2356 - * provide the hint to load the metadata into cache. 2357 - */ 2358 - thin_defer_bio(tc, bio); 2359 - cell_defer_no_holder_no_free(tc, &cell1); 2000 + thin_defer_cell(tc, virt_cell); 2360 2001 return DM_MAPIO_SUBMITTED; 2361 2002 2362 2003 default: ··· 2361 2012 * pool is switched to fail-io mode. 2362 2013 */ 2363 2014 bio_io_error(bio); 2364 - cell_defer_no_holder_no_free(tc, &cell1); 2015 + cell_defer_no_holder(tc, virt_cell); 2365 2016 return DM_MAPIO_SUBMITTED; 2366 2017 } 2367 2018 } ··· 2542 2193 pool->sectors_per_block_shift = __ffs(block_size); 2543 2194 pool->low_water_blocks = 0; 2544 2195 pool_features_init(&pool->pf); 2545 - pool->prison = dm_bio_prison_create(PRISON_CELLS); 2196 + pool->prison = dm_bio_prison_create(); 2546 2197 if (!pool->prison) { 2547 2198 *error = "Error creating pool's bio prison"; 2548 2199 err_p = ERR_PTR(-ENOMEM); ··· 2568 2219 goto bad_wq; 2569 2220 } 2570 2221 2222 + throttle_init(&pool->throttle); 2571 2223 INIT_WORK(&pool->worker, do_worker); 2572 2224 INIT_DELAYED_WORK(&pool->waker, do_waker); 2573 2225 INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout); ··· 2578 2228 INIT_LIST_HEAD(&pool->prepared_discards); 2579 2229 INIT_LIST_HEAD(&pool->active_thins); 2580 2230 pool->low_water_triggered = false; 2231 + pool->suspended = true; 2581 2232 2582 2233 pool->shared_read_ds = dm_deferred_set_create(); 2583 2234 if (!pool->shared_read_ds) { ··· 3115 2764 return 0; 3116 2765 } 3117 2766 2767 + static void pool_suspend_active_thins(struct pool *pool) 2768 + { 2769 + struct thin_c *tc; 2770 + 2771 + /* Suspend all active thin devices */ 2772 + tc = get_first_thin(pool); 2773 + while (tc) { 2774 + dm_internal_suspend_noflush(tc->thin_md); 2775 + tc = get_next_thin(pool, tc); 2776 + } 2777 + } 2778 + 2779 + static void pool_resume_active_thins(struct pool *pool) 2780 + { 2781 + struct thin_c *tc; 2782 + 2783 + /* Resume all active thin devices */ 2784 + tc = get_first_thin(pool); 2785 + while (tc) { 2786 + dm_internal_resume(tc->thin_md); 2787 + tc = get_next_thin(pool, tc); 2788 + } 2789 + } 2790 + 3118 2791 static void pool_resume(struct dm_target *ti) 3119 2792 { 3120 2793 struct pool_c *pt = ti->private; 3121 2794 struct pool *pool = pt->pool; 3122 2795 unsigned long flags; 3123 2796 2797 + /* 2798 + * Must requeue active_thins' bios and then resume 2799 + * active_thins _before_ clearing 'suspend' flag. 2800 + */ 2801 + requeue_bios(pool); 2802 + pool_resume_active_thins(pool); 2803 + 3124 2804 spin_lock_irqsave(&pool->lock, flags); 3125 2805 pool->low_water_triggered = false; 2806 + pool->suspended = false; 3126 2807 spin_unlock_irqrestore(&pool->lock, flags); 3127 - requeue_bios(pool); 3128 2808 3129 2809 do_waker(&pool->waker.work); 2810 + } 2811 + 2812 + static void pool_presuspend(struct dm_target *ti) 2813 + { 2814 + struct pool_c *pt = ti->private; 2815 + struct pool *pool = pt->pool; 2816 + unsigned long flags; 2817 + 2818 + spin_lock_irqsave(&pool->lock, flags); 2819 + pool->suspended = true; 2820 + spin_unlock_irqrestore(&pool->lock, flags); 2821 + 2822 + pool_suspend_active_thins(pool); 2823 + } 2824 + 2825 + static void pool_presuspend_undo(struct dm_target *ti) 2826 + { 2827 + struct pool_c *pt = ti->private; 2828 + struct pool *pool = pt->pool; 2829 + unsigned long flags; 2830 + 2831 + pool_resume_active_thins(pool); 2832 + 2833 + spin_lock_irqsave(&pool->lock, flags); 2834 + pool->suspended = false; 2835 + spin_unlock_irqrestore(&pool->lock, flags); 3130 2836 } 3131 2837 3132 2838 static void pool_postsuspend(struct dm_target *ti) ··· 3357 2949 * create_thin <dev_id> 3358 2950 * create_snap <dev_id> <origin_id> 3359 2951 * delete <dev_id> 3360 - * trim <dev_id> <new_size_in_sectors> 3361 2952 * set_transaction_id <current_trans_id> <new_trans_id> 3362 2953 * reserve_metadata_snap 3363 2954 * release_metadata_snap ··· 3584 3177 { 3585 3178 struct pool_c *pt = ti->private; 3586 3179 struct pool *pool = pt->pool; 3587 - uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3180 + sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3181 + 3182 + /* 3183 + * If max_sectors is smaller than pool->sectors_per_block adjust it 3184 + * to the highest possible power-of-2 factor of pool->sectors_per_block. 3185 + * This is especially beneficial when the pool's data device is a RAID 3186 + * device that has a full stripe width that matches pool->sectors_per_block 3187 + * -- because even though partial RAID stripe-sized IOs will be issued to a 3188 + * single RAID stripe; when aggregated they will end on a full RAID stripe 3189 + * boundary.. which avoids additional partial RAID stripe writes cascading 3190 + */ 3191 + if (limits->max_sectors < pool->sectors_per_block) { 3192 + while (!is_factor(pool->sectors_per_block, limits->max_sectors)) { 3193 + if ((limits->max_sectors & (limits->max_sectors - 1)) == 0) 3194 + limits->max_sectors--; 3195 + limits->max_sectors = rounddown_pow_of_two(limits->max_sectors); 3196 + } 3197 + } 3588 3198 3589 3199 /* 3590 3200 * If the system-determined stacked limits are compatible with the 3591 3201 * pool's blocksize (io_opt is a factor) do not override them. 3592 3202 */ 3593 3203 if (io_opt_sectors < pool->sectors_per_block || 3594 - do_div(io_opt_sectors, pool->sectors_per_block)) { 3595 - blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT); 3204 + !is_factor(io_opt_sectors, pool->sectors_per_block)) { 3205 + if (is_factor(pool->sectors_per_block, limits->max_sectors)) 3206 + blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT); 3207 + else 3208 + blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT); 3596 3209 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 3597 3210 } 3598 3211 ··· 3641 3214 .name = "thin-pool", 3642 3215 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 3643 3216 DM_TARGET_IMMUTABLE, 3644 - .version = {1, 13, 0}, 3217 + .version = {1, 14, 0}, 3645 3218 .module = THIS_MODULE, 3646 3219 .ctr = pool_ctr, 3647 3220 .dtr = pool_dtr, 3648 3221 .map = pool_map, 3222 + .presuspend = pool_presuspend, 3223 + .presuspend_undo = pool_presuspend_undo, 3649 3224 .postsuspend = pool_postsuspend, 3650 3225 .preresume = pool_preresume, 3651 3226 .resume = pool_resume, ··· 3677 3248 struct thin_c *tc = ti->private; 3678 3249 unsigned long flags; 3679 3250 3680 - thin_put(tc); 3681 - wait_for_completion(&tc->can_destroy); 3682 - 3683 3251 spin_lock_irqsave(&tc->pool->lock, flags); 3684 3252 list_del_rcu(&tc->list); 3685 3253 spin_unlock_irqrestore(&tc->pool->lock, flags); 3686 3254 synchronize_rcu(); 3255 + 3256 + thin_put(tc); 3257 + wait_for_completion(&tc->can_destroy); 3687 3258 3688 3259 mutex_lock(&dm_thin_pool_table.mutex); 3689 3260 ··· 3731 3302 r = -ENOMEM; 3732 3303 goto out_unlock; 3733 3304 } 3305 + tc->thin_md = dm_table_get_md(ti->table); 3734 3306 spin_lock_init(&tc->lock); 3307 + INIT_LIST_HEAD(&tc->deferred_cells); 3735 3308 bio_list_init(&tc->deferred_bio_list); 3736 3309 bio_list_init(&tc->retry_on_resume_list); 3737 3310 tc->sort_bio_list = RB_ROOT; ··· 3778 3347 if (get_pool_mode(tc->pool) == PM_FAIL) { 3779 3348 ti->error = "Couldn't open thin device, Pool is in fail mode"; 3780 3349 r = -EINVAL; 3781 - goto bad_thin_open; 3350 + goto bad_pool; 3782 3351 } 3783 3352 3784 3353 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); 3785 3354 if (r) { 3786 3355 ti->error = "Couldn't open thin internal device"; 3787 - goto bad_thin_open; 3356 + goto bad_pool; 3788 3357 } 3789 3358 3790 3359 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block); 3791 3360 if (r) 3792 - goto bad_target_max_io_len; 3361 + goto bad; 3793 3362 3794 3363 ti->num_flush_bios = 1; 3795 3364 ti->flush_supported = true; ··· 3804 3373 ti->split_discard_bios = true; 3805 3374 } 3806 3375 3807 - dm_put(pool_md); 3808 - 3809 3376 mutex_unlock(&dm_thin_pool_table.mutex); 3810 3377 3811 - atomic_set(&tc->refcount, 1); 3812 - init_completion(&tc->can_destroy); 3813 - 3814 3378 spin_lock_irqsave(&tc->pool->lock, flags); 3379 + if (tc->pool->suspended) { 3380 + spin_unlock_irqrestore(&tc->pool->lock, flags); 3381 + mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */ 3382 + ti->error = "Unable to activate thin device while pool is suspended"; 3383 + r = -EINVAL; 3384 + goto bad; 3385 + } 3815 3386 list_add_tail_rcu(&tc->list, &tc->pool->active_thins); 3816 3387 spin_unlock_irqrestore(&tc->pool->lock, flags); 3817 3388 /* ··· 3824 3391 */ 3825 3392 synchronize_rcu(); 3826 3393 3394 + dm_put(pool_md); 3395 + 3396 + atomic_set(&tc->refcount, 1); 3397 + init_completion(&tc->can_destroy); 3398 + 3827 3399 return 0; 3828 3400 3829 - bad_target_max_io_len: 3401 + bad: 3830 3402 dm_pool_close_thin_device(tc->td); 3831 - bad_thin_open: 3403 + bad_pool: 3832 3404 __pool_dec(tc->pool); 3833 3405 bad_pool_lookup: 3834 3406 dm_put(pool_md); ··· 3979 3541 DMEMIT("Error"); 3980 3542 } 3981 3543 3544 + static int thin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, 3545 + struct bio_vec *biovec, int max_size) 3546 + { 3547 + struct thin_c *tc = ti->private; 3548 + struct request_queue *q = bdev_get_queue(tc->pool_dev->bdev); 3549 + 3550 + if (!q->merge_bvec_fn) 3551 + return max_size; 3552 + 3553 + bvm->bi_bdev = tc->pool_dev->bdev; 3554 + bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector); 3555 + 3556 + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 3557 + } 3558 + 3982 3559 static int thin_iterate_devices(struct dm_target *ti, 3983 3560 iterate_devices_callout_fn fn, void *data) 3984 3561 { ··· 4018 3565 4019 3566 static struct target_type thin_target = { 4020 3567 .name = "thin", 4021 - .version = {1, 13, 0}, 3568 + .version = {1, 14, 0}, 4022 3569 .module = THIS_MODULE, 4023 3570 .ctr = thin_ctr, 4024 3571 .dtr = thin_dtr, ··· 4028 3575 .presuspend = thin_presuspend, 4029 3576 .postsuspend = thin_postsuspend, 4030 3577 .status = thin_status, 3578 + .merge = thin_merge, 4031 3579 .iterate_devices = thin_iterate_devices, 4032 3580 }; 4033 3581

+235 -108

drivers/md/dm.c

··· 19 19 #include <linux/idr.h> 20 20 #include <linux/hdreg.h> 21 21 #include <linux/delay.h> 22 + #include <linux/wait.h> 22 23 23 24 #include <trace/events/block.h> 24 25 ··· 118 117 #define DMF_NOFLUSH_SUSPENDING 5 119 118 #define DMF_MERGE_IS_OPTIONAL 6 120 119 #define DMF_DEFERRED_REMOVE 7 120 + #define DMF_SUSPENDED_INTERNALLY 8 121 121 122 122 /* 123 123 * A dummy definition to make RCU happy. ··· 142 140 * Use dm_get_live_table{_fast} or take suspend_lock for 143 141 * dereference. 144 142 */ 145 - struct dm_table *map; 143 + struct dm_table __rcu *map; 146 144 147 145 struct list_head table_devices; 148 146 struct mutex table_devices_lock; ··· 527 525 goto out; 528 526 529 527 tgt = dm_table_get_target(map, 0); 528 + if (!tgt->type->ioctl) 529 + goto out; 530 530 531 531 if (dm_suspended_md(md)) { 532 532 r = -EAGAIN; 533 533 goto out; 534 534 } 535 535 536 - if (tgt->type->ioctl) 537 - r = tgt->type->ioctl(tgt, cmd, arg); 536 + r = tgt->type->ioctl(tgt, cmd, arg); 538 537 539 538 out: 540 539 dm_put_live_table(md, srcu_idx); ··· 1610 1607 * Find maximum amount of I/O that won't need splitting 1611 1608 */ 1612 1609 max_sectors = min(max_io_len(bvm->bi_sector, ti), 1613 - (sector_t) BIO_MAX_SECTORS); 1610 + (sector_t) queue_max_sectors(q)); 1614 1611 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; 1615 - if (max_size < 0) 1612 + if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */ 1616 1613 max_size = 0; 1617 1614 1618 1615 /* ··· 1624 1621 max_size = ti->type->merge(ti, bvm, biovec, max_size); 1625 1622 /* 1626 1623 * If the target doesn't support merge method and some of the devices 1627 - * provided their merge_bvec method (we know this by looking at 1628 - * queue_max_hw_sectors), then we can't allow bios with multiple vector 1629 - * entries. So always set max_size to 0, and the code below allows 1630 - * just one page. 1624 + * provided their merge_bvec method (we know this by looking for the 1625 + * max_hw_sectors that dm_set_device_limits may set), then we can't 1626 + * allow bios with multiple vector entries. So always set max_size 1627 + * to 0, and the code below allows just one page. 1631 1628 */ 1632 1629 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) 1633 1630 max_size = 0; ··· 2335 2332 2336 2333 merge_is_optional = dm_table_merge_is_optional(t); 2337 2334 2338 - old_map = md->map; 2335 + old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2339 2336 rcu_assign_pointer(md->map, t); 2340 2337 md->immutable_target_type = dm_table_get_immutable_target_type(t); 2341 2338 ··· 2344 2341 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2345 2342 else 2346 2343 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2347 - dm_sync_table(md); 2344 + if (old_map) 2345 + dm_sync_table(md); 2348 2346 2349 2347 return old_map; 2350 2348 } ··· 2355 2351 */ 2356 2352 static struct dm_table *__unbind(struct mapped_device *md) 2357 2353 { 2358 - struct dm_table *map = md->map; 2354 + struct dm_table *map = rcu_dereference_protected(md->map, 1); 2359 2355 2360 2356 if (!map) 2361 2357 return NULL; ··· 2720 2716 } 2721 2717 2722 2718 /* 2719 + * If __dm_suspend returns 0, the device is completely quiescent 2720 + * now. There is no request-processing activity. All new requests 2721 + * are being added to md->deferred list. 2722 + * 2723 + * Caller must hold md->suspend_lock 2724 + */ 2725 + static int __dm_suspend(struct mapped_device *md, struct dm_table *map, 2726 + unsigned suspend_flags, int interruptible) 2727 + { 2728 + bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; 2729 + bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; 2730 + int r; 2731 + 2732 + /* 2733 + * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2734 + * This flag is cleared before dm_suspend returns. 2735 + */ 2736 + if (noflush) 2737 + set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2738 + 2739 + /* 2740 + * This gets reverted if there's an error later and the targets 2741 + * provide the .presuspend_undo hook. 2742 + */ 2743 + dm_table_presuspend_targets(map); 2744 + 2745 + /* 2746 + * Flush I/O to the device. 2747 + * Any I/O submitted after lock_fs() may not be flushed. 2748 + * noflush takes precedence over do_lockfs. 2749 + * (lock_fs() flushes I/Os and waits for them to complete.) 2750 + */ 2751 + if (!noflush && do_lockfs) { 2752 + r = lock_fs(md); 2753 + if (r) { 2754 + dm_table_presuspend_undo_targets(map); 2755 + return r; 2756 + } 2757 + } 2758 + 2759 + /* 2760 + * Here we must make sure that no processes are submitting requests 2761 + * to target drivers i.e. no one may be executing 2762 + * __split_and_process_bio. This is called from dm_request and 2763 + * dm_wq_work. 2764 + * 2765 + * To get all processes out of __split_and_process_bio in dm_request, 2766 + * we take the write lock. To prevent any process from reentering 2767 + * __split_and_process_bio from dm_request and quiesce the thread 2768 + * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call 2769 + * flush_workqueue(md->wq). 2770 + */ 2771 + set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2772 + if (map) 2773 + synchronize_srcu(&md->io_barrier); 2774 + 2775 + /* 2776 + * Stop md->queue before flushing md->wq in case request-based 2777 + * dm defers requests to md->wq from md->queue. 2778 + */ 2779 + if (dm_request_based(md)) 2780 + stop_queue(md->queue); 2781 + 2782 + flush_workqueue(md->wq); 2783 + 2784 + /* 2785 + * At this point no more requests are entering target request routines. 2786 + * We call dm_wait_for_completion to wait for all existing requests 2787 + * to finish. 2788 + */ 2789 + r = dm_wait_for_completion(md, interruptible); 2790 + 2791 + if (noflush) 2792 + clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2793 + if (map) 2794 + synchronize_srcu(&md->io_barrier); 2795 + 2796 + /* were we interrupted ? */ 2797 + if (r < 0) { 2798 + dm_queue_flush(md); 2799 + 2800 + if (dm_request_based(md)) 2801 + start_queue(md->queue); 2802 + 2803 + unlock_fs(md); 2804 + dm_table_presuspend_undo_targets(map); 2805 + /* pushback list is already flushed, so skip flush */ 2806 + } 2807 + 2808 + return r; 2809 + } 2810 + 2811 + /* 2723 2812 * We need to be able to change a mapping table under a mounted 2724 2813 * filesystem. For example we might want to move some data in 2725 2814 * the background. Before the table can be swapped with ··· 2832 2735 { 2833 2736 struct dm_table *map = NULL; 2834 2737 int r = 0; 2835 - int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; 2836 - int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; 2837 2738 2838 - mutex_lock(&md->suspend_lock); 2739 + retry: 2740 + mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2839 2741 2840 2742 if (dm_suspended_md(md)) { 2841 2743 r = -EINVAL; 2842 2744 goto out_unlock; 2843 2745 } 2844 2746 2845 - map = md->map; 2846 - 2847 - /* 2848 - * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2849 - * This flag is cleared before dm_suspend returns. 2850 - */ 2851 - if (noflush) 2852 - set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2853 - 2854 - /* This does not get reverted if there's an error later. */ 2855 - dm_table_presuspend_targets(map); 2856 - 2857 - /* 2858 - * Flush I/O to the device. 2859 - * Any I/O submitted after lock_fs() may not be flushed. 2860 - * noflush takes precedence over do_lockfs. 2861 - * (lock_fs() flushes I/Os and waits for them to complete.) 2862 - */ 2863 - if (!noflush && do_lockfs) { 2864 - r = lock_fs(md); 2747 + if (dm_suspended_internally_md(md)) { 2748 + /* already internally suspended, wait for internal resume */ 2749 + mutex_unlock(&md->suspend_lock); 2750 + r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2865 2751 if (r) 2866 - goto out_unlock; 2752 + return r; 2753 + goto retry; 2867 2754 } 2868 2755 2869 - /* 2870 - * Here we must make sure that no processes are submitting requests 2871 - * to target drivers i.e. no one may be executing 2872 - * __split_and_process_bio. This is called from dm_request and 2873 - * dm_wq_work. 2874 - * 2875 - * To get all processes out of __split_and_process_bio in dm_request, 2876 - * we take the write lock. To prevent any process from reentering 2877 - * __split_and_process_bio from dm_request and quiesce the thread 2878 - * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call 2879 - * flush_workqueue(md->wq). 2880 - */ 2881 - set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2882 - synchronize_srcu(&md->io_barrier); 2756 + map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2883 2757 2884 - /* 2885 - * Stop md->queue before flushing md->wq in case request-based 2886 - * dm defers requests to md->wq from md->queue. 2887 - */ 2888 - if (dm_request_based(md)) 2889 - stop_queue(md->queue); 2890 - 2891 - flush_workqueue(md->wq); 2892 - 2893 - /* 2894 - * At this point no more requests are entering target request routines. 2895 - * We call dm_wait_for_completion to wait for all existing requests 2896 - * to finish. 2897 - */ 2898 - r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); 2899 - 2900 - if (noflush) 2901 - clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2902 - synchronize_srcu(&md->io_barrier); 2903 - 2904 - /* were we interrupted ? */ 2905 - if (r < 0) { 2906 - dm_queue_flush(md); 2907 - 2908 - if (dm_request_based(md)) 2909 - start_queue(md->queue); 2910 - 2911 - unlock_fs(md); 2912 - goto out_unlock; /* pushback list is already flushed, so skip flush */ 2913 - } 2914 - 2915 - /* 2916 - * If dm_wait_for_completion returned 0, the device is completely 2917 - * quiescent now. There is no request-processing activity. All new 2918 - * requests are being added to md->deferred list. 2919 - */ 2758 + r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE); 2759 + if (r) 2760 + goto out_unlock; 2920 2761 2921 2762 set_bit(DMF_SUSPENDED, &md->flags); 2922 2763 ··· 2865 2830 return r; 2866 2831 } 2867 2832 2868 - int dm_resume(struct mapped_device *md) 2833 + static int __dm_resume(struct mapped_device *md, struct dm_table *map) 2869 2834 { 2870 - int r = -EINVAL; 2871 - struct dm_table *map = NULL; 2872 - 2873 - mutex_lock(&md->suspend_lock); 2874 - if (!dm_suspended_md(md)) 2875 - goto out; 2876 - 2877 - map = md->map; 2878 - if (!map || !dm_table_get_size(map)) 2879 - goto out; 2880 - 2881 - r = dm_table_resume_targets(map); 2882 - if (r) 2883 - goto out; 2835 + if (map) { 2836 + int r = dm_table_resume_targets(map); 2837 + if (r) 2838 + return r; 2839 + } 2884 2840 2885 2841 dm_queue_flush(md); 2886 2842 ··· 2884 2858 start_queue(md->queue); 2885 2859 2886 2860 unlock_fs(md); 2861 + 2862 + return 0; 2863 + } 2864 + 2865 + int dm_resume(struct mapped_device *md) 2866 + { 2867 + int r = -EINVAL; 2868 + struct dm_table *map = NULL; 2869 + 2870 + retry: 2871 + mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2872 + 2873 + if (!dm_suspended_md(md)) 2874 + goto out; 2875 + 2876 + if (dm_suspended_internally_md(md)) { 2877 + /* already internally suspended, wait for internal resume */ 2878 + mutex_unlock(&md->suspend_lock); 2879 + r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2880 + if (r) 2881 + return r; 2882 + goto retry; 2883 + } 2884 + 2885 + map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2886 + if (!map || !dm_table_get_size(map)) 2887 + goto out; 2888 + 2889 + r = __dm_resume(md, map); 2890 + if (r) 2891 + goto out; 2887 2892 2888 2893 clear_bit(DMF_SUSPENDED, &md->flags); 2889 2894 ··· 2929 2872 * Internal suspend/resume works like userspace-driven suspend. It waits 2930 2873 * until all bios finish and prevents issuing new bios to the target drivers. 2931 2874 * It may be used only from the kernel. 2932 - * 2933 - * Internal suspend holds md->suspend_lock, which prevents interaction with 2934 - * userspace-driven suspend. 2935 2875 */ 2936 2876 2937 - void dm_internal_suspend(struct mapped_device *md) 2877 + static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags) 2878 + { 2879 + struct dm_table *map = NULL; 2880 + 2881 + if (dm_suspended_internally_md(md)) 2882 + return; /* nested internal suspend */ 2883 + 2884 + if (dm_suspended_md(md)) { 2885 + set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2886 + return; /* nest suspend */ 2887 + } 2888 + 2889 + map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2890 + 2891 + /* 2892 + * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is 2893 + * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend 2894 + * would require changing .presuspend to return an error -- avoid this 2895 + * until there is a need for more elaborate variants of internal suspend. 2896 + */ 2897 + (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE); 2898 + 2899 + set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2900 + 2901 + dm_table_postsuspend_targets(map); 2902 + } 2903 + 2904 + static void __dm_internal_resume(struct mapped_device *md) 2905 + { 2906 + if (!dm_suspended_internally_md(md)) 2907 + return; /* resume from nested internal suspend */ 2908 + 2909 + if (dm_suspended_md(md)) 2910 + goto done; /* resume from nested suspend */ 2911 + 2912 + /* 2913 + * NOTE: existing callers don't need to call dm_table_resume_targets 2914 + * (which may fail -- so best to avoid it for now by passing NULL map) 2915 + */ 2916 + (void) __dm_resume(md, NULL); 2917 + 2918 + done: 2919 + clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2920 + smp_mb__after_atomic(); 2921 + wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); 2922 + } 2923 + 2924 + void dm_internal_suspend_noflush(struct mapped_device *md) 2938 2925 { 2939 2926 mutex_lock(&md->suspend_lock); 2940 - if (dm_suspended_md(md)) 2927 + __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG); 2928 + mutex_unlock(&md->suspend_lock); 2929 + } 2930 + EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush); 2931 + 2932 + void dm_internal_resume(struct mapped_device *md) 2933 + { 2934 + mutex_lock(&md->suspend_lock); 2935 + __dm_internal_resume(md); 2936 + mutex_unlock(&md->suspend_lock); 2937 + } 2938 + EXPORT_SYMBOL_GPL(dm_internal_resume); 2939 + 2940 + /* 2941 + * Fast variants of internal suspend/resume hold md->suspend_lock, 2942 + * which prevents interaction with userspace-driven suspend. 2943 + */ 2944 + 2945 + void dm_internal_suspend_fast(struct mapped_device *md) 2946 + { 2947 + mutex_lock(&md->suspend_lock); 2948 + if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 2941 2949 return; 2942 2950 2943 2951 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); ··· 3011 2889 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 3012 2890 } 3013 2891 3014 - void dm_internal_resume(struct mapped_device *md) 2892 + void dm_internal_resume_fast(struct mapped_device *md) 3015 2893 { 3016 - if (dm_suspended_md(md)) 2894 + if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 3017 2895 goto done; 3018 2896 3019 2897 dm_queue_flush(md); ··· 3097 2975 int dm_suspended_md(struct mapped_device *md) 3098 2976 { 3099 2977 return test_bit(DMF_SUSPENDED, &md->flags); 2978 + } 2979 + 2980 + int dm_suspended_internally_md(struct mapped_device *md) 2981 + { 2982 + return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3100 2983 } 3101 2984 3102 2985 int dm_test_deferred_remove_flag(struct mapped_device *md)

+10

drivers/md/dm.h

··· 65 65 struct queue_limits *limits); 66 66 struct list_head *dm_table_get_devices(struct dm_table *t); 67 67 void dm_table_presuspend_targets(struct dm_table *t); 68 + void dm_table_presuspend_undo_targets(struct dm_table *t); 68 69 void dm_table_postsuspend_targets(struct dm_table *t); 69 70 int dm_table_resume_targets(struct dm_table *t); 70 71 int dm_table_any_congested(struct dm_table *t, int bdi_bits); ··· 128 127 * Is this mapped_device suspended? 129 128 */ 130 129 int dm_suspended_md(struct mapped_device *md); 130 + 131 + /* 132 + * Internal suspend and resume methods. 133 + */ 134 + int dm_suspended_internally_md(struct mapped_device *md); 135 + void dm_internal_suspend_fast(struct mapped_device *md); 136 + void dm_internal_resume_fast(struct mapped_device *md); 137 + void dm_internal_suspend_noflush(struct mapped_device *md); 138 + void dm_internal_resume(struct mapped_device *md); 131 139 132 140 /* 133 141 * Test if the device is scheduled for deferred remove.

+3 -1

drivers/md/persistent-data/dm-array.c

··· 645 645 int r; 646 646 struct resize resize; 647 647 648 - if (old_size == new_size) 648 + if (old_size == new_size) { 649 + *new_root = root; 649 650 return 0; 651 + } 650 652 651 653 resize.info = info; 652 654 resize.root = root;

+6 -2

drivers/md/persistent-data/dm-space-map-metadata.c

··· 564 564 { 565 565 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); 566 566 567 - return smm->ll.nr_blocks; 567 + *count = smm->ll.nr_blocks; 568 + 569 + return 0; 568 570 } 569 571 570 572 static int sm_bootstrap_get_nr_free(struct dm_space_map *sm, dm_block_t *count) ··· 583 581 { 584 582 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); 585 583 586 - return b < smm->begin ? 1 : 0; 584 + *result = (b < smm->begin) ? 1 : 0; 585 + 586 + return 0; 587 587 } 588 588 589 589 static int sm_bootstrap_count_is_more_than_one(struct dm_space_map *sm,

+75 -2

drivers/md/persistent-data/dm-transaction-manager.c

··· 10 10 #include "dm-persistent-data-internal.h" 11 11 12 12 #include <linux/export.h> 13 + #include <linux/mutex.h> 14 + #include <linux/hash.h> 13 15 #include <linux/slab.h> 14 16 #include <linux/device-mapper.h> 15 17 16 18 #define DM_MSG_PREFIX "transaction manager" 19 + 20 + /*----------------------------------------------------------------*/ 21 + 22 + #define PREFETCH_SIZE 128 23 + #define PREFETCH_BITS 7 24 + #define PREFETCH_SENTINEL ((dm_block_t) -1ULL) 25 + 26 + struct prefetch_set { 27 + struct mutex lock; 28 + dm_block_t blocks[PREFETCH_SIZE]; 29 + }; 30 + 31 + static unsigned prefetch_hash(dm_block_t b) 32 + { 33 + return hash_64(b, PREFETCH_BITS); 34 + } 35 + 36 + static void prefetch_wipe(struct prefetch_set *p) 37 + { 38 + unsigned i; 39 + for (i = 0; i < PREFETCH_SIZE; i++) 40 + p->blocks[i] = PREFETCH_SENTINEL; 41 + } 42 + 43 + static void prefetch_init(struct prefetch_set *p) 44 + { 45 + mutex_init(&p->lock); 46 + prefetch_wipe(p); 47 + } 48 + 49 + static void prefetch_add(struct prefetch_set *p, dm_block_t b) 50 + { 51 + unsigned h = prefetch_hash(b); 52 + 53 + mutex_lock(&p->lock); 54 + if (p->blocks[h] == PREFETCH_SENTINEL) 55 + p->blocks[h] = b; 56 + 57 + mutex_unlock(&p->lock); 58 + } 59 + 60 + static void prefetch_issue(struct prefetch_set *p, struct dm_block_manager *bm) 61 + { 62 + unsigned i; 63 + 64 + mutex_lock(&p->lock); 65 + 66 + for (i = 0; i < PREFETCH_SIZE; i++) 67 + if (p->blocks[i] != PREFETCH_SENTINEL) { 68 + dm_bm_prefetch(bm, p->blocks[i]); 69 + p->blocks[i] = PREFETCH_SENTINEL; 70 + } 71 + 72 + mutex_unlock(&p->lock); 73 + } 17 74 18 75 /*----------------------------------------------------------------*/ 19 76 ··· 94 37 95 38 spinlock_t lock; 96 39 struct hlist_head buckets[DM_HASH_SIZE]; 40 + 41 + struct prefetch_set prefetches; 97 42 }; 98 43 99 44 /*----------------------------------------------------------------*/ ··· 175 116 spin_lock_init(&tm->lock); 176 117 for (i = 0; i < DM_HASH_SIZE; i++) 177 118 INIT_HLIST_HEAD(tm->buckets + i); 119 + 120 + prefetch_init(&tm->prefetches); 178 121 179 122 return tm; 180 123 } ··· 329 268 struct dm_block_validator *v, 330 269 struct dm_block **blk) 331 270 { 332 - if (tm->is_clone) 333 - return dm_bm_read_try_lock(tm->real->bm, b, v, blk); 271 + if (tm->is_clone) { 272 + int r = dm_bm_read_try_lock(tm->real->bm, b, v, blk); 273 + 274 + if (r == -EWOULDBLOCK) 275 + prefetch_add(&tm->real->prefetches, b); 276 + 277 + return r; 278 + } 334 279 335 280 return dm_bm_read_lock(tm->bm, b, v, blk); 336 281 } ··· 383 316 { 384 317 return tm->bm; 385 318 } 319 + 320 + void dm_tm_issue_prefetches(struct dm_transaction_manager *tm) 321 + { 322 + prefetch_issue(&tm->prefetches, tm->bm); 323 + } 324 + EXPORT_SYMBOL_GPL(dm_tm_issue_prefetches); 386 325 387 326 /*----------------------------------------------------------------*/ 388 327

+7

drivers/md/persistent-data/dm-transaction-manager.h

··· 109 109 struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm); 110 110 111 111 /* 112 + * If you're using a non-blocking clone the tm will build up a list of 113 + * requested blocks that weren't in core. This call will request those 114 + * blocks to be prefetched. 115 + */ 116 + void dm_tm_issue_prefetches(struct dm_transaction_manager *tm); 117 + 118 + /* 112 119 * A little utility that ties the knot by producing a transaction manager 113 120 * that has a space map managed by the transaction manager... 114 121 *

+2

include/linux/device-mapper.h

··· 64 64 union map_info *map_context); 65 65 66 66 typedef void (*dm_presuspend_fn) (struct dm_target *ti); 67 + typedef void (*dm_presuspend_undo_fn) (struct dm_target *ti); 67 68 typedef void (*dm_postsuspend_fn) (struct dm_target *ti); 68 69 typedef int (*dm_preresume_fn) (struct dm_target *ti); 69 70 typedef void (*dm_resume_fn) (struct dm_target *ti); ··· 146 145 dm_endio_fn end_io; 147 146 dm_request_endio_fn rq_end_io; 148 147 dm_presuspend_fn presuspend; 148 + dm_presuspend_undo_fn presuspend_undo; 149 149 dm_postsuspend_fn postsuspend; 150 150 dm_preresume_fn preresume; 151 151 dm_resume_fn resume;

+7 -2

include/uapi/linux/dm-ioctl.h

··· 267 267 #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) 268 268 269 269 #define DM_VERSION_MAJOR 4 270 - #define DM_VERSION_MINOR 28 270 + #define DM_VERSION_MINOR 29 271 271 #define DM_VERSION_PATCHLEVEL 0 272 - #define DM_VERSION_EXTRA "-ioctl (2014-09-17)" 272 + #define DM_VERSION_EXTRA "-ioctl (2014-10-28)" 273 273 274 274 /* Status bits */ 275 275 #define DM_READONLY_FLAG (1 << 0) /* In/Out */ ··· 351 351 * gets closed. 352 352 */ 353 353 #define DM_DEFERRED_REMOVE (1 << 17) /* In/Out */ 354 + 355 + /* 356 + * If set, the device is suspended internally. 357 + */ 358 + #define DM_INTERNAL_SUSPEND_FLAG (1 << 18) /* Out */ 354 359 355 360 #endif /* _LINUX_DM_IOCTL_H */