dm cache: significant rework to leverage dm-bio-prison-v2

-8

drivers/md/Kconfig

··· 325 325 of less memory utilization, improved performance and increased 326 326 adaptability in the face of changing workloads. 327 327 328 - config DM_CACHE_CLEANER 329 - tristate "Cleaner Cache Policy (EXPERIMENTAL)" 330 - depends on DM_CACHE 331 - default y 332 - ---help--- 333 - A simple cache policy that writes back all data to the 334 - origin. Used when decommissioning a dm-cache. 335 - 336 328 config DM_ERA 337 329 tristate "Era target (EXPERIMENTAL)" 338 330 depends on BLK_DEV_DM

+2 -3

drivers/md/Makefile

··· 13 13 += dm-log-userspace-base.o dm-log-userspace-transfer.o 14 14 dm-bio-prison-y += dm-bio-prison-v1.o dm-bio-prison-v2.o 15 15 dm-thin-pool-y += dm-thin.o dm-thin-metadata.o 16 - dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o 16 + dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \ 17 + dm-cache-background-tracker.o 17 18 dm-cache-smq-y += dm-cache-policy-smq.o 18 - dm-cache-cleaner-y += dm-cache-policy-cleaner.o 19 19 dm-era-y += dm-era-target.o 20 20 dm-verity-y += dm-verity-target.o 21 21 md-mod-y += md.o bitmap.o ··· 57 57 obj-$(CONFIG_DM_VERITY) += dm-verity.o 58 58 obj-$(CONFIG_DM_CACHE) += dm-cache.o 59 59 obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o 60 - obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o 61 60 obj-$(CONFIG_DM_ERA) += dm-era.o 62 61 obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o 63 62

+238

drivers/md/dm-cache-background-tracker.c

··· 1 + /* 2 + * Copyright (C) 2017 Red Hat. All rights reserved. 3 + * 4 + * This file is released under the GPL. 5 + */ 6 + 7 + #include "dm-cache-background-tracker.h" 8 + 9 + /*----------------------------------------------------------------*/ 10 + 11 + #define DM_MSG_PREFIX "dm-background-tracker" 12 + 13 + struct bt_work { 14 + struct list_head list; 15 + struct rb_node node; 16 + struct policy_work work; 17 + }; 18 + 19 + struct background_tracker { 20 + unsigned max_work; 21 + atomic_t pending_promotes; 22 + atomic_t pending_writebacks; 23 + atomic_t pending_demotes; 24 + 25 + struct list_head issued; 26 + struct list_head queued; 27 + struct rb_root pending; 28 + 29 + struct kmem_cache *work_cache; 30 + }; 31 + 32 + struct background_tracker *btracker_create(unsigned max_work) 33 + { 34 + struct background_tracker *b = kmalloc(sizeof(*b), GFP_KERNEL); 35 + 36 + b->max_work = max_work; 37 + atomic_set(&b->pending_promotes, 0); 38 + atomic_set(&b->pending_writebacks, 0); 39 + atomic_set(&b->pending_demotes, 0); 40 + 41 + INIT_LIST_HEAD(&b->issued); 42 + INIT_LIST_HEAD(&b->queued); 43 + 44 + b->pending = RB_ROOT; 45 + b->work_cache = KMEM_CACHE(bt_work, 0); 46 + if (!b->work_cache) { 47 + DMERR("couldn't create mempool for background work items"); 48 + kfree(b); 49 + b = NULL; 50 + } 51 + 52 + return b; 53 + } 54 + EXPORT_SYMBOL_GPL(btracker_create); 55 + 56 + void btracker_destroy(struct background_tracker *b) 57 + { 58 + kmem_cache_destroy(b->work_cache); 59 + kfree(b); 60 + } 61 + EXPORT_SYMBOL_GPL(btracker_destroy); 62 + 63 + static int cmp_oblock(dm_oblock_t lhs, dm_oblock_t rhs) 64 + { 65 + if (from_oblock(lhs) < from_oblock(rhs)) 66 + return -1; 67 + 68 + if (from_oblock(rhs) < from_oblock(lhs)) 69 + return 1; 70 + 71 + return 0; 72 + } 73 + 74 + static bool __insert_pending(struct background_tracker *b, 75 + struct bt_work *nw) 76 + { 77 + int cmp; 78 + struct bt_work *w; 79 + struct rb_node **new = &b->pending.rb_node, *parent = NULL; 80 + 81 + while (*new) { 82 + w = container_of(*new, struct bt_work, node); 83 + 84 + parent = *new; 85 + cmp = cmp_oblock(w->work.oblock, nw->work.oblock); 86 + if (cmp < 0) 87 + new = &((*new)->rb_left); 88 + 89 + else if (cmp > 0) 90 + new = &((*new)->rb_right); 91 + 92 + else 93 + /* already present */ 94 + return false; 95 + } 96 + 97 + rb_link_node(&nw->node, parent, new); 98 + rb_insert_color(&nw->node, &b->pending); 99 + 100 + return true; 101 + } 102 + 103 + static struct bt_work *__find_pending(struct background_tracker *b, 104 + dm_oblock_t oblock) 105 + { 106 + int cmp; 107 + struct bt_work *w; 108 + struct rb_node **new = &b->pending.rb_node; 109 + 110 + while (*new) { 111 + w = container_of(*new, struct bt_work, node); 112 + 113 + cmp = cmp_oblock(w->work.oblock, oblock); 114 + if (cmp < 0) 115 + new = &((*new)->rb_left); 116 + 117 + else if (cmp > 0) 118 + new = &((*new)->rb_right); 119 + 120 + else 121 + break; 122 + } 123 + 124 + return *new ? w : NULL; 125 + } 126 + 127 + 128 + static void update_stats(struct background_tracker *b, struct policy_work *w, int delta) 129 + { 130 + switch (w->op) { 131 + case POLICY_PROMOTE: 132 + atomic_add(delta, &b->pending_promotes); 133 + break; 134 + 135 + case POLICY_DEMOTE: 136 + atomic_add(delta, &b->pending_demotes); 137 + break; 138 + 139 + case POLICY_WRITEBACK: 140 + atomic_add(delta, &b->pending_writebacks); 141 + break; 142 + } 143 + } 144 + 145 + unsigned btracker_nr_writebacks_queued(struct background_tracker *b) 146 + { 147 + return atomic_read(&b->pending_writebacks); 148 + } 149 + EXPORT_SYMBOL_GPL(btracker_nr_writebacks_queued); 150 + 151 + unsigned btracker_nr_demotions_queued(struct background_tracker *b) 152 + { 153 + return atomic_read(&b->pending_demotes); 154 + } 155 + EXPORT_SYMBOL_GPL(btracker_nr_demotions_queued); 156 + 157 + static bool max_work_reached(struct background_tracker *b) 158 + { 159 + // FIXME: finish 160 + return false; 161 + } 162 + 163 + int btracker_queue(struct background_tracker *b, 164 + struct policy_work *work, 165 + struct policy_work **pwork) 166 + { 167 + struct bt_work *w; 168 + 169 + if (pwork) 170 + *pwork = NULL; 171 + 172 + if (max_work_reached(b)) 173 + return -ENOMEM; 174 + 175 + w = kmem_cache_alloc(b->work_cache, GFP_NOWAIT); 176 + if (!w) 177 + return -ENOMEM; 178 + 179 + memcpy(&w->work, work, sizeof(*work)); 180 + 181 + if (!__insert_pending(b, w)) { 182 + /* 183 + * There was a race, we'll just ignore this second 184 + * bit of work for the same oblock. 185 + */ 186 + kmem_cache_free(b->work_cache, w); 187 + return -EINVAL; 188 + } 189 + 190 + if (pwork) { 191 + *pwork = &w->work; 192 + list_add(&w->list, &b->issued); 193 + } else 194 + list_add(&w->list, &b->queued); 195 + update_stats(b, &w->work, 1); 196 + 197 + return 0; 198 + } 199 + EXPORT_SYMBOL_GPL(btracker_queue); 200 + 201 + /* 202 + * Returns -ENODATA if there's no work. 203 + */ 204 + int btracker_issue(struct background_tracker *b, struct policy_work **work) 205 + { 206 + struct bt_work *w; 207 + 208 + if (list_empty(&b->queued)) 209 + return -ENODATA; 210 + 211 + w = list_first_entry(&b->queued, struct bt_work, list); 212 + list_move(&w->list, &b->issued); 213 + *work = &w->work; 214 + 215 + return 0; 216 + } 217 + EXPORT_SYMBOL_GPL(btracker_issue); 218 + 219 + void btracker_complete(struct background_tracker *b, 220 + struct policy_work *op) 221 + { 222 + struct bt_work *w = container_of(op, struct bt_work, work); 223 + 224 + update_stats(b, &w->work, -1); 225 + rb_erase(&w->node, &b->pending); 226 + list_del(&w->list); 227 + kmem_cache_free(b->work_cache, w); 228 + } 229 + EXPORT_SYMBOL_GPL(btracker_complete); 230 + 231 + bool btracker_promotion_already_present(struct background_tracker *b, 232 + dm_oblock_t oblock) 233 + { 234 + return __find_pending(b, oblock) != NULL; 235 + } 236 + EXPORT_SYMBOL_GPL(btracker_promotion_already_present); 237 + 238 + /*----------------------------------------------------------------*/

+46

drivers/md/dm-cache-background-tracker.h

··· 1 + /* 2 + * Copyright (C) 2017 Red Hat. All rights reserved. 3 + * 4 + * This file is released under the GPL. 5 + */ 6 + 7 + #ifndef DM_CACHE_BACKGROUND_WORK_H 8 + #define DM_CACHE_BACKGROUND_WORK_H 9 + 10 + #include <linux/vmalloc.h> 11 + #include "dm-cache-policy.h" 12 + 13 + /*----------------------------------------------------------------*/ 14 + 15 + struct background_work; 16 + struct background_tracker; 17 + 18 + /* 19 + * FIXME: discuss lack of locking in all methods. 20 + */ 21 + struct background_tracker *btracker_create(unsigned max_work); 22 + void btracker_destroy(struct background_tracker *b); 23 + 24 + unsigned btracker_nr_writebacks_queued(struct background_tracker *b); 25 + unsigned btracker_nr_demotions_queued(struct background_tracker *b); 26 + 27 + /* 28 + * returns -EINVAL iff the work is already queued. -ENOMEM if the work 29 + * couldn't be queued for another reason. 30 + */ 31 + int btracker_queue(struct background_tracker *b, 32 + struct policy_work *work, 33 + struct policy_work **pwork); 34 + 35 + /* 36 + * Returns -ENODATA if there's no work. 37 + */ 38 + int btracker_issue(struct background_tracker *b, struct policy_work **work); 39 + void btracker_complete(struct background_tracker *b, 40 + struct policy_work *op); 41 + bool btracker_promotion_already_present(struct background_tracker *b, 42 + dm_oblock_t oblock); 43 + 44 + /*----------------------------------------------------------------*/ 45 + 46 + #endif

+2

drivers/md/dm-cache-metadata.h

··· 50 50 #define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL 51 51 #define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL 52 52 53 + struct dm_cache_metadata; 54 + 53 55 /* 54 56 * Reopens or creates a new, empty metadata volume. Returns an ERR_PTR on 55 57 * failure. If reopening then features must match.

-469

drivers/md/dm-cache-policy-cleaner.c

··· 1 - /* 2 - * Copyright (C) 2012 Red Hat. All rights reserved. 3 - * 4 - * writeback cache policy supporting flushing out dirty cache blocks. 5 - * 6 - * This file is released under the GPL. 7 - */ 8 - 9 - #include "dm-cache-policy.h" 10 - #include "dm.h" 11 - 12 - #include <linux/hash.h> 13 - #include <linux/module.h> 14 - #include <linux/slab.h> 15 - #include <linux/vmalloc.h> 16 - 17 - /*----------------------------------------------------------------*/ 18 - 19 - #define DM_MSG_PREFIX "cache cleaner" 20 - 21 - /* Cache entry struct. */ 22 - struct wb_cache_entry { 23 - struct list_head list; 24 - struct hlist_node hlist; 25 - 26 - dm_oblock_t oblock; 27 - dm_cblock_t cblock; 28 - bool dirty:1; 29 - bool pending:1; 30 - }; 31 - 32 - struct hash { 33 - struct hlist_head *table; 34 - dm_block_t hash_bits; 35 - unsigned nr_buckets; 36 - }; 37 - 38 - struct policy { 39 - struct dm_cache_policy policy; 40 - spinlock_t lock; 41 - 42 - struct list_head free; 43 - struct list_head clean; 44 - struct list_head clean_pending; 45 - struct list_head dirty; 46 - 47 - /* 48 - * We know exactly how many cblocks will be needed, 49 - * so we can allocate them up front. 50 - */ 51 - dm_cblock_t cache_size, nr_cblocks_allocated; 52 - struct wb_cache_entry *cblocks; 53 - struct hash chash; 54 - }; 55 - 56 - /*----------------------------------------------------------------------------*/ 57 - 58 - /* 59 - * Low-level functions. 60 - */ 61 - static unsigned next_power(unsigned n, unsigned min) 62 - { 63 - return roundup_pow_of_two(max(n, min)); 64 - } 65 - 66 - static struct policy *to_policy(struct dm_cache_policy *p) 67 - { 68 - return container_of(p, struct policy, policy); 69 - } 70 - 71 - static struct list_head *list_pop(struct list_head *q) 72 - { 73 - struct list_head *r = q->next; 74 - 75 - list_del(r); 76 - 77 - return r; 78 - } 79 - 80 - /*----------------------------------------------------------------------------*/ 81 - 82 - /* Allocate/free various resources. */ 83 - static int alloc_hash(struct hash *hash, unsigned elts) 84 - { 85 - hash->nr_buckets = next_power(elts >> 4, 16); 86 - hash->hash_bits = __ffs(hash->nr_buckets); 87 - hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets); 88 - 89 - return hash->table ? 0 : -ENOMEM; 90 - } 91 - 92 - static void free_hash(struct hash *hash) 93 - { 94 - vfree(hash->table); 95 - } 96 - 97 - static int alloc_cache_blocks_with_hash(struct policy *p, dm_cblock_t cache_size) 98 - { 99 - int r = -ENOMEM; 100 - 101 - p->cblocks = vzalloc(sizeof(*p->cblocks) * from_cblock(cache_size)); 102 - if (p->cblocks) { 103 - unsigned u = from_cblock(cache_size); 104 - 105 - while (u--) 106 - list_add(&p->cblocks[u].list, &p->free); 107 - 108 - p->nr_cblocks_allocated = 0; 109 - 110 - /* Cache entries hash. */ 111 - r = alloc_hash(&p->chash, from_cblock(cache_size)); 112 - if (r) 113 - vfree(p->cblocks); 114 - } 115 - 116 - return r; 117 - } 118 - 119 - static void free_cache_blocks_and_hash(struct policy *p) 120 - { 121 - free_hash(&p->chash); 122 - vfree(p->cblocks); 123 - } 124 - 125 - static struct wb_cache_entry *alloc_cache_entry(struct policy *p) 126 - { 127 - struct wb_cache_entry *e; 128 - 129 - BUG_ON(from_cblock(p->nr_cblocks_allocated) >= from_cblock(p->cache_size)); 130 - 131 - e = list_entry(list_pop(&p->free), struct wb_cache_entry, list); 132 - p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) + 1); 133 - 134 - return e; 135 - } 136 - 137 - /*----------------------------------------------------------------------------*/ 138 - 139 - /* Hash functions (lookup, insert, remove). */ 140 - static struct wb_cache_entry *lookup_cache_entry(struct policy *p, dm_oblock_t oblock) 141 - { 142 - struct hash *hash = &p->chash; 143 - unsigned h = hash_64(from_oblock(oblock), hash->hash_bits); 144 - struct wb_cache_entry *cur; 145 - struct hlist_head *bucket = &hash->table[h]; 146 - 147 - hlist_for_each_entry(cur, bucket, hlist) { 148 - if (cur->oblock == oblock) { 149 - /* Move upfront bucket for faster access. */ 150 - hlist_del(&cur->hlist); 151 - hlist_add_head(&cur->hlist, bucket); 152 - return cur; 153 - } 154 - } 155 - 156 - return NULL; 157 - } 158 - 159 - static void insert_cache_hash_entry(struct policy *p, struct wb_cache_entry *e) 160 - { 161 - unsigned h = hash_64(from_oblock(e->oblock), p->chash.hash_bits); 162 - 163 - hlist_add_head(&e->hlist, &p->chash.table[h]); 164 - } 165 - 166 - static void remove_cache_hash_entry(struct wb_cache_entry *e) 167 - { 168 - hlist_del(&e->hlist); 169 - } 170 - 171 - /* Public interface (see dm-cache-policy.h */ 172 - static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock, 173 - bool can_block, bool can_migrate, bool discarded_oblock, 174 - struct bio *bio, struct policy_locker *locker, 175 - struct policy_result *result) 176 - { 177 - struct policy *p = to_policy(pe); 178 - struct wb_cache_entry *e; 179 - unsigned long flags; 180 - 181 - result->op = POLICY_MISS; 182 - 183 - if (can_block) 184 - spin_lock_irqsave(&p->lock, flags); 185 - 186 - else if (!spin_trylock_irqsave(&p->lock, flags)) 187 - return -EWOULDBLOCK; 188 - 189 - e = lookup_cache_entry(p, oblock); 190 - if (e) { 191 - result->op = POLICY_HIT; 192 - result->cblock = e->cblock; 193 - 194 - } 195 - 196 - spin_unlock_irqrestore(&p->lock, flags); 197 - 198 - return 0; 199 - } 200 - 201 - static int wb_lookup(struct dm_cache_policy *pe, dm_oblock_t oblock, dm_cblock_t *cblock) 202 - { 203 - int r; 204 - struct policy *p = to_policy(pe); 205 - struct wb_cache_entry *e; 206 - unsigned long flags; 207 - 208 - if (!spin_trylock_irqsave(&p->lock, flags)) 209 - return -EWOULDBLOCK; 210 - 211 - e = lookup_cache_entry(p, oblock); 212 - if (e) { 213 - *cblock = e->cblock; 214 - r = 0; 215 - 216 - } else 217 - r = -ENOENT; 218 - 219 - spin_unlock_irqrestore(&p->lock, flags); 220 - 221 - return r; 222 - } 223 - 224 - static void __set_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock, bool set) 225 - { 226 - struct policy *p = to_policy(pe); 227 - struct wb_cache_entry *e; 228 - 229 - e = lookup_cache_entry(p, oblock); 230 - BUG_ON(!e); 231 - 232 - if (set) { 233 - if (!e->dirty) { 234 - e->dirty = true; 235 - list_move(&e->list, &p->dirty); 236 - } 237 - 238 - } else { 239 - if (e->dirty) { 240 - e->pending = false; 241 - e->dirty = false; 242 - list_move(&e->list, &p->clean); 243 - } 244 - } 245 - } 246 - 247 - static void wb_set_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock) 248 - { 249 - struct policy *p = to_policy(pe); 250 - unsigned long flags; 251 - 252 - spin_lock_irqsave(&p->lock, flags); 253 - __set_clear_dirty(pe, oblock, true); 254 - spin_unlock_irqrestore(&p->lock, flags); 255 - } 256 - 257 - static void wb_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock) 258 - { 259 - struct policy *p = to_policy(pe); 260 - unsigned long flags; 261 - 262 - spin_lock_irqsave(&p->lock, flags); 263 - __set_clear_dirty(pe, oblock, false); 264 - spin_unlock_irqrestore(&p->lock, flags); 265 - } 266 - 267 - static void add_cache_entry(struct policy *p, struct wb_cache_entry *e) 268 - { 269 - insert_cache_hash_entry(p, e); 270 - if (e->dirty) 271 - list_add(&e->list, &p->dirty); 272 - else 273 - list_add(&e->list, &p->clean); 274 - } 275 - 276 - static int wb_load_mapping(struct dm_cache_policy *pe, 277 - dm_oblock_t oblock, dm_cblock_t cblock, 278 - uint32_t hint, bool hint_valid) 279 - { 280 - int r; 281 - struct policy *p = to_policy(pe); 282 - struct wb_cache_entry *e = alloc_cache_entry(p); 283 - 284 - if (e) { 285 - e->cblock = cblock; 286 - e->oblock = oblock; 287 - e->dirty = false; /* blocks default to clean */ 288 - add_cache_entry(p, e); 289 - r = 0; 290 - 291 - } else 292 - r = -ENOMEM; 293 - 294 - return r; 295 - } 296 - 297 - static void wb_destroy(struct dm_cache_policy *pe) 298 - { 299 - struct policy *p = to_policy(pe); 300 - 301 - free_cache_blocks_and_hash(p); 302 - kfree(p); 303 - } 304 - 305 - static struct wb_cache_entry *__wb_force_remove_mapping(struct policy *p, dm_oblock_t oblock) 306 - { 307 - struct wb_cache_entry *r = lookup_cache_entry(p, oblock); 308 - 309 - BUG_ON(!r); 310 - 311 - remove_cache_hash_entry(r); 312 - list_del(&r->list); 313 - 314 - return r; 315 - } 316 - 317 - static void wb_remove_mapping(struct dm_cache_policy *pe, dm_oblock_t oblock) 318 - { 319 - struct policy *p = to_policy(pe); 320 - struct wb_cache_entry *e; 321 - unsigned long flags; 322 - 323 - spin_lock_irqsave(&p->lock, flags); 324 - e = __wb_force_remove_mapping(p, oblock); 325 - list_add_tail(&e->list, &p->free); 326 - BUG_ON(!from_cblock(p->nr_cblocks_allocated)); 327 - p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) - 1); 328 - spin_unlock_irqrestore(&p->lock, flags); 329 - } 330 - 331 - static void wb_force_mapping(struct dm_cache_policy *pe, 332 - dm_oblock_t current_oblock, dm_oblock_t oblock) 333 - { 334 - struct policy *p = to_policy(pe); 335 - struct wb_cache_entry *e; 336 - unsigned long flags; 337 - 338 - spin_lock_irqsave(&p->lock, flags); 339 - e = __wb_force_remove_mapping(p, current_oblock); 340 - e->oblock = oblock; 341 - add_cache_entry(p, e); 342 - spin_unlock_irqrestore(&p->lock, flags); 343 - } 344 - 345 - static struct wb_cache_entry *get_next_dirty_entry(struct policy *p) 346 - { 347 - struct list_head *l; 348 - struct wb_cache_entry *r; 349 - 350 - if (list_empty(&p->dirty)) 351 - return NULL; 352 - 353 - l = list_pop(&p->dirty); 354 - r = container_of(l, struct wb_cache_entry, list); 355 - list_add(l, &p->clean_pending); 356 - 357 - return r; 358 - } 359 - 360 - static int wb_writeback_work(struct dm_cache_policy *pe, 361 - dm_oblock_t *oblock, 362 - dm_cblock_t *cblock, 363 - bool critical_only) 364 - { 365 - int r = -ENOENT; 366 - struct policy *p = to_policy(pe); 367 - struct wb_cache_entry *e; 368 - unsigned long flags; 369 - 370 - spin_lock_irqsave(&p->lock, flags); 371 - 372 - e = get_next_dirty_entry(p); 373 - if (e) { 374 - *oblock = e->oblock; 375 - *cblock = e->cblock; 376 - r = 0; 377 - } 378 - 379 - spin_unlock_irqrestore(&p->lock, flags); 380 - 381 - return r; 382 - } 383 - 384 - static dm_cblock_t wb_residency(struct dm_cache_policy *pe) 385 - { 386 - return to_policy(pe)->nr_cblocks_allocated; 387 - } 388 - 389 - /* Init the policy plugin interface function pointers. */ 390 - static void init_policy_functions(struct policy *p) 391 - { 392 - p->policy.destroy = wb_destroy; 393 - p->policy.map = wb_map; 394 - p->policy.lookup = wb_lookup; 395 - p->policy.set_dirty = wb_set_dirty; 396 - p->policy.clear_dirty = wb_clear_dirty; 397 - p->policy.load_mapping = wb_load_mapping; 398 - p->policy.get_hint = NULL; 399 - p->policy.remove_mapping = wb_remove_mapping; 400 - p->policy.writeback_work = wb_writeback_work; 401 - p->policy.force_mapping = wb_force_mapping; 402 - p->policy.residency = wb_residency; 403 - p->policy.tick = NULL; 404 - } 405 - 406 - static struct dm_cache_policy *wb_create(dm_cblock_t cache_size, 407 - sector_t origin_size, 408 - sector_t cache_block_size) 409 - { 410 - int r; 411 - struct policy *p = kzalloc(sizeof(*p), GFP_KERNEL); 412 - 413 - if (!p) 414 - return NULL; 415 - 416 - init_policy_functions(p); 417 - INIT_LIST_HEAD(&p->free); 418 - INIT_LIST_HEAD(&p->clean); 419 - INIT_LIST_HEAD(&p->clean_pending); 420 - INIT_LIST_HEAD(&p->dirty); 421 - 422 - p->cache_size = cache_size; 423 - spin_lock_init(&p->lock); 424 - 425 - /* Allocate cache entry structs and add them to free list. */ 426 - r = alloc_cache_blocks_with_hash(p, cache_size); 427 - if (!r) 428 - return &p->policy; 429 - 430 - kfree(p); 431 - 432 - return NULL; 433 - } 434 - /*----------------------------------------------------------------------------*/ 435 - 436 - static struct dm_cache_policy_type wb_policy_type = { 437 - .name = "cleaner", 438 - .version = {1, 0, 0}, 439 - .hint_size = 4, 440 - .owner = THIS_MODULE, 441 - .create = wb_create 442 - }; 443 - 444 - static int __init wb_init(void) 445 - { 446 - int r = dm_cache_policy_register(&wb_policy_type); 447 - 448 - if (r < 0) 449 - DMERR("register failed %d", r); 450 - else 451 - DMINFO("version %u.%u.%u loaded", 452 - wb_policy_type.version[0], 453 - wb_policy_type.version[1], 454 - wb_policy_type.version[2]); 455 - 456 - return r; 457 - } 458 - 459 - static void __exit wb_exit(void) 460 - { 461 - dm_cache_policy_unregister(&wb_policy_type); 462 - } 463 - 464 - module_init(wb_init); 465 - module_exit(wb_exit); 466 - 467 - MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>"); 468 - MODULE_LICENSE("GPL"); 469 - MODULE_DESCRIPTION("cleaner cache policy");

+43 -43

drivers/md/dm-cache-policy-internal.h

··· 12 12 13 13 /*----------------------------------------------------------------*/ 14 14 15 - /* 16 - * Little inline functions that simplify calling the policy methods. 17 - */ 18 - static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock, 19 - bool can_block, bool can_migrate, bool discarded_oblock, 20 - struct bio *bio, struct policy_locker *locker, 21 - struct policy_result *result) 15 + static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock, 16 + int data_dir, bool fast_copy, bool *background_queued) 22 17 { 23 - return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, locker, result); 18 + return p->lookup(p, oblock, cblock, data_dir, fast_copy, background_queued); 24 19 } 25 20 26 - static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) 21 + static inline int policy_lookup_with_work(struct dm_cache_policy *p, 22 + dm_oblock_t oblock, dm_cblock_t *cblock, 23 + int data_dir, bool fast_copy, 24 + struct policy_work **work) 27 25 { 28 - BUG_ON(!p->lookup); 29 - return p->lookup(p, oblock, cblock); 26 + if (!p->lookup_with_work) { 27 + *work = NULL; 28 + return p->lookup(p, oblock, cblock, data_dir, fast_copy, NULL); 29 + } 30 + 31 + return p->lookup_with_work(p, oblock, cblock, data_dir, fast_copy, work); 30 32 } 31 33 32 - static inline void policy_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) 34 + static inline int policy_get_background_work(struct dm_cache_policy *p, 35 + bool idle, struct policy_work **result) 33 36 { 34 - if (p->set_dirty) 35 - p->set_dirty(p, oblock); 37 + return p->get_background_work(p, idle, result); 36 38 } 37 39 38 - static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) 40 + static inline void policy_complete_background_work(struct dm_cache_policy *p, 41 + struct policy_work *work, 42 + bool success) 39 43 { 40 - if (p->clear_dirty) 41 - p->clear_dirty(p, oblock); 44 + return p->complete_background_work(p, work, success); 45 + } 46 + 47 + static inline void policy_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) 48 + { 49 + p->set_dirty(p, cblock); 50 + } 51 + 52 + static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) 53 + { 54 + p->clear_dirty(p, cblock); 42 55 } 43 56 44 57 static inline int policy_load_mapping(struct dm_cache_policy *p, 45 58 dm_oblock_t oblock, dm_cblock_t cblock, 46 - uint32_t hint, bool hint_valid) 59 + bool dirty, uint32_t hint, bool hint_valid) 47 60 { 48 - return p->load_mapping(p, oblock, cblock, hint, hint_valid); 61 + return p->load_mapping(p, oblock, cblock, dirty, hint, hint_valid); 62 + } 63 + 64 + static inline int policy_invalidate_mapping(struct dm_cache_policy *p, 65 + dm_cblock_t cblock) 66 + { 67 + return p->invalidate_mapping(p, cblock); 49 68 } 50 69 51 70 static inline uint32_t policy_get_hint(struct dm_cache_policy *p, 52 71 dm_cblock_t cblock) 53 72 { 54 73 return p->get_hint ? p->get_hint(p, cblock) : 0; 55 - } 56 - 57 - static inline int policy_writeback_work(struct dm_cache_policy *p, 58 - dm_oblock_t *oblock, 59 - dm_cblock_t *cblock, 60 - bool critical_only) 61 - { 62 - return p->writeback_work ? p->writeback_work(p, oblock, cblock, critical_only) : -ENOENT; 63 - } 64 - 65 - static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) 66 - { 67 - p->remove_mapping(p, oblock); 68 - } 69 - 70 - static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) 71 - { 72 - return p->remove_cblock(p, cblock); 73 - } 74 - 75 - static inline void policy_force_mapping(struct dm_cache_policy *p, 76 - dm_oblock_t current_oblock, dm_oblock_t new_oblock) 77 - { 78 - return p->force_mapping(p, current_oblock, new_oblock); 79 74 } 80 75 81 76 static inline dm_cblock_t policy_residency(struct dm_cache_policy *p) ··· 100 105 const char *key, const char *value) 101 106 { 102 107 return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL; 108 + } 109 + 110 + static inline void policy_allow_migrations(struct dm_cache_policy *p, bool allow) 111 + { 112 + return p->allow_migrations(p, allow); 103 113 } 104 114 105 115 /*----------------------------------------------------------------*/

+489 -388

drivers/md/dm-cache-policy-smq.c

··· 4 4 * This file is released under the GPL. 5 5 */ 6 6 7 - #include "dm-cache-policy.h" 7 + #include "dm-cache-background-tracker.h" 8 8 #include "dm-cache-policy-internal.h" 9 + #include "dm-cache-policy.h" 9 10 #include "dm.h" 10 11 11 12 #include <linux/hash.h> ··· 39 38 unsigned hash_next:28; 40 39 unsigned prev:28; 41 40 unsigned next:28; 42 - unsigned level:7; 41 + unsigned level:6; 43 42 bool dirty:1; 44 43 bool allocated:1; 45 44 bool sentinel:1; 45 + bool pending_work:1; 46 46 47 47 dm_oblock_t oblock; 48 48 }; ··· 281 279 */ 282 280 static void q_push(struct queue *q, struct entry *e) 283 281 { 282 + BUG_ON(e->pending_work); 283 + 284 284 if (!e->sentinel) 285 285 q->nr_elts++; 286 286 287 287 l_add_tail(q->es, q->qs + e->level, e); 288 288 } 289 289 290 + static void q_push_front(struct queue *q, struct entry *e) 291 + { 292 + BUG_ON(e->pending_work); 293 + 294 + if (!e->sentinel) 295 + q->nr_elts++; 296 + 297 + l_add_head(q->es, q->qs + e->level, e); 298 + } 299 + 290 300 static void q_push_before(struct queue *q, struct entry *old, struct entry *e) 291 301 { 302 + BUG_ON(e->pending_work); 303 + 292 304 if (!e->sentinel) 293 305 q->nr_elts++; 294 306 ··· 344 328 static struct entry *q_pop(struct queue *q) 345 329 { 346 330 struct entry *e = q_peek(q, q->nr_levels, true); 347 - 348 - if (e) 349 - q_del(q, e); 350 - 351 - return e; 352 - } 353 - 354 - /* 355 - * Pops an entry from a level that is not past a sentinel. 356 - */ 357 - static struct entry *q_pop_old(struct queue *q, unsigned max_level) 358 - { 359 - struct entry *e = q_peek(q, max_level, false); 360 331 361 332 if (e) 362 333 q_del(q, e); ··· 449 446 break; 450 447 451 448 e->level = level + 1u; 452 - l_add_head(q->es, l_above, e); 449 + l_add_tail(q->es, l_above, e); 453 450 } 454 451 } 455 452 } 456 453 457 - static void q_requeue_before(struct queue *q, struct entry *dest, struct entry *e, unsigned extra_levels) 454 + static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels, 455 + struct entry *s1, struct entry *s2) 458 456 { 459 457 struct entry *de; 460 - unsigned new_level; 458 + unsigned sentinels_passed = 0; 459 + unsigned new_level = min(q->nr_levels - 1u, e->level + extra_levels); 461 460 462 - q_del(q, e); 463 - 461 + /* try and find an entry to swap with */ 464 462 if (extra_levels && (e->level < q->nr_levels - 1u)) { 465 - new_level = min(q->nr_levels - 1u, e->level + extra_levels); 466 - for (de = l_head(q->es, q->qs + new_level); de; de = l_next(q->es, de)) { 467 - if (de->sentinel) 468 - continue; 463 + for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de)) 464 + sentinels_passed++; 469 465 466 + if (de) { 470 467 q_del(q, de); 471 468 de->level = e->level; 469 + if (s1) { 470 + switch (sentinels_passed) { 471 + case 0: 472 + q_push_before(q, s1, de); 473 + break; 472 474 473 - if (dest) 474 - q_push_before(q, dest, de); 475 - else 475 + case 1: 476 + q_push_before(q, s2, de); 477 + break; 478 + 479 + default: 480 + q_push(q, de); 481 + } 482 + } else 476 483 q_push(q, de); 477 - break; 478 484 } 479 - 480 - e->level = new_level; 481 485 } 482 486 487 + q_del(q, e); 488 + e->level = new_level; 483 489 q_push(q, e); 484 - } 485 - 486 - static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels) 487 - { 488 - q_requeue_before(q, NULL, e, extra_levels); 489 490 } 490 491 491 492 /*----------------------------------------------------------------*/ ··· 557 550 558 551 /*----------------------------------------------------------------*/ 559 552 560 - struct hash_table { 553 + struct smq_hash_table { 561 554 struct entry_space *es; 562 555 unsigned long long hash_bits; 563 556 unsigned *buckets; ··· 567 560 * All cache entries are stored in a chained hash table. To save space we 568 561 * use indexing again, and only store indexes to the next entry. 569 562 */ 570 - static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_entries) 563 + static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned nr_entries) 571 564 { 572 565 unsigned i, nr_buckets; 573 566 ··· 585 578 return 0; 586 579 } 587 580 588 - static void h_exit(struct hash_table *ht) 581 + static void h_exit(struct smq_hash_table *ht) 589 582 { 590 583 vfree(ht->buckets); 591 584 } 592 585 593 - static struct entry *h_head(struct hash_table *ht, unsigned bucket) 586 + static struct entry *h_head(struct smq_hash_table *ht, unsigned bucket) 594 587 { 595 588 return to_entry(ht->es, ht->buckets[bucket]); 596 589 } 597 590 598 - static struct entry *h_next(struct hash_table *ht, struct entry *e) 591 + static struct entry *h_next(struct smq_hash_table *ht, struct entry *e) 599 592 { 600 593 return to_entry(ht->es, e->hash_next); 601 594 } 602 595 603 - static void __h_insert(struct hash_table *ht, unsigned bucket, struct entry *e) 596 + static void __h_insert(struct smq_hash_table *ht, unsigned bucket, struct entry *e) 604 597 { 605 598 e->hash_next = ht->buckets[bucket]; 606 599 ht->buckets[bucket] = to_index(ht->es, e); 607 600 } 608 601 609 - static void h_insert(struct hash_table *ht, struct entry *e) 602 + static void h_insert(struct smq_hash_table *ht, struct entry *e) 610 603 { 611 604 unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits); 612 605 __h_insert(ht, h, e); 613 606 } 614 607 615 - static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t oblock, 608 + static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned h, dm_oblock_t oblock, 616 609 struct entry **prev) 617 610 { 618 611 struct entry *e; ··· 628 621 return NULL; 629 622 } 630 623 631 - static void __h_unlink(struct hash_table *ht, unsigned h, 624 + static void __h_unlink(struct smq_hash_table *ht, unsigned h, 632 625 struct entry *e, struct entry *prev) 633 626 { 634 627 if (prev) ··· 640 633 /* 641 634 * Also moves each entry to the front of the bucket. 642 635 */ 643 - static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock) 636 + static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock) 644 637 { 645 638 struct entry *e, *prev; 646 639 unsigned h = hash_64(from_oblock(oblock), ht->hash_bits); ··· 658 651 return e; 659 652 } 660 653 661 - static void h_remove(struct hash_table *ht, struct entry *e) 654 + static void h_remove(struct smq_hash_table *ht, struct entry *e) 662 655 { 663 656 unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits); 664 657 struct entry *prev; ··· 706 699 e->next = INDEXER_NULL; 707 700 e->prev = INDEXER_NULL; 708 701 e->level = 0u; 702 + e->dirty = true; /* FIXME: audit */ 709 703 e->allocated = true; 704 + e->sentinel = false; 705 + e->pending_work = false; 710 706 } 711 707 712 708 static struct entry *alloc_entry(struct entry_alloc *ea) ··· 772 762 #define NR_HOTSPOT_LEVELS 64u 773 763 #define NR_CACHE_LEVELS 64u 774 764 775 - #define WRITEBACK_PERIOD (10 * HZ) 776 - #define DEMOTE_PERIOD (60 * HZ) 765 + #define WRITEBACK_PERIOD (10ul * HZ) 766 + #define DEMOTE_PERIOD (60ul * HZ) 777 767 778 768 #define HOTSPOT_UPDATE_PERIOD (HZ) 779 - #define CACHE_UPDATE_PERIOD (10u * HZ) 769 + #define CACHE_UPDATE_PERIOD (60ul * HZ) 780 770 781 771 struct smq_policy { 782 772 struct dm_cache_policy policy; ··· 824 814 * The hash tables allows us to quickly find an entry by origin 825 815 * block. 826 816 */ 827 - struct hash_table table; 828 - struct hash_table hotspot_table; 817 + struct smq_hash_table table; 818 + struct smq_hash_table hotspot_table; 829 819 830 820 bool current_writeback_sentinels; 831 821 unsigned long next_writeback_period; ··· 838 828 839 829 unsigned long next_hotspot_period; 840 830 unsigned long next_cache_period; 831 + 832 + struct background_tracker *bg_work; 833 + 834 + bool migrations_allowed; 841 835 }; 842 836 843 837 /*----------------------------------------------------------------*/ ··· 890 876 static void update_sentinels(struct smq_policy *mq) 891 877 { 892 878 if (time_after(jiffies, mq->next_writeback_period)) { 893 - __update_writeback_sentinels(mq); 894 879 mq->next_writeback_period = jiffies + WRITEBACK_PERIOD; 895 880 mq->current_writeback_sentinels = !mq->current_writeback_sentinels; 881 + __update_writeback_sentinels(mq); 896 882 } 897 883 898 884 if (time_after(jiffies, mq->next_demote_period)) { 899 - __update_demote_sentinels(mq); 900 885 mq->next_demote_period = jiffies + DEMOTE_PERIOD; 901 886 mq->current_demote_sentinels = !mq->current_demote_sentinels; 887 + __update_demote_sentinels(mq); 902 888 } 903 889 } 904 890 ··· 934 920 935 921 /*----------------------------------------------------------------*/ 936 922 937 - /* 938 - * These methods tie together the dirty queue, clean queue and hash table. 939 - */ 940 - static void push_new(struct smq_policy *mq, struct entry *e) 923 + static void del_queue(struct smq_policy *mq, struct entry *e) 941 924 { 942 - struct queue *q = e->dirty ? &mq->dirty : &mq->clean; 943 - h_insert(&mq->table, e); 944 - q_push(q, e); 925 + q_del(e->dirty ? &mq->dirty : &mq->clean, e); 945 926 } 946 927 928 + static void push_queue(struct smq_policy *mq, struct entry *e) 929 + { 930 + if (e->dirty) 931 + q_push(&mq->dirty, e); 932 + else 933 + q_push(&mq->clean, e); 934 + } 935 + 936 + // !h, !q, a -> h, q, a 947 937 static void push(struct smq_policy *mq, struct entry *e) 948 938 { 949 - struct entry *sentinel; 950 - 951 939 h_insert(&mq->table, e); 952 - 953 - /* 954 - * Punch this into the queue just in front of the sentinel, to 955 - * ensure it's cleaned straight away. 956 - */ 957 - if (e->dirty) { 958 - sentinel = writeback_sentinel(mq, e->level); 959 - q_push_before(&mq->dirty, sentinel, e); 960 - } else { 961 - sentinel = demote_sentinel(mq, e->level); 962 - q_push_before(&mq->clean, sentinel, e); 963 - } 940 + if (!e->pending_work) 941 + push_queue(mq, e); 964 942 } 965 943 966 - /* 967 - * Removes an entry from cache. Removes from the hash table. 968 - */ 969 - static void __del(struct smq_policy *mq, struct queue *q, struct entry *e) 944 + static void push_queue_front(struct smq_policy *mq, struct entry *e) 970 945 { 971 - q_del(q, e); 972 - h_remove(&mq->table, e); 946 + if (e->dirty) 947 + q_push_front(&mq->dirty, e); 948 + else 949 + q_push_front(&mq->clean, e); 973 950 } 974 951 975 - static void del(struct smq_policy *mq, struct entry *e) 952 + static void push_front(struct smq_policy *mq, struct entry *e) 976 953 { 977 - __del(mq, e->dirty ? &mq->dirty : &mq->clean, e); 978 - } 979 - 980 - static struct entry *pop_old(struct smq_policy *mq, struct queue *q, unsigned max_level) 981 - { 982 - struct entry *e = q_pop_old(q, max_level); 983 - if (e) 984 - h_remove(&mq->table, e); 985 - return e; 954 + h_insert(&mq->table, e); 955 + if (!e->pending_work) 956 + push_queue_front(mq, e); 986 957 } 987 958 988 959 static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e) ··· 977 978 978 979 static void requeue(struct smq_policy *mq, struct entry *e) 979 980 { 980 - struct entry *sentinel; 981 + /* 982 + * Pending work has temporarily been taken out of the queues. 983 + */ 984 + if (e->pending_work) 985 + return; 981 986 982 987 if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) { 983 - if (e->dirty) { 984 - sentinel = writeback_sentinel(mq, e->level); 985 - q_requeue_before(&mq->dirty, sentinel, e, 1u); 986 - } else { 987 - sentinel = demote_sentinel(mq, e->level); 988 - q_requeue_before(&mq->clean, sentinel, e, 1u); 988 + if (!e->dirty) { 989 + q_requeue(&mq->clean, e, 1u, NULL, NULL); 990 + return; 989 991 } 992 + 993 + q_requeue(&mq->dirty, e, 1u, 994 + get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels), 995 + get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels)); 990 996 } 991 997 } 992 998 ··· 1030 1026 unsigned threshold_level = allocator_empty(&mq->cache_alloc) ? 1031 1027 default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u); 1032 1028 1029 + threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS); 1030 + 1033 1031 /* 1034 1032 * If the hotspot queue is performing badly then we have little 1035 1033 * confidence that we know which blocks to promote. So we cut down ··· 1051 1045 } 1052 1046 1053 1047 mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level; 1054 - mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level) + 2u; 1048 + mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level); 1055 1049 } 1056 1050 1057 1051 /* ··· 1101 1095 } 1102 1096 } 1103 1097 1104 - static int demote_cblock(struct smq_policy *mq, 1105 - struct policy_locker *locker, 1106 - dm_oblock_t *oblock) 1098 + /*----------------------------------------------------------------*/ 1099 + 1100 + /* 1101 + * Targets are given as a percentage. 1102 + */ 1103 + #define CLEAN_TARGET 25u 1104 + #define FREE_TARGET 25u 1105 + 1106 + static unsigned percent_to_target(struct smq_policy *mq, unsigned p) 1107 1107 { 1108 - struct entry *demoted = q_peek(&mq->clean, mq->clean.nr_levels, false); 1109 - if (!demoted) 1110 - /* 1111 - * We could get a block from mq->dirty, but that 1112 - * would add extra latency to the triggering bio as it 1113 - * waits for the writeback. Better to not promote this 1114 - * time and hope there's a clean block next time this block 1115 - * is hit. 1116 - */ 1117 - return -ENOSPC; 1118 - 1119 - if (locker->fn(locker, demoted->oblock)) 1120 - /* 1121 - * We couldn't lock this block. 1122 - */ 1123 - return -EBUSY; 1124 - 1125 - del(mq, demoted); 1126 - *oblock = demoted->oblock; 1127 - free_entry(&mq->cache_alloc, demoted); 1128 - 1129 - return 0; 1108 + return from_cblock(mq->cache_size) * p / 100u; 1130 1109 } 1110 + 1111 + static bool clean_target_met(struct smq_policy *mq, bool idle) 1112 + { 1113 + /* 1114 + * Cache entries may not be populated. So we cannot rely on the 1115 + * size of the clean queue. 1116 + */ 1117 + unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty); 1118 + 1119 + if (idle) 1120 + /* 1121 + * We'd like to clean everything. 1122 + */ 1123 + return q_size(&mq->dirty) == 0u; 1124 + else 1125 + return (nr_clean + btracker_nr_writebacks_queued(mq->bg_work)) >= 1126 + percent_to_target(mq, CLEAN_TARGET); 1127 + } 1128 + 1129 + static bool free_target_met(struct smq_policy *mq, bool idle) 1130 + { 1131 + unsigned nr_free = from_cblock(mq->cache_size) - 1132 + mq->cache_alloc.nr_allocated; 1133 + 1134 + if (idle) 1135 + return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >= 1136 + percent_to_target(mq, FREE_TARGET); 1137 + else 1138 + return true; 1139 + } 1140 + 1141 + /*----------------------------------------------------------------*/ 1142 + 1143 + static void mark_pending(struct smq_policy *mq, struct entry *e) 1144 + { 1145 + BUG_ON(e->sentinel); 1146 + BUG_ON(!e->allocated); 1147 + BUG_ON(e->pending_work); 1148 + e->pending_work = true; 1149 + } 1150 + 1151 + static void clear_pending(struct smq_policy *mq, struct entry *e) 1152 + { 1153 + BUG_ON(!e->pending_work); 1154 + e->pending_work = false; 1155 + } 1156 + 1157 + static void queue_writeback(struct smq_policy *mq) 1158 + { 1159 + int r; 1160 + struct policy_work work; 1161 + struct entry *e; 1162 + 1163 + e = q_peek(&mq->dirty, mq->dirty.nr_levels, false); 1164 + if (e) { 1165 + mark_pending(mq, e); 1166 + q_del(&mq->dirty, e); 1167 + 1168 + work.op = POLICY_WRITEBACK; 1169 + work.oblock = e->oblock; 1170 + work.cblock = infer_cblock(mq, e); 1171 + 1172 + r = btracker_queue(mq->bg_work, &work, NULL); 1173 + WARN_ON_ONCE(r); // FIXME: finish, I think we have to get rid of this race. 1174 + } 1175 + } 1176 + 1177 + static void queue_demotion(struct smq_policy *mq) 1178 + { 1179 + struct policy_work work; 1180 + struct entry *e; 1181 + 1182 + if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed))) 1183 + return; 1184 + 1185 + e = q_peek(&mq->clean, mq->clean.nr_levels, true); 1186 + if (!e) { 1187 + if (!clean_target_met(mq, false)) 1188 + queue_writeback(mq); 1189 + return; 1190 + } 1191 + 1192 + mark_pending(mq, e); 1193 + q_del(&mq->clean, e); 1194 + 1195 + work.op = POLICY_DEMOTE; 1196 + work.oblock = e->oblock; 1197 + work.cblock = infer_cblock(mq, e); 1198 + btracker_queue(mq->bg_work, &work, NULL); 1199 + } 1200 + 1201 + static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock, 1202 + struct policy_work **workp) 1203 + { 1204 + struct entry *e; 1205 + struct policy_work work; 1206 + 1207 + if (!mq->migrations_allowed) 1208 + return; 1209 + 1210 + if (allocator_empty(&mq->cache_alloc)) { 1211 + if (!free_target_met(mq, false)) 1212 + queue_demotion(mq); 1213 + return; 1214 + } 1215 + 1216 + if (btracker_promotion_already_present(mq->bg_work, oblock)) 1217 + return; 1218 + 1219 + /* 1220 + * We allocate the entry now to reserve the cblock. If the 1221 + * background work is aborted we must remember to free it. 1222 + */ 1223 + e = alloc_entry(&mq->cache_alloc); 1224 + BUG_ON(!e); 1225 + e->pending_work = true; 1226 + work.op = POLICY_PROMOTE; 1227 + work.oblock = oblock; 1228 + work.cblock = infer_cblock(mq, e); 1229 + btracker_queue(mq->bg_work, &work, workp); 1230 + } 1231 + 1232 + /*----------------------------------------------------------------*/ 1131 1233 1132 1234 enum promote_result { 1133 1235 PROMOTE_NOT, ··· 1251 1137 return promote ? PROMOTE_PERMANENT : PROMOTE_NOT; 1252 1138 } 1253 1139 1254 - static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, struct bio *bio, 1255 - bool fast_promote) 1140 + static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, 1141 + int data_dir, bool fast_promote) 1256 1142 { 1257 - if (bio_data_dir(bio) == WRITE) { 1143 + if (data_dir == WRITE) { 1258 1144 if (!allocator_empty(&mq->cache_alloc) && fast_promote) 1259 1145 return PROMOTE_TEMPORARY; 1260 1146 1261 - else 1262 - return maybe_promote(hs_e->level >= mq->write_promote_level); 1147 + return maybe_promote(hs_e->level >= mq->write_promote_level); 1263 1148 } else 1264 1149 return maybe_promote(hs_e->level >= mq->read_promote_level); 1265 - } 1266 - 1267 - static void insert_in_cache(struct smq_policy *mq, dm_oblock_t oblock, 1268 - struct policy_locker *locker, 1269 - struct policy_result *result, enum promote_result pr) 1270 - { 1271 - int r; 1272 - struct entry *e; 1273 - 1274 - if (allocator_empty(&mq->cache_alloc)) { 1275 - result->op = POLICY_REPLACE; 1276 - r = demote_cblock(mq, locker, &result->old_oblock); 1277 - if (r) { 1278 - result->op = POLICY_MISS; 1279 - return; 1280 - } 1281 - 1282 - } else 1283 - result->op = POLICY_NEW; 1284 - 1285 - e = alloc_entry(&mq->cache_alloc); 1286 - BUG_ON(!e); 1287 - e->oblock = oblock; 1288 - 1289 - if (pr == PROMOTE_TEMPORARY) 1290 - push(mq, e); 1291 - else 1292 - push_new(mq, e); 1293 - 1294 - result->cblock = infer_cblock(mq, e); 1295 1150 } 1296 1151 1297 1152 static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b) ··· 1270 1187 return to_oblock(r); 1271 1188 } 1272 1189 1273 - static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, struct bio *bio) 1190 + static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b) 1274 1191 { 1275 1192 unsigned hi; 1276 1193 dm_oblock_t hb = to_hblock(mq, b); ··· 1282 1199 hi = get_index(&mq->hotspot_alloc, e); 1283 1200 q_requeue(&mq->hotspot, e, 1284 1201 test_and_set_bit(hi, mq->hotspot_hit_bits) ? 1285 - 0u : mq->hotspot_level_jump); 1202 + 0u : mq->hotspot_level_jump, 1203 + NULL, NULL); 1286 1204 1287 1205 } else { 1288 1206 stats_miss(&mq->hotspot_stats); ··· 1309 1225 return e; 1310 1226 } 1311 1227 1312 - /* 1313 - * Looks the oblock up in the hash table, then decides whether to put in 1314 - * pre_cache, or cache etc. 1315 - */ 1316 - static int map(struct smq_policy *mq, struct bio *bio, dm_oblock_t oblock, 1317 - bool can_migrate, bool fast_promote, 1318 - struct policy_locker *locker, struct policy_result *result) 1319 - { 1320 - struct entry *e, *hs_e; 1321 - enum promote_result pr; 1322 - 1323 - hs_e = update_hotspot_queue(mq, oblock, bio); 1324 - 1325 - e = h_lookup(&mq->table, oblock); 1326 - if (e) { 1327 - stats_level_accessed(&mq->cache_stats, e->level); 1328 - 1329 - requeue(mq, e); 1330 - result->op = POLICY_HIT; 1331 - result->cblock = infer_cblock(mq, e); 1332 - 1333 - } else { 1334 - stats_miss(&mq->cache_stats); 1335 - 1336 - pr = should_promote(mq, hs_e, bio, fast_promote); 1337 - if (pr == PROMOTE_NOT) 1338 - result->op = POLICY_MISS; 1339 - 1340 - else { 1341 - if (!can_migrate) { 1342 - result->op = POLICY_MISS; 1343 - return -EWOULDBLOCK; 1344 - } 1345 - 1346 - insert_in_cache(mq, oblock, locker, result, pr); 1347 - } 1348 - } 1349 - 1350 - return 0; 1351 - } 1352 - 1353 1228 /*----------------------------------------------------------------*/ 1354 1229 1355 1230 /* ··· 1325 1282 { 1326 1283 struct smq_policy *mq = to_smq_policy(p); 1327 1284 1285 + btracker_destroy(mq->bg_work); 1328 1286 h_exit(&mq->hotspot_table); 1329 1287 h_exit(&mq->table); 1330 1288 free_bitset(mq->hotspot_hit_bits); ··· 1334 1290 kfree(mq); 1335 1291 } 1336 1292 1337 - static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock, 1338 - bool can_block, bool can_migrate, bool fast_promote, 1339 - struct bio *bio, struct policy_locker *locker, 1340 - struct policy_result *result) 1293 + /*----------------------------------------------------------------*/ 1294 + 1295 + static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock, 1296 + int data_dir, bool fast_copy, 1297 + struct policy_work **work, bool *background_work) 1341 1298 { 1342 - int r; 1343 - unsigned long flags; 1344 - struct smq_policy *mq = to_smq_policy(p); 1299 + struct entry *e, *hs_e; 1300 + enum promote_result pr; 1345 1301 1346 - result->op = POLICY_MISS; 1302 + *background_work = false; 1347 1303 1348 - spin_lock_irqsave(&mq->lock, flags); 1349 - r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result); 1350 - spin_unlock_irqrestore(&mq->lock, flags); 1351 - 1352 - return r; 1353 - } 1354 - 1355 - static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) 1356 - { 1357 - int r; 1358 - unsigned long flags; 1359 - struct smq_policy *mq = to_smq_policy(p); 1360 - struct entry *e; 1361 - 1362 - spin_lock_irqsave(&mq->lock, flags); 1363 1304 e = h_lookup(&mq->table, oblock); 1364 1305 if (e) { 1306 + stats_level_accessed(&mq->cache_stats, e->level); 1307 + 1308 + requeue(mq, e); 1365 1309 *cblock = infer_cblock(mq, e); 1366 - r = 0; 1367 - } else 1368 - r = -ENOENT; 1310 + return 0; 1311 + 1312 + } else { 1313 + stats_miss(&mq->cache_stats); 1314 + 1315 + /* 1316 + * The hotspot queue only gets updated with misses. 1317 + */ 1318 + hs_e = update_hotspot_queue(mq, oblock); 1319 + 1320 + pr = should_promote(mq, hs_e, data_dir, fast_copy); 1321 + if (pr != PROMOTE_NOT) { 1322 + queue_promotion(mq, oblock, work); 1323 + *background_work = true; 1324 + } 1325 + 1326 + return -ENOENT; 1327 + } 1328 + } 1329 + 1330 + static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock, 1331 + int data_dir, bool fast_copy, 1332 + bool *background_work) 1333 + { 1334 + int r; 1335 + unsigned long flags; 1336 + struct smq_policy *mq = to_smq_policy(p); 1337 + 1338 + spin_lock_irqsave(&mq->lock, flags); 1339 + r = __lookup(mq, oblock, cblock, 1340 + data_dir, fast_copy, 1341 + NULL, background_work); 1369 1342 spin_unlock_irqrestore(&mq->lock, flags); 1370 1343 1371 1344 return r; 1372 1345 } 1373 1346 1374 - static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, bool set) 1347 + static int smq_lookup_with_work(struct dm_cache_policy *p, 1348 + dm_oblock_t oblock, dm_cblock_t *cblock, 1349 + int data_dir, bool fast_copy, 1350 + struct policy_work **work) 1375 1351 { 1376 - struct entry *e; 1352 + int r; 1353 + bool background_queued; 1354 + unsigned long flags; 1355 + struct smq_policy *mq = to_smq_policy(p); 1377 1356 1378 - e = h_lookup(&mq->table, oblock); 1379 - BUG_ON(!e); 1357 + spin_lock_irqsave(&mq->lock, flags); 1358 + r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued); 1359 + spin_unlock_irqrestore(&mq->lock, flags); 1380 1360 1381 - del(mq, e); 1382 - e->dirty = set; 1383 - push(mq, e); 1361 + return r; 1384 1362 } 1385 1363 1386 - static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) 1364 + static int smq_get_background_work(struct dm_cache_policy *p, bool idle, 1365 + struct policy_work **result) 1366 + { 1367 + int r; 1368 + unsigned long flags; 1369 + struct smq_policy *mq = to_smq_policy(p); 1370 + 1371 + spin_lock_irqsave(&mq->lock, flags); 1372 + r = btracker_issue(mq->bg_work, result); 1373 + if (r == -ENODATA) { 1374 + /* find some writeback work to do */ 1375 + if (mq->migrations_allowed && !free_target_met(mq, idle)) 1376 + queue_demotion(mq); 1377 + 1378 + else if (!clean_target_met(mq, idle)) 1379 + queue_writeback(mq); 1380 + 1381 + r = btracker_issue(mq->bg_work, result); 1382 + } 1383 + spin_unlock_irqrestore(&mq->lock, flags); 1384 + 1385 + return r; 1386 + } 1387 + 1388 + /* 1389 + * We need to clear any pending work flags that have been set, and in the 1390 + * case of promotion free the entry for the destination cblock. 1391 + */ 1392 + static void __complete_background_work(struct smq_policy *mq, 1393 + struct policy_work *work, 1394 + bool success) 1395 + { 1396 + struct entry *e = get_entry(&mq->cache_alloc, 1397 + from_cblock(work->cblock)); 1398 + 1399 + switch (work->op) { 1400 + case POLICY_PROMOTE: 1401 + // !h, !q, a 1402 + clear_pending(mq, e); 1403 + if (success) { 1404 + e->oblock = work->oblock; 1405 + push(mq, e); 1406 + // h, q, a 1407 + } else { 1408 + free_entry(&mq->cache_alloc, e); 1409 + // !h, !q, !a 1410 + } 1411 + break; 1412 + 1413 + case POLICY_DEMOTE: 1414 + // h, !q, a 1415 + if (success) { 1416 + h_remove(&mq->table, e); 1417 + free_entry(&mq->cache_alloc, e); 1418 + // !h, !q, !a 1419 + } else { 1420 + clear_pending(mq, e); 1421 + push_queue(mq, e); 1422 + // h, q, a 1423 + } 1424 + break; 1425 + 1426 + case POLICY_WRITEBACK: 1427 + // h, !q, a 1428 + clear_pending(mq, e); 1429 + push_queue(mq, e); 1430 + // h, q, a 1431 + break; 1432 + } 1433 + 1434 + btracker_complete(mq->bg_work, work); 1435 + } 1436 + 1437 + static void smq_complete_background_work(struct dm_cache_policy *p, 1438 + struct policy_work *work, 1439 + bool success) 1387 1440 { 1388 1441 unsigned long flags; 1389 1442 struct smq_policy *mq = to_smq_policy(p); 1390 1443 1391 1444 spin_lock_irqsave(&mq->lock, flags); 1392 - __smq_set_clear_dirty(mq, oblock, true); 1445 + __complete_background_work(mq, work, success); 1393 1446 spin_unlock_irqrestore(&mq->lock, flags); 1394 1447 } 1395 1448 1396 - static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) 1449 + // in_hash(oblock) -> in_hash(oblock) 1450 + static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set) 1451 + { 1452 + struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); 1453 + 1454 + if (e->pending_work) 1455 + e->dirty = set; 1456 + else { 1457 + del_queue(mq, e); 1458 + e->dirty = set; 1459 + push_queue(mq, e); 1460 + } 1461 + } 1462 + 1463 + static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) 1464 + { 1465 + unsigned long flags; 1466 + struct smq_policy *mq = to_smq_policy(p); 1467 + 1468 + spin_lock_irqsave(&mq->lock, flags); 1469 + __smq_set_clear_dirty(mq, cblock, true); 1470 + spin_unlock_irqrestore(&mq->lock, flags); 1471 + } 1472 + 1473 + static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) 1397 1474 { 1398 1475 struct smq_policy *mq = to_smq_policy(p); 1399 1476 unsigned long flags; 1400 1477 1401 1478 spin_lock_irqsave(&mq->lock, flags); 1402 - __smq_set_clear_dirty(mq, oblock, false); 1479 + __smq_set_clear_dirty(mq, cblock, false); 1403 1480 spin_unlock_irqrestore(&mq->lock, flags); 1404 1481 } 1405 1482 ··· 1531 1366 1532 1367 static int smq_load_mapping(struct dm_cache_policy *p, 1533 1368 dm_oblock_t oblock, dm_cblock_t cblock, 1534 - uint32_t hint, bool hint_valid) 1369 + bool dirty, uint32_t hint, bool hint_valid) 1535 1370 { 1536 1371 struct smq_policy *mq = to_smq_policy(p); 1537 1372 struct entry *e; 1538 1373 1539 1374 e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock)); 1540 1375 e->oblock = oblock; 1541 - e->dirty = false; /* this gets corrected in a minute */ 1376 + e->dirty = dirty; 1542 1377 e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock); 1543 - push(mq, e); 1378 + e->pending_work = false; 1544 1379 1380 + /* 1381 + * When we load mappings we push ahead of both sentinels in order to 1382 + * allow demotions and cleaning to occur immediately. 1383 + */ 1384 + push_front(mq, e); 1385 + 1386 + return 0; 1387 + } 1388 + 1389 + static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock) 1390 + { 1391 + struct smq_policy *mq = to_smq_policy(p); 1392 + struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); 1393 + 1394 + if (!e->allocated) 1395 + return -ENODATA; 1396 + 1397 + // FIXME: what if this block has pending background work? 1398 + del_queue(mq, e); 1399 + h_remove(&mq->table, e); 1400 + free_entry(&mq->cache_alloc, e); 1545 1401 return 0; 1546 1402 } 1547 1403 ··· 1575 1389 return 0; 1576 1390 1577 1391 return e->level; 1578 - } 1579 - 1580 - static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock) 1581 - { 1582 - struct entry *e; 1583 - 1584 - e = h_lookup(&mq->table, oblock); 1585 - BUG_ON(!e); 1586 - 1587 - del(mq, e); 1588 - free_entry(&mq->cache_alloc, e); 1589 - } 1590 - 1591 - static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) 1592 - { 1593 - struct smq_policy *mq = to_smq_policy(p); 1594 - unsigned long flags; 1595 - 1596 - spin_lock_irqsave(&mq->lock, flags); 1597 - __remove_mapping(mq, oblock); 1598 - spin_unlock_irqrestore(&mq->lock, flags); 1599 - } 1600 - 1601 - static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock) 1602 - { 1603 - struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); 1604 - 1605 - if (!e || !e->allocated) 1606 - return -ENODATA; 1607 - 1608 - del(mq, e); 1609 - free_entry(&mq->cache_alloc, e); 1610 - 1611 - return 0; 1612 - } 1613 - 1614 - static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) 1615 - { 1616 - int r; 1617 - unsigned long flags; 1618 - struct smq_policy *mq = to_smq_policy(p); 1619 - 1620 - spin_lock_irqsave(&mq->lock, flags); 1621 - r = __remove_cblock(mq, cblock); 1622 - spin_unlock_irqrestore(&mq->lock, flags); 1623 - 1624 - return r; 1625 - } 1626 - 1627 - 1628 - #define CLEAN_TARGET_CRITICAL 5u /* percent */ 1629 - 1630 - static bool clean_target_met(struct smq_policy *mq, bool critical) 1631 - { 1632 - if (critical) { 1633 - /* 1634 - * Cache entries may not be populated. So we're cannot rely on the 1635 - * size of the clean queue. 1636 - */ 1637 - unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty); 1638 - unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_CRITICAL / 100u; 1639 - 1640 - return nr_clean >= target; 1641 - } else 1642 - return !q_size(&mq->dirty); 1643 - } 1644 - 1645 - static int __smq_writeback_work(struct smq_policy *mq, dm_oblock_t *oblock, 1646 - dm_cblock_t *cblock, bool critical_only) 1647 - { 1648 - struct entry *e = NULL; 1649 - bool target_met = clean_target_met(mq, critical_only); 1650 - 1651 - if (critical_only) 1652 - /* 1653 - * Always try and keep the bottom level clean. 1654 - */ 1655 - e = pop_old(mq, &mq->dirty, target_met ? 1u : mq->dirty.nr_levels); 1656 - 1657 - else 1658 - e = pop_old(mq, &mq->dirty, mq->dirty.nr_levels); 1659 - 1660 - if (!e) 1661 - return -ENODATA; 1662 - 1663 - *oblock = e->oblock; 1664 - *cblock = infer_cblock(mq, e); 1665 - e->dirty = false; 1666 - push_new(mq, e); 1667 - 1668 - return 0; 1669 - } 1670 - 1671 - static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock, 1672 - dm_cblock_t *cblock, bool critical_only) 1673 - { 1674 - int r; 1675 - unsigned long flags; 1676 - struct smq_policy *mq = to_smq_policy(p); 1677 - 1678 - spin_lock_irqsave(&mq->lock, flags); 1679 - r = __smq_writeback_work(mq, oblock, cblock, critical_only); 1680 - spin_unlock_irqrestore(&mq->lock, flags); 1681 - 1682 - return r; 1683 - } 1684 - 1685 - static void __force_mapping(struct smq_policy *mq, 1686 - dm_oblock_t current_oblock, dm_oblock_t new_oblock) 1687 - { 1688 - struct entry *e = h_lookup(&mq->table, current_oblock); 1689 - 1690 - if (e) { 1691 - del(mq, e); 1692 - e->oblock = new_oblock; 1693 - e->dirty = true; 1694 - push(mq, e); 1695 - } 1696 - } 1697 - 1698 - static void smq_force_mapping(struct dm_cache_policy *p, 1699 - dm_oblock_t current_oblock, dm_oblock_t new_oblock) 1700 - { 1701 - unsigned long flags; 1702 - struct smq_policy *mq = to_smq_policy(p); 1703 - 1704 - spin_lock_irqsave(&mq->lock, flags); 1705 - __force_mapping(mq, current_oblock, new_oblock); 1706 - spin_unlock_irqrestore(&mq->lock, flags); 1707 1392 } 1708 1393 1709 1394 static dm_cblock_t smq_residency(struct dm_cache_policy *p) ··· 1601 1544 end_hotspot_period(mq); 1602 1545 end_cache_period(mq); 1603 1546 spin_unlock_irqrestore(&mq->lock, flags); 1547 + } 1548 + 1549 + static void smq_allow_migrations(struct dm_cache_policy *p, bool allow) 1550 + { 1551 + struct smq_policy *mq = to_smq_policy(p); 1552 + mq->migrations_allowed = allow; 1604 1553 } 1605 1554 1606 1555 /* ··· 1653 1590 static void init_policy_functions(struct smq_policy *mq, bool mimic_mq) 1654 1591 { 1655 1592 mq->policy.destroy = smq_destroy; 1656 - mq->policy.map = smq_map; 1657 1593 mq->policy.lookup = smq_lookup; 1594 + mq->policy.lookup_with_work = smq_lookup_with_work; 1595 + mq->policy.get_background_work = smq_get_background_work; 1596 + mq->policy.complete_background_work = smq_complete_background_work; 1658 1597 mq->policy.set_dirty = smq_set_dirty; 1659 1598 mq->policy.clear_dirty = smq_clear_dirty; 1660 1599 mq->policy.load_mapping = smq_load_mapping; 1600 + mq->policy.invalidate_mapping = smq_invalidate_mapping; 1661 1601 mq->policy.get_hint = smq_get_hint; 1662 - mq->policy.remove_mapping = smq_remove_mapping; 1663 - mq->policy.remove_cblock = smq_remove_cblock; 1664 - mq->policy.writeback_work = smq_writeback_work; 1665 - mq->policy.force_mapping = smq_force_mapping; 1666 1602 mq->policy.residency = smq_residency; 1667 1603 mq->policy.tick = smq_tick; 1604 + mq->policy.allow_migrations = smq_allow_migrations; 1668 1605 1669 1606 if (mimic_mq) { 1670 1607 mq->policy.set_config_value = mq_set_config_value; ··· 1696 1633 static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, 1697 1634 sector_t origin_size, 1698 1635 sector_t cache_block_size, 1699 - bool mimic_mq) 1636 + bool mimic_mq, 1637 + bool migrations_allowed) 1700 1638 { 1701 1639 unsigned i; 1702 1640 unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS; ··· 1722 1658 } 1723 1659 1724 1660 init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue); 1725 - for (i = 0; i < nr_sentinels_per_queue; i++) 1661 + for (i = 0; i < nr_sentinels_per_queue; i++) 1726 1662 get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true; 1727 1663 1728 1664 init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels); 1729 - for (i = 0; i < nr_sentinels_per_queue; i++) 1665 + for (i = 0; i < nr_sentinels_per_queue; i++) 1730 1666 get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true; 1731 1667 1732 1668 init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels, ··· 1779 1715 mq->next_hotspot_period = jiffies; 1780 1716 mq->next_cache_period = jiffies; 1781 1717 1718 + mq->bg_work = btracker_create(10240); /* FIXME: hard coded value */ 1719 + if (!mq->bg_work) 1720 + goto bad_btracker; 1721 + 1722 + mq->migrations_allowed = migrations_allowed; 1723 + 1782 1724 return &mq->policy; 1783 1725 1726 + bad_btracker: 1727 + h_exit(&mq->hotspot_table); 1784 1728 bad_alloc_hotspot_table: 1785 1729 h_exit(&mq->table); 1786 1730 bad_alloc_table: ··· 1807 1735 sector_t origin_size, 1808 1736 sector_t cache_block_size) 1809 1737 { 1810 - return __smq_create(cache_size, origin_size, cache_block_size, false); 1738 + return __smq_create(cache_size, origin_size, cache_block_size, false, true); 1811 1739 } 1812 1740 1813 1741 static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, 1814 1742 sector_t origin_size, 1815 1743 sector_t cache_block_size) 1816 1744 { 1817 - return __smq_create(cache_size, origin_size, cache_block_size, true); 1745 + return __smq_create(cache_size, origin_size, cache_block_size, true, true); 1746 + } 1747 + 1748 + static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size, 1749 + sector_t origin_size, 1750 + sector_t cache_block_size) 1751 + { 1752 + return __smq_create(cache_size, origin_size, cache_block_size, false, false); 1818 1753 } 1819 1754 1820 1755 /*----------------------------------------------------------------*/ 1821 1756 1822 1757 static struct dm_cache_policy_type smq_policy_type = { 1823 1758 .name = "smq", 1824 - .version = {1, 5, 0}, 1759 + .version = {2, 0, 0}, 1825 1760 .hint_size = 4, 1826 1761 .owner = THIS_MODULE, 1827 1762 .create = smq_create ··· 1836 1757 1837 1758 static struct dm_cache_policy_type mq_policy_type = { 1838 1759 .name = "mq", 1839 - .version = {1, 5, 0}, 1760 + .version = {2, 0, 0}, 1840 1761 .hint_size = 4, 1841 1762 .owner = THIS_MODULE, 1842 1763 .create = mq_create, 1843 1764 }; 1844 1765 1766 + static struct dm_cache_policy_type cleaner_policy_type = { 1767 + .name = "cleaner", 1768 + .version = {2, 0, 0}, 1769 + .hint_size = 4, 1770 + .owner = THIS_MODULE, 1771 + .create = cleaner_create, 1772 + }; 1773 + 1845 1774 static struct dm_cache_policy_type default_policy_type = { 1846 1775 .name = "default", 1847 - .version = {1, 5, 0}, 1776 + .version = {2, 0, 0}, 1848 1777 .hint_size = 4, 1849 1778 .owner = THIS_MODULE, 1850 1779 .create = smq_create, ··· 1872 1785 r = dm_cache_policy_register(&mq_policy_type); 1873 1786 if (r) { 1874 1787 DMERR("register failed (as mq) %d", r); 1875 - dm_cache_policy_unregister(&smq_policy_type); 1876 - return -ENOMEM; 1788 + goto out_mq; 1789 + } 1790 + 1791 + r = dm_cache_policy_register(&cleaner_policy_type); 1792 + if (r) { 1793 + DMERR("register failed (as cleaner) %d", r); 1794 + goto out_cleaner; 1877 1795 } 1878 1796 1879 1797 r = dm_cache_policy_register(&default_policy_type); 1880 1798 if (r) { 1881 1799 DMERR("register failed (as default) %d", r); 1882 - dm_cache_policy_unregister(&mq_policy_type); 1883 - dm_cache_policy_unregister(&smq_policy_type); 1884 - return -ENOMEM; 1800 + goto out_default; 1885 1801 } 1886 1802 1887 1803 return 0; 1804 + 1805 + out_default: 1806 + dm_cache_policy_unregister(&cleaner_policy_type); 1807 + out_cleaner: 1808 + dm_cache_policy_unregister(&mq_policy_type); 1809 + out_mq: 1810 + dm_cache_policy_unregister(&smq_policy_type); 1811 + 1812 + return -ENOMEM; 1888 1813 } 1889 1814 1890 1815 static void __exit smq_exit(void) 1891 1816 { 1817 + dm_cache_policy_unregister(&cleaner_policy_type); 1892 1818 dm_cache_policy_unregister(&smq_policy_type); 1893 1819 dm_cache_policy_unregister(&mq_policy_type); 1894 1820 dm_cache_policy_unregister(&default_policy_type); ··· 1916 1816 1917 1817 MODULE_ALIAS("dm-cache-default"); 1918 1818 MODULE_ALIAS("dm-cache-mq"); 1819 + MODULE_ALIAS("dm-cache-cleaner");

+59 -138

drivers/md/dm-cache-policy.h

··· 13 13 14 14 /*----------------------------------------------------------------*/ 15 15 16 - /* FIXME: make it clear which methods are optional. Get debug policy to 17 - * double check this at start. 18 - */ 19 - 20 16 /* 21 17 * The cache policy makes the important decisions about which blocks get to 22 18 * live on the faster cache device. 23 - * 24 - * When the core target has to remap a bio it calls the 'map' method of the 25 - * policy. This returns an instruction telling the core target what to do. 26 - * 27 - * POLICY_HIT: 28 - * That block is in the cache. Remap to the cache and carry on. 29 - * 30 - * POLICY_MISS: 31 - * This block is on the origin device. Remap and carry on. 32 - * 33 - * POLICY_NEW: 34 - * This block is currently on the origin device, but the policy wants to 35 - * move it. The core should: 36 - * 37 - * - hold any further io to this origin block 38 - * - copy the origin to the given cache block 39 - * - release all the held blocks 40 - * - remap the original block to the cache 41 - * 42 - * POLICY_REPLACE: 43 - * This block is currently on the origin device. The policy wants to 44 - * move it to the cache, with the added complication that the destination 45 - * cache block needs a writeback first. The core should: 46 - * 47 - * - hold any further io to this origin block 48 - * - hold any further io to the origin block that's being written back 49 - * - writeback 50 - * - copy new block to cache 51 - * - release held blocks 52 - * - remap bio to cache and reissue. 53 - * 54 - * Should the core run into trouble while processing a POLICY_NEW or 55 - * POLICY_REPLACE instruction it will roll back the policies mapping using 56 - * remove_mapping() or force_mapping(). These methods must not fail. This 57 - * approach avoids having transactional semantics in the policy (ie, the 58 - * core informing the policy when a migration is complete), and hence makes 59 - * it easier to write new policies. 60 - * 61 - * In general policy methods should never block, except in the case of the 62 - * map function when can_migrate is set. So be careful to implement using 63 - * bounded, preallocated memory. 64 19 */ 65 20 enum policy_operation { 66 - POLICY_HIT, 67 - POLICY_MISS, 68 - POLICY_NEW, 69 - POLICY_REPLACE 70 - }; 71 - 72 - /* 73 - * When issuing a POLICY_REPLACE the policy needs to make a callback to 74 - * lock the block being demoted. This doesn't need to occur during a 75 - * writeback operation since the block remains in the cache. 76 - */ 77 - struct policy_locker; 78 - typedef int (*policy_lock_fn)(struct policy_locker *l, dm_oblock_t oblock); 79 - 80 - struct policy_locker { 81 - policy_lock_fn fn; 21 + POLICY_PROMOTE, 22 + POLICY_DEMOTE, 23 + POLICY_WRITEBACK 82 24 }; 83 25 84 26 /* 85 27 * This is the instruction passed back to the core target. 86 28 */ 87 - struct policy_result { 29 + struct policy_work { 88 30 enum policy_operation op; 89 - dm_oblock_t old_oblock; /* POLICY_REPLACE */ 90 - dm_cblock_t cblock; /* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */ 31 + dm_oblock_t oblock; 32 + dm_cblock_t cblock; 91 33 }; 92 34 93 35 /* 94 - * The cache policy object. Just a bunch of methods. It is envisaged that 95 - * this structure will be embedded in a bigger, policy specific structure 96 - * (ie. use container_of()). 36 + * The cache policy object. It is envisaged that this structure will be 37 + * embedded in a bigger, policy specific structure (ie. use container_of()). 97 38 */ 98 39 struct dm_cache_policy { 99 - 100 - /* 101 - * FIXME: make it clear which methods are optional, and which may 102 - * block. 103 - */ 104 - 105 40 /* 106 41 * Destroys this object. 107 42 */ 108 43 void (*destroy)(struct dm_cache_policy *p); 109 44 110 45 /* 111 - * See large comment above. 112 - * 113 - * oblock - the origin block we're interested in. 114 - * 115 - * can_block - indicates whether the current thread is allowed to 116 - * block. -EWOULDBLOCK returned if it can't and would. 117 - * 118 - * can_migrate - gives permission for POLICY_NEW or POLICY_REPLACE 119 - * instructions. If denied and the policy would have 120 - * returned one of these instructions it should 121 - * return -EWOULDBLOCK. 122 - * 123 - * discarded_oblock - indicates whether the whole origin block is 124 - * in a discarded state (FIXME: better to tell the 125 - * policy about this sooner, so it can recycle that 126 - * cache block if it wants.) 127 - * bio - the bio that triggered this call. 128 - * result - gets filled in with the instruction. 129 - * 130 - * May only return 0, or -EWOULDBLOCK (if !can_migrate) 131 - */ 132 - int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock, 133 - bool can_block, bool can_migrate, bool discarded_oblock, 134 - struct bio *bio, struct policy_locker *locker, 135 - struct policy_result *result); 136 - 137 - /* 138 - * Sometimes we want to see if a block is in the cache, without 139 - * triggering any update of stats. (ie. it's not a real hit). 46 + * Find the location of a block. 140 47 * 141 48 * Must not block. 142 49 * 143 - * Returns 0 if in cache, -ENOENT if not, < 0 for other errors 144 - * (-EWOULDBLOCK would be typical). 50 + * Returns 0 if in cache (cblock will be set), -ENOENT if not, < 0 for 51 + * other errors (-EWOULDBLOCK would be typical). data_dir should be 52 + * READ or WRITE. fast_copy should be set if migrating this block would 53 + * be 'cheap' somehow (eg, discarded data). background_queued will be set 54 + * if a migration has just been queued. 145 55 */ 146 - int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock); 56 + int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock, 57 + int data_dir, bool fast_copy, bool *background_queued); 147 58 148 - void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); 149 - void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); 59 + /* 60 + * Sometimes the core target can optimise a migration, eg, the 61 + * block may be discarded, or the bio may cover an entire block. 62 + * In order to optimise it needs the migration immediately though 63 + * so it knows to do something different with the bio. 64 + * 65 + * This method is optional (policy-internal will fallback to using 66 + * lookup). 67 + */ 68 + int (*lookup_with_work)(struct dm_cache_policy *p, 69 + dm_oblock_t oblock, dm_cblock_t *cblock, 70 + int data_dir, bool fast_copy, 71 + struct policy_work **work); 72 + 73 + /* 74 + * Retrieves background work. Returns -ENODATA when there's no 75 + * background work. 76 + */ 77 + int (*get_background_work)(struct dm_cache_policy *p, bool idle, 78 + struct policy_work **result); 79 + 80 + /* 81 + * You must pass in the same work pointer that you were given, not 82 + * a copy. 83 + */ 84 + void (*complete_background_work)(struct dm_cache_policy *p, 85 + struct policy_work *work, 86 + bool success); 87 + 88 + void (*set_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock); 89 + void (*clear_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock); 150 90 151 91 /* 152 92 * Called when a cache target is first created. Used to load a 153 93 * mapping from the metadata device into the policy. 154 94 */ 155 95 int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock, 156 - dm_cblock_t cblock, uint32_t hint, bool hint_valid); 96 + dm_cblock_t cblock, bool dirty, 97 + uint32_t hint, bool hint_valid); 98 + 99 + /* 100 + * Drops the mapping, irrespective of whether it's clean or dirty. 101 + * Returns -ENODATA if cblock is not mapped. 102 + */ 103 + int (*invalidate_mapping)(struct dm_cache_policy *p, dm_cblock_t cblock); 157 104 158 105 /* 159 106 * Gets the hint for a given cblock. Called in a single threaded 160 107 * context. So no locking required. 161 108 */ 162 109 uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock); 163 - 164 - /* 165 - * Override functions used on the error paths of the core target. 166 - * They must succeed. 167 - */ 168 - void (*remove_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock); 169 - void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock, 170 - dm_oblock_t new_oblock); 171 - 172 - /* 173 - * This is called via the invalidate_cblocks message. It is 174 - * possible the particular cblock has already been removed due to a 175 - * write io in passthrough mode. In which case this should return 176 - * -ENODATA. 177 - */ 178 - int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock); 179 - 180 - /* 181 - * Provide a dirty block to be written back by the core target. If 182 - * critical_only is set then the policy should only provide work if 183 - * it urgently needs it. 184 - * 185 - * Returns: 186 - * 187 - * 0 and @cblock,@oblock: block to write back provided 188 - * 189 - * -ENODATA: no dirty blocks available 190 - */ 191 - int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock, 192 - bool critical_only); 193 110 194 111 /* 195 112 * How full is the cache? ··· 119 202 * queue merging has occurred). To stop the policy being fooled by 120 203 * these, the core target sends regular tick() calls to the policy. 121 204 * The policy should only count an entry as hit once per tick. 205 + * 206 + * This method is optional. 122 207 */ 123 208 void (*tick)(struct dm_cache_policy *p, bool can_block); 124 209 ··· 131 212 unsigned maxlen, ssize_t *sz_ptr); 132 213 int (*set_config_value)(struct dm_cache_policy *p, 133 214 const char *key, const char *value); 215 + 216 + void (*allow_migrations)(struct dm_cache_policy *p, bool allow); 134 217 135 218 /* 136 219 * Book keeping ptr for the policy register, not for general use.

+1208 -1515

drivers/md/dm-cache-target.c

··· 5 5 */ 6 6 7 7 #include "dm.h" 8 - #include "dm-bio-prison-v1.h" 8 + #include "dm-bio-prison-v2.h" 9 9 #include "dm-bio-record.h" 10 10 #include "dm-cache-metadata.h" 11 11 ··· 15 15 #include <linux/init.h> 16 16 #include <linux/mempool.h> 17 17 #include <linux/module.h> 18 + #include <linux/rwsem.h> 18 19 #include <linux/slab.h> 19 20 #include <linux/vmalloc.h> 20 21 ··· 26 25 27 26 /*----------------------------------------------------------------*/ 28 27 29 - #define IOT_RESOLUTION 4 28 + /* 29 + * Glossary: 30 + * 31 + * oblock: index of an origin block 32 + * cblock: index of a cache block 33 + * promotion: movement of a block from origin to cache 34 + * demotion: movement of a block from cache to origin 35 + * migration: movement of a block between the origin and cache device, 36 + * either direction 37 + */ 38 + 39 + /*----------------------------------------------------------------*/ 30 40 31 41 struct io_tracker { 32 42 spinlock_t lock; ··· 111 99 /*----------------------------------------------------------------*/ 112 100 113 101 /* 114 - * Glossary: 115 - * 116 - * oblock: index of an origin block 117 - * cblock: index of a cache block 118 - * promotion: movement of a block from origin to cache 119 - * demotion: movement of a block from cache to origin 120 - * migration: movement of a block between the origin and cache device, 121 - * either direction 102 + * Represents a chunk of future work. 'input' allows continuations to pass 103 + * values between themselves, typically error values. 122 104 */ 105 + struct continuation { 106 + struct work_struct ws; 107 + int input; 108 + }; 109 + 110 + static inline void init_continuation(struct continuation *k, 111 + void (*fn)(struct work_struct *)) 112 + { 113 + INIT_WORK(&k->ws, fn); 114 + k->input = 0; 115 + } 116 + 117 + static inline void queue_continuation(struct workqueue_struct *wq, 118 + struct continuation *k) 119 + { 120 + queue_work(wq, &k->ws); 121 + } 123 122 124 123 /*----------------------------------------------------------------*/ 124 + 125 + /* 126 + * The batcher collects together pieces of work that need a particular 127 + * operation to occur before they can proceed (typically a commit). 128 + */ 129 + struct batcher { 130 + /* 131 + * The operation that everyone is waiting for. 132 + */ 133 + int (*commit_op)(void *context); 134 + void *commit_context; 135 + 136 + /* 137 + * This is how bios should be issued once the commit op is complete 138 + * (accounted_request). 139 + */ 140 + void (*issue_op)(struct bio *bio, void *context); 141 + void *issue_context; 142 + 143 + /* 144 + * Queued work gets put on here after commit. 145 + */ 146 + struct workqueue_struct *wq; 147 + 148 + spinlock_t lock; 149 + struct list_head work_items; 150 + struct bio_list bios; 151 + struct work_struct commit_work; 152 + 153 + bool commit_scheduled; 154 + }; 155 + 156 + static void __commit(struct work_struct *_ws) 157 + { 158 + struct batcher *b = container_of(_ws, struct batcher, commit_work); 159 + 160 + int r; 161 + unsigned long flags; 162 + struct list_head work_items; 163 + struct work_struct *ws, *tmp; 164 + struct continuation *k; 165 + struct bio *bio; 166 + struct bio_list bios; 167 + 168 + INIT_LIST_HEAD(&work_items); 169 + bio_list_init(&bios); 170 + 171 + /* 172 + * We have to grab these before the commit_op to avoid a race 173 + * condition. 174 + */ 175 + spin_lock_irqsave(&b->lock, flags); 176 + list_splice_init(&b->work_items, &work_items); 177 + bio_list_merge(&bios, &b->bios); 178 + bio_list_init(&b->bios); 179 + b->commit_scheduled = false; 180 + spin_unlock_irqrestore(&b->lock, flags); 181 + 182 + r = b->commit_op(b->commit_context); 183 + 184 + list_for_each_entry_safe(ws, tmp, &work_items, entry) { 185 + k = container_of(ws, struct continuation, ws); 186 + k->input = r; 187 + INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */ 188 + queue_work(b->wq, ws); 189 + } 190 + 191 + while ((bio = bio_list_pop(&bios))) { 192 + if (r) { 193 + bio->bi_error = r; 194 + bio_endio(bio); 195 + } else 196 + b->issue_op(bio, b->issue_context); 197 + } 198 + } 199 + 200 + static void batcher_init(struct batcher *b, 201 + int (*commit_op)(void *), 202 + void *commit_context, 203 + void (*issue_op)(struct bio *bio, void *), 204 + void *issue_context, 205 + struct workqueue_struct *wq) 206 + { 207 + b->commit_op = commit_op; 208 + b->commit_context = commit_context; 209 + b->issue_op = issue_op; 210 + b->issue_context = issue_context; 211 + b->wq = wq; 212 + 213 + spin_lock_init(&b->lock); 214 + INIT_LIST_HEAD(&b->work_items); 215 + bio_list_init(&b->bios); 216 + INIT_WORK(&b->commit_work, __commit); 217 + b->commit_scheduled = false; 218 + } 219 + 220 + static void async_commit(struct batcher *b) 221 + { 222 + queue_work(b->wq, &b->commit_work); 223 + } 224 + 225 + static void continue_after_commit(struct batcher *b, struct continuation *k) 226 + { 227 + unsigned long flags; 228 + bool commit_scheduled; 229 + 230 + spin_lock_irqsave(&b->lock, flags); 231 + commit_scheduled = b->commit_scheduled; 232 + list_add_tail(&k->ws.entry, &b->work_items); 233 + spin_unlock_irqrestore(&b->lock, flags); 234 + 235 + if (commit_scheduled) 236 + async_commit(b); 237 + } 238 + 239 + /* 240 + * Bios are errored if commit failed. 241 + */ 242 + static void issue_after_commit(struct batcher *b, struct bio *bio) 243 + { 244 + unsigned long flags; 245 + bool commit_scheduled; 246 + 247 + spin_lock_irqsave(&b->lock, flags); 248 + commit_scheduled = b->commit_scheduled; 249 + bio_list_add(&b->bios, bio); 250 + spin_unlock_irqrestore(&b->lock, flags); 251 + 252 + if (commit_scheduled) 253 + async_commit(b); 254 + } 255 + 256 + /* 257 + * Call this if some urgent work is waiting for the commit to complete. 258 + */ 259 + static void schedule_commit(struct batcher *b) 260 + { 261 + bool immediate; 262 + unsigned long flags; 263 + 264 + spin_lock_irqsave(&b->lock, flags); 265 + immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios); 266 + b->commit_scheduled = true; 267 + spin_unlock_irqrestore(&b->lock, flags); 268 + 269 + if (immediate) 270 + async_commit(b); 271 + } 125 272 126 273 /* 127 274 * There are a couple of places where we let a bio run, but want to do some ··· 360 189 atomic_t write_miss; 361 190 atomic_t demotion; 362 191 atomic_t promotion; 192 + atomic_t writeback; 363 193 atomic_t copies_avoided; 364 194 atomic_t cache_cell_clash; 365 195 atomic_t commit_count; 366 196 atomic_t discard_count; 367 - }; 368 - 369 - /* 370 - * Defines a range of cblocks, begin to (end - 1) are in the range. end is 371 - * the one-past-the-end value. 372 - */ 373 - struct cblock_range { 374 - dm_cblock_t begin; 375 - dm_cblock_t end; 376 - }; 377 - 378 - struct invalidation_request { 379 - struct list_head list; 380 - struct cblock_range *cblocks; 381 - 382 - atomic_t complete; 383 - int err; 384 - 385 - wait_queue_head_t result_wait; 386 197 }; 387 198 388 199 struct cache { ··· 408 255 spinlock_t lock; 409 256 struct list_head deferred_cells; 410 257 struct bio_list deferred_bios; 411 - struct bio_list deferred_flush_bios; 412 258 struct bio_list deferred_writethrough_bios; 413 - struct list_head quiesced_migrations; 414 - struct list_head completed_migrations; 415 - struct list_head need_commit_migrations; 416 259 sector_t migration_threshold; 417 260 wait_queue_head_t migration_wait; 418 261 atomic_t nr_allocated_migrations; ··· 419 270 */ 420 271 atomic_t nr_io_migrations; 421 272 422 - wait_queue_head_t quiescing_wait; 423 - atomic_t quiescing; 424 - atomic_t quiescing_ack; 273 + struct rw_semaphore quiesce_lock; 425 274 426 275 /* 427 276 * cache_size entries, dirty if set ··· 443 296 444 297 struct dm_kcopyd_client *copier; 445 298 struct workqueue_struct *wq; 446 - struct work_struct worker; 447 - 299 + struct work_struct deferred_bio_worker; 300 + struct work_struct deferred_writethrough_worker; 301 + struct work_struct migration_worker; 448 302 struct delayed_work waker; 449 - unsigned long last_commit_jiffies; 450 - 451 - struct dm_bio_prison *prison; 452 - struct dm_deferred_set *all_io_ds; 303 + struct dm_bio_prison_v2 *prison; 453 304 454 305 mempool_t *migration_pool; 455 306 ··· 475 330 struct list_head invalidation_requests; 476 331 477 332 struct io_tracker origin_tracker; 333 + 334 + struct work_struct commit_ws; 335 + struct batcher committer; 336 + 337 + struct rw_semaphore background_work_lock; 478 338 }; 479 339 480 340 struct per_bio_data { 481 341 bool tick:1; 482 342 unsigned req_nr:2; 483 - struct dm_deferred_entry *all_io_entry; 343 + struct dm_bio_prison_cell_v2 *cell; 484 344 struct dm_hook_info hook_info; 485 345 sector_t len; 486 346 ··· 500 350 }; 501 351 502 352 struct dm_cache_migration { 503 - struct list_head list; 353 + struct continuation k; 504 354 struct cache *cache; 505 355 506 - unsigned long start_jiffies; 507 - dm_oblock_t old_oblock; 508 - dm_oblock_t new_oblock; 509 - dm_cblock_t cblock; 356 + struct policy_work *op; 357 + struct bio *overwrite_bio; 358 + struct dm_bio_prison_cell_v2 *cell; 510 359 511 - bool err:1; 512 - bool discard:1; 513 - bool writeback:1; 514 - bool demote:1; 515 - bool promote:1; 516 - bool requeue_holder:1; 517 - bool invalidate:1; 518 - 519 - struct dm_bio_prison_cell *old_ocell; 520 - struct dm_bio_prison_cell *new_ocell; 360 + dm_cblock_t invalidate_cblock; 361 + dm_oblock_t invalidate_oblock; 521 362 }; 522 363 523 - /* 524 - * Processing a bio in the worker thread may require these memory 525 - * allocations. We prealloc to avoid deadlocks (the same worker thread 526 - * frees them back to the mempool). 527 - */ 528 - struct prealloc { 529 - struct dm_cache_migration *mg; 530 - struct dm_bio_prison_cell *cell1; 531 - struct dm_bio_prison_cell *cell2; 532 - }; 364 + /*----------------------------------------------------------------*/ 533 365 534 - static enum cache_metadata_mode get_cache_mode(struct cache *cache); 535 - 536 - static void wake_worker(struct cache *cache) 366 + static bool writethrough_mode(struct cache_features *f) 537 367 { 538 - queue_work(cache->wq, &cache->worker); 368 + return f->io_mode == CM_IO_WRITETHROUGH; 369 + } 370 + 371 + static bool writeback_mode(struct cache_features *f) 372 + { 373 + return f->io_mode == CM_IO_WRITEBACK; 374 + } 375 + 376 + static inline bool passthrough_mode(struct cache_features *f) 377 + { 378 + return unlikely(f->io_mode == CM_IO_PASSTHROUGH); 539 379 } 540 380 541 381 /*----------------------------------------------------------------*/ 542 382 543 - static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) 383 + static void wake_deferred_bio_worker(struct cache *cache) 544 384 { 545 - /* FIXME: change to use a local slab. */ 546 - return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT); 385 + queue_work(cache->wq, &cache->deferred_bio_worker); 547 386 } 548 387 549 - static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) 388 + static void wake_deferred_writethrough_worker(struct cache *cache) 550 389 { 551 - dm_bio_prison_free_cell(cache->prison, cell); 390 + queue_work(cache->wq, &cache->deferred_writethrough_worker); 391 + } 392 + 393 + static void wake_migration_worker(struct cache *cache) 394 + { 395 + if (passthrough_mode(&cache->features)) 396 + return; 397 + 398 + queue_work(cache->wq, &cache->migration_worker); 399 + } 400 + 401 + /*----------------------------------------------------------------*/ 402 + 403 + static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache) 404 + { 405 + return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT); 406 + } 407 + 408 + static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell) 409 + { 410 + dm_bio_prison_free_cell_v2(cache->prison, cell); 552 411 } 553 412 554 413 static struct dm_cache_migration *alloc_migration(struct cache *cache) ··· 583 424 mempool_free(mg, cache->migration_pool); 584 425 } 585 426 586 - static int prealloc_data_structs(struct cache *cache, struct prealloc *p) 587 - { 588 - if (!p->mg) { 589 - p->mg = alloc_migration(cache); 590 - if (!p->mg) 591 - return -ENOMEM; 592 - } 593 - 594 - if (!p->cell1) { 595 - p->cell1 = alloc_prison_cell(cache); 596 - if (!p->cell1) 597 - return -ENOMEM; 598 - } 599 - 600 - if (!p->cell2) { 601 - p->cell2 = alloc_prison_cell(cache); 602 - if (!p->cell2) 603 - return -ENOMEM; 604 - } 605 - 606 - return 0; 607 - } 608 - 609 - static void prealloc_free_structs(struct cache *cache, struct prealloc *p) 610 - { 611 - if (p->cell2) 612 - free_prison_cell(cache, p->cell2); 613 - 614 - if (p->cell1) 615 - free_prison_cell(cache, p->cell1); 616 - 617 - if (p->mg) 618 - free_migration(p->mg); 619 - } 620 - 621 - static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) 622 - { 623 - struct dm_cache_migration *mg = p->mg; 624 - 625 - BUG_ON(!mg); 626 - p->mg = NULL; 627 - 628 - return mg; 629 - } 630 - 631 - /* 632 - * You must have a cell within the prealloc struct to return. If not this 633 - * function will BUG() rather than returning NULL. 634 - */ 635 - static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) 636 - { 637 - struct dm_bio_prison_cell *r = NULL; 638 - 639 - if (p->cell1) { 640 - r = p->cell1; 641 - p->cell1 = NULL; 642 - 643 - } else if (p->cell2) { 644 - r = p->cell2; 645 - p->cell2 = NULL; 646 - } else 647 - BUG(); 648 - 649 - return r; 650 - } 651 - 652 - /* 653 - * You can't have more than two cells in a prealloc struct. BUG() will be 654 - * called if you try and overfill. 655 - */ 656 - static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell) 657 - { 658 - if (!p->cell2) 659 - p->cell2 = cell; 660 - 661 - else if (!p->cell1) 662 - p->cell1 = cell; 663 - 664 - else 665 - BUG(); 666 - } 667 - 668 427 /*----------------------------------------------------------------*/ 669 428 670 - static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key) 429 + static inline dm_oblock_t oblock_succ(dm_oblock_t b) 430 + { 431 + return to_oblock(from_oblock(b) + 1ull); 432 + } 433 + 434 + static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key) 671 435 { 672 436 key->virtual = 0; 673 437 key->dev = 0; ··· 599 517 } 600 518 601 519 /* 602 - * The caller hands in a preallocated cell, and a free function for it. 603 - * The cell will be freed if there's an error, or if it wasn't used because 604 - * a cell with that key already exists. 520 + * We have two lock levels. Level 0, which is used to prevent WRITEs, and 521 + * level 1 which prevents *both* READs and WRITEs. 605 522 */ 606 - typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); 523 + #define WRITE_LOCK_LEVEL 0 524 + #define READ_WRITE_LOCK_LEVEL 1 607 525 608 - static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end, 609 - struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 610 - cell_free_fn free_fn, void *free_context, 611 - struct dm_bio_prison_cell **cell_result) 526 + static unsigned lock_level(struct bio *bio) 612 527 { 613 - int r; 614 - struct dm_cell_key key; 615 - 616 - build_key(oblock_begin, oblock_end, &key); 617 - r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); 618 - if (r) 619 - free_fn(free_context, cell_prealloc); 620 - 621 - return r; 528 + return bio_data_dir(bio) == WRITE ? 529 + WRITE_LOCK_LEVEL : 530 + READ_WRITE_LOCK_LEVEL; 622 531 } 623 532 624 - static int bio_detain(struct cache *cache, dm_oblock_t oblock, 625 - struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 626 - cell_free_fn free_fn, void *free_context, 627 - struct dm_bio_prison_cell **cell_result) 533 + /*---------------------------------------------------------------- 534 + * Per bio data 535 + *--------------------------------------------------------------*/ 536 + 537 + /* 538 + * If using writeback, leave out struct per_bio_data's writethrough fields. 539 + */ 540 + #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) 541 + #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) 542 + 543 + static size_t get_per_bio_data_size(struct cache *cache) 628 544 { 545 + return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; 546 + } 547 + 548 + static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) 549 + { 550 + struct per_bio_data *pb = dm_per_bio_data(bio, data_size); 551 + BUG_ON(!pb); 552 + return pb; 553 + } 554 + 555 + static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) 556 + { 557 + struct per_bio_data *pb = get_per_bio_data(bio, data_size); 558 + 559 + pb->tick = false; 560 + pb->req_nr = dm_bio_get_target_bio_nr(bio); 561 + pb->cell = NULL; 562 + pb->len = 0; 563 + 564 + return pb; 565 + } 566 + 567 + /*----------------------------------------------------------------*/ 568 + 569 + static void defer_bio(struct cache *cache, struct bio *bio) 570 + { 571 + unsigned long flags; 572 + 573 + spin_lock_irqsave(&cache->lock, flags); 574 + bio_list_add(&cache->deferred_bios, bio); 575 + spin_unlock_irqrestore(&cache->lock, flags); 576 + 577 + wake_deferred_bio_worker(cache); 578 + } 579 + 580 + static void defer_bios(struct cache *cache, struct bio_list *bios) 581 + { 582 + unsigned long flags; 583 + 584 + spin_lock_irqsave(&cache->lock, flags); 585 + bio_list_merge(&cache->deferred_bios, bios); 586 + bio_list_init(bios); 587 + spin_unlock_irqrestore(&cache->lock, flags); 588 + 589 + wake_deferred_bio_worker(cache); 590 + } 591 + 592 + /*----------------------------------------------------------------*/ 593 + 594 + static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio) 595 + { 596 + bool r; 597 + size_t pb_size; 598 + struct per_bio_data *pb; 599 + struct dm_cell_key_v2 key; 629 600 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 630 - return bio_detain_range(cache, oblock, end, bio, 631 - cell_prealloc, free_fn, free_context, cell_result); 632 - } 601 + struct dm_bio_prison_cell_v2 *cell_prealloc, *cell; 633 602 634 - static int get_cell(struct cache *cache, 635 - dm_oblock_t oblock, 636 - struct prealloc *structs, 637 - struct dm_bio_prison_cell **cell_result) 638 - { 639 - int r; 640 - struct dm_cell_key key; 641 - struct dm_bio_prison_cell *cell_prealloc; 603 + cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */ 604 + if (!cell_prealloc) { 605 + defer_bio(cache, bio); 606 + return false; 607 + } 642 608 643 - cell_prealloc = prealloc_get_cell(structs); 609 + build_key(oblock, end, &key); 610 + r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell); 611 + if (!r) { 612 + /* 613 + * Failed to get the lock. 614 + */ 615 + free_prison_cell(cache, cell_prealloc); 616 + return r; 617 + } 644 618 645 - build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key); 646 - r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); 647 - if (r) 648 - prealloc_put_cell(structs, cell_prealloc); 619 + if (cell != cell_prealloc) 620 + free_prison_cell(cache, cell_prealloc); 621 + 622 + pb_size = get_per_bio_data_size(cache); 623 + pb = get_per_bio_data(bio, pb_size); 624 + pb->cell = cell; 649 625 650 626 return r; 651 627 } ··· 715 575 return test_bit(from_cblock(b), cache->dirty_bitset); 716 576 } 717 577 718 - static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 578 + static void set_dirty(struct cache *cache, dm_cblock_t cblock) 719 579 { 720 580 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 721 581 atomic_inc(&cache->nr_dirty); 722 - policy_set_dirty(cache->policy, oblock); 582 + policy_set_dirty(cache->policy, cblock); 723 583 } 724 584 } 725 585 726 - static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 586 + /* 587 + * These two are called when setting after migrations to force the policy 588 + * and dirty bitset to be in sync. 589 + */ 590 + static void force_set_dirty(struct cache *cache, dm_cblock_t cblock) 591 + { 592 + if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) 593 + atomic_inc(&cache->nr_dirty); 594 + policy_set_dirty(cache->policy, cblock); 595 + } 596 + 597 + static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock) 727 598 { 728 599 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 729 - policy_clear_dirty(cache->policy, oblock); 730 600 if (atomic_dec_return(&cache->nr_dirty) == 0) 731 601 dm_table_event(cache->ti->table); 732 602 } 603 + 604 + policy_clear_dirty(cache->policy, cblock); 733 605 } 734 606 735 607 /*----------------------------------------------------------------*/ ··· 778 626 { 779 627 return to_dblock(block_div(from_oblock(oblock), 780 628 oblocks_per_dblock(cache))); 781 - } 782 - 783 - static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock) 784 - { 785 - return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache)); 786 629 } 787 630 788 631 static void set_discard(struct cache *cache, dm_dblock_t b) ··· 826 679 return r; 827 680 } 828 681 829 - /*----------------------------------------------------------------*/ 830 - 831 - static void load_stats(struct cache *cache) 832 - { 833 - struct dm_cache_statistics stats; 834 - 835 - dm_cache_metadata_get_stats(cache->cmd, &stats); 836 - atomic_set(&cache->stats.read_hit, stats.read_hits); 837 - atomic_set(&cache->stats.read_miss, stats.read_misses); 838 - atomic_set(&cache->stats.write_hit, stats.write_hits); 839 - atomic_set(&cache->stats.write_miss, stats.write_misses); 840 - } 841 - 842 - static void save_stats(struct cache *cache) 843 - { 844 - struct dm_cache_statistics stats; 845 - 846 - if (get_cache_mode(cache) >= CM_READ_ONLY) 847 - return; 848 - 849 - stats.read_hits = atomic_read(&cache->stats.read_hit); 850 - stats.read_misses = atomic_read(&cache->stats.read_miss); 851 - stats.write_hits = atomic_read(&cache->stats.write_hit); 852 - stats.write_misses = atomic_read(&cache->stats.write_miss); 853 - 854 - dm_cache_metadata_set_stats(cache->cmd, &stats); 855 - } 856 - 857 - /*---------------------------------------------------------------- 858 - * Per bio data 859 - *--------------------------------------------------------------*/ 860 - 861 - /* 862 - * If using writeback, leave out struct per_bio_data's writethrough fields. 863 - */ 864 - #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) 865 - #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) 866 - 867 - static bool writethrough_mode(struct cache_features *f) 868 - { 869 - return f->io_mode == CM_IO_WRITETHROUGH; 870 - } 871 - 872 - static bool writeback_mode(struct cache_features *f) 873 - { 874 - return f->io_mode == CM_IO_WRITEBACK; 875 - } 876 - 877 - static bool passthrough_mode(struct cache_features *f) 878 - { 879 - return f->io_mode == CM_IO_PASSTHROUGH; 880 - } 881 - 882 - static size_t get_per_bio_data_size(struct cache *cache) 883 - { 884 - return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; 885 - } 886 - 887 - static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) 888 - { 889 - struct per_bio_data *pb = dm_per_bio_data(bio, data_size); 890 - BUG_ON(!pb); 891 - return pb; 892 - } 893 - 894 - static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) 895 - { 896 - struct per_bio_data *pb = get_per_bio_data(bio, data_size); 897 - 898 - pb->tick = false; 899 - pb->req_nr = dm_bio_get_target_bio_nr(bio); 900 - pb->all_io_entry = NULL; 901 - pb->len = 0; 902 - 903 - return pb; 904 - } 905 - 906 682 /*---------------------------------------------------------------- 907 683 * Remapping 908 684 *--------------------------------------------------------------*/ ··· 867 797 } 868 798 869 799 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 870 - dm_oblock_t oblock) 800 + dm_oblock_t oblock) 871 801 { 802 + // FIXME: this is called way too much. 872 803 check_if_tick_bio_needed(cache, bio); 873 804 remap_to_origin(cache, bio); 874 805 if (bio_data_dir(bio) == WRITE) ··· 882 811 check_if_tick_bio_needed(cache, bio); 883 812 remap_to_cache(cache, bio, cblock); 884 813 if (bio_data_dir(bio) == WRITE) { 885 - set_dirty(cache, oblock, cblock); 814 + set_dirty(cache, cblock); 886 815 clear_discard(cache, oblock_to_dblock(cache, oblock)); 887 816 } 888 817 } ··· 897 826 block_nr >>= cache->sectors_per_block_shift; 898 827 899 828 return to_oblock(block_nr); 900 - } 901 - 902 - /* 903 - * You must increment the deferred set whilst the prison cell is held. To 904 - * encourage this, we ask for 'cell' to be passed in. 905 - */ 906 - static void inc_ds(struct cache *cache, struct bio *bio, 907 - struct dm_bio_prison_cell *cell) 908 - { 909 - size_t pb_data_size = get_per_bio_data_size(cache); 910 - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 911 - 912 - BUG_ON(!cell); 913 - BUG_ON(pb->all_io_entry); 914 - 915 - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 916 829 } 917 830 918 831 static bool accountable_bio(struct cache *cache, struct bio *bio) ··· 930 875 generic_make_request(bio); 931 876 } 932 877 933 - static void issue(struct cache *cache, struct bio *bio) 878 + static void issue_op(struct bio *bio, void *context) 934 879 { 935 - unsigned long flags; 936 - 937 - if (!op_is_flush(bio->bi_opf)) { 938 - accounted_request(cache, bio); 939 - return; 940 - } 941 - 942 - /* 943 - * Batch together any bios that trigger commits and then issue a 944 - * single commit for them in do_worker(). 945 - */ 946 - spin_lock_irqsave(&cache->lock, flags); 947 - cache->commit_requested = true; 948 - bio_list_add(&cache->deferred_flush_bios, bio); 949 - spin_unlock_irqrestore(&cache->lock, flags); 950 - } 951 - 952 - static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell) 953 - { 954 - inc_ds(cache, bio, cell); 955 - issue(cache, bio); 880 + struct cache *cache = context; 881 + accounted_request(cache, bio); 956 882 } 957 883 958 884 static void defer_writethrough_bio(struct cache *cache, struct bio *bio) ··· 944 908 bio_list_add(&cache->deferred_writethrough_bios, bio); 945 909 spin_unlock_irqrestore(&cache->lock, flags); 946 910 947 - wake_worker(cache); 911 + wake_deferred_writethrough_worker(cache); 948 912 } 949 913 950 914 static void writethrough_endio(struct bio *bio) ··· 970 934 } 971 935 972 936 /* 937 + * FIXME: send in parallel, huge latency as is. 973 938 * When running in writethrough mode we need to send writes to clean blocks 974 939 * to both the cache and origin devices. In future we'd like to clone the 975 940 * bio and send them in parallel, but for now we're doing them in ··· 1083 1046 set_cache_mode(cache, CM_READ_ONLY); 1084 1047 } 1085 1048 1049 + /*----------------------------------------------------------------*/ 1050 + 1051 + static void load_stats(struct cache *cache) 1052 + { 1053 + struct dm_cache_statistics stats; 1054 + 1055 + dm_cache_metadata_get_stats(cache->cmd, &stats); 1056 + atomic_set(&cache->stats.read_hit, stats.read_hits); 1057 + atomic_set(&cache->stats.read_miss, stats.read_misses); 1058 + atomic_set(&cache->stats.write_hit, stats.write_hits); 1059 + atomic_set(&cache->stats.write_miss, stats.write_misses); 1060 + } 1061 + 1062 + static void save_stats(struct cache *cache) 1063 + { 1064 + struct dm_cache_statistics stats; 1065 + 1066 + if (get_cache_mode(cache) >= CM_READ_ONLY) 1067 + return; 1068 + 1069 + stats.read_hits = atomic_read(&cache->stats.read_hit); 1070 + stats.read_misses = atomic_read(&cache->stats.read_miss); 1071 + stats.write_hits = atomic_read(&cache->stats.write_hit); 1072 + stats.write_misses = atomic_read(&cache->stats.write_miss); 1073 + 1074 + dm_cache_metadata_set_stats(cache->cmd, &stats); 1075 + } 1076 + 1077 + static void update_stats(struct cache_stats *stats, enum policy_operation op) 1078 + { 1079 + switch (op) { 1080 + case POLICY_PROMOTE: 1081 + atomic_inc(&stats->promotion); 1082 + break; 1083 + 1084 + case POLICY_DEMOTE: 1085 + atomic_inc(&stats->demotion); 1086 + break; 1087 + 1088 + case POLICY_WRITEBACK: 1089 + atomic_inc(&stats->writeback); 1090 + break; 1091 + } 1092 + } 1093 + 1086 1094 /*---------------------------------------------------------------- 1087 1095 * Migration processing 1088 1096 * 1089 1097 * Migration covers moving data from the origin device to the cache, or 1090 1098 * vice versa. 1091 1099 *--------------------------------------------------------------*/ 1100 + 1092 1101 static void inc_io_migrations(struct cache *cache) 1093 1102 { 1094 1103 atomic_inc(&cache->nr_io_migrations); ··· 1148 1065 static bool discard_or_flush(struct bio *bio) 1149 1066 { 1150 1067 return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); 1151 - } 1152 - 1153 - static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell) 1154 - { 1155 - if (discard_or_flush(cell->holder)) { 1156 - /* 1157 - * We have to handle these bios individually. 1158 - */ 1159 - dm_cell_release(cache->prison, cell, &cache->deferred_bios); 1160 - free_prison_cell(cache, cell); 1161 - } else 1162 - list_add_tail(&cell->user_list, &cache->deferred_cells); 1163 - } 1164 - 1165 - static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder) 1166 - { 1167 - unsigned long flags; 1168 - 1169 - if (!holder && dm_cell_promote_or_release(cache->prison, cell)) { 1170 - /* 1171 - * There was no prisoner to promote to holder, the 1172 - * cell has been released. 1173 - */ 1174 - free_prison_cell(cache, cell); 1175 - return; 1176 - } 1177 - 1178 - spin_lock_irqsave(&cache->lock, flags); 1179 - __cell_defer(cache, cell); 1180 - spin_unlock_irqrestore(&cache->lock, flags); 1181 - 1182 - wake_worker(cache); 1183 - } 1184 - 1185 - static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err) 1186 - { 1187 - dm_cell_error(cache->prison, cell, err); 1188 - free_prison_cell(cache, cell); 1189 - } 1190 - 1191 - static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell) 1192 - { 1193 - cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE); 1194 - } 1195 - 1196 - static void free_io_migration(struct dm_cache_migration *mg) 1197 - { 1198 - struct cache *cache = mg->cache; 1199 - 1200 - dec_io_migrations(cache); 1201 - free_migration(mg); 1202 - wake_worker(cache); 1203 - } 1204 - 1205 - static void migration_failure(struct dm_cache_migration *mg) 1206 - { 1207 - struct cache *cache = mg->cache; 1208 - const char *dev_name = cache_device_name(cache); 1209 - 1210 - if (mg->writeback) { 1211 - DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name); 1212 - set_dirty(cache, mg->old_oblock, mg->cblock); 1213 - cell_defer(cache, mg->old_ocell, false); 1214 - 1215 - } else if (mg->demote) { 1216 - DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name); 1217 - policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); 1218 - 1219 - cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 1220 - if (mg->promote) 1221 - cell_defer(cache, mg->new_ocell, true); 1222 - } else { 1223 - DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name); 1224 - policy_remove_mapping(cache->policy, mg->new_oblock); 1225 - cell_defer(cache, mg->new_ocell, true); 1226 - } 1227 - 1228 - free_io_migration(mg); 1229 - } 1230 - 1231 - static void migration_success_pre_commit(struct dm_cache_migration *mg) 1232 - { 1233 - int r; 1234 - unsigned long flags; 1235 - struct cache *cache = mg->cache; 1236 - 1237 - if (mg->writeback) { 1238 - clear_dirty(cache, mg->old_oblock, mg->cblock); 1239 - cell_defer(cache, mg->old_ocell, false); 1240 - free_io_migration(mg); 1241 - return; 1242 - 1243 - } else if (mg->demote) { 1244 - r = dm_cache_remove_mapping(cache->cmd, mg->cblock); 1245 - if (r) { 1246 - DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata", 1247 - cache_device_name(cache)); 1248 - metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1249 - policy_force_mapping(cache->policy, mg->new_oblock, 1250 - mg->old_oblock); 1251 - if (mg->promote) 1252 - cell_defer(cache, mg->new_ocell, true); 1253 - free_io_migration(mg); 1254 - return; 1255 - } 1256 - } else { 1257 - r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock); 1258 - if (r) { 1259 - DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata", 1260 - cache_device_name(cache)); 1261 - metadata_operation_failed(cache, "dm_cache_insert_mapping", r); 1262 - policy_remove_mapping(cache->policy, mg->new_oblock); 1263 - free_io_migration(mg); 1264 - return; 1265 - } 1266 - } 1267 - 1268 - spin_lock_irqsave(&cache->lock, flags); 1269 - list_add_tail(&mg->list, &cache->need_commit_migrations); 1270 - cache->commit_requested = true; 1271 - spin_unlock_irqrestore(&cache->lock, flags); 1272 - } 1273 - 1274 - static void migration_success_post_commit(struct dm_cache_migration *mg) 1275 - { 1276 - unsigned long flags; 1277 - struct cache *cache = mg->cache; 1278 - 1279 - if (mg->writeback) { 1280 - DMWARN_LIMIT("%s: writeback unexpectedly triggered commit", 1281 - cache_device_name(cache)); 1282 - return; 1283 - 1284 - } else if (mg->demote) { 1285 - cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 1286 - 1287 - if (mg->promote) { 1288 - mg->demote = false; 1289 - 1290 - spin_lock_irqsave(&cache->lock, flags); 1291 - list_add_tail(&mg->list, &cache->quiesced_migrations); 1292 - spin_unlock_irqrestore(&cache->lock, flags); 1293 - 1294 - } else { 1295 - if (mg->invalidate) 1296 - policy_remove_mapping(cache->policy, mg->old_oblock); 1297 - free_io_migration(mg); 1298 - } 1299 - 1300 - } else { 1301 - if (mg->requeue_holder) { 1302 - clear_dirty(cache, mg->new_oblock, mg->cblock); 1303 - cell_defer(cache, mg->new_ocell, true); 1304 - } else { 1305 - /* 1306 - * The block was promoted via an overwrite, so it's dirty. 1307 - */ 1308 - set_dirty(cache, mg->new_oblock, mg->cblock); 1309 - bio_endio(mg->new_ocell->holder); 1310 - cell_defer(cache, mg->new_ocell, false); 1311 - } 1312 - free_io_migration(mg); 1313 - } 1314 - } 1315 - 1316 - static void copy_complete(int read_err, unsigned long write_err, void *context) 1317 - { 1318 - unsigned long flags; 1319 - struct dm_cache_migration *mg = (struct dm_cache_migration *) context; 1320 - struct cache *cache = mg->cache; 1321 - 1322 - if (read_err || write_err) 1323 - mg->err = true; 1324 - 1325 - spin_lock_irqsave(&cache->lock, flags); 1326 - list_add_tail(&mg->list, &cache->completed_migrations); 1327 - spin_unlock_irqrestore(&cache->lock, flags); 1328 - 1329 - wake_worker(cache); 1330 - } 1331 - 1332 - static void issue_copy(struct dm_cache_migration *mg) 1333 - { 1334 - int r; 1335 - struct dm_io_region o_region, c_region; 1336 - struct cache *cache = mg->cache; 1337 - sector_t cblock = from_cblock(mg->cblock); 1338 - 1339 - o_region.bdev = cache->origin_dev->bdev; 1340 - o_region.count = cache->sectors_per_block; 1341 - 1342 - c_region.bdev = cache->cache_dev->bdev; 1343 - c_region.sector = cblock * cache->sectors_per_block; 1344 - c_region.count = cache->sectors_per_block; 1345 - 1346 - if (mg->writeback || mg->demote) { 1347 - /* demote */ 1348 - o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; 1349 - r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); 1350 - } else { 1351 - /* promote */ 1352 - o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block; 1353 - r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); 1354 - } 1355 - 1356 - if (r < 0) { 1357 - DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache)); 1358 - migration_failure(mg); 1359 - } 1360 - } 1361 - 1362 - static void overwrite_endio(struct bio *bio) 1363 - { 1364 - struct dm_cache_migration *mg = bio->bi_private; 1365 - struct cache *cache = mg->cache; 1366 - size_t pb_data_size = get_per_bio_data_size(cache); 1367 - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1368 - unsigned long flags; 1369 - 1370 - dm_unhook_bio(&pb->hook_info, bio); 1371 - 1372 - if (bio->bi_error) 1373 - mg->err = true; 1374 - 1375 - mg->requeue_holder = false; 1376 - 1377 - spin_lock_irqsave(&cache->lock, flags); 1378 - list_add_tail(&mg->list, &cache->completed_migrations); 1379 - spin_unlock_irqrestore(&cache->lock, flags); 1380 - 1381 - wake_worker(cache); 1382 - } 1383 - 1384 - static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) 1385 - { 1386 - size_t pb_data_size = get_per_bio_data_size(mg->cache); 1387 - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1388 - 1389 - dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1390 - remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock); 1391 - 1392 - /* 1393 - * No need to inc_ds() here, since the cell will be held for the 1394 - * duration of the io. 1395 - */ 1396 - accounted_request(mg->cache, bio); 1397 - } 1398 - 1399 - static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1400 - { 1401 - return (bio_data_dir(bio) == WRITE) && 1402 - (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1403 - } 1404 - 1405 - static void avoid_copy(struct dm_cache_migration *mg) 1406 - { 1407 - atomic_inc(&mg->cache->stats.copies_avoided); 1408 - migration_success_pre_commit(mg); 1409 1068 } 1410 1069 1411 1070 static void calc_discard_block_range(struct cache *cache, struct bio *bio, ··· 1164 1339 *e = to_dblock(block_div(se, cache->discard_block_size)); 1165 1340 } 1166 1341 1167 - static void issue_discard(struct dm_cache_migration *mg) 1342 + /*----------------------------------------------------------------*/ 1343 + 1344 + static void prevent_background_work(struct cache *cache) 1168 1345 { 1169 - dm_dblock_t b, e; 1170 - struct bio *bio = mg->new_ocell->holder; 1171 - struct cache *cache = mg->cache; 1172 - 1173 - calc_discard_block_range(cache, bio, &b, &e); 1174 - while (b != e) { 1175 - set_discard(cache, b); 1176 - b = to_dblock(from_dblock(b) + 1); 1177 - } 1178 - 1179 - bio_endio(bio); 1180 - cell_defer(cache, mg->new_ocell, false); 1181 - free_migration(mg); 1182 - wake_worker(cache); 1346 + lockdep_off(); 1347 + down_write(&cache->background_work_lock); 1348 + lockdep_on(); 1183 1349 } 1184 1350 1185 - static void issue_copy_or_discard(struct dm_cache_migration *mg) 1351 + static void allow_background_work(struct cache *cache) 1186 1352 { 1187 - bool avoid; 1188 - struct cache *cache = mg->cache; 1189 - 1190 - if (mg->discard) { 1191 - issue_discard(mg); 1192 - return; 1193 - } 1194 - 1195 - if (mg->writeback || mg->demote) 1196 - avoid = !is_dirty(cache, mg->cblock) || 1197 - is_discarded_oblock(cache, mg->old_oblock); 1198 - else { 1199 - struct bio *bio = mg->new_ocell->holder; 1200 - 1201 - avoid = is_discarded_oblock(cache, mg->new_oblock); 1202 - 1203 - if (writeback_mode(&cache->features) && 1204 - !avoid && bio_writes_complete_block(cache, bio)) { 1205 - issue_overwrite(mg, bio); 1206 - return; 1207 - } 1208 - } 1209 - 1210 - avoid ? avoid_copy(mg) : issue_copy(mg); 1353 + lockdep_off(); 1354 + up_write(&cache->background_work_lock); 1355 + lockdep_on(); 1211 1356 } 1212 1357 1213 - static void complete_migration(struct dm_cache_migration *mg) 1358 + static bool background_work_begin(struct cache *cache) 1214 1359 { 1215 - if (mg->err) 1216 - migration_failure(mg); 1360 + bool r; 1361 + 1362 + lockdep_off(); 1363 + r = down_read_trylock(&cache->background_work_lock); 1364 + lockdep_on(); 1365 + 1366 + return r; 1367 + } 1368 + 1369 + static void background_work_end(struct cache *cache) 1370 + { 1371 + lockdep_off(); 1372 + up_read(&cache->background_work_lock); 1373 + lockdep_on(); 1374 + } 1375 + 1376 + /*----------------------------------------------------------------*/ 1377 + 1378 + static void quiesce(struct dm_cache_migration *mg, 1379 + void (*continuation)(struct work_struct *)) 1380 + { 1381 + init_continuation(&mg->k, continuation); 1382 + dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws); 1383 + } 1384 + 1385 + static struct dm_cache_migration *ws_to_mg(struct work_struct *ws) 1386 + { 1387 + struct continuation *k = container_of(ws, struct continuation, ws); 1388 + return container_of(k, struct dm_cache_migration, k); 1389 + } 1390 + 1391 + static void copy_complete(int read_err, unsigned long write_err, void *context) 1392 + { 1393 + struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k); 1394 + 1395 + if (read_err || write_err) 1396 + mg->k.input = -EIO; 1397 + 1398 + queue_continuation(mg->cache->wq, &mg->k); 1399 + } 1400 + 1401 + static int copy(struct dm_cache_migration *mg, bool promote) 1402 + { 1403 + int r; 1404 + struct dm_io_region o_region, c_region; 1405 + struct cache *cache = mg->cache; 1406 + 1407 + o_region.bdev = cache->origin_dev->bdev; 1408 + o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block; 1409 + o_region.count = cache->sectors_per_block; 1410 + 1411 + c_region.bdev = cache->cache_dev->bdev; 1412 + c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block; 1413 + c_region.count = cache->sectors_per_block; 1414 + 1415 + if (promote) 1416 + r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k); 1217 1417 else 1218 - migration_success_pre_commit(mg); 1418 + r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k); 1419 + 1420 + return r; 1219 1421 } 1220 1422 1221 - static void process_migrations(struct cache *cache, struct list_head *head, 1222 - void (*fn)(struct dm_cache_migration *)) 1423 + static void bio_drop_shared_lock(struct cache *cache, struct bio *bio) 1223 1424 { 1224 - unsigned long flags; 1225 - struct list_head list; 1226 - struct dm_cache_migration *mg, *tmp; 1425 + size_t pb_data_size = get_per_bio_data_size(cache); 1426 + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1227 1427 1228 - INIT_LIST_HEAD(&list); 1229 - spin_lock_irqsave(&cache->lock, flags); 1230 - list_splice_init(head, &list); 1231 - spin_unlock_irqrestore(&cache->lock, flags); 1232 - 1233 - list_for_each_entry_safe(mg, tmp, &list, list) 1234 - fn(mg); 1428 + if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell)) 1429 + free_prison_cell(cache, pb->cell); 1430 + pb->cell = NULL; 1235 1431 } 1236 1432 1237 - static void __queue_quiesced_migration(struct dm_cache_migration *mg) 1433 + static void overwrite_endio(struct bio *bio) 1238 1434 { 1239 - list_add_tail(&mg->list, &mg->cache->quiesced_migrations); 1240 - } 1241 - 1242 - static void queue_quiesced_migration(struct dm_cache_migration *mg) 1243 - { 1244 - unsigned long flags; 1435 + struct dm_cache_migration *mg = bio->bi_private; 1245 1436 struct cache *cache = mg->cache; 1437 + size_t pb_data_size = get_per_bio_data_size(cache); 1438 + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1246 1439 1247 - spin_lock_irqsave(&cache->lock, flags); 1248 - __queue_quiesced_migration(mg); 1249 - spin_unlock_irqrestore(&cache->lock, flags); 1440 + dm_unhook_bio(&pb->hook_info, bio); 1250 1441 1251 - wake_worker(cache); 1442 + if (bio->bi_error) 1443 + mg->k.input = bio->bi_error; 1444 + 1445 + queue_continuation(mg->cache->wq, &mg->k); 1252 1446 } 1253 1447 1254 - static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) 1448 + static void overwrite(struct dm_cache_migration *mg, 1449 + void (*continuation)(struct work_struct *)) 1255 1450 { 1256 - unsigned long flags; 1257 - struct dm_cache_migration *mg, *tmp; 1451 + struct bio *bio = mg->overwrite_bio; 1452 + size_t pb_data_size = get_per_bio_data_size(mg->cache); 1453 + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1258 1454 1259 - spin_lock_irqsave(&cache->lock, flags); 1260 - list_for_each_entry_safe(mg, tmp, work, list) 1261 - __queue_quiesced_migration(mg); 1262 - spin_unlock_irqrestore(&cache->lock, flags); 1455 + dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1263 1456 1264 - wake_worker(cache); 1265 - } 1457 + /* 1458 + * The overwrite bio is part of the copy operation, as such it does 1459 + * not set/clear discard or dirty flags. 1460 + */ 1461 + if (mg->op->op == POLICY_PROMOTE) 1462 + remap_to_cache(mg->cache, bio, mg->op->cblock); 1463 + else 1464 + remap_to_origin(mg->cache, bio); 1266 1465 1267 - static void check_for_quiesced_migrations(struct cache *cache, 1268 - struct per_bio_data *pb) 1269 - { 1270 - struct list_head work; 1271 - 1272 - if (!pb->all_io_entry) 1273 - return; 1274 - 1275 - INIT_LIST_HEAD(&work); 1276 - dm_deferred_entry_dec(pb->all_io_entry, &work); 1277 - 1278 - if (!list_empty(&work)) 1279 - queue_quiesced_migrations(cache, &work); 1280 - } 1281 - 1282 - static void quiesce_migration(struct dm_cache_migration *mg) 1283 - { 1284 - if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list)) 1285 - queue_quiesced_migration(mg); 1286 - } 1287 - 1288 - static void promote(struct cache *cache, struct prealloc *structs, 1289 - dm_oblock_t oblock, dm_cblock_t cblock, 1290 - struct dm_bio_prison_cell *cell) 1291 - { 1292 - struct dm_cache_migration *mg = prealloc_get_migration(structs); 1293 - 1294 - mg->err = false; 1295 - mg->discard = false; 1296 - mg->writeback = false; 1297 - mg->demote = false; 1298 - mg->promote = true; 1299 - mg->requeue_holder = true; 1300 - mg->invalidate = false; 1301 - mg->cache = cache; 1302 - mg->new_oblock = oblock; 1303 - mg->cblock = cblock; 1304 - mg->old_ocell = NULL; 1305 - mg->new_ocell = cell; 1306 - mg->start_jiffies = jiffies; 1307 - 1308 - inc_io_migrations(cache); 1309 - quiesce_migration(mg); 1310 - } 1311 - 1312 - static void writeback(struct cache *cache, struct prealloc *structs, 1313 - dm_oblock_t oblock, dm_cblock_t cblock, 1314 - struct dm_bio_prison_cell *cell) 1315 - { 1316 - struct dm_cache_migration *mg = prealloc_get_migration(structs); 1317 - 1318 - mg->err = false; 1319 - mg->discard = false; 1320 - mg->writeback = true; 1321 - mg->demote = false; 1322 - mg->promote = false; 1323 - mg->requeue_holder = true; 1324 - mg->invalidate = false; 1325 - mg->cache = cache; 1326 - mg->old_oblock = oblock; 1327 - mg->cblock = cblock; 1328 - mg->old_ocell = cell; 1329 - mg->new_ocell = NULL; 1330 - mg->start_jiffies = jiffies; 1331 - 1332 - inc_io_migrations(cache); 1333 - quiesce_migration(mg); 1334 - } 1335 - 1336 - static void demote_then_promote(struct cache *cache, struct prealloc *structs, 1337 - dm_oblock_t old_oblock, dm_oblock_t new_oblock, 1338 - dm_cblock_t cblock, 1339 - struct dm_bio_prison_cell *old_ocell, 1340 - struct dm_bio_prison_cell *new_ocell) 1341 - { 1342 - struct dm_cache_migration *mg = prealloc_get_migration(structs); 1343 - 1344 - mg->err = false; 1345 - mg->discard = false; 1346 - mg->writeback = false; 1347 - mg->demote = true; 1348 - mg->promote = true; 1349 - mg->requeue_holder = true; 1350 - mg->invalidate = false; 1351 - mg->cache = cache; 1352 - mg->old_oblock = old_oblock; 1353 - mg->new_oblock = new_oblock; 1354 - mg->cblock = cblock; 1355 - mg->old_ocell = old_ocell; 1356 - mg->new_ocell = new_ocell; 1357 - mg->start_jiffies = jiffies; 1358 - 1359 - inc_io_migrations(cache); 1360 - quiesce_migration(mg); 1466 + init_continuation(&mg->k, continuation); 1467 + accounted_request(mg->cache, bio); 1361 1468 } 1362 1469 1363 1470 /* 1364 - * Invalidate a cache entry. No writeback occurs; any changes in the cache 1365 - * block are thrown away. 1471 + * Migration steps: 1472 + * 1473 + * 1) exclusive lock preventing WRITEs 1474 + * 2) quiesce 1475 + * 3) copy or issue overwrite bio 1476 + * 4) upgrade to exclusive lock preventing READs and WRITEs 1477 + * 5) quiesce 1478 + * 6) update metadata and commit 1479 + * 7) unlock 1366 1480 */ 1367 - static void invalidate(struct cache *cache, struct prealloc *structs, 1368 - dm_oblock_t oblock, dm_cblock_t cblock, 1369 - struct dm_bio_prison_cell *cell) 1481 + static void mg_complete(struct dm_cache_migration *mg, bool success) 1370 1482 { 1371 - struct dm_cache_migration *mg = prealloc_get_migration(structs); 1483 + struct bio_list bios; 1484 + struct cache *cache = mg->cache; 1485 + struct policy_work *op = mg->op; 1486 + dm_cblock_t cblock = op->cblock; 1372 1487 1373 - mg->err = false; 1374 - mg->discard = false; 1375 - mg->writeback = false; 1376 - mg->demote = true; 1377 - mg->promote = false; 1378 - mg->requeue_holder = true; 1379 - mg->invalidate = true; 1380 - mg->cache = cache; 1381 - mg->old_oblock = oblock; 1382 - mg->cblock = cblock; 1383 - mg->old_ocell = cell; 1384 - mg->new_ocell = NULL; 1385 - mg->start_jiffies = jiffies; 1488 + if (success) 1489 + update_stats(&cache->stats, op->op); 1386 1490 1387 - inc_io_migrations(cache); 1388 - quiesce_migration(mg); 1491 + switch (op->op) { 1492 + case POLICY_PROMOTE: 1493 + clear_discard(cache, oblock_to_dblock(cache, op->oblock)); 1494 + policy_complete_background_work(cache->policy, op, success); 1495 + 1496 + if (mg->overwrite_bio) { 1497 + if (success) 1498 + force_set_dirty(cache, cblock); 1499 + else 1500 + mg->overwrite_bio->bi_error = (mg->k.input ? : -EIO); 1501 + bio_endio(mg->overwrite_bio); 1502 + } else { 1503 + if (success) 1504 + force_clear_dirty(cache, cblock); 1505 + dec_io_migrations(cache); 1506 + } 1507 + break; 1508 + 1509 + case POLICY_DEMOTE: 1510 + /* 1511 + * We clear dirty here to update the nr_dirty counter. 1512 + */ 1513 + if (success) 1514 + force_clear_dirty(cache, cblock); 1515 + policy_complete_background_work(cache->policy, op, success); 1516 + dec_io_migrations(cache); 1517 + break; 1518 + 1519 + case POLICY_WRITEBACK: 1520 + if (success) 1521 + force_clear_dirty(cache, cblock); 1522 + policy_complete_background_work(cache->policy, op, success); 1523 + dec_io_migrations(cache); 1524 + break; 1525 + } 1526 + 1527 + bio_list_init(&bios); 1528 + if (mg->cell) { 1529 + if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1530 + free_prison_cell(cache, mg->cell); 1531 + } 1532 + 1533 + free_migration(mg); 1534 + defer_bios(cache, &bios); 1535 + wake_migration_worker(cache); 1536 + 1537 + background_work_end(cache); 1389 1538 } 1390 1539 1391 - static void discard(struct cache *cache, struct prealloc *structs, 1392 - struct dm_bio_prison_cell *cell) 1540 + static void mg_success(struct work_struct *ws) 1393 1541 { 1394 - struct dm_cache_migration *mg = prealloc_get_migration(structs); 1542 + struct dm_cache_migration *mg = ws_to_mg(ws); 1543 + mg_complete(mg, mg->k.input == 0); 1544 + } 1395 1545 1396 - mg->err = false; 1397 - mg->discard = true; 1398 - mg->writeback = false; 1399 - mg->demote = false; 1400 - mg->promote = false; 1401 - mg->requeue_holder = false; 1402 - mg->invalidate = false; 1546 + static void mg_update_metadata(struct work_struct *ws) 1547 + { 1548 + int r; 1549 + struct dm_cache_migration *mg = ws_to_mg(ws); 1550 + struct cache *cache = mg->cache; 1551 + struct policy_work *op = mg->op; 1552 + 1553 + switch (op->op) { 1554 + case POLICY_PROMOTE: 1555 + r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock); 1556 + if (r) { 1557 + DMERR_LIMIT("%s: migration failed; couldn't insert mapping", 1558 + cache_device_name(cache)); 1559 + metadata_operation_failed(cache, "dm_cache_insert_mapping", r); 1560 + 1561 + mg_complete(mg, false); 1562 + return; 1563 + } 1564 + mg_complete(mg, true); 1565 + break; 1566 + 1567 + case POLICY_DEMOTE: 1568 + r = dm_cache_remove_mapping(cache->cmd, op->cblock); 1569 + if (r) { 1570 + DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata", 1571 + cache_device_name(cache)); 1572 + metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1573 + 1574 + mg_complete(mg, false); 1575 + return; 1576 + } 1577 + 1578 + /* 1579 + * It would be nice if we only had to commit when a REQ_FLUSH 1580 + * comes through. But there's one scenario that we have to 1581 + * look out for: 1582 + * 1583 + * - vblock x in a cache block 1584 + * - domotion occurs 1585 + * - cache block gets reallocated and over written 1586 + * - crash 1587 + * 1588 + * When we recover, because there was no commit the cache will 1589 + * rollback to having the data for vblock x in the cache block. 1590 + * But the cache block has since been overwritten, so it'll end 1591 + * up pointing to data that was never in 'x' during the history 1592 + * of the device. 1593 + * 1594 + * To avoid this issue we require a commit as part of the 1595 + * demotion operation. 1596 + */ 1597 + init_continuation(&mg->k, mg_success); 1598 + continue_after_commit(&cache->committer, &mg->k); 1599 + schedule_commit(&cache->committer); 1600 + break; 1601 + 1602 + case POLICY_WRITEBACK: 1603 + mg_complete(mg, true); 1604 + break; 1605 + } 1606 + } 1607 + 1608 + static void mg_update_metadata_after_copy(struct work_struct *ws) 1609 + { 1610 + struct dm_cache_migration *mg = ws_to_mg(ws); 1611 + 1612 + /* 1613 + * Did the copy succeed? 1614 + */ 1615 + if (mg->k.input) 1616 + mg_complete(mg, false); 1617 + else 1618 + mg_update_metadata(ws); 1619 + } 1620 + 1621 + static void mg_upgrade_lock(struct work_struct *ws) 1622 + { 1623 + int r; 1624 + struct dm_cache_migration *mg = ws_to_mg(ws); 1625 + 1626 + /* 1627 + * Did the copy succeed? 1628 + */ 1629 + if (mg->k.input) 1630 + mg_complete(mg, false); 1631 + 1632 + else { 1633 + /* 1634 + * Now we want the lock to prevent both reads and writes. 1635 + */ 1636 + r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell, 1637 + READ_WRITE_LOCK_LEVEL); 1638 + if (r < 0) 1639 + mg_complete(mg, false); 1640 + 1641 + else if (r) 1642 + quiesce(mg, mg_update_metadata); 1643 + 1644 + else 1645 + mg_update_metadata(ws); 1646 + } 1647 + } 1648 + 1649 + static void mg_copy(struct work_struct *ws) 1650 + { 1651 + int r; 1652 + struct dm_cache_migration *mg = ws_to_mg(ws); 1653 + 1654 + if (mg->overwrite_bio) { 1655 + /* 1656 + * It's safe to do this here, even though it's new data 1657 + * because all IO has been locked out of the block. 1658 + * 1659 + * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL 1660 + * so _not_ using mg_upgrade_lock() as continutation. 1661 + */ 1662 + overwrite(mg, mg_update_metadata_after_copy); 1663 + 1664 + } else { 1665 + struct cache *cache = mg->cache; 1666 + struct policy_work *op = mg->op; 1667 + bool is_policy_promote = (op->op == POLICY_PROMOTE); 1668 + 1669 + if ((!is_policy_promote && !is_dirty(cache, op->cblock)) || 1670 + is_discarded_oblock(cache, op->oblock)) { 1671 + mg_upgrade_lock(ws); 1672 + return; 1673 + } 1674 + 1675 + init_continuation(&mg->k, mg_upgrade_lock); 1676 + 1677 + r = copy(mg, is_policy_promote); 1678 + if (r) { 1679 + DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache)); 1680 + mg->k.input = -EIO; 1681 + mg_complete(mg, false); 1682 + } 1683 + } 1684 + } 1685 + 1686 + static int mg_lock_writes(struct dm_cache_migration *mg) 1687 + { 1688 + int r; 1689 + struct dm_cell_key_v2 key; 1690 + struct cache *cache = mg->cache; 1691 + struct dm_bio_prison_cell_v2 *prealloc; 1692 + 1693 + prealloc = alloc_prison_cell(cache); 1694 + if (!prealloc) { 1695 + DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache)); 1696 + mg_complete(mg, false); 1697 + return -ENOMEM; 1698 + } 1699 + 1700 + /* 1701 + * Prevent writes to the block, but allow reads to continue. 1702 + * Unless we're using an overwrite bio, in which case we lock 1703 + * everything. 1704 + */ 1705 + build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key); 1706 + r = dm_cell_lock_v2(cache->prison, &key, 1707 + mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL, 1708 + prealloc, &mg->cell); 1709 + if (r < 0) { 1710 + free_prison_cell(cache, prealloc); 1711 + mg_complete(mg, false); 1712 + return r; 1713 + } 1714 + 1715 + if (mg->cell != prealloc) 1716 + free_prison_cell(cache, prealloc); 1717 + 1718 + if (r == 0) 1719 + mg_copy(&mg->k.ws); 1720 + else 1721 + quiesce(mg, mg_copy); 1722 + 1723 + return 0; 1724 + } 1725 + 1726 + static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio) 1727 + { 1728 + struct dm_cache_migration *mg; 1729 + 1730 + if (!background_work_begin(cache)) { 1731 + policy_complete_background_work(cache->policy, op, false); 1732 + return -EPERM; 1733 + } 1734 + 1735 + mg = alloc_migration(cache); 1736 + if (!mg) { 1737 + policy_complete_background_work(cache->policy, op, false); 1738 + background_work_end(cache); 1739 + return -ENOMEM; 1740 + } 1741 + 1742 + memset(mg, 0, sizeof(*mg)); 1743 + 1403 1744 mg->cache = cache; 1404 - mg->old_ocell = NULL; 1405 - mg->new_ocell = cell; 1406 - mg->start_jiffies = jiffies; 1745 + mg->op = op; 1746 + mg->overwrite_bio = bio; 1407 1747 1408 - quiesce_migration(mg); 1748 + if (!bio) 1749 + inc_io_migrations(cache); 1750 + 1751 + return mg_lock_writes(mg); 1752 + } 1753 + 1754 + /*---------------------------------------------------------------- 1755 + * invalidation processing 1756 + *--------------------------------------------------------------*/ 1757 + 1758 + static void invalidate_complete(struct dm_cache_migration *mg, bool success) 1759 + { 1760 + struct bio_list bios; 1761 + struct cache *cache = mg->cache; 1762 + 1763 + bio_list_init(&bios); 1764 + if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1765 + free_prison_cell(cache, mg->cell); 1766 + 1767 + if (!success && mg->overwrite_bio) 1768 + bio_io_error(mg->overwrite_bio); 1769 + 1770 + free_migration(mg); 1771 + defer_bios(cache, &bios); 1772 + 1773 + background_work_end(cache); 1774 + } 1775 + 1776 + static void invalidate_completed(struct work_struct *ws) 1777 + { 1778 + struct dm_cache_migration *mg = ws_to_mg(ws); 1779 + invalidate_complete(mg, !mg->k.input); 1780 + } 1781 + 1782 + static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock) 1783 + { 1784 + int r = policy_invalidate_mapping(cache->policy, cblock); 1785 + if (!r) { 1786 + r = dm_cache_remove_mapping(cache->cmd, cblock); 1787 + if (r) { 1788 + DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", 1789 + cache_device_name(cache)); 1790 + metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1791 + } 1792 + 1793 + } else if (r == -ENODATA) { 1794 + /* 1795 + * Harmless, already unmapped. 1796 + */ 1797 + r = 0; 1798 + 1799 + } else 1800 + DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache)); 1801 + 1802 + return r; 1803 + } 1804 + 1805 + static void invalidate_remove(struct work_struct *ws) 1806 + { 1807 + int r; 1808 + struct dm_cache_migration *mg = ws_to_mg(ws); 1809 + struct cache *cache = mg->cache; 1810 + 1811 + r = invalidate_cblock(cache, mg->invalidate_cblock); 1812 + if (r) { 1813 + invalidate_complete(mg, false); 1814 + return; 1815 + } 1816 + 1817 + init_continuation(&mg->k, invalidate_completed); 1818 + continue_after_commit(&cache->committer, &mg->k); 1819 + remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock); 1820 + mg->overwrite_bio = NULL; 1821 + schedule_commit(&cache->committer); 1822 + } 1823 + 1824 + static int invalidate_lock(struct dm_cache_migration *mg) 1825 + { 1826 + int r; 1827 + struct dm_cell_key_v2 key; 1828 + struct cache *cache = mg->cache; 1829 + struct dm_bio_prison_cell_v2 *prealloc; 1830 + 1831 + prealloc = alloc_prison_cell(cache); 1832 + if (!prealloc) { 1833 + invalidate_complete(mg, false); 1834 + return -ENOMEM; 1835 + } 1836 + 1837 + build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key); 1838 + r = dm_cell_lock_v2(cache->prison, &key, 1839 + READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell); 1840 + if (r < 0) { 1841 + free_prison_cell(cache, prealloc); 1842 + invalidate_complete(mg, false); 1843 + return r; 1844 + } 1845 + 1846 + if (mg->cell != prealloc) 1847 + free_prison_cell(cache, prealloc); 1848 + 1849 + if (r) 1850 + quiesce(mg, invalidate_remove); 1851 + 1852 + else { 1853 + /* 1854 + * We can't call invalidate_remove() directly here because we 1855 + * might still be in request context. 1856 + */ 1857 + init_continuation(&mg->k, invalidate_remove); 1858 + queue_work(cache->wq, &mg->k.ws); 1859 + } 1860 + 1861 + return 0; 1862 + } 1863 + 1864 + static int invalidate_start(struct cache *cache, dm_cblock_t cblock, 1865 + dm_oblock_t oblock, struct bio *bio) 1866 + { 1867 + struct dm_cache_migration *mg; 1868 + 1869 + if (!background_work_begin(cache)) 1870 + return -EPERM; 1871 + 1872 + mg = alloc_migration(cache); 1873 + if (!mg) { 1874 + background_work_end(cache); 1875 + return -ENOMEM; 1876 + } 1877 + 1878 + memset(mg, 0, sizeof(*mg)); 1879 + 1880 + mg->cache = cache; 1881 + mg->overwrite_bio = bio; 1882 + mg->invalidate_cblock = cblock; 1883 + mg->invalidate_oblock = oblock; 1884 + 1885 + return invalidate_lock(mg); 1409 1886 } 1410 1887 1411 1888 /*---------------------------------------------------------------- 1412 1889 * bio processing 1413 1890 *--------------------------------------------------------------*/ 1414 - static void defer_bio(struct cache *cache, struct bio *bio) 1891 + 1892 + enum busy { 1893 + IDLE, 1894 + MODERATE, 1895 + BUSY 1896 + }; 1897 + 1898 + static enum busy spare_migration_bandwidth(struct cache *cache) 1415 1899 { 1416 - unsigned long flags; 1417 - 1418 - spin_lock_irqsave(&cache->lock, flags); 1419 - bio_list_add(&cache->deferred_bios, bio); 1420 - spin_unlock_irqrestore(&cache->lock, flags); 1421 - 1422 - wake_worker(cache); 1423 - } 1424 - 1425 - static void process_flush_bio(struct cache *cache, struct bio *bio) 1426 - { 1427 - size_t pb_data_size = get_per_bio_data_size(cache); 1428 - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1429 - 1430 - BUG_ON(bio->bi_iter.bi_size); 1431 - if (!pb->req_nr) 1432 - remap_to_origin(cache, bio); 1433 - else 1434 - remap_to_cache(cache, bio, 0); 1435 - 1436 - /* 1437 - * REQ_PREFLUSH is not directed at any particular block so we don't 1438 - * need to inc_ds(). REQ_FUA's are split into a write + REQ_PREFLUSH 1439 - * by dm-core. 1440 - */ 1441 - issue(cache, bio); 1442 - } 1443 - 1444 - static void process_discard_bio(struct cache *cache, struct prealloc *structs, 1445 - struct bio *bio) 1446 - { 1447 - int r; 1448 - dm_dblock_t b, e; 1449 - struct dm_bio_prison_cell *cell_prealloc, *new_ocell; 1450 - 1451 - calc_discard_block_range(cache, bio, &b, &e); 1452 - if (b == e) { 1453 - bio_endio(bio); 1454 - return; 1455 - } 1456 - 1457 - cell_prealloc = prealloc_get_cell(structs); 1458 - r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc, 1459 - (cell_free_fn) prealloc_put_cell, 1460 - structs, &new_ocell); 1461 - if (r > 0) 1462 - return; 1463 - 1464 - discard(cache, structs, new_ocell); 1465 - } 1466 - 1467 - static bool spare_migration_bandwidth(struct cache *cache) 1468 - { 1900 + bool idle = iot_idle_for(&cache->origin_tracker, HZ); 1469 1901 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1470 1902 cache->sectors_per_block; 1471 - return current_volume < cache->migration_threshold; 1903 + 1904 + if (current_volume <= cache->migration_threshold) 1905 + return idle ? IDLE : MODERATE; 1906 + else 1907 + return idle ? MODERATE : BUSY; 1472 1908 } 1473 1909 1474 1910 static void inc_hit_counter(struct cache *cache, struct bio *bio) ··· 1746 1660 1747 1661 /*----------------------------------------------------------------*/ 1748 1662 1749 - struct inc_detail { 1750 - struct cache *cache; 1751 - struct bio_list bios_for_issue; 1752 - struct bio_list unhandled_bios; 1753 - bool any_writes; 1754 - }; 1755 - 1756 - static void inc_fn(void *context, struct dm_bio_prison_cell *cell) 1663 + static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1757 1664 { 1758 - struct bio *bio; 1759 - struct inc_detail *detail = context; 1760 - struct cache *cache = detail->cache; 1665 + return (bio_data_dir(bio) == WRITE) && 1666 + (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1667 + } 1761 1668 1762 - inc_ds(cache, cell->holder, cell); 1763 - if (bio_data_dir(cell->holder) == WRITE) 1764 - detail->any_writes = true; 1669 + static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) 1670 + { 1671 + return writeback_mode(&cache->features) && 1672 + (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); 1673 + } 1765 1674 1766 - while ((bio = bio_list_pop(&cell->bios))) { 1767 - if (discard_or_flush(bio)) { 1768 - bio_list_add(&detail->unhandled_bios, bio); 1769 - continue; 1675 + static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, 1676 + bool *commit_needed) 1677 + { 1678 + int r, data_dir; 1679 + bool rb, background_queued; 1680 + dm_cblock_t cblock; 1681 + size_t pb_data_size = get_per_bio_data_size(cache); 1682 + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1683 + 1684 + *commit_needed = false; 1685 + 1686 + rb = bio_detain_shared(cache, block, bio); 1687 + if (!rb) { 1688 + /* 1689 + * An exclusive lock is held for this block, so we have to 1690 + * wait. We set the commit_needed flag so the current 1691 + * transaction will be committed asap, allowing this lock 1692 + * to be dropped. 1693 + */ 1694 + *commit_needed = true; 1695 + return DM_MAPIO_SUBMITTED; 1696 + } 1697 + 1698 + data_dir = bio_data_dir(bio); 1699 + 1700 + if (optimisable_bio(cache, bio, block)) { 1701 + struct policy_work *op = NULL; 1702 + 1703 + r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op); 1704 + if (unlikely(r && r != -ENOENT)) { 1705 + DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d", 1706 + cache_device_name(cache), r); 1707 + bio_io_error(bio); 1708 + return DM_MAPIO_SUBMITTED; 1770 1709 } 1771 1710 1772 - if (bio_data_dir(bio) == WRITE) 1773 - detail->any_writes = true; 1774 - 1775 - bio_list_add(&detail->bios_for_issue, bio); 1776 - inc_ds(cache, bio, cell); 1777 - } 1778 - } 1779 - 1780 - // FIXME: refactor these two 1781 - static void remap_cell_to_origin_clear_discard(struct cache *cache, 1782 - struct dm_bio_prison_cell *cell, 1783 - dm_oblock_t oblock, bool issue_holder) 1784 - { 1785 - struct bio *bio; 1786 - unsigned long flags; 1787 - struct inc_detail detail; 1788 - 1789 - detail.cache = cache; 1790 - bio_list_init(&detail.bios_for_issue); 1791 - bio_list_init(&detail.unhandled_bios); 1792 - detail.any_writes = false; 1793 - 1794 - spin_lock_irqsave(&cache->lock, flags); 1795 - dm_cell_visit_release(cache->prison, inc_fn, &detail, cell); 1796 - bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios); 1797 - spin_unlock_irqrestore(&cache->lock, flags); 1798 - 1799 - remap_to_origin(cache, cell->holder); 1800 - if (issue_holder) 1801 - issue(cache, cell->holder); 1802 - else 1803 - accounted_begin(cache, cell->holder); 1804 - 1805 - if (detail.any_writes) 1806 - clear_discard(cache, oblock_to_dblock(cache, oblock)); 1807 - 1808 - while ((bio = bio_list_pop(&detail.bios_for_issue))) { 1809 - remap_to_origin(cache, bio); 1810 - issue(cache, bio); 1811 - } 1812 - 1813 - free_prison_cell(cache, cell); 1814 - } 1815 - 1816 - static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell, 1817 - dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder) 1818 - { 1819 - struct bio *bio; 1820 - unsigned long flags; 1821 - struct inc_detail detail; 1822 - 1823 - detail.cache = cache; 1824 - bio_list_init(&detail.bios_for_issue); 1825 - bio_list_init(&detail.unhandled_bios); 1826 - detail.any_writes = false; 1827 - 1828 - spin_lock_irqsave(&cache->lock, flags); 1829 - dm_cell_visit_release(cache->prison, inc_fn, &detail, cell); 1830 - bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios); 1831 - spin_unlock_irqrestore(&cache->lock, flags); 1832 - 1833 - remap_to_cache(cache, cell->holder, cblock); 1834 - if (issue_holder) 1835 - issue(cache, cell->holder); 1836 - else 1837 - accounted_begin(cache, cell->holder); 1838 - 1839 - if (detail.any_writes) { 1840 - set_dirty(cache, oblock, cblock); 1841 - clear_discard(cache, oblock_to_dblock(cache, oblock)); 1842 - } 1843 - 1844 - while ((bio = bio_list_pop(&detail.bios_for_issue))) { 1845 - remap_to_cache(cache, bio, cblock); 1846 - issue(cache, bio); 1847 - } 1848 - 1849 - free_prison_cell(cache, cell); 1850 - } 1851 - 1852 - /*----------------------------------------------------------------*/ 1853 - 1854 - struct old_oblock_lock { 1855 - struct policy_locker locker; 1856 - struct cache *cache; 1857 - struct prealloc *structs; 1858 - struct dm_bio_prison_cell *cell; 1859 - }; 1860 - 1861 - static int null_locker(struct policy_locker *locker, dm_oblock_t b) 1862 - { 1863 - /* This should never be called */ 1864 - BUG(); 1865 - return 0; 1866 - } 1867 - 1868 - static int cell_locker(struct policy_locker *locker, dm_oblock_t b) 1869 - { 1870 - struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker); 1871 - struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs); 1872 - 1873 - return bio_detain(l->cache, b, NULL, cell_prealloc, 1874 - (cell_free_fn) prealloc_put_cell, 1875 - l->structs, &l->cell); 1876 - } 1877 - 1878 - static void process_cell(struct cache *cache, struct prealloc *structs, 1879 - struct dm_bio_prison_cell *new_ocell) 1880 - { 1881 - int r; 1882 - bool release_cell = true; 1883 - struct bio *bio = new_ocell->holder; 1884 - dm_oblock_t block = get_bio_block(cache, bio); 1885 - struct policy_result lookup_result; 1886 - bool passthrough = passthrough_mode(&cache->features); 1887 - bool fast_promotion, can_migrate; 1888 - struct old_oblock_lock ool; 1889 - 1890 - fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio); 1891 - can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache)); 1892 - 1893 - ool.locker.fn = cell_locker; 1894 - ool.cache = cache; 1895 - ool.structs = structs; 1896 - ool.cell = NULL; 1897 - r = policy_map(cache->policy, block, true, can_migrate, fast_promotion, 1898 - bio, &ool.locker, &lookup_result); 1899 - 1900 - if (r == -EWOULDBLOCK) 1901 - /* migration has been denied */ 1902 - lookup_result.op = POLICY_MISS; 1903 - 1904 - switch (lookup_result.op) { 1905 - case POLICY_HIT: 1906 - if (passthrough) { 1907 - inc_miss_counter(cache, bio); 1908 - 1909 - /* 1910 - * Passthrough always maps to the origin, 1911 - * invalidating any cache blocks that are written 1912 - * to. 1913 - */ 1914 - 1915 - if (bio_data_dir(bio) == WRITE) { 1916 - atomic_inc(&cache->stats.demotion); 1917 - invalidate(cache, structs, block, lookup_result.cblock, new_ocell); 1918 - release_cell = false; 1919 - 1920 - } else { 1921 - /* FIXME: factor out issue_origin() */ 1922 - remap_to_origin_clear_discard(cache, bio, block); 1923 - inc_and_issue(cache, bio, new_ocell); 1924 - } 1925 - } else { 1926 - inc_hit_counter(cache, bio); 1927 - 1928 - if (bio_data_dir(bio) == WRITE && 1929 - writethrough_mode(&cache->features) && 1930 - !is_dirty(cache, lookup_result.cblock)) { 1931 - remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 1932 - inc_and_issue(cache, bio, new_ocell); 1933 - 1934 - } else { 1935 - remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true); 1936 - release_cell = false; 1937 - } 1711 + if (r == -ENOENT && op) { 1712 + bio_drop_shared_lock(cache, bio); 1713 + BUG_ON(op->op != POLICY_PROMOTE); 1714 + mg_start(cache, op, bio); 1715 + return DM_MAPIO_SUBMITTED; 1716 + } 1717 + } else { 1718 + r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued); 1719 + if (unlikely(r && r != -ENOENT)) { 1720 + DMERR_LIMIT("%s: policy_lookup() failed with r = %d", 1721 + cache_device_name(cache), r); 1722 + bio_io_error(bio); 1723 + return DM_MAPIO_SUBMITTED; 1938 1724 } 1939 1725 1940 - break; 1726 + if (background_queued) 1727 + wake_migration_worker(cache); 1728 + } 1941 1729 1942 - case POLICY_MISS: 1730 + if (r == -ENOENT) { 1731 + /* 1732 + * Miss. 1733 + */ 1943 1734 inc_miss_counter(cache, bio); 1944 - remap_cell_to_origin_clear_discard(cache, new_ocell, block, true); 1945 - release_cell = false; 1946 - break; 1735 + if (pb->req_nr == 0) { 1736 + accounted_begin(cache, bio); 1737 + remap_to_origin_clear_discard(cache, bio, block); 1947 1738 1948 - case POLICY_NEW: 1949 - atomic_inc(&cache->stats.promotion); 1950 - promote(cache, structs, block, lookup_result.cblock, new_ocell); 1951 - release_cell = false; 1952 - break; 1739 + } else { 1740 + /* 1741 + * This is a duplicate writethrough io that is no 1742 + * longer needed because the block has been demoted. 1743 + */ 1744 + bio_endio(bio); 1745 + return DM_MAPIO_SUBMITTED; 1746 + } 1747 + } else { 1748 + /* 1749 + * Hit. 1750 + */ 1751 + inc_hit_counter(cache, bio); 1953 1752 1954 - case POLICY_REPLACE: 1955 - atomic_inc(&cache->stats.demotion); 1956 - atomic_inc(&cache->stats.promotion); 1957 - demote_then_promote(cache, structs, lookup_result.old_oblock, 1958 - block, lookup_result.cblock, 1959 - ool.cell, new_ocell); 1960 - release_cell = false; 1961 - break; 1753 + /* 1754 + * Passthrough always maps to the origin, invalidating any 1755 + * cache blocks that are written to. 1756 + */ 1757 + if (passthrough_mode(&cache->features)) { 1758 + if (bio_data_dir(bio) == WRITE) { 1759 + bio_drop_shared_lock(cache, bio); 1760 + atomic_inc(&cache->stats.demotion); 1761 + invalidate_start(cache, cblock, block, bio); 1762 + } else 1763 + remap_to_origin_clear_discard(cache, bio, block); 1962 1764 1963 - default: 1964 - DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u", 1965 - cache_device_name(cache), __func__, 1966 - (unsigned) lookup_result.op); 1967 - bio_io_error(bio); 1765 + } else { 1766 + if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && 1767 + !is_dirty(cache, cblock)) { 1768 + remap_to_origin_then_cache(cache, bio, block, cblock); 1769 + accounted_begin(cache, bio); 1770 + } else 1771 + remap_to_cache_dirty(cache, bio, block, cblock); 1772 + } 1968 1773 } 1969 - 1970 - if (release_cell) 1971 - cell_defer(cache, new_ocell, false); 1972 - } 1973 - 1974 - static void process_bio(struct cache *cache, struct prealloc *structs, 1975 - struct bio *bio) 1976 - { 1977 - int r; 1978 - dm_oblock_t block = get_bio_block(cache, bio); 1979 - struct dm_bio_prison_cell *cell_prealloc, *new_ocell; 1980 1774 1981 1775 /* 1982 - * Check to see if that block is currently migrating. 1776 + * dm core turns FUA requests into a separate payload and FLUSH req. 1983 1777 */ 1984 - cell_prealloc = prealloc_get_cell(structs); 1985 - r = bio_detain(cache, block, bio, cell_prealloc, 1986 - (cell_free_fn) prealloc_put_cell, 1987 - structs, &new_ocell); 1988 - if (r > 0) 1989 - return; 1778 + if (bio->bi_opf & REQ_FUA) { 1779 + /* 1780 + * issue_after_commit will call accounted_begin a second time. So 1781 + * we call accounted_complete() to avoid double accounting. 1782 + */ 1783 + accounted_complete(cache, bio); 1784 + issue_after_commit(&cache->committer, bio); 1785 + *commit_needed = true; 1786 + return DM_MAPIO_SUBMITTED; 1787 + } 1990 1788 1991 - process_cell(cache, structs, new_ocell); 1789 + return DM_MAPIO_REMAPPED; 1992 1790 } 1993 1791 1994 - static int need_commit_due_to_time(struct cache *cache) 1792 + static bool process_bio(struct cache *cache, struct bio *bio) 1995 1793 { 1996 - return jiffies < cache->last_commit_jiffies || 1997 - jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; 1794 + bool commit_needed; 1795 + 1796 + if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED) 1797 + generic_make_request(bio); 1798 + 1799 + return commit_needed; 1998 1800 } 1999 1801 2000 1802 /* ··· 1903 1929 return r; 1904 1930 } 1905 1931 1906 - static int commit_if_needed(struct cache *cache) 1932 + /* 1933 + * Used by the batcher. 1934 + */ 1935 + static int commit_op(void *context) 1907 1936 { 1908 - int r = 0; 1937 + struct cache *cache = context; 1909 1938 1910 - if ((cache->commit_requested || need_commit_due_to_time(cache)) && 1911 - dm_cache_changed_this_transaction(cache->cmd)) { 1912 - r = commit(cache, false); 1913 - cache->commit_requested = false; 1914 - cache->last_commit_jiffies = jiffies; 1915 - } 1939 + if (dm_cache_changed_this_transaction(cache->cmd)) 1940 + return commit(cache, false); 1916 1941 1917 - return r; 1942 + return 0; 1918 1943 } 1919 1944 1920 - static void process_deferred_bios(struct cache *cache) 1945 + /*----------------------------------------------------------------*/ 1946 + 1947 + static bool process_flush_bio(struct cache *cache, struct bio *bio) 1921 1948 { 1922 - bool prealloc_used = false; 1949 + size_t pb_data_size = get_per_bio_data_size(cache); 1950 + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1951 + 1952 + if (!pb->req_nr) 1953 + remap_to_origin(cache, bio); 1954 + else 1955 + remap_to_cache(cache, bio, 0); 1956 + 1957 + issue_after_commit(&cache->committer, bio); 1958 + return true; 1959 + } 1960 + 1961 + static bool process_discard_bio(struct cache *cache, struct bio *bio) 1962 + { 1963 + dm_dblock_t b, e; 1964 + 1965 + // FIXME: do we need to lock the region? Or can we just assume the 1966 + // user wont be so foolish as to issue discard concurrently with 1967 + // other IO? 1968 + calc_discard_block_range(cache, bio, &b, &e); 1969 + while (b != e) { 1970 + set_discard(cache, b); 1971 + b = to_dblock(from_dblock(b) + 1); 1972 + } 1973 + 1974 + bio_endio(bio); 1975 + 1976 + return false; 1977 + } 1978 + 1979 + static void process_deferred_bios(struct work_struct *ws) 1980 + { 1981 + struct cache *cache = container_of(ws, struct cache, deferred_bio_worker); 1982 + 1923 1983 unsigned long flags; 1984 + bool commit_needed = false; 1924 1985 struct bio_list bios; 1925 1986 struct bio *bio; 1926 - struct prealloc structs; 1927 1987 1928 - memset(&structs, 0, sizeof(structs)); 1929 1988 bio_list_init(&bios); 1930 1989 1931 1990 spin_lock_irqsave(&cache->lock, flags); ··· 1966 1959 bio_list_init(&cache->deferred_bios); 1967 1960 spin_unlock_irqrestore(&cache->lock, flags); 1968 1961 1969 - while (!bio_list_empty(&bios)) { 1970 - /* 1971 - * If we've got no free migration structs, and processing 1972 - * this bio might require one, we pause until there are some 1973 - * prepared mappings to process. 1974 - */ 1975 - prealloc_used = true; 1976 - if (prealloc_data_structs(cache, &structs)) { 1977 - spin_lock_irqsave(&cache->lock, flags); 1978 - bio_list_merge(&cache->deferred_bios, &bios); 1979 - spin_unlock_irqrestore(&cache->lock, flags); 1980 - break; 1981 - } 1982 - 1983 - bio = bio_list_pop(&bios); 1984 - 1962 + while ((bio = bio_list_pop(&bios))) { 1985 1963 if (bio->bi_opf & REQ_PREFLUSH) 1986 - process_flush_bio(cache, bio); 1964 + commit_needed = process_flush_bio(cache, bio) || commit_needed; 1965 + 1987 1966 else if (bio_op(bio) == REQ_OP_DISCARD) 1988 - process_discard_bio(cache, &structs, bio); 1967 + commit_needed = process_discard_bio(cache, bio) || commit_needed; 1968 + 1989 1969 else 1990 - process_bio(cache, &structs, bio); 1970 + commit_needed = process_bio(cache, bio) || commit_needed; 1991 1971 } 1992 1972 1993 - if (prealloc_used) 1994 - prealloc_free_structs(cache, &structs); 1973 + if (commit_needed) 1974 + schedule_commit(&cache->committer); 1995 1975 } 1996 1976 1997 - static void process_deferred_cells(struct cache *cache) 1977 + static void process_deferred_writethrough_bios(struct work_struct *ws) 1998 1978 { 1999 - bool prealloc_used = false; 2000 - unsigned long flags; 2001 - struct dm_bio_prison_cell *cell, *tmp; 2002 - struct list_head cells; 2003 - struct prealloc structs; 1979 + struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker); 2004 1980 2005 - memset(&structs, 0, sizeof(structs)); 2006 - 2007 - INIT_LIST_HEAD(&cells); 2008 - 2009 - spin_lock_irqsave(&cache->lock, flags); 2010 - list_splice_init(&cache->deferred_cells, &cells); 2011 - spin_unlock_irqrestore(&cache->lock, flags); 2012 - 2013 - list_for_each_entry_safe(cell, tmp, &cells, user_list) { 2014 - /* 2015 - * If we've got no free migration structs, and processing 2016 - * this bio might require one, we pause until there are some 2017 - * prepared mappings to process. 2018 - */ 2019 - prealloc_used = true; 2020 - if (prealloc_data_structs(cache, &structs)) { 2021 - spin_lock_irqsave(&cache->lock, flags); 2022 - list_splice(&cells, &cache->deferred_cells); 2023 - spin_unlock_irqrestore(&cache->lock, flags); 2024 - break; 2025 - } 2026 - 2027 - process_cell(cache, &structs, cell); 2028 - } 2029 - 2030 - if (prealloc_used) 2031 - prealloc_free_structs(cache, &structs); 2032 - } 2033 - 2034 - static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) 2035 - { 2036 - unsigned long flags; 2037 - struct bio_list bios; 2038 - struct bio *bio; 2039 - 2040 - bio_list_init(&bios); 2041 - 2042 - spin_lock_irqsave(&cache->lock, flags); 2043 - bio_list_merge(&bios, &cache->deferred_flush_bios); 2044 - bio_list_init(&cache->deferred_flush_bios); 2045 - spin_unlock_irqrestore(&cache->lock, flags); 2046 - 2047 - /* 2048 - * These bios have already been through inc_ds() 2049 - */ 2050 - while ((bio = bio_list_pop(&bios))) 2051 - submit_bios ? accounted_request(cache, bio) : bio_io_error(bio); 2052 - } 2053 - 2054 - static void process_deferred_writethrough_bios(struct cache *cache) 2055 - { 2056 1981 unsigned long flags; 2057 1982 struct bio_list bios; 2058 1983 struct bio *bio; ··· 1997 2058 spin_unlock_irqrestore(&cache->lock, flags); 1998 2059 1999 2060 /* 2000 - * These bios have already been through inc_ds() 2061 + * These bios have already been through accounted_begin() 2001 2062 */ 2002 2063 while ((bio = bio_list_pop(&bios))) 2003 - accounted_request(cache, bio); 2004 - } 2005 - 2006 - static void writeback_some_dirty_blocks(struct cache *cache) 2007 - { 2008 - bool prealloc_used = false; 2009 - dm_oblock_t oblock; 2010 - dm_cblock_t cblock; 2011 - struct prealloc structs; 2012 - struct dm_bio_prison_cell *old_ocell; 2013 - bool busy = !iot_idle_for(&cache->origin_tracker, HZ); 2014 - 2015 - memset(&structs, 0, sizeof(structs)); 2016 - 2017 - while (spare_migration_bandwidth(cache)) { 2018 - if (policy_writeback_work(cache->policy, &oblock, &cblock, busy)) 2019 - break; /* no work to do */ 2020 - 2021 - prealloc_used = true; 2022 - if (prealloc_data_structs(cache, &structs) || 2023 - get_cell(cache, oblock, &structs, &old_ocell)) { 2024 - policy_set_dirty(cache->policy, oblock); 2025 - break; 2026 - } 2027 - 2028 - writeback(cache, &structs, oblock, cblock, old_ocell); 2029 - } 2030 - 2031 - if (prealloc_used) 2032 - prealloc_free_structs(cache, &structs); 2033 - } 2034 - 2035 - /*---------------------------------------------------------------- 2036 - * Invalidations. 2037 - * Dropping something from the cache *without* writing back. 2038 - *--------------------------------------------------------------*/ 2039 - 2040 - static void process_invalidation_request(struct cache *cache, struct invalidation_request *req) 2041 - { 2042 - int r = 0; 2043 - uint64_t begin = from_cblock(req->cblocks->begin); 2044 - uint64_t end = from_cblock(req->cblocks->end); 2045 - 2046 - while (begin != end) { 2047 - r = policy_remove_cblock(cache->policy, to_cblock(begin)); 2048 - if (!r) { 2049 - r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin)); 2050 - if (r) { 2051 - metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 2052 - break; 2053 - } 2054 - 2055 - } else if (r == -ENODATA) { 2056 - /* harmless, already unmapped */ 2057 - r = 0; 2058 - 2059 - } else { 2060 - DMERR("%s: policy_remove_cblock failed", cache_device_name(cache)); 2061 - break; 2062 - } 2063 - 2064 - begin++; 2065 - } 2066 - 2067 - cache->commit_requested = true; 2068 - 2069 - req->err = r; 2070 - atomic_set(&req->complete, 1); 2071 - 2072 - wake_up(&req->result_wait); 2073 - } 2074 - 2075 - static void process_invalidation_requests(struct cache *cache) 2076 - { 2077 - struct list_head list; 2078 - struct invalidation_request *req, *tmp; 2079 - 2080 - INIT_LIST_HEAD(&list); 2081 - spin_lock(&cache->invalidation_lock); 2082 - list_splice_init(&cache->invalidation_requests, &list); 2083 - spin_unlock(&cache->invalidation_lock); 2084 - 2085 - list_for_each_entry_safe (req, tmp, &list, list) 2086 - process_invalidation_request(cache, req); 2064 + generic_make_request(bio); 2087 2065 } 2088 2066 2089 2067 /*---------------------------------------------------------------- 2090 2068 * Main worker loop 2091 2069 *--------------------------------------------------------------*/ 2092 - static bool is_quiescing(struct cache *cache) 2093 - { 2094 - return atomic_read(&cache->quiescing); 2095 - } 2096 - 2097 - static void ack_quiescing(struct cache *cache) 2098 - { 2099 - if (is_quiescing(cache)) { 2100 - atomic_inc(&cache->quiescing_ack); 2101 - wake_up(&cache->quiescing_wait); 2102 - } 2103 - } 2104 - 2105 - static void wait_for_quiescing_ack(struct cache *cache) 2106 - { 2107 - wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack)); 2108 - } 2109 - 2110 - static void start_quiescing(struct cache *cache) 2111 - { 2112 - atomic_inc(&cache->quiescing); 2113 - wait_for_quiescing_ack(cache); 2114 - } 2115 - 2116 - static void stop_quiescing(struct cache *cache) 2117 - { 2118 - atomic_set(&cache->quiescing, 0); 2119 - atomic_set(&cache->quiescing_ack, 0); 2120 - } 2121 - 2122 - static void wait_for_migrations(struct cache *cache) 2123 - { 2124 - wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations)); 2125 - } 2126 - 2127 - static void stop_worker(struct cache *cache) 2128 - { 2129 - cancel_delayed_work(&cache->waker); 2130 - flush_workqueue(cache->wq); 2131 - } 2132 - 2133 - static void requeue_deferred_cells(struct cache *cache) 2134 - { 2135 - unsigned long flags; 2136 - struct list_head cells; 2137 - struct dm_bio_prison_cell *cell, *tmp; 2138 - 2139 - INIT_LIST_HEAD(&cells); 2140 - spin_lock_irqsave(&cache->lock, flags); 2141 - list_splice_init(&cache->deferred_cells, &cells); 2142 - spin_unlock_irqrestore(&cache->lock, flags); 2143 - 2144 - list_for_each_entry_safe(cell, tmp, &cells, user_list) 2145 - cell_requeue(cache, cell); 2146 - } 2147 2070 2148 2071 static void requeue_deferred_bios(struct cache *cache) 2149 2072 { ··· 2022 2221 } 2023 2222 } 2024 2223 2025 - static int more_work(struct cache *cache) 2026 - { 2027 - if (is_quiescing(cache)) 2028 - return !list_empty(&cache->quiesced_migrations) || 2029 - !list_empty(&cache->completed_migrations) || 2030 - !list_empty(&cache->need_commit_migrations); 2031 - else 2032 - return !bio_list_empty(&cache->deferred_bios) || 2033 - !list_empty(&cache->deferred_cells) || 2034 - !bio_list_empty(&cache->deferred_flush_bios) || 2035 - !bio_list_empty(&cache->deferred_writethrough_bios) || 2036 - !list_empty(&cache->quiesced_migrations) || 2037 - !list_empty(&cache->completed_migrations) || 2038 - !list_empty(&cache->need_commit_migrations) || 2039 - cache->invalidate; 2040 - } 2041 - 2042 - static void do_worker(struct work_struct *ws) 2043 - { 2044 - struct cache *cache = container_of(ws, struct cache, worker); 2045 - 2046 - do { 2047 - if (!is_quiescing(cache)) { 2048 - writeback_some_dirty_blocks(cache); 2049 - process_deferred_writethrough_bios(cache); 2050 - process_deferred_bios(cache); 2051 - process_deferred_cells(cache); 2052 - process_invalidation_requests(cache); 2053 - } 2054 - 2055 - process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard); 2056 - process_migrations(cache, &cache->completed_migrations, complete_migration); 2057 - 2058 - if (commit_if_needed(cache)) { 2059 - process_deferred_flush_bios(cache, false); 2060 - process_migrations(cache, &cache->need_commit_migrations, migration_failure); 2061 - } else { 2062 - process_deferred_flush_bios(cache, true); 2063 - process_migrations(cache, &cache->need_commit_migrations, 2064 - migration_success_post_commit); 2065 - } 2066 - 2067 - ack_quiescing(cache); 2068 - 2069 - } while (more_work(cache)); 2070 - } 2071 - 2072 2224 /* 2073 2225 * We want to commit periodically so that not too much 2074 2226 * unwritten metadata builds up. ··· 2029 2275 static void do_waker(struct work_struct *ws) 2030 2276 { 2031 2277 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 2278 + 2032 2279 policy_tick(cache->policy, true); 2033 - wake_worker(cache); 2280 + wake_migration_worker(cache); 2281 + schedule_commit(&cache->committer); 2034 2282 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 2035 2283 } 2036 2284 2037 - /*----------------------------------------------------------------*/ 2038 - 2039 - static int is_congested(struct dm_dev *dev, int bdi_bits) 2285 + static void check_migrations(struct work_struct *ws) 2040 2286 { 2041 - struct request_queue *q = bdev_get_queue(dev->bdev); 2042 - return bdi_congested(q->backing_dev_info, bdi_bits); 2043 - } 2287 + int r; 2288 + struct policy_work *op; 2289 + struct cache *cache = container_of(ws, struct cache, migration_worker); 2290 + enum busy b; 2044 2291 2045 - static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 2046 - { 2047 - struct cache *cache = container_of(cb, struct cache, callbacks); 2292 + for (;;) { 2293 + b = spare_migration_bandwidth(cache); 2294 + if (b == BUSY) 2295 + break; 2048 2296 2049 - return is_congested(cache->origin_dev, bdi_bits) || 2050 - is_congested(cache->cache_dev, bdi_bits); 2297 + r = policy_get_background_work(cache->policy, b == IDLE, &op); 2298 + if (r == -ENODATA) 2299 + break; 2300 + 2301 + if (r) { 2302 + DMERR_LIMIT("%s: policy_background_work failed", 2303 + cache_device_name(cache)); 2304 + break; 2305 + } 2306 + 2307 + r = mg_start(cache, op, NULL); 2308 + if (r) 2309 + break; 2310 + } 2051 2311 } 2052 2312 2053 2313 /*---------------------------------------------------------------- ··· 2078 2310 2079 2311 mempool_destroy(cache->migration_pool); 2080 2312 2081 - if (cache->all_io_ds) 2082 - dm_deferred_set_destroy(cache->all_io_ds); 2083 - 2084 2313 if (cache->prison) 2085 - dm_bio_prison_destroy(cache->prison); 2314 + dm_bio_prison_destroy_v2(cache->prison); 2086 2315 2087 2316 if (cache->wq) 2088 2317 destroy_workqueue(cache->wq); ··· 2472 2707 return PTR_ERR(p); 2473 2708 } 2474 2709 cache->policy = p; 2710 + BUG_ON(!cache->policy); 2475 2711 2476 2712 return 0; 2477 2713 } ··· 2516 2750 cache->cache_size = size; 2517 2751 } 2518 2752 2753 + static int is_congested(struct dm_dev *dev, int bdi_bits) 2754 + { 2755 + struct request_queue *q = bdev_get_queue(dev->bdev); 2756 + return bdi_congested(q->backing_dev_info, bdi_bits); 2757 + } 2758 + 2759 + static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 2760 + { 2761 + struct cache *cache = container_of(cb, struct cache, callbacks); 2762 + 2763 + return is_congested(cache->origin_dev, bdi_bits) || 2764 + is_congested(cache->cache_dev, bdi_bits); 2765 + } 2766 + 2519 2767 #define DEFAULT_MIGRATION_THRESHOLD 2048 2520 2768 2521 2769 static int cache_create(struct cache_args *ca, struct cache **result) ··· 2568 2788 2569 2789 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2570 2790 2571 - /* FIXME: factor out this whole section */ 2572 2791 origin_blocks = cache->origin_sectors = ca->origin_sectors; 2573 2792 origin_blocks = block_div(origin_blocks, ca->block_size); 2574 2793 cache->origin_blocks = to_oblock(origin_blocks); ··· 2633 2854 r = -EINVAL; 2634 2855 goto bad; 2635 2856 } 2857 + 2858 + policy_allow_migrations(cache->policy, false); 2636 2859 } 2637 2860 2638 2861 spin_lock_init(&cache->lock); 2639 2862 INIT_LIST_HEAD(&cache->deferred_cells); 2640 2863 bio_list_init(&cache->deferred_bios); 2641 - bio_list_init(&cache->deferred_flush_bios); 2642 2864 bio_list_init(&cache->deferred_writethrough_bios); 2643 - INIT_LIST_HEAD(&cache->quiesced_migrations); 2644 - INIT_LIST_HEAD(&cache->completed_migrations); 2645 - INIT_LIST_HEAD(&cache->need_commit_migrations); 2646 2865 atomic_set(&cache->nr_allocated_migrations, 0); 2647 2866 atomic_set(&cache->nr_io_migrations, 0); 2648 2867 init_waitqueue_head(&cache->migration_wait); 2649 - 2650 - init_waitqueue_head(&cache->quiescing_wait); 2651 - atomic_set(&cache->quiescing, 0); 2652 - atomic_set(&cache->quiescing_ack, 0); 2653 2868 2654 2869 r = -ENOMEM; 2655 2870 atomic_set(&cache->nr_dirty, 0); ··· 2673 2900 goto bad; 2674 2901 } 2675 2902 2676 - cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 2903 + cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); 2677 2904 if (!cache->wq) { 2678 2905 *error = "could not create workqueue for metadata object"; 2679 2906 goto bad; 2680 2907 } 2681 - INIT_WORK(&cache->worker, do_worker); 2908 + INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios); 2909 + INIT_WORK(&cache->deferred_writethrough_worker, 2910 + process_deferred_writethrough_bios); 2911 + INIT_WORK(&cache->migration_worker, check_migrations); 2682 2912 INIT_DELAYED_WORK(&cache->waker, do_waker); 2683 - cache->last_commit_jiffies = jiffies; 2684 2913 2685 - cache->prison = dm_bio_prison_create(); 2914 + cache->prison = dm_bio_prison_create_v2(cache->wq); 2686 2915 if (!cache->prison) { 2687 2916 *error = "could not create bio prison"; 2688 - goto bad; 2689 - } 2690 - 2691 - cache->all_io_ds = dm_deferred_set_create(); 2692 - if (!cache->all_io_ds) { 2693 - *error = "could not create all_io deferred set"; 2694 2917 goto bad; 2695 2918 } 2696 2919 ··· 2716 2947 spin_lock_init(&cache->invalidation_lock); 2717 2948 INIT_LIST_HEAD(&cache->invalidation_requests); 2718 2949 2950 + batcher_init(&cache->committer, commit_op, cache, 2951 + issue_op, cache, cache->wq); 2719 2952 iot_init(&cache->origin_tracker); 2953 + 2954 + init_rwsem(&cache->background_work_lock); 2955 + prevent_background_work(cache); 2720 2956 2721 2957 *result = cache; 2722 2958 return 0; 2723 - 2724 2959 bad: 2725 2960 destroy(cache); 2726 2961 return r; ··· 2782 3009 } 2783 3010 2784 3011 ti->private = cache; 2785 - 2786 3012 out: 2787 3013 destroy_cache_args(ca); 2788 3014 return r; ··· 2794 3022 struct cache *cache = ti->private; 2795 3023 2796 3024 int r; 2797 - struct dm_bio_prison_cell *cell = NULL; 3025 + bool commit_needed; 2798 3026 dm_oblock_t block = get_bio_block(cache, bio); 2799 3027 size_t pb_data_size = get_per_bio_data_size(cache); 2800 - bool can_migrate = false; 2801 - bool fast_promotion; 2802 - struct policy_result lookup_result; 2803 - struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size); 2804 - struct old_oblock_lock ool; 2805 3028 2806 - ool.locker.fn = null_locker; 2807 - 3029 + init_per_bio_data(bio, pb_data_size); 2808 3030 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 2809 3031 /* 2810 3032 * This can only occur if the io goes to a partial block at ··· 2815 3049 return DM_MAPIO_SUBMITTED; 2816 3050 } 2817 3051 2818 - /* 2819 - * Check to see if that block is currently migrating. 2820 - */ 2821 - cell = alloc_prison_cell(cache); 2822 - if (!cell) { 2823 - defer_bio(cache, bio); 2824 - return DM_MAPIO_SUBMITTED; 2825 - } 2826 - 2827 - r = bio_detain(cache, block, bio, cell, 2828 - (cell_free_fn) free_prison_cell, 2829 - cache, &cell); 2830 - if (r) { 2831 - if (r < 0) 2832 - defer_bio(cache, bio); 2833 - 2834 - return DM_MAPIO_SUBMITTED; 2835 - } 2836 - 2837 - fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio); 2838 - 2839 - r = policy_map(cache->policy, block, false, can_migrate, fast_promotion, 2840 - bio, &ool.locker, &lookup_result); 2841 - if (r == -EWOULDBLOCK) { 2842 - cell_defer(cache, cell, true); 2843 - return DM_MAPIO_SUBMITTED; 2844 - 2845 - } else if (r) { 2846 - DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d", 2847 - cache_device_name(cache), r); 2848 - cell_defer(cache, cell, false); 2849 - bio_io_error(bio); 2850 - return DM_MAPIO_SUBMITTED; 2851 - } 2852 - 2853 - r = DM_MAPIO_REMAPPED; 2854 - switch (lookup_result.op) { 2855 - case POLICY_HIT: 2856 - if (passthrough_mode(&cache->features)) { 2857 - if (bio_data_dir(bio) == WRITE) { 2858 - /* 2859 - * We need to invalidate this block, so 2860 - * defer for the worker thread. 2861 - */ 2862 - cell_defer(cache, cell, true); 2863 - r = DM_MAPIO_SUBMITTED; 2864 - 2865 - } else { 2866 - inc_miss_counter(cache, bio); 2867 - remap_to_origin_clear_discard(cache, bio, block); 2868 - accounted_begin(cache, bio); 2869 - inc_ds(cache, bio, cell); 2870 - // FIXME: we want to remap hits or misses straight 2871 - // away rather than passing over to the worker. 2872 - cell_defer(cache, cell, false); 2873 - } 2874 - 2875 - } else { 2876 - inc_hit_counter(cache, bio); 2877 - if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && 2878 - !is_dirty(cache, lookup_result.cblock)) { 2879 - remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 2880 - accounted_begin(cache, bio); 2881 - inc_ds(cache, bio, cell); 2882 - cell_defer(cache, cell, false); 2883 - 2884 - } else 2885 - remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false); 2886 - } 2887 - break; 2888 - 2889 - case POLICY_MISS: 2890 - inc_miss_counter(cache, bio); 2891 - if (pb->req_nr != 0) { 2892 - /* 2893 - * This is a duplicate writethrough io that is no 2894 - * longer needed because the block has been demoted. 2895 - */ 2896 - bio_endio(bio); 2897 - // FIXME: remap everything as a miss 2898 - cell_defer(cache, cell, false); 2899 - r = DM_MAPIO_SUBMITTED; 2900 - 2901 - } else 2902 - remap_cell_to_origin_clear_discard(cache, cell, block, false); 2903 - break; 2904 - 2905 - default: 2906 - DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u", 2907 - cache_device_name(cache), __func__, 2908 - (unsigned) lookup_result.op); 2909 - cell_defer(cache, cell, false); 2910 - bio_io_error(bio); 2911 - r = DM_MAPIO_SUBMITTED; 2912 - } 3052 + r = map_bio(cache, bio, block, &commit_needed); 3053 + if (commit_needed) 3054 + schedule_commit(&cache->committer); 2913 3055 2914 3056 return r; 2915 3057 } ··· 2837 3163 spin_unlock_irqrestore(&cache->lock, flags); 2838 3164 } 2839 3165 2840 - check_for_quiesced_migrations(cache, pb); 3166 + bio_drop_shared_lock(cache, bio); 2841 3167 accounted_complete(cache, bio); 2842 3168 2843 3169 return 0; ··· 2937 3263 { 2938 3264 struct cache *cache = ti->private; 2939 3265 2940 - start_quiescing(cache); 2941 - wait_for_migrations(cache); 2942 - stop_worker(cache); 3266 + prevent_background_work(cache); 3267 + BUG_ON(atomic_read(&cache->nr_io_migrations)); 3268 + 3269 + cancel_delayed_work(&cache->waker); 3270 + flush_workqueue(cache->wq); 3271 + WARN_ON(cache->origin_tracker.in_flight); 3272 + 3273 + /* 3274 + * If it's a flush suspend there won't be any deferred bios, so this 3275 + * call is harmless. 3276 + */ 2943 3277 requeue_deferred_bios(cache); 2944 - requeue_deferred_cells(cache); 2945 - stop_quiescing(cache); 2946 3278 2947 3279 if (get_cache_mode(cache) == CM_WRITE) 2948 3280 (void) sync_metadata(cache); ··· 2960 3280 int r; 2961 3281 struct cache *cache = context; 2962 3282 2963 - r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid); 3283 + r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); 2964 3284 if (r) 2965 3285 return r; 2966 - 2967 - if (dirty) 2968 - set_dirty(cache, oblock, cblock); 2969 - else 2970 - clear_dirty(cache, oblock, cblock); 2971 3286 2972 3287 return 0; 2973 3288 } ··· 3162 3487 struct cache *cache = ti->private; 3163 3488 3164 3489 cache->need_tick_bio = true; 3490 + allow_background_work(cache); 3165 3491 do_waker(&cache->waker.work); 3166 3492 } 3167 3493 ··· 3297 3621 } 3298 3622 3299 3623 /* 3624 + * Defines a range of cblocks, begin to (end - 1) are in the range. end is 3625 + * the one-past-the-end value. 3626 + */ 3627 + struct cblock_range { 3628 + dm_cblock_t begin; 3629 + dm_cblock_t end; 3630 + }; 3631 + 3632 + /* 3300 3633 * A cache block range can take two forms: 3301 3634 * 3302 3635 * i) A single cblock, eg. '3456' 3303 - * ii) A begin and end cblock with dots between, eg. 123-234 3636 + * ii) A begin and end cblock with a dash between, eg. 123-234 3304 3637 */ 3305 3638 static int parse_cblock_range(struct cache *cache, const char *str, 3306 3639 struct cblock_range *result) ··· 3375 3690 return 0; 3376 3691 } 3377 3692 3693 + static inline dm_cblock_t cblock_succ(dm_cblock_t b) 3694 + { 3695 + return to_cblock(from_cblock(b) + 1); 3696 + } 3697 + 3378 3698 static int request_invalidation(struct cache *cache, struct cblock_range *range) 3379 3699 { 3380 - struct invalidation_request req; 3700 + int r = 0; 3381 3701 3382 - INIT_LIST_HEAD(&req.list); 3383 - req.cblocks = range; 3384 - atomic_set(&req.complete, 0); 3385 - req.err = 0; 3386 - init_waitqueue_head(&req.result_wait); 3702 + /* 3703 + * We don't need to do any locking here because we know we're in 3704 + * passthrough mode. There's is potential for a race between an 3705 + * invalidation triggered by an io and an invalidation message. This 3706 + * is harmless, we must not worry if the policy call fails. 3707 + */ 3708 + while (range->begin != range->end) { 3709 + r = invalidate_cblock(cache, range->begin); 3710 + if (r) 3711 + return r; 3387 3712 3388 - spin_lock(&cache->invalidation_lock); 3389 - list_add(&req.list, &cache->invalidation_requests); 3390 - spin_unlock(&cache->invalidation_lock); 3391 - wake_worker(cache); 3713 + range->begin = cblock_succ(range->begin); 3714 + } 3392 3715 3393 - wait_event(req.result_wait, atomic_read(&req.complete)); 3394 - return req.err; 3716 + cache->commit_requested = true; 3717 + return r; 3395 3718 } 3396 3719 3397 3720 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, ··· 3509 3816 3510 3817 static struct target_type cache_target = { 3511 3818 .name = "cache", 3512 - .version = {1, 10, 0}, 3819 + .version = {2, 0, 0}, 3513 3820 .module = THIS_MODULE, 3514 3821 .ctr = cache_ctr, 3515 3822 .dtr = cache_dtr,