Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

dm cache: significant rework to leverage dm-bio-prison-v2

The cache policy interfaces have been updated to work well with the new
bio-prison v2 interface's ability to queue work immediately (promotion,
demotion, etc) -- overriding benefit being reduced latency on processing
IO through the cache. Previously such work would be left for the DM
cache core to queue on various lists and then process in batches later
-- this caused a serious delay in latency for IO driven by the cache.

The background tracker code was factored out so that all cache policies
can make use of it.

Also, the "cleaner" policy has been removed and is now a variant of the
smq policy that simply disallows migrations.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>

authored by

Joe Thornber and committed by
Mike Snitzer
b29d4986 742c8fdc

+2087 -2564
-8
drivers/md/Kconfig
··· 325 325 of less memory utilization, improved performance and increased 326 326 adaptability in the face of changing workloads. 327 327 328 - config DM_CACHE_CLEANER 329 - tristate "Cleaner Cache Policy (EXPERIMENTAL)" 330 - depends on DM_CACHE 331 - default y 332 - ---help--- 333 - A simple cache policy that writes back all data to the 334 - origin. Used when decommissioning a dm-cache. 335 - 336 328 config DM_ERA 337 329 tristate "Era target (EXPERIMENTAL)" 338 330 depends on BLK_DEV_DM
+2 -3
drivers/md/Makefile
··· 13 13 += dm-log-userspace-base.o dm-log-userspace-transfer.o 14 14 dm-bio-prison-y += dm-bio-prison-v1.o dm-bio-prison-v2.o 15 15 dm-thin-pool-y += dm-thin.o dm-thin-metadata.o 16 - dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o 16 + dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \ 17 + dm-cache-background-tracker.o 17 18 dm-cache-smq-y += dm-cache-policy-smq.o 18 - dm-cache-cleaner-y += dm-cache-policy-cleaner.o 19 19 dm-era-y += dm-era-target.o 20 20 dm-verity-y += dm-verity-target.o 21 21 md-mod-y += md.o bitmap.o ··· 57 57 obj-$(CONFIG_DM_VERITY) += dm-verity.o 58 58 obj-$(CONFIG_DM_CACHE) += dm-cache.o 59 59 obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o 60 - obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o 61 60 obj-$(CONFIG_DM_ERA) += dm-era.o 62 61 obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o 63 62
+238
drivers/md/dm-cache-background-tracker.c
··· 1 + /* 2 + * Copyright (C) 2017 Red Hat. All rights reserved. 3 + * 4 + * This file is released under the GPL. 5 + */ 6 + 7 + #include "dm-cache-background-tracker.h" 8 + 9 + /*----------------------------------------------------------------*/ 10 + 11 + #define DM_MSG_PREFIX "dm-background-tracker" 12 + 13 + struct bt_work { 14 + struct list_head list; 15 + struct rb_node node; 16 + struct policy_work work; 17 + }; 18 + 19 + struct background_tracker { 20 + unsigned max_work; 21 + atomic_t pending_promotes; 22 + atomic_t pending_writebacks; 23 + atomic_t pending_demotes; 24 + 25 + struct list_head issued; 26 + struct list_head queued; 27 + struct rb_root pending; 28 + 29 + struct kmem_cache *work_cache; 30 + }; 31 + 32 + struct background_tracker *btracker_create(unsigned max_work) 33 + { 34 + struct background_tracker *b = kmalloc(sizeof(*b), GFP_KERNEL); 35 + 36 + b->max_work = max_work; 37 + atomic_set(&b->pending_promotes, 0); 38 + atomic_set(&b->pending_writebacks, 0); 39 + atomic_set(&b->pending_demotes, 0); 40 + 41 + INIT_LIST_HEAD(&b->issued); 42 + INIT_LIST_HEAD(&b->queued); 43 + 44 + b->pending = RB_ROOT; 45 + b->work_cache = KMEM_CACHE(bt_work, 0); 46 + if (!b->work_cache) { 47 + DMERR("couldn't create mempool for background work items"); 48 + kfree(b); 49 + b = NULL; 50 + } 51 + 52 + return b; 53 + } 54 + EXPORT_SYMBOL_GPL(btracker_create); 55 + 56 + void btracker_destroy(struct background_tracker *b) 57 + { 58 + kmem_cache_destroy(b->work_cache); 59 + kfree(b); 60 + } 61 + EXPORT_SYMBOL_GPL(btracker_destroy); 62 + 63 + static int cmp_oblock(dm_oblock_t lhs, dm_oblock_t rhs) 64 + { 65 + if (from_oblock(lhs) < from_oblock(rhs)) 66 + return -1; 67 + 68 + if (from_oblock(rhs) < from_oblock(lhs)) 69 + return 1; 70 + 71 + return 0; 72 + } 73 + 74 + static bool __insert_pending(struct background_tracker *b, 75 + struct bt_work *nw) 76 + { 77 + int cmp; 78 + struct bt_work *w; 79 + struct rb_node **new = &b->pending.rb_node, *parent = NULL; 80 + 81 + while (*new) { 82 + w = container_of(*new, struct bt_work, node); 83 + 84 + parent = *new; 85 + cmp = cmp_oblock(w->work.oblock, nw->work.oblock); 86 + if (cmp < 0) 87 + new = &((*new)->rb_left); 88 + 89 + else if (cmp > 0) 90 + new = &((*new)->rb_right); 91 + 92 + else 93 + /* already present */ 94 + return false; 95 + } 96 + 97 + rb_link_node(&nw->node, parent, new); 98 + rb_insert_color(&nw->node, &b->pending); 99 + 100 + return true; 101 + } 102 + 103 + static struct bt_work *__find_pending(struct background_tracker *b, 104 + dm_oblock_t oblock) 105 + { 106 + int cmp; 107 + struct bt_work *w; 108 + struct rb_node **new = &b->pending.rb_node; 109 + 110 + while (*new) { 111 + w = container_of(*new, struct bt_work, node); 112 + 113 + cmp = cmp_oblock(w->work.oblock, oblock); 114 + if (cmp < 0) 115 + new = &((*new)->rb_left); 116 + 117 + else if (cmp > 0) 118 + new = &((*new)->rb_right); 119 + 120 + else 121 + break; 122 + } 123 + 124 + return *new ? w : NULL; 125 + } 126 + 127 + 128 + static void update_stats(struct background_tracker *b, struct policy_work *w, int delta) 129 + { 130 + switch (w->op) { 131 + case POLICY_PROMOTE: 132 + atomic_add(delta, &b->pending_promotes); 133 + break; 134 + 135 + case POLICY_DEMOTE: 136 + atomic_add(delta, &b->pending_demotes); 137 + break; 138 + 139 + case POLICY_WRITEBACK: 140 + atomic_add(delta, &b->pending_writebacks); 141 + break; 142 + } 143 + } 144 + 145 + unsigned btracker_nr_writebacks_queued(struct background_tracker *b) 146 + { 147 + return atomic_read(&b->pending_writebacks); 148 + } 149 + EXPORT_SYMBOL_GPL(btracker_nr_writebacks_queued); 150 + 151 + unsigned btracker_nr_demotions_queued(struct background_tracker *b) 152 + { 153 + return atomic_read(&b->pending_demotes); 154 + } 155 + EXPORT_SYMBOL_GPL(btracker_nr_demotions_queued); 156 + 157 + static bool max_work_reached(struct background_tracker *b) 158 + { 159 + // FIXME: finish 160 + return false; 161 + } 162 + 163 + int btracker_queue(struct background_tracker *b, 164 + struct policy_work *work, 165 + struct policy_work **pwork) 166 + { 167 + struct bt_work *w; 168 + 169 + if (pwork) 170 + *pwork = NULL; 171 + 172 + if (max_work_reached(b)) 173 + return -ENOMEM; 174 + 175 + w = kmem_cache_alloc(b->work_cache, GFP_NOWAIT); 176 + if (!w) 177 + return -ENOMEM; 178 + 179 + memcpy(&w->work, work, sizeof(*work)); 180 + 181 + if (!__insert_pending(b, w)) { 182 + /* 183 + * There was a race, we'll just ignore this second 184 + * bit of work for the same oblock. 185 + */ 186 + kmem_cache_free(b->work_cache, w); 187 + return -EINVAL; 188 + } 189 + 190 + if (pwork) { 191 + *pwork = &w->work; 192 + list_add(&w->list, &b->issued); 193 + } else 194 + list_add(&w->list, &b->queued); 195 + update_stats(b, &w->work, 1); 196 + 197 + return 0; 198 + } 199 + EXPORT_SYMBOL_GPL(btracker_queue); 200 + 201 + /* 202 + * Returns -ENODATA if there's no work. 203 + */ 204 + int btracker_issue(struct background_tracker *b, struct policy_work **work) 205 + { 206 + struct bt_work *w; 207 + 208 + if (list_empty(&b->queued)) 209 + return -ENODATA; 210 + 211 + w = list_first_entry(&b->queued, struct bt_work, list); 212 + list_move(&w->list, &b->issued); 213 + *work = &w->work; 214 + 215 + return 0; 216 + } 217 + EXPORT_SYMBOL_GPL(btracker_issue); 218 + 219 + void btracker_complete(struct background_tracker *b, 220 + struct policy_work *op) 221 + { 222 + struct bt_work *w = container_of(op, struct bt_work, work); 223 + 224 + update_stats(b, &w->work, -1); 225 + rb_erase(&w->node, &b->pending); 226 + list_del(&w->list); 227 + kmem_cache_free(b->work_cache, w); 228 + } 229 + EXPORT_SYMBOL_GPL(btracker_complete); 230 + 231 + bool btracker_promotion_already_present(struct background_tracker *b, 232 + dm_oblock_t oblock) 233 + { 234 + return __find_pending(b, oblock) != NULL; 235 + } 236 + EXPORT_SYMBOL_GPL(btracker_promotion_already_present); 237 + 238 + /*----------------------------------------------------------------*/
+46
drivers/md/dm-cache-background-tracker.h
··· 1 + /* 2 + * Copyright (C) 2017 Red Hat. All rights reserved. 3 + * 4 + * This file is released under the GPL. 5 + */ 6 + 7 + #ifndef DM_CACHE_BACKGROUND_WORK_H 8 + #define DM_CACHE_BACKGROUND_WORK_H 9 + 10 + #include <linux/vmalloc.h> 11 + #include "dm-cache-policy.h" 12 + 13 + /*----------------------------------------------------------------*/ 14 + 15 + struct background_work; 16 + struct background_tracker; 17 + 18 + /* 19 + * FIXME: discuss lack of locking in all methods. 20 + */ 21 + struct background_tracker *btracker_create(unsigned max_work); 22 + void btracker_destroy(struct background_tracker *b); 23 + 24 + unsigned btracker_nr_writebacks_queued(struct background_tracker *b); 25 + unsigned btracker_nr_demotions_queued(struct background_tracker *b); 26 + 27 + /* 28 + * returns -EINVAL iff the work is already queued. -ENOMEM if the work 29 + * couldn't be queued for another reason. 30 + */ 31 + int btracker_queue(struct background_tracker *b, 32 + struct policy_work *work, 33 + struct policy_work **pwork); 34 + 35 + /* 36 + * Returns -ENODATA if there's no work. 37 + */ 38 + int btracker_issue(struct background_tracker *b, struct policy_work **work); 39 + void btracker_complete(struct background_tracker *b, 40 + struct policy_work *op); 41 + bool btracker_promotion_already_present(struct background_tracker *b, 42 + dm_oblock_t oblock); 43 + 44 + /*----------------------------------------------------------------*/ 45 + 46 + #endif
+2
drivers/md/dm-cache-metadata.h
··· 50 50 #define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL 51 51 #define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL 52 52 53 + struct dm_cache_metadata; 54 + 53 55 /* 54 56 * Reopens or creates a new, empty metadata volume. Returns an ERR_PTR on 55 57 * failure. If reopening then features must match.
-469
drivers/md/dm-cache-policy-cleaner.c
··· 1 - /* 2 - * Copyright (C) 2012 Red Hat. All rights reserved. 3 - * 4 - * writeback cache policy supporting flushing out dirty cache blocks. 5 - * 6 - * This file is released under the GPL. 7 - */ 8 - 9 - #include "dm-cache-policy.h" 10 - #include "dm.h" 11 - 12 - #include <linux/hash.h> 13 - #include <linux/module.h> 14 - #include <linux/slab.h> 15 - #include <linux/vmalloc.h> 16 - 17 - /*----------------------------------------------------------------*/ 18 - 19 - #define DM_MSG_PREFIX "cache cleaner" 20 - 21 - /* Cache entry struct. */ 22 - struct wb_cache_entry { 23 - struct list_head list; 24 - struct hlist_node hlist; 25 - 26 - dm_oblock_t oblock; 27 - dm_cblock_t cblock; 28 - bool dirty:1; 29 - bool pending:1; 30 - }; 31 - 32 - struct hash { 33 - struct hlist_head *table; 34 - dm_block_t hash_bits; 35 - unsigned nr_buckets; 36 - }; 37 - 38 - struct policy { 39 - struct dm_cache_policy policy; 40 - spinlock_t lock; 41 - 42 - struct list_head free; 43 - struct list_head clean; 44 - struct list_head clean_pending; 45 - struct list_head dirty; 46 - 47 - /* 48 - * We know exactly how many cblocks will be needed, 49 - * so we can allocate them up front. 50 - */ 51 - dm_cblock_t cache_size, nr_cblocks_allocated; 52 - struct wb_cache_entry *cblocks; 53 - struct hash chash; 54 - }; 55 - 56 - /*----------------------------------------------------------------------------*/ 57 - 58 - /* 59 - * Low-level functions. 60 - */ 61 - static unsigned next_power(unsigned n, unsigned min) 62 - { 63 - return roundup_pow_of_two(max(n, min)); 64 - } 65 - 66 - static struct policy *to_policy(struct dm_cache_policy *p) 67 - { 68 - return container_of(p, struct policy, policy); 69 - } 70 - 71 - static struct list_head *list_pop(struct list_head *q) 72 - { 73 - struct list_head *r = q->next; 74 - 75 - list_del(r); 76 - 77 - return r; 78 - } 79 - 80 - /*----------------------------------------------------------------------------*/ 81 - 82 - /* Allocate/free various resources. */ 83 - static int alloc_hash(struct hash *hash, unsigned elts) 84 - { 85 - hash->nr_buckets = next_power(elts >> 4, 16); 86 - hash->hash_bits = __ffs(hash->nr_buckets); 87 - hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets); 88 - 89 - return hash->table ? 0 : -ENOMEM; 90 - } 91 - 92 - static void free_hash(struct hash *hash) 93 - { 94 - vfree(hash->table); 95 - } 96 - 97 - static int alloc_cache_blocks_with_hash(struct policy *p, dm_cblock_t cache_size) 98 - { 99 - int r = -ENOMEM; 100 - 101 - p->cblocks = vzalloc(sizeof(*p->cblocks) * from_cblock(cache_size)); 102 - if (p->cblocks) { 103 - unsigned u = from_cblock(cache_size); 104 - 105 - while (u--) 106 - list_add(&p->cblocks[u].list, &p->free); 107 - 108 - p->nr_cblocks_allocated = 0; 109 - 110 - /* Cache entries hash. */ 111 - r = alloc_hash(&p->chash, from_cblock(cache_size)); 112 - if (r) 113 - vfree(p->cblocks); 114 - } 115 - 116 - return r; 117 - } 118 - 119 - static void free_cache_blocks_and_hash(struct policy *p) 120 - { 121 - free_hash(&p->chash); 122 - vfree(p->cblocks); 123 - } 124 - 125 - static struct wb_cache_entry *alloc_cache_entry(struct policy *p) 126 - { 127 - struct wb_cache_entry *e; 128 - 129 - BUG_ON(from_cblock(p->nr_cblocks_allocated) >= from_cblock(p->cache_size)); 130 - 131 - e = list_entry(list_pop(&p->free), struct wb_cache_entry, list); 132 - p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) + 1); 133 - 134 - return e; 135 - } 136 - 137 - /*----------------------------------------------------------------------------*/ 138 - 139 - /* Hash functions (lookup, insert, remove). */ 140 - static struct wb_cache_entry *lookup_cache_entry(struct policy *p, dm_oblock_t oblock) 141 - { 142 - struct hash *hash = &p->chash; 143 - unsigned h = hash_64(from_oblock(oblock), hash->hash_bits); 144 - struct wb_cache_entry *cur; 145 - struct hlist_head *bucket = &hash->table[h]; 146 - 147 - hlist_for_each_entry(cur, bucket, hlist) { 148 - if (cur->oblock == oblock) { 149 - /* Move upfront bucket for faster access. */ 150 - hlist_del(&cur->hlist); 151 - hlist_add_head(&cur->hlist, bucket); 152 - return cur; 153 - } 154 - } 155 - 156 - return NULL; 157 - } 158 - 159 - static void insert_cache_hash_entry(struct policy *p, struct wb_cache_entry *e) 160 - { 161 - unsigned h = hash_64(from_oblock(e->oblock), p->chash.hash_bits); 162 - 163 - hlist_add_head(&e->hlist, &p->chash.table[h]); 164 - } 165 - 166 - static void remove_cache_hash_entry(struct wb_cache_entry *e) 167 - { 168 - hlist_del(&e->hlist); 169 - } 170 - 171 - /* Public interface (see dm-cache-policy.h */ 172 - static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock, 173 - bool can_block, bool can_migrate, bool discarded_oblock, 174 - struct bio *bio, struct policy_locker *locker, 175 - struct policy_result *result) 176 - { 177 - struct policy *p = to_policy(pe); 178 - struct wb_cache_entry *e; 179 - unsigned long flags; 180 - 181 - result->op = POLICY_MISS; 182 - 183 - if (can_block) 184 - spin_lock_irqsave(&p->lock, flags); 185 - 186 - else if (!spin_trylock_irqsave(&p->lock, flags)) 187 - return -EWOULDBLOCK; 188 - 189 - e = lookup_cache_entry(p, oblock); 190 - if (e) { 191 - result->op = POLICY_HIT; 192 - result->cblock = e->cblock; 193 - 194 - } 195 - 196 - spin_unlock_irqrestore(&p->lock, flags); 197 - 198 - return 0; 199 - } 200 - 201 - static int wb_lookup(struct dm_cache_policy *pe, dm_oblock_t oblock, dm_cblock_t *cblock) 202 - { 203 - int r; 204 - struct policy *p = to_policy(pe); 205 - struct wb_cache_entry *e; 206 - unsigned long flags; 207 - 208 - if (!spin_trylock_irqsave(&p->lock, flags)) 209 - return -EWOULDBLOCK; 210 - 211 - e = lookup_cache_entry(p, oblock); 212 - if (e) { 213 - *cblock = e->cblock; 214 - r = 0; 215 - 216 - } else 217 - r = -ENOENT; 218 - 219 - spin_unlock_irqrestore(&p->lock, flags); 220 - 221 - return r; 222 - } 223 - 224 - static void __set_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock, bool set) 225 - { 226 - struct policy *p = to_policy(pe); 227 - struct wb_cache_entry *e; 228 - 229 - e = lookup_cache_entry(p, oblock); 230 - BUG_ON(!e); 231 - 232 - if (set) { 233 - if (!e->dirty) { 234 - e->dirty = true; 235 - list_move(&e->list, &p->dirty); 236 - } 237 - 238 - } else { 239 - if (e->dirty) { 240 - e->pending = false; 241 - e->dirty = false; 242 - list_move(&e->list, &p->clean); 243 - } 244 - } 245 - } 246 - 247 - static void wb_set_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock) 248 - { 249 - struct policy *p = to_policy(pe); 250 - unsigned long flags; 251 - 252 - spin_lock_irqsave(&p->lock, flags); 253 - __set_clear_dirty(pe, oblock, true); 254 - spin_unlock_irqrestore(&p->lock, flags); 255 - } 256 - 257 - static void wb_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock) 258 - { 259 - struct policy *p = to_policy(pe); 260 - unsigned long flags; 261 - 262 - spin_lock_irqsave(&p->lock, flags); 263 - __set_clear_dirty(pe, oblock, false); 264 - spin_unlock_irqrestore(&p->lock, flags); 265 - } 266 - 267 - static void add_cache_entry(struct policy *p, struct wb_cache_entry *e) 268 - { 269 - insert_cache_hash_entry(p, e); 270 - if (e->dirty) 271 - list_add(&e->list, &p->dirty); 272 - else 273 - list_add(&e->list, &p->clean); 274 - } 275 - 276 - static int wb_load_mapping(struct dm_cache_policy *pe, 277 - dm_oblock_t oblock, dm_cblock_t cblock, 278 - uint32_t hint, bool hint_valid) 279 - { 280 - int r; 281 - struct policy *p = to_policy(pe); 282 - struct wb_cache_entry *e = alloc_cache_entry(p); 283 - 284 - if (e) { 285 - e->cblock = cblock; 286 - e->oblock = oblock; 287 - e->dirty = false; /* blocks default to clean */ 288 - add_cache_entry(p, e); 289 - r = 0; 290 - 291 - } else 292 - r = -ENOMEM; 293 - 294 - return r; 295 - } 296 - 297 - static void wb_destroy(struct dm_cache_policy *pe) 298 - { 299 - struct policy *p = to_policy(pe); 300 - 301 - free_cache_blocks_and_hash(p); 302 - kfree(p); 303 - } 304 - 305 - static struct wb_cache_entry *__wb_force_remove_mapping(struct policy *p, dm_oblock_t oblock) 306 - { 307 - struct wb_cache_entry *r = lookup_cache_entry(p, oblock); 308 - 309 - BUG_ON(!r); 310 - 311 - remove_cache_hash_entry(r); 312 - list_del(&r->list); 313 - 314 - return r; 315 - } 316 - 317 - static void wb_remove_mapping(struct dm_cache_policy *pe, dm_oblock_t oblock) 318 - { 319 - struct policy *p = to_policy(pe); 320 - struct wb_cache_entry *e; 321 - unsigned long flags; 322 - 323 - spin_lock_irqsave(&p->lock, flags); 324 - e = __wb_force_remove_mapping(p, oblock); 325 - list_add_tail(&e->list, &p->free); 326 - BUG_ON(!from_cblock(p->nr_cblocks_allocated)); 327 - p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) - 1); 328 - spin_unlock_irqrestore(&p->lock, flags); 329 - } 330 - 331 - static void wb_force_mapping(struct dm_cache_policy *pe, 332 - dm_oblock_t current_oblock, dm_oblock_t oblock) 333 - { 334 - struct policy *p = to_policy(pe); 335 - struct wb_cache_entry *e; 336 - unsigned long flags; 337 - 338 - spin_lock_irqsave(&p->lock, flags); 339 - e = __wb_force_remove_mapping(p, current_oblock); 340 - e->oblock = oblock; 341 - add_cache_entry(p, e); 342 - spin_unlock_irqrestore(&p->lock, flags); 343 - } 344 - 345 - static struct wb_cache_entry *get_next_dirty_entry(struct policy *p) 346 - { 347 - struct list_head *l; 348 - struct wb_cache_entry *r; 349 - 350 - if (list_empty(&p->dirty)) 351 - return NULL; 352 - 353 - l = list_pop(&p->dirty); 354 - r = container_of(l, struct wb_cache_entry, list); 355 - list_add(l, &p->clean_pending); 356 - 357 - return r; 358 - } 359 - 360 - static int wb_writeback_work(struct dm_cache_policy *pe, 361 - dm_oblock_t *oblock, 362 - dm_cblock_t *cblock, 363 - bool critical_only) 364 - { 365 - int r = -ENOENT; 366 - struct policy *p = to_policy(pe); 367 - struct wb_cache_entry *e; 368 - unsigned long flags; 369 - 370 - spin_lock_irqsave(&p->lock, flags); 371 - 372 - e = get_next_dirty_entry(p); 373 - if (e) { 374 - *oblock = e->oblock; 375 - *cblock = e->cblock; 376 - r = 0; 377 - } 378 - 379 - spin_unlock_irqrestore(&p->lock, flags); 380 - 381 - return r; 382 - } 383 - 384 - static dm_cblock_t wb_residency(struct dm_cache_policy *pe) 385 - { 386 - return to_policy(pe)->nr_cblocks_allocated; 387 - } 388 - 389 - /* Init the policy plugin interface function pointers. */ 390 - static void init_policy_functions(struct policy *p) 391 - { 392 - p->policy.destroy = wb_destroy; 393 - p->policy.map = wb_map; 394 - p->policy.lookup = wb_lookup; 395 - p->policy.set_dirty = wb_set_dirty; 396 - p->policy.clear_dirty = wb_clear_dirty; 397 - p->policy.load_mapping = wb_load_mapping; 398 - p->policy.get_hint = NULL; 399 - p->policy.remove_mapping = wb_remove_mapping; 400 - p->policy.writeback_work = wb_writeback_work; 401 - p->policy.force_mapping = wb_force_mapping; 402 - p->policy.residency = wb_residency; 403 - p->policy.tick = NULL; 404 - } 405 - 406 - static struct dm_cache_policy *wb_create(dm_cblock_t cache_size, 407 - sector_t origin_size, 408 - sector_t cache_block_size) 409 - { 410 - int r; 411 - struct policy *p = kzalloc(sizeof(*p), GFP_KERNEL); 412 - 413 - if (!p) 414 - return NULL; 415 - 416 - init_policy_functions(p); 417 - INIT_LIST_HEAD(&p->free); 418 - INIT_LIST_HEAD(&p->clean); 419 - INIT_LIST_HEAD(&p->clean_pending); 420 - INIT_LIST_HEAD(&p->dirty); 421 - 422 - p->cache_size = cache_size; 423 - spin_lock_init(&p->lock); 424 - 425 - /* Allocate cache entry structs and add them to free list. */ 426 - r = alloc_cache_blocks_with_hash(p, cache_size); 427 - if (!r) 428 - return &p->policy; 429 - 430 - kfree(p); 431 - 432 - return NULL; 433 - } 434 - /*----------------------------------------------------------------------------*/ 435 - 436 - static struct dm_cache_policy_type wb_policy_type = { 437 - .name = "cleaner", 438 - .version = {1, 0, 0}, 439 - .hint_size = 4, 440 - .owner = THIS_MODULE, 441 - .create = wb_create 442 - }; 443 - 444 - static int __init wb_init(void) 445 - { 446 - int r = dm_cache_policy_register(&wb_policy_type); 447 - 448 - if (r < 0) 449 - DMERR("register failed %d", r); 450 - else 451 - DMINFO("version %u.%u.%u loaded", 452 - wb_policy_type.version[0], 453 - wb_policy_type.version[1], 454 - wb_policy_type.version[2]); 455 - 456 - return r; 457 - } 458 - 459 - static void __exit wb_exit(void) 460 - { 461 - dm_cache_policy_unregister(&wb_policy_type); 462 - } 463 - 464 - module_init(wb_init); 465 - module_exit(wb_exit); 466 - 467 - MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>"); 468 - MODULE_LICENSE("GPL"); 469 - MODULE_DESCRIPTION("cleaner cache policy");
+43 -43
drivers/md/dm-cache-policy-internal.h
··· 12 12 13 13 /*----------------------------------------------------------------*/ 14 14 15 - /* 16 - * Little inline functions that simplify calling the policy methods. 17 - */ 18 - static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock, 19 - bool can_block, bool can_migrate, bool discarded_oblock, 20 - struct bio *bio, struct policy_locker *locker, 21 - struct policy_result *result) 15 + static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock, 16 + int data_dir, bool fast_copy, bool *background_queued) 22 17 { 23 - return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, locker, result); 18 + return p->lookup(p, oblock, cblock, data_dir, fast_copy, background_queued); 24 19 } 25 20 26 - static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) 21 + static inline int policy_lookup_with_work(struct dm_cache_policy *p, 22 + dm_oblock_t oblock, dm_cblock_t *cblock, 23 + int data_dir, bool fast_copy, 24 + struct policy_work **work) 27 25 { 28 - BUG_ON(!p->lookup); 29 - return p->lookup(p, oblock, cblock); 26 + if (!p->lookup_with_work) { 27 + *work = NULL; 28 + return p->lookup(p, oblock, cblock, data_dir, fast_copy, NULL); 29 + } 30 + 31 + return p->lookup_with_work(p, oblock, cblock, data_dir, fast_copy, work); 30 32 } 31 33 32 - static inline void policy_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) 34 + static inline int policy_get_background_work(struct dm_cache_policy *p, 35 + bool idle, struct policy_work **result) 33 36 { 34 - if (p->set_dirty) 35 - p->set_dirty(p, oblock); 37 + return p->get_background_work(p, idle, result); 36 38 } 37 39 38 - static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) 40 + static inline void policy_complete_background_work(struct dm_cache_policy *p, 41 + struct policy_work *work, 42 + bool success) 39 43 { 40 - if (p->clear_dirty) 41 - p->clear_dirty(p, oblock); 44 + return p->complete_background_work(p, work, success); 45 + } 46 + 47 + static inline void policy_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) 48 + { 49 + p->set_dirty(p, cblock); 50 + } 51 + 52 + static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) 53 + { 54 + p->clear_dirty(p, cblock); 42 55 } 43 56 44 57 static inline int policy_load_mapping(struct dm_cache_policy *p, 45 58 dm_oblock_t oblock, dm_cblock_t cblock, 46 - uint32_t hint, bool hint_valid) 59 + bool dirty, uint32_t hint, bool hint_valid) 47 60 { 48 - return p->load_mapping(p, oblock, cblock, hint, hint_valid); 61 + return p->load_mapping(p, oblock, cblock, dirty, hint, hint_valid); 62 + } 63 + 64 + static inline int policy_invalidate_mapping(struct dm_cache_policy *p, 65 + dm_cblock_t cblock) 66 + { 67 + return p->invalidate_mapping(p, cblock); 49 68 } 50 69 51 70 static inline uint32_t policy_get_hint(struct dm_cache_policy *p, 52 71 dm_cblock_t cblock) 53 72 { 54 73 return p->get_hint ? p->get_hint(p, cblock) : 0; 55 - } 56 - 57 - static inline int policy_writeback_work(struct dm_cache_policy *p, 58 - dm_oblock_t *oblock, 59 - dm_cblock_t *cblock, 60 - bool critical_only) 61 - { 62 - return p->writeback_work ? p->writeback_work(p, oblock, cblock, critical_only) : -ENOENT; 63 - } 64 - 65 - static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) 66 - { 67 - p->remove_mapping(p, oblock); 68 - } 69 - 70 - static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) 71 - { 72 - return p->remove_cblock(p, cblock); 73 - } 74 - 75 - static inline void policy_force_mapping(struct dm_cache_policy *p, 76 - dm_oblock_t current_oblock, dm_oblock_t new_oblock) 77 - { 78 - return p->force_mapping(p, current_oblock, new_oblock); 79 74 } 80 75 81 76 static inline dm_cblock_t policy_residency(struct dm_cache_policy *p) ··· 100 105 const char *key, const char *value) 101 106 { 102 107 return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL; 108 + } 109 + 110 + static inline void policy_allow_migrations(struct dm_cache_policy *p, bool allow) 111 + { 112 + return p->allow_migrations(p, allow); 103 113 } 104 114 105 115 /*----------------------------------------------------------------*/
+489 -388
drivers/md/dm-cache-policy-smq.c
··· 4 4 * This file is released under the GPL. 5 5 */ 6 6 7 - #include "dm-cache-policy.h" 7 + #include "dm-cache-background-tracker.h" 8 8 #include "dm-cache-policy-internal.h" 9 + #include "dm-cache-policy.h" 9 10 #include "dm.h" 10 11 11 12 #include <linux/hash.h> ··· 39 38 unsigned hash_next:28; 40 39 unsigned prev:28; 41 40 unsigned next:28; 42 - unsigned level:7; 41 + unsigned level:6; 43 42 bool dirty:1; 44 43 bool allocated:1; 45 44 bool sentinel:1; 45 + bool pending_work:1; 46 46 47 47 dm_oblock_t oblock; 48 48 }; ··· 281 279 */ 282 280 static void q_push(struct queue *q, struct entry *e) 283 281 { 282 + BUG_ON(e->pending_work); 283 + 284 284 if (!e->sentinel) 285 285 q->nr_elts++; 286 286 287 287 l_add_tail(q->es, q->qs + e->level, e); 288 288 } 289 289 290 + static void q_push_front(struct queue *q, struct entry *e) 291 + { 292 + BUG_ON(e->pending_work); 293 + 294 + if (!e->sentinel) 295 + q->nr_elts++; 296 + 297 + l_add_head(q->es, q->qs + e->level, e); 298 + } 299 + 290 300 static void q_push_before(struct queue *q, struct entry *old, struct entry *e) 291 301 { 302 + BUG_ON(e->pending_work); 303 + 292 304 if (!e->sentinel) 293 305 q->nr_elts++; 294 306 ··· 344 328 static struct entry *q_pop(struct queue *q) 345 329 { 346 330 struct entry *e = q_peek(q, q->nr_levels, true); 347 - 348 - if (e) 349 - q_del(q, e); 350 - 351 - return e; 352 - } 353 - 354 - /* 355 - * Pops an entry from a level that is not past a sentinel. 356 - */ 357 - static struct entry *q_pop_old(struct queue *q, unsigned max_level) 358 - { 359 - struct entry *e = q_peek(q, max_level, false); 360 331 361 332 if (e) 362 333 q_del(q, e); ··· 449 446 break; 450 447 451 448 e->level = level + 1u; 452 - l_add_head(q->es, l_above, e); 449 + l_add_tail(q->es, l_above, e); 453 450 } 454 451 } 455 452 } 456 453 457 - static void q_requeue_before(struct queue *q, struct entry *dest, struct entry *e, unsigned extra_levels) 454 + static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels, 455 + struct entry *s1, struct entry *s2) 458 456 { 459 457 struct entry *de; 460 - unsigned new_level; 458 + unsigned sentinels_passed = 0; 459 + unsigned new_level = min(q->nr_levels - 1u, e->level + extra_levels); 461 460 462 - q_del(q, e); 463 - 461 + /* try and find an entry to swap with */ 464 462 if (extra_levels && (e->level < q->nr_levels - 1u)) { 465 - new_level = min(q->nr_levels - 1u, e->level + extra_levels); 466 - for (de = l_head(q->es, q->qs + new_level); de; de = l_next(q->es, de)) { 467 - if (de->sentinel) 468 - continue; 463 + for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de)) 464 + sentinels_passed++; 469 465 466 + if (de) { 470 467 q_del(q, de); 471 468 de->level = e->level; 469 + if (s1) { 470 + switch (sentinels_passed) { 471 + case 0: 472 + q_push_before(q, s1, de); 473 + break; 472 474 473 - if (dest) 474 - q_push_before(q, dest, de); 475 - else 475 + case 1: 476 + q_push_before(q, s2, de); 477 + break; 478 + 479 + default: 480 + q_push(q, de); 481 + } 482 + } else 476 483 q_push(q, de); 477 - break; 478 484 } 479 - 480 - e->level = new_level; 481 485 } 482 486 487 + q_del(q, e); 488 + e->level = new_level; 483 489 q_push(q, e); 484 - } 485 - 486 - static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels) 487 - { 488 - q_requeue_before(q, NULL, e, extra_levels); 489 490 } 490 491 491 492 /*----------------------------------------------------------------*/ ··· 557 550 558 551 /*----------------------------------------------------------------*/ 559 552 560 - struct hash_table { 553 + struct smq_hash_table { 561 554 struct entry_space *es; 562 555 unsigned long long hash_bits; 563 556 unsigned *buckets; ··· 567 560 * All cache entries are stored in a chained hash table. To save space we 568 561 * use indexing again, and only store indexes to the next entry. 569 562 */ 570 - static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_entries) 563 + static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned nr_entries) 571 564 { 572 565 unsigned i, nr_buckets; 573 566 ··· 585 578 return 0; 586 579 } 587 580 588 - static void h_exit(struct hash_table *ht) 581 + static void h_exit(struct smq_hash_table *ht) 589 582 { 590 583 vfree(ht->buckets); 591 584 } 592 585 593 - static struct entry *h_head(struct hash_table *ht, unsigned bucket) 586 + static struct entry *h_head(struct smq_hash_table *ht, unsigned bucket) 594 587 { 595 588 return to_entry(ht->es, ht->buckets[bucket]); 596 589 } 597 590 598 - static struct entry *h_next(struct hash_table *ht, struct entry *e) 591 + static struct entry *h_next(struct smq_hash_table *ht, struct entry *e) 599 592 { 600 593 return to_entry(ht->es, e->hash_next); 601 594 } 602 595 603 - static void __h_insert(struct hash_table *ht, unsigned bucket, struct entry *e) 596 + static void __h_insert(struct smq_hash_table *ht, unsigned bucket, struct entry *e) 604 597 { 605 598 e->hash_next = ht->buckets[bucket]; 606 599 ht->buckets[bucket] = to_index(ht->es, e); 607 600 } 608 601 609 - static void h_insert(struct hash_table *ht, struct entry *e) 602 + static void h_insert(struct smq_hash_table *ht, struct entry *e) 610 603 { 611 604 unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits); 612 605 __h_insert(ht, h, e); 613 606 } 614 607 615 - static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t oblock, 608 + static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned h, dm_oblock_t oblock, 616 609 struct entry **prev) 617 610 { 618 611 struct entry *e; ··· 628 621 return NULL; 629 622 } 630 623 631 - static void __h_unlink(struct hash_table *ht, unsigned h, 624 + static void __h_unlink(struct smq_hash_table *ht, unsigned h, 632 625 struct entry *e, struct entry *prev) 633 626 { 634 627 if (prev) ··· 640 633 /* 641 634 * Also moves each entry to the front of the bucket. 642 635 */ 643 - static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock) 636 + static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock) 644 637 { 645 638 struct entry *e, *prev; 646 639 unsigned h = hash_64(from_oblock(oblock), ht->hash_bits); ··· 658 651 return e; 659 652 } 660 653 661 - static void h_remove(struct hash_table *ht, struct entry *e) 654 + static void h_remove(struct smq_hash_table *ht, struct entry *e) 662 655 { 663 656 unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits); 664 657 struct entry *prev; ··· 706 699 e->next = INDEXER_NULL; 707 700 e->prev = INDEXER_NULL; 708 701 e->level = 0u; 702 + e->dirty = true; /* FIXME: audit */ 709 703 e->allocated = true; 704 + e->sentinel = false; 705 + e->pending_work = false; 710 706 } 711 707 712 708 static struct entry *alloc_entry(struct entry_alloc *ea) ··· 772 762 #define NR_HOTSPOT_LEVELS 64u 773 763 #define NR_CACHE_LEVELS 64u 774 764 775 - #define WRITEBACK_PERIOD (10 * HZ) 776 - #define DEMOTE_PERIOD (60 * HZ) 765 + #define WRITEBACK_PERIOD (10ul * HZ) 766 + #define DEMOTE_PERIOD (60ul * HZ) 777 767 778 768 #define HOTSPOT_UPDATE_PERIOD (HZ) 779 - #define CACHE_UPDATE_PERIOD (10u * HZ) 769 + #define CACHE_UPDATE_PERIOD (60ul * HZ) 780 770 781 771 struct smq_policy { 782 772 struct dm_cache_policy policy; ··· 824 814 * The hash tables allows us to quickly find an entry by origin 825 815 * block. 826 816 */ 827 - struct hash_table table; 828 - struct hash_table hotspot_table; 817 + struct smq_hash_table table; 818 + struct smq_hash_table hotspot_table; 829 819 830 820 bool current_writeback_sentinels; 831 821 unsigned long next_writeback_period; ··· 838 828 839 829 unsigned long next_hotspot_period; 840 830 unsigned long next_cache_period; 831 + 832 + struct background_tracker *bg_work; 833 + 834 + bool migrations_allowed; 841 835 }; 842 836 843 837 /*----------------------------------------------------------------*/ ··· 890 876 static void update_sentinels(struct smq_policy *mq) 891 877 { 892 878 if (time_after(jiffies, mq->next_writeback_period)) { 893 - __update_writeback_sentinels(mq); 894 879 mq->next_writeback_period = jiffies + WRITEBACK_PERIOD; 895 880 mq->current_writeback_sentinels = !mq->current_writeback_sentinels; 881 + __update_writeback_sentinels(mq); 896 882 } 897 883 898 884 if (time_after(jiffies, mq->next_demote_period)) { 899 - __update_demote_sentinels(mq); 900 885 mq->next_demote_period = jiffies + DEMOTE_PERIOD; 901 886 mq->current_demote_sentinels = !mq->current_demote_sentinels; 887 + __update_demote_sentinels(mq); 902 888 } 903 889 } 904 890 ··· 934 920 935 921 /*----------------------------------------------------------------*/ 936 922 937 - /* 938 - * These methods tie together the dirty queue, clean queue and hash table. 939 - */ 940 - static void push_new(struct smq_policy *mq, struct entry *e) 923 + static void del_queue(struct smq_policy *mq, struct entry *e) 941 924 { 942 - struct queue *q = e->dirty ? &mq->dirty : &mq->clean; 943 - h_insert(&mq->table, e); 944 - q_push(q, e); 925 + q_del(e->dirty ? &mq->dirty : &mq->clean, e); 945 926 } 946 927 928 + static void push_queue(struct smq_policy *mq, struct entry *e) 929 + { 930 + if (e->dirty) 931 + q_push(&mq->dirty, e); 932 + else 933 + q_push(&mq->clean, e); 934 + } 935 + 936 + // !h, !q, a -> h, q, a 947 937 static void push(struct smq_policy *mq, struct entry *e) 948 938 { 949 - struct entry *sentinel; 950 - 951 939 h_insert(&mq->table, e); 952 - 953 - /* 954 - * Punch this into the queue just in front of the sentinel, to 955 - * ensure it's cleaned straight away. 956 - */ 957 - if (e->dirty) { 958 - sentinel = writeback_sentinel(mq, e->level); 959 - q_push_before(&mq->dirty, sentinel, e); 960 - } else { 961 - sentinel = demote_sentinel(mq, e->level); 962 - q_push_before(&mq->clean, sentinel, e); 963 - } 940 + if (!e->pending_work) 941 + push_queue(mq, e); 964 942 } 965 943 966 - /* 967 - * Removes an entry from cache. Removes from the hash table. 968 - */ 969 - static void __del(struct smq_policy *mq, struct queue *q, struct entry *e) 944 + static void push_queue_front(struct smq_policy *mq, struct entry *e) 970 945 { 971 - q_del(q, e); 972 - h_remove(&mq->table, e); 946 + if (e->dirty) 947 + q_push_front(&mq->dirty, e); 948 + else 949 + q_push_front(&mq->clean, e); 973 950 } 974 951 975 - static void del(struct smq_policy *mq, struct entry *e) 952 + static void push_front(struct smq_policy *mq, struct entry *e) 976 953 { 977 - __del(mq, e->dirty ? &mq->dirty : &mq->clean, e); 978 - } 979 - 980 - static struct entry *pop_old(struct smq_policy *mq, struct queue *q, unsigned max_level) 981 - { 982 - struct entry *e = q_pop_old(q, max_level); 983 - if (e) 984 - h_remove(&mq->table, e); 985 - return e; 954 + h_insert(&mq->table, e); 955 + if (!e->pending_work) 956 + push_queue_front(mq, e); 986 957 } 987 958 988 959 static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e) ··· 977 978 978 979 static void requeue(struct smq_policy *mq, struct entry *e) 979 980 { 980 - struct entry *sentinel; 981 + /* 982 + * Pending work has temporarily been taken out of the queues. 983 + */ 984 + if (e->pending_work) 985 + return; 981 986 982 987 if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) { 983 - if (e->dirty) { 984 - sentinel = writeback_sentinel(mq, e->level); 985 - q_requeue_before(&mq->dirty, sentinel, e, 1u); 986 - } else { 987 - sentinel = demote_sentinel(mq, e->level); 988 - q_requeue_before(&mq->clean, sentinel, e, 1u); 988 + if (!e->dirty) { 989 + q_requeue(&mq->clean, e, 1u, NULL, NULL); 990 + return; 989 991 } 992 + 993 + q_requeue(&mq->dirty, e, 1u, 994 + get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels), 995 + get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels)); 990 996 } 991 997 } 992 998 ··· 1030 1026 unsigned threshold_level = allocator_empty(&mq->cache_alloc) ? 1031 1027 default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u); 1032 1028 1029 + threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS); 1030 + 1033 1031 /* 1034 1032 * If the hotspot queue is performing badly then we have little 1035 1033 * confidence that we know which blocks to promote. So we cut down ··· 1051 1045 } 1052 1046 1053 1047 mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level; 1054 - mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level) + 2u; 1048 + mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level); 1055 1049 } 1056 1050 1057 1051 /* ··· 1101 1095 } 1102 1096 } 1103 1097 1104 - static int demote_cblock(struct smq_policy *mq, 1105 - struct policy_locker *locker, 1106 - dm_oblock_t *oblock) 1098 + /*----------------------------------------------------------------*/ 1099 + 1100 + /* 1101 + * Targets are given as a percentage. 1102 + */ 1103 + #define CLEAN_TARGET 25u 1104 + #define FREE_TARGET 25u 1105 + 1106 + static unsigned percent_to_target(struct smq_policy *mq, unsigned p) 1107 1107 { 1108 - struct entry *demoted = q_peek(&mq->clean, mq->clean.nr_levels, false); 1109 - if (!demoted) 1110 - /* 1111 - * We could get a block from mq->dirty, but that 1112 - * would add extra latency to the triggering bio as it 1113 - * waits for the writeback. Better to not promote this 1114 - * time and hope there's a clean block next time this block 1115 - * is hit. 1116 - */ 1117 - return -ENOSPC; 1118 - 1119 - if (locker->fn(locker, demoted->oblock)) 1120 - /* 1121 - * We couldn't lock this block. 1122 - */ 1123 - return -EBUSY; 1124 - 1125 - del(mq, demoted); 1126 - *oblock = demoted->oblock; 1127 - free_entry(&mq->cache_alloc, demoted); 1128 - 1129 - return 0; 1108 + return from_cblock(mq->cache_size) * p / 100u; 1130 1109 } 1110 + 1111 + static bool clean_target_met(struct smq_policy *mq, bool idle) 1112 + { 1113 + /* 1114 + * Cache entries may not be populated. So we cannot rely on the 1115 + * size of the clean queue. 1116 + */ 1117 + unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty); 1118 + 1119 + if (idle) 1120 + /* 1121 + * We'd like to clean everything. 1122 + */ 1123 + return q_size(&mq->dirty) == 0u; 1124 + else 1125 + return (nr_clean + btracker_nr_writebacks_queued(mq->bg_work)) >= 1126 + percent_to_target(mq, CLEAN_TARGET); 1127 + } 1128 + 1129 + static bool free_target_met(struct smq_policy *mq, bool idle) 1130 + { 1131 + unsigned nr_free = from_cblock(mq->cache_size) - 1132 + mq->cache_alloc.nr_allocated; 1133 + 1134 + if (idle) 1135 + return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >= 1136 + percent_to_target(mq, FREE_TARGET); 1137 + else 1138 + return true; 1139 + } 1140 + 1141 + /*----------------------------------------------------------------*/ 1142 + 1143 + static void mark_pending(struct smq_policy *mq, struct entry *e) 1144 + { 1145 + BUG_ON(e->sentinel); 1146 + BUG_ON(!e->allocated); 1147 + BUG_ON(e->pending_work); 1148 + e->pending_work = true; 1149 + } 1150 + 1151 + static void clear_pending(struct smq_policy *mq, struct entry *e) 1152 + { 1153 + BUG_ON(!e->pending_work); 1154 + e->pending_work = false; 1155 + } 1156 + 1157 + static void queue_writeback(struct smq_policy *mq) 1158 + { 1159 + int r; 1160 + struct policy_work work; 1161 + struct entry *e; 1162 + 1163 + e = q_peek(&mq->dirty, mq->dirty.nr_levels, false); 1164 + if (e) { 1165 + mark_pending(mq, e); 1166 + q_del(&mq->dirty, e); 1167 + 1168 + work.op = POLICY_WRITEBACK; 1169 + work.oblock = e->oblock; 1170 + work.cblock = infer_cblock(mq, e); 1171 + 1172 + r = btracker_queue(mq->bg_work, &work, NULL); 1173 + WARN_ON_ONCE(r); // FIXME: finish, I think we have to get rid of this race. 1174 + } 1175 + } 1176 + 1177 + static void queue_demotion(struct smq_policy *mq) 1178 + { 1179 + struct policy_work work; 1180 + struct entry *e; 1181 + 1182 + if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed))) 1183 + return; 1184 + 1185 + e = q_peek(&mq->clean, mq->clean.nr_levels, true); 1186 + if (!e) { 1187 + if (!clean_target_met(mq, false)) 1188 + queue_writeback(mq); 1189 + return; 1190 + } 1191 + 1192 + mark_pending(mq, e); 1193 + q_del(&mq->clean, e); 1194 + 1195 + work.op = POLICY_DEMOTE; 1196 + work.oblock = e->oblock; 1197 + work.cblock = infer_cblock(mq, e); 1198 + btracker_queue(mq->bg_work, &work, NULL); 1199 + } 1200 + 1201 + static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock, 1202 + struct policy_work **workp) 1203 + { 1204 + struct entry *e; 1205 + struct policy_work work; 1206 + 1207 + if (!mq->migrations_allowed) 1208 + return; 1209 + 1210 + if (allocator_empty(&mq->cache_alloc)) { 1211 + if (!free_target_met(mq, false)) 1212 + queue_demotion(mq); 1213 + return; 1214 + } 1215 + 1216 + if (btracker_promotion_already_present(mq->bg_work, oblock)) 1217 + return; 1218 + 1219 + /* 1220 + * We allocate the entry now to reserve the cblock. If the 1221 + * background work is aborted we must remember to free it. 1222 + */ 1223 + e = alloc_entry(&mq->cache_alloc); 1224 + BUG_ON(!e); 1225 + e->pending_work = true; 1226 + work.op = POLICY_PROMOTE; 1227 + work.oblock = oblock; 1228 + work.cblock = infer_cblock(mq, e); 1229 + btracker_queue(mq->bg_work, &work, workp); 1230 + } 1231 + 1232 + /*----------------------------------------------------------------*/ 1131 1233 1132 1234 enum promote_result { 1133 1235 PROMOTE_NOT, ··· 1251 1137 return promote ? PROMOTE_PERMANENT : PROMOTE_NOT; 1252 1138 } 1253 1139 1254 - static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, struct bio *bio, 1255 - bool fast_promote) 1140 + static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, 1141 + int data_dir, bool fast_promote) 1256 1142 { 1257 - if (bio_data_dir(bio) == WRITE) { 1143 + if (data_dir == WRITE) { 1258 1144 if (!allocator_empty(&mq->cache_alloc) && fast_promote) 1259 1145 return PROMOTE_TEMPORARY; 1260 1146 1261 - else 1262 - return maybe_promote(hs_e->level >= mq->write_promote_level); 1147 + return maybe_promote(hs_e->level >= mq->write_promote_level); 1263 1148 } else 1264 1149 return maybe_promote(hs_e->level >= mq->read_promote_level); 1265 - } 1266 - 1267 - static void insert_in_cache(struct smq_policy *mq, dm_oblock_t oblock, 1268 - struct policy_locker *locker, 1269 - struct policy_result *result, enum promote_result pr) 1270 - { 1271 - int r; 1272 - struct entry *e; 1273 - 1274 - if (allocator_empty(&mq->cache_alloc)) { 1275 - result->op = POLICY_REPLACE; 1276 - r = demote_cblock(mq, locker, &result->old_oblock); 1277 - if (r) { 1278 - result->op = POLICY_MISS; 1279 - return; 1280 - } 1281 - 1282 - } else 1283 - result->op = POLICY_NEW; 1284 - 1285 - e = alloc_entry(&mq->cache_alloc); 1286 - BUG_ON(!e); 1287 - e->oblock = oblock; 1288 - 1289 - if (pr == PROMOTE_TEMPORARY) 1290 - push(mq, e); 1291 - else 1292 - push_new(mq, e); 1293 - 1294 - result->cblock = infer_cblock(mq, e); 1295 1150 } 1296 1151 1297 1152 static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b) ··· 1270 1187 return to_oblock(r); 1271 1188 } 1272 1189 1273 - static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, struct bio *bio) 1190 + static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b) 1274 1191 { 1275 1192 unsigned hi; 1276 1193 dm_oblock_t hb = to_hblock(mq, b); ··· 1282 1199 hi = get_index(&mq->hotspot_alloc, e); 1283 1200 q_requeue(&mq->hotspot, e, 1284 1201 test_and_set_bit(hi, mq->hotspot_hit_bits) ? 1285 - 0u : mq->hotspot_level_jump); 1202 + 0u : mq->hotspot_level_jump, 1203 + NULL, NULL); 1286 1204 1287 1205 } else { 1288 1206 stats_miss(&mq->hotspot_stats); ··· 1309 1225 return e; 1310 1226 } 1311 1227 1312 - /* 1313 - * Looks the oblock up in the hash table, then decides whether to put in 1314 - * pre_cache, or cache etc. 1315 - */ 1316 - static int map(struct smq_policy *mq, struct bio *bio, dm_oblock_t oblock, 1317 - bool can_migrate, bool fast_promote, 1318 - struct policy_locker *locker, struct policy_result *result) 1319 - { 1320 - struct entry *e, *hs_e; 1321 - enum promote_result pr; 1322 - 1323 - hs_e = update_hotspot_queue(mq, oblock, bio); 1324 - 1325 - e = h_lookup(&mq->table, oblock); 1326 - if (e) { 1327 - stats_level_accessed(&mq->cache_stats, e->level); 1328 - 1329 - requeue(mq, e); 1330 - result->op = POLICY_HIT; 1331 - result->cblock = infer_cblock(mq, e); 1332 - 1333 - } else { 1334 - stats_miss(&mq->cache_stats); 1335 - 1336 - pr = should_promote(mq, hs_e, bio, fast_promote); 1337 - if (pr == PROMOTE_NOT) 1338 - result->op = POLICY_MISS; 1339 - 1340 - else { 1341 - if (!can_migrate) { 1342 - result->op = POLICY_MISS; 1343 - return -EWOULDBLOCK; 1344 - } 1345 - 1346 - insert_in_cache(mq, oblock, locker, result, pr); 1347 - } 1348 - } 1349 - 1350 - return 0; 1351 - } 1352 - 1353 1228 /*----------------------------------------------------------------*/ 1354 1229 1355 1230 /* ··· 1325 1282 { 1326 1283 struct smq_policy *mq = to_smq_policy(p); 1327 1284 1285 + btracker_destroy(mq->bg_work); 1328 1286 h_exit(&mq->hotspot_table); 1329 1287 h_exit(&mq->table); 1330 1288 free_bitset(mq->hotspot_hit_bits); ··· 1334 1290 kfree(mq); 1335 1291 } 1336 1292 1337 - static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock, 1338 - bool can_block, bool can_migrate, bool fast_promote, 1339 - struct bio *bio, struct policy_locker *locker, 1340 - struct policy_result *result) 1293 + /*----------------------------------------------------------------*/ 1294 + 1295 + static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock, 1296 + int data_dir, bool fast_copy, 1297 + struct policy_work **work, bool *background_work) 1341 1298 { 1342 - int r; 1343 - unsigned long flags; 1344 - struct smq_policy *mq = to_smq_policy(p); 1299 + struct entry *e, *hs_e; 1300 + enum promote_result pr; 1345 1301 1346 - result->op = POLICY_MISS; 1302 + *background_work = false; 1347 1303 1348 - spin_lock_irqsave(&mq->lock, flags); 1349 - r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result); 1350 - spin_unlock_irqrestore(&mq->lock, flags); 1351 - 1352 - return r; 1353 - } 1354 - 1355 - static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) 1356 - { 1357 - int r; 1358 - unsigned long flags; 1359 - struct smq_policy *mq = to_smq_policy(p); 1360 - struct entry *e; 1361 - 1362 - spin_lock_irqsave(&mq->lock, flags); 1363 1304 e = h_lookup(&mq->table, oblock); 1364 1305 if (e) { 1306 + stats_level_accessed(&mq->cache_stats, e->level); 1307 + 1308 + requeue(mq, e); 1365 1309 *cblock = infer_cblock(mq, e); 1366 - r = 0; 1367 - } else 1368 - r = -ENOENT; 1310 + return 0; 1311 + 1312 + } else { 1313 + stats_miss(&mq->cache_stats); 1314 + 1315 + /* 1316 + * The hotspot queue only gets updated with misses. 1317 + */ 1318 + hs_e = update_hotspot_queue(mq, oblock); 1319 + 1320 + pr = should_promote(mq, hs_e, data_dir, fast_copy); 1321 + if (pr != PROMOTE_NOT) { 1322 + queue_promotion(mq, oblock, work); 1323 + *background_work = true; 1324 + } 1325 + 1326 + return -ENOENT; 1327 + } 1328 + } 1329 + 1330 + static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock, 1331 + int data_dir, bool fast_copy, 1332 + bool *background_work) 1333 + { 1334 + int r; 1335 + unsigned long flags; 1336 + struct smq_policy *mq = to_smq_policy(p); 1337 + 1338 + spin_lock_irqsave(&mq->lock, flags); 1339 + r = __lookup(mq, oblock, cblock, 1340 + data_dir, fast_copy, 1341 + NULL, background_work); 1369 1342 spin_unlock_irqrestore(&mq->lock, flags); 1370 1343 1371 1344 return r; 1372 1345 } 1373 1346 1374 - static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, bool set) 1347 + static int smq_lookup_with_work(struct dm_cache_policy *p, 1348 + dm_oblock_t oblock, dm_cblock_t *cblock, 1349 + int data_dir, bool fast_copy, 1350 + struct policy_work **work) 1375 1351 { 1376 - struct entry *e; 1352 + int r; 1353 + bool background_queued; 1354 + unsigned long flags; 1355 + struct smq_policy *mq = to_smq_policy(p); 1377 1356 1378 - e = h_lookup(&mq->table, oblock); 1379 - BUG_ON(!e); 1357 + spin_lock_irqsave(&mq->lock, flags); 1358 + r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued); 1359 + spin_unlock_irqrestore(&mq->lock, flags); 1380 1360 1381 - del(mq, e); 1382 - e->dirty = set; 1383 - push(mq, e); 1361 + return r; 1384 1362 } 1385 1363 1386 - static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) 1364 + static int smq_get_background_work(struct dm_cache_policy *p, bool idle, 1365 + struct policy_work **result) 1366 + { 1367 + int r; 1368 + unsigned long flags; 1369 + struct smq_policy *mq = to_smq_policy(p); 1370 + 1371 + spin_lock_irqsave(&mq->lock, flags); 1372 + r = btracker_issue(mq->bg_work, result); 1373 + if (r == -ENODATA) { 1374 + /* find some writeback work to do */ 1375 + if (mq->migrations_allowed && !free_target_met(mq, idle)) 1376 + queue_demotion(mq); 1377 + 1378 + else if (!clean_target_met(mq, idle)) 1379 + queue_writeback(mq); 1380 + 1381 + r = btracker_issue(mq->bg_work, result); 1382 + } 1383 + spin_unlock_irqrestore(&mq->lock, flags); 1384 + 1385 + return r; 1386 + } 1387 + 1388 + /* 1389 + * We need to clear any pending work flags that have been set, and in the 1390 + * case of promotion free the entry for the destination cblock. 1391 + */ 1392 + static void __complete_background_work(struct smq_policy *mq, 1393 + struct policy_work *work, 1394 + bool success) 1395 + { 1396 + struct entry *e = get_entry(&mq->cache_alloc, 1397 + from_cblock(work->cblock)); 1398 + 1399 + switch (work->op) { 1400 + case POLICY_PROMOTE: 1401 + // !h, !q, a 1402 + clear_pending(mq, e); 1403 + if (success) { 1404 + e->oblock = work->oblock; 1405 + push(mq, e); 1406 + // h, q, a 1407 + } else { 1408 + free_entry(&mq->cache_alloc, e); 1409 + // !h, !q, !a 1410 + } 1411 + break; 1412 + 1413 + case POLICY_DEMOTE: 1414 + // h, !q, a 1415 + if (success) { 1416 + h_remove(&mq->table, e); 1417 + free_entry(&mq->cache_alloc, e); 1418 + // !h, !q, !a 1419 + } else { 1420 + clear_pending(mq, e); 1421 + push_queue(mq, e); 1422 + // h, q, a 1423 + } 1424 + break; 1425 + 1426 + case POLICY_WRITEBACK: 1427 + // h, !q, a 1428 + clear_pending(mq, e); 1429 + push_queue(mq, e); 1430 + // h, q, a 1431 + break; 1432 + } 1433 + 1434 + btracker_complete(mq->bg_work, work); 1435 + } 1436 + 1437 + static void smq_complete_background_work(struct dm_cache_policy *p, 1438 + struct policy_work *work, 1439 + bool success) 1387 1440 { 1388 1441 unsigned long flags; 1389 1442 struct smq_policy *mq = to_smq_policy(p); 1390 1443 1391 1444 spin_lock_irqsave(&mq->lock, flags); 1392 - __smq_set_clear_dirty(mq, oblock, true); 1445 + __complete_background_work(mq, work, success); 1393 1446 spin_unlock_irqrestore(&mq->lock, flags); 1394 1447 } 1395 1448 1396 - static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) 1449 + // in_hash(oblock) -> in_hash(oblock) 1450 + static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set) 1451 + { 1452 + struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); 1453 + 1454 + if (e->pending_work) 1455 + e->dirty = set; 1456 + else { 1457 + del_queue(mq, e); 1458 + e->dirty = set; 1459 + push_queue(mq, e); 1460 + } 1461 + } 1462 + 1463 + static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) 1464 + { 1465 + unsigned long flags; 1466 + struct smq_policy *mq = to_smq_policy(p); 1467 + 1468 + spin_lock_irqsave(&mq->lock, flags); 1469 + __smq_set_clear_dirty(mq, cblock, true); 1470 + spin_unlock_irqrestore(&mq->lock, flags); 1471 + } 1472 + 1473 + static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) 1397 1474 { 1398 1475 struct smq_policy *mq = to_smq_policy(p); 1399 1476 unsigned long flags; 1400 1477 1401 1478 spin_lock_irqsave(&mq->lock, flags); 1402 - __smq_set_clear_dirty(mq, oblock, false); 1479 + __smq_set_clear_dirty(mq, cblock, false); 1403 1480 spin_unlock_irqrestore(&mq->lock, flags); 1404 1481 } 1405 1482 ··· 1531 1366 1532 1367 static int smq_load_mapping(struct dm_cache_policy *p, 1533 1368 dm_oblock_t oblock, dm_cblock_t cblock, 1534 - uint32_t hint, bool hint_valid) 1369 + bool dirty, uint32_t hint, bool hint_valid) 1535 1370 { 1536 1371 struct smq_policy *mq = to_smq_policy(p); 1537 1372 struct entry *e; 1538 1373 1539 1374 e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock)); 1540 1375 e->oblock = oblock; 1541 - e->dirty = false; /* this gets corrected in a minute */ 1376 + e->dirty = dirty; 1542 1377 e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock); 1543 - push(mq, e); 1378 + e->pending_work = false; 1544 1379 1380 + /* 1381 + * When we load mappings we push ahead of both sentinels in order to 1382 + * allow demotions and cleaning to occur immediately. 1383 + */ 1384 + push_front(mq, e); 1385 + 1386 + return 0; 1387 + } 1388 + 1389 + static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock) 1390 + { 1391 + struct smq_policy *mq = to_smq_policy(p); 1392 + struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); 1393 + 1394 + if (!e->allocated) 1395 + return -ENODATA; 1396 + 1397 + // FIXME: what if this block has pending background work? 1398 + del_queue(mq, e); 1399 + h_remove(&mq->table, e); 1400 + free_entry(&mq->cache_alloc, e); 1545 1401 return 0; 1546 1402 } 1547 1403 ··· 1575 1389 return 0; 1576 1390 1577 1391 return e->level; 1578 - } 1579 - 1580 - static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock) 1581 - { 1582 - struct entry *e; 1583 - 1584 - e = h_lookup(&mq->table, oblock); 1585 - BUG_ON(!e); 1586 - 1587 - del(mq, e); 1588 - free_entry(&mq->cache_alloc, e); 1589 - } 1590 - 1591 - static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) 1592 - { 1593 - struct smq_policy *mq = to_smq_policy(p); 1594 - unsigned long flags; 1595 - 1596 - spin_lock_irqsave(&mq->lock, flags); 1597 - __remove_mapping(mq, oblock); 1598 - spin_unlock_irqrestore(&mq->lock, flags); 1599 - } 1600 - 1601 - static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock) 1602 - { 1603 - struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); 1604 - 1605 - if (!e || !e->allocated) 1606 - return -ENODATA; 1607 - 1608 - del(mq, e); 1609 - free_entry(&mq->cache_alloc, e); 1610 - 1611 - return 0; 1612 - } 1613 - 1614 - static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) 1615 - { 1616 - int r; 1617 - unsigned long flags; 1618 - struct smq_policy *mq = to_smq_policy(p); 1619 - 1620 - spin_lock_irqsave(&mq->lock, flags); 1621 - r = __remove_cblock(mq, cblock); 1622 - spin_unlock_irqrestore(&mq->lock, flags); 1623 - 1624 - return r; 1625 - } 1626 - 1627 - 1628 - #define CLEAN_TARGET_CRITICAL 5u /* percent */ 1629 - 1630 - static bool clean_target_met(struct smq_policy *mq, bool critical) 1631 - { 1632 - if (critical) { 1633 - /* 1634 - * Cache entries may not be populated. So we're cannot rely on the 1635 - * size of the clean queue. 1636 - */ 1637 - unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty); 1638 - unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_CRITICAL / 100u; 1639 - 1640 - return nr_clean >= target; 1641 - } else 1642 - return !q_size(&mq->dirty); 1643 - } 1644 - 1645 - static int __smq_writeback_work(struct smq_policy *mq, dm_oblock_t *oblock, 1646 - dm_cblock_t *cblock, bool critical_only) 1647 - { 1648 - struct entry *e = NULL; 1649 - bool target_met = clean_target_met(mq, critical_only); 1650 - 1651 - if (critical_only) 1652 - /* 1653 - * Always try and keep the bottom level clean. 1654 - */ 1655 - e = pop_old(mq, &mq->dirty, target_met ? 1u : mq->dirty.nr_levels); 1656 - 1657 - else 1658 - e = pop_old(mq, &mq->dirty, mq->dirty.nr_levels); 1659 - 1660 - if (!e) 1661 - return -ENODATA; 1662 - 1663 - *oblock = e->oblock; 1664 - *cblock = infer_cblock(mq, e); 1665 - e->dirty = false; 1666 - push_new(mq, e); 1667 - 1668 - return 0; 1669 - } 1670 - 1671 - static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock, 1672 - dm_cblock_t *cblock, bool critical_only) 1673 - { 1674 - int r; 1675 - unsigned long flags; 1676 - struct smq_policy *mq = to_smq_policy(p); 1677 - 1678 - spin_lock_irqsave(&mq->lock, flags); 1679 - r = __smq_writeback_work(mq, oblock, cblock, critical_only); 1680 - spin_unlock_irqrestore(&mq->lock, flags); 1681 - 1682 - return r; 1683 - } 1684 - 1685 - static void __force_mapping(struct smq_policy *mq, 1686 - dm_oblock_t current_oblock, dm_oblock_t new_oblock) 1687 - { 1688 - struct entry *e = h_lookup(&mq->table, current_oblock); 1689 - 1690 - if (e) { 1691 - del(mq, e); 1692 - e->oblock = new_oblock; 1693 - e->dirty = true; 1694 - push(mq, e); 1695 - } 1696 - } 1697 - 1698 - static void smq_force_mapping(struct dm_cache_policy *p, 1699 - dm_oblock_t current_oblock, dm_oblock_t new_oblock) 1700 - { 1701 - unsigned long flags; 1702 - struct smq_policy *mq = to_smq_policy(p); 1703 - 1704 - spin_lock_irqsave(&mq->lock, flags); 1705 - __force_mapping(mq, current_oblock, new_oblock); 1706 - spin_unlock_irqrestore(&mq->lock, flags); 1707 1392 } 1708 1393 1709 1394 static dm_cblock_t smq_residency(struct dm_cache_policy *p) ··· 1601 1544 end_hotspot_period(mq); 1602 1545 end_cache_period(mq); 1603 1546 spin_unlock_irqrestore(&mq->lock, flags); 1547 + } 1548 + 1549 + static void smq_allow_migrations(struct dm_cache_policy *p, bool allow) 1550 + { 1551 + struct smq_policy *mq = to_smq_policy(p); 1552 + mq->migrations_allowed = allow; 1604 1553 } 1605 1554 1606 1555 /* ··· 1653 1590 static void init_policy_functions(struct smq_policy *mq, bool mimic_mq) 1654 1591 { 1655 1592 mq->policy.destroy = smq_destroy; 1656 - mq->policy.map = smq_map; 1657 1593 mq->policy.lookup = smq_lookup; 1594 + mq->policy.lookup_with_work = smq_lookup_with_work; 1595 + mq->policy.get_background_work = smq_get_background_work; 1596 + mq->policy.complete_background_work = smq_complete_background_work; 1658 1597 mq->policy.set_dirty = smq_set_dirty; 1659 1598 mq->policy.clear_dirty = smq_clear_dirty; 1660 1599 mq->policy.load_mapping = smq_load_mapping; 1600 + mq->policy.invalidate_mapping = smq_invalidate_mapping; 1661 1601 mq->policy.get_hint = smq_get_hint; 1662 - mq->policy.remove_mapping = smq_remove_mapping; 1663 - mq->policy.remove_cblock = smq_remove_cblock; 1664 - mq->policy.writeback_work = smq_writeback_work; 1665 - mq->policy.force_mapping = smq_force_mapping; 1666 1602 mq->policy.residency = smq_residency; 1667 1603 mq->policy.tick = smq_tick; 1604 + mq->policy.allow_migrations = smq_allow_migrations; 1668 1605 1669 1606 if (mimic_mq) { 1670 1607 mq->policy.set_config_value = mq_set_config_value; ··· 1696 1633 static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, 1697 1634 sector_t origin_size, 1698 1635 sector_t cache_block_size, 1699 - bool mimic_mq) 1636 + bool mimic_mq, 1637 + bool migrations_allowed) 1700 1638 { 1701 1639 unsigned i; 1702 1640 unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS; ··· 1722 1658 } 1723 1659 1724 1660 init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue); 1725 - for (i = 0; i < nr_sentinels_per_queue; i++) 1661 + for (i = 0; i < nr_sentinels_per_queue; i++) 1726 1662 get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true; 1727 1663 1728 1664 init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels); 1729 - for (i = 0; i < nr_sentinels_per_queue; i++) 1665 + for (i = 0; i < nr_sentinels_per_queue; i++) 1730 1666 get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true; 1731 1667 1732 1668 init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels, ··· 1779 1715 mq->next_hotspot_period = jiffies; 1780 1716 mq->next_cache_period = jiffies; 1781 1717 1718 + mq->bg_work = btracker_create(10240); /* FIXME: hard coded value */ 1719 + if (!mq->bg_work) 1720 + goto bad_btracker; 1721 + 1722 + mq->migrations_allowed = migrations_allowed; 1723 + 1782 1724 return &mq->policy; 1783 1725 1726 + bad_btracker: 1727 + h_exit(&mq->hotspot_table); 1784 1728 bad_alloc_hotspot_table: 1785 1729 h_exit(&mq->table); 1786 1730 bad_alloc_table: ··· 1807 1735 sector_t origin_size, 1808 1736 sector_t cache_block_size) 1809 1737 { 1810 - return __smq_create(cache_size, origin_size, cache_block_size, false); 1738 + return __smq_create(cache_size, origin_size, cache_block_size, false, true); 1811 1739 } 1812 1740 1813 1741 static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, 1814 1742 sector_t origin_size, 1815 1743 sector_t cache_block_size) 1816 1744 { 1817 - return __smq_create(cache_size, origin_size, cache_block_size, true); 1745 + return __smq_create(cache_size, origin_size, cache_block_size, true, true); 1746 + } 1747 + 1748 + static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size, 1749 + sector_t origin_size, 1750 + sector_t cache_block_size) 1751 + { 1752 + return __smq_create(cache_size, origin_size, cache_block_size, false, false); 1818 1753 } 1819 1754 1820 1755 /*----------------------------------------------------------------*/ 1821 1756 1822 1757 static struct dm_cache_policy_type smq_policy_type = { 1823 1758 .name = "smq", 1824 - .version = {1, 5, 0}, 1759 + .version = {2, 0, 0}, 1825 1760 .hint_size = 4, 1826 1761 .owner = THIS_MODULE, 1827 1762 .create = smq_create ··· 1836 1757 1837 1758 static struct dm_cache_policy_type mq_policy_type = { 1838 1759 .name = "mq", 1839 - .version = {1, 5, 0}, 1760 + .version = {2, 0, 0}, 1840 1761 .hint_size = 4, 1841 1762 .owner = THIS_MODULE, 1842 1763 .create = mq_create, 1843 1764 }; 1844 1765 1766 + static struct dm_cache_policy_type cleaner_policy_type = { 1767 + .name = "cleaner", 1768 + .version = {2, 0, 0}, 1769 + .hint_size = 4, 1770 + .owner = THIS_MODULE, 1771 + .create = cleaner_create, 1772 + }; 1773 + 1845 1774 static struct dm_cache_policy_type default_policy_type = { 1846 1775 .name = "default", 1847 - .version = {1, 5, 0}, 1776 + .version = {2, 0, 0}, 1848 1777 .hint_size = 4, 1849 1778 .owner = THIS_MODULE, 1850 1779 .create = smq_create, ··· 1872 1785 r = dm_cache_policy_register(&mq_policy_type); 1873 1786 if (r) { 1874 1787 DMERR("register failed (as mq) %d", r); 1875 - dm_cache_policy_unregister(&smq_policy_type); 1876 - return -ENOMEM; 1788 + goto out_mq; 1789 + } 1790 + 1791 + r = dm_cache_policy_register(&cleaner_policy_type); 1792 + if (r) { 1793 + DMERR("register failed (as cleaner) %d", r); 1794 + goto out_cleaner; 1877 1795 } 1878 1796 1879 1797 r = dm_cache_policy_register(&default_policy_type); 1880 1798 if (r) { 1881 1799 DMERR("register failed (as default) %d", r); 1882 - dm_cache_policy_unregister(&mq_policy_type); 1883 - dm_cache_policy_unregister(&smq_policy_type); 1884 - return -ENOMEM; 1800 + goto out_default; 1885 1801 } 1886 1802 1887 1803 return 0; 1804 + 1805 + out_default: 1806 + dm_cache_policy_unregister(&cleaner_policy_type); 1807 + out_cleaner: 1808 + dm_cache_policy_unregister(&mq_policy_type); 1809 + out_mq: 1810 + dm_cache_policy_unregister(&smq_policy_type); 1811 + 1812 + return -ENOMEM; 1888 1813 } 1889 1814 1890 1815 static void __exit smq_exit(void) 1891 1816 { 1817 + dm_cache_policy_unregister(&cleaner_policy_type); 1892 1818 dm_cache_policy_unregister(&smq_policy_type); 1893 1819 dm_cache_policy_unregister(&mq_policy_type); 1894 1820 dm_cache_policy_unregister(&default_policy_type); ··· 1916 1816 1917 1817 MODULE_ALIAS("dm-cache-default"); 1918 1818 MODULE_ALIAS("dm-cache-mq"); 1819 + MODULE_ALIAS("dm-cache-cleaner");
+59 -138
drivers/md/dm-cache-policy.h
··· 13 13 14 14 /*----------------------------------------------------------------*/ 15 15 16 - /* FIXME: make it clear which methods are optional. Get debug policy to 17 - * double check this at start. 18 - */ 19 - 20 16 /* 21 17 * The cache policy makes the important decisions about which blocks get to 22 18 * live on the faster cache device. 23 - * 24 - * When the core target has to remap a bio it calls the 'map' method of the 25 - * policy. This returns an instruction telling the core target what to do. 26 - * 27 - * POLICY_HIT: 28 - * That block is in the cache. Remap to the cache and carry on. 29 - * 30 - * POLICY_MISS: 31 - * This block is on the origin device. Remap and carry on. 32 - * 33 - * POLICY_NEW: 34 - * This block is currently on the origin device, but the policy wants to 35 - * move it. The core should: 36 - * 37 - * - hold any further io to this origin block 38 - * - copy the origin to the given cache block 39 - * - release all the held blocks 40 - * - remap the original block to the cache 41 - * 42 - * POLICY_REPLACE: 43 - * This block is currently on the origin device. The policy wants to 44 - * move it to the cache, with the added complication that the destination 45 - * cache block needs a writeback first. The core should: 46 - * 47 - * - hold any further io to this origin block 48 - * - hold any further io to the origin block that's being written back 49 - * - writeback 50 - * - copy new block to cache 51 - * - release held blocks 52 - * - remap bio to cache and reissue. 53 - * 54 - * Should the core run into trouble while processing a POLICY_NEW or 55 - * POLICY_REPLACE instruction it will roll back the policies mapping using 56 - * remove_mapping() or force_mapping(). These methods must not fail. This 57 - * approach avoids having transactional semantics in the policy (ie, the 58 - * core informing the policy when a migration is complete), and hence makes 59 - * it easier to write new policies. 60 - * 61 - * In general policy methods should never block, except in the case of the 62 - * map function when can_migrate is set. So be careful to implement using 63 - * bounded, preallocated memory. 64 19 */ 65 20 enum policy_operation { 66 - POLICY_HIT, 67 - POLICY_MISS, 68 - POLICY_NEW, 69 - POLICY_REPLACE 70 - }; 71 - 72 - /* 73 - * When issuing a POLICY_REPLACE the policy needs to make a callback to 74 - * lock the block being demoted. This doesn't need to occur during a 75 - * writeback operation since the block remains in the cache. 76 - */ 77 - struct policy_locker; 78 - typedef int (*policy_lock_fn)(struct policy_locker *l, dm_oblock_t oblock); 79 - 80 - struct policy_locker { 81 - policy_lock_fn fn; 21 + POLICY_PROMOTE, 22 + POLICY_DEMOTE, 23 + POLICY_WRITEBACK 82 24 }; 83 25 84 26 /* 85 27 * This is the instruction passed back to the core target. 86 28 */ 87 - struct policy_result { 29 + struct policy_work { 88 30 enum policy_operation op; 89 - dm_oblock_t old_oblock; /* POLICY_REPLACE */ 90 - dm_cblock_t cblock; /* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */ 31 + dm_oblock_t oblock; 32 + dm_cblock_t cblock; 91 33 }; 92 34 93 35 /* 94 - * The cache policy object. Just a bunch of methods. It is envisaged that 95 - * this structure will be embedded in a bigger, policy specific structure 96 - * (ie. use container_of()). 36 + * The cache policy object. It is envisaged that this structure will be 37 + * embedded in a bigger, policy specific structure (ie. use container_of()). 97 38 */ 98 39 struct dm_cache_policy { 99 - 100 - /* 101 - * FIXME: make it clear which methods are optional, and which may 102 - * block. 103 - */ 104 - 105 40 /* 106 41 * Destroys this object. 107 42 */ 108 43 void (*destroy)(struct dm_cache_policy *p); 109 44 110 45 /* 111 - * See large comment above. 112 - * 113 - * oblock - the origin block we're interested in. 114 - * 115 - * can_block - indicates whether the current thread is allowed to 116 - * block. -EWOULDBLOCK returned if it can't and would. 117 - * 118 - * can_migrate - gives permission for POLICY_NEW or POLICY_REPLACE 119 - * instructions. If denied and the policy would have 120 - * returned one of these instructions it should 121 - * return -EWOULDBLOCK. 122 - * 123 - * discarded_oblock - indicates whether the whole origin block is 124 - * in a discarded state (FIXME: better to tell the 125 - * policy about this sooner, so it can recycle that 126 - * cache block if it wants.) 127 - * bio - the bio that triggered this call. 128 - * result - gets filled in with the instruction. 129 - * 130 - * May only return 0, or -EWOULDBLOCK (if !can_migrate) 131 - */ 132 - int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock, 133 - bool can_block, bool can_migrate, bool discarded_oblock, 134 - struct bio *bio, struct policy_locker *locker, 135 - struct policy_result *result); 136 - 137 - /* 138 - * Sometimes we want to see if a block is in the cache, without 139 - * triggering any update of stats. (ie. it's not a real hit). 46 + * Find the location of a block. 140 47 * 141 48 * Must not block. 142 49 * 143 - * Returns 0 if in cache, -ENOENT if not, < 0 for other errors 144 - * (-EWOULDBLOCK would be typical). 50 + * Returns 0 if in cache (cblock will be set), -ENOENT if not, < 0 for 51 + * other errors (-EWOULDBLOCK would be typical). data_dir should be 52 + * READ or WRITE. fast_copy should be set if migrating this block would 53 + * be 'cheap' somehow (eg, discarded data). background_queued will be set 54 + * if a migration has just been queued. 145 55 */ 146 - int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock); 56 + int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock, 57 + int data_dir, bool fast_copy, bool *background_queued); 147 58 148 - void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); 149 - void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); 59 + /* 60 + * Sometimes the core target can optimise a migration, eg, the 61 + * block may be discarded, or the bio may cover an entire block. 62 + * In order to optimise it needs the migration immediately though 63 + * so it knows to do something different with the bio. 64 + * 65 + * This method is optional (policy-internal will fallback to using 66 + * lookup). 67 + */ 68 + int (*lookup_with_work)(struct dm_cache_policy *p, 69 + dm_oblock_t oblock, dm_cblock_t *cblock, 70 + int data_dir, bool fast_copy, 71 + struct policy_work **work); 72 + 73 + /* 74 + * Retrieves background work. Returns -ENODATA when there's no 75 + * background work. 76 + */ 77 + int (*get_background_work)(struct dm_cache_policy *p, bool idle, 78 + struct policy_work **result); 79 + 80 + /* 81 + * You must pass in the same work pointer that you were given, not 82 + * a copy. 83 + */ 84 + void (*complete_background_work)(struct dm_cache_policy *p, 85 + struct policy_work *work, 86 + bool success); 87 + 88 + void (*set_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock); 89 + void (*clear_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock); 150 90 151 91 /* 152 92 * Called when a cache target is first created. Used to load a 153 93 * mapping from the metadata device into the policy. 154 94 */ 155 95 int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock, 156 - dm_cblock_t cblock, uint32_t hint, bool hint_valid); 96 + dm_cblock_t cblock, bool dirty, 97 + uint32_t hint, bool hint_valid); 98 + 99 + /* 100 + * Drops the mapping, irrespective of whether it's clean or dirty. 101 + * Returns -ENODATA if cblock is not mapped. 102 + */ 103 + int (*invalidate_mapping)(struct dm_cache_policy *p, dm_cblock_t cblock); 157 104 158 105 /* 159 106 * Gets the hint for a given cblock. Called in a single threaded 160 107 * context. So no locking required. 161 108 */ 162 109 uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock); 163 - 164 - /* 165 - * Override functions used on the error paths of the core target. 166 - * They must succeed. 167 - */ 168 - void (*remove_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock); 169 - void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock, 170 - dm_oblock_t new_oblock); 171 - 172 - /* 173 - * This is called via the invalidate_cblocks message. It is 174 - * possible the particular cblock has already been removed due to a 175 - * write io in passthrough mode. In which case this should return 176 - * -ENODATA. 177 - */ 178 - int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock); 179 - 180 - /* 181 - * Provide a dirty block to be written back by the core target. If 182 - * critical_only is set then the policy should only provide work if 183 - * it urgently needs it. 184 - * 185 - * Returns: 186 - * 187 - * 0 and @cblock,@oblock: block to write back provided 188 - * 189 - * -ENODATA: no dirty blocks available 190 - */ 191 - int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock, 192 - bool critical_only); 193 110 194 111 /* 195 112 * How full is the cache? ··· 119 202 * queue merging has occurred). To stop the policy being fooled by 120 203 * these, the core target sends regular tick() calls to the policy. 121 204 * The policy should only count an entry as hit once per tick. 205 + * 206 + * This method is optional. 122 207 */ 123 208 void (*tick)(struct dm_cache_policy *p, bool can_block); 124 209 ··· 131 212 unsigned maxlen, ssize_t *sz_ptr); 132 213 int (*set_config_value)(struct dm_cache_policy *p, 133 214 const char *key, const char *value); 215 + 216 + void (*allow_migrations)(struct dm_cache_policy *p, bool allow); 134 217 135 218 /* 136 219 * Book keeping ptr for the policy register, not for general use.
+1208 -1515
drivers/md/dm-cache-target.c
··· 5 5 */ 6 6 7 7 #include "dm.h" 8 - #include "dm-bio-prison-v1.h" 8 + #include "dm-bio-prison-v2.h" 9 9 #include "dm-bio-record.h" 10 10 #include "dm-cache-metadata.h" 11 11 ··· 15 15 #include <linux/init.h> 16 16 #include <linux/mempool.h> 17 17 #include <linux/module.h> 18 + #include <linux/rwsem.h> 18 19 #include <linux/slab.h> 19 20 #include <linux/vmalloc.h> 20 21 ··· 26 25 27 26 /*----------------------------------------------------------------*/ 28 27 29 - #define IOT_RESOLUTION 4 28 + /* 29 + * Glossary: 30 + * 31 + * oblock: index of an origin block 32 + * cblock: index of a cache block 33 + * promotion: movement of a block from origin to cache 34 + * demotion: movement of a block from cache to origin 35 + * migration: movement of a block between the origin and cache device, 36 + * either direction 37 + */ 38 + 39 + /*----------------------------------------------------------------*/ 30 40 31 41 struct io_tracker { 32 42 spinlock_t lock; ··· 111 99 /*----------------------------------------------------------------*/ 112 100 113 101 /* 114 - * Glossary: 115 - * 116 - * oblock: index of an origin block 117 - * cblock: index of a cache block 118 - * promotion: movement of a block from origin to cache 119 - * demotion: movement of a block from cache to origin 120 - * migration: movement of a block between the origin and cache device, 121 - * either direction 102 + * Represents a chunk of future work. 'input' allows continuations to pass 103 + * values between themselves, typically error values. 122 104 */ 105 + struct continuation { 106 + struct work_struct ws; 107 + int input; 108 + }; 109 + 110 + static inline void init_continuation(struct continuation *k, 111 + void (*fn)(struct work_struct *)) 112 + { 113 + INIT_WORK(&k->ws, fn); 114 + k->input = 0; 115 + } 116 + 117 + static inline void queue_continuation(struct workqueue_struct *wq, 118 + struct continuation *k) 119 + { 120 + queue_work(wq, &k->ws); 121 + } 123 122 124 123 /*----------------------------------------------------------------*/ 124 + 125 + /* 126 + * The batcher collects together pieces of work that need a particular 127 + * operation to occur before they can proceed (typically a commit). 128 + */ 129 + struct batcher { 130 + /* 131 + * The operation that everyone is waiting for. 132 + */ 133 + int (*commit_op)(void *context); 134 + void *commit_context; 135 + 136 + /* 137 + * This is how bios should be issued once the commit op is complete 138 + * (accounted_request). 139 + */ 140 + void (*issue_op)(struct bio *bio, void *context); 141 + void *issue_context; 142 + 143 + /* 144 + * Queued work gets put on here after commit. 145 + */ 146 + struct workqueue_struct *wq; 147 + 148 + spinlock_t lock; 149 + struct list_head work_items; 150 + struct bio_list bios; 151 + struct work_struct commit_work; 152 + 153 + bool commit_scheduled; 154 + }; 155 + 156 + static void __commit(struct work_struct *_ws) 157 + { 158 + struct batcher *b = container_of(_ws, struct batcher, commit_work); 159 + 160 + int r; 161 + unsigned long flags; 162 + struct list_head work_items; 163 + struct work_struct *ws, *tmp; 164 + struct continuation *k; 165 + struct bio *bio; 166 + struct bio_list bios; 167 + 168 + INIT_LIST_HEAD(&work_items); 169 + bio_list_init(&bios); 170 + 171 + /* 172 + * We have to grab these before the commit_op to avoid a race 173 + * condition. 174 + */ 175 + spin_lock_irqsave(&b->lock, flags); 176 + list_splice_init(&b->work_items, &work_items); 177 + bio_list_merge(&bios, &b->bios); 178 + bio_list_init(&b->bios); 179 + b->commit_scheduled = false; 180 + spin_unlock_irqrestore(&b->lock, flags); 181 + 182 + r = b->commit_op(b->commit_context); 183 + 184 + list_for_each_entry_safe(ws, tmp, &work_items, entry) { 185 + k = container_of(ws, struct continuation, ws); 186 + k->input = r; 187 + INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */ 188 + queue_work(b->wq, ws); 189 + } 190 + 191 + while ((bio = bio_list_pop(&bios))) { 192 + if (r) { 193 + bio->bi_error = r; 194 + bio_endio(bio); 195 + } else 196 + b->issue_op(bio, b->issue_context); 197 + } 198 + } 199 + 200 + static void batcher_init(struct batcher *b, 201 + int (*commit_op)(void *), 202 + void *commit_context, 203 + void (*issue_op)(struct bio *bio, void *), 204 + void *issue_context, 205 + struct workqueue_struct *wq) 206 + { 207 + b->commit_op = commit_op; 208 + b->commit_context = commit_context; 209 + b->issue_op = issue_op; 210 + b->issue_context = issue_context; 211 + b->wq = wq; 212 + 213 + spin_lock_init(&b->lock); 214 + INIT_LIST_HEAD(&b->work_items); 215 + bio_list_init(&b->bios); 216 + INIT_WORK(&b->commit_work, __commit); 217 + b->commit_scheduled = false; 218 + } 219 + 220 + static void async_commit(struct batcher *b) 221 + { 222 + queue_work(b->wq, &b->commit_work); 223 + } 224 + 225 + static void continue_after_commit(struct batcher *b, struct continuation *k) 226 + { 227 + unsigned long flags; 228 + bool commit_scheduled; 229 + 230 + spin_lock_irqsave(&b->lock, flags); 231 + commit_scheduled = b->commit_scheduled; 232 + list_add_tail(&k->ws.entry, &b->work_items); 233 + spin_unlock_irqrestore(&b->lock, flags); 234 + 235 + if (commit_scheduled) 236 + async_commit(b); 237 + } 238 + 239 + /* 240 + * Bios are errored if commit failed. 241 + */ 242 + static void issue_after_commit(struct batcher *b, struct bio *bio) 243 + { 244 + unsigned long flags; 245 + bool commit_scheduled; 246 + 247 + spin_lock_irqsave(&b->lock, flags); 248 + commit_scheduled = b->commit_scheduled; 249 + bio_list_add(&b->bios, bio); 250 + spin_unlock_irqrestore(&b->lock, flags); 251 + 252 + if (commit_scheduled) 253 + async_commit(b); 254 + } 255 + 256 + /* 257 + * Call this if some urgent work is waiting for the commit to complete. 258 + */ 259 + static void schedule_commit(struct batcher *b) 260 + { 261 + bool immediate; 262 + unsigned long flags; 263 + 264 + spin_lock_irqsave(&b->lock, flags); 265 + immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios); 266 + b->commit_scheduled = true; 267 + spin_unlock_irqrestore(&b->lock, flags); 268 + 269 + if (immediate) 270 + async_commit(b); 271 + } 125 272 126 273 /* 127 274 * There are a couple of places where we let a bio run, but want to do some ··· 360 189 atomic_t write_miss; 361 190 atomic_t demotion; 362 191 atomic_t promotion; 192 + atomic_t writeback; 363 193 atomic_t copies_avoided; 364 194 atomic_t cache_cell_clash; 365 195 atomic_t commit_count; 366 196 atomic_t discard_count; 367 - }; 368 - 369 - /* 370 - * Defines a range of cblocks, begin to (end - 1) are in the range. end is 371 - * the one-past-the-end value. 372 - */ 373 - struct cblock_range { 374 - dm_cblock_t begin; 375 - dm_cblock_t end; 376 - }; 377 - 378 - struct invalidation_request { 379 - struct list_head list; 380 - struct cblock_range *cblocks; 381 - 382 - atomic_t complete; 383 - int err; 384 - 385 - wait_queue_head_t result_wait; 386 197 }; 387 198 388 199 struct cache { ··· 408 255 spinlock_t lock; 409 256 struct list_head deferred_cells; 410 257 struct bio_list deferred_bios; 411 - struct bio_list deferred_flush_bios; 412 258 struct bio_list deferred_writethrough_bios; 413 - struct list_head quiesced_migrations; 414 - struct list_head completed_migrations; 415 - struct list_head need_commit_migrations; 416 259 sector_t migration_threshold; 417 260 wait_queue_head_t migration_wait; 418 261 atomic_t nr_allocated_migrations; ··· 419 270 */ 420 271 atomic_t nr_io_migrations; 421 272 422 - wait_queue_head_t quiescing_wait; 423 - atomic_t quiescing; 424 - atomic_t quiescing_ack; 273 + struct rw_semaphore quiesce_lock; 425 274 426 275 /* 427 276 * cache_size entries, dirty if set ··· 443 296 444 297 struct dm_kcopyd_client *copier; 445 298 struct workqueue_struct *wq; 446 - struct work_struct worker; 447 - 299 + struct work_struct deferred_bio_worker; 300 + struct work_struct deferred_writethrough_worker; 301 + struct work_struct migration_worker; 448 302 struct delayed_work waker; 449 - unsigned long last_commit_jiffies; 450 - 451 - struct dm_bio_prison *prison; 452 - struct dm_deferred_set *all_io_ds; 303 + struct dm_bio_prison_v2 *prison; 453 304 454 305 mempool_t *migration_pool; 455 306 ··· 475 330 struct list_head invalidation_requests; 476 331 477 332 struct io_tracker origin_tracker; 333 + 334 + struct work_struct commit_ws; 335 + struct batcher committer; 336 + 337 + struct rw_semaphore background_work_lock; 478 338 }; 479 339 480 340 struct per_bio_data { 481 341 bool tick:1; 482 342 unsigned req_nr:2; 483 - struct dm_deferred_entry *all_io_entry; 343 + struct dm_bio_prison_cell_v2 *cell; 484 344 struct dm_hook_info hook_info; 485 345 sector_t len; 486 346 ··· 500 350 }; 501 351 502 352 struct dm_cache_migration { 503 - struct list_head list; 353 + struct continuation k; 504 354 struct cache *cache; 505 355 506 - unsigned long start_jiffies; 507 - dm_oblock_t old_oblock; 508 - dm_oblock_t new_oblock; 509 - dm_cblock_t cblock; 356 + struct policy_work *op; 357 + struct bio *overwrite_bio; 358 + struct dm_bio_prison_cell_v2 *cell; 510 359 511 - bool err:1; 512 - bool discard:1; 513 - bool writeback:1; 514 - bool demote:1; 515 - bool promote:1; 516 - bool requeue_holder:1; 517 - bool invalidate:1; 518 - 519 - struct dm_bio_prison_cell *old_ocell; 520 - struct dm_bio_prison_cell *new_ocell; 360 + dm_cblock_t invalidate_cblock; 361 + dm_oblock_t invalidate_oblock; 521 362 }; 522 363 523 - /* 524 - * Processing a bio in the worker thread may require these memory 525 - * allocations. We prealloc to avoid deadlocks (the same worker thread 526 - * frees them back to the mempool). 527 - */ 528 - struct prealloc { 529 - struct dm_cache_migration *mg; 530 - struct dm_bio_prison_cell *cell1; 531 - struct dm_bio_prison_cell *cell2; 532 - }; 364 + /*----------------------------------------------------------------*/ 533 365 534 - static enum cache_metadata_mode get_cache_mode(struct cache *cache); 535 - 536 - static void wake_worker(struct cache *cache) 366 + static bool writethrough_mode(struct cache_features *f) 537 367 { 538 - queue_work(cache->wq, &cache->worker); 368 + return f->io_mode == CM_IO_WRITETHROUGH; 369 + } 370 + 371 + static bool writeback_mode(struct cache_features *f) 372 + { 373 + return f->io_mode == CM_IO_WRITEBACK; 374 + } 375 + 376 + static inline bool passthrough_mode(struct cache_features *f) 377 + { 378 + return unlikely(f->io_mode == CM_IO_PASSTHROUGH); 539 379 } 540 380 541 381 /*----------------------------------------------------------------*/ 542 382 543 - static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) 383 + static void wake_deferred_bio_worker(struct cache *cache) 544 384 { 545 - /* FIXME: change to use a local slab. */ 546 - return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT); 385 + queue_work(cache->wq, &cache->deferred_bio_worker); 547 386 } 548 387 549 - static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) 388 + static void wake_deferred_writethrough_worker(struct cache *cache) 550 389 { 551 - dm_bio_prison_free_cell(cache->prison, cell); 390 + queue_work(cache->wq, &cache->deferred_writethrough_worker); 391 + } 392 + 393 + static void wake_migration_worker(struct cache *cache) 394 + { 395 + if (passthrough_mode(&cache->features)) 396 + return; 397 + 398 + queue_work(cache->wq, &cache->migration_worker); 399 + } 400 + 401 + /*----------------------------------------------------------------*/ 402 + 403 + static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache) 404 + { 405 + return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT); 406 + } 407 + 408 + static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell) 409 + { 410 + dm_bio_prison_free_cell_v2(cache->prison, cell); 552 411 } 553 412 554 413 static struct dm_cache_migration *alloc_migration(struct cache *cache) ··· 583 424 mempool_free(mg, cache->migration_pool); 584 425 } 585 426 586 - static int prealloc_data_structs(struct cache *cache, struct prealloc *p) 587 - { 588 - if (!p->mg) { 589 - p->mg = alloc_migration(cache); 590 - if (!p->mg) 591 - return -ENOMEM; 592 - } 593 - 594 - if (!p->cell1) { 595 - p->cell1 = alloc_prison_cell(cache); 596 - if (!p->cell1) 597 - return -ENOMEM; 598 - } 599 - 600 - if (!p->cell2) { 601 - p->cell2 = alloc_prison_cell(cache); 602 - if (!p->cell2) 603 - return -ENOMEM; 604 - } 605 - 606 - return 0; 607 - } 608 - 609 - static void prealloc_free_structs(struct cache *cache, struct prealloc *p) 610 - { 611 - if (p->cell2) 612 - free_prison_cell(cache, p->cell2); 613 - 614 - if (p->cell1) 615 - free_prison_cell(cache, p->cell1); 616 - 617 - if (p->mg) 618 - free_migration(p->mg); 619 - } 620 - 621 - static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) 622 - { 623 - struct dm_cache_migration *mg = p->mg; 624 - 625 - BUG_ON(!mg); 626 - p->mg = NULL; 627 - 628 - return mg; 629 - } 630 - 631 - /* 632 - * You must have a cell within the prealloc struct to return. If not this 633 - * function will BUG() rather than returning NULL. 634 - */ 635 - static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) 636 - { 637 - struct dm_bio_prison_cell *r = NULL; 638 - 639 - if (p->cell1) { 640 - r = p->cell1; 641 - p->cell1 = NULL; 642 - 643 - } else if (p->cell2) { 644 - r = p->cell2; 645 - p->cell2 = NULL; 646 - } else 647 - BUG(); 648 - 649 - return r; 650 - } 651 - 652 - /* 653 - * You can't have more than two cells in a prealloc struct. BUG() will be 654 - * called if you try and overfill. 655 - */ 656 - static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell) 657 - { 658 - if (!p->cell2) 659 - p->cell2 = cell; 660 - 661 - else if (!p->cell1) 662 - p->cell1 = cell; 663 - 664 - else 665 - BUG(); 666 - } 667 - 668 427 /*----------------------------------------------------------------*/ 669 428 670 - static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key) 429 + static inline dm_oblock_t oblock_succ(dm_oblock_t b) 430 + { 431 + return to_oblock(from_oblock(b) + 1ull); 432 + } 433 + 434 + static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key) 671 435 { 672 436 key->virtual = 0; 673 437 key->dev = 0; ··· 599 517 } 600 518 601 519 /* 602 - * The caller hands in a preallocated cell, and a free function for it. 603 - * The cell will be freed if there's an error, or if it wasn't used because 604 - * a cell with that key already exists. 520 + * We have two lock levels. Level 0, which is used to prevent WRITEs, and 521 + * level 1 which prevents *both* READs and WRITEs. 605 522 */ 606 - typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); 523 + #define WRITE_LOCK_LEVEL 0 524 + #define READ_WRITE_LOCK_LEVEL 1 607 525 608 - static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end, 609 - struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 610 - cell_free_fn free_fn, void *free_context, 611 - struct dm_bio_prison_cell **cell_result) 526 + static unsigned lock_level(struct bio *bio) 612 527 { 613 - int r; 614 - struct dm_cell_key key; 615 - 616 - build_key(oblock_begin, oblock_end, &key); 617 - r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); 618 - if (r) 619 - free_fn(free_context, cell_prealloc); 620 - 621 - return r; 528 + return bio_data_dir(bio) == WRITE ? 529 + WRITE_LOCK_LEVEL : 530 + READ_WRITE_LOCK_LEVEL; 622 531 } 623 532 624 - static int bio_detain(struct cache *cache, dm_oblock_t oblock, 625 - struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 626 - cell_free_fn free_fn, void *free_context, 627 - struct dm_bio_prison_cell **cell_result) 533 + /*---------------------------------------------------------------- 534 + * Per bio data 535 + *--------------------------------------------------------------*/ 536 + 537 + /* 538 + * If using writeback, leave out struct per_bio_data's writethrough fields. 539 + */ 540 + #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) 541 + #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) 542 + 543 + static size_t get_per_bio_data_size(struct cache *cache) 628 544 { 545 + return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; 546 + } 547 + 548 + static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) 549 + { 550 + struct per_bio_data *pb = dm_per_bio_data(bio, data_size); 551 + BUG_ON(!pb); 552 + return pb; 553 + } 554 + 555 + static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) 556 + { 557 + struct per_bio_data *pb = get_per_bio_data(bio, data_size); 558 + 559 + pb->tick = false; 560 + pb->req_nr = dm_bio_get_target_bio_nr(bio); 561 + pb->cell = NULL; 562 + pb->len = 0; 563 + 564 + return pb; 565 + } 566 + 567 + /*----------------------------------------------------------------*/ 568 + 569 + static void defer_bio(struct cache *cache, struct bio *bio) 570 + { 571 + unsigned long flags; 572 + 573 + spin_lock_irqsave(&cache->lock, flags); 574 + bio_list_add(&cache->deferred_bios, bio); 575 + spin_unlock_irqrestore(&cache->lock, flags); 576 + 577 + wake_deferred_bio_worker(cache); 578 + } 579 + 580 + static void defer_bios(struct cache *cache, struct bio_list *bios) 581 + { 582 + unsigned long flags; 583 + 584 + spin_lock_irqsave(&cache->lock, flags); 585 + bio_list_merge(&cache->deferred_bios, bios); 586 + bio_list_init(bios); 587 + spin_unlock_irqrestore(&cache->lock, flags); 588 + 589 + wake_deferred_bio_worker(cache); 590 + } 591 + 592 + /*----------------------------------------------------------------*/ 593 + 594 + static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio) 595 + { 596 + bool r; 597 + size_t pb_size; 598 + struct per_bio_data *pb; 599 + struct dm_cell_key_v2 key; 629 600 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 630 - return bio_detain_range(cache, oblock, end, bio, 631 - cell_prealloc, free_fn, free_context, cell_result); 632 - } 601 + struct dm_bio_prison_cell_v2 *cell_prealloc, *cell; 633 602 634 - static int get_cell(struct cache *cache, 635 - dm_oblock_t oblock, 636 - struct prealloc *structs, 637 - struct dm_bio_prison_cell **cell_result) 638 - { 639 - int r; 640 - struct dm_cell_key key; 641 - struct dm_bio_prison_cell *cell_prealloc; 603 + cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */ 604 + if (!cell_prealloc) { 605 + defer_bio(cache, bio); 606 + return false; 607 + } 642 608 643 - cell_prealloc = prealloc_get_cell(structs); 609 + build_key(oblock, end, &key); 610 + r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell); 611 + if (!r) { 612 + /* 613 + * Failed to get the lock. 614 + */ 615 + free_prison_cell(cache, cell_prealloc); 616 + return r; 617 + } 644 618 645 - build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key); 646 - r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); 647 - if (r) 648 - prealloc_put_cell(structs, cell_prealloc); 619 + if (cell != cell_prealloc) 620 + free_prison_cell(cache, cell_prealloc); 621 + 622 + pb_size = get_per_bio_data_size(cache); 623 + pb = get_per_bio_data(bio, pb_size); 624 + pb->cell = cell; 649 625 650 626 return r; 651 627 } ··· 715 575 return test_bit(from_cblock(b), cache->dirty_bitset); 716 576 } 717 577 718 - static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 578 + static void set_dirty(struct cache *cache, dm_cblock_t cblock) 719 579 { 720 580 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 721 581 atomic_inc(&cache->nr_dirty); 722 - policy_set_dirty(cache->policy, oblock); 582 + policy_set_dirty(cache->policy, cblock); 723 583 } 724 584 } 725 585 726 - static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 586 + /* 587 + * These two are called when setting after migrations to force the policy 588 + * and dirty bitset to be in sync. 589 + */ 590 + static void force_set_dirty(struct cache *cache, dm_cblock_t cblock) 591 + { 592 + if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) 593 + atomic_inc(&cache->nr_dirty); 594 + policy_set_dirty(cache->policy, cblock); 595 + } 596 + 597 + static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock) 727 598 { 728 599 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 729 - policy_clear_dirty(cache->policy, oblock); 730 600 if (atomic_dec_return(&cache->nr_dirty) == 0) 731 601 dm_table_event(cache->ti->table); 732 602 } 603 + 604 + policy_clear_dirty(cache->policy, cblock); 733 605 } 734 606 735 607 /*----------------------------------------------------------------*/ ··· 778 626 { 779 627 return to_dblock(block_div(from_oblock(oblock), 780 628 oblocks_per_dblock(cache))); 781 - } 782 - 783 - static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock) 784 - { 785 - return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache)); 786 629 } 787 630 788 631 static void set_discard(struct cache *cache, dm_dblock_t b) ··· 826 679 return r; 827 680 } 828 681 829 - /*----------------------------------------------------------------*/ 830 - 831 - static void load_stats(struct cache *cache) 832 - { 833 - struct dm_cache_statistics stats; 834 - 835 - dm_cache_metadata_get_stats(cache->cmd, &stats); 836 - atomic_set(&cache->stats.read_hit, stats.read_hits); 837 - atomic_set(&cache->stats.read_miss, stats.read_misses); 838 - atomic_set(&cache->stats.write_hit, stats.write_hits); 839 - atomic_set(&cache->stats.write_miss, stats.write_misses); 840 - } 841 - 842 - static void save_stats(struct cache *cache) 843 - { 844 - struct dm_cache_statistics stats; 845 - 846 - if (get_cache_mode(cache) >= CM_READ_ONLY) 847 - return; 848 - 849 - stats.read_hits = atomic_read(&cache->stats.read_hit); 850 - stats.read_misses = atomic_read(&cache->stats.read_miss); 851 - stats.write_hits = atomic_read(&cache->stats.write_hit); 852 - stats.write_misses = atomic_read(&cache->stats.write_miss); 853 - 854 - dm_cache_metadata_set_stats(cache->cmd, &stats); 855 - } 856 - 857 - /*---------------------------------------------------------------- 858 - * Per bio data 859 - *--------------------------------------------------------------*/ 860 - 861 - /* 862 - * If using writeback, leave out struct per_bio_data's writethrough fields. 863 - */ 864 - #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) 865 - #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) 866 - 867 - static bool writethrough_mode(struct cache_features *f) 868 - { 869 - return f->io_mode == CM_IO_WRITETHROUGH; 870 - } 871 - 872 - static bool writeback_mode(struct cache_features *f) 873 - { 874 - return f->io_mode == CM_IO_WRITEBACK; 875 - } 876 - 877 - static bool passthrough_mode(struct cache_features *f) 878 - { 879 - return f->io_mode == CM_IO_PASSTHROUGH; 880 - } 881 - 882 - static size_t get_per_bio_data_size(struct cache *cache) 883 - { 884 - return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; 885 - } 886 - 887 - static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) 888 - { 889 - struct per_bio_data *pb = dm_per_bio_data(bio, data_size); 890 - BUG_ON(!pb); 891 - return pb; 892 - } 893 - 894 - static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) 895 - { 896 - struct per_bio_data *pb = get_per_bio_data(bio, data_size); 897 - 898 - pb->tick = false; 899 - pb->req_nr = dm_bio_get_target_bio_nr(bio); 900 - pb->all_io_entry = NULL; 901 - pb->len = 0; 902 - 903 - return pb; 904 - } 905 - 906 682 /*---------------------------------------------------------------- 907 683 * Remapping 908 684 *--------------------------------------------------------------*/ ··· 867 797 } 868 798 869 799 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 870 - dm_oblock_t oblock) 800 + dm_oblock_t oblock) 871 801 { 802 + // FIXME: this is called way too much. 872 803 check_if_tick_bio_needed(cache, bio); 873 804 remap_to_origin(cache, bio); 874 805 if (bio_data_dir(bio) == WRITE) ··· 882 811 check_if_tick_bio_needed(cache, bio); 883 812 remap_to_cache(cache, bio, cblock); 884 813 if (bio_data_dir(bio) == WRITE) { 885 - set_dirty(cache, oblock, cblock); 814 + set_dirty(cache, cblock); 886 815 clear_discard(cache, oblock_to_dblock(cache, oblock)); 887 816 } 888 817 } ··· 897 826 block_nr >>= cache->sectors_per_block_shift; 898 827 899 828 return to_oblock(block_nr); 900 - } 901 - 902 - /* 903 - * You must increment the deferred set whilst the prison cell is held. To 904 - * encourage this, we ask for 'cell' to be passed in. 905 - */ 906 - static void inc_ds(struct cache *cache, struct bio *bio, 907 - struct dm_bio_prison_cell *cell) 908 - { 909 - size_t pb_data_size = get_per_bio_data_size(cache); 910 - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 911 - 912 - BUG_ON(!cell); 913 - BUG_ON(pb->all_io_entry); 914 - 915 - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 916 829 } 917 830 918 831 static bool accountable_bio(struct cache *cache, struct bio *bio) ··· 930 875 generic_make_request(bio); 931 876 } 932 877 933 - static void issue(struct cache *cache, struct bio *bio) 878 + static void issue_op(struct bio *bio, void *context) 934 879 { 935 - unsigned long flags; 936 - 937 - if (!op_is_flush(bio->bi_opf)) { 938 - accounted_request(cache, bio); 939 - return; 940 - } 941 - 942 - /* 943 - * Batch together any bios that trigger commits and then issue a 944 - * single commit for them in do_worker(). 945 - */ 946 - spin_lock_irqsave(&cache->lock, flags); 947 - cache->commit_requested = true; 948 - bio_list_add(&cache->deferred_flush_bios, bio); 949 - spin_unlock_irqrestore(&cache->lock, flags); 950 - } 951 - 952 - static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell) 953 - { 954 - inc_ds(cache, bio, cell); 955 - issue(cache, bio); 880 + struct cache *cache = context; 881 + accounted_request(cache, bio); 956 882 } 957 883 958 884 static void defer_writethrough_bio(struct cache *cache, struct bio *bio) ··· 944 908 bio_list_add(&cache->deferred_writethrough_bios, bio); 945 909 spin_unlock_irqrestore(&cache->lock, flags); 946 910 947 - wake_worker(cache); 911 + wake_deferred_writethrough_worker(cache); 948 912 } 949 913 950 914 static void writethrough_endio(struct bio *bio) ··· 970 934 } 971 935 972 936 /* 937 + * FIXME: send in parallel, huge latency as is. 973 938 * When running in writethrough mode we need to send writes to clean blocks 974 939 * to both the cache and origin devices. In future we'd like to clone the 975 940 * bio and send them in parallel, but for now we're doing them in ··· 1083 1046 set_cache_mode(cache, CM_READ_ONLY); 1084 1047 } 1085 1048 1049 + /*----------------------------------------------------------------*/ 1050 + 1051 + static void load_stats(struct cache *cache) 1052 + { 1053 + struct dm_cache_statistics stats; 1054 + 1055 + dm_cache_metadata_get_stats(cache->cmd, &stats); 1056 + atomic_set(&cache->stats.read_hit, stats.read_hits); 1057 + atomic_set(&cache->stats.read_miss, stats.read_misses); 1058 + atomic_set(&cache->stats.write_hit, stats.write_hits); 1059 + atomic_set(&cache->stats.write_miss, stats.write_misses); 1060 + } 1061 + 1062 + static void save_stats(struct cache *cache) 1063 + { 1064 + struct dm_cache_statistics stats; 1065 + 1066 + if (get_cache_mode(cache) >= CM_READ_ONLY) 1067 + return; 1068 + 1069 + stats.read_hits = atomic_read(&cache->stats.read_hit); 1070 + stats.read_misses = atomic_read(&cache->stats.read_miss); 1071 + stats.write_hits = atomic_read(&cache->stats.write_hit); 1072 + stats.write_misses = atomic_read(&cache->stats.write_miss); 1073 + 1074 + dm_cache_metadata_set_stats(cache->cmd, &stats); 1075 + } 1076 + 1077 + static void update_stats(struct cache_stats *stats, enum policy_operation op) 1078 + { 1079 + switch (op) { 1080 + case POLICY_PROMOTE: 1081 + atomic_inc(&stats->promotion); 1082 + break; 1083 + 1084 + case POLICY_DEMOTE: 1085 + atomic_inc(&stats->demotion); 1086 + break; 1087 + 1088 + case POLICY_WRITEBACK: 1089 + atomic_inc(&stats->writeback); 1090 + break; 1091 + } 1092 + } 1093 + 1086 1094 /*---------------------------------------------------------------- 1087 1095 * Migration processing 1088 1096 * 1089 1097 * Migration covers moving data from the origin device to the cache, or 1090 1098 * vice versa. 1091 1099 *--------------------------------------------------------------*/ 1100 + 1092 1101 static void inc_io_migrations(struct cache *cache) 1093 1102 { 1094 1103 atomic_inc(&cache->nr_io_migrations); ··· 1148 1065 static bool discard_or_flush(struct bio *bio) 1149 1066 { 1150 1067 return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); 1151 - } 1152 - 1153 - static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell) 1154 - { 1155 - if (discard_or_flush(cell->holder)) { 1156 - /* 1157 - * We have to handle these bios individually. 1158 - */ 1159 - dm_cell_release(cache->prison, cell, &cache->deferred_bios); 1160 - free_prison_cell(cache, cell); 1161 - } else 1162 - list_add_tail(&cell->user_list, &cache->deferred_cells); 1163 - } 1164 - 1165 - static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder) 1166 - { 1167 - unsigned long flags; 1168 - 1169 - if (!holder && dm_cell_promote_or_release(cache->prison, cell)) { 1170 - /* 1171 - * There was no prisoner to promote to holder, the 1172 - * cell has been released. 1173 - */ 1174 - free_prison_cell(cache, cell); 1175 - return; 1176 - } 1177 - 1178 - spin_lock_irqsave(&cache->lock, flags); 1179 - __cell_defer(cache, cell); 1180 - spin_unlock_irqrestore(&cache->lock, flags); 1181 - 1182 - wake_worker(cache); 1183 - } 1184 - 1185 - static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err) 1186 - { 1187 - dm_cell_error(cache->prison, cell, err); 1188 - free_prison_cell(cache, cell); 1189 - } 1190 - 1191 - static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell) 1192 - { 1193 - cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE); 1194 - } 1195 - 1196 - static void free_io_migration(struct dm_cache_migration *mg) 1197 - { 1198 - struct cache *cache = mg->cache; 1199 - 1200 - dec_io_migrations(cache); 1201 - free_migration(mg); 1202 - wake_worker(cache); 1203 - } 1204 - 1205 - static void migration_failure(struct dm_cache_migration *mg) 1206 - { 1207 - struct cache *cache = mg->cache; 1208 - const char *dev_name = cache_device_name(cache); 1209 - 1210 - if (mg->writeback) { 1211 - DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name); 1212 - set_dirty(cache, mg->old_oblock, mg->cblock); 1213 - cell_defer(cache, mg->old_ocell, false); 1214 - 1215 - } else if (mg->demote) { 1216 - DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name); 1217 - policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); 1218 - 1219 - cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 1220 - if (mg->promote) 1221 - cell_defer(cache, mg->new_ocell, true); 1222 - } else { 1223 - DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name); 1224 - policy_remove_mapping(cache->policy, mg->new_oblock); 1225 - cell_defer(cache, mg->new_ocell, true); 1226 - } 1227 - 1228 - free_io_migration(mg); 1229 - } 1230 - 1231 - static void migration_success_pre_commit(struct dm_cache_migration *mg) 1232 - { 1233 - int r; 1234 - unsigned long flags; 1235 - struct cache *cache = mg->cache; 1236 - 1237 - if (mg->writeback) { 1238 - clear_dirty(cache, mg->old_oblock, mg->cblock); 1239 - cell_defer(cache, mg->old_ocell, false); 1240 - free_io_migration(mg); 1241 - return; 1242 - 1243 - } else if (mg->demote) { 1244 - r = dm_cache_remove_mapping(cache->cmd, mg->cblock); 1245 - if (r) { 1246 - DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata", 1247 - cache_device_name(cache)); 1248 - metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1249 - policy_force_mapping(cache->policy, mg->new_oblock, 1250 - mg->old_oblock); 1251 - if (mg->promote) 1252 - cell_defer(cache, mg->new_ocell, true); 1253 - free_io_migration(mg); 1254 - return; 1255 - } 1256 - } else { 1257 - r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock); 1258 - if (r) { 1259 - DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata", 1260 - cache_device_name(cache)); 1261 - metadata_operation_failed(cache, "dm_cache_insert_mapping", r); 1262 - policy_remove_mapping(cache->policy, mg->new_oblock); 1263 - free_io_migration(mg); 1264 - return; 1265 - } 1266 - } 1267 - 1268 - spin_lock_irqsave(&cache->lock, flags); 1269 - list_add_tail(&mg->list, &cache->need_commit_migrations); 1270 - cache->commit_requested = true; 1271 - spin_unlock_irqrestore(&cache->lock, flags); 1272 - } 1273 - 1274 - static void migration_success_post_commit(struct dm_cache_migration *mg) 1275 - { 1276 - unsigned long flags; 1277 - struct cache *cache = mg->cache; 1278 - 1279 - if (mg->writeback) { 1280 - DMWARN_LIMIT("%s: writeback unexpectedly triggered commit", 1281 - cache_device_name(cache)); 1282 - return; 1283 - 1284 - } else if (mg->demote) { 1285 - cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 1286 - 1287 - if (mg->promote) { 1288 - mg->demote = false; 1289 - 1290 - spin_lock_irqsave(&cache->lock, flags); 1291 - list_add_tail(&mg->list, &cache->quiesced_migrations); 1292 - spin_unlock_irqrestore(&cache->lock, flags); 1293 - 1294 - } else { 1295 - if (mg->invalidate) 1296 - policy_remove_mapping(cache->policy, mg->old_oblock); 1297 - free_io_migration(mg); 1298 - } 1299 - 1300 - } else { 1301 - if (mg->requeue_holder) { 1302 - clear_dirty(cache, mg->new_oblock, mg->cblock); 1303 - cell_defer(cache, mg->new_ocell, true); 1304 - } else { 1305 - /* 1306 - * The block was promoted via an overwrite, so it's dirty. 1307 - */ 1308 - set_dirty(cache, mg->new_oblock, mg->cblock); 1309 - bio_endio(mg->new_ocell->holder); 1310 - cell_defer(cache, mg->new_ocell, false); 1311 - } 1312 - free_io_migration(mg); 1313 - } 1314 - } 1315 - 1316 - static void copy_complete(int read_err, unsigned long write_err, void *context) 1317 - { 1318 - unsigned long flags; 1319 - struct dm_cache_migration *mg = (struct dm_cache_migration *) context; 1320 - struct cache *cache = mg->cache; 1321 - 1322 - if (read_err || write_err) 1323 - mg->err = true; 1324 - 1325 - spin_lock_irqsave(&cache->lock, flags); 1326 - list_add_tail(&mg->list, &cache->completed_migrations); 1327 - spin_unlock_irqrestore(&cache->lock, flags); 1328 - 1329 - wake_worker(cache); 1330 - } 1331 - 1332 - static void issue_copy(struct dm_cache_migration *mg) 1333 - { 1334 - int r; 1335 - struct dm_io_region o_region, c_region; 1336 - struct cache *cache = mg->cache; 1337 - sector_t cblock = from_cblock(mg->cblock); 1338 - 1339 - o_region.bdev = cache->origin_dev->bdev; 1340 - o_region.count = cache->sectors_per_block; 1341 - 1342 - c_region.bdev = cache->cache_dev->bdev; 1343 - c_region.sector = cblock * cache->sectors_per_block; 1344 - c_region.count = cache->sectors_per_block; 1345 - 1346 - if (mg->writeback || mg->demote) { 1347 - /* demote */ 1348 - o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; 1349 - r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); 1350 - } else { 1351 - /* promote */ 1352 - o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block; 1353 - r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); 1354 - } 1355 - 1356 - if (r < 0) { 1357 - DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache)); 1358 - migration_failure(mg); 1359 - } 1360 - } 1361 - 1362 - static void overwrite_endio(struct bio *bio) 1363 - { 1364 - struct dm_cache_migration *mg = bio->bi_private; 1365 - struct cache *cache = mg->cache; 1366 - size_t pb_data_size = get_per_bio_data_size(cache); 1367 - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1368 - unsigned long flags; 1369 - 1370 - dm_unhook_bio(&pb->hook_info, bio); 1371 - 1372 - if (bio->bi_error) 1373 - mg->err = true; 1374 - 1375 - mg->requeue_holder = false; 1376 - 1377 - spin_lock_irqsave(&cache->lock, flags); 1378 - list_add_tail(&mg->list, &cache->completed_migrations); 1379 - spin_unlock_irqrestore(&cache->lock, flags); 1380 - 1381 - wake_worker(cache); 1382 - } 1383 - 1384 - static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) 1385 - { 1386 - size_t pb_data_size = get_per_bio_data_size(mg->cache); 1387 - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1388 - 1389 - dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1390 - remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock); 1391 - 1392 - /* 1393 - * No need to inc_ds() here, since the cell will be held for the 1394 - * duration of the io. 1395 - */ 1396 - accounted_request(mg->cache, bio); 1397 - } 1398 - 1399 - static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1400 - { 1401 - return (bio_data_dir(bio) == WRITE) && 1402 - (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1403 - } 1404 - 1405 - static void avoid_copy(struct dm_cache_migration *mg) 1406 - { 1407 - atomic_inc(&mg->cache->stats.copies_avoided); 1408 - migration_success_pre_commit(mg); 1409 1068 } 1410 1069 1411 1070 static void calc_discard_block_range(struct cache *cache, struct bio *bio, ··· 1164 1339 *e = to_dblock(block_div(se, cache->discard_block_size)); 1165 1340 } 1166 1341 1167 - static void issue_discard(struct dm_cache_migration *mg) 1342 + /*----------------------------------------------------------------*/ 1343 + 1344 + static void prevent_background_work(struct cache *cache) 1168 1345 { 1169 - dm_dblock_t b, e; 1170 - struct bio *bio = mg->new_ocell->holder; 1171 - struct cache *cache = mg->cache; 1172 - 1173 - calc_discard_block_range(cache, bio, &b, &e); 1174 - while (b != e) { 1175 - set_discard(cache, b); 1176 - b = to_dblock(from_dblock(b) + 1); 1177 - } 1178 - 1179 - bio_endio(bio); 1180 - cell_defer(cache, mg->new_ocell, false); 1181 - free_migration(mg); 1182 - wake_worker(cache); 1346 + lockdep_off(); 1347 + down_write(&cache->background_work_lock); 1348 + lockdep_on(); 1183 1349 } 1184 1350 1185 - static void issue_copy_or_discard(struct dm_cache_migration *mg) 1351 + static void allow_background_work(struct cache *cache) 1186 1352 { 1187 - bool avoid; 1188 - struct cache *cache = mg->cache; 1189 - 1190 - if (mg->discard) { 1191 - issue_discard(mg); 1192 - return; 1193 - } 1194 - 1195 - if (mg->writeback || mg->demote) 1196 - avoid = !is_dirty(cache, mg->cblock) || 1197 - is_discarded_oblock(cache, mg->old_oblock); 1198 - else { 1199 - struct bio *bio = mg->new_ocell->holder; 1200 - 1201 - avoid = is_discarded_oblock(cache, mg->new_oblock); 1202 - 1203 - if (writeback_mode(&cache->features) && 1204 - !avoid && bio_writes_complete_block(cache, bio)) { 1205 - issue_overwrite(mg, bio); 1206 - return; 1207 - } 1208 - } 1209 - 1210 - avoid ? avoid_copy(mg) : issue_copy(mg); 1353 + lockdep_off(); 1354 + up_write(&cache->background_work_lock); 1355 + lockdep_on(); 1211 1356 } 1212 1357 1213 - static void complete_migration(struct dm_cache_migration *mg) 1358 + static bool background_work_begin(struct cache *cache) 1214 1359 { 1215 - if (mg->err) 1216 - migration_failure(mg); 1360 + bool r; 1361 + 1362 + lockdep_off(); 1363 + r = down_read_trylock(&cache->background_work_lock); 1364 + lockdep_on(); 1365 + 1366 + return r; 1367 + } 1368 + 1369 + static void background_work_end(struct cache *cache) 1370 + { 1371 + lockdep_off(); 1372 + up_read(&cache->background_work_lock); 1373 + lockdep_on(); 1374 + } 1375 + 1376 + /*----------------------------------------------------------------*/ 1377 + 1378 + static void quiesce(struct dm_cache_migration *mg, 1379 + void (*continuation)(struct work_struct *)) 1380 + { 1381 + init_continuation(&mg->k, continuation); 1382 + dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws); 1383 + } 1384 + 1385 + static struct dm_cache_migration *ws_to_mg(struct work_struct *ws) 1386 + { 1387 + struct continuation *k = container_of(ws, struct continuation, ws); 1388 + return container_of(k, struct dm_cache_migration, k); 1389 + } 1390 + 1391 + static void copy_complete(int read_err, unsigned long write_err, void *context) 1392 + { 1393 + struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k); 1394 + 1395 + if (read_err || write_err) 1396 + mg->k.input = -EIO; 1397 + 1398 + queue_continuation(mg->cache->wq, &mg->k); 1399 + } 1400 + 1401 + static int copy(struct dm_cache_migration *mg, bool promote) 1402 + { 1403 + int r; 1404 + struct dm_io_region o_region, c_region; 1405 + struct cache *cache = mg->cache; 1406 + 1407 + o_region.bdev = cache->origin_dev->bdev; 1408 + o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block; 1409 + o_region.count = cache->sectors_per_block; 1410 + 1411 + c_region.bdev = cache->cache_dev->bdev; 1412 + c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block; 1413 + c_region.count = cache->sectors_per_block; 1414 + 1415 + if (promote) 1416 + r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k); 1217 1417 else 1218 - migration_success_pre_commit(mg); 1418 + r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k); 1419 + 1420 + return r; 1219 1421 } 1220 1422 1221 - static void process_migrations(struct cache *cache, struct list_head *head, 1222 - void (*fn)(struct dm_cache_migration *)) 1423 + static void bio_drop_shared_lock(struct cache *cache, struct bio *bio) 1223 1424 { 1224 - unsigned long flags; 1225 - struct list_head list; 1226 - struct dm_cache_migration *mg, *tmp; 1425 + size_t pb_data_size = get_per_bio_data_size(cache); 1426 + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1227 1427 1228 - INIT_LIST_HEAD(&list); 1229 - spin_lock_irqsave(&cache->lock, flags); 1230 - list_splice_init(head, &list); 1231 - spin_unlock_irqrestore(&cache->lock, flags); 1232 - 1233 - list_for_each_entry_safe(mg, tmp, &list, list) 1234 - fn(mg); 1428 + if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell)) 1429 + free_prison_cell(cache, pb->cell); 1430 + pb->cell = NULL; 1235 1431 } 1236 1432 1237 - static void __queue_quiesced_migration(struct dm_cache_migration *mg) 1433 + static void overwrite_endio(struct bio *bio) 1238 1434 { 1239 - list_add_tail(&mg->list, &mg->cache->quiesced_migrations); 1240 - } 1241 - 1242 - static void queue_quiesced_migration(struct dm_cache_migration *mg) 1243 - { 1244 - unsigned long flags; 1435 + struct dm_cache_migration *mg = bio->bi_private; 1245 1436 struct cache *cache = mg->cache; 1437 + size_t pb_data_size = get_per_bio_data_size(cache); 1438 + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1246 1439 1247 - spin_lock_irqsave(&cache->lock, flags); 1248 - __queue_quiesced_migration(mg); 1249 - spin_unlock_irqrestore(&cache->lock, flags); 1440 + dm_unhook_bio(&pb->hook_info, bio); 1250 1441 1251 - wake_worker(cache); 1442 + if (bio->bi_error) 1443 + mg->k.input = bio->bi_error; 1444 + 1445 + queue_continuation(mg->cache->wq, &mg->k); 1252 1446 } 1253 1447 1254 - static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) 1448 + static void overwrite(struct dm_cache_migration *mg, 1449 + void (*continuation)(struct work_struct *)) 1255 1450 { 1256 - unsigned long flags; 1257 - struct dm_cache_migration *mg, *tmp; 1451 + struct bio *bio = mg->overwrite_bio; 1452 + size_t pb_data_size = get_per_bio_data_size(mg->cache); 1453 + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1258 1454 1259 - spin_lock_irqsave(&cache->lock, flags); 1260 - list_for_each_entry_safe(mg, tmp, work, list) 1261 - __queue_quiesced_migration(mg); 1262 - spin_unlock_irqrestore(&cache->lock, flags); 1455 + dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1263 1456 1264 - wake_worker(cache); 1265 - } 1457 + /* 1458 + * The overwrite bio is part of the copy operation, as such it does 1459 + * not set/clear discard or dirty flags. 1460 + */ 1461 + if (mg->op->op == POLICY_PROMOTE) 1462 + remap_to_cache(mg->cache, bio, mg->op->cblock); 1463 + else 1464 + remap_to_origin(mg->cache, bio); 1266 1465 1267 - static void check_for_quiesced_migrations(struct cache *cache, 1268 - struct per_bio_data *pb) 1269 - { 1270 - struct list_head work; 1271 - 1272 - if (!pb->all_io_entry) 1273 - return; 1274 - 1275 - INIT_LIST_HEAD(&work); 1276 - dm_deferred_entry_dec(pb->all_io_entry, &work); 1277 - 1278 - if (!list_empty(&work)) 1279 - queue_quiesced_migrations(cache, &work); 1280 - } 1281 - 1282 - static void quiesce_migration(struct dm_cache_migration *mg) 1283 - { 1284 - if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list)) 1285 - queue_quiesced_migration(mg); 1286 - } 1287 - 1288 - static void promote(struct cache *cache, struct prealloc *structs, 1289 - dm_oblock_t oblock, dm_cblock_t cblock, 1290 - struct dm_bio_prison_cell *cell) 1291 - { 1292 - struct dm_cache_migration *mg = prealloc_get_migration(structs); 1293 - 1294 - mg->err = false; 1295 - mg->discard = false; 1296 - mg->writeback = false; 1297 - mg->demote = false; 1298 - mg->promote = true; 1299 - mg->requeue_holder = true; 1300 - mg->invalidate = false; 1301 - mg->cache = cache; 1302 - mg->new_oblock = oblock; 1303 - mg->cblock = cblock; 1304 - mg->old_ocell = NULL; 1305 - mg->new_ocell = cell; 1306 - mg->start_jiffies = jiffies; 1307 - 1308 - inc_io_migrations(cache); 1309 - quiesce_migration(mg); 1310 - } 1311 - 1312 - static void writeback(struct cache *cache, struct prealloc *structs, 1313 - dm_oblock_t oblock, dm_cblock_t cblock, 1314 - struct dm_bio_prison_cell *cell) 1315 - { 1316 - struct dm_cache_migration *mg = prealloc_get_migration(structs); 1317 - 1318 - mg->err = false; 1319 - mg->discard = false; 1320 - mg->writeback = true; 1321 - mg->demote = false; 1322 - mg->promote = false; 1323 - mg->requeue_holder = true; 1324 - mg->invalidate = false; 1325 - mg->cache = cache; 1326 - mg->old_oblock = oblock; 1327 - mg->cblock = cblock; 1328 - mg->old_ocell = cell; 1329 - mg->new_ocell = NULL; 1330 - mg->start_jiffies = jiffies; 1331 - 1332 - inc_io_migrations(cache); 1333 - quiesce_migration(mg); 1334 - } 1335 - 1336 - static void demote_then_promote(struct cache *cache, struct prealloc *structs, 1337 - dm_oblock_t old_oblock, dm_oblock_t new_oblock, 1338 - dm_cblock_t cblock, 1339 - struct dm_bio_prison_cell *old_ocell, 1340 - struct dm_bio_prison_cell *new_ocell) 1341 - { 1342 - struct dm_cache_migration *mg = prealloc_get_migration(structs); 1343 - 1344 - mg->err = false; 1345 - mg->discard = false; 1346 - mg->writeback = false; 1347 - mg->demote = true; 1348 - mg->promote = true; 1349 - mg->requeue_holder = true; 1350 - mg->invalidate = false; 1351 - mg->cache = cache; 1352 - mg->old_oblock = old_oblock; 1353 - mg->new_oblock = new_oblock; 1354 - mg->cblock = cblock; 1355 - mg->old_ocell = old_ocell; 1356 - mg->new_ocell = new_ocell; 1357 - mg->start_jiffies = jiffies; 1358 - 1359 - inc_io_migrations(cache); 1360 - quiesce_migration(mg); 1466 + init_continuation(&mg->k, continuation); 1467 + accounted_request(mg->cache, bio); 1361 1468 } 1362 1469 1363 1470 /* 1364 - * Invalidate a cache entry. No writeback occurs; any changes in the cache 1365 - * block are thrown away. 1471 + * Migration steps: 1472 + * 1473 + * 1) exclusive lock preventing WRITEs 1474 + * 2) quiesce 1475 + * 3) copy or issue overwrite bio 1476 + * 4) upgrade to exclusive lock preventing READs and WRITEs 1477 + * 5) quiesce 1478 + * 6) update metadata and commit 1479 + * 7) unlock 1366 1480 */ 1367 - static void invalidate(struct cache *cache, struct prealloc *structs, 1368 - dm_oblock_t oblock, dm_cblock_t cblock, 1369 - struct dm_bio_prison_cell *cell) 1481 + static void mg_complete(struct dm_cache_migration *mg, bool success) 1370 1482 { 1371 - struct dm_cache_migration *mg = prealloc_get_migration(structs); 1483 + struct bio_list bios; 1484 + struct cache *cache = mg->cache; 1485 + struct policy_work *op = mg->op; 1486 + dm_cblock_t cblock = op->cblock; 1372 1487 1373 - mg->err = false; 1374 - mg->discard = false; 1375 - mg->writeback = false; 1376 - mg->demote = true; 1377 - mg->promote = false; 1378 - mg->requeue_holder = true; 1379 - mg->invalidate = true; 1380 - mg->cache = cache; 1381 - mg->old_oblock = oblock; 1382 - mg->cblock = cblock; 1383 - mg->old_ocell = cell; 1384 - mg->new_ocell = NULL; 1385 - mg->start_jiffies = jiffies; 1488 + if (success) 1489 + update_stats(&cache->stats, op->op); 1386 1490 1387 - inc_io_migrations(cache); 1388 - quiesce_migration(mg); 1491 + switch (op->op) { 1492 + case POLICY_PROMOTE: 1493 + clear_discard(cache, oblock_to_dblock(cache, op->oblock)); 1494 + policy_complete_background_work(cache->policy, op, success); 1495 + 1496 + if (mg->overwrite_bio) { 1497 + if (success) 1498 + force_set_dirty(cache, cblock); 1499 + else 1500 + mg->overwrite_bio->bi_error = (mg->k.input ? : -EIO); 1501 + bio_endio(mg->overwrite_bio); 1502 + } else { 1503 + if (success) 1504 + force_clear_dirty(cache, cblock); 1505 + dec_io_migrations(cache); 1506 + } 1507 + break; 1508 + 1509 + case POLICY_DEMOTE: 1510 + /* 1511 + * We clear dirty here to update the nr_dirty counter. 1512 + */ 1513 + if (success) 1514 + force_clear_dirty(cache, cblock); 1515 + policy_complete_background_work(cache->policy, op, success); 1516 + dec_io_migrations(cache); 1517 + break; 1518 + 1519 + case POLICY_WRITEBACK: 1520 + if (success) 1521 + force_clear_dirty(cache, cblock); 1522 + policy_complete_background_work(cache->policy, op, success); 1523 + dec_io_migrations(cache); 1524 + break; 1525 + } 1526 + 1527 + bio_list_init(&bios); 1528 + if (mg->cell) { 1529 + if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1530 + free_prison_cell(cache, mg->cell); 1531 + } 1532 + 1533 + free_migration(mg); 1534 + defer_bios(cache, &bios); 1535 + wake_migration_worker(cache); 1536 + 1537 + background_work_end(cache); 1389 1538 } 1390 1539 1391 - static void discard(struct cache *cache, struct prealloc *structs, 1392 - struct dm_bio_prison_cell *cell) 1540 + static void mg_success(struct work_struct *ws) 1393 1541 { 1394 - struct dm_cache_migration *mg = prealloc_get_migration(structs); 1542 + struct dm_cache_migration *mg = ws_to_mg(ws); 1543 + mg_complete(mg, mg->k.input == 0); 1544 + } 1395 1545 1396 - mg->err = false; 1397 - mg->discard = true; 1398 - mg->writeback = false; 1399 - mg->demote = false; 1400 - mg->promote = false; 1401 - mg->requeue_holder = false; 1402 - mg->invalidate = false; 1546 + static void mg_update_metadata(struct work_struct *ws) 1547 + { 1548 + int r; 1549 + struct dm_cache_migration *mg = ws_to_mg(ws); 1550 + struct cache *cache = mg->cache; 1551 + struct policy_work *op = mg->op; 1552 + 1553 + switch (op->op) { 1554 + case POLICY_PROMOTE: 1555 + r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock); 1556 + if (r) { 1557 + DMERR_LIMIT("%s: migration failed; couldn't insert mapping", 1558 + cache_device_name(cache)); 1559 + metadata_operation_failed(cache, "dm_cache_insert_mapping", r); 1560 + 1561 + mg_complete(mg, false); 1562 + return; 1563 + } 1564 + mg_complete(mg, true); 1565 + break; 1566 + 1567 + case POLICY_DEMOTE: 1568 + r = dm_cache_remove_mapping(cache->cmd, op->cblock); 1569 + if (r) { 1570 + DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata", 1571 + cache_device_name(cache)); 1572 + metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1573 + 1574 + mg_complete(mg, false); 1575 + return; 1576 + } 1577 + 1578 + /* 1579 + * It would be nice if we only had to commit when a REQ_FLUSH 1580 + * comes through. But there's one scenario that we have to 1581 + * look out for: 1582 + * 1583 + * - vblock x in a cache block 1584 + * - domotion occurs 1585 + * - cache block gets reallocated and over written 1586 + * - crash 1587 + * 1588 + * When we recover, because there was no commit the cache will 1589 + * rollback to having the data for vblock x in the cache block. 1590 + * But the cache block has since been overwritten, so it'll end 1591 + * up pointing to data that was never in 'x' during the history 1592 + * of the device. 1593 + * 1594 + * To avoid this issue we require a commit as part of the 1595 + * demotion operation. 1596 + */ 1597 + init_continuation(&mg->k, mg_success); 1598 + continue_after_commit(&cache->committer, &mg->k); 1599 + schedule_commit(&cache->committer); 1600 + break; 1601 + 1602 + case POLICY_WRITEBACK: 1603 + mg_complete(mg, true); 1604 + break; 1605 + } 1606 + } 1607 + 1608 + static void mg_update_metadata_after_copy(struct work_struct *ws) 1609 + { 1610 + struct dm_cache_migration *mg = ws_to_mg(ws); 1611 + 1612 + /* 1613 + * Did the copy succeed? 1614 + */ 1615 + if (mg->k.input) 1616 + mg_complete(mg, false); 1617 + else 1618 + mg_update_metadata(ws); 1619 + } 1620 + 1621 + static void mg_upgrade_lock(struct work_struct *ws) 1622 + { 1623 + int r; 1624 + struct dm_cache_migration *mg = ws_to_mg(ws); 1625 + 1626 + /* 1627 + * Did the copy succeed? 1628 + */ 1629 + if (mg->k.input) 1630 + mg_complete(mg, false); 1631 + 1632 + else { 1633 + /* 1634 + * Now we want the lock to prevent both reads and writes. 1635 + */ 1636 + r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell, 1637 + READ_WRITE_LOCK_LEVEL); 1638 + if (r < 0) 1639 + mg_complete(mg, false); 1640 + 1641 + else if (r) 1642 + quiesce(mg, mg_update_metadata); 1643 + 1644 + else 1645 + mg_update_metadata(ws); 1646 + } 1647 + } 1648 + 1649 + static void mg_copy(struct work_struct *ws) 1650 + { 1651 + int r; 1652 + struct dm_cache_migration *mg = ws_to_mg(ws); 1653 + 1654 + if (mg->overwrite_bio) { 1655 + /* 1656 + * It's safe to do this here, even though it's new data 1657 + * because all IO has been locked out of the block. 1658 + * 1659 + * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL 1660 + * so _not_ using mg_upgrade_lock() as continutation. 1661 + */ 1662 + overwrite(mg, mg_update_metadata_after_copy); 1663 + 1664 + } else { 1665 + struct cache *cache = mg->cache; 1666 + struct policy_work *op = mg->op; 1667 + bool is_policy_promote = (op->op == POLICY_PROMOTE); 1668 + 1669 + if ((!is_policy_promote && !is_dirty(cache, op->cblock)) || 1670 + is_discarded_oblock(cache, op->oblock)) { 1671 + mg_upgrade_lock(ws); 1672 + return; 1673 + } 1674 + 1675 + init_continuation(&mg->k, mg_upgrade_lock); 1676 + 1677 + r = copy(mg, is_policy_promote); 1678 + if (r) { 1679 + DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache)); 1680 + mg->k.input = -EIO; 1681 + mg_complete(mg, false); 1682 + } 1683 + } 1684 + } 1685 + 1686 + static int mg_lock_writes(struct dm_cache_migration *mg) 1687 + { 1688 + int r; 1689 + struct dm_cell_key_v2 key; 1690 + struct cache *cache = mg->cache; 1691 + struct dm_bio_prison_cell_v2 *prealloc; 1692 + 1693 + prealloc = alloc_prison_cell(cache); 1694 + if (!prealloc) { 1695 + DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache)); 1696 + mg_complete(mg, false); 1697 + return -ENOMEM; 1698 + } 1699 + 1700 + /* 1701 + * Prevent writes to the block, but allow reads to continue. 1702 + * Unless we're using an overwrite bio, in which case we lock 1703 + * everything. 1704 + */ 1705 + build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key); 1706 + r = dm_cell_lock_v2(cache->prison, &key, 1707 + mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL, 1708 + prealloc, &mg->cell); 1709 + if (r < 0) { 1710 + free_prison_cell(cache, prealloc); 1711 + mg_complete(mg, false); 1712 + return r; 1713 + } 1714 + 1715 + if (mg->cell != prealloc) 1716 + free_prison_cell(cache, prealloc); 1717 + 1718 + if (r == 0) 1719 + mg_copy(&mg->k.ws); 1720 + else 1721 + quiesce(mg, mg_copy); 1722 + 1723 + return 0; 1724 + } 1725 + 1726 + static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio) 1727 + { 1728 + struct dm_cache_migration *mg; 1729 + 1730 + if (!background_work_begin(cache)) { 1731 + policy_complete_background_work(cache->policy, op, false); 1732 + return -EPERM; 1733 + } 1734 + 1735 + mg = alloc_migration(cache); 1736 + if (!mg) { 1737 + policy_complete_background_work(cache->policy, op, false); 1738 + background_work_end(cache); 1739 + return -ENOMEM; 1740 + } 1741 + 1742 + memset(mg, 0, sizeof(*mg)); 1743 + 1403 1744 mg->cache = cache; 1404 - mg->old_ocell = NULL; 1405 - mg->new_ocell = cell; 1406 - mg->start_jiffies = jiffies; 1745 + mg->op = op; 1746 + mg->overwrite_bio = bio; 1407 1747 1408 - quiesce_migration(mg); 1748 + if (!bio) 1749 + inc_io_migrations(cache); 1750 + 1751 + return mg_lock_writes(mg); 1752 + } 1753 + 1754 + /*---------------------------------------------------------------- 1755 + * invalidation processing 1756 + *--------------------------------------------------------------*/ 1757 + 1758 + static void invalidate_complete(struct dm_cache_migration *mg, bool success) 1759 + { 1760 + struct bio_list bios; 1761 + struct cache *cache = mg->cache; 1762 + 1763 + bio_list_init(&bios); 1764 + if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1765 + free_prison_cell(cache, mg->cell); 1766 + 1767 + if (!success && mg->overwrite_bio) 1768 + bio_io_error(mg->overwrite_bio); 1769 + 1770 + free_migration(mg); 1771 + defer_bios(cache, &bios); 1772 + 1773 + background_work_end(cache); 1774 + } 1775 + 1776 + static void invalidate_completed(struct work_struct *ws) 1777 + { 1778 + struct dm_cache_migration *mg = ws_to_mg(ws); 1779 + invalidate_complete(mg, !mg->k.input); 1780 + } 1781 + 1782 + static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock) 1783 + { 1784 + int r = policy_invalidate_mapping(cache->policy, cblock); 1785 + if (!r) { 1786 + r = dm_cache_remove_mapping(cache->cmd, cblock); 1787 + if (r) { 1788 + DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", 1789 + cache_device_name(cache)); 1790 + metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1791 + } 1792 + 1793 + } else if (r == -ENODATA) { 1794 + /* 1795 + * Harmless, already unmapped. 1796 + */ 1797 + r = 0; 1798 + 1799 + } else 1800 + DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache)); 1801 + 1802 + return r; 1803 + } 1804 + 1805 + static void invalidate_remove(struct work_struct *ws) 1806 + { 1807 + int r; 1808 + struct dm_cache_migration *mg = ws_to_mg(ws); 1809 + struct cache *cache = mg->cache; 1810 + 1811 + r = invalidate_cblock(cache, mg->invalidate_cblock); 1812 + if (r) { 1813 + invalidate_complete(mg, false); 1814 + return; 1815 + } 1816 + 1817 + init_continuation(&mg->k, invalidate_completed); 1818 + continue_after_commit(&cache->committer, &mg->k); 1819 + remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock); 1820 + mg->overwrite_bio = NULL; 1821 + schedule_commit(&cache->committer); 1822 + } 1823 + 1824 + static int invalidate_lock(struct dm_cache_migration *mg) 1825 + { 1826 + int r; 1827 + struct dm_cell_key_v2 key; 1828 + struct cache *cache = mg->cache; 1829 + struct dm_bio_prison_cell_v2 *prealloc; 1830 + 1831 + prealloc = alloc_prison_cell(cache); 1832 + if (!prealloc) { 1833 + invalidate_complete(mg, false); 1834 + return -ENOMEM; 1835 + } 1836 + 1837 + build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key); 1838 + r = dm_cell_lock_v2(cache->prison, &key, 1839 + READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell); 1840 + if (r < 0) { 1841 + free_prison_cell(cache, prealloc); 1842 + invalidate_complete(mg, false); 1843 + return r; 1844 + } 1845 + 1846 + if (mg->cell != prealloc) 1847 + free_prison_cell(cache, prealloc); 1848 + 1849 + if (r) 1850 + quiesce(mg, invalidate_remove); 1851 + 1852 + else { 1853 + /* 1854 + * We can't call invalidate_remove() directly here because we 1855 + * might still be in request context. 1856 + */ 1857 + init_continuation(&mg->k, invalidate_remove); 1858 + queue_work(cache->wq, &mg->k.ws); 1859 + } 1860 + 1861 + return 0; 1862 + } 1863 + 1864 + static int invalidate_start(struct cache *cache, dm_cblock_t cblock, 1865 + dm_oblock_t oblock, struct bio *bio) 1866 + { 1867 + struct dm_cache_migration *mg; 1868 + 1869 + if (!background_work_begin(cache)) 1870 + return -EPERM; 1871 + 1872 + mg = alloc_migration(cache); 1873 + if (!mg) { 1874 + background_work_end(cache); 1875 + return -ENOMEM; 1876 + } 1877 + 1878 + memset(mg, 0, sizeof(*mg)); 1879 + 1880 + mg->cache = cache; 1881 + mg->overwrite_bio = bio; 1882 + mg->invalidate_cblock = cblock; 1883 + mg->invalidate_oblock = oblock; 1884 + 1885 + return invalidate_lock(mg); 1409 1886 } 1410 1887 1411 1888 /*---------------------------------------------------------------- 1412 1889 * bio processing 1413 1890 *--------------------------------------------------------------*/ 1414 - static void defer_bio(struct cache *cache, struct bio *bio) 1891 + 1892 + enum busy { 1893 + IDLE, 1894 + MODERATE, 1895 + BUSY 1896 + }; 1897 + 1898 + static enum busy spare_migration_bandwidth(struct cache *cache) 1415 1899 { 1416 - unsigned long flags; 1417 - 1418 - spin_lock_irqsave(&cache->lock, flags); 1419 - bio_list_add(&cache->deferred_bios, bio); 1420 - spin_unlock_irqrestore(&cache->lock, flags); 1421 - 1422 - wake_worker(cache); 1423 - } 1424 - 1425 - static void process_flush_bio(struct cache *cache, struct bio *bio) 1426 - { 1427 - size_t pb_data_size = get_per_bio_data_size(cache); 1428 - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1429 - 1430 - BUG_ON(bio->bi_iter.bi_size); 1431 - if (!pb->req_nr) 1432 - remap_to_origin(cache, bio); 1433 - else 1434 - remap_to_cache(cache, bio, 0); 1435 - 1436 - /* 1437 - * REQ_PREFLUSH is not directed at any particular block so we don't 1438 - * need to inc_ds(). REQ_FUA's are split into a write + REQ_PREFLUSH 1439 - * by dm-core. 1440 - */ 1441 - issue(cache, bio); 1442 - } 1443 - 1444 - static void process_discard_bio(struct cache *cache, struct prealloc *structs, 1445 - struct bio *bio) 1446 - { 1447 - int r; 1448 - dm_dblock_t b, e; 1449 - struct dm_bio_prison_cell *cell_prealloc, *new_ocell; 1450 - 1451 - calc_discard_block_range(cache, bio, &b, &e); 1452 - if (b == e) { 1453 - bio_endio(bio); 1454 - return; 1455 - } 1456 - 1457 - cell_prealloc = prealloc_get_cell(structs); 1458 - r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc, 1459 - (cell_free_fn) prealloc_put_cell, 1460 - structs, &new_ocell); 1461 - if (r > 0) 1462 - return; 1463 - 1464 - discard(cache, structs, new_ocell); 1465 - } 1466 - 1467 - static bool spare_migration_bandwidth(struct cache *cache) 1468 - { 1900 + bool idle = iot_idle_for(&cache->origin_tracker, HZ); 1469 1901 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1470 1902 cache->sectors_per_block; 1471 - return current_volume < cache->migration_threshold; 1903 + 1904 + if (current_volume <= cache->migration_threshold) 1905 + return idle ? IDLE : MODERATE; 1906 + else 1907 + return idle ? MODERATE : BUSY; 1472 1908 } 1473 1909 1474 1910 static void inc_hit_counter(struct cache *cache, struct bio *bio) ··· 1746 1660 1747 1661 /*----------------------------------------------------------------*/ 1748 1662 1749 - struct inc_detail { 1750 - struct cache *cache; 1751 - struct bio_list bios_for_issue; 1752 - struct bio_list unhandled_bios; 1753 - bool any_writes; 1754 - }; 1755 - 1756 - static void inc_fn(void *context, struct dm_bio_prison_cell *cell) 1663 + static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1757 1664 { 1758 - struct bio *bio; 1759 - struct inc_detail *detail = context; 1760 - struct cache *cache = detail->cache; 1665 + return (bio_data_dir(bio) == WRITE) && 1666 + (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1667 + } 1761 1668 1762 - inc_ds(cache, cell->holder, cell); 1763 - if (bio_data_dir(cell->holder) == WRITE) 1764 - detail->any_writes = true; 1669 + static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) 1670 + { 1671 + return writeback_mode(&cache->features) && 1672 + (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); 1673 + } 1765 1674 1766 - while ((bio = bio_list_pop(&cell->bios))) { 1767 - if (discard_or_flush(bio)) { 1768 - bio_list_add(&detail->unhandled_bios, bio); 1769 - continue; 1675 + static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, 1676 + bool *commit_needed) 1677 + { 1678 + int r, data_dir; 1679 + bool rb, background_queued; 1680 + dm_cblock_t cblock; 1681 + size_t pb_data_size = get_per_bio_data_size(cache); 1682 + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1683 + 1684 + *commit_needed = false; 1685 + 1686 + rb = bio_detain_shared(cache, block, bio); 1687 + if (!rb) { 1688 + /* 1689 + * An exclusive lock is held for this block, so we have to 1690 + * wait. We set the commit_needed flag so the current 1691 + * transaction will be committed asap, allowing this lock 1692 + * to be dropped. 1693 + */ 1694 + *commit_needed = true; 1695 + return DM_MAPIO_SUBMITTED; 1696 + } 1697 + 1698 + data_dir = bio_data_dir(bio); 1699 + 1700 + if (optimisable_bio(cache, bio, block)) { 1701 + struct policy_work *op = NULL; 1702 + 1703 + r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op); 1704 + if (unlikely(r && r != -ENOENT)) { 1705 + DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d", 1706 + cache_device_name(cache), r); 1707 + bio_io_error(bio); 1708 + return DM_MAPIO_SUBMITTED; 1770 1709 } 1771 1710 1772 - if (bio_data_dir(bio) == WRITE) 1773 - detail->any_writes = true; 1774 - 1775 - bio_list_add(&detail->bios_for_issue, bio); 1776 - inc_ds(cache, bio, cell); 1777 - } 1778 - } 1779 - 1780 - // FIXME: refactor these two 1781 - static void remap_cell_to_origin_clear_discard(struct cache *cache, 1782 - struct dm_bio_prison_cell *cell, 1783 - dm_oblock_t oblock, bool issue_holder) 1784 - { 1785 - struct bio *bio; 1786 - unsigned long flags; 1787 - struct inc_detail detail; 1788 - 1789 - detail.cache = cache; 1790 - bio_list_init(&detail.bios_for_issue); 1791 - bio_list_init(&detail.unhandled_bios); 1792 - detail.any_writes = false; 1793 - 1794 - spin_lock_irqsave(&cache->lock, flags); 1795 - dm_cell_visit_release(cache->prison, inc_fn, &detail, cell); 1796 - bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios); 1797 - spin_unlock_irqrestore(&cache->lock, flags); 1798 - 1799 - remap_to_origin(cache, cell->holder); 1800 - if (issue_holder) 1801 - issue(cache, cell->holder); 1802 - else 1803 - accounted_begin(cache, cell->holder); 1804 - 1805 - if (detail.any_writes) 1806 - clear_discard(cache, oblock_to_dblock(cache, oblock)); 1807 - 1808 - while ((bio = bio_list_pop(&detail.bios_for_issue))) { 1809 - remap_to_origin(cache, bio); 1810 - issue(cache, bio); 1811 - } 1812 - 1813 - free_prison_cell(cache, cell); 1814 - } 1815 - 1816 - static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell, 1817 - dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder) 1818 - { 1819 - struct bio *bio; 1820 - unsigned long flags; 1821 - struct inc_detail detail; 1822 - 1823 - detail.cache = cache; 1824 - bio_list_init(&detail.bios_for_issue); 1825 - bio_list_init(&detail.unhandled_bios); 1826 - detail.any_writes = false; 1827 - 1828 - spin_lock_irqsave(&cache->lock, flags); 1829 - dm_cell_visit_release(cache->prison, inc_fn, &detail, cell); 1830 - bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios); 1831 - spin_unlock_irqrestore(&cache->lock, flags); 1832 - 1833 - remap_to_cache(cache, cell->holder, cblock); 1834 - if (issue_holder) 1835 - issue(cache, cell->holder); 1836 - else 1837 - accounted_begin(cache, cell->holder); 1838 - 1839 - if (detail.any_writes) { 1840 - set_dirty(cache, oblock, cblock); 1841 - clear_discard(cache, oblock_to_dblock(cache, oblock)); 1842 - } 1843 - 1844 - while ((bio = bio_list_pop(&detail.bios_for_issue))) { 1845 - remap_to_cache(cache, bio, cblock); 1846 - issue(cache, bio); 1847 - } 1848 - 1849 - free_prison_cell(cache, cell); 1850 - } 1851 - 1852 - /*----------------------------------------------------------------*/ 1853 - 1854 - struct old_oblock_lock { 1855 - struct policy_locker locker; 1856 - struct cache *cache; 1857 - struct prealloc *structs; 1858 - struct dm_bio_prison_cell *cell; 1859 - }; 1860 - 1861 - static int null_locker(struct policy_locker *locker, dm_oblock_t b) 1862 - { 1863 - /* This should never be called */ 1864 - BUG(); 1865 - return 0; 1866 - } 1867 - 1868 - static int cell_locker(struct policy_locker *locker, dm_oblock_t b) 1869 - { 1870 - struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker); 1871 - struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs); 1872 - 1873 - return bio_detain(l->cache, b, NULL, cell_prealloc, 1874 - (cell_free_fn) prealloc_put_cell, 1875 - l->structs, &l->cell); 1876 - } 1877 - 1878 - static void process_cell(struct cache *cache, struct prealloc *structs, 1879 - struct dm_bio_prison_cell *new_ocell) 1880 - { 1881 - int r; 1882 - bool release_cell = true; 1883 - struct bio *bio = new_ocell->holder; 1884 - dm_oblock_t block = get_bio_block(cache, bio); 1885 - struct policy_result lookup_result; 1886 - bool passthrough = passthrough_mode(&cache->features); 1887 - bool fast_promotion, can_migrate; 1888 - struct old_oblock_lock ool; 1889 - 1890 - fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio); 1891 - can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache)); 1892 - 1893 - ool.locker.fn = cell_locker; 1894 - ool.cache = cache; 1895 - ool.structs = structs; 1896 - ool.cell = NULL; 1897 - r = policy_map(cache->policy, block, true, can_migrate, fast_promotion, 1898 - bio, &ool.locker, &lookup_result); 1899 - 1900 - if (r == -EWOULDBLOCK) 1901 - /* migration has been denied */ 1902 - lookup_result.op = POLICY_MISS; 1903 - 1904 - switch (lookup_result.op) { 1905 - case POLICY_HIT: 1906 - if (passthrough) { 1907 - inc_miss_counter(cache, bio); 1908 - 1909 - /* 1910 - * Passthrough always maps to the origin, 1911 - * invalidating any cache blocks that are written 1912 - * to. 1913 - */ 1914 - 1915 - if (bio_data_dir(bio) == WRITE) { 1916 - atomic_inc(&cache->stats.demotion); 1917 - invalidate(cache, structs, block, lookup_result.cblock, new_ocell); 1918 - release_cell = false; 1919 - 1920 - } else { 1921 - /* FIXME: factor out issue_origin() */ 1922 - remap_to_origin_clear_discard(cache, bio, block); 1923 - inc_and_issue(cache, bio, new_ocell); 1924 - } 1925 - } else { 1926 - inc_hit_counter(cache, bio); 1927 - 1928 - if (bio_data_dir(bio) == WRITE && 1929 - writethrough_mode(&cache->features) && 1930 - !is_dirty(cache, lookup_result.cblock)) { 1931 - remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 1932 - inc_and_issue(cache, bio, new_ocell); 1933 - 1934 - } else { 1935 - remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true); 1936 - release_cell = false; 1937 - } 1711 + if (r == -ENOENT && op) { 1712 + bio_drop_shared_lock(cache, bio); 1713 + BUG_ON(op->op != POLICY_PROMOTE); 1714 + mg_start(cache, op, bio); 1715 + return DM_MAPIO_SUBMITTED; 1716 + } 1717 + } else { 1718 + r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued); 1719 + if (unlikely(r && r != -ENOENT)) { 1720 + DMERR_LIMIT("%s: policy_lookup() failed with r = %d", 1721 + cache_device_name(cache), r); 1722 + bio_io_error(bio); 1723 + return DM_MAPIO_SUBMITTED; 1938 1724 } 1939 1725 1940 - break; 1726 + if (background_queued) 1727 + wake_migration_worker(cache); 1728 + } 1941 1729 1942 - case POLICY_MISS: 1730 + if (r == -ENOENT) { 1731 + /* 1732 + * Miss. 1733 + */ 1943 1734 inc_miss_counter(cache, bio); 1944 - remap_cell_to_origin_clear_discard(cache, new_ocell, block, true); 1945 - release_cell = false; 1946 - break; 1735 + if (pb->req_nr == 0) { 1736 + accounted_begin(cache, bio); 1737 + remap_to_origin_clear_discard(cache, bio, block); 1947 1738 1948 - case POLICY_NEW: 1949 - atomic_inc(&cache->stats.promotion); 1950 - promote(cache, structs, block, lookup_result.cblock, new_ocell); 1951 - release_cell = false; 1952 - break; 1739 + } else { 1740 + /* 1741 + * This is a duplicate writethrough io that is no 1742 + * longer needed because the block has been demoted. 1743 + */ 1744 + bio_endio(bio); 1745 + return DM_MAPIO_SUBMITTED; 1746 + } 1747 + } else { 1748 + /* 1749 + * Hit. 1750 + */ 1751 + inc_hit_counter(cache, bio); 1953 1752 1954 - case POLICY_REPLACE: 1955 - atomic_inc(&cache->stats.demotion); 1956 - atomic_inc(&cache->stats.promotion); 1957 - demote_then_promote(cache, structs, lookup_result.old_oblock, 1958 - block, lookup_result.cblock, 1959 - ool.cell, new_ocell); 1960 - release_cell = false; 1961 - break; 1753 + /* 1754 + * Passthrough always maps to the origin, invalidating any 1755 + * cache blocks that are written to. 1756 + */ 1757 + if (passthrough_mode(&cache->features)) { 1758 + if (bio_data_dir(bio) == WRITE) { 1759 + bio_drop_shared_lock(cache, bio); 1760 + atomic_inc(&cache->stats.demotion); 1761 + invalidate_start(cache, cblock, block, bio); 1762 + } else 1763 + remap_to_origin_clear_discard(cache, bio, block); 1962 1764 1963 - default: 1964 - DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u", 1965 - cache_device_name(cache), __func__, 1966 - (unsigned) lookup_result.op); 1967 - bio_io_error(bio); 1765 + } else { 1766 + if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && 1767 + !is_dirty(cache, cblock)) { 1768 + remap_to_origin_then_cache(cache, bio, block, cblock); 1769 + accounted_begin(cache, bio); 1770 + } else 1771 + remap_to_cache_dirty(cache, bio, block, cblock); 1772 + } 1968 1773 } 1969 - 1970 - if (release_cell) 1971 - cell_defer(cache, new_ocell, false); 1972 - } 1973 - 1974 - static void process_bio(struct cache *cache, struct prealloc *structs, 1975 - struct bio *bio) 1976 - { 1977 - int r; 1978 - dm_oblock_t block = get_bio_block(cache, bio); 1979 - struct dm_bio_prison_cell *cell_prealloc, *new_ocell; 1980 1774 1981 1775 /* 1982 - * Check to see if that block is currently migrating. 1776 + * dm core turns FUA requests into a separate payload and FLUSH req. 1983 1777 */ 1984 - cell_prealloc = prealloc_get_cell(structs); 1985 - r = bio_detain(cache, block, bio, cell_prealloc, 1986 - (cell_free_fn) prealloc_put_cell, 1987 - structs, &new_ocell); 1988 - if (r > 0) 1989 - return; 1778 + if (bio->bi_opf & REQ_FUA) { 1779 + /* 1780 + * issue_after_commit will call accounted_begin a second time. So 1781 + * we call accounted_complete() to avoid double accounting. 1782 + */ 1783 + accounted_complete(cache, bio); 1784 + issue_after_commit(&cache->committer, bio); 1785 + *commit_needed = true; 1786 + return DM_MAPIO_SUBMITTED; 1787 + } 1990 1788 1991 - process_cell(cache, structs, new_ocell); 1789 + return DM_MAPIO_REMAPPED; 1992 1790 } 1993 1791 1994 - static int need_commit_due_to_time(struct cache *cache) 1792 + static bool process_bio(struct cache *cache, struct bio *bio) 1995 1793 { 1996 - return jiffies < cache->last_commit_jiffies || 1997 - jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; 1794 + bool commit_needed; 1795 + 1796 + if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED) 1797 + generic_make_request(bio); 1798 + 1799 + return commit_needed; 1998 1800 } 1999 1801 2000 1802 /* ··· 1903 1929 return r; 1904 1930 } 1905 1931 1906 - static int commit_if_needed(struct cache *cache) 1932 + /* 1933 + * Used by the batcher. 1934 + */ 1935 + static int commit_op(void *context) 1907 1936 { 1908 - int r = 0; 1937 + struct cache *cache = context; 1909 1938 1910 - if ((cache->commit_requested || need_commit_due_to_time(cache)) && 1911 - dm_cache_changed_this_transaction(cache->cmd)) { 1912 - r = commit(cache, false); 1913 - cache->commit_requested = false; 1914 - cache->last_commit_jiffies = jiffies; 1915 - } 1939 + if (dm_cache_changed_this_transaction(cache->cmd)) 1940 + return commit(cache, false); 1916 1941 1917 - return r; 1942 + return 0; 1918 1943 } 1919 1944 1920 - static void process_deferred_bios(struct cache *cache) 1945 + /*----------------------------------------------------------------*/ 1946 + 1947 + static bool process_flush_bio(struct cache *cache, struct bio *bio) 1921 1948 { 1922 - bool prealloc_used = false; 1949 + size_t pb_data_size = get_per_bio_data_size(cache); 1950 + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1951 + 1952 + if (!pb->req_nr) 1953 + remap_to_origin(cache, bio); 1954 + else 1955 + remap_to_cache(cache, bio, 0); 1956 + 1957 + issue_after_commit(&cache->committer, bio); 1958 + return true; 1959 + } 1960 + 1961 + static bool process_discard_bio(struct cache *cache, struct bio *bio) 1962 + { 1963 + dm_dblock_t b, e; 1964 + 1965 + // FIXME: do we need to lock the region? Or can we just assume the 1966 + // user wont be so foolish as to issue discard concurrently with 1967 + // other IO? 1968 + calc_discard_block_range(cache, bio, &b, &e); 1969 + while (b != e) { 1970 + set_discard(cache, b); 1971 + b = to_dblock(from_dblock(b) + 1); 1972 + } 1973 + 1974 + bio_endio(bio); 1975 + 1976 + return false; 1977 + } 1978 + 1979 + static void process_deferred_bios(struct work_struct *ws) 1980 + { 1981 + struct cache *cache = container_of(ws, struct cache, deferred_bio_worker); 1982 + 1923 1983 unsigned long flags; 1984 + bool commit_needed = false; 1924 1985 struct bio_list bios; 1925 1986 struct bio *bio; 1926 - struct prealloc structs; 1927 1987 1928 - memset(&structs, 0, sizeof(structs)); 1929 1988 bio_list_init(&bios); 1930 1989 1931 1990 spin_lock_irqsave(&cache->lock, flags); ··· 1966 1959 bio_list_init(&cache->deferred_bios); 1967 1960 spin_unlock_irqrestore(&cache->lock, flags); 1968 1961 1969 - while (!bio_list_empty(&bios)) { 1970 - /* 1971 - * If we've got no free migration structs, and processing 1972 - * this bio might require one, we pause until there are some 1973 - * prepared mappings to process. 1974 - */ 1975 - prealloc_used = true; 1976 - if (prealloc_data_structs(cache, &structs)) { 1977 - spin_lock_irqsave(&cache->lock, flags); 1978 - bio_list_merge(&cache->deferred_bios, &bios); 1979 - spin_unlock_irqrestore(&cache->lock, flags); 1980 - break; 1981 - } 1982 - 1983 - bio = bio_list_pop(&bios); 1984 - 1962 + while ((bio = bio_list_pop(&bios))) { 1985 1963 if (bio->bi_opf & REQ_PREFLUSH) 1986 - process_flush_bio(cache, bio); 1964 + commit_needed = process_flush_bio(cache, bio) || commit_needed; 1965 + 1987 1966 else if (bio_op(bio) == REQ_OP_DISCARD) 1988 - process_discard_bio(cache, &structs, bio); 1967 + commit_needed = process_discard_bio(cache, bio) || commit_needed; 1968 + 1989 1969 else 1990 - process_bio(cache, &structs, bio); 1970 + commit_needed = process_bio(cache, bio) || commit_needed; 1991 1971 } 1992 1972 1993 - if (prealloc_used) 1994 - prealloc_free_structs(cache, &structs); 1973 + if (commit_needed) 1974 + schedule_commit(&cache->committer); 1995 1975 } 1996 1976 1997 - static void process_deferred_cells(struct cache *cache) 1977 + static void process_deferred_writethrough_bios(struct work_struct *ws) 1998 1978 { 1999 - bool prealloc_used = false; 2000 - unsigned long flags; 2001 - struct dm_bio_prison_cell *cell, *tmp; 2002 - struct list_head cells; 2003 - struct prealloc structs; 1979 + struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker); 2004 1980 2005 - memset(&structs, 0, sizeof(structs)); 2006 - 2007 - INIT_LIST_HEAD(&cells); 2008 - 2009 - spin_lock_irqsave(&cache->lock, flags); 2010 - list_splice_init(&cache->deferred_cells, &cells); 2011 - spin_unlock_irqrestore(&cache->lock, flags); 2012 - 2013 - list_for_each_entry_safe(cell, tmp, &cells, user_list) { 2014 - /* 2015 - * If we've got no free migration structs, and processing 2016 - * this bio might require one, we pause until there are some 2017 - * prepared mappings to process. 2018 - */ 2019 - prealloc_used = true; 2020 - if (prealloc_data_structs(cache, &structs)) { 2021 - spin_lock_irqsave(&cache->lock, flags); 2022 - list_splice(&cells, &cache->deferred_cells); 2023 - spin_unlock_irqrestore(&cache->lock, flags); 2024 - break; 2025 - } 2026 - 2027 - process_cell(cache, &structs, cell); 2028 - } 2029 - 2030 - if (prealloc_used) 2031 - prealloc_free_structs(cache, &structs); 2032 - } 2033 - 2034 - static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) 2035 - { 2036 - unsigned long flags; 2037 - struct bio_list bios; 2038 - struct bio *bio; 2039 - 2040 - bio_list_init(&bios); 2041 - 2042 - spin_lock_irqsave(&cache->lock, flags); 2043 - bio_list_merge(&bios, &cache->deferred_flush_bios); 2044 - bio_list_init(&cache->deferred_flush_bios); 2045 - spin_unlock_irqrestore(&cache->lock, flags); 2046 - 2047 - /* 2048 - * These bios have already been through inc_ds() 2049 - */ 2050 - while ((bio = bio_list_pop(&bios))) 2051 - submit_bios ? accounted_request(cache, bio) : bio_io_error(bio); 2052 - } 2053 - 2054 - static void process_deferred_writethrough_bios(struct cache *cache) 2055 - { 2056 1981 unsigned long flags; 2057 1982 struct bio_list bios; 2058 1983 struct bio *bio; ··· 1997 2058 spin_unlock_irqrestore(&cache->lock, flags); 1998 2059 1999 2060 /* 2000 - * These bios have already been through inc_ds() 2061 + * These bios have already been through accounted_begin() 2001 2062 */ 2002 2063 while ((bio = bio_list_pop(&bios))) 2003 - accounted_request(cache, bio); 2004 - } 2005 - 2006 - static void writeback_some_dirty_blocks(struct cache *cache) 2007 - { 2008 - bool prealloc_used = false; 2009 - dm_oblock_t oblock; 2010 - dm_cblock_t cblock; 2011 - struct prealloc structs; 2012 - struct dm_bio_prison_cell *old_ocell; 2013 - bool busy = !iot_idle_for(&cache->origin_tracker, HZ); 2014 - 2015 - memset(&structs, 0, sizeof(structs)); 2016 - 2017 - while (spare_migration_bandwidth(cache)) { 2018 - if (policy_writeback_work(cache->policy, &oblock, &cblock, busy)) 2019 - break; /* no work to do */ 2020 - 2021 - prealloc_used = true; 2022 - if (prealloc_data_structs(cache, &structs) || 2023 - get_cell(cache, oblock, &structs, &old_ocell)) { 2024 - policy_set_dirty(cache->policy, oblock); 2025 - break; 2026 - } 2027 - 2028 - writeback(cache, &structs, oblock, cblock, old_ocell); 2029 - } 2030 - 2031 - if (prealloc_used) 2032 - prealloc_free_structs(cache, &structs); 2033 - } 2034 - 2035 - /*---------------------------------------------------------------- 2036 - * Invalidations. 2037 - * Dropping something from the cache *without* writing back. 2038 - *--------------------------------------------------------------*/ 2039 - 2040 - static void process_invalidation_request(struct cache *cache, struct invalidation_request *req) 2041 - { 2042 - int r = 0; 2043 - uint64_t begin = from_cblock(req->cblocks->begin); 2044 - uint64_t end = from_cblock(req->cblocks->end); 2045 - 2046 - while (begin != end) { 2047 - r = policy_remove_cblock(cache->policy, to_cblock(begin)); 2048 - if (!r) { 2049 - r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin)); 2050 - if (r) { 2051 - metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 2052 - break; 2053 - } 2054 - 2055 - } else if (r == -ENODATA) { 2056 - /* harmless, already unmapped */ 2057 - r = 0; 2058 - 2059 - } else { 2060 - DMERR("%s: policy_remove_cblock failed", cache_device_name(cache)); 2061 - break; 2062 - } 2063 - 2064 - begin++; 2065 - } 2066 - 2067 - cache->commit_requested = true; 2068 - 2069 - req->err = r; 2070 - atomic_set(&req->complete, 1); 2071 - 2072 - wake_up(&req->result_wait); 2073 - } 2074 - 2075 - static void process_invalidation_requests(struct cache *cache) 2076 - { 2077 - struct list_head list; 2078 - struct invalidation_request *req, *tmp; 2079 - 2080 - INIT_LIST_HEAD(&list); 2081 - spin_lock(&cache->invalidation_lock); 2082 - list_splice_init(&cache->invalidation_requests, &list); 2083 - spin_unlock(&cache->invalidation_lock); 2084 - 2085 - list_for_each_entry_safe (req, tmp, &list, list) 2086 - process_invalidation_request(cache, req); 2064 + generic_make_request(bio); 2087 2065 } 2088 2066 2089 2067 /*---------------------------------------------------------------- 2090 2068 * Main worker loop 2091 2069 *--------------------------------------------------------------*/ 2092 - static bool is_quiescing(struct cache *cache) 2093 - { 2094 - return atomic_read(&cache->quiescing); 2095 - } 2096 - 2097 - static void ack_quiescing(struct cache *cache) 2098 - { 2099 - if (is_quiescing(cache)) { 2100 - atomic_inc(&cache->quiescing_ack); 2101 - wake_up(&cache->quiescing_wait); 2102 - } 2103 - } 2104 - 2105 - static void wait_for_quiescing_ack(struct cache *cache) 2106 - { 2107 - wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack)); 2108 - } 2109 - 2110 - static void start_quiescing(struct cache *cache) 2111 - { 2112 - atomic_inc(&cache->quiescing); 2113 - wait_for_quiescing_ack(cache); 2114 - } 2115 - 2116 - static void stop_quiescing(struct cache *cache) 2117 - { 2118 - atomic_set(&cache->quiescing, 0); 2119 - atomic_set(&cache->quiescing_ack, 0); 2120 - } 2121 - 2122 - static void wait_for_migrations(struct cache *cache) 2123 - { 2124 - wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations)); 2125 - } 2126 - 2127 - static void stop_worker(struct cache *cache) 2128 - { 2129 - cancel_delayed_work(&cache->waker); 2130 - flush_workqueue(cache->wq); 2131 - } 2132 - 2133 - static void requeue_deferred_cells(struct cache *cache) 2134 - { 2135 - unsigned long flags; 2136 - struct list_head cells; 2137 - struct dm_bio_prison_cell *cell, *tmp; 2138 - 2139 - INIT_LIST_HEAD(&cells); 2140 - spin_lock_irqsave(&cache->lock, flags); 2141 - list_splice_init(&cache->deferred_cells, &cells); 2142 - spin_unlock_irqrestore(&cache->lock, flags); 2143 - 2144 - list_for_each_entry_safe(cell, tmp, &cells, user_list) 2145 - cell_requeue(cache, cell); 2146 - } 2147 2070 2148 2071 static void requeue_deferred_bios(struct cache *cache) 2149 2072 { ··· 2022 2221 } 2023 2222 } 2024 2223 2025 - static int more_work(struct cache *cache) 2026 - { 2027 - if (is_quiescing(cache)) 2028 - return !list_empty(&cache->quiesced_migrations) || 2029 - !list_empty(&cache->completed_migrations) || 2030 - !list_empty(&cache->need_commit_migrations); 2031 - else 2032 - return !bio_list_empty(&cache->deferred_bios) || 2033 - !list_empty(&cache->deferred_cells) || 2034 - !bio_list_empty(&cache->deferred_flush_bios) || 2035 - !bio_list_empty(&cache->deferred_writethrough_bios) || 2036 - !list_empty(&cache->quiesced_migrations) || 2037 - !list_empty(&cache->completed_migrations) || 2038 - !list_empty(&cache->need_commit_migrations) || 2039 - cache->invalidate; 2040 - } 2041 - 2042 - static void do_worker(struct work_struct *ws) 2043 - { 2044 - struct cache *cache = container_of(ws, struct cache, worker); 2045 - 2046 - do { 2047 - if (!is_quiescing(cache)) { 2048 - writeback_some_dirty_blocks(cache); 2049 - process_deferred_writethrough_bios(cache); 2050 - process_deferred_bios(cache); 2051 - process_deferred_cells(cache); 2052 - process_invalidation_requests(cache); 2053 - } 2054 - 2055 - process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard); 2056 - process_migrations(cache, &cache->completed_migrations, complete_migration); 2057 - 2058 - if (commit_if_needed(cache)) { 2059 - process_deferred_flush_bios(cache, false); 2060 - process_migrations(cache, &cache->need_commit_migrations, migration_failure); 2061 - } else { 2062 - process_deferred_flush_bios(cache, true); 2063 - process_migrations(cache, &cache->need_commit_migrations, 2064 - migration_success_post_commit); 2065 - } 2066 - 2067 - ack_quiescing(cache); 2068 - 2069 - } while (more_work(cache)); 2070 - } 2071 - 2072 2224 /* 2073 2225 * We want to commit periodically so that not too much 2074 2226 * unwritten metadata builds up. ··· 2029 2275 static void do_waker(struct work_struct *ws) 2030 2276 { 2031 2277 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 2278 + 2032 2279 policy_tick(cache->policy, true); 2033 - wake_worker(cache); 2280 + wake_migration_worker(cache); 2281 + schedule_commit(&cache->committer); 2034 2282 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 2035 2283 } 2036 2284 2037 - /*----------------------------------------------------------------*/ 2038 - 2039 - static int is_congested(struct dm_dev *dev, int bdi_bits) 2285 + static void check_migrations(struct work_struct *ws) 2040 2286 { 2041 - struct request_queue *q = bdev_get_queue(dev->bdev); 2042 - return bdi_congested(q->backing_dev_info, bdi_bits); 2043 - } 2287 + int r; 2288 + struct policy_work *op; 2289 + struct cache *cache = container_of(ws, struct cache, migration_worker); 2290 + enum busy b; 2044 2291 2045 - static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 2046 - { 2047 - struct cache *cache = container_of(cb, struct cache, callbacks); 2292 + for (;;) { 2293 + b = spare_migration_bandwidth(cache); 2294 + if (b == BUSY) 2295 + break; 2048 2296 2049 - return is_congested(cache->origin_dev, bdi_bits) || 2050 - is_congested(cache->cache_dev, bdi_bits); 2297 + r = policy_get_background_work(cache->policy, b == IDLE, &op); 2298 + if (r == -ENODATA) 2299 + break; 2300 + 2301 + if (r) { 2302 + DMERR_LIMIT("%s: policy_background_work failed", 2303 + cache_device_name(cache)); 2304 + break; 2305 + } 2306 + 2307 + r = mg_start(cache, op, NULL); 2308 + if (r) 2309 + break; 2310 + } 2051 2311 } 2052 2312 2053 2313 /*---------------------------------------------------------------- ··· 2078 2310 2079 2311 mempool_destroy(cache->migration_pool); 2080 2312 2081 - if (cache->all_io_ds) 2082 - dm_deferred_set_destroy(cache->all_io_ds); 2083 - 2084 2313 if (cache->prison) 2085 - dm_bio_prison_destroy(cache->prison); 2314 + dm_bio_prison_destroy_v2(cache->prison); 2086 2315 2087 2316 if (cache->wq) 2088 2317 destroy_workqueue(cache->wq); ··· 2472 2707 return PTR_ERR(p); 2473 2708 } 2474 2709 cache->policy = p; 2710 + BUG_ON(!cache->policy); 2475 2711 2476 2712 return 0; 2477 2713 } ··· 2516 2750 cache->cache_size = size; 2517 2751 } 2518 2752 2753 + static int is_congested(struct dm_dev *dev, int bdi_bits) 2754 + { 2755 + struct request_queue *q = bdev_get_queue(dev->bdev); 2756 + return bdi_congested(q->backing_dev_info, bdi_bits); 2757 + } 2758 + 2759 + static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 2760 + { 2761 + struct cache *cache = container_of(cb, struct cache, callbacks); 2762 + 2763 + return is_congested(cache->origin_dev, bdi_bits) || 2764 + is_congested(cache->cache_dev, bdi_bits); 2765 + } 2766 + 2519 2767 #define DEFAULT_MIGRATION_THRESHOLD 2048 2520 2768 2521 2769 static int cache_create(struct cache_args *ca, struct cache **result) ··· 2568 2788 2569 2789 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2570 2790 2571 - /* FIXME: factor out this whole section */ 2572 2791 origin_blocks = cache->origin_sectors = ca->origin_sectors; 2573 2792 origin_blocks = block_div(origin_blocks, ca->block_size); 2574 2793 cache->origin_blocks = to_oblock(origin_blocks); ··· 2633 2854 r = -EINVAL; 2634 2855 goto bad; 2635 2856 } 2857 + 2858 + policy_allow_migrations(cache->policy, false); 2636 2859 } 2637 2860 2638 2861 spin_lock_init(&cache->lock); 2639 2862 INIT_LIST_HEAD(&cache->deferred_cells); 2640 2863 bio_list_init(&cache->deferred_bios); 2641 - bio_list_init(&cache->deferred_flush_bios); 2642 2864 bio_list_init(&cache->deferred_writethrough_bios); 2643 - INIT_LIST_HEAD(&cache->quiesced_migrations); 2644 - INIT_LIST_HEAD(&cache->completed_migrations); 2645 - INIT_LIST_HEAD(&cache->need_commit_migrations); 2646 2865 atomic_set(&cache->nr_allocated_migrations, 0); 2647 2866 atomic_set(&cache->nr_io_migrations, 0); 2648 2867 init_waitqueue_head(&cache->migration_wait); 2649 - 2650 - init_waitqueue_head(&cache->quiescing_wait); 2651 - atomic_set(&cache->quiescing, 0); 2652 - atomic_set(&cache->quiescing_ack, 0); 2653 2868 2654 2869 r = -ENOMEM; 2655 2870 atomic_set(&cache->nr_dirty, 0); ··· 2673 2900 goto bad; 2674 2901 } 2675 2902 2676 - cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 2903 + cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); 2677 2904 if (!cache->wq) { 2678 2905 *error = "could not create workqueue for metadata object"; 2679 2906 goto bad; 2680 2907 } 2681 - INIT_WORK(&cache->worker, do_worker); 2908 + INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios); 2909 + INIT_WORK(&cache->deferred_writethrough_worker, 2910 + process_deferred_writethrough_bios); 2911 + INIT_WORK(&cache->migration_worker, check_migrations); 2682 2912 INIT_DELAYED_WORK(&cache->waker, do_waker); 2683 - cache->last_commit_jiffies = jiffies; 2684 2913 2685 - cache->prison = dm_bio_prison_create(); 2914 + cache->prison = dm_bio_prison_create_v2(cache->wq); 2686 2915 if (!cache->prison) { 2687 2916 *error = "could not create bio prison"; 2688 - goto bad; 2689 - } 2690 - 2691 - cache->all_io_ds = dm_deferred_set_create(); 2692 - if (!cache->all_io_ds) { 2693 - *error = "could not create all_io deferred set"; 2694 2917 goto bad; 2695 2918 } 2696 2919 ··· 2716 2947 spin_lock_init(&cache->invalidation_lock); 2717 2948 INIT_LIST_HEAD(&cache->invalidation_requests); 2718 2949 2950 + batcher_init(&cache->committer, commit_op, cache, 2951 + issue_op, cache, cache->wq); 2719 2952 iot_init(&cache->origin_tracker); 2953 + 2954 + init_rwsem(&cache->background_work_lock); 2955 + prevent_background_work(cache); 2720 2956 2721 2957 *result = cache; 2722 2958 return 0; 2723 - 2724 2959 bad: 2725 2960 destroy(cache); 2726 2961 return r; ··· 2782 3009 } 2783 3010 2784 3011 ti->private = cache; 2785 - 2786 3012 out: 2787 3013 destroy_cache_args(ca); 2788 3014 return r; ··· 2794 3022 struct cache *cache = ti->private; 2795 3023 2796 3024 int r; 2797 - struct dm_bio_prison_cell *cell = NULL; 3025 + bool commit_needed; 2798 3026 dm_oblock_t block = get_bio_block(cache, bio); 2799 3027 size_t pb_data_size = get_per_bio_data_size(cache); 2800 - bool can_migrate = false; 2801 - bool fast_promotion; 2802 - struct policy_result lookup_result; 2803 - struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size); 2804 - struct old_oblock_lock ool; 2805 3028 2806 - ool.locker.fn = null_locker; 2807 - 3029 + init_per_bio_data(bio, pb_data_size); 2808 3030 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 2809 3031 /* 2810 3032 * This can only occur if the io goes to a partial block at ··· 2815 3049 return DM_MAPIO_SUBMITTED; 2816 3050 } 2817 3051 2818 - /* 2819 - * Check to see if that block is currently migrating. 2820 - */ 2821 - cell = alloc_prison_cell(cache); 2822 - if (!cell) { 2823 - defer_bio(cache, bio); 2824 - return DM_MAPIO_SUBMITTED; 2825 - } 2826 - 2827 - r = bio_detain(cache, block, bio, cell, 2828 - (cell_free_fn) free_prison_cell, 2829 - cache, &cell); 2830 - if (r) { 2831 - if (r < 0) 2832 - defer_bio(cache, bio); 2833 - 2834 - return DM_MAPIO_SUBMITTED; 2835 - } 2836 - 2837 - fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio); 2838 - 2839 - r = policy_map(cache->policy, block, false, can_migrate, fast_promotion, 2840 - bio, &ool.locker, &lookup_result); 2841 - if (r == -EWOULDBLOCK) { 2842 - cell_defer(cache, cell, true); 2843 - return DM_MAPIO_SUBMITTED; 2844 - 2845 - } else if (r) { 2846 - DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d", 2847 - cache_device_name(cache), r); 2848 - cell_defer(cache, cell, false); 2849 - bio_io_error(bio); 2850 - return DM_MAPIO_SUBMITTED; 2851 - } 2852 - 2853 - r = DM_MAPIO_REMAPPED; 2854 - switch (lookup_result.op) { 2855 - case POLICY_HIT: 2856 - if (passthrough_mode(&cache->features)) { 2857 - if (bio_data_dir(bio) == WRITE) { 2858 - /* 2859 - * We need to invalidate this block, so 2860 - * defer for the worker thread. 2861 - */ 2862 - cell_defer(cache, cell, true); 2863 - r = DM_MAPIO_SUBMITTED; 2864 - 2865 - } else { 2866 - inc_miss_counter(cache, bio); 2867 - remap_to_origin_clear_discard(cache, bio, block); 2868 - accounted_begin(cache, bio); 2869 - inc_ds(cache, bio, cell); 2870 - // FIXME: we want to remap hits or misses straight 2871 - // away rather than passing over to the worker. 2872 - cell_defer(cache, cell, false); 2873 - } 2874 - 2875 - } else { 2876 - inc_hit_counter(cache, bio); 2877 - if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && 2878 - !is_dirty(cache, lookup_result.cblock)) { 2879 - remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 2880 - accounted_begin(cache, bio); 2881 - inc_ds(cache, bio, cell); 2882 - cell_defer(cache, cell, false); 2883 - 2884 - } else 2885 - remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false); 2886 - } 2887 - break; 2888 - 2889 - case POLICY_MISS: 2890 - inc_miss_counter(cache, bio); 2891 - if (pb->req_nr != 0) { 2892 - /* 2893 - * This is a duplicate writethrough io that is no 2894 - * longer needed because the block has been demoted. 2895 - */ 2896 - bio_endio(bio); 2897 - // FIXME: remap everything as a miss 2898 - cell_defer(cache, cell, false); 2899 - r = DM_MAPIO_SUBMITTED; 2900 - 2901 - } else 2902 - remap_cell_to_origin_clear_discard(cache, cell, block, false); 2903 - break; 2904 - 2905 - default: 2906 - DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u", 2907 - cache_device_name(cache), __func__, 2908 - (unsigned) lookup_result.op); 2909 - cell_defer(cache, cell, false); 2910 - bio_io_error(bio); 2911 - r = DM_MAPIO_SUBMITTED; 2912 - } 3052 + r = map_bio(cache, bio, block, &commit_needed); 3053 + if (commit_needed) 3054 + schedule_commit(&cache->committer); 2913 3055 2914 3056 return r; 2915 3057 } ··· 2837 3163 spin_unlock_irqrestore(&cache->lock, flags); 2838 3164 } 2839 3165 2840 - check_for_quiesced_migrations(cache, pb); 3166 + bio_drop_shared_lock(cache, bio); 2841 3167 accounted_complete(cache, bio); 2842 3168 2843 3169 return 0; ··· 2937 3263 { 2938 3264 struct cache *cache = ti->private; 2939 3265 2940 - start_quiescing(cache); 2941 - wait_for_migrations(cache); 2942 - stop_worker(cache); 3266 + prevent_background_work(cache); 3267 + BUG_ON(atomic_read(&cache->nr_io_migrations)); 3268 + 3269 + cancel_delayed_work(&cache->waker); 3270 + flush_workqueue(cache->wq); 3271 + WARN_ON(cache->origin_tracker.in_flight); 3272 + 3273 + /* 3274 + * If it's a flush suspend there won't be any deferred bios, so this 3275 + * call is harmless. 3276 + */ 2943 3277 requeue_deferred_bios(cache); 2944 - requeue_deferred_cells(cache); 2945 - stop_quiescing(cache); 2946 3278 2947 3279 if (get_cache_mode(cache) == CM_WRITE) 2948 3280 (void) sync_metadata(cache); ··· 2960 3280 int r; 2961 3281 struct cache *cache = context; 2962 3282 2963 - r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid); 3283 + r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); 2964 3284 if (r) 2965 3285 return r; 2966 - 2967 - if (dirty) 2968 - set_dirty(cache, oblock, cblock); 2969 - else 2970 - clear_dirty(cache, oblock, cblock); 2971 3286 2972 3287 return 0; 2973 3288 } ··· 3162 3487 struct cache *cache = ti->private; 3163 3488 3164 3489 cache->need_tick_bio = true; 3490 + allow_background_work(cache); 3165 3491 do_waker(&cache->waker.work); 3166 3492 } 3167 3493 ··· 3297 3621 } 3298 3622 3299 3623 /* 3624 + * Defines a range of cblocks, begin to (end - 1) are in the range. end is 3625 + * the one-past-the-end value. 3626 + */ 3627 + struct cblock_range { 3628 + dm_cblock_t begin; 3629 + dm_cblock_t end; 3630 + }; 3631 + 3632 + /* 3300 3633 * A cache block range can take two forms: 3301 3634 * 3302 3635 * i) A single cblock, eg. '3456' 3303 - * ii) A begin and end cblock with dots between, eg. 123-234 3636 + * ii) A begin and end cblock with a dash between, eg. 123-234 3304 3637 */ 3305 3638 static int parse_cblock_range(struct cache *cache, const char *str, 3306 3639 struct cblock_range *result) ··· 3375 3690 return 0; 3376 3691 } 3377 3692 3693 + static inline dm_cblock_t cblock_succ(dm_cblock_t b) 3694 + { 3695 + return to_cblock(from_cblock(b) + 1); 3696 + } 3697 + 3378 3698 static int request_invalidation(struct cache *cache, struct cblock_range *range) 3379 3699 { 3380 - struct invalidation_request req; 3700 + int r = 0; 3381 3701 3382 - INIT_LIST_HEAD(&req.list); 3383 - req.cblocks = range; 3384 - atomic_set(&req.complete, 0); 3385 - req.err = 0; 3386 - init_waitqueue_head(&req.result_wait); 3702 + /* 3703 + * We don't need to do any locking here because we know we're in 3704 + * passthrough mode. There's is potential for a race between an 3705 + * invalidation triggered by an io and an invalidation message. This 3706 + * is harmless, we must not worry if the policy call fails. 3707 + */ 3708 + while (range->begin != range->end) { 3709 + r = invalidate_cblock(cache, range->begin); 3710 + if (r) 3711 + return r; 3387 3712 3388 - spin_lock(&cache->invalidation_lock); 3389 - list_add(&req.list, &cache->invalidation_requests); 3390 - spin_unlock(&cache->invalidation_lock); 3391 - wake_worker(cache); 3713 + range->begin = cblock_succ(range->begin); 3714 + } 3392 3715 3393 - wait_event(req.result_wait, atomic_read(&req.complete)); 3394 - return req.err; 3716 + cache->commit_requested = true; 3717 + return r; 3395 3718 } 3396 3719 3397 3720 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, ··· 3509 3816 3510 3817 static struct target_type cache_target = { 3511 3818 .name = "cache", 3512 - .version = {1, 10, 0}, 3819 + .version = {2, 0, 0}, 3513 3820 .module = THIS_MODULE, 3514 3821 .ctr = cache_ctr, 3515 3822 .dtr = cache_dtr,