Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v3.9-rc6 1199 lines 28 kB view raw
1/* 2 * Copyright (C) 2012 Red Hat. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7#include "dm-cache-policy.h" 8#include "dm.h" 9 10#include <linux/hash.h> 11#include <linux/module.h> 12#include <linux/mutex.h> 13#include <linux/slab.h> 14#include <linux/vmalloc.h> 15 16#define DM_MSG_PREFIX "cache-policy-mq" 17 18static struct kmem_cache *mq_entry_cache; 19 20/*----------------------------------------------------------------*/ 21 22static unsigned next_power(unsigned n, unsigned min) 23{ 24 return roundup_pow_of_two(max(n, min)); 25} 26 27/*----------------------------------------------------------------*/ 28 29static unsigned long *alloc_bitset(unsigned nr_entries) 30{ 31 size_t s = sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG); 32 return vzalloc(s); 33} 34 35static void free_bitset(unsigned long *bits) 36{ 37 vfree(bits); 38} 39 40/*----------------------------------------------------------------*/ 41 42/* 43 * Large, sequential ios are probably better left on the origin device since 44 * spindles tend to have good bandwidth. 45 * 46 * The io_tracker tries to spot when the io is in one of these sequential 47 * modes. 48 * 49 * Two thresholds to switch between random and sequential io mode are defaulting 50 * as follows and can be adjusted via the constructor and message interfaces. 51 */ 52#define RANDOM_THRESHOLD_DEFAULT 4 53#define SEQUENTIAL_THRESHOLD_DEFAULT 512 54 55enum io_pattern { 56 PATTERN_SEQUENTIAL, 57 PATTERN_RANDOM 58}; 59 60struct io_tracker { 61 enum io_pattern pattern; 62 63 unsigned nr_seq_samples; 64 unsigned nr_rand_samples; 65 unsigned thresholds[2]; 66 67 dm_oblock_t last_end_oblock; 68}; 69 70static void iot_init(struct io_tracker *t, 71 int sequential_threshold, int random_threshold) 72{ 73 t->pattern = PATTERN_RANDOM; 74 t->nr_seq_samples = 0; 75 t->nr_rand_samples = 0; 76 t->last_end_oblock = 0; 77 t->thresholds[PATTERN_RANDOM] = random_threshold; 78 t->thresholds[PATTERN_SEQUENTIAL] = sequential_threshold; 79} 80 81static enum io_pattern iot_pattern(struct io_tracker *t) 82{ 83 return t->pattern; 84} 85 86static void iot_update_stats(struct io_tracker *t, struct bio *bio) 87{ 88 if (bio->bi_sector == from_oblock(t->last_end_oblock) + 1) 89 t->nr_seq_samples++; 90 else { 91 /* 92 * Just one non-sequential IO is enough to reset the 93 * counters. 94 */ 95 if (t->nr_seq_samples) { 96 t->nr_seq_samples = 0; 97 t->nr_rand_samples = 0; 98 } 99 100 t->nr_rand_samples++; 101 } 102 103 t->last_end_oblock = to_oblock(bio->bi_sector + bio_sectors(bio) - 1); 104} 105 106static void iot_check_for_pattern_switch(struct io_tracker *t) 107{ 108 switch (t->pattern) { 109 case PATTERN_SEQUENTIAL: 110 if (t->nr_rand_samples >= t->thresholds[PATTERN_RANDOM]) { 111 t->pattern = PATTERN_RANDOM; 112 t->nr_seq_samples = t->nr_rand_samples = 0; 113 } 114 break; 115 116 case PATTERN_RANDOM: 117 if (t->nr_seq_samples >= t->thresholds[PATTERN_SEQUENTIAL]) { 118 t->pattern = PATTERN_SEQUENTIAL; 119 t->nr_seq_samples = t->nr_rand_samples = 0; 120 } 121 break; 122 } 123} 124 125static void iot_examine_bio(struct io_tracker *t, struct bio *bio) 126{ 127 iot_update_stats(t, bio); 128 iot_check_for_pattern_switch(t); 129} 130 131/*----------------------------------------------------------------*/ 132 133 134/* 135 * This queue is divided up into different levels. Allowing us to push 136 * entries to the back of any of the levels. Think of it as a partially 137 * sorted queue. 138 */ 139#define NR_QUEUE_LEVELS 16u 140 141struct queue { 142 struct list_head qs[NR_QUEUE_LEVELS]; 143}; 144 145static void queue_init(struct queue *q) 146{ 147 unsigned i; 148 149 for (i = 0; i < NR_QUEUE_LEVELS; i++) 150 INIT_LIST_HEAD(q->qs + i); 151} 152 153/* 154 * Insert an entry to the back of the given level. 155 */ 156static void queue_push(struct queue *q, unsigned level, struct list_head *elt) 157{ 158 list_add_tail(elt, q->qs + level); 159} 160 161static void queue_remove(struct list_head *elt) 162{ 163 list_del(elt); 164} 165 166/* 167 * Shifts all regions down one level. This has no effect on the order of 168 * the queue. 169 */ 170static void queue_shift_down(struct queue *q) 171{ 172 unsigned level; 173 174 for (level = 1; level < NR_QUEUE_LEVELS; level++) 175 list_splice_init(q->qs + level, q->qs + level - 1); 176} 177 178/* 179 * Gives us the oldest entry of the lowest popoulated level. If the first 180 * level is emptied then we shift down one level. 181 */ 182static struct list_head *queue_pop(struct queue *q) 183{ 184 unsigned level; 185 struct list_head *r; 186 187 for (level = 0; level < NR_QUEUE_LEVELS; level++) 188 if (!list_empty(q->qs + level)) { 189 r = q->qs[level].next; 190 list_del(r); 191 192 /* have we just emptied the bottom level? */ 193 if (level == 0 && list_empty(q->qs)) 194 queue_shift_down(q); 195 196 return r; 197 } 198 199 return NULL; 200} 201 202static struct list_head *list_pop(struct list_head *lh) 203{ 204 struct list_head *r = lh->next; 205 206 BUG_ON(!r); 207 list_del_init(r); 208 209 return r; 210} 211 212/*----------------------------------------------------------------*/ 213 214/* 215 * Describes a cache entry. Used in both the cache and the pre_cache. 216 */ 217struct entry { 218 struct hlist_node hlist; 219 struct list_head list; 220 dm_oblock_t oblock; 221 dm_cblock_t cblock; /* valid iff in_cache */ 222 223 /* 224 * FIXME: pack these better 225 */ 226 bool in_cache:1; 227 unsigned hit_count; 228 unsigned generation; 229 unsigned tick; 230}; 231 232struct mq_policy { 233 struct dm_cache_policy policy; 234 235 /* protects everything */ 236 struct mutex lock; 237 dm_cblock_t cache_size; 238 struct io_tracker tracker; 239 240 /* 241 * We maintain two queues of entries. The cache proper contains 242 * the currently active mappings. Whereas the pre_cache tracks 243 * blocks that are being hit frequently and potential candidates 244 * for promotion to the cache. 245 */ 246 struct queue pre_cache; 247 struct queue cache; 248 249 /* 250 * Keeps track of time, incremented by the core. We use this to 251 * avoid attributing multiple hits within the same tick. 252 * 253 * Access to tick_protected should be done with the spin lock held. 254 * It's copied to tick at the start of the map function (within the 255 * mutex). 256 */ 257 spinlock_t tick_lock; 258 unsigned tick_protected; 259 unsigned tick; 260 261 /* 262 * A count of the number of times the map function has been called 263 * and found an entry in the pre_cache or cache. Currently used to 264 * calculate the generation. 265 */ 266 unsigned hit_count; 267 268 /* 269 * A generation is a longish period that is used to trigger some 270 * book keeping effects. eg, decrementing hit counts on entries. 271 * This is needed to allow the cache to evolve as io patterns 272 * change. 273 */ 274 unsigned generation; 275 unsigned generation_period; /* in lookups (will probably change) */ 276 277 /* 278 * Entries in the pre_cache whose hit count passes the promotion 279 * threshold move to the cache proper. Working out the correct 280 * value for the promotion_threshold is crucial to this policy. 281 */ 282 unsigned promote_threshold; 283 284 /* 285 * We need cache_size entries for the cache, and choose to have 286 * cache_size entries for the pre_cache too. One motivation for 287 * using the same size is to make the hit counts directly 288 * comparable between pre_cache and cache. 289 */ 290 unsigned nr_entries; 291 unsigned nr_entries_allocated; 292 struct list_head free; 293 294 /* 295 * Cache blocks may be unallocated. We store this info in a 296 * bitset. 297 */ 298 unsigned long *allocation_bitset; 299 unsigned nr_cblocks_allocated; 300 unsigned find_free_nr_words; 301 unsigned find_free_last_word; 302 303 /* 304 * The hash table allows us to quickly find an entry by origin 305 * block. Both pre_cache and cache entries are in here. 306 */ 307 unsigned nr_buckets; 308 dm_block_t hash_bits; 309 struct hlist_head *table; 310}; 311 312/*----------------------------------------------------------------*/ 313/* Free/alloc mq cache entry structures. */ 314static void takeout_queue(struct list_head *lh, struct queue *q) 315{ 316 unsigned level; 317 318 for (level = 0; level < NR_QUEUE_LEVELS; level++) 319 list_splice(q->qs + level, lh); 320} 321 322static void free_entries(struct mq_policy *mq) 323{ 324 struct entry *e, *tmp; 325 326 takeout_queue(&mq->free, &mq->pre_cache); 327 takeout_queue(&mq->free, &mq->cache); 328 329 list_for_each_entry_safe(e, tmp, &mq->free, list) 330 kmem_cache_free(mq_entry_cache, e); 331} 332 333static int alloc_entries(struct mq_policy *mq, unsigned elts) 334{ 335 unsigned u = mq->nr_entries; 336 337 INIT_LIST_HEAD(&mq->free); 338 mq->nr_entries_allocated = 0; 339 340 while (u--) { 341 struct entry *e = kmem_cache_zalloc(mq_entry_cache, GFP_KERNEL); 342 343 if (!e) { 344 free_entries(mq); 345 return -ENOMEM; 346 } 347 348 349 list_add(&e->list, &mq->free); 350 } 351 352 return 0; 353} 354 355/*----------------------------------------------------------------*/ 356 357/* 358 * Simple hash table implementation. Should replace with the standard hash 359 * table that's making its way upstream. 360 */ 361static void hash_insert(struct mq_policy *mq, struct entry *e) 362{ 363 unsigned h = hash_64(from_oblock(e->oblock), mq->hash_bits); 364 365 hlist_add_head(&e->hlist, mq->table + h); 366} 367 368static struct entry *hash_lookup(struct mq_policy *mq, dm_oblock_t oblock) 369{ 370 unsigned h = hash_64(from_oblock(oblock), mq->hash_bits); 371 struct hlist_head *bucket = mq->table + h; 372 struct entry *e; 373 374 hlist_for_each_entry(e, bucket, hlist) 375 if (e->oblock == oblock) { 376 hlist_del(&e->hlist); 377 hlist_add_head(&e->hlist, bucket); 378 return e; 379 } 380 381 return NULL; 382} 383 384static void hash_remove(struct entry *e) 385{ 386 hlist_del(&e->hlist); 387} 388 389/*----------------------------------------------------------------*/ 390 391/* 392 * Allocates a new entry structure. The memory is allocated in one lump, 393 * so we just handing it out here. Returns NULL if all entries have 394 * already been allocated. Cannot fail otherwise. 395 */ 396static struct entry *alloc_entry(struct mq_policy *mq) 397{ 398 struct entry *e; 399 400 if (mq->nr_entries_allocated >= mq->nr_entries) { 401 BUG_ON(!list_empty(&mq->free)); 402 return NULL; 403 } 404 405 e = list_entry(list_pop(&mq->free), struct entry, list); 406 INIT_LIST_HEAD(&e->list); 407 INIT_HLIST_NODE(&e->hlist); 408 409 mq->nr_entries_allocated++; 410 return e; 411} 412 413/*----------------------------------------------------------------*/ 414 415/* 416 * Mark cache blocks allocated or not in the bitset. 417 */ 418static void alloc_cblock(struct mq_policy *mq, dm_cblock_t cblock) 419{ 420 BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size)); 421 BUG_ON(test_bit(from_cblock(cblock), mq->allocation_bitset)); 422 423 set_bit(from_cblock(cblock), mq->allocation_bitset); 424 mq->nr_cblocks_allocated++; 425} 426 427static void free_cblock(struct mq_policy *mq, dm_cblock_t cblock) 428{ 429 BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size)); 430 BUG_ON(!test_bit(from_cblock(cblock), mq->allocation_bitset)); 431 432 clear_bit(from_cblock(cblock), mq->allocation_bitset); 433 mq->nr_cblocks_allocated--; 434} 435 436static bool any_free_cblocks(struct mq_policy *mq) 437{ 438 return mq->nr_cblocks_allocated < from_cblock(mq->cache_size); 439} 440 441/* 442 * Fills result out with a cache block that isn't in use, or return 443 * -ENOSPC. This does _not_ mark the cblock as allocated, the caller is 444 * reponsible for that. 445 */ 446static int __find_free_cblock(struct mq_policy *mq, unsigned begin, unsigned end, 447 dm_cblock_t *result, unsigned *last_word) 448{ 449 int r = -ENOSPC; 450 unsigned w; 451 452 for (w = begin; w < end; w++) { 453 /* 454 * ffz is undefined if no zero exists 455 */ 456 if (mq->allocation_bitset[w] != ~0UL) { 457 *last_word = w; 458 *result = to_cblock((w * BITS_PER_LONG) + ffz(mq->allocation_bitset[w])); 459 if (from_cblock(*result) < from_cblock(mq->cache_size)) 460 r = 0; 461 462 break; 463 } 464 } 465 466 return r; 467} 468 469static int find_free_cblock(struct mq_policy *mq, dm_cblock_t *result) 470{ 471 int r; 472 473 if (!any_free_cblocks(mq)) 474 return -ENOSPC; 475 476 r = __find_free_cblock(mq, mq->find_free_last_word, mq->find_free_nr_words, result, &mq->find_free_last_word); 477 if (r == -ENOSPC && mq->find_free_last_word) 478 r = __find_free_cblock(mq, 0, mq->find_free_last_word, result, &mq->find_free_last_word); 479 480 return r; 481} 482 483/*----------------------------------------------------------------*/ 484 485/* 486 * Now we get to the meat of the policy. This section deals with deciding 487 * when to to add entries to the pre_cache and cache, and move between 488 * them. 489 */ 490 491/* 492 * The queue level is based on the log2 of the hit count. 493 */ 494static unsigned queue_level(struct entry *e) 495{ 496 return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u); 497} 498 499/* 500 * Inserts the entry into the pre_cache or the cache. Ensures the cache 501 * block is marked as allocated if necc. Inserts into the hash table. Sets the 502 * tick which records when the entry was last moved about. 503 */ 504static void push(struct mq_policy *mq, struct entry *e) 505{ 506 e->tick = mq->tick; 507 hash_insert(mq, e); 508 509 if (e->in_cache) { 510 alloc_cblock(mq, e->cblock); 511 queue_push(&mq->cache, queue_level(e), &e->list); 512 } else 513 queue_push(&mq->pre_cache, queue_level(e), &e->list); 514} 515 516/* 517 * Removes an entry from pre_cache or cache. Removes from the hash table. 518 * Frees off the cache block if necc. 519 */ 520static void del(struct mq_policy *mq, struct entry *e) 521{ 522 queue_remove(&e->list); 523 hash_remove(e); 524 if (e->in_cache) 525 free_cblock(mq, e->cblock); 526} 527 528/* 529 * Like del, except it removes the first entry in the queue (ie. the least 530 * recently used). 531 */ 532static struct entry *pop(struct mq_policy *mq, struct queue *q) 533{ 534 struct entry *e = container_of(queue_pop(q), struct entry, list); 535 536 if (e) { 537 hash_remove(e); 538 539 if (e->in_cache) 540 free_cblock(mq, e->cblock); 541 } 542 543 return e; 544} 545 546/* 547 * Has this entry already been updated? 548 */ 549static bool updated_this_tick(struct mq_policy *mq, struct entry *e) 550{ 551 return mq->tick == e->tick; 552} 553 554/* 555 * The promotion threshold is adjusted every generation. As are the counts 556 * of the entries. 557 * 558 * At the moment the threshold is taken by averaging the hit counts of some 559 * of the entries in the cache (the first 20 entries of the first level). 560 * 561 * We can be much cleverer than this though. For example, each promotion 562 * could bump up the threshold helping to prevent churn. Much more to do 563 * here. 564 */ 565 566#define MAX_TO_AVERAGE 20 567 568static void check_generation(struct mq_policy *mq) 569{ 570 unsigned total = 0, nr = 0, count = 0, level; 571 struct list_head *head; 572 struct entry *e; 573 574 if ((mq->hit_count >= mq->generation_period) && 575 (mq->nr_cblocks_allocated == from_cblock(mq->cache_size))) { 576 577 mq->hit_count = 0; 578 mq->generation++; 579 580 for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) { 581 head = mq->cache.qs + level; 582 list_for_each_entry(e, head, list) { 583 nr++; 584 total += e->hit_count; 585 586 if (++count >= MAX_TO_AVERAGE) 587 break; 588 } 589 } 590 591 mq->promote_threshold = nr ? total / nr : 1; 592 if (mq->promote_threshold * nr < total) 593 mq->promote_threshold++; 594 } 595} 596 597/* 598 * Whenever we use an entry we bump up it's hit counter, and push it to the 599 * back to it's current level. 600 */ 601static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e) 602{ 603 if (updated_this_tick(mq, e)) 604 return; 605 606 e->hit_count++; 607 mq->hit_count++; 608 check_generation(mq); 609 610 /* generation adjustment, to stop the counts increasing forever. */ 611 /* FIXME: divide? */ 612 /* e->hit_count -= min(e->hit_count - 1, mq->generation - e->generation); */ 613 e->generation = mq->generation; 614 615 del(mq, e); 616 push(mq, e); 617} 618 619/* 620 * Demote the least recently used entry from the cache to the pre_cache. 621 * Returns the new cache entry to use, and the old origin block it was 622 * mapped to. 623 * 624 * We drop the hit count on the demoted entry back to 1 to stop it bouncing 625 * straight back into the cache if it's subsequently hit. There are 626 * various options here, and more experimentation would be good: 627 * 628 * - just forget about the demoted entry completely (ie. don't insert it 629 into the pre_cache). 630 * - divide the hit count rather that setting to some hard coded value. 631 * - set the hit count to a hard coded value other than 1, eg, is it better 632 * if it goes in at level 2? 633 */ 634static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock) 635{ 636 dm_cblock_t result; 637 struct entry *demoted = pop(mq, &mq->cache); 638 639 BUG_ON(!demoted); 640 result = demoted->cblock; 641 *oblock = demoted->oblock; 642 demoted->in_cache = false; 643 demoted->hit_count = 1; 644 push(mq, demoted); 645 646 return result; 647} 648 649/* 650 * We modify the basic promotion_threshold depending on the specific io. 651 * 652 * If the origin block has been discarded then there's no cost to copy it 653 * to the cache. 654 * 655 * We bias towards reads, since they can be demoted at no cost if they 656 * haven't been dirtied. 657 */ 658#define DISCARDED_PROMOTE_THRESHOLD 1 659#define READ_PROMOTE_THRESHOLD 4 660#define WRITE_PROMOTE_THRESHOLD 8 661 662static unsigned adjusted_promote_threshold(struct mq_policy *mq, 663 bool discarded_oblock, int data_dir) 664{ 665 if (discarded_oblock && any_free_cblocks(mq) && data_dir == WRITE) 666 /* 667 * We don't need to do any copying at all, so give this a 668 * very low threshold. In practice this only triggers 669 * during initial population after a format. 670 */ 671 return DISCARDED_PROMOTE_THRESHOLD; 672 673 return data_dir == READ ? 674 (mq->promote_threshold + READ_PROMOTE_THRESHOLD) : 675 (mq->promote_threshold + WRITE_PROMOTE_THRESHOLD); 676} 677 678static bool should_promote(struct mq_policy *mq, struct entry *e, 679 bool discarded_oblock, int data_dir) 680{ 681 return e->hit_count >= 682 adjusted_promote_threshold(mq, discarded_oblock, data_dir); 683} 684 685static int cache_entry_found(struct mq_policy *mq, 686 struct entry *e, 687 struct policy_result *result) 688{ 689 requeue_and_update_tick(mq, e); 690 691 if (e->in_cache) { 692 result->op = POLICY_HIT; 693 result->cblock = e->cblock; 694 } 695 696 return 0; 697} 698 699/* 700 * Moves and entry from the pre_cache to the cache. The main work is 701 * finding which cache block to use. 702 */ 703static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e, 704 struct policy_result *result) 705{ 706 dm_cblock_t cblock; 707 708 if (find_free_cblock(mq, &cblock) == -ENOSPC) { 709 result->op = POLICY_REPLACE; 710 cblock = demote_cblock(mq, &result->old_oblock); 711 } else 712 result->op = POLICY_NEW; 713 714 result->cblock = e->cblock = cblock; 715 716 del(mq, e); 717 e->in_cache = true; 718 push(mq, e); 719 720 return 0; 721} 722 723static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e, 724 bool can_migrate, bool discarded_oblock, 725 int data_dir, struct policy_result *result) 726{ 727 int r = 0; 728 bool updated = updated_this_tick(mq, e); 729 730 requeue_and_update_tick(mq, e); 731 732 if ((!discarded_oblock && updated) || 733 !should_promote(mq, e, discarded_oblock, data_dir)) 734 result->op = POLICY_MISS; 735 else if (!can_migrate) 736 r = -EWOULDBLOCK; 737 else 738 r = pre_cache_to_cache(mq, e, result); 739 740 return r; 741} 742 743static void insert_in_pre_cache(struct mq_policy *mq, 744 dm_oblock_t oblock) 745{ 746 struct entry *e = alloc_entry(mq); 747 748 if (!e) 749 /* 750 * There's no spare entry structure, so we grab the least 751 * used one from the pre_cache. 752 */ 753 e = pop(mq, &mq->pre_cache); 754 755 if (unlikely(!e)) { 756 DMWARN("couldn't pop from pre cache"); 757 return; 758 } 759 760 e->in_cache = false; 761 e->oblock = oblock; 762 e->hit_count = 1; 763 e->generation = mq->generation; 764 push(mq, e); 765} 766 767static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock, 768 struct policy_result *result) 769{ 770 struct entry *e; 771 dm_cblock_t cblock; 772 773 if (find_free_cblock(mq, &cblock) == -ENOSPC) { 774 result->op = POLICY_MISS; 775 insert_in_pre_cache(mq, oblock); 776 return; 777 } 778 779 e = alloc_entry(mq); 780 if (unlikely(!e)) { 781 result->op = POLICY_MISS; 782 return; 783 } 784 785 e->oblock = oblock; 786 e->cblock = cblock; 787 e->in_cache = true; 788 e->hit_count = 1; 789 e->generation = mq->generation; 790 push(mq, e); 791 792 result->op = POLICY_NEW; 793 result->cblock = e->cblock; 794} 795 796static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock, 797 bool can_migrate, bool discarded_oblock, 798 int data_dir, struct policy_result *result) 799{ 800 if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) == 1) { 801 if (can_migrate) 802 insert_in_cache(mq, oblock, result); 803 else 804 return -EWOULDBLOCK; 805 } else { 806 insert_in_pre_cache(mq, oblock); 807 result->op = POLICY_MISS; 808 } 809 810 return 0; 811} 812 813/* 814 * Looks the oblock up in the hash table, then decides whether to put in 815 * pre_cache, or cache etc. 816 */ 817static int map(struct mq_policy *mq, dm_oblock_t oblock, 818 bool can_migrate, bool discarded_oblock, 819 int data_dir, struct policy_result *result) 820{ 821 int r = 0; 822 struct entry *e = hash_lookup(mq, oblock); 823 824 if (e && e->in_cache) 825 r = cache_entry_found(mq, e, result); 826 else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL) 827 result->op = POLICY_MISS; 828 else if (e) 829 r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock, 830 data_dir, result); 831 else 832 r = no_entry_found(mq, oblock, can_migrate, discarded_oblock, 833 data_dir, result); 834 835 if (r == -EWOULDBLOCK) 836 result->op = POLICY_MISS; 837 838 return r; 839} 840 841/*----------------------------------------------------------------*/ 842 843/* 844 * Public interface, via the policy struct. See dm-cache-policy.h for a 845 * description of these. 846 */ 847 848static struct mq_policy *to_mq_policy(struct dm_cache_policy *p) 849{ 850 return container_of(p, struct mq_policy, policy); 851} 852 853static void mq_destroy(struct dm_cache_policy *p) 854{ 855 struct mq_policy *mq = to_mq_policy(p); 856 857 free_bitset(mq->allocation_bitset); 858 kfree(mq->table); 859 free_entries(mq); 860 kfree(mq); 861} 862 863static void copy_tick(struct mq_policy *mq) 864{ 865 unsigned long flags; 866 867 spin_lock_irqsave(&mq->tick_lock, flags); 868 mq->tick = mq->tick_protected; 869 spin_unlock_irqrestore(&mq->tick_lock, flags); 870} 871 872static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock, 873 bool can_block, bool can_migrate, bool discarded_oblock, 874 struct bio *bio, struct policy_result *result) 875{ 876 int r; 877 struct mq_policy *mq = to_mq_policy(p); 878 879 result->op = POLICY_MISS; 880 881 if (can_block) 882 mutex_lock(&mq->lock); 883 else if (!mutex_trylock(&mq->lock)) 884 return -EWOULDBLOCK; 885 886 copy_tick(mq); 887 888 iot_examine_bio(&mq->tracker, bio); 889 r = map(mq, oblock, can_migrate, discarded_oblock, 890 bio_data_dir(bio), result); 891 892 mutex_unlock(&mq->lock); 893 894 return r; 895} 896 897static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) 898{ 899 int r; 900 struct mq_policy *mq = to_mq_policy(p); 901 struct entry *e; 902 903 if (!mutex_trylock(&mq->lock)) 904 return -EWOULDBLOCK; 905 906 e = hash_lookup(mq, oblock); 907 if (e && e->in_cache) { 908 *cblock = e->cblock; 909 r = 0; 910 } else 911 r = -ENOENT; 912 913 mutex_unlock(&mq->lock); 914 915 return r; 916} 917 918static int mq_load_mapping(struct dm_cache_policy *p, 919 dm_oblock_t oblock, dm_cblock_t cblock, 920 uint32_t hint, bool hint_valid) 921{ 922 struct mq_policy *mq = to_mq_policy(p); 923 struct entry *e; 924 925 e = alloc_entry(mq); 926 if (!e) 927 return -ENOMEM; 928 929 e->cblock = cblock; 930 e->oblock = oblock; 931 e->in_cache = true; 932 e->hit_count = hint_valid ? hint : 1; 933 e->generation = mq->generation; 934 push(mq, e); 935 936 return 0; 937} 938 939static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn, 940 void *context) 941{ 942 struct mq_policy *mq = to_mq_policy(p); 943 int r = 0; 944 struct entry *e; 945 unsigned level; 946 947 mutex_lock(&mq->lock); 948 949 for (level = 0; level < NR_QUEUE_LEVELS; level++) 950 list_for_each_entry(e, &mq->cache.qs[level], list) { 951 r = fn(context, e->cblock, e->oblock, e->hit_count); 952 if (r) 953 goto out; 954 } 955 956out: 957 mutex_unlock(&mq->lock); 958 959 return r; 960} 961 962static void remove_mapping(struct mq_policy *mq, dm_oblock_t oblock) 963{ 964 struct entry *e = hash_lookup(mq, oblock); 965 966 BUG_ON(!e || !e->in_cache); 967 968 del(mq, e); 969 e->in_cache = false; 970 push(mq, e); 971} 972 973static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) 974{ 975 struct mq_policy *mq = to_mq_policy(p); 976 977 mutex_lock(&mq->lock); 978 remove_mapping(mq, oblock); 979 mutex_unlock(&mq->lock); 980} 981 982static void force_mapping(struct mq_policy *mq, 983 dm_oblock_t current_oblock, dm_oblock_t new_oblock) 984{ 985 struct entry *e = hash_lookup(mq, current_oblock); 986 987 BUG_ON(!e || !e->in_cache); 988 989 del(mq, e); 990 e->oblock = new_oblock; 991 push(mq, e); 992} 993 994static void mq_force_mapping(struct dm_cache_policy *p, 995 dm_oblock_t current_oblock, dm_oblock_t new_oblock) 996{ 997 struct mq_policy *mq = to_mq_policy(p); 998 999 mutex_lock(&mq->lock); 1000 force_mapping(mq, current_oblock, new_oblock); 1001 mutex_unlock(&mq->lock); 1002} 1003 1004static dm_cblock_t mq_residency(struct dm_cache_policy *p) 1005{ 1006 struct mq_policy *mq = to_mq_policy(p); 1007 1008 /* FIXME: lock mutex, not sure we can block here */ 1009 return to_cblock(mq->nr_cblocks_allocated); 1010} 1011 1012static void mq_tick(struct dm_cache_policy *p) 1013{ 1014 struct mq_policy *mq = to_mq_policy(p); 1015 unsigned long flags; 1016 1017 spin_lock_irqsave(&mq->tick_lock, flags); 1018 mq->tick_protected++; 1019 spin_unlock_irqrestore(&mq->tick_lock, flags); 1020} 1021 1022static int mq_set_config_value(struct dm_cache_policy *p, 1023 const char *key, const char *value) 1024{ 1025 struct mq_policy *mq = to_mq_policy(p); 1026 enum io_pattern pattern; 1027 unsigned long tmp; 1028 1029 if (!strcasecmp(key, "random_threshold")) 1030 pattern = PATTERN_RANDOM; 1031 else if (!strcasecmp(key, "sequential_threshold")) 1032 pattern = PATTERN_SEQUENTIAL; 1033 else 1034 return -EINVAL; 1035 1036 if (kstrtoul(value, 10, &tmp)) 1037 return -EINVAL; 1038 1039 mq->tracker.thresholds[pattern] = tmp; 1040 1041 return 0; 1042} 1043 1044static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen) 1045{ 1046 ssize_t sz = 0; 1047 struct mq_policy *mq = to_mq_policy(p); 1048 1049 DMEMIT("4 random_threshold %u sequential_threshold %u", 1050 mq->tracker.thresholds[PATTERN_RANDOM], 1051 mq->tracker.thresholds[PATTERN_SEQUENTIAL]); 1052 1053 return 0; 1054} 1055 1056/* Init the policy plugin interface function pointers. */ 1057static void init_policy_functions(struct mq_policy *mq) 1058{ 1059 mq->policy.destroy = mq_destroy; 1060 mq->policy.map = mq_map; 1061 mq->policy.lookup = mq_lookup; 1062 mq->policy.load_mapping = mq_load_mapping; 1063 mq->policy.walk_mappings = mq_walk_mappings; 1064 mq->policy.remove_mapping = mq_remove_mapping; 1065 mq->policy.writeback_work = NULL; 1066 mq->policy.force_mapping = mq_force_mapping; 1067 mq->policy.residency = mq_residency; 1068 mq->policy.tick = mq_tick; 1069 mq->policy.emit_config_values = mq_emit_config_values; 1070 mq->policy.set_config_value = mq_set_config_value; 1071} 1072 1073static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, 1074 sector_t origin_size, 1075 sector_t cache_block_size) 1076{ 1077 int r; 1078 struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL); 1079 1080 if (!mq) 1081 return NULL; 1082 1083 init_policy_functions(mq); 1084 iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT); 1085 1086 mq->cache_size = cache_size; 1087 mq->tick_protected = 0; 1088 mq->tick = 0; 1089 mq->hit_count = 0; 1090 mq->generation = 0; 1091 mq->promote_threshold = 0; 1092 mutex_init(&mq->lock); 1093 spin_lock_init(&mq->tick_lock); 1094 mq->find_free_nr_words = dm_div_up(from_cblock(mq->cache_size), BITS_PER_LONG); 1095 mq->find_free_last_word = 0; 1096 1097 queue_init(&mq->pre_cache); 1098 queue_init(&mq->cache); 1099 mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U); 1100 1101 mq->nr_entries = 2 * from_cblock(cache_size); 1102 r = alloc_entries(mq, mq->nr_entries); 1103 if (r) 1104 goto bad_cache_alloc; 1105 1106 mq->nr_entries_allocated = 0; 1107 mq->nr_cblocks_allocated = 0; 1108 1109 mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); 1110 mq->hash_bits = ffs(mq->nr_buckets) - 1; 1111 mq->table = kzalloc(sizeof(*mq->table) * mq->nr_buckets, GFP_KERNEL); 1112 if (!mq->table) 1113 goto bad_alloc_table; 1114 1115 mq->allocation_bitset = alloc_bitset(from_cblock(cache_size)); 1116 if (!mq->allocation_bitset) 1117 goto bad_alloc_bitset; 1118 1119 return &mq->policy; 1120 1121bad_alloc_bitset: 1122 kfree(mq->table); 1123bad_alloc_table: 1124 free_entries(mq); 1125bad_cache_alloc: 1126 kfree(mq); 1127 1128 return NULL; 1129} 1130 1131/*----------------------------------------------------------------*/ 1132 1133static struct dm_cache_policy_type mq_policy_type = { 1134 .name = "mq", 1135 .version = {1, 0, 0}, 1136 .hint_size = 4, 1137 .owner = THIS_MODULE, 1138 .create = mq_create 1139}; 1140 1141static struct dm_cache_policy_type default_policy_type = { 1142 .name = "default", 1143 .version = {1, 0, 0}, 1144 .hint_size = 4, 1145 .owner = THIS_MODULE, 1146 .create = mq_create 1147}; 1148 1149static int __init mq_init(void) 1150{ 1151 int r; 1152 1153 mq_entry_cache = kmem_cache_create("dm_mq_policy_cache_entry", 1154 sizeof(struct entry), 1155 __alignof__(struct entry), 1156 0, NULL); 1157 if (!mq_entry_cache) 1158 goto bad; 1159 1160 r = dm_cache_policy_register(&mq_policy_type); 1161 if (r) { 1162 DMERR("register failed %d", r); 1163 goto bad_register_mq; 1164 } 1165 1166 r = dm_cache_policy_register(&default_policy_type); 1167 if (!r) { 1168 DMINFO("version %u.%u.%u loaded", 1169 mq_policy_type.version[0], 1170 mq_policy_type.version[1], 1171 mq_policy_type.version[2]); 1172 return 0; 1173 } 1174 1175 DMERR("register failed (as default) %d", r); 1176 1177 dm_cache_policy_unregister(&mq_policy_type); 1178bad_register_mq: 1179 kmem_cache_destroy(mq_entry_cache); 1180bad: 1181 return -ENOMEM; 1182} 1183 1184static void __exit mq_exit(void) 1185{ 1186 dm_cache_policy_unregister(&mq_policy_type); 1187 dm_cache_policy_unregister(&default_policy_type); 1188 1189 kmem_cache_destroy(mq_entry_cache); 1190} 1191 1192module_init(mq_init); 1193module_exit(mq_exit); 1194 1195MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 1196MODULE_LICENSE("GPL"); 1197MODULE_DESCRIPTION("mq cache policy"); 1198 1199MODULE_ALIAS("dm-cache-default");