Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v3.2-rc3 1700 lines 42 kB view raw
1/* 2 * Copyright (C) 2009-2011 Red Hat, Inc. 3 * 4 * Author: Mikulas Patocka <mpatocka@redhat.com> 5 * 6 * This file is released under the GPL. 7 */ 8 9#include "dm-bufio.h" 10 11#include <linux/device-mapper.h> 12#include <linux/dm-io.h> 13#include <linux/slab.h> 14#include <linux/vmalloc.h> 15#include <linux/version.h> 16#include <linux/shrinker.h> 17#include <linux/module.h> 18 19#define DM_MSG_PREFIX "bufio" 20 21/* 22 * Memory management policy: 23 * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory 24 * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower). 25 * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers. 26 * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT 27 * dirty buffers. 28 */ 29#define DM_BUFIO_MIN_BUFFERS 8 30 31#define DM_BUFIO_MEMORY_PERCENT 2 32#define DM_BUFIO_VMALLOC_PERCENT 25 33#define DM_BUFIO_WRITEBACK_PERCENT 75 34 35/* 36 * Check buffer ages in this interval (seconds) 37 */ 38#define DM_BUFIO_WORK_TIMER_SECS 10 39 40/* 41 * Free buffers when they are older than this (seconds) 42 */ 43#define DM_BUFIO_DEFAULT_AGE_SECS 60 44 45/* 46 * The number of bvec entries that are embedded directly in the buffer. 47 * If the chunk size is larger, dm-io is used to do the io. 48 */ 49#define DM_BUFIO_INLINE_VECS 16 50 51/* 52 * Buffer hash 53 */ 54#define DM_BUFIO_HASH_BITS 20 55#define DM_BUFIO_HASH(block) \ 56 ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \ 57 ((1 << DM_BUFIO_HASH_BITS) - 1)) 58 59/* 60 * Don't try to use kmem_cache_alloc for blocks larger than this. 61 * For explanation, see alloc_buffer_data below. 62 */ 63#define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT (PAGE_SIZE >> 1) 64#define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1)) 65 66/* 67 * dm_buffer->list_mode 68 */ 69#define LIST_CLEAN 0 70#define LIST_DIRTY 1 71#define LIST_SIZE 2 72 73/* 74 * Linking of buffers: 75 * All buffers are linked to cache_hash with their hash_list field. 76 * 77 * Clean buffers that are not being written (B_WRITING not set) 78 * are linked to lru[LIST_CLEAN] with their lru_list field. 79 * 80 * Dirty and clean buffers that are being written are linked to 81 * lru[LIST_DIRTY] with their lru_list field. When the write 82 * finishes, the buffer cannot be relinked immediately (because we 83 * are in an interrupt context and relinking requires process 84 * context), so some clean-not-writing buffers can be held on 85 * dirty_lru too. They are later added to lru in the process 86 * context. 87 */ 88struct dm_bufio_client { 89 struct mutex lock; 90 91 struct list_head lru[LIST_SIZE]; 92 unsigned long n_buffers[LIST_SIZE]; 93 94 struct block_device *bdev; 95 unsigned block_size; 96 unsigned char sectors_per_block_bits; 97 unsigned char pages_per_block_bits; 98 unsigned char blocks_per_page_bits; 99 unsigned aux_size; 100 void (*alloc_callback)(struct dm_buffer *); 101 void (*write_callback)(struct dm_buffer *); 102 103 struct dm_io_client *dm_io; 104 105 struct list_head reserved_buffers; 106 unsigned need_reserved_buffers; 107 108 struct hlist_head *cache_hash; 109 wait_queue_head_t free_buffer_wait; 110 111 int async_write_error; 112 113 struct list_head client_list; 114 struct shrinker shrinker; 115}; 116 117/* 118 * Buffer state bits. 119 */ 120#define B_READING 0 121#define B_WRITING 1 122#define B_DIRTY 2 123 124/* 125 * Describes how the block was allocated: 126 * kmem_cache_alloc(), __get_free_pages() or vmalloc(). 127 * See the comment at alloc_buffer_data. 128 */ 129enum data_mode { 130 DATA_MODE_SLAB = 0, 131 DATA_MODE_GET_FREE_PAGES = 1, 132 DATA_MODE_VMALLOC = 2, 133 DATA_MODE_LIMIT = 3 134}; 135 136struct dm_buffer { 137 struct hlist_node hash_list; 138 struct list_head lru_list; 139 sector_t block; 140 void *data; 141 enum data_mode data_mode; 142 unsigned char list_mode; /* LIST_* */ 143 unsigned hold_count; 144 int read_error; 145 int write_error; 146 unsigned long state; 147 unsigned long last_accessed; 148 struct dm_bufio_client *c; 149 struct bio bio; 150 struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS]; 151}; 152 153/*----------------------------------------------------------------*/ 154 155static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT]; 156static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT]; 157 158static inline int dm_bufio_cache_index(struct dm_bufio_client *c) 159{ 160 unsigned ret = c->blocks_per_page_bits - 1; 161 162 BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches)); 163 164 return ret; 165} 166 167#define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)]) 168#define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)]) 169 170#define dm_bufio_in_request() (!!current->bio_list) 171 172static void dm_bufio_lock(struct dm_bufio_client *c) 173{ 174 mutex_lock_nested(&c->lock, dm_bufio_in_request()); 175} 176 177static int dm_bufio_trylock(struct dm_bufio_client *c) 178{ 179 return mutex_trylock(&c->lock); 180} 181 182static void dm_bufio_unlock(struct dm_bufio_client *c) 183{ 184 mutex_unlock(&c->lock); 185} 186 187/* 188 * FIXME Move to sched.h? 189 */ 190#ifdef CONFIG_PREEMPT_VOLUNTARY 191# define dm_bufio_cond_resched() \ 192do { \ 193 if (unlikely(need_resched())) \ 194 _cond_resched(); \ 195} while (0) 196#else 197# define dm_bufio_cond_resched() do { } while (0) 198#endif 199 200/*----------------------------------------------------------------*/ 201 202/* 203 * Default cache size: available memory divided by the ratio. 204 */ 205static unsigned long dm_bufio_default_cache_size; 206 207/* 208 * Total cache size set by the user. 209 */ 210static unsigned long dm_bufio_cache_size; 211 212/* 213 * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change 214 * at any time. If it disagrees, the user has changed cache size. 215 */ 216static unsigned long dm_bufio_cache_size_latch; 217 218static DEFINE_SPINLOCK(param_spinlock); 219 220/* 221 * Buffers are freed after this timeout 222 */ 223static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; 224 225static unsigned long dm_bufio_peak_allocated; 226static unsigned long dm_bufio_allocated_kmem_cache; 227static unsigned long dm_bufio_allocated_get_free_pages; 228static unsigned long dm_bufio_allocated_vmalloc; 229static unsigned long dm_bufio_current_allocated; 230 231/*----------------------------------------------------------------*/ 232 233/* 234 * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count 235 */ 236static unsigned long dm_bufio_cache_size_per_client; 237 238/* 239 * The current number of clients. 240 */ 241static int dm_bufio_client_count; 242 243/* 244 * The list of all clients. 245 */ 246static LIST_HEAD(dm_bufio_all_clients); 247 248/* 249 * This mutex protects dm_bufio_cache_size_latch, 250 * dm_bufio_cache_size_per_client and dm_bufio_client_count 251 */ 252static DEFINE_MUTEX(dm_bufio_clients_lock); 253 254/*----------------------------------------------------------------*/ 255 256static void adjust_total_allocated(enum data_mode data_mode, long diff) 257{ 258 static unsigned long * const class_ptr[DATA_MODE_LIMIT] = { 259 &dm_bufio_allocated_kmem_cache, 260 &dm_bufio_allocated_get_free_pages, 261 &dm_bufio_allocated_vmalloc, 262 }; 263 264 spin_lock(&param_spinlock); 265 266 *class_ptr[data_mode] += diff; 267 268 dm_bufio_current_allocated += diff; 269 270 if (dm_bufio_current_allocated > dm_bufio_peak_allocated) 271 dm_bufio_peak_allocated = dm_bufio_current_allocated; 272 273 spin_unlock(&param_spinlock); 274} 275 276/* 277 * Change the number of clients and recalculate per-client limit. 278 */ 279static void __cache_size_refresh(void) 280{ 281 BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock)); 282 BUG_ON(dm_bufio_client_count < 0); 283 284 dm_bufio_cache_size_latch = dm_bufio_cache_size; 285 286 barrier(); 287 288 /* 289 * Use default if set to 0 and report the actual cache size used. 290 */ 291 if (!dm_bufio_cache_size_latch) { 292 (void)cmpxchg(&dm_bufio_cache_size, 0, 293 dm_bufio_default_cache_size); 294 dm_bufio_cache_size_latch = dm_bufio_default_cache_size; 295 } 296 297 dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch / 298 (dm_bufio_client_count ? : 1); 299} 300 301/* 302 * Allocating buffer data. 303 * 304 * Small buffers are allocated with kmem_cache, to use space optimally. 305 * 306 * For large buffers, we choose between get_free_pages and vmalloc. 307 * Each has advantages and disadvantages. 308 * 309 * __get_free_pages can randomly fail if the memory is fragmented. 310 * __vmalloc won't randomly fail, but vmalloc space is limited (it may be 311 * as low as 128M) so using it for caching is not appropriate. 312 * 313 * If the allocation may fail we use __get_free_pages. Memory fragmentation 314 * won't have a fatal effect here, but it just causes flushes of some other 315 * buffers and more I/O will be performed. Don't use __get_free_pages if it 316 * always fails (i.e. order >= MAX_ORDER). 317 * 318 * If the allocation shouldn't fail we use __vmalloc. This is only for the 319 * initial reserve allocation, so there's no risk of wasting all vmalloc 320 * space. 321 */ 322static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, 323 enum data_mode *data_mode) 324{ 325 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) { 326 *data_mode = DATA_MODE_SLAB; 327 return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask); 328 } 329 330 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT && 331 gfp_mask & __GFP_NORETRY) { 332 *data_mode = DATA_MODE_GET_FREE_PAGES; 333 return (void *)__get_free_pages(gfp_mask, 334 c->pages_per_block_bits); 335 } 336 337 *data_mode = DATA_MODE_VMALLOC; 338 return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); 339} 340 341/* 342 * Free buffer's data. 343 */ 344static void free_buffer_data(struct dm_bufio_client *c, 345 void *data, enum data_mode data_mode) 346{ 347 switch (data_mode) { 348 case DATA_MODE_SLAB: 349 kmem_cache_free(DM_BUFIO_CACHE(c), data); 350 break; 351 352 case DATA_MODE_GET_FREE_PAGES: 353 free_pages((unsigned long)data, c->pages_per_block_bits); 354 break; 355 356 case DATA_MODE_VMALLOC: 357 vfree(data); 358 break; 359 360 default: 361 DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d", 362 data_mode); 363 BUG(); 364 } 365} 366 367/* 368 * Allocate buffer and its data. 369 */ 370static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) 371{ 372 struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size, 373 gfp_mask); 374 375 if (!b) 376 return NULL; 377 378 b->c = c; 379 380 b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode); 381 if (!b->data) { 382 kfree(b); 383 return NULL; 384 } 385 386 adjust_total_allocated(b->data_mode, (long)c->block_size); 387 388 return b; 389} 390 391/* 392 * Free buffer and its data. 393 */ 394static void free_buffer(struct dm_buffer *b) 395{ 396 struct dm_bufio_client *c = b->c; 397 398 adjust_total_allocated(b->data_mode, -(long)c->block_size); 399 400 free_buffer_data(c, b->data, b->data_mode); 401 kfree(b); 402} 403 404/* 405 * Link buffer to the hash list and clean or dirty queue. 406 */ 407static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty) 408{ 409 struct dm_bufio_client *c = b->c; 410 411 c->n_buffers[dirty]++; 412 b->block = block; 413 b->list_mode = dirty; 414 list_add(&b->lru_list, &c->lru[dirty]); 415 hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]); 416 b->last_accessed = jiffies; 417} 418 419/* 420 * Unlink buffer from the hash list and dirty or clean queue. 421 */ 422static void __unlink_buffer(struct dm_buffer *b) 423{ 424 struct dm_bufio_client *c = b->c; 425 426 BUG_ON(!c->n_buffers[b->list_mode]); 427 428 c->n_buffers[b->list_mode]--; 429 hlist_del(&b->hash_list); 430 list_del(&b->lru_list); 431} 432 433/* 434 * Place the buffer to the head of dirty or clean LRU queue. 435 */ 436static void __relink_lru(struct dm_buffer *b, int dirty) 437{ 438 struct dm_bufio_client *c = b->c; 439 440 BUG_ON(!c->n_buffers[b->list_mode]); 441 442 c->n_buffers[b->list_mode]--; 443 c->n_buffers[dirty]++; 444 b->list_mode = dirty; 445 list_del(&b->lru_list); 446 list_add(&b->lru_list, &c->lru[dirty]); 447} 448 449/*---------------------------------------------------------------- 450 * Submit I/O on the buffer. 451 * 452 * Bio interface is faster but it has some problems: 453 * the vector list is limited (increasing this limit increases 454 * memory-consumption per buffer, so it is not viable); 455 * 456 * the memory must be direct-mapped, not vmalloced; 457 * 458 * the I/O driver can reject requests spuriously if it thinks that 459 * the requests are too big for the device or if they cross a 460 * controller-defined memory boundary. 461 * 462 * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and 463 * it is not vmalloced, try using the bio interface. 464 * 465 * If the buffer is big, if it is vmalloced or if the underlying device 466 * rejects the bio because it is too large, use dm-io layer to do the I/O. 467 * The dm-io layer splits the I/O into multiple requests, avoiding the above 468 * shortcomings. 469 *--------------------------------------------------------------*/ 470 471/* 472 * dm-io completion routine. It just calls b->bio.bi_end_io, pretending 473 * that the request was handled directly with bio interface. 474 */ 475static void dmio_complete(unsigned long error, void *context) 476{ 477 struct dm_buffer *b = context; 478 479 b->bio.bi_end_io(&b->bio, error ? -EIO : 0); 480} 481 482static void use_dmio(struct dm_buffer *b, int rw, sector_t block, 483 bio_end_io_t *end_io) 484{ 485 int r; 486 struct dm_io_request io_req = { 487 .bi_rw = rw, 488 .notify.fn = dmio_complete, 489 .notify.context = b, 490 .client = b->c->dm_io, 491 }; 492 struct dm_io_region region = { 493 .bdev = b->c->bdev, 494 .sector = block << b->c->sectors_per_block_bits, 495 .count = b->c->block_size >> SECTOR_SHIFT, 496 }; 497 498 if (b->data_mode != DATA_MODE_VMALLOC) { 499 io_req.mem.type = DM_IO_KMEM; 500 io_req.mem.ptr.addr = b->data; 501 } else { 502 io_req.mem.type = DM_IO_VMA; 503 io_req.mem.ptr.vma = b->data; 504 } 505 506 b->bio.bi_end_io = end_io; 507 508 r = dm_io(&io_req, 1, &region, NULL); 509 if (r) 510 end_io(&b->bio, r); 511} 512 513static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, 514 bio_end_io_t *end_io) 515{ 516 char *ptr; 517 int len; 518 519 bio_init(&b->bio); 520 b->bio.bi_io_vec = b->bio_vec; 521 b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; 522 b->bio.bi_sector = block << b->c->sectors_per_block_bits; 523 b->bio.bi_bdev = b->c->bdev; 524 b->bio.bi_end_io = end_io; 525 526 /* 527 * We assume that if len >= PAGE_SIZE ptr is page-aligned. 528 * If len < PAGE_SIZE the buffer doesn't cross page boundary. 529 */ 530 ptr = b->data; 531 len = b->c->block_size; 532 533 if (len >= PAGE_SIZE) 534 BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1)); 535 else 536 BUG_ON((unsigned long)ptr & (len - 1)); 537 538 do { 539 if (!bio_add_page(&b->bio, virt_to_page(ptr), 540 len < PAGE_SIZE ? len : PAGE_SIZE, 541 virt_to_phys(ptr) & (PAGE_SIZE - 1))) { 542 BUG_ON(b->c->block_size <= PAGE_SIZE); 543 use_dmio(b, rw, block, end_io); 544 return; 545 } 546 547 len -= PAGE_SIZE; 548 ptr += PAGE_SIZE; 549 } while (len > 0); 550 551 submit_bio(rw, &b->bio); 552} 553 554static void submit_io(struct dm_buffer *b, int rw, sector_t block, 555 bio_end_io_t *end_io) 556{ 557 if (rw == WRITE && b->c->write_callback) 558 b->c->write_callback(b); 559 560 if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE && 561 b->data_mode != DATA_MODE_VMALLOC) 562 use_inline_bio(b, rw, block, end_io); 563 else 564 use_dmio(b, rw, block, end_io); 565} 566 567/*---------------------------------------------------------------- 568 * Writing dirty buffers 569 *--------------------------------------------------------------*/ 570 571/* 572 * The endio routine for write. 573 * 574 * Set the error, clear B_WRITING bit and wake anyone who was waiting on 575 * it. 576 */ 577static void write_endio(struct bio *bio, int error) 578{ 579 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 580 581 b->write_error = error; 582 if (error) { 583 struct dm_bufio_client *c = b->c; 584 (void)cmpxchg(&c->async_write_error, 0, error); 585 } 586 587 BUG_ON(!test_bit(B_WRITING, &b->state)); 588 589 smp_mb__before_clear_bit(); 590 clear_bit(B_WRITING, &b->state); 591 smp_mb__after_clear_bit(); 592 593 wake_up_bit(&b->state, B_WRITING); 594} 595 596/* 597 * This function is called when wait_on_bit is actually waiting. 598 */ 599static int do_io_schedule(void *word) 600{ 601 io_schedule(); 602 603 return 0; 604} 605 606/* 607 * Initiate a write on a dirty buffer, but don't wait for it. 608 * 609 * - If the buffer is not dirty, exit. 610 * - If there some previous write going on, wait for it to finish (we can't 611 * have two writes on the same buffer simultaneously). 612 * - Submit our write and don't wait on it. We set B_WRITING indicating 613 * that there is a write in progress. 614 */ 615static void __write_dirty_buffer(struct dm_buffer *b) 616{ 617 if (!test_bit(B_DIRTY, &b->state)) 618 return; 619 620 clear_bit(B_DIRTY, &b->state); 621 wait_on_bit_lock(&b->state, B_WRITING, 622 do_io_schedule, TASK_UNINTERRUPTIBLE); 623 624 submit_io(b, WRITE, b->block, write_endio); 625} 626 627/* 628 * Wait until any activity on the buffer finishes. Possibly write the 629 * buffer if it is dirty. When this function finishes, there is no I/O 630 * running on the buffer and the buffer is not dirty. 631 */ 632static void __make_buffer_clean(struct dm_buffer *b) 633{ 634 BUG_ON(b->hold_count); 635 636 if (!b->state) /* fast case */ 637 return; 638 639 wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); 640 __write_dirty_buffer(b); 641 wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE); 642} 643 644/* 645 * Find some buffer that is not held by anybody, clean it, unlink it and 646 * return it. 647 */ 648static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) 649{ 650 struct dm_buffer *b; 651 652 list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) { 653 BUG_ON(test_bit(B_WRITING, &b->state)); 654 BUG_ON(test_bit(B_DIRTY, &b->state)); 655 656 if (!b->hold_count) { 657 __make_buffer_clean(b); 658 __unlink_buffer(b); 659 return b; 660 } 661 dm_bufio_cond_resched(); 662 } 663 664 list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) { 665 BUG_ON(test_bit(B_READING, &b->state)); 666 667 if (!b->hold_count) { 668 __make_buffer_clean(b); 669 __unlink_buffer(b); 670 return b; 671 } 672 dm_bufio_cond_resched(); 673 } 674 675 return NULL; 676} 677 678/* 679 * Wait until some other threads free some buffer or release hold count on 680 * some buffer. 681 * 682 * This function is entered with c->lock held, drops it and regains it 683 * before exiting. 684 */ 685static void __wait_for_free_buffer(struct dm_bufio_client *c) 686{ 687 DECLARE_WAITQUEUE(wait, current); 688 689 add_wait_queue(&c->free_buffer_wait, &wait); 690 set_task_state(current, TASK_UNINTERRUPTIBLE); 691 dm_bufio_unlock(c); 692 693 io_schedule(); 694 695 set_task_state(current, TASK_RUNNING); 696 remove_wait_queue(&c->free_buffer_wait, &wait); 697 698 dm_bufio_lock(c); 699} 700 701/* 702 * Allocate a new buffer. If the allocation is not possible, wait until 703 * some other thread frees a buffer. 704 * 705 * May drop the lock and regain it. 706 */ 707static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c) 708{ 709 struct dm_buffer *b; 710 711 /* 712 * dm-bufio is resistant to allocation failures (it just keeps 713 * one buffer reserved in cases all the allocations fail). 714 * So set flags to not try too hard: 715 * GFP_NOIO: don't recurse into the I/O layer 716 * __GFP_NORETRY: don't retry and rather return failure 717 * __GFP_NOMEMALLOC: don't use emergency reserves 718 * __GFP_NOWARN: don't print a warning in case of failure 719 * 720 * For debugging, if we set the cache size to 1, no new buffers will 721 * be allocated. 722 */ 723 while (1) { 724 if (dm_bufio_cache_size_latch != 1) { 725 b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); 726 if (b) 727 return b; 728 } 729 730 if (!list_empty(&c->reserved_buffers)) { 731 b = list_entry(c->reserved_buffers.next, 732 struct dm_buffer, lru_list); 733 list_del(&b->lru_list); 734 c->need_reserved_buffers++; 735 736 return b; 737 } 738 739 b = __get_unclaimed_buffer(c); 740 if (b) 741 return b; 742 743 __wait_for_free_buffer(c); 744 } 745} 746 747static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c) 748{ 749 struct dm_buffer *b = __alloc_buffer_wait_no_callback(c); 750 751 if (c->alloc_callback) 752 c->alloc_callback(b); 753 754 return b; 755} 756 757/* 758 * Free a buffer and wake other threads waiting for free buffers. 759 */ 760static void __free_buffer_wake(struct dm_buffer *b) 761{ 762 struct dm_bufio_client *c = b->c; 763 764 if (!c->need_reserved_buffers) 765 free_buffer(b); 766 else { 767 list_add(&b->lru_list, &c->reserved_buffers); 768 c->need_reserved_buffers--; 769 } 770 771 wake_up(&c->free_buffer_wait); 772} 773 774static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait) 775{ 776 struct dm_buffer *b, *tmp; 777 778 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 779 BUG_ON(test_bit(B_READING, &b->state)); 780 781 if (!test_bit(B_DIRTY, &b->state) && 782 !test_bit(B_WRITING, &b->state)) { 783 __relink_lru(b, LIST_CLEAN); 784 continue; 785 } 786 787 if (no_wait && test_bit(B_WRITING, &b->state)) 788 return; 789 790 __write_dirty_buffer(b); 791 dm_bufio_cond_resched(); 792 } 793} 794 795/* 796 * Get writeback threshold and buffer limit for a given client. 797 */ 798static void __get_memory_limit(struct dm_bufio_client *c, 799 unsigned long *threshold_buffers, 800 unsigned long *limit_buffers) 801{ 802 unsigned long buffers; 803 804 if (dm_bufio_cache_size != dm_bufio_cache_size_latch) { 805 mutex_lock(&dm_bufio_clients_lock); 806 __cache_size_refresh(); 807 mutex_unlock(&dm_bufio_clients_lock); 808 } 809 810 buffers = dm_bufio_cache_size_per_client >> 811 (c->sectors_per_block_bits + SECTOR_SHIFT); 812 813 if (buffers < DM_BUFIO_MIN_BUFFERS) 814 buffers = DM_BUFIO_MIN_BUFFERS; 815 816 *limit_buffers = buffers; 817 *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; 818} 819 820/* 821 * Check if we're over watermark. 822 * If we are over threshold_buffers, start freeing buffers. 823 * If we're over "limit_buffers", block until we get under the limit. 824 */ 825static void __check_watermark(struct dm_bufio_client *c) 826{ 827 unsigned long threshold_buffers, limit_buffers; 828 829 __get_memory_limit(c, &threshold_buffers, &limit_buffers); 830 831 while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] > 832 limit_buffers) { 833 834 struct dm_buffer *b = __get_unclaimed_buffer(c); 835 836 if (!b) 837 return; 838 839 __free_buffer_wake(b); 840 dm_bufio_cond_resched(); 841 } 842 843 if (c->n_buffers[LIST_DIRTY] > threshold_buffers) 844 __write_dirty_buffers_async(c, 1); 845} 846 847/* 848 * Find a buffer in the hash. 849 */ 850static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) 851{ 852 struct dm_buffer *b; 853 struct hlist_node *hn; 854 855 hlist_for_each_entry(b, hn, &c->cache_hash[DM_BUFIO_HASH(block)], 856 hash_list) { 857 dm_bufio_cond_resched(); 858 if (b->block == block) 859 return b; 860 } 861 862 return NULL; 863} 864 865/*---------------------------------------------------------------- 866 * Getting a buffer 867 *--------------------------------------------------------------*/ 868 869enum new_flag { 870 NF_FRESH = 0, 871 NF_READ = 1, 872 NF_GET = 2 873}; 874 875static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, 876 enum new_flag nf, struct dm_buffer **bp, 877 int *need_submit) 878{ 879 struct dm_buffer *b, *new_b = NULL; 880 881 *need_submit = 0; 882 883 b = __find(c, block); 884 if (b) { 885 b->hold_count++; 886 __relink_lru(b, test_bit(B_DIRTY, &b->state) || 887 test_bit(B_WRITING, &b->state)); 888 return b; 889 } 890 891 if (nf == NF_GET) 892 return NULL; 893 894 new_b = __alloc_buffer_wait(c); 895 896 /* 897 * We've had a period where the mutex was unlocked, so need to 898 * recheck the hash table. 899 */ 900 b = __find(c, block); 901 if (b) { 902 __free_buffer_wake(new_b); 903 b->hold_count++; 904 __relink_lru(b, test_bit(B_DIRTY, &b->state) || 905 test_bit(B_WRITING, &b->state)); 906 return b; 907 } 908 909 __check_watermark(c); 910 911 b = new_b; 912 b->hold_count = 1; 913 b->read_error = 0; 914 b->write_error = 0; 915 __link_buffer(b, block, LIST_CLEAN); 916 917 if (nf == NF_FRESH) { 918 b->state = 0; 919 return b; 920 } 921 922 b->state = 1 << B_READING; 923 *need_submit = 1; 924 925 return b; 926} 927 928/* 929 * The endio routine for reading: set the error, clear the bit and wake up 930 * anyone waiting on the buffer. 931 */ 932static void read_endio(struct bio *bio, int error) 933{ 934 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 935 936 b->read_error = error; 937 938 BUG_ON(!test_bit(B_READING, &b->state)); 939 940 smp_mb__before_clear_bit(); 941 clear_bit(B_READING, &b->state); 942 smp_mb__after_clear_bit(); 943 944 wake_up_bit(&b->state, B_READING); 945} 946 947/* 948 * A common routine for dm_bufio_new and dm_bufio_read. Operation of these 949 * functions is similar except that dm_bufio_new doesn't read the 950 * buffer from the disk (assuming that the caller overwrites all the data 951 * and uses dm_bufio_mark_buffer_dirty to write new data back). 952 */ 953static void *new_read(struct dm_bufio_client *c, sector_t block, 954 enum new_flag nf, struct dm_buffer **bp) 955{ 956 int need_submit; 957 struct dm_buffer *b; 958 959 dm_bufio_lock(c); 960 b = __bufio_new(c, block, nf, bp, &need_submit); 961 dm_bufio_unlock(c); 962 963 if (!b || IS_ERR(b)) 964 return b; 965 966 if (need_submit) 967 submit_io(b, READ, b->block, read_endio); 968 969 wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); 970 971 if (b->read_error) { 972 int error = b->read_error; 973 974 dm_bufio_release(b); 975 976 return ERR_PTR(error); 977 } 978 979 *bp = b; 980 981 return b->data; 982} 983 984void *dm_bufio_get(struct dm_bufio_client *c, sector_t block, 985 struct dm_buffer **bp) 986{ 987 return new_read(c, block, NF_GET, bp); 988} 989EXPORT_SYMBOL_GPL(dm_bufio_get); 990 991void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, 992 struct dm_buffer **bp) 993{ 994 BUG_ON(dm_bufio_in_request()); 995 996 return new_read(c, block, NF_READ, bp); 997} 998EXPORT_SYMBOL_GPL(dm_bufio_read); 999 1000void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, 1001 struct dm_buffer **bp) 1002{ 1003 BUG_ON(dm_bufio_in_request()); 1004 1005 return new_read(c, block, NF_FRESH, bp); 1006} 1007EXPORT_SYMBOL_GPL(dm_bufio_new); 1008 1009void dm_bufio_release(struct dm_buffer *b) 1010{ 1011 struct dm_bufio_client *c = b->c; 1012 1013 dm_bufio_lock(c); 1014 1015 BUG_ON(test_bit(B_READING, &b->state)); 1016 BUG_ON(!b->hold_count); 1017 1018 b->hold_count--; 1019 if (!b->hold_count) { 1020 wake_up(&c->free_buffer_wait); 1021 1022 /* 1023 * If there were errors on the buffer, and the buffer is not 1024 * to be written, free the buffer. There is no point in caching 1025 * invalid buffer. 1026 */ 1027 if ((b->read_error || b->write_error) && 1028 !test_bit(B_WRITING, &b->state) && 1029 !test_bit(B_DIRTY, &b->state)) { 1030 __unlink_buffer(b); 1031 __free_buffer_wake(b); 1032 } 1033 } 1034 1035 dm_bufio_unlock(c); 1036} 1037EXPORT_SYMBOL_GPL(dm_bufio_release); 1038 1039void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) 1040{ 1041 struct dm_bufio_client *c = b->c; 1042 1043 dm_bufio_lock(c); 1044 1045 if (!test_and_set_bit(B_DIRTY, &b->state)) 1046 __relink_lru(b, LIST_DIRTY); 1047 1048 dm_bufio_unlock(c); 1049} 1050EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty); 1051 1052void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c) 1053{ 1054 BUG_ON(dm_bufio_in_request()); 1055 1056 dm_bufio_lock(c); 1057 __write_dirty_buffers_async(c, 0); 1058 dm_bufio_unlock(c); 1059} 1060EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); 1061 1062/* 1063 * For performance, it is essential that the buffers are written asynchronously 1064 * and simultaneously (so that the block layer can merge the writes) and then 1065 * waited upon. 1066 * 1067 * Finally, we flush hardware disk cache. 1068 */ 1069int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) 1070{ 1071 int a, f; 1072 unsigned long buffers_processed = 0; 1073 struct dm_buffer *b, *tmp; 1074 1075 dm_bufio_lock(c); 1076 __write_dirty_buffers_async(c, 0); 1077 1078again: 1079 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 1080 int dropped_lock = 0; 1081 1082 if (buffers_processed < c->n_buffers[LIST_DIRTY]) 1083 buffers_processed++; 1084 1085 BUG_ON(test_bit(B_READING, &b->state)); 1086 1087 if (test_bit(B_WRITING, &b->state)) { 1088 if (buffers_processed < c->n_buffers[LIST_DIRTY]) { 1089 dropped_lock = 1; 1090 b->hold_count++; 1091 dm_bufio_unlock(c); 1092 wait_on_bit(&b->state, B_WRITING, 1093 do_io_schedule, 1094 TASK_UNINTERRUPTIBLE); 1095 dm_bufio_lock(c); 1096 b->hold_count--; 1097 } else 1098 wait_on_bit(&b->state, B_WRITING, 1099 do_io_schedule, 1100 TASK_UNINTERRUPTIBLE); 1101 } 1102 1103 if (!test_bit(B_DIRTY, &b->state) && 1104 !test_bit(B_WRITING, &b->state)) 1105 __relink_lru(b, LIST_CLEAN); 1106 1107 dm_bufio_cond_resched(); 1108 1109 /* 1110 * If we dropped the lock, the list is no longer consistent, 1111 * so we must restart the search. 1112 * 1113 * In the most common case, the buffer just processed is 1114 * relinked to the clean list, so we won't loop scanning the 1115 * same buffer again and again. 1116 * 1117 * This may livelock if there is another thread simultaneously 1118 * dirtying buffers, so we count the number of buffers walked 1119 * and if it exceeds the total number of buffers, it means that 1120 * someone is doing some writes simultaneously with us. In 1121 * this case, stop, dropping the lock. 1122 */ 1123 if (dropped_lock) 1124 goto again; 1125 } 1126 wake_up(&c->free_buffer_wait); 1127 dm_bufio_unlock(c); 1128 1129 a = xchg(&c->async_write_error, 0); 1130 f = dm_bufio_issue_flush(c); 1131 if (a) 1132 return a; 1133 1134 return f; 1135} 1136EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers); 1137 1138/* 1139 * Use dm-io to send and empty barrier flush the device. 1140 */ 1141int dm_bufio_issue_flush(struct dm_bufio_client *c) 1142{ 1143 struct dm_io_request io_req = { 1144 .bi_rw = REQ_FLUSH, 1145 .mem.type = DM_IO_KMEM, 1146 .mem.ptr.addr = NULL, 1147 .client = c->dm_io, 1148 }; 1149 struct dm_io_region io_reg = { 1150 .bdev = c->bdev, 1151 .sector = 0, 1152 .count = 0, 1153 }; 1154 1155 BUG_ON(dm_bufio_in_request()); 1156 1157 return dm_io(&io_req, 1, &io_reg, NULL); 1158} 1159EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); 1160 1161/* 1162 * We first delete any other buffer that may be at that new location. 1163 * 1164 * Then, we write the buffer to the original location if it was dirty. 1165 * 1166 * Then, if we are the only one who is holding the buffer, relink the buffer 1167 * in the hash queue for the new location. 1168 * 1169 * If there was someone else holding the buffer, we write it to the new 1170 * location but not relink it, because that other user needs to have the buffer 1171 * at the same place. 1172 */ 1173void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block) 1174{ 1175 struct dm_bufio_client *c = b->c; 1176 struct dm_buffer *new; 1177 1178 BUG_ON(dm_bufio_in_request()); 1179 1180 dm_bufio_lock(c); 1181 1182retry: 1183 new = __find(c, new_block); 1184 if (new) { 1185 if (new->hold_count) { 1186 __wait_for_free_buffer(c); 1187 goto retry; 1188 } 1189 1190 /* 1191 * FIXME: Is there any point waiting for a write that's going 1192 * to be overwritten in a bit? 1193 */ 1194 __make_buffer_clean(new); 1195 __unlink_buffer(new); 1196 __free_buffer_wake(new); 1197 } 1198 1199 BUG_ON(!b->hold_count); 1200 BUG_ON(test_bit(B_READING, &b->state)); 1201 1202 __write_dirty_buffer(b); 1203 if (b->hold_count == 1) { 1204 wait_on_bit(&b->state, B_WRITING, 1205 do_io_schedule, TASK_UNINTERRUPTIBLE); 1206 set_bit(B_DIRTY, &b->state); 1207 __unlink_buffer(b); 1208 __link_buffer(b, new_block, LIST_DIRTY); 1209 } else { 1210 sector_t old_block; 1211 wait_on_bit_lock(&b->state, B_WRITING, 1212 do_io_schedule, TASK_UNINTERRUPTIBLE); 1213 /* 1214 * Relink buffer to "new_block" so that write_callback 1215 * sees "new_block" as a block number. 1216 * After the write, link the buffer back to old_block. 1217 * All this must be done in bufio lock, so that block number 1218 * change isn't visible to other threads. 1219 */ 1220 old_block = b->block; 1221 __unlink_buffer(b); 1222 __link_buffer(b, new_block, b->list_mode); 1223 submit_io(b, WRITE, new_block, write_endio); 1224 wait_on_bit(&b->state, B_WRITING, 1225 do_io_schedule, TASK_UNINTERRUPTIBLE); 1226 __unlink_buffer(b); 1227 __link_buffer(b, old_block, b->list_mode); 1228 } 1229 1230 dm_bufio_unlock(c); 1231 dm_bufio_release(b); 1232} 1233EXPORT_SYMBOL_GPL(dm_bufio_release_move); 1234 1235unsigned dm_bufio_get_block_size(struct dm_bufio_client *c) 1236{ 1237 return c->block_size; 1238} 1239EXPORT_SYMBOL_GPL(dm_bufio_get_block_size); 1240 1241sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) 1242{ 1243 return i_size_read(c->bdev->bd_inode) >> 1244 (SECTOR_SHIFT + c->sectors_per_block_bits); 1245} 1246EXPORT_SYMBOL_GPL(dm_bufio_get_device_size); 1247 1248sector_t dm_bufio_get_block_number(struct dm_buffer *b) 1249{ 1250 return b->block; 1251} 1252EXPORT_SYMBOL_GPL(dm_bufio_get_block_number); 1253 1254void *dm_bufio_get_block_data(struct dm_buffer *b) 1255{ 1256 return b->data; 1257} 1258EXPORT_SYMBOL_GPL(dm_bufio_get_block_data); 1259 1260void *dm_bufio_get_aux_data(struct dm_buffer *b) 1261{ 1262 return b + 1; 1263} 1264EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data); 1265 1266struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b) 1267{ 1268 return b->c; 1269} 1270EXPORT_SYMBOL_GPL(dm_bufio_get_client); 1271 1272static void drop_buffers(struct dm_bufio_client *c) 1273{ 1274 struct dm_buffer *b; 1275 int i; 1276 1277 BUG_ON(dm_bufio_in_request()); 1278 1279 /* 1280 * An optimization so that the buffers are not written one-by-one. 1281 */ 1282 dm_bufio_write_dirty_buffers_async(c); 1283 1284 dm_bufio_lock(c); 1285 1286 while ((b = __get_unclaimed_buffer(c))) 1287 __free_buffer_wake(b); 1288 1289 for (i = 0; i < LIST_SIZE; i++) 1290 list_for_each_entry(b, &c->lru[i], lru_list) 1291 DMERR("leaked buffer %llx, hold count %u, list %d", 1292 (unsigned long long)b->block, b->hold_count, i); 1293 1294 for (i = 0; i < LIST_SIZE; i++) 1295 BUG_ON(!list_empty(&c->lru[i])); 1296 1297 dm_bufio_unlock(c); 1298} 1299 1300/* 1301 * Test if the buffer is unused and too old, and commit it. 1302 * At if noio is set, we must not do any I/O because we hold 1303 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to 1304 * different bufio client. 1305 */ 1306static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp, 1307 unsigned long max_jiffies) 1308{ 1309 if (jiffies - b->last_accessed < max_jiffies) 1310 return 1; 1311 1312 if (!(gfp & __GFP_IO)) { 1313 if (test_bit(B_READING, &b->state) || 1314 test_bit(B_WRITING, &b->state) || 1315 test_bit(B_DIRTY, &b->state)) 1316 return 1; 1317 } 1318 1319 if (b->hold_count) 1320 return 1; 1321 1322 __make_buffer_clean(b); 1323 __unlink_buffer(b); 1324 __free_buffer_wake(b); 1325 1326 return 0; 1327} 1328 1329static void __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, 1330 struct shrink_control *sc) 1331{ 1332 int l; 1333 struct dm_buffer *b, *tmp; 1334 1335 for (l = 0; l < LIST_SIZE; l++) { 1336 list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) 1337 if (!__cleanup_old_buffer(b, sc->gfp_mask, 0) && 1338 !--nr_to_scan) 1339 return; 1340 dm_bufio_cond_resched(); 1341 } 1342} 1343 1344static int shrink(struct shrinker *shrinker, struct shrink_control *sc) 1345{ 1346 struct dm_bufio_client *c = 1347 container_of(shrinker, struct dm_bufio_client, shrinker); 1348 unsigned long r; 1349 unsigned long nr_to_scan = sc->nr_to_scan; 1350 1351 if (sc->gfp_mask & __GFP_IO) 1352 dm_bufio_lock(c); 1353 else if (!dm_bufio_trylock(c)) 1354 return !nr_to_scan ? 0 : -1; 1355 1356 if (nr_to_scan) 1357 __scan(c, nr_to_scan, sc); 1358 1359 r = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; 1360 if (r > INT_MAX) 1361 r = INT_MAX; 1362 1363 dm_bufio_unlock(c); 1364 1365 return r; 1366} 1367 1368/* 1369 * Create the buffering interface 1370 */ 1371struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size, 1372 unsigned reserved_buffers, unsigned aux_size, 1373 void (*alloc_callback)(struct dm_buffer *), 1374 void (*write_callback)(struct dm_buffer *)) 1375{ 1376 int r; 1377 struct dm_bufio_client *c; 1378 unsigned i; 1379 1380 BUG_ON(block_size < 1 << SECTOR_SHIFT || 1381 (block_size & (block_size - 1))); 1382 1383 c = kmalloc(sizeof(*c), GFP_KERNEL); 1384 if (!c) { 1385 r = -ENOMEM; 1386 goto bad_client; 1387 } 1388 c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS); 1389 if (!c->cache_hash) { 1390 r = -ENOMEM; 1391 goto bad_hash; 1392 } 1393 1394 c->bdev = bdev; 1395 c->block_size = block_size; 1396 c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT; 1397 c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ? 1398 ffs(block_size) - 1 - PAGE_SHIFT : 0; 1399 c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ? 1400 PAGE_SHIFT - (ffs(block_size) - 1) : 0); 1401 1402 c->aux_size = aux_size; 1403 c->alloc_callback = alloc_callback; 1404 c->write_callback = write_callback; 1405 1406 for (i = 0; i < LIST_SIZE; i++) { 1407 INIT_LIST_HEAD(&c->lru[i]); 1408 c->n_buffers[i] = 0; 1409 } 1410 1411 for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) 1412 INIT_HLIST_HEAD(&c->cache_hash[i]); 1413 1414 mutex_init(&c->lock); 1415 INIT_LIST_HEAD(&c->reserved_buffers); 1416 c->need_reserved_buffers = reserved_buffers; 1417 1418 init_waitqueue_head(&c->free_buffer_wait); 1419 c->async_write_error = 0; 1420 1421 c->dm_io = dm_io_client_create(); 1422 if (IS_ERR(c->dm_io)) { 1423 r = PTR_ERR(c->dm_io); 1424 goto bad_dm_io; 1425 } 1426 1427 mutex_lock(&dm_bufio_clients_lock); 1428 if (c->blocks_per_page_bits) { 1429 if (!DM_BUFIO_CACHE_NAME(c)) { 1430 DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size); 1431 if (!DM_BUFIO_CACHE_NAME(c)) { 1432 r = -ENOMEM; 1433 mutex_unlock(&dm_bufio_clients_lock); 1434 goto bad_cache; 1435 } 1436 } 1437 1438 if (!DM_BUFIO_CACHE(c)) { 1439 DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c), 1440 c->block_size, 1441 c->block_size, 0, NULL); 1442 if (!DM_BUFIO_CACHE(c)) { 1443 r = -ENOMEM; 1444 mutex_unlock(&dm_bufio_clients_lock); 1445 goto bad_cache; 1446 } 1447 } 1448 } 1449 mutex_unlock(&dm_bufio_clients_lock); 1450 1451 while (c->need_reserved_buffers) { 1452 struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL); 1453 1454 if (!b) { 1455 r = -ENOMEM; 1456 goto bad_buffer; 1457 } 1458 __free_buffer_wake(b); 1459 } 1460 1461 mutex_lock(&dm_bufio_clients_lock); 1462 dm_bufio_client_count++; 1463 list_add(&c->client_list, &dm_bufio_all_clients); 1464 __cache_size_refresh(); 1465 mutex_unlock(&dm_bufio_clients_lock); 1466 1467 c->shrinker.shrink = shrink; 1468 c->shrinker.seeks = 1; 1469 c->shrinker.batch = 0; 1470 register_shrinker(&c->shrinker); 1471 1472 return c; 1473 1474bad_buffer: 1475bad_cache: 1476 while (!list_empty(&c->reserved_buffers)) { 1477 struct dm_buffer *b = list_entry(c->reserved_buffers.next, 1478 struct dm_buffer, lru_list); 1479 list_del(&b->lru_list); 1480 free_buffer(b); 1481 } 1482 dm_io_client_destroy(c->dm_io); 1483bad_dm_io: 1484 vfree(c->cache_hash); 1485bad_hash: 1486 kfree(c); 1487bad_client: 1488 return ERR_PTR(r); 1489} 1490EXPORT_SYMBOL_GPL(dm_bufio_client_create); 1491 1492/* 1493 * Free the buffering interface. 1494 * It is required that there are no references on any buffers. 1495 */ 1496void dm_bufio_client_destroy(struct dm_bufio_client *c) 1497{ 1498 unsigned i; 1499 1500 drop_buffers(c); 1501 1502 unregister_shrinker(&c->shrinker); 1503 1504 mutex_lock(&dm_bufio_clients_lock); 1505 1506 list_del(&c->client_list); 1507 dm_bufio_client_count--; 1508 __cache_size_refresh(); 1509 1510 mutex_unlock(&dm_bufio_clients_lock); 1511 1512 for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) 1513 BUG_ON(!hlist_empty(&c->cache_hash[i])); 1514 1515 BUG_ON(c->need_reserved_buffers); 1516 1517 while (!list_empty(&c->reserved_buffers)) { 1518 struct dm_buffer *b = list_entry(c->reserved_buffers.next, 1519 struct dm_buffer, lru_list); 1520 list_del(&b->lru_list); 1521 free_buffer(b); 1522 } 1523 1524 for (i = 0; i < LIST_SIZE; i++) 1525 if (c->n_buffers[i]) 1526 DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]); 1527 1528 for (i = 0; i < LIST_SIZE; i++) 1529 BUG_ON(c->n_buffers[i]); 1530 1531 dm_io_client_destroy(c->dm_io); 1532 vfree(c->cache_hash); 1533 kfree(c); 1534} 1535EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); 1536 1537static void cleanup_old_buffers(void) 1538{ 1539 unsigned long max_age = dm_bufio_max_age; 1540 struct dm_bufio_client *c; 1541 1542 barrier(); 1543 1544 if (max_age > ULONG_MAX / HZ) 1545 max_age = ULONG_MAX / HZ; 1546 1547 mutex_lock(&dm_bufio_clients_lock); 1548 list_for_each_entry(c, &dm_bufio_all_clients, client_list) { 1549 if (!dm_bufio_trylock(c)) 1550 continue; 1551 1552 while (!list_empty(&c->lru[LIST_CLEAN])) { 1553 struct dm_buffer *b; 1554 b = list_entry(c->lru[LIST_CLEAN].prev, 1555 struct dm_buffer, lru_list); 1556 if (__cleanup_old_buffer(b, 0, max_age * HZ)) 1557 break; 1558 dm_bufio_cond_resched(); 1559 } 1560 1561 dm_bufio_unlock(c); 1562 dm_bufio_cond_resched(); 1563 } 1564 mutex_unlock(&dm_bufio_clients_lock); 1565} 1566 1567static struct workqueue_struct *dm_bufio_wq; 1568static struct delayed_work dm_bufio_work; 1569 1570static void work_fn(struct work_struct *w) 1571{ 1572 cleanup_old_buffers(); 1573 1574 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1575 DM_BUFIO_WORK_TIMER_SECS * HZ); 1576} 1577 1578/*---------------------------------------------------------------- 1579 * Module setup 1580 *--------------------------------------------------------------*/ 1581 1582/* 1583 * This is called only once for the whole dm_bufio module. 1584 * It initializes memory limit. 1585 */ 1586static int __init dm_bufio_init(void) 1587{ 1588 __u64 mem; 1589 1590 memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); 1591 memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); 1592 1593 mem = (__u64)((totalram_pages - totalhigh_pages) * 1594 DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT; 1595 1596 if (mem > ULONG_MAX) 1597 mem = ULONG_MAX; 1598 1599#ifdef CONFIG_MMU 1600 /* 1601 * Get the size of vmalloc space the same way as VMALLOC_TOTAL 1602 * in fs/proc/internal.h 1603 */ 1604 if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100) 1605 mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100; 1606#endif 1607 1608 dm_bufio_default_cache_size = mem; 1609 1610 mutex_lock(&dm_bufio_clients_lock); 1611 __cache_size_refresh(); 1612 mutex_unlock(&dm_bufio_clients_lock); 1613 1614 dm_bufio_wq = create_singlethread_workqueue("dm_bufio_cache"); 1615 if (!dm_bufio_wq) 1616 return -ENOMEM; 1617 1618 INIT_DELAYED_WORK(&dm_bufio_work, work_fn); 1619 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1620 DM_BUFIO_WORK_TIMER_SECS * HZ); 1621 1622 return 0; 1623} 1624 1625/* 1626 * This is called once when unloading the dm_bufio module. 1627 */ 1628static void __exit dm_bufio_exit(void) 1629{ 1630 int bug = 0; 1631 int i; 1632 1633 cancel_delayed_work_sync(&dm_bufio_work); 1634 destroy_workqueue(dm_bufio_wq); 1635 1636 for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) { 1637 struct kmem_cache *kc = dm_bufio_caches[i]; 1638 1639 if (kc) 1640 kmem_cache_destroy(kc); 1641 } 1642 1643 for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++) 1644 kfree(dm_bufio_cache_names[i]); 1645 1646 if (dm_bufio_client_count) { 1647 DMCRIT("%s: dm_bufio_client_count leaked: %d", 1648 __func__, dm_bufio_client_count); 1649 bug = 1; 1650 } 1651 1652 if (dm_bufio_current_allocated) { 1653 DMCRIT("%s: dm_bufio_current_allocated leaked: %lu", 1654 __func__, dm_bufio_current_allocated); 1655 bug = 1; 1656 } 1657 1658 if (dm_bufio_allocated_get_free_pages) { 1659 DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu", 1660 __func__, dm_bufio_allocated_get_free_pages); 1661 bug = 1; 1662 } 1663 1664 if (dm_bufio_allocated_vmalloc) { 1665 DMCRIT("%s: dm_bufio_vmalloc leaked: %lu", 1666 __func__, dm_bufio_allocated_vmalloc); 1667 bug = 1; 1668 } 1669 1670 if (bug) 1671 BUG(); 1672} 1673 1674module_init(dm_bufio_init) 1675module_exit(dm_bufio_exit) 1676 1677module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR); 1678MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); 1679 1680module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); 1681MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); 1682 1683module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); 1684MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory"); 1685 1686module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO); 1687MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc"); 1688 1689module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO); 1690MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages"); 1691 1692module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO); 1693MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc"); 1694 1695module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO); 1696MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache"); 1697 1698MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>"); 1699MODULE_DESCRIPTION(DM_NAME " buffered I/O library"); 1700MODULE_LICENSE("GPL");