Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at c9a28fa7b9ac19b676deefa0a171ce7df8755c08 1884 lines 43 kB view raw
1/* 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18#include "xfs.h" 19#include <linux/stddef.h> 20#include <linux/errno.h> 21#include <linux/slab.h> 22#include <linux/pagemap.h> 23#include <linux/init.h> 24#include <linux/vmalloc.h> 25#include <linux/bio.h> 26#include <linux/sysctl.h> 27#include <linux/proc_fs.h> 28#include <linux/workqueue.h> 29#include <linux/percpu.h> 30#include <linux/blkdev.h> 31#include <linux/hash.h> 32#include <linux/kthread.h> 33#include <linux/migrate.h> 34#include <linux/backing-dev.h> 35#include <linux/freezer.h> 36 37static kmem_zone_t *xfs_buf_zone; 38STATIC int xfsbufd(void *); 39STATIC int xfsbufd_wakeup(int, gfp_t); 40STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); 41static struct shrinker xfs_buf_shake = { 42 .shrink = xfsbufd_wakeup, 43 .seeks = DEFAULT_SEEKS, 44}; 45 46static struct workqueue_struct *xfslogd_workqueue; 47struct workqueue_struct *xfsdatad_workqueue; 48 49#ifdef XFS_BUF_TRACE 50void 51xfs_buf_trace( 52 xfs_buf_t *bp, 53 char *id, 54 void *data, 55 void *ra) 56{ 57 ktrace_enter(xfs_buf_trace_buf, 58 bp, id, 59 (void *)(unsigned long)bp->b_flags, 60 (void *)(unsigned long)bp->b_hold.counter, 61 (void *)(unsigned long)bp->b_sema.count.counter, 62 (void *)current, 63 data, ra, 64 (void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff), 65 (void *)(unsigned long)(bp->b_file_offset & 0xffffffff), 66 (void *)(unsigned long)bp->b_buffer_length, 67 NULL, NULL, NULL, NULL, NULL); 68} 69ktrace_t *xfs_buf_trace_buf; 70#define XFS_BUF_TRACE_SIZE 4096 71#define XB_TRACE(bp, id, data) \ 72 xfs_buf_trace(bp, id, (void *)data, (void *)__builtin_return_address(0)) 73#else 74#define XB_TRACE(bp, id, data) do { } while (0) 75#endif 76 77#ifdef XFS_BUF_LOCK_TRACKING 78# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) 79# define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1) 80# define XB_GET_OWNER(bp) ((bp)->b_last_holder) 81#else 82# define XB_SET_OWNER(bp) do { } while (0) 83# define XB_CLEAR_OWNER(bp) do { } while (0) 84# define XB_GET_OWNER(bp) do { } while (0) 85#endif 86 87#define xb_to_gfp(flags) \ 88 ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \ 89 ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN) 90 91#define xb_to_km(flags) \ 92 (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) 93 94#define xfs_buf_allocate(flags) \ 95 kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags)) 96#define xfs_buf_deallocate(bp) \ 97 kmem_zone_free(xfs_buf_zone, (bp)); 98 99/* 100 * Page Region interfaces. 101 * 102 * For pages in filesystems where the blocksize is smaller than the 103 * pagesize, we use the page->private field (long) to hold a bitmap 104 * of uptodate regions within the page. 105 * 106 * Each such region is "bytes per page / bits per long" bytes long. 107 * 108 * NBPPR == number-of-bytes-per-page-region 109 * BTOPR == bytes-to-page-region (rounded up) 110 * BTOPRT == bytes-to-page-region-truncated (rounded down) 111 */ 112#if (BITS_PER_LONG == 32) 113#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */ 114#elif (BITS_PER_LONG == 64) 115#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */ 116#else 117#error BITS_PER_LONG must be 32 or 64 118#endif 119#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG) 120#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT) 121#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT)) 122 123STATIC unsigned long 124page_region_mask( 125 size_t offset, 126 size_t length) 127{ 128 unsigned long mask; 129 int first, final; 130 131 first = BTOPR(offset); 132 final = BTOPRT(offset + length - 1); 133 first = min(first, final); 134 135 mask = ~0UL; 136 mask <<= BITS_PER_LONG - (final - first); 137 mask >>= BITS_PER_LONG - (final); 138 139 ASSERT(offset + length <= PAGE_CACHE_SIZE); 140 ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0); 141 142 return mask; 143} 144 145STATIC_INLINE void 146set_page_region( 147 struct page *page, 148 size_t offset, 149 size_t length) 150{ 151 set_page_private(page, 152 page_private(page) | page_region_mask(offset, length)); 153 if (page_private(page) == ~0UL) 154 SetPageUptodate(page); 155} 156 157STATIC_INLINE int 158test_page_region( 159 struct page *page, 160 size_t offset, 161 size_t length) 162{ 163 unsigned long mask = page_region_mask(offset, length); 164 165 return (mask && (page_private(page) & mask) == mask); 166} 167 168/* 169 * Mapping of multi-page buffers into contiguous virtual space 170 */ 171 172typedef struct a_list { 173 void *vm_addr; 174 struct a_list *next; 175} a_list_t; 176 177static a_list_t *as_free_head; 178static int as_list_len; 179static DEFINE_SPINLOCK(as_lock); 180 181/* 182 * Try to batch vunmaps because they are costly. 183 */ 184STATIC void 185free_address( 186 void *addr) 187{ 188 a_list_t *aentry; 189 190#ifdef CONFIG_XEN 191 /* 192 * Xen needs to be able to make sure it can get an exclusive 193 * RO mapping of pages it wants to turn into a pagetable. If 194 * a newly allocated page is also still being vmap()ed by xfs, 195 * it will cause pagetable construction to fail. This is a 196 * quick workaround to always eagerly unmap pages so that Xen 197 * is happy. 198 */ 199 vunmap(addr); 200 return; 201#endif 202 203 aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT); 204 if (likely(aentry)) { 205 spin_lock(&as_lock); 206 aentry->next = as_free_head; 207 aentry->vm_addr = addr; 208 as_free_head = aentry; 209 as_list_len++; 210 spin_unlock(&as_lock); 211 } else { 212 vunmap(addr); 213 } 214} 215 216STATIC void 217purge_addresses(void) 218{ 219 a_list_t *aentry, *old; 220 221 if (as_free_head == NULL) 222 return; 223 224 spin_lock(&as_lock); 225 aentry = as_free_head; 226 as_free_head = NULL; 227 as_list_len = 0; 228 spin_unlock(&as_lock); 229 230 while ((old = aentry) != NULL) { 231 vunmap(aentry->vm_addr); 232 aentry = aentry->next; 233 kfree(old); 234 } 235} 236 237/* 238 * Internal xfs_buf_t object manipulation 239 */ 240 241STATIC void 242_xfs_buf_initialize( 243 xfs_buf_t *bp, 244 xfs_buftarg_t *target, 245 xfs_off_t range_base, 246 size_t range_length, 247 xfs_buf_flags_t flags) 248{ 249 /* 250 * We don't want certain flags to appear in b_flags. 251 */ 252 flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD); 253 254 memset(bp, 0, sizeof(xfs_buf_t)); 255 atomic_set(&bp->b_hold, 1); 256 init_MUTEX_LOCKED(&bp->b_iodonesema); 257 INIT_LIST_HEAD(&bp->b_list); 258 INIT_LIST_HEAD(&bp->b_hash_list); 259 init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */ 260 XB_SET_OWNER(bp); 261 bp->b_target = target; 262 bp->b_file_offset = range_base; 263 /* 264 * Set buffer_length and count_desired to the same value initially. 265 * I/O routines should use count_desired, which will be the same in 266 * most cases but may be reset (e.g. XFS recovery). 267 */ 268 bp->b_buffer_length = bp->b_count_desired = range_length; 269 bp->b_flags = flags; 270 bp->b_bn = XFS_BUF_DADDR_NULL; 271 atomic_set(&bp->b_pin_count, 0); 272 init_waitqueue_head(&bp->b_waiters); 273 274 XFS_STATS_INC(xb_create); 275 XB_TRACE(bp, "initialize", target); 276} 277 278/* 279 * Allocate a page array capable of holding a specified number 280 * of pages, and point the page buf at it. 281 */ 282STATIC int 283_xfs_buf_get_pages( 284 xfs_buf_t *bp, 285 int page_count, 286 xfs_buf_flags_t flags) 287{ 288 /* Make sure that we have a page list */ 289 if (bp->b_pages == NULL) { 290 bp->b_offset = xfs_buf_poff(bp->b_file_offset); 291 bp->b_page_count = page_count; 292 if (page_count <= XB_PAGES) { 293 bp->b_pages = bp->b_page_array; 294 } else { 295 bp->b_pages = kmem_alloc(sizeof(struct page *) * 296 page_count, xb_to_km(flags)); 297 if (bp->b_pages == NULL) 298 return -ENOMEM; 299 } 300 memset(bp->b_pages, 0, sizeof(struct page *) * page_count); 301 } 302 return 0; 303} 304 305/* 306 * Frees b_pages if it was allocated. 307 */ 308STATIC void 309_xfs_buf_free_pages( 310 xfs_buf_t *bp) 311{ 312 if (bp->b_pages != bp->b_page_array) { 313 kmem_free(bp->b_pages, 314 bp->b_page_count * sizeof(struct page *)); 315 } 316} 317 318/* 319 * Releases the specified buffer. 320 * 321 * The modification state of any associated pages is left unchanged. 322 * The buffer most not be on any hash - use xfs_buf_rele instead for 323 * hashed and refcounted buffers 324 */ 325void 326xfs_buf_free( 327 xfs_buf_t *bp) 328{ 329 XB_TRACE(bp, "free", 0); 330 331 ASSERT(list_empty(&bp->b_hash_list)); 332 333 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { 334 uint i; 335 336 if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) 337 free_address(bp->b_addr - bp->b_offset); 338 339 for (i = 0; i < bp->b_page_count; i++) { 340 struct page *page = bp->b_pages[i]; 341 342 if (bp->b_flags & _XBF_PAGE_CACHE) 343 ASSERT(!PagePrivate(page)); 344 page_cache_release(page); 345 } 346 _xfs_buf_free_pages(bp); 347 } 348 349 xfs_buf_deallocate(bp); 350} 351 352/* 353 * Finds all pages for buffer in question and builds it's page list. 354 */ 355STATIC int 356_xfs_buf_lookup_pages( 357 xfs_buf_t *bp, 358 uint flags) 359{ 360 struct address_space *mapping = bp->b_target->bt_mapping; 361 size_t blocksize = bp->b_target->bt_bsize; 362 size_t size = bp->b_count_desired; 363 size_t nbytes, offset; 364 gfp_t gfp_mask = xb_to_gfp(flags); 365 unsigned short page_count, i; 366 pgoff_t first; 367 xfs_off_t end; 368 int error; 369 370 end = bp->b_file_offset + bp->b_buffer_length; 371 page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); 372 373 error = _xfs_buf_get_pages(bp, page_count, flags); 374 if (unlikely(error)) 375 return error; 376 bp->b_flags |= _XBF_PAGE_CACHE; 377 378 offset = bp->b_offset; 379 first = bp->b_file_offset >> PAGE_CACHE_SHIFT; 380 381 for (i = 0; i < bp->b_page_count; i++) { 382 struct page *page; 383 uint retries = 0; 384 385 retry: 386 page = find_or_create_page(mapping, first + i, gfp_mask); 387 if (unlikely(page == NULL)) { 388 if (flags & XBF_READ_AHEAD) { 389 bp->b_page_count = i; 390 for (i = 0; i < bp->b_page_count; i++) 391 unlock_page(bp->b_pages[i]); 392 return -ENOMEM; 393 } 394 395 /* 396 * This could deadlock. 397 * 398 * But until all the XFS lowlevel code is revamped to 399 * handle buffer allocation failures we can't do much. 400 */ 401 if (!(++retries % 100)) 402 printk(KERN_ERR 403 "XFS: possible memory allocation " 404 "deadlock in %s (mode:0x%x)\n", 405 __FUNCTION__, gfp_mask); 406 407 XFS_STATS_INC(xb_page_retries); 408 xfsbufd_wakeup(0, gfp_mask); 409 congestion_wait(WRITE, HZ/50); 410 goto retry; 411 } 412 413 XFS_STATS_INC(xb_page_found); 414 415 nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset); 416 size -= nbytes; 417 418 ASSERT(!PagePrivate(page)); 419 if (!PageUptodate(page)) { 420 page_count--; 421 if (blocksize >= PAGE_CACHE_SIZE) { 422 if (flags & XBF_READ) 423 bp->b_locked = 1; 424 } else if (!PagePrivate(page)) { 425 if (test_page_region(page, offset, nbytes)) 426 page_count++; 427 } 428 } 429 430 bp->b_pages[i] = page; 431 offset = 0; 432 } 433 434 if (!bp->b_locked) { 435 for (i = 0; i < bp->b_page_count; i++) 436 unlock_page(bp->b_pages[i]); 437 } 438 439 if (page_count == bp->b_page_count) 440 bp->b_flags |= XBF_DONE; 441 442 XB_TRACE(bp, "lookup_pages", (long)page_count); 443 return error; 444} 445 446/* 447 * Map buffer into kernel address-space if nessecary. 448 */ 449STATIC int 450_xfs_buf_map_pages( 451 xfs_buf_t *bp, 452 uint flags) 453{ 454 /* A single page buffer is always mappable */ 455 if (bp->b_page_count == 1) { 456 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 457 bp->b_flags |= XBF_MAPPED; 458 } else if (flags & XBF_MAPPED) { 459 if (as_list_len > 64) 460 purge_addresses(); 461 bp->b_addr = vmap(bp->b_pages, bp->b_page_count, 462 VM_MAP, PAGE_KERNEL); 463 if (unlikely(bp->b_addr == NULL)) 464 return -ENOMEM; 465 bp->b_addr += bp->b_offset; 466 bp->b_flags |= XBF_MAPPED; 467 } 468 469 return 0; 470} 471 472/* 473 * Finding and Reading Buffers 474 */ 475 476/* 477 * Look up, and creates if absent, a lockable buffer for 478 * a given range of an inode. The buffer is returned 479 * locked. If other overlapping buffers exist, they are 480 * released before the new buffer is created and locked, 481 * which may imply that this call will block until those buffers 482 * are unlocked. No I/O is implied by this call. 483 */ 484xfs_buf_t * 485_xfs_buf_find( 486 xfs_buftarg_t *btp, /* block device target */ 487 xfs_off_t ioff, /* starting offset of range */ 488 size_t isize, /* length of range */ 489 xfs_buf_flags_t flags, 490 xfs_buf_t *new_bp) 491{ 492 xfs_off_t range_base; 493 size_t range_length; 494 xfs_bufhash_t *hash; 495 xfs_buf_t *bp, *n; 496 497 range_base = (ioff << BBSHIFT); 498 range_length = (isize << BBSHIFT); 499 500 /* Check for IOs smaller than the sector size / not sector aligned */ 501 ASSERT(!(range_length < (1 << btp->bt_sshift))); 502 ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); 503 504 hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)]; 505 506 spin_lock(&hash->bh_lock); 507 508 list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { 509 ASSERT(btp == bp->b_target); 510 if (bp->b_file_offset == range_base && 511 bp->b_buffer_length == range_length) { 512 /* 513 * If we look at something, bring it to the 514 * front of the list for next time. 515 */ 516 atomic_inc(&bp->b_hold); 517 list_move(&bp->b_hash_list, &hash->bh_list); 518 goto found; 519 } 520 } 521 522 /* No match found */ 523 if (new_bp) { 524 _xfs_buf_initialize(new_bp, btp, range_base, 525 range_length, flags); 526 new_bp->b_hash = hash; 527 list_add(&new_bp->b_hash_list, &hash->bh_list); 528 } else { 529 XFS_STATS_INC(xb_miss_locked); 530 } 531 532 spin_unlock(&hash->bh_lock); 533 return new_bp; 534 535found: 536 spin_unlock(&hash->bh_lock); 537 538 /* Attempt to get the semaphore without sleeping, 539 * if this does not work then we need to drop the 540 * spinlock and do a hard attempt on the semaphore. 541 */ 542 if (down_trylock(&bp->b_sema)) { 543 if (!(flags & XBF_TRYLOCK)) { 544 /* wait for buffer ownership */ 545 XB_TRACE(bp, "get_lock", 0); 546 xfs_buf_lock(bp); 547 XFS_STATS_INC(xb_get_locked_waited); 548 } else { 549 /* We asked for a trylock and failed, no need 550 * to look at file offset and length here, we 551 * know that this buffer at least overlaps our 552 * buffer and is locked, therefore our buffer 553 * either does not exist, or is this buffer. 554 */ 555 xfs_buf_rele(bp); 556 XFS_STATS_INC(xb_busy_locked); 557 return NULL; 558 } 559 } else { 560 /* trylock worked */ 561 XB_SET_OWNER(bp); 562 } 563 564 if (bp->b_flags & XBF_STALE) { 565 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 566 bp->b_flags &= XBF_MAPPED; 567 } 568 XB_TRACE(bp, "got_lock", 0); 569 XFS_STATS_INC(xb_get_locked); 570 return bp; 571} 572 573/* 574 * Assembles a buffer covering the specified range. 575 * Storage in memory for all portions of the buffer will be allocated, 576 * although backing storage may not be. 577 */ 578xfs_buf_t * 579xfs_buf_get_flags( 580 xfs_buftarg_t *target,/* target for buffer */ 581 xfs_off_t ioff, /* starting offset of range */ 582 size_t isize, /* length of range */ 583 xfs_buf_flags_t flags) 584{ 585 xfs_buf_t *bp, *new_bp; 586 int error = 0, i; 587 588 new_bp = xfs_buf_allocate(flags); 589 if (unlikely(!new_bp)) 590 return NULL; 591 592 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); 593 if (bp == new_bp) { 594 error = _xfs_buf_lookup_pages(bp, flags); 595 if (error) 596 goto no_buffer; 597 } else { 598 xfs_buf_deallocate(new_bp); 599 if (unlikely(bp == NULL)) 600 return NULL; 601 } 602 603 for (i = 0; i < bp->b_page_count; i++) 604 mark_page_accessed(bp->b_pages[i]); 605 606 if (!(bp->b_flags & XBF_MAPPED)) { 607 error = _xfs_buf_map_pages(bp, flags); 608 if (unlikely(error)) { 609 printk(KERN_WARNING "%s: failed to map pages\n", 610 __FUNCTION__); 611 goto no_buffer; 612 } 613 } 614 615 XFS_STATS_INC(xb_get); 616 617 /* 618 * Always fill in the block number now, the mapped cases can do 619 * their own overlay of this later. 620 */ 621 bp->b_bn = ioff; 622 bp->b_count_desired = bp->b_buffer_length; 623 624 XB_TRACE(bp, "get", (unsigned long)flags); 625 return bp; 626 627 no_buffer: 628 if (flags & (XBF_LOCK | XBF_TRYLOCK)) 629 xfs_buf_unlock(bp); 630 xfs_buf_rele(bp); 631 return NULL; 632} 633 634xfs_buf_t * 635xfs_buf_read_flags( 636 xfs_buftarg_t *target, 637 xfs_off_t ioff, 638 size_t isize, 639 xfs_buf_flags_t flags) 640{ 641 xfs_buf_t *bp; 642 643 flags |= XBF_READ; 644 645 bp = xfs_buf_get_flags(target, ioff, isize, flags); 646 if (bp) { 647 if (!XFS_BUF_ISDONE(bp)) { 648 XB_TRACE(bp, "read", (unsigned long)flags); 649 XFS_STATS_INC(xb_get_read); 650 xfs_buf_iostart(bp, flags); 651 } else if (flags & XBF_ASYNC) { 652 XB_TRACE(bp, "read_async", (unsigned long)flags); 653 /* 654 * Read ahead call which is already satisfied, 655 * drop the buffer 656 */ 657 goto no_buffer; 658 } else { 659 XB_TRACE(bp, "read_done", (unsigned long)flags); 660 /* We do not want read in the flags */ 661 bp->b_flags &= ~XBF_READ; 662 } 663 } 664 665 return bp; 666 667 no_buffer: 668 if (flags & (XBF_LOCK | XBF_TRYLOCK)) 669 xfs_buf_unlock(bp); 670 xfs_buf_rele(bp); 671 return NULL; 672} 673 674/* 675 * If we are not low on memory then do the readahead in a deadlock 676 * safe manner. 677 */ 678void 679xfs_buf_readahead( 680 xfs_buftarg_t *target, 681 xfs_off_t ioff, 682 size_t isize, 683 xfs_buf_flags_t flags) 684{ 685 struct backing_dev_info *bdi; 686 687 bdi = target->bt_mapping->backing_dev_info; 688 if (bdi_read_congested(bdi)) 689 return; 690 691 flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); 692 xfs_buf_read_flags(target, ioff, isize, flags); 693} 694 695xfs_buf_t * 696xfs_buf_get_empty( 697 size_t len, 698 xfs_buftarg_t *target) 699{ 700 xfs_buf_t *bp; 701 702 bp = xfs_buf_allocate(0); 703 if (bp) 704 _xfs_buf_initialize(bp, target, 0, len, 0); 705 return bp; 706} 707 708static inline struct page * 709mem_to_page( 710 void *addr) 711{ 712 if (((unsigned long)addr < VMALLOC_START) || 713 ((unsigned long)addr >= VMALLOC_END)) { 714 return virt_to_page(addr); 715 } else { 716 return vmalloc_to_page(addr); 717 } 718} 719 720int 721xfs_buf_associate_memory( 722 xfs_buf_t *bp, 723 void *mem, 724 size_t len) 725{ 726 int rval; 727 int i = 0; 728 unsigned long pageaddr; 729 unsigned long offset; 730 size_t buflen; 731 int page_count; 732 733 pageaddr = (unsigned long)mem & PAGE_CACHE_MASK; 734 offset = (unsigned long)mem - pageaddr; 735 buflen = PAGE_CACHE_ALIGN(len + offset); 736 page_count = buflen >> PAGE_CACHE_SHIFT; 737 738 /* Free any previous set of page pointers */ 739 if (bp->b_pages) 740 _xfs_buf_free_pages(bp); 741 742 bp->b_pages = NULL; 743 bp->b_addr = mem; 744 745 rval = _xfs_buf_get_pages(bp, page_count, 0); 746 if (rval) 747 return rval; 748 749 bp->b_offset = offset; 750 751 for (i = 0; i < bp->b_page_count; i++) { 752 bp->b_pages[i] = mem_to_page((void *)pageaddr); 753 pageaddr += PAGE_CACHE_SIZE; 754 } 755 bp->b_locked = 0; 756 757 bp->b_count_desired = len; 758 bp->b_buffer_length = buflen; 759 bp->b_flags |= XBF_MAPPED; 760 761 return 0; 762} 763 764xfs_buf_t * 765xfs_buf_get_noaddr( 766 size_t len, 767 xfs_buftarg_t *target) 768{ 769 unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT; 770 int error, i; 771 xfs_buf_t *bp; 772 773 bp = xfs_buf_allocate(0); 774 if (unlikely(bp == NULL)) 775 goto fail; 776 _xfs_buf_initialize(bp, target, 0, len, 0); 777 778 error = _xfs_buf_get_pages(bp, page_count, 0); 779 if (error) 780 goto fail_free_buf; 781 782 for (i = 0; i < page_count; i++) { 783 bp->b_pages[i] = alloc_page(GFP_KERNEL); 784 if (!bp->b_pages[i]) 785 goto fail_free_mem; 786 } 787 bp->b_flags |= _XBF_PAGES; 788 789 error = _xfs_buf_map_pages(bp, XBF_MAPPED); 790 if (unlikely(error)) { 791 printk(KERN_WARNING "%s: failed to map pages\n", 792 __FUNCTION__); 793 goto fail_free_mem; 794 } 795 796 xfs_buf_unlock(bp); 797 798 XB_TRACE(bp, "no_daddr", len); 799 return bp; 800 801 fail_free_mem: 802 while (--i >= 0) 803 __free_page(bp->b_pages[i]); 804 _xfs_buf_free_pages(bp); 805 fail_free_buf: 806 xfs_buf_deallocate(bp); 807 fail: 808 return NULL; 809} 810 811/* 812 * Increment reference count on buffer, to hold the buffer concurrently 813 * with another thread which may release (free) the buffer asynchronously. 814 * Must hold the buffer already to call this function. 815 */ 816void 817xfs_buf_hold( 818 xfs_buf_t *bp) 819{ 820 atomic_inc(&bp->b_hold); 821 XB_TRACE(bp, "hold", 0); 822} 823 824/* 825 * Releases a hold on the specified buffer. If the 826 * the hold count is 1, calls xfs_buf_free. 827 */ 828void 829xfs_buf_rele( 830 xfs_buf_t *bp) 831{ 832 xfs_bufhash_t *hash = bp->b_hash; 833 834 XB_TRACE(bp, "rele", bp->b_relse); 835 836 if (unlikely(!hash)) { 837 ASSERT(!bp->b_relse); 838 if (atomic_dec_and_test(&bp->b_hold)) 839 xfs_buf_free(bp); 840 return; 841 } 842 843 if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) { 844 if (bp->b_relse) { 845 atomic_inc(&bp->b_hold); 846 spin_unlock(&hash->bh_lock); 847 (*(bp->b_relse)) (bp); 848 } else if (bp->b_flags & XBF_FS_MANAGED) { 849 spin_unlock(&hash->bh_lock); 850 } else { 851 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); 852 list_del_init(&bp->b_hash_list); 853 spin_unlock(&hash->bh_lock); 854 xfs_buf_free(bp); 855 } 856 } else { 857 /* 858 * Catch reference count leaks 859 */ 860 ASSERT(atomic_read(&bp->b_hold) >= 0); 861 } 862} 863 864 865/* 866 * Mutual exclusion on buffers. Locking model: 867 * 868 * Buffers associated with inodes for which buffer locking 869 * is not enabled are not protected by semaphores, and are 870 * assumed to be exclusively owned by the caller. There is a 871 * spinlock in the buffer, used by the caller when concurrent 872 * access is possible. 873 */ 874 875/* 876 * Locks a buffer object, if it is not already locked. 877 * Note that this in no way locks the underlying pages, so it is only 878 * useful for synchronizing concurrent use of buffer objects, not for 879 * synchronizing independent access to the underlying pages. 880 */ 881int 882xfs_buf_cond_lock( 883 xfs_buf_t *bp) 884{ 885 int locked; 886 887 locked = down_trylock(&bp->b_sema) == 0; 888 if (locked) { 889 XB_SET_OWNER(bp); 890 } 891 XB_TRACE(bp, "cond_lock", (long)locked); 892 return locked ? 0 : -EBUSY; 893} 894 895#if defined(DEBUG) || defined(XFS_BLI_TRACE) 896int 897xfs_buf_lock_value( 898 xfs_buf_t *bp) 899{ 900 return atomic_read(&bp->b_sema.count); 901} 902#endif 903 904/* 905 * Locks a buffer object. 906 * Note that this in no way locks the underlying pages, so it is only 907 * useful for synchronizing concurrent use of buffer objects, not for 908 * synchronizing independent access to the underlying pages. 909 */ 910void 911xfs_buf_lock( 912 xfs_buf_t *bp) 913{ 914 XB_TRACE(bp, "lock", 0); 915 if (atomic_read(&bp->b_io_remaining)) 916 blk_run_address_space(bp->b_target->bt_mapping); 917 down(&bp->b_sema); 918 XB_SET_OWNER(bp); 919 XB_TRACE(bp, "locked", 0); 920} 921 922/* 923 * Releases the lock on the buffer object. 924 * If the buffer is marked delwri but is not queued, do so before we 925 * unlock the buffer as we need to set flags correctly. We also need to 926 * take a reference for the delwri queue because the unlocker is going to 927 * drop their's and they don't know we just queued it. 928 */ 929void 930xfs_buf_unlock( 931 xfs_buf_t *bp) 932{ 933 if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) { 934 atomic_inc(&bp->b_hold); 935 bp->b_flags |= XBF_ASYNC; 936 xfs_buf_delwri_queue(bp, 0); 937 } 938 939 XB_CLEAR_OWNER(bp); 940 up(&bp->b_sema); 941 XB_TRACE(bp, "unlock", 0); 942} 943 944 945/* 946 * Pinning Buffer Storage in Memory 947 * Ensure that no attempt to force a buffer to disk will succeed. 948 */ 949void 950xfs_buf_pin( 951 xfs_buf_t *bp) 952{ 953 atomic_inc(&bp->b_pin_count); 954 XB_TRACE(bp, "pin", (long)bp->b_pin_count.counter); 955} 956 957void 958xfs_buf_unpin( 959 xfs_buf_t *bp) 960{ 961 if (atomic_dec_and_test(&bp->b_pin_count)) 962 wake_up_all(&bp->b_waiters); 963 XB_TRACE(bp, "unpin", (long)bp->b_pin_count.counter); 964} 965 966int 967xfs_buf_ispin( 968 xfs_buf_t *bp) 969{ 970 return atomic_read(&bp->b_pin_count); 971} 972 973STATIC void 974xfs_buf_wait_unpin( 975 xfs_buf_t *bp) 976{ 977 DECLARE_WAITQUEUE (wait, current); 978 979 if (atomic_read(&bp->b_pin_count) == 0) 980 return; 981 982 add_wait_queue(&bp->b_waiters, &wait); 983 for (;;) { 984 set_current_state(TASK_UNINTERRUPTIBLE); 985 if (atomic_read(&bp->b_pin_count) == 0) 986 break; 987 if (atomic_read(&bp->b_io_remaining)) 988 blk_run_address_space(bp->b_target->bt_mapping); 989 schedule(); 990 } 991 remove_wait_queue(&bp->b_waiters, &wait); 992 set_current_state(TASK_RUNNING); 993} 994 995/* 996 * Buffer Utility Routines 997 */ 998 999STATIC void 1000xfs_buf_iodone_work( 1001 struct work_struct *work) 1002{ 1003 xfs_buf_t *bp = 1004 container_of(work, xfs_buf_t, b_iodone_work); 1005 1006 /* 1007 * We can get an EOPNOTSUPP to ordered writes. Here we clear the 1008 * ordered flag and reissue them. Because we can't tell the higher 1009 * layers directly that they should not issue ordered I/O anymore, they 1010 * need to check if the ordered flag was cleared during I/O completion. 1011 */ 1012 if ((bp->b_error == EOPNOTSUPP) && 1013 (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) { 1014 XB_TRACE(bp, "ordered_retry", bp->b_iodone); 1015 bp->b_flags &= ~XBF_ORDERED; 1016 xfs_buf_iorequest(bp); 1017 } else if (bp->b_iodone) 1018 (*(bp->b_iodone))(bp); 1019 else if (bp->b_flags & XBF_ASYNC) 1020 xfs_buf_relse(bp); 1021} 1022 1023void 1024xfs_buf_ioend( 1025 xfs_buf_t *bp, 1026 int schedule) 1027{ 1028 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); 1029 if (bp->b_error == 0) 1030 bp->b_flags |= XBF_DONE; 1031 1032 XB_TRACE(bp, "iodone", bp->b_iodone); 1033 1034 if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) { 1035 if (schedule) { 1036 INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); 1037 queue_work(xfslogd_workqueue, &bp->b_iodone_work); 1038 } else { 1039 xfs_buf_iodone_work(&bp->b_iodone_work); 1040 } 1041 } else { 1042 up(&bp->b_iodonesema); 1043 } 1044} 1045 1046void 1047xfs_buf_ioerror( 1048 xfs_buf_t *bp, 1049 int error) 1050{ 1051 ASSERT(error >= 0 && error <= 0xffff); 1052 bp->b_error = (unsigned short)error; 1053 XB_TRACE(bp, "ioerror", (unsigned long)error); 1054} 1055 1056/* 1057 * Initiate I/O on a buffer, based on the flags supplied. 1058 * The b_iodone routine in the buffer supplied will only be called 1059 * when all of the subsidiary I/O requests, if any, have been completed. 1060 */ 1061int 1062xfs_buf_iostart( 1063 xfs_buf_t *bp, 1064 xfs_buf_flags_t flags) 1065{ 1066 int status = 0; 1067 1068 XB_TRACE(bp, "iostart", (unsigned long)flags); 1069 1070 if (flags & XBF_DELWRI) { 1071 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC); 1072 bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC); 1073 xfs_buf_delwri_queue(bp, 1); 1074 return status; 1075 } 1076 1077 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \ 1078 XBF_READ_AHEAD | _XBF_RUN_QUEUES); 1079 bp->b_flags |= flags & (XBF_READ | XBF_WRITE | XBF_ASYNC | \ 1080 XBF_READ_AHEAD | _XBF_RUN_QUEUES); 1081 1082 BUG_ON(bp->b_bn == XFS_BUF_DADDR_NULL); 1083 1084 /* For writes allow an alternate strategy routine to precede 1085 * the actual I/O request (which may not be issued at all in 1086 * a shutdown situation, for example). 1087 */ 1088 status = (flags & XBF_WRITE) ? 1089 xfs_buf_iostrategy(bp) : xfs_buf_iorequest(bp); 1090 1091 /* Wait for I/O if we are not an async request. 1092 * Note: async I/O request completion will release the buffer, 1093 * and that can already be done by this point. So using the 1094 * buffer pointer from here on, after async I/O, is invalid. 1095 */ 1096 if (!status && !(flags & XBF_ASYNC)) 1097 status = xfs_buf_iowait(bp); 1098 1099 return status; 1100} 1101 1102STATIC_INLINE int 1103_xfs_buf_iolocked( 1104 xfs_buf_t *bp) 1105{ 1106 ASSERT(bp->b_flags & (XBF_READ | XBF_WRITE)); 1107 if (bp->b_flags & XBF_READ) 1108 return bp->b_locked; 1109 return 0; 1110} 1111 1112STATIC_INLINE void 1113_xfs_buf_ioend( 1114 xfs_buf_t *bp, 1115 int schedule) 1116{ 1117 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { 1118 bp->b_locked = 0; 1119 xfs_buf_ioend(bp, schedule); 1120 } 1121} 1122 1123STATIC void 1124xfs_buf_bio_end_io( 1125 struct bio *bio, 1126 int error) 1127{ 1128 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; 1129 unsigned int blocksize = bp->b_target->bt_bsize; 1130 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1131 1132 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 1133 bp->b_error = EIO; 1134 1135 do { 1136 struct page *page = bvec->bv_page; 1137 1138 ASSERT(!PagePrivate(page)); 1139 if (unlikely(bp->b_error)) { 1140 if (bp->b_flags & XBF_READ) 1141 ClearPageUptodate(page); 1142 } else if (blocksize >= PAGE_CACHE_SIZE) { 1143 SetPageUptodate(page); 1144 } else if (!PagePrivate(page) && 1145 (bp->b_flags & _XBF_PAGE_CACHE)) { 1146 set_page_region(page, bvec->bv_offset, bvec->bv_len); 1147 } 1148 1149 if (--bvec >= bio->bi_io_vec) 1150 prefetchw(&bvec->bv_page->flags); 1151 1152 if (_xfs_buf_iolocked(bp)) { 1153 unlock_page(page); 1154 } 1155 } while (bvec >= bio->bi_io_vec); 1156 1157 _xfs_buf_ioend(bp, 1); 1158 bio_put(bio); 1159} 1160 1161STATIC void 1162_xfs_buf_ioapply( 1163 xfs_buf_t *bp) 1164{ 1165 int i, rw, map_i, total_nr_pages, nr_pages; 1166 struct bio *bio; 1167 int offset = bp->b_offset; 1168 int size = bp->b_count_desired; 1169 sector_t sector = bp->b_bn; 1170 unsigned int blocksize = bp->b_target->bt_bsize; 1171 int locking = _xfs_buf_iolocked(bp); 1172 1173 total_nr_pages = bp->b_page_count; 1174 map_i = 0; 1175 1176 if (bp->b_flags & XBF_ORDERED) { 1177 ASSERT(!(bp->b_flags & XBF_READ)); 1178 rw = WRITE_BARRIER; 1179 } else if (bp->b_flags & _XBF_RUN_QUEUES) { 1180 ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); 1181 bp->b_flags &= ~_XBF_RUN_QUEUES; 1182 rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC; 1183 } else { 1184 rw = (bp->b_flags & XBF_WRITE) ? WRITE : 1185 (bp->b_flags & XBF_READ_AHEAD) ? READA : READ; 1186 } 1187 1188 /* Special code path for reading a sub page size buffer in -- 1189 * we populate up the whole page, and hence the other metadata 1190 * in the same page. This optimization is only valid when the 1191 * filesystem block size is not smaller than the page size. 1192 */ 1193 if ((bp->b_buffer_length < PAGE_CACHE_SIZE) && 1194 (bp->b_flags & XBF_READ) && locking && 1195 (blocksize >= PAGE_CACHE_SIZE)) { 1196 bio = bio_alloc(GFP_NOIO, 1); 1197 1198 bio->bi_bdev = bp->b_target->bt_bdev; 1199 bio->bi_sector = sector - (offset >> BBSHIFT); 1200 bio->bi_end_io = xfs_buf_bio_end_io; 1201 bio->bi_private = bp; 1202 1203 bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0); 1204 size = 0; 1205 1206 atomic_inc(&bp->b_io_remaining); 1207 1208 goto submit_io; 1209 } 1210 1211 /* Lock down the pages which we need to for the request */ 1212 if (locking && (bp->b_flags & XBF_WRITE) && (bp->b_locked == 0)) { 1213 for (i = 0; size; i++) { 1214 int nbytes = PAGE_CACHE_SIZE - offset; 1215 struct page *page = bp->b_pages[i]; 1216 1217 if (nbytes > size) 1218 nbytes = size; 1219 1220 lock_page(page); 1221 1222 size -= nbytes; 1223 offset = 0; 1224 } 1225 offset = bp->b_offset; 1226 size = bp->b_count_desired; 1227 } 1228 1229next_chunk: 1230 atomic_inc(&bp->b_io_remaining); 1231 nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); 1232 if (nr_pages > total_nr_pages) 1233 nr_pages = total_nr_pages; 1234 1235 bio = bio_alloc(GFP_NOIO, nr_pages); 1236 bio->bi_bdev = bp->b_target->bt_bdev; 1237 bio->bi_sector = sector; 1238 bio->bi_end_io = xfs_buf_bio_end_io; 1239 bio->bi_private = bp; 1240 1241 for (; size && nr_pages; nr_pages--, map_i++) { 1242 int rbytes, nbytes = PAGE_CACHE_SIZE - offset; 1243 1244 if (nbytes > size) 1245 nbytes = size; 1246 1247 rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset); 1248 if (rbytes < nbytes) 1249 break; 1250 1251 offset = 0; 1252 sector += nbytes >> BBSHIFT; 1253 size -= nbytes; 1254 total_nr_pages--; 1255 } 1256 1257submit_io: 1258 if (likely(bio->bi_size)) { 1259 submit_bio(rw, bio); 1260 if (size) 1261 goto next_chunk; 1262 } else { 1263 bio_put(bio); 1264 xfs_buf_ioerror(bp, EIO); 1265 } 1266} 1267 1268int 1269xfs_buf_iorequest( 1270 xfs_buf_t *bp) 1271{ 1272 XB_TRACE(bp, "iorequest", 0); 1273 1274 if (bp->b_flags & XBF_DELWRI) { 1275 xfs_buf_delwri_queue(bp, 1); 1276 return 0; 1277 } 1278 1279 if (bp->b_flags & XBF_WRITE) { 1280 xfs_buf_wait_unpin(bp); 1281 } 1282 1283 xfs_buf_hold(bp); 1284 1285 /* Set the count to 1 initially, this will stop an I/O 1286 * completion callout which happens before we have started 1287 * all the I/O from calling xfs_buf_ioend too early. 1288 */ 1289 atomic_set(&bp->b_io_remaining, 1); 1290 _xfs_buf_ioapply(bp); 1291 _xfs_buf_ioend(bp, 0); 1292 1293 xfs_buf_rele(bp); 1294 return 0; 1295} 1296 1297/* 1298 * Waits for I/O to complete on the buffer supplied. 1299 * It returns immediately if no I/O is pending. 1300 * It returns the I/O error code, if any, or 0 if there was no error. 1301 */ 1302int 1303xfs_buf_iowait( 1304 xfs_buf_t *bp) 1305{ 1306 XB_TRACE(bp, "iowait", 0); 1307 if (atomic_read(&bp->b_io_remaining)) 1308 blk_run_address_space(bp->b_target->bt_mapping); 1309 down(&bp->b_iodonesema); 1310 XB_TRACE(bp, "iowaited", (long)bp->b_error); 1311 return bp->b_error; 1312} 1313 1314xfs_caddr_t 1315xfs_buf_offset( 1316 xfs_buf_t *bp, 1317 size_t offset) 1318{ 1319 struct page *page; 1320 1321 if (bp->b_flags & XBF_MAPPED) 1322 return XFS_BUF_PTR(bp) + offset; 1323 1324 offset += bp->b_offset; 1325 page = bp->b_pages[offset >> PAGE_CACHE_SHIFT]; 1326 return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1)); 1327} 1328 1329/* 1330 * Move data into or out of a buffer. 1331 */ 1332void 1333xfs_buf_iomove( 1334 xfs_buf_t *bp, /* buffer to process */ 1335 size_t boff, /* starting buffer offset */ 1336 size_t bsize, /* length to copy */ 1337 caddr_t data, /* data address */ 1338 xfs_buf_rw_t mode) /* read/write/zero flag */ 1339{ 1340 size_t bend, cpoff, csize; 1341 struct page *page; 1342 1343 bend = boff + bsize; 1344 while (boff < bend) { 1345 page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; 1346 cpoff = xfs_buf_poff(boff + bp->b_offset); 1347 csize = min_t(size_t, 1348 PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff); 1349 1350 ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE)); 1351 1352 switch (mode) { 1353 case XBRW_ZERO: 1354 memset(page_address(page) + cpoff, 0, csize); 1355 break; 1356 case XBRW_READ: 1357 memcpy(data, page_address(page) + cpoff, csize); 1358 break; 1359 case XBRW_WRITE: 1360 memcpy(page_address(page) + cpoff, data, csize); 1361 } 1362 1363 boff += csize; 1364 data += csize; 1365 } 1366} 1367 1368/* 1369 * Handling of buffer targets (buftargs). 1370 */ 1371 1372/* 1373 * Wait for any bufs with callbacks that have been submitted but 1374 * have not yet returned... walk the hash list for the target. 1375 */ 1376void 1377xfs_wait_buftarg( 1378 xfs_buftarg_t *btp) 1379{ 1380 xfs_buf_t *bp, *n; 1381 xfs_bufhash_t *hash; 1382 uint i; 1383 1384 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1385 hash = &btp->bt_hash[i]; 1386again: 1387 spin_lock(&hash->bh_lock); 1388 list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { 1389 ASSERT(btp == bp->b_target); 1390 if (!(bp->b_flags & XBF_FS_MANAGED)) { 1391 spin_unlock(&hash->bh_lock); 1392 /* 1393 * Catch superblock reference count leaks 1394 * immediately 1395 */ 1396 BUG_ON(bp->b_bn == 0); 1397 delay(100); 1398 goto again; 1399 } 1400 } 1401 spin_unlock(&hash->bh_lock); 1402 } 1403} 1404 1405/* 1406 * Allocate buffer hash table for a given target. 1407 * For devices containing metadata (i.e. not the log/realtime devices) 1408 * we need to allocate a much larger hash table. 1409 */ 1410STATIC void 1411xfs_alloc_bufhash( 1412 xfs_buftarg_t *btp, 1413 int external) 1414{ 1415 unsigned int i; 1416 1417 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ 1418 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; 1419 btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) * 1420 sizeof(xfs_bufhash_t), KM_SLEEP | KM_LARGE); 1421 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1422 spin_lock_init(&btp->bt_hash[i].bh_lock); 1423 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); 1424 } 1425} 1426 1427STATIC void 1428xfs_free_bufhash( 1429 xfs_buftarg_t *btp) 1430{ 1431 kmem_free(btp->bt_hash, (1<<btp->bt_hashshift) * sizeof(xfs_bufhash_t)); 1432 btp->bt_hash = NULL; 1433} 1434 1435/* 1436 * buftarg list for delwrite queue processing 1437 */ 1438static LIST_HEAD(xfs_buftarg_list); 1439static DEFINE_SPINLOCK(xfs_buftarg_lock); 1440 1441STATIC void 1442xfs_register_buftarg( 1443 xfs_buftarg_t *btp) 1444{ 1445 spin_lock(&xfs_buftarg_lock); 1446 list_add(&btp->bt_list, &xfs_buftarg_list); 1447 spin_unlock(&xfs_buftarg_lock); 1448} 1449 1450STATIC void 1451xfs_unregister_buftarg( 1452 xfs_buftarg_t *btp) 1453{ 1454 spin_lock(&xfs_buftarg_lock); 1455 list_del(&btp->bt_list); 1456 spin_unlock(&xfs_buftarg_lock); 1457} 1458 1459void 1460xfs_free_buftarg( 1461 xfs_buftarg_t *btp, 1462 int external) 1463{ 1464 xfs_flush_buftarg(btp, 1); 1465 xfs_blkdev_issue_flush(btp); 1466 if (external) 1467 xfs_blkdev_put(btp->bt_bdev); 1468 xfs_free_bufhash(btp); 1469 iput(btp->bt_mapping->host); 1470 1471 /* Unregister the buftarg first so that we don't get a 1472 * wakeup finding a non-existent task 1473 */ 1474 xfs_unregister_buftarg(btp); 1475 kthread_stop(btp->bt_task); 1476 1477 kmem_free(btp, sizeof(*btp)); 1478} 1479 1480STATIC int 1481xfs_setsize_buftarg_flags( 1482 xfs_buftarg_t *btp, 1483 unsigned int blocksize, 1484 unsigned int sectorsize, 1485 int verbose) 1486{ 1487 btp->bt_bsize = blocksize; 1488 btp->bt_sshift = ffs(sectorsize) - 1; 1489 btp->bt_smask = sectorsize - 1; 1490 1491 if (set_blocksize(btp->bt_bdev, sectorsize)) { 1492 printk(KERN_WARNING 1493 "XFS: Cannot set_blocksize to %u on device %s\n", 1494 sectorsize, XFS_BUFTARG_NAME(btp)); 1495 return EINVAL; 1496 } 1497 1498 if (verbose && 1499 (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) { 1500 printk(KERN_WARNING 1501 "XFS: %u byte sectors in use on device %s. " 1502 "This is suboptimal; %u or greater is ideal.\n", 1503 sectorsize, XFS_BUFTARG_NAME(btp), 1504 (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG); 1505 } 1506 1507 return 0; 1508} 1509 1510/* 1511 * When allocating the initial buffer target we have not yet 1512 * read in the superblock, so don't know what sized sectors 1513 * are being used is at this early stage. Play safe. 1514 */ 1515STATIC int 1516xfs_setsize_buftarg_early( 1517 xfs_buftarg_t *btp, 1518 struct block_device *bdev) 1519{ 1520 return xfs_setsize_buftarg_flags(btp, 1521 PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0); 1522} 1523 1524int 1525xfs_setsize_buftarg( 1526 xfs_buftarg_t *btp, 1527 unsigned int blocksize, 1528 unsigned int sectorsize) 1529{ 1530 return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); 1531} 1532 1533STATIC int 1534xfs_mapping_buftarg( 1535 xfs_buftarg_t *btp, 1536 struct block_device *bdev) 1537{ 1538 struct backing_dev_info *bdi; 1539 struct inode *inode; 1540 struct address_space *mapping; 1541 static const struct address_space_operations mapping_aops = { 1542 .sync_page = block_sync_page, 1543 .migratepage = fail_migrate_page, 1544 }; 1545 1546 inode = new_inode(bdev->bd_inode->i_sb); 1547 if (!inode) { 1548 printk(KERN_WARNING 1549 "XFS: Cannot allocate mapping inode for device %s\n", 1550 XFS_BUFTARG_NAME(btp)); 1551 return ENOMEM; 1552 } 1553 inode->i_mode = S_IFBLK; 1554 inode->i_bdev = bdev; 1555 inode->i_rdev = bdev->bd_dev; 1556 bdi = blk_get_backing_dev_info(bdev); 1557 if (!bdi) 1558 bdi = &default_backing_dev_info; 1559 mapping = &inode->i_data; 1560 mapping->a_ops = &mapping_aops; 1561 mapping->backing_dev_info = bdi; 1562 mapping_set_gfp_mask(mapping, GFP_NOFS); 1563 btp->bt_mapping = mapping; 1564 return 0; 1565} 1566 1567STATIC int 1568xfs_alloc_delwrite_queue( 1569 xfs_buftarg_t *btp) 1570{ 1571 int error = 0; 1572 1573 INIT_LIST_HEAD(&btp->bt_list); 1574 INIT_LIST_HEAD(&btp->bt_delwrite_queue); 1575 spinlock_init(&btp->bt_delwrite_lock, "delwri_lock"); 1576 btp->bt_flags = 0; 1577 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd"); 1578 if (IS_ERR(btp->bt_task)) { 1579 error = PTR_ERR(btp->bt_task); 1580 goto out_error; 1581 } 1582 xfs_register_buftarg(btp); 1583out_error: 1584 return error; 1585} 1586 1587xfs_buftarg_t * 1588xfs_alloc_buftarg( 1589 struct block_device *bdev, 1590 int external) 1591{ 1592 xfs_buftarg_t *btp; 1593 1594 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); 1595 1596 btp->bt_dev = bdev->bd_dev; 1597 btp->bt_bdev = bdev; 1598 if (xfs_setsize_buftarg_early(btp, bdev)) 1599 goto error; 1600 if (xfs_mapping_buftarg(btp, bdev)) 1601 goto error; 1602 if (xfs_alloc_delwrite_queue(btp)) 1603 goto error; 1604 xfs_alloc_bufhash(btp, external); 1605 return btp; 1606 1607error: 1608 kmem_free(btp, sizeof(*btp)); 1609 return NULL; 1610} 1611 1612 1613/* 1614 * Delayed write buffer handling 1615 */ 1616STATIC void 1617xfs_buf_delwri_queue( 1618 xfs_buf_t *bp, 1619 int unlock) 1620{ 1621 struct list_head *dwq = &bp->b_target->bt_delwrite_queue; 1622 spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; 1623 1624 XB_TRACE(bp, "delwri_q", (long)unlock); 1625 ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC)); 1626 1627 spin_lock(dwlk); 1628 /* If already in the queue, dequeue and place at tail */ 1629 if (!list_empty(&bp->b_list)) { 1630 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 1631 if (unlock) 1632 atomic_dec(&bp->b_hold); 1633 list_del(&bp->b_list); 1634 } 1635 1636 bp->b_flags |= _XBF_DELWRI_Q; 1637 list_add_tail(&bp->b_list, dwq); 1638 bp->b_queuetime = jiffies; 1639 spin_unlock(dwlk); 1640 1641 if (unlock) 1642 xfs_buf_unlock(bp); 1643} 1644 1645void 1646xfs_buf_delwri_dequeue( 1647 xfs_buf_t *bp) 1648{ 1649 spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; 1650 int dequeued = 0; 1651 1652 spin_lock(dwlk); 1653 if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) { 1654 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 1655 list_del_init(&bp->b_list); 1656 dequeued = 1; 1657 } 1658 bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); 1659 spin_unlock(dwlk); 1660 1661 if (dequeued) 1662 xfs_buf_rele(bp); 1663 1664 XB_TRACE(bp, "delwri_dq", (long)dequeued); 1665} 1666 1667STATIC void 1668xfs_buf_runall_queues( 1669 struct workqueue_struct *queue) 1670{ 1671 flush_workqueue(queue); 1672} 1673 1674STATIC int 1675xfsbufd_wakeup( 1676 int priority, 1677 gfp_t mask) 1678{ 1679 xfs_buftarg_t *btp; 1680 1681 spin_lock(&xfs_buftarg_lock); 1682 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) { 1683 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags)) 1684 continue; 1685 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags); 1686 wake_up_process(btp->bt_task); 1687 } 1688 spin_unlock(&xfs_buftarg_lock); 1689 return 0; 1690} 1691 1692/* 1693 * Move as many buffers as specified to the supplied list 1694 * idicating if we skipped any buffers to prevent deadlocks. 1695 */ 1696STATIC int 1697xfs_buf_delwri_split( 1698 xfs_buftarg_t *target, 1699 struct list_head *list, 1700 unsigned long age) 1701{ 1702 xfs_buf_t *bp, *n; 1703 struct list_head *dwq = &target->bt_delwrite_queue; 1704 spinlock_t *dwlk = &target->bt_delwrite_lock; 1705 int skipped = 0; 1706 int force; 1707 1708 force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); 1709 INIT_LIST_HEAD(list); 1710 spin_lock(dwlk); 1711 list_for_each_entry_safe(bp, n, dwq, b_list) { 1712 XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp)); 1713 ASSERT(bp->b_flags & XBF_DELWRI); 1714 1715 if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) { 1716 if (!force && 1717 time_before(jiffies, bp->b_queuetime + age)) { 1718 xfs_buf_unlock(bp); 1719 break; 1720 } 1721 1722 bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q| 1723 _XBF_RUN_QUEUES); 1724 bp->b_flags |= XBF_WRITE; 1725 list_move_tail(&bp->b_list, list); 1726 } else 1727 skipped++; 1728 } 1729 spin_unlock(dwlk); 1730 1731 return skipped; 1732 1733} 1734 1735STATIC int 1736xfsbufd( 1737 void *data) 1738{ 1739 struct list_head tmp; 1740 xfs_buftarg_t *target = (xfs_buftarg_t *)data; 1741 int count; 1742 xfs_buf_t *bp; 1743 1744 current->flags |= PF_MEMALLOC; 1745 1746 set_freezable(); 1747 1748 do { 1749 if (unlikely(freezing(current))) { 1750 set_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1751 refrigerator(); 1752 } else { 1753 clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1754 } 1755 1756 schedule_timeout_interruptible( 1757 xfs_buf_timer_centisecs * msecs_to_jiffies(10)); 1758 1759 xfs_buf_delwri_split(target, &tmp, 1760 xfs_buf_age_centisecs * msecs_to_jiffies(10)); 1761 1762 count = 0; 1763 while (!list_empty(&tmp)) { 1764 bp = list_entry(tmp.next, xfs_buf_t, b_list); 1765 ASSERT(target == bp->b_target); 1766 1767 list_del_init(&bp->b_list); 1768 xfs_buf_iostrategy(bp); 1769 count++; 1770 } 1771 1772 if (as_list_len > 0) 1773 purge_addresses(); 1774 if (count) 1775 blk_run_address_space(target->bt_mapping); 1776 1777 } while (!kthread_should_stop()); 1778 1779 return 0; 1780} 1781 1782/* 1783 * Go through all incore buffers, and release buffers if they belong to 1784 * the given device. This is used in filesystem error handling to 1785 * preserve the consistency of its metadata. 1786 */ 1787int 1788xfs_flush_buftarg( 1789 xfs_buftarg_t *target, 1790 int wait) 1791{ 1792 struct list_head tmp; 1793 xfs_buf_t *bp, *n; 1794 int pincount = 0; 1795 1796 xfs_buf_runall_queues(xfsdatad_workqueue); 1797 xfs_buf_runall_queues(xfslogd_workqueue); 1798 1799 set_bit(XBT_FORCE_FLUSH, &target->bt_flags); 1800 pincount = xfs_buf_delwri_split(target, &tmp, 0); 1801 1802 /* 1803 * Dropped the delayed write list lock, now walk the temporary list 1804 */ 1805 list_for_each_entry_safe(bp, n, &tmp, b_list) { 1806 ASSERT(target == bp->b_target); 1807 if (wait) 1808 bp->b_flags &= ~XBF_ASYNC; 1809 else 1810 list_del_init(&bp->b_list); 1811 1812 xfs_buf_iostrategy(bp); 1813 } 1814 1815 if (wait) 1816 blk_run_address_space(target->bt_mapping); 1817 1818 /* 1819 * Remaining list items must be flushed before returning 1820 */ 1821 while (!list_empty(&tmp)) { 1822 bp = list_entry(tmp.next, xfs_buf_t, b_list); 1823 1824 list_del_init(&bp->b_list); 1825 xfs_iowait(bp); 1826 xfs_buf_relse(bp); 1827 } 1828 1829 return pincount; 1830} 1831 1832int __init 1833xfs_buf_init(void) 1834{ 1835#ifdef XFS_BUF_TRACE 1836 xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_SLEEP); 1837#endif 1838 1839 xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", 1840 KM_ZONE_HWALIGN, NULL); 1841 if (!xfs_buf_zone) 1842 goto out_free_trace_buf; 1843 1844 xfslogd_workqueue = create_workqueue("xfslogd"); 1845 if (!xfslogd_workqueue) 1846 goto out_free_buf_zone; 1847 1848 xfsdatad_workqueue = create_workqueue("xfsdatad"); 1849 if (!xfsdatad_workqueue) 1850 goto out_destroy_xfslogd_workqueue; 1851 1852 register_shrinker(&xfs_buf_shake); 1853 return 0; 1854 1855 out_destroy_xfslogd_workqueue: 1856 destroy_workqueue(xfslogd_workqueue); 1857 out_free_buf_zone: 1858 kmem_zone_destroy(xfs_buf_zone); 1859 out_free_trace_buf: 1860#ifdef XFS_BUF_TRACE 1861 ktrace_free(xfs_buf_trace_buf); 1862#endif 1863 return -ENOMEM; 1864} 1865 1866void 1867xfs_buf_terminate(void) 1868{ 1869 unregister_shrinker(&xfs_buf_shake); 1870 destroy_workqueue(xfsdatad_workqueue); 1871 destroy_workqueue(xfslogd_workqueue); 1872 kmem_zone_destroy(xfs_buf_zone); 1873#ifdef XFS_BUF_TRACE 1874 ktrace_free(xfs_buf_trace_buf); 1875#endif 1876} 1877 1878#ifdef CONFIG_KDB_MODULES 1879struct list_head * 1880xfs_get_buftarg_list(void) 1881{ 1882 return &xfs_buftarg_list; 1883} 1884#endif