Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v2.6.23-rc2 1870 lines 42 kB view raw
1/* 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18#include "xfs.h" 19#include <linux/stddef.h> 20#include <linux/errno.h> 21#include <linux/slab.h> 22#include <linux/pagemap.h> 23#include <linux/init.h> 24#include <linux/vmalloc.h> 25#include <linux/bio.h> 26#include <linux/sysctl.h> 27#include <linux/proc_fs.h> 28#include <linux/workqueue.h> 29#include <linux/percpu.h> 30#include <linux/blkdev.h> 31#include <linux/hash.h> 32#include <linux/kthread.h> 33#include <linux/migrate.h> 34#include <linux/backing-dev.h> 35#include <linux/freezer.h> 36 37static kmem_zone_t *xfs_buf_zone; 38STATIC int xfsbufd(void *); 39STATIC int xfsbufd_wakeup(int, gfp_t); 40STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); 41static struct shrinker xfs_buf_shake = { 42 .shrink = xfsbufd_wakeup, 43 .seeks = DEFAULT_SEEKS, 44}; 45 46static struct workqueue_struct *xfslogd_workqueue; 47struct workqueue_struct *xfsdatad_workqueue; 48 49#ifdef XFS_BUF_TRACE 50void 51xfs_buf_trace( 52 xfs_buf_t *bp, 53 char *id, 54 void *data, 55 void *ra) 56{ 57 ktrace_enter(xfs_buf_trace_buf, 58 bp, id, 59 (void *)(unsigned long)bp->b_flags, 60 (void *)(unsigned long)bp->b_hold.counter, 61 (void *)(unsigned long)bp->b_sema.count.counter, 62 (void *)current, 63 data, ra, 64 (void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff), 65 (void *)(unsigned long)(bp->b_file_offset & 0xffffffff), 66 (void *)(unsigned long)bp->b_buffer_length, 67 NULL, NULL, NULL, NULL, NULL); 68} 69ktrace_t *xfs_buf_trace_buf; 70#define XFS_BUF_TRACE_SIZE 4096 71#define XB_TRACE(bp, id, data) \ 72 xfs_buf_trace(bp, id, (void *)data, (void *)__builtin_return_address(0)) 73#else 74#define XB_TRACE(bp, id, data) do { } while (0) 75#endif 76 77#ifdef XFS_BUF_LOCK_TRACKING 78# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) 79# define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1) 80# define XB_GET_OWNER(bp) ((bp)->b_last_holder) 81#else 82# define XB_SET_OWNER(bp) do { } while (0) 83# define XB_CLEAR_OWNER(bp) do { } while (0) 84# define XB_GET_OWNER(bp) do { } while (0) 85#endif 86 87#define xb_to_gfp(flags) \ 88 ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \ 89 ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN) 90 91#define xb_to_km(flags) \ 92 (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) 93 94#define xfs_buf_allocate(flags) \ 95 kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags)) 96#define xfs_buf_deallocate(bp) \ 97 kmem_zone_free(xfs_buf_zone, (bp)); 98 99/* 100 * Page Region interfaces. 101 * 102 * For pages in filesystems where the blocksize is smaller than the 103 * pagesize, we use the page->private field (long) to hold a bitmap 104 * of uptodate regions within the page. 105 * 106 * Each such region is "bytes per page / bits per long" bytes long. 107 * 108 * NBPPR == number-of-bytes-per-page-region 109 * BTOPR == bytes-to-page-region (rounded up) 110 * BTOPRT == bytes-to-page-region-truncated (rounded down) 111 */ 112#if (BITS_PER_LONG == 32) 113#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */ 114#elif (BITS_PER_LONG == 64) 115#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */ 116#else 117#error BITS_PER_LONG must be 32 or 64 118#endif 119#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG) 120#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT) 121#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT)) 122 123STATIC unsigned long 124page_region_mask( 125 size_t offset, 126 size_t length) 127{ 128 unsigned long mask; 129 int first, final; 130 131 first = BTOPR(offset); 132 final = BTOPRT(offset + length - 1); 133 first = min(first, final); 134 135 mask = ~0UL; 136 mask <<= BITS_PER_LONG - (final - first); 137 mask >>= BITS_PER_LONG - (final); 138 139 ASSERT(offset + length <= PAGE_CACHE_SIZE); 140 ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0); 141 142 return mask; 143} 144 145STATIC_INLINE void 146set_page_region( 147 struct page *page, 148 size_t offset, 149 size_t length) 150{ 151 set_page_private(page, 152 page_private(page) | page_region_mask(offset, length)); 153 if (page_private(page) == ~0UL) 154 SetPageUptodate(page); 155} 156 157STATIC_INLINE int 158test_page_region( 159 struct page *page, 160 size_t offset, 161 size_t length) 162{ 163 unsigned long mask = page_region_mask(offset, length); 164 165 return (mask && (page_private(page) & mask) == mask); 166} 167 168/* 169 * Mapping of multi-page buffers into contiguous virtual space 170 */ 171 172typedef struct a_list { 173 void *vm_addr; 174 struct a_list *next; 175} a_list_t; 176 177static a_list_t *as_free_head; 178static int as_list_len; 179static DEFINE_SPINLOCK(as_lock); 180 181/* 182 * Try to batch vunmaps because they are costly. 183 */ 184STATIC void 185free_address( 186 void *addr) 187{ 188 a_list_t *aentry; 189 190 aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT); 191 if (likely(aentry)) { 192 spin_lock(&as_lock); 193 aentry->next = as_free_head; 194 aentry->vm_addr = addr; 195 as_free_head = aentry; 196 as_list_len++; 197 spin_unlock(&as_lock); 198 } else { 199 vunmap(addr); 200 } 201} 202 203STATIC void 204purge_addresses(void) 205{ 206 a_list_t *aentry, *old; 207 208 if (as_free_head == NULL) 209 return; 210 211 spin_lock(&as_lock); 212 aentry = as_free_head; 213 as_free_head = NULL; 214 as_list_len = 0; 215 spin_unlock(&as_lock); 216 217 while ((old = aentry) != NULL) { 218 vunmap(aentry->vm_addr); 219 aentry = aentry->next; 220 kfree(old); 221 } 222} 223 224/* 225 * Internal xfs_buf_t object manipulation 226 */ 227 228STATIC void 229_xfs_buf_initialize( 230 xfs_buf_t *bp, 231 xfs_buftarg_t *target, 232 xfs_off_t range_base, 233 size_t range_length, 234 xfs_buf_flags_t flags) 235{ 236 /* 237 * We don't want certain flags to appear in b_flags. 238 */ 239 flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD); 240 241 memset(bp, 0, sizeof(xfs_buf_t)); 242 atomic_set(&bp->b_hold, 1); 243 init_MUTEX_LOCKED(&bp->b_iodonesema); 244 INIT_LIST_HEAD(&bp->b_list); 245 INIT_LIST_HEAD(&bp->b_hash_list); 246 init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */ 247 XB_SET_OWNER(bp); 248 bp->b_target = target; 249 bp->b_file_offset = range_base; 250 /* 251 * Set buffer_length and count_desired to the same value initially. 252 * I/O routines should use count_desired, which will be the same in 253 * most cases but may be reset (e.g. XFS recovery). 254 */ 255 bp->b_buffer_length = bp->b_count_desired = range_length; 256 bp->b_flags = flags; 257 bp->b_bn = XFS_BUF_DADDR_NULL; 258 atomic_set(&bp->b_pin_count, 0); 259 init_waitqueue_head(&bp->b_waiters); 260 261 XFS_STATS_INC(xb_create); 262 XB_TRACE(bp, "initialize", target); 263} 264 265/* 266 * Allocate a page array capable of holding a specified number 267 * of pages, and point the page buf at it. 268 */ 269STATIC int 270_xfs_buf_get_pages( 271 xfs_buf_t *bp, 272 int page_count, 273 xfs_buf_flags_t flags) 274{ 275 /* Make sure that we have a page list */ 276 if (bp->b_pages == NULL) { 277 bp->b_offset = xfs_buf_poff(bp->b_file_offset); 278 bp->b_page_count = page_count; 279 if (page_count <= XB_PAGES) { 280 bp->b_pages = bp->b_page_array; 281 } else { 282 bp->b_pages = kmem_alloc(sizeof(struct page *) * 283 page_count, xb_to_km(flags)); 284 if (bp->b_pages == NULL) 285 return -ENOMEM; 286 } 287 memset(bp->b_pages, 0, sizeof(struct page *) * page_count); 288 } 289 return 0; 290} 291 292/* 293 * Frees b_pages if it was allocated. 294 */ 295STATIC void 296_xfs_buf_free_pages( 297 xfs_buf_t *bp) 298{ 299 if (bp->b_pages != bp->b_page_array) { 300 kmem_free(bp->b_pages, 301 bp->b_page_count * sizeof(struct page *)); 302 } 303} 304 305/* 306 * Releases the specified buffer. 307 * 308 * The modification state of any associated pages is left unchanged. 309 * The buffer most not be on any hash - use xfs_buf_rele instead for 310 * hashed and refcounted buffers 311 */ 312void 313xfs_buf_free( 314 xfs_buf_t *bp) 315{ 316 XB_TRACE(bp, "free", 0); 317 318 ASSERT(list_empty(&bp->b_hash_list)); 319 320 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { 321 uint i; 322 323 if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) 324 free_address(bp->b_addr - bp->b_offset); 325 326 for (i = 0; i < bp->b_page_count; i++) { 327 struct page *page = bp->b_pages[i]; 328 329 if (bp->b_flags & _XBF_PAGE_CACHE) 330 ASSERT(!PagePrivate(page)); 331 page_cache_release(page); 332 } 333 _xfs_buf_free_pages(bp); 334 } 335 336 xfs_buf_deallocate(bp); 337} 338 339/* 340 * Finds all pages for buffer in question and builds it's page list. 341 */ 342STATIC int 343_xfs_buf_lookup_pages( 344 xfs_buf_t *bp, 345 uint flags) 346{ 347 struct address_space *mapping = bp->b_target->bt_mapping; 348 size_t blocksize = bp->b_target->bt_bsize; 349 size_t size = bp->b_count_desired; 350 size_t nbytes, offset; 351 gfp_t gfp_mask = xb_to_gfp(flags); 352 unsigned short page_count, i; 353 pgoff_t first; 354 xfs_off_t end; 355 int error; 356 357 end = bp->b_file_offset + bp->b_buffer_length; 358 page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); 359 360 error = _xfs_buf_get_pages(bp, page_count, flags); 361 if (unlikely(error)) 362 return error; 363 bp->b_flags |= _XBF_PAGE_CACHE; 364 365 offset = bp->b_offset; 366 first = bp->b_file_offset >> PAGE_CACHE_SHIFT; 367 368 for (i = 0; i < bp->b_page_count; i++) { 369 struct page *page; 370 uint retries = 0; 371 372 retry: 373 page = find_or_create_page(mapping, first + i, gfp_mask); 374 if (unlikely(page == NULL)) { 375 if (flags & XBF_READ_AHEAD) { 376 bp->b_page_count = i; 377 for (i = 0; i < bp->b_page_count; i++) 378 unlock_page(bp->b_pages[i]); 379 return -ENOMEM; 380 } 381 382 /* 383 * This could deadlock. 384 * 385 * But until all the XFS lowlevel code is revamped to 386 * handle buffer allocation failures we can't do much. 387 */ 388 if (!(++retries % 100)) 389 printk(KERN_ERR 390 "XFS: possible memory allocation " 391 "deadlock in %s (mode:0x%x)\n", 392 __FUNCTION__, gfp_mask); 393 394 XFS_STATS_INC(xb_page_retries); 395 xfsbufd_wakeup(0, gfp_mask); 396 congestion_wait(WRITE, HZ/50); 397 goto retry; 398 } 399 400 XFS_STATS_INC(xb_page_found); 401 402 nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset); 403 size -= nbytes; 404 405 ASSERT(!PagePrivate(page)); 406 if (!PageUptodate(page)) { 407 page_count--; 408 if (blocksize >= PAGE_CACHE_SIZE) { 409 if (flags & XBF_READ) 410 bp->b_locked = 1; 411 } else if (!PagePrivate(page)) { 412 if (test_page_region(page, offset, nbytes)) 413 page_count++; 414 } 415 } 416 417 bp->b_pages[i] = page; 418 offset = 0; 419 } 420 421 if (!bp->b_locked) { 422 for (i = 0; i < bp->b_page_count; i++) 423 unlock_page(bp->b_pages[i]); 424 } 425 426 if (page_count == bp->b_page_count) 427 bp->b_flags |= XBF_DONE; 428 429 XB_TRACE(bp, "lookup_pages", (long)page_count); 430 return error; 431} 432 433/* 434 * Map buffer into kernel address-space if nessecary. 435 */ 436STATIC int 437_xfs_buf_map_pages( 438 xfs_buf_t *bp, 439 uint flags) 440{ 441 /* A single page buffer is always mappable */ 442 if (bp->b_page_count == 1) { 443 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 444 bp->b_flags |= XBF_MAPPED; 445 } else if (flags & XBF_MAPPED) { 446 if (as_list_len > 64) 447 purge_addresses(); 448 bp->b_addr = vmap(bp->b_pages, bp->b_page_count, 449 VM_MAP, PAGE_KERNEL); 450 if (unlikely(bp->b_addr == NULL)) 451 return -ENOMEM; 452 bp->b_addr += bp->b_offset; 453 bp->b_flags |= XBF_MAPPED; 454 } 455 456 return 0; 457} 458 459/* 460 * Finding and Reading Buffers 461 */ 462 463/* 464 * Look up, and creates if absent, a lockable buffer for 465 * a given range of an inode. The buffer is returned 466 * locked. If other overlapping buffers exist, they are 467 * released before the new buffer is created and locked, 468 * which may imply that this call will block until those buffers 469 * are unlocked. No I/O is implied by this call. 470 */ 471xfs_buf_t * 472_xfs_buf_find( 473 xfs_buftarg_t *btp, /* block device target */ 474 xfs_off_t ioff, /* starting offset of range */ 475 size_t isize, /* length of range */ 476 xfs_buf_flags_t flags, 477 xfs_buf_t *new_bp) 478{ 479 xfs_off_t range_base; 480 size_t range_length; 481 xfs_bufhash_t *hash; 482 xfs_buf_t *bp, *n; 483 484 range_base = (ioff << BBSHIFT); 485 range_length = (isize << BBSHIFT); 486 487 /* Check for IOs smaller than the sector size / not sector aligned */ 488 ASSERT(!(range_length < (1 << btp->bt_sshift))); 489 ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); 490 491 hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)]; 492 493 spin_lock(&hash->bh_lock); 494 495 list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { 496 ASSERT(btp == bp->b_target); 497 if (bp->b_file_offset == range_base && 498 bp->b_buffer_length == range_length) { 499 /* 500 * If we look at something, bring it to the 501 * front of the list for next time. 502 */ 503 atomic_inc(&bp->b_hold); 504 list_move(&bp->b_hash_list, &hash->bh_list); 505 goto found; 506 } 507 } 508 509 /* No match found */ 510 if (new_bp) { 511 _xfs_buf_initialize(new_bp, btp, range_base, 512 range_length, flags); 513 new_bp->b_hash = hash; 514 list_add(&new_bp->b_hash_list, &hash->bh_list); 515 } else { 516 XFS_STATS_INC(xb_miss_locked); 517 } 518 519 spin_unlock(&hash->bh_lock); 520 return new_bp; 521 522found: 523 spin_unlock(&hash->bh_lock); 524 525 /* Attempt to get the semaphore without sleeping, 526 * if this does not work then we need to drop the 527 * spinlock and do a hard attempt on the semaphore. 528 */ 529 if (down_trylock(&bp->b_sema)) { 530 if (!(flags & XBF_TRYLOCK)) { 531 /* wait for buffer ownership */ 532 XB_TRACE(bp, "get_lock", 0); 533 xfs_buf_lock(bp); 534 XFS_STATS_INC(xb_get_locked_waited); 535 } else { 536 /* We asked for a trylock and failed, no need 537 * to look at file offset and length here, we 538 * know that this buffer at least overlaps our 539 * buffer and is locked, therefore our buffer 540 * either does not exist, or is this buffer. 541 */ 542 xfs_buf_rele(bp); 543 XFS_STATS_INC(xb_busy_locked); 544 return NULL; 545 } 546 } else { 547 /* trylock worked */ 548 XB_SET_OWNER(bp); 549 } 550 551 if (bp->b_flags & XBF_STALE) { 552 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 553 bp->b_flags &= XBF_MAPPED; 554 } 555 XB_TRACE(bp, "got_lock", 0); 556 XFS_STATS_INC(xb_get_locked); 557 return bp; 558} 559 560/* 561 * Assembles a buffer covering the specified range. 562 * Storage in memory for all portions of the buffer will be allocated, 563 * although backing storage may not be. 564 */ 565xfs_buf_t * 566xfs_buf_get_flags( 567 xfs_buftarg_t *target,/* target for buffer */ 568 xfs_off_t ioff, /* starting offset of range */ 569 size_t isize, /* length of range */ 570 xfs_buf_flags_t flags) 571{ 572 xfs_buf_t *bp, *new_bp; 573 int error = 0, i; 574 575 new_bp = xfs_buf_allocate(flags); 576 if (unlikely(!new_bp)) 577 return NULL; 578 579 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); 580 if (bp == new_bp) { 581 error = _xfs_buf_lookup_pages(bp, flags); 582 if (error) 583 goto no_buffer; 584 } else { 585 xfs_buf_deallocate(new_bp); 586 if (unlikely(bp == NULL)) 587 return NULL; 588 } 589 590 for (i = 0; i < bp->b_page_count; i++) 591 mark_page_accessed(bp->b_pages[i]); 592 593 if (!(bp->b_flags & XBF_MAPPED)) { 594 error = _xfs_buf_map_pages(bp, flags); 595 if (unlikely(error)) { 596 printk(KERN_WARNING "%s: failed to map pages\n", 597 __FUNCTION__); 598 goto no_buffer; 599 } 600 } 601 602 XFS_STATS_INC(xb_get); 603 604 /* 605 * Always fill in the block number now, the mapped cases can do 606 * their own overlay of this later. 607 */ 608 bp->b_bn = ioff; 609 bp->b_count_desired = bp->b_buffer_length; 610 611 XB_TRACE(bp, "get", (unsigned long)flags); 612 return bp; 613 614 no_buffer: 615 if (flags & (XBF_LOCK | XBF_TRYLOCK)) 616 xfs_buf_unlock(bp); 617 xfs_buf_rele(bp); 618 return NULL; 619} 620 621xfs_buf_t * 622xfs_buf_read_flags( 623 xfs_buftarg_t *target, 624 xfs_off_t ioff, 625 size_t isize, 626 xfs_buf_flags_t flags) 627{ 628 xfs_buf_t *bp; 629 630 flags |= XBF_READ; 631 632 bp = xfs_buf_get_flags(target, ioff, isize, flags); 633 if (bp) { 634 if (!XFS_BUF_ISDONE(bp)) { 635 XB_TRACE(bp, "read", (unsigned long)flags); 636 XFS_STATS_INC(xb_get_read); 637 xfs_buf_iostart(bp, flags); 638 } else if (flags & XBF_ASYNC) { 639 XB_TRACE(bp, "read_async", (unsigned long)flags); 640 /* 641 * Read ahead call which is already satisfied, 642 * drop the buffer 643 */ 644 goto no_buffer; 645 } else { 646 XB_TRACE(bp, "read_done", (unsigned long)flags); 647 /* We do not want read in the flags */ 648 bp->b_flags &= ~XBF_READ; 649 } 650 } 651 652 return bp; 653 654 no_buffer: 655 if (flags & (XBF_LOCK | XBF_TRYLOCK)) 656 xfs_buf_unlock(bp); 657 xfs_buf_rele(bp); 658 return NULL; 659} 660 661/* 662 * If we are not low on memory then do the readahead in a deadlock 663 * safe manner. 664 */ 665void 666xfs_buf_readahead( 667 xfs_buftarg_t *target, 668 xfs_off_t ioff, 669 size_t isize, 670 xfs_buf_flags_t flags) 671{ 672 struct backing_dev_info *bdi; 673 674 bdi = target->bt_mapping->backing_dev_info; 675 if (bdi_read_congested(bdi)) 676 return; 677 678 flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); 679 xfs_buf_read_flags(target, ioff, isize, flags); 680} 681 682xfs_buf_t * 683xfs_buf_get_empty( 684 size_t len, 685 xfs_buftarg_t *target) 686{ 687 xfs_buf_t *bp; 688 689 bp = xfs_buf_allocate(0); 690 if (bp) 691 _xfs_buf_initialize(bp, target, 0, len, 0); 692 return bp; 693} 694 695static inline struct page * 696mem_to_page( 697 void *addr) 698{ 699 if (((unsigned long)addr < VMALLOC_START) || 700 ((unsigned long)addr >= VMALLOC_END)) { 701 return virt_to_page(addr); 702 } else { 703 return vmalloc_to_page(addr); 704 } 705} 706 707int 708xfs_buf_associate_memory( 709 xfs_buf_t *bp, 710 void *mem, 711 size_t len) 712{ 713 int rval; 714 int i = 0; 715 size_t ptr; 716 size_t end, end_cur; 717 off_t offset; 718 int page_count; 719 720 page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT; 721 offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK); 722 if (offset && (len > PAGE_CACHE_SIZE)) 723 page_count++; 724 725 /* Free any previous set of page pointers */ 726 if (bp->b_pages) 727 _xfs_buf_free_pages(bp); 728 729 bp->b_pages = NULL; 730 bp->b_addr = mem; 731 732 rval = _xfs_buf_get_pages(bp, page_count, 0); 733 if (rval) 734 return rval; 735 736 bp->b_offset = offset; 737 ptr = (size_t) mem & PAGE_CACHE_MASK; 738 end = PAGE_CACHE_ALIGN((size_t) mem + len); 739 end_cur = end; 740 /* set up first page */ 741 bp->b_pages[0] = mem_to_page(mem); 742 743 ptr += PAGE_CACHE_SIZE; 744 bp->b_page_count = ++i; 745 while (ptr < end) { 746 bp->b_pages[i] = mem_to_page((void *)ptr); 747 bp->b_page_count = ++i; 748 ptr += PAGE_CACHE_SIZE; 749 } 750 bp->b_locked = 0; 751 752 bp->b_count_desired = bp->b_buffer_length = len; 753 bp->b_flags |= XBF_MAPPED; 754 755 return 0; 756} 757 758xfs_buf_t * 759xfs_buf_get_noaddr( 760 size_t len, 761 xfs_buftarg_t *target) 762{ 763 unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT; 764 int error, i; 765 xfs_buf_t *bp; 766 767 bp = xfs_buf_allocate(0); 768 if (unlikely(bp == NULL)) 769 goto fail; 770 _xfs_buf_initialize(bp, target, 0, len, 0); 771 772 error = _xfs_buf_get_pages(bp, page_count, 0); 773 if (error) 774 goto fail_free_buf; 775 776 for (i = 0; i < page_count; i++) { 777 bp->b_pages[i] = alloc_page(GFP_KERNEL); 778 if (!bp->b_pages[i]) 779 goto fail_free_mem; 780 } 781 bp->b_flags |= _XBF_PAGES; 782 783 error = _xfs_buf_map_pages(bp, XBF_MAPPED); 784 if (unlikely(error)) { 785 printk(KERN_WARNING "%s: failed to map pages\n", 786 __FUNCTION__); 787 goto fail_free_mem; 788 } 789 790 xfs_buf_unlock(bp); 791 792 XB_TRACE(bp, "no_daddr", len); 793 return bp; 794 795 fail_free_mem: 796 while (--i >= 0) 797 __free_page(bp->b_pages[i]); 798 _xfs_buf_free_pages(bp); 799 fail_free_buf: 800 xfs_buf_deallocate(bp); 801 fail: 802 return NULL; 803} 804 805/* 806 * Increment reference count on buffer, to hold the buffer concurrently 807 * with another thread which may release (free) the buffer asynchronously. 808 * Must hold the buffer already to call this function. 809 */ 810void 811xfs_buf_hold( 812 xfs_buf_t *bp) 813{ 814 atomic_inc(&bp->b_hold); 815 XB_TRACE(bp, "hold", 0); 816} 817 818/* 819 * Releases a hold on the specified buffer. If the 820 * the hold count is 1, calls xfs_buf_free. 821 */ 822void 823xfs_buf_rele( 824 xfs_buf_t *bp) 825{ 826 xfs_bufhash_t *hash = bp->b_hash; 827 828 XB_TRACE(bp, "rele", bp->b_relse); 829 830 if (unlikely(!hash)) { 831 ASSERT(!bp->b_relse); 832 if (atomic_dec_and_test(&bp->b_hold)) 833 xfs_buf_free(bp); 834 return; 835 } 836 837 if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) { 838 if (bp->b_relse) { 839 atomic_inc(&bp->b_hold); 840 spin_unlock(&hash->bh_lock); 841 (*(bp->b_relse)) (bp); 842 } else if (bp->b_flags & XBF_FS_MANAGED) { 843 spin_unlock(&hash->bh_lock); 844 } else { 845 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); 846 list_del_init(&bp->b_hash_list); 847 spin_unlock(&hash->bh_lock); 848 xfs_buf_free(bp); 849 } 850 } else { 851 /* 852 * Catch reference count leaks 853 */ 854 ASSERT(atomic_read(&bp->b_hold) >= 0); 855 } 856} 857 858 859/* 860 * Mutual exclusion on buffers. Locking model: 861 * 862 * Buffers associated with inodes for which buffer locking 863 * is not enabled are not protected by semaphores, and are 864 * assumed to be exclusively owned by the caller. There is a 865 * spinlock in the buffer, used by the caller when concurrent 866 * access is possible. 867 */ 868 869/* 870 * Locks a buffer object, if it is not already locked. 871 * Note that this in no way locks the underlying pages, so it is only 872 * useful for synchronizing concurrent use of buffer objects, not for 873 * synchronizing independent access to the underlying pages. 874 */ 875int 876xfs_buf_cond_lock( 877 xfs_buf_t *bp) 878{ 879 int locked; 880 881 locked = down_trylock(&bp->b_sema) == 0; 882 if (locked) { 883 XB_SET_OWNER(bp); 884 } 885 XB_TRACE(bp, "cond_lock", (long)locked); 886 return locked ? 0 : -EBUSY; 887} 888 889#if defined(DEBUG) || defined(XFS_BLI_TRACE) 890int 891xfs_buf_lock_value( 892 xfs_buf_t *bp) 893{ 894 return atomic_read(&bp->b_sema.count); 895} 896#endif 897 898/* 899 * Locks a buffer object. 900 * Note that this in no way locks the underlying pages, so it is only 901 * useful for synchronizing concurrent use of buffer objects, not for 902 * synchronizing independent access to the underlying pages. 903 */ 904void 905xfs_buf_lock( 906 xfs_buf_t *bp) 907{ 908 XB_TRACE(bp, "lock", 0); 909 if (atomic_read(&bp->b_io_remaining)) 910 blk_run_address_space(bp->b_target->bt_mapping); 911 down(&bp->b_sema); 912 XB_SET_OWNER(bp); 913 XB_TRACE(bp, "locked", 0); 914} 915 916/* 917 * Releases the lock on the buffer object. 918 * If the buffer is marked delwri but is not queued, do so before we 919 * unlock the buffer as we need to set flags correctly. We also need to 920 * take a reference for the delwri queue because the unlocker is going to 921 * drop their's and they don't know we just queued it. 922 */ 923void 924xfs_buf_unlock( 925 xfs_buf_t *bp) 926{ 927 if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) { 928 atomic_inc(&bp->b_hold); 929 bp->b_flags |= XBF_ASYNC; 930 xfs_buf_delwri_queue(bp, 0); 931 } 932 933 XB_CLEAR_OWNER(bp); 934 up(&bp->b_sema); 935 XB_TRACE(bp, "unlock", 0); 936} 937 938 939/* 940 * Pinning Buffer Storage in Memory 941 * Ensure that no attempt to force a buffer to disk will succeed. 942 */ 943void 944xfs_buf_pin( 945 xfs_buf_t *bp) 946{ 947 atomic_inc(&bp->b_pin_count); 948 XB_TRACE(bp, "pin", (long)bp->b_pin_count.counter); 949} 950 951void 952xfs_buf_unpin( 953 xfs_buf_t *bp) 954{ 955 if (atomic_dec_and_test(&bp->b_pin_count)) 956 wake_up_all(&bp->b_waiters); 957 XB_TRACE(bp, "unpin", (long)bp->b_pin_count.counter); 958} 959 960int 961xfs_buf_ispin( 962 xfs_buf_t *bp) 963{ 964 return atomic_read(&bp->b_pin_count); 965} 966 967STATIC void 968xfs_buf_wait_unpin( 969 xfs_buf_t *bp) 970{ 971 DECLARE_WAITQUEUE (wait, current); 972 973 if (atomic_read(&bp->b_pin_count) == 0) 974 return; 975 976 add_wait_queue(&bp->b_waiters, &wait); 977 for (;;) { 978 set_current_state(TASK_UNINTERRUPTIBLE); 979 if (atomic_read(&bp->b_pin_count) == 0) 980 break; 981 if (atomic_read(&bp->b_io_remaining)) 982 blk_run_address_space(bp->b_target->bt_mapping); 983 schedule(); 984 } 985 remove_wait_queue(&bp->b_waiters, &wait); 986 set_current_state(TASK_RUNNING); 987} 988 989/* 990 * Buffer Utility Routines 991 */ 992 993STATIC void 994xfs_buf_iodone_work( 995 struct work_struct *work) 996{ 997 xfs_buf_t *bp = 998 container_of(work, xfs_buf_t, b_iodone_work); 999 1000 if (bp->b_iodone) 1001 (*(bp->b_iodone))(bp); 1002 else if (bp->b_flags & XBF_ASYNC) 1003 xfs_buf_relse(bp); 1004} 1005 1006void 1007xfs_buf_ioend( 1008 xfs_buf_t *bp, 1009 int schedule) 1010{ 1011 bp->b_flags &= ~(XBF_READ | XBF_WRITE); 1012 if (bp->b_error == 0) 1013 bp->b_flags |= XBF_DONE; 1014 1015 XB_TRACE(bp, "iodone", bp->b_iodone); 1016 1017 if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) { 1018 if (schedule) { 1019 INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); 1020 queue_work(xfslogd_workqueue, &bp->b_iodone_work); 1021 } else { 1022 xfs_buf_iodone_work(&bp->b_iodone_work); 1023 } 1024 } else { 1025 up(&bp->b_iodonesema); 1026 } 1027} 1028 1029void 1030xfs_buf_ioerror( 1031 xfs_buf_t *bp, 1032 int error) 1033{ 1034 ASSERT(error >= 0 && error <= 0xffff); 1035 bp->b_error = (unsigned short)error; 1036 XB_TRACE(bp, "ioerror", (unsigned long)error); 1037} 1038 1039/* 1040 * Initiate I/O on a buffer, based on the flags supplied. 1041 * The b_iodone routine in the buffer supplied will only be called 1042 * when all of the subsidiary I/O requests, if any, have been completed. 1043 */ 1044int 1045xfs_buf_iostart( 1046 xfs_buf_t *bp, 1047 xfs_buf_flags_t flags) 1048{ 1049 int status = 0; 1050 1051 XB_TRACE(bp, "iostart", (unsigned long)flags); 1052 1053 if (flags & XBF_DELWRI) { 1054 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC); 1055 bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC); 1056 xfs_buf_delwri_queue(bp, 1); 1057 return status; 1058 } 1059 1060 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \ 1061 XBF_READ_AHEAD | _XBF_RUN_QUEUES); 1062 bp->b_flags |= flags & (XBF_READ | XBF_WRITE | XBF_ASYNC | \ 1063 XBF_READ_AHEAD | _XBF_RUN_QUEUES); 1064 1065 BUG_ON(bp->b_bn == XFS_BUF_DADDR_NULL); 1066 1067 /* For writes allow an alternate strategy routine to precede 1068 * the actual I/O request (which may not be issued at all in 1069 * a shutdown situation, for example). 1070 */ 1071 status = (flags & XBF_WRITE) ? 1072 xfs_buf_iostrategy(bp) : xfs_buf_iorequest(bp); 1073 1074 /* Wait for I/O if we are not an async request. 1075 * Note: async I/O request completion will release the buffer, 1076 * and that can already be done by this point. So using the 1077 * buffer pointer from here on, after async I/O, is invalid. 1078 */ 1079 if (!status && !(flags & XBF_ASYNC)) 1080 status = xfs_buf_iowait(bp); 1081 1082 return status; 1083} 1084 1085STATIC_INLINE int 1086_xfs_buf_iolocked( 1087 xfs_buf_t *bp) 1088{ 1089 ASSERT(bp->b_flags & (XBF_READ | XBF_WRITE)); 1090 if (bp->b_flags & XBF_READ) 1091 return bp->b_locked; 1092 return 0; 1093} 1094 1095STATIC_INLINE void 1096_xfs_buf_ioend( 1097 xfs_buf_t *bp, 1098 int schedule) 1099{ 1100 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { 1101 bp->b_locked = 0; 1102 xfs_buf_ioend(bp, schedule); 1103 } 1104} 1105 1106STATIC int 1107xfs_buf_bio_end_io( 1108 struct bio *bio, 1109 unsigned int bytes_done, 1110 int error) 1111{ 1112 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; 1113 unsigned int blocksize = bp->b_target->bt_bsize; 1114 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1115 1116 if (bio->bi_size) 1117 return 1; 1118 1119 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 1120 bp->b_error = EIO; 1121 1122 do { 1123 struct page *page = bvec->bv_page; 1124 1125 ASSERT(!PagePrivate(page)); 1126 if (unlikely(bp->b_error)) { 1127 if (bp->b_flags & XBF_READ) 1128 ClearPageUptodate(page); 1129 } else if (blocksize >= PAGE_CACHE_SIZE) { 1130 SetPageUptodate(page); 1131 } else if (!PagePrivate(page) && 1132 (bp->b_flags & _XBF_PAGE_CACHE)) { 1133 set_page_region(page, bvec->bv_offset, bvec->bv_len); 1134 } 1135 1136 if (--bvec >= bio->bi_io_vec) 1137 prefetchw(&bvec->bv_page->flags); 1138 1139 if (_xfs_buf_iolocked(bp)) { 1140 unlock_page(page); 1141 } 1142 } while (bvec >= bio->bi_io_vec); 1143 1144 _xfs_buf_ioend(bp, 1); 1145 bio_put(bio); 1146 return 0; 1147} 1148 1149STATIC void 1150_xfs_buf_ioapply( 1151 xfs_buf_t *bp) 1152{ 1153 int i, rw, map_i, total_nr_pages, nr_pages; 1154 struct bio *bio; 1155 int offset = bp->b_offset; 1156 int size = bp->b_count_desired; 1157 sector_t sector = bp->b_bn; 1158 unsigned int blocksize = bp->b_target->bt_bsize; 1159 int locking = _xfs_buf_iolocked(bp); 1160 1161 total_nr_pages = bp->b_page_count; 1162 map_i = 0; 1163 1164 if (bp->b_flags & XBF_ORDERED) { 1165 ASSERT(!(bp->b_flags & XBF_READ)); 1166 rw = WRITE_BARRIER; 1167 } else if (bp->b_flags & _XBF_RUN_QUEUES) { 1168 ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); 1169 bp->b_flags &= ~_XBF_RUN_QUEUES; 1170 rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC; 1171 } else { 1172 rw = (bp->b_flags & XBF_WRITE) ? WRITE : 1173 (bp->b_flags & XBF_READ_AHEAD) ? READA : READ; 1174 } 1175 1176 /* Special code path for reading a sub page size buffer in -- 1177 * we populate up the whole page, and hence the other metadata 1178 * in the same page. This optimization is only valid when the 1179 * filesystem block size is not smaller than the page size. 1180 */ 1181 if ((bp->b_buffer_length < PAGE_CACHE_SIZE) && 1182 (bp->b_flags & XBF_READ) && locking && 1183 (blocksize >= PAGE_CACHE_SIZE)) { 1184 bio = bio_alloc(GFP_NOIO, 1); 1185 1186 bio->bi_bdev = bp->b_target->bt_bdev; 1187 bio->bi_sector = sector - (offset >> BBSHIFT); 1188 bio->bi_end_io = xfs_buf_bio_end_io; 1189 bio->bi_private = bp; 1190 1191 bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0); 1192 size = 0; 1193 1194 atomic_inc(&bp->b_io_remaining); 1195 1196 goto submit_io; 1197 } 1198 1199 /* Lock down the pages which we need to for the request */ 1200 if (locking && (bp->b_flags & XBF_WRITE) && (bp->b_locked == 0)) { 1201 for (i = 0; size; i++) { 1202 int nbytes = PAGE_CACHE_SIZE - offset; 1203 struct page *page = bp->b_pages[i]; 1204 1205 if (nbytes > size) 1206 nbytes = size; 1207 1208 lock_page(page); 1209 1210 size -= nbytes; 1211 offset = 0; 1212 } 1213 offset = bp->b_offset; 1214 size = bp->b_count_desired; 1215 } 1216 1217next_chunk: 1218 atomic_inc(&bp->b_io_remaining); 1219 nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); 1220 if (nr_pages > total_nr_pages) 1221 nr_pages = total_nr_pages; 1222 1223 bio = bio_alloc(GFP_NOIO, nr_pages); 1224 bio->bi_bdev = bp->b_target->bt_bdev; 1225 bio->bi_sector = sector; 1226 bio->bi_end_io = xfs_buf_bio_end_io; 1227 bio->bi_private = bp; 1228 1229 for (; size && nr_pages; nr_pages--, map_i++) { 1230 int rbytes, nbytes = PAGE_CACHE_SIZE - offset; 1231 1232 if (nbytes > size) 1233 nbytes = size; 1234 1235 rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset); 1236 if (rbytes < nbytes) 1237 break; 1238 1239 offset = 0; 1240 sector += nbytes >> BBSHIFT; 1241 size -= nbytes; 1242 total_nr_pages--; 1243 } 1244 1245submit_io: 1246 if (likely(bio->bi_size)) { 1247 submit_bio(rw, bio); 1248 if (size) 1249 goto next_chunk; 1250 } else { 1251 bio_put(bio); 1252 xfs_buf_ioerror(bp, EIO); 1253 } 1254} 1255 1256int 1257xfs_buf_iorequest( 1258 xfs_buf_t *bp) 1259{ 1260 XB_TRACE(bp, "iorequest", 0); 1261 1262 if (bp->b_flags & XBF_DELWRI) { 1263 xfs_buf_delwri_queue(bp, 1); 1264 return 0; 1265 } 1266 1267 if (bp->b_flags & XBF_WRITE) { 1268 xfs_buf_wait_unpin(bp); 1269 } 1270 1271 xfs_buf_hold(bp); 1272 1273 /* Set the count to 1 initially, this will stop an I/O 1274 * completion callout which happens before we have started 1275 * all the I/O from calling xfs_buf_ioend too early. 1276 */ 1277 atomic_set(&bp->b_io_remaining, 1); 1278 _xfs_buf_ioapply(bp); 1279 _xfs_buf_ioend(bp, 0); 1280 1281 xfs_buf_rele(bp); 1282 return 0; 1283} 1284 1285/* 1286 * Waits for I/O to complete on the buffer supplied. 1287 * It returns immediately if no I/O is pending. 1288 * It returns the I/O error code, if any, or 0 if there was no error. 1289 */ 1290int 1291xfs_buf_iowait( 1292 xfs_buf_t *bp) 1293{ 1294 XB_TRACE(bp, "iowait", 0); 1295 if (atomic_read(&bp->b_io_remaining)) 1296 blk_run_address_space(bp->b_target->bt_mapping); 1297 down(&bp->b_iodonesema); 1298 XB_TRACE(bp, "iowaited", (long)bp->b_error); 1299 return bp->b_error; 1300} 1301 1302xfs_caddr_t 1303xfs_buf_offset( 1304 xfs_buf_t *bp, 1305 size_t offset) 1306{ 1307 struct page *page; 1308 1309 if (bp->b_flags & XBF_MAPPED) 1310 return XFS_BUF_PTR(bp) + offset; 1311 1312 offset += bp->b_offset; 1313 page = bp->b_pages[offset >> PAGE_CACHE_SHIFT]; 1314 return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1)); 1315} 1316 1317/* 1318 * Move data into or out of a buffer. 1319 */ 1320void 1321xfs_buf_iomove( 1322 xfs_buf_t *bp, /* buffer to process */ 1323 size_t boff, /* starting buffer offset */ 1324 size_t bsize, /* length to copy */ 1325 caddr_t data, /* data address */ 1326 xfs_buf_rw_t mode) /* read/write/zero flag */ 1327{ 1328 size_t bend, cpoff, csize; 1329 struct page *page; 1330 1331 bend = boff + bsize; 1332 while (boff < bend) { 1333 page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; 1334 cpoff = xfs_buf_poff(boff + bp->b_offset); 1335 csize = min_t(size_t, 1336 PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff); 1337 1338 ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE)); 1339 1340 switch (mode) { 1341 case XBRW_ZERO: 1342 memset(page_address(page) + cpoff, 0, csize); 1343 break; 1344 case XBRW_READ: 1345 memcpy(data, page_address(page) + cpoff, csize); 1346 break; 1347 case XBRW_WRITE: 1348 memcpy(page_address(page) + cpoff, data, csize); 1349 } 1350 1351 boff += csize; 1352 data += csize; 1353 } 1354} 1355 1356/* 1357 * Handling of buffer targets (buftargs). 1358 */ 1359 1360/* 1361 * Wait for any bufs with callbacks that have been submitted but 1362 * have not yet returned... walk the hash list for the target. 1363 */ 1364void 1365xfs_wait_buftarg( 1366 xfs_buftarg_t *btp) 1367{ 1368 xfs_buf_t *bp, *n; 1369 xfs_bufhash_t *hash; 1370 uint i; 1371 1372 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1373 hash = &btp->bt_hash[i]; 1374again: 1375 spin_lock(&hash->bh_lock); 1376 list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { 1377 ASSERT(btp == bp->b_target); 1378 if (!(bp->b_flags & XBF_FS_MANAGED)) { 1379 spin_unlock(&hash->bh_lock); 1380 /* 1381 * Catch superblock reference count leaks 1382 * immediately 1383 */ 1384 BUG_ON(bp->b_bn == 0); 1385 delay(100); 1386 goto again; 1387 } 1388 } 1389 spin_unlock(&hash->bh_lock); 1390 } 1391} 1392 1393/* 1394 * Allocate buffer hash table for a given target. 1395 * For devices containing metadata (i.e. not the log/realtime devices) 1396 * we need to allocate a much larger hash table. 1397 */ 1398STATIC void 1399xfs_alloc_bufhash( 1400 xfs_buftarg_t *btp, 1401 int external) 1402{ 1403 unsigned int i; 1404 1405 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ 1406 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; 1407 btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) * 1408 sizeof(xfs_bufhash_t), KM_SLEEP | KM_LARGE); 1409 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1410 spin_lock_init(&btp->bt_hash[i].bh_lock); 1411 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); 1412 } 1413} 1414 1415STATIC void 1416xfs_free_bufhash( 1417 xfs_buftarg_t *btp) 1418{ 1419 kmem_free(btp->bt_hash, (1<<btp->bt_hashshift) * sizeof(xfs_bufhash_t)); 1420 btp->bt_hash = NULL; 1421} 1422 1423/* 1424 * buftarg list for delwrite queue processing 1425 */ 1426static LIST_HEAD(xfs_buftarg_list); 1427static DEFINE_SPINLOCK(xfs_buftarg_lock); 1428 1429STATIC void 1430xfs_register_buftarg( 1431 xfs_buftarg_t *btp) 1432{ 1433 spin_lock(&xfs_buftarg_lock); 1434 list_add(&btp->bt_list, &xfs_buftarg_list); 1435 spin_unlock(&xfs_buftarg_lock); 1436} 1437 1438STATIC void 1439xfs_unregister_buftarg( 1440 xfs_buftarg_t *btp) 1441{ 1442 spin_lock(&xfs_buftarg_lock); 1443 list_del(&btp->bt_list); 1444 spin_unlock(&xfs_buftarg_lock); 1445} 1446 1447void 1448xfs_free_buftarg( 1449 xfs_buftarg_t *btp, 1450 int external) 1451{ 1452 xfs_flush_buftarg(btp, 1); 1453 xfs_blkdev_issue_flush(btp); 1454 if (external) 1455 xfs_blkdev_put(btp->bt_bdev); 1456 xfs_free_bufhash(btp); 1457 iput(btp->bt_mapping->host); 1458 1459 /* Unregister the buftarg first so that we don't get a 1460 * wakeup finding a non-existent task 1461 */ 1462 xfs_unregister_buftarg(btp); 1463 kthread_stop(btp->bt_task); 1464 1465 kmem_free(btp, sizeof(*btp)); 1466} 1467 1468STATIC int 1469xfs_setsize_buftarg_flags( 1470 xfs_buftarg_t *btp, 1471 unsigned int blocksize, 1472 unsigned int sectorsize, 1473 int verbose) 1474{ 1475 btp->bt_bsize = blocksize; 1476 btp->bt_sshift = ffs(sectorsize) - 1; 1477 btp->bt_smask = sectorsize - 1; 1478 1479 if (set_blocksize(btp->bt_bdev, sectorsize)) { 1480 printk(KERN_WARNING 1481 "XFS: Cannot set_blocksize to %u on device %s\n", 1482 sectorsize, XFS_BUFTARG_NAME(btp)); 1483 return EINVAL; 1484 } 1485 1486 if (verbose && 1487 (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) { 1488 printk(KERN_WARNING 1489 "XFS: %u byte sectors in use on device %s. " 1490 "This is suboptimal; %u or greater is ideal.\n", 1491 sectorsize, XFS_BUFTARG_NAME(btp), 1492 (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG); 1493 } 1494 1495 return 0; 1496} 1497 1498/* 1499 * When allocating the initial buffer target we have not yet 1500 * read in the superblock, so don't know what sized sectors 1501 * are being used is at this early stage. Play safe. 1502 */ 1503STATIC int 1504xfs_setsize_buftarg_early( 1505 xfs_buftarg_t *btp, 1506 struct block_device *bdev) 1507{ 1508 return xfs_setsize_buftarg_flags(btp, 1509 PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0); 1510} 1511 1512int 1513xfs_setsize_buftarg( 1514 xfs_buftarg_t *btp, 1515 unsigned int blocksize, 1516 unsigned int sectorsize) 1517{ 1518 return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); 1519} 1520 1521STATIC int 1522xfs_mapping_buftarg( 1523 xfs_buftarg_t *btp, 1524 struct block_device *bdev) 1525{ 1526 struct backing_dev_info *bdi; 1527 struct inode *inode; 1528 struct address_space *mapping; 1529 static const struct address_space_operations mapping_aops = { 1530 .sync_page = block_sync_page, 1531 .migratepage = fail_migrate_page, 1532 }; 1533 1534 inode = new_inode(bdev->bd_inode->i_sb); 1535 if (!inode) { 1536 printk(KERN_WARNING 1537 "XFS: Cannot allocate mapping inode for device %s\n", 1538 XFS_BUFTARG_NAME(btp)); 1539 return ENOMEM; 1540 } 1541 inode->i_mode = S_IFBLK; 1542 inode->i_bdev = bdev; 1543 inode->i_rdev = bdev->bd_dev; 1544 bdi = blk_get_backing_dev_info(bdev); 1545 if (!bdi) 1546 bdi = &default_backing_dev_info; 1547 mapping = &inode->i_data; 1548 mapping->a_ops = &mapping_aops; 1549 mapping->backing_dev_info = bdi; 1550 mapping_set_gfp_mask(mapping, GFP_NOFS); 1551 btp->bt_mapping = mapping; 1552 return 0; 1553} 1554 1555STATIC int 1556xfs_alloc_delwrite_queue( 1557 xfs_buftarg_t *btp) 1558{ 1559 int error = 0; 1560 1561 INIT_LIST_HEAD(&btp->bt_list); 1562 INIT_LIST_HEAD(&btp->bt_delwrite_queue); 1563 spinlock_init(&btp->bt_delwrite_lock, "delwri_lock"); 1564 btp->bt_flags = 0; 1565 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd"); 1566 if (IS_ERR(btp->bt_task)) { 1567 error = PTR_ERR(btp->bt_task); 1568 goto out_error; 1569 } 1570 xfs_register_buftarg(btp); 1571out_error: 1572 return error; 1573} 1574 1575xfs_buftarg_t * 1576xfs_alloc_buftarg( 1577 struct block_device *bdev, 1578 int external) 1579{ 1580 xfs_buftarg_t *btp; 1581 1582 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); 1583 1584 btp->bt_dev = bdev->bd_dev; 1585 btp->bt_bdev = bdev; 1586 if (xfs_setsize_buftarg_early(btp, bdev)) 1587 goto error; 1588 if (xfs_mapping_buftarg(btp, bdev)) 1589 goto error; 1590 if (xfs_alloc_delwrite_queue(btp)) 1591 goto error; 1592 xfs_alloc_bufhash(btp, external); 1593 return btp; 1594 1595error: 1596 kmem_free(btp, sizeof(*btp)); 1597 return NULL; 1598} 1599 1600 1601/* 1602 * Delayed write buffer handling 1603 */ 1604STATIC void 1605xfs_buf_delwri_queue( 1606 xfs_buf_t *bp, 1607 int unlock) 1608{ 1609 struct list_head *dwq = &bp->b_target->bt_delwrite_queue; 1610 spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; 1611 1612 XB_TRACE(bp, "delwri_q", (long)unlock); 1613 ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC)); 1614 1615 spin_lock(dwlk); 1616 /* If already in the queue, dequeue and place at tail */ 1617 if (!list_empty(&bp->b_list)) { 1618 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 1619 if (unlock) 1620 atomic_dec(&bp->b_hold); 1621 list_del(&bp->b_list); 1622 } 1623 1624 bp->b_flags |= _XBF_DELWRI_Q; 1625 list_add_tail(&bp->b_list, dwq); 1626 bp->b_queuetime = jiffies; 1627 spin_unlock(dwlk); 1628 1629 if (unlock) 1630 xfs_buf_unlock(bp); 1631} 1632 1633void 1634xfs_buf_delwri_dequeue( 1635 xfs_buf_t *bp) 1636{ 1637 spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; 1638 int dequeued = 0; 1639 1640 spin_lock(dwlk); 1641 if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) { 1642 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 1643 list_del_init(&bp->b_list); 1644 dequeued = 1; 1645 } 1646 bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); 1647 spin_unlock(dwlk); 1648 1649 if (dequeued) 1650 xfs_buf_rele(bp); 1651 1652 XB_TRACE(bp, "delwri_dq", (long)dequeued); 1653} 1654 1655STATIC void 1656xfs_buf_runall_queues( 1657 struct workqueue_struct *queue) 1658{ 1659 flush_workqueue(queue); 1660} 1661 1662STATIC int 1663xfsbufd_wakeup( 1664 int priority, 1665 gfp_t mask) 1666{ 1667 xfs_buftarg_t *btp; 1668 1669 spin_lock(&xfs_buftarg_lock); 1670 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) { 1671 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags)) 1672 continue; 1673 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags); 1674 wake_up_process(btp->bt_task); 1675 } 1676 spin_unlock(&xfs_buftarg_lock); 1677 return 0; 1678} 1679 1680/* 1681 * Move as many buffers as specified to the supplied list 1682 * idicating if we skipped any buffers to prevent deadlocks. 1683 */ 1684STATIC int 1685xfs_buf_delwri_split( 1686 xfs_buftarg_t *target, 1687 struct list_head *list, 1688 unsigned long age) 1689{ 1690 xfs_buf_t *bp, *n; 1691 struct list_head *dwq = &target->bt_delwrite_queue; 1692 spinlock_t *dwlk = &target->bt_delwrite_lock; 1693 int skipped = 0; 1694 int force; 1695 1696 force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); 1697 INIT_LIST_HEAD(list); 1698 spin_lock(dwlk); 1699 list_for_each_entry_safe(bp, n, dwq, b_list) { 1700 XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp)); 1701 ASSERT(bp->b_flags & XBF_DELWRI); 1702 1703 if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) { 1704 if (!force && 1705 time_before(jiffies, bp->b_queuetime + age)) { 1706 xfs_buf_unlock(bp); 1707 break; 1708 } 1709 1710 bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q| 1711 _XBF_RUN_QUEUES); 1712 bp->b_flags |= XBF_WRITE; 1713 list_move_tail(&bp->b_list, list); 1714 } else 1715 skipped++; 1716 } 1717 spin_unlock(dwlk); 1718 1719 return skipped; 1720 1721} 1722 1723STATIC int 1724xfsbufd( 1725 void *data) 1726{ 1727 struct list_head tmp; 1728 xfs_buftarg_t *target = (xfs_buftarg_t *)data; 1729 int count; 1730 xfs_buf_t *bp; 1731 1732 current->flags |= PF_MEMALLOC; 1733 1734 do { 1735 if (unlikely(freezing(current))) { 1736 set_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1737 refrigerator(); 1738 } else { 1739 clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1740 } 1741 1742 schedule_timeout_interruptible( 1743 xfs_buf_timer_centisecs * msecs_to_jiffies(10)); 1744 1745 xfs_buf_delwri_split(target, &tmp, 1746 xfs_buf_age_centisecs * msecs_to_jiffies(10)); 1747 1748 count = 0; 1749 while (!list_empty(&tmp)) { 1750 bp = list_entry(tmp.next, xfs_buf_t, b_list); 1751 ASSERT(target == bp->b_target); 1752 1753 list_del_init(&bp->b_list); 1754 xfs_buf_iostrategy(bp); 1755 count++; 1756 } 1757 1758 if (as_list_len > 0) 1759 purge_addresses(); 1760 if (count) 1761 blk_run_address_space(target->bt_mapping); 1762 1763 } while (!kthread_should_stop()); 1764 1765 return 0; 1766} 1767 1768/* 1769 * Go through all incore buffers, and release buffers if they belong to 1770 * the given device. This is used in filesystem error handling to 1771 * preserve the consistency of its metadata. 1772 */ 1773int 1774xfs_flush_buftarg( 1775 xfs_buftarg_t *target, 1776 int wait) 1777{ 1778 struct list_head tmp; 1779 xfs_buf_t *bp, *n; 1780 int pincount = 0; 1781 1782 xfs_buf_runall_queues(xfsdatad_workqueue); 1783 xfs_buf_runall_queues(xfslogd_workqueue); 1784 1785 set_bit(XBT_FORCE_FLUSH, &target->bt_flags); 1786 pincount = xfs_buf_delwri_split(target, &tmp, 0); 1787 1788 /* 1789 * Dropped the delayed write list lock, now walk the temporary list 1790 */ 1791 list_for_each_entry_safe(bp, n, &tmp, b_list) { 1792 ASSERT(target == bp->b_target); 1793 if (wait) 1794 bp->b_flags &= ~XBF_ASYNC; 1795 else 1796 list_del_init(&bp->b_list); 1797 1798 xfs_buf_iostrategy(bp); 1799 } 1800 1801 if (wait) 1802 blk_run_address_space(target->bt_mapping); 1803 1804 /* 1805 * Remaining list items must be flushed before returning 1806 */ 1807 while (!list_empty(&tmp)) { 1808 bp = list_entry(tmp.next, xfs_buf_t, b_list); 1809 1810 list_del_init(&bp->b_list); 1811 xfs_iowait(bp); 1812 xfs_buf_relse(bp); 1813 } 1814 1815 return pincount; 1816} 1817 1818int __init 1819xfs_buf_init(void) 1820{ 1821#ifdef XFS_BUF_TRACE 1822 xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_SLEEP); 1823#endif 1824 1825 xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", 1826 KM_ZONE_HWALIGN, NULL); 1827 if (!xfs_buf_zone) 1828 goto out_free_trace_buf; 1829 1830 xfslogd_workqueue = create_workqueue("xfslogd"); 1831 if (!xfslogd_workqueue) 1832 goto out_free_buf_zone; 1833 1834 xfsdatad_workqueue = create_workqueue("xfsdatad"); 1835 if (!xfsdatad_workqueue) 1836 goto out_destroy_xfslogd_workqueue; 1837 1838 register_shrinker(&xfs_buf_shake); 1839 return 0; 1840 1841 out_destroy_xfslogd_workqueue: 1842 destroy_workqueue(xfslogd_workqueue); 1843 out_free_buf_zone: 1844 kmem_zone_destroy(xfs_buf_zone); 1845 out_free_trace_buf: 1846#ifdef XFS_BUF_TRACE 1847 ktrace_free(xfs_buf_trace_buf); 1848#endif 1849 return -ENOMEM; 1850} 1851 1852void 1853xfs_buf_terminate(void) 1854{ 1855 unregister_shrinker(&xfs_buf_shake); 1856 destroy_workqueue(xfsdatad_workqueue); 1857 destroy_workqueue(xfslogd_workqueue); 1858 kmem_zone_destroy(xfs_buf_zone); 1859#ifdef XFS_BUF_TRACE 1860 ktrace_free(xfs_buf_trace_buf); 1861#endif 1862} 1863 1864#ifdef CONFIG_KDB_MODULES 1865struct list_head * 1866xfs_get_buftarg_list(void) 1867{ 1868 return &xfs_buftarg_list; 1869} 1870#endif