Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v2.6.14-rc2 1989 lines 46 kB view raw
1/* 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved. 3 * 4 * This program is free software; you can redistribute it and/or modify it 5 * under the terms of version 2 of the GNU General Public License as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it would be useful, but 9 * WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 * 12 * Further, this software is distributed without any warranty that it is 13 * free of the rightful claim of any third person regarding infringement 14 * or the like. Any license provided herein, whether implied or 15 * otherwise, applies only to this software file. Patent licenses, if 16 * any, provided herein do not apply to combinations of this program with 17 * other software, or any other product whatsoever. 18 * 19 * You should have received a copy of the GNU General Public License along 20 * with this program; if not, write the Free Software Foundation, Inc., 59 21 * Temple Place - Suite 330, Boston MA 02111-1307, USA. 22 * 23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, 24 * Mountain View, CA 94043, or: 25 * 26 * http://www.sgi.com 27 * 28 * For further information regarding this notice, see: 29 * 30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ 31 */ 32 33/* 34 * The xfs_buf.c code provides an abstract buffer cache model on top 35 * of the Linux page cache. Cached metadata blocks for a file system 36 * are hashed to the inode for the block device. xfs_buf.c assembles 37 * buffers (xfs_buf_t) on demand to aggregate such cached pages for I/O. 38 * 39 * Written by Steve Lord, Jim Mostek, Russell Cattelan 40 * and Rajagopal Ananthanarayanan ("ananth") at SGI. 41 * 42 */ 43 44#include <linux/stddef.h> 45#include <linux/errno.h> 46#include <linux/slab.h> 47#include <linux/pagemap.h> 48#include <linux/init.h> 49#include <linux/vmalloc.h> 50#include <linux/bio.h> 51#include <linux/sysctl.h> 52#include <linux/proc_fs.h> 53#include <linux/workqueue.h> 54#include <linux/percpu.h> 55#include <linux/blkdev.h> 56#include <linux/hash.h> 57#include <linux/kthread.h> 58 59#include "xfs_linux.h" 60 61/* 62 * File wide globals 63 */ 64 65STATIC kmem_cache_t *pagebuf_zone; 66STATIC kmem_shaker_t pagebuf_shake; 67STATIC int xfsbufd_wakeup(int, unsigned int); 68STATIC void pagebuf_delwri_queue(xfs_buf_t *, int); 69 70STATIC struct workqueue_struct *xfslogd_workqueue; 71struct workqueue_struct *xfsdatad_workqueue; 72 73/* 74 * Pagebuf debugging 75 */ 76 77#ifdef PAGEBUF_TRACE 78void 79pagebuf_trace( 80 xfs_buf_t *pb, 81 char *id, 82 void *data, 83 void *ra) 84{ 85 ktrace_enter(pagebuf_trace_buf, 86 pb, id, 87 (void *)(unsigned long)pb->pb_flags, 88 (void *)(unsigned long)pb->pb_hold.counter, 89 (void *)(unsigned long)pb->pb_sema.count.counter, 90 (void *)current, 91 data, ra, 92 (void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff), 93 (void *)(unsigned long)(pb->pb_file_offset & 0xffffffff), 94 (void *)(unsigned long)pb->pb_buffer_length, 95 NULL, NULL, NULL, NULL, NULL); 96} 97ktrace_t *pagebuf_trace_buf; 98#define PAGEBUF_TRACE_SIZE 4096 99#define PB_TRACE(pb, id, data) \ 100 pagebuf_trace(pb, id, (void *)data, (void *)__builtin_return_address(0)) 101#else 102#define PB_TRACE(pb, id, data) do { } while (0) 103#endif 104 105#ifdef PAGEBUF_LOCK_TRACKING 106# define PB_SET_OWNER(pb) ((pb)->pb_last_holder = current->pid) 107# define PB_CLEAR_OWNER(pb) ((pb)->pb_last_holder = -1) 108# define PB_GET_OWNER(pb) ((pb)->pb_last_holder) 109#else 110# define PB_SET_OWNER(pb) do { } while (0) 111# define PB_CLEAR_OWNER(pb) do { } while (0) 112# define PB_GET_OWNER(pb) do { } while (0) 113#endif 114 115/* 116 * Pagebuf allocation / freeing. 117 */ 118 119#define pb_to_gfp(flags) \ 120 ((((flags) & PBF_READ_AHEAD) ? __GFP_NORETRY : \ 121 ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN) 122 123#define pb_to_km(flags) \ 124 (((flags) & PBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) 125 126 127#define pagebuf_allocate(flags) \ 128 kmem_zone_alloc(pagebuf_zone, pb_to_km(flags)) 129#define pagebuf_deallocate(pb) \ 130 kmem_zone_free(pagebuf_zone, (pb)); 131 132/* 133 * Page Region interfaces. 134 * 135 * For pages in filesystems where the blocksize is smaller than the 136 * pagesize, we use the page->private field (long) to hold a bitmap 137 * of uptodate regions within the page. 138 * 139 * Each such region is "bytes per page / bits per long" bytes long. 140 * 141 * NBPPR == number-of-bytes-per-page-region 142 * BTOPR == bytes-to-page-region (rounded up) 143 * BTOPRT == bytes-to-page-region-truncated (rounded down) 144 */ 145#if (BITS_PER_LONG == 32) 146#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */ 147#elif (BITS_PER_LONG == 64) 148#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */ 149#else 150#error BITS_PER_LONG must be 32 or 64 151#endif 152#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG) 153#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT) 154#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT)) 155 156STATIC unsigned long 157page_region_mask( 158 size_t offset, 159 size_t length) 160{ 161 unsigned long mask; 162 int first, final; 163 164 first = BTOPR(offset); 165 final = BTOPRT(offset + length - 1); 166 first = min(first, final); 167 168 mask = ~0UL; 169 mask <<= BITS_PER_LONG - (final - first); 170 mask >>= BITS_PER_LONG - (final); 171 172 ASSERT(offset + length <= PAGE_CACHE_SIZE); 173 ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0); 174 175 return mask; 176} 177 178STATIC inline void 179set_page_region( 180 struct page *page, 181 size_t offset, 182 size_t length) 183{ 184 page->private |= page_region_mask(offset, length); 185 if (page->private == ~0UL) 186 SetPageUptodate(page); 187} 188 189STATIC inline int 190test_page_region( 191 struct page *page, 192 size_t offset, 193 size_t length) 194{ 195 unsigned long mask = page_region_mask(offset, length); 196 197 return (mask && (page->private & mask) == mask); 198} 199 200/* 201 * Mapping of multi-page buffers into contiguous virtual space 202 */ 203 204typedef struct a_list { 205 void *vm_addr; 206 struct a_list *next; 207} a_list_t; 208 209STATIC a_list_t *as_free_head; 210STATIC int as_list_len; 211STATIC DEFINE_SPINLOCK(as_lock); 212 213/* 214 * Try to batch vunmaps because they are costly. 215 */ 216STATIC void 217free_address( 218 void *addr) 219{ 220 a_list_t *aentry; 221 222 aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH); 223 if (likely(aentry)) { 224 spin_lock(&as_lock); 225 aentry->next = as_free_head; 226 aentry->vm_addr = addr; 227 as_free_head = aentry; 228 as_list_len++; 229 spin_unlock(&as_lock); 230 } else { 231 vunmap(addr); 232 } 233} 234 235STATIC void 236purge_addresses(void) 237{ 238 a_list_t *aentry, *old; 239 240 if (as_free_head == NULL) 241 return; 242 243 spin_lock(&as_lock); 244 aentry = as_free_head; 245 as_free_head = NULL; 246 as_list_len = 0; 247 spin_unlock(&as_lock); 248 249 while ((old = aentry) != NULL) { 250 vunmap(aentry->vm_addr); 251 aentry = aentry->next; 252 kfree(old); 253 } 254} 255 256/* 257 * Internal pagebuf object manipulation 258 */ 259 260STATIC void 261_pagebuf_initialize( 262 xfs_buf_t *pb, 263 xfs_buftarg_t *target, 264 loff_t range_base, 265 size_t range_length, 266 page_buf_flags_t flags) 267{ 268 /* 269 * We don't want certain flags to appear in pb->pb_flags. 270 */ 271 flags &= ~(PBF_LOCK|PBF_MAPPED|PBF_DONT_BLOCK|PBF_READ_AHEAD); 272 273 memset(pb, 0, sizeof(xfs_buf_t)); 274 atomic_set(&pb->pb_hold, 1); 275 init_MUTEX_LOCKED(&pb->pb_iodonesema); 276 INIT_LIST_HEAD(&pb->pb_list); 277 INIT_LIST_HEAD(&pb->pb_hash_list); 278 init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */ 279 PB_SET_OWNER(pb); 280 pb->pb_target = target; 281 pb->pb_file_offset = range_base; 282 /* 283 * Set buffer_length and count_desired to the same value initially. 284 * I/O routines should use count_desired, which will be the same in 285 * most cases but may be reset (e.g. XFS recovery). 286 */ 287 pb->pb_buffer_length = pb->pb_count_desired = range_length; 288 pb->pb_flags = flags | PBF_NONE; 289 pb->pb_bn = XFS_BUF_DADDR_NULL; 290 atomic_set(&pb->pb_pin_count, 0); 291 init_waitqueue_head(&pb->pb_waiters); 292 293 XFS_STATS_INC(pb_create); 294 PB_TRACE(pb, "initialize", target); 295} 296 297/* 298 * Allocate a page array capable of holding a specified number 299 * of pages, and point the page buf at it. 300 */ 301STATIC int 302_pagebuf_get_pages( 303 xfs_buf_t *pb, 304 int page_count, 305 page_buf_flags_t flags) 306{ 307 /* Make sure that we have a page list */ 308 if (pb->pb_pages == NULL) { 309 pb->pb_offset = page_buf_poff(pb->pb_file_offset); 310 pb->pb_page_count = page_count; 311 if (page_count <= PB_PAGES) { 312 pb->pb_pages = pb->pb_page_array; 313 } else { 314 pb->pb_pages = kmem_alloc(sizeof(struct page *) * 315 page_count, pb_to_km(flags)); 316 if (pb->pb_pages == NULL) 317 return -ENOMEM; 318 } 319 memset(pb->pb_pages, 0, sizeof(struct page *) * page_count); 320 } 321 return 0; 322} 323 324/* 325 * Frees pb_pages if it was malloced. 326 */ 327STATIC void 328_pagebuf_free_pages( 329 xfs_buf_t *bp) 330{ 331 if (bp->pb_pages != bp->pb_page_array) { 332 kmem_free(bp->pb_pages, 333 bp->pb_page_count * sizeof(struct page *)); 334 } 335} 336 337/* 338 * Releases the specified buffer. 339 * 340 * The modification state of any associated pages is left unchanged. 341 * The buffer most not be on any hash - use pagebuf_rele instead for 342 * hashed and refcounted buffers 343 */ 344void 345pagebuf_free( 346 xfs_buf_t *bp) 347{ 348 PB_TRACE(bp, "free", 0); 349 350 ASSERT(list_empty(&bp->pb_hash_list)); 351 352 if (bp->pb_flags & _PBF_PAGE_CACHE) { 353 uint i; 354 355 if ((bp->pb_flags & PBF_MAPPED) && (bp->pb_page_count > 1)) 356 free_address(bp->pb_addr - bp->pb_offset); 357 358 for (i = 0; i < bp->pb_page_count; i++) 359 page_cache_release(bp->pb_pages[i]); 360 _pagebuf_free_pages(bp); 361 } else if (bp->pb_flags & _PBF_KMEM_ALLOC) { 362 /* 363 * XXX(hch): bp->pb_count_desired might be incorrect (see 364 * pagebuf_associate_memory for details), but fortunately 365 * the Linux version of kmem_free ignores the len argument.. 366 */ 367 kmem_free(bp->pb_addr, bp->pb_count_desired); 368 _pagebuf_free_pages(bp); 369 } 370 371 pagebuf_deallocate(bp); 372} 373 374/* 375 * Finds all pages for buffer in question and builds it's page list. 376 */ 377STATIC int 378_pagebuf_lookup_pages( 379 xfs_buf_t *bp, 380 uint flags) 381{ 382 struct address_space *mapping = bp->pb_target->pbr_mapping; 383 size_t blocksize = bp->pb_target->pbr_bsize; 384 size_t size = bp->pb_count_desired; 385 size_t nbytes, offset; 386 int gfp_mask = pb_to_gfp(flags); 387 unsigned short page_count, i; 388 pgoff_t first; 389 loff_t end; 390 int error; 391 392 end = bp->pb_file_offset + bp->pb_buffer_length; 393 page_count = page_buf_btoc(end) - page_buf_btoct(bp->pb_file_offset); 394 395 error = _pagebuf_get_pages(bp, page_count, flags); 396 if (unlikely(error)) 397 return error; 398 bp->pb_flags |= _PBF_PAGE_CACHE; 399 400 offset = bp->pb_offset; 401 first = bp->pb_file_offset >> PAGE_CACHE_SHIFT; 402 403 for (i = 0; i < bp->pb_page_count; i++) { 404 struct page *page; 405 uint retries = 0; 406 407 retry: 408 page = find_or_create_page(mapping, first + i, gfp_mask); 409 if (unlikely(page == NULL)) { 410 if (flags & PBF_READ_AHEAD) { 411 bp->pb_page_count = i; 412 for (i = 0; i < bp->pb_page_count; i++) 413 unlock_page(bp->pb_pages[i]); 414 return -ENOMEM; 415 } 416 417 /* 418 * This could deadlock. 419 * 420 * But until all the XFS lowlevel code is revamped to 421 * handle buffer allocation failures we can't do much. 422 */ 423 if (!(++retries % 100)) 424 printk(KERN_ERR 425 "XFS: possible memory allocation " 426 "deadlock in %s (mode:0x%x)\n", 427 __FUNCTION__, gfp_mask); 428 429 XFS_STATS_INC(pb_page_retries); 430 xfsbufd_wakeup(0, gfp_mask); 431 blk_congestion_wait(WRITE, HZ/50); 432 goto retry; 433 } 434 435 XFS_STATS_INC(pb_page_found); 436 437 nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset); 438 size -= nbytes; 439 440 if (!PageUptodate(page)) { 441 page_count--; 442 if (blocksize >= PAGE_CACHE_SIZE) { 443 if (flags & PBF_READ) 444 bp->pb_locked = 1; 445 } else if (!PagePrivate(page)) { 446 if (test_page_region(page, offset, nbytes)) 447 page_count++; 448 } 449 } 450 451 bp->pb_pages[i] = page; 452 offset = 0; 453 } 454 455 if (!bp->pb_locked) { 456 for (i = 0; i < bp->pb_page_count; i++) 457 unlock_page(bp->pb_pages[i]); 458 } 459 460 if (page_count) { 461 /* if we have any uptodate pages, mark that in the buffer */ 462 bp->pb_flags &= ~PBF_NONE; 463 464 /* if some pages aren't uptodate, mark that in the buffer */ 465 if (page_count != bp->pb_page_count) 466 bp->pb_flags |= PBF_PARTIAL; 467 } 468 469 PB_TRACE(bp, "lookup_pages", (long)page_count); 470 return error; 471} 472 473/* 474 * Map buffer into kernel address-space if nessecary. 475 */ 476STATIC int 477_pagebuf_map_pages( 478 xfs_buf_t *bp, 479 uint flags) 480{ 481 /* A single page buffer is always mappable */ 482 if (bp->pb_page_count == 1) { 483 bp->pb_addr = page_address(bp->pb_pages[0]) + bp->pb_offset; 484 bp->pb_flags |= PBF_MAPPED; 485 } else if (flags & PBF_MAPPED) { 486 if (as_list_len > 64) 487 purge_addresses(); 488 bp->pb_addr = vmap(bp->pb_pages, bp->pb_page_count, 489 VM_MAP, PAGE_KERNEL); 490 if (unlikely(bp->pb_addr == NULL)) 491 return -ENOMEM; 492 bp->pb_addr += bp->pb_offset; 493 bp->pb_flags |= PBF_MAPPED; 494 } 495 496 return 0; 497} 498 499/* 500 * Finding and Reading Buffers 501 */ 502 503/* 504 * _pagebuf_find 505 * 506 * Looks up, and creates if absent, a lockable buffer for 507 * a given range of an inode. The buffer is returned 508 * locked. If other overlapping buffers exist, they are 509 * released before the new buffer is created and locked, 510 * which may imply that this call will block until those buffers 511 * are unlocked. No I/O is implied by this call. 512 */ 513xfs_buf_t * 514_pagebuf_find( 515 xfs_buftarg_t *btp, /* block device target */ 516 loff_t ioff, /* starting offset of range */ 517 size_t isize, /* length of range */ 518 page_buf_flags_t flags, /* PBF_TRYLOCK */ 519 xfs_buf_t *new_pb)/* newly allocated buffer */ 520{ 521 loff_t range_base; 522 size_t range_length; 523 xfs_bufhash_t *hash; 524 xfs_buf_t *pb, *n; 525 526 range_base = (ioff << BBSHIFT); 527 range_length = (isize << BBSHIFT); 528 529 /* Check for IOs smaller than the sector size / not sector aligned */ 530 ASSERT(!(range_length < (1 << btp->pbr_sshift))); 531 ASSERT(!(range_base & (loff_t)btp->pbr_smask)); 532 533 hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)]; 534 535 spin_lock(&hash->bh_lock); 536 537 list_for_each_entry_safe(pb, n, &hash->bh_list, pb_hash_list) { 538 ASSERT(btp == pb->pb_target); 539 if (pb->pb_file_offset == range_base && 540 pb->pb_buffer_length == range_length) { 541 /* 542 * If we look at something bring it to the 543 * front of the list for next time. 544 */ 545 atomic_inc(&pb->pb_hold); 546 list_move(&pb->pb_hash_list, &hash->bh_list); 547 goto found; 548 } 549 } 550 551 /* No match found */ 552 if (new_pb) { 553 _pagebuf_initialize(new_pb, btp, range_base, 554 range_length, flags); 555 new_pb->pb_hash = hash; 556 list_add(&new_pb->pb_hash_list, &hash->bh_list); 557 } else { 558 XFS_STATS_INC(pb_miss_locked); 559 } 560 561 spin_unlock(&hash->bh_lock); 562 return new_pb; 563 564found: 565 spin_unlock(&hash->bh_lock); 566 567 /* Attempt to get the semaphore without sleeping, 568 * if this does not work then we need to drop the 569 * spinlock and do a hard attempt on the semaphore. 570 */ 571 if (down_trylock(&pb->pb_sema)) { 572 if (!(flags & PBF_TRYLOCK)) { 573 /* wait for buffer ownership */ 574 PB_TRACE(pb, "get_lock", 0); 575 pagebuf_lock(pb); 576 XFS_STATS_INC(pb_get_locked_waited); 577 } else { 578 /* We asked for a trylock and failed, no need 579 * to look at file offset and length here, we 580 * know that this pagebuf at least overlaps our 581 * pagebuf and is locked, therefore our buffer 582 * either does not exist, or is this buffer 583 */ 584 585 pagebuf_rele(pb); 586 XFS_STATS_INC(pb_busy_locked); 587 return (NULL); 588 } 589 } else { 590 /* trylock worked */ 591 PB_SET_OWNER(pb); 592 } 593 594 if (pb->pb_flags & PBF_STALE) { 595 ASSERT((pb->pb_flags & _PBF_DELWRI_Q) == 0); 596 pb->pb_flags &= PBF_MAPPED; 597 } 598 PB_TRACE(pb, "got_lock", 0); 599 XFS_STATS_INC(pb_get_locked); 600 return (pb); 601} 602 603/* 604 * xfs_buf_get_flags assembles a buffer covering the specified range. 605 * 606 * Storage in memory for all portions of the buffer will be allocated, 607 * although backing storage may not be. 608 */ 609xfs_buf_t * 610xfs_buf_get_flags( /* allocate a buffer */ 611 xfs_buftarg_t *target,/* target for buffer */ 612 loff_t ioff, /* starting offset of range */ 613 size_t isize, /* length of range */ 614 page_buf_flags_t flags) /* PBF_TRYLOCK */ 615{ 616 xfs_buf_t *pb, *new_pb; 617 int error = 0, i; 618 619 new_pb = pagebuf_allocate(flags); 620 if (unlikely(!new_pb)) 621 return NULL; 622 623 pb = _pagebuf_find(target, ioff, isize, flags, new_pb); 624 if (pb == new_pb) { 625 error = _pagebuf_lookup_pages(pb, flags); 626 if (error) 627 goto no_buffer; 628 } else { 629 pagebuf_deallocate(new_pb); 630 if (unlikely(pb == NULL)) 631 return NULL; 632 } 633 634 for (i = 0; i < pb->pb_page_count; i++) 635 mark_page_accessed(pb->pb_pages[i]); 636 637 if (!(pb->pb_flags & PBF_MAPPED)) { 638 error = _pagebuf_map_pages(pb, flags); 639 if (unlikely(error)) { 640 printk(KERN_WARNING "%s: failed to map pages\n", 641 __FUNCTION__); 642 goto no_buffer; 643 } 644 } 645 646 XFS_STATS_INC(pb_get); 647 648 /* 649 * Always fill in the block number now, the mapped cases can do 650 * their own overlay of this later. 651 */ 652 pb->pb_bn = ioff; 653 pb->pb_count_desired = pb->pb_buffer_length; 654 655 PB_TRACE(pb, "get", (unsigned long)flags); 656 return pb; 657 658 no_buffer: 659 if (flags & (PBF_LOCK | PBF_TRYLOCK)) 660 pagebuf_unlock(pb); 661 pagebuf_rele(pb); 662 return NULL; 663} 664 665xfs_buf_t * 666xfs_buf_read_flags( 667 xfs_buftarg_t *target, 668 loff_t ioff, 669 size_t isize, 670 page_buf_flags_t flags) 671{ 672 xfs_buf_t *pb; 673 674 flags |= PBF_READ; 675 676 pb = xfs_buf_get_flags(target, ioff, isize, flags); 677 if (pb) { 678 if (PBF_NOT_DONE(pb)) { 679 PB_TRACE(pb, "read", (unsigned long)flags); 680 XFS_STATS_INC(pb_get_read); 681 pagebuf_iostart(pb, flags); 682 } else if (flags & PBF_ASYNC) { 683 PB_TRACE(pb, "read_async", (unsigned long)flags); 684 /* 685 * Read ahead call which is already satisfied, 686 * drop the buffer 687 */ 688 goto no_buffer; 689 } else { 690 PB_TRACE(pb, "read_done", (unsigned long)flags); 691 /* We do not want read in the flags */ 692 pb->pb_flags &= ~PBF_READ; 693 } 694 } 695 696 return pb; 697 698 no_buffer: 699 if (flags & (PBF_LOCK | PBF_TRYLOCK)) 700 pagebuf_unlock(pb); 701 pagebuf_rele(pb); 702 return NULL; 703} 704 705/* 706 * If we are not low on memory then do the readahead in a deadlock 707 * safe manner. 708 */ 709void 710pagebuf_readahead( 711 xfs_buftarg_t *target, 712 loff_t ioff, 713 size_t isize, 714 page_buf_flags_t flags) 715{ 716 struct backing_dev_info *bdi; 717 718 bdi = target->pbr_mapping->backing_dev_info; 719 if (bdi_read_congested(bdi)) 720 return; 721 722 flags |= (PBF_TRYLOCK|PBF_ASYNC|PBF_READ_AHEAD); 723 xfs_buf_read_flags(target, ioff, isize, flags); 724} 725 726xfs_buf_t * 727pagebuf_get_empty( 728 size_t len, 729 xfs_buftarg_t *target) 730{ 731 xfs_buf_t *pb; 732 733 pb = pagebuf_allocate(0); 734 if (pb) 735 _pagebuf_initialize(pb, target, 0, len, 0); 736 return pb; 737} 738 739static inline struct page * 740mem_to_page( 741 void *addr) 742{ 743 if (((unsigned long)addr < VMALLOC_START) || 744 ((unsigned long)addr >= VMALLOC_END)) { 745 return virt_to_page(addr); 746 } else { 747 return vmalloc_to_page(addr); 748 } 749} 750 751int 752pagebuf_associate_memory( 753 xfs_buf_t *pb, 754 void *mem, 755 size_t len) 756{ 757 int rval; 758 int i = 0; 759 size_t ptr; 760 size_t end, end_cur; 761 off_t offset; 762 int page_count; 763 764 page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT; 765 offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK); 766 if (offset && (len > PAGE_CACHE_SIZE)) 767 page_count++; 768 769 /* Free any previous set of page pointers */ 770 if (pb->pb_pages) 771 _pagebuf_free_pages(pb); 772 773 pb->pb_pages = NULL; 774 pb->pb_addr = mem; 775 776 rval = _pagebuf_get_pages(pb, page_count, 0); 777 if (rval) 778 return rval; 779 780 pb->pb_offset = offset; 781 ptr = (size_t) mem & PAGE_CACHE_MASK; 782 end = PAGE_CACHE_ALIGN((size_t) mem + len); 783 end_cur = end; 784 /* set up first page */ 785 pb->pb_pages[0] = mem_to_page(mem); 786 787 ptr += PAGE_CACHE_SIZE; 788 pb->pb_page_count = ++i; 789 while (ptr < end) { 790 pb->pb_pages[i] = mem_to_page((void *)ptr); 791 pb->pb_page_count = ++i; 792 ptr += PAGE_CACHE_SIZE; 793 } 794 pb->pb_locked = 0; 795 796 pb->pb_count_desired = pb->pb_buffer_length = len; 797 pb->pb_flags |= PBF_MAPPED; 798 799 return 0; 800} 801 802xfs_buf_t * 803pagebuf_get_no_daddr( 804 size_t len, 805 xfs_buftarg_t *target) 806{ 807 size_t malloc_len = len; 808 xfs_buf_t *bp; 809 void *data; 810 int error; 811 812 bp = pagebuf_allocate(0); 813 if (unlikely(bp == NULL)) 814 goto fail; 815 _pagebuf_initialize(bp, target, 0, len, PBF_FORCEIO); 816 817 try_again: 818 data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL); 819 if (unlikely(data == NULL)) 820 goto fail_free_buf; 821 822 /* check whether alignment matches.. */ 823 if ((__psunsigned_t)data != 824 ((__psunsigned_t)data & ~target->pbr_smask)) { 825 /* .. else double the size and try again */ 826 kmem_free(data, malloc_len); 827 malloc_len <<= 1; 828 goto try_again; 829 } 830 831 error = pagebuf_associate_memory(bp, data, len); 832 if (error) 833 goto fail_free_mem; 834 bp->pb_flags |= _PBF_KMEM_ALLOC; 835 836 pagebuf_unlock(bp); 837 838 PB_TRACE(bp, "no_daddr", data); 839 return bp; 840 fail_free_mem: 841 kmem_free(data, malloc_len); 842 fail_free_buf: 843 pagebuf_free(bp); 844 fail: 845 return NULL; 846} 847 848/* 849 * pagebuf_hold 850 * 851 * Increment reference count on buffer, to hold the buffer concurrently 852 * with another thread which may release (free) the buffer asynchronously. 853 * 854 * Must hold the buffer already to call this function. 855 */ 856void 857pagebuf_hold( 858 xfs_buf_t *pb) 859{ 860 atomic_inc(&pb->pb_hold); 861 PB_TRACE(pb, "hold", 0); 862} 863 864/* 865 * pagebuf_rele 866 * 867 * pagebuf_rele releases a hold on the specified buffer. If the 868 * the hold count is 1, pagebuf_rele calls pagebuf_free. 869 */ 870void 871pagebuf_rele( 872 xfs_buf_t *pb) 873{ 874 xfs_bufhash_t *hash = pb->pb_hash; 875 876 PB_TRACE(pb, "rele", pb->pb_relse); 877 878 /* 879 * pagebuf_lookup buffers are not hashed, not delayed write, 880 * and don't have their own release routines. Special case. 881 */ 882 if (unlikely(!hash)) { 883 ASSERT(!pb->pb_relse); 884 if (atomic_dec_and_test(&pb->pb_hold)) 885 xfs_buf_free(pb); 886 return; 887 } 888 889 if (atomic_dec_and_lock(&pb->pb_hold, &hash->bh_lock)) { 890 int do_free = 1; 891 892 if (pb->pb_relse) { 893 atomic_inc(&pb->pb_hold); 894 spin_unlock(&hash->bh_lock); 895 (*(pb->pb_relse)) (pb); 896 spin_lock(&hash->bh_lock); 897 do_free = 0; 898 } 899 900 if (pb->pb_flags & PBF_FS_MANAGED) { 901 do_free = 0; 902 } 903 904 if (do_free) { 905 ASSERT((pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q)) == 0); 906 list_del_init(&pb->pb_hash_list); 907 spin_unlock(&hash->bh_lock); 908 pagebuf_free(pb); 909 } else { 910 spin_unlock(&hash->bh_lock); 911 } 912 } else { 913 /* 914 * Catch reference count leaks 915 */ 916 ASSERT(atomic_read(&pb->pb_hold) >= 0); 917 } 918} 919 920 921/* 922 * Mutual exclusion on buffers. Locking model: 923 * 924 * Buffers associated with inodes for which buffer locking 925 * is not enabled are not protected by semaphores, and are 926 * assumed to be exclusively owned by the caller. There is a 927 * spinlock in the buffer, used by the caller when concurrent 928 * access is possible. 929 */ 930 931/* 932 * pagebuf_cond_lock 933 * 934 * pagebuf_cond_lock locks a buffer object, if it is not already locked. 935 * Note that this in no way 936 * locks the underlying pages, so it is only useful for synchronizing 937 * concurrent use of page buffer objects, not for synchronizing independent 938 * access to the underlying pages. 939 */ 940int 941pagebuf_cond_lock( /* lock buffer, if not locked */ 942 /* returns -EBUSY if locked) */ 943 xfs_buf_t *pb) 944{ 945 int locked; 946 947 locked = down_trylock(&pb->pb_sema) == 0; 948 if (locked) { 949 PB_SET_OWNER(pb); 950 } 951 PB_TRACE(pb, "cond_lock", (long)locked); 952 return(locked ? 0 : -EBUSY); 953} 954 955#if defined(DEBUG) || defined(XFS_BLI_TRACE) 956/* 957 * pagebuf_lock_value 958 * 959 * Return lock value for a pagebuf 960 */ 961int 962pagebuf_lock_value( 963 xfs_buf_t *pb) 964{ 965 return(atomic_read(&pb->pb_sema.count)); 966} 967#endif 968 969/* 970 * pagebuf_lock 971 * 972 * pagebuf_lock locks a buffer object. Note that this in no way 973 * locks the underlying pages, so it is only useful for synchronizing 974 * concurrent use of page buffer objects, not for synchronizing independent 975 * access to the underlying pages. 976 */ 977int 978pagebuf_lock( 979 xfs_buf_t *pb) 980{ 981 PB_TRACE(pb, "lock", 0); 982 if (atomic_read(&pb->pb_io_remaining)) 983 blk_run_address_space(pb->pb_target->pbr_mapping); 984 down(&pb->pb_sema); 985 PB_SET_OWNER(pb); 986 PB_TRACE(pb, "locked", 0); 987 return 0; 988} 989 990/* 991 * pagebuf_unlock 992 * 993 * pagebuf_unlock releases the lock on the buffer object created by 994 * pagebuf_lock or pagebuf_cond_lock (not any pinning of underlying pages 995 * created by pagebuf_pin). 996 * 997 * If the buffer is marked delwri but is not queued, do so before we 998 * unlock the buffer as we need to set flags correctly. We also need to 999 * take a reference for the delwri queue because the unlocker is going to 1000 * drop their's and they don't know we just queued it. 1001 */ 1002void 1003pagebuf_unlock( /* unlock buffer */ 1004 xfs_buf_t *pb) /* buffer to unlock */ 1005{ 1006 if ((pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q)) == PBF_DELWRI) { 1007 atomic_inc(&pb->pb_hold); 1008 pb->pb_flags |= PBF_ASYNC; 1009 pagebuf_delwri_queue(pb, 0); 1010 } 1011 1012 PB_CLEAR_OWNER(pb); 1013 up(&pb->pb_sema); 1014 PB_TRACE(pb, "unlock", 0); 1015} 1016 1017 1018/* 1019 * Pinning Buffer Storage in Memory 1020 */ 1021 1022/* 1023 * pagebuf_pin 1024 * 1025 * pagebuf_pin locks all of the memory represented by a buffer in 1026 * memory. Multiple calls to pagebuf_pin and pagebuf_unpin, for 1027 * the same or different buffers affecting a given page, will 1028 * properly count the number of outstanding "pin" requests. The 1029 * buffer may be released after the pagebuf_pin and a different 1030 * buffer used when calling pagebuf_unpin, if desired. 1031 * pagebuf_pin should be used by the file system when it wants be 1032 * assured that no attempt will be made to force the affected 1033 * memory to disk. It does not assure that a given logical page 1034 * will not be moved to a different physical page. 1035 */ 1036void 1037pagebuf_pin( 1038 xfs_buf_t *pb) 1039{ 1040 atomic_inc(&pb->pb_pin_count); 1041 PB_TRACE(pb, "pin", (long)pb->pb_pin_count.counter); 1042} 1043 1044/* 1045 * pagebuf_unpin 1046 * 1047 * pagebuf_unpin reverses the locking of memory performed by 1048 * pagebuf_pin. Note that both functions affected the logical 1049 * pages associated with the buffer, not the buffer itself. 1050 */ 1051void 1052pagebuf_unpin( 1053 xfs_buf_t *pb) 1054{ 1055 if (atomic_dec_and_test(&pb->pb_pin_count)) { 1056 wake_up_all(&pb->pb_waiters); 1057 } 1058 PB_TRACE(pb, "unpin", (long)pb->pb_pin_count.counter); 1059} 1060 1061int 1062pagebuf_ispin( 1063 xfs_buf_t *pb) 1064{ 1065 return atomic_read(&pb->pb_pin_count); 1066} 1067 1068/* 1069 * pagebuf_wait_unpin 1070 * 1071 * pagebuf_wait_unpin waits until all of the memory associated 1072 * with the buffer is not longer locked in memory. It returns 1073 * immediately if none of the affected pages are locked. 1074 */ 1075static inline void 1076_pagebuf_wait_unpin( 1077 xfs_buf_t *pb) 1078{ 1079 DECLARE_WAITQUEUE (wait, current); 1080 1081 if (atomic_read(&pb->pb_pin_count) == 0) 1082 return; 1083 1084 add_wait_queue(&pb->pb_waiters, &wait); 1085 for (;;) { 1086 set_current_state(TASK_UNINTERRUPTIBLE); 1087 if (atomic_read(&pb->pb_pin_count) == 0) 1088 break; 1089 if (atomic_read(&pb->pb_io_remaining)) 1090 blk_run_address_space(pb->pb_target->pbr_mapping); 1091 schedule(); 1092 } 1093 remove_wait_queue(&pb->pb_waiters, &wait); 1094 set_current_state(TASK_RUNNING); 1095} 1096 1097/* 1098 * Buffer Utility Routines 1099 */ 1100 1101/* 1102 * pagebuf_iodone 1103 * 1104 * pagebuf_iodone marks a buffer for which I/O is in progress 1105 * done with respect to that I/O. The pb_iodone routine, if 1106 * present, will be called as a side-effect. 1107 */ 1108STATIC void 1109pagebuf_iodone_work( 1110 void *v) 1111{ 1112 xfs_buf_t *bp = (xfs_buf_t *)v; 1113 1114 if (bp->pb_iodone) 1115 (*(bp->pb_iodone))(bp); 1116 else if (bp->pb_flags & PBF_ASYNC) 1117 xfs_buf_relse(bp); 1118} 1119 1120void 1121pagebuf_iodone( 1122 xfs_buf_t *pb, 1123 int dataio, 1124 int schedule) 1125{ 1126 pb->pb_flags &= ~(PBF_READ | PBF_WRITE); 1127 if (pb->pb_error == 0) { 1128 pb->pb_flags &= ~(PBF_PARTIAL | PBF_NONE); 1129 } 1130 1131 PB_TRACE(pb, "iodone", pb->pb_iodone); 1132 1133 if ((pb->pb_iodone) || (pb->pb_flags & PBF_ASYNC)) { 1134 if (schedule) { 1135 INIT_WORK(&pb->pb_iodone_work, pagebuf_iodone_work, pb); 1136 queue_work(dataio ? xfsdatad_workqueue : 1137 xfslogd_workqueue, &pb->pb_iodone_work); 1138 } else { 1139 pagebuf_iodone_work(pb); 1140 } 1141 } else { 1142 up(&pb->pb_iodonesema); 1143 } 1144} 1145 1146/* 1147 * pagebuf_ioerror 1148 * 1149 * pagebuf_ioerror sets the error code for a buffer. 1150 */ 1151void 1152pagebuf_ioerror( /* mark/clear buffer error flag */ 1153 xfs_buf_t *pb, /* buffer to mark */ 1154 int error) /* error to store (0 if none) */ 1155{ 1156 ASSERT(error >= 0 && error <= 0xffff); 1157 pb->pb_error = (unsigned short)error; 1158 PB_TRACE(pb, "ioerror", (unsigned long)error); 1159} 1160 1161/* 1162 * pagebuf_iostart 1163 * 1164 * pagebuf_iostart initiates I/O on a buffer, based on the flags supplied. 1165 * If necessary, it will arrange for any disk space allocation required, 1166 * and it will break up the request if the block mappings require it. 1167 * The pb_iodone routine in the buffer supplied will only be called 1168 * when all of the subsidiary I/O requests, if any, have been completed. 1169 * pagebuf_iostart calls the pagebuf_ioinitiate routine or 1170 * pagebuf_iorequest, if the former routine is not defined, to start 1171 * the I/O on a given low-level request. 1172 */ 1173int 1174pagebuf_iostart( /* start I/O on a buffer */ 1175 xfs_buf_t *pb, /* buffer to start */ 1176 page_buf_flags_t flags) /* PBF_LOCK, PBF_ASYNC, PBF_READ, */ 1177 /* PBF_WRITE, PBF_DELWRI, */ 1178 /* PBF_DONT_BLOCK */ 1179{ 1180 int status = 0; 1181 1182 PB_TRACE(pb, "iostart", (unsigned long)flags); 1183 1184 if (flags & PBF_DELWRI) { 1185 pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC); 1186 pb->pb_flags |= flags & (PBF_DELWRI | PBF_ASYNC); 1187 pagebuf_delwri_queue(pb, 1); 1188 return status; 1189 } 1190 1191 pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC | PBF_DELWRI | \ 1192 PBF_READ_AHEAD | _PBF_RUN_QUEUES); 1193 pb->pb_flags |= flags & (PBF_READ | PBF_WRITE | PBF_ASYNC | \ 1194 PBF_READ_AHEAD | _PBF_RUN_QUEUES); 1195 1196 BUG_ON(pb->pb_bn == XFS_BUF_DADDR_NULL); 1197 1198 /* For writes allow an alternate strategy routine to precede 1199 * the actual I/O request (which may not be issued at all in 1200 * a shutdown situation, for example). 1201 */ 1202 status = (flags & PBF_WRITE) ? 1203 pagebuf_iostrategy(pb) : pagebuf_iorequest(pb); 1204 1205 /* Wait for I/O if we are not an async request. 1206 * Note: async I/O request completion will release the buffer, 1207 * and that can already be done by this point. So using the 1208 * buffer pointer from here on, after async I/O, is invalid. 1209 */ 1210 if (!status && !(flags & PBF_ASYNC)) 1211 status = pagebuf_iowait(pb); 1212 1213 return status; 1214} 1215 1216/* 1217 * Helper routine for pagebuf_iorequest 1218 */ 1219 1220STATIC __inline__ int 1221_pagebuf_iolocked( 1222 xfs_buf_t *pb) 1223{ 1224 ASSERT(pb->pb_flags & (PBF_READ|PBF_WRITE)); 1225 if (pb->pb_flags & PBF_READ) 1226 return pb->pb_locked; 1227 return 0; 1228} 1229 1230STATIC __inline__ void 1231_pagebuf_iodone( 1232 xfs_buf_t *pb, 1233 int schedule) 1234{ 1235 if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) { 1236 pb->pb_locked = 0; 1237 pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), schedule); 1238 } 1239} 1240 1241STATIC int 1242bio_end_io_pagebuf( 1243 struct bio *bio, 1244 unsigned int bytes_done, 1245 int error) 1246{ 1247 xfs_buf_t *pb = (xfs_buf_t *)bio->bi_private; 1248 unsigned int blocksize = pb->pb_target->pbr_bsize; 1249 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1250 1251 if (bio->bi_size) 1252 return 1; 1253 1254 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 1255 pb->pb_error = EIO; 1256 1257 do { 1258 struct page *page = bvec->bv_page; 1259 1260 if (unlikely(pb->pb_error)) { 1261 if (pb->pb_flags & PBF_READ) 1262 ClearPageUptodate(page); 1263 SetPageError(page); 1264 } else if (blocksize == PAGE_CACHE_SIZE) { 1265 SetPageUptodate(page); 1266 } else if (!PagePrivate(page) && 1267 (pb->pb_flags & _PBF_PAGE_CACHE)) { 1268 set_page_region(page, bvec->bv_offset, bvec->bv_len); 1269 } 1270 1271 if (--bvec >= bio->bi_io_vec) 1272 prefetchw(&bvec->bv_page->flags); 1273 1274 if (_pagebuf_iolocked(pb)) { 1275 unlock_page(page); 1276 } 1277 } while (bvec >= bio->bi_io_vec); 1278 1279 _pagebuf_iodone(pb, 1); 1280 bio_put(bio); 1281 return 0; 1282} 1283 1284STATIC void 1285_pagebuf_ioapply( 1286 xfs_buf_t *pb) 1287{ 1288 int i, rw, map_i, total_nr_pages, nr_pages; 1289 struct bio *bio; 1290 int offset = pb->pb_offset; 1291 int size = pb->pb_count_desired; 1292 sector_t sector = pb->pb_bn; 1293 unsigned int blocksize = pb->pb_target->pbr_bsize; 1294 int locking = _pagebuf_iolocked(pb); 1295 1296 total_nr_pages = pb->pb_page_count; 1297 map_i = 0; 1298 1299 if (pb->pb_flags & _PBF_RUN_QUEUES) { 1300 pb->pb_flags &= ~_PBF_RUN_QUEUES; 1301 rw = (pb->pb_flags & PBF_READ) ? READ_SYNC : WRITE_SYNC; 1302 } else { 1303 rw = (pb->pb_flags & PBF_READ) ? READ : WRITE; 1304 } 1305 1306 /* Special code path for reading a sub page size pagebuf in -- 1307 * we populate up the whole page, and hence the other metadata 1308 * in the same page. This optimization is only valid when the 1309 * filesystem block size and the page size are equal. 1310 */ 1311 if ((pb->pb_buffer_length < PAGE_CACHE_SIZE) && 1312 (pb->pb_flags & PBF_READ) && locking && 1313 (blocksize == PAGE_CACHE_SIZE)) { 1314 bio = bio_alloc(GFP_NOIO, 1); 1315 1316 bio->bi_bdev = pb->pb_target->pbr_bdev; 1317 bio->bi_sector = sector - (offset >> BBSHIFT); 1318 bio->bi_end_io = bio_end_io_pagebuf; 1319 bio->bi_private = pb; 1320 1321 bio_add_page(bio, pb->pb_pages[0], PAGE_CACHE_SIZE, 0); 1322 size = 0; 1323 1324 atomic_inc(&pb->pb_io_remaining); 1325 1326 goto submit_io; 1327 } 1328 1329 /* Lock down the pages which we need to for the request */ 1330 if (locking && (pb->pb_flags & PBF_WRITE) && (pb->pb_locked == 0)) { 1331 for (i = 0; size; i++) { 1332 int nbytes = PAGE_CACHE_SIZE - offset; 1333 struct page *page = pb->pb_pages[i]; 1334 1335 if (nbytes > size) 1336 nbytes = size; 1337 1338 lock_page(page); 1339 1340 size -= nbytes; 1341 offset = 0; 1342 } 1343 offset = pb->pb_offset; 1344 size = pb->pb_count_desired; 1345 } 1346 1347next_chunk: 1348 atomic_inc(&pb->pb_io_remaining); 1349 nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); 1350 if (nr_pages > total_nr_pages) 1351 nr_pages = total_nr_pages; 1352 1353 bio = bio_alloc(GFP_NOIO, nr_pages); 1354 bio->bi_bdev = pb->pb_target->pbr_bdev; 1355 bio->bi_sector = sector; 1356 bio->bi_end_io = bio_end_io_pagebuf; 1357 bio->bi_private = pb; 1358 1359 for (; size && nr_pages; nr_pages--, map_i++) { 1360 int nbytes = PAGE_CACHE_SIZE - offset; 1361 1362 if (nbytes > size) 1363 nbytes = size; 1364 1365 if (bio_add_page(bio, pb->pb_pages[map_i], 1366 nbytes, offset) < nbytes) 1367 break; 1368 1369 offset = 0; 1370 sector += nbytes >> BBSHIFT; 1371 size -= nbytes; 1372 total_nr_pages--; 1373 } 1374 1375submit_io: 1376 if (likely(bio->bi_size)) { 1377 submit_bio(rw, bio); 1378 if (size) 1379 goto next_chunk; 1380 } else { 1381 bio_put(bio); 1382 pagebuf_ioerror(pb, EIO); 1383 } 1384} 1385 1386/* 1387 * pagebuf_iorequest -- the core I/O request routine. 1388 */ 1389int 1390pagebuf_iorequest( /* start real I/O */ 1391 xfs_buf_t *pb) /* buffer to convey to device */ 1392{ 1393 PB_TRACE(pb, "iorequest", 0); 1394 1395 if (pb->pb_flags & PBF_DELWRI) { 1396 pagebuf_delwri_queue(pb, 1); 1397 return 0; 1398 } 1399 1400 if (pb->pb_flags & PBF_WRITE) { 1401 _pagebuf_wait_unpin(pb); 1402 } 1403 1404 pagebuf_hold(pb); 1405 1406 /* Set the count to 1 initially, this will stop an I/O 1407 * completion callout which happens before we have started 1408 * all the I/O from calling pagebuf_iodone too early. 1409 */ 1410 atomic_set(&pb->pb_io_remaining, 1); 1411 _pagebuf_ioapply(pb); 1412 _pagebuf_iodone(pb, 0); 1413 1414 pagebuf_rele(pb); 1415 return 0; 1416} 1417 1418/* 1419 * pagebuf_iowait 1420 * 1421 * pagebuf_iowait waits for I/O to complete on the buffer supplied. 1422 * It returns immediately if no I/O is pending. In any case, it returns 1423 * the error code, if any, or 0 if there is no error. 1424 */ 1425int 1426pagebuf_iowait( 1427 xfs_buf_t *pb) 1428{ 1429 PB_TRACE(pb, "iowait", 0); 1430 if (atomic_read(&pb->pb_io_remaining)) 1431 blk_run_address_space(pb->pb_target->pbr_mapping); 1432 down(&pb->pb_iodonesema); 1433 PB_TRACE(pb, "iowaited", (long)pb->pb_error); 1434 return pb->pb_error; 1435} 1436 1437caddr_t 1438pagebuf_offset( 1439 xfs_buf_t *pb, 1440 size_t offset) 1441{ 1442 struct page *page; 1443 1444 offset += pb->pb_offset; 1445 1446 page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT]; 1447 return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1)); 1448} 1449 1450/* 1451 * pagebuf_iomove 1452 * 1453 * Move data into or out of a buffer. 1454 */ 1455void 1456pagebuf_iomove( 1457 xfs_buf_t *pb, /* buffer to process */ 1458 size_t boff, /* starting buffer offset */ 1459 size_t bsize, /* length to copy */ 1460 caddr_t data, /* data address */ 1461 page_buf_rw_t mode) /* read/write flag */ 1462{ 1463 size_t bend, cpoff, csize; 1464 struct page *page; 1465 1466 bend = boff + bsize; 1467 while (boff < bend) { 1468 page = pb->pb_pages[page_buf_btoct(boff + pb->pb_offset)]; 1469 cpoff = page_buf_poff(boff + pb->pb_offset); 1470 csize = min_t(size_t, 1471 PAGE_CACHE_SIZE-cpoff, pb->pb_count_desired-boff); 1472 1473 ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE)); 1474 1475 switch (mode) { 1476 case PBRW_ZERO: 1477 memset(page_address(page) + cpoff, 0, csize); 1478 break; 1479 case PBRW_READ: 1480 memcpy(data, page_address(page) + cpoff, csize); 1481 break; 1482 case PBRW_WRITE: 1483 memcpy(page_address(page) + cpoff, data, csize); 1484 } 1485 1486 boff += csize; 1487 data += csize; 1488 } 1489} 1490 1491/* 1492 * Handling of buftargs. 1493 */ 1494 1495/* 1496 * Wait for any bufs with callbacks that have been submitted but 1497 * have not yet returned... walk the hash list for the target. 1498 */ 1499void 1500xfs_wait_buftarg( 1501 xfs_buftarg_t *btp) 1502{ 1503 xfs_buf_t *bp, *n; 1504 xfs_bufhash_t *hash; 1505 uint i; 1506 1507 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1508 hash = &btp->bt_hash[i]; 1509again: 1510 spin_lock(&hash->bh_lock); 1511 list_for_each_entry_safe(bp, n, &hash->bh_list, pb_hash_list) { 1512 ASSERT(btp == bp->pb_target); 1513 if (!(bp->pb_flags & PBF_FS_MANAGED)) { 1514 spin_unlock(&hash->bh_lock); 1515 /* 1516 * Catch superblock reference count leaks 1517 * immediately 1518 */ 1519 BUG_ON(bp->pb_bn == 0); 1520 delay(100); 1521 goto again; 1522 } 1523 } 1524 spin_unlock(&hash->bh_lock); 1525 } 1526} 1527 1528/* 1529 * Allocate buffer hash table for a given target. 1530 * For devices containing metadata (i.e. not the log/realtime devices) 1531 * we need to allocate a much larger hash table. 1532 */ 1533STATIC void 1534xfs_alloc_bufhash( 1535 xfs_buftarg_t *btp, 1536 int external) 1537{ 1538 unsigned int i; 1539 1540 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ 1541 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; 1542 btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) * 1543 sizeof(xfs_bufhash_t), KM_SLEEP); 1544 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1545 spin_lock_init(&btp->bt_hash[i].bh_lock); 1546 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); 1547 } 1548} 1549 1550STATIC void 1551xfs_free_bufhash( 1552 xfs_buftarg_t *btp) 1553{ 1554 kmem_free(btp->bt_hash, 1555 (1 << btp->bt_hashshift) * sizeof(xfs_bufhash_t)); 1556 btp->bt_hash = NULL; 1557} 1558 1559void 1560xfs_free_buftarg( 1561 xfs_buftarg_t *btp, 1562 int external) 1563{ 1564 xfs_flush_buftarg(btp, 1); 1565 if (external) 1566 xfs_blkdev_put(btp->pbr_bdev); 1567 xfs_free_bufhash(btp); 1568 iput(btp->pbr_mapping->host); 1569 kmem_free(btp, sizeof(*btp)); 1570} 1571 1572STATIC int 1573xfs_setsize_buftarg_flags( 1574 xfs_buftarg_t *btp, 1575 unsigned int blocksize, 1576 unsigned int sectorsize, 1577 int verbose) 1578{ 1579 btp->pbr_bsize = blocksize; 1580 btp->pbr_sshift = ffs(sectorsize) - 1; 1581 btp->pbr_smask = sectorsize - 1; 1582 1583 if (set_blocksize(btp->pbr_bdev, sectorsize)) { 1584 printk(KERN_WARNING 1585 "XFS: Cannot set_blocksize to %u on device %s\n", 1586 sectorsize, XFS_BUFTARG_NAME(btp)); 1587 return EINVAL; 1588 } 1589 1590 if (verbose && 1591 (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) { 1592 printk(KERN_WARNING 1593 "XFS: %u byte sectors in use on device %s. " 1594 "This is suboptimal; %u or greater is ideal.\n", 1595 sectorsize, XFS_BUFTARG_NAME(btp), 1596 (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG); 1597 } 1598 1599 return 0; 1600} 1601 1602/* 1603* When allocating the initial buffer target we have not yet 1604* read in the superblock, so don't know what sized sectors 1605* are being used is at this early stage. Play safe. 1606*/ 1607STATIC int 1608xfs_setsize_buftarg_early( 1609 xfs_buftarg_t *btp, 1610 struct block_device *bdev) 1611{ 1612 return xfs_setsize_buftarg_flags(btp, 1613 PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0); 1614} 1615 1616int 1617xfs_setsize_buftarg( 1618 xfs_buftarg_t *btp, 1619 unsigned int blocksize, 1620 unsigned int sectorsize) 1621{ 1622 return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); 1623} 1624 1625STATIC int 1626xfs_mapping_buftarg( 1627 xfs_buftarg_t *btp, 1628 struct block_device *bdev) 1629{ 1630 struct backing_dev_info *bdi; 1631 struct inode *inode; 1632 struct address_space *mapping; 1633 static struct address_space_operations mapping_aops = { 1634 .sync_page = block_sync_page, 1635 }; 1636 1637 inode = new_inode(bdev->bd_inode->i_sb); 1638 if (!inode) { 1639 printk(KERN_WARNING 1640 "XFS: Cannot allocate mapping inode for device %s\n", 1641 XFS_BUFTARG_NAME(btp)); 1642 return ENOMEM; 1643 } 1644 inode->i_mode = S_IFBLK; 1645 inode->i_bdev = bdev; 1646 inode->i_rdev = bdev->bd_dev; 1647 bdi = blk_get_backing_dev_info(bdev); 1648 if (!bdi) 1649 bdi = &default_backing_dev_info; 1650 mapping = &inode->i_data; 1651 mapping->a_ops = &mapping_aops; 1652 mapping->backing_dev_info = bdi; 1653 mapping_set_gfp_mask(mapping, GFP_NOFS); 1654 btp->pbr_mapping = mapping; 1655 return 0; 1656} 1657 1658xfs_buftarg_t * 1659xfs_alloc_buftarg( 1660 struct block_device *bdev, 1661 int external) 1662{ 1663 xfs_buftarg_t *btp; 1664 1665 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); 1666 1667 btp->pbr_dev = bdev->bd_dev; 1668 btp->pbr_bdev = bdev; 1669 if (xfs_setsize_buftarg_early(btp, bdev)) 1670 goto error; 1671 if (xfs_mapping_buftarg(btp, bdev)) 1672 goto error; 1673 xfs_alloc_bufhash(btp, external); 1674 return btp; 1675 1676error: 1677 kmem_free(btp, sizeof(*btp)); 1678 return NULL; 1679} 1680 1681 1682/* 1683 * Pagebuf delayed write buffer handling 1684 */ 1685 1686STATIC LIST_HEAD(pbd_delwrite_queue); 1687STATIC DEFINE_SPINLOCK(pbd_delwrite_lock); 1688 1689STATIC void 1690pagebuf_delwri_queue( 1691 xfs_buf_t *pb, 1692 int unlock) 1693{ 1694 PB_TRACE(pb, "delwri_q", (long)unlock); 1695 ASSERT((pb->pb_flags & (PBF_DELWRI|PBF_ASYNC)) == 1696 (PBF_DELWRI|PBF_ASYNC)); 1697 1698 spin_lock(&pbd_delwrite_lock); 1699 /* If already in the queue, dequeue and place at tail */ 1700 if (!list_empty(&pb->pb_list)) { 1701 ASSERT(pb->pb_flags & _PBF_DELWRI_Q); 1702 if (unlock) { 1703 atomic_dec(&pb->pb_hold); 1704 } 1705 list_del(&pb->pb_list); 1706 } 1707 1708 pb->pb_flags |= _PBF_DELWRI_Q; 1709 list_add_tail(&pb->pb_list, &pbd_delwrite_queue); 1710 pb->pb_queuetime = jiffies; 1711 spin_unlock(&pbd_delwrite_lock); 1712 1713 if (unlock) 1714 pagebuf_unlock(pb); 1715} 1716 1717void 1718pagebuf_delwri_dequeue( 1719 xfs_buf_t *pb) 1720{ 1721 int dequeued = 0; 1722 1723 spin_lock(&pbd_delwrite_lock); 1724 if ((pb->pb_flags & PBF_DELWRI) && !list_empty(&pb->pb_list)) { 1725 ASSERT(pb->pb_flags & _PBF_DELWRI_Q); 1726 list_del_init(&pb->pb_list); 1727 dequeued = 1; 1728 } 1729 pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q); 1730 spin_unlock(&pbd_delwrite_lock); 1731 1732 if (dequeued) 1733 pagebuf_rele(pb); 1734 1735 PB_TRACE(pb, "delwri_dq", (long)dequeued); 1736} 1737 1738STATIC void 1739pagebuf_runall_queues( 1740 struct workqueue_struct *queue) 1741{ 1742 flush_workqueue(queue); 1743} 1744 1745/* Defines for pagebuf daemon */ 1746STATIC struct task_struct *xfsbufd_task; 1747STATIC int xfsbufd_force_flush; 1748STATIC int xfsbufd_force_sleep; 1749 1750STATIC int 1751xfsbufd_wakeup( 1752 int priority, 1753 unsigned int mask) 1754{ 1755 if (xfsbufd_force_sleep) 1756 return 0; 1757 xfsbufd_force_flush = 1; 1758 barrier(); 1759 wake_up_process(xfsbufd_task); 1760 return 0; 1761} 1762 1763STATIC int 1764xfsbufd( 1765 void *data) 1766{ 1767 struct list_head tmp; 1768 unsigned long age; 1769 xfs_buftarg_t *target; 1770 xfs_buf_t *pb, *n; 1771 1772 current->flags |= PF_MEMALLOC; 1773 1774 INIT_LIST_HEAD(&tmp); 1775 do { 1776 if (unlikely(freezing(current))) { 1777 xfsbufd_force_sleep = 1; 1778 refrigerator(); 1779 } else { 1780 xfsbufd_force_sleep = 0; 1781 } 1782 1783 schedule_timeout_interruptible 1784 (xfs_buf_timer_centisecs * msecs_to_jiffies(10)); 1785 1786 age = xfs_buf_age_centisecs * msecs_to_jiffies(10); 1787 spin_lock(&pbd_delwrite_lock); 1788 list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) { 1789 PB_TRACE(pb, "walkq1", (long)pagebuf_ispin(pb)); 1790 ASSERT(pb->pb_flags & PBF_DELWRI); 1791 1792 if (!pagebuf_ispin(pb) && !pagebuf_cond_lock(pb)) { 1793 if (!xfsbufd_force_flush && 1794 time_before(jiffies, 1795 pb->pb_queuetime + age)) { 1796 pagebuf_unlock(pb); 1797 break; 1798 } 1799 1800 pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q); 1801 pb->pb_flags |= PBF_WRITE; 1802 list_move(&pb->pb_list, &tmp); 1803 } 1804 } 1805 spin_unlock(&pbd_delwrite_lock); 1806 1807 while (!list_empty(&tmp)) { 1808 pb = list_entry(tmp.next, xfs_buf_t, pb_list); 1809 target = pb->pb_target; 1810 1811 list_del_init(&pb->pb_list); 1812 pagebuf_iostrategy(pb); 1813 1814 blk_run_address_space(target->pbr_mapping); 1815 } 1816 1817 if (as_list_len > 0) 1818 purge_addresses(); 1819 1820 xfsbufd_force_flush = 0; 1821 } while (!kthread_should_stop()); 1822 1823 return 0; 1824} 1825 1826/* 1827 * Go through all incore buffers, and release buffers if they belong to 1828 * the given device. This is used in filesystem error handling to 1829 * preserve the consistency of its metadata. 1830 */ 1831int 1832xfs_flush_buftarg( 1833 xfs_buftarg_t *target, 1834 int wait) 1835{ 1836 struct list_head tmp; 1837 xfs_buf_t *pb, *n; 1838 int pincount = 0; 1839 1840 pagebuf_runall_queues(xfsdatad_workqueue); 1841 pagebuf_runall_queues(xfslogd_workqueue); 1842 1843 INIT_LIST_HEAD(&tmp); 1844 spin_lock(&pbd_delwrite_lock); 1845 list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) { 1846 1847 if (pb->pb_target != target) 1848 continue; 1849 1850 ASSERT(pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q)); 1851 PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb)); 1852 if (pagebuf_ispin(pb)) { 1853 pincount++; 1854 continue; 1855 } 1856 1857 list_move(&pb->pb_list, &tmp); 1858 } 1859 spin_unlock(&pbd_delwrite_lock); 1860 1861 /* 1862 * Dropped the delayed write list lock, now walk the temporary list 1863 */ 1864 list_for_each_entry_safe(pb, n, &tmp, pb_list) { 1865 pagebuf_lock(pb); 1866 pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q); 1867 pb->pb_flags |= PBF_WRITE; 1868 if (wait) 1869 pb->pb_flags &= ~PBF_ASYNC; 1870 else 1871 list_del_init(&pb->pb_list); 1872 1873 pagebuf_iostrategy(pb); 1874 } 1875 1876 /* 1877 * Remaining list items must be flushed before returning 1878 */ 1879 while (!list_empty(&tmp)) { 1880 pb = list_entry(tmp.next, xfs_buf_t, pb_list); 1881 1882 list_del_init(&pb->pb_list); 1883 xfs_iowait(pb); 1884 xfs_buf_relse(pb); 1885 } 1886 1887 if (wait) 1888 blk_run_address_space(target->pbr_mapping); 1889 1890 return pincount; 1891} 1892 1893STATIC int 1894xfs_buf_daemons_start(void) 1895{ 1896 int error = -ENOMEM; 1897 1898 xfslogd_workqueue = create_workqueue("xfslogd"); 1899 if (!xfslogd_workqueue) 1900 goto out; 1901 1902 xfsdatad_workqueue = create_workqueue("xfsdatad"); 1903 if (!xfsdatad_workqueue) 1904 goto out_destroy_xfslogd_workqueue; 1905 1906 xfsbufd_task = kthread_run(xfsbufd, NULL, "xfsbufd"); 1907 if (IS_ERR(xfsbufd_task)) { 1908 error = PTR_ERR(xfsbufd_task); 1909 goto out_destroy_xfsdatad_workqueue; 1910 } 1911 return 0; 1912 1913 out_destroy_xfsdatad_workqueue: 1914 destroy_workqueue(xfsdatad_workqueue); 1915 out_destroy_xfslogd_workqueue: 1916 destroy_workqueue(xfslogd_workqueue); 1917 out: 1918 return error; 1919} 1920 1921/* 1922 * Note: do not mark as __exit, it is called from pagebuf_terminate. 1923 */ 1924STATIC void 1925xfs_buf_daemons_stop(void) 1926{ 1927 kthread_stop(xfsbufd_task); 1928 destroy_workqueue(xfslogd_workqueue); 1929 destroy_workqueue(xfsdatad_workqueue); 1930} 1931 1932/* 1933 * Initialization and Termination 1934 */ 1935 1936int __init 1937pagebuf_init(void) 1938{ 1939 int error = -ENOMEM; 1940 1941 pagebuf_zone = kmem_zone_init(sizeof(xfs_buf_t), "xfs_buf"); 1942 if (!pagebuf_zone) 1943 goto out; 1944 1945#ifdef PAGEBUF_TRACE 1946 pagebuf_trace_buf = ktrace_alloc(PAGEBUF_TRACE_SIZE, KM_SLEEP); 1947#endif 1948 1949 error = xfs_buf_daemons_start(); 1950 if (error) 1951 goto out_free_buf_zone; 1952 1953 pagebuf_shake = kmem_shake_register(xfsbufd_wakeup); 1954 if (!pagebuf_shake) { 1955 error = -ENOMEM; 1956 goto out_stop_daemons; 1957 } 1958 1959 return 0; 1960 1961 out_stop_daemons: 1962 xfs_buf_daemons_stop(); 1963 out_free_buf_zone: 1964#ifdef PAGEBUF_TRACE 1965 ktrace_free(pagebuf_trace_buf); 1966#endif 1967 kmem_zone_destroy(pagebuf_zone); 1968 out: 1969 return error; 1970} 1971 1972 1973/* 1974 * pagebuf_terminate. 1975 * 1976 * Note: do not mark as __exit, this is also called from the __init code. 1977 */ 1978void 1979pagebuf_terminate(void) 1980{ 1981 xfs_buf_daemons_stop(); 1982 1983#ifdef PAGEBUF_TRACE 1984 ktrace_free(pagebuf_trace_buf); 1985#endif 1986 1987 kmem_zone_destroy(pagebuf_zone); 1988 kmem_shake_deregister(pagebuf_shake); 1989}