fs/xfs/linux-2.6/xfs_buf.c at v2.6.23-rc2

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / fs / xfs / linux-2.6 / xfs_buf.c
at v2.6.23-rc2 1870 lines 42 kB view raw
wrap content
   1/*
   2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include <linux/stddef.h>
  20#include <linux/errno.h>
  21#include <linux/slab.h>
  22#include <linux/pagemap.h>
  23#include <linux/init.h>
  24#include <linux/vmalloc.h>
  25#include <linux/bio.h>
  26#include <linux/sysctl.h>
  27#include <linux/proc_fs.h>
  28#include <linux/workqueue.h>
  29#include <linux/percpu.h>
  30#include <linux/blkdev.h>
  31#include <linux/hash.h>
  32#include <linux/kthread.h>
  33#include <linux/migrate.h>
  34#include <linux/backing-dev.h>
  35#include <linux/freezer.h>
  36
  37static kmem_zone_t *xfs_buf_zone;
  38STATIC int xfsbufd(void *);
  39STATIC int xfsbufd_wakeup(int, gfp_t);
  40STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
  41static struct shrinker xfs_buf_shake = {
  42	.shrink = xfsbufd_wakeup,
  43	.seeks = DEFAULT_SEEKS,
  44};
  45
  46static struct workqueue_struct *xfslogd_workqueue;
  47struct workqueue_struct *xfsdatad_workqueue;
  48
  49#ifdef XFS_BUF_TRACE
  50void
  51xfs_buf_trace(
  52	xfs_buf_t	*bp,
  53	char		*id,
  54	void		*data,
  55	void		*ra)
  56{
  57	ktrace_enter(xfs_buf_trace_buf,
  58		bp, id,
  59		(void *)(unsigned long)bp->b_flags,
  60		(void *)(unsigned long)bp->b_hold.counter,
  61		(void *)(unsigned long)bp->b_sema.count.counter,
  62		(void *)current,
  63		data, ra,
  64		(void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff),
  65		(void *)(unsigned long)(bp->b_file_offset & 0xffffffff),
  66		(void *)(unsigned long)bp->b_buffer_length,
  67		NULL, NULL, NULL, NULL, NULL);
  68}
  69ktrace_t *xfs_buf_trace_buf;
  70#define XFS_BUF_TRACE_SIZE	4096
  71#define XB_TRACE(bp, id, data)	\
  72	xfs_buf_trace(bp, id, (void *)data, (void *)__builtin_return_address(0))
  73#else
  74#define XB_TRACE(bp, id, data)	do { } while (0)
  75#endif
  76
  77#ifdef XFS_BUF_LOCK_TRACKING
  78# define XB_SET_OWNER(bp)	((bp)->b_last_holder = current->pid)
  79# define XB_CLEAR_OWNER(bp)	((bp)->b_last_holder = -1)
  80# define XB_GET_OWNER(bp)	((bp)->b_last_holder)
  81#else
  82# define XB_SET_OWNER(bp)	do { } while (0)
  83# define XB_CLEAR_OWNER(bp)	do { } while (0)
  84# define XB_GET_OWNER(bp)	do { } while (0)
  85#endif
  86
  87#define xb_to_gfp(flags) \
  88	((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \
  89	  ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)
  90
  91#define xb_to_km(flags) \
  92	 (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
  93
  94#define xfs_buf_allocate(flags) \
  95	kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags))
  96#define xfs_buf_deallocate(bp) \
  97	kmem_zone_free(xfs_buf_zone, (bp));
  98
  99/*
 100 *	Page Region interfaces.
 101 *
 102 *	For pages in filesystems where the blocksize is smaller than the
 103 *	pagesize, we use the page->private field (long) to hold a bitmap
 104 * 	of uptodate regions within the page.
 105 *
 106 *	Each such region is "bytes per page / bits per long" bytes long.
 107 *
 108 *	NBPPR == number-of-bytes-per-page-region
 109 *	BTOPR == bytes-to-page-region (rounded up)
 110 *	BTOPRT == bytes-to-page-region-truncated (rounded down)
 111 */
 112#if (BITS_PER_LONG == 32)
 113#define PRSHIFT		(PAGE_CACHE_SHIFT - 5)	/* (32 == 1<<5) */
 114#elif (BITS_PER_LONG == 64)
 115#define PRSHIFT		(PAGE_CACHE_SHIFT - 6)	/* (64 == 1<<6) */
 116#else
 117#error BITS_PER_LONG must be 32 or 64
 118#endif
 119#define NBPPR		(PAGE_CACHE_SIZE/BITS_PER_LONG)
 120#define BTOPR(b)	(((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)
 121#define BTOPRT(b)	(((unsigned int)(b) >> PRSHIFT))
 122
 123STATIC unsigned long
 124page_region_mask(
 125	size_t		offset,
 126	size_t		length)
 127{
 128	unsigned long	mask;
 129	int		first, final;
 130
 131	first = BTOPR(offset);
 132	final = BTOPRT(offset + length - 1);
 133	first = min(first, final);
 134
 135	mask = ~0UL;
 136	mask <<= BITS_PER_LONG - (final - first);
 137	mask >>= BITS_PER_LONG - (final);
 138
 139	ASSERT(offset + length <= PAGE_CACHE_SIZE);
 140	ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);
 141
 142	return mask;
 143}
 144
 145STATIC_INLINE void
 146set_page_region(
 147	struct page	*page,
 148	size_t		offset,
 149	size_t		length)
 150{
 151	set_page_private(page,
 152		page_private(page) | page_region_mask(offset, length));
 153	if (page_private(page) == ~0UL)
 154		SetPageUptodate(page);
 155}
 156
 157STATIC_INLINE int
 158test_page_region(
 159	struct page	*page,
 160	size_t		offset,
 161	size_t		length)
 162{
 163	unsigned long	mask = page_region_mask(offset, length);
 164
 165	return (mask && (page_private(page) & mask) == mask);
 166}
 167
 168/*
 169 *	Mapping of multi-page buffers into contiguous virtual space
 170 */
 171
 172typedef struct a_list {
 173	void		*vm_addr;
 174	struct a_list	*next;
 175} a_list_t;
 176
 177static a_list_t		*as_free_head;
 178static int		as_list_len;
 179static DEFINE_SPINLOCK(as_lock);
 180
 181/*
 182 *	Try to batch vunmaps because they are costly.
 183 */
 184STATIC void
 185free_address(
 186	void		*addr)
 187{
 188	a_list_t	*aentry;
 189
 190	aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
 191	if (likely(aentry)) {
 192		spin_lock(&as_lock);
 193		aentry->next = as_free_head;
 194		aentry->vm_addr = addr;
 195		as_free_head = aentry;
 196		as_list_len++;
 197		spin_unlock(&as_lock);
 198	} else {
 199		vunmap(addr);
 200	}
 201}
 202
 203STATIC void
 204purge_addresses(void)
 205{
 206	a_list_t	*aentry, *old;
 207
 208	if (as_free_head == NULL)
 209		return;
 210
 211	spin_lock(&as_lock);
 212	aentry = as_free_head;
 213	as_free_head = NULL;
 214	as_list_len = 0;
 215	spin_unlock(&as_lock);
 216
 217	while ((old = aentry) != NULL) {
 218		vunmap(aentry->vm_addr);
 219		aentry = aentry->next;
 220		kfree(old);
 221	}
 222}
 223
 224/*
 225 *	Internal xfs_buf_t object manipulation
 226 */
 227
 228STATIC void
 229_xfs_buf_initialize(
 230	xfs_buf_t		*bp,
 231	xfs_buftarg_t		*target,
 232	xfs_off_t		range_base,
 233	size_t			range_length,
 234	xfs_buf_flags_t		flags)
 235{
 236	/*
 237	 * We don't want certain flags to appear in b_flags.
 238	 */
 239	flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD);
 240
 241	memset(bp, 0, sizeof(xfs_buf_t));
 242	atomic_set(&bp->b_hold, 1);
 243	init_MUTEX_LOCKED(&bp->b_iodonesema);
 244	INIT_LIST_HEAD(&bp->b_list);
 245	INIT_LIST_HEAD(&bp->b_hash_list);
 246	init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
 247	XB_SET_OWNER(bp);
 248	bp->b_target = target;
 249	bp->b_file_offset = range_base;
 250	/*
 251	 * Set buffer_length and count_desired to the same value initially.
 252	 * I/O routines should use count_desired, which will be the same in
 253	 * most cases but may be reset (e.g. XFS recovery).
 254	 */
 255	bp->b_buffer_length = bp->b_count_desired = range_length;
 256	bp->b_flags = flags;
 257	bp->b_bn = XFS_BUF_DADDR_NULL;
 258	atomic_set(&bp->b_pin_count, 0);
 259	init_waitqueue_head(&bp->b_waiters);
 260
 261	XFS_STATS_INC(xb_create);
 262	XB_TRACE(bp, "initialize", target);
 263}
 264
 265/*
 266 *	Allocate a page array capable of holding a specified number
 267 *	of pages, and point the page buf at it.
 268 */
 269STATIC int
 270_xfs_buf_get_pages(
 271	xfs_buf_t		*bp,
 272	int			page_count,
 273	xfs_buf_flags_t		flags)
 274{
 275	/* Make sure that we have a page list */
 276	if (bp->b_pages == NULL) {
 277		bp->b_offset = xfs_buf_poff(bp->b_file_offset);
 278		bp->b_page_count = page_count;
 279		if (page_count <= XB_PAGES) {
 280			bp->b_pages = bp->b_page_array;
 281		} else {
 282			bp->b_pages = kmem_alloc(sizeof(struct page *) *
 283					page_count, xb_to_km(flags));
 284			if (bp->b_pages == NULL)
 285				return -ENOMEM;
 286		}
 287		memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
 288	}
 289	return 0;
 290}
 291
 292/*
 293 *	Frees b_pages if it was allocated.
 294 */
 295STATIC void
 296_xfs_buf_free_pages(
 297	xfs_buf_t	*bp)
 298{
 299	if (bp->b_pages != bp->b_page_array) {
 300		kmem_free(bp->b_pages,
 301			  bp->b_page_count * sizeof(struct page *));
 302	}
 303}
 304
 305/*
 306 *	Releases the specified buffer.
 307 *
 308 * 	The modification state of any associated pages is left unchanged.
 309 * 	The buffer most not be on any hash - use xfs_buf_rele instead for
 310 * 	hashed and refcounted buffers
 311 */
 312void
 313xfs_buf_free(
 314	xfs_buf_t		*bp)
 315{
 316	XB_TRACE(bp, "free", 0);
 317
 318	ASSERT(list_empty(&bp->b_hash_list));
 319
 320	if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
 321		uint		i;
 322
 323		if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
 324			free_address(bp->b_addr - bp->b_offset);
 325
 326		for (i = 0; i < bp->b_page_count; i++) {
 327			struct page	*page = bp->b_pages[i];
 328
 329			if (bp->b_flags & _XBF_PAGE_CACHE)
 330				ASSERT(!PagePrivate(page));
 331			page_cache_release(page);
 332		}
 333		_xfs_buf_free_pages(bp);
 334	}
 335
 336	xfs_buf_deallocate(bp);
 337}
 338
 339/*
 340 *	Finds all pages for buffer in question and builds it's page list.
 341 */
 342STATIC int
 343_xfs_buf_lookup_pages(
 344	xfs_buf_t		*bp,
 345	uint			flags)
 346{
 347	struct address_space	*mapping = bp->b_target->bt_mapping;
 348	size_t			blocksize = bp->b_target->bt_bsize;
 349	size_t			size = bp->b_count_desired;
 350	size_t			nbytes, offset;
 351	gfp_t			gfp_mask = xb_to_gfp(flags);
 352	unsigned short		page_count, i;
 353	pgoff_t			first;
 354	xfs_off_t		end;
 355	int			error;
 356
 357	end = bp->b_file_offset + bp->b_buffer_length;
 358	page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
 359
 360	error = _xfs_buf_get_pages(bp, page_count, flags);
 361	if (unlikely(error))
 362		return error;
 363	bp->b_flags |= _XBF_PAGE_CACHE;
 364
 365	offset = bp->b_offset;
 366	first = bp->b_file_offset >> PAGE_CACHE_SHIFT;
 367
 368	for (i = 0; i < bp->b_page_count; i++) {
 369		struct page	*page;
 370		uint		retries = 0;
 371
 372	      retry:
 373		page = find_or_create_page(mapping, first + i, gfp_mask);
 374		if (unlikely(page == NULL)) {
 375			if (flags & XBF_READ_AHEAD) {
 376				bp->b_page_count = i;
 377				for (i = 0; i < bp->b_page_count; i++)
 378					unlock_page(bp->b_pages[i]);
 379				return -ENOMEM;
 380			}
 381
 382			/*
 383			 * This could deadlock.
 384			 *
 385			 * But until all the XFS lowlevel code is revamped to
 386			 * handle buffer allocation failures we can't do much.
 387			 */
 388			if (!(++retries % 100))
 389				printk(KERN_ERR
 390					"XFS: possible memory allocation "
 391					"deadlock in %s (mode:0x%x)\n",
 392					__FUNCTION__, gfp_mask);
 393
 394			XFS_STATS_INC(xb_page_retries);
 395			xfsbufd_wakeup(0, gfp_mask);
 396			congestion_wait(WRITE, HZ/50);
 397			goto retry;
 398		}
 399
 400		XFS_STATS_INC(xb_page_found);
 401
 402		nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset);
 403		size -= nbytes;
 404
 405		ASSERT(!PagePrivate(page));
 406		if (!PageUptodate(page)) {
 407			page_count--;
 408			if (blocksize >= PAGE_CACHE_SIZE) {
 409				if (flags & XBF_READ)
 410					bp->b_locked = 1;
 411			} else if (!PagePrivate(page)) {
 412				if (test_page_region(page, offset, nbytes))
 413					page_count++;
 414			}
 415		}
 416
 417		bp->b_pages[i] = page;
 418		offset = 0;
 419	}
 420
 421	if (!bp->b_locked) {
 422		for (i = 0; i < bp->b_page_count; i++)
 423			unlock_page(bp->b_pages[i]);
 424	}
 425
 426	if (page_count == bp->b_page_count)
 427		bp->b_flags |= XBF_DONE;
 428
 429	XB_TRACE(bp, "lookup_pages", (long)page_count);
 430	return error;
 431}
 432
 433/*
 434 *	Map buffer into kernel address-space if nessecary.
 435 */
 436STATIC int
 437_xfs_buf_map_pages(
 438	xfs_buf_t		*bp,
 439	uint			flags)
 440{
 441	/* A single page buffer is always mappable */
 442	if (bp->b_page_count == 1) {
 443		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
 444		bp->b_flags |= XBF_MAPPED;
 445	} else if (flags & XBF_MAPPED) {
 446		if (as_list_len > 64)
 447			purge_addresses();
 448		bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
 449					VM_MAP, PAGE_KERNEL);
 450		if (unlikely(bp->b_addr == NULL))
 451			return -ENOMEM;
 452		bp->b_addr += bp->b_offset;
 453		bp->b_flags |= XBF_MAPPED;
 454	}
 455
 456	return 0;
 457}
 458
 459/*
 460 *	Finding and Reading Buffers
 461 */
 462
 463/*
 464 *	Look up, and creates if absent, a lockable buffer for
 465 *	a given range of an inode.  The buffer is returned
 466 *	locked.	 If other overlapping buffers exist, they are
 467 *	released before the new buffer is created and locked,
 468 *	which may imply that this call will block until those buffers
 469 *	are unlocked.  No I/O is implied by this call.
 470 */
 471xfs_buf_t *
 472_xfs_buf_find(
 473	xfs_buftarg_t		*btp,	/* block device target		*/
 474	xfs_off_t		ioff,	/* starting offset of range	*/
 475	size_t			isize,	/* length of range		*/
 476	xfs_buf_flags_t		flags,
 477	xfs_buf_t		*new_bp)
 478{
 479	xfs_off_t		range_base;
 480	size_t			range_length;
 481	xfs_bufhash_t		*hash;
 482	xfs_buf_t		*bp, *n;
 483
 484	range_base = (ioff << BBSHIFT);
 485	range_length = (isize << BBSHIFT);
 486
 487	/* Check for IOs smaller than the sector size / not sector aligned */
 488	ASSERT(!(range_length < (1 << btp->bt_sshift)));
 489	ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
 490
 491	hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
 492
 493	spin_lock(&hash->bh_lock);
 494
 495	list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
 496		ASSERT(btp == bp->b_target);
 497		if (bp->b_file_offset == range_base &&
 498		    bp->b_buffer_length == range_length) {
 499			/*
 500			 * If we look at something, bring it to the
 501			 * front of the list for next time.
 502			 */
 503			atomic_inc(&bp->b_hold);
 504			list_move(&bp->b_hash_list, &hash->bh_list);
 505			goto found;
 506		}
 507	}
 508
 509	/* No match found */
 510	if (new_bp) {
 511		_xfs_buf_initialize(new_bp, btp, range_base,
 512				range_length, flags);
 513		new_bp->b_hash = hash;
 514		list_add(&new_bp->b_hash_list, &hash->bh_list);
 515	} else {
 516		XFS_STATS_INC(xb_miss_locked);
 517	}
 518
 519	spin_unlock(&hash->bh_lock);
 520	return new_bp;
 521
 522found:
 523	spin_unlock(&hash->bh_lock);
 524
 525	/* Attempt to get the semaphore without sleeping,
 526	 * if this does not work then we need to drop the
 527	 * spinlock and do a hard attempt on the semaphore.
 528	 */
 529	if (down_trylock(&bp->b_sema)) {
 530		if (!(flags & XBF_TRYLOCK)) {
 531			/* wait for buffer ownership */
 532			XB_TRACE(bp, "get_lock", 0);
 533			xfs_buf_lock(bp);
 534			XFS_STATS_INC(xb_get_locked_waited);
 535		} else {
 536			/* We asked for a trylock and failed, no need
 537			 * to look at file offset and length here, we
 538			 * know that this buffer at least overlaps our
 539			 * buffer and is locked, therefore our buffer
 540			 * either does not exist, or is this buffer.
 541			 */
 542			xfs_buf_rele(bp);
 543			XFS_STATS_INC(xb_busy_locked);
 544			return NULL;
 545		}
 546	} else {
 547		/* trylock worked */
 548		XB_SET_OWNER(bp);
 549	}
 550
 551	if (bp->b_flags & XBF_STALE) {
 552		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
 553		bp->b_flags &= XBF_MAPPED;
 554	}
 555	XB_TRACE(bp, "got_lock", 0);
 556	XFS_STATS_INC(xb_get_locked);
 557	return bp;
 558}
 559
 560/*
 561 *	Assembles a buffer covering the specified range.
 562 *	Storage in memory for all portions of the buffer will be allocated,
 563 *	although backing storage may not be.
 564 */
 565xfs_buf_t *
 566xfs_buf_get_flags(
 567	xfs_buftarg_t		*target,/* target for buffer		*/
 568	xfs_off_t		ioff,	/* starting offset of range	*/
 569	size_t			isize,	/* length of range		*/
 570	xfs_buf_flags_t		flags)
 571{
 572	xfs_buf_t		*bp, *new_bp;
 573	int			error = 0, i;
 574
 575	new_bp = xfs_buf_allocate(flags);
 576	if (unlikely(!new_bp))
 577		return NULL;
 578
 579	bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
 580	if (bp == new_bp) {
 581		error = _xfs_buf_lookup_pages(bp, flags);
 582		if (error)
 583			goto no_buffer;
 584	} else {
 585		xfs_buf_deallocate(new_bp);
 586		if (unlikely(bp == NULL))
 587			return NULL;
 588	}
 589
 590	for (i = 0; i < bp->b_page_count; i++)
 591		mark_page_accessed(bp->b_pages[i]);
 592
 593	if (!(bp->b_flags & XBF_MAPPED)) {
 594		error = _xfs_buf_map_pages(bp, flags);
 595		if (unlikely(error)) {
 596			printk(KERN_WARNING "%s: failed to map pages\n",
 597					__FUNCTION__);
 598			goto no_buffer;
 599		}
 600	}
 601
 602	XFS_STATS_INC(xb_get);
 603
 604	/*
 605	 * Always fill in the block number now, the mapped cases can do
 606	 * their own overlay of this later.
 607	 */
 608	bp->b_bn = ioff;
 609	bp->b_count_desired = bp->b_buffer_length;
 610
 611	XB_TRACE(bp, "get", (unsigned long)flags);
 612	return bp;
 613
 614 no_buffer:
 615	if (flags & (XBF_LOCK | XBF_TRYLOCK))
 616		xfs_buf_unlock(bp);
 617	xfs_buf_rele(bp);
 618	return NULL;
 619}
 620
 621xfs_buf_t *
 622xfs_buf_read_flags(
 623	xfs_buftarg_t		*target,
 624	xfs_off_t		ioff,
 625	size_t			isize,
 626	xfs_buf_flags_t		flags)
 627{
 628	xfs_buf_t		*bp;
 629
 630	flags |= XBF_READ;
 631
 632	bp = xfs_buf_get_flags(target, ioff, isize, flags);
 633	if (bp) {
 634		if (!XFS_BUF_ISDONE(bp)) {
 635			XB_TRACE(bp, "read", (unsigned long)flags);
 636			XFS_STATS_INC(xb_get_read);
 637			xfs_buf_iostart(bp, flags);
 638		} else if (flags & XBF_ASYNC) {
 639			XB_TRACE(bp, "read_async", (unsigned long)flags);
 640			/*
 641			 * Read ahead call which is already satisfied,
 642			 * drop the buffer
 643			 */
 644			goto no_buffer;
 645		} else {
 646			XB_TRACE(bp, "read_done", (unsigned long)flags);
 647			/* We do not want read in the flags */
 648			bp->b_flags &= ~XBF_READ;
 649		}
 650	}
 651
 652	return bp;
 653
 654 no_buffer:
 655	if (flags & (XBF_LOCK | XBF_TRYLOCK))
 656		xfs_buf_unlock(bp);
 657	xfs_buf_rele(bp);
 658	return NULL;
 659}
 660
 661/*
 662 *	If we are not low on memory then do the readahead in a deadlock
 663 *	safe manner.
 664 */
 665void
 666xfs_buf_readahead(
 667	xfs_buftarg_t		*target,
 668	xfs_off_t		ioff,
 669	size_t			isize,
 670	xfs_buf_flags_t		flags)
 671{
 672	struct backing_dev_info *bdi;
 673
 674	bdi = target->bt_mapping->backing_dev_info;
 675	if (bdi_read_congested(bdi))
 676		return;
 677
 678	flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
 679	xfs_buf_read_flags(target, ioff, isize, flags);
 680}
 681
 682xfs_buf_t *
 683xfs_buf_get_empty(
 684	size_t			len,
 685	xfs_buftarg_t		*target)
 686{
 687	xfs_buf_t		*bp;
 688
 689	bp = xfs_buf_allocate(0);
 690	if (bp)
 691		_xfs_buf_initialize(bp, target, 0, len, 0);
 692	return bp;
 693}
 694
 695static inline struct page *
 696mem_to_page(
 697	void			*addr)
 698{
 699	if (((unsigned long)addr < VMALLOC_START) ||
 700	    ((unsigned long)addr >= VMALLOC_END)) {
 701		return virt_to_page(addr);
 702	} else {
 703		return vmalloc_to_page(addr);
 704	}
 705}
 706
 707int
 708xfs_buf_associate_memory(
 709	xfs_buf_t		*bp,
 710	void			*mem,
 711	size_t			len)
 712{
 713	int			rval;
 714	int			i = 0;
 715	size_t			ptr;
 716	size_t			end, end_cur;
 717	off_t			offset;
 718	int			page_count;
 719
 720	page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
 721	offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK);
 722	if (offset && (len > PAGE_CACHE_SIZE))
 723		page_count++;
 724
 725	/* Free any previous set of page pointers */
 726	if (bp->b_pages)
 727		_xfs_buf_free_pages(bp);
 728
 729	bp->b_pages = NULL;
 730	bp->b_addr = mem;
 731
 732	rval = _xfs_buf_get_pages(bp, page_count, 0);
 733	if (rval)
 734		return rval;
 735
 736	bp->b_offset = offset;
 737	ptr = (size_t) mem & PAGE_CACHE_MASK;
 738	end = PAGE_CACHE_ALIGN((size_t) mem + len);
 739	end_cur = end;
 740	/* set up first page */
 741	bp->b_pages[0] = mem_to_page(mem);
 742
 743	ptr += PAGE_CACHE_SIZE;
 744	bp->b_page_count = ++i;
 745	while (ptr < end) {
 746		bp->b_pages[i] = mem_to_page((void *)ptr);
 747		bp->b_page_count = ++i;
 748		ptr += PAGE_CACHE_SIZE;
 749	}
 750	bp->b_locked = 0;
 751
 752	bp->b_count_desired = bp->b_buffer_length = len;
 753	bp->b_flags |= XBF_MAPPED;
 754
 755	return 0;
 756}
 757
 758xfs_buf_t *
 759xfs_buf_get_noaddr(
 760	size_t			len,
 761	xfs_buftarg_t		*target)
 762{
 763	unsigned long		page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
 764	int			error, i;
 765	xfs_buf_t		*bp;
 766
 767	bp = xfs_buf_allocate(0);
 768	if (unlikely(bp == NULL))
 769		goto fail;
 770	_xfs_buf_initialize(bp, target, 0, len, 0);
 771
 772	error = _xfs_buf_get_pages(bp, page_count, 0);
 773	if (error)
 774		goto fail_free_buf;
 775
 776	for (i = 0; i < page_count; i++) {
 777		bp->b_pages[i] = alloc_page(GFP_KERNEL);
 778		if (!bp->b_pages[i])
 779			goto fail_free_mem;
 780	}
 781	bp->b_flags |= _XBF_PAGES;
 782
 783	error = _xfs_buf_map_pages(bp, XBF_MAPPED);
 784	if (unlikely(error)) {
 785		printk(KERN_WARNING "%s: failed to map pages\n",
 786				__FUNCTION__);
 787		goto fail_free_mem;
 788	}
 789
 790	xfs_buf_unlock(bp);
 791
 792	XB_TRACE(bp, "no_daddr", len);
 793	return bp;
 794
 795 fail_free_mem:
 796	while (--i >= 0)
 797		__free_page(bp->b_pages[i]);
 798	_xfs_buf_free_pages(bp);
 799 fail_free_buf:
 800	xfs_buf_deallocate(bp);
 801 fail:
 802	return NULL;
 803}
 804
 805/*
 806 *	Increment reference count on buffer, to hold the buffer concurrently
 807 *	with another thread which may release (free) the buffer asynchronously.
 808 *	Must hold the buffer already to call this function.
 809 */
 810void
 811xfs_buf_hold(
 812	xfs_buf_t		*bp)
 813{
 814	atomic_inc(&bp->b_hold);
 815	XB_TRACE(bp, "hold", 0);
 816}
 817
 818/*
 819 *	Releases a hold on the specified buffer.  If the
 820 *	the hold count is 1, calls xfs_buf_free.
 821 */
 822void
 823xfs_buf_rele(
 824	xfs_buf_t		*bp)
 825{
 826	xfs_bufhash_t		*hash = bp->b_hash;
 827
 828	XB_TRACE(bp, "rele", bp->b_relse);
 829
 830	if (unlikely(!hash)) {
 831		ASSERT(!bp->b_relse);
 832		if (atomic_dec_and_test(&bp->b_hold))
 833			xfs_buf_free(bp);
 834		return;
 835	}
 836
 837	if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
 838		if (bp->b_relse) {
 839			atomic_inc(&bp->b_hold);
 840			spin_unlock(&hash->bh_lock);
 841			(*(bp->b_relse)) (bp);
 842		} else if (bp->b_flags & XBF_FS_MANAGED) {
 843			spin_unlock(&hash->bh_lock);
 844		} else {
 845			ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
 846			list_del_init(&bp->b_hash_list);
 847			spin_unlock(&hash->bh_lock);
 848			xfs_buf_free(bp);
 849		}
 850	} else {
 851		/*
 852		 * Catch reference count leaks
 853		 */
 854		ASSERT(atomic_read(&bp->b_hold) >= 0);
 855	}
 856}
 857
 858
 859/*
 860 *	Mutual exclusion on buffers.  Locking model:
 861 *
 862 *	Buffers associated with inodes for which buffer locking
 863 *	is not enabled are not protected by semaphores, and are
 864 *	assumed to be exclusively owned by the caller.  There is a
 865 *	spinlock in the buffer, used by the caller when concurrent
 866 *	access is possible.
 867 */
 868
 869/*
 870 *	Locks a buffer object, if it is not already locked.
 871 *	Note that this in no way locks the underlying pages, so it is only
 872 *	useful for synchronizing concurrent use of buffer objects, not for
 873 *	synchronizing independent access to the underlying pages.
 874 */
 875int
 876xfs_buf_cond_lock(
 877	xfs_buf_t		*bp)
 878{
 879	int			locked;
 880
 881	locked = down_trylock(&bp->b_sema) == 0;
 882	if (locked) {
 883		XB_SET_OWNER(bp);
 884	}
 885	XB_TRACE(bp, "cond_lock", (long)locked);
 886	return locked ? 0 : -EBUSY;
 887}
 888
 889#if defined(DEBUG) || defined(XFS_BLI_TRACE)
 890int
 891xfs_buf_lock_value(
 892	xfs_buf_t		*bp)
 893{
 894	return atomic_read(&bp->b_sema.count);
 895}
 896#endif
 897
 898/*
 899 *	Locks a buffer object.
 900 *	Note that this in no way locks the underlying pages, so it is only
 901 *	useful for synchronizing concurrent use of buffer objects, not for
 902 *	synchronizing independent access to the underlying pages.
 903 */
 904void
 905xfs_buf_lock(
 906	xfs_buf_t		*bp)
 907{
 908	XB_TRACE(bp, "lock", 0);
 909	if (atomic_read(&bp->b_io_remaining))
 910		blk_run_address_space(bp->b_target->bt_mapping);
 911	down(&bp->b_sema);
 912	XB_SET_OWNER(bp);
 913	XB_TRACE(bp, "locked", 0);
 914}
 915
 916/*
 917 *	Releases the lock on the buffer object.
 918 *	If the buffer is marked delwri but is not queued, do so before we
 919 *	unlock the buffer as we need to set flags correctly.  We also need to
 920 *	take a reference for the delwri queue because the unlocker is going to
 921 *	drop their's and they don't know we just queued it.
 922 */
 923void
 924xfs_buf_unlock(
 925	xfs_buf_t		*bp)
 926{
 927	if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) {
 928		atomic_inc(&bp->b_hold);
 929		bp->b_flags |= XBF_ASYNC;
 930		xfs_buf_delwri_queue(bp, 0);
 931	}
 932
 933	XB_CLEAR_OWNER(bp);
 934	up(&bp->b_sema);
 935	XB_TRACE(bp, "unlock", 0);
 936}
 937
 938
 939/*
 940 *	Pinning Buffer Storage in Memory
 941 *	Ensure that no attempt to force a buffer to disk will succeed.
 942 */
 943void
 944xfs_buf_pin(
 945	xfs_buf_t		*bp)
 946{
 947	atomic_inc(&bp->b_pin_count);
 948	XB_TRACE(bp, "pin", (long)bp->b_pin_count.counter);
 949}
 950
 951void
 952xfs_buf_unpin(
 953	xfs_buf_t		*bp)
 954{
 955	if (atomic_dec_and_test(&bp->b_pin_count))
 956		wake_up_all(&bp->b_waiters);
 957	XB_TRACE(bp, "unpin", (long)bp->b_pin_count.counter);
 958}
 959
 960int
 961xfs_buf_ispin(
 962	xfs_buf_t		*bp)
 963{
 964	return atomic_read(&bp->b_pin_count);
 965}
 966
 967STATIC void
 968xfs_buf_wait_unpin(
 969	xfs_buf_t		*bp)
 970{
 971	DECLARE_WAITQUEUE	(wait, current);
 972
 973	if (atomic_read(&bp->b_pin_count) == 0)
 974		return;
 975
 976	add_wait_queue(&bp->b_waiters, &wait);
 977	for (;;) {
 978		set_current_state(TASK_UNINTERRUPTIBLE);
 979		if (atomic_read(&bp->b_pin_count) == 0)
 980			break;
 981		if (atomic_read(&bp->b_io_remaining))
 982			blk_run_address_space(bp->b_target->bt_mapping);
 983		schedule();
 984	}
 985	remove_wait_queue(&bp->b_waiters, &wait);
 986	set_current_state(TASK_RUNNING);
 987}
 988
 989/*
 990 *	Buffer Utility Routines
 991 */
 992
 993STATIC void
 994xfs_buf_iodone_work(
 995	struct work_struct	*work)
 996{
 997	xfs_buf_t		*bp =
 998		container_of(work, xfs_buf_t, b_iodone_work);
 999
1000	if (bp->b_iodone)
1001		(*(bp->b_iodone))(bp);
1002	else if (bp->b_flags & XBF_ASYNC)
1003		xfs_buf_relse(bp);
1004}
1005
1006void
1007xfs_buf_ioend(
1008	xfs_buf_t		*bp,
1009	int			schedule)
1010{
1011	bp->b_flags &= ~(XBF_READ | XBF_WRITE);
1012	if (bp->b_error == 0)
1013		bp->b_flags |= XBF_DONE;
1014
1015	XB_TRACE(bp, "iodone", bp->b_iodone);
1016
1017	if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
1018		if (schedule) {
1019			INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
1020			queue_work(xfslogd_workqueue, &bp->b_iodone_work);
1021		} else {
1022			xfs_buf_iodone_work(&bp->b_iodone_work);
1023		}
1024	} else {
1025		up(&bp->b_iodonesema);
1026	}
1027}
1028
1029void
1030xfs_buf_ioerror(
1031	xfs_buf_t		*bp,
1032	int			error)
1033{
1034	ASSERT(error >= 0 && error <= 0xffff);
1035	bp->b_error = (unsigned short)error;
1036	XB_TRACE(bp, "ioerror", (unsigned long)error);
1037}
1038
1039/*
1040 *	Initiate I/O on a buffer, based on the flags supplied.
1041 *	The b_iodone routine in the buffer supplied will only be called
1042 *	when all of the subsidiary I/O requests, if any, have been completed.
1043 */
1044int
1045xfs_buf_iostart(
1046	xfs_buf_t		*bp,
1047	xfs_buf_flags_t		flags)
1048{
1049	int			status = 0;
1050
1051	XB_TRACE(bp, "iostart", (unsigned long)flags);
1052
1053	if (flags & XBF_DELWRI) {
1054		bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC);
1055		bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC);
1056		xfs_buf_delwri_queue(bp, 1);
1057		return status;
1058	}
1059
1060	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
1061			XBF_READ_AHEAD | _XBF_RUN_QUEUES);
1062	bp->b_flags |= flags & (XBF_READ | XBF_WRITE | XBF_ASYNC | \
1063			XBF_READ_AHEAD | _XBF_RUN_QUEUES);
1064
1065	BUG_ON(bp->b_bn == XFS_BUF_DADDR_NULL);
1066
1067	/* For writes allow an alternate strategy routine to precede
1068	 * the actual I/O request (which may not be issued at all in
1069	 * a shutdown situation, for example).
1070	 */
1071	status = (flags & XBF_WRITE) ?
1072		xfs_buf_iostrategy(bp) : xfs_buf_iorequest(bp);
1073
1074	/* Wait for I/O if we are not an async request.
1075	 * Note: async I/O request completion will release the buffer,
1076	 * and that can already be done by this point.  So using the
1077	 * buffer pointer from here on, after async I/O, is invalid.
1078	 */
1079	if (!status && !(flags & XBF_ASYNC))
1080		status = xfs_buf_iowait(bp);
1081
1082	return status;
1083}
1084
1085STATIC_INLINE int
1086_xfs_buf_iolocked(
1087	xfs_buf_t		*bp)
1088{
1089	ASSERT(bp->b_flags & (XBF_READ | XBF_WRITE));
1090	if (bp->b_flags & XBF_READ)
1091		return bp->b_locked;
1092	return 0;
1093}
1094
1095STATIC_INLINE void
1096_xfs_buf_ioend(
1097	xfs_buf_t		*bp,
1098	int			schedule)
1099{
1100	if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
1101		bp->b_locked = 0;
1102		xfs_buf_ioend(bp, schedule);
1103	}
1104}
1105
1106STATIC int
1107xfs_buf_bio_end_io(
1108	struct bio		*bio,
1109	unsigned int		bytes_done,
1110	int			error)
1111{
1112	xfs_buf_t		*bp = (xfs_buf_t *)bio->bi_private;
1113	unsigned int		blocksize = bp->b_target->bt_bsize;
1114	struct bio_vec		*bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1115
1116	if (bio->bi_size)
1117		return 1;
1118
1119	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1120		bp->b_error = EIO;
1121
1122	do {
1123		struct page	*page = bvec->bv_page;
1124
1125		ASSERT(!PagePrivate(page));
1126		if (unlikely(bp->b_error)) {
1127			if (bp->b_flags & XBF_READ)
1128				ClearPageUptodate(page);
1129		} else if (blocksize >= PAGE_CACHE_SIZE) {
1130			SetPageUptodate(page);
1131		} else if (!PagePrivate(page) &&
1132				(bp->b_flags & _XBF_PAGE_CACHE)) {
1133			set_page_region(page, bvec->bv_offset, bvec->bv_len);
1134		}
1135
1136		if (--bvec >= bio->bi_io_vec)
1137			prefetchw(&bvec->bv_page->flags);
1138
1139		if (_xfs_buf_iolocked(bp)) {
1140			unlock_page(page);
1141		}
1142	} while (bvec >= bio->bi_io_vec);
1143
1144	_xfs_buf_ioend(bp, 1);
1145	bio_put(bio);
1146	return 0;
1147}
1148
1149STATIC void
1150_xfs_buf_ioapply(
1151	xfs_buf_t		*bp)
1152{
1153	int			i, rw, map_i, total_nr_pages, nr_pages;
1154	struct bio		*bio;
1155	int			offset = bp->b_offset;
1156	int			size = bp->b_count_desired;
1157	sector_t		sector = bp->b_bn;
1158	unsigned int		blocksize = bp->b_target->bt_bsize;
1159	int			locking = _xfs_buf_iolocked(bp);
1160
1161	total_nr_pages = bp->b_page_count;
1162	map_i = 0;
1163
1164	if (bp->b_flags & XBF_ORDERED) {
1165		ASSERT(!(bp->b_flags & XBF_READ));
1166		rw = WRITE_BARRIER;
1167	} else if (bp->b_flags & _XBF_RUN_QUEUES) {
1168		ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
1169		bp->b_flags &= ~_XBF_RUN_QUEUES;
1170		rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC;
1171	} else {
1172		rw = (bp->b_flags & XBF_WRITE) ? WRITE :
1173		     (bp->b_flags & XBF_READ_AHEAD) ? READA : READ;
1174	}
1175
1176	/* Special code path for reading a sub page size buffer in --
1177	 * we populate up the whole page, and hence the other metadata
1178	 * in the same page.  This optimization is only valid when the
1179	 * filesystem block size is not smaller than the page size.
1180	 */
1181	if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
1182	    (bp->b_flags & XBF_READ) && locking &&
1183	    (blocksize >= PAGE_CACHE_SIZE)) {
1184		bio = bio_alloc(GFP_NOIO, 1);
1185
1186		bio->bi_bdev = bp->b_target->bt_bdev;
1187		bio->bi_sector = sector - (offset >> BBSHIFT);
1188		bio->bi_end_io = xfs_buf_bio_end_io;
1189		bio->bi_private = bp;
1190
1191		bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0);
1192		size = 0;
1193
1194		atomic_inc(&bp->b_io_remaining);
1195
1196		goto submit_io;
1197	}
1198
1199	/* Lock down the pages which we need to for the request */
1200	if (locking && (bp->b_flags & XBF_WRITE) && (bp->b_locked == 0)) {
1201		for (i = 0; size; i++) {
1202			int		nbytes = PAGE_CACHE_SIZE - offset;
1203			struct page	*page = bp->b_pages[i];
1204
1205			if (nbytes > size)
1206				nbytes = size;
1207
1208			lock_page(page);
1209
1210			size -= nbytes;
1211			offset = 0;
1212		}
1213		offset = bp->b_offset;
1214		size = bp->b_count_desired;
1215	}
1216
1217next_chunk:
1218	atomic_inc(&bp->b_io_remaining);
1219	nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
1220	if (nr_pages > total_nr_pages)
1221		nr_pages = total_nr_pages;
1222
1223	bio = bio_alloc(GFP_NOIO, nr_pages);
1224	bio->bi_bdev = bp->b_target->bt_bdev;
1225	bio->bi_sector = sector;
1226	bio->bi_end_io = xfs_buf_bio_end_io;
1227	bio->bi_private = bp;
1228
1229	for (; size && nr_pages; nr_pages--, map_i++) {
1230		int	rbytes, nbytes = PAGE_CACHE_SIZE - offset;
1231
1232		if (nbytes > size)
1233			nbytes = size;
1234
1235		rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset);
1236		if (rbytes < nbytes)
1237			break;
1238
1239		offset = 0;
1240		sector += nbytes >> BBSHIFT;
1241		size -= nbytes;
1242		total_nr_pages--;
1243	}
1244
1245submit_io:
1246	if (likely(bio->bi_size)) {
1247		submit_bio(rw, bio);
1248		if (size)
1249			goto next_chunk;
1250	} else {
1251		bio_put(bio);
1252		xfs_buf_ioerror(bp, EIO);
1253	}
1254}
1255
1256int
1257xfs_buf_iorequest(
1258	xfs_buf_t		*bp)
1259{
1260	XB_TRACE(bp, "iorequest", 0);
1261
1262	if (bp->b_flags & XBF_DELWRI) {
1263		xfs_buf_delwri_queue(bp, 1);
1264		return 0;
1265	}
1266
1267	if (bp->b_flags & XBF_WRITE) {
1268		xfs_buf_wait_unpin(bp);
1269	}
1270
1271	xfs_buf_hold(bp);
1272
1273	/* Set the count to 1 initially, this will stop an I/O
1274	 * completion callout which happens before we have started
1275	 * all the I/O from calling xfs_buf_ioend too early.
1276	 */
1277	atomic_set(&bp->b_io_remaining, 1);
1278	_xfs_buf_ioapply(bp);
1279	_xfs_buf_ioend(bp, 0);
1280
1281	xfs_buf_rele(bp);
1282	return 0;
1283}
1284
1285/*
1286 *	Waits for I/O to complete on the buffer supplied.
1287 *	It returns immediately if no I/O is pending.
1288 *	It returns the I/O error code, if any, or 0 if there was no error.
1289 */
1290int
1291xfs_buf_iowait(
1292	xfs_buf_t		*bp)
1293{
1294	XB_TRACE(bp, "iowait", 0);
1295	if (atomic_read(&bp->b_io_remaining))
1296		blk_run_address_space(bp->b_target->bt_mapping);
1297	down(&bp->b_iodonesema);
1298	XB_TRACE(bp, "iowaited", (long)bp->b_error);
1299	return bp->b_error;
1300}
1301
1302xfs_caddr_t
1303xfs_buf_offset(
1304	xfs_buf_t		*bp,
1305	size_t			offset)
1306{
1307	struct page		*page;
1308
1309	if (bp->b_flags & XBF_MAPPED)
1310		return XFS_BUF_PTR(bp) + offset;
1311
1312	offset += bp->b_offset;
1313	page = bp->b_pages[offset >> PAGE_CACHE_SHIFT];
1314	return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1));
1315}
1316
1317/*
1318 *	Move data into or out of a buffer.
1319 */
1320void
1321xfs_buf_iomove(
1322	xfs_buf_t		*bp,	/* buffer to process		*/
1323	size_t			boff,	/* starting buffer offset	*/
1324	size_t			bsize,	/* length to copy		*/
1325	caddr_t			data,	/* data address			*/
1326	xfs_buf_rw_t		mode)	/* read/write/zero flag		*/
1327{
1328	size_t			bend, cpoff, csize;
1329	struct page		*page;
1330
1331	bend = boff + bsize;
1332	while (boff < bend) {
1333		page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
1334		cpoff = xfs_buf_poff(boff + bp->b_offset);
1335		csize = min_t(size_t,
1336			      PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff);
1337
1338		ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
1339
1340		switch (mode) {
1341		case XBRW_ZERO:
1342			memset(page_address(page) + cpoff, 0, csize);
1343			break;
1344		case XBRW_READ:
1345			memcpy(data, page_address(page) + cpoff, csize);
1346			break;
1347		case XBRW_WRITE:
1348			memcpy(page_address(page) + cpoff, data, csize);
1349		}
1350
1351		boff += csize;
1352		data += csize;
1353	}
1354}
1355
1356/*
1357 *	Handling of buffer targets (buftargs).
1358 */
1359
1360/*
1361 *	Wait for any bufs with callbacks that have been submitted but
1362 *	have not yet returned... walk the hash list for the target.
1363 */
1364void
1365xfs_wait_buftarg(
1366	xfs_buftarg_t	*btp)
1367{
1368	xfs_buf_t	*bp, *n;
1369	xfs_bufhash_t	*hash;
1370	uint		i;
1371
1372	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
1373		hash = &btp->bt_hash[i];
1374again:
1375		spin_lock(&hash->bh_lock);
1376		list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
1377			ASSERT(btp == bp->b_target);
1378			if (!(bp->b_flags & XBF_FS_MANAGED)) {
1379				spin_unlock(&hash->bh_lock);
1380				/*
1381				 * Catch superblock reference count leaks
1382				 * immediately
1383				 */
1384				BUG_ON(bp->b_bn == 0);
1385				delay(100);
1386				goto again;
1387			}
1388		}
1389		spin_unlock(&hash->bh_lock);
1390	}
1391}
1392
1393/*
1394 *	Allocate buffer hash table for a given target.
1395 *	For devices containing metadata (i.e. not the log/realtime devices)
1396 *	we need to allocate a much larger hash table.
1397 */
1398STATIC void
1399xfs_alloc_bufhash(
1400	xfs_buftarg_t		*btp,
1401	int			external)
1402{
1403	unsigned int		i;
1404
1405	btp->bt_hashshift = external ? 3 : 8;	/* 8 or 256 buckets */
1406	btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
1407	btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) *
1408					sizeof(xfs_bufhash_t), KM_SLEEP | KM_LARGE);
1409	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
1410		spin_lock_init(&btp->bt_hash[i].bh_lock);
1411		INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
1412	}
1413}
1414
1415STATIC void
1416xfs_free_bufhash(
1417	xfs_buftarg_t		*btp)
1418{
1419	kmem_free(btp->bt_hash, (1<<btp->bt_hashshift) * sizeof(xfs_bufhash_t));
1420	btp->bt_hash = NULL;
1421}
1422
1423/*
1424 *	buftarg list for delwrite queue processing
1425 */
1426static LIST_HEAD(xfs_buftarg_list);
1427static DEFINE_SPINLOCK(xfs_buftarg_lock);
1428
1429STATIC void
1430xfs_register_buftarg(
1431	xfs_buftarg_t           *btp)
1432{
1433	spin_lock(&xfs_buftarg_lock);
1434	list_add(&btp->bt_list, &xfs_buftarg_list);
1435	spin_unlock(&xfs_buftarg_lock);
1436}
1437
1438STATIC void
1439xfs_unregister_buftarg(
1440	xfs_buftarg_t           *btp)
1441{
1442	spin_lock(&xfs_buftarg_lock);
1443	list_del(&btp->bt_list);
1444	spin_unlock(&xfs_buftarg_lock);
1445}
1446
1447void
1448xfs_free_buftarg(
1449	xfs_buftarg_t		*btp,
1450	int			external)
1451{
1452	xfs_flush_buftarg(btp, 1);
1453	xfs_blkdev_issue_flush(btp);
1454	if (external)
1455		xfs_blkdev_put(btp->bt_bdev);
1456	xfs_free_bufhash(btp);
1457	iput(btp->bt_mapping->host);
1458
1459	/* Unregister the buftarg first so that we don't get a
1460	 * wakeup finding a non-existent task
1461	 */
1462	xfs_unregister_buftarg(btp);
1463	kthread_stop(btp->bt_task);
1464
1465	kmem_free(btp, sizeof(*btp));
1466}
1467
1468STATIC int
1469xfs_setsize_buftarg_flags(
1470	xfs_buftarg_t		*btp,
1471	unsigned int		blocksize,
1472	unsigned int		sectorsize,
1473	int			verbose)
1474{
1475	btp->bt_bsize = blocksize;
1476	btp->bt_sshift = ffs(sectorsize) - 1;
1477	btp->bt_smask = sectorsize - 1;
1478
1479	if (set_blocksize(btp->bt_bdev, sectorsize)) {
1480		printk(KERN_WARNING
1481			"XFS: Cannot set_blocksize to %u on device %s\n",
1482			sectorsize, XFS_BUFTARG_NAME(btp));
1483		return EINVAL;
1484	}
1485
1486	if (verbose &&
1487	    (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {
1488		printk(KERN_WARNING
1489			"XFS: %u byte sectors in use on device %s.  "
1490			"This is suboptimal; %u or greater is ideal.\n",
1491			sectorsize, XFS_BUFTARG_NAME(btp),
1492			(unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);
1493	}
1494
1495	return 0;
1496}
1497
1498/*
1499 *	When allocating the initial buffer target we have not yet
1500 *	read in the superblock, so don't know what sized sectors
1501 *	are being used is at this early stage.  Play safe.
1502 */
1503STATIC int
1504xfs_setsize_buftarg_early(
1505	xfs_buftarg_t		*btp,
1506	struct block_device	*bdev)
1507{
1508	return xfs_setsize_buftarg_flags(btp,
1509			PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0);
1510}
1511
1512int
1513xfs_setsize_buftarg(
1514	xfs_buftarg_t		*btp,
1515	unsigned int		blocksize,
1516	unsigned int		sectorsize)
1517{
1518	return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
1519}
1520
1521STATIC int
1522xfs_mapping_buftarg(
1523	xfs_buftarg_t		*btp,
1524	struct block_device	*bdev)
1525{
1526	struct backing_dev_info	*bdi;
1527	struct inode		*inode;
1528	struct address_space	*mapping;
1529	static const struct address_space_operations mapping_aops = {
1530		.sync_page = block_sync_page,
1531		.migratepage = fail_migrate_page,
1532	};
1533
1534	inode = new_inode(bdev->bd_inode->i_sb);
1535	if (!inode) {
1536		printk(KERN_WARNING
1537			"XFS: Cannot allocate mapping inode for device %s\n",
1538			XFS_BUFTARG_NAME(btp));
1539		return ENOMEM;
1540	}
1541	inode->i_mode = S_IFBLK;
1542	inode->i_bdev = bdev;
1543	inode->i_rdev = bdev->bd_dev;
1544	bdi = blk_get_backing_dev_info(bdev);
1545	if (!bdi)
1546		bdi = &default_backing_dev_info;
1547	mapping = &inode->i_data;
1548	mapping->a_ops = &mapping_aops;
1549	mapping->backing_dev_info = bdi;
1550	mapping_set_gfp_mask(mapping, GFP_NOFS);
1551	btp->bt_mapping = mapping;
1552	return 0;
1553}
1554
1555STATIC int
1556xfs_alloc_delwrite_queue(
1557	xfs_buftarg_t		*btp)
1558{
1559	int	error = 0;
1560
1561	INIT_LIST_HEAD(&btp->bt_list);
1562	INIT_LIST_HEAD(&btp->bt_delwrite_queue);
1563	spinlock_init(&btp->bt_delwrite_lock, "delwri_lock");
1564	btp->bt_flags = 0;
1565	btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd");
1566	if (IS_ERR(btp->bt_task)) {
1567		error = PTR_ERR(btp->bt_task);
1568		goto out_error;
1569	}
1570	xfs_register_buftarg(btp);
1571out_error:
1572	return error;
1573}
1574
1575xfs_buftarg_t *
1576xfs_alloc_buftarg(
1577	struct block_device	*bdev,
1578	int			external)
1579{
1580	xfs_buftarg_t		*btp;
1581
1582	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
1583
1584	btp->bt_dev =  bdev->bd_dev;
1585	btp->bt_bdev = bdev;
1586	if (xfs_setsize_buftarg_early(btp, bdev))
1587		goto error;
1588	if (xfs_mapping_buftarg(btp, bdev))
1589		goto error;
1590	if (xfs_alloc_delwrite_queue(btp))
1591		goto error;
1592	xfs_alloc_bufhash(btp, external);
1593	return btp;
1594
1595error:
1596	kmem_free(btp, sizeof(*btp));
1597	return NULL;
1598}
1599
1600
1601/*
1602 *	Delayed write buffer handling
1603 */
1604STATIC void
1605xfs_buf_delwri_queue(
1606	xfs_buf_t		*bp,
1607	int			unlock)
1608{
1609	struct list_head	*dwq = &bp->b_target->bt_delwrite_queue;
1610	spinlock_t		*dwlk = &bp->b_target->bt_delwrite_lock;
1611
1612	XB_TRACE(bp, "delwri_q", (long)unlock);
1613	ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC));
1614
1615	spin_lock(dwlk);
1616	/* If already in the queue, dequeue and place at tail */
1617	if (!list_empty(&bp->b_list)) {
1618		ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1619		if (unlock)
1620			atomic_dec(&bp->b_hold);
1621		list_del(&bp->b_list);
1622	}
1623
1624	bp->b_flags |= _XBF_DELWRI_Q;
1625	list_add_tail(&bp->b_list, dwq);
1626	bp->b_queuetime = jiffies;
1627	spin_unlock(dwlk);
1628
1629	if (unlock)
1630		xfs_buf_unlock(bp);
1631}
1632
1633void
1634xfs_buf_delwri_dequeue(
1635	xfs_buf_t		*bp)
1636{
1637	spinlock_t		*dwlk = &bp->b_target->bt_delwrite_lock;
1638	int			dequeued = 0;
1639
1640	spin_lock(dwlk);
1641	if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
1642		ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1643		list_del_init(&bp->b_list);
1644		dequeued = 1;
1645	}
1646	bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
1647	spin_unlock(dwlk);
1648
1649	if (dequeued)
1650		xfs_buf_rele(bp);
1651
1652	XB_TRACE(bp, "delwri_dq", (long)dequeued);
1653}
1654
1655STATIC void
1656xfs_buf_runall_queues(
1657	struct workqueue_struct	*queue)
1658{
1659	flush_workqueue(queue);
1660}
1661
1662STATIC int
1663xfsbufd_wakeup(
1664	int			priority,
1665	gfp_t			mask)
1666{
1667	xfs_buftarg_t		*btp;
1668
1669	spin_lock(&xfs_buftarg_lock);
1670	list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
1671		if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
1672			continue;
1673		set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
1674		wake_up_process(btp->bt_task);
1675	}
1676	spin_unlock(&xfs_buftarg_lock);
1677	return 0;
1678}
1679
1680/*
1681 * Move as many buffers as specified to the supplied list
1682 * idicating if we skipped any buffers to prevent deadlocks.
1683 */
1684STATIC int
1685xfs_buf_delwri_split(
1686	xfs_buftarg_t	*target,
1687	struct list_head *list,
1688	unsigned long	age)
1689{
1690	xfs_buf_t	*bp, *n;
1691	struct list_head *dwq = &target->bt_delwrite_queue;
1692	spinlock_t	*dwlk = &target->bt_delwrite_lock;
1693	int		skipped = 0;
1694	int		force;
1695
1696	force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1697	INIT_LIST_HEAD(list);
1698	spin_lock(dwlk);
1699	list_for_each_entry_safe(bp, n, dwq, b_list) {
1700		XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp));
1701		ASSERT(bp->b_flags & XBF_DELWRI);
1702
1703		if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) {
1704			if (!force &&
1705			    time_before(jiffies, bp->b_queuetime + age)) {
1706				xfs_buf_unlock(bp);
1707				break;
1708			}
1709
1710			bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q|
1711					 _XBF_RUN_QUEUES);
1712			bp->b_flags |= XBF_WRITE;
1713			list_move_tail(&bp->b_list, list);
1714		} else
1715			skipped++;
1716	}
1717	spin_unlock(dwlk);
1718
1719	return skipped;
1720
1721}
1722
1723STATIC int
1724xfsbufd(
1725	void		*data)
1726{
1727	struct list_head tmp;
1728	xfs_buftarg_t	*target = (xfs_buftarg_t *)data;
1729	int		count;
1730	xfs_buf_t	*bp;
1731
1732	current->flags |= PF_MEMALLOC;
1733
1734	do {
1735		if (unlikely(freezing(current))) {
1736			set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1737			refrigerator();
1738		} else {
1739			clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1740		}
1741
1742		schedule_timeout_interruptible(
1743			xfs_buf_timer_centisecs * msecs_to_jiffies(10));
1744
1745		xfs_buf_delwri_split(target, &tmp,
1746				xfs_buf_age_centisecs * msecs_to_jiffies(10));
1747
1748		count = 0;
1749		while (!list_empty(&tmp)) {
1750			bp = list_entry(tmp.next, xfs_buf_t, b_list);
1751			ASSERT(target == bp->b_target);
1752
1753			list_del_init(&bp->b_list);
1754			xfs_buf_iostrategy(bp);
1755			count++;
1756		}
1757
1758		if (as_list_len > 0)
1759			purge_addresses();
1760		if (count)
1761			blk_run_address_space(target->bt_mapping);
1762
1763	} while (!kthread_should_stop());
1764
1765	return 0;
1766}
1767
1768/*
1769 *	Go through all incore buffers, and release buffers if they belong to
1770 *	the given device. This is used in filesystem error handling to
1771 *	preserve the consistency of its metadata.
1772 */
1773int
1774xfs_flush_buftarg(
1775	xfs_buftarg_t	*target,
1776	int		wait)
1777{
1778	struct list_head tmp;
1779	xfs_buf_t	*bp, *n;
1780	int		pincount = 0;
1781
1782	xfs_buf_runall_queues(xfsdatad_workqueue);
1783	xfs_buf_runall_queues(xfslogd_workqueue);
1784
1785	set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1786	pincount = xfs_buf_delwri_split(target, &tmp, 0);
1787
1788	/*
1789	 * Dropped the delayed write list lock, now walk the temporary list
1790	 */
1791	list_for_each_entry_safe(bp, n, &tmp, b_list) {
1792		ASSERT(target == bp->b_target);
1793		if (wait)
1794			bp->b_flags &= ~XBF_ASYNC;
1795		else
1796			list_del_init(&bp->b_list);
1797
1798		xfs_buf_iostrategy(bp);
1799	}
1800
1801	if (wait)
1802		blk_run_address_space(target->bt_mapping);
1803
1804	/*
1805	 * Remaining list items must be flushed before returning
1806	 */
1807	while (!list_empty(&tmp)) {
1808		bp = list_entry(tmp.next, xfs_buf_t, b_list);
1809
1810		list_del_init(&bp->b_list);
1811		xfs_iowait(bp);
1812		xfs_buf_relse(bp);
1813	}
1814
1815	return pincount;
1816}
1817
1818int __init
1819xfs_buf_init(void)
1820{
1821#ifdef XFS_BUF_TRACE
1822	xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_SLEEP);
1823#endif
1824
1825	xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
1826						KM_ZONE_HWALIGN, NULL);
1827	if (!xfs_buf_zone)
1828		goto out_free_trace_buf;
1829
1830	xfslogd_workqueue = create_workqueue("xfslogd");
1831	if (!xfslogd_workqueue)
1832		goto out_free_buf_zone;
1833
1834	xfsdatad_workqueue = create_workqueue("xfsdatad");
1835	if (!xfsdatad_workqueue)
1836		goto out_destroy_xfslogd_workqueue;
1837
1838	register_shrinker(&xfs_buf_shake);
1839	return 0;
1840
1841 out_destroy_xfslogd_workqueue:
1842	destroy_workqueue(xfslogd_workqueue);
1843 out_free_buf_zone:
1844	kmem_zone_destroy(xfs_buf_zone);
1845 out_free_trace_buf:
1846#ifdef XFS_BUF_TRACE
1847	ktrace_free(xfs_buf_trace_buf);
1848#endif
1849	return -ENOMEM;
1850}
1851
1852void
1853xfs_buf_terminate(void)
1854{
1855	unregister_shrinker(&xfs_buf_shake);
1856	destroy_workqueue(xfsdatad_workqueue);
1857	destroy_workqueue(xfslogd_workqueue);
1858	kmem_zone_destroy(xfs_buf_zone);
1859#ifdef XFS_BUF_TRACE
1860	ktrace_free(xfs_buf_trace_buf);
1861#endif
1862}
1863
1864#ifdef CONFIG_KDB_MODULES
1865struct list_head *
1866xfs_get_buftarg_list(void)
1867{
1868	return &xfs_buftarg_list;
1869}
1870#endif
Configure Feed

Configure Feed