fs/buffer.c at v2.6.19-rc2 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / buffer.c
at v2.6.19-rc2 3035 lines 82 kB view raw
   1/*
   2 *  linux/fs/buffer.c
   3 *
   4 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
   5 */
   6
   7/*
   8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
   9 *
  10 * Removed a lot of unnecessary code and simplified things now that
  11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  12 *
  13 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
  14 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
  15 *
  16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  17 *
  18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  19 */
  20
  21#include <linux/kernel.h>
  22#include <linux/syscalls.h>
  23#include <linux/fs.h>
  24#include <linux/mm.h>
  25#include <linux/percpu.h>
  26#include <linux/slab.h>
  27#include <linux/smp_lock.h>
  28#include <linux/capability.h>
  29#include <linux/blkdev.h>
  30#include <linux/file.h>
  31#include <linux/quotaops.h>
  32#include <linux/highmem.h>
  33#include <linux/module.h>
  34#include <linux/writeback.h>
  35#include <linux/hash.h>
  36#include <linux/suspend.h>
  37#include <linux/buffer_head.h>
  38#include <linux/bio.h>
  39#include <linux/notifier.h>
  40#include <linux/cpu.h>
  41#include <linux/bitops.h>
  42#include <linux/mpage.h>
  43#include <linux/bit_spinlock.h>
  44
  45static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
  46static void invalidate_bh_lrus(void);
  47
  48#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
  49
  50inline void
  51init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
  52{
  53	bh->b_end_io = handler;
  54	bh->b_private = private;
  55}
  56
  57static int sync_buffer(void *word)
  58{
  59	struct block_device *bd;
  60	struct buffer_head *bh
  61		= container_of(word, struct buffer_head, b_state);
  62
  63	smp_mb();
  64	bd = bh->b_bdev;
  65	if (bd)
  66		blk_run_address_space(bd->bd_inode->i_mapping);
  67	io_schedule();
  68	return 0;
  69}
  70
  71void fastcall __lock_buffer(struct buffer_head *bh)
  72{
  73	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
  74							TASK_UNINTERRUPTIBLE);
  75}
  76EXPORT_SYMBOL(__lock_buffer);
  77
  78void fastcall unlock_buffer(struct buffer_head *bh)
  79{
  80	clear_buffer_locked(bh);
  81	smp_mb__after_clear_bit();
  82	wake_up_bit(&bh->b_state, BH_Lock);
  83}
  84
  85/*
  86 * Block until a buffer comes unlocked.  This doesn't stop it
  87 * from becoming locked again - you have to lock it yourself
  88 * if you want to preserve its state.
  89 */
  90void __wait_on_buffer(struct buffer_head * bh)
  91{
  92	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
  93}
  94
  95static void
  96__clear_page_buffers(struct page *page)
  97{
  98	ClearPagePrivate(page);
  99	set_page_private(page, 0);
 100	page_cache_release(page);
 101}
 102
 103static void buffer_io_error(struct buffer_head *bh)
 104{
 105	char b[BDEVNAME_SIZE];
 106
 107	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
 108			bdevname(bh->b_bdev, b),
 109			(unsigned long long)bh->b_blocknr);
 110}
 111
 112/*
 113 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 114 * unlock the buffer. This is what ll_rw_block uses too.
 115 */
 116void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 117{
 118	if (uptodate) {
 119		set_buffer_uptodate(bh);
 120	} else {
 121		/* This happens, due to failed READA attempts. */
 122		clear_buffer_uptodate(bh);
 123	}
 124	unlock_buffer(bh);
 125	put_bh(bh);
 126}
 127
 128void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 129{
 130	char b[BDEVNAME_SIZE];
 131
 132	if (uptodate) {
 133		set_buffer_uptodate(bh);
 134	} else {
 135		if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
 136			buffer_io_error(bh);
 137			printk(KERN_WARNING "lost page write due to "
 138					"I/O error on %s\n",
 139				       bdevname(bh->b_bdev, b));
 140		}
 141		set_buffer_write_io_error(bh);
 142		clear_buffer_uptodate(bh);
 143	}
 144	unlock_buffer(bh);
 145	put_bh(bh);
 146}
 147
 148/*
 149 * Write out and wait upon all the dirty data associated with a block
 150 * device via its mapping.  Does not take the superblock lock.
 151 */
 152int sync_blockdev(struct block_device *bdev)
 153{
 154	int ret = 0;
 155
 156	if (bdev)
 157		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
 158	return ret;
 159}
 160EXPORT_SYMBOL(sync_blockdev);
 161
 162/*
 163 * Write out and wait upon all dirty data associated with this
 164 * device.   Filesystem data as well as the underlying block
 165 * device.  Takes the superblock lock.
 166 */
 167int fsync_bdev(struct block_device *bdev)
 168{
 169	struct super_block *sb = get_super(bdev);
 170	if (sb) {
 171		int res = fsync_super(sb);
 172		drop_super(sb);
 173		return res;
 174	}
 175	return sync_blockdev(bdev);
 176}
 177
 178/**
 179 * freeze_bdev  --  lock a filesystem and force it into a consistent state
 180 * @bdev:	blockdevice to lock
 181 *
 182 * This takes the block device bd_mount_mutex to make sure no new mounts
 183 * happen on bdev until thaw_bdev() is called.
 184 * If a superblock is found on this device, we take the s_umount semaphore
 185 * on it to make sure nobody unmounts until the snapshot creation is done.
 186 */
 187struct super_block *freeze_bdev(struct block_device *bdev)
 188{
 189	struct super_block *sb;
 190
 191	mutex_lock(&bdev->bd_mount_mutex);
 192	sb = get_super(bdev);
 193	if (sb && !(sb->s_flags & MS_RDONLY)) {
 194		sb->s_frozen = SB_FREEZE_WRITE;
 195		smp_wmb();
 196
 197		__fsync_super(sb);
 198
 199		sb->s_frozen = SB_FREEZE_TRANS;
 200		smp_wmb();
 201
 202		sync_blockdev(sb->s_bdev);
 203
 204		if (sb->s_op->write_super_lockfs)
 205			sb->s_op->write_super_lockfs(sb);
 206	}
 207
 208	sync_blockdev(bdev);
 209	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
 210}
 211EXPORT_SYMBOL(freeze_bdev);
 212
 213/**
 214 * thaw_bdev  -- unlock filesystem
 215 * @bdev:	blockdevice to unlock
 216 * @sb:		associated superblock
 217 *
 218 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
 219 */
 220void thaw_bdev(struct block_device *bdev, struct super_block *sb)
 221{
 222	if (sb) {
 223		BUG_ON(sb->s_bdev != bdev);
 224
 225		if (sb->s_op->unlockfs)
 226			sb->s_op->unlockfs(sb);
 227		sb->s_frozen = SB_UNFROZEN;
 228		smp_wmb();
 229		wake_up(&sb->s_wait_unfrozen);
 230		drop_super(sb);
 231	}
 232
 233	mutex_unlock(&bdev->bd_mount_mutex);
 234}
 235EXPORT_SYMBOL(thaw_bdev);
 236
 237/*
 238 * Various filesystems appear to want __find_get_block to be non-blocking.
 239 * But it's the page lock which protects the buffers.  To get around this,
 240 * we get exclusion from try_to_free_buffers with the blockdev mapping's
 241 * private_lock.
 242 *
 243 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
 244 * may be quite high.  This code could TryLock the page, and if that
 245 * succeeds, there is no need to take private_lock. (But if
 246 * private_lock is contended then so is mapping->tree_lock).
 247 */
 248static struct buffer_head *
 249__find_get_block_slow(struct block_device *bdev, sector_t block)
 250{
 251	struct inode *bd_inode = bdev->bd_inode;
 252	struct address_space *bd_mapping = bd_inode->i_mapping;
 253	struct buffer_head *ret = NULL;
 254	pgoff_t index;
 255	struct buffer_head *bh;
 256	struct buffer_head *head;
 257	struct page *page;
 258	int all_mapped = 1;
 259
 260	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
 261	page = find_get_page(bd_mapping, index);
 262	if (!page)
 263		goto out;
 264
 265	spin_lock(&bd_mapping->private_lock);
 266	if (!page_has_buffers(page))
 267		goto out_unlock;
 268	head = page_buffers(page);
 269	bh = head;
 270	do {
 271		if (bh->b_blocknr == block) {
 272			ret = bh;
 273			get_bh(bh);
 274			goto out_unlock;
 275		}
 276		if (!buffer_mapped(bh))
 277			all_mapped = 0;
 278		bh = bh->b_this_page;
 279	} while (bh != head);
 280
 281	/* we might be here because some of the buffers on this page are
 282	 * not mapped.  This is due to various races between
 283	 * file io on the block device and getblk.  It gets dealt with
 284	 * elsewhere, don't buffer_error if we had some unmapped buffers
 285	 */
 286	if (all_mapped) {
 287		printk("__find_get_block_slow() failed. "
 288			"block=%llu, b_blocknr=%llu\n",
 289			(unsigned long long)block,
 290			(unsigned long long)bh->b_blocknr);
 291		printk("b_state=0x%08lx, b_size=%zu\n",
 292			bh->b_state, bh->b_size);
 293		printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
 294	}
 295out_unlock:
 296	spin_unlock(&bd_mapping->private_lock);
 297	page_cache_release(page);
 298out:
 299	return ret;
 300}
 301
 302/* If invalidate_buffers() will trash dirty buffers, it means some kind
 303   of fs corruption is going on. Trashing dirty data always imply losing
 304   information that was supposed to be just stored on the physical layer
 305   by the user.
 306
 307   Thus invalidate_buffers in general usage is not allwowed to trash
 308   dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
 309   be preserved.  These buffers are simply skipped.
 310  
 311   We also skip buffers which are still in use.  For example this can
 312   happen if a userspace program is reading the block device.
 313
 314   NOTE: In the case where the user removed a removable-media-disk even if
 315   there's still dirty data not synced on disk (due a bug in the device driver
 316   or due an error of the user), by not destroying the dirty buffers we could
 317   generate corruption also on the next media inserted, thus a parameter is
 318   necessary to handle this case in the most safe way possible (trying
 319   to not corrupt also the new disk inserted with the data belonging to
 320   the old now corrupted disk). Also for the ramdisk the natural thing
 321   to do in order to release the ramdisk memory is to destroy dirty buffers.
 322
 323   These are two special cases. Normal usage imply the device driver
 324   to issue a sync on the device (without waiting I/O completion) and
 325   then an invalidate_buffers call that doesn't trash dirty buffers.
 326
 327   For handling cache coherency with the blkdev pagecache the 'update' case
 328   is been introduced. It is needed to re-read from disk any pinned
 329   buffer. NOTE: re-reading from disk is destructive so we can do it only
 330   when we assume nobody is changing the buffercache under our I/O and when
 331   we think the disk contains more recent information than the buffercache.
 332   The update == 1 pass marks the buffers we need to update, the update == 2
 333   pass does the actual I/O. */
 334void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
 335{
 336	struct address_space *mapping = bdev->bd_inode->i_mapping;
 337
 338	if (mapping->nrpages == 0)
 339		return;
 340
 341	invalidate_bh_lrus();
 342	/*
 343	 * FIXME: what about destroy_dirty_buffers?
 344	 * We really want to use invalidate_inode_pages2() for
 345	 * that, but not until that's cleaned up.
 346	 */
 347	invalidate_inode_pages(mapping);
 348}
 349
 350/*
 351 * Kick pdflush then try to free up some ZONE_NORMAL memory.
 352 */
 353static void free_more_memory(void)
 354{
 355	struct zone **zones;
 356	pg_data_t *pgdat;
 357
 358	wakeup_pdflush(1024);
 359	yield();
 360
 361	for_each_online_pgdat(pgdat) {
 362		zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
 363		if (*zones)
 364			try_to_free_pages(zones, GFP_NOFS);
 365	}
 366}
 367
 368/*
 369 * I/O completion handler for block_read_full_page() - pages
 370 * which come unlocked at the end of I/O.
 371 */
 372static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 373{
 374	unsigned long flags;
 375	struct buffer_head *first;
 376	struct buffer_head *tmp;
 377	struct page *page;
 378	int page_uptodate = 1;
 379
 380	BUG_ON(!buffer_async_read(bh));
 381
 382	page = bh->b_page;
 383	if (uptodate) {
 384		set_buffer_uptodate(bh);
 385	} else {
 386		clear_buffer_uptodate(bh);
 387		if (printk_ratelimit())
 388			buffer_io_error(bh);
 389		SetPageError(page);
 390	}
 391
 392	/*
 393	 * Be _very_ careful from here on. Bad things can happen if
 394	 * two buffer heads end IO at almost the same time and both
 395	 * decide that the page is now completely done.
 396	 */
 397	first = page_buffers(page);
 398	local_irq_save(flags);
 399	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 400	clear_buffer_async_read(bh);
 401	unlock_buffer(bh);
 402	tmp = bh;
 403	do {
 404		if (!buffer_uptodate(tmp))
 405			page_uptodate = 0;
 406		if (buffer_async_read(tmp)) {
 407			BUG_ON(!buffer_locked(tmp));
 408			goto still_busy;
 409		}
 410		tmp = tmp->b_this_page;
 411	} while (tmp != bh);
 412	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 413	local_irq_restore(flags);
 414
 415	/*
 416	 * If none of the buffers had errors and they are all
 417	 * uptodate then we can set the page uptodate.
 418	 */
 419	if (page_uptodate && !PageError(page))
 420		SetPageUptodate(page);
 421	unlock_page(page);
 422	return;
 423
 424still_busy:
 425	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 426	local_irq_restore(flags);
 427	return;
 428}
 429
 430/*
 431 * Completion handler for block_write_full_page() - pages which are unlocked
 432 * during I/O, and which have PageWriteback cleared upon I/O completion.
 433 */
 434static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 435{
 436	char b[BDEVNAME_SIZE];
 437	unsigned long flags;
 438	struct buffer_head *first;
 439	struct buffer_head *tmp;
 440	struct page *page;
 441
 442	BUG_ON(!buffer_async_write(bh));
 443
 444	page = bh->b_page;
 445	if (uptodate) {
 446		set_buffer_uptodate(bh);
 447	} else {
 448		if (printk_ratelimit()) {
 449			buffer_io_error(bh);
 450			printk(KERN_WARNING "lost page write due to "
 451					"I/O error on %s\n",
 452			       bdevname(bh->b_bdev, b));
 453		}
 454		set_bit(AS_EIO, &page->mapping->flags);
 455		clear_buffer_uptodate(bh);
 456		SetPageError(page);
 457	}
 458
 459	first = page_buffers(page);
 460	local_irq_save(flags);
 461	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 462
 463	clear_buffer_async_write(bh);
 464	unlock_buffer(bh);
 465	tmp = bh->b_this_page;
 466	while (tmp != bh) {
 467		if (buffer_async_write(tmp)) {
 468			BUG_ON(!buffer_locked(tmp));
 469			goto still_busy;
 470		}
 471		tmp = tmp->b_this_page;
 472	}
 473	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 474	local_irq_restore(flags);
 475	end_page_writeback(page);
 476	return;
 477
 478still_busy:
 479	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 480	local_irq_restore(flags);
 481	return;
 482}
 483
 484/*
 485 * If a page's buffers are under async readin (end_buffer_async_read
 486 * completion) then there is a possibility that another thread of
 487 * control could lock one of the buffers after it has completed
 488 * but while some of the other buffers have not completed.  This
 489 * locked buffer would confuse end_buffer_async_read() into not unlocking
 490 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 491 * that this buffer is not under async I/O.
 492 *
 493 * The page comes unlocked when it has no locked buffer_async buffers
 494 * left.
 495 *
 496 * PageLocked prevents anyone starting new async I/O reads any of
 497 * the buffers.
 498 *
 499 * PageWriteback is used to prevent simultaneous writeout of the same
 500 * page.
 501 *
 502 * PageLocked prevents anyone from starting writeback of a page which is
 503 * under read I/O (PageWriteback is only ever set against a locked page).
 504 */
 505static void mark_buffer_async_read(struct buffer_head *bh)
 506{
 507	bh->b_end_io = end_buffer_async_read;
 508	set_buffer_async_read(bh);
 509}
 510
 511void mark_buffer_async_write(struct buffer_head *bh)
 512{
 513	bh->b_end_io = end_buffer_async_write;
 514	set_buffer_async_write(bh);
 515}
 516EXPORT_SYMBOL(mark_buffer_async_write);
 517
 518
 519/*
 520 * fs/buffer.c contains helper functions for buffer-backed address space's
 521 * fsync functions.  A common requirement for buffer-based filesystems is
 522 * that certain data from the backing blockdev needs to be written out for
 523 * a successful fsync().  For example, ext2 indirect blocks need to be
 524 * written back and waited upon before fsync() returns.
 525 *
 526 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
 527 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 528 * management of a list of dependent buffers at ->i_mapping->private_list.
 529 *
 530 * Locking is a little subtle: try_to_free_buffers() will remove buffers
 531 * from their controlling inode's queue when they are being freed.  But
 532 * try_to_free_buffers() will be operating against the *blockdev* mapping
 533 * at the time, not against the S_ISREG file which depends on those buffers.
 534 * So the locking for private_list is via the private_lock in the address_space
 535 * which backs the buffers.  Which is different from the address_space 
 536 * against which the buffers are listed.  So for a particular address_space,
 537 * mapping->private_lock does *not* protect mapping->private_list!  In fact,
 538 * mapping->private_list will always be protected by the backing blockdev's
 539 * ->private_lock.
 540 *
 541 * Which introduces a requirement: all buffers on an address_space's
 542 * ->private_list must be from the same address_space: the blockdev's.
 543 *
 544 * address_spaces which do not place buffers at ->private_list via these
 545 * utility functions are free to use private_lock and private_list for
 546 * whatever they want.  The only requirement is that list_empty(private_list)
 547 * be true at clear_inode() time.
 548 *
 549 * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 550 * filesystems should do that.  invalidate_inode_buffers() should just go
 551 * BUG_ON(!list_empty).
 552 *
 553 * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 554 * take an address_space, not an inode.  And it should be called
 555 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 556 * queued up.
 557 *
 558 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 559 * list if it is already on a list.  Because if the buffer is on a list,
 560 * it *must* already be on the right one.  If not, the filesystem is being
 561 * silly.  This will save a ton of locking.  But first we have to ensure
 562 * that buffers are taken *off* the old inode's list when they are freed
 563 * (presumably in truncate).  That requires careful auditing of all
 564 * filesystems (do it inside bforget()).  It could also be done by bringing
 565 * b_inode back.
 566 */
 567
 568/*
 569 * The buffer's backing address_space's private_lock must be held
 570 */
 571static inline void __remove_assoc_queue(struct buffer_head *bh)
 572{
 573	list_del_init(&bh->b_assoc_buffers);
 574}
 575
 576int inode_has_buffers(struct inode *inode)
 577{
 578	return !list_empty(&inode->i_data.private_list);
 579}
 580
 581/*
 582 * osync is designed to support O_SYNC io.  It waits synchronously for
 583 * all already-submitted IO to complete, but does not queue any new
 584 * writes to the disk.
 585 *
 586 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 587 * you dirty the buffers, and then use osync_inode_buffers to wait for
 588 * completion.  Any other dirty buffers which are not yet queued for
 589 * write will not be flushed to disk by the osync.
 590 */
 591static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 592{
 593	struct buffer_head *bh;
 594	struct list_head *p;
 595	int err = 0;
 596
 597	spin_lock(lock);
 598repeat:
 599	list_for_each_prev(p, list) {
 600		bh = BH_ENTRY(p);
 601		if (buffer_locked(bh)) {
 602			get_bh(bh);
 603			spin_unlock(lock);
 604			wait_on_buffer(bh);
 605			if (!buffer_uptodate(bh))
 606				err = -EIO;
 607			brelse(bh);
 608			spin_lock(lock);
 609			goto repeat;
 610		}
 611	}
 612	spin_unlock(lock);
 613	return err;
 614}
 615
 616/**
 617 * sync_mapping_buffers - write out and wait upon a mapping's "associated"
 618 *                        buffers
 619 * @mapping: the mapping which wants those buffers written
 620 *
 621 * Starts I/O against the buffers at mapping->private_list, and waits upon
 622 * that I/O.
 623 *
 624 * Basically, this is a convenience function for fsync().
 625 * @mapping is a file or directory which needs those buffers to be written for
 626 * a successful fsync().
 627 */
 628int sync_mapping_buffers(struct address_space *mapping)
 629{
 630	struct address_space *buffer_mapping = mapping->assoc_mapping;
 631
 632	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
 633		return 0;
 634
 635	return fsync_buffers_list(&buffer_mapping->private_lock,
 636					&mapping->private_list);
 637}
 638EXPORT_SYMBOL(sync_mapping_buffers);
 639
 640/*
 641 * Called when we've recently written block `bblock', and it is known that
 642 * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 643 * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 644 * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 645 */
 646void write_boundary_block(struct block_device *bdev,
 647			sector_t bblock, unsigned blocksize)
 648{
 649	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
 650	if (bh) {
 651		if (buffer_dirty(bh))
 652			ll_rw_block(WRITE, 1, &bh);
 653		put_bh(bh);
 654	}
 655}
 656
 657void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 658{
 659	struct address_space *mapping = inode->i_mapping;
 660	struct address_space *buffer_mapping = bh->b_page->mapping;
 661
 662	mark_buffer_dirty(bh);
 663	if (!mapping->assoc_mapping) {
 664		mapping->assoc_mapping = buffer_mapping;
 665	} else {
 666		BUG_ON(mapping->assoc_mapping != buffer_mapping);
 667	}
 668	if (list_empty(&bh->b_assoc_buffers)) {
 669		spin_lock(&buffer_mapping->private_lock);
 670		list_move_tail(&bh->b_assoc_buffers,
 671				&mapping->private_list);
 672		spin_unlock(&buffer_mapping->private_lock);
 673	}
 674}
 675EXPORT_SYMBOL(mark_buffer_dirty_inode);
 676
 677/*
 678 * Add a page to the dirty page list.
 679 *
 680 * It is a sad fact of life that this function is called from several places
 681 * deeply under spinlocking.  It may not sleep.
 682 *
 683 * If the page has buffers, the uptodate buffers are set dirty, to preserve
 684 * dirty-state coherency between the page and the buffers.  It the page does
 685 * not have buffers then when they are later attached they will all be set
 686 * dirty.
 687 *
 688 * The buffers are dirtied before the page is dirtied.  There's a small race
 689 * window in which a writepage caller may see the page cleanness but not the
 690 * buffer dirtiness.  That's fine.  If this code were to set the page dirty
 691 * before the buffers, a concurrent writepage caller could clear the page dirty
 692 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
 693 * page on the dirty page list.
 694 *
 695 * We use private_lock to lock against try_to_free_buffers while using the
 696 * page's buffer list.  Also use this to protect against clean buffers being
 697 * added to the page after it was set dirty.
 698 *
 699 * FIXME: may need to call ->reservepage here as well.  That's rather up to the
 700 * address_space though.
 701 */
 702int __set_page_dirty_buffers(struct page *page)
 703{
 704	struct address_space * const mapping = page_mapping(page);
 705
 706	if (unlikely(!mapping))
 707		return !TestSetPageDirty(page);
 708
 709	spin_lock(&mapping->private_lock);
 710	if (page_has_buffers(page)) {
 711		struct buffer_head *head = page_buffers(page);
 712		struct buffer_head *bh = head;
 713
 714		do {
 715			set_buffer_dirty(bh);
 716			bh = bh->b_this_page;
 717		} while (bh != head);
 718	}
 719	spin_unlock(&mapping->private_lock);
 720
 721	if (!TestSetPageDirty(page)) {
 722		write_lock_irq(&mapping->tree_lock);
 723		if (page->mapping) {	/* Race with truncate? */
 724			if (mapping_cap_account_dirty(mapping))
 725				__inc_zone_page_state(page, NR_FILE_DIRTY);
 726			radix_tree_tag_set(&mapping->page_tree,
 727						page_index(page),
 728						PAGECACHE_TAG_DIRTY);
 729		}
 730		write_unlock_irq(&mapping->tree_lock);
 731		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 732		return 1;
 733	}
 734	return 0;
 735}
 736EXPORT_SYMBOL(__set_page_dirty_buffers);
 737
 738/*
 739 * Write out and wait upon a list of buffers.
 740 *
 741 * We have conflicting pressures: we want to make sure that all
 742 * initially dirty buffers get waited on, but that any subsequently
 743 * dirtied buffers don't.  After all, we don't want fsync to last
 744 * forever if somebody is actively writing to the file.
 745 *
 746 * Do this in two main stages: first we copy dirty buffers to a
 747 * temporary inode list, queueing the writes as we go.  Then we clean
 748 * up, waiting for those writes to complete.
 749 * 
 750 * During this second stage, any subsequent updates to the file may end
 751 * up refiling the buffer on the original inode's dirty list again, so
 752 * there is a chance we will end up with a buffer queued for write but
 753 * not yet completed on that list.  So, as a final cleanup we go through
 754 * the osync code to catch these locked, dirty buffers without requeuing
 755 * any newly dirty buffers for write.
 756 */
 757static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 758{
 759	struct buffer_head *bh;
 760	struct list_head tmp;
 761	int err = 0, err2;
 762
 763	INIT_LIST_HEAD(&tmp);
 764
 765	spin_lock(lock);
 766	while (!list_empty(list)) {
 767		bh = BH_ENTRY(list->next);
 768		list_del_init(&bh->b_assoc_buffers);
 769		if (buffer_dirty(bh) || buffer_locked(bh)) {
 770			list_add(&bh->b_assoc_buffers, &tmp);
 771			if (buffer_dirty(bh)) {
 772				get_bh(bh);
 773				spin_unlock(lock);
 774				/*
 775				 * Ensure any pending I/O completes so that
 776				 * ll_rw_block() actually writes the current
 777				 * contents - it is a noop if I/O is still in
 778				 * flight on potentially older contents.
 779				 */
 780				ll_rw_block(SWRITE, 1, &bh);
 781				brelse(bh);
 782				spin_lock(lock);
 783			}
 784		}
 785	}
 786
 787	while (!list_empty(&tmp)) {
 788		bh = BH_ENTRY(tmp.prev);
 789		__remove_assoc_queue(bh);
 790		get_bh(bh);
 791		spin_unlock(lock);
 792		wait_on_buffer(bh);
 793		if (!buffer_uptodate(bh))
 794			err = -EIO;
 795		brelse(bh);
 796		spin_lock(lock);
 797	}
 798	
 799	spin_unlock(lock);
 800	err2 = osync_buffers_list(lock, list);
 801	if (err)
 802		return err;
 803	else
 804		return err2;
 805}
 806
 807/*
 808 * Invalidate any and all dirty buffers on a given inode.  We are
 809 * probably unmounting the fs, but that doesn't mean we have already
 810 * done a sync().  Just drop the buffers from the inode list.
 811 *
 812 * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
 813 * assumes that all the buffers are against the blockdev.  Not true
 814 * for reiserfs.
 815 */
 816void invalidate_inode_buffers(struct inode *inode)
 817{
 818	if (inode_has_buffers(inode)) {
 819		struct address_space *mapping = &inode->i_data;
 820		struct list_head *list = &mapping->private_list;
 821		struct address_space *buffer_mapping = mapping->assoc_mapping;
 822
 823		spin_lock(&buffer_mapping->private_lock);
 824		while (!list_empty(list))
 825			__remove_assoc_queue(BH_ENTRY(list->next));
 826		spin_unlock(&buffer_mapping->private_lock);
 827	}
 828}
 829
 830/*
 831 * Remove any clean buffers from the inode's buffer list.  This is called
 832 * when we're trying to free the inode itself.  Those buffers can pin it.
 833 *
 834 * Returns true if all buffers were removed.
 835 */
 836int remove_inode_buffers(struct inode *inode)
 837{
 838	int ret = 1;
 839
 840	if (inode_has_buffers(inode)) {
 841		struct address_space *mapping = &inode->i_data;
 842		struct list_head *list = &mapping->private_list;
 843		struct address_space *buffer_mapping = mapping->assoc_mapping;
 844
 845		spin_lock(&buffer_mapping->private_lock);
 846		while (!list_empty(list)) {
 847			struct buffer_head *bh = BH_ENTRY(list->next);
 848			if (buffer_dirty(bh)) {
 849				ret = 0;
 850				break;
 851			}
 852			__remove_assoc_queue(bh);
 853		}
 854		spin_unlock(&buffer_mapping->private_lock);
 855	}
 856	return ret;
 857}
 858
 859/*
 860 * Create the appropriate buffers when given a page for data area and
 861 * the size of each buffer.. Use the bh->b_this_page linked list to
 862 * follow the buffers created.  Return NULL if unable to create more
 863 * buffers.
 864 *
 865 * The retry flag is used to differentiate async IO (paging, swapping)
 866 * which may not fail from ordinary buffer allocations.
 867 */
 868struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 869		int retry)
 870{
 871	struct buffer_head *bh, *head;
 872	long offset;
 873
 874try_again:
 875	head = NULL;
 876	offset = PAGE_SIZE;
 877	while ((offset -= size) >= 0) {
 878		bh = alloc_buffer_head(GFP_NOFS);
 879		if (!bh)
 880			goto no_grow;
 881
 882		bh->b_bdev = NULL;
 883		bh->b_this_page = head;
 884		bh->b_blocknr = -1;
 885		head = bh;
 886
 887		bh->b_state = 0;
 888		atomic_set(&bh->b_count, 0);
 889		bh->b_private = NULL;
 890		bh->b_size = size;
 891
 892		/* Link the buffer to its page */
 893		set_bh_page(bh, page, offset);
 894
 895		init_buffer(bh, NULL, NULL);
 896	}
 897	return head;
 898/*
 899 * In case anything failed, we just free everything we got.
 900 */
 901no_grow:
 902	if (head) {
 903		do {
 904			bh = head;
 905			head = head->b_this_page;
 906			free_buffer_head(bh);
 907		} while (head);
 908	}
 909
 910	/*
 911	 * Return failure for non-async IO requests.  Async IO requests
 912	 * are not allowed to fail, so we have to wait until buffer heads
 913	 * become available.  But we don't want tasks sleeping with 
 914	 * partially complete buffers, so all were released above.
 915	 */
 916	if (!retry)
 917		return NULL;
 918
 919	/* We're _really_ low on memory. Now we just
 920	 * wait for old buffer heads to become free due to
 921	 * finishing IO.  Since this is an async request and
 922	 * the reserve list is empty, we're sure there are 
 923	 * async buffer heads in use.
 924	 */
 925	free_more_memory();
 926	goto try_again;
 927}
 928EXPORT_SYMBOL_GPL(alloc_page_buffers);
 929
 930static inline void
 931link_dev_buffers(struct page *page, struct buffer_head *head)
 932{
 933	struct buffer_head *bh, *tail;
 934
 935	bh = head;
 936	do {
 937		tail = bh;
 938		bh = bh->b_this_page;
 939	} while (bh);
 940	tail->b_this_page = head;
 941	attach_page_buffers(page, head);
 942}
 943
 944/*
 945 * Initialise the state of a blockdev page's buffers.
 946 */ 
 947static void
 948init_page_buffers(struct page *page, struct block_device *bdev,
 949			sector_t block, int size)
 950{
 951	struct buffer_head *head = page_buffers(page);
 952	struct buffer_head *bh = head;
 953	int uptodate = PageUptodate(page);
 954
 955	do {
 956		if (!buffer_mapped(bh)) {
 957			init_buffer(bh, NULL, NULL);
 958			bh->b_bdev = bdev;
 959			bh->b_blocknr = block;
 960			if (uptodate)
 961				set_buffer_uptodate(bh);
 962			set_buffer_mapped(bh);
 963		}
 964		block++;
 965		bh = bh->b_this_page;
 966	} while (bh != head);
 967}
 968
 969/*
 970 * Create the page-cache page that contains the requested block.
 971 *
 972 * This is user purely for blockdev mappings.
 973 */
 974static struct page *
 975grow_dev_page(struct block_device *bdev, sector_t block,
 976		pgoff_t index, int size)
 977{
 978	struct inode *inode = bdev->bd_inode;
 979	struct page *page;
 980	struct buffer_head *bh;
 981
 982	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
 983	if (!page)
 984		return NULL;
 985
 986	BUG_ON(!PageLocked(page));
 987
 988	if (page_has_buffers(page)) {
 989		bh = page_buffers(page);
 990		if (bh->b_size == size) {
 991			init_page_buffers(page, bdev, block, size);
 992			return page;
 993		}
 994		if (!try_to_free_buffers(page))
 995			goto failed;
 996	}
 997
 998	/*
 999	 * Allocate some buffers for this page
1000	 */
1001	bh = alloc_page_buffers(page, size, 0);
1002	if (!bh)
1003		goto failed;
1004
1005	/*
1006	 * Link the page to the buffers and initialise them.  Take the
1007	 * lock to be atomic wrt __find_get_block(), which does not
1008	 * run under the page lock.
1009	 */
1010	spin_lock(&inode->i_mapping->private_lock);
1011	link_dev_buffers(page, bh);
1012	init_page_buffers(page, bdev, block, size);
1013	spin_unlock(&inode->i_mapping->private_lock);
1014	return page;
1015
1016failed:
1017	BUG();
1018	unlock_page(page);
1019	page_cache_release(page);
1020	return NULL;
1021}
1022
1023/*
1024 * Create buffers for the specified block device block's page.  If
1025 * that page was dirty, the buffers are set dirty also.
1026 *
1027 * Except that's a bug.  Attaching dirty buffers to a dirty
1028 * blockdev's page can result in filesystem corruption, because
1029 * some of those buffers may be aliases of filesystem data.
1030 * grow_dev_page() will go BUG() if this happens.
1031 */
1032static int
1033grow_buffers(struct block_device *bdev, sector_t block, int size)
1034{
1035	struct page *page;
1036	pgoff_t index;
1037	int sizebits;
1038
1039	sizebits = -1;
1040	do {
1041		sizebits++;
1042	} while ((size << sizebits) < PAGE_SIZE);
1043
1044	index = block >> sizebits;
1045
1046	/*
1047	 * Check for a block which wants to lie outside our maximum possible
1048	 * pagecache index.  (this comparison is done using sector_t types).
1049	 */
1050	if (unlikely(index != block >> sizebits)) {
1051		char b[BDEVNAME_SIZE];
1052
1053		printk(KERN_ERR "%s: requested out-of-range block %llu for "
1054			"device %s\n",
1055			__FUNCTION__, (unsigned long long)block,
1056			bdevname(bdev, b));
1057		return -EIO;
1058	}
1059	block = index << sizebits;
1060	/* Create a page with the proper size buffers.. */
1061	page = grow_dev_page(bdev, block, index, size);
1062	if (!page)
1063		return 0;
1064	unlock_page(page);
1065	page_cache_release(page);
1066	return 1;
1067}
1068
1069static struct buffer_head *
1070__getblk_slow(struct block_device *bdev, sector_t block, int size)
1071{
1072	/* Size must be multiple of hard sectorsize */
1073	if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1074			(size < 512 || size > PAGE_SIZE))) {
1075		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1076					size);
1077		printk(KERN_ERR "hardsect size: %d\n",
1078					bdev_hardsect_size(bdev));
1079
1080		dump_stack();
1081		return NULL;
1082	}
1083
1084	for (;;) {
1085		struct buffer_head * bh;
1086		int ret;
1087
1088		bh = __find_get_block(bdev, block, size);
1089		if (bh)
1090			return bh;
1091
1092		ret = grow_buffers(bdev, block, size);
1093		if (ret < 0)
1094			return NULL;
1095		if (ret == 0)
1096			free_more_memory();
1097	}
1098}
1099
1100/*
1101 * The relationship between dirty buffers and dirty pages:
1102 *
1103 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1104 * the page is tagged dirty in its radix tree.
1105 *
1106 * At all times, the dirtiness of the buffers represents the dirtiness of
1107 * subsections of the page.  If the page has buffers, the page dirty bit is
1108 * merely a hint about the true dirty state.
1109 *
1110 * When a page is set dirty in its entirety, all its buffers are marked dirty
1111 * (if the page has buffers).
1112 *
1113 * When a buffer is marked dirty, its page is dirtied, but the page's other
1114 * buffers are not.
1115 *
1116 * Also.  When blockdev buffers are explicitly read with bread(), they
1117 * individually become uptodate.  But their backing page remains not
1118 * uptodate - even if all of its buffers are uptodate.  A subsequent
1119 * block_read_full_page() against that page will discover all the uptodate
1120 * buffers, will set the page uptodate and will perform no I/O.
1121 */
1122
1123/**
1124 * mark_buffer_dirty - mark a buffer_head as needing writeout
1125 * @bh: the buffer_head to mark dirty
1126 *
1127 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1128 * backing page dirty, then tag the page as dirty in its address_space's radix
1129 * tree and then attach the address_space's inode to its superblock's dirty
1130 * inode list.
1131 *
1132 * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1133 * mapping->tree_lock and the global inode_lock.
1134 */
1135void fastcall mark_buffer_dirty(struct buffer_head *bh)
1136{
1137	if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1138		__set_page_dirty_nobuffers(bh->b_page);
1139}
1140
1141/*
1142 * Decrement a buffer_head's reference count.  If all buffers against a page
1143 * have zero reference count, are clean and unlocked, and if the page is clean
1144 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1145 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1146 * a page but it ends up not being freed, and buffers may later be reattached).
1147 */
1148void __brelse(struct buffer_head * buf)
1149{
1150	if (atomic_read(&buf->b_count)) {
1151		put_bh(buf);
1152		return;
1153	}
1154	printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1155	WARN_ON(1);
1156}
1157
1158/*
1159 * bforget() is like brelse(), except it discards any
1160 * potentially dirty data.
1161 */
1162void __bforget(struct buffer_head *bh)
1163{
1164	clear_buffer_dirty(bh);
1165	if (!list_empty(&bh->b_assoc_buffers)) {
1166		struct address_space *buffer_mapping = bh->b_page->mapping;
1167
1168		spin_lock(&buffer_mapping->private_lock);
1169		list_del_init(&bh->b_assoc_buffers);
1170		spin_unlock(&buffer_mapping->private_lock);
1171	}
1172	__brelse(bh);
1173}
1174
1175static struct buffer_head *__bread_slow(struct buffer_head *bh)
1176{
1177	lock_buffer(bh);
1178	if (buffer_uptodate(bh)) {
1179		unlock_buffer(bh);
1180		return bh;
1181	} else {
1182		get_bh(bh);
1183		bh->b_end_io = end_buffer_read_sync;
1184		submit_bh(READ, bh);
1185		wait_on_buffer(bh);
1186		if (buffer_uptodate(bh))
1187			return bh;
1188	}
1189	brelse(bh);
1190	return NULL;
1191}
1192
1193/*
1194 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1195 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1196 * refcount elevated by one when they're in an LRU.  A buffer can only appear
1197 * once in a particular CPU's LRU.  A single buffer can be present in multiple
1198 * CPU's LRUs at the same time.
1199 *
1200 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1201 * sb_find_get_block().
1202 *
1203 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1204 * a local interrupt disable for that.
1205 */
1206
1207#define BH_LRU_SIZE	8
1208
1209struct bh_lru {
1210	struct buffer_head *bhs[BH_LRU_SIZE];
1211};
1212
1213static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1214
1215#ifdef CONFIG_SMP
1216#define bh_lru_lock()	local_irq_disable()
1217#define bh_lru_unlock()	local_irq_enable()
1218#else
1219#define bh_lru_lock()	preempt_disable()
1220#define bh_lru_unlock()	preempt_enable()
1221#endif
1222
1223static inline void check_irqs_on(void)
1224{
1225#ifdef irqs_disabled
1226	BUG_ON(irqs_disabled());
1227#endif
1228}
1229
1230/*
1231 * The LRU management algorithm is dopey-but-simple.  Sorry.
1232 */
1233static void bh_lru_install(struct buffer_head *bh)
1234{
1235	struct buffer_head *evictee = NULL;
1236	struct bh_lru *lru;
1237
1238	check_irqs_on();
1239	bh_lru_lock();
1240	lru = &__get_cpu_var(bh_lrus);
1241	if (lru->bhs[0] != bh) {
1242		struct buffer_head *bhs[BH_LRU_SIZE];
1243		int in;
1244		int out = 0;
1245
1246		get_bh(bh);
1247		bhs[out++] = bh;
1248		for (in = 0; in < BH_LRU_SIZE; in++) {
1249			struct buffer_head *bh2 = lru->bhs[in];
1250
1251			if (bh2 == bh) {
1252				__brelse(bh2);
1253			} else {
1254				if (out >= BH_LRU_SIZE) {
1255					BUG_ON(evictee != NULL);
1256					evictee = bh2;
1257				} else {
1258					bhs[out++] = bh2;
1259				}
1260			}
1261		}
1262		while (out < BH_LRU_SIZE)
1263			bhs[out++] = NULL;
1264		memcpy(lru->bhs, bhs, sizeof(bhs));
1265	}
1266	bh_lru_unlock();
1267
1268	if (evictee)
1269		__brelse(evictee);
1270}
1271
1272/*
1273 * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1274 */
1275static struct buffer_head *
1276lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
1277{
1278	struct buffer_head *ret = NULL;
1279	struct bh_lru *lru;
1280	int i;
1281
1282	check_irqs_on();
1283	bh_lru_lock();
1284	lru = &__get_cpu_var(bh_lrus);
1285	for (i = 0; i < BH_LRU_SIZE; i++) {
1286		struct buffer_head *bh = lru->bhs[i];
1287
1288		if (bh && bh->b_bdev == bdev &&
1289				bh->b_blocknr == block && bh->b_size == size) {
1290			if (i) {
1291				while (i) {
1292					lru->bhs[i] = lru->bhs[i - 1];
1293					i--;
1294				}
1295				lru->bhs[0] = bh;
1296			}
1297			get_bh(bh);
1298			ret = bh;
1299			break;
1300		}
1301	}
1302	bh_lru_unlock();
1303	return ret;
1304}
1305
1306/*
1307 * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1308 * it in the LRU and mark it as accessed.  If it is not present then return
1309 * NULL
1310 */
1311struct buffer_head *
1312__find_get_block(struct block_device *bdev, sector_t block, int size)
1313{
1314	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1315
1316	if (bh == NULL) {
1317		bh = __find_get_block_slow(bdev, block);
1318		if (bh)
1319			bh_lru_install(bh);
1320	}
1321	if (bh)
1322		touch_buffer(bh);
1323	return bh;
1324}
1325EXPORT_SYMBOL(__find_get_block);
1326
1327/*
1328 * __getblk will locate (and, if necessary, create) the buffer_head
1329 * which corresponds to the passed block_device, block and size. The
1330 * returned buffer has its reference count incremented.
1331 *
1332 * __getblk() cannot fail - it just keeps trying.  If you pass it an
1333 * illegal block number, __getblk() will happily return a buffer_head
1334 * which represents the non-existent block.  Very weird.
1335 *
1336 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1337 * attempt is failing.  FIXME, perhaps?
1338 */
1339struct buffer_head *
1340__getblk(struct block_device *bdev, sector_t block, int size)
1341{
1342	struct buffer_head *bh = __find_get_block(bdev, block, size);
1343
1344	might_sleep();
1345	if (bh == NULL)
1346		bh = __getblk_slow(bdev, block, size);
1347	return bh;
1348}
1349EXPORT_SYMBOL(__getblk);
1350
1351/*
1352 * Do async read-ahead on a buffer..
1353 */
1354void __breadahead(struct block_device *bdev, sector_t block, int size)
1355{
1356	struct buffer_head *bh = __getblk(bdev, block, size);
1357	if (likely(bh)) {
1358		ll_rw_block(READA, 1, &bh);
1359		brelse(bh);
1360	}
1361}
1362EXPORT_SYMBOL(__breadahead);
1363
1364/**
1365 *  __bread() - reads a specified block and returns the bh
1366 *  @bdev: the block_device to read from
1367 *  @block: number of block
1368 *  @size: size (in bytes) to read
1369 * 
1370 *  Reads a specified block, and returns buffer head that contains it.
1371 *  It returns NULL if the block was unreadable.
1372 */
1373struct buffer_head *
1374__bread(struct block_device *bdev, sector_t block, int size)
1375{
1376	struct buffer_head *bh = __getblk(bdev, block, size);
1377
1378	if (likely(bh) && !buffer_uptodate(bh))
1379		bh = __bread_slow(bh);
1380	return bh;
1381}
1382EXPORT_SYMBOL(__bread);
1383
1384/*
1385 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1386 * This doesn't race because it runs in each cpu either in irq
1387 * or with preempt disabled.
1388 */
1389static void invalidate_bh_lru(void *arg)
1390{
1391	struct bh_lru *b = &get_cpu_var(bh_lrus);
1392	int i;
1393
1394	for (i = 0; i < BH_LRU_SIZE; i++) {
1395		brelse(b->bhs[i]);
1396		b->bhs[i] = NULL;
1397	}
1398	put_cpu_var(bh_lrus);
1399}
1400	
1401static void invalidate_bh_lrus(void)
1402{
1403	on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1404}
1405
1406void set_bh_page(struct buffer_head *bh,
1407		struct page *page, unsigned long offset)
1408{
1409	bh->b_page = page;
1410	BUG_ON(offset >= PAGE_SIZE);
1411	if (PageHighMem(page))
1412		/*
1413		 * This catches illegal uses and preserves the offset:
1414		 */
1415		bh->b_data = (char *)(0 + offset);
1416	else
1417		bh->b_data = page_address(page) + offset;
1418}
1419EXPORT_SYMBOL(set_bh_page);
1420
1421/*
1422 * Called when truncating a buffer on a page completely.
1423 */
1424static void discard_buffer(struct buffer_head * bh)
1425{
1426	lock_buffer(bh);
1427	clear_buffer_dirty(bh);
1428	bh->b_bdev = NULL;
1429	clear_buffer_mapped(bh);
1430	clear_buffer_req(bh);
1431	clear_buffer_new(bh);
1432	clear_buffer_delay(bh);
1433	unlock_buffer(bh);
1434}
1435
1436/**
1437 * block_invalidatepage - invalidate part of all of a buffer-backed page
1438 *
1439 * @page: the page which is affected
1440 * @offset: the index of the truncation point
1441 *
1442 * block_invalidatepage() is called when all or part of the page has become
1443 * invalidatedby a truncate operation.
1444 *
1445 * block_invalidatepage() does not have to release all buffers, but it must
1446 * ensure that no dirty buffer is left outside @offset and that no I/O
1447 * is underway against any of the blocks which are outside the truncation
1448 * point.  Because the caller is about to free (and possibly reuse) those
1449 * blocks on-disk.
1450 */
1451void block_invalidatepage(struct page *page, unsigned long offset)
1452{
1453	struct buffer_head *head, *bh, *next;
1454	unsigned int curr_off = 0;
1455
1456	BUG_ON(!PageLocked(page));
1457	if (!page_has_buffers(page))
1458		goto out;
1459
1460	head = page_buffers(page);
1461	bh = head;
1462	do {
1463		unsigned int next_off = curr_off + bh->b_size;
1464		next = bh->b_this_page;
1465
1466		/*
1467		 * is this block fully invalidated?
1468		 */
1469		if (offset <= curr_off)
1470			discard_buffer(bh);
1471		curr_off = next_off;
1472		bh = next;
1473	} while (bh != head);
1474
1475	/*
1476	 * We release buffers only if the entire page is being invalidated.
1477	 * The get_block cached value has been unconditionally invalidated,
1478	 * so real IO is not possible anymore.
1479	 */
1480	if (offset == 0)
1481		try_to_release_page(page, 0);
1482out:
1483	return;
1484}
1485EXPORT_SYMBOL(block_invalidatepage);
1486
1487/*
1488 * We attach and possibly dirty the buffers atomically wrt
1489 * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1490 * is already excluded via the page lock.
1491 */
1492void create_empty_buffers(struct page *page,
1493			unsigned long blocksize, unsigned long b_state)
1494{
1495	struct buffer_head *bh, *head, *tail;
1496
1497	head = alloc_page_buffers(page, blocksize, 1);
1498	bh = head;
1499	do {
1500		bh->b_state |= b_state;
1501		tail = bh;
1502		bh = bh->b_this_page;
1503	} while (bh);
1504	tail->b_this_page = head;
1505
1506	spin_lock(&page->mapping->private_lock);
1507	if (PageUptodate(page) || PageDirty(page)) {
1508		bh = head;
1509		do {
1510			if (PageDirty(page))
1511				set_buffer_dirty(bh);
1512			if (PageUptodate(page))
1513				set_buffer_uptodate(bh);
1514			bh = bh->b_this_page;
1515		} while (bh != head);
1516	}
1517	attach_page_buffers(page, head);
1518	spin_unlock(&page->mapping->private_lock);
1519}
1520EXPORT_SYMBOL(create_empty_buffers);
1521
1522/*
1523 * We are taking a block for data and we don't want any output from any
1524 * buffer-cache aliases starting from return from that function and
1525 * until the moment when something will explicitly mark the buffer
1526 * dirty (hopefully that will not happen until we will free that block ;-)
1527 * We don't even need to mark it not-uptodate - nobody can expect
1528 * anything from a newly allocated buffer anyway. We used to used
1529 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1530 * don't want to mark the alias unmapped, for example - it would confuse
1531 * anyone who might pick it with bread() afterwards...
1532 *
1533 * Also..  Note that bforget() doesn't lock the buffer.  So there can
1534 * be writeout I/O going on against recently-freed buffers.  We don't
1535 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1536 * only if we really need to.  That happens here.
1537 */
1538void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1539{
1540	struct buffer_head *old_bh;
1541
1542	might_sleep();
1543
1544	old_bh = __find_get_block_slow(bdev, block);
1545	if (old_bh) {
1546		clear_buffer_dirty(old_bh);
1547		wait_on_buffer(old_bh);
1548		clear_buffer_req(old_bh);
1549		__brelse(old_bh);
1550	}
1551}
1552EXPORT_SYMBOL(unmap_underlying_metadata);
1553
1554/*
1555 * NOTE! All mapped/uptodate combinations are valid:
1556 *
1557 *	Mapped	Uptodate	Meaning
1558 *
1559 *	No	No		"unknown" - must do get_block()
1560 *	No	Yes		"hole" - zero-filled
1561 *	Yes	No		"allocated" - allocated on disk, not read in
1562 *	Yes	Yes		"valid" - allocated and up-to-date in memory.
1563 *
1564 * "Dirty" is valid only with the last case (mapped+uptodate).
1565 */
1566
1567/*
1568 * While block_write_full_page is writing back the dirty buffers under
1569 * the page lock, whoever dirtied the buffers may decide to clean them
1570 * again at any time.  We handle that by only looking at the buffer
1571 * state inside lock_buffer().
1572 *
1573 * If block_write_full_page() is called for regular writeback
1574 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1575 * locked buffer.   This only can happen if someone has written the buffer
1576 * directly, with submit_bh().  At the address_space level PageWriteback
1577 * prevents this contention from occurring.
1578 */
1579static int __block_write_full_page(struct inode *inode, struct page *page,
1580			get_block_t *get_block, struct writeback_control *wbc)
1581{
1582	int err;
1583	sector_t block;
1584	sector_t last_block;
1585	struct buffer_head *bh, *head;
1586	const unsigned blocksize = 1 << inode->i_blkbits;
1587	int nr_underway = 0;
1588
1589	BUG_ON(!PageLocked(page));
1590
1591	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1592
1593	if (!page_has_buffers(page)) {
1594		create_empty_buffers(page, blocksize,
1595					(1 << BH_Dirty)|(1 << BH_Uptodate));
1596	}
1597
1598	/*
1599	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1600	 * here, and the (potentially unmapped) buffers may become dirty at
1601	 * any time.  If a buffer becomes dirty here after we've inspected it
1602	 * then we just miss that fact, and the page stays dirty.
1603	 *
1604	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1605	 * handle that here by just cleaning them.
1606	 */
1607
1608	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1609	head = page_buffers(page);
1610	bh = head;
1611
1612	/*
1613	 * Get all the dirty buffers mapped to disk addresses and
1614	 * handle any aliases from the underlying blockdev's mapping.
1615	 */
1616	do {
1617		if (block > last_block) {
1618			/*
1619			 * mapped buffers outside i_size will occur, because
1620			 * this page can be outside i_size when there is a
1621			 * truncate in progress.
1622			 */
1623			/*
1624			 * The buffer was zeroed by block_write_full_page()
1625			 */
1626			clear_buffer_dirty(bh);
1627			set_buffer_uptodate(bh);
1628		} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1629			WARN_ON(bh->b_size != blocksize);
1630			err = get_block(inode, block, bh, 1);
1631			if (err)
1632				goto recover;
1633			if (buffer_new(bh)) {
1634				/* blockdev mappings never come here */
1635				clear_buffer_new(bh);
1636				unmap_underlying_metadata(bh->b_bdev,
1637							bh->b_blocknr);
1638			}
1639		}
1640		bh = bh->b_this_page;
1641		block++;
1642	} while (bh != head);
1643
1644	do {
1645		if (!buffer_mapped(bh))
1646			continue;
1647		/*
1648		 * If it's a fully non-blocking write attempt and we cannot
1649		 * lock the buffer then redirty the page.  Note that this can
1650		 * potentially cause a busy-wait loop from pdflush and kswapd
1651		 * activity, but those code paths have their own higher-level
1652		 * throttling.
1653		 */
1654		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1655			lock_buffer(bh);
1656		} else if (test_set_buffer_locked(bh)) {
1657			redirty_page_for_writepage(wbc, page);
1658			continue;
1659		}
1660		if (test_clear_buffer_dirty(bh)) {
1661			mark_buffer_async_write(bh);
1662		} else {
1663			unlock_buffer(bh);
1664		}
1665	} while ((bh = bh->b_this_page) != head);
1666
1667	/*
1668	 * The page and its buffers are protected by PageWriteback(), so we can
1669	 * drop the bh refcounts early.
1670	 */
1671	BUG_ON(PageWriteback(page));
1672	set_page_writeback(page);
1673
1674	do {
1675		struct buffer_head *next = bh->b_this_page;
1676		if (buffer_async_write(bh)) {
1677			submit_bh(WRITE, bh);
1678			nr_underway++;
1679		}
1680		bh = next;
1681	} while (bh != head);
1682	unlock_page(page);
1683
1684	err = 0;
1685done:
1686	if (nr_underway == 0) {
1687		/*
1688		 * The page was marked dirty, but the buffers were
1689		 * clean.  Someone wrote them back by hand with
1690		 * ll_rw_block/submit_bh.  A rare case.
1691		 */
1692		int uptodate = 1;
1693		do {
1694			if (!buffer_uptodate(bh)) {
1695				uptodate = 0;
1696				break;
1697			}
1698			bh = bh->b_this_page;
1699		} while (bh != head);
1700		if (uptodate)
1701			SetPageUptodate(page);
1702		end_page_writeback(page);
1703		/*
1704		 * The page and buffer_heads can be released at any time from
1705		 * here on.
1706		 */
1707		wbc->pages_skipped++;	/* We didn't write this page */
1708	}
1709	return err;
1710
1711recover:
1712	/*
1713	 * ENOSPC, or some other error.  We may already have added some
1714	 * blocks to the file, so we need to write these out to avoid
1715	 * exposing stale data.
1716	 * The page is currently locked and not marked for writeback
1717	 */
1718	bh = head;
1719	/* Recovery: lock and submit the mapped buffers */
1720	do {
1721		if (buffer_mapped(bh) && buffer_dirty(bh)) {
1722			lock_buffer(bh);
1723			mark_buffer_async_write(bh);
1724		} else {
1725			/*
1726			 * The buffer may have been set dirty during
1727			 * attachment to a dirty page.
1728			 */
1729			clear_buffer_dirty(bh);
1730		}
1731	} while ((bh = bh->b_this_page) != head);
1732	SetPageError(page);
1733	BUG_ON(PageWriteback(page));
1734	set_page_writeback(page);
1735	unlock_page(page);
1736	do {
1737		struct buffer_head *next = bh->b_this_page;
1738		if (buffer_async_write(bh)) {
1739			clear_buffer_dirty(bh);
1740			submit_bh(WRITE, bh);
1741			nr_underway++;
1742		}
1743		bh = next;
1744	} while (bh != head);
1745	goto done;
1746}
1747
1748static int __block_prepare_write(struct inode *inode, struct page *page,
1749		unsigned from, unsigned to, get_block_t *get_block)
1750{
1751	unsigned block_start, block_end;
1752	sector_t block;
1753	int err = 0;
1754	unsigned blocksize, bbits;
1755	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1756
1757	BUG_ON(!PageLocked(page));
1758	BUG_ON(from > PAGE_CACHE_SIZE);
1759	BUG_ON(to > PAGE_CACHE_SIZE);
1760	BUG_ON(from > to);
1761
1762	blocksize = 1 << inode->i_blkbits;
1763	if (!page_has_buffers(page))
1764		create_empty_buffers(page, blocksize, 0);
1765	head = page_buffers(page);
1766
1767	bbits = inode->i_blkbits;
1768	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1769
1770	for(bh = head, block_start = 0; bh != head || !block_start;
1771	    block++, block_start=block_end, bh = bh->b_this_page) {
1772		block_end = block_start + blocksize;
1773		if (block_end <= from || block_start >= to) {
1774			if (PageUptodate(page)) {
1775				if (!buffer_uptodate(bh))
1776					set_buffer_uptodate(bh);
1777			}
1778			continue;
1779		}
1780		if (buffer_new(bh))
1781			clear_buffer_new(bh);
1782		if (!buffer_mapped(bh)) {
1783			WARN_ON(bh->b_size != blocksize);
1784			err = get_block(inode, block, bh, 1);
1785			if (err)
1786				break;
1787			if (buffer_new(bh)) {
1788				unmap_underlying_metadata(bh->b_bdev,
1789							bh->b_blocknr);
1790				if (PageUptodate(page)) {
1791					set_buffer_uptodate(bh);
1792					continue;
1793				}
1794				if (block_end > to || block_start < from) {
1795					void *kaddr;
1796
1797					kaddr = kmap_atomic(page, KM_USER0);
1798					if (block_end > to)
1799						memset(kaddr+to, 0,
1800							block_end-to);
1801					if (block_start < from)
1802						memset(kaddr+block_start,
1803							0, from-block_start);
1804					flush_dcache_page(page);
1805					kunmap_atomic(kaddr, KM_USER0);
1806				}
1807				continue;
1808			}
1809		}
1810		if (PageUptodate(page)) {
1811			if (!buffer_uptodate(bh))
1812				set_buffer_uptodate(bh);
1813			continue; 
1814		}
1815		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1816		     (block_start < from || block_end > to)) {
1817			ll_rw_block(READ, 1, &bh);
1818			*wait_bh++=bh;
1819		}
1820	}
1821	/*
1822	 * If we issued read requests - let them complete.
1823	 */
1824	while(wait_bh > wait) {
1825		wait_on_buffer(*--wait_bh);
1826		if (!buffer_uptodate(*wait_bh))
1827			err = -EIO;
1828	}
1829	if (!err) {
1830		bh = head;
1831		do {
1832			if (buffer_new(bh))
1833				clear_buffer_new(bh);
1834		} while ((bh = bh->b_this_page) != head);
1835		return 0;
1836	}
1837	/* Error case: */
1838	/*
1839	 * Zero out any newly allocated blocks to avoid exposing stale
1840	 * data.  If BH_New is set, we know that the block was newly
1841	 * allocated in the above loop.
1842	 */
1843	bh = head;
1844	block_start = 0;
1845	do {
1846		block_end = block_start+blocksize;
1847		if (block_end <= from)
1848			goto next_bh;
1849		if (block_start >= to)
1850			break;
1851		if (buffer_new(bh)) {
1852			void *kaddr;
1853
1854			clear_buffer_new(bh);
1855			kaddr = kmap_atomic(page, KM_USER0);
1856			memset(kaddr+block_start, 0, bh->b_size);
1857			flush_dcache_page(page);
1858			kunmap_atomic(kaddr, KM_USER0);
1859			set_buffer_uptodate(bh);
1860			mark_buffer_dirty(bh);
1861		}
1862next_bh:
1863		block_start = block_end;
1864		bh = bh->b_this_page;
1865	} while (bh != head);
1866	return err;
1867}
1868
1869static int __block_commit_write(struct inode *inode, struct page *page,
1870		unsigned from, unsigned to)
1871{
1872	unsigned block_start, block_end;
1873	int partial = 0;
1874	unsigned blocksize;
1875	struct buffer_head *bh, *head;
1876
1877	blocksize = 1 << inode->i_blkbits;
1878
1879	for(bh = head = page_buffers(page), block_start = 0;
1880	    bh != head || !block_start;
1881	    block_start=block_end, bh = bh->b_this_page) {
1882		block_end = block_start + blocksize;
1883		if (block_end <= from || block_start >= to) {
1884			if (!buffer_uptodate(bh))
1885				partial = 1;
1886		} else {
1887			set_buffer_uptodate(bh);
1888			mark_buffer_dirty(bh);
1889		}
1890	}
1891
1892	/*
1893	 * If this is a partial write which happened to make all buffers
1894	 * uptodate then we can optimize away a bogus readpage() for
1895	 * the next read(). Here we 'discover' whether the page went
1896	 * uptodate as a result of this (potentially partial) write.
1897	 */
1898	if (!partial)
1899		SetPageUptodate(page);
1900	return 0;
1901}
1902
1903/*
1904 * Generic "read page" function for block devices that have the normal
1905 * get_block functionality. This is most of the block device filesystems.
1906 * Reads the page asynchronously --- the unlock_buffer() and
1907 * set/clear_buffer_uptodate() functions propagate buffer state into the
1908 * page struct once IO has completed.
1909 */
1910int block_read_full_page(struct page *page, get_block_t *get_block)
1911{
1912	struct inode *inode = page->mapping->host;
1913	sector_t iblock, lblock;
1914	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1915	unsigned int blocksize;
1916	int nr, i;
1917	int fully_mapped = 1;
1918
1919	BUG_ON(!PageLocked(page));
1920	blocksize = 1 << inode->i_blkbits;
1921	if (!page_has_buffers(page))
1922		create_empty_buffers(page, blocksize, 0);
1923	head = page_buffers(page);
1924
1925	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1926	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
1927	bh = head;
1928	nr = 0;
1929	i = 0;
1930
1931	do {
1932		if (buffer_uptodate(bh))
1933			continue;
1934
1935		if (!buffer_mapped(bh)) {
1936			int err = 0;
1937
1938			fully_mapped = 0;
1939			if (iblock < lblock) {
1940				WARN_ON(bh->b_size != blocksize);
1941				err = get_block(inode, iblock, bh, 0);
1942				if (err)
1943					SetPageError(page);
1944			}
1945			if (!buffer_mapped(bh)) {
1946				void *kaddr = kmap_atomic(page, KM_USER0);
1947				memset(kaddr + i * blocksize, 0, blocksize);
1948				flush_dcache_page(page);
1949				kunmap_atomic(kaddr, KM_USER0);
1950				if (!err)
1951					set_buffer_uptodate(bh);
1952				continue;
1953			}
1954			/*
1955			 * get_block() might have updated the buffer
1956			 * synchronously
1957			 */
1958			if (buffer_uptodate(bh))
1959				continue;
1960		}
1961		arr[nr++] = bh;
1962	} while (i++, iblock++, (bh = bh->b_this_page) != head);
1963
1964	if (fully_mapped)
1965		SetPageMappedToDisk(page);
1966
1967	if (!nr) {
1968		/*
1969		 * All buffers are uptodate - we can set the page uptodate
1970		 * as well. But not if get_block() returned an error.
1971		 */
1972		if (!PageError(page))
1973			SetPageUptodate(page);
1974		unlock_page(page);
1975		return 0;
1976	}
1977
1978	/* Stage two: lock the buffers */
1979	for (i = 0; i < nr; i++) {
1980		bh = arr[i];
1981		lock_buffer(bh);
1982		mark_buffer_async_read(bh);
1983	}
1984
1985	/*
1986	 * Stage 3: start the IO.  Check for uptodateness
1987	 * inside the buffer lock in case another process reading
1988	 * the underlying blockdev brought it uptodate (the sct fix).
1989	 */
1990	for (i = 0; i < nr; i++) {
1991		bh = arr[i];
1992		if (buffer_uptodate(bh))
1993			end_buffer_async_read(bh, 1);
1994		else
1995			submit_bh(READ, bh);
1996	}
1997	return 0;
1998}
1999
2000/* utility function for filesystems that need to do work on expanding
2001 * truncates.  Uses prepare/commit_write to allow the filesystem to
2002 * deal with the hole.  
2003 */
2004static int __generic_cont_expand(struct inode *inode, loff_t size,
2005				 pgoff_t index, unsigned int offset)
2006{
2007	struct address_space *mapping = inode->i_mapping;
2008	struct page *page;
2009	unsigned long limit;
2010	int err;
2011
2012	err = -EFBIG;
2013        limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2014	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2015		send_sig(SIGXFSZ, current, 0);
2016		goto out;
2017	}
2018	if (size > inode->i_sb->s_maxbytes)
2019		goto out;
2020
2021	err = -ENOMEM;
2022	page = grab_cache_page(mapping, index);
2023	if (!page)
2024		goto out;
2025	err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2026	if (err) {
2027		/*
2028		 * ->prepare_write() may have instantiated a few blocks
2029		 * outside i_size.  Trim these off again.
2030		 */
2031		unlock_page(page);
2032		page_cache_release(page);
2033		vmtruncate(inode, inode->i_size);
2034		goto out;
2035	}
2036
2037	err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2038
2039	unlock_page(page);
2040	page_cache_release(page);
2041	if (err > 0)
2042		err = 0;
2043out:
2044	return err;
2045}
2046
2047int generic_cont_expand(struct inode *inode, loff_t size)
2048{
2049	pgoff_t index;
2050	unsigned int offset;
2051
2052	offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
2053
2054	/* ugh.  in prepare/commit_write, if from==to==start of block, we
2055	** skip the prepare.  make sure we never send an offset for the start
2056	** of a block
2057	*/
2058	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2059		/* caller must handle this extra byte. */
2060		offset++;
2061	}
2062	index = size >> PAGE_CACHE_SHIFT;
2063
2064	return __generic_cont_expand(inode, size, index, offset);
2065}
2066
2067int generic_cont_expand_simple(struct inode *inode, loff_t size)
2068{
2069	loff_t pos = size - 1;
2070	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2071	unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
2072
2073	/* prepare/commit_write can handle even if from==to==start of block. */
2074	return __generic_cont_expand(inode, size, index, offset);
2075}
2076
2077/*
2078 * For moronic filesystems that do not allow holes in file.
2079 * We may have to extend the file.
2080 */
2081
2082int cont_prepare_write(struct page *page, unsigned offset,
2083		unsigned to, get_block_t *get_block, loff_t *bytes)
2084{
2085	struct address_space *mapping = page->mapping;
2086	struct inode *inode = mapping->host;
2087	struct page *new_page;
2088	pgoff_t pgpos;
2089	long status;
2090	unsigned zerofrom;
2091	unsigned blocksize = 1 << inode->i_blkbits;
2092	void *kaddr;
2093
2094	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2095		status = -ENOMEM;
2096		new_page = grab_cache_page(mapping, pgpos);
2097		if (!new_page)
2098			goto out;
2099		/* we might sleep */
2100		if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2101			unlock_page(new_page);
2102			page_cache_release(new_page);
2103			continue;
2104		}
2105		zerofrom = *bytes & ~PAGE_CACHE_MASK;
2106		if (zerofrom & (blocksize-1)) {
2107			*bytes |= (blocksize-1);
2108			(*bytes)++;
2109		}
2110		status = __block_prepare_write(inode, new_page, zerofrom,
2111						PAGE_CACHE_SIZE, get_block);
2112		if (status)
2113			goto out_unmap;
2114		kaddr = kmap_atomic(new_page, KM_USER0);
2115		memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
2116		flush_dcache_page(new_page);
2117		kunmap_atomic(kaddr, KM_USER0);
2118		generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
2119		unlock_page(new_page);
2120		page_cache_release(new_page);
2121	}
2122
2123	if (page->index < pgpos) {
2124		/* completely inside the area */
2125		zerofrom = offset;
2126	} else {
2127		/* page covers the boundary, find the boundary offset */
2128		zerofrom = *bytes & ~PAGE_CACHE_MASK;
2129
2130		/* if we will expand the thing last block will be filled */
2131		if (to > zerofrom && (zerofrom & (blocksize-1))) {
2132			*bytes |= (blocksize-1);
2133			(*bytes)++;
2134		}
2135
2136		/* starting below the boundary? Nothing to zero out */
2137		if (offset <= zerofrom)
2138			zerofrom = offset;
2139	}
2140	status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2141	if (status)
2142		goto out1;
2143	if (zerofrom < offset) {
2144		kaddr = kmap_atomic(page, KM_USER0);
2145		memset(kaddr+zerofrom, 0, offset-zerofrom);
2146		flush_dcache_page(page);
2147		kunmap_atomic(kaddr, KM_USER0);
2148		__block_commit_write(inode, page, zerofrom, offset);
2149	}
2150	return 0;
2151out1:
2152	ClearPageUptodate(page);
2153	return status;
2154
2155out_unmap:
2156	ClearPageUptodate(new_page);
2157	unlock_page(new_page);
2158	page_cache_release(new_page);
2159out:
2160	return status;
2161}
2162
2163int block_prepare_write(struct page *page, unsigned from, unsigned to,
2164			get_block_t *get_block)
2165{
2166	struct inode *inode = page->mapping->host;
2167	int err = __block_prepare_write(inode, page, from, to, get_block);
2168	if (err)
2169		ClearPageUptodate(page);
2170	return err;
2171}
2172
2173int block_commit_write(struct page *page, unsigned from, unsigned to)
2174{
2175	struct inode *inode = page->mapping->host;
2176	__block_commit_write(inode,page,from,to);
2177	return 0;
2178}
2179
2180int generic_commit_write(struct file *file, struct page *page,
2181		unsigned from, unsigned to)
2182{
2183	struct inode *inode = page->mapping->host;
2184	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2185	__block_commit_write(inode,page,from,to);
2186	/*
2187	 * No need to use i_size_read() here, the i_size
2188	 * cannot change under us because we hold i_mutex.
2189	 */
2190	if (pos > inode->i_size) {
2191		i_size_write(inode, pos);
2192		mark_inode_dirty(inode);
2193	}
2194	return 0;
2195}
2196
2197
2198/*
2199 * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
2200 * immediately, while under the page lock.  So it needs a special end_io
2201 * handler which does not touch the bh after unlocking it.
2202 *
2203 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
2204 * a race there is benign: unlock_buffer() only use the bh's address for
2205 * hashing after unlocking the buffer, so it doesn't actually touch the bh
2206 * itself.
2207 */
2208static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2209{
2210	if (uptodate) {
2211		set_buffer_uptodate(bh);
2212	} else {
2213		/* This happens, due to failed READA attempts. */
2214		clear_buffer_uptodate(bh);
2215	}
2216	unlock_buffer(bh);
2217}
2218
2219/*
2220 * On entry, the page is fully not uptodate.
2221 * On exit the page is fully uptodate in the areas outside (from,to)
2222 */
2223int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2224			get_block_t *get_block)
2225{
2226	struct inode *inode = page->mapping->host;
2227	const unsigned blkbits = inode->i_blkbits;
2228	const unsigned blocksize = 1 << blkbits;
2229	struct buffer_head map_bh;
2230	struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
2231	unsigned block_in_page;
2232	unsigned block_start;
2233	sector_t block_in_file;
2234	char *kaddr;
2235	int nr_reads = 0;
2236	int i;
2237	int ret = 0;
2238	int is_mapped_to_disk = 1;
2239	int dirtied_it = 0;
2240
2241	if (PageMappedToDisk(page))
2242		return 0;
2243
2244	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2245	map_bh.b_page = page;
2246
2247	/*
2248	 * We loop across all blocks in the page, whether or not they are
2249	 * part of the affected region.  This is so we can discover if the
2250	 * page is fully mapped-to-disk.
2251	 */
2252	for (block_start = 0, block_in_page = 0;
2253		  block_start < PAGE_CACHE_SIZE;
2254		  block_in_page++, block_start += blocksize) {
2255		unsigned block_end = block_start + blocksize;
2256		int create;
2257
2258		map_bh.b_state = 0;
2259		create = 1;
2260		if (block_start >= to)
2261			create = 0;
2262		map_bh.b_size = blocksize;
2263		ret = get_block(inode, block_in_file + block_in_page,
2264					&map_bh, create);
2265		if (ret)
2266			goto failed;
2267		if (!buffer_mapped(&map_bh))
2268			is_mapped_to_disk = 0;
2269		if (buffer_new(&map_bh))
2270			unmap_underlying_metadata(map_bh.b_bdev,
2271							map_bh.b_blocknr);
2272		if (PageUptodate(page))
2273			continue;
2274		if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
2275			kaddr = kmap_atomic(page, KM_USER0);
2276			if (block_start < from) {
2277				memset(kaddr+block_start, 0, from-block_start);
2278				dirtied_it = 1;
2279			}
2280			if (block_end > to) {
2281				memset(kaddr + to, 0, block_end - to);
2282				dirtied_it = 1;
2283			}
2284			flush_dcache_page(page);
2285			kunmap_atomic(kaddr, KM_USER0);
2286			continue;
2287		}
2288		if (buffer_uptodate(&map_bh))
2289			continue;	/* reiserfs does this */
2290		if (block_start < from || block_end > to) {
2291			struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
2292
2293			if (!bh) {
2294				ret = -ENOMEM;
2295				goto failed;
2296			}
2297			bh->b_state = map_bh.b_state;
2298			atomic_set(&bh->b_count, 0);
2299			bh->b_this_page = NULL;
2300			bh->b_page = page;
2301			bh->b_blocknr = map_bh.b_blocknr;
2302			bh->b_size = blocksize;
2303			bh->b_data = (char *)(long)block_start;
2304			bh->b_bdev = map_bh.b_bdev;
2305			bh->b_private = NULL;
2306			read_bh[nr_reads++] = bh;
2307		}
2308	}
2309
2310	if (nr_reads) {
2311		struct buffer_head *bh;
2312
2313		/*
2314		 * The page is locked, so these buffers are protected from
2315		 * any VM or truncate activity.  Hence we don't need to care
2316		 * for the buffer_head refcounts.
2317		 */
2318		for (i = 0; i < nr_reads; i++) {
2319			bh = read_bh[i];
2320			lock_buffer(bh);
2321			bh->b_end_io = end_buffer_read_nobh;
2322			submit_bh(READ, bh);
2323		}
2324		for (i = 0; i < nr_reads; i++) {
2325			bh = read_bh[i];
2326			wait_on_buffer(bh);
2327			if (!buffer_uptodate(bh))
2328				ret = -EIO;
2329			free_buffer_head(bh);
2330			read_bh[i] = NULL;
2331		}
2332		if (ret)
2333			goto failed;
2334	}
2335
2336	if (is_mapped_to_disk)
2337		SetPageMappedToDisk(page);
2338	SetPageUptodate(page);
2339
2340	/*
2341	 * Setting the page dirty here isn't necessary for the prepare_write
2342	 * function - commit_write will do that.  But if/when this function is
2343	 * used within the pagefault handler to ensure that all mmapped pages
2344	 * have backing space in the filesystem, we will need to dirty the page
2345	 * if its contents were altered.
2346	 */
2347	if (dirtied_it)
2348		set_page_dirty(page);
2349
2350	return 0;
2351
2352failed:
2353	for (i = 0; i < nr_reads; i++) {
2354		if (read_bh[i])
2355			free_buffer_head(read_bh[i]);
2356	}
2357
2358	/*
2359	 * Error recovery is pretty slack.  Clear the page and mark it dirty
2360	 * so we'll later zero out any blocks which _were_ allocated.
2361	 */
2362	kaddr = kmap_atomic(page, KM_USER0);
2363	memset(kaddr, 0, PAGE_CACHE_SIZE);
2364	flush_dcache_page(page);
2365	kunmap_atomic(kaddr, KM_USER0);
2366	SetPageUptodate(page);
2367	set_page_dirty(page);
2368	return ret;
2369}
2370EXPORT_SYMBOL(nobh_prepare_write);
2371
2372int nobh_commit_write(struct file *file, struct page *page,
2373		unsigned from, unsigned to)
2374{
2375	struct inode *inode = page->mapping->host;
2376	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2377
2378	set_page_dirty(page);
2379	if (pos > inode->i_size) {
2380		i_size_write(inode, pos);
2381		mark_inode_dirty(inode);
2382	}
2383	return 0;
2384}
2385EXPORT_SYMBOL(nobh_commit_write);
2386
2387/*
2388 * nobh_writepage() - based on block_full_write_page() except
2389 * that it tries to operate without attaching bufferheads to
2390 * the page.
2391 */
2392int nobh_writepage(struct page *page, get_block_t *get_block,
2393			struct writeback_control *wbc)
2394{
2395	struct inode * const inode = page->mapping->host;
2396	loff_t i_size = i_size_read(inode);
2397	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2398	unsigned offset;
2399	void *kaddr;
2400	int ret;
2401
2402	/* Is the page fully inside i_size? */
2403	if (page->index < end_index)
2404		goto out;
2405
2406	/* Is the page fully outside i_size? (truncate in progress) */
2407	offset = i_size & (PAGE_CACHE_SIZE-1);
2408	if (page->index >= end_index+1 || !offset) {
2409		/*
2410		 * The page may have dirty, unmapped buffers.  For example,
2411		 * they may have been added in ext3_writepage().  Make them
2412		 * freeable here, so the page does not leak.
2413		 */
2414#if 0
2415		/* Not really sure about this  - do we need this ? */
2416		if (page->mapping->a_ops->invalidatepage)
2417			page->mapping->a_ops->invalidatepage(page, offset);
2418#endif
2419		unlock_page(page);
2420		return 0; /* don't care */
2421	}
2422
2423	/*
2424	 * The page straddles i_size.  It must be zeroed out on each and every
2425	 * writepage invocation because it may be mmapped.  "A file is mapped
2426	 * in multiples of the page size.  For a file that is not a multiple of
2427	 * the  page size, the remaining memory is zeroed when mapped, and
2428	 * writes to that region are not written out to the file."
2429	 */
2430	kaddr = kmap_atomic(page, KM_USER0);
2431	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2432	flush_dcache_page(page);
2433	kunmap_atomic(kaddr, KM_USER0);
2434out:
2435	ret = mpage_writepage(page, get_block, wbc);
2436	if (ret == -EAGAIN)
2437		ret = __block_write_full_page(inode, page, get_block, wbc);
2438	return ret;
2439}
2440EXPORT_SYMBOL(nobh_writepage);
2441
2442/*
2443 * This function assumes that ->prepare_write() uses nobh_prepare_write().
2444 */
2445int nobh_truncate_page(struct address_space *mapping, loff_t from)
2446{
2447	struct inode *inode = mapping->host;
2448	unsigned blocksize = 1 << inode->i_blkbits;
2449	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2450	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2451	unsigned to;
2452	struct page *page;
2453	const struct address_space_operations *a_ops = mapping->a_ops;
2454	char *kaddr;
2455	int ret = 0;
2456
2457	if ((offset & (blocksize - 1)) == 0)
2458		goto out;
2459
2460	ret = -ENOMEM;
2461	page = grab_cache_page(mapping, index);
2462	if (!page)
2463		goto out;
2464
2465	to = (offset + blocksize) & ~(blocksize - 1);
2466	ret = a_ops->prepare_write(NULL, page, offset, to);
2467	if (ret == 0) {
2468		kaddr = kmap_atomic(page, KM_USER0);
2469		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2470		flush_dcache_page(page);
2471		kunmap_atomic(kaddr, KM_USER0);
2472		set_page_dirty(page);
2473	}
2474	unlock_page(page);
2475	page_cache_release(page);
2476out:
2477	return ret;
2478}
2479EXPORT_SYMBOL(nobh_truncate_page);
2480
2481int block_truncate_page(struct address_space *mapping,
2482			loff_t from, get_block_t *get_block)
2483{
2484	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2485	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2486	unsigned blocksize;
2487	sector_t iblock;
2488	unsigned length, pos;
2489	struct inode *inode = mapping->host;
2490	struct page *page;
2491	struct buffer_head *bh;
2492	void *kaddr;
2493	int err;
2494
2495	blocksize = 1 << inode->i_blkbits;
2496	length = offset & (blocksize - 1);
2497
2498	/* Block boundary? Nothing to do */
2499	if (!length)
2500		return 0;
2501
2502	length = blocksize - length;
2503	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2504	
2505	page = grab_cache_page(mapping, index);
2506	err = -ENOMEM;
2507	if (!page)
2508		goto out;
2509
2510	if (!page_has_buffers(page))
2511		create_empty_buffers(page, blocksize, 0);
2512
2513	/* Find the buffer that contains "offset" */
2514	bh = page_buffers(page);
2515	pos = blocksize;
2516	while (offset >= pos) {
2517		bh = bh->b_this_page;
2518		iblock++;
2519		pos += blocksize;
2520	}
2521
2522	err = 0;
2523	if (!buffer_mapped(bh)) {
2524		WARN_ON(bh->b_size != blocksize);
2525		err = get_block(inode, iblock, bh, 0);
2526		if (err)
2527			goto unlock;
2528		/* unmapped? It's a hole - nothing to do */
2529		if (!buffer_mapped(bh))
2530			goto unlock;
2531	}
2532
2533	/* Ok, it's mapped. Make sure it's up-to-date */
2534	if (PageUptodate(page))
2535		set_buffer_uptodate(bh);
2536
2537	if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
2538		err = -EIO;
2539		ll_rw_block(READ, 1, &bh);
2540		wait_on_buffer(bh);
2541		/* Uhhuh. Read error. Complain and punt. */
2542		if (!buffer_uptodate(bh))
2543			goto unlock;
2544	}
2545
2546	kaddr = kmap_atomic(page, KM_USER0);
2547	memset(kaddr + offset, 0, length);
2548	flush_dcache_page(page);
2549	kunmap_atomic(kaddr, KM_USER0);
2550
2551	mark_buffer_dirty(bh);
2552	err = 0;
2553
2554unlock:
2555	unlock_page(page);
2556	page_cache_release(page);
2557out:
2558	return err;
2559}
2560
2561/*
2562 * The generic ->writepage function for buffer-backed address_spaces
2563 */
2564int block_write_full_page(struct page *page, get_block_t *get_block,
2565			struct writeback_control *wbc)
2566{
2567	struct inode * const inode = page->mapping->host;
2568	loff_t i_size = i_size_read(inode);
2569	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2570	unsigned offset;
2571	void *kaddr;
2572
2573	/* Is the page fully inside i_size? */
2574	if (page->index < end_index)
2575		return __block_write_full_page(inode, page, get_block, wbc);
2576
2577	/* Is the page fully outside i_size? (truncate in progress) */
2578	offset = i_size & (PAGE_CACHE_SIZE-1);
2579	if (page->index >= end_index+1 || !offset) {
2580		/*
2581		 * The page may have dirty, unmapped buffers.  For example,
2582		 * they may have been added in ext3_writepage().  Make them
2583		 * freeable here, so the page does not leak.
2584		 */
2585		do_invalidatepage(page, 0);
2586		unlock_page(page);
2587		return 0; /* don't care */
2588	}
2589
2590	/*
2591	 * The page straddles i_size.  It must be zeroed out on each and every
2592	 * writepage invokation because it may be mmapped.  "A file is mapped
2593	 * in multiples of the page size.  For a file that is not a multiple of
2594	 * the  page size, the remaining memory is zeroed when mapped, and
2595	 * writes to that region are not written out to the file."
2596	 */
2597	kaddr = kmap_atomic(page, KM_USER0);
2598	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2599	flush_dcache_page(page);
2600	kunmap_atomic(kaddr, KM_USER0);
2601	return __block_write_full_page(inode, page, get_block, wbc);
2602}
2603
2604sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2605			    get_block_t *get_block)
2606{
2607	struct buffer_head tmp;
2608	struct inode *inode = mapping->host;
2609	tmp.b_state = 0;
2610	tmp.b_blocknr = 0;
2611	tmp.b_size = 1 << inode->i_blkbits;
2612	get_block(inode, block, &tmp, 0);
2613	return tmp.b_blocknr;
2614}
2615
2616static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2617{
2618	struct buffer_head *bh = bio->bi_private;
2619
2620	if (bio->bi_size)
2621		return 1;
2622
2623	if (err == -EOPNOTSUPP) {
2624		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2625		set_bit(BH_Eopnotsupp, &bh->b_state);
2626	}
2627
2628	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2629	bio_put(bio);
2630	return 0;
2631}
2632
2633int submit_bh(int rw, struct buffer_head * bh)
2634{
2635	struct bio *bio;
2636	int ret = 0;
2637
2638	BUG_ON(!buffer_locked(bh));
2639	BUG_ON(!buffer_mapped(bh));
2640	BUG_ON(!bh->b_end_io);
2641
2642	if (buffer_ordered(bh) && (rw == WRITE))
2643		rw = WRITE_BARRIER;
2644
2645	/*
2646	 * Only clear out a write error when rewriting, should this
2647	 * include WRITE_SYNC as well?
2648	 */
2649	if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
2650		clear_buffer_write_io_error(bh);
2651
2652	/*
2653	 * from here on down, it's all bio -- do the initial mapping,
2654	 * submit_bio -> generic_make_request may further map this bio around
2655	 */
2656	bio = bio_alloc(GFP_NOIO, 1);
2657
2658	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2659	bio->bi_bdev = bh->b_bdev;
2660	bio->bi_io_vec[0].bv_page = bh->b_page;
2661	bio->bi_io_vec[0].bv_len = bh->b_size;
2662	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2663
2664	bio->bi_vcnt = 1;
2665	bio->bi_idx = 0;
2666	bio->bi_size = bh->b_size;
2667
2668	bio->bi_end_io = end_bio_bh_io_sync;
2669	bio->bi_private = bh;
2670
2671	bio_get(bio);
2672	submit_bio(rw, bio);
2673
2674	if (bio_flagged(bio, BIO_EOPNOTSUPP))
2675		ret = -EOPNOTSUPP;
2676
2677	bio_put(bio);
2678	return ret;
2679}
2680
2681/**
2682 * ll_rw_block: low-level access to block devices (DEPRECATED)
2683 * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
2684 * @nr: number of &struct buffer_heads in the array
2685 * @bhs: array of pointers to &struct buffer_head
2686 *
2687 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2688 * requests an I/O operation on them, either a %READ or a %WRITE.  The third
2689 * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
2690 * are sent to disk. The fourth %READA option is described in the documentation
2691 * for generic_make_request() which ll_rw_block() calls.
2692 *
2693 * This function drops any buffer that it cannot get a lock on (with the
2694 * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
2695 * clean when doing a write request, and any buffer that appears to be
2696 * up-to-date when doing read request.  Further it marks as clean buffers that
2697 * are processed for writing (the buffer cache won't assume that they are
2698 * actually clean until the buffer gets unlocked).
2699 *
2700 * ll_rw_block sets b_end_io to simple completion handler that marks
2701 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2702 * any waiters. 
2703 *
2704 * All of the buffers must be for the same device, and must also be a
2705 * multiple of the current approved size for the device.
2706 */
2707void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2708{
2709	int i;
2710
2711	for (i = 0; i < nr; i++) {
2712		struct buffer_head *bh = bhs[i];
2713
2714		if (rw == SWRITE)
2715			lock_buffer(bh);
2716		else if (test_set_buffer_locked(bh))
2717			continue;
2718
2719		if (rw == WRITE || rw == SWRITE) {
2720			if (test_clear_buffer_dirty(bh)) {
2721				bh->b_end_io = end_buffer_write_sync;
2722				get_bh(bh);
2723				submit_bh(WRITE, bh);
2724				continue;
2725			}
2726		} else {
2727			if (!buffer_uptodate(bh)) {
2728				bh->b_end_io = end_buffer_read_sync;
2729				get_bh(bh);
2730				submit_bh(rw, bh);
2731				continue;
2732			}
2733		}
2734		unlock_buffer(bh);
2735	}
2736}
2737
2738/*
2739 * For a data-integrity writeout, we need to wait upon any in-progress I/O
2740 * and then start new I/O and then wait upon it.  The caller must have a ref on
2741 * the buffer_head.
2742 */
2743int sync_dirty_buffer(struct buffer_head *bh)
2744{
2745	int ret = 0;
2746
2747	WARN_ON(atomic_read(&bh->b_count) < 1);
2748	lock_buffer(bh);
2749	if (test_clear_buffer_dirty(bh)) {
2750		get_bh(bh);
2751		bh->b_end_io = end_buffer_write_sync;
2752		ret = submit_bh(WRITE, bh);
2753		wait_on_buffer(bh);
2754		if (buffer_eopnotsupp(bh)) {
2755			clear_buffer_eopnotsupp(bh);
2756			ret = -EOPNOTSUPP;
2757		}
2758		if (!ret && !buffer_uptodate(bh))
2759			ret = -EIO;
2760	} else {
2761		unlock_buffer(bh);
2762	}
2763	return ret;
2764}
2765
2766/*
2767 * try_to_free_buffers() checks if all the buffers on this particular page
2768 * are unused, and releases them if so.
2769 *
2770 * Exclusion against try_to_free_buffers may be obtained by either
2771 * locking the page or by holding its mapping's private_lock.
2772 *
2773 * If the page is dirty but all the buffers are clean then we need to
2774 * be sure to mark the page clean as well.  This is because the page
2775 * may be against a block device, and a later reattachment of buffers
2776 * to a dirty page will set *all* buffers dirty.  Which would corrupt
2777 * filesystem data on the same device.
2778 *
2779 * The same applies to regular filesystem pages: if all the buffers are
2780 * clean then we set the page clean and proceed.  To do that, we require
2781 * total exclusion from __set_page_dirty_buffers().  That is obtained with
2782 * private_lock.
2783 *
2784 * try_to_free_buffers() is non-blocking.
2785 */
2786static inline int buffer_busy(struct buffer_head *bh)
2787{
2788	return atomic_read(&bh->b_count) |
2789		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2790}
2791
2792static int
2793drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2794{
2795	struct buffer_head *head = page_buffers(page);
2796	struct buffer_head *bh;
2797
2798	bh = head;
2799	do {
2800		if (buffer_write_io_error(bh) && page->mapping)
2801			set_bit(AS_EIO, &page->mapping->flags);
2802		if (buffer_busy(bh))
2803			goto failed;
2804		bh = bh->b_this_page;
2805	} while (bh != head);
2806
2807	do {
2808		struct buffer_head *next = bh->b_this_page;
2809
2810		if (!list_empty(&bh->b_assoc_buffers))
2811			__remove_assoc_queue(bh);
2812		bh = next;
2813	} while (bh != head);
2814	*buffers_to_free = head;
2815	__clear_page_buffers(page);
2816	return 1;
2817failed:
2818	return 0;
2819}
2820
2821int try_to_free_buffers(struct page *page)
2822{
2823	struct address_space * const mapping = page->mapping;
2824	struct buffer_head *buffers_to_free = NULL;
2825	int ret = 0;
2826
2827	BUG_ON(!PageLocked(page));
2828	if (PageWriteback(page))
2829		return 0;
2830
2831	if (mapping == NULL) {		/* can this still happen? */
2832		ret = drop_buffers(page, &buffers_to_free);
2833		goto out;
2834	}
2835
2836	spin_lock(&mapping->private_lock);
2837	ret = drop_buffers(page, &buffers_to_free);
2838	spin_unlock(&mapping->private_lock);
2839	if (ret) {
2840		/*
2841		 * If the filesystem writes its buffers by hand (eg ext3)
2842		 * then we can have clean buffers against a dirty page.  We
2843		 * clean the page here; otherwise later reattachment of buffers
2844		 * could encounter a non-uptodate page, which is unresolvable.
2845		 * This only applies in the rare case where try_to_free_buffers
2846		 * succeeds but the page is not freed.
2847		 */
2848		clear_page_dirty(page);
2849	}
2850out:
2851	if (buffers_to_free) {
2852		struct buffer_head *bh = buffers_to_free;
2853
2854		do {
2855			struct buffer_head *next = bh->b_this_page;
2856			free_buffer_head(bh);
2857			bh = next;
2858		} while (bh != buffers_to_free);
2859	}
2860	return ret;
2861}
2862EXPORT_SYMBOL(try_to_free_buffers);
2863
2864void block_sync_page(struct page *page)
2865{
2866	struct address_space *mapping;
2867
2868	smp_mb();
2869	mapping = page_mapping(page);
2870	if (mapping)
2871		blk_run_backing_dev(mapping->backing_dev_info, page);
2872}
2873
2874/*
2875 * There are no bdflush tunables left.  But distributions are
2876 * still running obsolete flush daemons, so we terminate them here.
2877 *
2878 * Use of bdflush() is deprecated and will be removed in a future kernel.
2879 * The `pdflush' kernel threads fully replace bdflush daemons and this call.
2880 */
2881asmlinkage long sys_bdflush(int func, long data)
2882{
2883	static int msg_count;
2884
2885	if (!capable(CAP_SYS_ADMIN))
2886		return -EPERM;
2887
2888	if (msg_count < 5) {
2889		msg_count++;
2890		printk(KERN_INFO
2891			"warning: process `%s' used the obsolete bdflush"
2892			" system call\n", current->comm);
2893		printk(KERN_INFO "Fix your initscripts?\n");
2894	}
2895
2896	if (func == 1)
2897		do_exit(0);
2898	return 0;
2899}
2900
2901/*
2902 * Buffer-head allocation
2903 */
2904static kmem_cache_t *bh_cachep;
2905
2906/*
2907 * Once the number of bh's in the machine exceeds this level, we start
2908 * stripping them in writeback.
2909 */
2910static int max_buffer_heads;
2911
2912int buffer_heads_over_limit;
2913
2914struct bh_accounting {
2915	int nr;			/* Number of live bh's */
2916	int ratelimit;		/* Limit cacheline bouncing */
2917};
2918
2919static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
2920
2921static void recalc_bh_state(void)
2922{
2923	int i;
2924	int tot = 0;
2925
2926	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
2927		return;
2928	__get_cpu_var(bh_accounting).ratelimit = 0;
2929	for_each_online_cpu(i)
2930		tot += per_cpu(bh_accounting, i).nr;
2931	buffer_heads_over_limit = (tot > max_buffer_heads);
2932}
2933	
2934struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
2935{
2936	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
2937	if (ret) {
2938		get_cpu_var(bh_accounting).nr++;
2939		recalc_bh_state();
2940		put_cpu_var(bh_accounting);
2941	}
2942	return ret;
2943}
2944EXPORT_SYMBOL(alloc_buffer_head);
2945
2946void free_buffer_head(struct buffer_head *bh)
2947{
2948	BUG_ON(!list_empty(&bh->b_assoc_buffers));
2949	kmem_cache_free(bh_cachep, bh);
2950	get_cpu_var(bh_accounting).nr--;
2951	recalc_bh_state();
2952	put_cpu_var(bh_accounting);
2953}
2954EXPORT_SYMBOL(free_buffer_head);
2955
2956static void
2957init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
2958{
2959	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
2960			    SLAB_CTOR_CONSTRUCTOR) {
2961		struct buffer_head * bh = (struct buffer_head *)data;
2962
2963		memset(bh, 0, sizeof(*bh));
2964		INIT_LIST_HEAD(&bh->b_assoc_buffers);
2965	}
2966}
2967
2968#ifdef CONFIG_HOTPLUG_CPU
2969static void buffer_exit_cpu(int cpu)
2970{
2971	int i;
2972	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
2973
2974	for (i = 0; i < BH_LRU_SIZE; i++) {
2975		brelse(b->bhs[i]);
2976		b->bhs[i] = NULL;
2977	}
2978	get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
2979	per_cpu(bh_accounting, cpu).nr = 0;
2980	put_cpu_var(bh_accounting);
2981}
2982
2983static int buffer_cpu_notify(struct notifier_block *self,
2984			      unsigned long action, void *hcpu)
2985{
2986	if (action == CPU_DEAD)
2987		buffer_exit_cpu((unsigned long)hcpu);
2988	return NOTIFY_OK;
2989}
2990#endif /* CONFIG_HOTPLUG_CPU */
2991
2992void __init buffer_init(void)
2993{
2994	int nrpages;
2995
2996	bh_cachep = kmem_cache_create("buffer_head",
2997					sizeof(struct buffer_head), 0,
2998					(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
2999					SLAB_MEM_SPREAD),
3000					init_buffer_head,
3001					NULL);
3002
3003	/*
3004	 * Limit the bh occupancy to 10% of ZONE_NORMAL
3005	 */
3006	nrpages = (nr_free_buffer_pages() * 10) / 100;
3007	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3008	hotcpu_notifier(buffer_cpu_notify, 0);
3009}
3010
3011EXPORT_SYMBOL(__bforget);
3012EXPORT_SYMBOL(__brelse);
3013EXPORT_SYMBOL(__wait_on_buffer);
3014EXPORT_SYMBOL(block_commit_write);
3015EXPORT_SYMBOL(block_prepare_write);
3016EXPORT_SYMBOL(block_read_full_page);
3017EXPORT_SYMBOL(block_sync_page);
3018EXPORT_SYMBOL(block_truncate_page);
3019EXPORT_SYMBOL(block_write_full_page);
3020EXPORT_SYMBOL(cont_prepare_write);
3021EXPORT_SYMBOL(end_buffer_read_sync);
3022EXPORT_SYMBOL(end_buffer_write_sync);
3023EXPORT_SYMBOL(file_fsync);
3024EXPORT_SYMBOL(fsync_bdev);
3025EXPORT_SYMBOL(generic_block_bmap);
3026EXPORT_SYMBOL(generic_commit_write);
3027EXPORT_SYMBOL(generic_cont_expand);
3028EXPORT_SYMBOL(generic_cont_expand_simple);
3029EXPORT_SYMBOL(init_buffer);
3030EXPORT_SYMBOL(invalidate_bdev);
3031EXPORT_SYMBOL(ll_rw_block);
3032EXPORT_SYMBOL(mark_buffer_dirty);
3033EXPORT_SYMBOL(submit_bh);
3034EXPORT_SYMBOL(sync_dirty_buffer);
3035EXPORT_SYMBOL(unlock_buffer);