fs/reiserfs/file.c at v2.6.13 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / reiserfs / file.c
at v2.6.13 1569 lines 54 kB view raw
   1/*
   2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
   3 */
   4
   5#include <linux/time.h>
   6#include <linux/reiserfs_fs.h>
   7#include <linux/reiserfs_acl.h>
   8#include <linux/reiserfs_xattr.h>
   9#include <linux/smp_lock.h>
  10#include <asm/uaccess.h>
  11#include <linux/pagemap.h>
  12#include <linux/swap.h>
  13#include <linux/writeback.h>
  14#include <linux/blkdev.h>
  15#include <linux/buffer_head.h>
  16#include <linux/quotaops.h>
  17
  18/*
  19** We pack the tails of files on file close, not at the time they are written.
  20** This implies an unnecessary copy of the tail and an unnecessary indirect item
  21** insertion/balancing, for files that are written in one write.
  22** It avoids unnecessary tail packings (balances) for files that are written in
  23** multiple writes and are small enough to have tails.
  24** 
  25** file_release is called by the VFS layer when the file is closed.  If
  26** this is the last open file descriptor, and the file
  27** small enough to have a tail, and the tail is currently in an
  28** unformatted node, the tail is converted back into a direct item.
  29** 
  30** We use reiserfs_truncate_file to pack the tail, since it already has
  31** all the conditions coded.  
  32*/
  33static int reiserfs_file_release(struct inode *inode, struct file *filp)
  34{
  35
  36	struct reiserfs_transaction_handle th;
  37	int err;
  38	int jbegin_failure = 0;
  39
  40	if (!S_ISREG(inode->i_mode))
  41		BUG();
  42
  43	/* fast out for when nothing needs to be done */
  44	if ((atomic_read(&inode->i_count) > 1 ||
  45	     !(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
  46	     !tail_has_to_be_packed(inode)) &&
  47	    REISERFS_I(inode)->i_prealloc_count <= 0) {
  48		return 0;
  49	}
  50
  51	reiserfs_write_lock(inode->i_sb);
  52	down(&inode->i_sem);
  53	/* freeing preallocation only involves relogging blocks that
  54	 * are already in the current transaction.  preallocation gets
  55	 * freed at the end of each transaction, so it is impossible for
  56	 * us to log any additional blocks (including quota blocks)
  57	 */
  58	err = journal_begin(&th, inode->i_sb, 1);
  59	if (err) {
  60		/* uh oh, we can't allow the inode to go away while there
  61		 * is still preallocation blocks pending.  Try to join the
  62		 * aborted transaction
  63		 */
  64		jbegin_failure = err;
  65		err = journal_join_abort(&th, inode->i_sb, 1);
  66
  67		if (err) {
  68			/* hmpf, our choices here aren't good.  We can pin the inode
  69			 * which will disallow unmount from every happening, we can
  70			 * do nothing, which will corrupt random memory on unmount,
  71			 * or we can forcibly remove the file from the preallocation
  72			 * list, which will leak blocks on disk.  Lets pin the inode
  73			 * and let the admin know what is going on.
  74			 */
  75			igrab(inode);
  76			reiserfs_warning(inode->i_sb,
  77					 "pinning inode %lu because the "
  78					 "preallocation can't be freed");
  79			goto out;
  80		}
  81	}
  82	reiserfs_update_inode_transaction(inode);
  83
  84#ifdef REISERFS_PREALLOCATE
  85	reiserfs_discard_prealloc(&th, inode);
  86#endif
  87	err = journal_end(&th, inode->i_sb, 1);
  88
  89	/* copy back the error code from journal_begin */
  90	if (!err)
  91		err = jbegin_failure;
  92
  93	if (!err && atomic_read(&inode->i_count) <= 1 &&
  94	    (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
  95	    tail_has_to_be_packed(inode)) {
  96		/* if regular file is released by last holder and it has been
  97		   appended (we append by unformatted node only) or its direct
  98		   item(s) had to be converted, then it may have to be
  99		   indirect2direct converted */
 100		err = reiserfs_truncate_file(inode, 0);
 101	}
 102      out:
 103	up(&inode->i_sem);
 104	reiserfs_write_unlock(inode->i_sb);
 105	return err;
 106}
 107
 108static void reiserfs_vfs_truncate_file(struct inode *inode)
 109{
 110	reiserfs_truncate_file(inode, 1);
 111}
 112
 113/* Sync a reiserfs file. */
 114
 115/*
 116 * FIXME: sync_mapping_buffers() never has anything to sync.  Can
 117 * be removed...
 118 */
 119
 120static int reiserfs_sync_file(struct file *p_s_filp,
 121			      struct dentry *p_s_dentry, int datasync)
 122{
 123	struct inode *p_s_inode = p_s_dentry->d_inode;
 124	int n_err;
 125	int barrier_done;
 126
 127	if (!S_ISREG(p_s_inode->i_mode))
 128		BUG();
 129	n_err = sync_mapping_buffers(p_s_inode->i_mapping);
 130	reiserfs_write_lock(p_s_inode->i_sb);
 131	barrier_done = reiserfs_commit_for_inode(p_s_inode);
 132	reiserfs_write_unlock(p_s_inode->i_sb);
 133	if (barrier_done != 1)
 134		blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL);
 135	if (barrier_done < 0)
 136		return barrier_done;
 137	return (n_err < 0) ? -EIO : 0;
 138}
 139
 140/* I really do not want to play with memory shortage right now, so
 141   to simplify the code, we are not going to write more than this much pages at
 142   a time. This still should considerably improve performance compared to 4k
 143   at a time case. This is 32 pages of 4k size. */
 144#define REISERFS_WRITE_PAGES_AT_A_TIME (128 * 1024) / PAGE_CACHE_SIZE
 145
 146/* Allocates blocks for a file to fulfil write request.
 147   Maps all unmapped but prepared pages from the list.
 148   Updates metadata with newly allocated blocknumbers as needed */
 149static int reiserfs_allocate_blocks_for_region(struct reiserfs_transaction_handle *th, struct inode *inode,	/* Inode we work with */
 150					       loff_t pos,	/* Writing position */
 151					       int num_pages,	/* number of pages write going
 152								   to touch */
 153					       int write_bytes,	/* amount of bytes to write */
 154					       struct page **prepared_pages,	/* array of
 155										   prepared pages
 156										 */
 157					       int blocks_to_allocate	/* Amount of blocks we
 158									   need to allocate to
 159									   fit the data into file
 160									 */
 161    )
 162{
 163	struct cpu_key key;	// cpu key of item that we are going to deal with
 164	struct item_head *ih;	// pointer to item head that we are going to deal with
 165	struct buffer_head *bh;	// Buffer head that contains items that we are going to deal with
 166	__le32 *item;		// pointer to item we are going to deal with
 167	INITIALIZE_PATH(path);	// path to item, that we are going to deal with.
 168	b_blocknr_t *allocated_blocks;	// Pointer to a place where allocated blocknumbers would be stored.
 169	reiserfs_blocknr_hint_t hint;	// hint structure for block allocator.
 170	size_t res;		// return value of various functions that we call.
 171	int curr_block;		// current block used to keep track of unmapped blocks.
 172	int i;			// loop counter
 173	int itempos;		// position in item
 174	unsigned int from = (pos & (PAGE_CACHE_SIZE - 1));	// writing position in
 175	// first page
 176	unsigned int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;	/* last modified byte offset in last page */
 177	__u64 hole_size;	// amount of blocks for a file hole, if it needed to be created.
 178	int modifying_this_item = 0;	// Flag for items traversal code to keep track
 179	// of the fact that we already prepared
 180	// current block for journal
 181	int will_prealloc = 0;
 182	RFALSE(!blocks_to_allocate,
 183	       "green-9004: tried to allocate zero blocks?");
 184
 185	/* only preallocate if this is a small write */
 186	if (REISERFS_I(inode)->i_prealloc_count ||
 187	    (!(write_bytes & (inode->i_sb->s_blocksize - 1)) &&
 188	     blocks_to_allocate <
 189	     REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize))
 190		will_prealloc =
 191		    REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize;
 192
 193	allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) *
 194				   sizeof(b_blocknr_t), GFP_NOFS);
 195
 196	/* First we compose a key to point at the writing position, we want to do
 197	   that outside of any locking region. */
 198	make_cpu_key(&key, inode, pos + 1, TYPE_ANY, 3 /*key length */ );
 199
 200	/* If we came here, it means we absolutely need to open a transaction,
 201	   since we need to allocate some blocks */
 202	reiserfs_write_lock(inode->i_sb);	// Journaling stuff and we need that.
 203	res = journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb));	// Wish I know if this number enough
 204	if (res)
 205		goto error_exit;
 206	reiserfs_update_inode_transaction(inode);
 207
 208	/* Look for the in-tree position of our write, need path for block allocator */
 209	res = search_for_position_by_key(inode->i_sb, &key, &path);
 210	if (res == IO_ERROR) {
 211		res = -EIO;
 212		goto error_exit;
 213	}
 214
 215	/* Allocate blocks */
 216	/* First fill in "hint" structure for block allocator */
 217	hint.th = th;		// transaction handle.
 218	hint.path = &path;	// Path, so that block allocator can determine packing locality or whatever it needs to determine.
 219	hint.inode = inode;	// Inode is needed by block allocator too.
 220	hint.search_start = 0;	// We have no hint on where to search free blocks for block allocator.
 221	hint.key = key.on_disk_key;	// on disk key of file.
 222	hint.block = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);	// Number of disk blocks this file occupies already.
 223	hint.formatted_node = 0;	// We are allocating blocks for unformatted node.
 224	hint.preallocate = will_prealloc;
 225
 226	/* Call block allocator to allocate blocks */
 227	res =
 228	    reiserfs_allocate_blocknrs(&hint, allocated_blocks,
 229				       blocks_to_allocate, blocks_to_allocate);
 230	if (res != CARRY_ON) {
 231		if (res == NO_DISK_SPACE) {
 232			/* We flush the transaction in case of no space. This way some
 233			   blocks might become free */
 234			SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
 235			res = restart_transaction(th, inode, &path);
 236			if (res)
 237				goto error_exit;
 238
 239			/* We might have scheduled, so search again */
 240			res =
 241			    search_for_position_by_key(inode->i_sb, &key,
 242						       &path);
 243			if (res == IO_ERROR) {
 244				res = -EIO;
 245				goto error_exit;
 246			}
 247
 248			/* update changed info for hint structure. */
 249			res =
 250			    reiserfs_allocate_blocknrs(&hint, allocated_blocks,
 251						       blocks_to_allocate,
 252						       blocks_to_allocate);
 253			if (res != CARRY_ON) {
 254				res = -ENOSPC;
 255				pathrelse(&path);
 256				goto error_exit;
 257			}
 258		} else {
 259			res = -ENOSPC;
 260			pathrelse(&path);
 261			goto error_exit;
 262		}
 263	}
 264#ifdef __BIG_ENDIAN
 265	// Too bad, I have not found any way to convert a given region from
 266	// cpu format to little endian format
 267	{
 268		int i;
 269		for (i = 0; i < blocks_to_allocate; i++)
 270			allocated_blocks[i] = cpu_to_le32(allocated_blocks[i]);
 271	}
 272#endif
 273
 274	/* Blocks allocating well might have scheduled and tree might have changed,
 275	   let's search the tree again */
 276	/* find where in the tree our write should go */
 277	res = search_for_position_by_key(inode->i_sb, &key, &path);
 278	if (res == IO_ERROR) {
 279		res = -EIO;
 280		goto error_exit_free_blocks;
 281	}
 282
 283	bh = get_last_bh(&path);	// Get a bufferhead for last element in path.
 284	ih = get_ih(&path);	// Get a pointer to last item head in path.
 285	item = get_item(&path);	// Get a pointer to last item in path
 286
 287	/* Let's see what we have found */
 288	if (res != POSITION_FOUND) {	/* position not found, this means that we
 289					   might need to append file with holes
 290					   first */
 291		// Since we are writing past the file's end, we need to find out if
 292		// there is a hole that needs to be inserted before our writing
 293		// position, and how many blocks it is going to cover (we need to
 294		//  populate pointers to file blocks representing the hole with zeros)
 295
 296		{
 297			int item_offset = 1;
 298			/*
 299			 * if ih is stat data, its offset is 0 and we don't want to
 300			 * add 1 to pos in the hole_size calculation
 301			 */
 302			if (is_statdata_le_ih(ih))
 303				item_offset = 0;
 304			hole_size = (pos + item_offset -
 305				     (le_key_k_offset
 306				      (get_inode_item_key_version(inode),
 307				       &(ih->ih_key)) + op_bytes_number(ih,
 308									inode->
 309									i_sb->
 310									s_blocksize)))
 311			    >> inode->i_sb->s_blocksize_bits;
 312		}
 313
 314		if (hole_size > 0) {
 315			int to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize) / UNFM_P_SIZE);	// How much data to insert first time.
 316			/* area filled with zeroes, to supply as list of zero blocknumbers
 317			   We allocate it outside of loop just in case loop would spin for
 318			   several iterations. */
 319			char *zeros = kmalloc(to_paste * UNFM_P_SIZE, GFP_ATOMIC);	// We cannot insert more than MAX_ITEM_LEN bytes anyway.
 320			if (!zeros) {
 321				res = -ENOMEM;
 322				goto error_exit_free_blocks;
 323			}
 324			memset(zeros, 0, to_paste * UNFM_P_SIZE);
 325			do {
 326				to_paste =
 327				    min_t(__u64, hole_size,
 328					  MAX_ITEM_LEN(inode->i_sb->
 329						       s_blocksize) /
 330					  UNFM_P_SIZE);
 331				if (is_indirect_le_ih(ih)) {
 332					/* Ok, there is existing indirect item already. Need to append it */
 333					/* Calculate position past inserted item */
 334					make_cpu_key(&key, inode,
 335						     le_key_k_offset
 336						     (get_inode_item_key_version
 337						      (inode),
 338						      &(ih->ih_key)) +
 339						     op_bytes_number(ih,
 340								     inode->
 341								     i_sb->
 342								     s_blocksize),
 343						     TYPE_INDIRECT, 3);
 344					res =
 345					    reiserfs_paste_into_item(th, &path,
 346								     &key,
 347								     inode,
 348								     (char *)
 349								     zeros,
 350								     UNFM_P_SIZE
 351								     *
 352								     to_paste);
 353					if (res) {
 354						kfree(zeros);
 355						goto error_exit_free_blocks;
 356					}
 357				} else if (is_statdata_le_ih(ih)) {
 358					/* No existing item, create it */
 359					/* item head for new item */
 360					struct item_head ins_ih;
 361
 362					/* create a key for our new item */
 363					make_cpu_key(&key, inode, 1,
 364						     TYPE_INDIRECT, 3);
 365
 366					/* Create new item head for our new item */
 367					make_le_item_head(&ins_ih, &key,
 368							  key.version, 1,
 369							  TYPE_INDIRECT,
 370							  to_paste *
 371							  UNFM_P_SIZE,
 372							  0 /* free space */ );
 373
 374					/* Find where such item should live in the tree */
 375					res =
 376					    search_item(inode->i_sb, &key,
 377							&path);
 378					if (res != ITEM_NOT_FOUND) {
 379						/* item should not exist, otherwise we have error */
 380						if (res != -ENOSPC) {
 381							reiserfs_warning(inode->
 382									 i_sb,
 383									 "green-9008: search_by_key (%K) returned %d",
 384									 &key,
 385									 res);
 386						}
 387						res = -EIO;
 388						kfree(zeros);
 389						goto error_exit_free_blocks;
 390					}
 391					res =
 392					    reiserfs_insert_item(th, &path,
 393								 &key, &ins_ih,
 394								 inode,
 395								 (char *)zeros);
 396				} else {
 397					reiserfs_panic(inode->i_sb,
 398						       "green-9011: Unexpected key type %K\n",
 399						       &key);
 400				}
 401				if (res) {
 402					kfree(zeros);
 403					goto error_exit_free_blocks;
 404				}
 405				/* Now we want to check if transaction is too full, and if it is
 406				   we restart it. This will also free the path. */
 407				if (journal_transaction_should_end
 408				    (th, th->t_blocks_allocated)) {
 409					res =
 410					    restart_transaction(th, inode,
 411								&path);
 412					if (res) {
 413						pathrelse(&path);
 414						kfree(zeros);
 415						goto error_exit;
 416					}
 417				}
 418
 419				/* Well, need to recalculate path and stuff */
 420				set_cpu_key_k_offset(&key,
 421						     cpu_key_k_offset(&key) +
 422						     (to_paste << inode->
 423						      i_blkbits));
 424				res =
 425				    search_for_position_by_key(inode->i_sb,
 426							       &key, &path);
 427				if (res == IO_ERROR) {
 428					res = -EIO;
 429					kfree(zeros);
 430					goto error_exit_free_blocks;
 431				}
 432				bh = get_last_bh(&path);
 433				ih = get_ih(&path);
 434				item = get_item(&path);
 435				hole_size -= to_paste;
 436			} while (hole_size);
 437			kfree(zeros);
 438		}
 439	}
 440	// Go through existing indirect items first
 441	// replace all zeroes with blocknumbers from list
 442	// Note that if no corresponding item was found, by previous search,
 443	// it means there are no existing in-tree representation for file area
 444	// we are going to overwrite, so there is nothing to scan through for holes.
 445	for (curr_block = 0, itempos = path.pos_in_item;
 446	     curr_block < blocks_to_allocate && res == POSITION_FOUND;) {
 447	      retry:
 448
 449		if (itempos >= ih_item_len(ih) / UNFM_P_SIZE) {
 450			/* We run out of data in this indirect item, let's look for another
 451			   one. */
 452			/* First if we are already modifying current item, log it */
 453			if (modifying_this_item) {
 454				journal_mark_dirty(th, inode->i_sb, bh);
 455				modifying_this_item = 0;
 456			}
 457			/* Then set the key to look for a new indirect item (offset of old
 458			   item is added to old item length */
 459			set_cpu_key_k_offset(&key,
 460					     le_key_k_offset
 461					     (get_inode_item_key_version(inode),
 462					      &(ih->ih_key)) +
 463					     op_bytes_number(ih,
 464							     inode->i_sb->
 465							     s_blocksize));
 466			/* Search ofor position of new key in the tree. */
 467			res =
 468			    search_for_position_by_key(inode->i_sb, &key,
 469						       &path);
 470			if (res == IO_ERROR) {
 471				res = -EIO;
 472				goto error_exit_free_blocks;
 473			}
 474			bh = get_last_bh(&path);
 475			ih = get_ih(&path);
 476			item = get_item(&path);
 477			itempos = path.pos_in_item;
 478			continue;	// loop to check all kinds of conditions and so on.
 479		}
 480		/* Ok, we have correct position in item now, so let's see if it is
 481		   representing file hole (blocknumber is zero) and fill it if needed */
 482		if (!item[itempos]) {
 483			/* Ok, a hole. Now we need to check if we already prepared this
 484			   block to be journaled */
 485			while (!modifying_this_item) {	// loop until succeed
 486				/* Well, this item is not journaled yet, so we must prepare
 487				   it for journal first, before we can change it */
 488				struct item_head tmp_ih;	// We copy item head of found item,
 489				// here to detect if fs changed under
 490				// us while we were preparing for
 491				// journal.
 492				int fs_gen;	// We store fs generation here to find if someone
 493				// changes fs under our feet
 494
 495				copy_item_head(&tmp_ih, ih);	// Remember itemhead
 496				fs_gen = get_generation(inode->i_sb);	// remember fs generation
 497				reiserfs_prepare_for_journal(inode->i_sb, bh, 1);	// Prepare a buffer within which indirect item is stored for changing.
 498				if (fs_changed(fs_gen, inode->i_sb)
 499				    && item_moved(&tmp_ih, &path)) {
 500					// Sigh, fs was changed under us, we need to look for new
 501					// location of item we are working with
 502
 503					/* unmark prepaerd area as journaled and search for it's
 504					   new position */
 505					reiserfs_restore_prepared_buffer(inode->
 506									 i_sb,
 507									 bh);
 508					res =
 509					    search_for_position_by_key(inode->
 510								       i_sb,
 511								       &key,
 512								       &path);
 513					if (res == IO_ERROR) {
 514						res = -EIO;
 515						goto error_exit_free_blocks;
 516					}
 517					bh = get_last_bh(&path);
 518					ih = get_ih(&path);
 519					item = get_item(&path);
 520					itempos = path.pos_in_item;
 521					goto retry;
 522				}
 523				modifying_this_item = 1;
 524			}
 525			item[itempos] = allocated_blocks[curr_block];	// Assign new block
 526			curr_block++;
 527		}
 528		itempos++;
 529	}
 530
 531	if (modifying_this_item) {	// We need to log last-accessed block, if it
 532		// was modified, but not logged yet.
 533		journal_mark_dirty(th, inode->i_sb, bh);
 534	}
 535
 536	if (curr_block < blocks_to_allocate) {
 537		// Oh, well need to append to indirect item, or to create indirect item
 538		// if there weren't any
 539		if (is_indirect_le_ih(ih)) {
 540			// Existing indirect item - append. First calculate key for append
 541			// position. We do not need to recalculate path as it should
 542			// already point to correct place.
 543			make_cpu_key(&key, inode,
 544				     le_key_k_offset(get_inode_item_key_version
 545						     (inode),
 546						     &(ih->ih_key)) +
 547				     op_bytes_number(ih,
 548						     inode->i_sb->s_blocksize),
 549				     TYPE_INDIRECT, 3);
 550			res =
 551			    reiserfs_paste_into_item(th, &path, &key, inode,
 552						     (char *)(allocated_blocks +
 553							      curr_block),
 554						     UNFM_P_SIZE *
 555						     (blocks_to_allocate -
 556						      curr_block));
 557			if (res) {
 558				goto error_exit_free_blocks;
 559			}
 560		} else if (is_statdata_le_ih(ih)) {
 561			// Last found item was statdata. That means we need to create indirect item.
 562			struct item_head ins_ih;	/* itemhead for new item */
 563
 564			/* create a key for our new item */
 565			make_cpu_key(&key, inode, 1, TYPE_INDIRECT, 3);	// Position one,
 566			// because that's
 567			// where first
 568			// indirect item
 569			// begins
 570			/* Create new item head for our new item */
 571			make_le_item_head(&ins_ih, &key, key.version, 1,
 572					  TYPE_INDIRECT,
 573					  (blocks_to_allocate -
 574					   curr_block) * UNFM_P_SIZE,
 575					  0 /* free space */ );
 576			/* Find where such item should live in the tree */
 577			res = search_item(inode->i_sb, &key, &path);
 578			if (res != ITEM_NOT_FOUND) {
 579				/* Well, if we have found such item already, or some error
 580				   occured, we need to warn user and return error */
 581				if (res != -ENOSPC) {
 582					reiserfs_warning(inode->i_sb,
 583							 "green-9009: search_by_key (%K) "
 584							 "returned %d", &key,
 585							 res);
 586				}
 587				res = -EIO;
 588				goto error_exit_free_blocks;
 589			}
 590			/* Insert item into the tree with the data as its body */
 591			res =
 592			    reiserfs_insert_item(th, &path, &key, &ins_ih,
 593						 inode,
 594						 (char *)(allocated_blocks +
 595							  curr_block));
 596		} else {
 597			reiserfs_panic(inode->i_sb,
 598				       "green-9010: unexpected item type for key %K\n",
 599				       &key);
 600		}
 601	}
 602	// the caller is responsible for closing the transaction
 603	// unless we return an error, they are also responsible for logging
 604	// the inode.
 605	//
 606	pathrelse(&path);
 607	/*
 608	 * cleanup prellocation from previous writes
 609	 * if this is a partial block write
 610	 */
 611	if (write_bytes & (inode->i_sb->s_blocksize - 1))
 612		reiserfs_discard_prealloc(th, inode);
 613	reiserfs_write_unlock(inode->i_sb);
 614
 615	// go through all the pages/buffers and map the buffers to newly allocated
 616	// blocks (so that system knows where to write these pages later).
 617	curr_block = 0;
 618	for (i = 0; i < num_pages; i++) {
 619		struct page *page = prepared_pages[i];	//current page
 620		struct buffer_head *head = page_buffers(page);	// first buffer for a page
 621		int block_start, block_end;	// in-page offsets for buffers.
 622
 623		if (!page_buffers(page))
 624			reiserfs_panic(inode->i_sb,
 625				       "green-9005: No buffers for prepared page???");
 626
 627		/* For each buffer in page */
 628		for (bh = head, block_start = 0; bh != head || !block_start;
 629		     block_start = block_end, bh = bh->b_this_page) {
 630			if (!bh)
 631				reiserfs_panic(inode->i_sb,
 632					       "green-9006: Allocated but absent buffer for a page?");
 633			block_end = block_start + inode->i_sb->s_blocksize;
 634			if (i == 0 && block_end <= from)
 635				/* if this buffer is before requested data to map, skip it */
 636				continue;
 637			if (i == num_pages - 1 && block_start >= to)
 638				/* If this buffer is after requested data to map, abort
 639				   processing of current page */
 640				break;
 641
 642			if (!buffer_mapped(bh)) {	// Ok, unmapped buffer, need to map it
 643				map_bh(bh, inode->i_sb,
 644				       le32_to_cpu(allocated_blocks
 645						   [curr_block]));
 646				curr_block++;
 647				set_buffer_new(bh);
 648			}
 649		}
 650	}
 651
 652	RFALSE(curr_block > blocks_to_allocate,
 653	       "green-9007: Used too many blocks? weird");
 654
 655	kfree(allocated_blocks);
 656	return 0;
 657
 658// Need to deal with transaction here.
 659      error_exit_free_blocks:
 660	pathrelse(&path);
 661	// free blocks
 662	for (i = 0; i < blocks_to_allocate; i++)
 663		reiserfs_free_block(th, inode, le32_to_cpu(allocated_blocks[i]),
 664				    1);
 665
 666      error_exit:
 667	if (th->t_trans_id) {
 668		int err;
 669		// update any changes we made to blk count
 670		reiserfs_update_sd(th, inode);
 671		err =
 672		    journal_end(th, inode->i_sb,
 673				JOURNAL_PER_BALANCE_CNT * 3 + 1 +
 674				2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb));
 675		if (err)
 676			res = err;
 677	}
 678	reiserfs_write_unlock(inode->i_sb);
 679	kfree(allocated_blocks);
 680
 681	return res;
 682}
 683
 684/* Unlock pages prepared by reiserfs_prepare_file_region_for_write */
 685static void reiserfs_unprepare_pages(struct page **prepared_pages,	/* list of locked pages */
 686				     size_t num_pages /* amount of pages */ )
 687{
 688	int i;			// loop counter
 689
 690	for (i = 0; i < num_pages; i++) {
 691		struct page *page = prepared_pages[i];
 692
 693		try_to_free_buffers(page);
 694		unlock_page(page);
 695		page_cache_release(page);
 696	}
 697}
 698
 699/* This function will copy data from userspace to specified pages within
 700   supplied byte range */
 701static int reiserfs_copy_from_user_to_file_region(loff_t pos,	/* In-file position */
 702						  int num_pages,	/* Number of pages affected */
 703						  int write_bytes,	/* Amount of bytes to write */
 704						  struct page **prepared_pages,	/* pointer to 
 705										   array to
 706										   prepared pages
 707										 */
 708						  const char __user * buf	/* Pointer to user-supplied
 709										   data */
 710    )
 711{
 712	long page_fault = 0;	// status of copy_from_user.
 713	int i;			// loop counter.
 714	int offset;		// offset in page
 715
 716	for (i = 0, offset = (pos & (PAGE_CACHE_SIZE - 1)); i < num_pages;
 717	     i++, offset = 0) {
 718		size_t count = min_t(size_t, PAGE_CACHE_SIZE - offset, write_bytes);	// How much of bytes to write to this page
 719		struct page *page = prepared_pages[i];	// Current page we process.
 720
 721		fault_in_pages_readable(buf, count);
 722
 723		/* Copy data from userspace to the current page */
 724		kmap(page);
 725		page_fault = __copy_from_user(page_address(page) + offset, buf, count);	// Copy the data.
 726		/* Flush processor's dcache for this page */
 727		flush_dcache_page(page);
 728		kunmap(page);
 729		buf += count;
 730		write_bytes -= count;
 731
 732		if (page_fault)
 733			break;	// Was there a fault? abort.
 734	}
 735
 736	return page_fault ? -EFAULT : 0;
 737}
 738
 739/* taken fs/buffer.c:__block_commit_write */
 740int reiserfs_commit_page(struct inode *inode, struct page *page,
 741			 unsigned from, unsigned to)
 742{
 743	unsigned block_start, block_end;
 744	int partial = 0;
 745	unsigned blocksize;
 746	struct buffer_head *bh, *head;
 747	unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
 748	int new;
 749	int logit = reiserfs_file_data_log(inode);
 750	struct super_block *s = inode->i_sb;
 751	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
 752	struct reiserfs_transaction_handle th;
 753	int ret = 0;
 754
 755	th.t_trans_id = 0;
 756	blocksize = 1 << inode->i_blkbits;
 757
 758	if (logit) {
 759		reiserfs_write_lock(s);
 760		ret = journal_begin(&th, s, bh_per_page + 1);
 761		if (ret)
 762			goto drop_write_lock;
 763		reiserfs_update_inode_transaction(inode);
 764	}
 765	for (bh = head = page_buffers(page), block_start = 0;
 766	     bh != head || !block_start;
 767	     block_start = block_end, bh = bh->b_this_page) {
 768
 769		new = buffer_new(bh);
 770		clear_buffer_new(bh);
 771		block_end = block_start + blocksize;
 772		if (block_end <= from || block_start >= to) {
 773			if (!buffer_uptodate(bh))
 774				partial = 1;
 775		} else {
 776			set_buffer_uptodate(bh);
 777			if (logit) {
 778				reiserfs_prepare_for_journal(s, bh, 1);
 779				journal_mark_dirty(&th, s, bh);
 780			} else if (!buffer_dirty(bh)) {
 781				mark_buffer_dirty(bh);
 782				/* do data=ordered on any page past the end
 783				 * of file and any buffer marked BH_New.
 784				 */
 785				if (reiserfs_data_ordered(inode->i_sb) &&
 786				    (new || page->index >= i_size_index)) {
 787					reiserfs_add_ordered_list(inode, bh);
 788				}
 789			}
 790		}
 791	}
 792	if (logit) {
 793		ret = journal_end(&th, s, bh_per_page + 1);
 794	      drop_write_lock:
 795		reiserfs_write_unlock(s);
 796	}
 797	/*
 798	 * If this is a partial write which happened to make all buffers
 799	 * uptodate then we can optimize away a bogus readpage() for
 800	 * the next read(). Here we 'discover' whether the page went
 801	 * uptodate as a result of this (potentially partial) write.
 802	 */
 803	if (!partial)
 804		SetPageUptodate(page);
 805	return ret;
 806}
 807
 808/* Submit pages for write. This was separated from actual file copying
 809   because we might want to allocate block numbers in-between.
 810   This function assumes that caller will adjust file size to correct value. */
 811static int reiserfs_submit_file_region_for_write(struct reiserfs_transaction_handle *th, struct inode *inode, loff_t pos,	/* Writing position offset */
 812						 size_t num_pages,	/* Number of pages to write */
 813						 size_t write_bytes,	/* number of bytes to write */
 814						 struct page **prepared_pages	/* list of pages */
 815    )
 816{
 817	int status;		// return status of block_commit_write.
 818	int retval = 0;		// Return value we are going to return.
 819	int i;			// loop counter
 820	int offset;		// Writing offset in page.
 821	int orig_write_bytes = write_bytes;
 822	int sd_update = 0;
 823
 824	for (i = 0, offset = (pos & (PAGE_CACHE_SIZE - 1)); i < num_pages;
 825	     i++, offset = 0) {
 826		int count = min_t(int, PAGE_CACHE_SIZE - offset, write_bytes);	// How much of bytes to write to this page
 827		struct page *page = prepared_pages[i];	// Current page we process.
 828
 829		status =
 830		    reiserfs_commit_page(inode, page, offset, offset + count);
 831		if (status)
 832			retval = status;	// To not overcomplicate matters We are going to
 833		// submit all the pages even if there was error.
 834		// we only remember error status to report it on
 835		// exit.
 836		write_bytes -= count;
 837	}
 838	/* now that we've gotten all the ordered buffers marked dirty,
 839	 * we can safely update i_size and close any running transaction
 840	 */
 841	if (pos + orig_write_bytes > inode->i_size) {
 842		inode->i_size = pos + orig_write_bytes;	// Set new size
 843		/* If the file have grown so much that tail packing is no
 844		 * longer possible, reset "need to pack" flag */
 845		if ((have_large_tails(inode->i_sb) &&
 846		     inode->i_size > i_block_size(inode) * 4) ||
 847		    (have_small_tails(inode->i_sb) &&
 848		     inode->i_size > i_block_size(inode)))
 849			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
 850		else if ((have_large_tails(inode->i_sb) &&
 851			  inode->i_size < i_block_size(inode) * 4) ||
 852			 (have_small_tails(inode->i_sb) &&
 853			  inode->i_size < i_block_size(inode)))
 854			REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
 855
 856		if (th->t_trans_id) {
 857			reiserfs_write_lock(inode->i_sb);
 858			reiserfs_update_sd(th, inode);	// And update on-disk metadata
 859			reiserfs_write_unlock(inode->i_sb);
 860		} else
 861			inode->i_sb->s_op->dirty_inode(inode);
 862
 863		sd_update = 1;
 864	}
 865	if (th->t_trans_id) {
 866		reiserfs_write_lock(inode->i_sb);
 867		if (!sd_update)
 868			reiserfs_update_sd(th, inode);
 869		status = journal_end(th, th->t_super, th->t_blocks_allocated);
 870		if (status)
 871			retval = status;
 872		reiserfs_write_unlock(inode->i_sb);
 873	}
 874	th->t_trans_id = 0;
 875
 876	/* 
 877	 * we have to unlock the pages after updating i_size, otherwise
 878	 * we race with writepage
 879	 */
 880	for (i = 0; i < num_pages; i++) {
 881		struct page *page = prepared_pages[i];
 882		unlock_page(page);
 883		mark_page_accessed(page);
 884		page_cache_release(page);
 885	}
 886	return retval;
 887}
 888
 889/* Look if passed writing region is going to touch file's tail
 890   (if it is present). And if it is, convert the tail to unformatted node */
 891static int reiserfs_check_for_tail_and_convert(struct inode *inode,	/* inode to deal with */
 892					       loff_t pos,	/* Writing position */
 893					       int write_bytes	/* amount of bytes to write */
 894    )
 895{
 896	INITIALIZE_PATH(path);	// needed for search_for_position
 897	struct cpu_key key;	// Key that would represent last touched writing byte.
 898	struct item_head *ih;	// item header of found block;
 899	int res;		// Return value of various functions we call.
 900	int cont_expand_offset;	// We will put offset for generic_cont_expand here
 901	// This can be int just because tails are created
 902	// only for small files.
 903
 904/* this embodies a dependency on a particular tail policy */
 905	if (inode->i_size >= inode->i_sb->s_blocksize * 4) {
 906		/* such a big files do not have tails, so we won't bother ourselves
 907		   to look for tails, simply return */
 908		return 0;
 909	}
 910
 911	reiserfs_write_lock(inode->i_sb);
 912	/* find the item containing the last byte to be written, or if
 913	 * writing past the end of the file then the last item of the
 914	 * file (and then we check its type). */
 915	make_cpu_key(&key, inode, pos + write_bytes + 1, TYPE_ANY,
 916		     3 /*key length */ );
 917	res = search_for_position_by_key(inode->i_sb, &key, &path);
 918	if (res == IO_ERROR) {
 919		reiserfs_write_unlock(inode->i_sb);
 920		return -EIO;
 921	}
 922	ih = get_ih(&path);
 923	res = 0;
 924	if (is_direct_le_ih(ih)) {
 925		/* Ok, closest item is file tail (tails are stored in "direct"
 926		 * items), so we need to unpack it. */
 927		/* To not overcomplicate matters, we just call generic_cont_expand
 928		   which will in turn call other stuff and finally will boil down to
 929		   reiserfs_get_block() that would do necessary conversion. */
 930		cont_expand_offset =
 931		    le_key_k_offset(get_inode_item_key_version(inode),
 932				    &(ih->ih_key));
 933		pathrelse(&path);
 934		res = generic_cont_expand(inode, cont_expand_offset);
 935	} else
 936		pathrelse(&path);
 937
 938	reiserfs_write_unlock(inode->i_sb);
 939	return res;
 940}
 941
 942/* This function locks pages starting from @pos for @inode.
 943   @num_pages pages are locked and stored in
 944   @prepared_pages array. Also buffers are allocated for these pages.
 945   First and last page of the region is read if it is overwritten only
 946   partially. If last page did not exist before write (file hole or file
 947   append), it is zeroed, then. 
 948   Returns number of unallocated blocks that should be allocated to cover
 949   new file data.*/
 950static int reiserfs_prepare_file_region_for_write(struct inode *inode
 951						  /* Inode of the file */ ,
 952						  loff_t pos,	/* position in the file */
 953						  size_t num_pages,	/* number of pages to
 954									   prepare */
 955						  size_t write_bytes,	/* Amount of bytes to be
 956									   overwritten from
 957									   @pos */
 958						  struct page **prepared_pages	/* pointer to array
 959										   where to store
 960										   prepared pages */
 961    )
 962{
 963	int res = 0;		// Return values of different functions we call.
 964	unsigned long index = pos >> PAGE_CACHE_SHIFT;	// Offset in file in pages.
 965	int from = (pos & (PAGE_CACHE_SIZE - 1));	// Writing offset in first page
 966	int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;
 967	/* offset of last modified byte in last
 968	   page */
 969	struct address_space *mapping = inode->i_mapping;	// Pages are mapped here.
 970	int i;			// Simple counter
 971	int blocks = 0;		/* Return value (blocks that should be allocated) */
 972	struct buffer_head *bh, *head;	// Current bufferhead and first bufferhead
 973	// of a page.
 974	unsigned block_start, block_end;	// Starting and ending offsets of current
 975	// buffer in the page.
 976	struct buffer_head *wait[2], **wait_bh = wait;	// Buffers for page, if
 977	// Page appeared to be not up
 978	// to date. Note how we have
 979	// at most 2 buffers, this is
 980	// because we at most may
 981	// partially overwrite two
 982	// buffers for one page. One at                                                 // the beginning of write area
 983	// and one at the end.
 984	// Everything inthe middle gets                                                 // overwritten totally.
 985
 986	struct cpu_key key;	// cpu key of item that we are going to deal with
 987	struct item_head *ih = NULL;	// pointer to item head that we are going to deal with
 988	struct buffer_head *itembuf = NULL;	// Buffer head that contains items that we are going to deal with
 989	INITIALIZE_PATH(path);	// path to item, that we are going to deal with.
 990	__le32 *item = NULL;	// pointer to item we are going to deal with
 991	int item_pos = -1;	/* Position in indirect item */
 992
 993	if (num_pages < 1) {
 994		reiserfs_warning(inode->i_sb,
 995				 "green-9001: reiserfs_prepare_file_region_for_write "
 996				 "called with zero number of pages to process");
 997		return -EFAULT;
 998	}
 999
1000	/* We have 2 loops for pages. In first loop we grab and lock the pages, so
1001	   that nobody would touch these until we release the pages. Then
1002	   we'd start to deal with mapping buffers to blocks. */
1003	for (i = 0; i < num_pages; i++) {
1004		prepared_pages[i] = grab_cache_page(mapping, index + i);	// locks the page
1005		if (!prepared_pages[i]) {
1006			res = -ENOMEM;
1007			goto failed_page_grabbing;
1008		}
1009		if (!page_has_buffers(prepared_pages[i]))
1010			create_empty_buffers(prepared_pages[i],
1011					     inode->i_sb->s_blocksize, 0);
1012	}
1013
1014	/* Let's count amount of blocks for a case where all the blocks
1015	   overwritten are new (we will substract already allocated blocks later) */
1016	if (num_pages > 2)
1017		/* These are full-overwritten pages so we count all the blocks in
1018		   these pages are counted as needed to be allocated */
1019		blocks =
1020		    (num_pages - 2) << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1021
1022	/* count blocks needed for first page (possibly partially written) */
1023	blocks += ((PAGE_CACHE_SIZE - from) >> inode->i_blkbits) + !!(from & (inode->i_sb->s_blocksize - 1));	/* roundup */
1024
1025	/* Now we account for last page. If last page == first page (we
1026	   overwrite only one page), we substract all the blocks past the
1027	   last writing position in a page out of already calculated number
1028	   of blocks */
1029	blocks += ((num_pages > 1) << (PAGE_CACHE_SHIFT - inode->i_blkbits)) -
1030	    ((PAGE_CACHE_SIZE - to) >> inode->i_blkbits);
1031	/* Note how we do not roundup here since partial blocks still
1032	   should be allocated */
1033
1034	/* Now if all the write area lies past the file end, no point in
1035	   maping blocks, since there is none, so we just zero out remaining
1036	   parts of first and last pages in write area (if needed) */
1037	if ((pos & ~((loff_t) PAGE_CACHE_SIZE - 1)) > inode->i_size) {
1038		if (from != 0) {	/* First page needs to be partially zeroed */
1039			char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
1040			memset(kaddr, 0, from);
1041			kunmap_atomic(kaddr, KM_USER0);
1042		}
1043		if (to != PAGE_CACHE_SIZE) {	/* Last page needs to be partially zeroed */
1044			char *kaddr =
1045			    kmap_atomic(prepared_pages[num_pages - 1],
1046					KM_USER0);
1047			memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
1048			kunmap_atomic(kaddr, KM_USER0);
1049		}
1050
1051		/* Since all blocks are new - use already calculated value */
1052		return blocks;
1053	}
1054
1055	/* Well, since we write somewhere into the middle of a file, there is
1056	   possibility we are writing over some already allocated blocks, so
1057	   let's map these blocks and substract number of such blocks out of blocks
1058	   we need to allocate (calculated above) */
1059	/* Mask write position to start on blocksize, we do it out of the
1060	   loop for performance reasons */
1061	pos &= ~((loff_t) inode->i_sb->s_blocksize - 1);
1062	/* Set cpu key to the starting position in a file (on left block boundary) */
1063	make_cpu_key(&key, inode,
1064		     1 + ((pos) & ~((loff_t) inode->i_sb->s_blocksize - 1)),
1065		     TYPE_ANY, 3 /*key length */ );
1066
1067	reiserfs_write_lock(inode->i_sb);	// We need that for at least search_by_key()
1068	for (i = 0; i < num_pages; i++) {
1069
1070		head = page_buffers(prepared_pages[i]);
1071		/* For each buffer in the page */
1072		for (bh = head, block_start = 0; bh != head || !block_start;
1073		     block_start = block_end, bh = bh->b_this_page) {
1074			if (!bh)
1075				reiserfs_panic(inode->i_sb,
1076					       "green-9002: Allocated but absent buffer for a page?");
1077			/* Find where this buffer ends */
1078			block_end = block_start + inode->i_sb->s_blocksize;
1079			if (i == 0 && block_end <= from)
1080				/* if this buffer is before requested data to map, skip it */
1081				continue;
1082
1083			if (i == num_pages - 1 && block_start >= to) {
1084				/* If this buffer is after requested data to map, abort
1085				   processing of current page */
1086				break;
1087			}
1088
1089			if (buffer_mapped(bh) && bh->b_blocknr != 0) {
1090				/* This is optimisation for a case where buffer is mapped
1091				   and have blocknumber assigned. In case significant amount
1092				   of such buffers are present, we may avoid some amount
1093				   of search_by_key calls.
1094				   Probably it would be possible to move parts of this code
1095				   out of BKL, but I afraid that would overcomplicate code
1096				   without any noticeable benefit.
1097				 */
1098				item_pos++;
1099				/* Update the key */
1100				set_cpu_key_k_offset(&key,
1101						     cpu_key_k_offset(&key) +
1102						     inode->i_sb->s_blocksize);
1103				blocks--;	// Decrease the amount of blocks that need to be
1104				// allocated
1105				continue;	// Go to the next buffer
1106			}
1107
1108			if (!itembuf ||	/* if first iteration */
1109			    item_pos >= ih_item_len(ih) / UNFM_P_SIZE) {	/* or if we progressed past the
1110										   current unformatted_item */
1111				/* Try to find next item */
1112				res =
1113				    search_for_position_by_key(inode->i_sb,
1114							       &key, &path);
1115				/* Abort if no more items */
1116				if (res != POSITION_FOUND) {
1117					/* make sure later loops don't use this item */
1118					itembuf = NULL;
1119					item = NULL;
1120					break;
1121				}
1122
1123				/* Update information about current indirect item */
1124				itembuf = get_last_bh(&path);
1125				ih = get_ih(&path);
1126				item = get_item(&path);
1127				item_pos = path.pos_in_item;
1128
1129				RFALSE(!is_indirect_le_ih(ih),
1130				       "green-9003: indirect item expected");
1131			}
1132
1133			/* See if there is some block associated with the file
1134			   at that position, map the buffer to this block */
1135			if (get_block_num(item, item_pos)) {
1136				map_bh(bh, inode->i_sb,
1137				       get_block_num(item, item_pos));
1138				blocks--;	// Decrease the amount of blocks that need to be
1139				// allocated
1140			}
1141			item_pos++;
1142			/* Update the key */
1143			set_cpu_key_k_offset(&key,
1144					     cpu_key_k_offset(&key) +
1145					     inode->i_sb->s_blocksize);
1146		}
1147	}
1148	pathrelse(&path);	// Free the path
1149	reiserfs_write_unlock(inode->i_sb);
1150
1151	/* Now zero out unmappend buffers for the first and last pages of
1152	   write area or issue read requests if page is mapped. */
1153	/* First page, see if it is not uptodate */
1154	if (!PageUptodate(prepared_pages[0])) {
1155		head = page_buffers(prepared_pages[0]);
1156
1157		/* For each buffer in page */
1158		for (bh = head, block_start = 0; bh != head || !block_start;
1159		     block_start = block_end, bh = bh->b_this_page) {
1160
1161			if (!bh)
1162				reiserfs_panic(inode->i_sb,
1163					       "green-9002: Allocated but absent buffer for a page?");
1164			/* Find where this buffer ends */
1165			block_end = block_start + inode->i_sb->s_blocksize;
1166			if (block_end <= from)
1167				/* if this buffer is before requested data to map, skip it */
1168				continue;
1169			if (block_start < from) {	/* Aha, our partial buffer */
1170				if (buffer_mapped(bh)) {	/* If it is mapped, we need to
1171								   issue READ request for it to
1172								   not loose data */
1173					ll_rw_block(READ, 1, &bh);
1174					*wait_bh++ = bh;
1175				} else {	/* Not mapped, zero it */
1176					char *kaddr =
1177					    kmap_atomic(prepared_pages[0],
1178							KM_USER0);
1179					memset(kaddr + block_start, 0,
1180					       from - block_start);
1181					kunmap_atomic(kaddr, KM_USER0);
1182					set_buffer_uptodate(bh);
1183				}
1184			}
1185		}
1186	}
1187
1188	/* Last page, see if it is not uptodate, or if the last page is past the end of the file. */
1189	if (!PageUptodate(prepared_pages[num_pages - 1]) ||
1190	    ((pos + write_bytes) >> PAGE_CACHE_SHIFT) >
1191	    (inode->i_size >> PAGE_CACHE_SHIFT)) {
1192		head = page_buffers(prepared_pages[num_pages - 1]);
1193
1194		/* for each buffer in page */
1195		for (bh = head, block_start = 0; bh != head || !block_start;
1196		     block_start = block_end, bh = bh->b_this_page) {
1197
1198			if (!bh)
1199				reiserfs_panic(inode->i_sb,
1200					       "green-9002: Allocated but absent buffer for a page?");
1201			/* Find where this buffer ends */
1202			block_end = block_start + inode->i_sb->s_blocksize;
1203			if (block_start >= to)
1204				/* if this buffer is after requested data to map, skip it */
1205				break;
1206			if (block_end > to) {	/* Aha, our partial buffer */
1207				if (buffer_mapped(bh)) {	/* If it is mapped, we need to
1208								   issue READ request for it to
1209								   not loose data */
1210					ll_rw_block(READ, 1, &bh);
1211					*wait_bh++ = bh;
1212				} else {	/* Not mapped, zero it */
1213					char *kaddr =
1214					    kmap_atomic(prepared_pages
1215							[num_pages - 1],
1216							KM_USER0);
1217					memset(kaddr + to, 0, block_end - to);
1218					kunmap_atomic(kaddr, KM_USER0);
1219					set_buffer_uptodate(bh);
1220				}
1221			}
1222		}
1223	}
1224
1225	/* Wait for read requests we made to happen, if necessary */
1226	while (wait_bh > wait) {
1227		wait_on_buffer(*--wait_bh);
1228		if (!buffer_uptodate(*wait_bh)) {
1229			res = -EIO;
1230			goto failed_read;
1231		}
1232	}
1233
1234	return blocks;
1235      failed_page_grabbing:
1236	num_pages = i;
1237      failed_read:
1238	reiserfs_unprepare_pages(prepared_pages, num_pages);
1239	return res;
1240}
1241
1242/* Write @count bytes at position @ppos in a file indicated by @file
1243   from the buffer @buf.  
1244
1245   generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want
1246   something simple that works.  It is not for serious use by general purpose filesystems, excepting the one that it was
1247   written for (ext2/3).  This is for several reasons:
1248
1249   * It has no understanding of any filesystem specific optimizations.
1250
1251   * It enters the filesystem repeatedly for each page that is written.
1252
1253   * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key
1254   * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time
1255   * to reiserfs which allows for fewer tree traversals.
1256
1257   * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks.
1258
1259   * Asking the block allocation code for blocks one at a time is slightly less efficient.
1260
1261   All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to
1262   use it, but we were in a hurry to make code freeze, and so it couldn't be revised then.  This new code should make
1263   things right finally.
1264
1265   Future Features: providing search_by_key with hints.
1266
1267*/
1268static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going to write into */
1269				   const char __user * buf,	/*  pointer to user supplied data
1270								   (in userspace) */
1271				   size_t count,	/* amount of bytes to write */
1272				   loff_t * ppos	/* pointer to position in file that we start writing at. Should be updated to
1273							 * new current position before returning. */
1274				   )
1275{
1276	size_t already_written = 0;	// Number of bytes already written to the file.
1277	loff_t pos;		// Current position in the file.
1278	ssize_t res;		// return value of various functions that we call.
1279	int err = 0;
1280	struct inode *inode = file->f_dentry->d_inode;	// Inode of the file that we are writing to.
1281	/* To simplify coding at this time, we store
1282	   locked pages in array for now */
1283	struct page *prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
1284	struct reiserfs_transaction_handle th;
1285	th.t_trans_id = 0;
1286
1287	if (file->f_flags & O_DIRECT) {	// Direct IO needs treatment
1288		ssize_t result, after_file_end = 0;
1289		if ((*ppos + count >= inode->i_size)
1290		    || (file->f_flags & O_APPEND)) {
1291			/* If we are appending a file, we need to put this savelink in here.
1292			   If we will crash while doing direct io, finish_unfinished will
1293			   cut the garbage from the file end. */
1294			reiserfs_write_lock(inode->i_sb);
1295			err =
1296			    journal_begin(&th, inode->i_sb,
1297					  JOURNAL_PER_BALANCE_CNT);
1298			if (err) {
1299				reiserfs_write_unlock(inode->i_sb);
1300				return err;
1301			}
1302			reiserfs_update_inode_transaction(inode);
1303			add_save_link(&th, inode, 1 /* Truncate */ );
1304			after_file_end = 1;
1305			err =
1306			    journal_end(&th, inode->i_sb,
1307					JOURNAL_PER_BALANCE_CNT);
1308			reiserfs_write_unlock(inode->i_sb);
1309			if (err)
1310				return err;
1311		}
1312		result = generic_file_write(file, buf, count, ppos);
1313
1314		if (after_file_end) {	/* Now update i_size and remove the savelink */
1315			struct reiserfs_transaction_handle th;
1316			reiserfs_write_lock(inode->i_sb);
1317			err = journal_begin(&th, inode->i_sb, 1);
1318			if (err) {
1319				reiserfs_write_unlock(inode->i_sb);
1320				return err;
1321			}
1322			reiserfs_update_inode_transaction(inode);
1323			reiserfs_update_sd(&th, inode);
1324			err = journal_end(&th, inode->i_sb, 1);
1325			if (err) {
1326				reiserfs_write_unlock(inode->i_sb);
1327				return err;
1328			}
1329			err = remove_save_link(inode, 1 /* truncate */ );
1330			reiserfs_write_unlock(inode->i_sb);
1331			if (err)
1332				return err;
1333		}
1334
1335		return result;
1336	}
1337
1338	if (unlikely((ssize_t) count < 0))
1339		return -EINVAL;
1340
1341	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
1342		return -EFAULT;
1343
1344	down(&inode->i_sem);	// locks the entire file for just us
1345
1346	pos = *ppos;
1347
1348	/* Check if we can write to specified region of file, file
1349	   is not overly big and this kind of stuff. Adjust pos and
1350	   count, if needed */
1351	res = generic_write_checks(file, &pos, &count, 0);
1352	if (res)
1353		goto out;
1354
1355	if (count == 0)
1356		goto out;
1357
1358	res = remove_suid(file->f_dentry);
1359	if (res)
1360		goto out;
1361
1362	inode_update_time(inode, 1);	/* Both mtime and ctime */
1363
1364	// Ok, we are done with all the checks.
1365
1366	// Now we should start real work
1367
1368	/* If we are going to write past the file's packed tail or if we are going
1369	   to overwrite part of the tail, we need that tail to be converted into
1370	   unformatted node */
1371	res = reiserfs_check_for_tail_and_convert(inode, pos, count);
1372	if (res)
1373		goto out;
1374
1375	while (count > 0) {
1376		/* This is the main loop in which we running until some error occures
1377		   or until we write all of the data. */
1378		size_t num_pages;	/* amount of pages we are going to write this iteration */
1379		size_t write_bytes;	/* amount of bytes to write during this iteration */
1380		size_t blocks_to_allocate;	/* how much blocks we need to allocate for this iteration */
1381
1382		/*  (pos & (PAGE_CACHE_SIZE-1)) is an idiom for offset into a page of pos */
1383		num_pages = !!((pos + count) & (PAGE_CACHE_SIZE - 1)) +	/* round up partial
1384									   pages */
1385		    ((count +
1386		      (pos & (PAGE_CACHE_SIZE - 1))) >> PAGE_CACHE_SHIFT);
1387		/* convert size to amount of
1388		   pages */
1389		reiserfs_write_lock(inode->i_sb);
1390		if (num_pages > REISERFS_WRITE_PAGES_AT_A_TIME
1391		    || num_pages > reiserfs_can_fit_pages(inode->i_sb)) {
1392			/* If we were asked to write more data than we want to or if there
1393			   is not that much space, then we shorten amount of data to write
1394			   for this iteration. */
1395			num_pages =
1396			    min_t(size_t, REISERFS_WRITE_PAGES_AT_A_TIME,
1397				  reiserfs_can_fit_pages(inode->i_sb));
1398			/* Also we should not forget to set size in bytes accordingly */
1399			write_bytes = (num_pages << PAGE_CACHE_SHIFT) -
1400			    (pos & (PAGE_CACHE_SIZE - 1));
1401			/* If position is not on the
1402			   start of the page, we need
1403			   to substract the offset
1404			   within page */
1405		} else
1406			write_bytes = count;
1407
1408		/* reserve the blocks to be allocated later, so that later on
1409		   we still have the space to write the blocks to */
1410		reiserfs_claim_blocks_to_be_allocated(inode->i_sb,
1411						      num_pages <<
1412						      (PAGE_CACHE_SHIFT -
1413						       inode->i_blkbits));
1414		reiserfs_write_unlock(inode->i_sb);
1415
1416		if (!num_pages) {	/* If we do not have enough space even for a single page... */
1417			if (pos >
1418			    inode->i_size + inode->i_sb->s_blocksize -
1419			    (pos & (inode->i_sb->s_blocksize - 1))) {
1420				res = -ENOSPC;
1421				break;	// In case we are writing past the end of the last file block, break.
1422			}
1423			// Otherwise we are possibly overwriting the file, so
1424			// let's set write size to be equal or less than blocksize.
1425			// This way we get it correctly for file holes.
1426			// But overwriting files on absolutelly full volumes would not
1427			// be very efficient. Well, people are not supposed to fill
1428			// 100% of disk space anyway.
1429			write_bytes =
1430			    min_t(size_t, count,
1431				  inode->i_sb->s_blocksize -
1432				  (pos & (inode->i_sb->s_blocksize - 1)));
1433			num_pages = 1;
1434			// No blocks were claimed before, so do it now.
1435			reiserfs_claim_blocks_to_be_allocated(inode->i_sb,
1436							      1 <<
1437							      (PAGE_CACHE_SHIFT
1438							       -
1439							       inode->
1440							       i_blkbits));
1441		}
1442
1443		/* Prepare for writing into the region, read in all the
1444		   partially overwritten pages, if needed. And lock the pages,
1445		   so that nobody else can access these until we are done.
1446		   We get number of actual blocks needed as a result. */
1447		blocks_to_allocate =
1448		    reiserfs_prepare_file_region_for_write(inode, pos,
1449							   num_pages,
1450							   write_bytes,
1451							   prepared_pages);
1452		if (blocks_to_allocate < 0) {
1453			res = blocks_to_allocate;
1454			reiserfs_release_claimed_blocks(inode->i_sb,
1455							num_pages <<
1456							(PAGE_CACHE_SHIFT -
1457							 inode->i_blkbits));
1458			break;
1459		}
1460
1461		/* First we correct our estimate of how many blocks we need */
1462		reiserfs_release_claimed_blocks(inode->i_sb,
1463						(num_pages <<
1464						 (PAGE_CACHE_SHIFT -
1465						  inode->i_sb->
1466						  s_blocksize_bits)) -
1467						blocks_to_allocate);
1468
1469		if (blocks_to_allocate > 0) {	/*We only allocate blocks if we need to */
1470			/* Fill in all the possible holes and append the file if needed */
1471			res =
1472			    reiserfs_allocate_blocks_for_region(&th, inode, pos,
1473								num_pages,
1474								write_bytes,
1475								prepared_pages,
1476								blocks_to_allocate);
1477		}
1478
1479		/* well, we have allocated the blocks, so it is time to free
1480		   the reservation we made earlier. */
1481		reiserfs_release_claimed_blocks(inode->i_sb,
1482						blocks_to_allocate);
1483		if (res) {
1484			reiserfs_unprepare_pages(prepared_pages, num_pages);
1485			break;
1486		}
1487
1488/* NOTE that allocating blocks and filling blocks can be done in reverse order
1489   and probably we would do that just to get rid of garbage in files after a
1490   crash */
1491
1492		/* Copy data from user-supplied buffer to file's pages */
1493		res =
1494		    reiserfs_copy_from_user_to_file_region(pos, num_pages,
1495							   write_bytes,
1496							   prepared_pages, buf);
1497		if (res) {
1498			reiserfs_unprepare_pages(prepared_pages, num_pages);
1499			break;
1500		}
1501
1502		/* Send the pages to disk and unlock them. */
1503		res =
1504		    reiserfs_submit_file_region_for_write(&th, inode, pos,
1505							  num_pages,
1506							  write_bytes,
1507							  prepared_pages);
1508		if (res)
1509			break;
1510
1511		already_written += write_bytes;
1512		buf += write_bytes;
1513		*ppos = pos += write_bytes;
1514		count -= write_bytes;
1515		balance_dirty_pages_ratelimited(inode->i_mapping);
1516	}
1517
1518	/* this is only true on error */
1519	if (th.t_trans_id) {
1520		reiserfs_write_lock(inode->i_sb);
1521		err = journal_end(&th, th.t_super, th.t_blocks_allocated);
1522		reiserfs_write_unlock(inode->i_sb);
1523		if (err) {
1524			res = err;
1525			goto out;
1526		}
1527	}
1528
1529	if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
1530		res =
1531		    generic_osync_inode(inode, file->f_mapping,
1532					OSYNC_METADATA | OSYNC_DATA);
1533
1534	up(&inode->i_sem);
1535	reiserfs_async_progress_wait(inode->i_sb);
1536	return (already_written != 0) ? already_written : res;
1537
1538      out:
1539	up(&inode->i_sem);	// unlock the file on exit.
1540	return res;
1541}
1542
1543static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user * buf,
1544				  size_t count, loff_t pos)
1545{
1546	return generic_file_aio_write(iocb, buf, count, pos);
1547}
1548
1549struct file_operations reiserfs_file_operations = {
1550	.read = generic_file_read,
1551	.write = reiserfs_file_write,
1552	.ioctl = reiserfs_ioctl,
1553	.mmap = generic_file_mmap,
1554	.release = reiserfs_file_release,
1555	.fsync = reiserfs_sync_file,
1556	.sendfile = generic_file_sendfile,
1557	.aio_read = generic_file_aio_read,
1558	.aio_write = reiserfs_aio_write,
1559};
1560
1561struct inode_operations reiserfs_file_inode_operations = {
1562	.truncate = reiserfs_vfs_truncate_file,
1563	.setattr = reiserfs_setattr,
1564	.setxattr = reiserfs_setxattr,
1565	.getxattr = reiserfs_getxattr,
1566	.listxattr = reiserfs_listxattr,
1567	.removexattr = reiserfs_removexattr,
1568	.permission = reiserfs_permission,
1569};