fs/ocfs2/aops.c at v3.0 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / ocfs2 / aops.c
at v3.0 2048 lines 53 kB view raw
   1/* -*- mode: c; c-basic-offset: 8; -*-
   2 * vim: noexpandtab sw=8 ts=8 sts=0:
   3 *
   4 * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
   5 *
   6 * This program is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU General Public
   8 * License as published by the Free Software Foundation; either
   9 * version 2 of the License, or (at your option) any later version.
  10 *
  11 * This program is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 * General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU General Public
  17 * License along with this program; if not, write to the
  18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19 * Boston, MA 021110-1307, USA.
  20 */
  21
  22#include <linux/fs.h>
  23#include <linux/slab.h>
  24#include <linux/highmem.h>
  25#include <linux/pagemap.h>
  26#include <asm/byteorder.h>
  27#include <linux/swap.h>
  28#include <linux/pipe_fs_i.h>
  29#include <linux/mpage.h>
  30#include <linux/quotaops.h>
  31
  32#include <cluster/masklog.h>
  33
  34#include "ocfs2.h"
  35
  36#include "alloc.h"
  37#include "aops.h"
  38#include "dlmglue.h"
  39#include "extent_map.h"
  40#include "file.h"
  41#include "inode.h"
  42#include "journal.h"
  43#include "suballoc.h"
  44#include "super.h"
  45#include "symlink.h"
  46#include "refcounttree.h"
  47#include "ocfs2_trace.h"
  48
  49#include "buffer_head_io.h"
  50
  51static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
  52				   struct buffer_head *bh_result, int create)
  53{
  54	int err = -EIO;
  55	int status;
  56	struct ocfs2_dinode *fe = NULL;
  57	struct buffer_head *bh = NULL;
  58	struct buffer_head *buffer_cache_bh = NULL;
  59	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  60	void *kaddr;
  61
  62	trace_ocfs2_symlink_get_block(
  63			(unsigned long long)OCFS2_I(inode)->ip_blkno,
  64			(unsigned long long)iblock, bh_result, create);
  65
  66	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
  67
  68	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
  69		mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
  70		     (unsigned long long)iblock);
  71		goto bail;
  72	}
  73
  74	status = ocfs2_read_inode_block(inode, &bh);
  75	if (status < 0) {
  76		mlog_errno(status);
  77		goto bail;
  78	}
  79	fe = (struct ocfs2_dinode *) bh->b_data;
  80
  81	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
  82						    le32_to_cpu(fe->i_clusters))) {
  83		mlog(ML_ERROR, "block offset is outside the allocated size: "
  84		     "%llu\n", (unsigned long long)iblock);
  85		goto bail;
  86	}
  87
  88	/* We don't use the page cache to create symlink data, so if
  89	 * need be, copy it over from the buffer cache. */
  90	if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
  91		u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
  92			    iblock;
  93		buffer_cache_bh = sb_getblk(osb->sb, blkno);
  94		if (!buffer_cache_bh) {
  95			mlog(ML_ERROR, "couldn't getblock for symlink!\n");
  96			goto bail;
  97		}
  98
  99		/* we haven't locked out transactions, so a commit
 100		 * could've happened. Since we've got a reference on
 101		 * the bh, even if it commits while we're doing the
 102		 * copy, the data is still good. */
 103		if (buffer_jbd(buffer_cache_bh)
 104		    && ocfs2_inode_is_new(inode)) {
 105			kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
 106			if (!kaddr) {
 107				mlog(ML_ERROR, "couldn't kmap!\n");
 108				goto bail;
 109			}
 110			memcpy(kaddr + (bh_result->b_size * iblock),
 111			       buffer_cache_bh->b_data,
 112			       bh_result->b_size);
 113			kunmap_atomic(kaddr, KM_USER0);
 114			set_buffer_uptodate(bh_result);
 115		}
 116		brelse(buffer_cache_bh);
 117	}
 118
 119	map_bh(bh_result, inode->i_sb,
 120	       le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
 121
 122	err = 0;
 123
 124bail:
 125	brelse(bh);
 126
 127	return err;
 128}
 129
 130int ocfs2_get_block(struct inode *inode, sector_t iblock,
 131		    struct buffer_head *bh_result, int create)
 132{
 133	int err = 0;
 134	unsigned int ext_flags;
 135	u64 max_blocks = bh_result->b_size >> inode->i_blkbits;
 136	u64 p_blkno, count, past_eof;
 137	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 138
 139	trace_ocfs2_get_block((unsigned long long)OCFS2_I(inode)->ip_blkno,
 140			      (unsigned long long)iblock, bh_result, create);
 141
 142	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
 143		mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
 144		     inode, inode->i_ino);
 145
 146	if (S_ISLNK(inode->i_mode)) {
 147		/* this always does I/O for some reason. */
 148		err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
 149		goto bail;
 150	}
 151
 152	err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count,
 153					  &ext_flags);
 154	if (err) {
 155		mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
 156		     "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
 157		     (unsigned long long)p_blkno);
 158		goto bail;
 159	}
 160
 161	if (max_blocks < count)
 162		count = max_blocks;
 163
 164	/*
 165	 * ocfs2 never allocates in this function - the only time we
 166	 * need to use BH_New is when we're extending i_size on a file
 167	 * system which doesn't support holes, in which case BH_New
 168	 * allows __block_write_begin() to zero.
 169	 *
 170	 * If we see this on a sparse file system, then a truncate has
 171	 * raced us and removed the cluster. In this case, we clear
 172	 * the buffers dirty and uptodate bits and let the buffer code
 173	 * ignore it as a hole.
 174	 */
 175	if (create && p_blkno == 0 && ocfs2_sparse_alloc(osb)) {
 176		clear_buffer_dirty(bh_result);
 177		clear_buffer_uptodate(bh_result);
 178		goto bail;
 179	}
 180
 181	/* Treat the unwritten extent as a hole for zeroing purposes. */
 182	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
 183		map_bh(bh_result, inode->i_sb, p_blkno);
 184
 185	bh_result->b_size = count << inode->i_blkbits;
 186
 187	if (!ocfs2_sparse_alloc(osb)) {
 188		if (p_blkno == 0) {
 189			err = -EIO;
 190			mlog(ML_ERROR,
 191			     "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
 192			     (unsigned long long)iblock,
 193			     (unsigned long long)p_blkno,
 194			     (unsigned long long)OCFS2_I(inode)->ip_blkno);
 195			mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
 196			dump_stack();
 197			goto bail;
 198		}
 199	}
 200
 201	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
 202
 203	trace_ocfs2_get_block_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
 204				  (unsigned long long)past_eof);
 205	if (create && (iblock >= past_eof))
 206		set_buffer_new(bh_result);
 207
 208bail:
 209	if (err < 0)
 210		err = -EIO;
 211
 212	return err;
 213}
 214
 215int ocfs2_read_inline_data(struct inode *inode, struct page *page,
 216			   struct buffer_head *di_bh)
 217{
 218	void *kaddr;
 219	loff_t size;
 220	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 221
 222	if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
 223		ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag",
 224			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
 225		return -EROFS;
 226	}
 227
 228	size = i_size_read(inode);
 229
 230	if (size > PAGE_CACHE_SIZE ||
 231	    size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
 232		ocfs2_error(inode->i_sb,
 233			    "Inode %llu has with inline data has bad size: %Lu",
 234			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
 235			    (unsigned long long)size);
 236		return -EROFS;
 237	}
 238
 239	kaddr = kmap_atomic(page, KM_USER0);
 240	if (size)
 241		memcpy(kaddr, di->id2.i_data.id_data, size);
 242	/* Clear the remaining part of the page */
 243	memset(kaddr + size, 0, PAGE_CACHE_SIZE - size);
 244	flush_dcache_page(page);
 245	kunmap_atomic(kaddr, KM_USER0);
 246
 247	SetPageUptodate(page);
 248
 249	return 0;
 250}
 251
 252static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
 253{
 254	int ret;
 255	struct buffer_head *di_bh = NULL;
 256
 257	BUG_ON(!PageLocked(page));
 258	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
 259
 260	ret = ocfs2_read_inode_block(inode, &di_bh);
 261	if (ret) {
 262		mlog_errno(ret);
 263		goto out;
 264	}
 265
 266	ret = ocfs2_read_inline_data(inode, page, di_bh);
 267out:
 268	unlock_page(page);
 269
 270	brelse(di_bh);
 271	return ret;
 272}
 273
 274static int ocfs2_readpage(struct file *file, struct page *page)
 275{
 276	struct inode *inode = page->mapping->host;
 277	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 278	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
 279	int ret, unlock = 1;
 280
 281	trace_ocfs2_readpage((unsigned long long)oi->ip_blkno,
 282			     (page ? page->index : 0));
 283
 284	ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
 285	if (ret != 0) {
 286		if (ret == AOP_TRUNCATED_PAGE)
 287			unlock = 0;
 288		mlog_errno(ret);
 289		goto out;
 290	}
 291
 292	if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
 293		ret = AOP_TRUNCATED_PAGE;
 294		goto out_inode_unlock;
 295	}
 296
 297	/*
 298	 * i_size might have just been updated as we grabed the meta lock.  We
 299	 * might now be discovering a truncate that hit on another node.
 300	 * block_read_full_page->get_block freaks out if it is asked to read
 301	 * beyond the end of a file, so we check here.  Callers
 302	 * (generic_file_read, vm_ops->fault) are clever enough to check i_size
 303	 * and notice that the page they just read isn't needed.
 304	 *
 305	 * XXX sys_readahead() seems to get that wrong?
 306	 */
 307	if (start >= i_size_read(inode)) {
 308		zero_user(page, 0, PAGE_SIZE);
 309		SetPageUptodate(page);
 310		ret = 0;
 311		goto out_alloc;
 312	}
 313
 314	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
 315		ret = ocfs2_readpage_inline(inode, page);
 316	else
 317		ret = block_read_full_page(page, ocfs2_get_block);
 318	unlock = 0;
 319
 320out_alloc:
 321	up_read(&OCFS2_I(inode)->ip_alloc_sem);
 322out_inode_unlock:
 323	ocfs2_inode_unlock(inode, 0);
 324out:
 325	if (unlock)
 326		unlock_page(page);
 327	return ret;
 328}
 329
 330/*
 331 * This is used only for read-ahead. Failures or difficult to handle
 332 * situations are safe to ignore.
 333 *
 334 * Right now, we don't bother with BH_Boundary - in-inode extent lists
 335 * are quite large (243 extents on 4k blocks), so most inodes don't
 336 * grow out to a tree. If need be, detecting boundary extents could
 337 * trivially be added in a future version of ocfs2_get_block().
 338 */
 339static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
 340			   struct list_head *pages, unsigned nr_pages)
 341{
 342	int ret, err = -EIO;
 343	struct inode *inode = mapping->host;
 344	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 345	loff_t start;
 346	struct page *last;
 347
 348	/*
 349	 * Use the nonblocking flag for the dlm code to avoid page
 350	 * lock inversion, but don't bother with retrying.
 351	 */
 352	ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK);
 353	if (ret)
 354		return err;
 355
 356	if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
 357		ocfs2_inode_unlock(inode, 0);
 358		return err;
 359	}
 360
 361	/*
 362	 * Don't bother with inline-data. There isn't anything
 363	 * to read-ahead in that case anyway...
 364	 */
 365	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
 366		goto out_unlock;
 367
 368	/*
 369	 * Check whether a remote node truncated this file - we just
 370	 * drop out in that case as it's not worth handling here.
 371	 */
 372	last = list_entry(pages->prev, struct page, lru);
 373	start = (loff_t)last->index << PAGE_CACHE_SHIFT;
 374	if (start >= i_size_read(inode))
 375		goto out_unlock;
 376
 377	err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block);
 378
 379out_unlock:
 380	up_read(&oi->ip_alloc_sem);
 381	ocfs2_inode_unlock(inode, 0);
 382
 383	return err;
 384}
 385
 386/* Note: Because we don't support holes, our allocation has
 387 * already happened (allocation writes zeros to the file data)
 388 * so we don't have to worry about ordered writes in
 389 * ocfs2_writepage.
 390 *
 391 * ->writepage is called during the process of invalidating the page cache
 392 * during blocked lock processing.  It can't block on any cluster locks
 393 * to during block mapping.  It's relying on the fact that the block
 394 * mapping can't have disappeared under the dirty pages that it is
 395 * being asked to write back.
 396 */
 397static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
 398{
 399	trace_ocfs2_writepage(
 400		(unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno,
 401		page->index);
 402
 403	return block_write_full_page(page, ocfs2_get_block, wbc);
 404}
 405
 406/* Taken from ext3. We don't necessarily need the full blown
 407 * functionality yet, but IMHO it's better to cut and paste the whole
 408 * thing so we can avoid introducing our own bugs (and easily pick up
 409 * their fixes when they happen) --Mark */
 410int walk_page_buffers(	handle_t *handle,
 411			struct buffer_head *head,
 412			unsigned from,
 413			unsigned to,
 414			int *partial,
 415			int (*fn)(	handle_t *handle,
 416					struct buffer_head *bh))
 417{
 418	struct buffer_head *bh;
 419	unsigned block_start, block_end;
 420	unsigned blocksize = head->b_size;
 421	int err, ret = 0;
 422	struct buffer_head *next;
 423
 424	for (	bh = head, block_start = 0;
 425		ret == 0 && (bh != head || !block_start);
 426	    	block_start = block_end, bh = next)
 427	{
 428		next = bh->b_this_page;
 429		block_end = block_start + blocksize;
 430		if (block_end <= from || block_start >= to) {
 431			if (partial && !buffer_uptodate(bh))
 432				*partial = 1;
 433			continue;
 434		}
 435		err = (*fn)(handle, bh);
 436		if (!ret)
 437			ret = err;
 438	}
 439	return ret;
 440}
 441
 442static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 443{
 444	sector_t status;
 445	u64 p_blkno = 0;
 446	int err = 0;
 447	struct inode *inode = mapping->host;
 448
 449	trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno,
 450			 (unsigned long long)block);
 451
 452	/* We don't need to lock journal system files, since they aren't
 453	 * accessed concurrently from multiple nodes.
 454	 */
 455	if (!INODE_JOURNAL(inode)) {
 456		err = ocfs2_inode_lock(inode, NULL, 0);
 457		if (err) {
 458			if (err != -ENOENT)
 459				mlog_errno(err);
 460			goto bail;
 461		}
 462		down_read(&OCFS2_I(inode)->ip_alloc_sem);
 463	}
 464
 465	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
 466		err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
 467						  NULL);
 468
 469	if (!INODE_JOURNAL(inode)) {
 470		up_read(&OCFS2_I(inode)->ip_alloc_sem);
 471		ocfs2_inode_unlock(inode, 0);
 472	}
 473
 474	if (err) {
 475		mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
 476		     (unsigned long long)block);
 477		mlog_errno(err);
 478		goto bail;
 479	}
 480
 481bail:
 482	status = err ? 0 : p_blkno;
 483
 484	return status;
 485}
 486
 487/*
 488 * TODO: Make this into a generic get_blocks function.
 489 *
 490 * From do_direct_io in direct-io.c:
 491 *  "So what we do is to permit the ->get_blocks function to populate
 492 *   bh.b_size with the size of IO which is permitted at this offset and
 493 *   this i_blkbits."
 494 *
 495 * This function is called directly from get_more_blocks in direct-io.c.
 496 *
 497 * called like this: dio->get_blocks(dio->inode, fs_startblk,
 498 * 					fs_count, map_bh, dio->rw == WRITE);
 499 *
 500 * Note that we never bother to allocate blocks here, and thus ignore the
 501 * create argument.
 502 */
 503static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 504				     struct buffer_head *bh_result, int create)
 505{
 506	int ret;
 507	u64 p_blkno, inode_blocks, contig_blocks;
 508	unsigned int ext_flags;
 509	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
 510	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
 511
 512	/* This function won't even be called if the request isn't all
 513	 * nicely aligned and of the right size, so there's no need
 514	 * for us to check any of that. */
 515
 516	inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
 517
 518	/* This figures out the size of the next contiguous block, and
 519	 * our logical offset */
 520	ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
 521					  &contig_blocks, &ext_flags);
 522	if (ret) {
 523		mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
 524		     (unsigned long long)iblock);
 525		ret = -EIO;
 526		goto bail;
 527	}
 528
 529	/* We should already CoW the refcounted extent in case of create. */
 530	BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
 531
 532	/*
 533	 * get_more_blocks() expects us to describe a hole by clearing
 534	 * the mapped bit on bh_result().
 535	 *
 536	 * Consider an unwritten extent as a hole.
 537	 */
 538	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
 539		map_bh(bh_result, inode->i_sb, p_blkno);
 540	else
 541		clear_buffer_mapped(bh_result);
 542
 543	/* make sure we don't map more than max_blocks blocks here as
 544	   that's all the kernel will handle at this point. */
 545	if (max_blocks < contig_blocks)
 546		contig_blocks = max_blocks;
 547	bh_result->b_size = contig_blocks << blocksize_bits;
 548bail:
 549	return ret;
 550}
 551
 552/*
 553 * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
 554 * particularly interested in the aio/dio case.  Like the core uses
 555 * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
 556 * truncation on another.
 557 */
 558static void ocfs2_dio_end_io(struct kiocb *iocb,
 559			     loff_t offset,
 560			     ssize_t bytes,
 561			     void *private,
 562			     int ret,
 563			     bool is_async)
 564{
 565	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
 566	int level;
 567
 568	/* this io's submitter should not have unlocked this before we could */
 569	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
 570
 571	if (ocfs2_iocb_is_sem_locked(iocb)) {
 572		up_read(&inode->i_alloc_sem);
 573		ocfs2_iocb_clear_sem_locked(iocb);
 574	}
 575
 576	ocfs2_iocb_clear_rw_locked(iocb);
 577
 578	level = ocfs2_iocb_rw_locked_level(iocb);
 579	ocfs2_rw_unlock(inode, level);
 580
 581	if (is_async)
 582		aio_complete(iocb, ret, 0);
 583}
 584
 585/*
 586 * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen
 587 * from ext3.  PageChecked() bits have been removed as OCFS2 does not
 588 * do journalled data.
 589 */
 590static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
 591{
 592	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
 593
 594	jbd2_journal_invalidatepage(journal, page, offset);
 595}
 596
 597static int ocfs2_releasepage(struct page *page, gfp_t wait)
 598{
 599	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
 600
 601	if (!page_has_buffers(page))
 602		return 0;
 603	return jbd2_journal_try_to_free_buffers(journal, page, wait);
 604}
 605
 606static ssize_t ocfs2_direct_IO(int rw,
 607			       struct kiocb *iocb,
 608			       const struct iovec *iov,
 609			       loff_t offset,
 610			       unsigned long nr_segs)
 611{
 612	struct file *file = iocb->ki_filp;
 613	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
 614
 615	/*
 616	 * Fallback to buffered I/O if we see an inode without
 617	 * extents.
 618	 */
 619	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
 620		return 0;
 621
 622	/* Fallback to buffered I/O if we are appending. */
 623	if (i_size_read(inode) <= offset)
 624		return 0;
 625
 626	return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
 627				    iov, offset, nr_segs,
 628				    ocfs2_direct_IO_get_blocks,
 629				    ocfs2_dio_end_io, NULL, 0);
 630}
 631
 632static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
 633					    u32 cpos,
 634					    unsigned int *start,
 635					    unsigned int *end)
 636{
 637	unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
 638
 639	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
 640		unsigned int cpp;
 641
 642		cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
 643
 644		cluster_start = cpos % cpp;
 645		cluster_start = cluster_start << osb->s_clustersize_bits;
 646
 647		cluster_end = cluster_start + osb->s_clustersize;
 648	}
 649
 650	BUG_ON(cluster_start > PAGE_SIZE);
 651	BUG_ON(cluster_end > PAGE_SIZE);
 652
 653	if (start)
 654		*start = cluster_start;
 655	if (end)
 656		*end = cluster_end;
 657}
 658
 659/*
 660 * 'from' and 'to' are the region in the page to avoid zeroing.
 661 *
 662 * If pagesize > clustersize, this function will avoid zeroing outside
 663 * of the cluster boundary.
 664 *
 665 * from == to == 0 is code for "zero the entire cluster region"
 666 */
 667static void ocfs2_clear_page_regions(struct page *page,
 668				     struct ocfs2_super *osb, u32 cpos,
 669				     unsigned from, unsigned to)
 670{
 671	void *kaddr;
 672	unsigned int cluster_start, cluster_end;
 673
 674	ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
 675
 676	kaddr = kmap_atomic(page, KM_USER0);
 677
 678	if (from || to) {
 679		if (from > cluster_start)
 680			memset(kaddr + cluster_start, 0, from - cluster_start);
 681		if (to < cluster_end)
 682			memset(kaddr + to, 0, cluster_end - to);
 683	} else {
 684		memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
 685	}
 686
 687	kunmap_atomic(kaddr, KM_USER0);
 688}
 689
 690/*
 691 * Nonsparse file systems fully allocate before we get to the write
 692 * code. This prevents ocfs2_write() from tagging the write as an
 693 * allocating one, which means ocfs2_map_page_blocks() might try to
 694 * read-in the blocks at the tail of our file. Avoid reading them by
 695 * testing i_size against each block offset.
 696 */
 697static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
 698				 unsigned int block_start)
 699{
 700	u64 offset = page_offset(page) + block_start;
 701
 702	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
 703		return 1;
 704
 705	if (i_size_read(inode) > offset)
 706		return 1;
 707
 708	return 0;
 709}
 710
 711/*
 712 * Some of this taken from __block_write_begin(). We already have our
 713 * mapping by now though, and the entire write will be allocating or
 714 * it won't, so not much need to use BH_New.
 715 *
 716 * This will also skip zeroing, which is handled externally.
 717 */
 718int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
 719			  struct inode *inode, unsigned int from,
 720			  unsigned int to, int new)
 721{
 722	int ret = 0;
 723	struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
 724	unsigned int block_end, block_start;
 725	unsigned int bsize = 1 << inode->i_blkbits;
 726
 727	if (!page_has_buffers(page))
 728		create_empty_buffers(page, bsize, 0);
 729
 730	head = page_buffers(page);
 731	for (bh = head, block_start = 0; bh != head || !block_start;
 732	     bh = bh->b_this_page, block_start += bsize) {
 733		block_end = block_start + bsize;
 734
 735		clear_buffer_new(bh);
 736
 737		/*
 738		 * Ignore blocks outside of our i/o range -
 739		 * they may belong to unallocated clusters.
 740		 */
 741		if (block_start >= to || block_end <= from) {
 742			if (PageUptodate(page))
 743				set_buffer_uptodate(bh);
 744			continue;
 745		}
 746
 747		/*
 748		 * For an allocating write with cluster size >= page
 749		 * size, we always write the entire page.
 750		 */
 751		if (new)
 752			set_buffer_new(bh);
 753
 754		if (!buffer_mapped(bh)) {
 755			map_bh(bh, inode->i_sb, *p_blkno);
 756			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
 757		}
 758
 759		if (PageUptodate(page)) {
 760			if (!buffer_uptodate(bh))
 761				set_buffer_uptodate(bh);
 762		} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
 763			   !buffer_new(bh) &&
 764			   ocfs2_should_read_blk(inode, page, block_start) &&
 765			   (block_start < from || block_end > to)) {
 766			ll_rw_block(READ, 1, &bh);
 767			*wait_bh++=bh;
 768		}
 769
 770		*p_blkno = *p_blkno + 1;
 771	}
 772
 773	/*
 774	 * If we issued read requests - let them complete.
 775	 */
 776	while(wait_bh > wait) {
 777		wait_on_buffer(*--wait_bh);
 778		if (!buffer_uptodate(*wait_bh))
 779			ret = -EIO;
 780	}
 781
 782	if (ret == 0 || !new)
 783		return ret;
 784
 785	/*
 786	 * If we get -EIO above, zero out any newly allocated blocks
 787	 * to avoid exposing stale data.
 788	 */
 789	bh = head;
 790	block_start = 0;
 791	do {
 792		block_end = block_start + bsize;
 793		if (block_end <= from)
 794			goto next_bh;
 795		if (block_start >= to)
 796			break;
 797
 798		zero_user(page, block_start, bh->b_size);
 799		set_buffer_uptodate(bh);
 800		mark_buffer_dirty(bh);
 801
 802next_bh:
 803		block_start = block_end;
 804		bh = bh->b_this_page;
 805	} while (bh != head);
 806
 807	return ret;
 808}
 809
 810#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
 811#define OCFS2_MAX_CTXT_PAGES	1
 812#else
 813#define OCFS2_MAX_CTXT_PAGES	(OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
 814#endif
 815
 816#define OCFS2_MAX_CLUSTERS_PER_PAGE	(PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
 817
 818/*
 819 * Describe the state of a single cluster to be written to.
 820 */
 821struct ocfs2_write_cluster_desc {
 822	u32		c_cpos;
 823	u32		c_phys;
 824	/*
 825	 * Give this a unique field because c_phys eventually gets
 826	 * filled.
 827	 */
 828	unsigned	c_new;
 829	unsigned	c_unwritten;
 830	unsigned	c_needs_zero;
 831};
 832
 833struct ocfs2_write_ctxt {
 834	/* Logical cluster position / len of write */
 835	u32				w_cpos;
 836	u32				w_clen;
 837
 838	/* First cluster allocated in a nonsparse extend */
 839	u32				w_first_new_cpos;
 840
 841	struct ocfs2_write_cluster_desc	w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
 842
 843	/*
 844	 * This is true if page_size > cluster_size.
 845	 *
 846	 * It triggers a set of special cases during write which might
 847	 * have to deal with allocating writes to partial pages.
 848	 */
 849	unsigned int			w_large_pages;
 850
 851	/*
 852	 * Pages involved in this write.
 853	 *
 854	 * w_target_page is the page being written to by the user.
 855	 *
 856	 * w_pages is an array of pages which always contains
 857	 * w_target_page, and in the case of an allocating write with
 858	 * page_size < cluster size, it will contain zero'd and mapped
 859	 * pages adjacent to w_target_page which need to be written
 860	 * out in so that future reads from that region will get
 861	 * zero's.
 862	 */
 863	unsigned int			w_num_pages;
 864	struct page			*w_pages[OCFS2_MAX_CTXT_PAGES];
 865	struct page			*w_target_page;
 866
 867	/*
 868	 * ocfs2_write_end() uses this to know what the real range to
 869	 * write in the target should be.
 870	 */
 871	unsigned int			w_target_from;
 872	unsigned int			w_target_to;
 873
 874	/*
 875	 * We could use journal_current_handle() but this is cleaner,
 876	 * IMHO -Mark
 877	 */
 878	handle_t			*w_handle;
 879
 880	struct buffer_head		*w_di_bh;
 881
 882	struct ocfs2_cached_dealloc_ctxt w_dealloc;
 883};
 884
 885void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
 886{
 887	int i;
 888
 889	for(i = 0; i < num_pages; i++) {
 890		if (pages[i]) {
 891			unlock_page(pages[i]);
 892			mark_page_accessed(pages[i]);
 893			page_cache_release(pages[i]);
 894		}
 895	}
 896}
 897
 898static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
 899{
 900	ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
 901
 902	brelse(wc->w_di_bh);
 903	kfree(wc);
 904}
 905
 906static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
 907				  struct ocfs2_super *osb, loff_t pos,
 908				  unsigned len, struct buffer_head *di_bh)
 909{
 910	u32 cend;
 911	struct ocfs2_write_ctxt *wc;
 912
 913	wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
 914	if (!wc)
 915		return -ENOMEM;
 916
 917	wc->w_cpos = pos >> osb->s_clustersize_bits;
 918	wc->w_first_new_cpos = UINT_MAX;
 919	cend = (pos + len - 1) >> osb->s_clustersize_bits;
 920	wc->w_clen = cend - wc->w_cpos + 1;
 921	get_bh(di_bh);
 922	wc->w_di_bh = di_bh;
 923
 924	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
 925		wc->w_large_pages = 1;
 926	else
 927		wc->w_large_pages = 0;
 928
 929	ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
 930
 931	*wcp = wc;
 932
 933	return 0;
 934}
 935
 936/*
 937 * If a page has any new buffers, zero them out here, and mark them uptodate
 938 * and dirty so they'll be written out (in order to prevent uninitialised
 939 * block data from leaking). And clear the new bit.
 940 */
 941static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 942{
 943	unsigned int block_start, block_end;
 944	struct buffer_head *head, *bh;
 945
 946	BUG_ON(!PageLocked(page));
 947	if (!page_has_buffers(page))
 948		return;
 949
 950	bh = head = page_buffers(page);
 951	block_start = 0;
 952	do {
 953		block_end = block_start + bh->b_size;
 954
 955		if (buffer_new(bh)) {
 956			if (block_end > from && block_start < to) {
 957				if (!PageUptodate(page)) {
 958					unsigned start, end;
 959
 960					start = max(from, block_start);
 961					end = min(to, block_end);
 962
 963					zero_user_segment(page, start, end);
 964					set_buffer_uptodate(bh);
 965				}
 966
 967				clear_buffer_new(bh);
 968				mark_buffer_dirty(bh);
 969			}
 970		}
 971
 972		block_start = block_end;
 973		bh = bh->b_this_page;
 974	} while (bh != head);
 975}
 976
 977/*
 978 * Only called when we have a failure during allocating write to write
 979 * zero's to the newly allocated region.
 980 */
 981static void ocfs2_write_failure(struct inode *inode,
 982				struct ocfs2_write_ctxt *wc,
 983				loff_t user_pos, unsigned user_len)
 984{
 985	int i;
 986	unsigned from = user_pos & (PAGE_CACHE_SIZE - 1),
 987		to = user_pos + user_len;
 988	struct page *tmppage;
 989
 990	ocfs2_zero_new_buffers(wc->w_target_page, from, to);
 991
 992	for(i = 0; i < wc->w_num_pages; i++) {
 993		tmppage = wc->w_pages[i];
 994
 995		if (page_has_buffers(tmppage)) {
 996			if (ocfs2_should_order_data(inode))
 997				ocfs2_jbd2_file_inode(wc->w_handle, inode);
 998
 999			block_commit_write(tmppage, from, to);
1000		}
1001	}
1002}
1003
1004static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
1005					struct ocfs2_write_ctxt *wc,
1006					struct page *page, u32 cpos,
1007					loff_t user_pos, unsigned user_len,
1008					int new)
1009{
1010	int ret;
1011	unsigned int map_from = 0, map_to = 0;
1012	unsigned int cluster_start, cluster_end;
1013	unsigned int user_data_from = 0, user_data_to = 0;
1014
1015	ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
1016					&cluster_start, &cluster_end);
1017
1018	/* treat the write as new if the a hole/lseek spanned across
1019	 * the page boundary.
1020	 */
1021	new = new | ((i_size_read(inode) <= page_offset(page)) &&
1022			(page_offset(page) <= user_pos));
1023
1024	if (page == wc->w_target_page) {
1025		map_from = user_pos & (PAGE_CACHE_SIZE - 1);
1026		map_to = map_from + user_len;
1027
1028		if (new)
1029			ret = ocfs2_map_page_blocks(page, p_blkno, inode,
1030						    cluster_start, cluster_end,
1031						    new);
1032		else
1033			ret = ocfs2_map_page_blocks(page, p_blkno, inode,
1034						    map_from, map_to, new);
1035		if (ret) {
1036			mlog_errno(ret);
1037			goto out;
1038		}
1039
1040		user_data_from = map_from;
1041		user_data_to = map_to;
1042		if (new) {
1043			map_from = cluster_start;
1044			map_to = cluster_end;
1045		}
1046	} else {
1047		/*
1048		 * If we haven't allocated the new page yet, we
1049		 * shouldn't be writing it out without copying user
1050		 * data. This is likely a math error from the caller.
1051		 */
1052		BUG_ON(!new);
1053
1054		map_from = cluster_start;
1055		map_to = cluster_end;
1056
1057		ret = ocfs2_map_page_blocks(page, p_blkno, inode,
1058					    cluster_start, cluster_end, new);
1059		if (ret) {
1060			mlog_errno(ret);
1061			goto out;
1062		}
1063	}
1064
1065	/*
1066	 * Parts of newly allocated pages need to be zero'd.
1067	 *
1068	 * Above, we have also rewritten 'to' and 'from' - as far as
1069	 * the rest of the function is concerned, the entire cluster
1070	 * range inside of a page needs to be written.
1071	 *
1072	 * We can skip this if the page is up to date - it's already
1073	 * been zero'd from being read in as a hole.
1074	 */
1075	if (new && !PageUptodate(page))
1076		ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
1077					 cpos, user_data_from, user_data_to);
1078
1079	flush_dcache_page(page);
1080
1081out:
1082	return ret;
1083}
1084
1085/*
1086 * This function will only grab one clusters worth of pages.
1087 */
1088static int ocfs2_grab_pages_for_write(struct address_space *mapping,
1089				      struct ocfs2_write_ctxt *wc,
1090				      u32 cpos, loff_t user_pos,
1091				      unsigned user_len, int new,
1092				      struct page *mmap_page)
1093{
1094	int ret = 0, i;
1095	unsigned long start, target_index, end_index, index;
1096	struct inode *inode = mapping->host;
1097	loff_t last_byte;
1098
1099	target_index = user_pos >> PAGE_CACHE_SHIFT;
1100
1101	/*
1102	 * Figure out how many pages we'll be manipulating here. For
1103	 * non allocating write, we just change the one
1104	 * page. Otherwise, we'll need a whole clusters worth.  If we're
1105	 * writing past i_size, we only need enough pages to cover the
1106	 * last page of the write.
1107	 */
1108	if (new) {
1109		wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
1110		start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
1111		/*
1112		 * We need the index *past* the last page we could possibly
1113		 * touch.  This is the page past the end of the write or
1114		 * i_size, whichever is greater.
1115		 */
1116		last_byte = max(user_pos + user_len, i_size_read(inode));
1117		BUG_ON(last_byte < 1);
1118		end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1;
1119		if ((start + wc->w_num_pages) > end_index)
1120			wc->w_num_pages = end_index - start;
1121	} else {
1122		wc->w_num_pages = 1;
1123		start = target_index;
1124	}
1125
1126	for(i = 0; i < wc->w_num_pages; i++) {
1127		index = start + i;
1128
1129		if (index == target_index && mmap_page) {
1130			/*
1131			 * ocfs2_pagemkwrite() is a little different
1132			 * and wants us to directly use the page
1133			 * passed in.
1134			 */
1135			lock_page(mmap_page);
1136
1137			if (mmap_page->mapping != mapping) {
1138				unlock_page(mmap_page);
1139				/*
1140				 * Sanity check - the locking in
1141				 * ocfs2_pagemkwrite() should ensure
1142				 * that this code doesn't trigger.
1143				 */
1144				ret = -EINVAL;
1145				mlog_errno(ret);
1146				goto out;
1147			}
1148
1149			page_cache_get(mmap_page);
1150			wc->w_pages[i] = mmap_page;
1151		} else {
1152			wc->w_pages[i] = find_or_create_page(mapping, index,
1153							     GFP_NOFS);
1154			if (!wc->w_pages[i]) {
1155				ret = -ENOMEM;
1156				mlog_errno(ret);
1157				goto out;
1158			}
1159		}
1160
1161		if (index == target_index)
1162			wc->w_target_page = wc->w_pages[i];
1163	}
1164out:
1165	return ret;
1166}
1167
1168/*
1169 * Prepare a single cluster for write one cluster into the file.
1170 */
1171static int ocfs2_write_cluster(struct address_space *mapping,
1172			       u32 phys, unsigned int unwritten,
1173			       unsigned int should_zero,
1174			       struct ocfs2_alloc_context *data_ac,
1175			       struct ocfs2_alloc_context *meta_ac,
1176			       struct ocfs2_write_ctxt *wc, u32 cpos,
1177			       loff_t user_pos, unsigned user_len)
1178{
1179	int ret, i, new;
1180	u64 v_blkno, p_blkno;
1181	struct inode *inode = mapping->host;
1182	struct ocfs2_extent_tree et;
1183
1184	new = phys == 0 ? 1 : 0;
1185	if (new) {
1186		u32 tmp_pos;
1187
1188		/*
1189		 * This is safe to call with the page locks - it won't take
1190		 * any additional semaphores or cluster locks.
1191		 */
1192		tmp_pos = cpos;
1193		ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
1194					   &tmp_pos, 1, 0, wc->w_di_bh,
1195					   wc->w_handle, data_ac,
1196					   meta_ac, NULL);
1197		/*
1198		 * This shouldn't happen because we must have already
1199		 * calculated the correct meta data allocation required. The
1200		 * internal tree allocation code should know how to increase
1201		 * transaction credits itself.
1202		 *
1203		 * If need be, we could handle -EAGAIN for a
1204		 * RESTART_TRANS here.
1205		 */
1206		mlog_bug_on_msg(ret == -EAGAIN,
1207				"Inode %llu: EAGAIN return during allocation.\n",
1208				(unsigned long long)OCFS2_I(inode)->ip_blkno);
1209		if (ret < 0) {
1210			mlog_errno(ret);
1211			goto out;
1212		}
1213	} else if (unwritten) {
1214		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
1215					      wc->w_di_bh);
1216		ret = ocfs2_mark_extent_written(inode, &et,
1217						wc->w_handle, cpos, 1, phys,
1218						meta_ac, &wc->w_dealloc);
1219		if (ret < 0) {
1220			mlog_errno(ret);
1221			goto out;
1222		}
1223	}
1224
1225	if (should_zero)
1226		v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
1227	else
1228		v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
1229
1230	/*
1231	 * The only reason this should fail is due to an inability to
1232	 * find the extent added.
1233	 */
1234	ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
1235					  NULL);
1236	if (ret < 0) {
1237		ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, "
1238			    "at logical block %llu",
1239			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
1240			    (unsigned long long)v_blkno);
1241		goto out;
1242	}
1243
1244	BUG_ON(p_blkno == 0);
1245
1246	for(i = 0; i < wc->w_num_pages; i++) {
1247		int tmpret;
1248
1249		tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
1250						      wc->w_pages[i], cpos,
1251						      user_pos, user_len,
1252						      should_zero);
1253		if (tmpret) {
1254			mlog_errno(tmpret);
1255			if (ret == 0)
1256				ret = tmpret;
1257		}
1258	}
1259
1260	/*
1261	 * We only have cleanup to do in case of allocating write.
1262	 */
1263	if (ret && new)
1264		ocfs2_write_failure(inode, wc, user_pos, user_len);
1265
1266out:
1267
1268	return ret;
1269}
1270
1271static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
1272				       struct ocfs2_alloc_context *data_ac,
1273				       struct ocfs2_alloc_context *meta_ac,
1274				       struct ocfs2_write_ctxt *wc,
1275				       loff_t pos, unsigned len)
1276{
1277	int ret, i;
1278	loff_t cluster_off;
1279	unsigned int local_len = len;
1280	struct ocfs2_write_cluster_desc *desc;
1281	struct ocfs2_super *osb = OCFS2_SB(mapping->host->i_sb);
1282
1283	for (i = 0; i < wc->w_clen; i++) {
1284		desc = &wc->w_desc[i];
1285
1286		/*
1287		 * We have to make sure that the total write passed in
1288		 * doesn't extend past a single cluster.
1289		 */
1290		local_len = len;
1291		cluster_off = pos & (osb->s_clustersize - 1);
1292		if ((cluster_off + local_len) > osb->s_clustersize)
1293			local_len = osb->s_clustersize - cluster_off;
1294
1295		ret = ocfs2_write_cluster(mapping, desc->c_phys,
1296					  desc->c_unwritten,
1297					  desc->c_needs_zero,
1298					  data_ac, meta_ac,
1299					  wc, desc->c_cpos, pos, local_len);
1300		if (ret) {
1301			mlog_errno(ret);
1302			goto out;
1303		}
1304
1305		len -= local_len;
1306		pos += local_len;
1307	}
1308
1309	ret = 0;
1310out:
1311	return ret;
1312}
1313
1314/*
1315 * ocfs2_write_end() wants to know which parts of the target page it
1316 * should complete the write on. It's easiest to compute them ahead of
1317 * time when a more complete view of the write is available.
1318 */
1319static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
1320					struct ocfs2_write_ctxt *wc,
1321					loff_t pos, unsigned len, int alloc)
1322{
1323	struct ocfs2_write_cluster_desc *desc;
1324
1325	wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
1326	wc->w_target_to = wc->w_target_from + len;
1327
1328	if (alloc == 0)
1329		return;
1330
1331	/*
1332	 * Allocating write - we may have different boundaries based
1333	 * on page size and cluster size.
1334	 *
1335	 * NOTE: We can no longer compute one value from the other as
1336	 * the actual write length and user provided length may be
1337	 * different.
1338	 */
1339
1340	if (wc->w_large_pages) {
1341		/*
1342		 * We only care about the 1st and last cluster within
1343		 * our range and whether they should be zero'd or not. Either
1344		 * value may be extended out to the start/end of a
1345		 * newly allocated cluster.
1346		 */
1347		desc = &wc->w_desc[0];
1348		if (desc->c_needs_zero)
1349			ocfs2_figure_cluster_boundaries(osb,
1350							desc->c_cpos,
1351							&wc->w_target_from,
1352							NULL);
1353
1354		desc = &wc->w_desc[wc->w_clen - 1];
1355		if (desc->c_needs_zero)
1356			ocfs2_figure_cluster_boundaries(osb,
1357							desc->c_cpos,
1358							NULL,
1359							&wc->w_target_to);
1360	} else {
1361		wc->w_target_from = 0;
1362		wc->w_target_to = PAGE_CACHE_SIZE;
1363	}
1364}
1365
1366/*
1367 * Populate each single-cluster write descriptor in the write context
1368 * with information about the i/o to be done.
1369 *
1370 * Returns the number of clusters that will have to be allocated, as
1371 * well as a worst case estimate of the number of extent records that
1372 * would have to be created during a write to an unwritten region.
1373 */
1374static int ocfs2_populate_write_desc(struct inode *inode,
1375				     struct ocfs2_write_ctxt *wc,
1376				     unsigned int *clusters_to_alloc,
1377				     unsigned int *extents_to_split)
1378{
1379	int ret;
1380	struct ocfs2_write_cluster_desc *desc;
1381	unsigned int num_clusters = 0;
1382	unsigned int ext_flags = 0;
1383	u32 phys = 0;
1384	int i;
1385
1386	*clusters_to_alloc = 0;
1387	*extents_to_split = 0;
1388
1389	for (i = 0; i < wc->w_clen; i++) {
1390		desc = &wc->w_desc[i];
1391		desc->c_cpos = wc->w_cpos + i;
1392
1393		if (num_clusters == 0) {
1394			/*
1395			 * Need to look up the next extent record.
1396			 */
1397			ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
1398						 &num_clusters, &ext_flags);
1399			if (ret) {
1400				mlog_errno(ret);
1401				goto out;
1402			}
1403
1404			/* We should already CoW the refcountd extent. */
1405			BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
1406
1407			/*
1408			 * Assume worst case - that we're writing in
1409			 * the middle of the extent.
1410			 *
1411			 * We can assume that the write proceeds from
1412			 * left to right, in which case the extent
1413			 * insert code is smart enough to coalesce the
1414			 * next splits into the previous records created.
1415			 */
1416			if (ext_flags & OCFS2_EXT_UNWRITTEN)
1417				*extents_to_split = *extents_to_split + 2;
1418		} else if (phys) {
1419			/*
1420			 * Only increment phys if it doesn't describe
1421			 * a hole.
1422			 */
1423			phys++;
1424		}
1425
1426		/*
1427		 * If w_first_new_cpos is < UINT_MAX, we have a non-sparse
1428		 * file that got extended.  w_first_new_cpos tells us
1429		 * where the newly allocated clusters are so we can
1430		 * zero them.
1431		 */
1432		if (desc->c_cpos >= wc->w_first_new_cpos) {
1433			BUG_ON(phys == 0);
1434			desc->c_needs_zero = 1;
1435		}
1436
1437		desc->c_phys = phys;
1438		if (phys == 0) {
1439			desc->c_new = 1;
1440			desc->c_needs_zero = 1;
1441			*clusters_to_alloc = *clusters_to_alloc + 1;
1442		}
1443
1444		if (ext_flags & OCFS2_EXT_UNWRITTEN) {
1445			desc->c_unwritten = 1;
1446			desc->c_needs_zero = 1;
1447		}
1448
1449		num_clusters--;
1450	}
1451
1452	ret = 0;
1453out:
1454	return ret;
1455}
1456
1457static int ocfs2_write_begin_inline(struct address_space *mapping,
1458				    struct inode *inode,
1459				    struct ocfs2_write_ctxt *wc)
1460{
1461	int ret;
1462	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1463	struct page *page;
1464	handle_t *handle;
1465	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1466
1467	page = find_or_create_page(mapping, 0, GFP_NOFS);
1468	if (!page) {
1469		ret = -ENOMEM;
1470		mlog_errno(ret);
1471		goto out;
1472	}
1473	/*
1474	 * If we don't set w_num_pages then this page won't get unlocked
1475	 * and freed on cleanup of the write context.
1476	 */
1477	wc->w_pages[0] = wc->w_target_page = page;
1478	wc->w_num_pages = 1;
1479
1480	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1481	if (IS_ERR(handle)) {
1482		ret = PTR_ERR(handle);
1483		mlog_errno(ret);
1484		goto out;
1485	}
1486
1487	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
1488				      OCFS2_JOURNAL_ACCESS_WRITE);
1489	if (ret) {
1490		ocfs2_commit_trans(osb, handle);
1491
1492		mlog_errno(ret);
1493		goto out;
1494	}
1495
1496	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
1497		ocfs2_set_inode_data_inline(inode, di);
1498
1499	if (!PageUptodate(page)) {
1500		ret = ocfs2_read_inline_data(inode, page, wc->w_di_bh);
1501		if (ret) {
1502			ocfs2_commit_trans(osb, handle);
1503
1504			goto out;
1505		}
1506	}
1507
1508	wc->w_handle = handle;
1509out:
1510	return ret;
1511}
1512
1513int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size)
1514{
1515	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1516
1517	if (new_size <= le16_to_cpu(di->id2.i_data.id_count))
1518		return 1;
1519	return 0;
1520}
1521
1522static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
1523					  struct inode *inode, loff_t pos,
1524					  unsigned len, struct page *mmap_page,
1525					  struct ocfs2_write_ctxt *wc)
1526{
1527	int ret, written = 0;
1528	loff_t end = pos + len;
1529	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1530	struct ocfs2_dinode *di = NULL;
1531
1532	trace_ocfs2_try_to_write_inline_data((unsigned long long)oi->ip_blkno,
1533					     len, (unsigned long long)pos,
1534					     oi->ip_dyn_features);
1535
1536	/*
1537	 * Handle inodes which already have inline data 1st.
1538	 */
1539	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1540		if (mmap_page == NULL &&
1541		    ocfs2_size_fits_inline_data(wc->w_di_bh, end))
1542			goto do_inline_write;
1543
1544		/*
1545		 * The write won't fit - we have to give this inode an
1546		 * inline extent list now.
1547		 */
1548		ret = ocfs2_convert_inline_data_to_extents(inode, wc->w_di_bh);
1549		if (ret)
1550			mlog_errno(ret);
1551		goto out;
1552	}
1553
1554	/*
1555	 * Check whether the inode can accept inline data.
1556	 */
1557	if (oi->ip_clusters != 0 || i_size_read(inode) != 0)
1558		return 0;
1559
1560	/*
1561	 * Check whether the write can fit.
1562	 */
1563	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1564	if (mmap_page ||
1565	    end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di))
1566		return 0;
1567
1568do_inline_write:
1569	ret = ocfs2_write_begin_inline(mapping, inode, wc);
1570	if (ret) {
1571		mlog_errno(ret);
1572		goto out;
1573	}
1574
1575	/*
1576	 * This signals to the caller that the data can be written
1577	 * inline.
1578	 */
1579	written = 1;
1580out:
1581	return written ? written : ret;
1582}
1583
1584/*
1585 * This function only does anything for file systems which can't
1586 * handle sparse files.
1587 *
1588 * What we want to do here is fill in any hole between the current end
1589 * of allocation and the end of our write. That way the rest of the
1590 * write path can treat it as an non-allocating write, which has no
1591 * special case code for sparse/nonsparse files.
1592 */
1593static int ocfs2_expand_nonsparse_inode(struct inode *inode,
1594					struct buffer_head *di_bh,
1595					loff_t pos, unsigned len,
1596					struct ocfs2_write_ctxt *wc)
1597{
1598	int ret;
1599	loff_t newsize = pos + len;
1600
1601	BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
1602
1603	if (newsize <= i_size_read(inode))
1604		return 0;
1605
1606	ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos);
1607	if (ret)
1608		mlog_errno(ret);
1609
1610	wc->w_first_new_cpos =
1611		ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
1612
1613	return ret;
1614}
1615
1616static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
1617			   loff_t pos)
1618{
1619	int ret = 0;
1620
1621	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
1622	if (pos > i_size_read(inode))
1623		ret = ocfs2_zero_extend(inode, di_bh, pos);
1624
1625	return ret;
1626}
1627
1628/*
1629 * Try to flush truncate logs if we can free enough clusters from it.
1630 * As for return value, "< 0" means error, "0" no space and "1" means
1631 * we have freed enough spaces and let the caller try to allocate again.
1632 */
1633static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
1634					  unsigned int needed)
1635{
1636	tid_t target;
1637	int ret = 0;
1638	unsigned int truncated_clusters;
1639
1640	mutex_lock(&osb->osb_tl_inode->i_mutex);
1641	truncated_clusters = osb->truncated_clusters;
1642	mutex_unlock(&osb->osb_tl_inode->i_mutex);
1643
1644	/*
1645	 * Check whether we can succeed in allocating if we free
1646	 * the truncate log.
1647	 */
1648	if (truncated_clusters < needed)
1649		goto out;
1650
1651	ret = ocfs2_flush_truncate_log(osb);
1652	if (ret) {
1653		mlog_errno(ret);
1654		goto out;
1655	}
1656
1657	if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
1658		jbd2_log_wait_commit(osb->journal->j_journal, target);
1659		ret = 1;
1660	}
1661out:
1662	return ret;
1663}
1664
1665int ocfs2_write_begin_nolock(struct file *filp,
1666			     struct address_space *mapping,
1667			     loff_t pos, unsigned len, unsigned flags,
1668			     struct page **pagep, void **fsdata,
1669			     struct buffer_head *di_bh, struct page *mmap_page)
1670{
1671	int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
1672	unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0;
1673	struct ocfs2_write_ctxt *wc;
1674	struct inode *inode = mapping->host;
1675	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1676	struct ocfs2_dinode *di;
1677	struct ocfs2_alloc_context *data_ac = NULL;
1678	struct ocfs2_alloc_context *meta_ac = NULL;
1679	handle_t *handle;
1680	struct ocfs2_extent_tree et;
1681	int try_free = 1, ret1;
1682
1683try_again:
1684	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
1685	if (ret) {
1686		mlog_errno(ret);
1687		return ret;
1688	}
1689
1690	if (ocfs2_supports_inline_data(osb)) {
1691		ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len,
1692						     mmap_page, wc);
1693		if (ret == 1) {
1694			ret = 0;
1695			goto success;
1696		}
1697		if (ret < 0) {
1698			mlog_errno(ret);
1699			goto out;
1700		}
1701	}
1702
1703	if (ocfs2_sparse_alloc(osb))
1704		ret = ocfs2_zero_tail(inode, di_bh, pos);
1705	else
1706		ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
1707						   wc);
1708	if (ret) {
1709		mlog_errno(ret);
1710		goto out;
1711	}
1712
1713	ret = ocfs2_check_range_for_refcount(inode, pos, len);
1714	if (ret < 0) {
1715		mlog_errno(ret);
1716		goto out;
1717	} else if (ret == 1) {
1718		clusters_need = wc->w_clen;
1719		ret = ocfs2_refcount_cow(inode, filp, di_bh,
1720					 wc->w_cpos, wc->w_clen, UINT_MAX);
1721		if (ret) {
1722			mlog_errno(ret);
1723			goto out;
1724		}
1725	}
1726
1727	ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
1728					&extents_to_split);
1729	if (ret) {
1730		mlog_errno(ret);
1731		goto out;
1732	}
1733	clusters_need += clusters_to_alloc;
1734
1735	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1736
1737	trace_ocfs2_write_begin_nolock(
1738			(unsigned long long)OCFS2_I(inode)->ip_blkno,
1739			(long long)i_size_read(inode),
1740			le32_to_cpu(di->i_clusters),
1741			pos, len, flags, mmap_page,
1742			clusters_to_alloc, extents_to_split);
1743
1744	/*
1745	 * We set w_target_from, w_target_to here so that
1746	 * ocfs2_write_end() knows which range in the target page to
1747	 * write out. An allocation requires that we write the entire
1748	 * cluster range.
1749	 */
1750	if (clusters_to_alloc || extents_to_split) {
1751		/*
1752		 * XXX: We are stretching the limits of
1753		 * ocfs2_lock_allocators(). It greatly over-estimates
1754		 * the work to be done.
1755		 */
1756		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
1757					      wc->w_di_bh);
1758		ret = ocfs2_lock_allocators(inode, &et,
1759					    clusters_to_alloc, extents_to_split,
1760					    &data_ac, &meta_ac);
1761		if (ret) {
1762			mlog_errno(ret);
1763			goto out;
1764		}
1765
1766		if (data_ac)
1767			data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
1768
1769		credits = ocfs2_calc_extend_credits(inode->i_sb,
1770						    &di->id2.i_list,
1771						    clusters_to_alloc);
1772
1773	}
1774
1775	/*
1776	 * We have to zero sparse allocated clusters, unwritten extent clusters,
1777	 * and non-sparse clusters we just extended.  For non-sparse writes,
1778	 * we know zeros will only be needed in the first and/or last cluster.
1779	 */
1780	if (clusters_to_alloc || extents_to_split ||
1781	    (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
1782			    wc->w_desc[wc->w_clen - 1].c_needs_zero)))
1783		cluster_of_pages = 1;
1784	else
1785		cluster_of_pages = 0;
1786
1787	ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages);
1788
1789	handle = ocfs2_start_trans(osb, credits);
1790	if (IS_ERR(handle)) {
1791		ret = PTR_ERR(handle);
1792		mlog_errno(ret);
1793		goto out;
1794	}
1795
1796	wc->w_handle = handle;
1797
1798	if (clusters_to_alloc) {
1799		ret = dquot_alloc_space_nodirty(inode,
1800			ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
1801		if (ret)
1802			goto out_commit;
1803	}
1804	/*
1805	 * We don't want this to fail in ocfs2_write_end(), so do it
1806	 * here.
1807	 */
1808	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
1809				      OCFS2_JOURNAL_ACCESS_WRITE);
1810	if (ret) {
1811		mlog_errno(ret);
1812		goto out_quota;
1813	}
1814
1815	/*
1816	 * Fill our page array first. That way we've grabbed enough so
1817	 * that we can zero and flush if we error after adding the
1818	 * extent.
1819	 */
1820	ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
1821					 cluster_of_pages, mmap_page);
1822	if (ret) {
1823		mlog_errno(ret);
1824		goto out_quota;
1825	}
1826
1827	ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
1828					  len);
1829	if (ret) {
1830		mlog_errno(ret);
1831		goto out_quota;
1832	}
1833
1834	if (data_ac)
1835		ocfs2_free_alloc_context(data_ac);
1836	if (meta_ac)
1837		ocfs2_free_alloc_context(meta_ac);
1838
1839success:
1840	*pagep = wc->w_target_page;
1841	*fsdata = wc;
1842	return 0;
1843out_quota:
1844	if (clusters_to_alloc)
1845		dquot_free_space(inode,
1846			  ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
1847out_commit:
1848	ocfs2_commit_trans(osb, handle);
1849
1850out:
1851	ocfs2_free_write_ctxt(wc);
1852
1853	if (data_ac)
1854		ocfs2_free_alloc_context(data_ac);
1855	if (meta_ac)
1856		ocfs2_free_alloc_context(meta_ac);
1857
1858	if (ret == -ENOSPC && try_free) {
1859		/*
1860		 * Try to free some truncate log so that we can have enough
1861		 * clusters to allocate.
1862		 */
1863		try_free = 0;
1864
1865		ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need);
1866		if (ret1 == 1)
1867			goto try_again;
1868
1869		if (ret1 < 0)
1870			mlog_errno(ret1);
1871	}
1872
1873	return ret;
1874}
1875
1876static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
1877			     loff_t pos, unsigned len, unsigned flags,
1878			     struct page **pagep, void **fsdata)
1879{
1880	int ret;
1881	struct buffer_head *di_bh = NULL;
1882	struct inode *inode = mapping->host;
1883
1884	ret = ocfs2_inode_lock(inode, &di_bh, 1);
1885	if (ret) {
1886		mlog_errno(ret);
1887		return ret;
1888	}
1889
1890	/*
1891	 * Take alloc sem here to prevent concurrent lookups. That way
1892	 * the mapping, zeroing and tree manipulation within
1893	 * ocfs2_write() will be safe against ->readpage(). This
1894	 * should also serve to lock out allocation from a shared
1895	 * writeable region.
1896	 */
1897	down_write(&OCFS2_I(inode)->ip_alloc_sem);
1898
1899	ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
1900				       fsdata, di_bh, NULL);
1901	if (ret) {
1902		mlog_errno(ret);
1903		goto out_fail;
1904	}
1905
1906	brelse(di_bh);
1907
1908	return 0;
1909
1910out_fail:
1911	up_write(&OCFS2_I(inode)->ip_alloc_sem);
1912
1913	brelse(di_bh);
1914	ocfs2_inode_unlock(inode, 1);
1915
1916	return ret;
1917}
1918
1919static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
1920				   unsigned len, unsigned *copied,
1921				   struct ocfs2_dinode *di,
1922				   struct ocfs2_write_ctxt *wc)
1923{
1924	void *kaddr;
1925
1926	if (unlikely(*copied < len)) {
1927		if (!PageUptodate(wc->w_target_page)) {
1928			*copied = 0;
1929			return;
1930		}
1931	}
1932
1933	kaddr = kmap_atomic(wc->w_target_page, KM_USER0);
1934	memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
1935	kunmap_atomic(kaddr, KM_USER0);
1936
1937	trace_ocfs2_write_end_inline(
1938	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
1939	     (unsigned long long)pos, *copied,
1940	     le16_to_cpu(di->id2.i_data.id_count),
1941	     le16_to_cpu(di->i_dyn_features));
1942}
1943
1944int ocfs2_write_end_nolock(struct address_space *mapping,
1945			   loff_t pos, unsigned len, unsigned copied,
1946			   struct page *page, void *fsdata)
1947{
1948	int i;
1949	unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
1950	struct inode *inode = mapping->host;
1951	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1952	struct ocfs2_write_ctxt *wc = fsdata;
1953	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1954	handle_t *handle = wc->w_handle;
1955	struct page *tmppage;
1956
1957	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1958		ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
1959		goto out_write_size;
1960	}
1961
1962	if (unlikely(copied < len)) {
1963		if (!PageUptodate(wc->w_target_page))
1964			copied = 0;
1965
1966		ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
1967				       start+len);
1968	}
1969	flush_dcache_page(wc->w_target_page);
1970
1971	for(i = 0; i < wc->w_num_pages; i++) {
1972		tmppage = wc->w_pages[i];
1973
1974		if (tmppage == wc->w_target_page) {
1975			from = wc->w_target_from;
1976			to = wc->w_target_to;
1977
1978			BUG_ON(from > PAGE_CACHE_SIZE ||
1979			       to > PAGE_CACHE_SIZE ||
1980			       to < from);
1981		} else {
1982			/*
1983			 * Pages adjacent to the target (if any) imply
1984			 * a hole-filling write in which case we want
1985			 * to flush their entire range.
1986			 */
1987			from = 0;
1988			to = PAGE_CACHE_SIZE;
1989		}
1990
1991		if (page_has_buffers(tmppage)) {
1992			if (ocfs2_should_order_data(inode))
1993				ocfs2_jbd2_file_inode(wc->w_handle, inode);
1994			block_commit_write(tmppage, from, to);
1995		}
1996	}
1997
1998out_write_size:
1999	pos += copied;
2000	if (pos > inode->i_size) {
2001		i_size_write(inode, pos);
2002		mark_inode_dirty(inode);
2003	}
2004	inode->i_blocks = ocfs2_inode_sector_count(inode);
2005	di->i_size = cpu_to_le64((u64)i_size_read(inode));
2006	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2007	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
2008	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
2009	ocfs2_journal_dirty(handle, wc->w_di_bh);
2010
2011	ocfs2_commit_trans(osb, handle);
2012
2013	ocfs2_run_deallocs(osb, &wc->w_dealloc);
2014
2015	ocfs2_free_write_ctxt(wc);
2016
2017	return copied;
2018}
2019
2020static int ocfs2_write_end(struct file *file, struct address_space *mapping,
2021			   loff_t pos, unsigned len, unsigned copied,
2022			   struct page *page, void *fsdata)
2023{
2024	int ret;
2025	struct inode *inode = mapping->host;
2026
2027	ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
2028
2029	up_write(&OCFS2_I(inode)->ip_alloc_sem);
2030	ocfs2_inode_unlock(inode, 1);
2031
2032	return ret;
2033}
2034
2035const struct address_space_operations ocfs2_aops = {
2036	.readpage		= ocfs2_readpage,
2037	.readpages		= ocfs2_readpages,
2038	.writepage		= ocfs2_writepage,
2039	.write_begin		= ocfs2_write_begin,
2040	.write_end		= ocfs2_write_end,
2041	.bmap			= ocfs2_bmap,
2042	.direct_IO		= ocfs2_direct_IO,
2043	.invalidatepage		= ocfs2_invalidatepage,
2044	.releasepage		= ocfs2_releasepage,
2045	.migratepage		= buffer_migrate_page,
2046	.is_partially_uptodate	= block_is_partially_uptodate,
2047	.error_remove_page	= generic_error_remove_page,
2048};