fs/xfs/xfs_aops.c at v4.14-rc4

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / fs / xfs / xfs_aops.c
at v4.14-rc4 1471 lines 41 kB view raw
wrap content
   1/*
   2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include "xfs_shared.h"
  20#include "xfs_format.h"
  21#include "xfs_log_format.h"
  22#include "xfs_trans_resv.h"
  23#include "xfs_mount.h"
  24#include "xfs_inode.h"
  25#include "xfs_trans.h"
  26#include "xfs_inode_item.h"
  27#include "xfs_alloc.h"
  28#include "xfs_error.h"
  29#include "xfs_iomap.h"
  30#include "xfs_trace.h"
  31#include "xfs_bmap.h"
  32#include "xfs_bmap_util.h"
  33#include "xfs_bmap_btree.h"
  34#include "xfs_reflink.h"
  35#include <linux/gfp.h>
  36#include <linux/mpage.h>
  37#include <linux/pagevec.h>
  38#include <linux/writeback.h>
  39
  40/*
  41 * structure owned by writepages passed to individual writepage calls
  42 */
  43struct xfs_writepage_ctx {
  44	struct xfs_bmbt_irec    imap;
  45	bool			imap_valid;
  46	unsigned int		io_type;
  47	struct xfs_ioend	*ioend;
  48	sector_t		last_block;
  49};
  50
  51void
  52xfs_count_page_state(
  53	struct page		*page,
  54	int			*delalloc,
  55	int			*unwritten)
  56{
  57	struct buffer_head	*bh, *head;
  58
  59	*delalloc = *unwritten = 0;
  60
  61	bh = head = page_buffers(page);
  62	do {
  63		if (buffer_unwritten(bh))
  64			(*unwritten) = 1;
  65		else if (buffer_delay(bh))
  66			(*delalloc) = 1;
  67	} while ((bh = bh->b_this_page) != head);
  68}
  69
  70struct block_device *
  71xfs_find_bdev_for_inode(
  72	struct inode		*inode)
  73{
  74	struct xfs_inode	*ip = XFS_I(inode);
  75	struct xfs_mount	*mp = ip->i_mount;
  76
  77	if (XFS_IS_REALTIME_INODE(ip))
  78		return mp->m_rtdev_targp->bt_bdev;
  79	else
  80		return mp->m_ddev_targp->bt_bdev;
  81}
  82
  83struct dax_device *
  84xfs_find_daxdev_for_inode(
  85	struct inode		*inode)
  86{
  87	struct xfs_inode	*ip = XFS_I(inode);
  88	struct xfs_mount	*mp = ip->i_mount;
  89
  90	if (XFS_IS_REALTIME_INODE(ip))
  91		return mp->m_rtdev_targp->bt_daxdev;
  92	else
  93		return mp->m_ddev_targp->bt_daxdev;
  94}
  95
  96/*
  97 * We're now finished for good with this page.  Update the page state via the
  98 * associated buffer_heads, paying attention to the start and end offsets that
  99 * we need to process on the page.
 100 *
 101 * Note that we open code the action in end_buffer_async_write here so that we
 102 * only have to iterate over the buffers attached to the page once.  This is not
 103 * only more efficient, but also ensures that we only calls end_page_writeback
 104 * at the end of the iteration, and thus avoids the pitfall of having the page
 105 * and buffers potentially freed after every call to end_buffer_async_write.
 106 */
 107static void
 108xfs_finish_page_writeback(
 109	struct inode		*inode,
 110	struct bio_vec		*bvec,
 111	int			error)
 112{
 113	struct buffer_head	*head = page_buffers(bvec->bv_page), *bh = head;
 114	bool			busy = false;
 115	unsigned int		off = 0;
 116	unsigned long		flags;
 117
 118	ASSERT(bvec->bv_offset < PAGE_SIZE);
 119	ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
 120	ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
 121	ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
 122
 123	local_irq_save(flags);
 124	bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
 125	do {
 126		if (off >= bvec->bv_offset &&
 127		    off < bvec->bv_offset + bvec->bv_len) {
 128			ASSERT(buffer_async_write(bh));
 129			ASSERT(bh->b_end_io == NULL);
 130
 131			if (error) {
 132				mark_buffer_write_io_error(bh);
 133				clear_buffer_uptodate(bh);
 134				SetPageError(bvec->bv_page);
 135			} else {
 136				set_buffer_uptodate(bh);
 137			}
 138			clear_buffer_async_write(bh);
 139			unlock_buffer(bh);
 140		} else if (buffer_async_write(bh)) {
 141			ASSERT(buffer_locked(bh));
 142			busy = true;
 143		}
 144		off += bh->b_size;
 145	} while ((bh = bh->b_this_page) != head);
 146	bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
 147	local_irq_restore(flags);
 148
 149	if (!busy)
 150		end_page_writeback(bvec->bv_page);
 151}
 152
 153/*
 154 * We're now finished for good with this ioend structure.  Update the page
 155 * state, release holds on bios, and finally free up memory.  Do not use the
 156 * ioend after this.
 157 */
 158STATIC void
 159xfs_destroy_ioend(
 160	struct xfs_ioend	*ioend,
 161	int			error)
 162{
 163	struct inode		*inode = ioend->io_inode;
 164	struct bio		*bio = &ioend->io_inline_bio;
 165	struct bio		*last = ioend->io_bio, *next;
 166	u64			start = bio->bi_iter.bi_sector;
 167	bool			quiet = bio_flagged(bio, BIO_QUIET);
 168
 169	for (bio = &ioend->io_inline_bio; bio; bio = next) {
 170		struct bio_vec	*bvec;
 171		int		i;
 172
 173		/*
 174		 * For the last bio, bi_private points to the ioend, so we
 175		 * need to explicitly end the iteration here.
 176		 */
 177		if (bio == last)
 178			next = NULL;
 179		else
 180			next = bio->bi_private;
 181
 182		/* walk each page on bio, ending page IO on them */
 183		bio_for_each_segment_all(bvec, bio, i)
 184			xfs_finish_page_writeback(inode, bvec, error);
 185
 186		bio_put(bio);
 187	}
 188
 189	if (unlikely(error && !quiet)) {
 190		xfs_err_ratelimited(XFS_I(inode)->i_mount,
 191			"writeback error on sector %llu", start);
 192	}
 193}
 194
 195/*
 196 * Fast and loose check if this write could update the on-disk inode size.
 197 */
 198static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
 199{
 200	return ioend->io_offset + ioend->io_size >
 201		XFS_I(ioend->io_inode)->i_d.di_size;
 202}
 203
 204STATIC int
 205xfs_setfilesize_trans_alloc(
 206	struct xfs_ioend	*ioend)
 207{
 208	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
 209	struct xfs_trans	*tp;
 210	int			error;
 211
 212	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
 213	if (error)
 214		return error;
 215
 216	ioend->io_append_trans = tp;
 217
 218	/*
 219	 * We may pass freeze protection with a transaction.  So tell lockdep
 220	 * we released it.
 221	 */
 222	__sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
 223	/*
 224	 * We hand off the transaction to the completion thread now, so
 225	 * clear the flag here.
 226	 */
 227	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 228	return 0;
 229}
 230
 231/*
 232 * Update on-disk file size now that data has been written to disk.
 233 */
 234STATIC int
 235__xfs_setfilesize(
 236	struct xfs_inode	*ip,
 237	struct xfs_trans	*tp,
 238	xfs_off_t		offset,
 239	size_t			size)
 240{
 241	xfs_fsize_t		isize;
 242
 243	xfs_ilock(ip, XFS_ILOCK_EXCL);
 244	isize = xfs_new_eof(ip, offset + size);
 245	if (!isize) {
 246		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 247		xfs_trans_cancel(tp);
 248		return 0;
 249	}
 250
 251	trace_xfs_setfilesize(ip, offset, size);
 252
 253	ip->i_d.di_size = isize;
 254	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 255	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 256
 257	return xfs_trans_commit(tp);
 258}
 259
 260int
 261xfs_setfilesize(
 262	struct xfs_inode	*ip,
 263	xfs_off_t		offset,
 264	size_t			size)
 265{
 266	struct xfs_mount	*mp = ip->i_mount;
 267	struct xfs_trans	*tp;
 268	int			error;
 269
 270	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
 271	if (error)
 272		return error;
 273
 274	return __xfs_setfilesize(ip, tp, offset, size);
 275}
 276
 277STATIC int
 278xfs_setfilesize_ioend(
 279	struct xfs_ioend	*ioend,
 280	int			error)
 281{
 282	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
 283	struct xfs_trans	*tp = ioend->io_append_trans;
 284
 285	/*
 286	 * The transaction may have been allocated in the I/O submission thread,
 287	 * thus we need to mark ourselves as being in a transaction manually.
 288	 * Similarly for freeze protection.
 289	 */
 290	current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 291	__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
 292
 293	/* we abort the update if there was an IO error */
 294	if (error) {
 295		xfs_trans_cancel(tp);
 296		return error;
 297	}
 298
 299	return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
 300}
 301
 302/*
 303 * IO write completion.
 304 */
 305STATIC void
 306xfs_end_io(
 307	struct work_struct *work)
 308{
 309	struct xfs_ioend	*ioend =
 310		container_of(work, struct xfs_ioend, io_work);
 311	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
 312	xfs_off_t		offset = ioend->io_offset;
 313	size_t			size = ioend->io_size;
 314	int			error;
 315
 316	/*
 317	 * Just clean up the in-memory strutures if the fs has been shut down.
 318	 */
 319	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 320		error = -EIO;
 321		goto done;
 322	}
 323
 324	/*
 325	 * Clean up any COW blocks on an I/O error.
 326	 */
 327	error = blk_status_to_errno(ioend->io_bio->bi_status);
 328	if (unlikely(error)) {
 329		switch (ioend->io_type) {
 330		case XFS_IO_COW:
 331			xfs_reflink_cancel_cow_range(ip, offset, size, true);
 332			break;
 333		}
 334
 335		goto done;
 336	}
 337
 338	/*
 339	 * Success:  commit the COW or unwritten blocks if needed.
 340	 */
 341	switch (ioend->io_type) {
 342	case XFS_IO_COW:
 343		error = xfs_reflink_end_cow(ip, offset, size);
 344		break;
 345	case XFS_IO_UNWRITTEN:
 346		/* writeback should never update isize */
 347		error = xfs_iomap_write_unwritten(ip, offset, size, false);
 348		break;
 349	default:
 350		ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
 351		break;
 352	}
 353
 354done:
 355	if (ioend->io_append_trans)
 356		error = xfs_setfilesize_ioend(ioend, error);
 357	xfs_destroy_ioend(ioend, error);
 358}
 359
 360STATIC void
 361xfs_end_bio(
 362	struct bio		*bio)
 363{
 364	struct xfs_ioend	*ioend = bio->bi_private;
 365	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
 366
 367	if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
 368		queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
 369	else if (ioend->io_append_trans)
 370		queue_work(mp->m_data_workqueue, &ioend->io_work);
 371	else
 372		xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
 373}
 374
 375STATIC int
 376xfs_map_blocks(
 377	struct inode		*inode,
 378	loff_t			offset,
 379	struct xfs_bmbt_irec	*imap,
 380	int			type)
 381{
 382	struct xfs_inode	*ip = XFS_I(inode);
 383	struct xfs_mount	*mp = ip->i_mount;
 384	ssize_t			count = i_blocksize(inode);
 385	xfs_fileoff_t		offset_fsb, end_fsb;
 386	int			error = 0;
 387	int			bmapi_flags = XFS_BMAPI_ENTIRE;
 388	int			nimaps = 1;
 389
 390	if (XFS_FORCED_SHUTDOWN(mp))
 391		return -EIO;
 392
 393	ASSERT(type != XFS_IO_COW);
 394	if (type == XFS_IO_UNWRITTEN)
 395		bmapi_flags |= XFS_BMAPI_IGSTATE;
 396
 397	xfs_ilock(ip, XFS_ILOCK_SHARED);
 398	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 399	       (ip->i_df.if_flags & XFS_IFEXTENTS));
 400	ASSERT(offset <= mp->m_super->s_maxbytes);
 401
 402	if (offset + count > mp->m_super->s_maxbytes)
 403		count = mp->m_super->s_maxbytes - offset;
 404	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
 405	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 406	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
 407				imap, &nimaps, bmapi_flags);
 408	/*
 409	 * Truncate an overwrite extent if there's a pending CoW
 410	 * reservation before the end of this extent.  This forces us
 411	 * to come back to writepage to take care of the CoW.
 412	 */
 413	if (nimaps && type == XFS_IO_OVERWRITE)
 414		xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, imap);
 415	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 416
 417	if (error)
 418		return error;
 419
 420	if (type == XFS_IO_DELALLOC &&
 421	    (!nimaps || isnullstartblock(imap->br_startblock))) {
 422		error = xfs_iomap_write_allocate(ip, XFS_DATA_FORK, offset,
 423				imap);
 424		if (!error)
 425			trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
 426		return error;
 427	}
 428
 429#ifdef DEBUG
 430	if (type == XFS_IO_UNWRITTEN) {
 431		ASSERT(nimaps);
 432		ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 433		ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 434	}
 435#endif
 436	if (nimaps)
 437		trace_xfs_map_blocks_found(ip, offset, count, type, imap);
 438	return 0;
 439}
 440
 441STATIC bool
 442xfs_imap_valid(
 443	struct inode		*inode,
 444	struct xfs_bmbt_irec	*imap,
 445	xfs_off_t		offset)
 446{
 447	offset >>= inode->i_blkbits;
 448
 449	return offset >= imap->br_startoff &&
 450		offset < imap->br_startoff + imap->br_blockcount;
 451}
 452
 453STATIC void
 454xfs_start_buffer_writeback(
 455	struct buffer_head	*bh)
 456{
 457	ASSERT(buffer_mapped(bh));
 458	ASSERT(buffer_locked(bh));
 459	ASSERT(!buffer_delay(bh));
 460	ASSERT(!buffer_unwritten(bh));
 461
 462	bh->b_end_io = NULL;
 463	set_buffer_async_write(bh);
 464	set_buffer_uptodate(bh);
 465	clear_buffer_dirty(bh);
 466}
 467
 468STATIC void
 469xfs_start_page_writeback(
 470	struct page		*page,
 471	int			clear_dirty)
 472{
 473	ASSERT(PageLocked(page));
 474	ASSERT(!PageWriteback(page));
 475
 476	/*
 477	 * if the page was not fully cleaned, we need to ensure that the higher
 478	 * layers come back to it correctly. That means we need to keep the page
 479	 * dirty, and for WB_SYNC_ALL writeback we need to ensure the
 480	 * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to
 481	 * write this page in this writeback sweep will be made.
 482	 */
 483	if (clear_dirty) {
 484		clear_page_dirty_for_io(page);
 485		set_page_writeback(page);
 486	} else
 487		set_page_writeback_keepwrite(page);
 488
 489	unlock_page(page);
 490}
 491
 492static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
 493{
 494	return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
 495}
 496
 497/*
 498 * Submit the bio for an ioend. We are passed an ioend with a bio attached to
 499 * it, and we submit that bio. The ioend may be used for multiple bio
 500 * submissions, so we only want to allocate an append transaction for the ioend
 501 * once. In the case of multiple bio submission, each bio will take an IO
 502 * reference to the ioend to ensure that the ioend completion is only done once
 503 * all bios have been submitted and the ioend is really done.
 504 *
 505 * If @fail is non-zero, it means that we have a situation where some part of
 506 * the submission process has failed after we have marked paged for writeback
 507 * and unlocked them. In this situation, we need to fail the bio and ioend
 508 * rather than submit it to IO. This typically only happens on a filesystem
 509 * shutdown.
 510 */
 511STATIC int
 512xfs_submit_ioend(
 513	struct writeback_control *wbc,
 514	struct xfs_ioend	*ioend,
 515	int			status)
 516{
 517	/* Convert CoW extents to regular */
 518	if (!status && ioend->io_type == XFS_IO_COW) {
 519		status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
 520				ioend->io_offset, ioend->io_size);
 521	}
 522
 523	/* Reserve log space if we might write beyond the on-disk inode size. */
 524	if (!status &&
 525	    ioend->io_type != XFS_IO_UNWRITTEN &&
 526	    xfs_ioend_is_append(ioend) &&
 527	    !ioend->io_append_trans)
 528		status = xfs_setfilesize_trans_alloc(ioend);
 529
 530	ioend->io_bio->bi_private = ioend;
 531	ioend->io_bio->bi_end_io = xfs_end_bio;
 532	ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
 533
 534	/*
 535	 * If we are failing the IO now, just mark the ioend with an
 536	 * error and finish it. This will run IO completion immediately
 537	 * as there is only one reference to the ioend at this point in
 538	 * time.
 539	 */
 540	if (status) {
 541		ioend->io_bio->bi_status = errno_to_blk_status(status);
 542		bio_endio(ioend->io_bio);
 543		return status;
 544	}
 545
 546	ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
 547	submit_bio(ioend->io_bio);
 548	return 0;
 549}
 550
 551static void
 552xfs_init_bio_from_bh(
 553	struct bio		*bio,
 554	struct buffer_head	*bh)
 555{
 556	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 557	bio_set_dev(bio, bh->b_bdev);
 558}
 559
 560static struct xfs_ioend *
 561xfs_alloc_ioend(
 562	struct inode		*inode,
 563	unsigned int		type,
 564	xfs_off_t		offset,
 565	struct buffer_head	*bh)
 566{
 567	struct xfs_ioend	*ioend;
 568	struct bio		*bio;
 569
 570	bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset);
 571	xfs_init_bio_from_bh(bio, bh);
 572
 573	ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
 574	INIT_LIST_HEAD(&ioend->io_list);
 575	ioend->io_type = type;
 576	ioend->io_inode = inode;
 577	ioend->io_size = 0;
 578	ioend->io_offset = offset;
 579	INIT_WORK(&ioend->io_work, xfs_end_io);
 580	ioend->io_append_trans = NULL;
 581	ioend->io_bio = bio;
 582	return ioend;
 583}
 584
 585/*
 586 * Allocate a new bio, and chain the old bio to the new one.
 587 *
 588 * Note that we have to do perform the chaining in this unintuitive order
 589 * so that the bi_private linkage is set up in the right direction for the
 590 * traversal in xfs_destroy_ioend().
 591 */
 592static void
 593xfs_chain_bio(
 594	struct xfs_ioend	*ioend,
 595	struct writeback_control *wbc,
 596	struct buffer_head	*bh)
 597{
 598	struct bio *new;
 599
 600	new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
 601	xfs_init_bio_from_bh(new, bh);
 602
 603	bio_chain(ioend->io_bio, new);
 604	bio_get(ioend->io_bio);		/* for xfs_destroy_ioend */
 605	ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
 606	ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
 607	submit_bio(ioend->io_bio);
 608	ioend->io_bio = new;
 609}
 610
 611/*
 612 * Test to see if we've been building up a completion structure for
 613 * earlier buffers -- if so, we try to append to this ioend if we
 614 * can, otherwise we finish off any current ioend and start another.
 615 * Return the ioend we finished off so that the caller can submit it
 616 * once it has finished processing the dirty page.
 617 */
 618STATIC void
 619xfs_add_to_ioend(
 620	struct inode		*inode,
 621	struct buffer_head	*bh,
 622	xfs_off_t		offset,
 623	struct xfs_writepage_ctx *wpc,
 624	struct writeback_control *wbc,
 625	struct list_head	*iolist)
 626{
 627	if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
 628	    bh->b_blocknr != wpc->last_block + 1 ||
 629	    offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
 630		if (wpc->ioend)
 631			list_add(&wpc->ioend->io_list, iolist);
 632		wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, bh);
 633	}
 634
 635	/*
 636	 * If the buffer doesn't fit into the bio we need to allocate a new
 637	 * one.  This shouldn't happen more than once for a given buffer.
 638	 */
 639	while (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size)
 640		xfs_chain_bio(wpc->ioend, wbc, bh);
 641
 642	wpc->ioend->io_size += bh->b_size;
 643	wpc->last_block = bh->b_blocknr;
 644	xfs_start_buffer_writeback(bh);
 645}
 646
 647STATIC void
 648xfs_map_buffer(
 649	struct inode		*inode,
 650	struct buffer_head	*bh,
 651	struct xfs_bmbt_irec	*imap,
 652	xfs_off_t		offset)
 653{
 654	sector_t		bn;
 655	struct xfs_mount	*m = XFS_I(inode)->i_mount;
 656	xfs_off_t		iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
 657	xfs_daddr_t		iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
 658
 659	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 660	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 661
 662	bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
 663	      ((offset - iomap_offset) >> inode->i_blkbits);
 664
 665	ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
 666
 667	bh->b_blocknr = bn;
 668	set_buffer_mapped(bh);
 669}
 670
 671STATIC void
 672xfs_map_at_offset(
 673	struct inode		*inode,
 674	struct buffer_head	*bh,
 675	struct xfs_bmbt_irec	*imap,
 676	xfs_off_t		offset)
 677{
 678	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 679	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 680
 681	xfs_map_buffer(inode, bh, imap, offset);
 682	set_buffer_mapped(bh);
 683	clear_buffer_delay(bh);
 684	clear_buffer_unwritten(bh);
 685}
 686
 687/*
 688 * Test if a given page contains at least one buffer of a given @type.
 689 * If @check_all_buffers is true, then we walk all the buffers in the page to
 690 * try to find one of the type passed in. If it is not set, then the caller only
 691 * needs to check the first buffer on the page for a match.
 692 */
 693STATIC bool
 694xfs_check_page_type(
 695	struct page		*page,
 696	unsigned int		type,
 697	bool			check_all_buffers)
 698{
 699	struct buffer_head	*bh;
 700	struct buffer_head	*head;
 701
 702	if (PageWriteback(page))
 703		return false;
 704	if (!page->mapping)
 705		return false;
 706	if (!page_has_buffers(page))
 707		return false;
 708
 709	bh = head = page_buffers(page);
 710	do {
 711		if (buffer_unwritten(bh)) {
 712			if (type == XFS_IO_UNWRITTEN)
 713				return true;
 714		} else if (buffer_delay(bh)) {
 715			if (type == XFS_IO_DELALLOC)
 716				return true;
 717		} else if (buffer_dirty(bh) && buffer_mapped(bh)) {
 718			if (type == XFS_IO_OVERWRITE)
 719				return true;
 720		}
 721
 722		/* If we are only checking the first buffer, we are done now. */
 723		if (!check_all_buffers)
 724			break;
 725	} while ((bh = bh->b_this_page) != head);
 726
 727	return false;
 728}
 729
 730STATIC void
 731xfs_vm_invalidatepage(
 732	struct page		*page,
 733	unsigned int		offset,
 734	unsigned int		length)
 735{
 736	trace_xfs_invalidatepage(page->mapping->host, page, offset,
 737				 length);
 738	block_invalidatepage(page, offset, length);
 739}
 740
 741/*
 742 * If the page has delalloc buffers on it, we need to punch them out before we
 743 * invalidate the page. If we don't, we leave a stale delalloc mapping on the
 744 * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
 745 * is done on that same region - the delalloc extent is returned when none is
 746 * supposed to be there.
 747 *
 748 * We prevent this by truncating away the delalloc regions on the page before
 749 * invalidating it. Because they are delalloc, we can do this without needing a
 750 * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
 751 * truncation without a transaction as there is no space left for block
 752 * reservation (typically why we see a ENOSPC in writeback).
 753 *
 754 * This is not a performance critical path, so for now just do the punching a
 755 * buffer head at a time.
 756 */
 757STATIC void
 758xfs_aops_discard_page(
 759	struct page		*page)
 760{
 761	struct inode		*inode = page->mapping->host;
 762	struct xfs_inode	*ip = XFS_I(inode);
 763	struct buffer_head	*bh, *head;
 764	loff_t			offset = page_offset(page);
 765
 766	if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true))
 767		goto out_invalidate;
 768
 769	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 770		goto out_invalidate;
 771
 772	xfs_alert(ip->i_mount,
 773		"page discard on page %p, inode 0x%llx, offset %llu.",
 774			page, ip->i_ino, offset);
 775
 776	xfs_ilock(ip, XFS_ILOCK_EXCL);
 777	bh = head = page_buffers(page);
 778	do {
 779		int		error;
 780		xfs_fileoff_t	start_fsb;
 781
 782		if (!buffer_delay(bh))
 783			goto next_buffer;
 784
 785		start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
 786		error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
 787		if (error) {
 788			/* something screwed, just bail */
 789			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 790				xfs_alert(ip->i_mount,
 791			"page discard unable to remove delalloc mapping.");
 792			}
 793			break;
 794		}
 795next_buffer:
 796		offset += i_blocksize(inode);
 797
 798	} while ((bh = bh->b_this_page) != head);
 799
 800	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 801out_invalidate:
 802	xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
 803	return;
 804}
 805
 806static int
 807xfs_map_cow(
 808	struct xfs_writepage_ctx *wpc,
 809	struct inode		*inode,
 810	loff_t			offset,
 811	unsigned int		*new_type)
 812{
 813	struct xfs_inode	*ip = XFS_I(inode);
 814	struct xfs_bmbt_irec	imap;
 815	bool			is_cow = false;
 816	int			error;
 817
 818	/*
 819	 * If we already have a valid COW mapping keep using it.
 820	 */
 821	if (wpc->io_type == XFS_IO_COW) {
 822		wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset);
 823		if (wpc->imap_valid) {
 824			*new_type = XFS_IO_COW;
 825			return 0;
 826		}
 827	}
 828
 829	/*
 830	 * Else we need to check if there is a COW mapping at this offset.
 831	 */
 832	xfs_ilock(ip, XFS_ILOCK_SHARED);
 833	is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap);
 834	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 835
 836	if (!is_cow)
 837		return 0;
 838
 839	/*
 840	 * And if the COW mapping has a delayed extent here we need to
 841	 * allocate real space for it now.
 842	 */
 843	if (isnullstartblock(imap.br_startblock)) {
 844		error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset,
 845				&imap);
 846		if (error)
 847			return error;
 848	}
 849
 850	wpc->io_type = *new_type = XFS_IO_COW;
 851	wpc->imap_valid = true;
 852	wpc->imap = imap;
 853	return 0;
 854}
 855
 856/*
 857 * We implement an immediate ioend submission policy here to avoid needing to
 858 * chain multiple ioends and hence nest mempool allocations which can violate
 859 * forward progress guarantees we need to provide. The current ioend we are
 860 * adding buffers to is cached on the writepage context, and if the new buffer
 861 * does not append to the cached ioend it will create a new ioend and cache that
 862 * instead.
 863 *
 864 * If a new ioend is created and cached, the old ioend is returned and queued
 865 * locally for submission once the entire page is processed or an error has been
 866 * detected.  While ioends are submitted immediately after they are completed,
 867 * batching optimisations are provided by higher level block plugging.
 868 *
 869 * At the end of a writeback pass, there will be a cached ioend remaining on the
 870 * writepage context that the caller will need to submit.
 871 */
 872static int
 873xfs_writepage_map(
 874	struct xfs_writepage_ctx *wpc,
 875	struct writeback_control *wbc,
 876	struct inode		*inode,
 877	struct page		*page,
 878	loff_t			offset,
 879	uint64_t              end_offset)
 880{
 881	LIST_HEAD(submit_list);
 882	struct xfs_ioend	*ioend, *next;
 883	struct buffer_head	*bh, *head;
 884	ssize_t			len = i_blocksize(inode);
 885	int			error = 0;
 886	int			count = 0;
 887	int			uptodate = 1;
 888	unsigned int		new_type;
 889
 890	bh = head = page_buffers(page);
 891	offset = page_offset(page);
 892	do {
 893		if (offset >= end_offset)
 894			break;
 895		if (!buffer_uptodate(bh))
 896			uptodate = 0;
 897
 898		/*
 899		 * set_page_dirty dirties all buffers in a page, independent
 900		 * of their state.  The dirty state however is entirely
 901		 * meaningless for holes (!mapped && uptodate), so skip
 902		 * buffers covering holes here.
 903		 */
 904		if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
 905			wpc->imap_valid = false;
 906			continue;
 907		}
 908
 909		if (buffer_unwritten(bh))
 910			new_type = XFS_IO_UNWRITTEN;
 911		else if (buffer_delay(bh))
 912			new_type = XFS_IO_DELALLOC;
 913		else if (buffer_uptodate(bh))
 914			new_type = XFS_IO_OVERWRITE;
 915		else {
 916			if (PageUptodate(page))
 917				ASSERT(buffer_mapped(bh));
 918			/*
 919			 * This buffer is not uptodate and will not be
 920			 * written to disk.  Ensure that we will put any
 921			 * subsequent writeable buffers into a new
 922			 * ioend.
 923			 */
 924			wpc->imap_valid = false;
 925			continue;
 926		}
 927
 928		if (xfs_is_reflink_inode(XFS_I(inode))) {
 929			error = xfs_map_cow(wpc, inode, offset, &new_type);
 930			if (error)
 931				goto out;
 932		}
 933
 934		if (wpc->io_type != new_type) {
 935			wpc->io_type = new_type;
 936			wpc->imap_valid = false;
 937		}
 938
 939		if (wpc->imap_valid)
 940			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
 941							 offset);
 942		if (!wpc->imap_valid) {
 943			error = xfs_map_blocks(inode, offset, &wpc->imap,
 944					     wpc->io_type);
 945			if (error)
 946				goto out;
 947			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
 948							 offset);
 949		}
 950		if (wpc->imap_valid) {
 951			lock_buffer(bh);
 952			if (wpc->io_type != XFS_IO_OVERWRITE)
 953				xfs_map_at_offset(inode, bh, &wpc->imap, offset);
 954			xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list);
 955			count++;
 956		}
 957
 958	} while (offset += len, ((bh = bh->b_this_page) != head));
 959
 960	if (uptodate && bh == head)
 961		SetPageUptodate(page);
 962
 963	ASSERT(wpc->ioend || list_empty(&submit_list));
 964
 965out:
 966	/*
 967	 * On error, we have to fail the ioend here because we have locked
 968	 * buffers in the ioend. If we don't do this, we'll deadlock
 969	 * invalidating the page as that tries to lock the buffers on the page.
 970	 * Also, because we may have set pages under writeback, we have to make
 971	 * sure we run IO completion to mark the error state of the IO
 972	 * appropriately, so we can't cancel the ioend directly here. That means
 973	 * we have to mark this page as under writeback if we included any
 974	 * buffers from it in the ioend chain so that completion treats it
 975	 * correctly.
 976	 *
 977	 * If we didn't include the page in the ioend, the on error we can
 978	 * simply discard and unlock it as there are no other users of the page
 979	 * or it's buffers right now. The caller will still need to trigger
 980	 * submission of outstanding ioends on the writepage context so they are
 981	 * treated correctly on error.
 982	 */
 983	if (count) {
 984		xfs_start_page_writeback(page, !error);
 985
 986		/*
 987		 * Preserve the original error if there was one, otherwise catch
 988		 * submission errors here and propagate into subsequent ioend
 989		 * submissions.
 990		 */
 991		list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
 992			int error2;
 993
 994			list_del_init(&ioend->io_list);
 995			error2 = xfs_submit_ioend(wbc, ioend, error);
 996			if (error2 && !error)
 997				error = error2;
 998		}
 999	} else if (error) {
1000		xfs_aops_discard_page(page);
1001		ClearPageUptodate(page);
1002		unlock_page(page);
1003	} else {
1004		/*
1005		 * We can end up here with no error and nothing to write if we
1006		 * race with a partial page truncate on a sub-page block sized
1007		 * filesystem. In that case we need to mark the page clean.
1008		 */
1009		xfs_start_page_writeback(page, 1);
1010		end_page_writeback(page);
1011	}
1012
1013	mapping_set_error(page->mapping, error);
1014	return error;
1015}
1016
1017/*
1018 * Write out a dirty page.
1019 *
1020 * For delalloc space on the page we need to allocate space and flush it.
1021 * For unwritten space on the page we need to start the conversion to
1022 * regular allocated space.
1023 * For any other dirty buffer heads on the page we should flush them.
1024 */
1025STATIC int
1026xfs_do_writepage(
1027	struct page		*page,
1028	struct writeback_control *wbc,
1029	void			*data)
1030{
1031	struct xfs_writepage_ctx *wpc = data;
1032	struct inode		*inode = page->mapping->host;
1033	loff_t			offset;
1034	uint64_t              end_offset;
1035	pgoff_t                 end_index;
1036
1037	trace_xfs_writepage(inode, page, 0, 0);
1038
1039	ASSERT(page_has_buffers(page));
1040
1041	/*
1042	 * Refuse to write the page out if we are called from reclaim context.
1043	 *
1044	 * This avoids stack overflows when called from deeply used stacks in
1045	 * random callers for direct reclaim or memcg reclaim.  We explicitly
1046	 * allow reclaim from kswapd as the stack usage there is relatively low.
1047	 *
1048	 * This should never happen except in the case of a VM regression so
1049	 * warn about it.
1050	 */
1051	if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
1052			PF_MEMALLOC))
1053		goto redirty;
1054
1055	/*
1056	 * Given that we do not allow direct reclaim to call us, we should
1057	 * never be called while in a filesystem transaction.
1058	 */
1059	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
1060		goto redirty;
1061
1062	/*
1063	 * Is this page beyond the end of the file?
1064	 *
1065	 * The page index is less than the end_index, adjust the end_offset
1066	 * to the highest offset that this page should represent.
1067	 * -----------------------------------------------------
1068	 * |			file mapping	       | <EOF> |
1069	 * -----------------------------------------------------
1070	 * | Page ... | Page N-2 | Page N-1 |  Page N  |       |
1071	 * ^--------------------------------^----------|--------
1072	 * |     desired writeback range    |      see else    |
1073	 * ---------------------------------^------------------|
1074	 */
1075	offset = i_size_read(inode);
1076	end_index = offset >> PAGE_SHIFT;
1077	if (page->index < end_index)
1078		end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT;
1079	else {
1080		/*
1081		 * Check whether the page to write out is beyond or straddles
1082		 * i_size or not.
1083		 * -------------------------------------------------------
1084		 * |		file mapping		        | <EOF>  |
1085		 * -------------------------------------------------------
1086		 * | Page ... | Page N-2 | Page N-1 |  Page N   | Beyond |
1087		 * ^--------------------------------^-----------|---------
1088		 * |				    |      Straddles     |
1089		 * ---------------------------------^-----------|--------|
1090		 */
1091		unsigned offset_into_page = offset & (PAGE_SIZE - 1);
1092
1093		/*
1094		 * Skip the page if it is fully outside i_size, e.g. due to a
1095		 * truncate operation that is in progress. We must redirty the
1096		 * page so that reclaim stops reclaiming it. Otherwise
1097		 * xfs_vm_releasepage() is called on it and gets confused.
1098		 *
1099		 * Note that the end_index is unsigned long, it would overflow
1100		 * if the given offset is greater than 16TB on 32-bit system
1101		 * and if we do check the page is fully outside i_size or not
1102		 * via "if (page->index >= end_index + 1)" as "end_index + 1"
1103		 * will be evaluated to 0.  Hence this page will be redirtied
1104		 * and be written out repeatedly which would result in an
1105		 * infinite loop, the user program that perform this operation
1106		 * will hang.  Instead, we can verify this situation by checking
1107		 * if the page to write is totally beyond the i_size or if it's
1108		 * offset is just equal to the EOF.
1109		 */
1110		if (page->index > end_index ||
1111		    (page->index == end_index && offset_into_page == 0))
1112			goto redirty;
1113
1114		/*
1115		 * The page straddles i_size.  It must be zeroed out on each
1116		 * and every writepage invocation because it may be mmapped.
1117		 * "A file is mapped in multiples of the page size.  For a file
1118		 * that is not a multiple of the page size, the remaining
1119		 * memory is zeroed when mapped, and writes to that region are
1120		 * not written out to the file."
1121		 */
1122		zero_user_segment(page, offset_into_page, PAGE_SIZE);
1123
1124		/* Adjust the end_offset to the end of file */
1125		end_offset = offset;
1126	}
1127
1128	return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset);
1129
1130redirty:
1131	redirty_page_for_writepage(wbc, page);
1132	unlock_page(page);
1133	return 0;
1134}
1135
1136STATIC int
1137xfs_vm_writepage(
1138	struct page		*page,
1139	struct writeback_control *wbc)
1140{
1141	struct xfs_writepage_ctx wpc = {
1142		.io_type = XFS_IO_INVALID,
1143	};
1144	int			ret;
1145
1146	ret = xfs_do_writepage(page, wbc, &wpc);
1147	if (wpc.ioend)
1148		ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
1149	return ret;
1150}
1151
1152STATIC int
1153xfs_vm_writepages(
1154	struct address_space	*mapping,
1155	struct writeback_control *wbc)
1156{
1157	struct xfs_writepage_ctx wpc = {
1158		.io_type = XFS_IO_INVALID,
1159	};
1160	int			ret;
1161
1162	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
1163	if (dax_mapping(mapping))
1164		return dax_writeback_mapping_range(mapping,
1165				xfs_find_bdev_for_inode(mapping->host), wbc);
1166
1167	ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
1168	if (wpc.ioend)
1169		ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
1170	return ret;
1171}
1172
1173/*
1174 * Called to move a page into cleanable state - and from there
1175 * to be released. The page should already be clean. We always
1176 * have buffer heads in this call.
1177 *
1178 * Returns 1 if the page is ok to release, 0 otherwise.
1179 */
1180STATIC int
1181xfs_vm_releasepage(
1182	struct page		*page,
1183	gfp_t			gfp_mask)
1184{
1185	int			delalloc, unwritten;
1186
1187	trace_xfs_releasepage(page->mapping->host, page, 0, 0);
1188
1189	/*
1190	 * mm accommodates an old ext3 case where clean pages might not have had
1191	 * the dirty bit cleared. Thus, it can send actual dirty pages to
1192	 * ->releasepage() via shrink_active_list(). Conversely,
1193	 * block_invalidatepage() can send pages that are still marked dirty
1194	 * but otherwise have invalidated buffers.
1195	 *
1196	 * We want to release the latter to avoid unnecessary buildup of the
1197	 * LRU, skip the former and warn if we've left any lingering
1198	 * delalloc/unwritten buffers on clean pages. Skip pages with delalloc
1199	 * or unwritten buffers and warn if the page is not dirty. Otherwise
1200	 * try to release the buffers.
1201	 */
1202	xfs_count_page_state(page, &delalloc, &unwritten);
1203
1204	if (delalloc) {
1205		WARN_ON_ONCE(!PageDirty(page));
1206		return 0;
1207	}
1208	if (unwritten) {
1209		WARN_ON_ONCE(!PageDirty(page));
1210		return 0;
1211	}
1212
1213	return try_to_free_buffers(page);
1214}
1215
1216/*
1217 * If this is O_DIRECT or the mpage code calling tell them how large the mapping
1218 * is, so that we can avoid repeated get_blocks calls.
1219 *
1220 * If the mapping spans EOF, then we have to break the mapping up as the mapping
1221 * for blocks beyond EOF must be marked new so that sub block regions can be
1222 * correctly zeroed. We can't do this for mappings within EOF unless the mapping
1223 * was just allocated or is unwritten, otherwise the callers would overwrite
1224 * existing data with zeros. Hence we have to split the mapping into a range up
1225 * to and including EOF, and a second mapping for beyond EOF.
1226 */
1227static void
1228xfs_map_trim_size(
1229	struct inode		*inode,
1230	sector_t		iblock,
1231	struct buffer_head	*bh_result,
1232	struct xfs_bmbt_irec	*imap,
1233	xfs_off_t		offset,
1234	ssize_t			size)
1235{
1236	xfs_off_t		mapping_size;
1237
1238	mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
1239	mapping_size <<= inode->i_blkbits;
1240
1241	ASSERT(mapping_size > 0);
1242	if (mapping_size > size)
1243		mapping_size = size;
1244	if (offset < i_size_read(inode) &&
1245	    offset + mapping_size >= i_size_read(inode)) {
1246		/* limit mapping to block that spans EOF */
1247		mapping_size = roundup_64(i_size_read(inode) - offset,
1248					  i_blocksize(inode));
1249	}
1250	if (mapping_size > LONG_MAX)
1251		mapping_size = LONG_MAX;
1252
1253	bh_result->b_size = mapping_size;
1254}
1255
1256static int
1257xfs_get_blocks(
1258	struct inode		*inode,
1259	sector_t		iblock,
1260	struct buffer_head	*bh_result,
1261	int			create)
1262{
1263	struct xfs_inode	*ip = XFS_I(inode);
1264	struct xfs_mount	*mp = ip->i_mount;
1265	xfs_fileoff_t		offset_fsb, end_fsb;
1266	int			error = 0;
1267	int			lockmode = 0;
1268	struct xfs_bmbt_irec	imap;
1269	int			nimaps = 1;
1270	xfs_off_t		offset;
1271	ssize_t			size;
1272
1273	BUG_ON(create);
1274
1275	if (XFS_FORCED_SHUTDOWN(mp))
1276		return -EIO;
1277
1278	offset = (xfs_off_t)iblock << inode->i_blkbits;
1279	ASSERT(bh_result->b_size >= i_blocksize(inode));
1280	size = bh_result->b_size;
1281
1282	if (offset >= i_size_read(inode))
1283		return 0;
1284
1285	/*
1286	 * Direct I/O is usually done on preallocated files, so try getting
1287	 * a block mapping without an exclusive lock first.
1288	 */
1289	lockmode = xfs_ilock_data_map_shared(ip);
1290
1291	ASSERT(offset <= mp->m_super->s_maxbytes);
1292	if (offset + size > mp->m_super->s_maxbytes)
1293		size = mp->m_super->s_maxbytes - offset;
1294	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
1295	offset_fsb = XFS_B_TO_FSBT(mp, offset);
1296
1297	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
1298				&imap, &nimaps, XFS_BMAPI_ENTIRE);
1299	if (error)
1300		goto out_unlock;
1301
1302	if (nimaps) {
1303		trace_xfs_get_blocks_found(ip, offset, size,
1304			imap.br_state == XFS_EXT_UNWRITTEN ?
1305				XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap);
1306		xfs_iunlock(ip, lockmode);
1307	} else {
1308		trace_xfs_get_blocks_notfound(ip, offset, size);
1309		goto out_unlock;
1310	}
1311
1312	/* trim mapping down to size requested */
1313	xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
1314
1315	/*
1316	 * For unwritten extents do not report a disk address in the buffered
1317	 * read case (treat as if we're reading into a hole).
1318	 */
1319	if (xfs_bmap_is_real_extent(&imap))
1320		xfs_map_buffer(inode, bh_result, &imap, offset);
1321
1322	/*
1323	 * If this is a realtime file, data may be on a different device.
1324	 * to that pointed to from the buffer_head b_bdev currently.
1325	 */
1326	bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
1327	return 0;
1328
1329out_unlock:
1330	xfs_iunlock(ip, lockmode);
1331	return error;
1332}
1333
1334STATIC ssize_t
1335xfs_vm_direct_IO(
1336	struct kiocb		*iocb,
1337	struct iov_iter		*iter)
1338{
1339	/*
1340	 * We just need the method present so that open/fcntl allow direct I/O.
1341	 */
1342	return -EINVAL;
1343}
1344
1345STATIC sector_t
1346xfs_vm_bmap(
1347	struct address_space	*mapping,
1348	sector_t		block)
1349{
1350	struct inode		*inode = (struct inode *)mapping->host;
1351	struct xfs_inode	*ip = XFS_I(inode);
1352
1353	trace_xfs_vm_bmap(XFS_I(inode));
1354
1355	/*
1356	 * The swap code (ab-)uses ->bmap to get a block mapping and then
1357	 * bypasseѕ the file system for actual I/O.  We really can't allow
1358	 * that on reflinks inodes, so we have to skip out here.  And yes,
1359	 * 0 is the magic code for a bmap error.
1360	 *
1361	 * Since we don't pass back blockdev info, we can't return bmap
1362	 * information for rt files either.
1363	 */
1364	if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
1365		return 0;
1366
1367	filemap_write_and_wait(mapping);
1368	return generic_block_bmap(mapping, block, xfs_get_blocks);
1369}
1370
1371STATIC int
1372xfs_vm_readpage(
1373	struct file		*unused,
1374	struct page		*page)
1375{
1376	trace_xfs_vm_readpage(page->mapping->host, 1);
1377	return mpage_readpage(page, xfs_get_blocks);
1378}
1379
1380STATIC int
1381xfs_vm_readpages(
1382	struct file		*unused,
1383	struct address_space	*mapping,
1384	struct list_head	*pages,
1385	unsigned		nr_pages)
1386{
1387	trace_xfs_vm_readpages(mapping->host, nr_pages);
1388	return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
1389}
1390
1391/*
1392 * This is basically a copy of __set_page_dirty_buffers() with one
1393 * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
1394 * dirty, we'll never be able to clean them because we don't write buffers
1395 * beyond EOF, and that means we can't invalidate pages that span EOF
1396 * that have been marked dirty. Further, the dirty state can leak into
1397 * the file interior if the file is extended, resulting in all sorts of
1398 * bad things happening as the state does not match the underlying data.
1399 *
1400 * XXX: this really indicates that bufferheads in XFS need to die. Warts like
1401 * this only exist because of bufferheads and how the generic code manages them.
1402 */
1403STATIC int
1404xfs_vm_set_page_dirty(
1405	struct page		*page)
1406{
1407	struct address_space	*mapping = page->mapping;
1408	struct inode		*inode = mapping->host;
1409	loff_t			end_offset;
1410	loff_t			offset;
1411	int			newly_dirty;
1412
1413	if (unlikely(!mapping))
1414		return !TestSetPageDirty(page);
1415
1416	end_offset = i_size_read(inode);
1417	offset = page_offset(page);
1418
1419	spin_lock(&mapping->private_lock);
1420	if (page_has_buffers(page)) {
1421		struct buffer_head *head = page_buffers(page);
1422		struct buffer_head *bh = head;
1423
1424		do {
1425			if (offset < end_offset)
1426				set_buffer_dirty(bh);
1427			bh = bh->b_this_page;
1428			offset += i_blocksize(inode);
1429		} while (bh != head);
1430	}
1431	/*
1432	 * Lock out page->mem_cgroup migration to keep PageDirty
1433	 * synchronized with per-memcg dirty page counters.
1434	 */
1435	lock_page_memcg(page);
1436	newly_dirty = !TestSetPageDirty(page);
1437	spin_unlock(&mapping->private_lock);
1438
1439	if (newly_dirty) {
1440		/* sigh - __set_page_dirty() is static, so copy it here, too */
1441		unsigned long flags;
1442
1443		spin_lock_irqsave(&mapping->tree_lock, flags);
1444		if (page->mapping) {	/* Race with truncate? */
1445			WARN_ON_ONCE(!PageUptodate(page));
1446			account_page_dirtied(page, mapping);
1447			radix_tree_tag_set(&mapping->page_tree,
1448					page_index(page), PAGECACHE_TAG_DIRTY);
1449		}
1450		spin_unlock_irqrestore(&mapping->tree_lock, flags);
1451	}
1452	unlock_page_memcg(page);
1453	if (newly_dirty)
1454		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1455	return newly_dirty;
1456}
1457
1458const struct address_space_operations xfs_address_space_operations = {
1459	.readpage		= xfs_vm_readpage,
1460	.readpages		= xfs_vm_readpages,
1461	.writepage		= xfs_vm_writepage,
1462	.writepages		= xfs_vm_writepages,
1463	.set_page_dirty		= xfs_vm_set_page_dirty,
1464	.releasepage		= xfs_vm_releasepage,
1465	.invalidatepage		= xfs_vm_invalidatepage,
1466	.bmap			= xfs_vm_bmap,
1467	.direct_IO		= xfs_vm_direct_IO,
1468	.migratepage		= buffer_migrate_page,
1469	.is_partially_uptodate  = block_is_partially_uptodate,
1470	.error_remove_page	= generic_error_remove_page,
1471};
Configure Feed

Configure Feed