fs/btrfs/direct-io.c at master · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / btrfs / direct-io.c
at master 35 kB view raw
   1// SPDX-License-Identifier: GPL-2.0
   2
   3#include <linux/fsverity.h>
   4#include <linux/iomap.h>
   5#include "ctree.h"
   6#include "delalloc-space.h"
   7#include "direct-io.h"
   8#include "extent-tree.h"
   9#include "file.h"
  10#include "fs.h"
  11#include "transaction.h"
  12#include "volumes.h"
  13#include "bio.h"
  14#include "ordered-data.h"
  15
  16struct btrfs_dio_data {
  17	ssize_t submitted;
  18	struct extent_changeset *data_reserved;
  19	struct btrfs_ordered_extent *ordered;
  20	bool data_space_reserved;
  21	bool nocow_done;
  22};
  23
  24struct btrfs_dio_private {
  25	/* Range of I/O */
  26	u64 file_offset;
  27	u32 bytes;
  28
  29	/* This must be last */
  30	struct btrfs_bio bbio;
  31};
  32
  33static struct bio_set btrfs_dio_bioset;
  34
  35static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
  36			      struct extent_state **cached_state,
  37			      unsigned int iomap_flags)
  38{
  39	const bool writing = (iomap_flags & IOMAP_WRITE);
  40	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
  41	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
  42	struct btrfs_ordered_extent *ordered;
  43	int ret = 0;
  44
  45	/* Direct lock must be taken before the extent lock. */
  46	if (nowait) {
  47		if (!btrfs_try_lock_dio_extent(io_tree, lockstart, lockend, cached_state))
  48			return -EAGAIN;
  49	} else {
  50		btrfs_lock_dio_extent(io_tree, lockstart, lockend, cached_state);
  51	}
  52
  53	while (1) {
  54		if (nowait) {
  55			if (!btrfs_try_lock_extent(io_tree, lockstart, lockend,
  56						   cached_state)) {
  57				ret = -EAGAIN;
  58				break;
  59			}
  60		} else {
  61			btrfs_lock_extent(io_tree, lockstart, lockend, cached_state);
  62		}
  63		/*
  64		 * We're concerned with the entire range that we're going to be
  65		 * doing DIO to, so we need to make sure there's no ordered
  66		 * extents in this range.
  67		 */
  68		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
  69						     lockend - lockstart + 1);
  70
  71		/*
  72		 * We need to make sure there are no buffered pages in this
  73		 * range either, we could have raced between the invalidate in
  74		 * generic_file_direct_write and locking the extent.  The
  75		 * invalidate needs to happen so that reads after a write do not
  76		 * get stale data.
  77		 */
  78		if (!ordered &&
  79		    (!writing || !filemap_range_has_page(inode->i_mapping,
  80							 lockstart, lockend)))
  81			break;
  82
  83		btrfs_unlock_extent(io_tree, lockstart, lockend, cached_state);
  84
  85		if (ordered) {
  86			if (nowait) {
  87				btrfs_put_ordered_extent(ordered);
  88				ret = -EAGAIN;
  89				break;
  90			}
  91			/*
  92			 * If we are doing a DIO read and the ordered extent we
  93			 * found is for a buffered write, we can not wait for it
  94			 * to complete and retry, because if we do so we can
  95			 * deadlock with concurrent buffered writes on page
  96			 * locks. This happens only if our DIO read covers more
  97			 * than one extent map, if at this point has already
  98			 * created an ordered extent for a previous extent map
  99			 * and locked its range in the inode's io tree, and a
 100			 * concurrent write against that previous extent map's
 101			 * range and this range started (we unlock the ranges
 102			 * in the io tree only when the bios complete and
 103			 * buffered writes always lock pages before attempting
 104			 * to lock range in the io tree).
 105			 */
 106			if (writing ||
 107			    test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
 108				btrfs_start_ordered_extent(ordered);
 109			else
 110				ret = nowait ? -EAGAIN : -ENOTBLK;
 111			btrfs_put_ordered_extent(ordered);
 112		} else {
 113			/*
 114			 * We could trigger writeback for this range (and wait
 115			 * for it to complete) and then invalidate the pages for
 116			 * this range (through invalidate_inode_pages2_range()),
 117			 * but that can lead us to a deadlock with a concurrent
 118			 * call to readahead (a buffered read or a defrag call
 119			 * triggered a readahead) on a page lock due to an
 120			 * ordered dio extent we created before but did not have
 121			 * yet a corresponding bio submitted (whence it can not
 122			 * complete), which makes readahead wait for that
 123			 * ordered extent to complete while holding a lock on
 124			 * that page.
 125			 */
 126			ret = nowait ? -EAGAIN : -ENOTBLK;
 127		}
 128
 129		if (ret)
 130			break;
 131
 132		cond_resched();
 133	}
 134
 135	if (ret)
 136		btrfs_unlock_dio_extent(io_tree, lockstart, lockend, cached_state);
 137	return ret;
 138}
 139
 140static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
 141						  struct btrfs_dio_data *dio_data,
 142						  const u64 start,
 143						  const struct btrfs_file_extent *file_extent,
 144						  const int type)
 145{
 146	struct extent_map *em = NULL;
 147	struct btrfs_ordered_extent *ordered;
 148
 149	if (type != BTRFS_ORDERED_NOCOW) {
 150		em = btrfs_create_io_em(inode, start, file_extent, type);
 151		if (IS_ERR(em))
 152			goto out;
 153	}
 154
 155	ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,
 156					     (1U << type) |
 157					     (1U << BTRFS_ORDERED_DIRECT));
 158	if (IS_ERR(ordered)) {
 159		if (em) {
 160			btrfs_free_extent_map(em);
 161			btrfs_drop_extent_map_range(inode, start,
 162					start + file_extent->num_bytes - 1, false);
 163		}
 164		em = ERR_CAST(ordered);
 165	} else {
 166		ASSERT(!dio_data->ordered);
 167		dio_data->ordered = ordered;
 168	}
 169 out:
 170
 171	return em;
 172}
 173
 174static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
 175						  struct btrfs_dio_data *dio_data,
 176						  u64 start, u64 len)
 177{
 178	struct btrfs_root *root = inode->root;
 179	struct btrfs_fs_info *fs_info = root->fs_info;
 180	struct btrfs_file_extent file_extent;
 181	struct extent_map *em;
 182	struct btrfs_key ins;
 183	u64 alloc_hint;
 184	int ret;
 185
 186	alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len);
 187again:
 188	ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
 189				   0, alloc_hint, &ins, true, true);
 190	if (ret == -EAGAIN) {
 191		ASSERT(btrfs_is_zoned(fs_info));
 192		wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
 193			       TASK_UNINTERRUPTIBLE);
 194		goto again;
 195	}
 196	if (ret)
 197		return ERR_PTR(ret);
 198
 199	file_extent.disk_bytenr = ins.objectid;
 200	file_extent.disk_num_bytes = ins.offset;
 201	file_extent.num_bytes = ins.offset;
 202	file_extent.ram_bytes = ins.offset;
 203	file_extent.offset = 0;
 204	file_extent.compression = BTRFS_COMPRESS_NONE;
 205	em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent,
 206				     BTRFS_ORDERED_REGULAR);
 207	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 208	if (IS_ERR(em))
 209		btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
 210
 211	return em;
 212}
 213
 214static int btrfs_get_blocks_direct_write(struct extent_map **map,
 215					 struct inode *inode,
 216					 struct btrfs_dio_data *dio_data,
 217					 u64 start, u64 *lenp,
 218					 unsigned int iomap_flags)
 219{
 220	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
 221	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 222	struct btrfs_file_extent file_extent;
 223	struct extent_map *em = *map;
 224	int type;
 225	u64 block_start;
 226	struct btrfs_block_group *bg;
 227	bool can_nocow = false;
 228	bool space_reserved = false;
 229	u64 len = *lenp;
 230	u64 prev_len;
 231	int ret = 0;
 232
 233	/*
 234	 * We don't allocate a new extent in the following cases
 235	 *
 236	 * 1) The inode is marked as NODATACOW. In this case we'll just use the
 237	 * existing extent.
 238	 * 2) The extent is marked as PREALLOC. We're good to go here and can
 239	 * just use the extent.
 240	 *
 241	 */
 242	if ((em->flags & EXTENT_FLAG_PREALLOC) ||
 243	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
 244	     em->disk_bytenr != EXTENT_MAP_HOLE)) {
 245		if (em->flags & EXTENT_FLAG_PREALLOC)
 246			type = BTRFS_ORDERED_PREALLOC;
 247		else
 248			type = BTRFS_ORDERED_NOCOW;
 249		len = min(len, em->len - (start - em->start));
 250		block_start = btrfs_extent_map_block_start(em) + (start - em->start);
 251
 252		if (can_nocow_extent(BTRFS_I(inode), start, &len, &file_extent,
 253				     false) == 1) {
 254			bg = btrfs_inc_nocow_writers(fs_info, block_start);
 255			if (bg)
 256				can_nocow = true;
 257		}
 258	}
 259
 260	prev_len = len;
 261	if (can_nocow) {
 262		struct extent_map *em2;
 263
 264		/* We can NOCOW, so only need to reserve metadata space. */
 265		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
 266						      nowait);
 267		if (ret < 0) {
 268			/* Our caller expects us to free the input extent map. */
 269			btrfs_free_extent_map(em);
 270			*map = NULL;
 271			btrfs_dec_nocow_writers(bg);
 272			if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
 273				ret = -EAGAIN;
 274			goto out;
 275		}
 276		space_reserved = true;
 277
 278		em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start,
 279					      &file_extent, type);
 280		btrfs_dec_nocow_writers(bg);
 281		if (type == BTRFS_ORDERED_PREALLOC) {
 282			btrfs_free_extent_map(em);
 283			*map = em2;
 284			em = em2;
 285		}
 286
 287		if (IS_ERR(em2)) {
 288			ret = PTR_ERR(em2);
 289			goto out;
 290		}
 291
 292		dio_data->nocow_done = true;
 293	} else {
 294		/* Our caller expects us to free the input extent map. */
 295		btrfs_free_extent_map(em);
 296		*map = NULL;
 297
 298		if (nowait) {
 299			ret = -EAGAIN;
 300			goto out;
 301		}
 302
 303		/*
 304		 * If we could not allocate data space before locking the file
 305		 * range and we can't do a NOCOW write, then we have to fail.
 306		 */
 307		if (!dio_data->data_space_reserved) {
 308			ret = -ENOSPC;
 309			goto out;
 310		}
 311
 312		/*
 313		 * We have to COW and we have already reserved data space before,
 314		 * so now we reserve only metadata.
 315		 */
 316		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
 317						      false);
 318		if (ret < 0)
 319			goto out;
 320		space_reserved = true;
 321
 322		em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
 323		if (IS_ERR(em)) {
 324			ret = PTR_ERR(em);
 325			goto out;
 326		}
 327		*map = em;
 328		len = min(len, em->len - (start - em->start));
 329		if (len < prev_len)
 330			btrfs_delalloc_release_metadata(BTRFS_I(inode),
 331							prev_len - len, true);
 332	}
 333
 334	/*
 335	 * We have created our ordered extent, so we can now release our reservation
 336	 * for an outstanding extent.
 337	 */
 338	btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
 339
 340	/*
 341	 * Need to update the i_size under the extent lock so buffered
 342	 * readers will get the updated i_size when we unlock.
 343	 */
 344	if (start + len > i_size_read(inode))
 345		i_size_write(inode, start + len);
 346out:
 347	if (ret && space_reserved) {
 348		btrfs_delalloc_release_extents(BTRFS_I(inode), len);
 349		btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
 350	}
 351	*lenp = len;
 352	return ret;
 353}
 354
 355static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
 356		loff_t length, unsigned int flags, struct iomap *iomap,
 357		struct iomap *srcmap)
 358{
 359	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
 360	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 361	struct extent_map *em;
 362	struct extent_state *cached_state = NULL;
 363	struct btrfs_dio_data *dio_data = iter->private;
 364	u64 lockstart, lockend;
 365	const bool write = !!(flags & IOMAP_WRITE);
 366	int ret = 0;
 367	u64 len = length;
 368	const u64 data_alloc_len = length;
 369	u32 unlock_bits = EXTENT_LOCKED;
 370
 371	/*
 372	 * We could potentially fault if we have a buffer > PAGE_SIZE, and if
 373	 * we're NOWAIT we may submit a bio for a partial range and return
 374	 * EIOCBQUEUED, which would result in an errant short read.
 375	 *
 376	 * The best way to handle this would be to allow for partial completions
 377	 * of iocb's, so we could submit the partial bio, return and fault in
 378	 * the rest of the pages, and then submit the io for the rest of the
 379	 * range.  However we don't have that currently, so simply return
 380	 * -EAGAIN at this point so that the normal path is used.
 381	 */
 382	if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
 383		return -EAGAIN;
 384
 385	/*
 386	 * Cap the size of reads to that usually seen in buffered I/O as we need
 387	 * to allocate a contiguous array for the checksums.
 388	 */
 389	if (!write)
 390		len = min_t(u64, len, fs_info->sectorsize * BIO_MAX_VECS);
 391
 392	lockstart = start;
 393	lockend = start + len - 1;
 394
 395	/*
 396	 * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
 397	 * enough if we've written compressed pages to this area, so we need to
 398	 * flush the dirty pages again to make absolutely sure that any
 399	 * outstanding dirty pages are on disk - the first flush only starts
 400	 * compression on the data, while keeping the pages locked, so by the
 401	 * time the second flush returns we know bios for the compressed pages
 402	 * were submitted and finished, and the pages no longer under writeback.
 403	 *
 404	 * If we have a NOWAIT request and we have any pages in the range that
 405	 * are locked, likely due to compression still in progress, we don't want
 406	 * to block on page locks. We also don't want to block on pages marked as
 407	 * dirty or under writeback (same as for the non-compression case).
 408	 * iomap_dio_rw() did the same check, but after that and before we got
 409	 * here, mmap'ed writes may have happened or buffered reads started
 410	 * (readpage() and readahead(), which lock pages), as we haven't locked
 411	 * the file range yet.
 412	 */
 413	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
 414		     &BTRFS_I(inode)->runtime_flags)) {
 415		if (flags & IOMAP_NOWAIT) {
 416			if (filemap_range_needs_writeback(inode->i_mapping,
 417							  lockstart, lockend))
 418				return -EAGAIN;
 419		} else {
 420			ret = filemap_fdatawrite_range(inode->i_mapping, start,
 421						       start + length - 1);
 422			if (ret)
 423				return ret;
 424		}
 425	}
 426
 427	memset(dio_data, 0, sizeof(*dio_data));
 428
 429	/*
 430	 * We always try to allocate data space and must do it before locking
 431	 * the file range, to avoid deadlocks with concurrent writes to the same
 432	 * range if the range has several extents and the writes don't expand the
 433	 * current i_size (the inode lock is taken in shared mode). If we fail to
 434	 * allocate data space here we continue and later, after locking the
 435	 * file range, we fail with ENOSPC only if we figure out we can not do a
 436	 * NOCOW write.
 437	 */
 438	if (write && !(flags & IOMAP_NOWAIT)) {
 439		ret = btrfs_check_data_free_space(BTRFS_I(inode),
 440						  &dio_data->data_reserved,
 441						  start, data_alloc_len, false);
 442		if (!ret)
 443			dio_data->data_space_reserved = true;
 444		else if (!(BTRFS_I(inode)->flags &
 445			   (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
 446			goto err;
 447	}
 448
 449	/*
 450	 * If this errors out it's because we couldn't invalidate pagecache for
 451	 * this range and we need to fallback to buffered IO, or we are doing a
 452	 * NOWAIT read/write and we need to block.
 453	 */
 454	ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
 455	if (ret < 0)
 456		goto err;
 457
 458	em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
 459	if (IS_ERR(em)) {
 460		ret = PTR_ERR(em);
 461		goto unlock_err;
 462	}
 463
 464	/*
 465	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
 466	 * io.  INLINE is special, and we could probably kludge it in here, but
 467	 * it's still buffered so for safety lets just fall back to the generic
 468	 * buffered path.
 469	 *
 470	 * For COMPRESSED we _have_ to read the entire extent in so we can
 471	 * decompress it, so there will be buffering required no matter what we
 472	 * do, so go ahead and fallback to buffered.
 473	 *
 474	 * We return -ENOTBLK because that's what makes DIO go ahead and go back
 475	 * to buffered IO.  Don't blame me, this is the price we pay for using
 476	 * the generic code.
 477	 */
 478	if (btrfs_extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
 479		btrfs_free_extent_map(em);
 480		/*
 481		 * If we are in a NOWAIT context, return -EAGAIN in order to
 482		 * fallback to buffered IO. This is not only because we can
 483		 * block with buffered IO (no support for NOWAIT semantics at
 484		 * the moment) but also to avoid returning short reads to user
 485		 * space - this happens if we were able to read some data from
 486		 * previous non-compressed extents and then when we fallback to
 487		 * buffered IO, at btrfs_file_read_iter() by calling
 488		 * filemap_read(), we fail to fault in pages for the read buffer,
 489		 * in which case filemap_read() returns a short read (the number
 490		 * of bytes previously read is > 0, so it does not return -EFAULT).
 491		 */
 492		ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
 493		goto unlock_err;
 494	}
 495
 496	len = min(len, em->len - (start - em->start));
 497
 498	/*
 499	 * If we have a NOWAIT request and the range contains multiple extents
 500	 * (or a mix of extents and holes), then we return -EAGAIN to make the
 501	 * caller fallback to a context where it can do a blocking (without
 502	 * NOWAIT) request. This way we avoid doing partial IO and returning
 503	 * success to the caller, which is not optimal for writes and for reads
 504	 * it can result in unexpected behaviour for an application.
 505	 *
 506	 * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
 507	 * iomap_dio_rw(), we can end up returning less data then what the caller
 508	 * asked for, resulting in an unexpected, and incorrect, short read.
 509	 * That is, the caller asked to read N bytes and we return less than that,
 510	 * which is wrong unless we are crossing EOF. This happens if we get a
 511	 * page fault error when trying to fault in pages for the buffer that is
 512	 * associated to the struct iov_iter passed to iomap_dio_rw(), and we
 513	 * have previously submitted bios for other extents in the range, in
 514	 * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
 515	 * those bios have completed by the time we get the page fault error,
 516	 * which we return back to our caller - we should only return EIOCBQUEUED
 517	 * after we have submitted bios for all the extents in the range.
 518	 */
 519	if ((flags & IOMAP_NOWAIT) && len < length) {
 520		btrfs_free_extent_map(em);
 521		ret = -EAGAIN;
 522		goto unlock_err;
 523	}
 524
 525	if (write) {
 526		ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
 527						    start, &len, flags);
 528		if (ret < 0)
 529			goto unlock_err;
 530		/* Recalc len in case the new em is smaller than requested */
 531		len = min(len, em->len - (start - em->start));
 532		if (dio_data->data_space_reserved) {
 533			u64 release_offset;
 534			u64 release_len = 0;
 535
 536			if (dio_data->nocow_done) {
 537				release_offset = start;
 538				release_len = data_alloc_len;
 539			} else if (len < data_alloc_len) {
 540				release_offset = start + len;
 541				release_len = data_alloc_len - len;
 542			}
 543
 544			if (release_len > 0)
 545				btrfs_free_reserved_data_space(BTRFS_I(inode),
 546							       dio_data->data_reserved,
 547							       release_offset,
 548							       release_len);
 549		}
 550	}
 551
 552	/*
 553	 * Translate extent map information to iomap.
 554	 * We trim the extents (and move the addr) even though iomap code does
 555	 * that, since we have locked only the parts we are performing I/O in.
 556	 */
 557	if ((em->disk_bytenr == EXTENT_MAP_HOLE) ||
 558	    ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
 559		iomap->addr = IOMAP_NULL_ADDR;
 560		iomap->type = IOMAP_HOLE;
 561	} else {
 562		iomap->addr = btrfs_extent_map_block_start(em) + (start - em->start);
 563		iomap->type = IOMAP_MAPPED;
 564	}
 565	iomap->offset = start;
 566	iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
 567	iomap->length = len;
 568	btrfs_free_extent_map(em);
 569
 570	/*
 571	 * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed,
 572	 * writes only hold it for this part.  We hold the extent lock until
 573	 * we're completely done with the extent map to make sure it remains
 574	 * valid.
 575	 */
 576	if (write)
 577		unlock_bits |= EXTENT_DIO_LOCKED;
 578
 579	btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
 580			       unlock_bits, &cached_state);
 581
 582	/* We didn't use everything, unlock the dio extent for the remainder. */
 583	if (!write && (start + len) < lockend)
 584		btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
 585					lockend, NULL);
 586
 587	return 0;
 588
 589unlock_err:
 590	/*
 591	 * Don't use EXTENT_LOCK_BITS here in case we extend it later and forget
 592	 * to update this, be explicit that we expect EXTENT_LOCKED and
 593	 * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing.
 594	 */
 595	btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
 596			       EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);
 597err:
 598	if (dio_data->data_space_reserved) {
 599		btrfs_free_reserved_data_space(BTRFS_I(inode),
 600					       dio_data->data_reserved,
 601					       start, data_alloc_len);
 602		extent_changeset_free(dio_data->data_reserved);
 603	}
 604
 605	return ret;
 606}
 607
 608static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
 609		ssize_t written, unsigned int flags, struct iomap *iomap)
 610{
 611	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
 612	struct btrfs_dio_data *dio_data = iter->private;
 613	size_t submitted = dio_data->submitted;
 614	const bool write = !!(flags & IOMAP_WRITE);
 615	int ret = 0;
 616
 617	if (!write && (iomap->type == IOMAP_HOLE)) {
 618		/* If reading from a hole, unlock and return */
 619		btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
 620					pos + length - 1, NULL);
 621		return 0;
 622	}
 623
 624	if (submitted < length) {
 625		pos += submitted;
 626		length -= submitted;
 627		if (write)
 628			btrfs_finish_ordered_extent(dio_data->ordered, NULL,
 629						    pos, length, false);
 630		else
 631			btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
 632						pos + length - 1, NULL);
 633		ret = -ENOTBLK;
 634	}
 635	if (write) {
 636		btrfs_put_ordered_extent(dio_data->ordered);
 637		dio_data->ordered = NULL;
 638	}
 639
 640	if (write)
 641		extent_changeset_free(dio_data->data_reserved);
 642	return ret;
 643}
 644
 645static void btrfs_dio_end_io(struct btrfs_bio *bbio)
 646{
 647	struct btrfs_dio_private *dip =
 648		container_of(bbio, struct btrfs_dio_private, bbio);
 649	struct btrfs_inode *inode = bbio->inode;
 650	struct bio *bio = &bbio->bio;
 651
 652	if (bio->bi_status) {
 653		btrfs_warn(inode->root->fs_info,
 654		"direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
 655			   btrfs_ino(inode), bio->bi_opf,
 656			   dip->file_offset, dip->bytes, bio->bi_status);
 657	}
 658
 659	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
 660		btrfs_finish_ordered_extent(bbio->ordered, NULL,
 661					    dip->file_offset, dip->bytes,
 662					    !bio->bi_status);
 663	} else {
 664		btrfs_unlock_dio_extent(&inode->io_tree, dip->file_offset,
 665					dip->file_offset + dip->bytes - 1, NULL);
 666	}
 667
 668	bbio->bio.bi_private = bbio->private;
 669	iomap_dio_bio_end_io(bio);
 670}
 671
 672static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
 673					struct btrfs_ordered_extent *ordered)
 674{
 675	u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
 676	u64 len = bbio->bio.bi_iter.bi_size;
 677	struct btrfs_ordered_extent *new;
 678	int ret;
 679
 680	/* Must always be called for the beginning of an ordered extent. */
 681	if (WARN_ON_ONCE(start != ordered->disk_bytenr))
 682		return -EINVAL;
 683
 684	/* No need to split if the ordered extent covers the entire bio. */
 685	if (ordered->disk_num_bytes == len) {
 686		refcount_inc(&ordered->refs);
 687		bbio->ordered = ordered;
 688		return 0;
 689	}
 690
 691	/*
 692	 * Don't split the extent_map for NOCOW extents, as we're writing into
 693	 * a pre-existing one.
 694	 */
 695	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
 696		ret = btrfs_split_extent_map(bbio->inode, bbio->file_offset,
 697					     ordered->num_bytes, len,
 698					     ordered->disk_bytenr);
 699		if (ret)
 700			return ret;
 701	}
 702
 703	new = btrfs_split_ordered_extent(ordered, len);
 704	if (IS_ERR(new))
 705		return PTR_ERR(new);
 706	bbio->ordered = new;
 707	return 0;
 708}
 709
 710static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
 711				loff_t file_offset)
 712{
 713	struct btrfs_bio *bbio = btrfs_bio(bio);
 714	struct btrfs_dio_private *dip =
 715		container_of(bbio, struct btrfs_dio_private, bbio);
 716	struct btrfs_dio_data *dio_data = iter->private;
 717
 718	btrfs_bio_init(bbio, BTRFS_I(iter->inode), file_offset,
 719		       btrfs_dio_end_io, bio->bi_private);
 720
 721	dip->file_offset = file_offset;
 722	dip->bytes = bio->bi_iter.bi_size;
 723
 724	dio_data->submitted += bio->bi_iter.bi_size;
 725
 726	/*
 727	 * Check if we are doing a partial write.  If we are, we need to split
 728	 * the ordered extent to match the submitted bio.  Hang on to the
 729	 * remaining unfinishable ordered_extent in dio_data so that it can be
 730	 * cancelled in iomap_end to avoid a deadlock wherein faulting the
 731	 * remaining pages is blocked on the outstanding ordered extent.
 732	 */
 733	if (iter->flags & IOMAP_WRITE) {
 734		int ret;
 735
 736		ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
 737		if (ret) {
 738			btrfs_finish_ordered_extent(dio_data->ordered, NULL,
 739						    file_offset, dip->bytes,
 740						    !ret);
 741			bio->bi_status = errno_to_blk_status(ret);
 742			iomap_dio_bio_end_io(bio);
 743			return;
 744		}
 745	}
 746
 747	btrfs_submit_bbio(bbio, 0);
 748}
 749
 750static const struct iomap_ops btrfs_dio_iomap_ops = {
 751	.iomap_begin            = btrfs_dio_iomap_begin,
 752	.iomap_end              = btrfs_dio_iomap_end,
 753};
 754
 755static const struct iomap_dio_ops btrfs_dio_ops = {
 756	.submit_io		= btrfs_dio_submit_io,
 757	.bio_set		= &btrfs_dio_bioset,
 758};
 759
 760static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
 761			      size_t done_before)
 762{
 763	struct btrfs_dio_data data = { 0 };
 764
 765	return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
 766			    IOMAP_DIO_PARTIAL, &data, done_before);
 767}
 768
 769static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
 770					 size_t done_before)
 771{
 772	struct btrfs_dio_data data = { 0 };
 773
 774	return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
 775			    IOMAP_DIO_PARTIAL, &data, done_before);
 776}
 777
 778static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
 779			       const struct iov_iter *iter, loff_t offset)
 780{
 781	const u32 blocksize_mask = fs_info->sectorsize - 1;
 782
 783	if (offset & blocksize_mask)
 784		return -EINVAL;
 785
 786	if (iov_iter_alignment(iter) & blocksize_mask)
 787		return -EINVAL;
 788
 789	/*
 790	 * For bs > ps support, we heavily rely on large folios to make sure no
 791	 * block will cross large folio boundaries.
 792	 *
 793	 * But memory provided by direct IO is only virtually contiguous, not
 794	 * physically contiguous, and will break the btrfs' large folio requirement.
 795	 *
 796	 * So for bs > ps support, all direct IOs should fallback to buffered ones.
 797	 */
 798	if (fs_info->sectorsize > PAGE_SIZE)
 799		return -EINVAL;
 800
 801	return 0;
 802}
 803
 804ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
 805{
 806	struct file *file = iocb->ki_filp;
 807	struct inode *inode = file_inode(file);
 808	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 809	loff_t pos;
 810	ssize_t written = 0;
 811	ssize_t written_buffered;
 812	size_t prev_left = 0;
 813	loff_t endbyte;
 814	ssize_t ret;
 815	unsigned int ilock_flags = 0;
 816	struct iomap_dio *dio;
 817
 818	if (iocb->ki_flags & IOCB_NOWAIT)
 819		ilock_flags |= BTRFS_ILOCK_TRY;
 820
 821	/*
 822	 * If the write DIO is within EOF, use a shared lock and also only if
 823	 * security bits will likely not be dropped by file_remove_privs() called
 824	 * from btrfs_write_check(). Either will need to be rechecked after the
 825	 * lock was acquired.
 826	 */
 827	if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
 828		ilock_flags |= BTRFS_ILOCK_SHARED;
 829
 830relock:
 831	ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
 832	if (ret < 0)
 833		return ret;
 834
 835	/* Shared lock cannot be used with security bits set. */
 836	if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
 837		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 838		ilock_flags &= ~BTRFS_ILOCK_SHARED;
 839		goto relock;
 840	}
 841
 842	ret = generic_write_checks(iocb, from);
 843	if (ret <= 0) {
 844		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 845		return ret;
 846	}
 847
 848	ret = btrfs_write_check(iocb, ret);
 849	if (ret < 0) {
 850		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 851		goto out;
 852	}
 853
 854	pos = iocb->ki_pos;
 855	/*
 856	 * Re-check since file size may have changed just before taking the
 857	 * lock or pos may have changed because of O_APPEND in generic_write_check()
 858	 */
 859	if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
 860	    pos + iov_iter_count(from) > i_size_read(inode)) {
 861		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 862		ilock_flags &= ~BTRFS_ILOCK_SHARED;
 863		goto relock;
 864	}
 865
 866	if (check_direct_IO(fs_info, from, pos)) {
 867		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 868		goto buffered;
 869	}
 870	/*
 871	 * We can't control the folios being passed in, applications can write
 872	 * to them while a direct IO write is in progress.  This means the
 873	 * content might change after we calculated the data checksum.
 874	 * Therefore we can end up storing a checksum that doesn't match the
 875	 * persisted data.
 876	 *
 877	 * To be extra safe and avoid false data checksum mismatch, if the
 878	 * inode requires data checksum, just fallback to buffered IO.
 879	 * For buffered IO we have full control of page cache and can ensure
 880	 * no one is modifying the content during writeback.
 881	 */
 882	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
 883		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 884		goto buffered;
 885	}
 886
 887	/*
 888	 * The iov_iter can be mapped to the same file range we are writing to.
 889	 * If that's the case, then we will deadlock in the iomap code, because
 890	 * it first calls our callback btrfs_dio_iomap_begin(), which will create
 891	 * an ordered extent, and after that it will fault in the pages that the
 892	 * iov_iter refers to. During the fault in we end up in the readahead
 893	 * pages code (starting at btrfs_readahead()), which will lock the range,
 894	 * find that ordered extent and then wait for it to complete (at
 895	 * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
 896	 * obviously the ordered extent can never complete as we didn't submit
 897	 * yet the respective bio(s). This always happens when the buffer is
 898	 * memory mapped to the same file range, since the iomap DIO code always
 899	 * invalidates pages in the target file range (after starting and waiting
 900	 * for any writeback).
 901	 *
 902	 * So here we disable page faults in the iov_iter and then retry if we
 903	 * got -EFAULT, faulting in the pages before the retry.
 904	 */
 905again:
 906	from->nofault = true;
 907	dio = btrfs_dio_write(iocb, from, written);
 908	from->nofault = false;
 909
 910	if (IS_ERR_OR_NULL(dio)) {
 911		ret = PTR_ERR_OR_ZERO(dio);
 912	} else {
 913		/*
 914		 * If we have a synchronous write, we must make sure the fsync
 915		 * triggered by the iomap_dio_complete() call below doesn't
 916		 * deadlock on the inode lock - we are already holding it and we
 917		 * can't call it after unlocking because we may need to complete
 918		 * partial writes due to the input buffer (or parts of it) not
 919		 * being already faulted in.
 920		 */
 921		ASSERT(current->journal_info == NULL);
 922		current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
 923		ret = iomap_dio_complete(dio);
 924		current->journal_info = NULL;
 925	}
 926
 927	/* No increment (+=) because iomap returns a cumulative value. */
 928	if (ret > 0)
 929		written = ret;
 930
 931	if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
 932		const size_t left = iov_iter_count(from);
 933		/*
 934		 * We have more data left to write. Try to fault in as many as
 935		 * possible of the remainder pages and retry. We do this without
 936		 * releasing and locking again the inode, to prevent races with
 937		 * truncate.
 938		 *
 939		 * Also, in case the iov refers to pages in the file range of the
 940		 * file we want to write to (due to a mmap), we could enter an
 941		 * infinite loop if we retry after faulting the pages in, since
 942		 * iomap will invalidate any pages in the range early on, before
 943		 * it tries to fault in the pages of the iov. So we keep track of
 944		 * how much was left of iov in the previous EFAULT and fallback
 945		 * to buffered IO in case we haven't made any progress.
 946		 */
 947		if (left == prev_left) {
 948			ret = -ENOTBLK;
 949		} else {
 950			fault_in_iov_iter_readable(from, left);
 951			prev_left = left;
 952			goto again;
 953		}
 954	}
 955
 956	btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
 957
 958	/*
 959	 * If 'ret' is -ENOTBLK or we have not written all data, then it means
 960	 * we must fallback to buffered IO.
 961	 */
 962	if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
 963		goto out;
 964
 965buffered:
 966	/*
 967	 * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
 968	 * it must retry the operation in a context where blocking is acceptable,
 969	 * because even if we end up not blocking during the buffered IO attempt
 970	 * below, we will block when flushing and waiting for the IO.
 971	 */
 972	if (iocb->ki_flags & IOCB_NOWAIT) {
 973		ret = -EAGAIN;
 974		goto out;
 975	}
 976
 977	pos = iocb->ki_pos;
 978	written_buffered = btrfs_buffered_write(iocb, from);
 979	if (written_buffered < 0) {
 980		ret = written_buffered;
 981		goto out;
 982	}
 983	/*
 984	 * Ensure all data is persisted. We want the next direct IO read to be
 985	 * able to read what was just written.
 986	 */
 987	endbyte = pos + written_buffered - 1;
 988	ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte);
 989	if (ret)
 990		goto out;
 991	ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
 992	if (ret)
 993		goto out;
 994	written += written_buffered;
 995	iocb->ki_pos = pos + written_buffered;
 996	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
 997				 endbyte >> PAGE_SHIFT);
 998out:
 999	return ret < 0 ? ret : written;
1000}
1001
1002static int check_direct_read(struct btrfs_fs_info *fs_info,
1003			     const struct iov_iter *iter, loff_t offset)
1004{
1005	int ret;
1006	int i, seg;
1007
1008	ret = check_direct_IO(fs_info, iter, offset);
1009	if (ret < 0)
1010		return ret;
1011
1012	if (!iter_is_iovec(iter))
1013		return 0;
1014
1015	for (seg = 0; seg < iter->nr_segs; seg++) {
1016		for (i = seg + 1; i < iter->nr_segs; i++) {
1017			const struct iovec *iov1 = iter_iov(iter) + seg;
1018			const struct iovec *iov2 = iter_iov(iter) + i;
1019
1020			if (iov1->iov_base == iov2->iov_base)
1021				return -EINVAL;
1022		}
1023	}
1024	return 0;
1025}
1026
1027ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
1028{
1029	struct inode *inode = file_inode(iocb->ki_filp);
1030	size_t prev_left = 0;
1031	ssize_t read = 0;
1032	ssize_t ret;
1033
1034	if (fsverity_active(inode))
1035		return 0;
1036
1037	if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
1038		return 0;
1039
1040	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
1041again:
1042	/*
1043	 * This is similar to what we do for direct IO writes, see the comment
1044	 * at btrfs_direct_write(), but we also disable page faults in addition
1045	 * to disabling them only at the iov_iter level. This is because when
1046	 * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
1047	 * which can still trigger page fault ins despite having set ->nofault
1048	 * to true of our 'to' iov_iter.
1049	 *
1050	 * The difference to direct IO writes is that we deadlock when trying
1051	 * to lock the extent range in the inode's tree during he page reads
1052	 * triggered by the fault in (while for writes it is due to waiting for
1053	 * our own ordered extent). This is because for direct IO reads,
1054	 * btrfs_dio_iomap_begin() returns with the extent range locked, which
1055	 * is only unlocked in the endio callback (end_bio_extent_readpage()).
1056	 */
1057	pagefault_disable();
1058	to->nofault = true;
1059	ret = btrfs_dio_read(iocb, to, read);
1060	to->nofault = false;
1061	pagefault_enable();
1062
1063	/* No increment (+=) because iomap returns a cumulative value. */
1064	if (ret > 0)
1065		read = ret;
1066
1067	if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
1068		const size_t left = iov_iter_count(to);
1069
1070		if (left == prev_left) {
1071			/*
1072			 * We didn't make any progress since the last attempt,
1073			 * fallback to a buffered read for the remainder of the
1074			 * range. This is just to avoid any possibility of looping
1075			 * for too long.
1076			 */
1077			ret = read;
1078		} else {
1079			/*
1080			 * We made some progress since the last retry or this is
1081			 * the first time we are retrying. Fault in as many pages
1082			 * as possible and retry.
1083			 */
1084			fault_in_iov_iter_writeable(to, left);
1085			prev_left = left;
1086			goto again;
1087		}
1088	}
1089	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
1090	return ret < 0 ? ret : read;
1091}
1092
1093int __init btrfs_init_dio(void)
1094{
1095	if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
1096			offsetof(struct btrfs_dio_private, bbio.bio),
1097			BIOSET_NEED_BVECS))
1098		return -ENOMEM;
1099
1100	return 0;
1101}
1102
1103void __cold btrfs_destroy_dio(void)
1104{
1105	bioset_exit(&btrfs_dio_bioset);
1106}