fs/btrfs/inode.c at master · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / btrfs / inode.c
at master 313 kB view raw
    1// SPDX-License-Identifier: GPL-2.0
    2/*
    3 * Copyright (C) 2007 Oracle.  All rights reserved.
    4 */
    5
    6#include <crypto/hash.h>
    7#include <linux/kernel.h>
    8#include <linux/bio.h>
    9#include <linux/blk-cgroup.h>
   10#include <linux/file.h>
   11#include <linux/fs.h>
   12#include <linux/fs_struct.h>
   13#include <linux/pagemap.h>
   14#include <linux/highmem.h>
   15#include <linux/time.h>
   16#include <linux/init.h>
   17#include <linux/string.h>
   18#include <linux/backing-dev.h>
   19#include <linux/writeback.h>
   20#include <linux/compat.h>
   21#include <linux/xattr.h>
   22#include <linux/posix_acl.h>
   23#include <linux/falloc.h>
   24#include <linux/slab.h>
   25#include <linux/ratelimit.h>
   26#include <linux/btrfs.h>
   27#include <linux/blkdev.h>
   28#include <linux/posix_acl_xattr.h>
   29#include <linux/uio.h>
   30#include <linux/magic.h>
   31#include <linux/iversion.h>
   32#include <linux/swap.h>
   33#include <linux/migrate.h>
   34#include <linux/sched/mm.h>
   35#include <linux/iomap.h>
   36#include <linux/unaligned.h>
   37#include <linux/fsverity.h>
   38#include "misc.h"
   39#include "ctree.h"
   40#include "disk-io.h"
   41#include "transaction.h"
   42#include "btrfs_inode.h"
   43#include "ordered-data.h"
   44#include "xattr.h"
   45#include "tree-log.h"
   46#include "bio.h"
   47#include "compression.h"
   48#include "locking.h"
   49#include "props.h"
   50#include "qgroup.h"
   51#include "delalloc-space.h"
   52#include "block-group.h"
   53#include "space-info.h"
   54#include "zoned.h"
   55#include "subpage.h"
   56#include "inode-item.h"
   57#include "fs.h"
   58#include "accessors.h"
   59#include "extent-tree.h"
   60#include "root-tree.h"
   61#include "defrag.h"
   62#include "dir-item.h"
   63#include "file-item.h"
   64#include "uuid-tree.h"
   65#include "ioctl.h"
   66#include "file.h"
   67#include "acl.h"
   68#include "relocation.h"
   69#include "verity.h"
   70#include "super.h"
   71#include "orphan.h"
   72#include "backref.h"
   73#include "raid-stripe-tree.h"
   74#include "fiemap.h"
   75#include "delayed-inode.h"
   76
   77#define COW_FILE_RANGE_KEEP_LOCKED	(1UL << 0)
   78#define COW_FILE_RANGE_NO_INLINE	(1UL << 1)
   79
   80struct btrfs_iget_args {
   81	u64 ino;
   82	struct btrfs_root *root;
   83};
   84
   85struct btrfs_rename_ctx {
   86	/* Output field. Stores the index number of the old directory entry. */
   87	u64 index;
   88};
   89
   90/*
   91 * Used by data_reloc_print_warning_inode() to pass needed info for filename
   92 * resolution and output of error message.
   93 */
   94struct data_reloc_warn {
   95	struct btrfs_path path;
   96	struct btrfs_fs_info *fs_info;
   97	u64 extent_item_size;
   98	u64 logical;
   99	int mirror_num;
  100};
  101
  102/*
  103 * For the file_extent_tree, we want to hold the inode lock when we lookup and
  104 * update the disk_i_size, but lockdep will complain because our io_tree we hold
  105 * the tree lock and get the inode lock when setting delalloc. These two things
  106 * are unrelated, so make a class for the file_extent_tree so we don't get the
  107 * two locking patterns mixed up.
  108 */
  109static struct lock_class_key file_extent_tree_class;
  110
  111static const struct inode_operations btrfs_dir_inode_operations;
  112static const struct inode_operations btrfs_symlink_inode_operations;
  113static const struct inode_operations btrfs_special_inode_operations;
  114static const struct inode_operations btrfs_file_inode_operations;
  115static const struct address_space_operations btrfs_aops;
  116static const struct file_operations btrfs_dir_file_operations;
  117
  118static struct kmem_cache *btrfs_inode_cachep;
  119
  120static int btrfs_setsize(struct inode *inode, struct iattr *attr);
  121static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);
  122
  123static noinline int run_delalloc_cow(struct btrfs_inode *inode,
  124				     struct folio *locked_folio, u64 start,
  125				     u64 end, struct writeback_control *wbc,
  126				     bool pages_dirty);
  127
  128static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
  129					  u64 root, void *warn_ctx)
  130{
  131	struct data_reloc_warn *warn = warn_ctx;
  132	struct btrfs_fs_info *fs_info = warn->fs_info;
  133	struct extent_buffer *eb;
  134	struct btrfs_inode_item *inode_item;
  135	struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL;
  136	struct btrfs_root *local_root;
  137	struct btrfs_key key;
  138	unsigned int nofs_flag;
  139	u32 nlink;
  140	int ret;
  141
  142	local_root = btrfs_get_fs_root(fs_info, root, true);
  143	if (IS_ERR(local_root)) {
  144		ret = PTR_ERR(local_root);
  145		goto err;
  146	}
  147
  148	/* This makes the path point to (inum INODE_ITEM ioff). */
  149	key.objectid = inum;
  150	key.type = BTRFS_INODE_ITEM_KEY;
  151	key.offset = 0;
  152
  153	ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0);
  154	if (ret) {
  155		btrfs_put_root(local_root);
  156		btrfs_release_path(&warn->path);
  157		goto err;
  158	}
  159
  160	eb = warn->path.nodes[0];
  161	inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item);
  162	nlink = btrfs_inode_nlink(eb, inode_item);
  163	btrfs_release_path(&warn->path);
  164
  165	nofs_flag = memalloc_nofs_save();
  166	ipath = init_ipath(4096, local_root, &warn->path);
  167	memalloc_nofs_restore(nofs_flag);
  168	if (IS_ERR(ipath)) {
  169		btrfs_put_root(local_root);
  170		ret = PTR_ERR(ipath);
  171		ipath = NULL;
  172		/*
  173		 * -ENOMEM, not a critical error, just output an generic error
  174		 * without filename.
  175		 */
  176		btrfs_warn(fs_info,
  177"checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu",
  178			   warn->logical, warn->mirror_num, root, inum, offset);
  179		return ret;
  180	}
  181	ret = paths_from_inode(inum, ipath);
  182	if (ret < 0) {
  183		btrfs_put_root(local_root);
  184		goto err;
  185	}
  186
  187	/*
  188	 * We deliberately ignore the bit ipath might have been too small to
  189	 * hold all of the paths here
  190	 */
  191	for (int i = 0; i < ipath->fspath->elem_cnt; i++) {
  192		btrfs_warn(fs_info,
  193"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)",
  194			   warn->logical, warn->mirror_num, root, inum, offset,
  195			   fs_info->sectorsize, nlink,
  196			   (char *)(unsigned long)ipath->fspath->val[i]);
  197	}
  198
  199	btrfs_put_root(local_root);
  200	return 0;
  201
  202err:
  203	btrfs_warn(fs_info,
  204"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d",
  205		   warn->logical, warn->mirror_num, root, inum, offset, ret);
  206
  207	return ret;
  208}
  209
  210/*
  211 * Do extra user-friendly error output (e.g. lookup all the affected files).
  212 *
  213 * Return true if we succeeded doing the backref lookup.
  214 * Return false if such lookup failed, and has to fallback to the old error message.
  215 */
  216static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off,
  217				   const u8 *csum, const u8 *csum_expected,
  218				   int mirror_num)
  219{
  220	struct btrfs_fs_info *fs_info = inode->root->fs_info;
  221	struct btrfs_path path = { 0 };
  222	struct btrfs_key found_key = { 0 };
  223	struct extent_buffer *eb;
  224	struct btrfs_extent_item *ei;
  225	const u32 csum_size = fs_info->csum_size;
  226	u64 logical;
  227	u64 flags;
  228	u32 item_size;
  229	int ret;
  230
  231	mutex_lock(&fs_info->reloc_mutex);
  232	logical = btrfs_get_reloc_bg_bytenr(fs_info);
  233	mutex_unlock(&fs_info->reloc_mutex);
  234
  235	if (logical == U64_MAX) {
  236		btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
  237		btrfs_warn_rl(fs_info,
  238"csum failed root %lld ino %llu off %llu csum " BTRFS_CSUM_FMT " expected csum " BTRFS_CSUM_FMT " mirror %d",
  239			btrfs_root_id(inode->root), btrfs_ino(inode), file_off,
  240			BTRFS_CSUM_FMT_VALUE(csum_size, csum),
  241			BTRFS_CSUM_FMT_VALUE(csum_size, csum_expected),
  242			mirror_num);
  243		return;
  244	}
  245
  246	logical += file_off;
  247	btrfs_warn_rl(fs_info,
  248"csum failed root %lld ino %llu off %llu logical %llu csum " BTRFS_CSUM_FMT " expected csum " BTRFS_CSUM_FMT " mirror %d",
  249			btrfs_root_id(inode->root),
  250			btrfs_ino(inode), file_off, logical,
  251			BTRFS_CSUM_FMT_VALUE(csum_size, csum),
  252			BTRFS_CSUM_FMT_VALUE(csum_size, csum_expected),
  253			mirror_num);
  254
  255	ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags);
  256	if (ret < 0) {
  257		btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d",
  258			     logical, ret);
  259		btrfs_release_path(&path);
  260		return;
  261	}
  262	eb = path.nodes[0];
  263	ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item);
  264	item_size = btrfs_item_size(eb, path.slots[0]);
  265	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
  266		unsigned long ptr = 0;
  267		u64 ref_root;
  268		u8 ref_level;
  269
  270		while (true) {
  271			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
  272						      item_size, &ref_root,
  273						      &ref_level);
  274			if (ret < 0) {
  275				btrfs_warn_rl(fs_info,
  276				"failed to resolve tree backref for logical %llu: %d",
  277					      logical, ret);
  278				break;
  279			}
  280			if (ret > 0)
  281				break;
  282
  283			btrfs_warn_rl(fs_info,
  284"csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu",
  285				logical, mirror_num,
  286				(ref_level ? "node" : "leaf"),
  287				ref_level, ref_root);
  288		}
  289		btrfs_release_path(&path);
  290	} else {
  291		struct btrfs_backref_walk_ctx ctx = { 0 };
  292		struct data_reloc_warn reloc_warn = { 0 };
  293
  294		btrfs_release_path(&path);
  295
  296		ctx.bytenr = found_key.objectid;
  297		ctx.extent_item_pos = logical - found_key.objectid;
  298		ctx.fs_info = fs_info;
  299
  300		reloc_warn.logical = logical;
  301		reloc_warn.extent_item_size = found_key.offset;
  302		reloc_warn.mirror_num = mirror_num;
  303		reloc_warn.fs_info = fs_info;
  304
  305		iterate_extent_inodes(&ctx, true,
  306				      data_reloc_print_warning_inode, &reloc_warn);
  307	}
  308}
  309
  310static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
  311		u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
  312{
  313	struct btrfs_root *root = inode->root;
  314	const u32 csum_size = root->fs_info->csum_size;
  315
  316	/* For data reloc tree, it's better to do a backref lookup instead. */
  317	if (btrfs_is_data_reloc_root(root))
  318		return print_data_reloc_error(inode, logical_start, csum,
  319					      csum_expected, mirror_num);
  320
  321	/* Output without objectid, which is more meaningful */
  322	if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) {
  323		btrfs_warn_rl(root->fs_info,
  324"csum failed root %lld ino %lld off %llu csum " BTRFS_CSUM_FMT " expected csum " BTRFS_CSUM_FMT " mirror %d",
  325			btrfs_root_id(root), btrfs_ino(inode),
  326			logical_start,
  327			BTRFS_CSUM_FMT_VALUE(csum_size, csum),
  328			BTRFS_CSUM_FMT_VALUE(csum_size, csum_expected),
  329			mirror_num);
  330	} else {
  331		btrfs_warn_rl(root->fs_info,
  332"csum failed root %llu ino %llu off %llu csum " BTRFS_CSUM_FMT " expected csum " BTRFS_CSUM_FMT " mirror %d",
  333			btrfs_root_id(root), btrfs_ino(inode),
  334			logical_start,
  335			BTRFS_CSUM_FMT_VALUE(csum_size, csum),
  336			BTRFS_CSUM_FMT_VALUE(csum_size, csum_expected),
  337			mirror_num);
  338	}
  339}
  340
  341/*
  342 * Lock inode i_rwsem based on arguments passed.
  343 *
  344 * ilock_flags can have the following bit set:
  345 *
  346 * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
  347 * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
  348 *		     return -EAGAIN
  349 * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
  350 */
  351int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
  352{
  353	if (ilock_flags & BTRFS_ILOCK_SHARED) {
  354		if (ilock_flags & BTRFS_ILOCK_TRY) {
  355			if (!inode_trylock_shared(&inode->vfs_inode))
  356				return -EAGAIN;
  357			else
  358				return 0;
  359		}
  360		inode_lock_shared(&inode->vfs_inode);
  361	} else {
  362		if (ilock_flags & BTRFS_ILOCK_TRY) {
  363			if (!inode_trylock(&inode->vfs_inode))
  364				return -EAGAIN;
  365			else
  366				return 0;
  367		}
  368		inode_lock(&inode->vfs_inode);
  369	}
  370	if (ilock_flags & BTRFS_ILOCK_MMAP)
  371		down_write(&inode->i_mmap_lock);
  372	return 0;
  373}
  374
  375/*
  376 * Unlock inode i_rwsem.
  377 *
  378 * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
  379 * to decide whether the lock acquired is shared or exclusive.
  380 */
  381void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
  382{
  383	if (ilock_flags & BTRFS_ILOCK_MMAP)
  384		up_write(&inode->i_mmap_lock);
  385	if (ilock_flags & BTRFS_ILOCK_SHARED)
  386		inode_unlock_shared(&inode->vfs_inode);
  387	else
  388		inode_unlock(&inode->vfs_inode);
  389}
  390
  391/*
  392 * Cleanup all submitted ordered extents in specified range to handle errors
  393 * from the btrfs_run_delalloc_range() callback.
  394 *
  395 * NOTE: caller must ensure that when an error happens, it can not call
  396 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
  397 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
  398 * to be released, which we want to happen only when finishing the ordered
  399 * extent (btrfs_finish_ordered_io()).
  400 */
  401static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
  402						 u64 offset, u64 bytes)
  403{
  404	pgoff_t index = offset >> PAGE_SHIFT;
  405	const pgoff_t end_index = (offset + bytes - 1) >> PAGE_SHIFT;
  406	struct folio *folio;
  407
  408	while (index <= end_index) {
  409		folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
  410		if (IS_ERR(folio)) {
  411			index++;
  412			continue;
  413		}
  414
  415		index = folio_next_index(folio);
  416		/*
  417		 * Here we just clear all Ordered bits for every page in the
  418		 * range, then btrfs_mark_ordered_io_finished() will handle
  419		 * the ordered extent accounting for the range.
  420		 */
  421		btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio,
  422						offset, bytes);
  423		folio_put(folio);
  424	}
  425
  426	return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
  427}
  428
  429static int btrfs_dirty_inode(struct btrfs_inode *inode);
  430
  431static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
  432				     struct btrfs_new_inode_args *args)
  433{
  434	int ret;
  435
  436	if (args->default_acl) {
  437		ret = __btrfs_set_acl(trans, args->inode, args->default_acl,
  438				      ACL_TYPE_DEFAULT);
  439		if (ret)
  440			return ret;
  441	}
  442	if (args->acl) {
  443		ret = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
  444		if (ret)
  445			return ret;
  446	}
  447	if (!args->default_acl && !args->acl)
  448		cache_no_acl(args->inode);
  449	return btrfs_xattr_security_init(trans, args->inode, args->dir,
  450					 &args->dentry->d_name);
  451}
  452
  453/*
  454 * this does all the hard work for inserting an inline extent into
  455 * the btree.  The caller should have done a btrfs_drop_extents so that
  456 * no overlapping inline items exist in the btree
  457 */
  458static int insert_inline_extent(struct btrfs_trans_handle *trans,
  459				struct btrfs_path *path,
  460				struct btrfs_inode *inode, bool extent_inserted,
  461				size_t size, size_t compressed_size,
  462				int compress_type,
  463				struct folio *compressed_folio,
  464				bool update_i_size)
  465{
  466	struct btrfs_root *root = inode->root;
  467	struct extent_buffer *leaf;
  468	const u32 sectorsize = trans->fs_info->sectorsize;
  469	char *kaddr;
  470	unsigned long ptr;
  471	struct btrfs_file_extent_item *ei;
  472	int ret;
  473	size_t cur_size = size;
  474	u64 i_size;
  475
  476	/*
  477	 * The decompressed size must still be no larger than a sector.  Under
  478	 * heavy race, we can have size == 0 passed in, but that shouldn't be a
  479	 * big deal and we can continue the insertion.
  480	 */
  481	ASSERT(size <= sectorsize);
  482
  483	/*
  484	 * The compressed size also needs to be no larger than a page.
  485	 * That's also why we only need one folio as the parameter.
  486	 */
  487	if (compressed_folio) {
  488		ASSERT(compressed_size <= sectorsize);
  489		ASSERT(compressed_size <= PAGE_SIZE);
  490	} else {
  491		ASSERT(compressed_size == 0);
  492	}
  493
  494	if (compressed_size && compressed_folio)
  495		cur_size = compressed_size;
  496
  497	if (!extent_inserted) {
  498		struct btrfs_key key;
  499		size_t datasize;
  500
  501		key.objectid = btrfs_ino(inode);
  502		key.type = BTRFS_EXTENT_DATA_KEY;
  503		key.offset = 0;
  504
  505		datasize = btrfs_file_extent_calc_inline_size(cur_size);
  506		ret = btrfs_insert_empty_item(trans, root, path, &key,
  507					      datasize);
  508		if (ret)
  509			goto fail;
  510	}
  511	leaf = path->nodes[0];
  512	ei = btrfs_item_ptr(leaf, path->slots[0],
  513			    struct btrfs_file_extent_item);
  514	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
  515	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
  516	btrfs_set_file_extent_encryption(leaf, ei, 0);
  517	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
  518	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
  519	ptr = btrfs_file_extent_inline_start(ei);
  520
  521	if (compress_type != BTRFS_COMPRESS_NONE) {
  522		kaddr = kmap_local_folio(compressed_folio, 0);
  523		write_extent_buffer(leaf, kaddr, ptr, compressed_size);
  524		kunmap_local(kaddr);
  525
  526		btrfs_set_file_extent_compression(leaf, ei,
  527						  compress_type);
  528	} else {
  529		struct folio *folio;
  530
  531		folio = filemap_get_folio(inode->vfs_inode.i_mapping, 0);
  532		ASSERT(!IS_ERR(folio));
  533		btrfs_set_file_extent_compression(leaf, ei, 0);
  534		kaddr = kmap_local_folio(folio, 0);
  535		write_extent_buffer(leaf, kaddr, ptr, size);
  536		kunmap_local(kaddr);
  537		folio_put(folio);
  538	}
  539	btrfs_release_path(path);
  540
  541	/*
  542	 * We align size to sectorsize for inline extents just for simplicity
  543	 * sake.
  544	 */
  545	ret = btrfs_inode_set_file_extent_range(inode, 0,
  546					ALIGN(size, root->fs_info->sectorsize));
  547	if (ret)
  548		goto fail;
  549
  550	/*
  551	 * We're an inline extent, so nobody can extend the file past i_size
  552	 * without locking a page we already have locked.
  553	 *
  554	 * We must do any i_size and inode updates before we unlock the pages.
  555	 * Otherwise we could end up racing with unlink.
  556	 */
  557	i_size = i_size_read(&inode->vfs_inode);
  558	if (update_i_size && size > i_size) {
  559		i_size_write(&inode->vfs_inode, size);
  560		i_size = size;
  561	}
  562	inode->disk_i_size = i_size;
  563
  564fail:
  565	return ret;
  566}
  567
  568static bool can_cow_file_range_inline(struct btrfs_inode *inode,
  569				      u64 offset, u64 size,
  570				      size_t compressed_size)
  571{
  572	struct btrfs_fs_info *fs_info = inode->root->fs_info;
  573	u64 data_len = (compressed_size ?: size);
  574
  575	/* Inline extents must start at offset 0. */
  576	if (offset != 0)
  577		return false;
  578
  579	/*
  580	 * Even for bs > ps cases, cow_file_range_inline() can only accept a
  581	 * single folio.
  582	 *
  583	 * This can be problematic and cause access beyond page boundary if a
  584	 * page sized folio is passed into that function.
  585	 * And encoded write is doing exactly that.
  586	 * So here limits the inlined extent size to PAGE_SIZE.
  587	 */
  588	if (size > PAGE_SIZE || compressed_size > PAGE_SIZE)
  589		return false;
  590
  591	/* Inline extents are limited to sectorsize. */
  592	if (size > fs_info->sectorsize)
  593		return false;
  594
  595	/* We do not allow a non-compressed extent to be as large as block size. */
  596	if (data_len >= fs_info->sectorsize)
  597		return false;
  598
  599	/* We cannot exceed the maximum inline data size. */
  600	if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
  601		return false;
  602
  603	/* We cannot exceed the user specified max_inline size. */
  604	if (data_len > fs_info->max_inline)
  605		return false;
  606
  607	/* Inline extents must be the entirety of the file. */
  608	if (size < i_size_read(&inode->vfs_inode))
  609		return false;
  610
  611	/* Encrypted file cannot be inlined. */
  612	if (IS_ENCRYPTED(&inode->vfs_inode))
  613		return false;
  614
  615	return true;
  616}
  617
  618/*
  619 * conditionally insert an inline extent into the file.  This
  620 * does the checks required to make sure the data is small enough
  621 * to fit as an inline extent.
  622 *
  623 * If being used directly, you must have already checked we're allowed to cow
  624 * the range by getting true from can_cow_file_range_inline().
  625 */
  626static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
  627					    u64 size, size_t compressed_size,
  628					    int compress_type,
  629					    struct folio *compressed_folio,
  630					    bool update_i_size)
  631{
  632	struct btrfs_drop_extents_args drop_args = { 0 };
  633	struct btrfs_root *root = inode->root;
  634	struct btrfs_fs_info *fs_info = root->fs_info;
  635	struct btrfs_trans_handle *trans = NULL;
  636	u64 data_len = (compressed_size ?: size);
  637	int ret;
  638	struct btrfs_path *path;
  639
  640	path = btrfs_alloc_path();
  641	if (!path) {
  642		ret = -ENOMEM;
  643		goto out;
  644	}
  645
  646	trans = btrfs_join_transaction(root);
  647	if (IS_ERR(trans)) {
  648		ret = PTR_ERR(trans);
  649		trans = NULL;
  650		goto out;
  651	}
  652	trans->block_rsv = &inode->block_rsv;
  653
  654	drop_args.path = path;
  655	drop_args.start = 0;
  656	drop_args.end = fs_info->sectorsize;
  657	drop_args.drop_cache = true;
  658	drop_args.replace_extent = true;
  659	drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
  660	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
  661	if (unlikely(ret)) {
  662		btrfs_abort_transaction(trans, ret);
  663		goto out;
  664	}
  665
  666	ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
  667				   size, compressed_size, compress_type,
  668				   compressed_folio, update_i_size);
  669	if (unlikely(ret && ret != -ENOSPC)) {
  670		btrfs_abort_transaction(trans, ret);
  671		goto out;
  672	} else if (ret == -ENOSPC) {
  673		ret = 1;
  674		goto out;
  675	}
  676
  677	btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
  678	ret = btrfs_update_inode(trans, inode);
  679	if (unlikely(ret && ret != -ENOSPC)) {
  680		btrfs_abort_transaction(trans, ret);
  681		goto out;
  682	} else if (ret == -ENOSPC) {
  683		ret = 1;
  684		goto out;
  685	}
  686
  687	btrfs_set_inode_full_sync(inode);
  688out:
  689	/*
  690	 * Don't forget to free the reserved space, as for inlined extent
  691	 * it won't count as data extent, free them directly here.
  692	 * And at reserve time, it's always aligned to page size, so
  693	 * just free one page here.
  694	 *
  695	 * If we fallback to non-inline (ret == 1) due to -ENOSPC, then we need
  696	 * to keep the data reservation.
  697	 */
  698	if (ret <= 0)
  699		btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL);
  700	btrfs_free_path(path);
  701	if (trans)
  702		btrfs_end_transaction(trans);
  703	return ret;
  704}
  705
  706static noinline int cow_file_range_inline(struct btrfs_inode *inode,
  707					  struct folio *locked_folio,
  708					  u64 offset, u64 end,
  709					  size_t compressed_size,
  710					  int compress_type,
  711					  struct folio *compressed_folio,
  712					  bool update_i_size)
  713{
  714	struct extent_state *cached = NULL;
  715	unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
  716		EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED;
  717	u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1);
  718	int ret;
  719
  720	if (!can_cow_file_range_inline(inode, offset, size, compressed_size))
  721		return 1;
  722
  723	btrfs_lock_extent(&inode->io_tree, offset, end, &cached);
  724	ret = __cow_file_range_inline(inode, size, compressed_size,
  725				      compress_type, compressed_folio,
  726				      update_i_size);
  727	if (ret > 0) {
  728		btrfs_unlock_extent(&inode->io_tree, offset, end, &cached);
  729		return ret;
  730	}
  731
  732	/*
  733	 * In the successful case (ret == 0 here), cow_file_range will return 1.
  734	 *
  735	 * Quite a bit further up the callstack in extent_writepage(), ret == 1
  736	 * is treated as a short circuited success and does not unlock the folio,
  737	 * so we must do it here.
  738	 *
  739	 * In the failure case, the locked_folio does get unlocked by
  740	 * btrfs_folio_end_all_writers, which asserts that it is still locked
  741	 * at that point, so we must *not* unlock it here.
  742	 *
  743	 * The other two callsites in compress_file_range do not have a
  744	 * locked_folio, so they are not relevant to this logic.
  745	 */
  746	if (ret == 0)
  747		locked_folio = NULL;
  748
  749	extent_clear_unlock_delalloc(inode, offset, end, locked_folio, &cached,
  750				     clear_flags, PAGE_UNLOCK |
  751				     PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
  752	return ret;
  753}
  754
  755struct async_extent {
  756	u64 start;
  757	u64 ram_size;
  758	u64 compressed_size;
  759	struct folio **folios;
  760	unsigned long nr_folios;
  761	int compress_type;
  762	struct list_head list;
  763};
  764
  765struct async_chunk {
  766	struct btrfs_inode *inode;
  767	struct folio *locked_folio;
  768	u64 start;
  769	u64 end;
  770	blk_opf_t write_flags;
  771	struct list_head extents;
  772	struct cgroup_subsys_state *blkcg_css;
  773	struct btrfs_work work;
  774	struct async_cow *async_cow;
  775};
  776
  777struct async_cow {
  778	atomic_t num_chunks;
  779	struct async_chunk chunks[];
  780};
  781
  782static noinline int add_async_extent(struct async_chunk *cow,
  783				     u64 start, u64 ram_size,
  784				     u64 compressed_size,
  785				     struct folio **folios,
  786				     unsigned long nr_folios,
  787				     int compress_type)
  788{
  789	struct async_extent *async_extent;
  790
  791	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
  792	if (!async_extent)
  793		return -ENOMEM;
  794	async_extent->start = start;
  795	async_extent->ram_size = ram_size;
  796	async_extent->compressed_size = compressed_size;
  797	async_extent->folios = folios;
  798	async_extent->nr_folios = nr_folios;
  799	async_extent->compress_type = compress_type;
  800	list_add_tail(&async_extent->list, &cow->extents);
  801	return 0;
  802}
  803
  804/*
  805 * Check if the inode needs to be submitted to compression, based on mount
  806 * options, defragmentation, properties or heuristics.
  807 */
  808static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
  809				      u64 end)
  810{
  811	struct btrfs_fs_info *fs_info = inode->root->fs_info;
  812
  813	if (!btrfs_inode_can_compress(inode)) {
  814		DEBUG_WARN("BTRFS: unexpected compression for ino %llu", btrfs_ino(inode));
  815		return 0;
  816	}
  817
  818	/* Defrag ioctl takes precedence over mount options and properties. */
  819	if (inode->defrag_compress == BTRFS_DEFRAG_DONT_COMPRESS)
  820		return 0;
  821	if (BTRFS_COMPRESS_NONE < inode->defrag_compress &&
  822	    inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES)
  823		return 1;
  824	/* force compress */
  825	if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
  826		return 1;
  827	/* bad compression ratios */
  828	if (inode->flags & BTRFS_INODE_NOCOMPRESS)
  829		return 0;
  830	if (btrfs_test_opt(fs_info, COMPRESS) ||
  831	    inode->flags & BTRFS_INODE_COMPRESS ||
  832	    inode->prop_compress)
  833		return btrfs_compress_heuristic(inode, start, end);
  834	return 0;
  835}
  836
  837static inline void inode_should_defrag(struct btrfs_inode *inode,
  838		u64 start, u64 end, u64 num_bytes, u32 small_write)
  839{
  840	/* If this is a small write inside eof, kick off a defrag */
  841	if (num_bytes < small_write &&
  842	    (start > 0 || end + 1 < inode->disk_i_size))
  843		btrfs_add_inode_defrag(inode, small_write);
  844}
  845
  846static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end)
  847{
  848	const pgoff_t end_index = end >> PAGE_SHIFT;
  849	struct folio *folio;
  850	int ret = 0;
  851
  852	for (pgoff_t index = start >> PAGE_SHIFT; index <= end_index; index++) {
  853		folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
  854		if (IS_ERR(folio)) {
  855			if (!ret)
  856				ret = PTR_ERR(folio);
  857			continue;
  858		}
  859		btrfs_folio_clamp_clear_dirty(inode->root->fs_info, folio, start,
  860					      end + 1 - start);
  861		folio_put(folio);
  862	}
  863	return ret;
  864}
  865
  866/*
  867 * Work queue call back to started compression on a file and pages.
  868 *
  869 * This is done inside an ordered work queue, and the compression is spread
  870 * across many cpus.  The actual IO submission is step two, and the ordered work
  871 * queue takes care of making sure that happens in the same order things were
  872 * put onto the queue by writepages and friends.
  873 *
  874 * If this code finds it can't get good compression, it puts an entry onto the
  875 * work queue to write the uncompressed bytes.  This makes sure that both
  876 * compressed inodes and uncompressed inodes are written in the same order that
  877 * the flusher thread sent them down.
  878 */
  879static void compress_file_range(struct btrfs_work *work)
  880{
  881	struct async_chunk *async_chunk =
  882		container_of(work, struct async_chunk, work);
  883	struct btrfs_inode *inode = async_chunk->inode;
  884	struct btrfs_fs_info *fs_info = inode->root->fs_info;
  885	struct address_space *mapping = inode->vfs_inode.i_mapping;
  886	const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
  887	const u32 min_folio_size = btrfs_min_folio_size(fs_info);
  888	u64 blocksize = fs_info->sectorsize;
  889	u64 start = async_chunk->start;
  890	u64 end = async_chunk->end;
  891	u64 actual_end;
  892	u64 i_size;
  893	int ret = 0;
  894	struct folio **folios = NULL;
  895	unsigned long nr_folios;
  896	unsigned long total_compressed = 0;
  897	unsigned long total_in = 0;
  898	unsigned int loff;
  899	int i;
  900	int compress_type = fs_info->compress_type;
  901	int compress_level = fs_info->compress_level;
  902
  903	if (unlikely(btrfs_is_shutdown(fs_info)))
  904		goto cleanup_and_bail_uncompressed;
  905
  906	inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
  907
  908	/*
  909	 * We need to call clear_page_dirty_for_io on each page in the range.
  910	 * Otherwise applications with the file mmap'd can wander in and change
  911	 * the page contents while we are compressing them.
  912	 */
  913	ret = extent_range_clear_dirty_for_io(inode, start, end);
  914
  915	/*
  916	 * All the folios should have been locked thus no failure.
  917	 *
  918	 * And even if some folios are missing, btrfs_compress_folios()
  919	 * would handle them correctly, so here just do an ASSERT() check for
  920	 * early logic errors.
  921	 */
  922	ASSERT(ret == 0);
  923
  924	/*
  925	 * We need to save i_size before now because it could change in between
  926	 * us evaluating the size and assigning it.  This is because we lock and
  927	 * unlock the page in truncate and fallocate, and then modify the i_size
  928	 * later on.
  929	 *
  930	 * The barriers are to emulate READ_ONCE, remove that once i_size_read
  931	 * does that for us.
  932	 */
  933	barrier();
  934	i_size = i_size_read(&inode->vfs_inode);
  935	barrier();
  936	actual_end = min_t(u64, i_size, end + 1);
  937again:
  938	folios = NULL;
  939	nr_folios = (end >> min_folio_shift) - (start >> min_folio_shift) + 1;
  940	nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED >> min_folio_shift);
  941
  942	/*
  943	 * we don't want to send crud past the end of i_size through
  944	 * compression, that's just a waste of CPU time.  So, if the
  945	 * end of the file is before the start of our current
  946	 * requested range of bytes, we bail out to the uncompressed
  947	 * cleanup code that can deal with all of this.
  948	 *
  949	 * It isn't really the fastest way to fix things, but this is a
  950	 * very uncommon corner.
  951	 */
  952	if (actual_end <= start)
  953		goto cleanup_and_bail_uncompressed;
  954
  955	total_compressed = actual_end - start;
  956
  957	/*
  958	 * Skip compression for a small file range(<=blocksize) that
  959	 * isn't an inline extent, since it doesn't save disk space at all.
  960	 */
  961	if (total_compressed <= blocksize &&
  962	   (start > 0 || end + 1 < inode->disk_i_size))
  963		goto cleanup_and_bail_uncompressed;
  964
  965	total_compressed = min_t(unsigned long, total_compressed,
  966			BTRFS_MAX_UNCOMPRESSED);
  967	total_in = 0;
  968	ret = 0;
  969
  970	/*
  971	 * We do compression for mount -o compress and when the inode has not
  972	 * been flagged as NOCOMPRESS.  This flag can change at any time if we
  973	 * discover bad compression ratios.
  974	 */
  975	if (!inode_need_compress(inode, start, end))
  976		goto cleanup_and_bail_uncompressed;
  977
  978	folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS);
  979	if (!folios) {
  980		/*
  981		 * Memory allocation failure is not a fatal error, we can fall
  982		 * back to uncompressed code.
  983		 */
  984		goto cleanup_and_bail_uncompressed;
  985	}
  986
  987	if (0 < inode->defrag_compress && inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) {
  988		compress_type = inode->defrag_compress;
  989		compress_level = inode->defrag_compress_level;
  990	} else if (inode->prop_compress) {
  991		compress_type = inode->prop_compress;
  992	}
  993
  994	/* Compression level is applied here. */
  995	ret = btrfs_compress_folios(compress_type, compress_level,
  996				    inode, start, folios, &nr_folios, &total_in,
  997				    &total_compressed);
  998	if (ret)
  999		goto mark_incompressible;
 1000
 1001	/*
 1002	 * Zero the tail end of the last folio, as we might be sending it down
 1003	 * to disk.
 1004	 */
 1005	loff = (total_compressed & (min_folio_size - 1));
 1006	if (loff)
 1007		folio_zero_range(folios[nr_folios - 1], loff, min_folio_size - loff);
 1008
 1009	/*
 1010	 * Try to create an inline extent.
 1011	 *
 1012	 * If we didn't compress the entire range, try to create an uncompressed
 1013	 * inline extent, else a compressed one.
 1014	 *
 1015	 * Check cow_file_range() for why we don't even try to create inline
 1016	 * extent for the subpage case.
 1017	 */
 1018	if (total_in < actual_end)
 1019		ret = cow_file_range_inline(inode, NULL, start, end, 0,
 1020					    BTRFS_COMPRESS_NONE, NULL, false);
 1021	else
 1022		ret = cow_file_range_inline(inode, NULL, start, end, total_compressed,
 1023					    compress_type, folios[0], false);
 1024	if (ret <= 0) {
 1025		if (ret < 0)
 1026			mapping_set_error(mapping, -EIO);
 1027		goto free_pages;
 1028	}
 1029
 1030	/*
 1031	 * We aren't doing an inline extent. Round the compressed size up to a
 1032	 * block size boundary so the allocator does sane things.
 1033	 */
 1034	total_compressed = ALIGN(total_compressed, blocksize);
 1035
 1036	/*
 1037	 * One last check to make sure the compression is really a win, compare
 1038	 * the page count read with the blocks on disk, compression must free at
 1039	 * least one sector.
 1040	 */
 1041	total_in = round_up(total_in, fs_info->sectorsize);
 1042	if (total_compressed + blocksize > total_in)
 1043		goto mark_incompressible;
 1044
 1045	/*
 1046	 * The async work queues will take care of doing actual allocation on
 1047	 * disk for these compressed pages, and will submit the bios.
 1048	 */
 1049	ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios,
 1050			       nr_folios, compress_type);
 1051	BUG_ON(ret);
 1052	if (start + total_in < end) {
 1053		start += total_in;
 1054		cond_resched();
 1055		goto again;
 1056	}
 1057	return;
 1058
 1059mark_incompressible:
 1060	if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress)
 1061		inode->flags |= BTRFS_INODE_NOCOMPRESS;
 1062cleanup_and_bail_uncompressed:
 1063	ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
 1064			       BTRFS_COMPRESS_NONE);
 1065	BUG_ON(ret);
 1066free_pages:
 1067	if (folios) {
 1068		for (i = 0; i < nr_folios; i++) {
 1069			WARN_ON(folios[i]->mapping);
 1070			btrfs_free_compr_folio(folios[i]);
 1071		}
 1072		kfree(folios);
 1073	}
 1074}
 1075
 1076static void free_async_extent_pages(struct async_extent *async_extent)
 1077{
 1078	int i;
 1079
 1080	if (!async_extent->folios)
 1081		return;
 1082
 1083	for (i = 0; i < async_extent->nr_folios; i++) {
 1084		WARN_ON(async_extent->folios[i]->mapping);
 1085		btrfs_free_compr_folio(async_extent->folios[i]);
 1086	}
 1087	kfree(async_extent->folios);
 1088	async_extent->nr_folios = 0;
 1089	async_extent->folios = NULL;
 1090}
 1091
 1092static void submit_uncompressed_range(struct btrfs_inode *inode,
 1093				      struct async_extent *async_extent,
 1094				      struct folio *locked_folio)
 1095{
 1096	u64 start = async_extent->start;
 1097	u64 end = async_extent->start + async_extent->ram_size - 1;
 1098	int ret;
 1099	struct writeback_control wbc = {
 1100		.sync_mode		= WB_SYNC_ALL,
 1101		.range_start		= start,
 1102		.range_end		= end,
 1103		.no_cgroup_owner	= 1,
 1104	};
 1105
 1106	wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode);
 1107	ret = run_delalloc_cow(inode, locked_folio, start, end,
 1108			       &wbc, false);
 1109	wbc_detach_inode(&wbc);
 1110	if (ret < 0) {
 1111		if (locked_folio)
 1112			btrfs_folio_end_lock(inode->root->fs_info, locked_folio,
 1113					     start, async_extent->ram_size);
 1114		btrfs_err_rl(inode->root->fs_info,
 1115			"%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
 1116			     __func__, btrfs_root_id(inode->root),
 1117			     btrfs_ino(inode), start, async_extent->ram_size, ret);
 1118	}
 1119}
 1120
 1121static void submit_one_async_extent(struct async_chunk *async_chunk,
 1122				    struct async_extent *async_extent,
 1123				    u64 *alloc_hint)
 1124{
 1125	struct btrfs_inode *inode = async_chunk->inode;
 1126	struct extent_io_tree *io_tree = &inode->io_tree;
 1127	struct btrfs_root *root = inode->root;
 1128	struct btrfs_fs_info *fs_info = root->fs_info;
 1129	struct btrfs_ordered_extent *ordered;
 1130	struct btrfs_file_extent file_extent;
 1131	struct btrfs_key ins;
 1132	struct folio *locked_folio = NULL;
 1133	struct extent_state *cached = NULL;
 1134	struct extent_map *em;
 1135	int ret = 0;
 1136	bool free_pages = false;
 1137	u64 start = async_extent->start;
 1138	u64 end = async_extent->start + async_extent->ram_size - 1;
 1139
 1140	if (async_chunk->blkcg_css)
 1141		kthread_associate_blkcg(async_chunk->blkcg_css);
 1142
 1143	/*
 1144	 * If async_chunk->locked_folio is in the async_extent range, we need to
 1145	 * handle it.
 1146	 */
 1147	if (async_chunk->locked_folio) {
 1148		u64 locked_folio_start = folio_pos(async_chunk->locked_folio);
 1149		u64 locked_folio_end = locked_folio_start +
 1150			folio_size(async_chunk->locked_folio) - 1;
 1151
 1152		if (!(start >= locked_folio_end || end <= locked_folio_start))
 1153			locked_folio = async_chunk->locked_folio;
 1154	}
 1155
 1156	if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
 1157		ASSERT(!async_extent->folios);
 1158		ASSERT(async_extent->nr_folios == 0);
 1159		submit_uncompressed_range(inode, async_extent, locked_folio);
 1160		free_pages = true;
 1161		goto done;
 1162	}
 1163
 1164	ret = btrfs_reserve_extent(root, async_extent->ram_size,
 1165				   async_extent->compressed_size,
 1166				   async_extent->compressed_size,
 1167				   0, *alloc_hint, &ins, true, true);
 1168	if (ret) {
 1169		/*
 1170		 * We can't reserve contiguous space for the compressed size.
 1171		 * Unlikely, but it's possible that we could have enough
 1172		 * non-contiguous space for the uncompressed size instead.  So
 1173		 * fall back to uncompressed.
 1174		 */
 1175		submit_uncompressed_range(inode, async_extent, locked_folio);
 1176		free_pages = true;
 1177		goto done;
 1178	}
 1179
 1180	btrfs_lock_extent(io_tree, start, end, &cached);
 1181
 1182	/* Here we're doing allocation and writeback of the compressed pages */
 1183	file_extent.disk_bytenr = ins.objectid;
 1184	file_extent.disk_num_bytes = ins.offset;
 1185	file_extent.ram_bytes = async_extent->ram_size;
 1186	file_extent.num_bytes = async_extent->ram_size;
 1187	file_extent.offset = 0;
 1188	file_extent.compression = async_extent->compress_type;
 1189
 1190	em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
 1191	if (IS_ERR(em)) {
 1192		ret = PTR_ERR(em);
 1193		goto out_free_reserve;
 1194	}
 1195	btrfs_free_extent_map(em);
 1196
 1197	ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
 1198					     1U << BTRFS_ORDERED_COMPRESSED);
 1199	if (IS_ERR(ordered)) {
 1200		btrfs_drop_extent_map_range(inode, start, end, false);
 1201		ret = PTR_ERR(ordered);
 1202		goto out_free_reserve;
 1203	}
 1204	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 1205
 1206	/* Clear dirty, set writeback and unlock the pages. */
 1207	extent_clear_unlock_delalloc(inode, start, end,
 1208			NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC,
 1209			PAGE_UNLOCK | PAGE_START_WRITEBACK);
 1210	btrfs_submit_compressed_write(ordered,
 1211			    async_extent->folios,	/* compressed_folios */
 1212			    async_extent->nr_folios,
 1213			    async_chunk->write_flags, true);
 1214	*alloc_hint = ins.objectid + ins.offset;
 1215done:
 1216	if (async_chunk->blkcg_css)
 1217		kthread_associate_blkcg(NULL);
 1218	if (free_pages)
 1219		free_async_extent_pages(async_extent);
 1220	kfree(async_extent);
 1221	return;
 1222
 1223out_free_reserve:
 1224	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 1225	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
 1226	mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
 1227	extent_clear_unlock_delalloc(inode, start, end,
 1228				     NULL, &cached,
 1229				     EXTENT_LOCKED | EXTENT_DELALLOC |
 1230				     EXTENT_DELALLOC_NEW |
 1231				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
 1232				     PAGE_UNLOCK | PAGE_START_WRITEBACK |
 1233				     PAGE_END_WRITEBACK);
 1234	free_async_extent_pages(async_extent);
 1235	if (async_chunk->blkcg_css)
 1236		kthread_associate_blkcg(NULL);
 1237	btrfs_debug(fs_info,
 1238"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
 1239		    btrfs_root_id(root), btrfs_ino(inode), start,
 1240		    async_extent->ram_size, ret);
 1241	kfree(async_extent);
 1242}
 1243
 1244u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
 1245				     u64 num_bytes)
 1246{
 1247	struct extent_map_tree *em_tree = &inode->extent_tree;
 1248	struct extent_map *em;
 1249	u64 alloc_hint = 0;
 1250
 1251	read_lock(&em_tree->lock);
 1252	em = btrfs_search_extent_mapping(em_tree, start, num_bytes);
 1253	if (em) {
 1254		/*
 1255		 * if block start isn't an actual block number then find the
 1256		 * first block in this inode and use that as a hint.  If that
 1257		 * block is also bogus then just don't worry about it.
 1258		 */
 1259		if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
 1260			btrfs_free_extent_map(em);
 1261			em = btrfs_search_extent_mapping(em_tree, 0, 0);
 1262			if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
 1263				alloc_hint = btrfs_extent_map_block_start(em);
 1264			if (em)
 1265				btrfs_free_extent_map(em);
 1266		} else {
 1267			alloc_hint = btrfs_extent_map_block_start(em);
 1268			btrfs_free_extent_map(em);
 1269		}
 1270	}
 1271	read_unlock(&em_tree->lock);
 1272
 1273	return alloc_hint;
 1274}
 1275
 1276/*
 1277 * when extent_io.c finds a delayed allocation range in the file,
 1278 * the call backs end up in this code.  The basic idea is to
 1279 * allocate extents on disk for the range, and create ordered data structs
 1280 * in ram to track those extents.
 1281 *
 1282 * locked_folio is the folio that writepage had locked already.  We use
 1283 * it to make sure we don't do extra locks or unlocks.
 1284 *
 1285 * When this function fails, it unlocks all folios except @locked_folio.
 1286 *
 1287 * When this function successfully creates an inline extent, it returns 1 and
 1288 * unlocks all folios including locked_folio and starts I/O on them.
 1289 * (In reality inline extents are limited to a single block, so locked_folio is
 1290 * the only folio handled anyway).
 1291 *
 1292 * When this function succeed and creates a normal extent, the folio locking
 1293 * status depends on the passed in flags:
 1294 *
 1295 * - If COW_FILE_RANGE_KEEP_LOCKED flag is set, all folios are kept locked.
 1296 * - Else all folios except for @locked_folio are unlocked.
 1297 *
 1298 * When a failure happens in the second or later iteration of the
 1299 * while-loop, the ordered extents created in previous iterations are cleaned up.
 1300 */
 1301static noinline int cow_file_range(struct btrfs_inode *inode,
 1302				   struct folio *locked_folio, u64 start,
 1303				   u64 end, u64 *done_offset,
 1304				   unsigned long flags)
 1305{
 1306	struct btrfs_root *root = inode->root;
 1307	struct btrfs_fs_info *fs_info = root->fs_info;
 1308	struct extent_state *cached = NULL;
 1309	u64 alloc_hint = 0;
 1310	u64 orig_start = start;
 1311	u64 num_bytes;
 1312	u64 cur_alloc_size = 0;
 1313	u64 min_alloc_size;
 1314	u64 blocksize = fs_info->sectorsize;
 1315	struct btrfs_key ins;
 1316	struct extent_map *em;
 1317	unsigned clear_bits;
 1318	unsigned long page_ops;
 1319	int ret = 0;
 1320
 1321	if (unlikely(btrfs_is_shutdown(fs_info))) {
 1322		ret = -EIO;
 1323		goto out_unlock;
 1324	}
 1325
 1326	if (btrfs_is_free_space_inode(inode)) {
 1327		ret = -EINVAL;
 1328		goto out_unlock;
 1329	}
 1330
 1331	num_bytes = ALIGN(end - start + 1, blocksize);
 1332	num_bytes = max(blocksize,  num_bytes);
 1333	ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
 1334
 1335	inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
 1336
 1337	if (!(flags & COW_FILE_RANGE_NO_INLINE)) {
 1338		/* lets try to make an inline extent */
 1339		ret = cow_file_range_inline(inode, locked_folio, start, end, 0,
 1340					    BTRFS_COMPRESS_NONE, NULL, false);
 1341		if (ret <= 0) {
 1342			/*
 1343			 * We succeeded, return 1 so the caller knows we're done
 1344			 * with this page and already handled the IO.
 1345			 *
 1346			 * If there was an error then cow_file_range_inline() has
 1347			 * already done the cleanup.
 1348			 */
 1349			if (ret == 0)
 1350				ret = 1;
 1351			goto done;
 1352		}
 1353	}
 1354
 1355	alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes);
 1356
 1357	/*
 1358	 * We're not doing compressed IO, don't unlock the first page (which
 1359	 * the caller expects to stay locked), don't clear any dirty bits and
 1360	 * don't set any writeback bits.
 1361	 *
 1362	 * Do set the Ordered (Private2) bit so we know this page was properly
 1363	 * setup for writepage.
 1364	 */
 1365	page_ops = ((flags & COW_FILE_RANGE_KEEP_LOCKED) ? 0 : PAGE_UNLOCK);
 1366	page_ops |= PAGE_SET_ORDERED;
 1367
 1368	/*
 1369	 * Relocation relies on the relocated extents to have exactly the same
 1370	 * size as the original extents. Normally writeback for relocation data
 1371	 * extents follows a NOCOW path because relocation preallocates the
 1372	 * extents. However, due to an operation such as scrub turning a block
 1373	 * group to RO mode, it may fallback to COW mode, so we must make sure
 1374	 * an extent allocated during COW has exactly the requested size and can
 1375	 * not be split into smaller extents, otherwise relocation breaks and
 1376	 * fails during the stage where it updates the bytenr of file extent
 1377	 * items.
 1378	 */
 1379	if (btrfs_is_data_reloc_root(root))
 1380		min_alloc_size = num_bytes;
 1381	else
 1382		min_alloc_size = fs_info->sectorsize;
 1383
 1384	while (num_bytes > 0) {
 1385		struct btrfs_ordered_extent *ordered;
 1386		struct btrfs_file_extent file_extent;
 1387
 1388		ret = btrfs_reserve_extent(root, num_bytes, num_bytes,
 1389					   min_alloc_size, 0, alloc_hint,
 1390					   &ins, true, true);
 1391		if (ret == -EAGAIN) {
 1392			/*
 1393			 * btrfs_reserve_extent only returns -EAGAIN for zoned
 1394			 * file systems, which is an indication that there are
 1395			 * no active zones to allocate from at the moment.
 1396			 *
 1397			 * If this is the first loop iteration, wait for at
 1398			 * least one zone to finish before retrying the
 1399			 * allocation.  Otherwise ask the caller to write out
 1400			 * the already allocated blocks before coming back to
 1401			 * us, or return -ENOSPC if it can't handle retries.
 1402			 */
 1403			ASSERT(btrfs_is_zoned(fs_info));
 1404			if (start == orig_start) {
 1405				wait_on_bit_io(&inode->root->fs_info->flags,
 1406					       BTRFS_FS_NEED_ZONE_FINISH,
 1407					       TASK_UNINTERRUPTIBLE);
 1408				continue;
 1409			}
 1410			if (done_offset) {
 1411				/*
 1412				 * Move @end to the end of the processed range,
 1413				 * and exit the loop to unlock the processed extents.
 1414				 */
 1415				end = start - 1;
 1416				ret = 0;
 1417				break;
 1418			}
 1419			ret = -ENOSPC;
 1420		}
 1421		if (ret < 0)
 1422			goto out_unlock;
 1423		cur_alloc_size = ins.offset;
 1424
 1425		file_extent.disk_bytenr = ins.objectid;
 1426		file_extent.disk_num_bytes = ins.offset;
 1427		file_extent.num_bytes = ins.offset;
 1428		file_extent.ram_bytes = ins.offset;
 1429		file_extent.offset = 0;
 1430		file_extent.compression = BTRFS_COMPRESS_NONE;
 1431
 1432		/*
 1433		 * Locked range will be released either during error clean up or
 1434		 * after the whole range is finished.
 1435		 */
 1436		btrfs_lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,
 1437				  &cached);
 1438
 1439		em = btrfs_create_io_em(inode, start, &file_extent,
 1440					BTRFS_ORDERED_REGULAR);
 1441		if (IS_ERR(em)) {
 1442			btrfs_unlock_extent(&inode->io_tree, start,
 1443					    start + cur_alloc_size - 1, &cached);
 1444			ret = PTR_ERR(em);
 1445			goto out_reserve;
 1446		}
 1447		btrfs_free_extent_map(em);
 1448
 1449		ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
 1450						     1U << BTRFS_ORDERED_REGULAR);
 1451		if (IS_ERR(ordered)) {
 1452			btrfs_unlock_extent(&inode->io_tree, start,
 1453					    start + cur_alloc_size - 1, &cached);
 1454			ret = PTR_ERR(ordered);
 1455			goto out_drop_extent_cache;
 1456		}
 1457
 1458		if (btrfs_is_data_reloc_root(root)) {
 1459			ret = btrfs_reloc_clone_csums(ordered);
 1460
 1461			/*
 1462			 * Only drop cache here, and process as normal.
 1463			 *
 1464			 * We must not allow extent_clear_unlock_delalloc()
 1465			 * at out_unlock label to free meta of this ordered
 1466			 * extent, as its meta should be freed by
 1467			 * btrfs_finish_ordered_io().
 1468			 *
 1469			 * So we must continue until @start is increased to
 1470			 * skip current ordered extent.
 1471			 */
 1472			if (ret)
 1473				btrfs_drop_extent_map_range(inode, start,
 1474							    start + cur_alloc_size - 1,
 1475							    false);
 1476		}
 1477		btrfs_put_ordered_extent(ordered);
 1478
 1479		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 1480
 1481		if (num_bytes < cur_alloc_size)
 1482			num_bytes = 0;
 1483		else
 1484			num_bytes -= cur_alloc_size;
 1485		alloc_hint = ins.objectid + ins.offset;
 1486		start += cur_alloc_size;
 1487		cur_alloc_size = 0;
 1488
 1489		/*
 1490		 * btrfs_reloc_clone_csums() error, since start is increased
 1491		 * extent_clear_unlock_delalloc() at out_unlock label won't
 1492		 * free metadata of current ordered extent, we're OK to exit.
 1493		 */
 1494		if (ret)
 1495			goto out_unlock;
 1496	}
 1497	extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached,
 1498				     EXTENT_LOCKED | EXTENT_DELALLOC, page_ops);
 1499done:
 1500	if (done_offset)
 1501		*done_offset = end;
 1502	return ret;
 1503
 1504out_drop_extent_cache:
 1505	btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false);
 1506out_reserve:
 1507	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 1508	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
 1509out_unlock:
 1510	/*
 1511	 * Now, we have three regions to clean up:
 1512	 *
 1513	 * |-------(1)----|---(2)---|-------------(3)----------|
 1514	 * `- orig_start  `- start  `- start + cur_alloc_size  `- end
 1515	 *
 1516	 * We process each region below.
 1517	 */
 1518
 1519	/*
 1520	 * For the range (1). We have already instantiated the ordered extents
 1521	 * for this region, thus we need to cleanup those ordered extents.
 1522	 * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV
 1523	 * are also handled by the ordered extents cleanup.
 1524	 *
 1525	 * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and
 1526	 * finish the writeback of the involved folios, which will be never submitted.
 1527	 */
 1528	if (orig_start < start) {
 1529		clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
 1530		page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
 1531
 1532		if (!locked_folio)
 1533			mapping_set_error(inode->vfs_inode.i_mapping, ret);
 1534
 1535		btrfs_cleanup_ordered_extents(inode, orig_start, start - orig_start);
 1536		extent_clear_unlock_delalloc(inode, orig_start, start - 1,
 1537					     locked_folio, NULL, clear_bits, page_ops);
 1538	}
 1539
 1540	clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
 1541		     EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
 1542	page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
 1543
 1544	/*
 1545	 * For the range (2). If we reserved an extent for our delalloc range
 1546	 * (or a subrange) and failed to create the respective ordered extent,
 1547	 * then it means that when we reserved the extent we decremented the
 1548	 * extent's size from the data space_info's bytes_may_use counter and
 1549	 * incremented the space_info's bytes_reserved counter by the same
 1550	 * amount. We must make sure extent_clear_unlock_delalloc() does not try
 1551	 * to decrement again the data space_info's bytes_may_use counter,
 1552	 * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
 1553	 */
 1554	if (cur_alloc_size) {
 1555		extent_clear_unlock_delalloc(inode, start,
 1556					     start + cur_alloc_size - 1,
 1557					     locked_folio, &cached, clear_bits,
 1558					     page_ops);
 1559		btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
 1560	}
 1561
 1562	/*
 1563	 * For the range (3). We never touched the region. In addition to the
 1564	 * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
 1565	 * space_info's bytes_may_use counter, reserved in
 1566	 * btrfs_check_data_free_space().
 1567	 */
 1568	if (start + cur_alloc_size < end) {
 1569		clear_bits |= EXTENT_CLEAR_DATA_RESV;
 1570		extent_clear_unlock_delalloc(inode, start + cur_alloc_size,
 1571					     end, locked_folio,
 1572					     &cached, clear_bits, page_ops);
 1573		btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size,
 1574				       end - start - cur_alloc_size + 1, NULL);
 1575	}
 1576	btrfs_err(fs_info,
 1577"%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu cur_alloc_size=%llu: %d",
 1578		  __func__, btrfs_root_id(inode->root),
 1579		  btrfs_ino(inode), orig_start, end + 1 - orig_start,
 1580		  start, cur_alloc_size, ret);
 1581	return ret;
 1582}
 1583
 1584/*
 1585 * Phase two of compressed writeback.  This is the ordered portion of the code,
 1586 * which only gets called in the order the work was queued.  We walk all the
 1587 * async extents created by compress_file_range and send them down to the disk.
 1588 *
 1589 * If called with @do_free == true then it'll try to finish the work and free
 1590 * the work struct eventually.
 1591 */
 1592static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free)
 1593{
 1594	struct async_chunk *async_chunk = container_of(work, struct async_chunk,
 1595						     work);
 1596	struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
 1597	struct async_extent *async_extent;
 1598	unsigned long nr_pages;
 1599	u64 alloc_hint = 0;
 1600
 1601	if (do_free) {
 1602		struct async_cow *async_cow;
 1603
 1604		btrfs_add_delayed_iput(async_chunk->inode);
 1605		if (async_chunk->blkcg_css)
 1606			css_put(async_chunk->blkcg_css);
 1607
 1608		async_cow = async_chunk->async_cow;
 1609		if (atomic_dec_and_test(&async_cow->num_chunks))
 1610			kvfree(async_cow);
 1611		return;
 1612	}
 1613
 1614	nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
 1615		PAGE_SHIFT;
 1616
 1617	while (!list_empty(&async_chunk->extents)) {
 1618		async_extent = list_first_entry(&async_chunk->extents,
 1619						struct async_extent, list);
 1620		list_del(&async_extent->list);
 1621		submit_one_async_extent(async_chunk, async_extent, &alloc_hint);
 1622	}
 1623
 1624	/* atomic_sub_return implies a barrier */
 1625	if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
 1626	    5 * SZ_1M)
 1627		cond_wake_up_nomb(&fs_info->async_submit_wait);
 1628}
 1629
 1630static bool run_delalloc_compressed(struct btrfs_inode *inode,
 1631				    struct folio *locked_folio, u64 start,
 1632				    u64 end, struct writeback_control *wbc)
 1633{
 1634	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 1635	struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
 1636	struct async_cow *ctx;
 1637	struct async_chunk *async_chunk;
 1638	unsigned long nr_pages;
 1639	u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
 1640	int i;
 1641	unsigned nofs_flag;
 1642	const blk_opf_t write_flags = wbc_to_write_flags(wbc);
 1643
 1644	nofs_flag = memalloc_nofs_save();
 1645	ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
 1646	memalloc_nofs_restore(nofs_flag);
 1647	if (!ctx)
 1648		return false;
 1649
 1650	set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
 1651
 1652	async_chunk = ctx->chunks;
 1653	atomic_set(&ctx->num_chunks, num_chunks);
 1654
 1655	for (i = 0; i < num_chunks; i++) {
 1656		u64 cur_end = min(end, start + SZ_512K - 1);
 1657
 1658		/*
 1659		 * igrab is called higher up in the call chain, take only the
 1660		 * lightweight reference for the callback lifetime
 1661		 */
 1662		ihold(&inode->vfs_inode);
 1663		async_chunk[i].async_cow = ctx;
 1664		async_chunk[i].inode = inode;
 1665		async_chunk[i].start = start;
 1666		async_chunk[i].end = cur_end;
 1667		async_chunk[i].write_flags = write_flags;
 1668		INIT_LIST_HEAD(&async_chunk[i].extents);
 1669
 1670		/*
 1671		 * The locked_folio comes all the way from writepage and its
 1672		 * the original folio we were actually given.  As we spread
 1673		 * this large delalloc region across multiple async_chunk
 1674		 * structs, only the first struct needs a pointer to
 1675		 * locked_folio.
 1676		 *
 1677		 * This way we don't need racey decisions about who is supposed
 1678		 * to unlock it.
 1679		 */
 1680		if (locked_folio) {
 1681			/*
 1682			 * Depending on the compressibility, the pages might or
 1683			 * might not go through async.  We want all of them to
 1684			 * be accounted against wbc once.  Let's do it here
 1685			 * before the paths diverge.  wbc accounting is used
 1686			 * only for foreign writeback detection and doesn't
 1687			 * need full accuracy.  Just account the whole thing
 1688			 * against the first page.
 1689			 */
 1690			wbc_account_cgroup_owner(wbc, locked_folio,
 1691						 cur_end - start);
 1692			async_chunk[i].locked_folio = locked_folio;
 1693			locked_folio = NULL;
 1694		} else {
 1695			async_chunk[i].locked_folio = NULL;
 1696		}
 1697
 1698		if (blkcg_css != blkcg_root_css) {
 1699			css_get(blkcg_css);
 1700			async_chunk[i].blkcg_css = blkcg_css;
 1701			async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT;
 1702		} else {
 1703			async_chunk[i].blkcg_css = NULL;
 1704		}
 1705
 1706		btrfs_init_work(&async_chunk[i].work, compress_file_range,
 1707				submit_compressed_extents);
 1708
 1709		nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
 1710		atomic_add(nr_pages, &fs_info->async_delalloc_pages);
 1711
 1712		btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
 1713
 1714		start = cur_end + 1;
 1715	}
 1716	return true;
 1717}
 1718
 1719/*
 1720 * Run the delalloc range from start to end, and write back any dirty pages
 1721 * covered by the range.
 1722 */
 1723static noinline int run_delalloc_cow(struct btrfs_inode *inode,
 1724				     struct folio *locked_folio, u64 start,
 1725				     u64 end, struct writeback_control *wbc,
 1726				     bool pages_dirty)
 1727{
 1728	u64 done_offset = end;
 1729	int ret;
 1730
 1731	while (start <= end) {
 1732		ret = cow_file_range(inode, locked_folio, start, end,
 1733				     &done_offset, COW_FILE_RANGE_KEEP_LOCKED);
 1734		if (ret)
 1735			return ret;
 1736		extent_write_locked_range(&inode->vfs_inode, locked_folio,
 1737					  start, done_offset, wbc, pages_dirty);
 1738		start = done_offset + 1;
 1739	}
 1740
 1741	return 1;
 1742}
 1743
 1744static int fallback_to_cow(struct btrfs_inode *inode,
 1745			   struct folio *locked_folio, const u64 start,
 1746			   const u64 end)
 1747{
 1748	const bool is_space_ino = btrfs_is_free_space_inode(inode);
 1749	const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
 1750	const u64 range_bytes = end + 1 - start;
 1751	struct extent_io_tree *io_tree = &inode->io_tree;
 1752	struct extent_state *cached_state = NULL;
 1753	u64 range_start = start;
 1754	u64 count;
 1755	int ret;
 1756
 1757	/*
 1758	 * If EXTENT_NORESERVE is set it means that when the buffered write was
 1759	 * made we had not enough available data space and therefore we did not
 1760	 * reserve data space for it, since we though we could do NOCOW for the
 1761	 * respective file range (either there is prealloc extent or the inode
 1762	 * has the NOCOW bit set).
 1763	 *
 1764	 * However when we need to fallback to COW mode (because for example the
 1765	 * block group for the corresponding extent was turned to RO mode by a
 1766	 * scrub or relocation) we need to do the following:
 1767	 *
 1768	 * 1) We increment the bytes_may_use counter of the data space info.
 1769	 *    If COW succeeds, it allocates a new data extent and after doing
 1770	 *    that it decrements the space info's bytes_may_use counter and
 1771	 *    increments its bytes_reserved counter by the same amount (we do
 1772	 *    this at btrfs_add_reserved_bytes()). So we need to increment the
 1773	 *    bytes_may_use counter to compensate (when space is reserved at
 1774	 *    buffered write time, the bytes_may_use counter is incremented);
 1775	 *
 1776	 * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
 1777	 *    that if the COW path fails for any reason, it decrements (through
 1778	 *    extent_clear_unlock_delalloc()) the bytes_may_use counter of the
 1779	 *    data space info, which we incremented in the step above.
 1780	 *
 1781	 * If we need to fallback to cow and the inode corresponds to a free
 1782	 * space cache inode or an inode of the data relocation tree, we must
 1783	 * also increment bytes_may_use of the data space_info for the same
 1784	 * reason. Space caches and relocated data extents always get a prealloc
 1785	 * extent for them, however scrub or balance may have set the block
 1786	 * group that contains that extent to RO mode and therefore force COW
 1787	 * when starting writeback.
 1788	 */
 1789	btrfs_lock_extent(io_tree, start, end, &cached_state);
 1790	count = btrfs_count_range_bits(io_tree, &range_start, end, range_bytes,
 1791				       EXTENT_NORESERVE, 0, NULL);
 1792	if (count > 0 || is_space_ino || is_reloc_ino) {
 1793		u64 bytes = count;
 1794		struct btrfs_fs_info *fs_info = inode->root->fs_info;
 1795		struct btrfs_space_info *sinfo = fs_info->data_sinfo;
 1796
 1797		if (is_space_ino || is_reloc_ino)
 1798			bytes = range_bytes;
 1799
 1800		spin_lock(&sinfo->lock);
 1801		btrfs_space_info_update_bytes_may_use(sinfo, bytes);
 1802		spin_unlock(&sinfo->lock);
 1803
 1804		if (count > 0)
 1805			btrfs_clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
 1806					       &cached_state);
 1807	}
 1808	btrfs_unlock_extent(io_tree, start, end, &cached_state);
 1809
 1810	/*
 1811	 * Don't try to create inline extents, as a mix of inline extent that
 1812	 * is written out and unlocked directly and a normal NOCOW extent
 1813	 * doesn't work.
 1814	 *
 1815	 * And here we do not unlock the folio after a successful run.
 1816	 * The folios will be unlocked after everything is finished, or by error handling.
 1817	 *
 1818	 * This is to ensure error handling won't need to clear dirty/ordered flags without
 1819	 * a locked folio, which can race with writeback.
 1820	 */
 1821	ret = cow_file_range(inode, locked_folio, start, end, NULL,
 1822			     COW_FILE_RANGE_NO_INLINE | COW_FILE_RANGE_KEEP_LOCKED);
 1823	ASSERT(ret != 1);
 1824	return ret;
 1825}
 1826
 1827struct can_nocow_file_extent_args {
 1828	/* Input fields. */
 1829
 1830	/* Start file offset of the range we want to NOCOW. */
 1831	u64 start;
 1832	/* End file offset (inclusive) of the range we want to NOCOW. */
 1833	u64 end;
 1834	bool writeback_path;
 1835	/*
 1836	 * Free the path passed to can_nocow_file_extent() once it's not needed
 1837	 * anymore.
 1838	 */
 1839	bool free_path;
 1840
 1841	/*
 1842	 * Output fields. Only set when can_nocow_file_extent() returns 1.
 1843	 * The expected file extent for the NOCOW write.
 1844	 */
 1845	struct btrfs_file_extent file_extent;
 1846};
 1847
 1848/*
 1849 * Check if we can NOCOW the file extent that the path points to.
 1850 * This function may return with the path released, so the caller should check
 1851 * if path->nodes[0] is NULL or not if it needs to use the path afterwards.
 1852 *
 1853 * Returns: < 0 on error
 1854 *            0 if we can not NOCOW
 1855 *            1 if we can NOCOW
 1856 */
 1857static int can_nocow_file_extent(struct btrfs_path *path,
 1858				 struct btrfs_key *key,
 1859				 struct btrfs_inode *inode,
 1860				 struct can_nocow_file_extent_args *args)
 1861{
 1862	const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
 1863	struct extent_buffer *leaf = path->nodes[0];
 1864	struct btrfs_root *root = inode->root;
 1865	struct btrfs_file_extent_item *fi;
 1866	struct btrfs_root *csum_root;
 1867	u64 io_start;
 1868	u64 extent_end;
 1869	u8 extent_type;
 1870	int can_nocow = 0;
 1871	int ret = 0;
 1872	bool nowait = path->nowait;
 1873
 1874	fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
 1875	extent_type = btrfs_file_extent_type(leaf, fi);
 1876
 1877	if (extent_type == BTRFS_FILE_EXTENT_INLINE)
 1878		goto out;
 1879
 1880	if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
 1881	    extent_type == BTRFS_FILE_EXTENT_REG)
 1882		goto out;
 1883
 1884	/*
 1885	 * If the extent was created before the generation where the last snapshot
 1886	 * for its subvolume was created, then this implies the extent is shared,
 1887	 * hence we must COW.
 1888	 */
 1889	if (btrfs_file_extent_generation(leaf, fi) <=
 1890	    btrfs_root_last_snapshot(&root->root_item))
 1891		goto out;
 1892
 1893	/* An explicit hole, must COW. */
 1894	if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
 1895		goto out;
 1896
 1897	/* Compressed/encrypted/encoded extents must be COWed. */
 1898	if (btrfs_file_extent_compression(leaf, fi) ||
 1899	    btrfs_file_extent_encryption(leaf, fi) ||
 1900	    btrfs_file_extent_other_encoding(leaf, fi))
 1901		goto out;
 1902
 1903	extent_end = btrfs_file_extent_end(path);
 1904
 1905	args->file_extent.disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 1906	args->file_extent.disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
 1907	args->file_extent.ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
 1908	args->file_extent.offset = btrfs_file_extent_offset(leaf, fi);
 1909	args->file_extent.compression = btrfs_file_extent_compression(leaf, fi);
 1910
 1911	/*
 1912	 * The following checks can be expensive, as they need to take other
 1913	 * locks and do btree or rbtree searches, so release the path to avoid
 1914	 * blocking other tasks for too long.
 1915	 */
 1916	btrfs_release_path(path);
 1917
 1918	ret = btrfs_cross_ref_exist(inode, key->offset - args->file_extent.offset,
 1919				    args->file_extent.disk_bytenr, path);
 1920	WARN_ON_ONCE(ret > 0 && is_freespace_inode);
 1921	if (ret != 0)
 1922		goto out;
 1923
 1924	if (args->free_path) {
 1925		/*
 1926		 * We don't need the path anymore, plus through the
 1927		 * btrfs_lookup_csums_list() call below we will end up allocating
 1928		 * another path. So free the path to avoid unnecessary extra
 1929		 * memory usage.
 1930		 */
 1931		btrfs_free_path(path);
 1932		path = NULL;
 1933	}
 1934
 1935	/* If there are pending snapshots for this root, we must COW. */
 1936	if (args->writeback_path && !is_freespace_inode &&
 1937	    atomic_read(&root->snapshot_force_cow))
 1938		goto out;
 1939
 1940	args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start;
 1941	args->file_extent.offset += args->start - key->offset;
 1942	io_start = args->file_extent.disk_bytenr + args->file_extent.offset;
 1943
 1944	/*
 1945	 * Force COW if csums exist in the range. This ensures that csums for a
 1946	 * given extent are either valid or do not exist.
 1947	 */
 1948
 1949	csum_root = btrfs_csum_root(root->fs_info, io_start);
 1950	ret = btrfs_lookup_csums_list(csum_root, io_start,
 1951				      io_start + args->file_extent.num_bytes - 1,
 1952				      NULL, nowait);
 1953	WARN_ON_ONCE(ret > 0 && is_freespace_inode);
 1954	if (ret != 0)
 1955		goto out;
 1956
 1957	can_nocow = 1;
 1958 out:
 1959	if (args->free_path && path)
 1960		btrfs_free_path(path);
 1961
 1962	return ret < 0 ? ret : can_nocow;
 1963}
 1964
 1965static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio,
 1966			   struct extent_state **cached,
 1967			   struct can_nocow_file_extent_args *nocow_args,
 1968			   u64 file_pos, bool is_prealloc)
 1969{
 1970	struct btrfs_ordered_extent *ordered;
 1971	const u64 len = nocow_args->file_extent.num_bytes;
 1972	const u64 end = file_pos + len - 1;
 1973	int ret = 0;
 1974
 1975	btrfs_lock_extent(&inode->io_tree, file_pos, end, cached);
 1976
 1977	if (is_prealloc) {
 1978		struct extent_map *em;
 1979
 1980		em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent,
 1981					BTRFS_ORDERED_PREALLOC);
 1982		if (IS_ERR(em)) {
 1983			ret = PTR_ERR(em);
 1984			goto error;
 1985		}
 1986		btrfs_free_extent_map(em);
 1987	}
 1988
 1989	ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent,
 1990					     is_prealloc
 1991					     ? (1U << BTRFS_ORDERED_PREALLOC)
 1992					     : (1U << BTRFS_ORDERED_NOCOW));
 1993	if (IS_ERR(ordered)) {
 1994		if (is_prealloc)
 1995			btrfs_drop_extent_map_range(inode, file_pos, end, false);
 1996		ret = PTR_ERR(ordered);
 1997		goto error;
 1998	}
 1999
 2000	if (btrfs_is_data_reloc_root(inode->root))
 2001		/*
 2002		 * Errors are handled later, as we must prevent
 2003		 * extent_clear_unlock_delalloc() in error handler from freeing
 2004		 * metadata of the created ordered extent.
 2005		 */
 2006		ret = btrfs_reloc_clone_csums(ordered);
 2007	btrfs_put_ordered_extent(ordered);
 2008
 2009	if (ret < 0)
 2010		goto error;
 2011	extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
 2012				     EXTENT_LOCKED | EXTENT_DELALLOC |
 2013				     EXTENT_CLEAR_DATA_RESV,
 2014				     PAGE_SET_ORDERED);
 2015	return ret;
 2016
 2017error:
 2018	btrfs_cleanup_ordered_extents(inode, file_pos, len);
 2019	extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
 2020				     EXTENT_LOCKED | EXTENT_DELALLOC |
 2021				     EXTENT_CLEAR_DATA_RESV,
 2022				     PAGE_UNLOCK | PAGE_START_WRITEBACK |
 2023				     PAGE_END_WRITEBACK);
 2024	btrfs_err(inode->root->fs_info,
 2025		  "%s failed, root=%lld inode=%llu start=%llu len=%llu: %d",
 2026		  __func__, btrfs_root_id(inode->root), btrfs_ino(inode),
 2027		  file_pos, len, ret);
 2028	return ret;
 2029}
 2030
 2031/*
 2032 * When nocow writeback calls back.  This checks for snapshots or COW copies
 2033 * of the extents that exist in the file, and COWs the file as required.
 2034 *
 2035 * If no cow copies or snapshots exist, we write directly to the existing
 2036 * blocks on disk
 2037 */
 2038static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
 2039				       struct folio *locked_folio,
 2040				       const u64 start, const u64 end)
 2041{
 2042	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 2043	struct btrfs_root *root = inode->root;
 2044	struct btrfs_path *path = NULL;
 2045	u64 cow_start = (u64)-1;
 2046	/*
 2047	 * If not 0, represents the inclusive end of the last fallback_to_cow()
 2048	 * range. Only for error handling.
 2049	 *
 2050	 * The same for nocow_end, it's to avoid double cleaning up the range
 2051	 * already cleaned by nocow_one_range().
 2052	 */
 2053	u64 cow_end = 0;
 2054	u64 nocow_end = 0;
 2055	u64 cur_offset = start;
 2056	int ret;
 2057	bool check_prev = true;
 2058	u64 ino = btrfs_ino(inode);
 2059	struct can_nocow_file_extent_args nocow_args = { 0 };
 2060	/* The range that has ordered extent(s). */
 2061	u64 oe_cleanup_start;
 2062	u64 oe_cleanup_len = 0;
 2063	/* The range that is untouched. */
 2064	u64 untouched_start;
 2065	u64 untouched_len = 0;
 2066
 2067	/*
 2068	 * Normally on a zoned device we're only doing COW writes, but in case
 2069	 * of relocation on a zoned filesystem serializes I/O so that we're only
 2070	 * writing sequentially and can end up here as well.
 2071	 */
 2072	ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root));
 2073
 2074	if (unlikely(btrfs_is_shutdown(fs_info))) {
 2075		ret = -EIO;
 2076		goto error;
 2077	}
 2078	path = btrfs_alloc_path();
 2079	if (!path) {
 2080		ret = -ENOMEM;
 2081		goto error;
 2082	}
 2083
 2084	nocow_args.end = end;
 2085	nocow_args.writeback_path = true;
 2086
 2087	while (cur_offset <= end) {
 2088		struct btrfs_block_group *nocow_bg = NULL;
 2089		struct btrfs_key found_key;
 2090		struct btrfs_file_extent_item *fi;
 2091		struct extent_buffer *leaf;
 2092		struct extent_state *cached_state = NULL;
 2093		u64 extent_end;
 2094		int extent_type;
 2095
 2096		ret = btrfs_lookup_file_extent(NULL, root, path, ino,
 2097					       cur_offset, 0);
 2098		if (ret < 0)
 2099			goto error;
 2100
 2101		/*
 2102		 * If there is no extent for our range when doing the initial
 2103		 * search, then go back to the previous slot as it will be the
 2104		 * one containing the search offset
 2105		 */
 2106		if (ret > 0 && path->slots[0] > 0 && check_prev) {
 2107			leaf = path->nodes[0];
 2108			btrfs_item_key_to_cpu(leaf, &found_key,
 2109					      path->slots[0] - 1);
 2110			if (found_key.objectid == ino &&
 2111			    found_key.type == BTRFS_EXTENT_DATA_KEY)
 2112				path->slots[0]--;
 2113		}
 2114		check_prev = false;
 2115next_slot:
 2116		/* Go to next leaf if we have exhausted the current one */
 2117		leaf = path->nodes[0];
 2118		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 2119			ret = btrfs_next_leaf(root, path);
 2120			if (ret < 0)
 2121				goto error;
 2122			if (ret > 0)
 2123				break;
 2124			leaf = path->nodes[0];
 2125		}
 2126
 2127		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 2128
 2129		/* Didn't find anything for our INO */
 2130		if (found_key.objectid > ino)
 2131			break;
 2132		/*
 2133		 * Keep searching until we find an EXTENT_ITEM or there are no
 2134		 * more extents for this inode
 2135		 */
 2136		if (WARN_ON_ONCE(found_key.objectid < ino) ||
 2137		    found_key.type < BTRFS_EXTENT_DATA_KEY) {
 2138			path->slots[0]++;
 2139			goto next_slot;
 2140		}
 2141
 2142		/* Found key is not EXTENT_DATA_KEY or starts after req range */
 2143		if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
 2144		    found_key.offset > end)
 2145			break;
 2146
 2147		/*
 2148		 * If the found extent starts after requested offset, then
 2149		 * adjust cur_offset to be right before this extent begins.
 2150		 */
 2151		if (found_key.offset > cur_offset) {
 2152			if (cow_start == (u64)-1)
 2153				cow_start = cur_offset;
 2154			cur_offset = found_key.offset;
 2155			goto next_slot;
 2156		}
 2157
 2158		/*
 2159		 * Found extent which begins before our range and potentially
 2160		 * intersect it
 2161		 */
 2162		fi = btrfs_item_ptr(leaf, path->slots[0],
 2163				    struct btrfs_file_extent_item);
 2164		extent_type = btrfs_file_extent_type(leaf, fi);
 2165		/* If this is triggered then we have a memory corruption. */
 2166		ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES);
 2167		if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
 2168			ret = -EUCLEAN;
 2169			goto error;
 2170		}
 2171		extent_end = btrfs_file_extent_end(path);
 2172
 2173		/*
 2174		 * If the extent we got ends before our current offset, skip to
 2175		 * the next extent.
 2176		 */
 2177		if (extent_end <= cur_offset) {
 2178			path->slots[0]++;
 2179			goto next_slot;
 2180		}
 2181
 2182		nocow_args.start = cur_offset;
 2183		ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args);
 2184		if (ret < 0)
 2185			goto error;
 2186		if (ret == 0)
 2187			goto must_cow;
 2188
 2189		ret = 0;
 2190		nocow_bg = btrfs_inc_nocow_writers(fs_info,
 2191				nocow_args.file_extent.disk_bytenr +
 2192				nocow_args.file_extent.offset);
 2193		if (!nocow_bg) {
 2194must_cow:
 2195			/*
 2196			 * If we can't perform NOCOW writeback for the range,
 2197			 * then record the beginning of the range that needs to
 2198			 * be COWed.  It will be written out before the next
 2199			 * NOCOW range if we find one, or when exiting this
 2200			 * loop.
 2201			 */
 2202			if (cow_start == (u64)-1)
 2203				cow_start = cur_offset;
 2204			cur_offset = extent_end;
 2205			if (cur_offset > end)
 2206				break;
 2207			if (!path->nodes[0])
 2208				continue;
 2209			path->slots[0]++;
 2210			goto next_slot;
 2211		}
 2212
 2213		/*
 2214		 * COW range from cow_start to found_key.offset - 1. As the key
 2215		 * will contain the beginning of the first extent that can be
 2216		 * NOCOW, following one which needs to be COW'ed
 2217		 */
 2218		if (cow_start != (u64)-1) {
 2219			ret = fallback_to_cow(inode, locked_folio, cow_start,
 2220					      found_key.offset - 1);
 2221			if (ret) {
 2222				cow_end = found_key.offset - 1;
 2223				btrfs_dec_nocow_writers(nocow_bg);
 2224				goto error;
 2225			}
 2226			cow_start = (u64)-1;
 2227		}
 2228
 2229		ret = nocow_one_range(inode, locked_folio, &cached_state,
 2230				      &nocow_args, cur_offset,
 2231				      extent_type == BTRFS_FILE_EXTENT_PREALLOC);
 2232		btrfs_dec_nocow_writers(nocow_bg);
 2233		if (ret < 0) {
 2234			nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1;
 2235			goto error;
 2236		}
 2237		cur_offset = extent_end;
 2238	}
 2239	btrfs_release_path(path);
 2240
 2241	if (cur_offset <= end && cow_start == (u64)-1)
 2242		cow_start = cur_offset;
 2243
 2244	if (cow_start != (u64)-1) {
 2245		ret = fallback_to_cow(inode, locked_folio, cow_start, end);
 2246		if (ret) {
 2247			cow_end = end;
 2248			goto error;
 2249		}
 2250		cow_start = (u64)-1;
 2251	}
 2252
 2253	/*
 2254	 * Everything is finished without an error, can unlock the folios now.
 2255	 *
 2256	 * No need to touch the io tree range nor set folio ordered flag, as
 2257	 * fallback_to_cow() and nocow_one_range() have already handled them.
 2258	 */
 2259	extent_clear_unlock_delalloc(inode, start, end, locked_folio, NULL, 0, PAGE_UNLOCK);
 2260
 2261	btrfs_free_path(path);
 2262	return 0;
 2263
 2264error:
 2265	if (cow_start == (u64)-1) {
 2266		/*
 2267		 * case a)
 2268		 *    start           cur_offset               end
 2269		 *    |   OE cleanup  |       Untouched        |
 2270		 *
 2271		 * We finished a fallback_to_cow() or nocow_one_range() call,
 2272		 * but failed to check the next range.
 2273		 *
 2274		 * or
 2275		 *    start           cur_offset   nocow_end   end
 2276		 *    |   OE cleanup  |   Skip     | Untouched |
 2277		 *
 2278		 * nocow_one_range() failed, the range [cur_offset, nocow_end] is
 2279		 * already cleaned up.
 2280		 */
 2281		oe_cleanup_start = start;
 2282		oe_cleanup_len = cur_offset - start;
 2283		if (nocow_end)
 2284			untouched_start = nocow_end + 1;
 2285		else
 2286			untouched_start = cur_offset;
 2287		untouched_len = end + 1 - untouched_start;
 2288	} else if (cow_start != (u64)-1 && cow_end == 0) {
 2289		/*
 2290		 * case b)
 2291		 *    start        cow_start    cur_offset   end
 2292		 *    | OE cleanup |        Untouched        |
 2293		 *
 2294		 * We got a range that needs COW, but before we hit the next NOCOW range,
 2295		 * thus [cow_start, cur_offset) doesn't yet have any OE.
 2296		 */
 2297		oe_cleanup_start = start;
 2298		oe_cleanup_len = cow_start - start;
 2299		untouched_start = cow_start;
 2300		untouched_len = end + 1 - untouched_start;
 2301	} else {
 2302		/*
 2303		 * case c)
 2304		 *    start        cow_start    cow_end      end
 2305		 *    | OE cleanup |   Skip     |  Untouched |
 2306		 *
 2307		 * fallback_to_cow() failed, and fallback_to_cow() will do the
 2308		 * cleanup for its range, we shouldn't touch the range
 2309		 * [cow_start, cow_end].
 2310		 */
 2311		ASSERT(cow_start != (u64)-1 && cow_end != 0);
 2312		oe_cleanup_start = start;
 2313		oe_cleanup_len = cow_start - start;
 2314		untouched_start = cow_end + 1;
 2315		untouched_len = end + 1 - untouched_start;
 2316	}
 2317
 2318	if (oe_cleanup_len) {
 2319		const u64 oe_cleanup_end = oe_cleanup_start + oe_cleanup_len - 1;
 2320		btrfs_cleanup_ordered_extents(inode, oe_cleanup_start, oe_cleanup_len);
 2321		extent_clear_unlock_delalloc(inode, oe_cleanup_start, oe_cleanup_end,
 2322					     locked_folio, NULL,
 2323					     EXTENT_LOCKED | EXTENT_DELALLOC,
 2324					     PAGE_UNLOCK | PAGE_START_WRITEBACK |
 2325					     PAGE_END_WRITEBACK);
 2326	}
 2327
 2328	if (untouched_len) {
 2329		struct extent_state *cached = NULL;
 2330		const u64 untouched_end = untouched_start + untouched_len - 1;
 2331
 2332		/*
 2333		 * We need to lock the extent here because we're clearing DELALLOC and
 2334		 * we're not locked at this point.
 2335		 */
 2336		btrfs_lock_extent(&inode->io_tree, untouched_start, untouched_end, &cached);
 2337		extent_clear_unlock_delalloc(inode, untouched_start, untouched_end,
 2338					     locked_folio, &cached,
 2339					     EXTENT_LOCKED | EXTENT_DELALLOC |
 2340					     EXTENT_DEFRAG |
 2341					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
 2342					     PAGE_START_WRITEBACK |
 2343					     PAGE_END_WRITEBACK);
 2344		btrfs_qgroup_free_data(inode, NULL, untouched_start, untouched_len, NULL);
 2345	}
 2346	btrfs_free_path(path);
 2347	btrfs_err(fs_info,
 2348"%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu oe_cleanup=%llu oe_cleanup_len=%llu untouched_start=%llu untouched_len=%llu: %d",
 2349		  __func__, btrfs_root_id(inode->root), btrfs_ino(inode),
 2350		  start, end + 1 - start, cur_offset, oe_cleanup_start, oe_cleanup_len,
 2351		  untouched_start, untouched_len, ret);
 2352	return ret;
 2353}
 2354
 2355static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
 2356{
 2357	if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
 2358		if (inode->defrag_bytes &&
 2359		    btrfs_test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
 2360			return false;
 2361		return true;
 2362	}
 2363	return false;
 2364}
 2365
 2366/*
 2367 * Function to process delayed allocation (create CoW) for ranges which are
 2368 * being touched for the first time.
 2369 */
 2370int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio,
 2371			     u64 start, u64 end, struct writeback_control *wbc)
 2372{
 2373	const bool zoned = btrfs_is_zoned(inode->root->fs_info);
 2374	int ret;
 2375
 2376	/*
 2377	 * The range must cover part of the @locked_folio, or a return of 1
 2378	 * can confuse the caller.
 2379	 */
 2380	ASSERT(!(end <= folio_pos(locked_folio) ||
 2381		 start >= folio_next_pos(locked_folio)));
 2382
 2383	if (should_nocow(inode, start, end)) {
 2384		ret = run_delalloc_nocow(inode, locked_folio, start, end);
 2385		return ret;
 2386	}
 2387
 2388	if (btrfs_inode_can_compress(inode) &&
 2389	    inode_need_compress(inode, start, end) &&
 2390	    run_delalloc_compressed(inode, locked_folio, start, end, wbc))
 2391		return 1;
 2392
 2393	if (zoned)
 2394		ret = run_delalloc_cow(inode, locked_folio, start, end, wbc,
 2395				       true);
 2396	else
 2397		ret = cow_file_range(inode, locked_folio, start, end, NULL, 0);
 2398	return ret;
 2399}
 2400
 2401void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
 2402				 struct extent_state *orig, u64 split)
 2403{
 2404	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 2405	u64 size;
 2406
 2407	lockdep_assert_held(&inode->io_tree.lock);
 2408
 2409	/* not delalloc, ignore it */
 2410	if (!(orig->state & EXTENT_DELALLOC))
 2411		return;
 2412
 2413	size = orig->end - orig->start + 1;
 2414	if (size > fs_info->max_extent_size) {
 2415		u32 num_extents;
 2416		u64 new_size;
 2417
 2418		/*
 2419		 * See the explanation in btrfs_merge_delalloc_extent, the same
 2420		 * applies here, just in reverse.
 2421		 */
 2422		new_size = orig->end - split + 1;
 2423		num_extents = count_max_extents(fs_info, new_size);
 2424		new_size = split - orig->start;
 2425		num_extents += count_max_extents(fs_info, new_size);
 2426		if (count_max_extents(fs_info, size) >= num_extents)
 2427			return;
 2428	}
 2429
 2430	spin_lock(&inode->lock);
 2431	btrfs_mod_outstanding_extents(inode, 1);
 2432	spin_unlock(&inode->lock);
 2433}
 2434
 2435/*
 2436 * Handle merged delayed allocation extents so we can keep track of new extents
 2437 * that are just merged onto old extents, such as when we are doing sequential
 2438 * writes, so we can properly account for the metadata space we'll need.
 2439 */
 2440void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
 2441				 struct extent_state *other)
 2442{
 2443	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 2444	u64 new_size, old_size;
 2445	u32 num_extents;
 2446
 2447	lockdep_assert_held(&inode->io_tree.lock);
 2448
 2449	/* not delalloc, ignore it */
 2450	if (!(other->state & EXTENT_DELALLOC))
 2451		return;
 2452
 2453	if (new->start > other->start)
 2454		new_size = new->end - other->start + 1;
 2455	else
 2456		new_size = other->end - new->start + 1;
 2457
 2458	/* we're not bigger than the max, unreserve the space and go */
 2459	if (new_size <= fs_info->max_extent_size) {
 2460		spin_lock(&inode->lock);
 2461		btrfs_mod_outstanding_extents(inode, -1);
 2462		spin_unlock(&inode->lock);
 2463		return;
 2464	}
 2465
 2466	/*
 2467	 * We have to add up either side to figure out how many extents were
 2468	 * accounted for before we merged into one big extent.  If the number of
 2469	 * extents we accounted for is <= the amount we need for the new range
 2470	 * then we can return, otherwise drop.  Think of it like this
 2471	 *
 2472	 * [ 4k][MAX_SIZE]
 2473	 *
 2474	 * So we've grown the extent by a MAX_SIZE extent, this would mean we
 2475	 * need 2 outstanding extents, on one side we have 1 and the other side
 2476	 * we have 1 so they are == and we can return.  But in this case
 2477	 *
 2478	 * [MAX_SIZE+4k][MAX_SIZE+4k]
 2479	 *
 2480	 * Each range on their own accounts for 2 extents, but merged together
 2481	 * they are only 3 extents worth of accounting, so we need to drop in
 2482	 * this case.
 2483	 */
 2484	old_size = other->end - other->start + 1;
 2485	num_extents = count_max_extents(fs_info, old_size);
 2486	old_size = new->end - new->start + 1;
 2487	num_extents += count_max_extents(fs_info, old_size);
 2488	if (count_max_extents(fs_info, new_size) >= num_extents)
 2489		return;
 2490
 2491	spin_lock(&inode->lock);
 2492	btrfs_mod_outstanding_extents(inode, -1);
 2493	spin_unlock(&inode->lock);
 2494}
 2495
 2496static void btrfs_add_delalloc_inode(struct btrfs_inode *inode)
 2497{
 2498	struct btrfs_root *root = inode->root;
 2499	struct btrfs_fs_info *fs_info = root->fs_info;
 2500
 2501	spin_lock(&root->delalloc_lock);
 2502	ASSERT(list_empty(&inode->delalloc_inodes));
 2503	list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
 2504	root->nr_delalloc_inodes++;
 2505	if (root->nr_delalloc_inodes == 1) {
 2506		spin_lock(&fs_info->delalloc_root_lock);
 2507		ASSERT(list_empty(&root->delalloc_root));
 2508		list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots);
 2509		spin_unlock(&fs_info->delalloc_root_lock);
 2510	}
 2511	spin_unlock(&root->delalloc_lock);
 2512}
 2513
 2514void btrfs_del_delalloc_inode(struct btrfs_inode *inode)
 2515{
 2516	struct btrfs_root *root = inode->root;
 2517	struct btrfs_fs_info *fs_info = root->fs_info;
 2518
 2519	lockdep_assert_held(&root->delalloc_lock);
 2520
 2521	/*
 2522	 * We may be called after the inode was already deleted from the list,
 2523	 * namely in the transaction abort path btrfs_destroy_delalloc_inodes(),
 2524	 * and then later through btrfs_clear_delalloc_extent() while the inode
 2525	 * still has ->delalloc_bytes > 0.
 2526	 */
 2527	if (!list_empty(&inode->delalloc_inodes)) {
 2528		list_del_init(&inode->delalloc_inodes);
 2529		root->nr_delalloc_inodes--;
 2530		if (!root->nr_delalloc_inodes) {
 2531			ASSERT(list_empty(&root->delalloc_inodes));
 2532			spin_lock(&fs_info->delalloc_root_lock);
 2533			ASSERT(!list_empty(&root->delalloc_root));
 2534			list_del_init(&root->delalloc_root);
 2535			spin_unlock(&fs_info->delalloc_root_lock);
 2536		}
 2537	}
 2538}
 2539
 2540/*
 2541 * Properly track delayed allocation bytes in the inode and to maintain the
 2542 * list of inodes that have pending delalloc work to be done.
 2543 */
 2544void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
 2545			       u32 bits)
 2546{
 2547	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 2548
 2549	lockdep_assert_held(&inode->io_tree.lock);
 2550
 2551	if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
 2552		WARN_ON(1);
 2553	/*
 2554	 * set_bit and clear bit hooks normally require _irqsave/restore
 2555	 * but in this case, we are only testing for the DELALLOC
 2556	 * bit, which is only set or cleared with irqs on
 2557	 */
 2558	if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 2559		u64 len = state->end + 1 - state->start;
 2560		u64 prev_delalloc_bytes;
 2561		u32 num_extents = count_max_extents(fs_info, len);
 2562
 2563		spin_lock(&inode->lock);
 2564		btrfs_mod_outstanding_extents(inode, num_extents);
 2565		spin_unlock(&inode->lock);
 2566
 2567		/* For sanity tests */
 2568		if (btrfs_is_testing(fs_info))
 2569			return;
 2570
 2571		percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
 2572					 fs_info->delalloc_batch);
 2573		spin_lock(&inode->lock);
 2574		prev_delalloc_bytes = inode->delalloc_bytes;
 2575		inode->delalloc_bytes += len;
 2576		if (bits & EXTENT_DEFRAG)
 2577			inode->defrag_bytes += len;
 2578		spin_unlock(&inode->lock);
 2579
 2580		/*
 2581		 * We don't need to be under the protection of the inode's lock,
 2582		 * because we are called while holding the inode's io_tree lock
 2583		 * and are therefore protected against concurrent calls of this
 2584		 * function and btrfs_clear_delalloc_extent().
 2585		 */
 2586		if (!btrfs_is_free_space_inode(inode) && prev_delalloc_bytes == 0)
 2587			btrfs_add_delalloc_inode(inode);
 2588	}
 2589
 2590	if (!(state->state & EXTENT_DELALLOC_NEW) &&
 2591	    (bits & EXTENT_DELALLOC_NEW)) {
 2592		spin_lock(&inode->lock);
 2593		inode->new_delalloc_bytes += state->end + 1 - state->start;
 2594		spin_unlock(&inode->lock);
 2595	}
 2596}
 2597
 2598/*
 2599 * Once a range is no longer delalloc this function ensures that proper
 2600 * accounting happens.
 2601 */
 2602void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
 2603				 struct extent_state *state, u32 bits)
 2604{
 2605	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 2606	u64 len = state->end + 1 - state->start;
 2607	u32 num_extents = count_max_extents(fs_info, len);
 2608
 2609	lockdep_assert_held(&inode->io_tree.lock);
 2610
 2611	if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
 2612		spin_lock(&inode->lock);
 2613		inode->defrag_bytes -= len;
 2614		spin_unlock(&inode->lock);
 2615	}
 2616
 2617	/*
 2618	 * set_bit and clear bit hooks normally require _irqsave/restore
 2619	 * but in this case, we are only testing for the DELALLOC
 2620	 * bit, which is only set or cleared with irqs on
 2621	 */
 2622	if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 2623		struct btrfs_root *root = inode->root;
 2624		u64 new_delalloc_bytes;
 2625
 2626		spin_lock(&inode->lock);
 2627		btrfs_mod_outstanding_extents(inode, -num_extents);
 2628		spin_unlock(&inode->lock);
 2629
 2630		/*
 2631		 * We don't reserve metadata space for space cache inodes so we
 2632		 * don't need to call delalloc_release_metadata if there is an
 2633		 * error.
 2634		 */
 2635		if (bits & EXTENT_CLEAR_META_RESV &&
 2636		    root != fs_info->tree_root)
 2637			btrfs_delalloc_release_metadata(inode, len, true);
 2638
 2639		/* For sanity tests. */
 2640		if (btrfs_is_testing(fs_info))
 2641			return;
 2642
 2643		if (!btrfs_is_data_reloc_root(root) &&
 2644		    !btrfs_is_free_space_inode(inode) &&
 2645		    !(state->state & EXTENT_NORESERVE) &&
 2646		    (bits & EXTENT_CLEAR_DATA_RESV))
 2647			btrfs_free_reserved_data_space_noquota(inode, len);
 2648
 2649		percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
 2650					 fs_info->delalloc_batch);
 2651		spin_lock(&inode->lock);
 2652		inode->delalloc_bytes -= len;
 2653		new_delalloc_bytes = inode->delalloc_bytes;
 2654		spin_unlock(&inode->lock);
 2655
 2656		/*
 2657		 * We don't need to be under the protection of the inode's lock,
 2658		 * because we are called while holding the inode's io_tree lock
 2659		 * and are therefore protected against concurrent calls of this
 2660		 * function and btrfs_set_delalloc_extent().
 2661		 */
 2662		if (!btrfs_is_free_space_inode(inode) && new_delalloc_bytes == 0) {
 2663			spin_lock(&root->delalloc_lock);
 2664			btrfs_del_delalloc_inode(inode);
 2665			spin_unlock(&root->delalloc_lock);
 2666		}
 2667	}
 2668
 2669	if ((state->state & EXTENT_DELALLOC_NEW) &&
 2670	    (bits & EXTENT_DELALLOC_NEW)) {
 2671		spin_lock(&inode->lock);
 2672		ASSERT(inode->new_delalloc_bytes >= len);
 2673		inode->new_delalloc_bytes -= len;
 2674		if (bits & EXTENT_ADD_INODE_BYTES)
 2675			inode_add_bytes(&inode->vfs_inode, len);
 2676		spin_unlock(&inode->lock);
 2677	}
 2678}
 2679
 2680/*
 2681 * given a list of ordered sums record them in the inode.  This happens
 2682 * at IO completion time based on sums calculated at bio submission time.
 2683 */
 2684static int add_pending_csums(struct btrfs_trans_handle *trans,
 2685			     struct list_head *list)
 2686{
 2687	struct btrfs_ordered_sum *sum;
 2688	struct btrfs_root *csum_root = NULL;
 2689	int ret;
 2690
 2691	list_for_each_entry(sum, list, list) {
 2692		trans->adding_csums = true;
 2693		if (!csum_root)
 2694			csum_root = btrfs_csum_root(trans->fs_info,
 2695						    sum->logical);
 2696		ret = btrfs_csum_file_blocks(trans, csum_root, sum);
 2697		trans->adding_csums = false;
 2698		if (ret)
 2699			return ret;
 2700	}
 2701	return 0;
 2702}
 2703
 2704static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
 2705					 const u64 start,
 2706					 const u64 len,
 2707					 struct extent_state **cached_state)
 2708{
 2709	u64 search_start = start;
 2710	const u64 end = start + len - 1;
 2711
 2712	while (search_start < end) {
 2713		const u64 search_len = end - search_start + 1;
 2714		struct extent_map *em;
 2715		u64 em_len;
 2716		int ret = 0;
 2717
 2718		em = btrfs_get_extent(inode, NULL, search_start, search_len);
 2719		if (IS_ERR(em))
 2720			return PTR_ERR(em);
 2721
 2722		if (em->disk_bytenr != EXTENT_MAP_HOLE)
 2723			goto next;
 2724
 2725		em_len = em->len;
 2726		if (em->start < search_start)
 2727			em_len -= search_start - em->start;
 2728		if (em_len > search_len)
 2729			em_len = search_len;
 2730
 2731		ret = btrfs_set_extent_bit(&inode->io_tree, search_start,
 2732					   search_start + em_len - 1,
 2733					   EXTENT_DELALLOC_NEW, cached_state);
 2734next:
 2735		search_start = btrfs_extent_map_end(em);
 2736		btrfs_free_extent_map(em);
 2737		if (ret)
 2738			return ret;
 2739	}
 2740	return 0;
 2741}
 2742
 2743int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
 2744			      unsigned int extra_bits,
 2745			      struct extent_state **cached_state)
 2746{
 2747	WARN_ON(PAGE_ALIGNED(end));
 2748
 2749	if (start >= i_size_read(&inode->vfs_inode) &&
 2750	    !(inode->flags & BTRFS_INODE_PREALLOC)) {
 2751		/*
 2752		 * There can't be any extents following eof in this case so just
 2753		 * set the delalloc new bit for the range directly.
 2754		 */
 2755		extra_bits |= EXTENT_DELALLOC_NEW;
 2756	} else {
 2757		int ret;
 2758
 2759		ret = btrfs_find_new_delalloc_bytes(inode, start,
 2760						    end + 1 - start,
 2761						    cached_state);
 2762		if (ret)
 2763			return ret;
 2764	}
 2765
 2766	return btrfs_set_extent_bit(&inode->io_tree, start, end,
 2767				    EXTENT_DELALLOC | extra_bits, cached_state);
 2768}
 2769
 2770/* see btrfs_writepage_start_hook for details on why this is required */
 2771struct btrfs_writepage_fixup {
 2772	struct folio *folio;
 2773	struct btrfs_inode *inode;
 2774	struct btrfs_work work;
 2775};
 2776
 2777static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
 2778{
 2779	struct btrfs_writepage_fixup *fixup =
 2780		container_of(work, struct btrfs_writepage_fixup, work);
 2781	struct btrfs_ordered_extent *ordered;
 2782	struct extent_state *cached_state = NULL;
 2783	struct extent_changeset *data_reserved = NULL;
 2784	struct folio *folio = fixup->folio;
 2785	struct btrfs_inode *inode = fixup->inode;
 2786	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 2787	u64 page_start = folio_pos(folio);
 2788	u64 page_end = folio_next_pos(folio) - 1;
 2789	int ret = 0;
 2790	bool free_delalloc_space = true;
 2791
 2792	/*
 2793	 * This is similar to page_mkwrite, we need to reserve the space before
 2794	 * we take the folio lock.
 2795	 */
 2796	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
 2797					   folio_size(folio));
 2798again:
 2799	folio_lock(folio);
 2800
 2801	/*
 2802	 * Before we queued this fixup, we took a reference on the folio.
 2803	 * folio->mapping may go NULL, but it shouldn't be moved to a different
 2804	 * address space.
 2805	 */
 2806	if (!folio->mapping || !folio_test_dirty(folio) ||
 2807	    !folio_test_checked(folio)) {
 2808		/*
 2809		 * Unfortunately this is a little tricky, either
 2810		 *
 2811		 * 1) We got here and our folio had already been dealt with and
 2812		 *    we reserved our space, thus ret == 0, so we need to just
 2813		 *    drop our space reservation and bail.  This can happen the
 2814		 *    first time we come into the fixup worker, or could happen
 2815		 *    while waiting for the ordered extent.
 2816		 * 2) Our folio was already dealt with, but we happened to get an
 2817		 *    ENOSPC above from the btrfs_delalloc_reserve_space.  In
 2818		 *    this case we obviously don't have anything to release, but
 2819		 *    because the folio was already dealt with we don't want to
 2820		 *    mark the folio with an error, so make sure we're resetting
 2821		 *    ret to 0.  This is why we have this check _before_ the ret
 2822		 *    check, because we do not want to have a surprise ENOSPC
 2823		 *    when the folio was already properly dealt with.
 2824		 */
 2825		if (!ret) {
 2826			btrfs_delalloc_release_extents(inode, folio_size(folio));
 2827			btrfs_delalloc_release_space(inode, data_reserved,
 2828						     page_start, folio_size(folio),
 2829						     true);
 2830		}
 2831		ret = 0;
 2832		goto out_page;
 2833	}
 2834
 2835	/*
 2836	 * We can't mess with the folio state unless it is locked, so now that
 2837	 * it is locked bail if we failed to make our space reservation.
 2838	 */
 2839	if (ret)
 2840		goto out_page;
 2841
 2842	btrfs_lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
 2843
 2844	/* already ordered? We're done */
 2845	if (folio_test_ordered(folio))
 2846		goto out_reserved;
 2847
 2848	ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
 2849	if (ordered) {
 2850		btrfs_unlock_extent(&inode->io_tree, page_start, page_end,
 2851				    &cached_state);
 2852		folio_unlock(folio);
 2853		btrfs_start_ordered_extent(ordered);
 2854		btrfs_put_ordered_extent(ordered);
 2855		goto again;
 2856	}
 2857
 2858	ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
 2859					&cached_state);
 2860	if (ret)
 2861		goto out_reserved;
 2862
 2863	/*
 2864	 * Everything went as planned, we're now the owner of a dirty page with
 2865	 * delayed allocation bits set and space reserved for our COW
 2866	 * destination.
 2867	 *
 2868	 * The page was dirty when we started, nothing should have cleaned it.
 2869	 */
 2870	BUG_ON(!folio_test_dirty(folio));
 2871	free_delalloc_space = false;
 2872out_reserved:
 2873	btrfs_delalloc_release_extents(inode, PAGE_SIZE);
 2874	if (free_delalloc_space)
 2875		btrfs_delalloc_release_space(inode, data_reserved, page_start,
 2876					     PAGE_SIZE, true);
 2877	btrfs_unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
 2878out_page:
 2879	if (ret) {
 2880		/*
 2881		 * We hit ENOSPC or other errors.  Update the mapping and page
 2882		 * to reflect the errors and clean the page.
 2883		 */
 2884		mapping_set_error(folio->mapping, ret);
 2885		btrfs_mark_ordered_io_finished(inode, folio, page_start,
 2886					       folio_size(folio), !ret);
 2887		folio_clear_dirty_for_io(folio);
 2888	}
 2889	btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
 2890	folio_unlock(folio);
 2891	folio_put(folio);
 2892	kfree(fixup);
 2893	extent_changeset_free(data_reserved);
 2894	/*
 2895	 * As a precaution, do a delayed iput in case it would be the last iput
 2896	 * that could need flushing space. Recursing back to fixup worker would
 2897	 * deadlock.
 2898	 */
 2899	btrfs_add_delayed_iput(inode);
 2900}
 2901
 2902/*
 2903 * There are a few paths in the higher layers of the kernel that directly
 2904 * set the folio dirty bit without asking the filesystem if it is a
 2905 * good idea.  This causes problems because we want to make sure COW
 2906 * properly happens and the data=ordered rules are followed.
 2907 *
 2908 * In our case any range that doesn't have the ORDERED bit set
 2909 * hasn't been properly setup for IO.  We kick off an async process
 2910 * to fix it up.  The async helper will wait for ordered extents, set
 2911 * the delalloc bit and make it safe to write the folio.
 2912 */
 2913int btrfs_writepage_cow_fixup(struct folio *folio)
 2914{
 2915	struct inode *inode = folio->mapping->host;
 2916	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 2917	struct btrfs_writepage_fixup *fixup;
 2918
 2919	/* This folio has ordered extent covering it already */
 2920	if (folio_test_ordered(folio))
 2921		return 0;
 2922
 2923	/*
 2924	 * For experimental build, we error out instead of EAGAIN.
 2925	 *
 2926	 * We should not hit such out-of-band dirty folios anymore.
 2927	 */
 2928	if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
 2929		DEBUG_WARN();
 2930		btrfs_err_rl(fs_info,
 2931	"root %lld ino %llu folio %llu is marked dirty without notifying the fs",
 2932			     btrfs_root_id(BTRFS_I(inode)->root),
 2933			     btrfs_ino(BTRFS_I(inode)),
 2934			     folio_pos(folio));
 2935		return -EUCLEAN;
 2936	}
 2937
 2938	/*
 2939	 * folio_checked is set below when we create a fixup worker for this
 2940	 * folio, don't try to create another one if we're already
 2941	 * folio_test_checked.
 2942	 *
 2943	 * The extent_io writepage code will redirty the foio if we send back
 2944	 * EAGAIN.
 2945	 */
 2946	if (folio_test_checked(folio))
 2947		return -EAGAIN;
 2948
 2949	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
 2950	if (!fixup)
 2951		return -EAGAIN;
 2952
 2953	/*
 2954	 * We are already holding a reference to this inode from
 2955	 * write_cache_pages.  We need to hold it because the space reservation
 2956	 * takes place outside of the folio lock, and we can't trust
 2957	 * folio->mapping outside of the folio lock.
 2958	 */
 2959	ihold(inode);
 2960	btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
 2961	folio_get(folio);
 2962	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
 2963	fixup->folio = folio;
 2964	fixup->inode = BTRFS_I(inode);
 2965	btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
 2966
 2967	return -EAGAIN;
 2968}
 2969
 2970static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 2971				       struct btrfs_inode *inode, u64 file_pos,
 2972				       struct btrfs_file_extent_item *stack_fi,
 2973				       const bool update_inode_bytes,
 2974				       u64 qgroup_reserved)
 2975{
 2976	struct btrfs_root *root = inode->root;
 2977	const u64 sectorsize = root->fs_info->sectorsize;
 2978	BTRFS_PATH_AUTO_FREE(path);
 2979	struct extent_buffer *leaf;
 2980	struct btrfs_key ins;
 2981	u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
 2982	u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
 2983	u64 offset = btrfs_stack_file_extent_offset(stack_fi);
 2984	u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
 2985	u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
 2986	struct btrfs_drop_extents_args drop_args = { 0 };
 2987	int ret;
 2988
 2989	path = btrfs_alloc_path();
 2990	if (!path)
 2991		return -ENOMEM;
 2992
 2993	/*
 2994	 * we may be replacing one extent in the tree with another.
 2995	 * The new extent is pinned in the extent map, and we don't want
 2996	 * to drop it from the cache until it is completely in the btree.
 2997	 *
 2998	 * So, tell btrfs_drop_extents to leave this extent in the cache.
 2999	 * the caller is expected to unpin it and allow it to be merged
 3000	 * with the others.
 3001	 */
 3002	drop_args.path = path;
 3003	drop_args.start = file_pos;
 3004	drop_args.end = file_pos + num_bytes;
 3005	drop_args.replace_extent = true;
 3006	drop_args.extent_item_size = sizeof(*stack_fi);
 3007	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
 3008	if (ret)
 3009		goto out;
 3010
 3011	if (!drop_args.extent_inserted) {
 3012		ins.objectid = btrfs_ino(inode);
 3013		ins.type = BTRFS_EXTENT_DATA_KEY;
 3014		ins.offset = file_pos;
 3015
 3016		ret = btrfs_insert_empty_item(trans, root, path, &ins,
 3017					      sizeof(*stack_fi));
 3018		if (ret)
 3019			goto out;
 3020	}
 3021	leaf = path->nodes[0];
 3022	btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
 3023	write_extent_buffer(leaf, stack_fi,
 3024			btrfs_item_ptr_offset(leaf, path->slots[0]),
 3025			sizeof(struct btrfs_file_extent_item));
 3026
 3027	btrfs_release_path(path);
 3028
 3029	/*
 3030	 * If we dropped an inline extent here, we know the range where it is
 3031	 * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
 3032	 * number of bytes only for that range containing the inline extent.
 3033	 * The remaining of the range will be processed when clearing the
 3034	 * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
 3035	 */
 3036	if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
 3037		u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
 3038
 3039		inline_size = drop_args.bytes_found - inline_size;
 3040		btrfs_update_inode_bytes(inode, sectorsize, inline_size);
 3041		drop_args.bytes_found -= inline_size;
 3042		num_bytes -= sectorsize;
 3043	}
 3044
 3045	if (update_inode_bytes)
 3046		btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
 3047
 3048	ins.objectid = disk_bytenr;
 3049	ins.type = BTRFS_EXTENT_ITEM_KEY;
 3050	ins.offset = disk_num_bytes;
 3051
 3052	ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
 3053	if (ret)
 3054		goto out;
 3055
 3056	ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
 3057					       file_pos - offset,
 3058					       qgroup_reserved, &ins);
 3059out:
 3060	return ret;
 3061}
 3062
 3063static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
 3064					 u64 start, u64 len)
 3065{
 3066	struct btrfs_block_group *cache;
 3067
 3068	cache = btrfs_lookup_block_group(fs_info, start);
 3069	ASSERT(cache);
 3070
 3071	spin_lock(&cache->lock);
 3072	cache->delalloc_bytes -= len;
 3073	spin_unlock(&cache->lock);
 3074
 3075	btrfs_put_block_group(cache);
 3076}
 3077
 3078static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
 3079					     struct btrfs_ordered_extent *oe)
 3080{
 3081	struct btrfs_file_extent_item stack_fi;
 3082	bool update_inode_bytes;
 3083	u64 num_bytes = oe->num_bytes;
 3084	u64 ram_bytes = oe->ram_bytes;
 3085
 3086	memset(&stack_fi, 0, sizeof(stack_fi));
 3087	btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
 3088	btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
 3089	btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
 3090						   oe->disk_num_bytes);
 3091	btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
 3092	if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
 3093		num_bytes = oe->truncated_len;
 3094	btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
 3095	btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
 3096	btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
 3097	/* Encryption and other encoding is reserved and all 0 */
 3098
 3099	/*
 3100	 * For delalloc, when completing an ordered extent we update the inode's
 3101	 * bytes when clearing the range in the inode's io tree, so pass false
 3102	 * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
 3103	 * except if the ordered extent was truncated.
 3104	 */
 3105	update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
 3106			     test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
 3107			     test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
 3108
 3109	return insert_reserved_file_extent(trans, oe->inode,
 3110					   oe->file_offset, &stack_fi,
 3111					   update_inode_bytes, oe->qgroup_rsv);
 3112}
 3113
 3114/*
 3115 * As ordered data IO finishes, this gets called so we can finish
 3116 * an ordered extent if the range of bytes in the file it covers are
 3117 * fully written.
 3118 */
 3119int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
 3120{
 3121	struct btrfs_inode *inode = ordered_extent->inode;
 3122	struct btrfs_root *root = inode->root;
 3123	struct btrfs_fs_info *fs_info = root->fs_info;
 3124	struct btrfs_trans_handle *trans = NULL;
 3125	struct extent_io_tree *io_tree = &inode->io_tree;
 3126	struct extent_state *cached_state = NULL;
 3127	u64 start, end;
 3128	int compress_type = 0;
 3129	int ret = 0;
 3130	u64 logical_len = ordered_extent->num_bytes;
 3131	bool freespace_inode;
 3132	bool truncated = false;
 3133	bool clear_reserved_extent = true;
 3134	unsigned int clear_bits = EXTENT_DEFRAG;
 3135
 3136	start = ordered_extent->file_offset;
 3137	end = start + ordered_extent->num_bytes - 1;
 3138
 3139	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
 3140	    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
 3141	    !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
 3142	    !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
 3143		clear_bits |= EXTENT_DELALLOC_NEW;
 3144
 3145	freespace_inode = btrfs_is_free_space_inode(inode);
 3146	if (!freespace_inode)
 3147		btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
 3148
 3149	if (unlikely(test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags))) {
 3150		ret = -EIO;
 3151		goto out;
 3152	}
 3153
 3154	ret = btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
 3155				      ordered_extent->disk_num_bytes);
 3156	if (ret)
 3157		goto out;
 3158
 3159	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
 3160		truncated = true;
 3161		logical_len = ordered_extent->truncated_len;
 3162		/* Truncated the entire extent, don't bother adding */
 3163		if (!logical_len)
 3164			goto out;
 3165	}
 3166
 3167	/*
 3168	 * If it's a COW write we need to lock the extent range as we will be
 3169	 * inserting/replacing file extent items and unpinning an extent map.
 3170	 * This must be taken before joining a transaction, as it's a higher
 3171	 * level lock (like the inode's VFS lock), otherwise we can run into an
 3172	 * ABBA deadlock with other tasks (transactions work like a lock,
 3173	 * depending on their current state).
 3174	 */
 3175	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
 3176		clear_bits |= EXTENT_LOCKED | EXTENT_FINISHING_ORDERED;
 3177		btrfs_lock_extent_bits(io_tree, start, end,
 3178				       EXTENT_LOCKED | EXTENT_FINISHING_ORDERED,
 3179				       &cached_state);
 3180	}
 3181
 3182	if (freespace_inode)
 3183		trans = btrfs_join_transaction_spacecache(root);
 3184	else
 3185		trans = btrfs_join_transaction(root);
 3186	if (IS_ERR(trans)) {
 3187		ret = PTR_ERR(trans);
 3188		trans = NULL;
 3189		goto out;
 3190	}
 3191
 3192	trans->block_rsv = &inode->block_rsv;
 3193
 3194	ret = btrfs_insert_raid_extent(trans, ordered_extent);
 3195	if (unlikely(ret)) {
 3196		btrfs_abort_transaction(trans, ret);
 3197		goto out;
 3198	}
 3199
 3200	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
 3201		/* Logic error */
 3202		ASSERT(list_empty(&ordered_extent->list));
 3203		if (unlikely(!list_empty(&ordered_extent->list))) {
 3204			ret = -EINVAL;
 3205			btrfs_abort_transaction(trans, ret);
 3206			goto out;
 3207		}
 3208
 3209		btrfs_inode_safe_disk_i_size_write(inode, 0);
 3210		ret = btrfs_update_inode_fallback(trans, inode);
 3211		if (unlikely(ret)) {
 3212			/* -ENOMEM or corruption */
 3213			btrfs_abort_transaction(trans, ret);
 3214		}
 3215		goto out;
 3216	}
 3217
 3218	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
 3219		compress_type = ordered_extent->compress_type;
 3220	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
 3221		BUG_ON(compress_type);
 3222		ret = btrfs_mark_extent_written(trans, inode,
 3223						ordered_extent->file_offset,
 3224						ordered_extent->file_offset +
 3225						logical_len);
 3226		btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
 3227						  ordered_extent->disk_num_bytes);
 3228	} else {
 3229		BUG_ON(root == fs_info->tree_root);
 3230		ret = insert_ordered_extent_file_extent(trans, ordered_extent);
 3231		if (!ret) {
 3232			clear_reserved_extent = false;
 3233			btrfs_release_delalloc_bytes(fs_info,
 3234						ordered_extent->disk_bytenr,
 3235						ordered_extent->disk_num_bytes);
 3236		}
 3237	}
 3238	if (unlikely(ret < 0)) {
 3239		btrfs_abort_transaction(trans, ret);
 3240		goto out;
 3241	}
 3242
 3243	ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset,
 3244				       ordered_extent->num_bytes, trans->transid);
 3245	if (unlikely(ret < 0)) {
 3246		btrfs_abort_transaction(trans, ret);
 3247		goto out;
 3248	}
 3249
 3250	ret = add_pending_csums(trans, &ordered_extent->list);
 3251	if (unlikely(ret)) {
 3252		btrfs_abort_transaction(trans, ret);
 3253		goto out;
 3254	}
 3255
 3256	/*
 3257	 * If this is a new delalloc range, clear its new delalloc flag to
 3258	 * update the inode's number of bytes. This needs to be done first
 3259	 * before updating the inode item.
 3260	 */
 3261	if ((clear_bits & EXTENT_DELALLOC_NEW) &&
 3262	    !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
 3263		btrfs_clear_extent_bit(&inode->io_tree, start, end,
 3264				       EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
 3265				       &cached_state);
 3266
 3267	btrfs_inode_safe_disk_i_size_write(inode, 0);
 3268	ret = btrfs_update_inode_fallback(trans, inode);
 3269	if (unlikely(ret)) { /* -ENOMEM or corruption */
 3270		btrfs_abort_transaction(trans, ret);
 3271		goto out;
 3272	}
 3273out:
 3274	btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits,
 3275			       &cached_state);
 3276
 3277	if (trans)
 3278		btrfs_end_transaction(trans);
 3279
 3280	if (ret || truncated) {
 3281		/*
 3282		 * If we failed to finish this ordered extent for any reason we
 3283		 * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
 3284		 * extent, and mark the inode with the error if it wasn't
 3285		 * already set.  Any error during writeback would have already
 3286		 * set the mapping error, so we need to set it if we're the ones
 3287		 * marking this ordered extent as failed.
 3288		 */
 3289		if (ret)
 3290			btrfs_mark_ordered_extent_error(ordered_extent);
 3291
 3292		/*
 3293		 * Drop extent maps for the part of the extent we didn't write.
 3294		 *
 3295		 * We have an exception here for the free_space_inode, this is
 3296		 * because when we do btrfs_get_extent() on the free space inode
 3297		 * we will search the commit root.  If this is a new block group
 3298		 * we won't find anything, and we will trip over the assert in
 3299		 * writepage where we do ASSERT(em->block_start !=
 3300		 * EXTENT_MAP_HOLE).
 3301		 *
 3302		 * Theoretically we could also skip this for any NOCOW extent as
 3303		 * we don't mess with the extent map tree in the NOCOW case, but
 3304		 * for now simply skip this if we are the free space inode.
 3305		 */
 3306		if (!btrfs_is_free_space_inode(inode)) {
 3307			u64 unwritten_start = start;
 3308
 3309			if (truncated)
 3310				unwritten_start += logical_len;
 3311
 3312			btrfs_drop_extent_map_range(inode, unwritten_start,
 3313						    end, false);
 3314		}
 3315
 3316		/*
 3317		 * If the ordered extent had an IOERR or something else went
 3318		 * wrong we need to return the space for this ordered extent
 3319		 * back to the allocator.  We only free the extent in the
 3320		 * truncated case if we didn't write out the extent at all.
 3321		 *
 3322		 * If we made it past insert_reserved_file_extent before we
 3323		 * errored out then we don't need to do this as the accounting
 3324		 * has already been done.
 3325		 */
 3326		if ((ret || !logical_len) &&
 3327		    clear_reserved_extent &&
 3328		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
 3329		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
 3330			/*
 3331			 * Discard the range before returning it back to the
 3332			 * free space pool
 3333			 */
 3334			if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
 3335				btrfs_discard_extent(fs_info,
 3336						ordered_extent->disk_bytenr,
 3337						ordered_extent->disk_num_bytes,
 3338						NULL);
 3339			btrfs_free_reserved_extent(fs_info,
 3340					ordered_extent->disk_bytenr,
 3341					ordered_extent->disk_num_bytes, true);
 3342			/*
 3343			 * Actually free the qgroup rsv which was released when
 3344			 * the ordered extent was created.
 3345			 */
 3346			btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(inode->root),
 3347						  ordered_extent->qgroup_rsv,
 3348						  BTRFS_QGROUP_RSV_DATA);
 3349		}
 3350	}
 3351
 3352	/*
 3353	 * This needs to be done to make sure anybody waiting knows we are done
 3354	 * updating everything for this ordered extent.
 3355	 */
 3356	btrfs_remove_ordered_extent(inode, ordered_extent);
 3357
 3358	/* once for us */
 3359	btrfs_put_ordered_extent(ordered_extent);
 3360	/* once for the tree */
 3361	btrfs_put_ordered_extent(ordered_extent);
 3362
 3363	return ret;
 3364}
 3365
 3366int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
 3367{
 3368	if (btrfs_is_zoned(ordered->inode->root->fs_info) &&
 3369	    !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
 3370	    list_empty(&ordered->bioc_list))
 3371		btrfs_finish_ordered_zoned(ordered);
 3372	return btrfs_finish_one_ordered(ordered);
 3373}
 3374
 3375/*
 3376 * Calculate the checksum of an fs block at physical memory address @paddr,
 3377 * and save the result to @dest.
 3378 *
 3379 * The folio containing @paddr must be large enough to contain a full fs block.
 3380 */
 3381void btrfs_calculate_block_csum_folio(struct btrfs_fs_info *fs_info,
 3382				      const phys_addr_t paddr, u8 *dest)
 3383{
 3384	struct folio *folio = page_folio(phys_to_page(paddr));
 3385	const u32 blocksize = fs_info->sectorsize;
 3386	const u32 step = min(blocksize, PAGE_SIZE);
 3387	const u32 nr_steps = blocksize / step;
 3388	phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
 3389
 3390	/* The full block must be inside the folio. */
 3391	ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio));
 3392
 3393	for (int i = 0; i < nr_steps; i++) {
 3394		u32 pindex = offset_in_folio(folio, paddr + i * step) >> PAGE_SHIFT;
 3395
 3396		/*
 3397		 * For bs <= ps cases, we will only run the loop once, so the offset
 3398		 * inside the page will only added to paddrs[0].
 3399		 *
 3400		 * For bs > ps cases, the block must be page aligned, thus offset
 3401		 * inside the page will always be 0.
 3402		 */
 3403		paddrs[i] = page_to_phys(folio_page(folio, pindex)) + offset_in_page(paddr);
 3404	}
 3405	return btrfs_calculate_block_csum_pages(fs_info, paddrs, dest);
 3406}
 3407
 3408/*
 3409 * Calculate the checksum of a fs block backed by multiple noncontiguous pages
 3410 * at @paddrs[] and save the result to @dest.
 3411 *
 3412 * The folio containing @paddr must be large enough to contain a full fs block.
 3413 */
 3414void btrfs_calculate_block_csum_pages(struct btrfs_fs_info *fs_info,
 3415				      const phys_addr_t paddrs[], u8 *dest)
 3416{
 3417	const u32 blocksize = fs_info->sectorsize;
 3418	const u32 step = min(blocksize, PAGE_SIZE);
 3419	const u32 nr_steps = blocksize / step;
 3420	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 3421
 3422	shash->tfm = fs_info->csum_shash;
 3423	crypto_shash_init(shash);
 3424	for (int i = 0; i < nr_steps; i++) {
 3425		const phys_addr_t paddr = paddrs[i];
 3426		void *kaddr;
 3427
 3428		ASSERT(offset_in_page(paddr) + step <= PAGE_SIZE);
 3429		kaddr = kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr);
 3430		crypto_shash_update(shash, kaddr, step);
 3431		kunmap_local(kaddr);
 3432	}
 3433	crypto_shash_final(shash, dest);
 3434}
 3435
 3436/*
 3437 * Verify the checksum for a single sector without any extra action that depend
 3438 * on the type of I/O.
 3439 *
 3440 * @kaddr must be a properly kmapped address.
 3441 */
 3442int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum,
 3443			   const u8 * const csum_expected)
 3444{
 3445	btrfs_calculate_block_csum_folio(fs_info, paddr, csum);
 3446	if (unlikely(memcmp(csum, csum_expected, fs_info->csum_size) != 0))
 3447		return -EIO;
 3448	return 0;
 3449}
 3450
 3451/*
 3452 * Verify the checksum of a single data sector, which can be scattered at
 3453 * different noncontiguous pages.
 3454 *
 3455 * @bbio:	btrfs_io_bio which contains the csum
 3456 * @dev:	device the sector is on
 3457 * @bio_offset:	offset to the beginning of the bio (in bytes)
 3458 * @paddrs:	physical addresses which back the fs block
 3459 *
 3460 * Check if the checksum on a data block is valid.  When a checksum mismatch is
 3461 * detected, report the error and fill the corrupted range with zero.
 3462 *
 3463 * Return %true if the sector is ok or had no checksum to start with, else %false.
 3464 */
 3465bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
 3466			u32 bio_offset, const phys_addr_t paddrs[])
 3467{
 3468	struct btrfs_inode *inode = bbio->inode;
 3469	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 3470	const u32 blocksize = fs_info->sectorsize;
 3471	const u32 step = min(blocksize, PAGE_SIZE);
 3472	const u32 nr_steps = blocksize / step;
 3473	u64 file_offset = bbio->file_offset + bio_offset;
 3474	u64 end = file_offset + blocksize - 1;
 3475	u8 *csum_expected;
 3476	u8 csum[BTRFS_CSUM_SIZE];
 3477
 3478	if (!bbio->csum)
 3479		return true;
 3480
 3481	if (btrfs_is_data_reloc_root(inode->root) &&
 3482	    btrfs_test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
 3483				 NULL)) {
 3484		/* Skip the range without csum for data reloc inode */
 3485		btrfs_clear_extent_bit(&inode->io_tree, file_offset, end,
 3486				       EXTENT_NODATASUM, NULL);
 3487		return true;
 3488	}
 3489
 3490	csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
 3491				fs_info->csum_size;
 3492	btrfs_calculate_block_csum_pages(fs_info, paddrs, csum);
 3493	if (unlikely(memcmp(csum, csum_expected, fs_info->csum_size) != 0))
 3494		goto zeroit;
 3495	return true;
 3496
 3497zeroit:
 3498	btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected,
 3499				    bbio->mirror_num);
 3500	if (dev)
 3501		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
 3502	for (int i = 0; i < nr_steps; i++)
 3503		memzero_page(phys_to_page(paddrs[i]), offset_in_page(paddrs[i]), step);
 3504	return false;
 3505}
 3506
 3507/*
 3508 * Perform a delayed iput on @inode.
 3509 *
 3510 * @inode: The inode we want to perform iput on
 3511 *
 3512 * This function uses the generic vfs_inode::i_count to track whether we should
 3513 * just decrement it (in case it's > 1) or if this is the last iput then link
 3514 * the inode to the delayed iput machinery. Delayed iputs are processed at
 3515 * transaction commit time/superblock commit/cleaner kthread.
 3516 */
 3517void btrfs_add_delayed_iput(struct btrfs_inode *inode)
 3518{
 3519	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 3520	unsigned long flags;
 3521
 3522	if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
 3523		return;
 3524
 3525	WARN_ON_ONCE(test_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state));
 3526	atomic_inc(&fs_info->nr_delayed_iputs);
 3527	/*
 3528	 * Need to be irq safe here because we can be called from either an irq
 3529	 * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq
 3530	 * context.
 3531	 */
 3532	spin_lock_irqsave(&fs_info->delayed_iput_lock, flags);
 3533	ASSERT(list_empty(&inode->delayed_iput));
 3534	list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs);
 3535	spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags);
 3536	if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
 3537		wake_up_process(fs_info->cleaner_kthread);
 3538}
 3539
 3540static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
 3541				    struct btrfs_inode *inode)
 3542{
 3543	list_del_init(&inode->delayed_iput);
 3544	spin_unlock_irq(&fs_info->delayed_iput_lock);
 3545	iput(&inode->vfs_inode);
 3546	if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
 3547		wake_up(&fs_info->delayed_iputs_wait);
 3548	spin_lock_irq(&fs_info->delayed_iput_lock);
 3549}
 3550
 3551static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
 3552				   struct btrfs_inode *inode)
 3553{
 3554	if (!list_empty(&inode->delayed_iput)) {
 3555		spin_lock_irq(&fs_info->delayed_iput_lock);
 3556		if (!list_empty(&inode->delayed_iput))
 3557			run_delayed_iput_locked(fs_info, inode);
 3558		spin_unlock_irq(&fs_info->delayed_iput_lock);
 3559	}
 3560}
 3561
 3562void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
 3563{
 3564	/*
 3565	 * btrfs_put_ordered_extent() can run in irq context (see bio.c), which
 3566	 * calls btrfs_add_delayed_iput() and that needs to lock
 3567	 * fs_info->delayed_iput_lock. So we need to disable irqs here to
 3568	 * prevent a deadlock.
 3569	 */
 3570	spin_lock_irq(&fs_info->delayed_iput_lock);
 3571	while (!list_empty(&fs_info->delayed_iputs)) {
 3572		struct btrfs_inode *inode;
 3573
 3574		inode = list_first_entry(&fs_info->delayed_iputs,
 3575				struct btrfs_inode, delayed_iput);
 3576		run_delayed_iput_locked(fs_info, inode);
 3577		if (need_resched()) {
 3578			spin_unlock_irq(&fs_info->delayed_iput_lock);
 3579			cond_resched();
 3580			spin_lock_irq(&fs_info->delayed_iput_lock);
 3581		}
 3582	}
 3583	spin_unlock_irq(&fs_info->delayed_iput_lock);
 3584}
 3585
 3586/*
 3587 * Wait for flushing all delayed iputs
 3588 *
 3589 * @fs_info:  the filesystem
 3590 *
 3591 * This will wait on any delayed iputs that are currently running with KILLABLE
 3592 * set.  Once they are all done running we will return, unless we are killed in
 3593 * which case we return EINTR. This helps in user operations like fallocate etc
 3594 * that might get blocked on the iputs.
 3595 *
 3596 * Return EINTR if we were killed, 0 if nothing's pending
 3597 */
 3598int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
 3599{
 3600	int ret = wait_event_killable(fs_info->delayed_iputs_wait,
 3601			atomic_read(&fs_info->nr_delayed_iputs) == 0);
 3602	if (ret)
 3603		return -EINTR;
 3604	return 0;
 3605}
 3606
 3607/*
 3608 * This creates an orphan entry for the given inode in case something goes wrong
 3609 * in the middle of an unlink.
 3610 */
 3611int btrfs_orphan_add(struct btrfs_trans_handle *trans,
 3612		     struct btrfs_inode *inode)
 3613{
 3614	int ret;
 3615
 3616	ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
 3617	if (unlikely(ret && ret != -EEXIST)) {
 3618		btrfs_abort_transaction(trans, ret);
 3619		return ret;
 3620	}
 3621
 3622	return 0;
 3623}
 3624
 3625/*
 3626 * We have done the delete so we can go ahead and remove the orphan item for
 3627 * this particular inode.
 3628 */
 3629static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
 3630			    struct btrfs_inode *inode)
 3631{
 3632	return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
 3633}
 3634
 3635/*
 3636 * this cleans up any orphans that may be left on the list from the last use
 3637 * of this root.
 3638 */
 3639int btrfs_orphan_cleanup(struct btrfs_root *root)
 3640{
 3641	struct btrfs_fs_info *fs_info = root->fs_info;
 3642	BTRFS_PATH_AUTO_FREE(path);
 3643	struct extent_buffer *leaf;
 3644	struct btrfs_key key, found_key;
 3645	struct btrfs_trans_handle *trans;
 3646	u64 last_objectid = 0;
 3647	int ret = 0, nr_unlink = 0;
 3648
 3649	if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
 3650		return 0;
 3651
 3652	path = btrfs_alloc_path();
 3653	if (!path) {
 3654		ret = -ENOMEM;
 3655		goto out;
 3656	}
 3657	path->reada = READA_BACK;
 3658
 3659	key.objectid = BTRFS_ORPHAN_OBJECTID;
 3660	key.type = BTRFS_ORPHAN_ITEM_KEY;
 3661	key.offset = (u64)-1;
 3662
 3663	while (1) {
 3664		struct btrfs_inode *inode;
 3665
 3666		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 3667		if (ret < 0)
 3668			goto out;
 3669
 3670		/*
 3671		 * if ret == 0 means we found what we were searching for, which
 3672		 * is weird, but possible, so only screw with path if we didn't
 3673		 * find the key and see if we have stuff that matches
 3674		 */
 3675		if (ret > 0) {
 3676			ret = 0;
 3677			if (path->slots[0] == 0)
 3678				break;
 3679			path->slots[0]--;
 3680		}
 3681
 3682		/* pull out the item */
 3683		leaf = path->nodes[0];
 3684		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 3685
 3686		/* make sure the item matches what we want */
 3687		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
 3688			break;
 3689		if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
 3690			break;
 3691
 3692		/* release the path since we're done with it */
 3693		btrfs_release_path(path);
 3694
 3695		/*
 3696		 * this is where we are basically btrfs_lookup, without the
 3697		 * crossing root thing.  we store the inode number in the
 3698		 * offset of the orphan item.
 3699		 */
 3700
 3701		if (found_key.offset == last_objectid) {
 3702			/*
 3703			 * We found the same inode as before. This means we were
 3704			 * not able to remove its items via eviction triggered
 3705			 * by an iput(). A transaction abort may have happened,
 3706			 * due to -ENOSPC for example, so try to grab the error
 3707			 * that lead to a transaction abort, if any.
 3708			 */
 3709			btrfs_err(fs_info,
 3710				  "Error removing orphan entry, stopping orphan cleanup");
 3711			ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL;
 3712			goto out;
 3713		}
 3714
 3715		last_objectid = found_key.offset;
 3716
 3717		found_key.objectid = found_key.offset;
 3718		found_key.type = BTRFS_INODE_ITEM_KEY;
 3719		found_key.offset = 0;
 3720		inode = btrfs_iget(last_objectid, root);
 3721		if (IS_ERR(inode)) {
 3722			ret = PTR_ERR(inode);
 3723			inode = NULL;
 3724			if (ret != -ENOENT)
 3725				goto out;
 3726		}
 3727
 3728		if (!inode && root == fs_info->tree_root) {
 3729			struct btrfs_root *dead_root;
 3730			int is_dead_root = 0;
 3731
 3732			/*
 3733			 * This is an orphan in the tree root. Currently these
 3734			 * could come from 2 sources:
 3735			 *  a) a root (snapshot/subvolume) deletion in progress
 3736			 *  b) a free space cache inode
 3737			 * We need to distinguish those two, as the orphan item
 3738			 * for a root must not get deleted before the deletion
 3739			 * of the snapshot/subvolume's tree completes.
 3740			 *
 3741			 * btrfs_find_orphan_roots() ran before us, which has
 3742			 * found all deleted roots and loaded them into
 3743			 * fs_info->fs_roots_radix. So here we can find if an
 3744			 * orphan item corresponds to a deleted root by looking
 3745			 * up the root from that radix tree.
 3746			 */
 3747
 3748			spin_lock(&fs_info->fs_roots_radix_lock);
 3749			dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
 3750							 (unsigned long)found_key.objectid);
 3751			if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
 3752				is_dead_root = 1;
 3753			spin_unlock(&fs_info->fs_roots_radix_lock);
 3754
 3755			if (is_dead_root) {
 3756				/* prevent this orphan from being found again */
 3757				key.offset = found_key.objectid - 1;
 3758				continue;
 3759			}
 3760
 3761		}
 3762
 3763		/*
 3764		 * If we have an inode with links, there are a couple of
 3765		 * possibilities:
 3766		 *
 3767		 * 1. We were halfway through creating fsverity metadata for the
 3768		 * file. In that case, the orphan item represents incomplete
 3769		 * fsverity metadata which must be cleaned up with
 3770		 * btrfs_drop_verity_items and deleting the orphan item.
 3771
 3772		 * 2. Old kernels (before v3.12) used to create an
 3773		 * orphan item for truncate indicating that there were possibly
 3774		 * extent items past i_size that needed to be deleted. In v3.12,
 3775		 * truncate was changed to update i_size in sync with the extent
 3776		 * items, but the (useless) orphan item was still created. Since
 3777		 * v4.18, we don't create the orphan item for truncate at all.
 3778		 *
 3779		 * So, this item could mean that we need to do a truncate, but
 3780		 * only if this filesystem was last used on a pre-v3.12 kernel
 3781		 * and was not cleanly unmounted. The odds of that are quite
 3782		 * slim, and it's a pain to do the truncate now, so just delete
 3783		 * the orphan item.
 3784		 *
 3785		 * It's also possible that this orphan item was supposed to be
 3786		 * deleted but wasn't. The inode number may have been reused,
 3787		 * but either way, we can delete the orphan item.
 3788		 */
 3789		if (!inode || inode->vfs_inode.i_nlink) {
 3790			if (inode) {
 3791				ret = btrfs_drop_verity_items(inode);
 3792				iput(&inode->vfs_inode);
 3793				inode = NULL;
 3794				if (ret)
 3795					goto out;
 3796			}
 3797			trans = btrfs_start_transaction(root, 1);
 3798			if (IS_ERR(trans)) {
 3799				ret = PTR_ERR(trans);
 3800				goto out;
 3801			}
 3802			btrfs_debug(fs_info, "auto deleting %Lu",
 3803				    found_key.objectid);
 3804			ret = btrfs_del_orphan_item(trans, root,
 3805						    found_key.objectid);
 3806			btrfs_end_transaction(trans);
 3807			if (ret)
 3808				goto out;
 3809			continue;
 3810		}
 3811
 3812		nr_unlink++;
 3813
 3814		/* this will do delete_inode and everything for us */
 3815		iput(&inode->vfs_inode);
 3816	}
 3817	/* release the path since we're done with it */
 3818	btrfs_release_path(path);
 3819
 3820	if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
 3821		trans = btrfs_join_transaction(root);
 3822		if (!IS_ERR(trans))
 3823			btrfs_end_transaction(trans);
 3824	}
 3825
 3826	if (nr_unlink)
 3827		btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
 3828
 3829out:
 3830	if (ret)
 3831		btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
 3832	return ret;
 3833}
 3834
 3835/*
 3836 * Look ahead in the leaf for xattrs. If we don't find any then we know there
 3837 * can't be any ACLs.
 3838 *
 3839 * @leaf:       the eb leaf where to search
 3840 * @slot:       the slot the inode is in
 3841 * @objectid:   the objectid of the inode
 3842 *
 3843 * Return true if there is xattr/ACL, false otherwise.
 3844 */
 3845static noinline bool acls_after_inode_item(struct extent_buffer *leaf,
 3846					   int slot, u64 objectid,
 3847					   int *first_xattr_slot)
 3848{
 3849	u32 nritems = btrfs_header_nritems(leaf);
 3850	struct btrfs_key found_key;
 3851	static u64 xattr_access = 0;
 3852	static u64 xattr_default = 0;
 3853	int scanned = 0;
 3854
 3855	if (!xattr_access) {
 3856		xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
 3857					strlen(XATTR_NAME_POSIX_ACL_ACCESS));
 3858		xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
 3859					strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
 3860	}
 3861
 3862	slot++;
 3863	*first_xattr_slot = -1;
 3864	while (slot < nritems) {
 3865		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 3866
 3867		/* We found a different objectid, there must be no ACLs. */
 3868		if (found_key.objectid != objectid)
 3869			return false;
 3870
 3871		/* We found an xattr, assume we've got an ACL. */
 3872		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
 3873			if (*first_xattr_slot == -1)
 3874				*first_xattr_slot = slot;
 3875			if (found_key.offset == xattr_access ||
 3876			    found_key.offset == xattr_default)
 3877				return true;
 3878		}
 3879
 3880		/*
 3881		 * We found a key greater than an xattr key, there can't be any
 3882		 * ACLs later on.
 3883		 */
 3884		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
 3885			return false;
 3886
 3887		slot++;
 3888		scanned++;
 3889
 3890		/*
 3891		 * The item order goes like:
 3892		 * - inode
 3893		 * - inode backrefs
 3894		 * - xattrs
 3895		 * - extents,
 3896		 *
 3897		 * so if there are lots of hard links to an inode there can be
 3898		 * a lot of backrefs.  Don't waste time searching too hard,
 3899		 * this is just an optimization.
 3900		 */
 3901		if (scanned >= 8)
 3902			break;
 3903	}
 3904	/*
 3905	 * We hit the end of the leaf before we found an xattr or something
 3906	 * larger than an xattr.  We have to assume the inode has ACLs.
 3907	 */
 3908	if (*first_xattr_slot == -1)
 3909		*first_xattr_slot = slot;
 3910	return true;
 3911}
 3912
 3913static int btrfs_init_file_extent_tree(struct btrfs_inode *inode)
 3914{
 3915	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 3916
 3917	if (WARN_ON_ONCE(inode->file_extent_tree))
 3918		return 0;
 3919	if (btrfs_fs_incompat(fs_info, NO_HOLES))
 3920		return 0;
 3921	if (!S_ISREG(inode->vfs_inode.i_mode))
 3922		return 0;
 3923	if (btrfs_is_free_space_inode(inode))
 3924		return 0;
 3925
 3926	inode->file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL);
 3927	if (!inode->file_extent_tree)
 3928		return -ENOMEM;
 3929
 3930	btrfs_extent_io_tree_init(fs_info, inode->file_extent_tree,
 3931				  IO_TREE_INODE_FILE_EXTENT);
 3932	/* Lockdep class is set only for the file extent tree. */
 3933	lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class);
 3934
 3935	return 0;
 3936}
 3937
 3938static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
 3939{
 3940	struct btrfs_root *root = inode->root;
 3941	struct btrfs_inode *existing;
 3942	const u64 ino = btrfs_ino(inode);
 3943	int ret;
 3944
 3945	if (inode_unhashed(&inode->vfs_inode))
 3946		return 0;
 3947
 3948	if (prealloc) {
 3949		ret = xa_reserve(&root->inodes, ino, GFP_NOFS);
 3950		if (ret)
 3951			return ret;
 3952	}
 3953
 3954	existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC);
 3955
 3956	if (xa_is_err(existing)) {
 3957		ret = xa_err(existing);
 3958		ASSERT(ret != -EINVAL);
 3959		ASSERT(ret != -ENOMEM);
 3960		return ret;
 3961	} else if (existing) {
 3962		WARN_ON(!(inode_state_read_once(&existing->vfs_inode) & (I_WILL_FREE | I_FREEING)));
 3963	}
 3964
 3965	return 0;
 3966}
 3967
 3968/*
 3969 * Read a locked inode from the btree into the in-memory inode and add it to
 3970 * its root list/tree.
 3971 *
 3972 * On failure clean up the inode.
 3973 */
 3974static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path *path)
 3975{
 3976	struct btrfs_root *root = inode->root;
 3977	struct btrfs_fs_info *fs_info = root->fs_info;
 3978	struct extent_buffer *leaf;
 3979	struct btrfs_inode_item *inode_item;
 3980	struct inode *vfs_inode = &inode->vfs_inode;
 3981	struct btrfs_key location;
 3982	unsigned long ptr;
 3983	int maybe_acls;
 3984	u32 rdev;
 3985	int ret;
 3986	bool filled = false;
 3987	int first_xattr_slot;
 3988
 3989	ret = btrfs_fill_inode(inode, &rdev);
 3990	if (!ret)
 3991		filled = true;
 3992
 3993	ASSERT(path);
 3994
 3995	btrfs_get_inode_key(inode, &location);
 3996
 3997	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
 3998	if (ret) {
 3999		/*
 4000		 * ret > 0 can come from btrfs_search_slot called by
 4001		 * btrfs_lookup_inode(), this means the inode was not found.
 4002		 */
 4003		if (ret > 0)
 4004			ret = -ENOENT;
 4005		goto out;
 4006	}
 4007
 4008	leaf = path->nodes[0];
 4009
 4010	if (filled)
 4011		goto cache_index;
 4012
 4013	inode_item = btrfs_item_ptr(leaf, path->slots[0],
 4014				    struct btrfs_inode_item);
 4015	vfs_inode->i_mode = btrfs_inode_mode(leaf, inode_item);
 4016	set_nlink(vfs_inode, btrfs_inode_nlink(leaf, inode_item));
 4017	i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item));
 4018	i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item));
 4019	btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
 4020
 4021	inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime),
 4022			btrfs_timespec_nsec(leaf, &inode_item->atime));
 4023
 4024	inode_set_mtime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
 4025			btrfs_timespec_nsec(leaf, &inode_item->mtime));
 4026
 4027	inode_set_ctime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
 4028			btrfs_timespec_nsec(leaf, &inode_item->ctime));
 4029
 4030	inode->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
 4031	inode->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);
 4032
 4033	inode_set_bytes(vfs_inode, btrfs_inode_nbytes(leaf, inode_item));
 4034	inode->generation = btrfs_inode_generation(leaf, inode_item);
 4035	inode->last_trans = btrfs_inode_transid(leaf, inode_item);
 4036
 4037	inode_set_iversion_queried(vfs_inode, btrfs_inode_sequence(leaf, inode_item));
 4038	vfs_inode->i_generation = inode->generation;
 4039	vfs_inode->i_rdev = 0;
 4040	rdev = btrfs_inode_rdev(leaf, inode_item);
 4041
 4042	if (S_ISDIR(vfs_inode->i_mode))
 4043		inode->index_cnt = (u64)-1;
 4044
 4045	btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
 4046				&inode->flags, &inode->ro_flags);
 4047	btrfs_update_inode_mapping_flags(inode);
 4048	btrfs_set_inode_mapping_order(inode);
 4049
 4050cache_index:
 4051	/*
 4052	 * If we were modified in the current generation and evicted from memory
 4053	 * and then re-read we need to do a full sync since we don't have any
 4054	 * idea about which extents were modified before we were evicted from
 4055	 * cache.
 4056	 *
 4057	 * This is required for both inode re-read from disk and delayed inode
 4058	 * in the delayed_nodes xarray.
 4059	 */
 4060	if (inode->last_trans == btrfs_get_fs_generation(fs_info))
 4061		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
 4062
 4063	/*
 4064	 * We don't persist the id of the transaction where an unlink operation
 4065	 * against the inode was last made. So here we assume the inode might
 4066	 * have been evicted, and therefore the exact value of last_unlink_trans
 4067	 * lost, and set it to last_trans to avoid metadata inconsistencies
 4068	 * between the inode and its parent if the inode is fsync'ed and the log
 4069	 * replayed. For example, in the scenario:
 4070	 *
 4071	 * touch mydir/foo
 4072	 * ln mydir/foo mydir/bar
 4073	 * sync
 4074	 * unlink mydir/bar
 4075	 * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
 4076	 * xfs_io -c fsync mydir/foo
 4077	 * <power failure>
 4078	 * mount fs, triggers fsync log replay
 4079	 *
 4080	 * We must make sure that when we fsync our inode foo we also log its
 4081	 * parent inode, otherwise after log replay the parent still has the
 4082	 * dentry with the "bar" name but our inode foo has a link count of 1
 4083	 * and doesn't have an inode ref with the name "bar" anymore.
 4084	 *
 4085	 * Setting last_unlink_trans to last_trans is a pessimistic approach,
 4086	 * but it guarantees correctness at the expense of occasional full
 4087	 * transaction commits on fsync if our inode is a directory, or if our
 4088	 * inode is not a directory, logging its parent unnecessarily.
 4089	 */
 4090	inode->last_unlink_trans = inode->last_trans;
 4091
 4092	/*
 4093	 * Same logic as for last_unlink_trans. We don't persist the generation
 4094	 * of the last transaction where this inode was used for a reflink
 4095	 * operation, so after eviction and reloading the inode we must be
 4096	 * pessimistic and assume the last transaction that modified the inode.
 4097	 */
 4098	inode->last_reflink_trans = inode->last_trans;
 4099
 4100	path->slots[0]++;
 4101	if (vfs_inode->i_nlink != 1 ||
 4102	    path->slots[0] >= btrfs_header_nritems(leaf))
 4103		goto cache_acl;
 4104
 4105	btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
 4106	if (location.objectid != btrfs_ino(inode))
 4107		goto cache_acl;
 4108
 4109	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
 4110	if (location.type == BTRFS_INODE_REF_KEY) {
 4111		struct btrfs_inode_ref *ref;
 4112
 4113		ref = (struct btrfs_inode_ref *)ptr;
 4114		inode->dir_index = btrfs_inode_ref_index(leaf, ref);
 4115	} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
 4116		struct btrfs_inode_extref *extref;
 4117
 4118		extref = (struct btrfs_inode_extref *)ptr;
 4119		inode->dir_index = btrfs_inode_extref_index(leaf, extref);
 4120	}
 4121cache_acl:
 4122	/*
 4123	 * try to precache a NULL acl entry for files that don't have
 4124	 * any xattrs or acls
 4125	 */
 4126	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
 4127					   btrfs_ino(inode), &first_xattr_slot);
 4128	if (first_xattr_slot != -1) {
 4129		path->slots[0] = first_xattr_slot;
 4130		ret = btrfs_load_inode_props(inode, path);
 4131		if (ret)
 4132			btrfs_err(fs_info,
 4133				  "error loading props for ino %llu (root %llu): %d",
 4134				  btrfs_ino(inode), btrfs_root_id(root), ret);
 4135	}
 4136
 4137	/*
 4138	 * We don't need the path anymore, so release it to avoid holding a read
 4139	 * lock on a leaf while calling btrfs_init_file_extent_tree(), which can
 4140	 * allocate memory that triggers reclaim (GFP_KERNEL) and cause a locking
 4141	 * dependency.
 4142	 */
 4143	btrfs_release_path(path);
 4144
 4145	ret = btrfs_init_file_extent_tree(inode);
 4146	if (ret)
 4147		goto out;
 4148	btrfs_inode_set_file_extent_range(inode, 0,
 4149			  round_up(i_size_read(vfs_inode), fs_info->sectorsize));
 4150
 4151	if (!maybe_acls)
 4152		cache_no_acl(vfs_inode);
 4153
 4154	switch (vfs_inode->i_mode & S_IFMT) {
 4155	case S_IFREG:
 4156		vfs_inode->i_mapping->a_ops = &btrfs_aops;
 4157		vfs_inode->i_fop = &btrfs_file_operations;
 4158		vfs_inode->i_op = &btrfs_file_inode_operations;
 4159		break;
 4160	case S_IFDIR:
 4161		vfs_inode->i_fop = &btrfs_dir_file_operations;
 4162		vfs_inode->i_op = &btrfs_dir_inode_operations;
 4163		break;
 4164	case S_IFLNK:
 4165		vfs_inode->i_op = &btrfs_symlink_inode_operations;
 4166		inode_nohighmem(vfs_inode);
 4167		vfs_inode->i_mapping->a_ops = &btrfs_aops;
 4168		break;
 4169	default:
 4170		vfs_inode->i_op = &btrfs_special_inode_operations;
 4171		init_special_inode(vfs_inode, vfs_inode->i_mode, rdev);
 4172		break;
 4173	}
 4174
 4175	btrfs_sync_inode_flags_to_i_flags(inode);
 4176
 4177	ret = btrfs_add_inode_to_root(inode, true);
 4178	if (ret)
 4179		goto out;
 4180
 4181	return 0;
 4182out:
 4183	iget_failed(vfs_inode);
 4184	return ret;
 4185}
 4186
 4187/*
 4188 * given a leaf and an inode, copy the inode fields into the leaf
 4189 */
 4190static void fill_inode_item(struct btrfs_trans_handle *trans,
 4191			    struct extent_buffer *leaf,
 4192			    struct btrfs_inode_item *item,
 4193			    struct inode *inode)
 4194{
 4195	u64 flags;
 4196
 4197	btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
 4198	btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
 4199	btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
 4200	btrfs_set_inode_mode(leaf, item, inode->i_mode);
 4201	btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
 4202
 4203	btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode));
 4204	btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode));
 4205
 4206	btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode));
 4207	btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode));
 4208
 4209	btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode));
 4210	btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode));
 4211
 4212	btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec);
 4213	btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec);
 4214
 4215	btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
 4216	btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
 4217	btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode));
 4218	btrfs_set_inode_transid(leaf, item, trans->transid);
 4219	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
 4220	flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
 4221					  BTRFS_I(inode)->ro_flags);
 4222	btrfs_set_inode_flags(leaf, item, flags);
 4223	btrfs_set_inode_block_group(leaf, item, 0);
 4224}
 4225
 4226/*
 4227 * copy everything in the in-memory inode into the btree.
 4228 */
 4229static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
 4230					    struct btrfs_inode *inode)
 4231{
 4232	struct btrfs_inode_item *inode_item;
 4233	BTRFS_PATH_AUTO_FREE(path);
 4234	struct extent_buffer *leaf;
 4235	struct btrfs_key key;
 4236	int ret;
 4237
 4238	path = btrfs_alloc_path();
 4239	if (!path)
 4240		return -ENOMEM;
 4241
 4242	btrfs_get_inode_key(inode, &key);
 4243	ret = btrfs_lookup_inode(trans, inode->root, path, &key, 1);
 4244	if (ret) {
 4245		if (ret > 0)
 4246			ret = -ENOENT;
 4247		return ret;
 4248	}
 4249
 4250	leaf = path->nodes[0];
 4251	inode_item = btrfs_item_ptr(leaf, path->slots[0],
 4252				    struct btrfs_inode_item);
 4253
 4254	fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
 4255	btrfs_set_inode_last_trans(trans, inode);
 4256	return 0;
 4257}
 4258
 4259/*
 4260 * copy everything in the in-memory inode into the btree.
 4261 */
 4262int btrfs_update_inode(struct btrfs_trans_handle *trans,
 4263		       struct btrfs_inode *inode)
 4264{
 4265	struct btrfs_root *root = inode->root;
 4266	struct btrfs_fs_info *fs_info = root->fs_info;
 4267	int ret;
 4268
 4269	/*
 4270	 * If the inode is a free space inode, we can deadlock during commit
 4271	 * if we put it into the delayed code.
 4272	 *
 4273	 * The data relocation inode should also be directly updated
 4274	 * without delay
 4275	 */
 4276	if (!btrfs_is_free_space_inode(inode)
 4277	    && !btrfs_is_data_reloc_root(root)
 4278	    && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
 4279		btrfs_update_root_times(trans, root);
 4280
 4281		ret = btrfs_delayed_update_inode(trans, inode);
 4282		if (!ret)
 4283			btrfs_set_inode_last_trans(trans, inode);
 4284		return ret;
 4285	}
 4286
 4287	return btrfs_update_inode_item(trans, inode);
 4288}
 4289
 4290int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
 4291				struct btrfs_inode *inode)
 4292{
 4293	int ret;
 4294
 4295	ret = btrfs_update_inode(trans, inode);
 4296	if (ret == -ENOSPC)
 4297		return btrfs_update_inode_item(trans, inode);
 4298	return ret;
 4299}
 4300
 4301static void update_time_after_link_or_unlink(struct btrfs_inode *dir)
 4302{
 4303	struct timespec64 now;
 4304
 4305	/*
 4306	 * If we are replaying a log tree, we do not want to update the mtime
 4307	 * and ctime of the parent directory with the current time, since the
 4308	 * log replay procedure is responsible for setting them to their correct
 4309	 * values (the ones it had when the fsync was done).
 4310	 */
 4311	if (test_bit(BTRFS_FS_LOG_RECOVERING, &dir->root->fs_info->flags))
 4312		return;
 4313
 4314	now = inode_set_ctime_current(&dir->vfs_inode);
 4315	inode_set_mtime_to_ts(&dir->vfs_inode, now);
 4316}
 4317
 4318/*
 4319 * unlink helper that gets used here in inode.c and in the tree logging
 4320 * recovery code.  It remove a link in a directory with a given name, and
 4321 * also drops the back refs in the inode to the directory
 4322 */
 4323static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 4324				struct btrfs_inode *dir,
 4325				struct btrfs_inode *inode,
 4326				const struct fscrypt_str *name,
 4327				struct btrfs_rename_ctx *rename_ctx)
 4328{
 4329	struct btrfs_root *root = dir->root;
 4330	struct btrfs_fs_info *fs_info = root->fs_info;
 4331	struct btrfs_path *path;
 4332	int ret = 0;
 4333	struct btrfs_dir_item *di;
 4334	u64 index;
 4335	u64 ino = btrfs_ino(inode);
 4336	u64 dir_ino = btrfs_ino(dir);
 4337
 4338	path = btrfs_alloc_path();
 4339	if (!path)
 4340		return -ENOMEM;
 4341
 4342	di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
 4343	if (IS_ERR_OR_NULL(di)) {
 4344		btrfs_free_path(path);
 4345		return di ? PTR_ERR(di) : -ENOENT;
 4346	}
 4347	ret = btrfs_delete_one_dir_name(trans, root, path, di);
 4348	/*
 4349	 * Down the call chains below we'll also need to allocate a path, so no
 4350	 * need to hold on to this one for longer than necessary.
 4351	 */
 4352	btrfs_free_path(path);
 4353	if (ret)
 4354		return ret;
 4355
 4356	/*
 4357	 * If we don't have dir index, we have to get it by looking up
 4358	 * the inode ref, since we get the inode ref, remove it directly,
 4359	 * it is unnecessary to do delayed deletion.
 4360	 *
 4361	 * But if we have dir index, needn't search inode ref to get it.
 4362	 * Since the inode ref is close to the inode item, it is better
 4363	 * that we delay to delete it, and just do this deletion when
 4364	 * we update the inode item.
 4365	 */
 4366	if (inode->dir_index) {
 4367		ret = btrfs_delayed_delete_inode_ref(inode);
 4368		if (!ret) {
 4369			index = inode->dir_index;
 4370			goto skip_backref;
 4371		}
 4372	}
 4373
 4374	ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
 4375	if (unlikely(ret)) {
 4376		btrfs_crit(fs_info,
 4377	   "failed to delete reference to %.*s, root %llu inode %llu parent %llu",
 4378			   name->len, name->name, btrfs_root_id(root), ino, dir_ino);
 4379		btrfs_abort_transaction(trans, ret);
 4380		return ret;
 4381	}
 4382skip_backref:
 4383	if (rename_ctx)
 4384		rename_ctx->index = index;
 4385
 4386	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
 4387	if (unlikely(ret)) {
 4388		btrfs_abort_transaction(trans, ret);
 4389		return ret;
 4390	}
 4391
 4392	/*
 4393	 * If we are in a rename context, we don't need to update anything in the
 4394	 * log. That will be done later during the rename by btrfs_log_new_name().
 4395	 * Besides that, doing it here would only cause extra unnecessary btree
 4396	 * operations on the log tree, increasing latency for applications.
 4397	 */
 4398	if (!rename_ctx) {
 4399		btrfs_del_inode_ref_in_log(trans, name, inode, dir);
 4400		btrfs_del_dir_entries_in_log(trans, name, dir, index);
 4401	}
 4402
 4403	/*
 4404	 * If we have a pending delayed iput we could end up with the final iput
 4405	 * being run in btrfs-cleaner context.  If we have enough of these built
 4406	 * up we can end up burning a lot of time in btrfs-cleaner without any
 4407	 * way to throttle the unlinks.  Since we're currently holding a ref on
 4408	 * the inode we can run the delayed iput here without any issues as the
 4409	 * final iput won't be done until after we drop the ref we're currently
 4410	 * holding.
 4411	 */
 4412	btrfs_run_delayed_iput(fs_info, inode);
 4413
 4414	btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
 4415	inode_inc_iversion(&inode->vfs_inode);
 4416	inode_set_ctime_current(&inode->vfs_inode);
 4417	inode_inc_iversion(&dir->vfs_inode);
 4418	update_time_after_link_or_unlink(dir);
 4419
 4420	return btrfs_update_inode(trans, dir);
 4421}
 4422
 4423int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 4424		       struct btrfs_inode *dir, struct btrfs_inode *inode,
 4425		       const struct fscrypt_str *name)
 4426{
 4427	int ret;
 4428
 4429	ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
 4430	if (!ret) {
 4431		drop_nlink(&inode->vfs_inode);
 4432		ret = btrfs_update_inode(trans, inode);
 4433	}
 4434	return ret;
 4435}
 4436
 4437/*
 4438 * helper to start transaction for unlink and rmdir.
 4439 *
 4440 * unlink and rmdir are special in btrfs, they do not always free space, so
 4441 * if we cannot make our reservations the normal way try and see if there is
 4442 * plenty of slack room in the global reserve to migrate, otherwise we cannot
 4443 * allow the unlink to occur.
 4444 */
 4445static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir)
 4446{
 4447	struct btrfs_root *root = dir->root;
 4448
 4449	return btrfs_start_transaction_fallback_global_rsv(root,
 4450						   BTRFS_UNLINK_METADATA_UNITS);
 4451}
 4452
 4453static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 4454{
 4455	struct btrfs_trans_handle *trans;
 4456	struct inode *inode = d_inode(dentry);
 4457	int ret;
 4458	struct fscrypt_name fname;
 4459
 4460	ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
 4461	if (ret)
 4462		return ret;
 4463
 4464	/* This needs to handle no-key deletions later on */
 4465
 4466	trans = __unlink_start_trans(BTRFS_I(dir));
 4467	if (IS_ERR(trans)) {
 4468		ret = PTR_ERR(trans);
 4469		goto fscrypt_free;
 4470	}
 4471
 4472	btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
 4473				false);
 4474
 4475	ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
 4476				 &fname.disk_name);
 4477	if (ret)
 4478		goto end_trans;
 4479
 4480	if (inode->i_nlink == 0) {
 4481		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
 4482		if (ret)
 4483			goto end_trans;
 4484	}
 4485
 4486end_trans:
 4487	btrfs_end_transaction(trans);
 4488	btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
 4489fscrypt_free:
 4490	fscrypt_free_filename(&fname);
 4491	return ret;
 4492}
 4493
 4494static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 4495			       struct btrfs_inode *dir, struct dentry *dentry)
 4496{
 4497	struct btrfs_root *root = dir->root;
 4498	struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
 4499	BTRFS_PATH_AUTO_FREE(path);
 4500	struct extent_buffer *leaf;
 4501	struct btrfs_dir_item *di;
 4502	struct btrfs_key key;
 4503	u64 index;
 4504	int ret;
 4505	u64 objectid;
 4506	u64 dir_ino = btrfs_ino(dir);
 4507	struct fscrypt_name fname;
 4508
 4509	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
 4510	if (ret)
 4511		return ret;
 4512
 4513	/* This needs to handle no-key deletions later on */
 4514
 4515	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
 4516		objectid = btrfs_root_id(inode->root);
 4517	} else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
 4518		objectid = inode->ref_root_id;
 4519	} else {
 4520		WARN_ON(1);
 4521		fscrypt_free_filename(&fname);
 4522		return -EINVAL;
 4523	}
 4524
 4525	path = btrfs_alloc_path();
 4526	if (!path) {
 4527		ret = -ENOMEM;
 4528		goto out;
 4529	}
 4530
 4531	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
 4532				   &fname.disk_name, -1);
 4533	if (IS_ERR_OR_NULL(di)) {
 4534		ret = di ? PTR_ERR(di) : -ENOENT;
 4535		goto out;
 4536	}
 4537
 4538	leaf = path->nodes[0];
 4539	btrfs_dir_item_key_to_cpu(leaf, di, &key);
 4540	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
 4541	ret = btrfs_delete_one_dir_name(trans, root, path, di);
 4542	if (unlikely(ret)) {
 4543		btrfs_abort_transaction(trans, ret);
 4544		goto out;
 4545	}
 4546	btrfs_release_path(path);
 4547
 4548	/*
 4549	 * This is a placeholder inode for a subvolume we didn't have a
 4550	 * reference to at the time of the snapshot creation.  In the meantime
 4551	 * we could have renamed the real subvol link into our snapshot, so
 4552	 * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
 4553	 * Instead simply lookup the dir_index_item for this entry so we can
 4554	 * remove it.  Otherwise we know we have a ref to the root and we can
 4555	 * call btrfs_del_root_ref, and it _shouldn't_ fail.
 4556	 */
 4557	if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
 4558		di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
 4559		if (IS_ERR(di)) {
 4560			ret = PTR_ERR(di);
 4561			btrfs_abort_transaction(trans, ret);
 4562			goto out;
 4563		}
 4564
 4565		leaf = path->nodes[0];
 4566		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 4567		index = key.offset;
 4568		btrfs_release_path(path);
 4569	} else {
 4570		ret = btrfs_del_root_ref(trans, objectid,
 4571					 btrfs_root_id(root), dir_ino,
 4572					 &index, &fname.disk_name);
 4573		if (unlikely(ret)) {
 4574			btrfs_abort_transaction(trans, ret);
 4575			goto out;
 4576		}
 4577	}
 4578
 4579	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
 4580	if (unlikely(ret)) {
 4581		btrfs_abort_transaction(trans, ret);
 4582		goto out;
 4583	}
 4584
 4585	btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
 4586	inode_inc_iversion(&dir->vfs_inode);
 4587	inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
 4588	ret = btrfs_update_inode_fallback(trans, dir);
 4589	if (ret)
 4590		btrfs_abort_transaction(trans, ret);
 4591out:
 4592	fscrypt_free_filename(&fname);
 4593	return ret;
 4594}
 4595
 4596/*
 4597 * Helper to check if the subvolume references other subvolumes or if it's
 4598 * default.
 4599 */
 4600static noinline int may_destroy_subvol(struct btrfs_root *root)
 4601{
 4602	struct btrfs_fs_info *fs_info = root->fs_info;
 4603	BTRFS_PATH_AUTO_FREE(path);
 4604	struct btrfs_dir_item *di;
 4605	struct btrfs_key key;
 4606	struct fscrypt_str name = FSTR_INIT("default", 7);
 4607	u64 dir_id;
 4608	int ret;
 4609
 4610	path = btrfs_alloc_path();
 4611	if (!path)
 4612		return -ENOMEM;
 4613
 4614	/* Make sure this root isn't set as the default subvol */
 4615	dir_id = btrfs_super_root_dir(fs_info->super_copy);
 4616	di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
 4617				   dir_id, &name, 0);
 4618	if (di && !IS_ERR(di)) {
 4619		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
 4620		if (key.objectid == btrfs_root_id(root)) {
 4621			ret = -EPERM;
 4622			btrfs_err(fs_info,
 4623				  "deleting default subvolume %llu is not allowed",
 4624				  key.objectid);
 4625			return ret;
 4626		}
 4627		btrfs_release_path(path);
 4628	}
 4629
 4630	key.objectid = btrfs_root_id(root);
 4631	key.type = BTRFS_ROOT_REF_KEY;
 4632	key.offset = (u64)-1;
 4633
 4634	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
 4635	if (ret < 0)
 4636		return ret;
 4637	if (unlikely(ret == 0)) {
 4638		/*
 4639		 * Key with offset -1 found, there would have to exist a root
 4640		 * with such id, but this is out of valid range.
 4641		 */
 4642		return -EUCLEAN;
 4643	}
 4644
 4645	ret = 0;
 4646	if (path->slots[0] > 0) {
 4647		path->slots[0]--;
 4648		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
 4649		if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY)
 4650			ret = -ENOTEMPTY;
 4651	}
 4652
 4653	return ret;
 4654}
 4655
 4656/* Delete all dentries for inodes belonging to the root */
 4657static void btrfs_prune_dentries(struct btrfs_root *root)
 4658{
 4659	struct btrfs_fs_info *fs_info = root->fs_info;
 4660	struct btrfs_inode *inode;
 4661	u64 min_ino = 0;
 4662
 4663	if (!BTRFS_FS_ERROR(fs_info))
 4664		WARN_ON(btrfs_root_refs(&root->root_item) != 0);
 4665
 4666	inode = btrfs_find_first_inode(root, min_ino);
 4667	while (inode) {
 4668		if (icount_read(&inode->vfs_inode) > 1)
 4669			d_prune_aliases(&inode->vfs_inode);
 4670
 4671		min_ino = btrfs_ino(inode) + 1;
 4672		/*
 4673		 * btrfs_drop_inode() will have it removed from the inode
 4674		 * cache when its usage count hits zero.
 4675		 */
 4676		iput(&inode->vfs_inode);
 4677		cond_resched();
 4678		inode = btrfs_find_first_inode(root, min_ino);
 4679	}
 4680}
 4681
 4682int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
 4683{
 4684	struct btrfs_root *root = dir->root;
 4685	struct btrfs_fs_info *fs_info = root->fs_info;
 4686	struct inode *inode = d_inode(dentry);
 4687	struct btrfs_root *dest = BTRFS_I(inode)->root;
 4688	struct btrfs_trans_handle *trans;
 4689	struct btrfs_block_rsv block_rsv;
 4690	u64 root_flags;
 4691	u64 qgroup_reserved = 0;
 4692	int ret;
 4693
 4694	down_write(&fs_info->subvol_sem);
 4695
 4696	/*
 4697	 * Don't allow to delete a subvolume with send in progress. This is
 4698	 * inside the inode lock so the error handling that has to drop the bit
 4699	 * again is not run concurrently.
 4700	 */
 4701	spin_lock(&dest->root_item_lock);
 4702	if (dest->send_in_progress) {
 4703		spin_unlock(&dest->root_item_lock);
 4704		btrfs_warn(fs_info,
 4705			   "attempt to delete subvolume %llu during send",
 4706			   btrfs_root_id(dest));
 4707		ret = -EPERM;
 4708		goto out_up_write;
 4709	}
 4710	if (atomic_read(&dest->nr_swapfiles)) {
 4711		spin_unlock(&dest->root_item_lock);
 4712		btrfs_warn(fs_info,
 4713			   "attempt to delete subvolume %llu with active swapfile",
 4714			   btrfs_root_id(root));
 4715		ret = -EPERM;
 4716		goto out_up_write;
 4717	}
 4718	root_flags = btrfs_root_flags(&dest->root_item);
 4719	btrfs_set_root_flags(&dest->root_item,
 4720			     root_flags | BTRFS_ROOT_SUBVOL_DEAD);
 4721	spin_unlock(&dest->root_item_lock);
 4722
 4723	ret = may_destroy_subvol(dest);
 4724	if (ret)
 4725		goto out_undead;
 4726
 4727	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
 4728	/*
 4729	 * One for dir inode,
 4730	 * two for dir entries,
 4731	 * two for root ref/backref.
 4732	 */
 4733	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
 4734	if (ret)
 4735		goto out_undead;
 4736	qgroup_reserved = block_rsv.qgroup_rsv_reserved;
 4737
 4738	trans = btrfs_start_transaction(root, 0);
 4739	if (IS_ERR(trans)) {
 4740		ret = PTR_ERR(trans);
 4741		goto out_release;
 4742	}
 4743	btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
 4744	qgroup_reserved = 0;
 4745	trans->block_rsv = &block_rsv;
 4746	trans->bytes_reserved = block_rsv.size;
 4747
 4748	btrfs_record_snapshot_destroy(trans, dir);
 4749
 4750	ret = btrfs_unlink_subvol(trans, dir, dentry);
 4751	if (unlikely(ret)) {
 4752		btrfs_abort_transaction(trans, ret);
 4753		goto out_end_trans;
 4754	}
 4755
 4756	ret = btrfs_record_root_in_trans(trans, dest);
 4757	if (unlikely(ret)) {
 4758		btrfs_abort_transaction(trans, ret);
 4759		goto out_end_trans;
 4760	}
 4761
 4762	memset(&dest->root_item.drop_progress, 0,
 4763		sizeof(dest->root_item.drop_progress));
 4764	btrfs_set_root_drop_level(&dest->root_item, 0);
 4765	btrfs_set_root_refs(&dest->root_item, 0);
 4766
 4767	if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
 4768		ret = btrfs_insert_orphan_item(trans,
 4769					fs_info->tree_root,
 4770					btrfs_root_id(dest));
 4771		if (unlikely(ret)) {
 4772			btrfs_abort_transaction(trans, ret);
 4773			goto out_end_trans;
 4774		}
 4775	}
 4776
 4777	ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
 4778				     BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest));
 4779	if (unlikely(ret && ret != -ENOENT)) {
 4780		btrfs_abort_transaction(trans, ret);
 4781		goto out_end_trans;
 4782	}
 4783	if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
 4784		ret = btrfs_uuid_tree_remove(trans,
 4785					  dest->root_item.received_uuid,
 4786					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
 4787					  btrfs_root_id(dest));
 4788		if (unlikely(ret && ret != -ENOENT)) {
 4789			btrfs_abort_transaction(trans, ret);
 4790			goto out_end_trans;
 4791		}
 4792	}
 4793
 4794	free_anon_bdev(dest->anon_dev);
 4795	dest->anon_dev = 0;
 4796out_end_trans:
 4797	trans->block_rsv = NULL;
 4798	trans->bytes_reserved = 0;
 4799	ret = btrfs_end_transaction(trans);
 4800	inode->i_flags |= S_DEAD;
 4801out_release:
 4802	btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
 4803	if (qgroup_reserved)
 4804		btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
 4805out_undead:
 4806	if (ret) {
 4807		spin_lock(&dest->root_item_lock);
 4808		root_flags = btrfs_root_flags(&dest->root_item);
 4809		btrfs_set_root_flags(&dest->root_item,
 4810				root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
 4811		spin_unlock(&dest->root_item_lock);
 4812	}
 4813out_up_write:
 4814	up_write(&fs_info->subvol_sem);
 4815	if (!ret) {
 4816		d_invalidate(dentry);
 4817		btrfs_prune_dentries(dest);
 4818		ASSERT(dest->send_in_progress == 0);
 4819	}
 4820
 4821	return ret;
 4822}
 4823
 4824static int btrfs_rmdir(struct inode *vfs_dir, struct dentry *dentry)
 4825{
 4826	struct btrfs_inode *dir = BTRFS_I(vfs_dir);
 4827	struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
 4828	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 4829	int ret = 0;
 4830	struct btrfs_trans_handle *trans;
 4831	struct fscrypt_name fname;
 4832
 4833	if (inode->vfs_inode.i_size > BTRFS_EMPTY_DIR_SIZE)
 4834		return -ENOTEMPTY;
 4835	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
 4836		if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
 4837			btrfs_err(fs_info,
 4838			"extent tree v2 doesn't support snapshot deletion yet");
 4839			return -EOPNOTSUPP;
 4840		}
 4841		return btrfs_delete_subvolume(dir, dentry);
 4842	}
 4843
 4844	ret = fscrypt_setup_filename(vfs_dir, &dentry->d_name, 1, &fname);
 4845	if (ret)
 4846		return ret;
 4847
 4848	/* This needs to handle no-key deletions later on */
 4849
 4850	trans = __unlink_start_trans(dir);
 4851	if (IS_ERR(trans)) {
 4852		ret = PTR_ERR(trans);
 4853		goto out_notrans;
 4854	}
 4855
 4856	/*
 4857	 * Propagate the last_unlink_trans value of the deleted dir to its
 4858	 * parent directory. This is to prevent an unrecoverable log tree in the
 4859	 * case we do something like this:
 4860	 * 1) create dir foo
 4861	 * 2) create snapshot under dir foo
 4862	 * 3) delete the snapshot
 4863	 * 4) rmdir foo
 4864	 * 5) mkdir foo
 4865	 * 6) fsync foo or some file inside foo
 4866	 *
 4867	 * This is because we can't unlink other roots when replaying the dir
 4868	 * deletes for directory foo.
 4869	 */
 4870	if (inode->last_unlink_trans >= trans->transid)
 4871		btrfs_record_snapshot_destroy(trans, dir);
 4872
 4873	if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
 4874		ret = btrfs_unlink_subvol(trans, dir, dentry);
 4875		goto out;
 4876	}
 4877
 4878	ret = btrfs_orphan_add(trans, inode);
 4879	if (ret)
 4880		goto out;
 4881
 4882	/* now the directory is empty */
 4883	ret = btrfs_unlink_inode(trans, dir, inode, &fname.disk_name);
 4884	if (!ret)
 4885		btrfs_i_size_write(inode, 0);
 4886out:
 4887	btrfs_end_transaction(trans);
 4888out_notrans:
 4889	btrfs_btree_balance_dirty(fs_info);
 4890	fscrypt_free_filename(&fname);
 4891
 4892	return ret;
 4893}
 4894
 4895static bool is_inside_block(u64 bytenr, u64 blockstart, u32 blocksize)
 4896{
 4897	ASSERT(IS_ALIGNED(blockstart, blocksize), "blockstart=%llu blocksize=%u",
 4898		blockstart, blocksize);
 4899
 4900	if (blockstart <= bytenr && bytenr <= blockstart + blocksize - 1)
 4901		return true;
 4902	return false;
 4903}
 4904
 4905static int truncate_block_zero_beyond_eof(struct btrfs_inode *inode, u64 start)
 4906{
 4907	const pgoff_t index = (start >> PAGE_SHIFT);
 4908	struct address_space *mapping = inode->vfs_inode.i_mapping;
 4909	struct folio *folio;
 4910	u64 zero_start;
 4911	u64 zero_end;
 4912	int ret = 0;
 4913
 4914again:
 4915	folio = filemap_lock_folio(mapping, index);
 4916	/* No folio present. */
 4917	if (IS_ERR(folio))
 4918		return 0;
 4919
 4920	if (!folio_test_uptodate(folio)) {
 4921		ret = btrfs_read_folio(NULL, folio);
 4922		folio_lock(folio);
 4923		if (folio->mapping != mapping) {
 4924			folio_unlock(folio);
 4925			folio_put(folio);
 4926			goto again;
 4927		}
 4928		if (unlikely(!folio_test_uptodate(folio))) {
 4929			ret = -EIO;
 4930			goto out_unlock;
 4931		}
 4932	}
 4933	folio_wait_writeback(folio);
 4934
 4935	/*
 4936	 * We do not need to lock extents nor wait for OE, as it's already
 4937	 * beyond EOF.
 4938	 */
 4939
 4940	zero_start = max_t(u64, folio_pos(folio), start);
 4941	zero_end = folio_next_pos(folio);
 4942	folio_zero_range(folio, zero_start - folio_pos(folio),
 4943			 zero_end - zero_start);
 4944
 4945out_unlock:
 4946	folio_unlock(folio);
 4947	folio_put(folio);
 4948	return ret;
 4949}
 4950
 4951/*
 4952 * Handle the truncation of a fs block.
 4953 *
 4954 * @inode  - inode that we're zeroing
 4955 * @offset - the file offset of the block to truncate
 4956 *           The value must be inside [@start, @end], and the function will do
 4957 *           extra checks if the block that covers @offset needs to be zeroed.
 4958 * @start  - the start file offset of the range we want to zero
 4959 * @end    - the end (inclusive) file offset of the range we want to zero.
 4960 *
 4961 * If the range is not block aligned, read out the folio that covers @offset,
 4962 * and if needed zero blocks that are inside the folio and covered by [@start, @end).
 4963 * If @start or @end + 1 lands inside a block, that block will be marked dirty
 4964 * for writeback.
 4965 *
 4966 * This is utilized by hole punch, zero range, file expansion.
 4967 */
 4968int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end)
 4969{
 4970	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 4971	struct address_space *mapping = inode->vfs_inode.i_mapping;
 4972	struct extent_io_tree *io_tree = &inode->io_tree;
 4973	struct btrfs_ordered_extent *ordered;
 4974	struct extent_state *cached_state = NULL;
 4975	struct extent_changeset *data_reserved = NULL;
 4976	bool only_release_metadata = false;
 4977	u32 blocksize = fs_info->sectorsize;
 4978	pgoff_t index = (offset >> PAGE_SHIFT);
 4979	struct folio *folio;
 4980	gfp_t mask = btrfs_alloc_write_mask(mapping);
 4981	int ret = 0;
 4982	const bool in_head_block = is_inside_block(offset, round_down(start, blocksize),
 4983						   blocksize);
 4984	const bool in_tail_block = is_inside_block(offset, round_down(end, blocksize),
 4985						   blocksize);
 4986	bool need_truncate_head = false;
 4987	bool need_truncate_tail = false;
 4988	u64 zero_start;
 4989	u64 zero_end;
 4990	u64 block_start;
 4991	u64 block_end;
 4992
 4993	/* @offset should be inside the range. */
 4994	ASSERT(start <= offset && offset <= end, "offset=%llu start=%llu end=%llu",
 4995	       offset, start, end);
 4996
 4997	/* The range is aligned at both ends. */
 4998	if (IS_ALIGNED(start, blocksize) && IS_ALIGNED(end + 1, blocksize)) {
 4999		/*
 5000		 * For block size < page size case, we may have polluted blocks
 5001		 * beyond EOF. So we also need to zero them out.
 5002		 */
 5003		if (end == (u64)-1 && blocksize < PAGE_SIZE)
 5004			ret = truncate_block_zero_beyond_eof(inode, start);
 5005		goto out;
 5006	}
 5007
 5008	/*
 5009	 * @offset may not be inside the head nor tail block. In that case we
 5010	 * don't need to do anything.
 5011	 */
 5012	if (!in_head_block && !in_tail_block)
 5013		goto out;
 5014
 5015	/*
 5016	 * Skip the truncation if the range in the target block is already aligned.
 5017	 * The seemingly complex check will also handle the same block case.
 5018	 */
 5019	if (in_head_block && !IS_ALIGNED(start, blocksize))
 5020		need_truncate_head = true;
 5021	if (in_tail_block && !IS_ALIGNED(end + 1, blocksize))
 5022		need_truncate_tail = true;
 5023	if (!need_truncate_head && !need_truncate_tail)
 5024		goto out;
 5025
 5026	block_start = round_down(offset, blocksize);
 5027	block_end = block_start + blocksize - 1;
 5028
 5029	ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
 5030					  blocksize, false);
 5031	if (ret < 0) {
 5032		size_t write_bytes = blocksize;
 5033
 5034		if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
 5035			/* For nocow case, no need to reserve data space. */
 5036			ASSERT(write_bytes == blocksize, "write_bytes=%zu blocksize=%u",
 5037			       write_bytes, blocksize);
 5038			only_release_metadata = true;
 5039		} else {
 5040			goto out;
 5041		}
 5042	}
 5043	ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
 5044	if (ret < 0) {
 5045		if (!only_release_metadata)
 5046			btrfs_free_reserved_data_space(inode, data_reserved,
 5047						       block_start, blocksize);
 5048		goto out;
 5049	}
 5050again:
 5051	folio = __filemap_get_folio(mapping, index,
 5052				    FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
 5053	if (IS_ERR(folio)) {
 5054		if (only_release_metadata)
 5055			btrfs_delalloc_release_metadata(inode, blocksize, true);
 5056		else
 5057			btrfs_delalloc_release_space(inode, data_reserved,
 5058						     block_start, blocksize, true);
 5059		btrfs_delalloc_release_extents(inode, blocksize);
 5060		ret = PTR_ERR(folio);
 5061		goto out;
 5062	}
 5063
 5064	if (!folio_test_uptodate(folio)) {
 5065		ret = btrfs_read_folio(NULL, folio);
 5066		folio_lock(folio);
 5067		if (folio->mapping != mapping) {
 5068			folio_unlock(folio);
 5069			folio_put(folio);
 5070			goto again;
 5071		}
 5072		if (unlikely(!folio_test_uptodate(folio))) {
 5073			ret = -EIO;
 5074			goto out_unlock;
 5075		}
 5076	}
 5077
 5078	/*
 5079	 * We unlock the page after the io is completed and then re-lock it
 5080	 * above.  release_folio() could have come in between that and cleared
 5081	 * folio private, but left the page in the mapping.  Set the page mapped
 5082	 * here to make sure it's properly set for the subpage stuff.
 5083	 */
 5084	ret = set_folio_extent_mapped(folio);
 5085	if (ret < 0)
 5086		goto out_unlock;
 5087
 5088	folio_wait_writeback(folio);
 5089
 5090	btrfs_lock_extent(io_tree, block_start, block_end, &cached_state);
 5091
 5092	ordered = btrfs_lookup_ordered_extent(inode, block_start);
 5093	if (ordered) {
 5094		btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
 5095		folio_unlock(folio);
 5096		folio_put(folio);
 5097		btrfs_start_ordered_extent(ordered);
 5098		btrfs_put_ordered_extent(ordered);
 5099		goto again;
 5100	}
 5101
 5102	btrfs_clear_extent_bit(&inode->io_tree, block_start, block_end,
 5103			       EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
 5104			       &cached_state);
 5105
 5106	ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
 5107					&cached_state);
 5108	if (ret) {
 5109		btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
 5110		goto out_unlock;
 5111	}
 5112
 5113	if (end == (u64)-1) {
 5114		/*
 5115		 * We're truncating beyond EOF, the remaining blocks normally are
 5116		 * already holes thus no need to zero again, but it's possible for
 5117		 * fs block size < page size cases to have memory mapped writes
 5118		 * to pollute ranges beyond EOF.
 5119		 *
 5120		 * In that case although such polluted blocks beyond EOF will
 5121		 * not reach disk, it still affects our page caches.
 5122		 */
 5123		zero_start = max_t(u64, folio_pos(folio), start);
 5124		zero_end = min_t(u64, folio_next_pos(folio) - 1, end);
 5125	} else {
 5126		zero_start = max_t(u64, block_start, start);
 5127		zero_end = min_t(u64, block_end, end);
 5128	}
 5129	folio_zero_range(folio, zero_start - folio_pos(folio),
 5130			 zero_end - zero_start + 1);
 5131
 5132	btrfs_folio_clear_checked(fs_info, folio, block_start,
 5133				  block_end + 1 - block_start);
 5134	btrfs_folio_set_dirty(fs_info, folio, block_start,
 5135			      block_end + 1 - block_start);
 5136
 5137	if (only_release_metadata)
 5138		btrfs_set_extent_bit(&inode->io_tree, block_start, block_end,
 5139				     EXTENT_NORESERVE, &cached_state);
 5140
 5141	btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
 5142
 5143out_unlock:
 5144	if (ret) {
 5145		if (only_release_metadata)
 5146			btrfs_delalloc_release_metadata(inode, blocksize, true);
 5147		else
 5148			btrfs_delalloc_release_space(inode, data_reserved,
 5149					block_start, blocksize, true);
 5150	}
 5151	btrfs_delalloc_release_extents(inode, blocksize);
 5152	folio_unlock(folio);
 5153	folio_put(folio);
 5154out:
 5155	if (only_release_metadata)
 5156		btrfs_check_nocow_unlock(inode);
 5157	extent_changeset_free(data_reserved);
 5158	return ret;
 5159}
 5160
 5161static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len)
 5162{
 5163	struct btrfs_root *root = inode->root;
 5164	struct btrfs_fs_info *fs_info = root->fs_info;
 5165	struct btrfs_trans_handle *trans;
 5166	struct btrfs_drop_extents_args drop_args = { 0 };
 5167	int ret;
 5168
 5169	/*
 5170	 * If NO_HOLES is enabled, we don't need to do anything.
 5171	 * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
 5172	 * or btrfs_update_inode() will be called, which guarantee that the next
 5173	 * fsync will know this inode was changed and needs to be logged.
 5174	 */
 5175	if (btrfs_fs_incompat(fs_info, NO_HOLES))
 5176		return 0;
 5177
 5178	/*
 5179	 * 1 - for the one we're dropping
 5180	 * 1 - for the one we're adding
 5181	 * 1 - for updating the inode.
 5182	 */
 5183	trans = btrfs_start_transaction(root, 3);
 5184	if (IS_ERR(trans))
 5185		return PTR_ERR(trans);
 5186
 5187	drop_args.start = offset;
 5188	drop_args.end = offset + len;
 5189	drop_args.drop_cache = true;
 5190
 5191	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
 5192	if (unlikely(ret)) {
 5193		btrfs_abort_transaction(trans, ret);
 5194		btrfs_end_transaction(trans);
 5195		return ret;
 5196	}
 5197
 5198	ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len);
 5199	if (ret) {
 5200		btrfs_abort_transaction(trans, ret);
 5201	} else {
 5202		btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
 5203		btrfs_update_inode(trans, inode);
 5204	}
 5205	btrfs_end_transaction(trans);
 5206	return ret;
 5207}
 5208
 5209/*
 5210 * This function puts in dummy file extents for the area we're creating a hole
 5211 * for.  So if we are truncating this file to a larger size we need to insert
 5212 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
 5213 * the range between oldsize and size
 5214 */
 5215int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
 5216{
 5217	struct btrfs_root *root = inode->root;
 5218	struct btrfs_fs_info *fs_info = root->fs_info;
 5219	struct extent_io_tree *io_tree = &inode->io_tree;
 5220	struct extent_map *em = NULL;
 5221	struct extent_state *cached_state = NULL;
 5222	u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
 5223	u64 block_end = ALIGN(size, fs_info->sectorsize);
 5224	u64 last_byte;
 5225	u64 cur_offset;
 5226	u64 hole_size;
 5227	int ret = 0;
 5228
 5229	/*
 5230	 * If our size started in the middle of a block we need to zero out the
 5231	 * rest of the block before we expand the i_size, otherwise we could
 5232	 * expose stale data.
 5233	 */
 5234	ret = btrfs_truncate_block(inode, oldsize, oldsize, -1);
 5235	if (ret)
 5236		return ret;
 5237
 5238	if (size <= hole_start)
 5239		return 0;
 5240
 5241	btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
 5242					   &cached_state);
 5243	cur_offset = hole_start;
 5244	while (1) {
 5245		em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset);
 5246		if (IS_ERR(em)) {
 5247			ret = PTR_ERR(em);
 5248			em = NULL;
 5249			break;
 5250		}
 5251		last_byte = min(btrfs_extent_map_end(em), block_end);
 5252		last_byte = ALIGN(last_byte, fs_info->sectorsize);
 5253		hole_size = last_byte - cur_offset;
 5254
 5255		if (!(em->flags & EXTENT_FLAG_PREALLOC)) {
 5256			struct extent_map *hole_em;
 5257
 5258			ret = maybe_insert_hole(inode, cur_offset, hole_size);
 5259			if (ret)
 5260				break;
 5261
 5262			ret = btrfs_inode_set_file_extent_range(inode,
 5263							cur_offset, hole_size);
 5264			if (ret)
 5265				break;
 5266
 5267			hole_em = btrfs_alloc_extent_map();
 5268			if (!hole_em) {
 5269				btrfs_drop_extent_map_range(inode, cur_offset,
 5270						    cur_offset + hole_size - 1,
 5271						    false);
 5272				btrfs_set_inode_full_sync(inode);
 5273				goto next;
 5274			}
 5275			hole_em->start = cur_offset;
 5276			hole_em->len = hole_size;
 5277
 5278			hole_em->disk_bytenr = EXTENT_MAP_HOLE;
 5279			hole_em->disk_num_bytes = 0;
 5280			hole_em->ram_bytes = hole_size;
 5281			hole_em->generation = btrfs_get_fs_generation(fs_info);
 5282
 5283			ret = btrfs_replace_extent_map_range(inode, hole_em, true);
 5284			btrfs_free_extent_map(hole_em);
 5285		} else {
 5286			ret = btrfs_inode_set_file_extent_range(inode,
 5287							cur_offset, hole_size);
 5288			if (ret)
 5289				break;
 5290		}
 5291next:
 5292		btrfs_free_extent_map(em);
 5293		em = NULL;
 5294		cur_offset = last_byte;
 5295		if (cur_offset >= block_end)
 5296			break;
 5297	}
 5298	btrfs_free_extent_map(em);
 5299	btrfs_unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
 5300	return ret;
 5301}
 5302
 5303static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 5304{
 5305	struct btrfs_root *root = BTRFS_I(inode)->root;
 5306	struct btrfs_trans_handle *trans;
 5307	loff_t oldsize = i_size_read(inode);
 5308	loff_t newsize = attr->ia_size;
 5309	int mask = attr->ia_valid;
 5310	int ret;
 5311
 5312	/*
 5313	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
 5314	 * special case where we need to update the times despite not having
 5315	 * these flags set.  For all other operations the VFS set these flags
 5316	 * explicitly if it wants a timestamp update.
 5317	 */
 5318	if (newsize != oldsize) {
 5319		inode_inc_iversion(inode);
 5320		if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
 5321			inode_set_mtime_to_ts(inode,
 5322					      inode_set_ctime_current(inode));
 5323		}
 5324	}
 5325
 5326	if (newsize > oldsize) {
 5327		/*
 5328		 * Don't do an expanding truncate while snapshotting is ongoing.
 5329		 * This is to ensure the snapshot captures a fully consistent
 5330		 * state of this file - if the snapshot captures this expanding
 5331		 * truncation, it must capture all writes that happened before
 5332		 * this truncation.
 5333		 */
 5334		btrfs_drew_write_lock(&root->snapshot_lock);
 5335		ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
 5336		if (ret) {
 5337			btrfs_drew_write_unlock(&root->snapshot_lock);
 5338			return ret;
 5339		}
 5340
 5341		trans = btrfs_start_transaction(root, 1);
 5342		if (IS_ERR(trans)) {
 5343			btrfs_drew_write_unlock(&root->snapshot_lock);
 5344			return PTR_ERR(trans);
 5345		}
 5346
 5347		i_size_write(inode, newsize);
 5348		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
 5349		pagecache_isize_extended(inode, oldsize, newsize);
 5350		ret = btrfs_update_inode(trans, BTRFS_I(inode));
 5351		btrfs_drew_write_unlock(&root->snapshot_lock);
 5352		btrfs_end_transaction(trans);
 5353	} else {
 5354		struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 5355
 5356		if (btrfs_is_zoned(fs_info)) {
 5357			ret = btrfs_wait_ordered_range(BTRFS_I(inode),
 5358					ALIGN(newsize, fs_info->sectorsize),
 5359					(u64)-1);
 5360			if (ret)
 5361				return ret;
 5362		}
 5363
 5364		/*
 5365		 * We're truncating a file that used to have good data down to
 5366		 * zero. Make sure any new writes to the file get on disk
 5367		 * on close.
 5368		 */
 5369		if (newsize == 0)
 5370			set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
 5371				&BTRFS_I(inode)->runtime_flags);
 5372
 5373		truncate_setsize(inode, newsize);
 5374
 5375		inode_dio_wait(inode);
 5376
 5377		ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize);
 5378		if (ret && inode->i_nlink) {
 5379			int ret2;
 5380
 5381			/*
 5382			 * Truncate failed, so fix up the in-memory size. We
 5383			 * adjusted disk_i_size down as we removed extents, so
 5384			 * wait for disk_i_size to be stable and then update the
 5385			 * in-memory size to match.
 5386			 */
 5387			ret2 = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
 5388			if (ret2)
 5389				return ret2;
 5390			i_size_write(inode, BTRFS_I(inode)->disk_i_size);
 5391		}
 5392	}
 5393
 5394	return ret;
 5395}
 5396
 5397static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 5398			 struct iattr *attr)
 5399{
 5400	struct inode *inode = d_inode(dentry);
 5401	struct btrfs_root *root = BTRFS_I(inode)->root;
 5402	int ret;
 5403
 5404	if (btrfs_root_readonly(root))
 5405		return -EROFS;
 5406
 5407	ret = setattr_prepare(idmap, dentry, attr);
 5408	if (ret)
 5409		return ret;
 5410
 5411	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
 5412		ret = btrfs_setsize(inode, attr);
 5413		if (ret)
 5414			return ret;
 5415	}
 5416
 5417	if (attr->ia_valid) {
 5418		setattr_copy(idmap, inode, attr);
 5419		inode_inc_iversion(inode);
 5420		ret = btrfs_dirty_inode(BTRFS_I(inode));
 5421
 5422		if (!ret && attr->ia_valid & ATTR_MODE)
 5423			ret = posix_acl_chmod(idmap, dentry, inode->i_mode);
 5424	}
 5425
 5426	return ret;
 5427}
 5428
 5429/*
 5430 * While truncating the inode pages during eviction, we get the VFS
 5431 * calling btrfs_invalidate_folio() against each folio of the inode. This
 5432 * is slow because the calls to btrfs_invalidate_folio() result in a
 5433 * huge amount of calls to lock_extent() and clear_extent_bit(),
 5434 * which keep merging and splitting extent_state structures over and over,
 5435 * wasting lots of time.
 5436 *
 5437 * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
 5438 * skip all those expensive operations on a per folio basis and do only
 5439 * the ordered io finishing, while we release here the extent_map and
 5440 * extent_state structures, without the excessive merging and splitting.
 5441 */
 5442static void evict_inode_truncate_pages(struct inode *inode)
 5443{
 5444	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 5445	struct rb_node *node;
 5446
 5447	ASSERT(inode_state_read_once(inode) & I_FREEING);
 5448	truncate_inode_pages_final(&inode->i_data);
 5449
 5450	btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
 5451
 5452	/*
 5453	 * Keep looping until we have no more ranges in the io tree.
 5454	 * We can have ongoing bios started by readahead that have
 5455	 * their endio callback (extent_io.c:end_bio_extent_readpage)
 5456	 * still in progress (unlocked the pages in the bio but did not yet
 5457	 * unlocked the ranges in the io tree). Therefore this means some
 5458	 * ranges can still be locked and eviction started because before
 5459	 * submitting those bios, which are executed by a separate task (work
 5460	 * queue kthread), inode references (inode->i_count) were not taken
 5461	 * (which would be dropped in the end io callback of each bio).
 5462	 * Therefore here we effectively end up waiting for those bios and
 5463	 * anyone else holding locked ranges without having bumped the inode's
 5464	 * reference count - if we don't do it, when they access the inode's
 5465	 * io_tree to unlock a range it may be too late, leading to an
 5466	 * use-after-free issue.
 5467	 */
 5468	spin_lock(&io_tree->lock);
 5469	while (!RB_EMPTY_ROOT(&io_tree->state)) {
 5470		struct extent_state *state;
 5471		struct extent_state *cached_state = NULL;
 5472		u64 start;
 5473		u64 end;
 5474		unsigned state_flags;
 5475
 5476		node = rb_first(&io_tree->state);
 5477		state = rb_entry(node, struct extent_state, rb_node);
 5478		start = state->start;
 5479		end = state->end;
 5480		state_flags = state->state;
 5481		spin_unlock(&io_tree->lock);
 5482
 5483		btrfs_lock_extent(io_tree, start, end, &cached_state);
 5484
 5485		/*
 5486		 * If still has DELALLOC flag, the extent didn't reach disk,
 5487		 * and its reserved space won't be freed by delayed_ref.
 5488		 * So we need to free its reserved space here.
 5489		 * (Refer to comment in btrfs_invalidate_folio, case 2)
 5490		 *
 5491		 * Note, end is the bytenr of last byte, so we need + 1 here.
 5492		 */
 5493		if (state_flags & EXTENT_DELALLOC)
 5494			btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
 5495					       end - start + 1, NULL);
 5496
 5497		btrfs_clear_extent_bit(io_tree, start, end,
 5498				       EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
 5499				       &cached_state);
 5500
 5501		cond_resched();
 5502		spin_lock(&io_tree->lock);
 5503	}
 5504	spin_unlock(&io_tree->lock);
 5505}
 5506
 5507static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
 5508							struct btrfs_block_rsv *rsv)
 5509{
 5510	struct btrfs_fs_info *fs_info = root->fs_info;
 5511	struct btrfs_trans_handle *trans;
 5512	u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1);
 5513	int ret;
 5514
 5515	/*
 5516	 * Eviction should be taking place at some place safe because of our
 5517	 * delayed iputs.  However the normal flushing code will run delayed
 5518	 * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
 5519	 *
 5520	 * We reserve the delayed_refs_extra here again because we can't use
 5521	 * btrfs_start_transaction(root, 0) for the same deadlocky reason as
 5522	 * above.  We reserve our extra bit here because we generate a ton of
 5523	 * delayed refs activity by truncating.
 5524	 *
 5525	 * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
 5526	 * if we fail to make this reservation we can re-try without the
 5527	 * delayed_refs_extra so we can make some forward progress.
 5528	 */
 5529	ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
 5530				     BTRFS_RESERVE_FLUSH_EVICT);
 5531	if (ret) {
 5532		ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
 5533					     BTRFS_RESERVE_FLUSH_EVICT);
 5534		if (ret) {
 5535			btrfs_warn(fs_info,
 5536				   "could not allocate space for delete; will truncate on mount");
 5537			return ERR_PTR(-ENOSPC);
 5538		}
 5539		delayed_refs_extra = 0;
 5540	}
 5541
 5542	trans = btrfs_join_transaction(root);
 5543	if (IS_ERR(trans))
 5544		return trans;
 5545
 5546	if (delayed_refs_extra) {
 5547		trans->block_rsv = &fs_info->trans_block_rsv;
 5548		trans->bytes_reserved = delayed_refs_extra;
 5549		btrfs_block_rsv_migrate(rsv, trans->block_rsv,
 5550					delayed_refs_extra, true);
 5551	}
 5552	return trans;
 5553}
 5554
 5555void btrfs_evict_inode(struct inode *inode)
 5556{
 5557	struct btrfs_fs_info *fs_info;
 5558	struct btrfs_trans_handle *trans;
 5559	struct btrfs_root *root = BTRFS_I(inode)->root;
 5560	struct btrfs_block_rsv rsv;
 5561	int ret;
 5562
 5563	trace_btrfs_inode_evict(inode);
 5564
 5565	if (!root) {
 5566		fsverity_cleanup_inode(inode);
 5567		clear_inode(inode);
 5568		return;
 5569	}
 5570
 5571	fs_info = inode_to_fs_info(inode);
 5572	evict_inode_truncate_pages(inode);
 5573
 5574	if (inode->i_nlink &&
 5575	    ((btrfs_root_refs(&root->root_item) != 0 &&
 5576	      btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID) ||
 5577	     btrfs_is_free_space_inode(BTRFS_I(inode))))
 5578		goto out;
 5579
 5580	if (is_bad_inode(inode))
 5581		goto out;
 5582
 5583	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
 5584		goto out;
 5585
 5586	if (inode->i_nlink > 0) {
 5587		BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
 5588		       btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID);
 5589		goto out;
 5590	}
 5591
 5592	/*
 5593	 * This makes sure the inode item in tree is uptodate and the space for
 5594	 * the inode update is released.
 5595	 */
 5596	ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
 5597	if (ret)
 5598		goto out;
 5599
 5600	/*
 5601	 * This drops any pending insert or delete operations we have for this
 5602	 * inode.  We could have a delayed dir index deletion queued up, but
 5603	 * we're removing the inode completely so that'll be taken care of in
 5604	 * the truncate.
 5605	 */
 5606	btrfs_kill_delayed_inode_items(BTRFS_I(inode));
 5607
 5608	btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
 5609	rsv.size = btrfs_calc_metadata_size(fs_info, 1);
 5610	rsv.failfast = true;
 5611
 5612	btrfs_i_size_write(BTRFS_I(inode), 0);
 5613
 5614	while (1) {
 5615		struct btrfs_truncate_control control = {
 5616			.inode = BTRFS_I(inode),
 5617			.ino = btrfs_ino(BTRFS_I(inode)),
 5618			.new_size = 0,
 5619			.min_type = 0,
 5620		};
 5621
 5622		trans = evict_refill_and_join(root, &rsv);
 5623		if (IS_ERR(trans))
 5624			goto out_release;
 5625
 5626		trans->block_rsv = &rsv;
 5627
 5628		ret = btrfs_truncate_inode_items(trans, root, &control);
 5629		trans->block_rsv = &fs_info->trans_block_rsv;
 5630		btrfs_end_transaction(trans);
 5631		/*
 5632		 * We have not added new delayed items for our inode after we
 5633		 * have flushed its delayed items, so no need to throttle on
 5634		 * delayed items. However we have modified extent buffers.
 5635		 */
 5636		btrfs_btree_balance_dirty_nodelay(fs_info);
 5637		if (ret && ret != -ENOSPC && ret != -EAGAIN)
 5638			goto out_release;
 5639		else if (!ret)
 5640			break;
 5641	}
 5642
 5643	/*
 5644	 * Errors here aren't a big deal, it just means we leave orphan items in
 5645	 * the tree. They will be cleaned up on the next mount. If the inode
 5646	 * number gets reused, cleanup deletes the orphan item without doing
 5647	 * anything, and unlink reuses the existing orphan item.
 5648	 *
 5649	 * If it turns out that we are dropping too many of these, we might want
 5650	 * to add a mechanism for retrying these after a commit.
 5651	 */
 5652	trans = evict_refill_and_join(root, &rsv);
 5653	if (!IS_ERR(trans)) {
 5654		trans->block_rsv = &rsv;
 5655		btrfs_orphan_del(trans, BTRFS_I(inode));
 5656		trans->block_rsv = &fs_info->trans_block_rsv;
 5657		btrfs_end_transaction(trans);
 5658	}
 5659
 5660out_release:
 5661	btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL);
 5662out:
 5663	/*
 5664	 * If we didn't successfully delete, the orphan item will still be in
 5665	 * the tree and we'll retry on the next mount. Again, we might also want
 5666	 * to retry these periodically in the future.
 5667	 */
 5668	btrfs_remove_delayed_node(BTRFS_I(inode));
 5669	fsverity_cleanup_inode(inode);
 5670	clear_inode(inode);
 5671}
 5672
 5673/*
 5674 * Return the key found in the dir entry in the location pointer, fill @type
 5675 * with BTRFS_FT_*, and return 0.
 5676 *
 5677 * If no dir entries were found, returns -ENOENT.
 5678 * If found a corrupted location in dir entry, returns -EUCLEAN.
 5679 */
 5680static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
 5681			       struct btrfs_key *location, u8 *type)
 5682{
 5683	struct btrfs_dir_item *di;
 5684	BTRFS_PATH_AUTO_FREE(path);
 5685	struct btrfs_root *root = dir->root;
 5686	int ret = 0;
 5687	struct fscrypt_name fname;
 5688
 5689	path = btrfs_alloc_path();
 5690	if (!path)
 5691		return -ENOMEM;
 5692
 5693	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
 5694	if (ret < 0)
 5695		return ret;
 5696	/*
 5697	 * fscrypt_setup_filename() should never return a positive value, but
 5698	 * gcc on sparc/parisc thinks it can, so assert that doesn't happen.
 5699	 */
 5700	ASSERT(ret == 0);
 5701
 5702	/* This needs to handle no-key deletions later on */
 5703
 5704	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir),
 5705				   &fname.disk_name, 0);
 5706	if (IS_ERR_OR_NULL(di)) {
 5707		ret = di ? PTR_ERR(di) : -ENOENT;
 5708		goto out;
 5709	}
 5710
 5711	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
 5712	if (unlikely(location->type != BTRFS_INODE_ITEM_KEY &&
 5713		     location->type != BTRFS_ROOT_ITEM_KEY)) {
 5714		ret = -EUCLEAN;
 5715		btrfs_warn(root->fs_info,
 5716"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location " BTRFS_KEY_FMT ")",
 5717			   __func__, fname.disk_name.name, btrfs_ino(dir),
 5718			   BTRFS_KEY_FMT_VALUE(location));
 5719	}
 5720	if (!ret)
 5721		*type = btrfs_dir_ftype(path->nodes[0], di);
 5722out:
 5723	fscrypt_free_filename(&fname);
 5724	return ret;
 5725}
 5726
 5727/*
 5728 * when we hit a tree root in a directory, the btrfs part of the inode
 5729 * needs to be changed to reflect the root directory of the tree root.  This
 5730 * is kind of like crossing a mount point.
 5731 */
 5732static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
 5733				    struct btrfs_inode *dir,
 5734				    struct dentry *dentry,
 5735				    struct btrfs_key *location,
 5736				    struct btrfs_root **sub_root)
 5737{
 5738	BTRFS_PATH_AUTO_FREE(path);
 5739	struct btrfs_root *new_root;
 5740	struct btrfs_root_ref *ref;
 5741	struct extent_buffer *leaf;
 5742	struct btrfs_key key;
 5743	int ret;
 5744	int err = 0;
 5745	struct fscrypt_name fname;
 5746
 5747	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname);
 5748	if (ret)
 5749		return ret;
 5750
 5751	path = btrfs_alloc_path();
 5752	if (!path) {
 5753		err = -ENOMEM;
 5754		goto out;
 5755	}
 5756
 5757	err = -ENOENT;
 5758	key.objectid = btrfs_root_id(dir->root);
 5759	key.type = BTRFS_ROOT_REF_KEY;
 5760	key.offset = location->objectid;
 5761
 5762	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
 5763	if (ret) {
 5764		if (ret < 0)
 5765			err = ret;
 5766		goto out;
 5767	}
 5768
 5769	leaf = path->nodes[0];
 5770	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
 5771	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
 5772	    btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len)
 5773		goto out;
 5774
 5775	ret = memcmp_extent_buffer(leaf, fname.disk_name.name,
 5776				   (unsigned long)(ref + 1), fname.disk_name.len);
 5777	if (ret)
 5778		goto out;
 5779
 5780	btrfs_release_path(path);
 5781
 5782	new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
 5783	if (IS_ERR(new_root)) {
 5784		err = PTR_ERR(new_root);
 5785		goto out;
 5786	}
 5787
 5788	*sub_root = new_root;
 5789	location->objectid = btrfs_root_dirid(&new_root->root_item);
 5790	location->type = BTRFS_INODE_ITEM_KEY;
 5791	location->offset = 0;
 5792	err = 0;
 5793out:
 5794	fscrypt_free_filename(&fname);
 5795	return err;
 5796}
 5797
 5798
 5799
 5800static void btrfs_del_inode_from_root(struct btrfs_inode *inode)
 5801{
 5802	struct btrfs_root *root = inode->root;
 5803	struct btrfs_inode *entry;
 5804	bool empty = false;
 5805
 5806	xa_lock(&root->inodes);
 5807	/*
 5808	 * This btrfs_inode is being freed and has already been unhashed at this
 5809	 * point. It's possible that another btrfs_inode has already been
 5810	 * allocated for the same inode and inserted itself into the root, so
 5811	 * don't delete it in that case.
 5812	 *
 5813	 * Note that this shouldn't need to allocate memory, so the gfp flags
 5814	 * don't really matter.
 5815	 */
 5816	entry = __xa_cmpxchg(&root->inodes, btrfs_ino(inode), inode, NULL,
 5817			     GFP_ATOMIC);
 5818	if (entry == inode)
 5819		empty = xa_empty(&root->inodes);
 5820	xa_unlock(&root->inodes);
 5821
 5822	if (empty && btrfs_root_refs(&root->root_item) == 0) {
 5823		xa_lock(&root->inodes);
 5824		empty = xa_empty(&root->inodes);
 5825		xa_unlock(&root->inodes);
 5826		if (empty)
 5827			btrfs_add_dead_root(root);
 5828	}
 5829}
 5830
 5831
 5832static int btrfs_init_locked_inode(struct inode *inode, void *p)
 5833{
 5834	struct btrfs_iget_args *args = p;
 5835
 5836	btrfs_set_inode_number(BTRFS_I(inode), args->ino);
 5837	BTRFS_I(inode)->root = btrfs_grab_root(args->root);
 5838
 5839	if (args->root && args->root == args->root->fs_info->tree_root &&
 5840	    args->ino != BTRFS_BTREE_INODE_OBJECTID)
 5841		set_bit(BTRFS_INODE_FREE_SPACE_INODE,
 5842			&BTRFS_I(inode)->runtime_flags);
 5843	return 0;
 5844}
 5845
 5846static int btrfs_find_actor(struct inode *inode, void *opaque)
 5847{
 5848	struct btrfs_iget_args *args = opaque;
 5849
 5850	return args->ino == btrfs_ino(BTRFS_I(inode)) &&
 5851		args->root == BTRFS_I(inode)->root;
 5852}
 5853
 5854static struct btrfs_inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
 5855{
 5856	struct inode *inode;
 5857	struct btrfs_iget_args args;
 5858	unsigned long hashval = btrfs_inode_hash(ino, root);
 5859
 5860	args.ino = ino;
 5861	args.root = root;
 5862
 5863	inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor,
 5864			     btrfs_init_locked_inode,
 5865			     (void *)&args);
 5866	if (!inode)
 5867		return NULL;
 5868	return BTRFS_I(inode);
 5869}
 5870
 5871/*
 5872 * Get an inode object given its inode number and corresponding root.  Path is
 5873 * preallocated to prevent recursing back to iget through allocator.
 5874 */
 5875struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
 5876				    struct btrfs_path *path)
 5877{
 5878	struct btrfs_inode *inode;
 5879	int ret;
 5880
 5881	inode = btrfs_iget_locked(ino, root);
 5882	if (!inode)
 5883		return ERR_PTR(-ENOMEM);
 5884
 5885	if (!(inode_state_read_once(&inode->vfs_inode) & I_NEW))
 5886		return inode;
 5887
 5888	ret = btrfs_read_locked_inode(inode, path);
 5889	if (ret)
 5890		return ERR_PTR(ret);
 5891
 5892	unlock_new_inode(&inode->vfs_inode);
 5893	return inode;
 5894}
 5895
 5896/*
 5897 * Get an inode object given its inode number and corresponding root.
 5898 */
 5899struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root)
 5900{
 5901	struct btrfs_inode *inode;
 5902	struct btrfs_path *path;
 5903	int ret;
 5904
 5905	inode = btrfs_iget_locked(ino, root);
 5906	if (!inode)
 5907		return ERR_PTR(-ENOMEM);
 5908
 5909	if (!(inode_state_read_once(&inode->vfs_inode) & I_NEW))
 5910		return inode;
 5911
 5912	path = btrfs_alloc_path();
 5913	if (!path) {
 5914		iget_failed(&inode->vfs_inode);
 5915		return ERR_PTR(-ENOMEM);
 5916	}
 5917
 5918	ret = btrfs_read_locked_inode(inode, path);
 5919	btrfs_free_path(path);
 5920	if (ret)
 5921		return ERR_PTR(ret);
 5922
 5923	if (S_ISDIR(inode->vfs_inode.i_mode))
 5924		inode->vfs_inode.i_opflags |= IOP_FASTPERM_MAY_EXEC;
 5925	unlock_new_inode(&inode->vfs_inode);
 5926	return inode;
 5927}
 5928
 5929static struct btrfs_inode *new_simple_dir(struct inode *dir,
 5930					  struct btrfs_key *key,
 5931					  struct btrfs_root *root)
 5932{
 5933	struct timespec64 ts;
 5934	struct inode *vfs_inode;
 5935	struct btrfs_inode *inode;
 5936
 5937	vfs_inode = new_inode(dir->i_sb);
 5938	if (!vfs_inode)
 5939		return ERR_PTR(-ENOMEM);
 5940
 5941	inode = BTRFS_I(vfs_inode);
 5942	inode->root = btrfs_grab_root(root);
 5943	inode->ref_root_id = key->objectid;
 5944	set_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags);
 5945	set_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags);
 5946
 5947	btrfs_set_inode_number(inode, BTRFS_EMPTY_SUBVOL_DIR_OBJECTID);
 5948	/*
 5949	 * We only need lookup, the rest is read-only and there's no inode
 5950	 * associated with the dentry
 5951	 */
 5952	vfs_inode->i_op = &simple_dir_inode_operations;
 5953	vfs_inode->i_opflags &= ~IOP_XATTR;
 5954	vfs_inode->i_fop = &simple_dir_operations;
 5955	vfs_inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
 5956
 5957	ts = inode_set_ctime_current(vfs_inode);
 5958	inode_set_mtime_to_ts(vfs_inode, ts);
 5959	inode_set_atime_to_ts(vfs_inode, inode_get_atime(dir));
 5960	inode->i_otime_sec = ts.tv_sec;
 5961	inode->i_otime_nsec = ts.tv_nsec;
 5962
 5963	vfs_inode->i_uid = dir->i_uid;
 5964	vfs_inode->i_gid = dir->i_gid;
 5965
 5966	return inode;
 5967}
 5968
 5969static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
 5970static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
 5971static_assert(BTRFS_FT_DIR == FT_DIR);
 5972static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
 5973static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
 5974static_assert(BTRFS_FT_FIFO == FT_FIFO);
 5975static_assert(BTRFS_FT_SOCK == FT_SOCK);
 5976static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
 5977
 5978static inline u8 btrfs_inode_type(const struct btrfs_inode *inode)
 5979{
 5980	return fs_umode_to_ftype(inode->vfs_inode.i_mode);
 5981}
 5982
 5983struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 5984{
 5985	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
 5986	struct btrfs_inode *inode;
 5987	struct btrfs_root *root = BTRFS_I(dir)->root;
 5988	struct btrfs_root *sub_root = root;
 5989	struct btrfs_key location = { 0 };
 5990	u8 di_type = 0;
 5991	int ret = 0;
 5992
 5993	if (dentry->d_name.len > BTRFS_NAME_LEN)
 5994		return ERR_PTR(-ENAMETOOLONG);
 5995
 5996	ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type);
 5997	if (ret < 0)
 5998		return ERR_PTR(ret);
 5999
 6000	if (location.type == BTRFS_INODE_ITEM_KEY) {
 6001		inode = btrfs_iget(location.objectid, root);
 6002		if (IS_ERR(inode))
 6003			return ERR_CAST(inode);
 6004
 6005		/* Do extra check against inode mode with di_type */
 6006		if (unlikely(btrfs_inode_type(inode) != di_type)) {
 6007			btrfs_crit(fs_info,
 6008"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
 6009				  inode->vfs_inode.i_mode, btrfs_inode_type(inode),
 6010				  di_type);
 6011			iput(&inode->vfs_inode);
 6012			return ERR_PTR(-EUCLEAN);
 6013		}
 6014		return &inode->vfs_inode;
 6015	}
 6016
 6017	ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry,
 6018				       &location, &sub_root);
 6019	if (ret < 0) {
 6020		if (ret != -ENOENT)
 6021			inode = ERR_PTR(ret);
 6022		else
 6023			inode = new_simple_dir(dir, &location, root);
 6024	} else {
 6025		inode = btrfs_iget(location.objectid, sub_root);
 6026		btrfs_put_root(sub_root);
 6027
 6028		if (IS_ERR(inode))
 6029			return ERR_CAST(inode);
 6030
 6031		down_read(&fs_info->cleanup_work_sem);
 6032		if (!sb_rdonly(inode->vfs_inode.i_sb))
 6033			ret = btrfs_orphan_cleanup(sub_root);
 6034		up_read(&fs_info->cleanup_work_sem);
 6035		if (ret) {
 6036			iput(&inode->vfs_inode);
 6037			inode = ERR_PTR(ret);
 6038		}
 6039	}
 6040
 6041	if (IS_ERR(inode))
 6042		return ERR_CAST(inode);
 6043
 6044	return &inode->vfs_inode;
 6045}
 6046
 6047static int btrfs_dentry_delete(const struct dentry *dentry)
 6048{
 6049	struct btrfs_root *root;
 6050	struct inode *inode = d_inode(dentry);
 6051
 6052	if (!inode && !IS_ROOT(dentry))
 6053		inode = d_inode(dentry->d_parent);
 6054
 6055	if (inode) {
 6056		root = BTRFS_I(inode)->root;
 6057		if (btrfs_root_refs(&root->root_item) == 0)
 6058			return 1;
 6059
 6060		if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
 6061			return 1;
 6062	}
 6063	return 0;
 6064}
 6065
 6066static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 6067				   unsigned int flags)
 6068{
 6069	struct inode *inode = btrfs_lookup_dentry(dir, dentry);
 6070
 6071	if (inode == ERR_PTR(-ENOENT))
 6072		inode = NULL;
 6073	return d_splice_alias(inode, dentry);
 6074}
 6075
 6076/*
 6077 * Find the highest existing sequence number in a directory and then set the
 6078 * in-memory index_cnt variable to the first free sequence number.
 6079 */
 6080static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
 6081{
 6082	struct btrfs_root *root = inode->root;
 6083	struct btrfs_key key, found_key;
 6084	BTRFS_PATH_AUTO_FREE(path);
 6085	struct extent_buffer *leaf;
 6086	int ret;
 6087
 6088	key.objectid = btrfs_ino(inode);
 6089	key.type = BTRFS_DIR_INDEX_KEY;
 6090	key.offset = (u64)-1;
 6091
 6092	path = btrfs_alloc_path();
 6093	if (!path)
 6094		return -ENOMEM;
 6095
 6096	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 6097	if (ret < 0)
 6098		return ret;
 6099	/* FIXME: we should be able to handle this */
 6100	if (ret == 0)
 6101		return ret;
 6102
 6103	if (path->slots[0] == 0) {
 6104		inode->index_cnt = BTRFS_DIR_START_INDEX;
 6105		return 0;
 6106	}
 6107
 6108	path->slots[0]--;
 6109
 6110	leaf = path->nodes[0];
 6111	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 6112
 6113	if (found_key.objectid != btrfs_ino(inode) ||
 6114	    found_key.type != BTRFS_DIR_INDEX_KEY) {
 6115		inode->index_cnt = BTRFS_DIR_START_INDEX;
 6116		return 0;
 6117	}
 6118
 6119	inode->index_cnt = found_key.offset + 1;
 6120
 6121	return 0;
 6122}
 6123
 6124static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
 6125{
 6126	int ret = 0;
 6127
 6128	btrfs_inode_lock(dir, 0);
 6129	if (dir->index_cnt == (u64)-1) {
 6130		ret = btrfs_inode_delayed_dir_index_count(dir);
 6131		if (ret) {
 6132			ret = btrfs_set_inode_index_count(dir);
 6133			if (ret)
 6134				goto out;
 6135		}
 6136	}
 6137
 6138	/* index_cnt is the index number of next new entry, so decrement it. */
 6139	*index = dir->index_cnt - 1;
 6140out:
 6141	btrfs_inode_unlock(dir, 0);
 6142
 6143	return ret;
 6144}
 6145
 6146/*
 6147 * All this infrastructure exists because dir_emit can fault, and we are holding
 6148 * the tree lock when doing readdir.  For now just allocate a buffer and copy
 6149 * our information into that, and then dir_emit from the buffer.  This is
 6150 * similar to what NFS does, only we don't keep the buffer around in pagecache
 6151 * because I'm afraid I'll mess that up.  Long term we need to make filldir do
 6152 * copy_to_user_inatomic so we don't have to worry about page faulting under the
 6153 * tree lock.
 6154 */
 6155static int btrfs_opendir(struct inode *inode, struct file *file)
 6156{
 6157	struct btrfs_file_private *private;
 6158	u64 last_index;
 6159	int ret;
 6160
 6161	ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index);
 6162	if (ret)
 6163		return ret;
 6164
 6165	private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
 6166	if (!private)
 6167		return -ENOMEM;
 6168	private->last_index = last_index;
 6169	private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
 6170	if (!private->filldir_buf) {
 6171		kfree(private);
 6172		return -ENOMEM;
 6173	}
 6174	file->private_data = private;
 6175	return 0;
 6176}
 6177
 6178static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence)
 6179{
 6180	struct btrfs_file_private *private = file->private_data;
 6181	int ret;
 6182
 6183	ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)),
 6184				       &private->last_index);
 6185	if (ret)
 6186		return ret;
 6187
 6188	return generic_file_llseek(file, offset, whence);
 6189}
 6190
 6191struct dir_entry {
 6192	u64 ino;
 6193	u64 offset;
 6194	unsigned type;
 6195	int name_len;
 6196};
 6197
 6198static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
 6199{
 6200	while (entries--) {
 6201		struct dir_entry *entry = addr;
 6202		char *name = (char *)(entry + 1);
 6203
 6204		ctx->pos = get_unaligned(&entry->offset);
 6205		if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
 6206					 get_unaligned(&entry->ino),
 6207					 get_unaligned(&entry->type)))
 6208			return 1;
 6209		addr += sizeof(struct dir_entry) +
 6210			get_unaligned(&entry->name_len);
 6211		ctx->pos++;
 6212	}
 6213	return 0;
 6214}
 6215
 6216static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
 6217{
 6218	struct inode *inode = file_inode(file);
 6219	struct btrfs_root *root = BTRFS_I(inode)->root;
 6220	struct btrfs_file_private *private = file->private_data;
 6221	struct btrfs_dir_item *di;
 6222	struct btrfs_key key;
 6223	struct btrfs_key found_key;
 6224	BTRFS_PATH_AUTO_FREE(path);
 6225	void *addr;
 6226	LIST_HEAD(ins_list);
 6227	LIST_HEAD(del_list);
 6228	int ret;
 6229	char *name_ptr;
 6230	int name_len;
 6231	int entries = 0;
 6232	int total_len = 0;
 6233	bool put = false;
 6234	struct btrfs_key location;
 6235
 6236	if (!dir_emit_dots(file, ctx))
 6237		return 0;
 6238
 6239	path = btrfs_alloc_path();
 6240	if (!path)
 6241		return -ENOMEM;
 6242
 6243	addr = private->filldir_buf;
 6244	path->reada = READA_FORWARD;
 6245
 6246	put = btrfs_readdir_get_delayed_items(BTRFS_I(inode), private->last_index,
 6247					      &ins_list, &del_list);
 6248
 6249again:
 6250	key.type = BTRFS_DIR_INDEX_KEY;
 6251	key.offset = ctx->pos;
 6252	key.objectid = btrfs_ino(BTRFS_I(inode));
 6253
 6254	btrfs_for_each_slot(root, &key, &found_key, path, ret) {
 6255		struct dir_entry *entry;
 6256		struct extent_buffer *leaf = path->nodes[0];
 6257		u8 ftype;
 6258
 6259		if (found_key.objectid != key.objectid)
 6260			break;
 6261		if (found_key.type != BTRFS_DIR_INDEX_KEY)
 6262			break;
 6263		if (found_key.offset < ctx->pos)
 6264			continue;
 6265		if (found_key.offset > private->last_index)
 6266			break;
 6267		if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
 6268			continue;
 6269		di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
 6270		name_len = btrfs_dir_name_len(leaf, di);
 6271		if ((total_len + sizeof(struct dir_entry) + name_len) >=
 6272		    PAGE_SIZE) {
 6273			btrfs_release_path(path);
 6274			ret = btrfs_filldir(private->filldir_buf, entries, ctx);
 6275			if (ret)
 6276				goto nopos;
 6277			addr = private->filldir_buf;
 6278			entries = 0;
 6279			total_len = 0;
 6280			goto again;
 6281		}
 6282
 6283		ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di));
 6284		entry = addr;
 6285		name_ptr = (char *)(entry + 1);
 6286		read_extent_buffer(leaf, name_ptr,
 6287				   (unsigned long)(di + 1), name_len);
 6288		put_unaligned(name_len, &entry->name_len);
 6289		put_unaligned(fs_ftype_to_dtype(ftype), &entry->type);
 6290		btrfs_dir_item_key_to_cpu(leaf, di, &location);
 6291		put_unaligned(location.objectid, &entry->ino);
 6292		put_unaligned(found_key.offset, &entry->offset);
 6293		entries++;
 6294		addr += sizeof(struct dir_entry) + name_len;
 6295		total_len += sizeof(struct dir_entry) + name_len;
 6296	}
 6297	/* Catch error encountered during iteration */
 6298	if (ret < 0)
 6299		goto err;
 6300
 6301	btrfs_release_path(path);
 6302
 6303	ret = btrfs_filldir(private->filldir_buf, entries, ctx);
 6304	if (ret)
 6305		goto nopos;
 6306
 6307	if (btrfs_readdir_delayed_dir_index(ctx, &ins_list))
 6308		goto nopos;
 6309
 6310	/*
 6311	 * Stop new entries from being returned after we return the last
 6312	 * entry.
 6313	 *
 6314	 * New directory entries are assigned a strictly increasing
 6315	 * offset.  This means that new entries created during readdir
 6316	 * are *guaranteed* to be seen in the future by that readdir.
 6317	 * This has broken buggy programs which operate on names as
 6318	 * they're returned by readdir.  Until we reuse freed offsets
 6319	 * we have this hack to stop new entries from being returned
 6320	 * under the assumption that they'll never reach this huge
 6321	 * offset.
 6322	 *
 6323	 * This is being careful not to overflow 32bit loff_t unless the
 6324	 * last entry requires it because doing so has broken 32bit apps
 6325	 * in the past.
 6326	 */
 6327	if (ctx->pos >= INT_MAX)
 6328		ctx->pos = LLONG_MAX;
 6329	else
 6330		ctx->pos = INT_MAX;
 6331nopos:
 6332	ret = 0;
 6333err:
 6334	if (put)
 6335		btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list);
 6336	return ret;
 6337}
 6338
 6339/*
 6340 * This is somewhat expensive, updating the tree every time the
 6341 * inode changes.  But, it is most likely to find the inode in cache.
 6342 * FIXME, needs more benchmarking...there are no reasons other than performance
 6343 * to keep or drop this code.
 6344 */
 6345static int btrfs_dirty_inode(struct btrfs_inode *inode)
 6346{
 6347	struct btrfs_root *root = inode->root;
 6348	struct btrfs_fs_info *fs_info = root->fs_info;
 6349	struct btrfs_trans_handle *trans;
 6350	int ret;
 6351
 6352	if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags))
 6353		return 0;
 6354
 6355	trans = btrfs_join_transaction(root);
 6356	if (IS_ERR(trans))
 6357		return PTR_ERR(trans);
 6358
 6359	ret = btrfs_update_inode(trans, inode);
 6360	if (ret == -ENOSPC || ret == -EDQUOT) {
 6361		/* whoops, lets try again with the full transaction */
 6362		btrfs_end_transaction(trans);
 6363		trans = btrfs_start_transaction(root, 1);
 6364		if (IS_ERR(trans))
 6365			return PTR_ERR(trans);
 6366
 6367		ret = btrfs_update_inode(trans, inode);
 6368	}
 6369	btrfs_end_transaction(trans);
 6370	if (inode->delayed_node)
 6371		btrfs_balance_delayed_items(fs_info);
 6372
 6373	return ret;
 6374}
 6375
 6376/*
 6377 * We need our own ->update_time so that we can return error on ENOSPC for
 6378 * updating the inode in the case of file write and mmap writes.
 6379 */
 6380static int btrfs_update_time(struct inode *inode, int flags)
 6381{
 6382	struct btrfs_root *root = BTRFS_I(inode)->root;
 6383	bool dirty;
 6384
 6385	if (btrfs_root_readonly(root))
 6386		return -EROFS;
 6387
 6388	dirty = inode_update_timestamps(inode, flags);
 6389	return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0;
 6390}
 6391
 6392/*
 6393 * helper to find a free sequence number in a given directory.  This current
 6394 * code is very simple, later versions will do smarter things in the btree
 6395 */
 6396int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
 6397{
 6398	int ret = 0;
 6399
 6400	if (dir->index_cnt == (u64)-1) {
 6401		ret = btrfs_inode_delayed_dir_index_count(dir);
 6402		if (ret) {
 6403			ret = btrfs_set_inode_index_count(dir);
 6404			if (ret)
 6405				return ret;
 6406		}
 6407	}
 6408
 6409	*index = dir->index_cnt;
 6410	dir->index_cnt++;
 6411
 6412	return ret;
 6413}
 6414
 6415static int btrfs_insert_inode_locked(struct inode *inode)
 6416{
 6417	struct btrfs_iget_args args;
 6418
 6419	args.ino = btrfs_ino(BTRFS_I(inode));
 6420	args.root = BTRFS_I(inode)->root;
 6421
 6422	return insert_inode_locked4(inode,
 6423		   btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
 6424		   btrfs_find_actor, &args);
 6425}
 6426
 6427int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
 6428			    unsigned int *trans_num_items)
 6429{
 6430	struct inode *dir = args->dir;
 6431	struct inode *inode = args->inode;
 6432	int ret;
 6433
 6434	if (!args->orphan) {
 6435		ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0,
 6436					     &args->fname);
 6437		if (ret)
 6438			return ret;
 6439	}
 6440
 6441	ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
 6442	if (ret) {
 6443		fscrypt_free_filename(&args->fname);
 6444		return ret;
 6445	}
 6446
 6447	/* 1 to add inode item */
 6448	*trans_num_items = 1;
 6449	/* 1 to add compression property */
 6450	if (BTRFS_I(dir)->prop_compress)
 6451		(*trans_num_items)++;
 6452	/* 1 to add default ACL xattr */
 6453	if (args->default_acl)
 6454		(*trans_num_items)++;
 6455	/* 1 to add access ACL xattr */
 6456	if (args->acl)
 6457		(*trans_num_items)++;
 6458#ifdef CONFIG_SECURITY
 6459	/* 1 to add LSM xattr */
 6460	if (dir->i_security)
 6461		(*trans_num_items)++;
 6462#endif
 6463	if (args->orphan) {
 6464		/* 1 to add orphan item */
 6465		(*trans_num_items)++;
 6466	} else {
 6467		/*
 6468		 * 1 to add dir item
 6469		 * 1 to add dir index
 6470		 * 1 to update parent inode item
 6471		 *
 6472		 * No need for 1 unit for the inode ref item because it is
 6473		 * inserted in a batch together with the inode item at
 6474		 * btrfs_create_new_inode().
 6475		 */
 6476		*trans_num_items += 3;
 6477	}
 6478	return 0;
 6479}
 6480
 6481void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
 6482{
 6483	posix_acl_release(args->acl);
 6484	posix_acl_release(args->default_acl);
 6485	fscrypt_free_filename(&args->fname);
 6486}
 6487
 6488/*
 6489 * Inherit flags from the parent inode.
 6490 *
 6491 * Currently only the compression flags and the cow flags are inherited.
 6492 */
 6493static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir)
 6494{
 6495	unsigned int flags;
 6496
 6497	flags = dir->flags;
 6498
 6499	if (flags & BTRFS_INODE_NOCOMPRESS) {
 6500		inode->flags &= ~BTRFS_INODE_COMPRESS;
 6501		inode->flags |= BTRFS_INODE_NOCOMPRESS;
 6502	} else if (flags & BTRFS_INODE_COMPRESS) {
 6503		inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
 6504		inode->flags |= BTRFS_INODE_COMPRESS;
 6505	}
 6506
 6507	if (flags & BTRFS_INODE_NODATACOW) {
 6508		inode->flags |= BTRFS_INODE_NODATACOW;
 6509		if (S_ISREG(inode->vfs_inode.i_mode))
 6510			inode->flags |= BTRFS_INODE_NODATASUM;
 6511	}
 6512
 6513	btrfs_sync_inode_flags_to_i_flags(inode);
 6514}
 6515
 6516int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 6517			   struct btrfs_new_inode_args *args)
 6518{
 6519	struct timespec64 ts;
 6520	struct inode *dir = args->dir;
 6521	struct inode *inode = args->inode;
 6522	const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
 6523	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
 6524	struct btrfs_root *root;
 6525	struct btrfs_inode_item *inode_item;
 6526	struct btrfs_path *path;
 6527	u64 objectid;
 6528	struct btrfs_inode_ref *ref;
 6529	struct btrfs_key key[2];
 6530	u32 sizes[2];
 6531	struct btrfs_item_batch batch;
 6532	unsigned long ptr;
 6533	int ret;
 6534	bool xa_reserved = false;
 6535
 6536	path = btrfs_alloc_path();
 6537	if (!path)
 6538		return -ENOMEM;
 6539
 6540	if (!args->subvol)
 6541		BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
 6542	root = BTRFS_I(inode)->root;
 6543
 6544	ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
 6545	if (ret)
 6546		goto out;
 6547
 6548	ret = btrfs_get_free_objectid(root, &objectid);
 6549	if (ret)
 6550		goto out;
 6551	btrfs_set_inode_number(BTRFS_I(inode), objectid);
 6552
 6553	ret = xa_reserve(&root->inodes, objectid, GFP_NOFS);
 6554	if (ret)
 6555		goto out;
 6556	xa_reserved = true;
 6557
 6558	if (args->orphan) {
 6559		/*
 6560		 * O_TMPFILE, set link count to 0, so that after this point, we
 6561		 * fill in an inode item with the correct link count.
 6562		 */
 6563		set_nlink(inode, 0);
 6564	} else {
 6565		trace_btrfs_inode_request(dir);
 6566
 6567		ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index);
 6568		if (ret)
 6569			goto out;
 6570	}
 6571
 6572	if (S_ISDIR(inode->i_mode))
 6573		BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
 6574
 6575	BTRFS_I(inode)->generation = trans->transid;
 6576	inode->i_generation = BTRFS_I(inode)->generation;
 6577
 6578	/*
 6579	 * We don't have any capability xattrs set here yet, shortcut any
 6580	 * queries for the xattrs here.  If we add them later via the inode
 6581	 * security init path or any other path this flag will be cleared.
 6582	 */
 6583	set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
 6584
 6585	/*
 6586	 * Subvolumes don't inherit flags from their parent directory.
 6587	 * Originally this was probably by accident, but we probably can't
 6588	 * change it now without compatibility issues.
 6589	 */
 6590	if (!args->subvol)
 6591		btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir));
 6592
 6593	btrfs_set_inode_mapping_order(BTRFS_I(inode));
 6594	if (S_ISREG(inode->i_mode)) {
 6595		if (btrfs_test_opt(fs_info, NODATASUM))
 6596			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
 6597		if (btrfs_test_opt(fs_info, NODATACOW))
 6598			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
 6599				BTRFS_INODE_NODATASUM;
 6600		btrfs_update_inode_mapping_flags(BTRFS_I(inode));
 6601	}
 6602
 6603	ret = btrfs_insert_inode_locked(inode);
 6604	if (ret < 0) {
 6605		if (!args->orphan)
 6606			BTRFS_I(dir)->index_cnt--;
 6607		goto out;
 6608	}
 6609
 6610	/*
 6611	 * We could have gotten an inode number from somebody who was fsynced
 6612	 * and then removed in this same transaction, so let's just set full
 6613	 * sync since it will be a full sync anyway and this will blow away the
 6614	 * old info in the log.
 6615	 */
 6616	btrfs_set_inode_full_sync(BTRFS_I(inode));
 6617
 6618	key[0].objectid = objectid;
 6619	key[0].type = BTRFS_INODE_ITEM_KEY;
 6620	key[0].offset = 0;
 6621
 6622	sizes[0] = sizeof(struct btrfs_inode_item);
 6623
 6624	if (!args->orphan) {
 6625		/*
 6626		 * Start new inodes with an inode_ref. This is slightly more
 6627		 * efficient for small numbers of hard links since they will
 6628		 * be packed into one item. Extended refs will kick in if we
 6629		 * add more hard links than can fit in the ref item.
 6630		 */
 6631		key[1].objectid = objectid;
 6632		key[1].type = BTRFS_INODE_REF_KEY;
 6633		if (args->subvol) {
 6634			key[1].offset = objectid;
 6635			sizes[1] = 2 + sizeof(*ref);
 6636		} else {
 6637			key[1].offset = btrfs_ino(BTRFS_I(dir));
 6638			sizes[1] = name->len + sizeof(*ref);
 6639		}
 6640	}
 6641
 6642	batch.keys = &key[0];
 6643	batch.data_sizes = &sizes[0];
 6644	batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
 6645	batch.nr = args->orphan ? 1 : 2;
 6646	ret = btrfs_insert_empty_items(trans, root, path, &batch);
 6647	if (unlikely(ret != 0)) {
 6648		btrfs_abort_transaction(trans, ret);
 6649		goto discard;
 6650	}
 6651
 6652	ts = simple_inode_init_ts(inode);
 6653	BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
 6654	BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
 6655
 6656	/*
 6657	 * We're going to fill the inode item now, so at this point the inode
 6658	 * must be fully initialized.
 6659	 */
 6660
 6661	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 6662				  struct btrfs_inode_item);
 6663	memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
 6664			     sizeof(*inode_item));
 6665	fill_inode_item(trans, path->nodes[0], inode_item, inode);
 6666
 6667	if (!args->orphan) {
 6668		ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
 6669				     struct btrfs_inode_ref);
 6670		ptr = (unsigned long)(ref + 1);
 6671		if (args->subvol) {
 6672			btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2);
 6673			btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
 6674			write_extent_buffer(path->nodes[0], "..", ptr, 2);
 6675		} else {
 6676			btrfs_set_inode_ref_name_len(path->nodes[0], ref,
 6677						     name->len);
 6678			btrfs_set_inode_ref_index(path->nodes[0], ref,
 6679						  BTRFS_I(inode)->dir_index);
 6680			write_extent_buffer(path->nodes[0], name->name, ptr,
 6681					    name->len);
 6682		}
 6683	}
 6684
 6685	/*
 6686	 * We don't need the path anymore, plus inheriting properties, adding
 6687	 * ACLs, security xattrs, orphan item or adding the link, will result in
 6688	 * allocating yet another path. So just free our path.
 6689	 */
 6690	btrfs_free_path(path);
 6691	path = NULL;
 6692
 6693	if (args->subvol) {
 6694		struct btrfs_inode *parent;
 6695
 6696		/*
 6697		 * Subvolumes inherit properties from their parent subvolume,
 6698		 * not the directory they were created in.
 6699		 */
 6700		parent = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, BTRFS_I(dir)->root);
 6701		if (IS_ERR(parent)) {
 6702			ret = PTR_ERR(parent);
 6703		} else {
 6704			ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode),
 6705							parent);
 6706			iput(&parent->vfs_inode);
 6707		}
 6708	} else {
 6709		ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode),
 6710						BTRFS_I(dir));
 6711	}
 6712	if (ret) {
 6713		btrfs_err(fs_info,
 6714			  "error inheriting props for ino %llu (root %llu): %d",
 6715			  btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret);
 6716	}
 6717
 6718	/*
 6719	 * Subvolumes don't inherit ACLs or get passed to the LSM. This is
 6720	 * probably a bug.
 6721	 */
 6722	if (!args->subvol) {
 6723		ret = btrfs_init_inode_security(trans, args);
 6724		if (unlikely(ret)) {
 6725			btrfs_abort_transaction(trans, ret);
 6726			goto discard;
 6727		}
 6728	}
 6729
 6730	ret = btrfs_add_inode_to_root(BTRFS_I(inode), false);
 6731	if (WARN_ON(ret)) {
 6732		/* Shouldn't happen, we used xa_reserve() before. */
 6733		btrfs_abort_transaction(trans, ret);
 6734		goto discard;
 6735	}
 6736
 6737	trace_btrfs_inode_new(inode);
 6738	btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
 6739
 6740	btrfs_update_root_times(trans, root);
 6741
 6742	if (args->orphan) {
 6743		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
 6744		if (unlikely(ret)) {
 6745			btrfs_abort_transaction(trans, ret);
 6746			goto discard;
 6747		}
 6748	} else {
 6749		ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
 6750				     0, BTRFS_I(inode)->dir_index);
 6751		if (unlikely(ret)) {
 6752			btrfs_abort_transaction(trans, ret);
 6753			goto discard;
 6754		}
 6755	}
 6756
 6757	return 0;
 6758
 6759discard:
 6760	/*
 6761	 * discard_new_inode() calls iput(), but the caller owns the reference
 6762	 * to the inode.
 6763	 */
 6764	ihold(inode);
 6765	discard_new_inode(inode);
 6766out:
 6767	if (xa_reserved)
 6768		xa_release(&root->inodes, objectid);
 6769
 6770	btrfs_free_path(path);
 6771	return ret;
 6772}
 6773
 6774/*
 6775 * utility function to add 'inode' into 'parent_inode' with
 6776 * a give name and a given sequence number.
 6777 * if 'add_backref' is true, also insert a backref from the
 6778 * inode to the parent directory.
 6779 */
 6780int btrfs_add_link(struct btrfs_trans_handle *trans,
 6781		   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
 6782		   const struct fscrypt_str *name, bool add_backref, u64 index)
 6783{
 6784	int ret = 0;
 6785	struct btrfs_key key;
 6786	struct btrfs_root *root = parent_inode->root;
 6787	u64 ino = btrfs_ino(inode);
 6788	u64 parent_ino = btrfs_ino(parent_inode);
 6789
 6790	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
 6791		memcpy(&key, &inode->root->root_key, sizeof(key));
 6792	} else {
 6793		key.objectid = ino;
 6794		key.type = BTRFS_INODE_ITEM_KEY;
 6795		key.offset = 0;
 6796	}
 6797
 6798	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
 6799		ret = btrfs_add_root_ref(trans, key.objectid,
 6800					 btrfs_root_id(root), parent_ino,
 6801					 index, name);
 6802	} else if (add_backref) {
 6803		ret = btrfs_insert_inode_ref(trans, root, name,
 6804					     ino, parent_ino, index);
 6805	}
 6806
 6807	/* Nothing to clean up yet */
 6808	if (ret)
 6809		return ret;
 6810
 6811	ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
 6812				    btrfs_inode_type(inode), index);
 6813	if (ret == -EEXIST || ret == -EOVERFLOW)
 6814		goto fail_dir_item;
 6815	else if (unlikely(ret)) {
 6816		btrfs_abort_transaction(trans, ret);
 6817		return ret;
 6818	}
 6819
 6820	btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
 6821			   name->len * 2);
 6822	inode_inc_iversion(&parent_inode->vfs_inode);
 6823	update_time_after_link_or_unlink(parent_inode);
 6824
 6825	ret = btrfs_update_inode(trans, parent_inode);
 6826	if (ret)
 6827		btrfs_abort_transaction(trans, ret);
 6828	return ret;
 6829
 6830fail_dir_item:
 6831	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
 6832		u64 local_index;
 6833		int ret2;
 6834
 6835		ret2 = btrfs_del_root_ref(trans, key.objectid, btrfs_root_id(root),
 6836					  parent_ino, &local_index, name);
 6837		if (ret2)
 6838			btrfs_abort_transaction(trans, ret2);
 6839	} else if (add_backref) {
 6840		int ret2;
 6841
 6842		ret2 = btrfs_del_inode_ref(trans, root, name, ino, parent_ino, NULL);
 6843		if (ret2)
 6844			btrfs_abort_transaction(trans, ret2);
 6845	}
 6846
 6847	/* Return the original error code */
 6848	return ret;
 6849}
 6850
 6851static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
 6852			       struct inode *inode)
 6853{
 6854	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
 6855	struct btrfs_root *root = BTRFS_I(dir)->root;
 6856	struct btrfs_new_inode_args new_inode_args = {
 6857		.dir = dir,
 6858		.dentry = dentry,
 6859		.inode = inode,
 6860	};
 6861	unsigned int trans_num_items;
 6862	struct btrfs_trans_handle *trans;
 6863	int ret;
 6864
 6865	ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
 6866	if (ret)
 6867		goto out_inode;
 6868
 6869	trans = btrfs_start_transaction(root, trans_num_items);
 6870	if (IS_ERR(trans)) {
 6871		ret = PTR_ERR(trans);
 6872		goto out_new_inode_args;
 6873	}
 6874
 6875	ret = btrfs_create_new_inode(trans, &new_inode_args);
 6876	if (!ret) {
 6877		if (S_ISDIR(inode->i_mode))
 6878			inode->i_opflags |= IOP_FASTPERM_MAY_EXEC;
 6879		d_instantiate_new(dentry, inode);
 6880	}
 6881
 6882	btrfs_end_transaction(trans);
 6883	btrfs_btree_balance_dirty(fs_info);
 6884out_new_inode_args:
 6885	btrfs_new_inode_args_destroy(&new_inode_args);
 6886out_inode:
 6887	if (ret)
 6888		iput(inode);
 6889	return ret;
 6890}
 6891
 6892static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 6893		       struct dentry *dentry, umode_t mode, dev_t rdev)
 6894{
 6895	struct inode *inode;
 6896
 6897	inode = new_inode(dir->i_sb);
 6898	if (!inode)
 6899		return -ENOMEM;
 6900	inode_init_owner(idmap, inode, dir, mode);
 6901	inode->i_op = &btrfs_special_inode_operations;
 6902	init_special_inode(inode, inode->i_mode, rdev);
 6903	return btrfs_create_common(dir, dentry, inode);
 6904}
 6905
 6906static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir,
 6907			struct dentry *dentry, umode_t mode, bool excl)
 6908{
 6909	struct inode *inode;
 6910
 6911	inode = new_inode(dir->i_sb);
 6912	if (!inode)
 6913		return -ENOMEM;
 6914	inode_init_owner(idmap, inode, dir, mode);
 6915	inode->i_fop = &btrfs_file_operations;
 6916	inode->i_op = &btrfs_file_inode_operations;
 6917	inode->i_mapping->a_ops = &btrfs_aops;
 6918	return btrfs_create_common(dir, dentry, inode);
 6919}
 6920
 6921static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 6922		      struct dentry *dentry)
 6923{
 6924	struct btrfs_trans_handle *trans = NULL;
 6925	struct btrfs_root *root = BTRFS_I(dir)->root;
 6926	struct inode *inode = d_inode(old_dentry);
 6927	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 6928	struct fscrypt_name fname;
 6929	u64 index;
 6930	int ret;
 6931
 6932	/* do not allow sys_link's with other subvols of the same device */
 6933	if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root))
 6934		return -EXDEV;
 6935
 6936	if (inode->i_nlink >= BTRFS_LINK_MAX)
 6937		return -EMLINK;
 6938
 6939	ret = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
 6940	if (ret)
 6941		goto fail;
 6942
 6943	ret = btrfs_set_inode_index(BTRFS_I(dir), &index);
 6944	if (ret)
 6945		goto fail;
 6946
 6947	/*
 6948	 * 2 items for inode and inode ref
 6949	 * 2 items for dir items
 6950	 * 1 item for parent inode
 6951	 * 1 item for orphan item deletion if O_TMPFILE
 6952	 */
 6953	trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
 6954	if (IS_ERR(trans)) {
 6955		ret = PTR_ERR(trans);
 6956		trans = NULL;
 6957		goto fail;
 6958	}
 6959
 6960	/* There are several dir indexes for this inode, clear the cache. */
 6961	BTRFS_I(inode)->dir_index = 0ULL;
 6962	inode_inc_iversion(inode);
 6963	inode_set_ctime_current(inode);
 6964
 6965	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
 6966			     &fname.disk_name, 1, index);
 6967	if (ret)
 6968		goto fail;
 6969
 6970	/* Link added now we update the inode item with the new link count. */
 6971	inc_nlink(inode);
 6972	ret = btrfs_update_inode(trans, BTRFS_I(inode));
 6973	if (unlikely(ret)) {
 6974		btrfs_abort_transaction(trans, ret);
 6975		goto fail;
 6976	}
 6977
 6978	if (inode->i_nlink == 1) {
 6979		/*
 6980		 * If the new hard link count is 1, it's a file created with the
 6981		 * open(2) O_TMPFILE flag.
 6982		 */
 6983		ret = btrfs_orphan_del(trans, BTRFS_I(inode));
 6984		if (unlikely(ret)) {
 6985			btrfs_abort_transaction(trans, ret);
 6986			goto fail;
 6987		}
 6988	}
 6989
 6990	/* Grab reference for the new dentry passed to d_instantiate(). */
 6991	ihold(inode);
 6992	d_instantiate(dentry, inode);
 6993	btrfs_log_new_name(trans, old_dentry, NULL, 0, dentry->d_parent);
 6994
 6995fail:
 6996	fscrypt_free_filename(&fname);
 6997	if (trans)
 6998		btrfs_end_transaction(trans);
 6999	btrfs_btree_balance_dirty(fs_info);
 7000	return ret;
 7001}
 7002
 7003static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 7004				  struct dentry *dentry, umode_t mode)
 7005{
 7006	struct inode *inode;
 7007
 7008	inode = new_inode(dir->i_sb);
 7009	if (!inode)
 7010		return ERR_PTR(-ENOMEM);
 7011	inode_init_owner(idmap, inode, dir, S_IFDIR | mode);
 7012	inode->i_op = &btrfs_dir_inode_operations;
 7013	inode->i_fop = &btrfs_dir_file_operations;
 7014	return ERR_PTR(btrfs_create_common(dir, dentry, inode));
 7015}
 7016
 7017static noinline int uncompress_inline(struct btrfs_path *path,
 7018				      struct folio *folio,
 7019				      struct btrfs_file_extent_item *item)
 7020{
 7021	int ret;
 7022	struct extent_buffer *leaf = path->nodes[0];
 7023	const u32 blocksize = leaf->fs_info->sectorsize;
 7024	char *tmp;
 7025	size_t max_size;
 7026	unsigned long inline_size;
 7027	unsigned long ptr;
 7028	int compress_type;
 7029
 7030	compress_type = btrfs_file_extent_compression(leaf, item);
 7031	max_size = btrfs_file_extent_ram_bytes(leaf, item);
 7032	inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
 7033	tmp = kmalloc(inline_size, GFP_NOFS);
 7034	if (!tmp)
 7035		return -ENOMEM;
 7036	ptr = btrfs_file_extent_inline_start(item);
 7037
 7038	read_extent_buffer(leaf, tmp, ptr, inline_size);
 7039
 7040	max_size = min_t(unsigned long, blocksize, max_size);
 7041	ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size,
 7042			       max_size);
 7043
 7044	/*
 7045	 * decompression code contains a memset to fill in any space between the end
 7046	 * of the uncompressed data and the end of max_size in case the decompressed
 7047	 * data ends up shorter than ram_bytes.  That doesn't cover the hole between
 7048	 * the end of an inline extent and the beginning of the next block, so we
 7049	 * cover that region here.
 7050	 */
 7051
 7052	if (max_size < blocksize)
 7053		folio_zero_range(folio, max_size, blocksize - max_size);
 7054	kfree(tmp);
 7055	return ret;
 7056}
 7057
 7058static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
 7059{
 7060	const u32 blocksize = path->nodes[0]->fs_info->sectorsize;
 7061	struct btrfs_file_extent_item *fi;
 7062	void *kaddr;
 7063	size_t copy_size;
 7064
 7065	if (!folio || folio_test_uptodate(folio))
 7066		return 0;
 7067
 7068	ASSERT(folio_pos(folio) == 0);
 7069
 7070	fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
 7071			    struct btrfs_file_extent_item);
 7072	if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
 7073		return uncompress_inline(path, folio, fi);
 7074
 7075	copy_size = min_t(u64, blocksize,
 7076			  btrfs_file_extent_ram_bytes(path->nodes[0], fi));
 7077	kaddr = kmap_local_folio(folio, 0);
 7078	read_extent_buffer(path->nodes[0], kaddr,
 7079			   btrfs_file_extent_inline_start(fi), copy_size);
 7080	kunmap_local(kaddr);
 7081	if (copy_size < blocksize)
 7082		folio_zero_range(folio, copy_size, blocksize - copy_size);
 7083	return 0;
 7084}
 7085
 7086/*
 7087 * Lookup the first extent overlapping a range in a file.
 7088 *
 7089 * @inode:	file to search in
 7090 * @page:	page to read extent data into if the extent is inline
 7091 * @start:	file offset
 7092 * @len:	length of range starting at @start
 7093 *
 7094 * Return the first &struct extent_map which overlaps the given range, reading
 7095 * it from the B-tree and caching it if necessary. Note that there may be more
 7096 * extents which overlap the given range after the returned extent_map.
 7097 *
 7098 * If @page is not NULL and the extent is inline, this also reads the extent
 7099 * data directly into the page and marks the extent up to date in the io_tree.
 7100 *
 7101 * Return: ERR_PTR on error, non-NULL extent_map on success.
 7102 */
 7103struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
 7104				    struct folio *folio, u64 start, u64 len)
 7105{
 7106	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 7107	int ret = 0;
 7108	u64 extent_start = 0;
 7109	u64 extent_end = 0;
 7110	u64 objectid = btrfs_ino(inode);
 7111	int extent_type = -1;
 7112	struct btrfs_path *path = NULL;
 7113	struct btrfs_root *root = inode->root;
 7114	struct btrfs_file_extent_item *item;
 7115	struct extent_buffer *leaf;
 7116	struct btrfs_key found_key;
 7117	struct extent_map *em = NULL;
 7118	struct extent_map_tree *em_tree = &inode->extent_tree;
 7119
 7120	read_lock(&em_tree->lock);
 7121	em = btrfs_lookup_extent_mapping(em_tree, start, len);
 7122	read_unlock(&em_tree->lock);
 7123
 7124	if (em) {
 7125		if (em->start > start || em->start + em->len <= start)
 7126			btrfs_free_extent_map(em);
 7127		else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio)
 7128			btrfs_free_extent_map(em);
 7129		else
 7130			goto out;
 7131	}
 7132	em = btrfs_alloc_extent_map();
 7133	if (!em) {
 7134		ret = -ENOMEM;
 7135		goto out;
 7136	}
 7137	em->start = EXTENT_MAP_HOLE;
 7138	em->disk_bytenr = EXTENT_MAP_HOLE;
 7139	em->len = (u64)-1;
 7140
 7141	path = btrfs_alloc_path();
 7142	if (!path) {
 7143		ret = -ENOMEM;
 7144		goto out;
 7145	}
 7146
 7147	/* Chances are we'll be called again, so go ahead and do readahead */
 7148	path->reada = READA_FORWARD;
 7149
 7150	/*
 7151	 * The same explanation in load_free_space_cache applies here as well,
 7152	 * we only read when we're loading the free space cache, and at that
 7153	 * point the commit_root has everything we need.
 7154	 */
 7155	if (btrfs_is_free_space_inode(inode)) {
 7156		path->search_commit_root = true;
 7157		path->skip_locking = true;
 7158	}
 7159
 7160	ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
 7161	if (ret < 0) {
 7162		goto out;
 7163	} else if (ret > 0) {
 7164		if (path->slots[0] == 0)
 7165			goto not_found;
 7166		path->slots[0]--;
 7167		ret = 0;
 7168	}
 7169
 7170	leaf = path->nodes[0];
 7171	item = btrfs_item_ptr(leaf, path->slots[0],
 7172			      struct btrfs_file_extent_item);
 7173	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 7174	if (found_key.objectid != objectid ||
 7175	    found_key.type != BTRFS_EXTENT_DATA_KEY) {
 7176		/*
 7177		 * If we backup past the first extent we want to move forward
 7178		 * and see if there is an extent in front of us, otherwise we'll
 7179		 * say there is a hole for our whole search range which can
 7180		 * cause problems.
 7181		 */
 7182		extent_end = start;
 7183		goto next;
 7184	}
 7185
 7186	extent_type = btrfs_file_extent_type(leaf, item);
 7187	extent_start = found_key.offset;
 7188	extent_end = btrfs_file_extent_end(path);
 7189	if (extent_type == BTRFS_FILE_EXTENT_REG ||
 7190	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 7191		/* Only regular file could have regular/prealloc extent */
 7192		if (unlikely(!S_ISREG(inode->vfs_inode.i_mode))) {
 7193			ret = -EUCLEAN;
 7194			btrfs_crit(fs_info,
 7195		"regular/prealloc extent found for non-regular inode %llu",
 7196				   btrfs_ino(inode));
 7197			goto out;
 7198		}
 7199		trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
 7200						       extent_start);
 7201	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 7202		trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
 7203						      path->slots[0],
 7204						      extent_start);
 7205	}
 7206next:
 7207	if (start >= extent_end) {
 7208		path->slots[0]++;
 7209		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 7210			ret = btrfs_next_leaf(root, path);
 7211			if (ret < 0)
 7212				goto out;
 7213			else if (ret > 0)
 7214				goto not_found;
 7215
 7216			leaf = path->nodes[0];
 7217		}
 7218		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 7219		if (found_key.objectid != objectid ||
 7220		    found_key.type != BTRFS_EXTENT_DATA_KEY)
 7221			goto not_found;
 7222		if (start + len <= found_key.offset)
 7223			goto not_found;
 7224		if (start > found_key.offset)
 7225			goto next;
 7226
 7227		/* New extent overlaps with existing one */
 7228		em->start = start;
 7229		em->len = found_key.offset - start;
 7230		em->disk_bytenr = EXTENT_MAP_HOLE;
 7231		goto insert;
 7232	}
 7233
 7234	btrfs_extent_item_to_extent_map(inode, path, item, em);
 7235
 7236	if (extent_type == BTRFS_FILE_EXTENT_REG ||
 7237	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 7238		goto insert;
 7239	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 7240		/*
 7241		 * Inline extent can only exist at file offset 0. This is
 7242		 * ensured by tree-checker and inline extent creation path.
 7243		 * Thus all members representing file offsets should be zero.
 7244		 */
 7245		ASSERT(extent_start == 0);
 7246		ASSERT(em->start == 0);
 7247
 7248		/*
 7249		 * btrfs_extent_item_to_extent_map() should have properly
 7250		 * initialized em members already.
 7251		 *
 7252		 * Other members are not utilized for inline extents.
 7253		 */
 7254		ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);
 7255		ASSERT(em->len == fs_info->sectorsize);
 7256
 7257		ret = read_inline_extent(path, folio);
 7258		if (ret < 0)
 7259			goto out;
 7260		goto insert;
 7261	}
 7262not_found:
 7263	em->start = start;
 7264	em->len = len;
 7265	em->disk_bytenr = EXTENT_MAP_HOLE;
 7266insert:
 7267	ret = 0;
 7268	btrfs_release_path(path);
 7269	if (unlikely(em->start > start || btrfs_extent_map_end(em) <= start)) {
 7270		btrfs_err(fs_info,
 7271			  "bad extent! em: [%llu %llu] passed [%llu %llu]",
 7272			  em->start, em->len, start, len);
 7273		ret = -EIO;
 7274		goto out;
 7275	}
 7276
 7277	write_lock(&em_tree->lock);
 7278	ret = btrfs_add_extent_mapping(inode, &em, start, len);
 7279	write_unlock(&em_tree->lock);
 7280out:
 7281	btrfs_free_path(path);
 7282
 7283	trace_btrfs_get_extent(root, inode, em);
 7284
 7285	if (ret) {
 7286		btrfs_free_extent_map(em);
 7287		return ERR_PTR(ret);
 7288	}
 7289	return em;
 7290}
 7291
 7292static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
 7293{
 7294	struct btrfs_block_group *block_group;
 7295	bool readonly = false;
 7296
 7297	block_group = btrfs_lookup_block_group(fs_info, bytenr);
 7298	if (!block_group || block_group->ro)
 7299		readonly = true;
 7300	if (block_group)
 7301		btrfs_put_block_group(block_group);
 7302	return readonly;
 7303}
 7304
 7305/*
 7306 * Check if we can do nocow write into the range [@offset, @offset + @len)
 7307 *
 7308 * @offset:	File offset
 7309 * @len:	The length to write, will be updated to the nocow writeable
 7310 *		range
 7311 * @orig_start:	(optional) Return the original file offset of the file extent
 7312 * @orig_len:	(optional) Return the original on-disk length of the file extent
 7313 * @ram_bytes:	(optional) Return the ram_bytes of the file extent
 7314 *
 7315 * Return:
 7316 * >0	and update @len if we can do nocow write
 7317 *  0	if we can't do nocow write
 7318 * <0	if error happened
 7319 *
 7320 * NOTE: This only checks the file extents, caller is responsible to wait for
 7321 *	 any ordered extents.
 7322 */
 7323noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len,
 7324			      struct btrfs_file_extent *file_extent,
 7325			      bool nowait)
 7326{
 7327	struct btrfs_root *root = inode->root;
 7328	struct btrfs_fs_info *fs_info = root->fs_info;
 7329	struct can_nocow_file_extent_args nocow_args = { 0 };
 7330	BTRFS_PATH_AUTO_FREE(path);
 7331	int ret;
 7332	struct extent_buffer *leaf;
 7333	struct extent_io_tree *io_tree = &inode->io_tree;
 7334	struct btrfs_file_extent_item *fi;
 7335	struct btrfs_key key;
 7336	int found_type;
 7337
 7338	path = btrfs_alloc_path();
 7339	if (!path)
 7340		return -ENOMEM;
 7341	path->nowait = nowait;
 7342
 7343	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
 7344				       offset, 0);
 7345	if (ret < 0)
 7346		return ret;
 7347
 7348	if (ret == 1) {
 7349		if (path->slots[0] == 0) {
 7350			/* Can't find the item, must COW. */
 7351			return 0;
 7352		}
 7353		path->slots[0]--;
 7354	}
 7355	ret = 0;
 7356	leaf = path->nodes[0];
 7357	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 7358	if (key.objectid != btrfs_ino(inode) ||
 7359	    key.type != BTRFS_EXTENT_DATA_KEY) {
 7360		/* Not our file or wrong item type, must COW. */
 7361		return 0;
 7362	}
 7363
 7364	if (key.offset > offset) {
 7365		/* Wrong offset, must COW. */
 7366		return 0;
 7367	}
 7368
 7369	if (btrfs_file_extent_end(path) <= offset)
 7370		return 0;
 7371
 7372	fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
 7373	found_type = btrfs_file_extent_type(leaf, fi);
 7374
 7375	nocow_args.start = offset;
 7376	nocow_args.end = offset + *len - 1;
 7377	nocow_args.free_path = true;
 7378
 7379	ret = can_nocow_file_extent(path, &key, inode, &nocow_args);
 7380	/* can_nocow_file_extent() has freed the path. */
 7381	path = NULL;
 7382
 7383	if (ret != 1) {
 7384		/* Treat errors as not being able to NOCOW. */
 7385		return 0;
 7386	}
 7387
 7388	if (btrfs_extent_readonly(fs_info,
 7389				  nocow_args.file_extent.disk_bytenr +
 7390				  nocow_args.file_extent.offset))
 7391		return 0;
 7392
 7393	if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
 7394	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 7395		u64 range_end;
 7396
 7397		range_end = round_up(offset + nocow_args.file_extent.num_bytes,
 7398				     root->fs_info->sectorsize) - 1;
 7399		ret = btrfs_test_range_bit_exists(io_tree, offset, range_end,
 7400						  EXTENT_DELALLOC);
 7401		if (ret)
 7402			return -EAGAIN;
 7403	}
 7404
 7405	if (file_extent)
 7406		memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent));
 7407
 7408	*len = nocow_args.file_extent.num_bytes;
 7409
 7410	return 1;
 7411}
 7412
 7413/* The callers of this must take lock_extent() */
 7414struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
 7415				      const struct btrfs_file_extent *file_extent,
 7416				      int type)
 7417{
 7418	struct extent_map *em;
 7419	int ret;
 7420
 7421	/*
 7422	 * Note the missing NOCOW type.
 7423	 *
 7424	 * For pure NOCOW writes, we should not create an io extent map, but
 7425	 * just reusing the existing one.
 7426	 * Only PREALLOC writes (NOCOW write into preallocated range) can
 7427	 * create an io extent map.
 7428	 */
 7429	ASSERT(type == BTRFS_ORDERED_PREALLOC ||
 7430	       type == BTRFS_ORDERED_COMPRESSED ||
 7431	       type == BTRFS_ORDERED_REGULAR);
 7432
 7433	switch (type) {
 7434	case BTRFS_ORDERED_PREALLOC:
 7435		/* We're only referring part of a larger preallocated extent. */
 7436		ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);
 7437		break;
 7438	case BTRFS_ORDERED_REGULAR:
 7439		/* COW results a new extent matching our file extent size. */
 7440		ASSERT(file_extent->disk_num_bytes == file_extent->num_bytes);
 7441		ASSERT(file_extent->ram_bytes == file_extent->num_bytes);
 7442
 7443		/* Since it's a new extent, we should not have any offset. */
 7444		ASSERT(file_extent->offset == 0);
 7445		break;
 7446	case BTRFS_ORDERED_COMPRESSED:
 7447		/* Must be compressed. */
 7448		ASSERT(file_extent->compression != BTRFS_COMPRESS_NONE);
 7449
 7450		/*
 7451		 * Encoded write can make us to refer to part of the
 7452		 * uncompressed extent.
 7453		 */
 7454		ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);
 7455		break;
 7456	}
 7457
 7458	em = btrfs_alloc_extent_map();
 7459	if (!em)
 7460		return ERR_PTR(-ENOMEM);
 7461
 7462	em->start = start;
 7463	em->len = file_extent->num_bytes;
 7464	em->disk_bytenr = file_extent->disk_bytenr;
 7465	em->disk_num_bytes = file_extent->disk_num_bytes;
 7466	em->ram_bytes = file_extent->ram_bytes;
 7467	em->generation = -1;
 7468	em->offset = file_extent->offset;
 7469	em->flags |= EXTENT_FLAG_PINNED;
 7470	if (type == BTRFS_ORDERED_COMPRESSED)
 7471		btrfs_extent_map_set_compression(em, file_extent->compression);
 7472
 7473	ret = btrfs_replace_extent_map_range(inode, em, true);
 7474	if (ret) {
 7475		btrfs_free_extent_map(em);
 7476		return ERR_PTR(ret);
 7477	}
 7478
 7479	/* em got 2 refs now, callers needs to do btrfs_free_extent_map once. */
 7480	return em;
 7481}
 7482
 7483/*
 7484 * For release_folio() and invalidate_folio() we have a race window where
 7485 * folio_end_writeback() is called but the subpage spinlock is not yet released.
 7486 * If we continue to release/invalidate the page, we could cause use-after-free
 7487 * for subpage spinlock.  So this function is to spin and wait for subpage
 7488 * spinlock.
 7489 */
 7490static void wait_subpage_spinlock(struct folio *folio)
 7491{
 7492	struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
 7493	struct btrfs_folio_state *bfs;
 7494
 7495	if (!btrfs_is_subpage(fs_info, folio))
 7496		return;
 7497
 7498	ASSERT(folio_test_private(folio) && folio_get_private(folio));
 7499	bfs = folio_get_private(folio);
 7500
 7501	/*
 7502	 * This may look insane as we just acquire the spinlock and release it,
 7503	 * without doing anything.  But we just want to make sure no one is
 7504	 * still holding the subpage spinlock.
 7505	 * And since the page is not dirty nor writeback, and we have page
 7506	 * locked, the only possible way to hold a spinlock is from the endio
 7507	 * function to clear page writeback.
 7508	 *
 7509	 * Here we just acquire the spinlock so that all existing callers
 7510	 * should exit and we're safe to release/invalidate the page.
 7511	 */
 7512	spin_lock_irq(&bfs->lock);
 7513	spin_unlock_irq(&bfs->lock);
 7514}
 7515
 7516static int btrfs_launder_folio(struct folio *folio)
 7517{
 7518	return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio),
 7519				      folio_size(folio), NULL);
 7520}
 7521
 7522static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
 7523{
 7524	if (try_release_extent_mapping(folio, gfp_flags)) {
 7525		wait_subpage_spinlock(folio);
 7526		clear_folio_extent_mapped(folio);
 7527		return true;
 7528	}
 7529	return false;
 7530}
 7531
 7532static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
 7533{
 7534	if (folio_test_writeback(folio) || folio_test_dirty(folio))
 7535		return false;
 7536	return __btrfs_release_folio(folio, gfp_flags);
 7537}
 7538
 7539#ifdef CONFIG_MIGRATION
 7540static int btrfs_migrate_folio(struct address_space *mapping,
 7541			     struct folio *dst, struct folio *src,
 7542			     enum migrate_mode mode)
 7543{
 7544	int ret = filemap_migrate_folio(mapping, dst, src, mode);
 7545
 7546	if (ret)
 7547		return ret;
 7548
 7549	if (folio_test_ordered(src)) {
 7550		folio_clear_ordered(src);
 7551		folio_set_ordered(dst);
 7552	}
 7553
 7554	return 0;
 7555}
 7556#else
 7557#define btrfs_migrate_folio NULL
 7558#endif
 7559
 7560static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
 7561				 size_t length)
 7562{
 7563	struct btrfs_inode *inode = folio_to_inode(folio);
 7564	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 7565	struct extent_io_tree *tree = &inode->io_tree;
 7566	struct extent_state *cached_state = NULL;
 7567	u64 page_start = folio_pos(folio);
 7568	u64 page_end = page_start + folio_size(folio) - 1;
 7569	u64 cur;
 7570	int inode_evicting = inode_state_read_once(&inode->vfs_inode) & I_FREEING;
 7571
 7572	/*
 7573	 * We have folio locked so no new ordered extent can be created on this
 7574	 * page, nor bio can be submitted for this folio.
 7575	 *
 7576	 * But already submitted bio can still be finished on this folio.
 7577	 * Furthermore, endio function won't skip folio which has Ordered
 7578	 * already cleared, so it's possible for endio and
 7579	 * invalidate_folio to do the same ordered extent accounting twice
 7580	 * on one folio.
 7581	 *
 7582	 * So here we wait for any submitted bios to finish, so that we won't
 7583	 * do double ordered extent accounting on the same folio.
 7584	 */
 7585	folio_wait_writeback(folio);
 7586	wait_subpage_spinlock(folio);
 7587
 7588	/*
 7589	 * For subpage case, we have call sites like
 7590	 * btrfs_punch_hole_lock_range() which passes range not aligned to
 7591	 * sectorsize.
 7592	 * If the range doesn't cover the full folio, we don't need to and
 7593	 * shouldn't clear page extent mapped, as folio->private can still
 7594	 * record subpage dirty bits for other part of the range.
 7595	 *
 7596	 * For cases that invalidate the full folio even the range doesn't
 7597	 * cover the full folio, like invalidating the last folio, we're
 7598	 * still safe to wait for ordered extent to finish.
 7599	 */
 7600	if (!(offset == 0 && length == folio_size(folio))) {
 7601		btrfs_release_folio(folio, GFP_NOFS);
 7602		return;
 7603	}
 7604
 7605	if (!inode_evicting)
 7606		btrfs_lock_extent(tree, page_start, page_end, &cached_state);
 7607
 7608	cur = page_start;
 7609	while (cur < page_end) {
 7610		struct btrfs_ordered_extent *ordered;
 7611		u64 range_end;
 7612		u32 range_len;
 7613		u32 extra_flags = 0;
 7614
 7615		ordered = btrfs_lookup_first_ordered_range(inode, cur,
 7616							   page_end + 1 - cur);
 7617		if (!ordered) {
 7618			range_end = page_end;
 7619			/*
 7620			 * No ordered extent covering this range, we are safe
 7621			 * to delete all extent states in the range.
 7622			 */
 7623			extra_flags = EXTENT_CLEAR_ALL_BITS;
 7624			goto next;
 7625		}
 7626		if (ordered->file_offset > cur) {
 7627			/*
 7628			 * There is a range between [cur, oe->file_offset) not
 7629			 * covered by any ordered extent.
 7630			 * We are safe to delete all extent states, and handle
 7631			 * the ordered extent in the next iteration.
 7632			 */
 7633			range_end = ordered->file_offset - 1;
 7634			extra_flags = EXTENT_CLEAR_ALL_BITS;
 7635			goto next;
 7636		}
 7637
 7638		range_end = min(ordered->file_offset + ordered->num_bytes - 1,
 7639				page_end);
 7640		ASSERT(range_end + 1 - cur < U32_MAX);
 7641		range_len = range_end + 1 - cur;
 7642		if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) {
 7643			/*
 7644			 * If Ordered is cleared, it means endio has
 7645			 * already been executed for the range.
 7646			 * We can't delete the extent states as
 7647			 * btrfs_finish_ordered_io() may still use some of them.
 7648			 */
 7649			goto next;
 7650		}
 7651		btrfs_folio_clear_ordered(fs_info, folio, cur, range_len);
 7652
 7653		/*
 7654		 * IO on this page will never be started, so we need to account
 7655		 * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
 7656		 * here, must leave that up for the ordered extent completion.
 7657		 *
 7658		 * This will also unlock the range for incoming
 7659		 * btrfs_finish_ordered_io().
 7660		 */
 7661		if (!inode_evicting)
 7662			btrfs_clear_extent_bit(tree, cur, range_end,
 7663					       EXTENT_DELALLOC |
 7664					       EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
 7665					       EXTENT_DEFRAG, &cached_state);
 7666
 7667		spin_lock(&inode->ordered_tree_lock);
 7668		set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
 7669		ordered->truncated_len = min(ordered->truncated_len,
 7670					     cur - ordered->file_offset);
 7671		spin_unlock(&inode->ordered_tree_lock);
 7672
 7673		/*
 7674		 * If the ordered extent has finished, we're safe to delete all
 7675		 * the extent states of the range, otherwise
 7676		 * btrfs_finish_ordered_io() will get executed by endio for
 7677		 * other pages, so we can't delete extent states.
 7678		 */
 7679		if (btrfs_dec_test_ordered_pending(inode, &ordered,
 7680						   cur, range_end + 1 - cur)) {
 7681			btrfs_finish_ordered_io(ordered);
 7682			/*
 7683			 * The ordered extent has finished, now we're again
 7684			 * safe to delete all extent states of the range.
 7685			 */
 7686			extra_flags = EXTENT_CLEAR_ALL_BITS;
 7687		}
 7688next:
 7689		if (ordered)
 7690			btrfs_put_ordered_extent(ordered);
 7691		/*
 7692		 * Qgroup reserved space handler
 7693		 * Sector(s) here will be either:
 7694		 *
 7695		 * 1) Already written to disk or bio already finished
 7696		 *    Then its QGROUP_RESERVED bit in io_tree is already cleared.
 7697		 *    Qgroup will be handled by its qgroup_record then.
 7698		 *    btrfs_qgroup_free_data() call will do nothing here.
 7699		 *
 7700		 * 2) Not written to disk yet
 7701		 *    Then btrfs_qgroup_free_data() call will clear the
 7702		 *    QGROUP_RESERVED bit of its io_tree, and free the qgroup
 7703		 *    reserved data space.
 7704		 *    Since the IO will never happen for this page.
 7705		 */
 7706		btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL);
 7707		if (!inode_evicting)
 7708			btrfs_clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
 7709					       EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
 7710					       EXTENT_DEFRAG | extra_flags,
 7711					       &cached_state);
 7712		cur = range_end + 1;
 7713	}
 7714	/*
 7715	 * We have iterated through all ordered extents of the page, the page
 7716	 * should not have Ordered anymore, or the above iteration
 7717	 * did something wrong.
 7718	 */
 7719	ASSERT(!folio_test_ordered(folio));
 7720	btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
 7721	if (!inode_evicting)
 7722		__btrfs_release_folio(folio, GFP_NOFS);
 7723	clear_folio_extent_mapped(folio);
 7724}
 7725
 7726static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
 7727{
 7728	struct btrfs_truncate_control control = {
 7729		.inode = inode,
 7730		.ino = btrfs_ino(inode),
 7731		.min_type = BTRFS_EXTENT_DATA_KEY,
 7732		.clear_extent_range = true,
 7733		.new_size = inode->vfs_inode.i_size,
 7734	};
 7735	struct btrfs_root *root = inode->root;
 7736	struct btrfs_fs_info *fs_info = root->fs_info;
 7737	struct btrfs_block_rsv rsv;
 7738	int ret;
 7739	struct btrfs_trans_handle *trans;
 7740	const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
 7741	const u64 lock_start = round_down(inode->vfs_inode.i_size, fs_info->sectorsize);
 7742	const u64 i_size_up = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
 7743
 7744	/* Our inode is locked and the i_size can't be changed concurrently. */
 7745	btrfs_assert_inode_locked(inode);
 7746
 7747	if (!skip_writeback) {
 7748		ret = btrfs_wait_ordered_range(inode, lock_start, (u64)-1);
 7749		if (ret)
 7750			return ret;
 7751	}
 7752
 7753	/*
 7754	 * Yes ladies and gentlemen, this is indeed ugly.  We have a couple of
 7755	 * things going on here:
 7756	 *
 7757	 * 1) We need to reserve space to update our inode.
 7758	 *
 7759	 * 2) We need to have something to cache all the space that is going to
 7760	 * be free'd up by the truncate operation, but also have some slack
 7761	 * space reserved in case it uses space during the truncate (thank you
 7762	 * very much snapshotting).
 7763	 *
 7764	 * And we need these to be separate.  The fact is we can use a lot of
 7765	 * space doing the truncate, and we have no earthly idea how much space
 7766	 * we will use, so we need the truncate reservation to be separate so it
 7767	 * doesn't end up using space reserved for updating the inode.  We also
 7768	 * need to be able to stop the transaction and start a new one, which
 7769	 * means we need to be able to update the inode several times, and we
 7770	 * have no idea of knowing how many times that will be, so we can't just
 7771	 * reserve 1 item for the entirety of the operation, so that has to be
 7772	 * done separately as well.
 7773	 *
 7774	 * So that leaves us with
 7775	 *
 7776	 * 1) rsv - for the truncate reservation, which we will steal from the
 7777	 * transaction reservation.
 7778	 * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
 7779	 * updating the inode.
 7780	 */
 7781	btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
 7782	rsv.size = min_size;
 7783	rsv.failfast = true;
 7784
 7785	/*
 7786	 * 1 for the truncate slack space
 7787	 * 1 for updating the inode.
 7788	 */
 7789	trans = btrfs_start_transaction(root, 2);
 7790	if (IS_ERR(trans)) {
 7791		ret = PTR_ERR(trans);
 7792		goto out;
 7793	}
 7794
 7795	/* Migrate the slack space for the truncate to our reserve */
 7796	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv,
 7797				      min_size, false);
 7798	/*
 7799	 * We have reserved 2 metadata units when we started the transaction and
 7800	 * min_size matches 1 unit, so this should never fail, but if it does,
 7801	 * it's not critical we just fail truncation.
 7802	 */
 7803	if (WARN_ON(ret)) {
 7804		btrfs_end_transaction(trans);
 7805		goto out;
 7806	}
 7807
 7808	trans->block_rsv = &rsv;
 7809
 7810	while (1) {
 7811		struct extent_state *cached_state = NULL;
 7812
 7813		btrfs_lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
 7814		/*
 7815		 * We want to drop from the next block forward in case this new
 7816		 * size is not block aligned since we will be keeping the last
 7817		 * block of the extent just the way it is.
 7818		 */
 7819		btrfs_drop_extent_map_range(inode, i_size_up, (u64)-1, false);
 7820
 7821		ret = btrfs_truncate_inode_items(trans, root, &control);
 7822
 7823		inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
 7824		btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
 7825
 7826		btrfs_unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
 7827
 7828		trans->block_rsv = &fs_info->trans_block_rsv;
 7829		if (ret != -ENOSPC && ret != -EAGAIN)
 7830			break;
 7831
 7832		ret = btrfs_update_inode(trans, inode);
 7833		if (ret)
 7834			break;
 7835
 7836		btrfs_end_transaction(trans);
 7837		btrfs_btree_balance_dirty(fs_info);
 7838
 7839		trans = btrfs_start_transaction(root, 2);
 7840		if (IS_ERR(trans)) {
 7841			ret = PTR_ERR(trans);
 7842			trans = NULL;
 7843			break;
 7844		}
 7845
 7846		btrfs_block_rsv_release(fs_info, &rsv, -1, NULL);
 7847		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
 7848					      &rsv, min_size, false);
 7849		/*
 7850		 * We have reserved 2 metadata units when we started the
 7851		 * transaction and min_size matches 1 unit, so this should never
 7852		 * fail, but if it does, it's not critical we just fail truncation.
 7853		 */
 7854		if (WARN_ON(ret))
 7855			break;
 7856
 7857		trans->block_rsv = &rsv;
 7858	}
 7859
 7860	/*
 7861	 * We can't call btrfs_truncate_block inside a trans handle as we could
 7862	 * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
 7863	 * know we've truncated everything except the last little bit, and can
 7864	 * do btrfs_truncate_block and then update the disk_i_size.
 7865	 */
 7866	if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
 7867		btrfs_end_transaction(trans);
 7868		btrfs_btree_balance_dirty(fs_info);
 7869
 7870		ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size,
 7871					   inode->vfs_inode.i_size, (u64)-1);
 7872		if (ret)
 7873			goto out;
 7874		trans = btrfs_start_transaction(root, 1);
 7875		if (IS_ERR(trans)) {
 7876			ret = PTR_ERR(trans);
 7877			goto out;
 7878		}
 7879		btrfs_inode_safe_disk_i_size_write(inode, 0);
 7880	}
 7881
 7882	if (trans) {
 7883		int ret2;
 7884
 7885		trans->block_rsv = &fs_info->trans_block_rsv;
 7886		ret2 = btrfs_update_inode(trans, inode);
 7887		if (ret2 && !ret)
 7888			ret = ret2;
 7889
 7890		ret2 = btrfs_end_transaction(trans);
 7891		if (ret2 && !ret)
 7892			ret = ret2;
 7893		btrfs_btree_balance_dirty(fs_info);
 7894	}
 7895out:
 7896	btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL);
 7897	/*
 7898	 * So if we truncate and then write and fsync we normally would just
 7899	 * write the extents that changed, which is a problem if we need to
 7900	 * first truncate that entire inode.  So set this flag so we write out
 7901	 * all of the extents in the inode to the sync log so we're completely
 7902	 * safe.
 7903	 *
 7904	 * If no extents were dropped or trimmed we don't need to force the next
 7905	 * fsync to truncate all the inode's items from the log and re-log them
 7906	 * all. This means the truncate operation did not change the file size,
 7907	 * or changed it to a smaller size but there was only an implicit hole
 7908	 * between the old i_size and the new i_size, and there were no prealloc
 7909	 * extents beyond i_size to drop.
 7910	 */
 7911	if (control.extents_found > 0)
 7912		btrfs_set_inode_full_sync(inode);
 7913
 7914	return ret;
 7915}
 7916
 7917struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap,
 7918				     struct inode *dir)
 7919{
 7920	struct inode *inode;
 7921
 7922	inode = new_inode(dir->i_sb);
 7923	if (inode) {
 7924		/*
 7925		 * Subvolumes don't inherit the sgid bit or the parent's gid if
 7926		 * the parent's sgid bit is set. This is probably a bug.
 7927		 */
 7928		inode_init_owner(idmap, inode, NULL,
 7929				 S_IFDIR | (~current_umask() & S_IRWXUGO));
 7930		inode->i_op = &btrfs_dir_inode_operations;
 7931		inode->i_fop = &btrfs_dir_file_operations;
 7932	}
 7933	return inode;
 7934}
 7935
 7936struct inode *btrfs_alloc_inode(struct super_block *sb)
 7937{
 7938	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 7939	struct btrfs_inode *ei;
 7940	struct inode *inode;
 7941
 7942	ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
 7943	if (!ei)
 7944		return NULL;
 7945
 7946	ei->root = NULL;
 7947	ei->generation = 0;
 7948	ei->last_trans = 0;
 7949	ei->last_sub_trans = 0;
 7950	ei->logged_trans = 0;
 7951	ei->delalloc_bytes = 0;
 7952	/* new_delalloc_bytes and last_dir_index_offset are in a union. */
 7953	ei->new_delalloc_bytes = 0;
 7954	ei->defrag_bytes = 0;
 7955	ei->disk_i_size = 0;
 7956	ei->flags = 0;
 7957	ei->ro_flags = 0;
 7958	/*
 7959	 * ->index_cnt will be properly initialized later when creating a new
 7960	 * inode (btrfs_create_new_inode()) or when reading an existing inode
 7961	 * from disk (btrfs_read_locked_inode()).
 7962	 */
 7963	ei->csum_bytes = 0;
 7964	ei->dir_index = 0;
 7965	ei->last_unlink_trans = 0;
 7966	ei->last_reflink_trans = 0;
 7967	ei->last_log_commit = 0;
 7968
 7969	spin_lock_init(&ei->lock);
 7970	ei->outstanding_extents = 0;
 7971	if (sb->s_magic != BTRFS_TEST_MAGIC)
 7972		btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
 7973					      BTRFS_BLOCK_RSV_DELALLOC);
 7974	ei->runtime_flags = 0;
 7975	ei->prop_compress = BTRFS_COMPRESS_NONE;
 7976	ei->defrag_compress = BTRFS_COMPRESS_NONE;
 7977
 7978	ei->delayed_node = NULL;
 7979
 7980	ei->i_otime_sec = 0;
 7981	ei->i_otime_nsec = 0;
 7982
 7983	inode = &ei->vfs_inode;
 7984	btrfs_extent_map_tree_init(&ei->extent_tree);
 7985
 7986	/* This io tree sets the valid inode. */
 7987	btrfs_extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
 7988	ei->io_tree.inode = ei;
 7989
 7990	ei->file_extent_tree = NULL;
 7991
 7992	mutex_init(&ei->log_mutex);
 7993	spin_lock_init(&ei->ordered_tree_lock);
 7994	ei->ordered_tree = RB_ROOT;
 7995	ei->ordered_tree_last = NULL;
 7996	INIT_LIST_HEAD(&ei->delalloc_inodes);
 7997	INIT_LIST_HEAD(&ei->delayed_iput);
 7998	init_rwsem(&ei->i_mmap_lock);
 7999
 8000	return inode;
 8001}
 8002
 8003#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 8004void btrfs_test_destroy_inode(struct inode *inode)
 8005{
 8006	btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
 8007	kfree(BTRFS_I(inode)->file_extent_tree);
 8008	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 8009}
 8010#endif
 8011
 8012void btrfs_free_inode(struct inode *inode)
 8013{
 8014	kfree(BTRFS_I(inode)->file_extent_tree);
 8015	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 8016}
 8017
 8018void btrfs_destroy_inode(struct inode *vfs_inode)
 8019{
 8020	struct btrfs_ordered_extent *ordered;
 8021	struct btrfs_inode *inode = BTRFS_I(vfs_inode);
 8022	struct btrfs_root *root = inode->root;
 8023	bool freespace_inode;
 8024
 8025	WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
 8026	WARN_ON(vfs_inode->i_data.nrpages);
 8027	WARN_ON(inode->block_rsv.reserved);
 8028	WARN_ON(inode->block_rsv.size);
 8029	WARN_ON(inode->outstanding_extents);
 8030	if (!S_ISDIR(vfs_inode->i_mode)) {
 8031		WARN_ON(inode->delalloc_bytes);
 8032		WARN_ON(inode->new_delalloc_bytes);
 8033		WARN_ON(inode->csum_bytes);
 8034	}
 8035	if (!root || !btrfs_is_data_reloc_root(root))
 8036		WARN_ON(inode->defrag_bytes);
 8037
 8038	/*
 8039	 * This can happen where we create an inode, but somebody else also
 8040	 * created the same inode and we need to destroy the one we already
 8041	 * created.
 8042	 */
 8043	if (!root)
 8044		return;
 8045
 8046	/*
 8047	 * If this is a free space inode do not take the ordered extents lockdep
 8048	 * map.
 8049	 */
 8050	freespace_inode = btrfs_is_free_space_inode(inode);
 8051
 8052	while (1) {
 8053		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
 8054		if (!ordered)
 8055			break;
 8056		else {
 8057			btrfs_err(root->fs_info,
 8058				  "found ordered extent %llu %llu on inode cleanup",
 8059				  ordered->file_offset, ordered->num_bytes);
 8060
 8061			if (!freespace_inode)
 8062				btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);
 8063
 8064			btrfs_remove_ordered_extent(inode, ordered);
 8065			btrfs_put_ordered_extent(ordered);
 8066			btrfs_put_ordered_extent(ordered);
 8067		}
 8068	}
 8069	btrfs_qgroup_check_reserved_leak(inode);
 8070	btrfs_del_inode_from_root(inode);
 8071	btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
 8072	btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
 8073	btrfs_put_root(inode->root);
 8074}
 8075
 8076int btrfs_drop_inode(struct inode *inode)
 8077{
 8078	struct btrfs_root *root = BTRFS_I(inode)->root;
 8079
 8080	if (root == NULL)
 8081		return 1;
 8082
 8083	/* the snap/subvol tree is on deleting */
 8084	if (btrfs_root_refs(&root->root_item) == 0)
 8085		return 1;
 8086	else
 8087		return inode_generic_drop(inode);
 8088}
 8089
 8090static void init_once(void *foo)
 8091{
 8092	struct btrfs_inode *ei = foo;
 8093
 8094	inode_init_once(&ei->vfs_inode);
 8095#ifdef CONFIG_FS_VERITY
 8096	ei->i_verity_info = NULL;
 8097#endif
 8098}
 8099
 8100void __cold btrfs_destroy_cachep(void)
 8101{
 8102	/*
 8103	 * Make sure all delayed rcu free inodes are flushed before we
 8104	 * destroy cache.
 8105	 */
 8106	rcu_barrier();
 8107	kmem_cache_destroy(btrfs_inode_cachep);
 8108}
 8109
 8110int __init btrfs_init_cachep(void)
 8111{
 8112	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
 8113			sizeof(struct btrfs_inode), 0,
 8114			SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
 8115			init_once);
 8116	if (!btrfs_inode_cachep)
 8117		return -ENOMEM;
 8118
 8119	return 0;
 8120}
 8121
 8122static int btrfs_getattr(struct mnt_idmap *idmap,
 8123			 const struct path *path, struct kstat *stat,
 8124			 u32 request_mask, unsigned int flags)
 8125{
 8126	u64 delalloc_bytes;
 8127	u64 inode_bytes;
 8128	struct inode *inode = d_inode(path->dentry);
 8129	u32 blocksize = btrfs_sb(inode->i_sb)->sectorsize;
 8130	u32 bi_flags = BTRFS_I(inode)->flags;
 8131	u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
 8132
 8133	stat->result_mask |= STATX_BTIME;
 8134	stat->btime.tv_sec = BTRFS_I(inode)->i_otime_sec;
 8135	stat->btime.tv_nsec = BTRFS_I(inode)->i_otime_nsec;
 8136	if (bi_flags & BTRFS_INODE_APPEND)
 8137		stat->attributes |= STATX_ATTR_APPEND;
 8138	if (bi_flags & BTRFS_INODE_COMPRESS)
 8139		stat->attributes |= STATX_ATTR_COMPRESSED;
 8140	if (bi_flags & BTRFS_INODE_IMMUTABLE)
 8141		stat->attributes |= STATX_ATTR_IMMUTABLE;
 8142	if (bi_flags & BTRFS_INODE_NODUMP)
 8143		stat->attributes |= STATX_ATTR_NODUMP;
 8144	if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
 8145		stat->attributes |= STATX_ATTR_VERITY;
 8146
 8147	stat->attributes_mask |= (STATX_ATTR_APPEND |
 8148				  STATX_ATTR_COMPRESSED |
 8149				  STATX_ATTR_IMMUTABLE |
 8150				  STATX_ATTR_NODUMP);
 8151
 8152	generic_fillattr(idmap, request_mask, inode, stat);
 8153	stat->dev = BTRFS_I(inode)->root->anon_dev;
 8154
 8155	stat->subvol = btrfs_root_id(BTRFS_I(inode)->root);
 8156	stat->result_mask |= STATX_SUBVOL;
 8157
 8158	spin_lock(&BTRFS_I(inode)->lock);
 8159	delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
 8160	inode_bytes = inode_get_bytes(inode);
 8161	spin_unlock(&BTRFS_I(inode)->lock);
 8162	stat->blocks = (ALIGN(inode_bytes, blocksize) +
 8163			ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT;
 8164	return 0;
 8165}
 8166
 8167static int btrfs_rename_exchange(struct inode *old_dir,
 8168			      struct dentry *old_dentry,
 8169			      struct inode *new_dir,
 8170			      struct dentry *new_dentry)
 8171{
 8172	struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
 8173	struct btrfs_trans_handle *trans;
 8174	unsigned int trans_num_items;
 8175	struct btrfs_root *root = BTRFS_I(old_dir)->root;
 8176	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
 8177	struct inode *new_inode = new_dentry->d_inode;
 8178	struct inode *old_inode = old_dentry->d_inode;
 8179	struct btrfs_rename_ctx old_rename_ctx;
 8180	struct btrfs_rename_ctx new_rename_ctx;
 8181	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
 8182	u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
 8183	u64 old_idx = 0;
 8184	u64 new_idx = 0;
 8185	int ret;
 8186	int ret2;
 8187	bool need_abort = false;
 8188	bool logs_pinned = false;
 8189	struct fscrypt_name old_fname, new_fname;
 8190	struct fscrypt_str *old_name, *new_name;
 8191
 8192	/*
 8193	 * For non-subvolumes allow exchange only within one subvolume, in the
 8194	 * same inode namespace. Two subvolumes (represented as directory) can
 8195	 * be exchanged as they're a logical link and have a fixed inode number.
 8196	 */
 8197	if (root != dest &&
 8198	    (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
 8199	     new_ino != BTRFS_FIRST_FREE_OBJECTID))
 8200		return -EXDEV;
 8201
 8202	ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
 8203	if (ret)
 8204		return ret;
 8205
 8206	ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
 8207	if (ret) {
 8208		fscrypt_free_filename(&old_fname);
 8209		return ret;
 8210	}
 8211
 8212	old_name = &old_fname.disk_name;
 8213	new_name = &new_fname.disk_name;
 8214
 8215	/* close the race window with snapshot create/destroy ioctl */
 8216	if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
 8217	    new_ino == BTRFS_FIRST_FREE_OBJECTID)
 8218		down_read(&fs_info->subvol_sem);
 8219
 8220	/*
 8221	 * For each inode:
 8222	 * 1 to remove old dir item
 8223	 * 1 to remove old dir index
 8224	 * 1 to add new dir item
 8225	 * 1 to add new dir index
 8226	 * 1 to update parent inode
 8227	 *
 8228	 * If the parents are the same, we only need to account for one
 8229	 */
 8230	trans_num_items = (old_dir == new_dir ? 9 : 10);
 8231	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
 8232		/*
 8233		 * 1 to remove old root ref
 8234		 * 1 to remove old root backref
 8235		 * 1 to add new root ref
 8236		 * 1 to add new root backref
 8237		 */
 8238		trans_num_items += 4;
 8239	} else {
 8240		/*
 8241		 * 1 to update inode item
 8242		 * 1 to remove old inode ref
 8243		 * 1 to add new inode ref
 8244		 */
 8245		trans_num_items += 3;
 8246	}
 8247	if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
 8248		trans_num_items += 4;
 8249	else
 8250		trans_num_items += 3;
 8251	trans = btrfs_start_transaction(root, trans_num_items);
 8252	if (IS_ERR(trans)) {
 8253		ret = PTR_ERR(trans);
 8254		goto out_notrans;
 8255	}
 8256
 8257	if (dest != root) {
 8258		ret = btrfs_record_root_in_trans(trans, dest);
 8259		if (ret)
 8260			goto out_fail;
 8261	}
 8262
 8263	/*
 8264	 * We need to find a free sequence number both in the source and
 8265	 * in the destination directory for the exchange.
 8266	 */
 8267	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
 8268	if (ret)
 8269		goto out_fail;
 8270	ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
 8271	if (ret)
 8272		goto out_fail;
 8273
 8274	BTRFS_I(old_inode)->dir_index = 0ULL;
 8275	BTRFS_I(new_inode)->dir_index = 0ULL;
 8276
 8277	/* Reference for the source. */
 8278	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
 8279		/* force full log commit if subvolume involved. */
 8280		btrfs_set_log_full_commit(trans);
 8281	} else {
 8282		ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino,
 8283					     btrfs_ino(BTRFS_I(new_dir)),
 8284					     old_idx);
 8285		if (ret)
 8286			goto out_fail;
 8287		need_abort = true;
 8288	}
 8289
 8290	/* And now for the dest. */
 8291	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
 8292		/* force full log commit if subvolume involved. */
 8293		btrfs_set_log_full_commit(trans);
 8294	} else {
 8295		ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino,
 8296					     btrfs_ino(BTRFS_I(old_dir)),
 8297					     new_idx);
 8298		if (ret) {
 8299			if (unlikely(need_abort))
 8300				btrfs_abort_transaction(trans, ret);
 8301			goto out_fail;
 8302		}
 8303	}
 8304
 8305	/* Update inode version and ctime/mtime. */
 8306	inode_inc_iversion(old_dir);
 8307	inode_inc_iversion(new_dir);
 8308	inode_inc_iversion(old_inode);
 8309	inode_inc_iversion(new_inode);
 8310	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
 8311
 8312	if (old_ino != BTRFS_FIRST_FREE_OBJECTID &&
 8313	    new_ino != BTRFS_FIRST_FREE_OBJECTID) {
 8314		/*
 8315		 * If we are renaming in the same directory (and it's not for
 8316		 * root entries) pin the log early to prevent any concurrent
 8317		 * task from logging the directory after we removed the old
 8318		 * entries and before we add the new entries, otherwise that
 8319		 * task can sync a log without any entry for the inodes we are
 8320		 * renaming and therefore replaying that log, if a power failure
 8321		 * happens after syncing the log, would result in deleting the
 8322		 * inodes.
 8323		 *
 8324		 * If the rename affects two different directories, we want to
 8325		 * make sure the that there's no log commit that contains
 8326		 * updates for only one of the directories but not for the
 8327		 * other.
 8328		 *
 8329		 * If we are renaming an entry for a root, we don't care about
 8330		 * log updates since we called btrfs_set_log_full_commit().
 8331		 */
 8332		btrfs_pin_log_trans(root);
 8333		btrfs_pin_log_trans(dest);
 8334		logs_pinned = true;
 8335	}
 8336
 8337	if (old_dentry->d_parent != new_dentry->d_parent) {
 8338		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
 8339					BTRFS_I(old_inode), true);
 8340		btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
 8341					BTRFS_I(new_inode), true);
 8342	}
 8343
 8344	/* src is a subvolume */
 8345	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
 8346		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
 8347		if (unlikely(ret)) {
 8348			btrfs_abort_transaction(trans, ret);
 8349			goto out_fail;
 8350		}
 8351	} else { /* src is an inode */
 8352		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
 8353					   BTRFS_I(old_dentry->d_inode),
 8354					   old_name, &old_rename_ctx);
 8355		if (unlikely(ret)) {
 8356			btrfs_abort_transaction(trans, ret);
 8357			goto out_fail;
 8358		}
 8359		ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
 8360		if (unlikely(ret)) {
 8361			btrfs_abort_transaction(trans, ret);
 8362			goto out_fail;
 8363		}
 8364	}
 8365
 8366	/* dest is a subvolume */
 8367	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
 8368		ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
 8369		if (unlikely(ret)) {
 8370			btrfs_abort_transaction(trans, ret);
 8371			goto out_fail;
 8372		}
 8373	} else { /* dest is an inode */
 8374		ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
 8375					   BTRFS_I(new_dentry->d_inode),
 8376					   new_name, &new_rename_ctx);
 8377		if (unlikely(ret)) {
 8378			btrfs_abort_transaction(trans, ret);
 8379			goto out_fail;
 8380		}
 8381		ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
 8382		if (unlikely(ret)) {
 8383			btrfs_abort_transaction(trans, ret);
 8384			goto out_fail;
 8385		}
 8386	}
 8387
 8388	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
 8389			     new_name, 0, old_idx);
 8390	if (unlikely(ret)) {
 8391		btrfs_abort_transaction(trans, ret);
 8392		goto out_fail;
 8393	}
 8394
 8395	ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
 8396			     old_name, 0, new_idx);
 8397	if (unlikely(ret)) {
 8398		btrfs_abort_transaction(trans, ret);
 8399		goto out_fail;
 8400	}
 8401
 8402	if (old_inode->i_nlink == 1)
 8403		BTRFS_I(old_inode)->dir_index = old_idx;
 8404	if (new_inode->i_nlink == 1)
 8405		BTRFS_I(new_inode)->dir_index = new_idx;
 8406
 8407	/*
 8408	 * Do the log updates for all inodes.
 8409	 *
 8410	 * If either entry is for a root we don't need to update the logs since
 8411	 * we've called btrfs_set_log_full_commit() before.
 8412	 */
 8413	if (logs_pinned) {
 8414		btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
 8415				   old_rename_ctx.index, new_dentry->d_parent);
 8416		btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
 8417				   new_rename_ctx.index, old_dentry->d_parent);
 8418	}
 8419
 8420out_fail:
 8421	if (logs_pinned) {
 8422		btrfs_end_log_trans(root);
 8423		btrfs_end_log_trans(dest);
 8424	}
 8425	ret2 = btrfs_end_transaction(trans);
 8426	ret = ret ? ret : ret2;
 8427out_notrans:
 8428	if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
 8429	    old_ino == BTRFS_FIRST_FREE_OBJECTID)
 8430		up_read(&fs_info->subvol_sem);
 8431
 8432	fscrypt_free_filename(&new_fname);
 8433	fscrypt_free_filename(&old_fname);
 8434	return ret;
 8435}
 8436
 8437static struct inode *new_whiteout_inode(struct mnt_idmap *idmap,
 8438					struct inode *dir)
 8439{
 8440	struct inode *inode;
 8441
 8442	inode = new_inode(dir->i_sb);
 8443	if (inode) {
 8444		inode_init_owner(idmap, inode, dir,
 8445				 S_IFCHR | WHITEOUT_MODE);
 8446		inode->i_op = &btrfs_special_inode_operations;
 8447		init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
 8448	}
 8449	return inode;
 8450}
 8451
 8452static int btrfs_rename(struct mnt_idmap *idmap,
 8453			struct inode *old_dir, struct dentry *old_dentry,
 8454			struct inode *new_dir, struct dentry *new_dentry,
 8455			unsigned int flags)
 8456{
 8457	struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
 8458	struct btrfs_new_inode_args whiteout_args = {
 8459		.dir = old_dir,
 8460		.dentry = old_dentry,
 8461	};
 8462	struct btrfs_trans_handle *trans;
 8463	unsigned int trans_num_items;
 8464	struct btrfs_root *root = BTRFS_I(old_dir)->root;
 8465	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
 8466	struct inode *new_inode = d_inode(new_dentry);
 8467	struct inode *old_inode = d_inode(old_dentry);
 8468	struct btrfs_rename_ctx rename_ctx;
 8469	u64 index = 0;
 8470	int ret;
 8471	int ret2;
 8472	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
 8473	struct fscrypt_name old_fname, new_fname;
 8474	bool logs_pinned = false;
 8475
 8476	if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
 8477		return -EPERM;
 8478
 8479	/* we only allow rename subvolume link between subvolumes */
 8480	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
 8481		return -EXDEV;
 8482
 8483	if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
 8484	    (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
 8485		return -ENOTEMPTY;
 8486
 8487	if (S_ISDIR(old_inode->i_mode) && new_inode &&
 8488	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 8489		return -ENOTEMPTY;
 8490
 8491	ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
 8492	if (ret)
 8493		return ret;
 8494
 8495	ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
 8496	if (ret) {
 8497		fscrypt_free_filename(&old_fname);
 8498		return ret;
 8499	}
 8500
 8501	/* check for collisions, even if the  name isn't there */
 8502	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name);
 8503	if (ret) {
 8504		if (ret == -EEXIST) {
 8505			/* we shouldn't get
 8506			 * eexist without a new_inode */
 8507			if (WARN_ON(!new_inode)) {
 8508				goto out_fscrypt_names;
 8509			}
 8510		} else {
 8511			/* maybe -EOVERFLOW */
 8512			goto out_fscrypt_names;
 8513		}
 8514	}
 8515	ret = 0;
 8516
 8517	/*
 8518	 * we're using rename to replace one file with another.  Start IO on it
 8519	 * now so  we don't add too much work to the end of the transaction
 8520	 */
 8521	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
 8522		filemap_flush(old_inode->i_mapping);
 8523
 8524	if (flags & RENAME_WHITEOUT) {
 8525		whiteout_args.inode = new_whiteout_inode(idmap, old_dir);
 8526		if (!whiteout_args.inode) {
 8527			ret = -ENOMEM;
 8528			goto out_fscrypt_names;
 8529		}
 8530		ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items);
 8531		if (ret)
 8532			goto out_whiteout_inode;
 8533	} else {
 8534		/* 1 to update the old parent inode. */
 8535		trans_num_items = 1;
 8536	}
 8537
 8538	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
 8539		/* Close the race window with snapshot create/destroy ioctl */
 8540		down_read(&fs_info->subvol_sem);
 8541		/*
 8542		 * 1 to remove old root ref
 8543		 * 1 to remove old root backref
 8544		 * 1 to add new root ref
 8545		 * 1 to add new root backref
 8546		 */
 8547		trans_num_items += 4;
 8548	} else {
 8549		/*
 8550		 * 1 to update inode
 8551		 * 1 to remove old inode ref
 8552		 * 1 to add new inode ref
 8553		 */
 8554		trans_num_items += 3;
 8555	}
 8556	/*
 8557	 * 1 to remove old dir item
 8558	 * 1 to remove old dir index
 8559	 * 1 to add new dir item
 8560	 * 1 to add new dir index
 8561	 */
 8562	trans_num_items += 4;
 8563	/* 1 to update new parent inode if it's not the same as the old parent */
 8564	if (new_dir != old_dir)
 8565		trans_num_items++;
 8566	if (new_inode) {
 8567		/*
 8568		 * 1 to update inode
 8569		 * 1 to remove inode ref
 8570		 * 1 to remove dir item
 8571		 * 1 to remove dir index
 8572		 * 1 to possibly add orphan item
 8573		 */
 8574		trans_num_items += 5;
 8575	}
 8576	trans = btrfs_start_transaction(root, trans_num_items);
 8577	if (IS_ERR(trans)) {
 8578		ret = PTR_ERR(trans);
 8579		goto out_notrans;
 8580	}
 8581
 8582	if (dest != root) {
 8583		ret = btrfs_record_root_in_trans(trans, dest);
 8584		if (ret)
 8585			goto out_fail;
 8586	}
 8587
 8588	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
 8589	if (ret)
 8590		goto out_fail;
 8591
 8592	BTRFS_I(old_inode)->dir_index = 0ULL;
 8593	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
 8594		/* force full log commit if subvolume involved. */
 8595		btrfs_set_log_full_commit(trans);
 8596	} else {
 8597		ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name,
 8598					     old_ino, btrfs_ino(BTRFS_I(new_dir)),
 8599					     index);
 8600		if (ret)
 8601			goto out_fail;
 8602	}
 8603
 8604	inode_inc_iversion(old_dir);
 8605	inode_inc_iversion(new_dir);
 8606	inode_inc_iversion(old_inode);
 8607	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
 8608
 8609	if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
 8610		/*
 8611		 * If we are renaming in the same directory (and it's not a
 8612		 * root entry) pin the log to prevent any concurrent task from
 8613		 * logging the directory after we removed the old entry and
 8614		 * before we add the new entry, otherwise that task can sync
 8615		 * a log without any entry for the inode we are renaming and
 8616		 * therefore replaying that log, if a power failure happens
 8617		 * after syncing the log, would result in deleting the inode.
 8618		 *
 8619		 * If the rename affects two different directories, we want to
 8620		 * make sure the that there's no log commit that contains
 8621		 * updates for only one of the directories but not for the
 8622		 * other.
 8623		 *
 8624		 * If we are renaming an entry for a root, we don't care about
 8625		 * log updates since we called btrfs_set_log_full_commit().
 8626		 */
 8627		btrfs_pin_log_trans(root);
 8628		btrfs_pin_log_trans(dest);
 8629		logs_pinned = true;
 8630	}
 8631
 8632	if (old_dentry->d_parent != new_dentry->d_parent)
 8633		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
 8634					BTRFS_I(old_inode), true);
 8635
 8636	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
 8637		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
 8638		if (unlikely(ret)) {
 8639			btrfs_abort_transaction(trans, ret);
 8640			goto out_fail;
 8641		}
 8642	} else {
 8643		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
 8644					   BTRFS_I(d_inode(old_dentry)),
 8645					   &old_fname.disk_name, &rename_ctx);
 8646		if (unlikely(ret)) {
 8647			btrfs_abort_transaction(trans, ret);
 8648			goto out_fail;
 8649		}
 8650		ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
 8651		if (unlikely(ret)) {
 8652			btrfs_abort_transaction(trans, ret);
 8653			goto out_fail;
 8654		}
 8655	}
 8656
 8657	if (new_inode) {
 8658		inode_inc_iversion(new_inode);
 8659		if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
 8660			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
 8661			ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
 8662			if (unlikely(ret)) {
 8663				btrfs_abort_transaction(trans, ret);
 8664				goto out_fail;
 8665			}
 8666			BUG_ON(new_inode->i_nlink == 0);
 8667		} else {
 8668			ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
 8669						 BTRFS_I(d_inode(new_dentry)),
 8670						 &new_fname.disk_name);
 8671			if (unlikely(ret)) {
 8672				btrfs_abort_transaction(trans, ret);
 8673				goto out_fail;
 8674			}
 8675		}
 8676		if (new_inode->i_nlink == 0) {
 8677			ret = btrfs_orphan_add(trans,
 8678					BTRFS_I(d_inode(new_dentry)));
 8679			if (unlikely(ret)) {
 8680				btrfs_abort_transaction(trans, ret);
 8681				goto out_fail;
 8682			}
 8683		}
 8684	}
 8685
 8686	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
 8687			     &new_fname.disk_name, 0, index);
 8688	if (unlikely(ret)) {
 8689		btrfs_abort_transaction(trans, ret);
 8690		goto out_fail;
 8691	}
 8692
 8693	if (old_inode->i_nlink == 1)
 8694		BTRFS_I(old_inode)->dir_index = index;
 8695
 8696	if (logs_pinned)
 8697		btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
 8698				   rename_ctx.index, new_dentry->d_parent);
 8699
 8700	if (flags & RENAME_WHITEOUT) {
 8701		ret = btrfs_create_new_inode(trans, &whiteout_args);
 8702		if (unlikely(ret)) {
 8703			btrfs_abort_transaction(trans, ret);
 8704			goto out_fail;
 8705		} else {
 8706			unlock_new_inode(whiteout_args.inode);
 8707			iput(whiteout_args.inode);
 8708			whiteout_args.inode = NULL;
 8709		}
 8710	}
 8711out_fail:
 8712	if (logs_pinned) {
 8713		btrfs_end_log_trans(root);
 8714		btrfs_end_log_trans(dest);
 8715	}
 8716	ret2 = btrfs_end_transaction(trans);
 8717	ret = ret ? ret : ret2;
 8718out_notrans:
 8719	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
 8720		up_read(&fs_info->subvol_sem);
 8721	if (flags & RENAME_WHITEOUT)
 8722		btrfs_new_inode_args_destroy(&whiteout_args);
 8723out_whiteout_inode:
 8724	if (flags & RENAME_WHITEOUT)
 8725		iput(whiteout_args.inode);
 8726out_fscrypt_names:
 8727	fscrypt_free_filename(&old_fname);
 8728	fscrypt_free_filename(&new_fname);
 8729	return ret;
 8730}
 8731
 8732static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir,
 8733			 struct dentry *old_dentry, struct inode *new_dir,
 8734			 struct dentry *new_dentry, unsigned int flags)
 8735{
 8736	int ret;
 8737
 8738	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
 8739		return -EINVAL;
 8740
 8741	if (flags & RENAME_EXCHANGE)
 8742		ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,
 8743					    new_dentry);
 8744	else
 8745		ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir,
 8746				   new_dentry, flags);
 8747
 8748	btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info);
 8749
 8750	return ret;
 8751}
 8752
 8753struct btrfs_delalloc_work {
 8754	struct inode *inode;
 8755	struct completion completion;
 8756	struct list_head list;
 8757	struct btrfs_work work;
 8758};
 8759
 8760static void btrfs_run_delalloc_work(struct btrfs_work *work)
 8761{
 8762	struct btrfs_delalloc_work *delalloc_work;
 8763	struct inode *inode;
 8764
 8765	delalloc_work = container_of(work, struct btrfs_delalloc_work,
 8766				     work);
 8767	inode = delalloc_work->inode;
 8768	filemap_flush(inode->i_mapping);
 8769	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
 8770				&BTRFS_I(inode)->runtime_flags))
 8771		filemap_flush(inode->i_mapping);
 8772
 8773	iput(inode);
 8774	complete(&delalloc_work->completion);
 8775}
 8776
 8777static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
 8778{
 8779	struct btrfs_delalloc_work *work;
 8780
 8781	work = kmalloc(sizeof(*work), GFP_NOFS);
 8782	if (!work)
 8783		return NULL;
 8784
 8785	init_completion(&work->completion);
 8786	INIT_LIST_HEAD(&work->list);
 8787	work->inode = inode;
 8788	btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL);
 8789
 8790	return work;
 8791}
 8792
 8793/*
 8794 * some fairly slow code that needs optimization. This walks the list
 8795 * of all the inodes with pending delalloc and forces them to disk.
 8796 */
 8797static int start_delalloc_inodes(struct btrfs_root *root, long *nr_to_write,
 8798				 bool snapshot, bool in_reclaim_context)
 8799{
 8800	struct btrfs_delalloc_work *work, *next;
 8801	LIST_HEAD(works);
 8802	LIST_HEAD(splice);
 8803	int ret = 0;
 8804
 8805	mutex_lock(&root->delalloc_mutex);
 8806	spin_lock(&root->delalloc_lock);
 8807	list_splice_init(&root->delalloc_inodes, &splice);
 8808	while (!list_empty(&splice)) {
 8809		struct btrfs_inode *inode;
 8810		struct inode *tmp_inode;
 8811
 8812		inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes);
 8813
 8814		list_move_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
 8815
 8816		if (in_reclaim_context &&
 8817		    test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags))
 8818			continue;
 8819
 8820		tmp_inode = igrab(&inode->vfs_inode);
 8821		if (!tmp_inode) {
 8822			cond_resched_lock(&root->delalloc_lock);
 8823			continue;
 8824		}
 8825		spin_unlock(&root->delalloc_lock);
 8826
 8827		if (snapshot)
 8828			set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags);
 8829		if (nr_to_write == NULL) {
 8830			work = btrfs_alloc_delalloc_work(tmp_inode);
 8831			if (!work) {
 8832				iput(tmp_inode);
 8833				ret = -ENOMEM;
 8834				goto out;
 8835			}
 8836			list_add_tail(&work->list, &works);
 8837			btrfs_queue_work(root->fs_info->flush_workers,
 8838					 &work->work);
 8839		} else {
 8840			ret = filemap_flush_nr(tmp_inode->i_mapping,
 8841					nr_to_write);
 8842			btrfs_add_delayed_iput(inode);
 8843
 8844			if (ret || *nr_to_write <= 0)
 8845				goto out;
 8846		}
 8847		cond_resched();
 8848		spin_lock(&root->delalloc_lock);
 8849	}
 8850	spin_unlock(&root->delalloc_lock);
 8851
 8852out:
 8853	list_for_each_entry_safe(work, next, &works, list) {
 8854		list_del_init(&work->list);
 8855		wait_for_completion(&work->completion);
 8856		kfree(work);
 8857	}
 8858
 8859	if (!list_empty(&splice)) {
 8860		spin_lock(&root->delalloc_lock);
 8861		list_splice_tail(&splice, &root->delalloc_inodes);
 8862		spin_unlock(&root->delalloc_lock);
 8863	}
 8864	mutex_unlock(&root->delalloc_mutex);
 8865	return ret;
 8866}
 8867
 8868int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
 8869{
 8870	struct btrfs_fs_info *fs_info = root->fs_info;
 8871
 8872	if (BTRFS_FS_ERROR(fs_info))
 8873		return -EROFS;
 8874	return start_delalloc_inodes(root, NULL, true, in_reclaim_context);
 8875}
 8876
 8877int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
 8878			       bool in_reclaim_context)
 8879{
 8880	long *nr_to_write = nr == LONG_MAX ? NULL : &nr;
 8881	struct btrfs_root *root;
 8882	LIST_HEAD(splice);
 8883	int ret;
 8884
 8885	if (BTRFS_FS_ERROR(fs_info))
 8886		return -EROFS;
 8887
 8888	mutex_lock(&fs_info->delalloc_root_mutex);
 8889	spin_lock(&fs_info->delalloc_root_lock);
 8890	list_splice_init(&fs_info->delalloc_roots, &splice);
 8891	while (!list_empty(&splice)) {
 8892		root = list_first_entry(&splice, struct btrfs_root,
 8893					delalloc_root);
 8894		root = btrfs_grab_root(root);
 8895		BUG_ON(!root);
 8896		list_move_tail(&root->delalloc_root,
 8897			       &fs_info->delalloc_roots);
 8898		spin_unlock(&fs_info->delalloc_root_lock);
 8899
 8900		ret = start_delalloc_inodes(root, nr_to_write, false,
 8901				in_reclaim_context);
 8902		btrfs_put_root(root);
 8903		if (ret < 0 || nr <= 0)
 8904			goto out;
 8905		spin_lock(&fs_info->delalloc_root_lock);
 8906	}
 8907	spin_unlock(&fs_info->delalloc_root_lock);
 8908
 8909	ret = 0;
 8910out:
 8911	if (!list_empty(&splice)) {
 8912		spin_lock(&fs_info->delalloc_root_lock);
 8913		list_splice_tail(&splice, &fs_info->delalloc_roots);
 8914		spin_unlock(&fs_info->delalloc_root_lock);
 8915	}
 8916	mutex_unlock(&fs_info->delalloc_root_mutex);
 8917	return ret;
 8918}
 8919
 8920static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 8921			 struct dentry *dentry, const char *symname)
 8922{
 8923	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
 8924	struct btrfs_trans_handle *trans;
 8925	struct btrfs_root *root = BTRFS_I(dir)->root;
 8926	struct btrfs_path *path;
 8927	struct btrfs_key key;
 8928	struct inode *inode;
 8929	struct btrfs_new_inode_args new_inode_args = {
 8930		.dir = dir,
 8931		.dentry = dentry,
 8932	};
 8933	unsigned int trans_num_items;
 8934	int ret;
 8935	int name_len;
 8936	int datasize;
 8937	unsigned long ptr;
 8938	struct btrfs_file_extent_item *ei;
 8939	struct extent_buffer *leaf;
 8940
 8941	name_len = strlen(symname);
 8942	/*
 8943	 * Symlinks utilize uncompressed inline extent data, which should not
 8944	 * reach block size.
 8945	 */
 8946	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
 8947	    name_len >= fs_info->sectorsize)
 8948		return -ENAMETOOLONG;
 8949
 8950	inode = new_inode(dir->i_sb);
 8951	if (!inode)
 8952		return -ENOMEM;
 8953	inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO);
 8954	inode->i_op = &btrfs_symlink_inode_operations;
 8955	inode_nohighmem(inode);
 8956	inode->i_mapping->a_ops = &btrfs_aops;
 8957	btrfs_i_size_write(BTRFS_I(inode), name_len);
 8958	inode_set_bytes(inode, name_len);
 8959
 8960	new_inode_args.inode = inode;
 8961	ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
 8962	if (ret)
 8963		goto out_inode;
 8964	/* 1 additional item for the inline extent */
 8965	trans_num_items++;
 8966
 8967	trans = btrfs_start_transaction(root, trans_num_items);
 8968	if (IS_ERR(trans)) {
 8969		ret = PTR_ERR(trans);
 8970		goto out_new_inode_args;
 8971	}
 8972
 8973	ret = btrfs_create_new_inode(trans, &new_inode_args);
 8974	if (ret)
 8975		goto out;
 8976
 8977	path = btrfs_alloc_path();
 8978	if (unlikely(!path)) {
 8979		ret = -ENOMEM;
 8980		btrfs_abort_transaction(trans, ret);
 8981		discard_new_inode(inode);
 8982		inode = NULL;
 8983		goto out;
 8984	}
 8985	key.objectid = btrfs_ino(BTRFS_I(inode));
 8986	key.type = BTRFS_EXTENT_DATA_KEY;
 8987	key.offset = 0;
 8988	datasize = btrfs_file_extent_calc_inline_size(name_len);
 8989	ret = btrfs_insert_empty_item(trans, root, path, &key, datasize);
 8990	if (unlikely(ret)) {
 8991		btrfs_abort_transaction(trans, ret);
 8992		btrfs_free_path(path);
 8993		discard_new_inode(inode);
 8994		inode = NULL;
 8995		goto out;
 8996	}
 8997	leaf = path->nodes[0];
 8998	ei = btrfs_item_ptr(leaf, path->slots[0],
 8999			    struct btrfs_file_extent_item);
 9000	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
 9001	btrfs_set_file_extent_type(leaf, ei,
 9002				   BTRFS_FILE_EXTENT_INLINE);
 9003	btrfs_set_file_extent_encryption(leaf, ei, 0);
 9004	btrfs_set_file_extent_compression(leaf, ei, 0);
 9005	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
 9006	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
 9007
 9008	ptr = btrfs_file_extent_inline_start(ei);
 9009	write_extent_buffer(leaf, symname, ptr, name_len);
 9010	btrfs_free_path(path);
 9011
 9012	d_instantiate_new(dentry, inode);
 9013	ret = 0;
 9014out:
 9015	btrfs_end_transaction(trans);
 9016	btrfs_btree_balance_dirty(fs_info);
 9017out_new_inode_args:
 9018	btrfs_new_inode_args_destroy(&new_inode_args);
 9019out_inode:
 9020	if (ret)
 9021		iput(inode);
 9022	return ret;
 9023}
 9024
 9025static struct btrfs_trans_handle *insert_prealloc_file_extent(
 9026				       struct btrfs_trans_handle *trans_in,
 9027				       struct btrfs_inode *inode,
 9028				       struct btrfs_key *ins,
 9029				       u64 file_offset)
 9030{
 9031	struct btrfs_file_extent_item stack_fi;
 9032	struct btrfs_replace_extent_info extent_info;
 9033	struct btrfs_trans_handle *trans = trans_in;
 9034	struct btrfs_path *path;
 9035	u64 start = ins->objectid;
 9036	u64 len = ins->offset;
 9037	u64 qgroup_released = 0;
 9038	int ret;
 9039
 9040	memset(&stack_fi, 0, sizeof(stack_fi));
 9041
 9042	btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
 9043	btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
 9044	btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
 9045	btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
 9046	btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
 9047	btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
 9048	/* Encryption and other encoding is reserved and all 0 */
 9049
 9050	ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released);
 9051	if (ret < 0)
 9052		return ERR_PTR(ret);
 9053
 9054	if (trans) {
 9055		ret = insert_reserved_file_extent(trans, inode,
 9056						  file_offset, &stack_fi,
 9057						  true, qgroup_released);
 9058		if (ret)
 9059			goto free_qgroup;
 9060		return trans;
 9061	}
 9062
 9063	extent_info.disk_offset = start;
 9064	extent_info.disk_len = len;
 9065	extent_info.data_offset = 0;
 9066	extent_info.data_len = len;
 9067	extent_info.file_offset = file_offset;
 9068	extent_info.extent_buf = (char *)&stack_fi;
 9069	extent_info.is_new_extent = true;
 9070	extent_info.update_times = true;
 9071	extent_info.qgroup_reserved = qgroup_released;
 9072	extent_info.insertions = 0;
 9073
 9074	path = btrfs_alloc_path();
 9075	if (!path) {
 9076		ret = -ENOMEM;
 9077		goto free_qgroup;
 9078	}
 9079
 9080	ret = btrfs_replace_file_extents(inode, path, file_offset,
 9081				     file_offset + len - 1, &extent_info,
 9082				     &trans);
 9083	btrfs_free_path(path);
 9084	if (ret)
 9085		goto free_qgroup;
 9086	return trans;
 9087
 9088free_qgroup:
 9089	/*
 9090	 * We have released qgroup data range at the beginning of the function,
 9091	 * and normally qgroup_released bytes will be freed when committing
 9092	 * transaction.
 9093	 * But if we error out early, we have to free what we have released
 9094	 * or we leak qgroup data reservation.
 9095	 */
 9096	btrfs_qgroup_free_refroot(inode->root->fs_info,
 9097			btrfs_root_id(inode->root), qgroup_released,
 9098			BTRFS_QGROUP_RSV_DATA);
 9099	return ERR_PTR(ret);
 9100}
 9101
 9102static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 9103				       u64 start, u64 num_bytes, u64 min_size,
 9104				       loff_t actual_len, u64 *alloc_hint,
 9105				       struct btrfs_trans_handle *trans)
 9106{
 9107	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 9108	struct extent_map *em;
 9109	struct btrfs_root *root = BTRFS_I(inode)->root;
 9110	struct btrfs_key ins;
 9111	u64 cur_offset = start;
 9112	u64 clear_offset = start;
 9113	u64 i_size;
 9114	u64 cur_bytes;
 9115	u64 last_alloc = (u64)-1;
 9116	int ret = 0;
 9117	bool own_trans = true;
 9118	u64 end = start + num_bytes - 1;
 9119
 9120	if (trans)
 9121		own_trans = false;
 9122	while (num_bytes > 0) {
 9123		cur_bytes = min_t(u64, num_bytes, SZ_256M);
 9124		cur_bytes = max(cur_bytes, min_size);
 9125		/*
 9126		 * If we are severely fragmented we could end up with really
 9127		 * small allocations, so if the allocator is returning small
 9128		 * chunks lets make its job easier by only searching for those
 9129		 * sized chunks.
 9130		 */
 9131		cur_bytes = min(cur_bytes, last_alloc);
 9132		ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
 9133				min_size, 0, *alloc_hint, &ins, true, false);
 9134		if (ret)
 9135			break;
 9136
 9137		/*
 9138		 * We've reserved this space, and thus converted it from
 9139		 * ->bytes_may_use to ->bytes_reserved.  Any error that happens
 9140		 * from here on out we will only need to clear our reservation
 9141		 * for the remaining unreserved area, so advance our
 9142		 * clear_offset by our extent size.
 9143		 */
 9144		clear_offset += ins.offset;
 9145
 9146		last_alloc = ins.offset;
 9147		trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
 9148						    &ins, cur_offset);
 9149		/*
 9150		 * Now that we inserted the prealloc extent we can finally
 9151		 * decrement the number of reservations in the block group.
 9152		 * If we did it before, we could race with relocation and have
 9153		 * relocation miss the reserved extent, making it fail later.
 9154		 */
 9155		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 9156		if (IS_ERR(trans)) {
 9157			ret = PTR_ERR(trans);
 9158			btrfs_free_reserved_extent(fs_info, ins.objectid,
 9159						   ins.offset, false);
 9160			break;
 9161		}
 9162
 9163		em = btrfs_alloc_extent_map();
 9164		if (!em) {
 9165			btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
 9166					    cur_offset + ins.offset - 1, false);
 9167			btrfs_set_inode_full_sync(BTRFS_I(inode));
 9168			goto next;
 9169		}
 9170
 9171		em->start = cur_offset;
 9172		em->len = ins.offset;
 9173		em->disk_bytenr = ins.objectid;
 9174		em->offset = 0;
 9175		em->disk_num_bytes = ins.offset;
 9176		em->ram_bytes = ins.offset;
 9177		em->flags |= EXTENT_FLAG_PREALLOC;
 9178		em->generation = trans->transid;
 9179
 9180		ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
 9181		btrfs_free_extent_map(em);
 9182next:
 9183		num_bytes -= ins.offset;
 9184		cur_offset += ins.offset;
 9185		*alloc_hint = ins.objectid + ins.offset;
 9186
 9187		inode_inc_iversion(inode);
 9188		inode_set_ctime_current(inode);
 9189		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
 9190		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
 9191		    (actual_len > inode->i_size) &&
 9192		    (cur_offset > inode->i_size)) {
 9193			if (cur_offset > actual_len)
 9194				i_size = actual_len;
 9195			else
 9196				i_size = cur_offset;
 9197			i_size_write(inode, i_size);
 9198			btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
 9199		}
 9200
 9201		ret = btrfs_update_inode(trans, BTRFS_I(inode));
 9202
 9203		if (unlikely(ret)) {
 9204			btrfs_abort_transaction(trans, ret);
 9205			if (own_trans)
 9206				btrfs_end_transaction(trans);
 9207			break;
 9208		}
 9209
 9210		if (own_trans) {
 9211			btrfs_end_transaction(trans);
 9212			trans = NULL;
 9213		}
 9214	}
 9215	if (clear_offset < end)
 9216		btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
 9217			end - clear_offset + 1);
 9218	return ret;
 9219}
 9220
 9221int btrfs_prealloc_file_range(struct inode *inode, int mode,
 9222			      u64 start, u64 num_bytes, u64 min_size,
 9223			      loff_t actual_len, u64 *alloc_hint)
 9224{
 9225	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
 9226					   min_size, actual_len, alloc_hint,
 9227					   NULL);
 9228}
 9229
 9230int btrfs_prealloc_file_range_trans(struct inode *inode,
 9231				    struct btrfs_trans_handle *trans, int mode,
 9232				    u64 start, u64 num_bytes, u64 min_size,
 9233				    loff_t actual_len, u64 *alloc_hint)
 9234{
 9235	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
 9236					   min_size, actual_len, alloc_hint, trans);
 9237}
 9238
 9239/*
 9240 * NOTE: in case you are adding MAY_EXEC check for directories:
 9241 * we are marking them with IOP_FASTPERM_MAY_EXEC, allowing path lookup to
 9242 * elide calls here.
 9243 */
 9244static int btrfs_permission(struct mnt_idmap *idmap,
 9245			    struct inode *inode, int mask)
 9246{
 9247	struct btrfs_root *root = BTRFS_I(inode)->root;
 9248	umode_t mode = inode->i_mode;
 9249
 9250	if (mask & MAY_WRITE &&
 9251	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
 9252		if (btrfs_root_readonly(root))
 9253			return -EROFS;
 9254		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
 9255			return -EACCES;
 9256	}
 9257	return generic_permission(idmap, inode, mask);
 9258}
 9259
 9260static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
 9261			 struct file *file, umode_t mode)
 9262{
 9263	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
 9264	struct btrfs_trans_handle *trans;
 9265	struct btrfs_root *root = BTRFS_I(dir)->root;
 9266	struct inode *inode;
 9267	struct btrfs_new_inode_args new_inode_args = {
 9268		.dir = dir,
 9269		.dentry = file->f_path.dentry,
 9270		.orphan = true,
 9271	};
 9272	unsigned int trans_num_items;
 9273	int ret;
 9274
 9275	inode = new_inode(dir->i_sb);
 9276	if (!inode)
 9277		return -ENOMEM;
 9278	inode_init_owner(idmap, inode, dir, mode);
 9279	inode->i_fop = &btrfs_file_operations;
 9280	inode->i_op = &btrfs_file_inode_operations;
 9281	inode->i_mapping->a_ops = &btrfs_aops;
 9282
 9283	new_inode_args.inode = inode;
 9284	ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
 9285	if (ret)
 9286		goto out_inode;
 9287
 9288	trans = btrfs_start_transaction(root, trans_num_items);
 9289	if (IS_ERR(trans)) {
 9290		ret = PTR_ERR(trans);
 9291		goto out_new_inode_args;
 9292	}
 9293
 9294	ret = btrfs_create_new_inode(trans, &new_inode_args);
 9295
 9296	/*
 9297	 * We set number of links to 0 in btrfs_create_new_inode(), and here we
 9298	 * set it to 1 because d_tmpfile() will issue a warning if the count is
 9299	 * 0, through:
 9300	 *
 9301	 *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
 9302	 */
 9303	set_nlink(inode, 1);
 9304
 9305	if (!ret) {
 9306		d_tmpfile(file, inode);
 9307		unlock_new_inode(inode);
 9308		mark_inode_dirty(inode);
 9309	}
 9310
 9311	btrfs_end_transaction(trans);
 9312	btrfs_btree_balance_dirty(fs_info);
 9313out_new_inode_args:
 9314	btrfs_new_inode_args_destroy(&new_inode_args);
 9315out_inode:
 9316	if (ret)
 9317		iput(inode);
 9318	return finish_open_simple(file, ret);
 9319}
 9320
 9321int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
 9322					     int compress_type)
 9323{
 9324	switch (compress_type) {
 9325	case BTRFS_COMPRESS_NONE:
 9326		return BTRFS_ENCODED_IO_COMPRESSION_NONE;
 9327	case BTRFS_COMPRESS_ZLIB:
 9328		return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
 9329	case BTRFS_COMPRESS_LZO:
 9330		/*
 9331		 * The LZO format depends on the sector size. 64K is the maximum
 9332		 * sector size that we support.
 9333		 */
 9334		if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
 9335			return -EINVAL;
 9336		return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
 9337		       (fs_info->sectorsize_bits - 12);
 9338	case BTRFS_COMPRESS_ZSTD:
 9339		return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
 9340	default:
 9341		return -EUCLEAN;
 9342	}
 9343}
 9344
 9345static ssize_t btrfs_encoded_read_inline(
 9346				struct kiocb *iocb,
 9347				struct iov_iter *iter, u64 start,
 9348				u64 lockend,
 9349				struct extent_state **cached_state,
 9350				u64 extent_start, size_t count,
 9351				struct btrfs_ioctl_encoded_io_args *encoded,
 9352				bool *unlocked)
 9353{
 9354	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
 9355	struct btrfs_root *root = inode->root;
 9356	struct btrfs_fs_info *fs_info = root->fs_info;
 9357	struct extent_io_tree *io_tree = &inode->io_tree;
 9358	BTRFS_PATH_AUTO_FREE(path);
 9359	struct extent_buffer *leaf;
 9360	struct btrfs_file_extent_item *item;
 9361	u64 ram_bytes;
 9362	unsigned long ptr;
 9363	void *tmp;
 9364	ssize_t ret;
 9365	const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
 9366
 9367	path = btrfs_alloc_path();
 9368	if (!path)
 9369		return -ENOMEM;
 9370
 9371	path->nowait = nowait;
 9372
 9373	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
 9374				       extent_start, 0);
 9375	if (ret) {
 9376		if (unlikely(ret > 0)) {
 9377			/* The extent item disappeared? */
 9378			return -EIO;
 9379		}
 9380		return ret;
 9381	}
 9382	leaf = path->nodes[0];
 9383	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
 9384
 9385	ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
 9386	ptr = btrfs_file_extent_inline_start(item);
 9387
 9388	encoded->len = min_t(u64, extent_start + ram_bytes,
 9389			     inode->vfs_inode.i_size) - iocb->ki_pos;
 9390	ret = btrfs_encoded_io_compression_from_extent(fs_info,
 9391				 btrfs_file_extent_compression(leaf, item));
 9392	if (ret < 0)
 9393		return ret;
 9394	encoded->compression = ret;
 9395	if (encoded->compression) {
 9396		size_t inline_size;
 9397
 9398		inline_size = btrfs_file_extent_inline_item_len(leaf,
 9399								path->slots[0]);
 9400		if (inline_size > count)
 9401			return -ENOBUFS;
 9402
 9403		count = inline_size;
 9404		encoded->unencoded_len = ram_bytes;
 9405		encoded->unencoded_offset = iocb->ki_pos - extent_start;
 9406	} else {
 9407		count = min_t(u64, count, encoded->len);
 9408		encoded->len = count;
 9409		encoded->unencoded_len = count;
 9410		ptr += iocb->ki_pos - extent_start;
 9411	}
 9412
 9413	tmp = kmalloc(count, GFP_NOFS);
 9414	if (!tmp)
 9415		return -ENOMEM;
 9416
 9417	read_extent_buffer(leaf, tmp, ptr, count);
 9418	btrfs_release_path(path);
 9419	btrfs_unlock_extent(io_tree, start, lockend, cached_state);
 9420	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
 9421	*unlocked = true;
 9422
 9423	ret = copy_to_iter(tmp, count, iter);
 9424	if (ret != count)
 9425		ret = -EFAULT;
 9426	kfree(tmp);
 9427
 9428	return ret;
 9429}
 9430
 9431struct btrfs_encoded_read_private {
 9432	struct completion *sync_reads;
 9433	void *uring_ctx;
 9434	refcount_t pending_refs;
 9435	blk_status_t status;
 9436};
 9437
 9438static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
 9439{
 9440	struct btrfs_encoded_read_private *priv = bbio->private;
 9441
 9442	if (bbio->bio.bi_status) {
 9443		/*
 9444		 * The memory barrier implied by the refcount_dec_and_test() here
 9445		 * pairs with the memory barrier implied by the refcount_dec_and_test()
 9446		 * in btrfs_encoded_read_regular_fill_pages() to ensure that
 9447		 * this write is observed before the load of status in
 9448		 * btrfs_encoded_read_regular_fill_pages().
 9449		 */
 9450		WRITE_ONCE(priv->status, bbio->bio.bi_status);
 9451	}
 9452	if (refcount_dec_and_test(&priv->pending_refs)) {
 9453		int err = blk_status_to_errno(READ_ONCE(priv->status));
 9454
 9455		if (priv->uring_ctx) {
 9456			btrfs_uring_read_extent_endio(priv->uring_ctx, err);
 9457			kfree(priv);
 9458		} else {
 9459			complete(priv->sync_reads);
 9460		}
 9461	}
 9462	bio_put(&bbio->bio);
 9463}
 9464
 9465int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 9466					  u64 disk_bytenr, u64 disk_io_size,
 9467					  struct page **pages, void *uring_ctx)
 9468{
 9469	struct btrfs_encoded_read_private *priv, sync_priv;
 9470	struct completion sync_reads;
 9471	unsigned long i = 0;
 9472	struct btrfs_bio *bbio;
 9473	int ret;
 9474
 9475	/*
 9476	 * Fast path for synchronous reads which completes in this call, io_uring
 9477	 * needs longer time span.
 9478	 */
 9479	if (uring_ctx) {
 9480		priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
 9481		if (!priv)
 9482			return -ENOMEM;
 9483	} else {
 9484		priv = &sync_priv;
 9485		init_completion(&sync_reads);
 9486		priv->sync_reads = &sync_reads;
 9487	}
 9488
 9489	refcount_set(&priv->pending_refs, 1);
 9490	priv->status = 0;
 9491	priv->uring_ctx = uring_ctx;
 9492
 9493	bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, inode, 0,
 9494			       btrfs_encoded_read_endio, priv);
 9495	bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
 9496
 9497	do {
 9498		size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
 9499
 9500		if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
 9501			refcount_inc(&priv->pending_refs);
 9502			btrfs_submit_bbio(bbio, 0);
 9503
 9504			bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, inode, 0,
 9505					       btrfs_encoded_read_endio, priv);
 9506			bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
 9507			continue;
 9508		}
 9509
 9510		i++;
 9511		disk_bytenr += bytes;
 9512		disk_io_size -= bytes;
 9513	} while (disk_io_size);
 9514
 9515	refcount_inc(&priv->pending_refs);
 9516	btrfs_submit_bbio(bbio, 0);
 9517
 9518	if (uring_ctx) {
 9519		if (refcount_dec_and_test(&priv->pending_refs)) {
 9520			ret = blk_status_to_errno(READ_ONCE(priv->status));
 9521			btrfs_uring_read_extent_endio(uring_ctx, ret);
 9522			kfree(priv);
 9523			return ret;
 9524		}
 9525
 9526		return -EIOCBQUEUED;
 9527	} else {
 9528		if (!refcount_dec_and_test(&priv->pending_refs))
 9529			wait_for_completion_io(&sync_reads);
 9530		/* See btrfs_encoded_read_endio() for ordering. */
 9531		return blk_status_to_errno(READ_ONCE(priv->status));
 9532	}
 9533}
 9534
 9535ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
 9536				   u64 start, u64 lockend,
 9537				   struct extent_state **cached_state,
 9538				   u64 disk_bytenr, u64 disk_io_size,
 9539				   size_t count, bool compressed, bool *unlocked)
 9540{
 9541	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
 9542	struct extent_io_tree *io_tree = &inode->io_tree;
 9543	struct page **pages;
 9544	unsigned long nr_pages, i;
 9545	u64 cur;
 9546	size_t page_offset;
 9547	ssize_t ret;
 9548
 9549	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
 9550	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
 9551	if (!pages)
 9552		return -ENOMEM;
 9553	ret = btrfs_alloc_page_array(nr_pages, pages, false);
 9554	if (ret) {
 9555		ret = -ENOMEM;
 9556		goto out;
 9557		}
 9558
 9559	ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
 9560						    disk_io_size, pages, NULL);
 9561	if (ret)
 9562		goto out;
 9563
 9564	btrfs_unlock_extent(io_tree, start, lockend, cached_state);
 9565	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
 9566	*unlocked = true;
 9567
 9568	if (compressed) {
 9569		i = 0;
 9570		page_offset = 0;
 9571	} else {
 9572		i = (iocb->ki_pos - start) >> PAGE_SHIFT;
 9573		page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
 9574	}
 9575	cur = 0;
 9576	while (cur < count) {
 9577		size_t bytes = min_t(size_t, count - cur,
 9578				     PAGE_SIZE - page_offset);
 9579
 9580		if (copy_page_to_iter(pages[i], page_offset, bytes,
 9581				      iter) != bytes) {
 9582			ret = -EFAULT;
 9583			goto out;
 9584		}
 9585		i++;
 9586		cur += bytes;
 9587		page_offset = 0;
 9588	}
 9589	ret = count;
 9590out:
 9591	for (i = 0; i < nr_pages; i++) {
 9592		if (pages[i])
 9593			__free_page(pages[i]);
 9594	}
 9595	kfree(pages);
 9596	return ret;
 9597}
 9598
 9599ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
 9600			   struct btrfs_ioctl_encoded_io_args *encoded,
 9601			   struct extent_state **cached_state,
 9602			   u64 *disk_bytenr, u64 *disk_io_size)
 9603{
 9604	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
 9605	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 9606	struct extent_io_tree *io_tree = &inode->io_tree;
 9607	ssize_t ret;
 9608	size_t count = iov_iter_count(iter);
 9609	u64 start, lockend;
 9610	struct extent_map *em;
 9611	const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
 9612	bool unlocked = false;
 9613
 9614	file_accessed(iocb->ki_filp);
 9615
 9616	ret = btrfs_inode_lock(inode,
 9617			       BTRFS_ILOCK_SHARED | (nowait ? BTRFS_ILOCK_TRY : 0));
 9618	if (ret)
 9619		return ret;
 9620
 9621	if (iocb->ki_pos >= inode->vfs_inode.i_size) {
 9622		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
 9623		return 0;
 9624	}
 9625	start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
 9626	/*
 9627	 * We don't know how long the extent containing iocb->ki_pos is, but if
 9628	 * it's compressed we know that it won't be longer than this.
 9629	 */
 9630	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
 9631
 9632	if (nowait) {
 9633		struct btrfs_ordered_extent *ordered;
 9634
 9635		if (filemap_range_needs_writeback(inode->vfs_inode.i_mapping,
 9636						  start, lockend)) {
 9637			ret = -EAGAIN;
 9638			goto out_unlock_inode;
 9639		}
 9640
 9641		if (!btrfs_try_lock_extent(io_tree, start, lockend, cached_state)) {
 9642			ret = -EAGAIN;
 9643			goto out_unlock_inode;
 9644		}
 9645
 9646		ordered = btrfs_lookup_ordered_range(inode, start,
 9647						     lockend - start + 1);
 9648		if (ordered) {
 9649			btrfs_put_ordered_extent(ordered);
 9650			btrfs_unlock_extent(io_tree, start, lockend, cached_state);
 9651			ret = -EAGAIN;
 9652			goto out_unlock_inode;
 9653		}
 9654	} else {
 9655		for (;;) {
 9656			struct btrfs_ordered_extent *ordered;
 9657
 9658			ret = btrfs_wait_ordered_range(inode, start,
 9659						       lockend - start + 1);
 9660			if (ret)
 9661				goto out_unlock_inode;
 9662
 9663			btrfs_lock_extent(io_tree, start, lockend, cached_state);
 9664			ordered = btrfs_lookup_ordered_range(inode, start,
 9665							     lockend - start + 1);
 9666			if (!ordered)
 9667				break;
 9668			btrfs_put_ordered_extent(ordered);
 9669			btrfs_unlock_extent(io_tree, start, lockend, cached_state);
 9670			cond_resched();
 9671		}
 9672	}
 9673
 9674	em = btrfs_get_extent(inode, NULL, start, lockend - start + 1);
 9675	if (IS_ERR(em)) {
 9676		ret = PTR_ERR(em);
 9677		goto out_unlock_extent;
 9678	}
 9679
 9680	if (em->disk_bytenr == EXTENT_MAP_INLINE) {
 9681		u64 extent_start = em->start;
 9682
 9683		/*
 9684		 * For inline extents we get everything we need out of the
 9685		 * extent item.
 9686		 */
 9687		btrfs_free_extent_map(em);
 9688		em = NULL;
 9689		ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
 9690						cached_state, extent_start,
 9691						count, encoded, &unlocked);
 9692		goto out_unlock_extent;
 9693	}
 9694
 9695	/*
 9696	 * We only want to return up to EOF even if the extent extends beyond
 9697	 * that.
 9698	 */
 9699	encoded->len = min_t(u64, btrfs_extent_map_end(em),
 9700			     inode->vfs_inode.i_size) - iocb->ki_pos;
 9701	if (em->disk_bytenr == EXTENT_MAP_HOLE ||
 9702	    (em->flags & EXTENT_FLAG_PREALLOC)) {
 9703		*disk_bytenr = EXTENT_MAP_HOLE;
 9704		count = min_t(u64, count, encoded->len);
 9705		encoded->len = count;
 9706		encoded->unencoded_len = count;
 9707	} else if (btrfs_extent_map_is_compressed(em)) {
 9708		*disk_bytenr = em->disk_bytenr;
 9709		/*
 9710		 * Bail if the buffer isn't large enough to return the whole
 9711		 * compressed extent.
 9712		 */
 9713		if (em->disk_num_bytes > count) {
 9714			ret = -ENOBUFS;
 9715			goto out_em;
 9716		}
 9717		*disk_io_size = em->disk_num_bytes;
 9718		count = em->disk_num_bytes;
 9719		encoded->unencoded_len = em->ram_bytes;
 9720		encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset);
 9721		ret = btrfs_encoded_io_compression_from_extent(fs_info,
 9722					       btrfs_extent_map_compression(em));
 9723		if (ret < 0)
 9724			goto out_em;
 9725		encoded->compression = ret;
 9726	} else {
 9727		*disk_bytenr = btrfs_extent_map_block_start(em) + (start - em->start);
 9728		if (encoded->len > count)
 9729			encoded->len = count;
 9730		/*
 9731		 * Don't read beyond what we locked. This also limits the page
 9732		 * allocations that we'll do.
 9733		 */
 9734		*disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
 9735		count = start + *disk_io_size - iocb->ki_pos;
 9736		encoded->len = count;
 9737		encoded->unencoded_len = count;
 9738		*disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize);
 9739	}
 9740	btrfs_free_extent_map(em);
 9741	em = NULL;
 9742
 9743	if (*disk_bytenr == EXTENT_MAP_HOLE) {
 9744		btrfs_unlock_extent(io_tree, start, lockend, cached_state);
 9745		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
 9746		unlocked = true;
 9747		ret = iov_iter_zero(count, iter);
 9748		if (ret != count)
 9749			ret = -EFAULT;
 9750	} else {
 9751		ret = -EIOCBQUEUED;
 9752		goto out_unlock_extent;
 9753	}
 9754
 9755out_em:
 9756	btrfs_free_extent_map(em);
 9757out_unlock_extent:
 9758	/* Leave inode and extent locked if we need to do a read. */
 9759	if (!unlocked && ret != -EIOCBQUEUED)
 9760		btrfs_unlock_extent(io_tree, start, lockend, cached_state);
 9761out_unlock_inode:
 9762	if (!unlocked && ret != -EIOCBQUEUED)
 9763		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
 9764	return ret;
 9765}
 9766
 9767ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
 9768			       const struct btrfs_ioctl_encoded_io_args *encoded)
 9769{
 9770	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
 9771	struct btrfs_root *root = inode->root;
 9772	struct btrfs_fs_info *fs_info = root->fs_info;
 9773	struct extent_io_tree *io_tree = &inode->io_tree;
 9774	struct extent_changeset *data_reserved = NULL;
 9775	struct extent_state *cached_state = NULL;
 9776	struct btrfs_ordered_extent *ordered;
 9777	struct btrfs_file_extent file_extent;
 9778	int compression;
 9779	size_t orig_count;
 9780	u64 start, end;
 9781	u64 num_bytes, ram_bytes, disk_num_bytes;
 9782	unsigned long nr_folios, i;
 9783	struct folio **folios;
 9784	struct btrfs_key ins;
 9785	bool extent_reserved = false;
 9786	struct extent_map *em;
 9787	ssize_t ret;
 9788
 9789	switch (encoded->compression) {
 9790	case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
 9791		compression = BTRFS_COMPRESS_ZLIB;
 9792		break;
 9793	case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
 9794		compression = BTRFS_COMPRESS_ZSTD;
 9795		break;
 9796	case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
 9797	case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
 9798	case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
 9799	case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
 9800	case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
 9801		/* The sector size must match for LZO. */
 9802		if (encoded->compression -
 9803		    BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
 9804		    fs_info->sectorsize_bits)
 9805			return -EINVAL;
 9806		compression = BTRFS_COMPRESS_LZO;
 9807		break;
 9808	default:
 9809		return -EINVAL;
 9810	}
 9811	if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
 9812		return -EINVAL;
 9813
 9814	/*
 9815	 * Compressed extents should always have checksums, so error out if we
 9816	 * have a NOCOW file or inode was created while mounted with NODATASUM.
 9817	 */
 9818	if (inode->flags & BTRFS_INODE_NODATASUM)
 9819		return -EINVAL;
 9820
 9821	orig_count = iov_iter_count(from);
 9822
 9823	/* The extent size must be sane. */
 9824	if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
 9825	    orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
 9826		return -EINVAL;
 9827
 9828	/*
 9829	 * The compressed data must be smaller than the decompressed data.
 9830	 *
 9831	 * It's of course possible for data to compress to larger or the same
 9832	 * size, but the buffered I/O path falls back to no compression for such
 9833	 * data, and we don't want to break any assumptions by creating these
 9834	 * extents.
 9835	 *
 9836	 * Note that this is less strict than the current check we have that the
 9837	 * compressed data must be at least one sector smaller than the
 9838	 * decompressed data. We only want to enforce the weaker requirement
 9839	 * from old kernels that it is at least one byte smaller.
 9840	 */
 9841	if (orig_count >= encoded->unencoded_len)
 9842		return -EINVAL;
 9843
 9844	/* The extent must start on a sector boundary. */
 9845	start = iocb->ki_pos;
 9846	if (!IS_ALIGNED(start, fs_info->sectorsize))
 9847		return -EINVAL;
 9848
 9849	/*
 9850	 * The extent must end on a sector boundary. However, we allow a write
 9851	 * which ends at or extends i_size to have an unaligned length; we round
 9852	 * up the extent size and set i_size to the unaligned end.
 9853	 */
 9854	if (start + encoded->len < inode->vfs_inode.i_size &&
 9855	    !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
 9856		return -EINVAL;
 9857
 9858	/* Finally, the offset in the unencoded data must be sector-aligned. */
 9859	if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
 9860		return -EINVAL;
 9861
 9862	num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
 9863	ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
 9864	end = start + num_bytes - 1;
 9865
 9866	/*
 9867	 * If the extent cannot be inline, the compressed data on disk must be
 9868	 * sector-aligned. For convenience, we extend it with zeroes if it
 9869	 * isn't.
 9870	 */
 9871	disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
 9872	nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
 9873	folios = kvcalloc(nr_folios, sizeof(struct folio *), GFP_KERNEL_ACCOUNT);
 9874	if (!folios)
 9875		return -ENOMEM;
 9876	for (i = 0; i < nr_folios; i++) {
 9877		size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
 9878		char *kaddr;
 9879
 9880		folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0);
 9881		if (!folios[i]) {
 9882			ret = -ENOMEM;
 9883			goto out_folios;
 9884		}
 9885		kaddr = kmap_local_folio(folios[i], 0);
 9886		if (copy_from_iter(kaddr, bytes, from) != bytes) {
 9887			kunmap_local(kaddr);
 9888			ret = -EFAULT;
 9889			goto out_folios;
 9890		}
 9891		if (bytes < PAGE_SIZE)
 9892			memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
 9893		kunmap_local(kaddr);
 9894	}
 9895
 9896	for (;;) {
 9897		ret = btrfs_wait_ordered_range(inode, start, num_bytes);
 9898		if (ret)
 9899			goto out_folios;
 9900		ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
 9901						    start >> PAGE_SHIFT,
 9902						    end >> PAGE_SHIFT);
 9903		if (ret)
 9904			goto out_folios;
 9905		btrfs_lock_extent(io_tree, start, end, &cached_state);
 9906		ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
 9907		if (!ordered &&
 9908		    !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
 9909			break;
 9910		if (ordered)
 9911			btrfs_put_ordered_extent(ordered);
 9912		btrfs_unlock_extent(io_tree, start, end, &cached_state);
 9913		cond_resched();
 9914	}
 9915
 9916	/*
 9917	 * We don't use the higher-level delalloc space functions because our
 9918	 * num_bytes and disk_num_bytes are different.
 9919	 */
 9920	ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
 9921	if (ret)
 9922		goto out_unlock;
 9923	ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
 9924	if (ret)
 9925		goto out_free_data_space;
 9926	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
 9927					      false);
 9928	if (ret)
 9929		goto out_qgroup_free_data;
 9930
 9931	/* Try an inline extent first. */
 9932	if (encoded->unencoded_len == encoded->len &&
 9933	    encoded->unencoded_offset == 0 &&
 9934	    can_cow_file_range_inline(inode, start, encoded->len, orig_count)) {
 9935		ret = __cow_file_range_inline(inode, encoded->len,
 9936					      orig_count, compression, folios[0],
 9937					      true);
 9938		if (ret <= 0) {
 9939			if (ret == 0)
 9940				ret = orig_count;
 9941			goto out_delalloc_release;
 9942		}
 9943	}
 9944
 9945	ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
 9946				   disk_num_bytes, 0, 0, &ins, true, true);
 9947	if (ret)
 9948		goto out_delalloc_release;
 9949	extent_reserved = true;
 9950
 9951	file_extent.disk_bytenr = ins.objectid;
 9952	file_extent.disk_num_bytes = ins.offset;
 9953	file_extent.num_bytes = num_bytes;
 9954	file_extent.ram_bytes = ram_bytes;
 9955	file_extent.offset = encoded->unencoded_offset;
 9956	file_extent.compression = compression;
 9957	em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
 9958	if (IS_ERR(em)) {
 9959		ret = PTR_ERR(em);
 9960		goto out_free_reserved;
 9961	}
 9962	btrfs_free_extent_map(em);
 9963
 9964	ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
 9965				       (1U << BTRFS_ORDERED_ENCODED) |
 9966				       (1U << BTRFS_ORDERED_COMPRESSED));
 9967	if (IS_ERR(ordered)) {
 9968		btrfs_drop_extent_map_range(inode, start, end, false);
 9969		ret = PTR_ERR(ordered);
 9970		goto out_free_reserved;
 9971	}
 9972	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 9973
 9974	if (start + encoded->len > inode->vfs_inode.i_size)
 9975		i_size_write(&inode->vfs_inode, start + encoded->len);
 9976
 9977	btrfs_unlock_extent(io_tree, start, end, &cached_state);
 9978
 9979	btrfs_delalloc_release_extents(inode, num_bytes);
 9980
 9981	btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false);
 9982	ret = orig_count;
 9983	goto out;
 9984
 9985out_free_reserved:
 9986	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 9987	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
 9988out_delalloc_release:
 9989	btrfs_delalloc_release_extents(inode, num_bytes);
 9990	btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
 9991out_qgroup_free_data:
 9992	if (ret < 0)
 9993		btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL);
 9994out_free_data_space:
 9995	/*
 9996	 * If btrfs_reserve_extent() succeeded, then we already decremented
 9997	 * bytes_may_use.
 9998	 */
 9999	if (!extent_reserved)
10000		btrfs_free_reserved_data_space_noquota(inode, disk_num_bytes);
10001out_unlock:
10002	btrfs_unlock_extent(io_tree, start, end, &cached_state);
10003out_folios:
10004	for (i = 0; i < nr_folios; i++) {
10005		if (folios[i])
10006			folio_put(folios[i]);
10007	}
10008	kvfree(folios);
10009out:
10010	if (ret >= 0)
10011		iocb->ki_pos += encoded->len;
10012	return ret;
10013}
10014
10015#ifdef CONFIG_SWAP
10016/*
10017 * Add an entry indicating a block group or device which is pinned by a
10018 * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
10019 * negative errno on failure.
10020 */
10021static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
10022				  bool is_block_group)
10023{
10024	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10025	struct btrfs_swapfile_pin *sp, *entry;
10026	struct rb_node **p;
10027	struct rb_node *parent = NULL;
10028
10029	sp = kmalloc(sizeof(*sp), GFP_NOFS);
10030	if (!sp)
10031		return -ENOMEM;
10032	sp->ptr = ptr;
10033	sp->inode = inode;
10034	sp->is_block_group = is_block_group;
10035	sp->bg_extent_count = 1;
10036
10037	spin_lock(&fs_info->swapfile_pins_lock);
10038	p = &fs_info->swapfile_pins.rb_node;
10039	while (*p) {
10040		parent = *p;
10041		entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
10042		if (sp->ptr < entry->ptr ||
10043		    (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
10044			p = &(*p)->rb_left;
10045		} else if (sp->ptr > entry->ptr ||
10046			   (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
10047			p = &(*p)->rb_right;
10048		} else {
10049			if (is_block_group)
10050				entry->bg_extent_count++;
10051			spin_unlock(&fs_info->swapfile_pins_lock);
10052			kfree(sp);
10053			return 1;
10054		}
10055	}
10056	rb_link_node(&sp->node, parent, p);
10057	rb_insert_color(&sp->node, &fs_info->swapfile_pins);
10058	spin_unlock(&fs_info->swapfile_pins_lock);
10059	return 0;
10060}
10061
10062/* Free all of the entries pinned by this swapfile. */
10063static void btrfs_free_swapfile_pins(struct inode *inode)
10064{
10065	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10066	struct btrfs_swapfile_pin *sp;
10067	struct rb_node *node, *next;
10068
10069	spin_lock(&fs_info->swapfile_pins_lock);
10070	node = rb_first(&fs_info->swapfile_pins);
10071	while (node) {
10072		next = rb_next(node);
10073		sp = rb_entry(node, struct btrfs_swapfile_pin, node);
10074		if (sp->inode == inode) {
10075			rb_erase(&sp->node, &fs_info->swapfile_pins);
10076			if (sp->is_block_group) {
10077				btrfs_dec_block_group_swap_extents(sp->ptr,
10078							   sp->bg_extent_count);
10079				btrfs_put_block_group(sp->ptr);
10080			}
10081			kfree(sp);
10082		}
10083		node = next;
10084	}
10085	spin_unlock(&fs_info->swapfile_pins_lock);
10086}
10087
10088struct btrfs_swap_info {
10089	u64 start;
10090	u64 block_start;
10091	u64 block_len;
10092	u64 lowest_ppage;
10093	u64 highest_ppage;
10094	unsigned long nr_pages;
10095	int nr_extents;
10096};
10097
10098static int btrfs_add_swap_extent(struct swap_info_struct *sis,
10099				 struct btrfs_swap_info *bsi)
10100{
10101	unsigned long nr_pages;
10102	unsigned long max_pages;
10103	u64 first_ppage, first_ppage_reported, next_ppage;
10104	int ret;
10105
10106	/*
10107	 * Our swapfile may have had its size extended after the swap header was
10108	 * written. In that case activating the swapfile should not go beyond
10109	 * the max size set in the swap header.
10110	 */
10111	if (bsi->nr_pages >= sis->max)
10112		return 0;
10113
10114	max_pages = sis->max - bsi->nr_pages;
10115	first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT;
10116	next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT;
10117
10118	if (first_ppage >= next_ppage)
10119		return 0;
10120	nr_pages = next_ppage - first_ppage;
10121	nr_pages = min(nr_pages, max_pages);
10122
10123	first_ppage_reported = first_ppage;
10124	if (bsi->start == 0)
10125		first_ppage_reported++;
10126	if (bsi->lowest_ppage > first_ppage_reported)
10127		bsi->lowest_ppage = first_ppage_reported;
10128	if (bsi->highest_ppage < (next_ppage - 1))
10129		bsi->highest_ppage = next_ppage - 1;
10130
10131	ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
10132	if (ret < 0)
10133		return ret;
10134	bsi->nr_extents += ret;
10135	bsi->nr_pages += nr_pages;
10136	return 0;
10137}
10138
10139static void btrfs_swap_deactivate(struct file *file)
10140{
10141	struct inode *inode = file_inode(file);
10142
10143	btrfs_free_swapfile_pins(inode);
10144	atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
10145}
10146
10147static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10148			       sector_t *span)
10149{
10150	struct inode *inode = file_inode(file);
10151	struct btrfs_root *root = BTRFS_I(inode)->root;
10152	struct btrfs_fs_info *fs_info = root->fs_info;
10153	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
10154	struct extent_state *cached_state = NULL;
10155	struct btrfs_chunk_map *map = NULL;
10156	struct btrfs_device *device = NULL;
10157	struct btrfs_swap_info bsi = {
10158		.lowest_ppage = (sector_t)-1ULL,
10159	};
10160	struct btrfs_backref_share_check_ctx *backref_ctx = NULL;
10161	struct btrfs_path *path = NULL;
10162	int ret = 0;
10163	u64 isize;
10164	u64 prev_extent_end = 0;
10165
10166	/*
10167	 * Acquire the inode's mmap lock to prevent races with memory mapped
10168	 * writes, as they could happen after we flush delalloc below and before
10169	 * we lock the extent range further below. The inode was already locked
10170	 * up in the call chain.
10171	 */
10172	btrfs_assert_inode_locked(BTRFS_I(inode));
10173	down_write(&BTRFS_I(inode)->i_mmap_lock);
10174
10175	/*
10176	 * If the swap file was just created, make sure delalloc is done. If the
10177	 * file changes again after this, the user is doing something stupid and
10178	 * we don't really care.
10179	 */
10180	ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
10181	if (ret)
10182		goto out_unlock_mmap;
10183
10184	/*
10185	 * The inode is locked, so these flags won't change after we check them.
10186	 */
10187	if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
10188		btrfs_warn(fs_info, "swapfile must not be compressed");
10189		ret = -EINVAL;
10190		goto out_unlock_mmap;
10191	}
10192	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
10193		btrfs_warn(fs_info, "swapfile must not be copy-on-write");
10194		ret = -EINVAL;
10195		goto out_unlock_mmap;
10196	}
10197	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
10198		btrfs_warn(fs_info, "swapfile must not be checksummed");
10199		ret = -EINVAL;
10200		goto out_unlock_mmap;
10201	}
10202
10203	path = btrfs_alloc_path();
10204	backref_ctx = btrfs_alloc_backref_share_check_ctx();
10205	if (!path || !backref_ctx) {
10206		ret = -ENOMEM;
10207		goto out_unlock_mmap;
10208	}
10209
10210	/*
10211	 * Balance or device remove/replace/resize can move stuff around from
10212	 * under us. The exclop protection makes sure they aren't running/won't
10213	 * run concurrently while we are mapping the swap extents, and
10214	 * fs_info->swapfile_pins prevents them from running while the swap
10215	 * file is active and moving the extents. Note that this also prevents
10216	 * a concurrent device add which isn't actually necessary, but it's not
10217	 * really worth the trouble to allow it.
10218	 */
10219	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
10220		btrfs_warn(fs_info,
10221	   "cannot activate swapfile while exclusive operation is running");
10222		ret = -EBUSY;
10223		goto out_unlock_mmap;
10224	}
10225
10226	/*
10227	 * Prevent snapshot creation while we are activating the swap file.
10228	 * We do not want to race with snapshot creation. If snapshot creation
10229	 * already started before we bumped nr_swapfiles from 0 to 1 and
10230	 * completes before the first write into the swap file after it is
10231	 * activated, than that write would fallback to COW.
10232	 */
10233	if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
10234		btrfs_exclop_finish(fs_info);
10235		btrfs_warn(fs_info,
10236	   "cannot activate swapfile because snapshot creation is in progress");
10237		ret = -EINVAL;
10238		goto out_unlock_mmap;
10239	}
10240	/*
10241	 * Snapshots can create extents which require COW even if NODATACOW is
10242	 * set. We use this counter to prevent snapshots. We must increment it
10243	 * before walking the extents because we don't want a concurrent
10244	 * snapshot to run after we've already checked the extents.
10245	 *
10246	 * It is possible that subvolume is marked for deletion but still not
10247	 * removed yet. To prevent this race, we check the root status before
10248	 * activating the swapfile.
10249	 */
10250	spin_lock(&root->root_item_lock);
10251	if (btrfs_root_dead(root)) {
10252		spin_unlock(&root->root_item_lock);
10253
10254		btrfs_drew_write_unlock(&root->snapshot_lock);
10255		btrfs_exclop_finish(fs_info);
10256		btrfs_warn(fs_info,
10257		"cannot activate swapfile because subvolume %llu is being deleted",
10258			btrfs_root_id(root));
10259		ret = -EPERM;
10260		goto out_unlock_mmap;
10261	}
10262	atomic_inc(&root->nr_swapfiles);
10263	spin_unlock(&root->root_item_lock);
10264
10265	isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
10266
10267	btrfs_lock_extent(io_tree, 0, isize - 1, &cached_state);
10268	while (prev_extent_end < isize) {
10269		struct btrfs_key key;
10270		struct extent_buffer *leaf;
10271		struct btrfs_file_extent_item *ei;
10272		struct btrfs_block_group *bg;
10273		u64 logical_block_start;
10274		u64 physical_block_start;
10275		u64 extent_gen;
10276		u64 disk_bytenr;
10277		u64 len;
10278
10279		key.objectid = btrfs_ino(BTRFS_I(inode));
10280		key.type = BTRFS_EXTENT_DATA_KEY;
10281		key.offset = prev_extent_end;
10282
10283		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
10284		if (ret < 0)
10285			goto out;
10286
10287		/*
10288		 * If key not found it means we have an implicit hole (NO_HOLES
10289		 * is enabled).
10290		 */
10291		if (ret > 0) {
10292			btrfs_warn(fs_info, "swapfile must not have holes");
10293			ret = -EINVAL;
10294			goto out;
10295		}
10296
10297		leaf = path->nodes[0];
10298		ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
10299
10300		if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
10301			/*
10302			 * It's unlikely we'll ever actually find ourselves
10303			 * here, as a file small enough to fit inline won't be
10304			 * big enough to store more than the swap header, but in
10305			 * case something changes in the future, let's catch it
10306			 * here rather than later.
10307			 */
10308			btrfs_warn(fs_info, "swapfile must not be inline");
10309			ret = -EINVAL;
10310			goto out;
10311		}
10312
10313		if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
10314			btrfs_warn(fs_info, "swapfile must not be compressed");
10315			ret = -EINVAL;
10316			goto out;
10317		}
10318
10319		disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
10320		if (disk_bytenr == 0) {
10321			btrfs_warn(fs_info, "swapfile must not have holes");
10322			ret = -EINVAL;
10323			goto out;
10324		}
10325
10326		logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei);
10327		extent_gen = btrfs_file_extent_generation(leaf, ei);
10328		prev_extent_end = btrfs_file_extent_end(path);
10329
10330		if (prev_extent_end > isize)
10331			len = isize - key.offset;
10332		else
10333			len = btrfs_file_extent_num_bytes(leaf, ei);
10334
10335		backref_ctx->curr_leaf_bytenr = leaf->start;
10336
10337		/*
10338		 * Don't need the path anymore, release to avoid deadlocks when
10339		 * calling btrfs_is_data_extent_shared() because when joining a
10340		 * transaction it can block waiting for the current one's commit
10341		 * which in turn may be trying to lock the same leaf to flush
10342		 * delayed items for example.
10343		 */
10344		btrfs_release_path(path);
10345
10346		ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr,
10347						  extent_gen, backref_ctx);
10348		if (ret < 0) {
10349			goto out;
10350		} else if (ret > 0) {
10351			btrfs_warn(fs_info,
10352				   "swapfile must not be copy-on-write");
10353			ret = -EINVAL;
10354			goto out;
10355		}
10356
10357		map = btrfs_get_chunk_map(fs_info, logical_block_start, len);
10358		if (IS_ERR(map)) {
10359			ret = PTR_ERR(map);
10360			goto out;
10361		}
10362
10363		if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
10364			btrfs_warn(fs_info,
10365				   "swapfile must have single data profile");
10366			ret = -EINVAL;
10367			goto out;
10368		}
10369
10370		if (device == NULL) {
10371			device = map->stripes[0].dev;
10372			ret = btrfs_add_swapfile_pin(inode, device, false);
10373			if (ret == 1)
10374				ret = 0;
10375			else if (ret)
10376				goto out;
10377		} else if (device != map->stripes[0].dev) {
10378			btrfs_warn(fs_info, "swapfile must be on one device");
10379			ret = -EINVAL;
10380			goto out;
10381		}
10382
10383		physical_block_start = (map->stripes[0].physical +
10384					(logical_block_start - map->start));
10385		btrfs_free_chunk_map(map);
10386		map = NULL;
10387
10388		bg = btrfs_lookup_block_group(fs_info, logical_block_start);
10389		if (!bg) {
10390			btrfs_warn(fs_info,
10391			   "could not find block group containing swapfile");
10392			ret = -EINVAL;
10393			goto out;
10394		}
10395
10396		if (!btrfs_inc_block_group_swap_extents(bg)) {
10397			btrfs_warn(fs_info,
10398			   "block group for swapfile at %llu is read-only%s",
10399			   bg->start,
10400			   atomic_read(&fs_info->scrubs_running) ?
10401				       " (scrub running)" : "");
10402			btrfs_put_block_group(bg);
10403			ret = -EINVAL;
10404			goto out;
10405		}
10406
10407		ret = btrfs_add_swapfile_pin(inode, bg, true);
10408		if (ret) {
10409			btrfs_put_block_group(bg);
10410			if (ret == 1)
10411				ret = 0;
10412			else
10413				goto out;
10414		}
10415
10416		if (bsi.block_len &&
10417		    bsi.block_start + bsi.block_len == physical_block_start) {
10418			bsi.block_len += len;
10419		} else {
10420			if (bsi.block_len) {
10421				ret = btrfs_add_swap_extent(sis, &bsi);
10422				if (ret)
10423					goto out;
10424			}
10425			bsi.start = key.offset;
10426			bsi.block_start = physical_block_start;
10427			bsi.block_len = len;
10428		}
10429
10430		if (fatal_signal_pending(current)) {
10431			ret = -EINTR;
10432			goto out;
10433		}
10434
10435		cond_resched();
10436	}
10437
10438	if (bsi.block_len)
10439		ret = btrfs_add_swap_extent(sis, &bsi);
10440
10441out:
10442	if (!IS_ERR_OR_NULL(map))
10443		btrfs_free_chunk_map(map);
10444
10445	btrfs_unlock_extent(io_tree, 0, isize - 1, &cached_state);
10446
10447	if (ret)
10448		btrfs_swap_deactivate(file);
10449
10450	btrfs_drew_write_unlock(&root->snapshot_lock);
10451
10452	btrfs_exclop_finish(fs_info);
10453
10454out_unlock_mmap:
10455	up_write(&BTRFS_I(inode)->i_mmap_lock);
10456	btrfs_free_backref_share_ctx(backref_ctx);
10457	btrfs_free_path(path);
10458	if (ret)
10459		return ret;
10460
10461	if (device)
10462		sis->bdev = device->bdev;
10463	*span = bsi.highest_ppage - bsi.lowest_ppage + 1;
10464	sis->max = bsi.nr_pages;
10465	sis->pages = bsi.nr_pages - 1;
10466	return bsi.nr_extents;
10467}
10468#else
10469static void btrfs_swap_deactivate(struct file *file)
10470{
10471}
10472
10473static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10474			       sector_t *span)
10475{
10476	return -EOPNOTSUPP;
10477}
10478#endif
10479
10480/*
10481 * Update the number of bytes used in the VFS' inode. When we replace extents in
10482 * a range (clone, dedupe, fallocate's zero range), we must update the number of
10483 * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
10484 * always get a correct value.
10485 */
10486void btrfs_update_inode_bytes(struct btrfs_inode *inode,
10487			      const u64 add_bytes,
10488			      const u64 del_bytes)
10489{
10490	if (add_bytes == del_bytes)
10491		return;
10492
10493	spin_lock(&inode->lock);
10494	if (del_bytes > 0)
10495		inode_sub_bytes(&inode->vfs_inode, del_bytes);
10496	if (add_bytes > 0)
10497		inode_add_bytes(&inode->vfs_inode, add_bytes);
10498	spin_unlock(&inode->lock);
10499}
10500
10501/*
10502 * Verify that there are no ordered extents for a given file range.
10503 *
10504 * @inode:   The target inode.
10505 * @start:   Start offset of the file range, should be sector size aligned.
10506 * @end:     End offset (inclusive) of the file range, its value +1 should be
10507 *           sector size aligned.
10508 *
10509 * This should typically be used for cases where we locked an inode's VFS lock in
10510 * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode,
10511 * we have flushed all delalloc in the range, we have waited for all ordered
10512 * extents in the range to complete and finally we have locked the file range in
10513 * the inode's io_tree.
10514 */
10515void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
10516{
10517	struct btrfs_root *root = inode->root;
10518	struct btrfs_ordered_extent *ordered;
10519
10520	if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
10521		return;
10522
10523	ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start);
10524	if (ordered) {
10525		btrfs_err(root->fs_info,
10526"found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
10527			  start, end, btrfs_ino(inode), btrfs_root_id(root),
10528			  ordered->file_offset,
10529			  ordered->file_offset + ordered->num_bytes - 1);
10530		btrfs_put_ordered_extent(ordered);
10531	}
10532
10533	ASSERT(ordered == NULL);
10534}
10535
10536/*
10537 * Find the first inode with a minimum number.
10538 *
10539 * @root:	The root to search for.
10540 * @min_ino:	The minimum inode number.
10541 *
10542 * Find the first inode in the @root with a number >= @min_ino and return it.
10543 * Returns NULL if no such inode found.
10544 */
10545struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino)
10546{
10547	struct btrfs_inode *inode;
10548	unsigned long from = min_ino;
10549
10550	xa_lock(&root->inodes);
10551	while (true) {
10552		inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
10553		if (!inode)
10554			break;
10555		if (igrab(&inode->vfs_inode))
10556			break;
10557
10558		from = btrfs_ino(inode) + 1;
10559		cond_resched_lock(&root->inodes.xa_lock);
10560	}
10561	xa_unlock(&root->inodes);
10562
10563	return inode;
10564}
10565
10566static const struct inode_operations btrfs_dir_inode_operations = {
10567	.getattr	= btrfs_getattr,
10568	.lookup		= btrfs_lookup,
10569	.create		= btrfs_create,
10570	.unlink		= btrfs_unlink,
10571	.link		= btrfs_link,
10572	.mkdir		= btrfs_mkdir,
10573	.rmdir		= btrfs_rmdir,
10574	.rename		= btrfs_rename2,
10575	.symlink	= btrfs_symlink,
10576	.setattr	= btrfs_setattr,
10577	.mknod		= btrfs_mknod,
10578	.listxattr	= btrfs_listxattr,
10579	.permission	= btrfs_permission,
10580	.get_inode_acl	= btrfs_get_acl,
10581	.set_acl	= btrfs_set_acl,
10582	.update_time	= btrfs_update_time,
10583	.tmpfile        = btrfs_tmpfile,
10584	.fileattr_get	= btrfs_fileattr_get,
10585	.fileattr_set	= btrfs_fileattr_set,
10586};
10587
10588static const struct file_operations btrfs_dir_file_operations = {
10589	.llseek		= btrfs_dir_llseek,
10590	.read		= generic_read_dir,
10591	.iterate_shared	= btrfs_real_readdir,
10592	.open		= btrfs_opendir,
10593	.unlocked_ioctl	= btrfs_ioctl,
10594#ifdef CONFIG_COMPAT
10595	.compat_ioctl	= btrfs_compat_ioctl,
10596#endif
10597	.release        = btrfs_release_file,
10598	.fsync		= btrfs_sync_file,
10599};
10600
10601/*
10602 * btrfs doesn't support the bmap operation because swapfiles
10603 * use bmap to make a mapping of extents in the file.  They assume
10604 * these extents won't change over the life of the file and they
10605 * use the bmap result to do IO directly to the drive.
10606 *
10607 * the btrfs bmap call would return logical addresses that aren't
10608 * suitable for IO and they also will change frequently as COW
10609 * operations happen.  So, swapfile + btrfs == corruption.
10610 *
10611 * For now we're avoiding this by dropping bmap.
10612 */
10613static const struct address_space_operations btrfs_aops = {
10614	.read_folio	= btrfs_read_folio,
10615	.writepages	= btrfs_writepages,
10616	.readahead	= btrfs_readahead,
10617	.invalidate_folio = btrfs_invalidate_folio,
10618	.launder_folio	= btrfs_launder_folio,
10619	.release_folio	= btrfs_release_folio,
10620	.migrate_folio	= btrfs_migrate_folio,
10621	.dirty_folio	= filemap_dirty_folio,
10622	.error_remove_folio = generic_error_remove_folio,
10623	.swap_activate	= btrfs_swap_activate,
10624	.swap_deactivate = btrfs_swap_deactivate,
10625};
10626
10627static const struct inode_operations btrfs_file_inode_operations = {
10628	.getattr	= btrfs_getattr,
10629	.setattr	= btrfs_setattr,
10630	.listxattr      = btrfs_listxattr,
10631	.permission	= btrfs_permission,
10632	.fiemap		= btrfs_fiemap,
10633	.get_inode_acl	= btrfs_get_acl,
10634	.set_acl	= btrfs_set_acl,
10635	.update_time	= btrfs_update_time,
10636	.fileattr_get	= btrfs_fileattr_get,
10637	.fileattr_set	= btrfs_fileattr_set,
10638};
10639static const struct inode_operations btrfs_special_inode_operations = {
10640	.getattr	= btrfs_getattr,
10641	.setattr	= btrfs_setattr,
10642	.permission	= btrfs_permission,
10643	.listxattr	= btrfs_listxattr,
10644	.get_inode_acl	= btrfs_get_acl,
10645	.set_acl	= btrfs_set_acl,
10646	.update_time	= btrfs_update_time,
10647};
10648static const struct inode_operations btrfs_symlink_inode_operations = {
10649	.get_link	= page_get_link,
10650	.getattr	= btrfs_getattr,
10651	.setattr	= btrfs_setattr,
10652	.permission	= btrfs_permission,
10653	.listxattr	= btrfs_listxattr,
10654	.update_time	= btrfs_update_time,
10655};
10656
10657const struct dentry_operations btrfs_dentry_operations = {
10658	.d_delete	= btrfs_dentry_delete,
10659};