fs/btrfs/inode.c at v6.11-rc5

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / fs / btrfs / inode.c
at v6.11-rc5 10191 lines 296 kB view raw
wrap content
    1// SPDX-License-Identifier: GPL-2.0
    2/*
    3 * Copyright (C) 2007 Oracle.  All rights reserved.
    4 */
    5
    6#include <crypto/hash.h>
    7#include <linux/kernel.h>
    8#include <linux/bio.h>
    9#include <linux/blk-cgroup.h>
   10#include <linux/file.h>
   11#include <linux/fs.h>
   12#include <linux/pagemap.h>
   13#include <linux/highmem.h>
   14#include <linux/time.h>
   15#include <linux/init.h>
   16#include <linux/string.h>
   17#include <linux/backing-dev.h>
   18#include <linux/writeback.h>
   19#include <linux/compat.h>
   20#include <linux/xattr.h>
   21#include <linux/posix_acl.h>
   22#include <linux/falloc.h>
   23#include <linux/slab.h>
   24#include <linux/ratelimit.h>
   25#include <linux/btrfs.h>
   26#include <linux/blkdev.h>
   27#include <linux/posix_acl_xattr.h>
   28#include <linux/uio.h>
   29#include <linux/magic.h>
   30#include <linux/iversion.h>
   31#include <linux/swap.h>
   32#include <linux/migrate.h>
   33#include <linux/sched/mm.h>
   34#include <linux/iomap.h>
   35#include <asm/unaligned.h>
   36#include <linux/fsverity.h>
   37#include "misc.h"
   38#include "ctree.h"
   39#include "disk-io.h"
   40#include "transaction.h"
   41#include "btrfs_inode.h"
   42#include "ordered-data.h"
   43#include "xattr.h"
   44#include "tree-log.h"
   45#include "bio.h"
   46#include "compression.h"
   47#include "locking.h"
   48#include "props.h"
   49#include "qgroup.h"
   50#include "delalloc-space.h"
   51#include "block-group.h"
   52#include "space-info.h"
   53#include "zoned.h"
   54#include "subpage.h"
   55#include "inode-item.h"
   56#include "fs.h"
   57#include "accessors.h"
   58#include "extent-tree.h"
   59#include "root-tree.h"
   60#include "defrag.h"
   61#include "dir-item.h"
   62#include "file-item.h"
   63#include "uuid-tree.h"
   64#include "ioctl.h"
   65#include "file.h"
   66#include "acl.h"
   67#include "relocation.h"
   68#include "verity.h"
   69#include "super.h"
   70#include "orphan.h"
   71#include "backref.h"
   72#include "raid-stripe-tree.h"
   73#include "fiemap.h"
   74
   75struct btrfs_iget_args {
   76	u64 ino;
   77	struct btrfs_root *root;
   78};
   79
   80struct btrfs_rename_ctx {
   81	/* Output field. Stores the index number of the old directory entry. */
   82	u64 index;
   83};
   84
   85/*
   86 * Used by data_reloc_print_warning_inode() to pass needed info for filename
   87 * resolution and output of error message.
   88 */
   89struct data_reloc_warn {
   90	struct btrfs_path path;
   91	struct btrfs_fs_info *fs_info;
   92	u64 extent_item_size;
   93	u64 logical;
   94	int mirror_num;
   95};
   96
   97/*
   98 * For the file_extent_tree, we want to hold the inode lock when we lookup and
   99 * update the disk_i_size, but lockdep will complain because our io_tree we hold
  100 * the tree lock and get the inode lock when setting delalloc. These two things
  101 * are unrelated, so make a class for the file_extent_tree so we don't get the
  102 * two locking patterns mixed up.
  103 */
  104static struct lock_class_key file_extent_tree_class;
  105
  106static const struct inode_operations btrfs_dir_inode_operations;
  107static const struct inode_operations btrfs_symlink_inode_operations;
  108static const struct inode_operations btrfs_special_inode_operations;
  109static const struct inode_operations btrfs_file_inode_operations;
  110static const struct address_space_operations btrfs_aops;
  111static const struct file_operations btrfs_dir_file_operations;
  112
  113static struct kmem_cache *btrfs_inode_cachep;
  114
  115static int btrfs_setsize(struct inode *inode, struct iattr *attr);
  116static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);
  117
  118static noinline int run_delalloc_cow(struct btrfs_inode *inode,
  119				     struct page *locked_page, u64 start,
  120				     u64 end, struct writeback_control *wbc,
  121				     bool pages_dirty);
  122
  123static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
  124					  u64 root, void *warn_ctx)
  125{
  126	struct data_reloc_warn *warn = warn_ctx;
  127	struct btrfs_fs_info *fs_info = warn->fs_info;
  128	struct extent_buffer *eb;
  129	struct btrfs_inode_item *inode_item;
  130	struct inode_fs_paths *ipath = NULL;
  131	struct btrfs_root *local_root;
  132	struct btrfs_key key;
  133	unsigned int nofs_flag;
  134	u32 nlink;
  135	int ret;
  136
  137	local_root = btrfs_get_fs_root(fs_info, root, true);
  138	if (IS_ERR(local_root)) {
  139		ret = PTR_ERR(local_root);
  140		goto err;
  141	}
  142
  143	/* This makes the path point to (inum INODE_ITEM ioff). */
  144	key.objectid = inum;
  145	key.type = BTRFS_INODE_ITEM_KEY;
  146	key.offset = 0;
  147
  148	ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0);
  149	if (ret) {
  150		btrfs_put_root(local_root);
  151		btrfs_release_path(&warn->path);
  152		goto err;
  153	}
  154
  155	eb = warn->path.nodes[0];
  156	inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item);
  157	nlink = btrfs_inode_nlink(eb, inode_item);
  158	btrfs_release_path(&warn->path);
  159
  160	nofs_flag = memalloc_nofs_save();
  161	ipath = init_ipath(4096, local_root, &warn->path);
  162	memalloc_nofs_restore(nofs_flag);
  163	if (IS_ERR(ipath)) {
  164		btrfs_put_root(local_root);
  165		ret = PTR_ERR(ipath);
  166		ipath = NULL;
  167		/*
  168		 * -ENOMEM, not a critical error, just output an generic error
  169		 * without filename.
  170		 */
  171		btrfs_warn(fs_info,
  172"checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu",
  173			   warn->logical, warn->mirror_num, root, inum, offset);
  174		return ret;
  175	}
  176	ret = paths_from_inode(inum, ipath);
  177	if (ret < 0)
  178		goto err;
  179
  180	/*
  181	 * We deliberately ignore the bit ipath might have been too small to
  182	 * hold all of the paths here
  183	 */
  184	for (int i = 0; i < ipath->fspath->elem_cnt; i++) {
  185		btrfs_warn(fs_info,
  186"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)",
  187			   warn->logical, warn->mirror_num, root, inum, offset,
  188			   fs_info->sectorsize, nlink,
  189			   (char *)(unsigned long)ipath->fspath->val[i]);
  190	}
  191
  192	btrfs_put_root(local_root);
  193	free_ipath(ipath);
  194	return 0;
  195
  196err:
  197	btrfs_warn(fs_info,
  198"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d",
  199		   warn->logical, warn->mirror_num, root, inum, offset, ret);
  200
  201	free_ipath(ipath);
  202	return ret;
  203}
  204
  205/*
  206 * Do extra user-friendly error output (e.g. lookup all the affected files).
  207 *
  208 * Return true if we succeeded doing the backref lookup.
  209 * Return false if such lookup failed, and has to fallback to the old error message.
  210 */
  211static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off,
  212				   const u8 *csum, const u8 *csum_expected,
  213				   int mirror_num)
  214{
  215	struct btrfs_fs_info *fs_info = inode->root->fs_info;
  216	struct btrfs_path path = { 0 };
  217	struct btrfs_key found_key = { 0 };
  218	struct extent_buffer *eb;
  219	struct btrfs_extent_item *ei;
  220	const u32 csum_size = fs_info->csum_size;
  221	u64 logical;
  222	u64 flags;
  223	u32 item_size;
  224	int ret;
  225
  226	mutex_lock(&fs_info->reloc_mutex);
  227	logical = btrfs_get_reloc_bg_bytenr(fs_info);
  228	mutex_unlock(&fs_info->reloc_mutex);
  229
  230	if (logical == U64_MAX) {
  231		btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
  232		btrfs_warn_rl(fs_info,
  233"csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
  234			btrfs_root_id(inode->root), btrfs_ino(inode), file_off,
  235			CSUM_FMT_VALUE(csum_size, csum),
  236			CSUM_FMT_VALUE(csum_size, csum_expected),
  237			mirror_num);
  238		return;
  239	}
  240
  241	logical += file_off;
  242	btrfs_warn_rl(fs_info,
  243"csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
  244			btrfs_root_id(inode->root),
  245			btrfs_ino(inode), file_off, logical,
  246			CSUM_FMT_VALUE(csum_size, csum),
  247			CSUM_FMT_VALUE(csum_size, csum_expected),
  248			mirror_num);
  249
  250	ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags);
  251	if (ret < 0) {
  252		btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d",
  253			     logical, ret);
  254		return;
  255	}
  256	eb = path.nodes[0];
  257	ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item);
  258	item_size = btrfs_item_size(eb, path.slots[0]);
  259	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
  260		unsigned long ptr = 0;
  261		u64 ref_root;
  262		u8 ref_level;
  263
  264		while (true) {
  265			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
  266						      item_size, &ref_root,
  267						      &ref_level);
  268			if (ret < 0) {
  269				btrfs_warn_rl(fs_info,
  270				"failed to resolve tree backref for logical %llu: %d",
  271					      logical, ret);
  272				break;
  273			}
  274			if (ret > 0)
  275				break;
  276
  277			btrfs_warn_rl(fs_info,
  278"csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu",
  279				logical, mirror_num,
  280				(ref_level ? "node" : "leaf"),
  281				ref_level, ref_root);
  282		}
  283		btrfs_release_path(&path);
  284	} else {
  285		struct btrfs_backref_walk_ctx ctx = { 0 };
  286		struct data_reloc_warn reloc_warn = { 0 };
  287
  288		btrfs_release_path(&path);
  289
  290		ctx.bytenr = found_key.objectid;
  291		ctx.extent_item_pos = logical - found_key.objectid;
  292		ctx.fs_info = fs_info;
  293
  294		reloc_warn.logical = logical;
  295		reloc_warn.extent_item_size = found_key.offset;
  296		reloc_warn.mirror_num = mirror_num;
  297		reloc_warn.fs_info = fs_info;
  298
  299		iterate_extent_inodes(&ctx, true,
  300				      data_reloc_print_warning_inode, &reloc_warn);
  301	}
  302}
  303
  304static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
  305		u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
  306{
  307	struct btrfs_root *root = inode->root;
  308	const u32 csum_size = root->fs_info->csum_size;
  309
  310	/* For data reloc tree, it's better to do a backref lookup instead. */
  311	if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID)
  312		return print_data_reloc_error(inode, logical_start, csum,
  313					      csum_expected, mirror_num);
  314
  315	/* Output without objectid, which is more meaningful */
  316	if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) {
  317		btrfs_warn_rl(root->fs_info,
  318"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
  319			btrfs_root_id(root), btrfs_ino(inode),
  320			logical_start,
  321			CSUM_FMT_VALUE(csum_size, csum),
  322			CSUM_FMT_VALUE(csum_size, csum_expected),
  323			mirror_num);
  324	} else {
  325		btrfs_warn_rl(root->fs_info,
  326"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
  327			btrfs_root_id(root), btrfs_ino(inode),
  328			logical_start,
  329			CSUM_FMT_VALUE(csum_size, csum),
  330			CSUM_FMT_VALUE(csum_size, csum_expected),
  331			mirror_num);
  332	}
  333}
  334
  335/*
  336 * Lock inode i_rwsem based on arguments passed.
  337 *
  338 * ilock_flags can have the following bit set:
  339 *
  340 * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
  341 * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
  342 *		     return -EAGAIN
  343 * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
  344 */
  345int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
  346{
  347	if (ilock_flags & BTRFS_ILOCK_SHARED) {
  348		if (ilock_flags & BTRFS_ILOCK_TRY) {
  349			if (!inode_trylock_shared(&inode->vfs_inode))
  350				return -EAGAIN;
  351			else
  352				return 0;
  353		}
  354		inode_lock_shared(&inode->vfs_inode);
  355	} else {
  356		if (ilock_flags & BTRFS_ILOCK_TRY) {
  357			if (!inode_trylock(&inode->vfs_inode))
  358				return -EAGAIN;
  359			else
  360				return 0;
  361		}
  362		inode_lock(&inode->vfs_inode);
  363	}
  364	if (ilock_flags & BTRFS_ILOCK_MMAP)
  365		down_write(&inode->i_mmap_lock);
  366	return 0;
  367}
  368
  369/*
  370 * Unock inode i_rwsem.
  371 *
  372 * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
  373 * to decide whether the lock acquired is shared or exclusive.
  374 */
  375void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
  376{
  377	if (ilock_flags & BTRFS_ILOCK_MMAP)
  378		up_write(&inode->i_mmap_lock);
  379	if (ilock_flags & BTRFS_ILOCK_SHARED)
  380		inode_unlock_shared(&inode->vfs_inode);
  381	else
  382		inode_unlock(&inode->vfs_inode);
  383}
  384
  385/*
  386 * Cleanup all submitted ordered extents in specified range to handle errors
  387 * from the btrfs_run_delalloc_range() callback.
  388 *
  389 * NOTE: caller must ensure that when an error happens, it can not call
  390 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
  391 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
  392 * to be released, which we want to happen only when finishing the ordered
  393 * extent (btrfs_finish_ordered_io()).
  394 */
  395static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
  396						 struct page *locked_page,
  397						 u64 offset, u64 bytes)
  398{
  399	unsigned long index = offset >> PAGE_SHIFT;
  400	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
  401	u64 page_start = 0, page_end = 0;
  402	struct page *page;
  403
  404	if (locked_page) {
  405		page_start = page_offset(locked_page);
  406		page_end = page_start + PAGE_SIZE - 1;
  407	}
  408
  409	while (index <= end_index) {
  410		/*
  411		 * For locked page, we will call btrfs_mark_ordered_io_finished
  412		 * through btrfs_mark_ordered_io_finished() on it
  413		 * in run_delalloc_range() for the error handling, which will
  414		 * clear page Ordered and run the ordered extent accounting.
  415		 *
  416		 * Here we can't just clear the Ordered bit, or
  417		 * btrfs_mark_ordered_io_finished() would skip the accounting
  418		 * for the page range, and the ordered extent will never finish.
  419		 */
  420		if (locked_page && index == (page_start >> PAGE_SHIFT)) {
  421			index++;
  422			continue;
  423		}
  424		page = find_get_page(inode->vfs_inode.i_mapping, index);
  425		index++;
  426		if (!page)
  427			continue;
  428
  429		/*
  430		 * Here we just clear all Ordered bits for every page in the
  431		 * range, then btrfs_mark_ordered_io_finished() will handle
  432		 * the ordered extent accounting for the range.
  433		 */
  434		btrfs_folio_clamp_clear_ordered(inode->root->fs_info,
  435						page_folio(page), offset, bytes);
  436		put_page(page);
  437	}
  438
  439	if (locked_page) {
  440		/* The locked page covers the full range, nothing needs to be done */
  441		if (bytes + offset <= page_start + PAGE_SIZE)
  442			return;
  443		/*
  444		 * In case this page belongs to the delalloc range being
  445		 * instantiated then skip it, since the first page of a range is
  446		 * going to be properly cleaned up by the caller of
  447		 * run_delalloc_range
  448		 */
  449		if (page_start >= offset && page_end <= (offset + bytes - 1)) {
  450			bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
  451			offset = page_offset(locked_page) + PAGE_SIZE;
  452		}
  453	}
  454
  455	return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
  456}
  457
  458static int btrfs_dirty_inode(struct btrfs_inode *inode);
  459
  460static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
  461				     struct btrfs_new_inode_args *args)
  462{
  463	int err;
  464
  465	if (args->default_acl) {
  466		err = __btrfs_set_acl(trans, args->inode, args->default_acl,
  467				      ACL_TYPE_DEFAULT);
  468		if (err)
  469			return err;
  470	}
  471	if (args->acl) {
  472		err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
  473		if (err)
  474			return err;
  475	}
  476	if (!args->default_acl && !args->acl)
  477		cache_no_acl(args->inode);
  478	return btrfs_xattr_security_init(trans, args->inode, args->dir,
  479					 &args->dentry->d_name);
  480}
  481
  482/*
  483 * this does all the hard work for inserting an inline extent into
  484 * the btree.  The caller should have done a btrfs_drop_extents so that
  485 * no overlapping inline items exist in the btree
  486 */
  487static int insert_inline_extent(struct btrfs_trans_handle *trans,
  488				struct btrfs_path *path,
  489				struct btrfs_inode *inode, bool extent_inserted,
  490				size_t size, size_t compressed_size,
  491				int compress_type,
  492				struct folio *compressed_folio,
  493				bool update_i_size)
  494{
  495	struct btrfs_root *root = inode->root;
  496	struct extent_buffer *leaf;
  497	struct page *page = NULL;
  498	const u32 sectorsize = trans->fs_info->sectorsize;
  499	char *kaddr;
  500	unsigned long ptr;
  501	struct btrfs_file_extent_item *ei;
  502	int ret;
  503	size_t cur_size = size;
  504	u64 i_size;
  505
  506	/*
  507	 * The decompressed size must still be no larger than a sector.  Under
  508	 * heavy race, we can have size == 0 passed in, but that shouldn't be a
  509	 * big deal and we can continue the insertion.
  510	 */
  511	ASSERT(size <= sectorsize);
  512
  513	/*
  514	 * The compressed size also needs to be no larger than a sector.
  515	 * That's also why we only need one page as the parameter.
  516	 */
  517	if (compressed_folio)
  518		ASSERT(compressed_size <= sectorsize);
  519	else
  520		ASSERT(compressed_size == 0);
  521
  522	if (compressed_size && compressed_folio)
  523		cur_size = compressed_size;
  524
  525	if (!extent_inserted) {
  526		struct btrfs_key key;
  527		size_t datasize;
  528
  529		key.objectid = btrfs_ino(inode);
  530		key.offset = 0;
  531		key.type = BTRFS_EXTENT_DATA_KEY;
  532
  533		datasize = btrfs_file_extent_calc_inline_size(cur_size);
  534		ret = btrfs_insert_empty_item(trans, root, path, &key,
  535					      datasize);
  536		if (ret)
  537			goto fail;
  538	}
  539	leaf = path->nodes[0];
  540	ei = btrfs_item_ptr(leaf, path->slots[0],
  541			    struct btrfs_file_extent_item);
  542	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
  543	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
  544	btrfs_set_file_extent_encryption(leaf, ei, 0);
  545	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
  546	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
  547	ptr = btrfs_file_extent_inline_start(ei);
  548
  549	if (compress_type != BTRFS_COMPRESS_NONE) {
  550		kaddr = kmap_local_folio(compressed_folio, 0);
  551		write_extent_buffer(leaf, kaddr, ptr, compressed_size);
  552		kunmap_local(kaddr);
  553
  554		btrfs_set_file_extent_compression(leaf, ei,
  555						  compress_type);
  556	} else {
  557		page = find_get_page(inode->vfs_inode.i_mapping, 0);
  558		btrfs_set_file_extent_compression(leaf, ei, 0);
  559		kaddr = kmap_local_page(page);
  560		write_extent_buffer(leaf, kaddr, ptr, size);
  561		kunmap_local(kaddr);
  562		put_page(page);
  563	}
  564	btrfs_mark_buffer_dirty(trans, leaf);
  565	btrfs_release_path(path);
  566
  567	/*
  568	 * We align size to sectorsize for inline extents just for simplicity
  569	 * sake.
  570	 */
  571	ret = btrfs_inode_set_file_extent_range(inode, 0,
  572					ALIGN(size, root->fs_info->sectorsize));
  573	if (ret)
  574		goto fail;
  575
  576	/*
  577	 * We're an inline extent, so nobody can extend the file past i_size
  578	 * without locking a page we already have locked.
  579	 *
  580	 * We must do any i_size and inode updates before we unlock the pages.
  581	 * Otherwise we could end up racing with unlink.
  582	 */
  583	i_size = i_size_read(&inode->vfs_inode);
  584	if (update_i_size && size > i_size) {
  585		i_size_write(&inode->vfs_inode, size);
  586		i_size = size;
  587	}
  588	inode->disk_i_size = i_size;
  589
  590fail:
  591	return ret;
  592}
  593
  594static bool can_cow_file_range_inline(struct btrfs_inode *inode,
  595				      u64 offset, u64 size,
  596				      size_t compressed_size)
  597{
  598	struct btrfs_fs_info *fs_info = inode->root->fs_info;
  599	u64 data_len = (compressed_size ?: size);
  600
  601	/* Inline extents must start at offset 0. */
  602	if (offset != 0)
  603		return false;
  604
  605	/*
  606	 * Due to the page size limit, for subpage we can only trigger the
  607	 * writeback for the dirty sectors of page, that means data writeback
  608	 * is doing more writeback than what we want.
  609	 *
  610	 * This is especially unexpected for some call sites like fallocate,
  611	 * where we only increase i_size after everything is done.
  612	 * This means we can trigger inline extent even if we didn't want to.
  613	 * So here we skip inline extent creation completely.
  614	 */
  615	if (fs_info->sectorsize != PAGE_SIZE)
  616		return false;
  617
  618	/* Inline extents are limited to sectorsize. */
  619	if (size > fs_info->sectorsize)
  620		return false;
  621
  622	/* We cannot exceed the maximum inline data size. */
  623	if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
  624		return false;
  625
  626	/* We cannot exceed the user specified max_inline size. */
  627	if (data_len > fs_info->max_inline)
  628		return false;
  629
  630	/* Inline extents must be the entirety of the file. */
  631	if (size < i_size_read(&inode->vfs_inode))
  632		return false;
  633
  634	return true;
  635}
  636
  637/*
  638 * conditionally insert an inline extent into the file.  This
  639 * does the checks required to make sure the data is small enough
  640 * to fit as an inline extent.
  641 *
  642 * If being used directly, you must have already checked we're allowed to cow
  643 * the range by getting true from can_cow_file_range_inline().
  644 */
  645static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offset,
  646					    u64 size, size_t compressed_size,
  647					    int compress_type,
  648					    struct folio *compressed_folio,
  649					    bool update_i_size)
  650{
  651	struct btrfs_drop_extents_args drop_args = { 0 };
  652	struct btrfs_root *root = inode->root;
  653	struct btrfs_fs_info *fs_info = root->fs_info;
  654	struct btrfs_trans_handle *trans;
  655	u64 data_len = (compressed_size ?: size);
  656	int ret;
  657	struct btrfs_path *path;
  658
  659	path = btrfs_alloc_path();
  660	if (!path)
  661		return -ENOMEM;
  662
  663	trans = btrfs_join_transaction(root);
  664	if (IS_ERR(trans)) {
  665		btrfs_free_path(path);
  666		return PTR_ERR(trans);
  667	}
  668	trans->block_rsv = &inode->block_rsv;
  669
  670	drop_args.path = path;
  671	drop_args.start = 0;
  672	drop_args.end = fs_info->sectorsize;
  673	drop_args.drop_cache = true;
  674	drop_args.replace_extent = true;
  675	drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
  676	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
  677	if (ret) {
  678		btrfs_abort_transaction(trans, ret);
  679		goto out;
  680	}
  681
  682	ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
  683				   size, compressed_size, compress_type,
  684				   compressed_folio, update_i_size);
  685	if (ret && ret != -ENOSPC) {
  686		btrfs_abort_transaction(trans, ret);
  687		goto out;
  688	} else if (ret == -ENOSPC) {
  689		ret = 1;
  690		goto out;
  691	}
  692
  693	btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
  694	ret = btrfs_update_inode(trans, inode);
  695	if (ret && ret != -ENOSPC) {
  696		btrfs_abort_transaction(trans, ret);
  697		goto out;
  698	} else if (ret == -ENOSPC) {
  699		ret = 1;
  700		goto out;
  701	}
  702
  703	btrfs_set_inode_full_sync(inode);
  704out:
  705	/*
  706	 * Don't forget to free the reserved space, as for inlined extent
  707	 * it won't count as data extent, free them directly here.
  708	 * And at reserve time, it's always aligned to page size, so
  709	 * just free one page here.
  710	 */
  711	btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL);
  712	btrfs_free_path(path);
  713	btrfs_end_transaction(trans);
  714	return ret;
  715}
  716
  717static noinline int cow_file_range_inline(struct btrfs_inode *inode,
  718					  struct page *locked_page,
  719					  u64 offset, u64 end,
  720					  size_t compressed_size,
  721					  int compress_type,
  722					  struct folio *compressed_folio,
  723					  bool update_i_size)
  724{
  725	struct extent_state *cached = NULL;
  726	unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
  727		EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED;
  728	u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1);
  729	int ret;
  730
  731	if (!can_cow_file_range_inline(inode, offset, size, compressed_size))
  732		return 1;
  733
  734	lock_extent(&inode->io_tree, offset, end, &cached);
  735	ret = __cow_file_range_inline(inode, offset, size, compressed_size,
  736				      compress_type, compressed_folio,
  737				      update_i_size);
  738	if (ret > 0) {
  739		unlock_extent(&inode->io_tree, offset, end, &cached);
  740		return ret;
  741	}
  742
  743	if (ret == 0)
  744		locked_page = NULL;
  745
  746	extent_clear_unlock_delalloc(inode, offset, end, locked_page, &cached,
  747				     clear_flags,
  748				     PAGE_UNLOCK | PAGE_START_WRITEBACK |
  749				     PAGE_END_WRITEBACK);
  750	return ret;
  751}
  752
  753struct async_extent {
  754	u64 start;
  755	u64 ram_size;
  756	u64 compressed_size;
  757	struct folio **folios;
  758	unsigned long nr_folios;
  759	int compress_type;
  760	struct list_head list;
  761};
  762
  763struct async_chunk {
  764	struct btrfs_inode *inode;
  765	struct page *locked_page;
  766	u64 start;
  767	u64 end;
  768	blk_opf_t write_flags;
  769	struct list_head extents;
  770	struct cgroup_subsys_state *blkcg_css;
  771	struct btrfs_work work;
  772	struct async_cow *async_cow;
  773};
  774
  775struct async_cow {
  776	atomic_t num_chunks;
  777	struct async_chunk chunks[];
  778};
  779
  780static noinline int add_async_extent(struct async_chunk *cow,
  781				     u64 start, u64 ram_size,
  782				     u64 compressed_size,
  783				     struct folio **folios,
  784				     unsigned long nr_folios,
  785				     int compress_type)
  786{
  787	struct async_extent *async_extent;
  788
  789	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
  790	if (!async_extent)
  791		return -ENOMEM;
  792	async_extent->start = start;
  793	async_extent->ram_size = ram_size;
  794	async_extent->compressed_size = compressed_size;
  795	async_extent->folios = folios;
  796	async_extent->nr_folios = nr_folios;
  797	async_extent->compress_type = compress_type;
  798	list_add_tail(&async_extent->list, &cow->extents);
  799	return 0;
  800}
  801
  802/*
  803 * Check if the inode needs to be submitted to compression, based on mount
  804 * options, defragmentation, properties or heuristics.
  805 */
  806static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
  807				      u64 end)
  808{
  809	struct btrfs_fs_info *fs_info = inode->root->fs_info;
  810
  811	if (!btrfs_inode_can_compress(inode)) {
  812		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
  813			KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
  814			btrfs_ino(inode));
  815		return 0;
  816	}
  817	/*
  818	 * Special check for subpage.
  819	 *
  820	 * We lock the full page then run each delalloc range in the page, thus
  821	 * for the following case, we will hit some subpage specific corner case:
  822	 *
  823	 * 0		32K		64K
  824	 * |	|///////|	|///////|
  825	 *		\- A		\- B
  826	 *
  827	 * In above case, both range A and range B will try to unlock the full
  828	 * page [0, 64K), causing the one finished later will have page
  829	 * unlocked already, triggering various page lock requirement BUG_ON()s.
  830	 *
  831	 * So here we add an artificial limit that subpage compression can only
  832	 * if the range is fully page aligned.
  833	 *
  834	 * In theory we only need to ensure the first page is fully covered, but
  835	 * the tailing partial page will be locked until the full compression
  836	 * finishes, delaying the write of other range.
  837	 *
  838	 * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
  839	 * first to prevent any submitted async extent to unlock the full page.
  840	 * By this, we can ensure for subpage case that only the last async_cow
  841	 * will unlock the full page.
  842	 */
  843	if (fs_info->sectorsize < PAGE_SIZE) {
  844		if (!PAGE_ALIGNED(start) ||
  845		    !PAGE_ALIGNED(end + 1))
  846			return 0;
  847	}
  848
  849	/* force compress */
  850	if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
  851		return 1;
  852	/* defrag ioctl */
  853	if (inode->defrag_compress)
  854		return 1;
  855	/* bad compression ratios */
  856	if (inode->flags & BTRFS_INODE_NOCOMPRESS)
  857		return 0;
  858	if (btrfs_test_opt(fs_info, COMPRESS) ||
  859	    inode->flags & BTRFS_INODE_COMPRESS ||
  860	    inode->prop_compress)
  861		return btrfs_compress_heuristic(inode, start, end);
  862	return 0;
  863}
  864
  865static inline void inode_should_defrag(struct btrfs_inode *inode,
  866		u64 start, u64 end, u64 num_bytes, u32 small_write)
  867{
  868	/* If this is a small write inside eof, kick off a defrag */
  869	if (num_bytes < small_write &&
  870	    (start > 0 || end + 1 < inode->disk_i_size))
  871		btrfs_add_inode_defrag(NULL, inode, small_write);
  872}
  873
  874static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
  875{
  876	unsigned long end_index = end >> PAGE_SHIFT;
  877	struct page *page;
  878	int ret = 0;
  879
  880	for (unsigned long index = start >> PAGE_SHIFT;
  881	     index <= end_index; index++) {
  882		page = find_get_page(inode->i_mapping, index);
  883		if (unlikely(!page)) {
  884			if (!ret)
  885				ret = -ENOENT;
  886			continue;
  887		}
  888		clear_page_dirty_for_io(page);
  889		put_page(page);
  890	}
  891	return ret;
  892}
  893
  894/*
  895 * Work queue call back to started compression on a file and pages.
  896 *
  897 * This is done inside an ordered work queue, and the compression is spread
  898 * across many cpus.  The actual IO submission is step two, and the ordered work
  899 * queue takes care of making sure that happens in the same order things were
  900 * put onto the queue by writepages and friends.
  901 *
  902 * If this code finds it can't get good compression, it puts an entry onto the
  903 * work queue to write the uncompressed bytes.  This makes sure that both
  904 * compressed inodes and uncompressed inodes are written in the same order that
  905 * the flusher thread sent them down.
  906 */
  907static void compress_file_range(struct btrfs_work *work)
  908{
  909	struct async_chunk *async_chunk =
  910		container_of(work, struct async_chunk, work);
  911	struct btrfs_inode *inode = async_chunk->inode;
  912	struct btrfs_fs_info *fs_info = inode->root->fs_info;
  913	struct address_space *mapping = inode->vfs_inode.i_mapping;
  914	u64 blocksize = fs_info->sectorsize;
  915	u64 start = async_chunk->start;
  916	u64 end = async_chunk->end;
  917	u64 actual_end;
  918	u64 i_size;
  919	int ret = 0;
  920	struct folio **folios;
  921	unsigned long nr_folios;
  922	unsigned long total_compressed = 0;
  923	unsigned long total_in = 0;
  924	unsigned int poff;
  925	int i;
  926	int compress_type = fs_info->compress_type;
  927
  928	inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
  929
  930	/*
  931	 * We need to call clear_page_dirty_for_io on each page in the range.
  932	 * Otherwise applications with the file mmap'd can wander in and change
  933	 * the page contents while we are compressing them.
  934	 */
  935	ret = extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
  936
  937	/*
  938	 * All the folios should have been locked thus no failure.
  939	 *
  940	 * And even if some folios are missing, btrfs_compress_folios()
  941	 * would handle them correctly, so here just do an ASSERT() check for
  942	 * early logic errors.
  943	 */
  944	ASSERT(ret == 0);
  945
  946	/*
  947	 * We need to save i_size before now because it could change in between
  948	 * us evaluating the size and assigning it.  This is because we lock and
  949	 * unlock the page in truncate and fallocate, and then modify the i_size
  950	 * later on.
  951	 *
  952	 * The barriers are to emulate READ_ONCE, remove that once i_size_read
  953	 * does that for us.
  954	 */
  955	barrier();
  956	i_size = i_size_read(&inode->vfs_inode);
  957	barrier();
  958	actual_end = min_t(u64, i_size, end + 1);
  959again:
  960	folios = NULL;
  961	nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
  962	nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES);
  963
  964	/*
  965	 * we don't want to send crud past the end of i_size through
  966	 * compression, that's just a waste of CPU time.  So, if the
  967	 * end of the file is before the start of our current
  968	 * requested range of bytes, we bail out to the uncompressed
  969	 * cleanup code that can deal with all of this.
  970	 *
  971	 * It isn't really the fastest way to fix things, but this is a
  972	 * very uncommon corner.
  973	 */
  974	if (actual_end <= start)
  975		goto cleanup_and_bail_uncompressed;
  976
  977	total_compressed = actual_end - start;
  978
  979	/*
  980	 * Skip compression for a small file range(<=blocksize) that
  981	 * isn't an inline extent, since it doesn't save disk space at all.
  982	 */
  983	if (total_compressed <= blocksize &&
  984	   (start > 0 || end + 1 < inode->disk_i_size))
  985		goto cleanup_and_bail_uncompressed;
  986
  987	/*
  988	 * For subpage case, we require full page alignment for the sector
  989	 * aligned range.
  990	 * Thus we must also check against @actual_end, not just @end.
  991	 */
  992	if (blocksize < PAGE_SIZE) {
  993		if (!PAGE_ALIGNED(start) ||
  994		    !PAGE_ALIGNED(round_up(actual_end, blocksize)))
  995			goto cleanup_and_bail_uncompressed;
  996	}
  997
  998	total_compressed = min_t(unsigned long, total_compressed,
  999			BTRFS_MAX_UNCOMPRESSED);
 1000	total_in = 0;
 1001	ret = 0;
 1002
 1003	/*
 1004	 * We do compression for mount -o compress and when the inode has not
 1005	 * been flagged as NOCOMPRESS.  This flag can change at any time if we
 1006	 * discover bad compression ratios.
 1007	 */
 1008	if (!inode_need_compress(inode, start, end))
 1009		goto cleanup_and_bail_uncompressed;
 1010
 1011	folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS);
 1012	if (!folios) {
 1013		/*
 1014		 * Memory allocation failure is not a fatal error, we can fall
 1015		 * back to uncompressed code.
 1016		 */
 1017		goto cleanup_and_bail_uncompressed;
 1018	}
 1019
 1020	if (inode->defrag_compress)
 1021		compress_type = inode->defrag_compress;
 1022	else if (inode->prop_compress)
 1023		compress_type = inode->prop_compress;
 1024
 1025	/* Compression level is applied here. */
 1026	ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4),
 1027				    mapping, start, folios, &nr_folios, &total_in,
 1028				    &total_compressed);
 1029	if (ret)
 1030		goto mark_incompressible;
 1031
 1032	/*
 1033	 * Zero the tail end of the last page, as we might be sending it down
 1034	 * to disk.
 1035	 */
 1036	poff = offset_in_page(total_compressed);
 1037	if (poff)
 1038		folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff);
 1039
 1040	/*
 1041	 * Try to create an inline extent.
 1042	 *
 1043	 * If we didn't compress the entire range, try to create an uncompressed
 1044	 * inline extent, else a compressed one.
 1045	 *
 1046	 * Check cow_file_range() for why we don't even try to create inline
 1047	 * extent for the subpage case.
 1048	 */
 1049	if (total_in < actual_end)
 1050		ret = cow_file_range_inline(inode, NULL, start, end, 0,
 1051					    BTRFS_COMPRESS_NONE, NULL, false);
 1052	else
 1053		ret = cow_file_range_inline(inode, NULL, start, end, total_compressed,
 1054					    compress_type, folios[0], false);
 1055	if (ret <= 0) {
 1056		if (ret < 0)
 1057			mapping_set_error(mapping, -EIO);
 1058		goto free_pages;
 1059	}
 1060
 1061	/*
 1062	 * We aren't doing an inline extent. Round the compressed size up to a
 1063	 * block size boundary so the allocator does sane things.
 1064	 */
 1065	total_compressed = ALIGN(total_compressed, blocksize);
 1066
 1067	/*
 1068	 * One last check to make sure the compression is really a win, compare
 1069	 * the page count read with the blocks on disk, compression must free at
 1070	 * least one sector.
 1071	 */
 1072	total_in = round_up(total_in, fs_info->sectorsize);
 1073	if (total_compressed + blocksize > total_in)
 1074		goto mark_incompressible;
 1075
 1076	/*
 1077	 * The async work queues will take care of doing actual allocation on
 1078	 * disk for these compressed pages, and will submit the bios.
 1079	 */
 1080	ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios,
 1081			       nr_folios, compress_type);
 1082	BUG_ON(ret);
 1083	if (start + total_in < end) {
 1084		start += total_in;
 1085		cond_resched();
 1086		goto again;
 1087	}
 1088	return;
 1089
 1090mark_incompressible:
 1091	if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress)
 1092		inode->flags |= BTRFS_INODE_NOCOMPRESS;
 1093cleanup_and_bail_uncompressed:
 1094	ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
 1095			       BTRFS_COMPRESS_NONE);
 1096	BUG_ON(ret);
 1097free_pages:
 1098	if (folios) {
 1099		for (i = 0; i < nr_folios; i++) {
 1100			WARN_ON(folios[i]->mapping);
 1101			btrfs_free_compr_folio(folios[i]);
 1102		}
 1103		kfree(folios);
 1104	}
 1105}
 1106
 1107static void free_async_extent_pages(struct async_extent *async_extent)
 1108{
 1109	int i;
 1110
 1111	if (!async_extent->folios)
 1112		return;
 1113
 1114	for (i = 0; i < async_extent->nr_folios; i++) {
 1115		WARN_ON(async_extent->folios[i]->mapping);
 1116		btrfs_free_compr_folio(async_extent->folios[i]);
 1117	}
 1118	kfree(async_extent->folios);
 1119	async_extent->nr_folios = 0;
 1120	async_extent->folios = NULL;
 1121}
 1122
 1123static void submit_uncompressed_range(struct btrfs_inode *inode,
 1124				      struct async_extent *async_extent,
 1125				      struct page *locked_page)
 1126{
 1127	u64 start = async_extent->start;
 1128	u64 end = async_extent->start + async_extent->ram_size - 1;
 1129	int ret;
 1130	struct writeback_control wbc = {
 1131		.sync_mode		= WB_SYNC_ALL,
 1132		.range_start		= start,
 1133		.range_end		= end,
 1134		.no_cgroup_owner	= 1,
 1135	};
 1136
 1137	wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode);
 1138	ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false);
 1139	wbc_detach_inode(&wbc);
 1140	if (ret < 0) {
 1141		btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
 1142		if (locked_page) {
 1143			const u64 page_start = page_offset(locked_page);
 1144
 1145			set_page_writeback(locked_page);
 1146			end_page_writeback(locked_page);
 1147			btrfs_mark_ordered_io_finished(inode, locked_page,
 1148						       page_start, PAGE_SIZE,
 1149						       !ret);
 1150			mapping_set_error(locked_page->mapping, ret);
 1151			unlock_page(locked_page);
 1152		}
 1153	}
 1154}
 1155
 1156static void submit_one_async_extent(struct async_chunk *async_chunk,
 1157				    struct async_extent *async_extent,
 1158				    u64 *alloc_hint)
 1159{
 1160	struct btrfs_inode *inode = async_chunk->inode;
 1161	struct extent_io_tree *io_tree = &inode->io_tree;
 1162	struct btrfs_root *root = inode->root;
 1163	struct btrfs_fs_info *fs_info = root->fs_info;
 1164	struct btrfs_ordered_extent *ordered;
 1165	struct btrfs_file_extent file_extent;
 1166	struct btrfs_key ins;
 1167	struct page *locked_page = NULL;
 1168	struct extent_state *cached = NULL;
 1169	struct extent_map *em;
 1170	int ret = 0;
 1171	u64 start = async_extent->start;
 1172	u64 end = async_extent->start + async_extent->ram_size - 1;
 1173
 1174	if (async_chunk->blkcg_css)
 1175		kthread_associate_blkcg(async_chunk->blkcg_css);
 1176
 1177	/*
 1178	 * If async_chunk->locked_page is in the async_extent range, we need to
 1179	 * handle it.
 1180	 */
 1181	if (async_chunk->locked_page) {
 1182		u64 locked_page_start = page_offset(async_chunk->locked_page);
 1183		u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
 1184
 1185		if (!(start >= locked_page_end || end <= locked_page_start))
 1186			locked_page = async_chunk->locked_page;
 1187	}
 1188
 1189	if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
 1190		submit_uncompressed_range(inode, async_extent, locked_page);
 1191		goto done;
 1192	}
 1193
 1194	ret = btrfs_reserve_extent(root, async_extent->ram_size,
 1195				   async_extent->compressed_size,
 1196				   async_extent->compressed_size,
 1197				   0, *alloc_hint, &ins, 1, 1);
 1198	if (ret) {
 1199		/*
 1200		 * We can't reserve contiguous space for the compressed size.
 1201		 * Unlikely, but it's possible that we could have enough
 1202		 * non-contiguous space for the uncompressed size instead.  So
 1203		 * fall back to uncompressed.
 1204		 */
 1205		submit_uncompressed_range(inode, async_extent, locked_page);
 1206		goto done;
 1207	}
 1208
 1209	lock_extent(io_tree, start, end, &cached);
 1210
 1211	/* Here we're doing allocation and writeback of the compressed pages */
 1212	file_extent.disk_bytenr = ins.objectid;
 1213	file_extent.disk_num_bytes = ins.offset;
 1214	file_extent.ram_bytes = async_extent->ram_size;
 1215	file_extent.num_bytes = async_extent->ram_size;
 1216	file_extent.offset = 0;
 1217	file_extent.compression = async_extent->compress_type;
 1218
 1219	em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
 1220	if (IS_ERR(em)) {
 1221		ret = PTR_ERR(em);
 1222		goto out_free_reserve;
 1223	}
 1224	free_extent_map(em);
 1225
 1226	ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
 1227					     1 << BTRFS_ORDERED_COMPRESSED);
 1228	if (IS_ERR(ordered)) {
 1229		btrfs_drop_extent_map_range(inode, start, end, false);
 1230		ret = PTR_ERR(ordered);
 1231		goto out_free_reserve;
 1232	}
 1233	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 1234
 1235	/* Clear dirty, set writeback and unlock the pages. */
 1236	extent_clear_unlock_delalloc(inode, start, end,
 1237			NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC,
 1238			PAGE_UNLOCK | PAGE_START_WRITEBACK);
 1239	btrfs_submit_compressed_write(ordered,
 1240			    async_extent->folios,	/* compressed_folios */
 1241			    async_extent->nr_folios,
 1242			    async_chunk->write_flags, true);
 1243	*alloc_hint = ins.objectid + ins.offset;
 1244done:
 1245	if (async_chunk->blkcg_css)
 1246		kthread_associate_blkcg(NULL);
 1247	kfree(async_extent);
 1248	return;
 1249
 1250out_free_reserve:
 1251	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 1252	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
 1253	mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
 1254	extent_clear_unlock_delalloc(inode, start, end,
 1255				     NULL, &cached,
 1256				     EXTENT_LOCKED | EXTENT_DELALLOC |
 1257				     EXTENT_DELALLOC_NEW |
 1258				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
 1259				     PAGE_UNLOCK | PAGE_START_WRITEBACK |
 1260				     PAGE_END_WRITEBACK);
 1261	free_async_extent_pages(async_extent);
 1262	if (async_chunk->blkcg_css)
 1263		kthread_associate_blkcg(NULL);
 1264	btrfs_debug(fs_info,
 1265"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
 1266		    btrfs_root_id(root), btrfs_ino(inode), start,
 1267		    async_extent->ram_size, ret);
 1268	kfree(async_extent);
 1269}
 1270
 1271u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
 1272				     u64 num_bytes)
 1273{
 1274	struct extent_map_tree *em_tree = &inode->extent_tree;
 1275	struct extent_map *em;
 1276	u64 alloc_hint = 0;
 1277
 1278	read_lock(&em_tree->lock);
 1279	em = search_extent_mapping(em_tree, start, num_bytes);
 1280	if (em) {
 1281		/*
 1282		 * if block start isn't an actual block number then find the
 1283		 * first block in this inode and use that as a hint.  If that
 1284		 * block is also bogus then just don't worry about it.
 1285		 */
 1286		if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
 1287			free_extent_map(em);
 1288			em = search_extent_mapping(em_tree, 0, 0);
 1289			if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
 1290				alloc_hint = extent_map_block_start(em);
 1291			if (em)
 1292				free_extent_map(em);
 1293		} else {
 1294			alloc_hint = extent_map_block_start(em);
 1295			free_extent_map(em);
 1296		}
 1297	}
 1298	read_unlock(&em_tree->lock);
 1299
 1300	return alloc_hint;
 1301}
 1302
 1303/*
 1304 * when extent_io.c finds a delayed allocation range in the file,
 1305 * the call backs end up in this code.  The basic idea is to
 1306 * allocate extents on disk for the range, and create ordered data structs
 1307 * in ram to track those extents.
 1308 *
 1309 * locked_page is the page that writepage had locked already.  We use
 1310 * it to make sure we don't do extra locks or unlocks.
 1311 *
 1312 * When this function fails, it unlocks all pages except @locked_page.
 1313 *
 1314 * When this function successfully creates an inline extent, it returns 1 and
 1315 * unlocks all pages including locked_page and starts I/O on them.
 1316 * (In reality inline extents are limited to a single page, so locked_page is
 1317 * the only page handled anyway).
 1318 *
 1319 * When this function succeed and creates a normal extent, the page locking
 1320 * status depends on the passed in flags:
 1321 *
 1322 * - If @keep_locked is set, all pages are kept locked.
 1323 * - Else all pages except for @locked_page are unlocked.
 1324 *
 1325 * When a failure happens in the second or later iteration of the
 1326 * while-loop, the ordered extents created in previous iterations are kept
 1327 * intact. So, the caller must clean them up by calling
 1328 * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
 1329 * example.
 1330 */
 1331static noinline int cow_file_range(struct btrfs_inode *inode,
 1332				   struct page *locked_page, u64 start, u64 end,
 1333				   u64 *done_offset,
 1334				   bool keep_locked, bool no_inline)
 1335{
 1336	struct btrfs_root *root = inode->root;
 1337	struct btrfs_fs_info *fs_info = root->fs_info;
 1338	struct extent_state *cached = NULL;
 1339	u64 alloc_hint = 0;
 1340	u64 orig_start = start;
 1341	u64 num_bytes;
 1342	unsigned long ram_size;
 1343	u64 cur_alloc_size = 0;
 1344	u64 min_alloc_size;
 1345	u64 blocksize = fs_info->sectorsize;
 1346	struct btrfs_key ins;
 1347	struct extent_map *em;
 1348	unsigned clear_bits;
 1349	unsigned long page_ops;
 1350	bool extent_reserved = false;
 1351	int ret = 0;
 1352
 1353	if (btrfs_is_free_space_inode(inode)) {
 1354		ret = -EINVAL;
 1355		goto out_unlock;
 1356	}
 1357
 1358	num_bytes = ALIGN(end - start + 1, blocksize);
 1359	num_bytes = max(blocksize,  num_bytes);
 1360	ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
 1361
 1362	inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
 1363
 1364	if (!no_inline) {
 1365		/* lets try to make an inline extent */
 1366		ret = cow_file_range_inline(inode, locked_page, start, end, 0,
 1367					    BTRFS_COMPRESS_NONE, NULL, false);
 1368		if (ret <= 0) {
 1369			/*
 1370			 * We succeeded, return 1 so the caller knows we're done
 1371			 * with this page and already handled the IO.
 1372			 *
 1373			 * If there was an error then cow_file_range_inline() has
 1374			 * already done the cleanup.
 1375			 */
 1376			if (ret == 0)
 1377				ret = 1;
 1378			goto done;
 1379		}
 1380	}
 1381
 1382	alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes);
 1383
 1384	/*
 1385	 * Relocation relies on the relocated extents to have exactly the same
 1386	 * size as the original extents. Normally writeback for relocation data
 1387	 * extents follows a NOCOW path because relocation preallocates the
 1388	 * extents. However, due to an operation such as scrub turning a block
 1389	 * group to RO mode, it may fallback to COW mode, so we must make sure
 1390	 * an extent allocated during COW has exactly the requested size and can
 1391	 * not be split into smaller extents, otherwise relocation breaks and
 1392	 * fails during the stage where it updates the bytenr of file extent
 1393	 * items.
 1394	 */
 1395	if (btrfs_is_data_reloc_root(root))
 1396		min_alloc_size = num_bytes;
 1397	else
 1398		min_alloc_size = fs_info->sectorsize;
 1399
 1400	while (num_bytes > 0) {
 1401		struct btrfs_ordered_extent *ordered;
 1402		struct btrfs_file_extent file_extent;
 1403
 1404		cur_alloc_size = num_bytes;
 1405		ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
 1406					   min_alloc_size, 0, alloc_hint,
 1407					   &ins, 1, 1);
 1408		if (ret == -EAGAIN) {
 1409			/*
 1410			 * btrfs_reserve_extent only returns -EAGAIN for zoned
 1411			 * file systems, which is an indication that there are
 1412			 * no active zones to allocate from at the moment.
 1413			 *
 1414			 * If this is the first loop iteration, wait for at
 1415			 * least one zone to finish before retrying the
 1416			 * allocation.  Otherwise ask the caller to write out
 1417			 * the already allocated blocks before coming back to
 1418			 * us, or return -ENOSPC if it can't handle retries.
 1419			 */
 1420			ASSERT(btrfs_is_zoned(fs_info));
 1421			if (start == orig_start) {
 1422				wait_on_bit_io(&inode->root->fs_info->flags,
 1423					       BTRFS_FS_NEED_ZONE_FINISH,
 1424					       TASK_UNINTERRUPTIBLE);
 1425				continue;
 1426			}
 1427			if (done_offset) {
 1428				*done_offset = start - 1;
 1429				return 0;
 1430			}
 1431			ret = -ENOSPC;
 1432		}
 1433		if (ret < 0)
 1434			goto out_unlock;
 1435		cur_alloc_size = ins.offset;
 1436		extent_reserved = true;
 1437
 1438		ram_size = ins.offset;
 1439		file_extent.disk_bytenr = ins.objectid;
 1440		file_extent.disk_num_bytes = ins.offset;
 1441		file_extent.num_bytes = ins.offset;
 1442		file_extent.ram_bytes = ins.offset;
 1443		file_extent.offset = 0;
 1444		file_extent.compression = BTRFS_COMPRESS_NONE;
 1445
 1446		lock_extent(&inode->io_tree, start, start + ram_size - 1,
 1447			    &cached);
 1448
 1449		em = btrfs_create_io_em(inode, start, &file_extent,
 1450					BTRFS_ORDERED_REGULAR);
 1451		if (IS_ERR(em)) {
 1452			unlock_extent(&inode->io_tree, start,
 1453				      start + ram_size - 1, &cached);
 1454			ret = PTR_ERR(em);
 1455			goto out_reserve;
 1456		}
 1457		free_extent_map(em);
 1458
 1459		ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
 1460						     1 << BTRFS_ORDERED_REGULAR);
 1461		if (IS_ERR(ordered)) {
 1462			unlock_extent(&inode->io_tree, start,
 1463				      start + ram_size - 1, &cached);
 1464			ret = PTR_ERR(ordered);
 1465			goto out_drop_extent_cache;
 1466		}
 1467
 1468		if (btrfs_is_data_reloc_root(root)) {
 1469			ret = btrfs_reloc_clone_csums(ordered);
 1470
 1471			/*
 1472			 * Only drop cache here, and process as normal.
 1473			 *
 1474			 * We must not allow extent_clear_unlock_delalloc()
 1475			 * at out_unlock label to free meta of this ordered
 1476			 * extent, as its meta should be freed by
 1477			 * btrfs_finish_ordered_io().
 1478			 *
 1479			 * So we must continue until @start is increased to
 1480			 * skip current ordered extent.
 1481			 */
 1482			if (ret)
 1483				btrfs_drop_extent_map_range(inode, start,
 1484							    start + ram_size - 1,
 1485							    false);
 1486		}
 1487		btrfs_put_ordered_extent(ordered);
 1488
 1489		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 1490
 1491		/*
 1492		 * We're not doing compressed IO, don't unlock the first page
 1493		 * (which the caller expects to stay locked), don't clear any
 1494		 * dirty bits and don't set any writeback bits
 1495		 *
 1496		 * Do set the Ordered (Private2) bit so we know this page was
 1497		 * properly setup for writepage.
 1498		 */
 1499		page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
 1500		page_ops |= PAGE_SET_ORDERED;
 1501
 1502		extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
 1503					     locked_page, &cached,
 1504					     EXTENT_LOCKED | EXTENT_DELALLOC,
 1505					     page_ops);
 1506		if (num_bytes < cur_alloc_size)
 1507			num_bytes = 0;
 1508		else
 1509			num_bytes -= cur_alloc_size;
 1510		alloc_hint = ins.objectid + ins.offset;
 1511		start += cur_alloc_size;
 1512		extent_reserved = false;
 1513
 1514		/*
 1515		 * btrfs_reloc_clone_csums() error, since start is increased
 1516		 * extent_clear_unlock_delalloc() at out_unlock label won't
 1517		 * free metadata of current ordered extent, we're OK to exit.
 1518		 */
 1519		if (ret)
 1520			goto out_unlock;
 1521	}
 1522done:
 1523	if (done_offset)
 1524		*done_offset = end;
 1525	return ret;
 1526
 1527out_drop_extent_cache:
 1528	btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false);
 1529out_reserve:
 1530	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 1531	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
 1532out_unlock:
 1533	/*
 1534	 * Now, we have three regions to clean up:
 1535	 *
 1536	 * |-------(1)----|---(2)---|-------------(3)----------|
 1537	 * `- orig_start  `- start  `- start + cur_alloc_size  `- end
 1538	 *
 1539	 * We process each region below.
 1540	 */
 1541
 1542	clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
 1543		EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
 1544	page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
 1545
 1546	/*
 1547	 * For the range (1). We have already instantiated the ordered extents
 1548	 * for this region. They are cleaned up by
 1549	 * btrfs_cleanup_ordered_extents() in e.g,
 1550	 * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
 1551	 * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
 1552	 * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
 1553	 * function.
 1554	 *
 1555	 * However, in case of @keep_locked, we still need to unlock the pages
 1556	 * (except @locked_page) to ensure all the pages are unlocked.
 1557	 */
 1558	if (keep_locked && orig_start < start) {
 1559		if (!locked_page)
 1560			mapping_set_error(inode->vfs_inode.i_mapping, ret);
 1561		extent_clear_unlock_delalloc(inode, orig_start, start - 1,
 1562					     locked_page, NULL, 0, page_ops);
 1563	}
 1564
 1565	/*
 1566	 * At this point we're unlocked, we want to make sure we're only
 1567	 * clearing these flags under the extent lock, so lock the rest of the
 1568	 * range and clear everything up.
 1569	 */
 1570	lock_extent(&inode->io_tree, start, end, NULL);
 1571
 1572	/*
 1573	 * For the range (2). If we reserved an extent for our delalloc range
 1574	 * (or a subrange) and failed to create the respective ordered extent,
 1575	 * then it means that when we reserved the extent we decremented the
 1576	 * extent's size from the data space_info's bytes_may_use counter and
 1577	 * incremented the space_info's bytes_reserved counter by the same
 1578	 * amount. We must make sure extent_clear_unlock_delalloc() does not try
 1579	 * to decrement again the data space_info's bytes_may_use counter,
 1580	 * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
 1581	 */
 1582	if (extent_reserved) {
 1583		extent_clear_unlock_delalloc(inode, start,
 1584					     start + cur_alloc_size - 1,
 1585					     locked_page, &cached,
 1586					     clear_bits,
 1587					     page_ops);
 1588		btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
 1589		start += cur_alloc_size;
 1590	}
 1591
 1592	/*
 1593	 * For the range (3). We never touched the region. In addition to the
 1594	 * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
 1595	 * space_info's bytes_may_use counter, reserved in
 1596	 * btrfs_check_data_free_space().
 1597	 */
 1598	if (start < end) {
 1599		clear_bits |= EXTENT_CLEAR_DATA_RESV;
 1600		extent_clear_unlock_delalloc(inode, start, end, locked_page,
 1601					     &cached, clear_bits, page_ops);
 1602		btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
 1603	}
 1604	return ret;
 1605}
 1606
 1607/*
 1608 * Phase two of compressed writeback.  This is the ordered portion of the code,
 1609 * which only gets called in the order the work was queued.  We walk all the
 1610 * async extents created by compress_file_range and send them down to the disk.
 1611 *
 1612 * If called with @do_free == true then it'll try to finish the work and free
 1613 * the work struct eventually.
 1614 */
 1615static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free)
 1616{
 1617	struct async_chunk *async_chunk = container_of(work, struct async_chunk,
 1618						     work);
 1619	struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
 1620	struct async_extent *async_extent;
 1621	unsigned long nr_pages;
 1622	u64 alloc_hint = 0;
 1623
 1624	if (do_free) {
 1625		struct async_cow *async_cow;
 1626
 1627		btrfs_add_delayed_iput(async_chunk->inode);
 1628		if (async_chunk->blkcg_css)
 1629			css_put(async_chunk->blkcg_css);
 1630
 1631		async_cow = async_chunk->async_cow;
 1632		if (atomic_dec_and_test(&async_cow->num_chunks))
 1633			kvfree(async_cow);
 1634		return;
 1635	}
 1636
 1637	nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
 1638		PAGE_SHIFT;
 1639
 1640	while (!list_empty(&async_chunk->extents)) {
 1641		async_extent = list_entry(async_chunk->extents.next,
 1642					  struct async_extent, list);
 1643		list_del(&async_extent->list);
 1644		submit_one_async_extent(async_chunk, async_extent, &alloc_hint);
 1645	}
 1646
 1647	/* atomic_sub_return implies a barrier */
 1648	if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
 1649	    5 * SZ_1M)
 1650		cond_wake_up_nomb(&fs_info->async_submit_wait);
 1651}
 1652
 1653static bool run_delalloc_compressed(struct btrfs_inode *inode,
 1654				    struct page *locked_page, u64 start,
 1655				    u64 end, struct writeback_control *wbc)
 1656{
 1657	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 1658	struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
 1659	struct async_cow *ctx;
 1660	struct async_chunk *async_chunk;
 1661	unsigned long nr_pages;
 1662	u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
 1663	int i;
 1664	unsigned nofs_flag;
 1665	const blk_opf_t write_flags = wbc_to_write_flags(wbc);
 1666
 1667	nofs_flag = memalloc_nofs_save();
 1668	ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
 1669	memalloc_nofs_restore(nofs_flag);
 1670	if (!ctx)
 1671		return false;
 1672
 1673	set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
 1674
 1675	async_chunk = ctx->chunks;
 1676	atomic_set(&ctx->num_chunks, num_chunks);
 1677
 1678	for (i = 0; i < num_chunks; i++) {
 1679		u64 cur_end = min(end, start + SZ_512K - 1);
 1680
 1681		/*
 1682		 * igrab is called higher up in the call chain, take only the
 1683		 * lightweight reference for the callback lifetime
 1684		 */
 1685		ihold(&inode->vfs_inode);
 1686		async_chunk[i].async_cow = ctx;
 1687		async_chunk[i].inode = inode;
 1688		async_chunk[i].start = start;
 1689		async_chunk[i].end = cur_end;
 1690		async_chunk[i].write_flags = write_flags;
 1691		INIT_LIST_HEAD(&async_chunk[i].extents);
 1692
 1693		/*
 1694		 * The locked_page comes all the way from writepage and its
 1695		 * the original page we were actually given.  As we spread
 1696		 * this large delalloc region across multiple async_chunk
 1697		 * structs, only the first struct needs a pointer to locked_page
 1698		 *
 1699		 * This way we don't need racey decisions about who is supposed
 1700		 * to unlock it.
 1701		 */
 1702		if (locked_page) {
 1703			/*
 1704			 * Depending on the compressibility, the pages might or
 1705			 * might not go through async.  We want all of them to
 1706			 * be accounted against wbc once.  Let's do it here
 1707			 * before the paths diverge.  wbc accounting is used
 1708			 * only for foreign writeback detection and doesn't
 1709			 * need full accuracy.  Just account the whole thing
 1710			 * against the first page.
 1711			 */
 1712			wbc_account_cgroup_owner(wbc, locked_page,
 1713						 cur_end - start);
 1714			async_chunk[i].locked_page = locked_page;
 1715			locked_page = NULL;
 1716		} else {
 1717			async_chunk[i].locked_page = NULL;
 1718		}
 1719
 1720		if (blkcg_css != blkcg_root_css) {
 1721			css_get(blkcg_css);
 1722			async_chunk[i].blkcg_css = blkcg_css;
 1723			async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT;
 1724		} else {
 1725			async_chunk[i].blkcg_css = NULL;
 1726		}
 1727
 1728		btrfs_init_work(&async_chunk[i].work, compress_file_range,
 1729				submit_compressed_extents);
 1730
 1731		nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
 1732		atomic_add(nr_pages, &fs_info->async_delalloc_pages);
 1733
 1734		btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
 1735
 1736		start = cur_end + 1;
 1737	}
 1738	return true;
 1739}
 1740
 1741/*
 1742 * Run the delalloc range from start to end, and write back any dirty pages
 1743 * covered by the range.
 1744 */
 1745static noinline int run_delalloc_cow(struct btrfs_inode *inode,
 1746				     struct page *locked_page, u64 start,
 1747				     u64 end, struct writeback_control *wbc,
 1748				     bool pages_dirty)
 1749{
 1750	u64 done_offset = end;
 1751	int ret;
 1752
 1753	while (start <= end) {
 1754		ret = cow_file_range(inode, locked_page, start, end, &done_offset,
 1755				     true, false);
 1756		if (ret)
 1757			return ret;
 1758		extent_write_locked_range(&inode->vfs_inode, locked_page, start,
 1759					  done_offset, wbc, pages_dirty);
 1760		start = done_offset + 1;
 1761	}
 1762
 1763	return 1;
 1764}
 1765
 1766static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
 1767			   const u64 start, const u64 end)
 1768{
 1769	const bool is_space_ino = btrfs_is_free_space_inode(inode);
 1770	const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
 1771	const u64 range_bytes = end + 1 - start;
 1772	struct extent_io_tree *io_tree = &inode->io_tree;
 1773	struct extent_state *cached_state = NULL;
 1774	u64 range_start = start;
 1775	u64 count;
 1776	int ret;
 1777
 1778	/*
 1779	 * If EXTENT_NORESERVE is set it means that when the buffered write was
 1780	 * made we had not enough available data space and therefore we did not
 1781	 * reserve data space for it, since we though we could do NOCOW for the
 1782	 * respective file range (either there is prealloc extent or the inode
 1783	 * has the NOCOW bit set).
 1784	 *
 1785	 * However when we need to fallback to COW mode (because for example the
 1786	 * block group for the corresponding extent was turned to RO mode by a
 1787	 * scrub or relocation) we need to do the following:
 1788	 *
 1789	 * 1) We increment the bytes_may_use counter of the data space info.
 1790	 *    If COW succeeds, it allocates a new data extent and after doing
 1791	 *    that it decrements the space info's bytes_may_use counter and
 1792	 *    increments its bytes_reserved counter by the same amount (we do
 1793	 *    this at btrfs_add_reserved_bytes()). So we need to increment the
 1794	 *    bytes_may_use counter to compensate (when space is reserved at
 1795	 *    buffered write time, the bytes_may_use counter is incremented);
 1796	 *
 1797	 * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
 1798	 *    that if the COW path fails for any reason, it decrements (through
 1799	 *    extent_clear_unlock_delalloc()) the bytes_may_use counter of the
 1800	 *    data space info, which we incremented in the step above.
 1801	 *
 1802	 * If we need to fallback to cow and the inode corresponds to a free
 1803	 * space cache inode or an inode of the data relocation tree, we must
 1804	 * also increment bytes_may_use of the data space_info for the same
 1805	 * reason. Space caches and relocated data extents always get a prealloc
 1806	 * extent for them, however scrub or balance may have set the block
 1807	 * group that contains that extent to RO mode and therefore force COW
 1808	 * when starting writeback.
 1809	 */
 1810	lock_extent(io_tree, start, end, &cached_state);
 1811	count = count_range_bits(io_tree, &range_start, end, range_bytes,
 1812				 EXTENT_NORESERVE, 0, NULL);
 1813	if (count > 0 || is_space_ino || is_reloc_ino) {
 1814		u64 bytes = count;
 1815		struct btrfs_fs_info *fs_info = inode->root->fs_info;
 1816		struct btrfs_space_info *sinfo = fs_info->data_sinfo;
 1817
 1818		if (is_space_ino || is_reloc_ino)
 1819			bytes = range_bytes;
 1820
 1821		spin_lock(&sinfo->lock);
 1822		btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
 1823		spin_unlock(&sinfo->lock);
 1824
 1825		if (count > 0)
 1826			clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
 1827					 NULL);
 1828	}
 1829	unlock_extent(io_tree, start, end, &cached_state);
 1830
 1831	/*
 1832	 * Don't try to create inline extents, as a mix of inline extent that
 1833	 * is written out and unlocked directly and a normal NOCOW extent
 1834	 * doesn't work.
 1835	 */
 1836	ret = cow_file_range(inode, locked_page, start, end, NULL, false, true);
 1837	ASSERT(ret != 1);
 1838	return ret;
 1839}
 1840
 1841struct can_nocow_file_extent_args {
 1842	/* Input fields. */
 1843
 1844	/* Start file offset of the range we want to NOCOW. */
 1845	u64 start;
 1846	/* End file offset (inclusive) of the range we want to NOCOW. */
 1847	u64 end;
 1848	bool writeback_path;
 1849	bool strict;
 1850	/*
 1851	 * Free the path passed to can_nocow_file_extent() once it's not needed
 1852	 * anymore.
 1853	 */
 1854	bool free_path;
 1855
 1856	/*
 1857	 * Output fields. Only set when can_nocow_file_extent() returns 1.
 1858	 * The expected file extent for the NOCOW write.
 1859	 */
 1860	struct btrfs_file_extent file_extent;
 1861};
 1862
 1863/*
 1864 * Check if we can NOCOW the file extent that the path points to.
 1865 * This function may return with the path released, so the caller should check
 1866 * if path->nodes[0] is NULL or not if it needs to use the path afterwards.
 1867 *
 1868 * Returns: < 0 on error
 1869 *            0 if we can not NOCOW
 1870 *            1 if we can NOCOW
 1871 */
 1872static int can_nocow_file_extent(struct btrfs_path *path,
 1873				 struct btrfs_key *key,
 1874				 struct btrfs_inode *inode,
 1875				 struct can_nocow_file_extent_args *args)
 1876{
 1877	const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
 1878	struct extent_buffer *leaf = path->nodes[0];
 1879	struct btrfs_root *root = inode->root;
 1880	struct btrfs_file_extent_item *fi;
 1881	struct btrfs_root *csum_root;
 1882	u64 io_start;
 1883	u64 extent_end;
 1884	u8 extent_type;
 1885	int can_nocow = 0;
 1886	int ret = 0;
 1887	bool nowait = path->nowait;
 1888
 1889	fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
 1890	extent_type = btrfs_file_extent_type(leaf, fi);
 1891
 1892	if (extent_type == BTRFS_FILE_EXTENT_INLINE)
 1893		goto out;
 1894
 1895	if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
 1896	    extent_type == BTRFS_FILE_EXTENT_REG)
 1897		goto out;
 1898
 1899	/*
 1900	 * If the extent was created before the generation where the last snapshot
 1901	 * for its subvolume was created, then this implies the extent is shared,
 1902	 * hence we must COW.
 1903	 */
 1904	if (!args->strict &&
 1905	    btrfs_file_extent_generation(leaf, fi) <=
 1906	    btrfs_root_last_snapshot(&root->root_item))
 1907		goto out;
 1908
 1909	/* An explicit hole, must COW. */
 1910	if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
 1911		goto out;
 1912
 1913	/* Compressed/encrypted/encoded extents must be COWed. */
 1914	if (btrfs_file_extent_compression(leaf, fi) ||
 1915	    btrfs_file_extent_encryption(leaf, fi) ||
 1916	    btrfs_file_extent_other_encoding(leaf, fi))
 1917		goto out;
 1918
 1919	extent_end = btrfs_file_extent_end(path);
 1920
 1921	args->file_extent.disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 1922	args->file_extent.disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
 1923	args->file_extent.ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
 1924	args->file_extent.offset = btrfs_file_extent_offset(leaf, fi);
 1925	args->file_extent.compression = btrfs_file_extent_compression(leaf, fi);
 1926
 1927	/*
 1928	 * The following checks can be expensive, as they need to take other
 1929	 * locks and do btree or rbtree searches, so release the path to avoid
 1930	 * blocking other tasks for too long.
 1931	 */
 1932	btrfs_release_path(path);
 1933
 1934	ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
 1935				    key->offset - args->file_extent.offset,
 1936				    args->file_extent.disk_bytenr, args->strict, path);
 1937	WARN_ON_ONCE(ret > 0 && is_freespace_inode);
 1938	if (ret != 0)
 1939		goto out;
 1940
 1941	if (args->free_path) {
 1942		/*
 1943		 * We don't need the path anymore, plus through the
 1944		 * btrfs_lookup_csums_list() call below we will end up allocating
 1945		 * another path. So free the path to avoid unnecessary extra
 1946		 * memory usage.
 1947		 */
 1948		btrfs_free_path(path);
 1949		path = NULL;
 1950	}
 1951
 1952	/* If there are pending snapshots for this root, we must COW. */
 1953	if (args->writeback_path && !is_freespace_inode &&
 1954	    atomic_read(&root->snapshot_force_cow))
 1955		goto out;
 1956
 1957	args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start;
 1958	args->file_extent.offset += args->start - key->offset;
 1959	io_start = args->file_extent.disk_bytenr + args->file_extent.offset;
 1960
 1961	/*
 1962	 * Force COW if csums exist in the range. This ensures that csums for a
 1963	 * given extent are either valid or do not exist.
 1964	 */
 1965
 1966	csum_root = btrfs_csum_root(root->fs_info, io_start);
 1967	ret = btrfs_lookup_csums_list(csum_root, io_start,
 1968				      io_start + args->file_extent.num_bytes - 1,
 1969				      NULL, nowait);
 1970	WARN_ON_ONCE(ret > 0 && is_freespace_inode);
 1971	if (ret != 0)
 1972		goto out;
 1973
 1974	can_nocow = 1;
 1975 out:
 1976	if (args->free_path && path)
 1977		btrfs_free_path(path);
 1978
 1979	return ret < 0 ? ret : can_nocow;
 1980}
 1981
 1982/*
 1983 * when nowcow writeback call back.  This checks for snapshots or COW copies
 1984 * of the extents that exist in the file, and COWs the file as required.
 1985 *
 1986 * If no cow copies or snapshots exist, we write directly to the existing
 1987 * blocks on disk
 1988 */
 1989static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
 1990				       struct page *locked_page,
 1991				       const u64 start, const u64 end)
 1992{
 1993	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 1994	struct btrfs_root *root = inode->root;
 1995	struct btrfs_path *path;
 1996	u64 cow_start = (u64)-1;
 1997	u64 cur_offset = start;
 1998	int ret;
 1999	bool check_prev = true;
 2000	u64 ino = btrfs_ino(inode);
 2001	struct can_nocow_file_extent_args nocow_args = { 0 };
 2002
 2003	/*
 2004	 * Normally on a zoned device we're only doing COW writes, but in case
 2005	 * of relocation on a zoned filesystem serializes I/O so that we're only
 2006	 * writing sequentially and can end up here as well.
 2007	 */
 2008	ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root));
 2009
 2010	path = btrfs_alloc_path();
 2011	if (!path) {
 2012		ret = -ENOMEM;
 2013		goto error;
 2014	}
 2015
 2016	nocow_args.end = end;
 2017	nocow_args.writeback_path = true;
 2018
 2019	while (cur_offset <= end) {
 2020		struct btrfs_block_group *nocow_bg = NULL;
 2021		struct btrfs_ordered_extent *ordered;
 2022		struct btrfs_key found_key;
 2023		struct btrfs_file_extent_item *fi;
 2024		struct extent_buffer *leaf;
 2025		struct extent_state *cached_state = NULL;
 2026		u64 extent_end;
 2027		u64 nocow_end;
 2028		int extent_type;
 2029		bool is_prealloc;
 2030
 2031		ret = btrfs_lookup_file_extent(NULL, root, path, ino,
 2032					       cur_offset, 0);
 2033		if (ret < 0)
 2034			goto error;
 2035
 2036		/*
 2037		 * If there is no extent for our range when doing the initial
 2038		 * search, then go back to the previous slot as it will be the
 2039		 * one containing the search offset
 2040		 */
 2041		if (ret > 0 && path->slots[0] > 0 && check_prev) {
 2042			leaf = path->nodes[0];
 2043			btrfs_item_key_to_cpu(leaf, &found_key,
 2044					      path->slots[0] - 1);
 2045			if (found_key.objectid == ino &&
 2046			    found_key.type == BTRFS_EXTENT_DATA_KEY)
 2047				path->slots[0]--;
 2048		}
 2049		check_prev = false;
 2050next_slot:
 2051		/* Go to next leaf if we have exhausted the current one */
 2052		leaf = path->nodes[0];
 2053		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 2054			ret = btrfs_next_leaf(root, path);
 2055			if (ret < 0)
 2056				goto error;
 2057			if (ret > 0)
 2058				break;
 2059			leaf = path->nodes[0];
 2060		}
 2061
 2062		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 2063
 2064		/* Didn't find anything for our INO */
 2065		if (found_key.objectid > ino)
 2066			break;
 2067		/*
 2068		 * Keep searching until we find an EXTENT_ITEM or there are no
 2069		 * more extents for this inode
 2070		 */
 2071		if (WARN_ON_ONCE(found_key.objectid < ino) ||
 2072		    found_key.type < BTRFS_EXTENT_DATA_KEY) {
 2073			path->slots[0]++;
 2074			goto next_slot;
 2075		}
 2076
 2077		/* Found key is not EXTENT_DATA_KEY or starts after req range */
 2078		if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
 2079		    found_key.offset > end)
 2080			break;
 2081
 2082		/*
 2083		 * If the found extent starts after requested offset, then
 2084		 * adjust extent_end to be right before this extent begins
 2085		 */
 2086		if (found_key.offset > cur_offset) {
 2087			extent_end = found_key.offset;
 2088			extent_type = 0;
 2089			goto must_cow;
 2090		}
 2091
 2092		/*
 2093		 * Found extent which begins before our range and potentially
 2094		 * intersect it
 2095		 */
 2096		fi = btrfs_item_ptr(leaf, path->slots[0],
 2097				    struct btrfs_file_extent_item);
 2098		extent_type = btrfs_file_extent_type(leaf, fi);
 2099		/* If this is triggered then we have a memory corruption. */
 2100		ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES);
 2101		if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
 2102			ret = -EUCLEAN;
 2103			goto error;
 2104		}
 2105		extent_end = btrfs_file_extent_end(path);
 2106
 2107		/*
 2108		 * If the extent we got ends before our current offset, skip to
 2109		 * the next extent.
 2110		 */
 2111		if (extent_end <= cur_offset) {
 2112			path->slots[0]++;
 2113			goto next_slot;
 2114		}
 2115
 2116		nocow_args.start = cur_offset;
 2117		ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args);
 2118		if (ret < 0)
 2119			goto error;
 2120		if (ret == 0)
 2121			goto must_cow;
 2122
 2123		ret = 0;
 2124		nocow_bg = btrfs_inc_nocow_writers(fs_info,
 2125				nocow_args.file_extent.disk_bytenr +
 2126				nocow_args.file_extent.offset);
 2127		if (!nocow_bg) {
 2128must_cow:
 2129			/*
 2130			 * If we can't perform NOCOW writeback for the range,
 2131			 * then record the beginning of the range that needs to
 2132			 * be COWed.  It will be written out before the next
 2133			 * NOCOW range if we find one, or when exiting this
 2134			 * loop.
 2135			 */
 2136			if (cow_start == (u64)-1)
 2137				cow_start = cur_offset;
 2138			cur_offset = extent_end;
 2139			if (cur_offset > end)
 2140				break;
 2141			if (!path->nodes[0])
 2142				continue;
 2143			path->slots[0]++;
 2144			goto next_slot;
 2145		}
 2146
 2147		/*
 2148		 * COW range from cow_start to found_key.offset - 1. As the key
 2149		 * will contain the beginning of the first extent that can be
 2150		 * NOCOW, following one which needs to be COW'ed
 2151		 */
 2152		if (cow_start != (u64)-1) {
 2153			ret = fallback_to_cow(inode, locked_page,
 2154					      cow_start, found_key.offset - 1);
 2155			cow_start = (u64)-1;
 2156			if (ret) {
 2157				btrfs_dec_nocow_writers(nocow_bg);
 2158				goto error;
 2159			}
 2160		}
 2161
 2162		nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1;
 2163		lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state);
 2164
 2165		is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
 2166		if (is_prealloc) {
 2167			struct extent_map *em;
 2168
 2169			em = btrfs_create_io_em(inode, cur_offset,
 2170						&nocow_args.file_extent,
 2171						BTRFS_ORDERED_PREALLOC);
 2172			if (IS_ERR(em)) {
 2173				unlock_extent(&inode->io_tree, cur_offset,
 2174					      nocow_end, &cached_state);
 2175				btrfs_dec_nocow_writers(nocow_bg);
 2176				ret = PTR_ERR(em);
 2177				goto error;
 2178			}
 2179			free_extent_map(em);
 2180		}
 2181
 2182		ordered = btrfs_alloc_ordered_extent(inode, cur_offset,
 2183				&nocow_args.file_extent,
 2184				is_prealloc
 2185				? (1 << BTRFS_ORDERED_PREALLOC)
 2186				: (1 << BTRFS_ORDERED_NOCOW));
 2187		btrfs_dec_nocow_writers(nocow_bg);
 2188		if (IS_ERR(ordered)) {
 2189			if (is_prealloc) {
 2190				btrfs_drop_extent_map_range(inode, cur_offset,
 2191							    nocow_end, false);
 2192			}
 2193			unlock_extent(&inode->io_tree, cur_offset,
 2194				      nocow_end, &cached_state);
 2195			ret = PTR_ERR(ordered);
 2196			goto error;
 2197		}
 2198
 2199		if (btrfs_is_data_reloc_root(root))
 2200			/*
 2201			 * Error handled later, as we must prevent
 2202			 * extent_clear_unlock_delalloc() in error handler
 2203			 * from freeing metadata of created ordered extent.
 2204			 */
 2205			ret = btrfs_reloc_clone_csums(ordered);
 2206		btrfs_put_ordered_extent(ordered);
 2207
 2208		extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
 2209					     locked_page, &cached_state,
 2210					     EXTENT_LOCKED | EXTENT_DELALLOC |
 2211					     EXTENT_CLEAR_DATA_RESV,
 2212					     PAGE_UNLOCK | PAGE_SET_ORDERED);
 2213
 2214		cur_offset = extent_end;
 2215
 2216		/*
 2217		 * btrfs_reloc_clone_csums() error, now we're OK to call error
 2218		 * handler, as metadata for created ordered extent will only
 2219		 * be freed by btrfs_finish_ordered_io().
 2220		 */
 2221		if (ret)
 2222			goto error;
 2223	}
 2224	btrfs_release_path(path);
 2225
 2226	if (cur_offset <= end && cow_start == (u64)-1)
 2227		cow_start = cur_offset;
 2228
 2229	if (cow_start != (u64)-1) {
 2230		cur_offset = end;
 2231		ret = fallback_to_cow(inode, locked_page, cow_start, end);
 2232		cow_start = (u64)-1;
 2233		if (ret)
 2234			goto error;
 2235	}
 2236
 2237	btrfs_free_path(path);
 2238	return 0;
 2239
 2240error:
 2241	/*
 2242	 * If an error happened while a COW region is outstanding, cur_offset
 2243	 * needs to be reset to cow_start to ensure the COW region is unlocked
 2244	 * as well.
 2245	 */
 2246	if (cow_start != (u64)-1)
 2247		cur_offset = cow_start;
 2248
 2249	/*
 2250	 * We need to lock the extent here because we're clearing DELALLOC and
 2251	 * we're not locked at this point.
 2252	 */
 2253	if (cur_offset < end) {
 2254		struct extent_state *cached = NULL;
 2255
 2256		lock_extent(&inode->io_tree, cur_offset, end, &cached);
 2257		extent_clear_unlock_delalloc(inode, cur_offset, end,
 2258					     locked_page, &cached,
 2259					     EXTENT_LOCKED | EXTENT_DELALLOC |
 2260					     EXTENT_DEFRAG |
 2261					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
 2262					     PAGE_START_WRITEBACK |
 2263					     PAGE_END_WRITEBACK);
 2264		btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL);
 2265	}
 2266	btrfs_free_path(path);
 2267	return ret;
 2268}
 2269
 2270static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
 2271{
 2272	if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
 2273		if (inode->defrag_bytes &&
 2274		    test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
 2275			return false;
 2276		return true;
 2277	}
 2278	return false;
 2279}
 2280
 2281/*
 2282 * Function to process delayed allocation (create CoW) for ranges which are
 2283 * being touched for the first time.
 2284 */
 2285int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
 2286			     u64 start, u64 end, struct writeback_control *wbc)
 2287{
 2288	const bool zoned = btrfs_is_zoned(inode->root->fs_info);
 2289	int ret;
 2290
 2291	/*
 2292	 * The range must cover part of the @locked_page, or a return of 1
 2293	 * can confuse the caller.
 2294	 */
 2295	ASSERT(!(end <= page_offset(locked_page) ||
 2296		 start >= page_offset(locked_page) + PAGE_SIZE));
 2297
 2298	if (should_nocow(inode, start, end)) {
 2299		ret = run_delalloc_nocow(inode, locked_page, start, end);
 2300		goto out;
 2301	}
 2302
 2303	if (btrfs_inode_can_compress(inode) &&
 2304	    inode_need_compress(inode, start, end) &&
 2305	    run_delalloc_compressed(inode, locked_page, start, end, wbc))
 2306		return 1;
 2307
 2308	if (zoned)
 2309		ret = run_delalloc_cow(inode, locked_page, start, end, wbc,
 2310				       true);
 2311	else
 2312		ret = cow_file_range(inode, locked_page, start, end, NULL,
 2313				     false, false);
 2314
 2315out:
 2316	if (ret < 0)
 2317		btrfs_cleanup_ordered_extents(inode, locked_page, start,
 2318					      end - start + 1);
 2319	return ret;
 2320}
 2321
 2322void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
 2323				 struct extent_state *orig, u64 split)
 2324{
 2325	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 2326	u64 size;
 2327
 2328	lockdep_assert_held(&inode->io_tree.lock);
 2329
 2330	/* not delalloc, ignore it */
 2331	if (!(orig->state & EXTENT_DELALLOC))
 2332		return;
 2333
 2334	size = orig->end - orig->start + 1;
 2335	if (size > fs_info->max_extent_size) {
 2336		u32 num_extents;
 2337		u64 new_size;
 2338
 2339		/*
 2340		 * See the explanation in btrfs_merge_delalloc_extent, the same
 2341		 * applies here, just in reverse.
 2342		 */
 2343		new_size = orig->end - split + 1;
 2344		num_extents = count_max_extents(fs_info, new_size);
 2345		new_size = split - orig->start;
 2346		num_extents += count_max_extents(fs_info, new_size);
 2347		if (count_max_extents(fs_info, size) >= num_extents)
 2348			return;
 2349	}
 2350
 2351	spin_lock(&inode->lock);
 2352	btrfs_mod_outstanding_extents(inode, 1);
 2353	spin_unlock(&inode->lock);
 2354}
 2355
 2356/*
 2357 * Handle merged delayed allocation extents so we can keep track of new extents
 2358 * that are just merged onto old extents, such as when we are doing sequential
 2359 * writes, so we can properly account for the metadata space we'll need.
 2360 */
 2361void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
 2362				 struct extent_state *other)
 2363{
 2364	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 2365	u64 new_size, old_size;
 2366	u32 num_extents;
 2367
 2368	lockdep_assert_held(&inode->io_tree.lock);
 2369
 2370	/* not delalloc, ignore it */
 2371	if (!(other->state & EXTENT_DELALLOC))
 2372		return;
 2373
 2374	if (new->start > other->start)
 2375		new_size = new->end - other->start + 1;
 2376	else
 2377		new_size = other->end - new->start + 1;
 2378
 2379	/* we're not bigger than the max, unreserve the space and go */
 2380	if (new_size <= fs_info->max_extent_size) {
 2381		spin_lock(&inode->lock);
 2382		btrfs_mod_outstanding_extents(inode, -1);
 2383		spin_unlock(&inode->lock);
 2384		return;
 2385	}
 2386
 2387	/*
 2388	 * We have to add up either side to figure out how many extents were
 2389	 * accounted for before we merged into one big extent.  If the number of
 2390	 * extents we accounted for is <= the amount we need for the new range
 2391	 * then we can return, otherwise drop.  Think of it like this
 2392	 *
 2393	 * [ 4k][MAX_SIZE]
 2394	 *
 2395	 * So we've grown the extent by a MAX_SIZE extent, this would mean we
 2396	 * need 2 outstanding extents, on one side we have 1 and the other side
 2397	 * we have 1 so they are == and we can return.  But in this case
 2398	 *
 2399	 * [MAX_SIZE+4k][MAX_SIZE+4k]
 2400	 *
 2401	 * Each range on their own accounts for 2 extents, but merged together
 2402	 * they are only 3 extents worth of accounting, so we need to drop in
 2403	 * this case.
 2404	 */
 2405	old_size = other->end - other->start + 1;
 2406	num_extents = count_max_extents(fs_info, old_size);
 2407	old_size = new->end - new->start + 1;
 2408	num_extents += count_max_extents(fs_info, old_size);
 2409	if (count_max_extents(fs_info, new_size) >= num_extents)
 2410		return;
 2411
 2412	spin_lock(&inode->lock);
 2413	btrfs_mod_outstanding_extents(inode, -1);
 2414	spin_unlock(&inode->lock);
 2415}
 2416
 2417static void btrfs_add_delalloc_inode(struct btrfs_inode *inode)
 2418{
 2419	struct btrfs_root *root = inode->root;
 2420	struct btrfs_fs_info *fs_info = root->fs_info;
 2421
 2422	spin_lock(&root->delalloc_lock);
 2423	ASSERT(list_empty(&inode->delalloc_inodes));
 2424	list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
 2425	root->nr_delalloc_inodes++;
 2426	if (root->nr_delalloc_inodes == 1) {
 2427		spin_lock(&fs_info->delalloc_root_lock);
 2428		ASSERT(list_empty(&root->delalloc_root));
 2429		list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots);
 2430		spin_unlock(&fs_info->delalloc_root_lock);
 2431	}
 2432	spin_unlock(&root->delalloc_lock);
 2433}
 2434
 2435void btrfs_del_delalloc_inode(struct btrfs_inode *inode)
 2436{
 2437	struct btrfs_root *root = inode->root;
 2438	struct btrfs_fs_info *fs_info = root->fs_info;
 2439
 2440	lockdep_assert_held(&root->delalloc_lock);
 2441
 2442	/*
 2443	 * We may be called after the inode was already deleted from the list,
 2444	 * namely in the transaction abort path btrfs_destroy_delalloc_inodes(),
 2445	 * and then later through btrfs_clear_delalloc_extent() while the inode
 2446	 * still has ->delalloc_bytes > 0.
 2447	 */
 2448	if (!list_empty(&inode->delalloc_inodes)) {
 2449		list_del_init(&inode->delalloc_inodes);
 2450		root->nr_delalloc_inodes--;
 2451		if (!root->nr_delalloc_inodes) {
 2452			ASSERT(list_empty(&root->delalloc_inodes));
 2453			spin_lock(&fs_info->delalloc_root_lock);
 2454			ASSERT(!list_empty(&root->delalloc_root));
 2455			list_del_init(&root->delalloc_root);
 2456			spin_unlock(&fs_info->delalloc_root_lock);
 2457		}
 2458	}
 2459}
 2460
 2461/*
 2462 * Properly track delayed allocation bytes in the inode and to maintain the
 2463 * list of inodes that have pending delalloc work to be done.
 2464 */
 2465void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
 2466			       u32 bits)
 2467{
 2468	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 2469
 2470	lockdep_assert_held(&inode->io_tree.lock);
 2471
 2472	if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
 2473		WARN_ON(1);
 2474	/*
 2475	 * set_bit and clear bit hooks normally require _irqsave/restore
 2476	 * but in this case, we are only testing for the DELALLOC
 2477	 * bit, which is only set or cleared with irqs on
 2478	 */
 2479	if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 2480		u64 len = state->end + 1 - state->start;
 2481		u64 prev_delalloc_bytes;
 2482		u32 num_extents = count_max_extents(fs_info, len);
 2483
 2484		spin_lock(&inode->lock);
 2485		btrfs_mod_outstanding_extents(inode, num_extents);
 2486		spin_unlock(&inode->lock);
 2487
 2488		/* For sanity tests */
 2489		if (btrfs_is_testing(fs_info))
 2490			return;
 2491
 2492		percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
 2493					 fs_info->delalloc_batch);
 2494		spin_lock(&inode->lock);
 2495		prev_delalloc_bytes = inode->delalloc_bytes;
 2496		inode->delalloc_bytes += len;
 2497		if (bits & EXTENT_DEFRAG)
 2498			inode->defrag_bytes += len;
 2499		spin_unlock(&inode->lock);
 2500
 2501		/*
 2502		 * We don't need to be under the protection of the inode's lock,
 2503		 * because we are called while holding the inode's io_tree lock
 2504		 * and are therefore protected against concurrent calls of this
 2505		 * function and btrfs_clear_delalloc_extent().
 2506		 */
 2507		if (!btrfs_is_free_space_inode(inode) && prev_delalloc_bytes == 0)
 2508			btrfs_add_delalloc_inode(inode);
 2509	}
 2510
 2511	if (!(state->state & EXTENT_DELALLOC_NEW) &&
 2512	    (bits & EXTENT_DELALLOC_NEW)) {
 2513		spin_lock(&inode->lock);
 2514		inode->new_delalloc_bytes += state->end + 1 - state->start;
 2515		spin_unlock(&inode->lock);
 2516	}
 2517}
 2518
 2519/*
 2520 * Once a range is no longer delalloc this function ensures that proper
 2521 * accounting happens.
 2522 */
 2523void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
 2524				 struct extent_state *state, u32 bits)
 2525{
 2526	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 2527	u64 len = state->end + 1 - state->start;
 2528	u32 num_extents = count_max_extents(fs_info, len);
 2529
 2530	lockdep_assert_held(&inode->io_tree.lock);
 2531
 2532	if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
 2533		spin_lock(&inode->lock);
 2534		inode->defrag_bytes -= len;
 2535		spin_unlock(&inode->lock);
 2536	}
 2537
 2538	/*
 2539	 * set_bit and clear bit hooks normally require _irqsave/restore
 2540	 * but in this case, we are only testing for the DELALLOC
 2541	 * bit, which is only set or cleared with irqs on
 2542	 */
 2543	if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 2544		struct btrfs_root *root = inode->root;
 2545		u64 new_delalloc_bytes;
 2546
 2547		spin_lock(&inode->lock);
 2548		btrfs_mod_outstanding_extents(inode, -num_extents);
 2549		spin_unlock(&inode->lock);
 2550
 2551		/*
 2552		 * We don't reserve metadata space for space cache inodes so we
 2553		 * don't need to call delalloc_release_metadata if there is an
 2554		 * error.
 2555		 */
 2556		if (bits & EXTENT_CLEAR_META_RESV &&
 2557		    root != fs_info->tree_root)
 2558			btrfs_delalloc_release_metadata(inode, len, true);
 2559
 2560		/* For sanity tests. */
 2561		if (btrfs_is_testing(fs_info))
 2562			return;
 2563
 2564		if (!btrfs_is_data_reloc_root(root) &&
 2565		    !btrfs_is_free_space_inode(inode) &&
 2566		    !(state->state & EXTENT_NORESERVE) &&
 2567		    (bits & EXTENT_CLEAR_DATA_RESV))
 2568			btrfs_free_reserved_data_space_noquota(fs_info, len);
 2569
 2570		percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
 2571					 fs_info->delalloc_batch);
 2572		spin_lock(&inode->lock);
 2573		inode->delalloc_bytes -= len;
 2574		new_delalloc_bytes = inode->delalloc_bytes;
 2575		spin_unlock(&inode->lock);
 2576
 2577		/*
 2578		 * We don't need to be under the protection of the inode's lock,
 2579		 * because we are called while holding the inode's io_tree lock
 2580		 * and are therefore protected against concurrent calls of this
 2581		 * function and btrfs_set_delalloc_extent().
 2582		 */
 2583		if (!btrfs_is_free_space_inode(inode) && new_delalloc_bytes == 0) {
 2584			spin_lock(&root->delalloc_lock);
 2585			btrfs_del_delalloc_inode(inode);
 2586			spin_unlock(&root->delalloc_lock);
 2587		}
 2588	}
 2589
 2590	if ((state->state & EXTENT_DELALLOC_NEW) &&
 2591	    (bits & EXTENT_DELALLOC_NEW)) {
 2592		spin_lock(&inode->lock);
 2593		ASSERT(inode->new_delalloc_bytes >= len);
 2594		inode->new_delalloc_bytes -= len;
 2595		if (bits & EXTENT_ADD_INODE_BYTES)
 2596			inode_add_bytes(&inode->vfs_inode, len);
 2597		spin_unlock(&inode->lock);
 2598	}
 2599}
 2600
 2601/*
 2602 * given a list of ordered sums record them in the inode.  This happens
 2603 * at IO completion time based on sums calculated at bio submission time.
 2604 */
 2605static int add_pending_csums(struct btrfs_trans_handle *trans,
 2606			     struct list_head *list)
 2607{
 2608	struct btrfs_ordered_sum *sum;
 2609	struct btrfs_root *csum_root = NULL;
 2610	int ret;
 2611
 2612	list_for_each_entry(sum, list, list) {
 2613		trans->adding_csums = true;
 2614		if (!csum_root)
 2615			csum_root = btrfs_csum_root(trans->fs_info,
 2616						    sum->logical);
 2617		ret = btrfs_csum_file_blocks(trans, csum_root, sum);
 2618		trans->adding_csums = false;
 2619		if (ret)
 2620			return ret;
 2621	}
 2622	return 0;
 2623}
 2624
 2625static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
 2626					 const u64 start,
 2627					 const u64 len,
 2628					 struct extent_state **cached_state)
 2629{
 2630	u64 search_start = start;
 2631	const u64 end = start + len - 1;
 2632
 2633	while (search_start < end) {
 2634		const u64 search_len = end - search_start + 1;
 2635		struct extent_map *em;
 2636		u64 em_len;
 2637		int ret = 0;
 2638
 2639		em = btrfs_get_extent(inode, NULL, search_start, search_len);
 2640		if (IS_ERR(em))
 2641			return PTR_ERR(em);
 2642
 2643		if (em->disk_bytenr != EXTENT_MAP_HOLE)
 2644			goto next;
 2645
 2646		em_len = em->len;
 2647		if (em->start < search_start)
 2648			em_len -= search_start - em->start;
 2649		if (em_len > search_len)
 2650			em_len = search_len;
 2651
 2652		ret = set_extent_bit(&inode->io_tree, search_start,
 2653				     search_start + em_len - 1,
 2654				     EXTENT_DELALLOC_NEW, cached_state);
 2655next:
 2656		search_start = extent_map_end(em);
 2657		free_extent_map(em);
 2658		if (ret)
 2659			return ret;
 2660	}
 2661	return 0;
 2662}
 2663
 2664int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
 2665			      unsigned int extra_bits,
 2666			      struct extent_state **cached_state)
 2667{
 2668	WARN_ON(PAGE_ALIGNED(end));
 2669
 2670	if (start >= i_size_read(&inode->vfs_inode) &&
 2671	    !(inode->flags & BTRFS_INODE_PREALLOC)) {
 2672		/*
 2673		 * There can't be any extents following eof in this case so just
 2674		 * set the delalloc new bit for the range directly.
 2675		 */
 2676		extra_bits |= EXTENT_DELALLOC_NEW;
 2677	} else {
 2678		int ret;
 2679
 2680		ret = btrfs_find_new_delalloc_bytes(inode, start,
 2681						    end + 1 - start,
 2682						    cached_state);
 2683		if (ret)
 2684			return ret;
 2685	}
 2686
 2687	return set_extent_bit(&inode->io_tree, start, end,
 2688			      EXTENT_DELALLOC | extra_bits, cached_state);
 2689}
 2690
 2691/* see btrfs_writepage_start_hook for details on why this is required */
 2692struct btrfs_writepage_fixup {
 2693	struct page *page;
 2694	struct btrfs_inode *inode;
 2695	struct btrfs_work work;
 2696};
 2697
 2698static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
 2699{
 2700	struct btrfs_writepage_fixup *fixup =
 2701		container_of(work, struct btrfs_writepage_fixup, work);
 2702	struct btrfs_ordered_extent *ordered;
 2703	struct extent_state *cached_state = NULL;
 2704	struct extent_changeset *data_reserved = NULL;
 2705	struct page *page = fixup->page;
 2706	struct btrfs_inode *inode = fixup->inode;
 2707	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 2708	u64 page_start = page_offset(page);
 2709	u64 page_end = page_offset(page) + PAGE_SIZE - 1;
 2710	int ret = 0;
 2711	bool free_delalloc_space = true;
 2712
 2713	/*
 2714	 * This is similar to page_mkwrite, we need to reserve the space before
 2715	 * we take the page lock.
 2716	 */
 2717	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
 2718					   PAGE_SIZE);
 2719again:
 2720	lock_page(page);
 2721
 2722	/*
 2723	 * Before we queued this fixup, we took a reference on the page.
 2724	 * page->mapping may go NULL, but it shouldn't be moved to a different
 2725	 * address space.
 2726	 */
 2727	if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
 2728		/*
 2729		 * Unfortunately this is a little tricky, either
 2730		 *
 2731		 * 1) We got here and our page had already been dealt with and
 2732		 *    we reserved our space, thus ret == 0, so we need to just
 2733		 *    drop our space reservation and bail.  This can happen the
 2734		 *    first time we come into the fixup worker, or could happen
 2735		 *    while waiting for the ordered extent.
 2736		 * 2) Our page was already dealt with, but we happened to get an
 2737		 *    ENOSPC above from the btrfs_delalloc_reserve_space.  In
 2738		 *    this case we obviously don't have anything to release, but
 2739		 *    because the page was already dealt with we don't want to
 2740		 *    mark the page with an error, so make sure we're resetting
 2741		 *    ret to 0.  This is why we have this check _before_ the ret
 2742		 *    check, because we do not want to have a surprise ENOSPC
 2743		 *    when the page was already properly dealt with.
 2744		 */
 2745		if (!ret) {
 2746			btrfs_delalloc_release_extents(inode, PAGE_SIZE);
 2747			btrfs_delalloc_release_space(inode, data_reserved,
 2748						     page_start, PAGE_SIZE,
 2749						     true);
 2750		}
 2751		ret = 0;
 2752		goto out_page;
 2753	}
 2754
 2755	/*
 2756	 * We can't mess with the page state unless it is locked, so now that
 2757	 * it is locked bail if we failed to make our space reservation.
 2758	 */
 2759	if (ret)
 2760		goto out_page;
 2761
 2762	lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
 2763
 2764	/* already ordered? We're done */
 2765	if (PageOrdered(page))
 2766		goto out_reserved;
 2767
 2768	ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
 2769	if (ordered) {
 2770		unlock_extent(&inode->io_tree, page_start, page_end,
 2771			      &cached_state);
 2772		unlock_page(page);
 2773		btrfs_start_ordered_extent(ordered);
 2774		btrfs_put_ordered_extent(ordered);
 2775		goto again;
 2776	}
 2777
 2778	ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
 2779					&cached_state);
 2780	if (ret)
 2781		goto out_reserved;
 2782
 2783	/*
 2784	 * Everything went as planned, we're now the owner of a dirty page with
 2785	 * delayed allocation bits set and space reserved for our COW
 2786	 * destination.
 2787	 *
 2788	 * The page was dirty when we started, nothing should have cleaned it.
 2789	 */
 2790	BUG_ON(!PageDirty(page));
 2791	free_delalloc_space = false;
 2792out_reserved:
 2793	btrfs_delalloc_release_extents(inode, PAGE_SIZE);
 2794	if (free_delalloc_space)
 2795		btrfs_delalloc_release_space(inode, data_reserved, page_start,
 2796					     PAGE_SIZE, true);
 2797	unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
 2798out_page:
 2799	if (ret) {
 2800		/*
 2801		 * We hit ENOSPC or other errors.  Update the mapping and page
 2802		 * to reflect the errors and clean the page.
 2803		 */
 2804		mapping_set_error(page->mapping, ret);
 2805		btrfs_mark_ordered_io_finished(inode, page, page_start,
 2806					       PAGE_SIZE, !ret);
 2807		clear_page_dirty_for_io(page);
 2808	}
 2809	btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE);
 2810	unlock_page(page);
 2811	put_page(page);
 2812	kfree(fixup);
 2813	extent_changeset_free(data_reserved);
 2814	/*
 2815	 * As a precaution, do a delayed iput in case it would be the last iput
 2816	 * that could need flushing space. Recursing back to fixup worker would
 2817	 * deadlock.
 2818	 */
 2819	btrfs_add_delayed_iput(inode);
 2820}
 2821
 2822/*
 2823 * There are a few paths in the higher layers of the kernel that directly
 2824 * set the page dirty bit without asking the filesystem if it is a
 2825 * good idea.  This causes problems because we want to make sure COW
 2826 * properly happens and the data=ordered rules are followed.
 2827 *
 2828 * In our case any range that doesn't have the ORDERED bit set
 2829 * hasn't been properly setup for IO.  We kick off an async process
 2830 * to fix it up.  The async helper will wait for ordered extents, set
 2831 * the delalloc bit and make it safe to write the page.
 2832 */
 2833int btrfs_writepage_cow_fixup(struct page *page)
 2834{
 2835	struct inode *inode = page->mapping->host;
 2836	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 2837	struct btrfs_writepage_fixup *fixup;
 2838
 2839	/* This page has ordered extent covering it already */
 2840	if (PageOrdered(page))
 2841		return 0;
 2842
 2843	/*
 2844	 * PageChecked is set below when we create a fixup worker for this page,
 2845	 * don't try to create another one if we're already PageChecked()
 2846	 *
 2847	 * The extent_io writepage code will redirty the page if we send back
 2848	 * EAGAIN.
 2849	 */
 2850	if (PageChecked(page))
 2851		return -EAGAIN;
 2852
 2853	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
 2854	if (!fixup)
 2855		return -EAGAIN;
 2856
 2857	/*
 2858	 * We are already holding a reference to this inode from
 2859	 * write_cache_pages.  We need to hold it because the space reservation
 2860	 * takes place outside of the page lock, and we can't trust
 2861	 * page->mapping outside of the page lock.
 2862	 */
 2863	ihold(inode);
 2864	btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE);
 2865	get_page(page);
 2866	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
 2867	fixup->page = page;
 2868	fixup->inode = BTRFS_I(inode);
 2869	btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
 2870
 2871	return -EAGAIN;
 2872}
 2873
 2874static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 2875				       struct btrfs_inode *inode, u64 file_pos,
 2876				       struct btrfs_file_extent_item *stack_fi,
 2877				       const bool update_inode_bytes,
 2878				       u64 qgroup_reserved)
 2879{
 2880	struct btrfs_root *root = inode->root;
 2881	const u64 sectorsize = root->fs_info->sectorsize;
 2882	struct btrfs_path *path;
 2883	struct extent_buffer *leaf;
 2884	struct btrfs_key ins;
 2885	u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
 2886	u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
 2887	u64 offset = btrfs_stack_file_extent_offset(stack_fi);
 2888	u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
 2889	u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
 2890	struct btrfs_drop_extents_args drop_args = { 0 };
 2891	int ret;
 2892
 2893	path = btrfs_alloc_path();
 2894	if (!path)
 2895		return -ENOMEM;
 2896
 2897	/*
 2898	 * we may be replacing one extent in the tree with another.
 2899	 * The new extent is pinned in the extent map, and we don't want
 2900	 * to drop it from the cache until it is completely in the btree.
 2901	 *
 2902	 * So, tell btrfs_drop_extents to leave this extent in the cache.
 2903	 * the caller is expected to unpin it and allow it to be merged
 2904	 * with the others.
 2905	 */
 2906	drop_args.path = path;
 2907	drop_args.start = file_pos;
 2908	drop_args.end = file_pos + num_bytes;
 2909	drop_args.replace_extent = true;
 2910	drop_args.extent_item_size = sizeof(*stack_fi);
 2911	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
 2912	if (ret)
 2913		goto out;
 2914
 2915	if (!drop_args.extent_inserted) {
 2916		ins.objectid = btrfs_ino(inode);
 2917		ins.offset = file_pos;
 2918		ins.type = BTRFS_EXTENT_DATA_KEY;
 2919
 2920		ret = btrfs_insert_empty_item(trans, root, path, &ins,
 2921					      sizeof(*stack_fi));
 2922		if (ret)
 2923			goto out;
 2924	}
 2925	leaf = path->nodes[0];
 2926	btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
 2927	write_extent_buffer(leaf, stack_fi,
 2928			btrfs_item_ptr_offset(leaf, path->slots[0]),
 2929			sizeof(struct btrfs_file_extent_item));
 2930
 2931	btrfs_mark_buffer_dirty(trans, leaf);
 2932	btrfs_release_path(path);
 2933
 2934	/*
 2935	 * If we dropped an inline extent here, we know the range where it is
 2936	 * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
 2937	 * number of bytes only for that range containing the inline extent.
 2938	 * The remaining of the range will be processed when clearning the
 2939	 * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
 2940	 */
 2941	if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
 2942		u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
 2943
 2944		inline_size = drop_args.bytes_found - inline_size;
 2945		btrfs_update_inode_bytes(inode, sectorsize, inline_size);
 2946		drop_args.bytes_found -= inline_size;
 2947		num_bytes -= sectorsize;
 2948	}
 2949
 2950	if (update_inode_bytes)
 2951		btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
 2952
 2953	ins.objectid = disk_bytenr;
 2954	ins.offset = disk_num_bytes;
 2955	ins.type = BTRFS_EXTENT_ITEM_KEY;
 2956
 2957	ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
 2958	if (ret)
 2959		goto out;
 2960
 2961	ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
 2962					       file_pos - offset,
 2963					       qgroup_reserved, &ins);
 2964out:
 2965	btrfs_free_path(path);
 2966
 2967	return ret;
 2968}
 2969
 2970static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
 2971					 u64 start, u64 len)
 2972{
 2973	struct btrfs_block_group *cache;
 2974
 2975	cache = btrfs_lookup_block_group(fs_info, start);
 2976	ASSERT(cache);
 2977
 2978	spin_lock(&cache->lock);
 2979	cache->delalloc_bytes -= len;
 2980	spin_unlock(&cache->lock);
 2981
 2982	btrfs_put_block_group(cache);
 2983}
 2984
 2985static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
 2986					     struct btrfs_ordered_extent *oe)
 2987{
 2988	struct btrfs_file_extent_item stack_fi;
 2989	bool update_inode_bytes;
 2990	u64 num_bytes = oe->num_bytes;
 2991	u64 ram_bytes = oe->ram_bytes;
 2992
 2993	memset(&stack_fi, 0, sizeof(stack_fi));
 2994	btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
 2995	btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
 2996	btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
 2997						   oe->disk_num_bytes);
 2998	btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
 2999	if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
 3000		num_bytes = oe->truncated_len;
 3001	btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
 3002	btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
 3003	btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
 3004	/* Encryption and other encoding is reserved and all 0 */
 3005
 3006	/*
 3007	 * For delalloc, when completing an ordered extent we update the inode's
 3008	 * bytes when clearing the range in the inode's io tree, so pass false
 3009	 * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
 3010	 * except if the ordered extent was truncated.
 3011	 */
 3012	update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
 3013			     test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
 3014			     test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
 3015
 3016	return insert_reserved_file_extent(trans, oe->inode,
 3017					   oe->file_offset, &stack_fi,
 3018					   update_inode_bytes, oe->qgroup_rsv);
 3019}
 3020
 3021/*
 3022 * As ordered data IO finishes, this gets called so we can finish
 3023 * an ordered extent if the range of bytes in the file it covers are
 3024 * fully written.
 3025 */
 3026int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
 3027{
 3028	struct btrfs_inode *inode = ordered_extent->inode;
 3029	struct btrfs_root *root = inode->root;
 3030	struct btrfs_fs_info *fs_info = root->fs_info;
 3031	struct btrfs_trans_handle *trans = NULL;
 3032	struct extent_io_tree *io_tree = &inode->io_tree;
 3033	struct extent_state *cached_state = NULL;
 3034	u64 start, end;
 3035	int compress_type = 0;
 3036	int ret = 0;
 3037	u64 logical_len = ordered_extent->num_bytes;
 3038	bool freespace_inode;
 3039	bool truncated = false;
 3040	bool clear_reserved_extent = true;
 3041	unsigned int clear_bits = EXTENT_DEFRAG;
 3042
 3043	start = ordered_extent->file_offset;
 3044	end = start + ordered_extent->num_bytes - 1;
 3045
 3046	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
 3047	    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
 3048	    !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
 3049	    !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
 3050		clear_bits |= EXTENT_DELALLOC_NEW;
 3051
 3052	freespace_inode = btrfs_is_free_space_inode(inode);
 3053	if (!freespace_inode)
 3054		btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
 3055
 3056	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
 3057		ret = -EIO;
 3058		goto out;
 3059	}
 3060
 3061	if (btrfs_is_zoned(fs_info))
 3062		btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
 3063					ordered_extent->disk_num_bytes);
 3064
 3065	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
 3066		truncated = true;
 3067		logical_len = ordered_extent->truncated_len;
 3068		/* Truncated the entire extent, don't bother adding */
 3069		if (!logical_len)
 3070			goto out;
 3071	}
 3072
 3073	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
 3074		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
 3075
 3076		btrfs_inode_safe_disk_i_size_write(inode, 0);
 3077		if (freespace_inode)
 3078			trans = btrfs_join_transaction_spacecache(root);
 3079		else
 3080			trans = btrfs_join_transaction(root);
 3081		if (IS_ERR(trans)) {
 3082			ret = PTR_ERR(trans);
 3083			trans = NULL;
 3084			goto out;
 3085		}
 3086		trans->block_rsv = &inode->block_rsv;
 3087		ret = btrfs_update_inode_fallback(trans, inode);
 3088		if (ret) /* -ENOMEM or corruption */
 3089			btrfs_abort_transaction(trans, ret);
 3090		goto out;
 3091	}
 3092
 3093	clear_bits |= EXTENT_LOCKED;
 3094	lock_extent(io_tree, start, end, &cached_state);
 3095
 3096	if (freespace_inode)
 3097		trans = btrfs_join_transaction_spacecache(root);
 3098	else
 3099		trans = btrfs_join_transaction(root);
 3100	if (IS_ERR(trans)) {
 3101		ret = PTR_ERR(trans);
 3102		trans = NULL;
 3103		goto out;
 3104	}
 3105
 3106	trans->block_rsv = &inode->block_rsv;
 3107
 3108	ret = btrfs_insert_raid_extent(trans, ordered_extent);
 3109	if (ret)
 3110		goto out;
 3111
 3112	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
 3113		compress_type = ordered_extent->compress_type;
 3114	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
 3115		BUG_ON(compress_type);
 3116		ret = btrfs_mark_extent_written(trans, inode,
 3117						ordered_extent->file_offset,
 3118						ordered_extent->file_offset +
 3119						logical_len);
 3120		btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
 3121						  ordered_extent->disk_num_bytes);
 3122	} else {
 3123		BUG_ON(root == fs_info->tree_root);
 3124		ret = insert_ordered_extent_file_extent(trans, ordered_extent);
 3125		if (!ret) {
 3126			clear_reserved_extent = false;
 3127			btrfs_release_delalloc_bytes(fs_info,
 3128						ordered_extent->disk_bytenr,
 3129						ordered_extent->disk_num_bytes);
 3130		}
 3131	}
 3132	if (ret < 0) {
 3133		btrfs_abort_transaction(trans, ret);
 3134		goto out;
 3135	}
 3136
 3137	ret = unpin_extent_cache(inode, ordered_extent->file_offset,
 3138				 ordered_extent->num_bytes, trans->transid);
 3139	if (ret < 0) {
 3140		btrfs_abort_transaction(trans, ret);
 3141		goto out;
 3142	}
 3143
 3144	ret = add_pending_csums(trans, &ordered_extent->list);
 3145	if (ret) {
 3146		btrfs_abort_transaction(trans, ret);
 3147		goto out;
 3148	}
 3149
 3150	/*
 3151	 * If this is a new delalloc range, clear its new delalloc flag to
 3152	 * update the inode's number of bytes. This needs to be done first
 3153	 * before updating the inode item.
 3154	 */
 3155	if ((clear_bits & EXTENT_DELALLOC_NEW) &&
 3156	    !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
 3157		clear_extent_bit(&inode->io_tree, start, end,
 3158				 EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
 3159				 &cached_state);
 3160
 3161	btrfs_inode_safe_disk_i_size_write(inode, 0);
 3162	ret = btrfs_update_inode_fallback(trans, inode);
 3163	if (ret) { /* -ENOMEM or corruption */
 3164		btrfs_abort_transaction(trans, ret);
 3165		goto out;
 3166	}
 3167out:
 3168	clear_extent_bit(&inode->io_tree, start, end, clear_bits,
 3169			 &cached_state);
 3170
 3171	if (trans)
 3172		btrfs_end_transaction(trans);
 3173
 3174	if (ret || truncated) {
 3175		u64 unwritten_start = start;
 3176
 3177		/*
 3178		 * If we failed to finish this ordered extent for any reason we
 3179		 * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
 3180		 * extent, and mark the inode with the error if it wasn't
 3181		 * already set.  Any error during writeback would have already
 3182		 * set the mapping error, so we need to set it if we're the ones
 3183		 * marking this ordered extent as failed.
 3184		 */
 3185		if (ret)
 3186			btrfs_mark_ordered_extent_error(ordered_extent);
 3187
 3188		if (truncated)
 3189			unwritten_start += logical_len;
 3190		clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
 3191
 3192		/*
 3193		 * Drop extent maps for the part of the extent we didn't write.
 3194		 *
 3195		 * We have an exception here for the free_space_inode, this is
 3196		 * because when we do btrfs_get_extent() on the free space inode
 3197		 * we will search the commit root.  If this is a new block group
 3198		 * we won't find anything, and we will trip over the assert in
 3199		 * writepage where we do ASSERT(em->block_start !=
 3200		 * EXTENT_MAP_HOLE).
 3201		 *
 3202		 * Theoretically we could also skip this for any NOCOW extent as
 3203		 * we don't mess with the extent map tree in the NOCOW case, but
 3204		 * for now simply skip this if we are the free space inode.
 3205		 */
 3206		if (!btrfs_is_free_space_inode(inode))
 3207			btrfs_drop_extent_map_range(inode, unwritten_start,
 3208						    end, false);
 3209
 3210		/*
 3211		 * If the ordered extent had an IOERR or something else went
 3212		 * wrong we need to return the space for this ordered extent
 3213		 * back to the allocator.  We only free the extent in the
 3214		 * truncated case if we didn't write out the extent at all.
 3215		 *
 3216		 * If we made it past insert_reserved_file_extent before we
 3217		 * errored out then we don't need to do this as the accounting
 3218		 * has already been done.
 3219		 */
 3220		if ((ret || !logical_len) &&
 3221		    clear_reserved_extent &&
 3222		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
 3223		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
 3224			/*
 3225			 * Discard the range before returning it back to the
 3226			 * free space pool
 3227			 */
 3228			if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
 3229				btrfs_discard_extent(fs_info,
 3230						ordered_extent->disk_bytenr,
 3231						ordered_extent->disk_num_bytes,
 3232						NULL);
 3233			btrfs_free_reserved_extent(fs_info,
 3234					ordered_extent->disk_bytenr,
 3235					ordered_extent->disk_num_bytes, 1);
 3236			/*
 3237			 * Actually free the qgroup rsv which was released when
 3238			 * the ordered extent was created.
 3239			 */
 3240			btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(inode->root),
 3241						  ordered_extent->qgroup_rsv,
 3242						  BTRFS_QGROUP_RSV_DATA);
 3243		}
 3244	}
 3245
 3246	/*
 3247	 * This needs to be done to make sure anybody waiting knows we are done
 3248	 * updating everything for this ordered extent.
 3249	 */
 3250	btrfs_remove_ordered_extent(inode, ordered_extent);
 3251
 3252	/* once for us */
 3253	btrfs_put_ordered_extent(ordered_extent);
 3254	/* once for the tree */
 3255	btrfs_put_ordered_extent(ordered_extent);
 3256
 3257	return ret;
 3258}
 3259
 3260int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
 3261{
 3262	if (btrfs_is_zoned(ordered->inode->root->fs_info) &&
 3263	    !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
 3264	    list_empty(&ordered->bioc_list))
 3265		btrfs_finish_ordered_zoned(ordered);
 3266	return btrfs_finish_one_ordered(ordered);
 3267}
 3268
 3269/*
 3270 * Verify the checksum for a single sector without any extra action that depend
 3271 * on the type of I/O.
 3272 */
 3273int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
 3274			    u32 pgoff, u8 *csum, const u8 * const csum_expected)
 3275{
 3276	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 3277	char *kaddr;
 3278
 3279	ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
 3280
 3281	shash->tfm = fs_info->csum_shash;
 3282
 3283	kaddr = kmap_local_page(page) + pgoff;
 3284	crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
 3285	kunmap_local(kaddr);
 3286
 3287	if (memcmp(csum, csum_expected, fs_info->csum_size))
 3288		return -EIO;
 3289	return 0;
 3290}
 3291
 3292/*
 3293 * Verify the checksum of a single data sector.
 3294 *
 3295 * @bbio:	btrfs_io_bio which contains the csum
 3296 * @dev:	device the sector is on
 3297 * @bio_offset:	offset to the beginning of the bio (in bytes)
 3298 * @bv:		bio_vec to check
 3299 *
 3300 * Check if the checksum on a data block is valid.  When a checksum mismatch is
 3301 * detected, report the error and fill the corrupted range with zero.
 3302 *
 3303 * Return %true if the sector is ok or had no checksum to start with, else %false.
 3304 */
 3305bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
 3306			u32 bio_offset, struct bio_vec *bv)
 3307{
 3308	struct btrfs_inode *inode = bbio->inode;
 3309	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 3310	u64 file_offset = bbio->file_offset + bio_offset;
 3311	u64 end = file_offset + bv->bv_len - 1;
 3312	u8 *csum_expected;
 3313	u8 csum[BTRFS_CSUM_SIZE];
 3314
 3315	ASSERT(bv->bv_len == fs_info->sectorsize);
 3316
 3317	if (!bbio->csum)
 3318		return true;
 3319
 3320	if (btrfs_is_data_reloc_root(inode->root) &&
 3321	    test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
 3322			   NULL)) {
 3323		/* Skip the range without csum for data reloc inode */
 3324		clear_extent_bits(&inode->io_tree, file_offset, end,
 3325				  EXTENT_NODATASUM);
 3326		return true;
 3327	}
 3328
 3329	csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
 3330				fs_info->csum_size;
 3331	if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum,
 3332				    csum_expected))
 3333		goto zeroit;
 3334	return true;
 3335
 3336zeroit:
 3337	btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected,
 3338				    bbio->mirror_num);
 3339	if (dev)
 3340		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
 3341	memzero_bvec(bv);
 3342	return false;
 3343}
 3344
 3345/*
 3346 * Perform a delayed iput on @inode.
 3347 *
 3348 * @inode: The inode we want to perform iput on
 3349 *
 3350 * This function uses the generic vfs_inode::i_count to track whether we should
 3351 * just decrement it (in case it's > 1) or if this is the last iput then link
 3352 * the inode to the delayed iput machinery. Delayed iputs are processed at
 3353 * transaction commit time/superblock commit/cleaner kthread.
 3354 */
 3355void btrfs_add_delayed_iput(struct btrfs_inode *inode)
 3356{
 3357	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 3358	unsigned long flags;
 3359
 3360	if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
 3361		return;
 3362
 3363	atomic_inc(&fs_info->nr_delayed_iputs);
 3364	/*
 3365	 * Need to be irq safe here because we can be called from either an irq
 3366	 * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq
 3367	 * context.
 3368	 */
 3369	spin_lock_irqsave(&fs_info->delayed_iput_lock, flags);
 3370	ASSERT(list_empty(&inode->delayed_iput));
 3371	list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs);
 3372	spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags);
 3373	if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
 3374		wake_up_process(fs_info->cleaner_kthread);
 3375}
 3376
 3377static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
 3378				    struct btrfs_inode *inode)
 3379{
 3380	list_del_init(&inode->delayed_iput);
 3381	spin_unlock_irq(&fs_info->delayed_iput_lock);
 3382	iput(&inode->vfs_inode);
 3383	if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
 3384		wake_up(&fs_info->delayed_iputs_wait);
 3385	spin_lock_irq(&fs_info->delayed_iput_lock);
 3386}
 3387
 3388static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
 3389				   struct btrfs_inode *inode)
 3390{
 3391	if (!list_empty(&inode->delayed_iput)) {
 3392		spin_lock_irq(&fs_info->delayed_iput_lock);
 3393		if (!list_empty(&inode->delayed_iput))
 3394			run_delayed_iput_locked(fs_info, inode);
 3395		spin_unlock_irq(&fs_info->delayed_iput_lock);
 3396	}
 3397}
 3398
 3399void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
 3400{
 3401	/*
 3402	 * btrfs_put_ordered_extent() can run in irq context (see bio.c), which
 3403	 * calls btrfs_add_delayed_iput() and that needs to lock
 3404	 * fs_info->delayed_iput_lock. So we need to disable irqs here to
 3405	 * prevent a deadlock.
 3406	 */
 3407	spin_lock_irq(&fs_info->delayed_iput_lock);
 3408	while (!list_empty(&fs_info->delayed_iputs)) {
 3409		struct btrfs_inode *inode;
 3410
 3411		inode = list_first_entry(&fs_info->delayed_iputs,
 3412				struct btrfs_inode, delayed_iput);
 3413		run_delayed_iput_locked(fs_info, inode);
 3414		if (need_resched()) {
 3415			spin_unlock_irq(&fs_info->delayed_iput_lock);
 3416			cond_resched();
 3417			spin_lock_irq(&fs_info->delayed_iput_lock);
 3418		}
 3419	}
 3420	spin_unlock_irq(&fs_info->delayed_iput_lock);
 3421}
 3422
 3423/*
 3424 * Wait for flushing all delayed iputs
 3425 *
 3426 * @fs_info:  the filesystem
 3427 *
 3428 * This will wait on any delayed iputs that are currently running with KILLABLE
 3429 * set.  Once they are all done running we will return, unless we are killed in
 3430 * which case we return EINTR. This helps in user operations like fallocate etc
 3431 * that might get blocked on the iputs.
 3432 *
 3433 * Return EINTR if we were killed, 0 if nothing's pending
 3434 */
 3435int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
 3436{
 3437	int ret = wait_event_killable(fs_info->delayed_iputs_wait,
 3438			atomic_read(&fs_info->nr_delayed_iputs) == 0);
 3439	if (ret)
 3440		return -EINTR;
 3441	return 0;
 3442}
 3443
 3444/*
 3445 * This creates an orphan entry for the given inode in case something goes wrong
 3446 * in the middle of an unlink.
 3447 */
 3448int btrfs_orphan_add(struct btrfs_trans_handle *trans,
 3449		     struct btrfs_inode *inode)
 3450{
 3451	int ret;
 3452
 3453	ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
 3454	if (ret && ret != -EEXIST) {
 3455		btrfs_abort_transaction(trans, ret);
 3456		return ret;
 3457	}
 3458
 3459	return 0;
 3460}
 3461
 3462/*
 3463 * We have done the delete so we can go ahead and remove the orphan item for
 3464 * this particular inode.
 3465 */
 3466static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
 3467			    struct btrfs_inode *inode)
 3468{
 3469	return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
 3470}
 3471
 3472/*
 3473 * this cleans up any orphans that may be left on the list from the last use
 3474 * of this root.
 3475 */
 3476int btrfs_orphan_cleanup(struct btrfs_root *root)
 3477{
 3478	struct btrfs_fs_info *fs_info = root->fs_info;
 3479	struct btrfs_path *path;
 3480	struct extent_buffer *leaf;
 3481	struct btrfs_key key, found_key;
 3482	struct btrfs_trans_handle *trans;
 3483	struct inode *inode;
 3484	u64 last_objectid = 0;
 3485	int ret = 0, nr_unlink = 0;
 3486
 3487	if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
 3488		return 0;
 3489
 3490	path = btrfs_alloc_path();
 3491	if (!path) {
 3492		ret = -ENOMEM;
 3493		goto out;
 3494	}
 3495	path->reada = READA_BACK;
 3496
 3497	key.objectid = BTRFS_ORPHAN_OBJECTID;
 3498	key.type = BTRFS_ORPHAN_ITEM_KEY;
 3499	key.offset = (u64)-1;
 3500
 3501	while (1) {
 3502		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 3503		if (ret < 0)
 3504			goto out;
 3505
 3506		/*
 3507		 * if ret == 0 means we found what we were searching for, which
 3508		 * is weird, but possible, so only screw with path if we didn't
 3509		 * find the key and see if we have stuff that matches
 3510		 */
 3511		if (ret > 0) {
 3512			ret = 0;
 3513			if (path->slots[0] == 0)
 3514				break;
 3515			path->slots[0]--;
 3516		}
 3517
 3518		/* pull out the item */
 3519		leaf = path->nodes[0];
 3520		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 3521
 3522		/* make sure the item matches what we want */
 3523		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
 3524			break;
 3525		if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
 3526			break;
 3527
 3528		/* release the path since we're done with it */
 3529		btrfs_release_path(path);
 3530
 3531		/*
 3532		 * this is where we are basically btrfs_lookup, without the
 3533		 * crossing root thing.  we store the inode number in the
 3534		 * offset of the orphan item.
 3535		 */
 3536
 3537		if (found_key.offset == last_objectid) {
 3538			/*
 3539			 * We found the same inode as before. This means we were
 3540			 * not able to remove its items via eviction triggered
 3541			 * by an iput(). A transaction abort may have happened,
 3542			 * due to -ENOSPC for example, so try to grab the error
 3543			 * that lead to a transaction abort, if any.
 3544			 */
 3545			btrfs_err(fs_info,
 3546				  "Error removing orphan entry, stopping orphan cleanup");
 3547			ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL;
 3548			goto out;
 3549		}
 3550
 3551		last_objectid = found_key.offset;
 3552
 3553		found_key.objectid = found_key.offset;
 3554		found_key.type = BTRFS_INODE_ITEM_KEY;
 3555		found_key.offset = 0;
 3556		inode = btrfs_iget(last_objectid, root);
 3557		if (IS_ERR(inode)) {
 3558			ret = PTR_ERR(inode);
 3559			inode = NULL;
 3560			if (ret != -ENOENT)
 3561				goto out;
 3562		}
 3563
 3564		if (!inode && root == fs_info->tree_root) {
 3565			struct btrfs_root *dead_root;
 3566			int is_dead_root = 0;
 3567
 3568			/*
 3569			 * This is an orphan in the tree root. Currently these
 3570			 * could come from 2 sources:
 3571			 *  a) a root (snapshot/subvolume) deletion in progress
 3572			 *  b) a free space cache inode
 3573			 * We need to distinguish those two, as the orphan item
 3574			 * for a root must not get deleted before the deletion
 3575			 * of the snapshot/subvolume's tree completes.
 3576			 *
 3577			 * btrfs_find_orphan_roots() ran before us, which has
 3578			 * found all deleted roots and loaded them into
 3579			 * fs_info->fs_roots_radix. So here we can find if an
 3580			 * orphan item corresponds to a deleted root by looking
 3581			 * up the root from that radix tree.
 3582			 */
 3583
 3584			spin_lock(&fs_info->fs_roots_radix_lock);
 3585			dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
 3586							 (unsigned long)found_key.objectid);
 3587			if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
 3588				is_dead_root = 1;
 3589			spin_unlock(&fs_info->fs_roots_radix_lock);
 3590
 3591			if (is_dead_root) {
 3592				/* prevent this orphan from being found again */
 3593				key.offset = found_key.objectid - 1;
 3594				continue;
 3595			}
 3596
 3597		}
 3598
 3599		/*
 3600		 * If we have an inode with links, there are a couple of
 3601		 * possibilities:
 3602		 *
 3603		 * 1. We were halfway through creating fsverity metadata for the
 3604		 * file. In that case, the orphan item represents incomplete
 3605		 * fsverity metadata which must be cleaned up with
 3606		 * btrfs_drop_verity_items and deleting the orphan item.
 3607
 3608		 * 2. Old kernels (before v3.12) used to create an
 3609		 * orphan item for truncate indicating that there were possibly
 3610		 * extent items past i_size that needed to be deleted. In v3.12,
 3611		 * truncate was changed to update i_size in sync with the extent
 3612		 * items, but the (useless) orphan item was still created. Since
 3613		 * v4.18, we don't create the orphan item for truncate at all.
 3614		 *
 3615		 * So, this item could mean that we need to do a truncate, but
 3616		 * only if this filesystem was last used on a pre-v3.12 kernel
 3617		 * and was not cleanly unmounted. The odds of that are quite
 3618		 * slim, and it's a pain to do the truncate now, so just delete
 3619		 * the orphan item.
 3620		 *
 3621		 * It's also possible that this orphan item was supposed to be
 3622		 * deleted but wasn't. The inode number may have been reused,
 3623		 * but either way, we can delete the orphan item.
 3624		 */
 3625		if (!inode || inode->i_nlink) {
 3626			if (inode) {
 3627				ret = btrfs_drop_verity_items(BTRFS_I(inode));
 3628				iput(inode);
 3629				inode = NULL;
 3630				if (ret)
 3631					goto out;
 3632			}
 3633			trans = btrfs_start_transaction(root, 1);
 3634			if (IS_ERR(trans)) {
 3635				ret = PTR_ERR(trans);
 3636				goto out;
 3637			}
 3638			btrfs_debug(fs_info, "auto deleting %Lu",
 3639				    found_key.objectid);
 3640			ret = btrfs_del_orphan_item(trans, root,
 3641						    found_key.objectid);
 3642			btrfs_end_transaction(trans);
 3643			if (ret)
 3644				goto out;
 3645			continue;
 3646		}
 3647
 3648		nr_unlink++;
 3649
 3650		/* this will do delete_inode and everything for us */
 3651		iput(inode);
 3652	}
 3653	/* release the path since we're done with it */
 3654	btrfs_release_path(path);
 3655
 3656	if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
 3657		trans = btrfs_join_transaction(root);
 3658		if (!IS_ERR(trans))
 3659			btrfs_end_transaction(trans);
 3660	}
 3661
 3662	if (nr_unlink)
 3663		btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
 3664
 3665out:
 3666	if (ret)
 3667		btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
 3668	btrfs_free_path(path);
 3669	return ret;
 3670}
 3671
 3672/*
 3673 * very simple check to peek ahead in the leaf looking for xattrs.  If we
 3674 * don't find any xattrs, we know there can't be any acls.
 3675 *
 3676 * slot is the slot the inode is in, objectid is the objectid of the inode
 3677 */
 3678static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 3679					  int slot, u64 objectid,
 3680					  int *first_xattr_slot)
 3681{
 3682	u32 nritems = btrfs_header_nritems(leaf);
 3683	struct btrfs_key found_key;
 3684	static u64 xattr_access = 0;
 3685	static u64 xattr_default = 0;
 3686	int scanned = 0;
 3687
 3688	if (!xattr_access) {
 3689		xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
 3690					strlen(XATTR_NAME_POSIX_ACL_ACCESS));
 3691		xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
 3692					strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
 3693	}
 3694
 3695	slot++;
 3696	*first_xattr_slot = -1;
 3697	while (slot < nritems) {
 3698		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 3699
 3700		/* we found a different objectid, there must not be acls */
 3701		if (found_key.objectid != objectid)
 3702			return 0;
 3703
 3704		/* we found an xattr, assume we've got an acl */
 3705		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
 3706			if (*first_xattr_slot == -1)
 3707				*first_xattr_slot = slot;
 3708			if (found_key.offset == xattr_access ||
 3709			    found_key.offset == xattr_default)
 3710				return 1;
 3711		}
 3712
 3713		/*
 3714		 * we found a key greater than an xattr key, there can't
 3715		 * be any acls later on
 3716		 */
 3717		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
 3718			return 0;
 3719
 3720		slot++;
 3721		scanned++;
 3722
 3723		/*
 3724		 * it goes inode, inode backrefs, xattrs, extents,
 3725		 * so if there are a ton of hard links to an inode there can
 3726		 * be a lot of backrefs.  Don't waste time searching too hard,
 3727		 * this is just an optimization
 3728		 */
 3729		if (scanned >= 8)
 3730			break;
 3731	}
 3732	/* we hit the end of the leaf before we found an xattr or
 3733	 * something larger than an xattr.  We have to assume the inode
 3734	 * has acls
 3735	 */
 3736	if (*first_xattr_slot == -1)
 3737		*first_xattr_slot = slot;
 3738	return 1;
 3739}
 3740
 3741static int btrfs_init_file_extent_tree(struct btrfs_inode *inode)
 3742{
 3743	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 3744
 3745	if (WARN_ON_ONCE(inode->file_extent_tree))
 3746		return 0;
 3747	if (btrfs_fs_incompat(fs_info, NO_HOLES))
 3748		return 0;
 3749	if (!S_ISREG(inode->vfs_inode.i_mode))
 3750		return 0;
 3751	if (btrfs_is_free_space_inode(inode))
 3752		return 0;
 3753
 3754	inode->file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL);
 3755	if (!inode->file_extent_tree)
 3756		return -ENOMEM;
 3757
 3758	extent_io_tree_init(fs_info, inode->file_extent_tree, IO_TREE_INODE_FILE_EXTENT);
 3759	/* Lockdep class is set only for the file extent tree. */
 3760	lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class);
 3761
 3762	return 0;
 3763}
 3764
 3765/*
 3766 * read an inode from the btree into the in-memory inode
 3767 */
 3768static int btrfs_read_locked_inode(struct inode *inode,
 3769				   struct btrfs_path *in_path)
 3770{
 3771	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 3772	struct btrfs_path *path = in_path;
 3773	struct extent_buffer *leaf;
 3774	struct btrfs_inode_item *inode_item;
 3775	struct btrfs_root *root = BTRFS_I(inode)->root;
 3776	struct btrfs_key location;
 3777	unsigned long ptr;
 3778	int maybe_acls;
 3779	u32 rdev;
 3780	int ret;
 3781	bool filled = false;
 3782	int first_xattr_slot;
 3783
 3784	ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
 3785	if (ret)
 3786		return ret;
 3787
 3788	ret = btrfs_fill_inode(inode, &rdev);
 3789	if (!ret)
 3790		filled = true;
 3791
 3792	if (!path) {
 3793		path = btrfs_alloc_path();
 3794		if (!path)
 3795			return -ENOMEM;
 3796	}
 3797
 3798	btrfs_get_inode_key(BTRFS_I(inode), &location);
 3799
 3800	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
 3801	if (ret) {
 3802		if (path != in_path)
 3803			btrfs_free_path(path);
 3804		return ret;
 3805	}
 3806
 3807	leaf = path->nodes[0];
 3808
 3809	if (filled)
 3810		goto cache_index;
 3811
 3812	inode_item = btrfs_item_ptr(leaf, path->slots[0],
 3813				    struct btrfs_inode_item);
 3814	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
 3815	set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
 3816	i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
 3817	i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
 3818	btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
 3819	btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
 3820			round_up(i_size_read(inode), fs_info->sectorsize));
 3821
 3822	inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime),
 3823			btrfs_timespec_nsec(leaf, &inode_item->atime));
 3824
 3825	inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
 3826			btrfs_timespec_nsec(leaf, &inode_item->mtime));
 3827
 3828	inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
 3829			btrfs_timespec_nsec(leaf, &inode_item->ctime));
 3830
 3831	BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
 3832	BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);
 3833
 3834	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
 3835	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
 3836	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
 3837
 3838	inode_set_iversion_queried(inode,
 3839				   btrfs_inode_sequence(leaf, inode_item));
 3840	inode->i_generation = BTRFS_I(inode)->generation;
 3841	inode->i_rdev = 0;
 3842	rdev = btrfs_inode_rdev(leaf, inode_item);
 3843
 3844	if (S_ISDIR(inode->i_mode))
 3845		BTRFS_I(inode)->index_cnt = (u64)-1;
 3846
 3847	btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
 3848				&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
 3849
 3850cache_index:
 3851	/*
 3852	 * If we were modified in the current generation and evicted from memory
 3853	 * and then re-read we need to do a full sync since we don't have any
 3854	 * idea about which extents were modified before we were evicted from
 3855	 * cache.
 3856	 *
 3857	 * This is required for both inode re-read from disk and delayed inode
 3858	 * in the delayed_nodes xarray.
 3859	 */
 3860	if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
 3861		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 3862			&BTRFS_I(inode)->runtime_flags);
 3863
 3864	/*
 3865	 * We don't persist the id of the transaction where an unlink operation
 3866	 * against the inode was last made. So here we assume the inode might
 3867	 * have been evicted, and therefore the exact value of last_unlink_trans
 3868	 * lost, and set it to last_trans to avoid metadata inconsistencies
 3869	 * between the inode and its parent if the inode is fsync'ed and the log
 3870	 * replayed. For example, in the scenario:
 3871	 *
 3872	 * touch mydir/foo
 3873	 * ln mydir/foo mydir/bar
 3874	 * sync
 3875	 * unlink mydir/bar
 3876	 * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
 3877	 * xfs_io -c fsync mydir/foo
 3878	 * <power failure>
 3879	 * mount fs, triggers fsync log replay
 3880	 *
 3881	 * We must make sure that when we fsync our inode foo we also log its
 3882	 * parent inode, otherwise after log replay the parent still has the
 3883	 * dentry with the "bar" name but our inode foo has a link count of 1
 3884	 * and doesn't have an inode ref with the name "bar" anymore.
 3885	 *
 3886	 * Setting last_unlink_trans to last_trans is a pessimistic approach,
 3887	 * but it guarantees correctness at the expense of occasional full
 3888	 * transaction commits on fsync if our inode is a directory, or if our
 3889	 * inode is not a directory, logging its parent unnecessarily.
 3890	 */
 3891	BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
 3892
 3893	/*
 3894	 * Same logic as for last_unlink_trans. We don't persist the generation
 3895	 * of the last transaction where this inode was used for a reflink
 3896	 * operation, so after eviction and reloading the inode we must be
 3897	 * pessimistic and assume the last transaction that modified the inode.
 3898	 */
 3899	BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
 3900
 3901	path->slots[0]++;
 3902	if (inode->i_nlink != 1 ||
 3903	    path->slots[0] >= btrfs_header_nritems(leaf))
 3904		goto cache_acl;
 3905
 3906	btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
 3907	if (location.objectid != btrfs_ino(BTRFS_I(inode)))
 3908		goto cache_acl;
 3909
 3910	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
 3911	if (location.type == BTRFS_INODE_REF_KEY) {
 3912		struct btrfs_inode_ref *ref;
 3913
 3914		ref = (struct btrfs_inode_ref *)ptr;
 3915		BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
 3916	} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
 3917		struct btrfs_inode_extref *extref;
 3918
 3919		extref = (struct btrfs_inode_extref *)ptr;
 3920		BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
 3921								     extref);
 3922	}
 3923cache_acl:
 3924	/*
 3925	 * try to precache a NULL acl entry for files that don't have
 3926	 * any xattrs or acls
 3927	 */
 3928	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
 3929			btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
 3930	if (first_xattr_slot != -1) {
 3931		path->slots[0] = first_xattr_slot;
 3932		ret = btrfs_load_inode_props(inode, path);
 3933		if (ret)
 3934			btrfs_err(fs_info,
 3935				  "error loading props for ino %llu (root %llu): %d",
 3936				  btrfs_ino(BTRFS_I(inode)),
 3937				  btrfs_root_id(root), ret);
 3938	}
 3939	if (path != in_path)
 3940		btrfs_free_path(path);
 3941
 3942	if (!maybe_acls)
 3943		cache_no_acl(inode);
 3944
 3945	switch (inode->i_mode & S_IFMT) {
 3946	case S_IFREG:
 3947		inode->i_mapping->a_ops = &btrfs_aops;
 3948		inode->i_fop = &btrfs_file_operations;
 3949		inode->i_op = &btrfs_file_inode_operations;
 3950		break;
 3951	case S_IFDIR:
 3952		inode->i_fop = &btrfs_dir_file_operations;
 3953		inode->i_op = &btrfs_dir_inode_operations;
 3954		break;
 3955	case S_IFLNK:
 3956		inode->i_op = &btrfs_symlink_inode_operations;
 3957		inode_nohighmem(inode);
 3958		inode->i_mapping->a_ops = &btrfs_aops;
 3959		break;
 3960	default:
 3961		inode->i_op = &btrfs_special_inode_operations;
 3962		init_special_inode(inode, inode->i_mode, rdev);
 3963		break;
 3964	}
 3965
 3966	btrfs_sync_inode_flags_to_i_flags(inode);
 3967	return 0;
 3968}
 3969
 3970/*
 3971 * given a leaf and an inode, copy the inode fields into the leaf
 3972 */
 3973static void fill_inode_item(struct btrfs_trans_handle *trans,
 3974			    struct extent_buffer *leaf,
 3975			    struct btrfs_inode_item *item,
 3976			    struct inode *inode)
 3977{
 3978	struct btrfs_map_token token;
 3979	u64 flags;
 3980
 3981	btrfs_init_map_token(&token, leaf);
 3982
 3983	btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
 3984	btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
 3985	btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
 3986	btrfs_set_token_inode_mode(&token, item, inode->i_mode);
 3987	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
 3988
 3989	btrfs_set_token_timespec_sec(&token, &item->atime,
 3990				     inode_get_atime_sec(inode));
 3991	btrfs_set_token_timespec_nsec(&token, &item->atime,
 3992				      inode_get_atime_nsec(inode));
 3993
 3994	btrfs_set_token_timespec_sec(&token, &item->mtime,
 3995				     inode_get_mtime_sec(inode));
 3996	btrfs_set_token_timespec_nsec(&token, &item->mtime,
 3997				      inode_get_mtime_nsec(inode));
 3998
 3999	btrfs_set_token_timespec_sec(&token, &item->ctime,
 4000				     inode_get_ctime_sec(inode));
 4001	btrfs_set_token_timespec_nsec(&token, &item->ctime,
 4002				      inode_get_ctime_nsec(inode));
 4003
 4004	btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec);
 4005	btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec);
 4006
 4007	btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
 4008	btrfs_set_token_inode_generation(&token, item,
 4009					 BTRFS_I(inode)->generation);
 4010	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
 4011	btrfs_set_token_inode_transid(&token, item, trans->transid);
 4012	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
 4013	flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
 4014					  BTRFS_I(inode)->ro_flags);
 4015	btrfs_set_token_inode_flags(&token, item, flags);
 4016	btrfs_set_token_inode_block_group(&token, item, 0);
 4017}
 4018
 4019/*
 4020 * copy everything in the in-memory inode into the btree.
 4021 */
 4022static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
 4023					    struct btrfs_inode *inode)
 4024{
 4025	struct btrfs_inode_item *inode_item;
 4026	struct btrfs_path *path;
 4027	struct extent_buffer *leaf;
 4028	struct btrfs_key key;
 4029	int ret;
 4030
 4031	path = btrfs_alloc_path();
 4032	if (!path)
 4033		return -ENOMEM;
 4034
 4035	btrfs_get_inode_key(inode, &key);
 4036	ret = btrfs_lookup_inode(trans, inode->root, path, &key, 1);
 4037	if (ret) {
 4038		if (ret > 0)
 4039			ret = -ENOENT;
 4040		goto failed;
 4041	}
 4042
 4043	leaf = path->nodes[0];
 4044	inode_item = btrfs_item_ptr(leaf, path->slots[0],
 4045				    struct btrfs_inode_item);
 4046
 4047	fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
 4048	btrfs_mark_buffer_dirty(trans, leaf);
 4049	btrfs_set_inode_last_trans(trans, inode);
 4050	ret = 0;
 4051failed:
 4052	btrfs_free_path(path);
 4053	return ret;
 4054}
 4055
 4056/*
 4057 * copy everything in the in-memory inode into the btree.
 4058 */
 4059int btrfs_update_inode(struct btrfs_trans_handle *trans,
 4060		       struct btrfs_inode *inode)
 4061{
 4062	struct btrfs_root *root = inode->root;
 4063	struct btrfs_fs_info *fs_info = root->fs_info;
 4064	int ret;
 4065
 4066	/*
 4067	 * If the inode is a free space inode, we can deadlock during commit
 4068	 * if we put it into the delayed code.
 4069	 *
 4070	 * The data relocation inode should also be directly updated
 4071	 * without delay
 4072	 */
 4073	if (!btrfs_is_free_space_inode(inode)
 4074	    && !btrfs_is_data_reloc_root(root)
 4075	    && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
 4076		btrfs_update_root_times(trans, root);
 4077
 4078		ret = btrfs_delayed_update_inode(trans, inode);
 4079		if (!ret)
 4080			btrfs_set_inode_last_trans(trans, inode);
 4081		return ret;
 4082	}
 4083
 4084	return btrfs_update_inode_item(trans, inode);
 4085}
 4086
 4087int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
 4088				struct btrfs_inode *inode)
 4089{
 4090	int ret;
 4091
 4092	ret = btrfs_update_inode(trans, inode);
 4093	if (ret == -ENOSPC)
 4094		return btrfs_update_inode_item(trans, inode);
 4095	return ret;
 4096}
 4097
 4098/*
 4099 * unlink helper that gets used here in inode.c and in the tree logging
 4100 * recovery code.  It remove a link in a directory with a given name, and
 4101 * also drops the back refs in the inode to the directory
 4102 */
 4103static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 4104				struct btrfs_inode *dir,
 4105				struct btrfs_inode *inode,
 4106				const struct fscrypt_str *name,
 4107				struct btrfs_rename_ctx *rename_ctx)
 4108{
 4109	struct btrfs_root *root = dir->root;
 4110	struct btrfs_fs_info *fs_info = root->fs_info;
 4111	struct btrfs_path *path;
 4112	int ret = 0;
 4113	struct btrfs_dir_item *di;
 4114	u64 index;
 4115	u64 ino = btrfs_ino(inode);
 4116	u64 dir_ino = btrfs_ino(dir);
 4117
 4118	path = btrfs_alloc_path();
 4119	if (!path) {
 4120		ret = -ENOMEM;
 4121		goto out;
 4122	}
 4123
 4124	di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
 4125	if (IS_ERR_OR_NULL(di)) {
 4126		ret = di ? PTR_ERR(di) : -ENOENT;
 4127		goto err;
 4128	}
 4129	ret = btrfs_delete_one_dir_name(trans, root, path, di);
 4130	if (ret)
 4131		goto err;
 4132	btrfs_release_path(path);
 4133
 4134	/*
 4135	 * If we don't have dir index, we have to get it by looking up
 4136	 * the inode ref, since we get the inode ref, remove it directly,
 4137	 * it is unnecessary to do delayed deletion.
 4138	 *
 4139	 * But if we have dir index, needn't search inode ref to get it.
 4140	 * Since the inode ref is close to the inode item, it is better
 4141	 * that we delay to delete it, and just do this deletion when
 4142	 * we update the inode item.
 4143	 */
 4144	if (inode->dir_index) {
 4145		ret = btrfs_delayed_delete_inode_ref(inode);
 4146		if (!ret) {
 4147			index = inode->dir_index;
 4148			goto skip_backref;
 4149		}
 4150	}
 4151
 4152	ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
 4153	if (ret) {
 4154		btrfs_info(fs_info,
 4155			"failed to delete reference to %.*s, inode %llu parent %llu",
 4156			name->len, name->name, ino, dir_ino);
 4157		btrfs_abort_transaction(trans, ret);
 4158		goto err;
 4159	}
 4160skip_backref:
 4161	if (rename_ctx)
 4162		rename_ctx->index = index;
 4163
 4164	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
 4165	if (ret) {
 4166		btrfs_abort_transaction(trans, ret);
 4167		goto err;
 4168	}
 4169
 4170	/*
 4171	 * If we are in a rename context, we don't need to update anything in the
 4172	 * log. That will be done later during the rename by btrfs_log_new_name().
 4173	 * Besides that, doing it here would only cause extra unnecessary btree
 4174	 * operations on the log tree, increasing latency for applications.
 4175	 */
 4176	if (!rename_ctx) {
 4177		btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino);
 4178		btrfs_del_dir_entries_in_log(trans, root, name, dir, index);
 4179	}
 4180
 4181	/*
 4182	 * If we have a pending delayed iput we could end up with the final iput
 4183	 * being run in btrfs-cleaner context.  If we have enough of these built
 4184	 * up we can end up burning a lot of time in btrfs-cleaner without any
 4185	 * way to throttle the unlinks.  Since we're currently holding a ref on
 4186	 * the inode we can run the delayed iput here without any issues as the
 4187	 * final iput won't be done until after we drop the ref we're currently
 4188	 * holding.
 4189	 */
 4190	btrfs_run_delayed_iput(fs_info, inode);
 4191err:
 4192	btrfs_free_path(path);
 4193	if (ret)
 4194		goto out;
 4195
 4196	btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
 4197	inode_inc_iversion(&inode->vfs_inode);
 4198	inode_set_ctime_current(&inode->vfs_inode);
 4199	inode_inc_iversion(&dir->vfs_inode);
 4200 	inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
 4201	ret = btrfs_update_inode(trans, dir);
 4202out:
 4203	return ret;
 4204}
 4205
 4206int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 4207		       struct btrfs_inode *dir, struct btrfs_inode *inode,
 4208		       const struct fscrypt_str *name)
 4209{
 4210	int ret;
 4211
 4212	ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
 4213	if (!ret) {
 4214		drop_nlink(&inode->vfs_inode);
 4215		ret = btrfs_update_inode(trans, inode);
 4216	}
 4217	return ret;
 4218}
 4219
 4220/*
 4221 * helper to start transaction for unlink and rmdir.
 4222 *
 4223 * unlink and rmdir are special in btrfs, they do not always free space, so
 4224 * if we cannot make our reservations the normal way try and see if there is
 4225 * plenty of slack room in the global reserve to migrate, otherwise we cannot
 4226 * allow the unlink to occur.
 4227 */
 4228static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir)
 4229{
 4230	struct btrfs_root *root = dir->root;
 4231
 4232	return btrfs_start_transaction_fallback_global_rsv(root,
 4233						   BTRFS_UNLINK_METADATA_UNITS);
 4234}
 4235
 4236static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 4237{
 4238	struct btrfs_trans_handle *trans;
 4239	struct inode *inode = d_inode(dentry);
 4240	int ret;
 4241	struct fscrypt_name fname;
 4242
 4243	ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
 4244	if (ret)
 4245		return ret;
 4246
 4247	/* This needs to handle no-key deletions later on */
 4248
 4249	trans = __unlink_start_trans(BTRFS_I(dir));
 4250	if (IS_ERR(trans)) {
 4251		ret = PTR_ERR(trans);
 4252		goto fscrypt_free;
 4253	}
 4254
 4255	btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
 4256				false);
 4257
 4258	ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
 4259				 &fname.disk_name);
 4260	if (ret)
 4261		goto end_trans;
 4262
 4263	if (inode->i_nlink == 0) {
 4264		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
 4265		if (ret)
 4266			goto end_trans;
 4267	}
 4268
 4269end_trans:
 4270	btrfs_end_transaction(trans);
 4271	btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
 4272fscrypt_free:
 4273	fscrypt_free_filename(&fname);
 4274	return ret;
 4275}
 4276
 4277static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 4278			       struct btrfs_inode *dir, struct dentry *dentry)
 4279{
 4280	struct btrfs_root *root = dir->root;
 4281	struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
 4282	struct btrfs_path *path;
 4283	struct extent_buffer *leaf;
 4284	struct btrfs_dir_item *di;
 4285	struct btrfs_key key;
 4286	u64 index;
 4287	int ret;
 4288	u64 objectid;
 4289	u64 dir_ino = btrfs_ino(dir);
 4290	struct fscrypt_name fname;
 4291
 4292	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
 4293	if (ret)
 4294		return ret;
 4295
 4296	/* This needs to handle no-key deletions later on */
 4297
 4298	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
 4299		objectid = btrfs_root_id(inode->root);
 4300	} else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
 4301		objectid = inode->ref_root_id;
 4302	} else {
 4303		WARN_ON(1);
 4304		fscrypt_free_filename(&fname);
 4305		return -EINVAL;
 4306	}
 4307
 4308	path = btrfs_alloc_path();
 4309	if (!path) {
 4310		ret = -ENOMEM;
 4311		goto out;
 4312	}
 4313
 4314	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
 4315				   &fname.disk_name, -1);
 4316	if (IS_ERR_OR_NULL(di)) {
 4317		ret = di ? PTR_ERR(di) : -ENOENT;
 4318		goto out;
 4319	}
 4320
 4321	leaf = path->nodes[0];
 4322	btrfs_dir_item_key_to_cpu(leaf, di, &key);
 4323	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
 4324	ret = btrfs_delete_one_dir_name(trans, root, path, di);
 4325	if (ret) {
 4326		btrfs_abort_transaction(trans, ret);
 4327		goto out;
 4328	}
 4329	btrfs_release_path(path);
 4330
 4331	/*
 4332	 * This is a placeholder inode for a subvolume we didn't have a
 4333	 * reference to at the time of the snapshot creation.  In the meantime
 4334	 * we could have renamed the real subvol link into our snapshot, so
 4335	 * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
 4336	 * Instead simply lookup the dir_index_item for this entry so we can
 4337	 * remove it.  Otherwise we know we have a ref to the root and we can
 4338	 * call btrfs_del_root_ref, and it _shouldn't_ fail.
 4339	 */
 4340	if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
 4341		di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
 4342		if (IS_ERR_OR_NULL(di)) {
 4343			if (!di)
 4344				ret = -ENOENT;
 4345			else
 4346				ret = PTR_ERR(di);
 4347			btrfs_abort_transaction(trans, ret);
 4348			goto out;
 4349		}
 4350
 4351		leaf = path->nodes[0];
 4352		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 4353		index = key.offset;
 4354		btrfs_release_path(path);
 4355	} else {
 4356		ret = btrfs_del_root_ref(trans, objectid,
 4357					 btrfs_root_id(root), dir_ino,
 4358					 &index, &fname.disk_name);
 4359		if (ret) {
 4360			btrfs_abort_transaction(trans, ret);
 4361			goto out;
 4362		}
 4363	}
 4364
 4365	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
 4366	if (ret) {
 4367		btrfs_abort_transaction(trans, ret);
 4368		goto out;
 4369	}
 4370
 4371	btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
 4372	inode_inc_iversion(&dir->vfs_inode);
 4373	inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
 4374	ret = btrfs_update_inode_fallback(trans, dir);
 4375	if (ret)
 4376		btrfs_abort_transaction(trans, ret);
 4377out:
 4378	btrfs_free_path(path);
 4379	fscrypt_free_filename(&fname);
 4380	return ret;
 4381}
 4382
 4383/*
 4384 * Helper to check if the subvolume references other subvolumes or if it's
 4385 * default.
 4386 */
 4387static noinline int may_destroy_subvol(struct btrfs_root *root)
 4388{
 4389	struct btrfs_fs_info *fs_info = root->fs_info;
 4390	struct btrfs_path *path;
 4391	struct btrfs_dir_item *di;
 4392	struct btrfs_key key;
 4393	struct fscrypt_str name = FSTR_INIT("default", 7);
 4394	u64 dir_id;
 4395	int ret;
 4396
 4397	path = btrfs_alloc_path();
 4398	if (!path)
 4399		return -ENOMEM;
 4400
 4401	/* Make sure this root isn't set as the default subvol */
 4402	dir_id = btrfs_super_root_dir(fs_info->super_copy);
 4403	di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
 4404				   dir_id, &name, 0);
 4405	if (di && !IS_ERR(di)) {
 4406		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
 4407		if (key.objectid == btrfs_root_id(root)) {
 4408			ret = -EPERM;
 4409			btrfs_err(fs_info,
 4410				  "deleting default subvolume %llu is not allowed",
 4411				  key.objectid);
 4412			goto out;
 4413		}
 4414		btrfs_release_path(path);
 4415	}
 4416
 4417	key.objectid = btrfs_root_id(root);
 4418	key.type = BTRFS_ROOT_REF_KEY;
 4419	key.offset = (u64)-1;
 4420
 4421	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
 4422	if (ret < 0)
 4423		goto out;
 4424	if (ret == 0) {
 4425		/*
 4426		 * Key with offset -1 found, there would have to exist a root
 4427		 * with such id, but this is out of valid range.
 4428		 */
 4429		ret = -EUCLEAN;
 4430		goto out;
 4431	}
 4432
 4433	ret = 0;
 4434	if (path->slots[0] > 0) {
 4435		path->slots[0]--;
 4436		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
 4437		if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY)
 4438			ret = -ENOTEMPTY;
 4439	}
 4440out:
 4441	btrfs_free_path(path);
 4442	return ret;
 4443}
 4444
 4445/* Delete all dentries for inodes belonging to the root */
 4446static void btrfs_prune_dentries(struct btrfs_root *root)
 4447{
 4448	struct btrfs_fs_info *fs_info = root->fs_info;
 4449	struct btrfs_inode *inode;
 4450	u64 min_ino = 0;
 4451
 4452	if (!BTRFS_FS_ERROR(fs_info))
 4453		WARN_ON(btrfs_root_refs(&root->root_item) != 0);
 4454
 4455	inode = btrfs_find_first_inode(root, min_ino);
 4456	while (inode) {
 4457		if (atomic_read(&inode->vfs_inode.i_count) > 1)
 4458			d_prune_aliases(&inode->vfs_inode);
 4459
 4460		min_ino = btrfs_ino(inode) + 1;
 4461		/*
 4462		 * btrfs_drop_inode() will have it removed from the inode
 4463		 * cache when its usage count hits zero.
 4464		 */
 4465		iput(&inode->vfs_inode);
 4466		cond_resched();
 4467		inode = btrfs_find_first_inode(root, min_ino);
 4468	}
 4469}
 4470
 4471int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
 4472{
 4473	struct btrfs_root *root = dir->root;
 4474	struct btrfs_fs_info *fs_info = root->fs_info;
 4475	struct inode *inode = d_inode(dentry);
 4476	struct btrfs_root *dest = BTRFS_I(inode)->root;
 4477	struct btrfs_trans_handle *trans;
 4478	struct btrfs_block_rsv block_rsv;
 4479	u64 root_flags;
 4480	u64 qgroup_reserved = 0;
 4481	int ret;
 4482
 4483	down_write(&fs_info->subvol_sem);
 4484
 4485	/*
 4486	 * Don't allow to delete a subvolume with send in progress. This is
 4487	 * inside the inode lock so the error handling that has to drop the bit
 4488	 * again is not run concurrently.
 4489	 */
 4490	spin_lock(&dest->root_item_lock);
 4491	if (dest->send_in_progress) {
 4492		spin_unlock(&dest->root_item_lock);
 4493		btrfs_warn(fs_info,
 4494			   "attempt to delete subvolume %llu during send",
 4495			   btrfs_root_id(dest));
 4496		ret = -EPERM;
 4497		goto out_up_write;
 4498	}
 4499	if (atomic_read(&dest->nr_swapfiles)) {
 4500		spin_unlock(&dest->root_item_lock);
 4501		btrfs_warn(fs_info,
 4502			   "attempt to delete subvolume %llu with active swapfile",
 4503			   btrfs_root_id(root));
 4504		ret = -EPERM;
 4505		goto out_up_write;
 4506	}
 4507	root_flags = btrfs_root_flags(&dest->root_item);
 4508	btrfs_set_root_flags(&dest->root_item,
 4509			     root_flags | BTRFS_ROOT_SUBVOL_DEAD);
 4510	spin_unlock(&dest->root_item_lock);
 4511
 4512	ret = may_destroy_subvol(dest);
 4513	if (ret)
 4514		goto out_undead;
 4515
 4516	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
 4517	/*
 4518	 * One for dir inode,
 4519	 * two for dir entries,
 4520	 * two for root ref/backref.
 4521	 */
 4522	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
 4523	if (ret)
 4524		goto out_undead;
 4525	qgroup_reserved = block_rsv.qgroup_rsv_reserved;
 4526
 4527	trans = btrfs_start_transaction(root, 0);
 4528	if (IS_ERR(trans)) {
 4529		ret = PTR_ERR(trans);
 4530		goto out_release;
 4531	}
 4532	btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
 4533	qgroup_reserved = 0;
 4534	trans->block_rsv = &block_rsv;
 4535	trans->bytes_reserved = block_rsv.size;
 4536
 4537	btrfs_record_snapshot_destroy(trans, dir);
 4538
 4539	ret = btrfs_unlink_subvol(trans, dir, dentry);
 4540	if (ret) {
 4541		btrfs_abort_transaction(trans, ret);
 4542		goto out_end_trans;
 4543	}
 4544
 4545	ret = btrfs_record_root_in_trans(trans, dest);
 4546	if (ret) {
 4547		btrfs_abort_transaction(trans, ret);
 4548		goto out_end_trans;
 4549	}
 4550
 4551	memset(&dest->root_item.drop_progress, 0,
 4552		sizeof(dest->root_item.drop_progress));
 4553	btrfs_set_root_drop_level(&dest->root_item, 0);
 4554	btrfs_set_root_refs(&dest->root_item, 0);
 4555
 4556	if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
 4557		ret = btrfs_insert_orphan_item(trans,
 4558					fs_info->tree_root,
 4559					btrfs_root_id(dest));
 4560		if (ret) {
 4561			btrfs_abort_transaction(trans, ret);
 4562			goto out_end_trans;
 4563		}
 4564	}
 4565
 4566	ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
 4567				     BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest));
 4568	if (ret && ret != -ENOENT) {
 4569		btrfs_abort_transaction(trans, ret);
 4570		goto out_end_trans;
 4571	}
 4572	if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
 4573		ret = btrfs_uuid_tree_remove(trans,
 4574					  dest->root_item.received_uuid,
 4575					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
 4576					  btrfs_root_id(dest));
 4577		if (ret && ret != -ENOENT) {
 4578			btrfs_abort_transaction(trans, ret);
 4579			goto out_end_trans;
 4580		}
 4581	}
 4582
 4583	free_anon_bdev(dest->anon_dev);
 4584	dest->anon_dev = 0;
 4585out_end_trans:
 4586	trans->block_rsv = NULL;
 4587	trans->bytes_reserved = 0;
 4588	ret = btrfs_end_transaction(trans);
 4589	inode->i_flags |= S_DEAD;
 4590out_release:
 4591	btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
 4592	if (qgroup_reserved)
 4593		btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
 4594out_undead:
 4595	if (ret) {
 4596		spin_lock(&dest->root_item_lock);
 4597		root_flags = btrfs_root_flags(&dest->root_item);
 4598		btrfs_set_root_flags(&dest->root_item,
 4599				root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
 4600		spin_unlock(&dest->root_item_lock);
 4601	}
 4602out_up_write:
 4603	up_write(&fs_info->subvol_sem);
 4604	if (!ret) {
 4605		d_invalidate(dentry);
 4606		btrfs_prune_dentries(dest);
 4607		ASSERT(dest->send_in_progress == 0);
 4608	}
 4609
 4610	return ret;
 4611}
 4612
 4613static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 4614{
 4615	struct inode *inode = d_inode(dentry);
 4616	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
 4617	int ret = 0;
 4618	struct btrfs_trans_handle *trans;
 4619	u64 last_unlink_trans;
 4620	struct fscrypt_name fname;
 4621
 4622	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 4623		return -ENOTEMPTY;
 4624	if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
 4625		if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
 4626			btrfs_err(fs_info,
 4627			"extent tree v2 doesn't support snapshot deletion yet");
 4628			return -EOPNOTSUPP;
 4629		}
 4630		return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
 4631	}
 4632
 4633	ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
 4634	if (ret)
 4635		return ret;
 4636
 4637	/* This needs to handle no-key deletions later on */
 4638
 4639	trans = __unlink_start_trans(BTRFS_I(dir));
 4640	if (IS_ERR(trans)) {
 4641		ret = PTR_ERR(trans);
 4642		goto out_notrans;
 4643	}
 4644
 4645	if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
 4646		ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
 4647		goto out;
 4648	}
 4649
 4650	ret = btrfs_orphan_add(trans, BTRFS_I(inode));
 4651	if (ret)
 4652		goto out;
 4653
 4654	last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
 4655
 4656	/* now the directory is empty */
 4657	ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
 4658				 &fname.disk_name);
 4659	if (!ret) {
 4660		btrfs_i_size_write(BTRFS_I(inode), 0);
 4661		/*
 4662		 * Propagate the last_unlink_trans value of the deleted dir to
 4663		 * its parent directory. This is to prevent an unrecoverable
 4664		 * log tree in the case we do something like this:
 4665		 * 1) create dir foo
 4666		 * 2) create snapshot under dir foo
 4667		 * 3) delete the snapshot
 4668		 * 4) rmdir foo
 4669		 * 5) mkdir foo
 4670		 * 6) fsync foo or some file inside foo
 4671		 */
 4672		if (last_unlink_trans >= trans->transid)
 4673			BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
 4674	}
 4675out:
 4676	btrfs_end_transaction(trans);
 4677out_notrans:
 4678	btrfs_btree_balance_dirty(fs_info);
 4679	fscrypt_free_filename(&fname);
 4680
 4681	return ret;
 4682}
 4683
 4684/*
 4685 * Read, zero a chunk and write a block.
 4686 *
 4687 * @inode - inode that we're zeroing
 4688 * @from - the offset to start zeroing
 4689 * @len - the length to zero, 0 to zero the entire range respective to the
 4690 *	offset
 4691 * @front - zero up to the offset instead of from the offset on
 4692 *
 4693 * This will find the block for the "from" offset and cow the block and zero the
 4694 * part we want to zero.  This is used with truncate and hole punching.
 4695 */
 4696int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
 4697			 int front)
 4698{
 4699	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 4700	struct address_space *mapping = inode->vfs_inode.i_mapping;
 4701	struct extent_io_tree *io_tree = &inode->io_tree;
 4702	struct btrfs_ordered_extent *ordered;
 4703	struct extent_state *cached_state = NULL;
 4704	struct extent_changeset *data_reserved = NULL;
 4705	bool only_release_metadata = false;
 4706	u32 blocksize = fs_info->sectorsize;
 4707	pgoff_t index = from >> PAGE_SHIFT;
 4708	unsigned offset = from & (blocksize - 1);
 4709	struct folio *folio;
 4710	gfp_t mask = btrfs_alloc_write_mask(mapping);
 4711	size_t write_bytes = blocksize;
 4712	int ret = 0;
 4713	u64 block_start;
 4714	u64 block_end;
 4715
 4716	if (IS_ALIGNED(offset, blocksize) &&
 4717	    (!len || IS_ALIGNED(len, blocksize)))
 4718		goto out;
 4719
 4720	block_start = round_down(from, blocksize);
 4721	block_end = block_start + blocksize - 1;
 4722
 4723	ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
 4724					  blocksize, false);
 4725	if (ret < 0) {
 4726		if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
 4727			/* For nocow case, no need to reserve data space */
 4728			only_release_metadata = true;
 4729		} else {
 4730			goto out;
 4731		}
 4732	}
 4733	ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
 4734	if (ret < 0) {
 4735		if (!only_release_metadata)
 4736			btrfs_free_reserved_data_space(inode, data_reserved,
 4737						       block_start, blocksize);
 4738		goto out;
 4739	}
 4740again:
 4741	folio = __filemap_get_folio(mapping, index,
 4742				    FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
 4743	if (IS_ERR(folio)) {
 4744		btrfs_delalloc_release_space(inode, data_reserved, block_start,
 4745					     blocksize, true);
 4746		btrfs_delalloc_release_extents(inode, blocksize);
 4747		ret = -ENOMEM;
 4748		goto out;
 4749	}
 4750
 4751	if (!folio_test_uptodate(folio)) {
 4752		ret = btrfs_read_folio(NULL, folio);
 4753		folio_lock(folio);
 4754		if (folio->mapping != mapping) {
 4755			folio_unlock(folio);
 4756			folio_put(folio);
 4757			goto again;
 4758		}
 4759		if (!folio_test_uptodate(folio)) {
 4760			ret = -EIO;
 4761			goto out_unlock;
 4762		}
 4763	}
 4764
 4765	/*
 4766	 * We unlock the page after the io is completed and then re-lock it
 4767	 * above.  release_folio() could have come in between that and cleared
 4768	 * folio private, but left the page in the mapping.  Set the page mapped
 4769	 * here to make sure it's properly set for the subpage stuff.
 4770	 */
 4771	ret = set_folio_extent_mapped(folio);
 4772	if (ret < 0)
 4773		goto out_unlock;
 4774
 4775	folio_wait_writeback(folio);
 4776
 4777	lock_extent(io_tree, block_start, block_end, &cached_state);
 4778
 4779	ordered = btrfs_lookup_ordered_extent(inode, block_start);
 4780	if (ordered) {
 4781		unlock_extent(io_tree, block_start, block_end, &cached_state);
 4782		folio_unlock(folio);
 4783		folio_put(folio);
 4784		btrfs_start_ordered_extent(ordered);
 4785		btrfs_put_ordered_extent(ordered);
 4786		goto again;
 4787	}
 4788
 4789	clear_extent_bit(&inode->io_tree, block_start, block_end,
 4790			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
 4791			 &cached_state);
 4792
 4793	ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
 4794					&cached_state);
 4795	if (ret) {
 4796		unlock_extent(io_tree, block_start, block_end, &cached_state);
 4797		goto out_unlock;
 4798	}
 4799
 4800	if (offset != blocksize) {
 4801		if (!len)
 4802			len = blocksize - offset;
 4803		if (front)
 4804			folio_zero_range(folio, block_start - folio_pos(folio),
 4805					 offset);
 4806		else
 4807			folio_zero_range(folio,
 4808					 (block_start - folio_pos(folio)) + offset,
 4809					 len);
 4810	}
 4811	btrfs_folio_clear_checked(fs_info, folio, block_start,
 4812				  block_end + 1 - block_start);
 4813	btrfs_folio_set_dirty(fs_info, folio, block_start,
 4814			      block_end + 1 - block_start);
 4815	unlock_extent(io_tree, block_start, block_end, &cached_state);
 4816
 4817	if (only_release_metadata)
 4818		set_extent_bit(&inode->io_tree, block_start, block_end,
 4819			       EXTENT_NORESERVE, NULL);
 4820
 4821out_unlock:
 4822	if (ret) {
 4823		if (only_release_metadata)
 4824			btrfs_delalloc_release_metadata(inode, blocksize, true);
 4825		else
 4826			btrfs_delalloc_release_space(inode, data_reserved,
 4827					block_start, blocksize, true);
 4828	}
 4829	btrfs_delalloc_release_extents(inode, blocksize);
 4830	folio_unlock(folio);
 4831	folio_put(folio);
 4832out:
 4833	if (only_release_metadata)
 4834		btrfs_check_nocow_unlock(inode);
 4835	extent_changeset_free(data_reserved);
 4836	return ret;
 4837}
 4838
 4839static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len)
 4840{
 4841	struct btrfs_root *root = inode->root;
 4842	struct btrfs_fs_info *fs_info = root->fs_info;
 4843	struct btrfs_trans_handle *trans;
 4844	struct btrfs_drop_extents_args drop_args = { 0 };
 4845	int ret;
 4846
 4847	/*
 4848	 * If NO_HOLES is enabled, we don't need to do anything.
 4849	 * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
 4850	 * or btrfs_update_inode() will be called, which guarantee that the next
 4851	 * fsync will know this inode was changed and needs to be logged.
 4852	 */
 4853	if (btrfs_fs_incompat(fs_info, NO_HOLES))
 4854		return 0;
 4855
 4856	/*
 4857	 * 1 - for the one we're dropping
 4858	 * 1 - for the one we're adding
 4859	 * 1 - for updating the inode.
 4860	 */
 4861	trans = btrfs_start_transaction(root, 3);
 4862	if (IS_ERR(trans))
 4863		return PTR_ERR(trans);
 4864
 4865	drop_args.start = offset;
 4866	drop_args.end = offset + len;
 4867	drop_args.drop_cache = true;
 4868
 4869	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
 4870	if (ret) {
 4871		btrfs_abort_transaction(trans, ret);
 4872		btrfs_end_transaction(trans);
 4873		return ret;
 4874	}
 4875
 4876	ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len);
 4877	if (ret) {
 4878		btrfs_abort_transaction(trans, ret);
 4879	} else {
 4880		btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
 4881		btrfs_update_inode(trans, inode);
 4882	}
 4883	btrfs_end_transaction(trans);
 4884	return ret;
 4885}
 4886
 4887/*
 4888 * This function puts in dummy file extents for the area we're creating a hole
 4889 * for.  So if we are truncating this file to a larger size we need to insert
 4890 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
 4891 * the range between oldsize and size
 4892 */
 4893int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
 4894{
 4895	struct btrfs_root *root = inode->root;
 4896	struct btrfs_fs_info *fs_info = root->fs_info;
 4897	struct extent_io_tree *io_tree = &inode->io_tree;
 4898	struct extent_map *em = NULL;
 4899	struct extent_state *cached_state = NULL;
 4900	u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
 4901	u64 block_end = ALIGN(size, fs_info->sectorsize);
 4902	u64 last_byte;
 4903	u64 cur_offset;
 4904	u64 hole_size;
 4905	int ret = 0;
 4906
 4907	/*
 4908	 * If our size started in the middle of a block we need to zero out the
 4909	 * rest of the block before we expand the i_size, otherwise we could
 4910	 * expose stale data.
 4911	 */
 4912	ret = btrfs_truncate_block(inode, oldsize, 0, 0);
 4913	if (ret)
 4914		return ret;
 4915
 4916	if (size <= hole_start)
 4917		return 0;
 4918
 4919	btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
 4920					   &cached_state);
 4921	cur_offset = hole_start;
 4922	while (1) {
 4923		em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset);
 4924		if (IS_ERR(em)) {
 4925			ret = PTR_ERR(em);
 4926			em = NULL;
 4927			break;
 4928		}
 4929		last_byte = min(extent_map_end(em), block_end);
 4930		last_byte = ALIGN(last_byte, fs_info->sectorsize);
 4931		hole_size = last_byte - cur_offset;
 4932
 4933		if (!(em->flags & EXTENT_FLAG_PREALLOC)) {
 4934			struct extent_map *hole_em;
 4935
 4936			ret = maybe_insert_hole(inode, cur_offset, hole_size);
 4937			if (ret)
 4938				break;
 4939
 4940			ret = btrfs_inode_set_file_extent_range(inode,
 4941							cur_offset, hole_size);
 4942			if (ret)
 4943				break;
 4944
 4945			hole_em = alloc_extent_map();
 4946			if (!hole_em) {
 4947				btrfs_drop_extent_map_range(inode, cur_offset,
 4948						    cur_offset + hole_size - 1,
 4949						    false);
 4950				btrfs_set_inode_full_sync(inode);
 4951				goto next;
 4952			}
 4953			hole_em->start = cur_offset;
 4954			hole_em->len = hole_size;
 4955
 4956			hole_em->disk_bytenr = EXTENT_MAP_HOLE;
 4957			hole_em->disk_num_bytes = 0;
 4958			hole_em->ram_bytes = hole_size;
 4959			hole_em->generation = btrfs_get_fs_generation(fs_info);
 4960
 4961			ret = btrfs_replace_extent_map_range(inode, hole_em, true);
 4962			free_extent_map(hole_em);
 4963		} else {
 4964			ret = btrfs_inode_set_file_extent_range(inode,
 4965							cur_offset, hole_size);
 4966			if (ret)
 4967				break;
 4968		}
 4969next:
 4970		free_extent_map(em);
 4971		em = NULL;
 4972		cur_offset = last_byte;
 4973		if (cur_offset >= block_end)
 4974			break;
 4975	}
 4976	free_extent_map(em);
 4977	unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
 4978	return ret;
 4979}
 4980
 4981static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 4982{
 4983	struct btrfs_root *root = BTRFS_I(inode)->root;
 4984	struct btrfs_trans_handle *trans;
 4985	loff_t oldsize = i_size_read(inode);
 4986	loff_t newsize = attr->ia_size;
 4987	int mask = attr->ia_valid;
 4988	int ret;
 4989
 4990	/*
 4991	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
 4992	 * special case where we need to update the times despite not having
 4993	 * these flags set.  For all other operations the VFS set these flags
 4994	 * explicitly if it wants a timestamp update.
 4995	 */
 4996	if (newsize != oldsize) {
 4997		inode_inc_iversion(inode);
 4998		if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
 4999			inode_set_mtime_to_ts(inode,
 5000					      inode_set_ctime_current(inode));
 5001		}
 5002	}
 5003
 5004	if (newsize > oldsize) {
 5005		/*
 5006		 * Don't do an expanding truncate while snapshotting is ongoing.
 5007		 * This is to ensure the snapshot captures a fully consistent
 5008		 * state of this file - if the snapshot captures this expanding
 5009		 * truncation, it must capture all writes that happened before
 5010		 * this truncation.
 5011		 */
 5012		btrfs_drew_write_lock(&root->snapshot_lock);
 5013		ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
 5014		if (ret) {
 5015			btrfs_drew_write_unlock(&root->snapshot_lock);
 5016			return ret;
 5017		}
 5018
 5019		trans = btrfs_start_transaction(root, 1);
 5020		if (IS_ERR(trans)) {
 5021			btrfs_drew_write_unlock(&root->snapshot_lock);
 5022			return PTR_ERR(trans);
 5023		}
 5024
 5025		i_size_write(inode, newsize);
 5026		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
 5027		pagecache_isize_extended(inode, oldsize, newsize);
 5028		ret = btrfs_update_inode(trans, BTRFS_I(inode));
 5029		btrfs_drew_write_unlock(&root->snapshot_lock);
 5030		btrfs_end_transaction(trans);
 5031	} else {
 5032		struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 5033
 5034		if (btrfs_is_zoned(fs_info)) {
 5035			ret = btrfs_wait_ordered_range(BTRFS_I(inode),
 5036					ALIGN(newsize, fs_info->sectorsize),
 5037					(u64)-1);
 5038			if (ret)
 5039				return ret;
 5040		}
 5041
 5042		/*
 5043		 * We're truncating a file that used to have good data down to
 5044		 * zero. Make sure any new writes to the file get on disk
 5045		 * on close.
 5046		 */
 5047		if (newsize == 0)
 5048			set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
 5049				&BTRFS_I(inode)->runtime_flags);
 5050
 5051		truncate_setsize(inode, newsize);
 5052
 5053		inode_dio_wait(inode);
 5054
 5055		ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize);
 5056		if (ret && inode->i_nlink) {
 5057			int err;
 5058
 5059			/*
 5060			 * Truncate failed, so fix up the in-memory size. We
 5061			 * adjusted disk_i_size down as we removed extents, so
 5062			 * wait for disk_i_size to be stable and then update the
 5063			 * in-memory size to match.
 5064			 */
 5065			err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
 5066			if (err)
 5067				return err;
 5068			i_size_write(inode, BTRFS_I(inode)->disk_i_size);
 5069		}
 5070	}
 5071
 5072	return ret;
 5073}
 5074
 5075static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 5076			 struct iattr *attr)
 5077{
 5078	struct inode *inode = d_inode(dentry);
 5079	struct btrfs_root *root = BTRFS_I(inode)->root;
 5080	int err;
 5081
 5082	if (btrfs_root_readonly(root))
 5083		return -EROFS;
 5084
 5085	err = setattr_prepare(idmap, dentry, attr);
 5086	if (err)
 5087		return err;
 5088
 5089	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
 5090		err = btrfs_setsize(inode, attr);
 5091		if (err)
 5092			return err;
 5093	}
 5094
 5095	if (attr->ia_valid) {
 5096		setattr_copy(idmap, inode, attr);
 5097		inode_inc_iversion(inode);
 5098		err = btrfs_dirty_inode(BTRFS_I(inode));
 5099
 5100		if (!err && attr->ia_valid & ATTR_MODE)
 5101			err = posix_acl_chmod(idmap, dentry, inode->i_mode);
 5102	}
 5103
 5104	return err;
 5105}
 5106
 5107/*
 5108 * While truncating the inode pages during eviction, we get the VFS
 5109 * calling btrfs_invalidate_folio() against each folio of the inode. This
 5110 * is slow because the calls to btrfs_invalidate_folio() result in a
 5111 * huge amount of calls to lock_extent() and clear_extent_bit(),
 5112 * which keep merging and splitting extent_state structures over and over,
 5113 * wasting lots of time.
 5114 *
 5115 * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
 5116 * skip all those expensive operations on a per folio basis and do only
 5117 * the ordered io finishing, while we release here the extent_map and
 5118 * extent_state structures, without the excessive merging and splitting.
 5119 */
 5120static void evict_inode_truncate_pages(struct inode *inode)
 5121{
 5122	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 5123	struct rb_node *node;
 5124
 5125	ASSERT(inode->i_state & I_FREEING);
 5126	truncate_inode_pages_final(&inode->i_data);
 5127
 5128	btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
 5129
 5130	/*
 5131	 * Keep looping until we have no more ranges in the io tree.
 5132	 * We can have ongoing bios started by readahead that have
 5133	 * their endio callback (extent_io.c:end_bio_extent_readpage)
 5134	 * still in progress (unlocked the pages in the bio but did not yet
 5135	 * unlocked the ranges in the io tree). Therefore this means some
 5136	 * ranges can still be locked and eviction started because before
 5137	 * submitting those bios, which are executed by a separate task (work
 5138	 * queue kthread), inode references (inode->i_count) were not taken
 5139	 * (which would be dropped in the end io callback of each bio).
 5140	 * Therefore here we effectively end up waiting for those bios and
 5141	 * anyone else holding locked ranges without having bumped the inode's
 5142	 * reference count - if we don't do it, when they access the inode's
 5143	 * io_tree to unlock a range it may be too late, leading to an
 5144	 * use-after-free issue.
 5145	 */
 5146	spin_lock(&io_tree->lock);
 5147	while (!RB_EMPTY_ROOT(&io_tree->state)) {
 5148		struct extent_state *state;
 5149		struct extent_state *cached_state = NULL;
 5150		u64 start;
 5151		u64 end;
 5152		unsigned state_flags;
 5153
 5154		node = rb_first(&io_tree->state);
 5155		state = rb_entry(node, struct extent_state, rb_node);
 5156		start = state->start;
 5157		end = state->end;
 5158		state_flags = state->state;
 5159		spin_unlock(&io_tree->lock);
 5160
 5161		lock_extent(io_tree, start, end, &cached_state);
 5162
 5163		/*
 5164		 * If still has DELALLOC flag, the extent didn't reach disk,
 5165		 * and its reserved space won't be freed by delayed_ref.
 5166		 * So we need to free its reserved space here.
 5167		 * (Refer to comment in btrfs_invalidate_folio, case 2)
 5168		 *
 5169		 * Note, end is the bytenr of last byte, so we need + 1 here.
 5170		 */
 5171		if (state_flags & EXTENT_DELALLOC)
 5172			btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
 5173					       end - start + 1, NULL);
 5174
 5175		clear_extent_bit(io_tree, start, end,
 5176				 EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
 5177				 &cached_state);
 5178
 5179		cond_resched();
 5180		spin_lock(&io_tree->lock);
 5181	}
 5182	spin_unlock(&io_tree->lock);
 5183}
 5184
 5185static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
 5186							struct btrfs_block_rsv *rsv)
 5187{
 5188	struct btrfs_fs_info *fs_info = root->fs_info;
 5189	struct btrfs_trans_handle *trans;
 5190	u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1);
 5191	int ret;
 5192
 5193	/*
 5194	 * Eviction should be taking place at some place safe because of our
 5195	 * delayed iputs.  However the normal flushing code will run delayed
 5196	 * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
 5197	 *
 5198	 * We reserve the delayed_refs_extra here again because we can't use
 5199	 * btrfs_start_transaction(root, 0) for the same deadlocky reason as
 5200	 * above.  We reserve our extra bit here because we generate a ton of
 5201	 * delayed refs activity by truncating.
 5202	 *
 5203	 * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
 5204	 * if we fail to make this reservation we can re-try without the
 5205	 * delayed_refs_extra so we can make some forward progress.
 5206	 */
 5207	ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
 5208				     BTRFS_RESERVE_FLUSH_EVICT);
 5209	if (ret) {
 5210		ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
 5211					     BTRFS_RESERVE_FLUSH_EVICT);
 5212		if (ret) {
 5213			btrfs_warn(fs_info,
 5214				   "could not allocate space for delete; will truncate on mount");
 5215			return ERR_PTR(-ENOSPC);
 5216		}
 5217		delayed_refs_extra = 0;
 5218	}
 5219
 5220	trans = btrfs_join_transaction(root);
 5221	if (IS_ERR(trans))
 5222		return trans;
 5223
 5224	if (delayed_refs_extra) {
 5225		trans->block_rsv = &fs_info->trans_block_rsv;
 5226		trans->bytes_reserved = delayed_refs_extra;
 5227		btrfs_block_rsv_migrate(rsv, trans->block_rsv,
 5228					delayed_refs_extra, true);
 5229	}
 5230	return trans;
 5231}
 5232
 5233void btrfs_evict_inode(struct inode *inode)
 5234{
 5235	struct btrfs_fs_info *fs_info;
 5236	struct btrfs_trans_handle *trans;
 5237	struct btrfs_root *root = BTRFS_I(inode)->root;
 5238	struct btrfs_block_rsv *rsv = NULL;
 5239	int ret;
 5240
 5241	trace_btrfs_inode_evict(inode);
 5242
 5243	if (!root) {
 5244		fsverity_cleanup_inode(inode);
 5245		clear_inode(inode);
 5246		return;
 5247	}
 5248
 5249	fs_info = inode_to_fs_info(inode);
 5250	evict_inode_truncate_pages(inode);
 5251
 5252	if (inode->i_nlink &&
 5253	    ((btrfs_root_refs(&root->root_item) != 0 &&
 5254	      btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID) ||
 5255	     btrfs_is_free_space_inode(BTRFS_I(inode))))
 5256		goto out;
 5257
 5258	if (is_bad_inode(inode))
 5259		goto out;
 5260
 5261	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
 5262		goto out;
 5263
 5264	if (inode->i_nlink > 0) {
 5265		BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
 5266		       btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID);
 5267		goto out;
 5268	}
 5269
 5270	/*
 5271	 * This makes sure the inode item in tree is uptodate and the space for
 5272	 * the inode update is released.
 5273	 */
 5274	ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
 5275	if (ret)
 5276		goto out;
 5277
 5278	/*
 5279	 * This drops any pending insert or delete operations we have for this
 5280	 * inode.  We could have a delayed dir index deletion queued up, but
 5281	 * we're removing the inode completely so that'll be taken care of in
 5282	 * the truncate.
 5283	 */
 5284	btrfs_kill_delayed_inode_items(BTRFS_I(inode));
 5285
 5286	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
 5287	if (!rsv)
 5288		goto out;
 5289	rsv->size = btrfs_calc_metadata_size(fs_info, 1);
 5290	rsv->failfast = true;
 5291
 5292	btrfs_i_size_write(BTRFS_I(inode), 0);
 5293
 5294	while (1) {
 5295		struct btrfs_truncate_control control = {
 5296			.inode = BTRFS_I(inode),
 5297			.ino = btrfs_ino(BTRFS_I(inode)),
 5298			.new_size = 0,
 5299			.min_type = 0,
 5300		};
 5301
 5302		trans = evict_refill_and_join(root, rsv);
 5303		if (IS_ERR(trans))
 5304			goto out;
 5305
 5306		trans->block_rsv = rsv;
 5307
 5308		ret = btrfs_truncate_inode_items(trans, root, &control);
 5309		trans->block_rsv = &fs_info->trans_block_rsv;
 5310		btrfs_end_transaction(trans);
 5311		/*
 5312		 * We have not added new delayed items for our inode after we
 5313		 * have flushed its delayed items, so no need to throttle on
 5314		 * delayed items. However we have modified extent buffers.
 5315		 */
 5316		btrfs_btree_balance_dirty_nodelay(fs_info);
 5317		if (ret && ret != -ENOSPC && ret != -EAGAIN)
 5318			goto out;
 5319		else if (!ret)
 5320			break;
 5321	}
 5322
 5323	/*
 5324	 * Errors here aren't a big deal, it just means we leave orphan items in
 5325	 * the tree. They will be cleaned up on the next mount. If the inode
 5326	 * number gets reused, cleanup deletes the orphan item without doing
 5327	 * anything, and unlink reuses the existing orphan item.
 5328	 *
 5329	 * If it turns out that we are dropping too many of these, we might want
 5330	 * to add a mechanism for retrying these after a commit.
 5331	 */
 5332	trans = evict_refill_and_join(root, rsv);
 5333	if (!IS_ERR(trans)) {
 5334		trans->block_rsv = rsv;
 5335		btrfs_orphan_del(trans, BTRFS_I(inode));
 5336		trans->block_rsv = &fs_info->trans_block_rsv;
 5337		btrfs_end_transaction(trans);
 5338	}
 5339
 5340out:
 5341	btrfs_free_block_rsv(fs_info, rsv);
 5342	/*
 5343	 * If we didn't successfully delete, the orphan item will still be in
 5344	 * the tree and we'll retry on the next mount. Again, we might also want
 5345	 * to retry these periodically in the future.
 5346	 */
 5347	btrfs_remove_delayed_node(BTRFS_I(inode));
 5348	fsverity_cleanup_inode(inode);
 5349	clear_inode(inode);
 5350}
 5351
 5352/*
 5353 * Return the key found in the dir entry in the location pointer, fill @type
 5354 * with BTRFS_FT_*, and return 0.
 5355 *
 5356 * If no dir entries were found, returns -ENOENT.
 5357 * If found a corrupted location in dir entry, returns -EUCLEAN.
 5358 */
 5359static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
 5360			       struct btrfs_key *location, u8 *type)
 5361{
 5362	struct btrfs_dir_item *di;
 5363	struct btrfs_path *path;
 5364	struct btrfs_root *root = dir->root;
 5365	int ret = 0;
 5366	struct fscrypt_name fname;
 5367
 5368	path = btrfs_alloc_path();
 5369	if (!path)
 5370		return -ENOMEM;
 5371
 5372	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
 5373	if (ret < 0)
 5374		goto out;
 5375	/*
 5376	 * fscrypt_setup_filename() should never return a positive value, but
 5377	 * gcc on sparc/parisc thinks it can, so assert that doesn't happen.
 5378	 */
 5379	ASSERT(ret == 0);
 5380
 5381	/* This needs to handle no-key deletions later on */
 5382
 5383	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir),
 5384				   &fname.disk_name, 0);
 5385	if (IS_ERR_OR_NULL(di)) {
 5386		ret = di ? PTR_ERR(di) : -ENOENT;
 5387		goto out;
 5388	}
 5389
 5390	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
 5391	if (location->type != BTRFS_INODE_ITEM_KEY &&
 5392	    location->type != BTRFS_ROOT_ITEM_KEY) {
 5393		ret = -EUCLEAN;
 5394		btrfs_warn(root->fs_info,
 5395"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
 5396			   __func__, fname.disk_name.name, btrfs_ino(dir),
 5397			   location->objectid, location->type, location->offset);
 5398	}
 5399	if (!ret)
 5400		*type = btrfs_dir_ftype(path->nodes[0], di);
 5401out:
 5402	fscrypt_free_filename(&fname);
 5403	btrfs_free_path(path);
 5404	return ret;
 5405}
 5406
 5407/*
 5408 * when we hit a tree root in a directory, the btrfs part of the inode
 5409 * needs to be changed to reflect the root directory of the tree root.  This
 5410 * is kind of like crossing a mount point.
 5411 */
 5412static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
 5413				    struct btrfs_inode *dir,
 5414				    struct dentry *dentry,
 5415				    struct btrfs_key *location,
 5416				    struct btrfs_root **sub_root)
 5417{
 5418	struct btrfs_path *path;
 5419	struct btrfs_root *new_root;
 5420	struct btrfs_root_ref *ref;
 5421	struct extent_buffer *leaf;
 5422	struct btrfs_key key;
 5423	int ret;
 5424	int err = 0;
 5425	struct fscrypt_name fname;
 5426
 5427	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname);
 5428	if (ret)
 5429		return ret;
 5430
 5431	path = btrfs_alloc_path();
 5432	if (!path) {
 5433		err = -ENOMEM;
 5434		goto out;
 5435	}
 5436
 5437	err = -ENOENT;
 5438	key.objectid = btrfs_root_id(dir->root);
 5439	key.type = BTRFS_ROOT_REF_KEY;
 5440	key.offset = location->objectid;
 5441
 5442	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
 5443	if (ret) {
 5444		if (ret < 0)
 5445			err = ret;
 5446		goto out;
 5447	}
 5448
 5449	leaf = path->nodes[0];
 5450	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
 5451	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
 5452	    btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len)
 5453		goto out;
 5454
 5455	ret = memcmp_extent_buffer(leaf, fname.disk_name.name,
 5456				   (unsigned long)(ref + 1), fname.disk_name.len);
 5457	if (ret)
 5458		goto out;
 5459
 5460	btrfs_release_path(path);
 5461
 5462	new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
 5463	if (IS_ERR(new_root)) {
 5464		err = PTR_ERR(new_root);
 5465		goto out;
 5466	}
 5467
 5468	*sub_root = new_root;
 5469	location->objectid = btrfs_root_dirid(&new_root->root_item);
 5470	location->type = BTRFS_INODE_ITEM_KEY;
 5471	location->offset = 0;
 5472	err = 0;
 5473out:
 5474	btrfs_free_path(path);
 5475	fscrypt_free_filename(&fname);
 5476	return err;
 5477}
 5478
 5479static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
 5480{
 5481	struct btrfs_root *root = inode->root;
 5482	struct btrfs_inode *existing;
 5483	const u64 ino = btrfs_ino(inode);
 5484	int ret;
 5485
 5486	if (inode_unhashed(&inode->vfs_inode))
 5487		return 0;
 5488
 5489	if (prealloc) {
 5490		ret = xa_reserve(&root->inodes, ino, GFP_NOFS);
 5491		if (ret)
 5492			return ret;
 5493	}
 5494
 5495	existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC);
 5496
 5497	if (xa_is_err(existing)) {
 5498		ret = xa_err(existing);
 5499		ASSERT(ret != -EINVAL);
 5500		ASSERT(ret != -ENOMEM);
 5501		return ret;
 5502	} else if (existing) {
 5503		WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING)));
 5504	}
 5505
 5506	return 0;
 5507}
 5508
 5509static void btrfs_del_inode_from_root(struct btrfs_inode *inode)
 5510{
 5511	struct btrfs_root *root = inode->root;
 5512	struct btrfs_inode *entry;
 5513	bool empty = false;
 5514
 5515	xa_lock(&root->inodes);
 5516	entry = __xa_erase(&root->inodes, btrfs_ino(inode));
 5517	if (entry == inode)
 5518		empty = xa_empty(&root->inodes);
 5519	xa_unlock(&root->inodes);
 5520
 5521	if (empty && btrfs_root_refs(&root->root_item) == 0) {
 5522		xa_lock(&root->inodes);
 5523		empty = xa_empty(&root->inodes);
 5524		xa_unlock(&root->inodes);
 5525		if (empty)
 5526			btrfs_add_dead_root(root);
 5527	}
 5528}
 5529
 5530
 5531static int btrfs_init_locked_inode(struct inode *inode, void *p)
 5532{
 5533	struct btrfs_iget_args *args = p;
 5534
 5535	btrfs_set_inode_number(BTRFS_I(inode), args->ino);
 5536	BTRFS_I(inode)->root = btrfs_grab_root(args->root);
 5537
 5538	if (args->root && args->root == args->root->fs_info->tree_root &&
 5539	    args->ino != BTRFS_BTREE_INODE_OBJECTID)
 5540		set_bit(BTRFS_INODE_FREE_SPACE_INODE,
 5541			&BTRFS_I(inode)->runtime_flags);
 5542	return 0;
 5543}
 5544
 5545static int btrfs_find_actor(struct inode *inode, void *opaque)
 5546{
 5547	struct btrfs_iget_args *args = opaque;
 5548
 5549	return args->ino == btrfs_ino(BTRFS_I(inode)) &&
 5550		args->root == BTRFS_I(inode)->root;
 5551}
 5552
 5553static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
 5554{
 5555	struct inode *inode;
 5556	struct btrfs_iget_args args;
 5557	unsigned long hashval = btrfs_inode_hash(ino, root);
 5558
 5559	args.ino = ino;
 5560	args.root = root;
 5561
 5562	inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor,
 5563			     btrfs_init_locked_inode,
 5564			     (void *)&args);
 5565	return inode;
 5566}
 5567
 5568/*
 5569 * Get an inode object given its inode number and corresponding root.
 5570 * Path can be preallocated to prevent recursing back to iget through
 5571 * allocator. NULL is also valid but may require an additional allocation
 5572 * later.
 5573 */
 5574struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
 5575			      struct btrfs_path *path)
 5576{
 5577	struct inode *inode;
 5578	int ret;
 5579
 5580	inode = btrfs_iget_locked(ino, root);
 5581	if (!inode)
 5582		return ERR_PTR(-ENOMEM);
 5583
 5584	if (!(inode->i_state & I_NEW))
 5585		return inode;
 5586
 5587	ret = btrfs_read_locked_inode(inode, path);
 5588	/*
 5589	 * ret > 0 can come from btrfs_search_slot called by
 5590	 * btrfs_read_locked_inode(), this means the inode item was not found.
 5591	 */
 5592	if (ret > 0)
 5593		ret = -ENOENT;
 5594	if (ret < 0)
 5595		goto error;
 5596
 5597	ret = btrfs_add_inode_to_root(BTRFS_I(inode), true);
 5598	if (ret < 0)
 5599		goto error;
 5600
 5601	unlock_new_inode(inode);
 5602
 5603	return inode;
 5604error:
 5605	iget_failed(inode);
 5606	return ERR_PTR(ret);
 5607}
 5608
 5609struct inode *btrfs_iget(u64 ino, struct btrfs_root *root)
 5610{
 5611	return btrfs_iget_path(ino, root, NULL);
 5612}
 5613
 5614static struct inode *new_simple_dir(struct inode *dir,
 5615				    struct btrfs_key *key,
 5616				    struct btrfs_root *root)
 5617{
 5618	struct timespec64 ts;
 5619	struct inode *inode = new_inode(dir->i_sb);
 5620
 5621	if (!inode)
 5622		return ERR_PTR(-ENOMEM);
 5623
 5624	BTRFS_I(inode)->root = btrfs_grab_root(root);
 5625	BTRFS_I(inode)->ref_root_id = key->objectid;
 5626	set_bit(BTRFS_INODE_ROOT_STUB, &BTRFS_I(inode)->runtime_flags);
 5627	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
 5628
 5629	btrfs_set_inode_number(BTRFS_I(inode), BTRFS_EMPTY_SUBVOL_DIR_OBJECTID);
 5630	/*
 5631	 * We only need lookup, the rest is read-only and there's no inode
 5632	 * associated with the dentry
 5633	 */
 5634	inode->i_op = &simple_dir_inode_operations;
 5635	inode->i_opflags &= ~IOP_XATTR;
 5636	inode->i_fop = &simple_dir_operations;
 5637	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
 5638
 5639	ts = inode_set_ctime_current(inode);
 5640	inode_set_mtime_to_ts(inode, ts);
 5641	inode_set_atime_to_ts(inode, inode_get_atime(dir));
 5642	BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
 5643	BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
 5644
 5645	inode->i_uid = dir->i_uid;
 5646	inode->i_gid = dir->i_gid;
 5647
 5648	return inode;
 5649}
 5650
 5651static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
 5652static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
 5653static_assert(BTRFS_FT_DIR == FT_DIR);
 5654static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
 5655static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
 5656static_assert(BTRFS_FT_FIFO == FT_FIFO);
 5657static_assert(BTRFS_FT_SOCK == FT_SOCK);
 5658static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
 5659
 5660static inline u8 btrfs_inode_type(struct inode *inode)
 5661{
 5662	return fs_umode_to_ftype(inode->i_mode);
 5663}
 5664
 5665struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 5666{
 5667	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
 5668	struct inode *inode;
 5669	struct btrfs_root *root = BTRFS_I(dir)->root;
 5670	struct btrfs_root *sub_root = root;
 5671	struct btrfs_key location = { 0 };
 5672	u8 di_type = 0;
 5673	int ret = 0;
 5674
 5675	if (dentry->d_name.len > BTRFS_NAME_LEN)
 5676		return ERR_PTR(-ENAMETOOLONG);
 5677
 5678	ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type);
 5679	if (ret < 0)
 5680		return ERR_PTR(ret);
 5681
 5682	if (location.type == BTRFS_INODE_ITEM_KEY) {
 5683		inode = btrfs_iget(location.objectid, root);
 5684		if (IS_ERR(inode))
 5685			return inode;
 5686
 5687		/* Do extra check against inode mode with di_type */
 5688		if (btrfs_inode_type(inode) != di_type) {
 5689			btrfs_crit(fs_info,
 5690"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
 5691				  inode->i_mode, btrfs_inode_type(inode),
 5692				  di_type);
 5693			iput(inode);
 5694			return ERR_PTR(-EUCLEAN);
 5695		}
 5696		return inode;
 5697	}
 5698
 5699	ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry,
 5700				       &location, &sub_root);
 5701	if (ret < 0) {
 5702		if (ret != -ENOENT)
 5703			inode = ERR_PTR(ret);
 5704		else
 5705			inode = new_simple_dir(dir, &location, root);
 5706	} else {
 5707		inode = btrfs_iget(location.objectid, sub_root);
 5708		btrfs_put_root(sub_root);
 5709
 5710		if (IS_ERR(inode))
 5711			return inode;
 5712
 5713		down_read(&fs_info->cleanup_work_sem);
 5714		if (!sb_rdonly(inode->i_sb))
 5715			ret = btrfs_orphan_cleanup(sub_root);
 5716		up_read(&fs_info->cleanup_work_sem);
 5717		if (ret) {
 5718			iput(inode);
 5719			inode = ERR_PTR(ret);
 5720		}
 5721	}
 5722
 5723	return inode;
 5724}
 5725
 5726static int btrfs_dentry_delete(const struct dentry *dentry)
 5727{
 5728	struct btrfs_root *root;
 5729	struct inode *inode = d_inode(dentry);
 5730
 5731	if (!inode && !IS_ROOT(dentry))
 5732		inode = d_inode(dentry->d_parent);
 5733
 5734	if (inode) {
 5735		root = BTRFS_I(inode)->root;
 5736		if (btrfs_root_refs(&root->root_item) == 0)
 5737			return 1;
 5738
 5739		if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
 5740			return 1;
 5741	}
 5742	return 0;
 5743}
 5744
 5745static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 5746				   unsigned int flags)
 5747{
 5748	struct inode *inode = btrfs_lookup_dentry(dir, dentry);
 5749
 5750	if (inode == ERR_PTR(-ENOENT))
 5751		inode = NULL;
 5752	return d_splice_alias(inode, dentry);
 5753}
 5754
 5755/*
 5756 * Find the highest existing sequence number in a directory and then set the
 5757 * in-memory index_cnt variable to the first free sequence number.
 5758 */
 5759static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
 5760{
 5761	struct btrfs_root *root = inode->root;
 5762	struct btrfs_key key, found_key;
 5763	struct btrfs_path *path;
 5764	struct extent_buffer *leaf;
 5765	int ret;
 5766
 5767	key.objectid = btrfs_ino(inode);
 5768	key.type = BTRFS_DIR_INDEX_KEY;
 5769	key.offset = (u64)-1;
 5770
 5771	path = btrfs_alloc_path();
 5772	if (!path)
 5773		return -ENOMEM;
 5774
 5775	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 5776	if (ret < 0)
 5777		goto out;
 5778	/* FIXME: we should be able to handle this */
 5779	if (ret == 0)
 5780		goto out;
 5781	ret = 0;
 5782
 5783	if (path->slots[0] == 0) {
 5784		inode->index_cnt = BTRFS_DIR_START_INDEX;
 5785		goto out;
 5786	}
 5787
 5788	path->slots[0]--;
 5789
 5790	leaf = path->nodes[0];
 5791	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 5792
 5793	if (found_key.objectid != btrfs_ino(inode) ||
 5794	    found_key.type != BTRFS_DIR_INDEX_KEY) {
 5795		inode->index_cnt = BTRFS_DIR_START_INDEX;
 5796		goto out;
 5797	}
 5798
 5799	inode->index_cnt = found_key.offset + 1;
 5800out:
 5801	btrfs_free_path(path);
 5802	return ret;
 5803}
 5804
 5805static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
 5806{
 5807	int ret = 0;
 5808
 5809	btrfs_inode_lock(dir, 0);
 5810	if (dir->index_cnt == (u64)-1) {
 5811		ret = btrfs_inode_delayed_dir_index_count(dir);
 5812		if (ret) {
 5813			ret = btrfs_set_inode_index_count(dir);
 5814			if (ret)
 5815				goto out;
 5816		}
 5817	}
 5818
 5819	/* index_cnt is the index number of next new entry, so decrement it. */
 5820	*index = dir->index_cnt - 1;
 5821out:
 5822	btrfs_inode_unlock(dir, 0);
 5823
 5824	return ret;
 5825}
 5826
 5827/*
 5828 * All this infrastructure exists because dir_emit can fault, and we are holding
 5829 * the tree lock when doing readdir.  For now just allocate a buffer and copy
 5830 * our information into that, and then dir_emit from the buffer.  This is
 5831 * similar to what NFS does, only we don't keep the buffer around in pagecache
 5832 * because I'm afraid I'll mess that up.  Long term we need to make filldir do
 5833 * copy_to_user_inatomic so we don't have to worry about page faulting under the
 5834 * tree lock.
 5835 */
 5836static int btrfs_opendir(struct inode *inode, struct file *file)
 5837{
 5838	struct btrfs_file_private *private;
 5839	u64 last_index;
 5840	int ret;
 5841
 5842	ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index);
 5843	if (ret)
 5844		return ret;
 5845
 5846	private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
 5847	if (!private)
 5848		return -ENOMEM;
 5849	private->last_index = last_index;
 5850	private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
 5851	if (!private->filldir_buf) {
 5852		kfree(private);
 5853		return -ENOMEM;
 5854	}
 5855	file->private_data = private;
 5856	return 0;
 5857}
 5858
 5859static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence)
 5860{
 5861	struct btrfs_file_private *private = file->private_data;
 5862	int ret;
 5863
 5864	ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)),
 5865				       &private->last_index);
 5866	if (ret)
 5867		return ret;
 5868
 5869	return generic_file_llseek(file, offset, whence);
 5870}
 5871
 5872struct dir_entry {
 5873	u64 ino;
 5874	u64 offset;
 5875	unsigned type;
 5876	int name_len;
 5877};
 5878
 5879static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
 5880{
 5881	while (entries--) {
 5882		struct dir_entry *entry = addr;
 5883		char *name = (char *)(entry + 1);
 5884
 5885		ctx->pos = get_unaligned(&entry->offset);
 5886		if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
 5887					 get_unaligned(&entry->ino),
 5888					 get_unaligned(&entry->type)))
 5889			return 1;
 5890		addr += sizeof(struct dir_entry) +
 5891			get_unaligned(&entry->name_len);
 5892		ctx->pos++;
 5893	}
 5894	return 0;
 5895}
 5896
 5897static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
 5898{
 5899	struct inode *inode = file_inode(file);
 5900	struct btrfs_root *root = BTRFS_I(inode)->root;
 5901	struct btrfs_file_private *private = file->private_data;
 5902	struct btrfs_dir_item *di;
 5903	struct btrfs_key key;
 5904	struct btrfs_key found_key;
 5905	struct btrfs_path *path;
 5906	void *addr;
 5907	LIST_HEAD(ins_list);
 5908	LIST_HEAD(del_list);
 5909	int ret;
 5910	char *name_ptr;
 5911	int name_len;
 5912	int entries = 0;
 5913	int total_len = 0;
 5914	bool put = false;
 5915	struct btrfs_key location;
 5916
 5917	if (!dir_emit_dots(file, ctx))
 5918		return 0;
 5919
 5920	path = btrfs_alloc_path();
 5921	if (!path)
 5922		return -ENOMEM;
 5923
 5924	addr = private->filldir_buf;
 5925	path->reada = READA_FORWARD;
 5926
 5927	put = btrfs_readdir_get_delayed_items(BTRFS_I(inode), private->last_index,
 5928					      &ins_list, &del_list);
 5929
 5930again:
 5931	key.type = BTRFS_DIR_INDEX_KEY;
 5932	key.offset = ctx->pos;
 5933	key.objectid = btrfs_ino(BTRFS_I(inode));
 5934
 5935	btrfs_for_each_slot(root, &key, &found_key, path, ret) {
 5936		struct dir_entry *entry;
 5937		struct extent_buffer *leaf = path->nodes[0];
 5938		u8 ftype;
 5939
 5940		if (found_key.objectid != key.objectid)
 5941			break;
 5942		if (found_key.type != BTRFS_DIR_INDEX_KEY)
 5943			break;
 5944		if (found_key.offset < ctx->pos)
 5945			continue;
 5946		if (found_key.offset > private->last_index)
 5947			break;
 5948		if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
 5949			continue;
 5950		di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
 5951		name_len = btrfs_dir_name_len(leaf, di);
 5952		if ((total_len + sizeof(struct dir_entry) + name_len) >=
 5953		    PAGE_SIZE) {
 5954			btrfs_release_path(path);
 5955			ret = btrfs_filldir(private->filldir_buf, entries, ctx);
 5956			if (ret)
 5957				goto nopos;
 5958			addr = private->filldir_buf;
 5959			entries = 0;
 5960			total_len = 0;
 5961			goto again;
 5962		}
 5963
 5964		ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di));
 5965		entry = addr;
 5966		name_ptr = (char *)(entry + 1);
 5967		read_extent_buffer(leaf, name_ptr,
 5968				   (unsigned long)(di + 1), name_len);
 5969		put_unaligned(name_len, &entry->name_len);
 5970		put_unaligned(fs_ftype_to_dtype(ftype), &entry->type);
 5971		btrfs_dir_item_key_to_cpu(leaf, di, &location);
 5972		put_unaligned(location.objectid, &entry->ino);
 5973		put_unaligned(found_key.offset, &entry->offset);
 5974		entries++;
 5975		addr += sizeof(struct dir_entry) + name_len;
 5976		total_len += sizeof(struct dir_entry) + name_len;
 5977	}
 5978	/* Catch error encountered during iteration */
 5979	if (ret < 0)
 5980		goto err;
 5981
 5982	btrfs_release_path(path);
 5983
 5984	ret = btrfs_filldir(private->filldir_buf, entries, ctx);
 5985	if (ret)
 5986		goto nopos;
 5987
 5988	ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
 5989	if (ret)
 5990		goto nopos;
 5991
 5992	/*
 5993	 * Stop new entries from being returned after we return the last
 5994	 * entry.
 5995	 *
 5996	 * New directory entries are assigned a strictly increasing
 5997	 * offset.  This means that new entries created during readdir
 5998	 * are *guaranteed* to be seen in the future by that readdir.
 5999	 * This has broken buggy programs which operate on names as
 6000	 * they're returned by readdir.  Until we re-use freed offsets
 6001	 * we have this hack to stop new entries from being returned
 6002	 * under the assumption that they'll never reach this huge
 6003	 * offset.
 6004	 *
 6005	 * This is being careful not to overflow 32bit loff_t unless the
 6006	 * last entry requires it because doing so has broken 32bit apps
 6007	 * in the past.
 6008	 */
 6009	if (ctx->pos >= INT_MAX)
 6010		ctx->pos = LLONG_MAX;
 6011	else
 6012		ctx->pos = INT_MAX;
 6013nopos:
 6014	ret = 0;
 6015err:
 6016	if (put)
 6017		btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list);
 6018	btrfs_free_path(path);
 6019	return ret;
 6020}
 6021
 6022/*
 6023 * This is somewhat expensive, updating the tree every time the
 6024 * inode changes.  But, it is most likely to find the inode in cache.
 6025 * FIXME, needs more benchmarking...there are no reasons other than performance
 6026 * to keep or drop this code.
 6027 */
 6028static int btrfs_dirty_inode(struct btrfs_inode *inode)
 6029{
 6030	struct btrfs_root *root = inode->root;
 6031	struct btrfs_fs_info *fs_info = root->fs_info;
 6032	struct btrfs_trans_handle *trans;
 6033	int ret;
 6034
 6035	if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags))
 6036		return 0;
 6037
 6038	trans = btrfs_join_transaction(root);
 6039	if (IS_ERR(trans))
 6040		return PTR_ERR(trans);
 6041
 6042	ret = btrfs_update_inode(trans, inode);
 6043	if (ret == -ENOSPC || ret == -EDQUOT) {
 6044		/* whoops, lets try again with the full transaction */
 6045		btrfs_end_transaction(trans);
 6046		trans = btrfs_start_transaction(root, 1);
 6047		if (IS_ERR(trans))
 6048			return PTR_ERR(trans);
 6049
 6050		ret = btrfs_update_inode(trans, inode);
 6051	}
 6052	btrfs_end_transaction(trans);
 6053	if (inode->delayed_node)
 6054		btrfs_balance_delayed_items(fs_info);
 6055
 6056	return ret;
 6057}
 6058
 6059/*
 6060 * This is a copy of file_update_time.  We need this so we can return error on
 6061 * ENOSPC for updating the inode in the case of file write and mmap writes.
 6062 */
 6063static int btrfs_update_time(struct inode *inode, int flags)
 6064{
 6065	struct btrfs_root *root = BTRFS_I(inode)->root;
 6066	bool dirty;
 6067
 6068	if (btrfs_root_readonly(root))
 6069		return -EROFS;
 6070
 6071	dirty = inode_update_timestamps(inode, flags);
 6072	return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0;
 6073}
 6074
 6075/*
 6076 * helper to find a free sequence number in a given directory.  This current
 6077 * code is very simple, later versions will do smarter things in the btree
 6078 */
 6079int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
 6080{
 6081	int ret = 0;
 6082
 6083	if (dir->index_cnt == (u64)-1) {
 6084		ret = btrfs_inode_delayed_dir_index_count(dir);
 6085		if (ret) {
 6086			ret = btrfs_set_inode_index_count(dir);
 6087			if (ret)
 6088				return ret;
 6089		}
 6090	}
 6091
 6092	*index = dir->index_cnt;
 6093	dir->index_cnt++;
 6094
 6095	return ret;
 6096}
 6097
 6098static int btrfs_insert_inode_locked(struct inode *inode)
 6099{
 6100	struct btrfs_iget_args args;
 6101
 6102	args.ino = btrfs_ino(BTRFS_I(inode));
 6103	args.root = BTRFS_I(inode)->root;
 6104
 6105	return insert_inode_locked4(inode,
 6106		   btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
 6107		   btrfs_find_actor, &args);
 6108}
 6109
 6110int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
 6111			    unsigned int *trans_num_items)
 6112{
 6113	struct inode *dir = args->dir;
 6114	struct inode *inode = args->inode;
 6115	int ret;
 6116
 6117	if (!args->orphan) {
 6118		ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0,
 6119					     &args->fname);
 6120		if (ret)
 6121			return ret;
 6122	}
 6123
 6124	ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
 6125	if (ret) {
 6126		fscrypt_free_filename(&args->fname);
 6127		return ret;
 6128	}
 6129
 6130	/* 1 to add inode item */
 6131	*trans_num_items = 1;
 6132	/* 1 to add compression property */
 6133	if (BTRFS_I(dir)->prop_compress)
 6134		(*trans_num_items)++;
 6135	/* 1 to add default ACL xattr */
 6136	if (args->default_acl)
 6137		(*trans_num_items)++;
 6138	/* 1 to add access ACL xattr */
 6139	if (args->acl)
 6140		(*trans_num_items)++;
 6141#ifdef CONFIG_SECURITY
 6142	/* 1 to add LSM xattr */
 6143	if (dir->i_security)
 6144		(*trans_num_items)++;
 6145#endif
 6146	if (args->orphan) {
 6147		/* 1 to add orphan item */
 6148		(*trans_num_items)++;
 6149	} else {
 6150		/*
 6151		 * 1 to add dir item
 6152		 * 1 to add dir index
 6153		 * 1 to update parent inode item
 6154		 *
 6155		 * No need for 1 unit for the inode ref item because it is
 6156		 * inserted in a batch together with the inode item at
 6157		 * btrfs_create_new_inode().
 6158		 */
 6159		*trans_num_items += 3;
 6160	}
 6161	return 0;
 6162}
 6163
 6164void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
 6165{
 6166	posix_acl_release(args->acl);
 6167	posix_acl_release(args->default_acl);
 6168	fscrypt_free_filename(&args->fname);
 6169}
 6170
 6171/*
 6172 * Inherit flags from the parent inode.
 6173 *
 6174 * Currently only the compression flags and the cow flags are inherited.
 6175 */
 6176static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir)
 6177{
 6178	unsigned int flags;
 6179
 6180	flags = dir->flags;
 6181
 6182	if (flags & BTRFS_INODE_NOCOMPRESS) {
 6183		inode->flags &= ~BTRFS_INODE_COMPRESS;
 6184		inode->flags |= BTRFS_INODE_NOCOMPRESS;
 6185	} else if (flags & BTRFS_INODE_COMPRESS) {
 6186		inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
 6187		inode->flags |= BTRFS_INODE_COMPRESS;
 6188	}
 6189
 6190	if (flags & BTRFS_INODE_NODATACOW) {
 6191		inode->flags |= BTRFS_INODE_NODATACOW;
 6192		if (S_ISREG(inode->vfs_inode.i_mode))
 6193			inode->flags |= BTRFS_INODE_NODATASUM;
 6194	}
 6195
 6196	btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
 6197}
 6198
 6199int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 6200			   struct btrfs_new_inode_args *args)
 6201{
 6202	struct timespec64 ts;
 6203	struct inode *dir = args->dir;
 6204	struct inode *inode = args->inode;
 6205	const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
 6206	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
 6207	struct btrfs_root *root;
 6208	struct btrfs_inode_item *inode_item;
 6209	struct btrfs_path *path;
 6210	u64 objectid;
 6211	struct btrfs_inode_ref *ref;
 6212	struct btrfs_key key[2];
 6213	u32 sizes[2];
 6214	struct btrfs_item_batch batch;
 6215	unsigned long ptr;
 6216	int ret;
 6217	bool xa_reserved = false;
 6218
 6219	path = btrfs_alloc_path();
 6220	if (!path)
 6221		return -ENOMEM;
 6222
 6223	if (!args->subvol)
 6224		BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
 6225	root = BTRFS_I(inode)->root;
 6226
 6227	ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
 6228	if (ret)
 6229		goto out;
 6230
 6231	ret = btrfs_get_free_objectid(root, &objectid);
 6232	if (ret)
 6233		goto out;
 6234	btrfs_set_inode_number(BTRFS_I(inode), objectid);
 6235
 6236	ret = xa_reserve(&root->inodes, objectid, GFP_NOFS);
 6237	if (ret)
 6238		goto out;
 6239	xa_reserved = true;
 6240
 6241	if (args->orphan) {
 6242		/*
 6243		 * O_TMPFILE, set link count to 0, so that after this point, we
 6244		 * fill in an inode item with the correct link count.
 6245		 */
 6246		set_nlink(inode, 0);
 6247	} else {
 6248		trace_btrfs_inode_request(dir);
 6249
 6250		ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index);
 6251		if (ret)
 6252			goto out;
 6253	}
 6254
 6255	if (S_ISDIR(inode->i_mode))
 6256		BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
 6257
 6258	BTRFS_I(inode)->generation = trans->transid;
 6259	inode->i_generation = BTRFS_I(inode)->generation;
 6260
 6261	/*
 6262	 * We don't have any capability xattrs set here yet, shortcut any
 6263	 * queries for the xattrs here.  If we add them later via the inode
 6264	 * security init path or any other path this flag will be cleared.
 6265	 */
 6266	set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
 6267
 6268	/*
 6269	 * Subvolumes don't inherit flags from their parent directory.
 6270	 * Originally this was probably by accident, but we probably can't
 6271	 * change it now without compatibility issues.
 6272	 */
 6273	if (!args->subvol)
 6274		btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir));
 6275
 6276	if (S_ISREG(inode->i_mode)) {
 6277		if (btrfs_test_opt(fs_info, NODATASUM))
 6278			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
 6279		if (btrfs_test_opt(fs_info, NODATACOW))
 6280			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
 6281				BTRFS_INODE_NODATASUM;
 6282	}
 6283
 6284	ret = btrfs_insert_inode_locked(inode);
 6285	if (ret < 0) {
 6286		if (!args->orphan)
 6287			BTRFS_I(dir)->index_cnt--;
 6288		goto out;
 6289	}
 6290
 6291	/*
 6292	 * We could have gotten an inode number from somebody who was fsynced
 6293	 * and then removed in this same transaction, so let's just set full
 6294	 * sync since it will be a full sync anyway and this will blow away the
 6295	 * old info in the log.
 6296	 */
 6297	btrfs_set_inode_full_sync(BTRFS_I(inode));
 6298
 6299	key[0].objectid = objectid;
 6300	key[0].type = BTRFS_INODE_ITEM_KEY;
 6301	key[0].offset = 0;
 6302
 6303	sizes[0] = sizeof(struct btrfs_inode_item);
 6304
 6305	if (!args->orphan) {
 6306		/*
 6307		 * Start new inodes with an inode_ref. This is slightly more
 6308		 * efficient for small numbers of hard links since they will
 6309		 * be packed into one item. Extended refs will kick in if we
 6310		 * add more hard links than can fit in the ref item.
 6311		 */
 6312		key[1].objectid = objectid;
 6313		key[1].type = BTRFS_INODE_REF_KEY;
 6314		if (args->subvol) {
 6315			key[1].offset = objectid;
 6316			sizes[1] = 2 + sizeof(*ref);
 6317		} else {
 6318			key[1].offset = btrfs_ino(BTRFS_I(dir));
 6319			sizes[1] = name->len + sizeof(*ref);
 6320		}
 6321	}
 6322
 6323	batch.keys = &key[0];
 6324	batch.data_sizes = &sizes[0];
 6325	batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
 6326	batch.nr = args->orphan ? 1 : 2;
 6327	ret = btrfs_insert_empty_items(trans, root, path, &batch);
 6328	if (ret != 0) {
 6329		btrfs_abort_transaction(trans, ret);
 6330		goto discard;
 6331	}
 6332
 6333	ts = simple_inode_init_ts(inode);
 6334	BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
 6335	BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
 6336
 6337	/*
 6338	 * We're going to fill the inode item now, so at this point the inode
 6339	 * must be fully initialized.
 6340	 */
 6341
 6342	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 6343				  struct btrfs_inode_item);
 6344	memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
 6345			     sizeof(*inode_item));
 6346	fill_inode_item(trans, path->nodes[0], inode_item, inode);
 6347
 6348	if (!args->orphan) {
 6349		ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
 6350				     struct btrfs_inode_ref);
 6351		ptr = (unsigned long)(ref + 1);
 6352		if (args->subvol) {
 6353			btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2);
 6354			btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
 6355			write_extent_buffer(path->nodes[0], "..", ptr, 2);
 6356		} else {
 6357			btrfs_set_inode_ref_name_len(path->nodes[0], ref,
 6358						     name->len);
 6359			btrfs_set_inode_ref_index(path->nodes[0], ref,
 6360						  BTRFS_I(inode)->dir_index);
 6361			write_extent_buffer(path->nodes[0], name->name, ptr,
 6362					    name->len);
 6363		}
 6364	}
 6365
 6366	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 6367	/*
 6368	 * We don't need the path anymore, plus inheriting properties, adding
 6369	 * ACLs, security xattrs, orphan item or adding the link, will result in
 6370	 * allocating yet another path. So just free our path.
 6371	 */
 6372	btrfs_free_path(path);
 6373	path = NULL;
 6374
 6375	if (args->subvol) {
 6376		struct inode *parent;
 6377
 6378		/*
 6379		 * Subvolumes inherit properties from their parent subvolume,
 6380		 * not the directory they were created in.
 6381		 */
 6382		parent = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, BTRFS_I(dir)->root);
 6383		if (IS_ERR(parent)) {
 6384			ret = PTR_ERR(parent);
 6385		} else {
 6386			ret = btrfs_inode_inherit_props(trans, inode, parent);
 6387			iput(parent);
 6388		}
 6389	} else {
 6390		ret = btrfs_inode_inherit_props(trans, inode, dir);
 6391	}
 6392	if (ret) {
 6393		btrfs_err(fs_info,
 6394			  "error inheriting props for ino %llu (root %llu): %d",
 6395			  btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret);
 6396	}
 6397
 6398	/*
 6399	 * Subvolumes don't inherit ACLs or get passed to the LSM. This is
 6400	 * probably a bug.
 6401	 */
 6402	if (!args->subvol) {
 6403		ret = btrfs_init_inode_security(trans, args);
 6404		if (ret) {
 6405			btrfs_abort_transaction(trans, ret);
 6406			goto discard;
 6407		}
 6408	}
 6409
 6410	ret = btrfs_add_inode_to_root(BTRFS_I(inode), false);
 6411	if (WARN_ON(ret)) {
 6412		/* Shouldn't happen, we used xa_reserve() before. */
 6413		btrfs_abort_transaction(trans, ret);
 6414		goto discard;
 6415	}
 6416
 6417	trace_btrfs_inode_new(inode);
 6418	btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
 6419
 6420	btrfs_update_root_times(trans, root);
 6421
 6422	if (args->orphan) {
 6423		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
 6424	} else {
 6425		ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
 6426				     0, BTRFS_I(inode)->dir_index);
 6427	}
 6428	if (ret) {
 6429		btrfs_abort_transaction(trans, ret);
 6430		goto discard;
 6431	}
 6432
 6433	return 0;
 6434
 6435discard:
 6436	/*
 6437	 * discard_new_inode() calls iput(), but the caller owns the reference
 6438	 * to the inode.
 6439	 */
 6440	ihold(inode);
 6441	discard_new_inode(inode);
 6442out:
 6443	if (xa_reserved)
 6444		xa_release(&root->inodes, objectid);
 6445
 6446	btrfs_free_path(path);
 6447	return ret;
 6448}
 6449
 6450/*
 6451 * utility function to add 'inode' into 'parent_inode' with
 6452 * a give name and a given sequence number.
 6453 * if 'add_backref' is true, also insert a backref from the
 6454 * inode to the parent directory.
 6455 */
 6456int btrfs_add_link(struct btrfs_trans_handle *trans,
 6457		   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
 6458		   const struct fscrypt_str *name, int add_backref, u64 index)
 6459{
 6460	int ret = 0;
 6461	struct btrfs_key key;
 6462	struct btrfs_root *root = parent_inode->root;
 6463	u64 ino = btrfs_ino(inode);
 6464	u64 parent_ino = btrfs_ino(parent_inode);
 6465
 6466	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
 6467		memcpy(&key, &inode->root->root_key, sizeof(key));
 6468	} else {
 6469		key.objectid = ino;
 6470		key.type = BTRFS_INODE_ITEM_KEY;
 6471		key.offset = 0;
 6472	}
 6473
 6474	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
 6475		ret = btrfs_add_root_ref(trans, key.objectid,
 6476					 btrfs_root_id(root), parent_ino,
 6477					 index, name);
 6478	} else if (add_backref) {
 6479		ret = btrfs_insert_inode_ref(trans, root, name,
 6480					     ino, parent_ino, index);
 6481	}
 6482
 6483	/* Nothing to clean up yet */
 6484	if (ret)
 6485		return ret;
 6486
 6487	ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
 6488				    btrfs_inode_type(&inode->vfs_inode), index);
 6489	if (ret == -EEXIST || ret == -EOVERFLOW)
 6490		goto fail_dir_item;
 6491	else if (ret) {
 6492		btrfs_abort_transaction(trans, ret);
 6493		return ret;
 6494	}
 6495
 6496	btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
 6497			   name->len * 2);
 6498	inode_inc_iversion(&parent_inode->vfs_inode);
 6499	/*
 6500	 * If we are replaying a log tree, we do not want to update the mtime
 6501	 * and ctime of the parent directory with the current time, since the
 6502	 * log replay procedure is responsible for setting them to their correct
 6503	 * values (the ones it had when the fsync was done).
 6504	 */
 6505	if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
 6506		inode_set_mtime_to_ts(&parent_inode->vfs_inode,
 6507				      inode_set_ctime_current(&parent_inode->vfs_inode));
 6508
 6509	ret = btrfs_update_inode(trans, parent_inode);
 6510	if (ret)
 6511		btrfs_abort_transaction(trans, ret);
 6512	return ret;
 6513
 6514fail_dir_item:
 6515	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
 6516		u64 local_index;
 6517		int err;
 6518		err = btrfs_del_root_ref(trans, key.objectid,
 6519					 btrfs_root_id(root), parent_ino,
 6520					 &local_index, name);
 6521		if (err)
 6522			btrfs_abort_transaction(trans, err);
 6523	} else if (add_backref) {
 6524		u64 local_index;
 6525		int err;
 6526
 6527		err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino,
 6528					  &local_index);
 6529		if (err)
 6530			btrfs_abort_transaction(trans, err);
 6531	}
 6532
 6533	/* Return the original error code */
 6534	return ret;
 6535}
 6536
 6537static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
 6538			       struct inode *inode)
 6539{
 6540	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
 6541	struct btrfs_root *root = BTRFS_I(dir)->root;
 6542	struct btrfs_new_inode_args new_inode_args = {
 6543		.dir = dir,
 6544		.dentry = dentry,
 6545		.inode = inode,
 6546	};
 6547	unsigned int trans_num_items;
 6548	struct btrfs_trans_handle *trans;
 6549	int err;
 6550
 6551	err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
 6552	if (err)
 6553		goto out_inode;
 6554
 6555	trans = btrfs_start_transaction(root, trans_num_items);
 6556	if (IS_ERR(trans)) {
 6557		err = PTR_ERR(trans);
 6558		goto out_new_inode_args;
 6559	}
 6560
 6561	err = btrfs_create_new_inode(trans, &new_inode_args);
 6562	if (!err)
 6563		d_instantiate_new(dentry, inode);
 6564
 6565	btrfs_end_transaction(trans);
 6566	btrfs_btree_balance_dirty(fs_info);
 6567out_new_inode_args:
 6568	btrfs_new_inode_args_destroy(&new_inode_args);
 6569out_inode:
 6570	if (err)
 6571		iput(inode);
 6572	return err;
 6573}
 6574
 6575static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 6576		       struct dentry *dentry, umode_t mode, dev_t rdev)
 6577{
 6578	struct inode *inode;
 6579
 6580	inode = new_inode(dir->i_sb);
 6581	if (!inode)
 6582		return -ENOMEM;
 6583	inode_init_owner(idmap, inode, dir, mode);
 6584	inode->i_op = &btrfs_special_inode_operations;
 6585	init_special_inode(inode, inode->i_mode, rdev);
 6586	return btrfs_create_common(dir, dentry, inode);
 6587}
 6588
 6589static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir,
 6590			struct dentry *dentry, umode_t mode, bool excl)
 6591{
 6592	struct inode *inode;
 6593
 6594	inode = new_inode(dir->i_sb);
 6595	if (!inode)
 6596		return -ENOMEM;
 6597	inode_init_owner(idmap, inode, dir, mode);
 6598	inode->i_fop = &btrfs_file_operations;
 6599	inode->i_op = &btrfs_file_inode_operations;
 6600	inode->i_mapping->a_ops = &btrfs_aops;
 6601	return btrfs_create_common(dir, dentry, inode);
 6602}
 6603
 6604static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 6605		      struct dentry *dentry)
 6606{
 6607	struct btrfs_trans_handle *trans = NULL;
 6608	struct btrfs_root *root = BTRFS_I(dir)->root;
 6609	struct inode *inode = d_inode(old_dentry);
 6610	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 6611	struct fscrypt_name fname;
 6612	u64 index;
 6613	int err;
 6614	int drop_inode = 0;
 6615
 6616	/* do not allow sys_link's with other subvols of the same device */
 6617	if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root))
 6618		return -EXDEV;
 6619
 6620	if (inode->i_nlink >= BTRFS_LINK_MAX)
 6621		return -EMLINK;
 6622
 6623	err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
 6624	if (err)
 6625		goto fail;
 6626
 6627	err = btrfs_set_inode_index(BTRFS_I(dir), &index);
 6628	if (err)
 6629		goto fail;
 6630
 6631	/*
 6632	 * 2 items for inode and inode ref
 6633	 * 2 items for dir items
 6634	 * 1 item for parent inode
 6635	 * 1 item for orphan item deletion if O_TMPFILE
 6636	 */
 6637	trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
 6638	if (IS_ERR(trans)) {
 6639		err = PTR_ERR(trans);
 6640		trans = NULL;
 6641		goto fail;
 6642	}
 6643
 6644	/* There are several dir indexes for this inode, clear the cache. */
 6645	BTRFS_I(inode)->dir_index = 0ULL;
 6646	inc_nlink(inode);
 6647	inode_inc_iversion(inode);
 6648	inode_set_ctime_current(inode);
 6649	ihold(inode);
 6650	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
 6651
 6652	err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
 6653			     &fname.disk_name, 1, index);
 6654
 6655	if (err) {
 6656		drop_inode = 1;
 6657	} else {
 6658		struct dentry *parent = dentry->d_parent;
 6659
 6660		err = btrfs_update_inode(trans, BTRFS_I(inode));
 6661		if (err)
 6662			goto fail;
 6663		if (inode->i_nlink == 1) {
 6664			/*
 6665			 * If new hard link count is 1, it's a file created
 6666			 * with open(2) O_TMPFILE flag.
 6667			 */
 6668			err = btrfs_orphan_del(trans, BTRFS_I(inode));
 6669			if (err)
 6670				goto fail;
 6671		}
 6672		d_instantiate(dentry, inode);
 6673		btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
 6674	}
 6675
 6676fail:
 6677	fscrypt_free_filename(&fname);
 6678	if (trans)
 6679		btrfs_end_transaction(trans);
 6680	if (drop_inode) {
 6681		inode_dec_link_count(inode);
 6682		iput(inode);
 6683	}
 6684	btrfs_btree_balance_dirty(fs_info);
 6685	return err;
 6686}
 6687
 6688static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 6689		       struct dentry *dentry, umode_t mode)
 6690{
 6691	struct inode *inode;
 6692
 6693	inode = new_inode(dir->i_sb);
 6694	if (!inode)
 6695		return -ENOMEM;
 6696	inode_init_owner(idmap, inode, dir, S_IFDIR | mode);
 6697	inode->i_op = &btrfs_dir_inode_operations;
 6698	inode->i_fop = &btrfs_dir_file_operations;
 6699	return btrfs_create_common(dir, dentry, inode);
 6700}
 6701
 6702static noinline int uncompress_inline(struct btrfs_path *path,
 6703				      struct page *page,
 6704				      struct btrfs_file_extent_item *item)
 6705{
 6706	int ret;
 6707	struct extent_buffer *leaf = path->nodes[0];
 6708	char *tmp;
 6709	size_t max_size;
 6710	unsigned long inline_size;
 6711	unsigned long ptr;
 6712	int compress_type;
 6713
 6714	compress_type = btrfs_file_extent_compression(leaf, item);
 6715	max_size = btrfs_file_extent_ram_bytes(leaf, item);
 6716	inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
 6717	tmp = kmalloc(inline_size, GFP_NOFS);
 6718	if (!tmp)
 6719		return -ENOMEM;
 6720	ptr = btrfs_file_extent_inline_start(item);
 6721
 6722	read_extent_buffer(leaf, tmp, ptr, inline_size);
 6723
 6724	max_size = min_t(unsigned long, PAGE_SIZE, max_size);
 6725	ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size);
 6726
 6727	/*
 6728	 * decompression code contains a memset to fill in any space between the end
 6729	 * of the uncompressed data and the end of max_size in case the decompressed
 6730	 * data ends up shorter than ram_bytes.  That doesn't cover the hole between
 6731	 * the end of an inline extent and the beginning of the next block, so we
 6732	 * cover that region here.
 6733	 */
 6734
 6735	if (max_size < PAGE_SIZE)
 6736		memzero_page(page, max_size, PAGE_SIZE - max_size);
 6737	kfree(tmp);
 6738	return ret;
 6739}
 6740
 6741static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path,
 6742			      struct page *page)
 6743{
 6744	struct btrfs_file_extent_item *fi;
 6745	void *kaddr;
 6746	size_t copy_size;
 6747
 6748	if (!page || PageUptodate(page))
 6749		return 0;
 6750
 6751	ASSERT(page_offset(page) == 0);
 6752
 6753	fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
 6754			    struct btrfs_file_extent_item);
 6755	if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
 6756		return uncompress_inline(path, page, fi);
 6757
 6758	copy_size = min_t(u64, PAGE_SIZE,
 6759			  btrfs_file_extent_ram_bytes(path->nodes[0], fi));
 6760	kaddr = kmap_local_page(page);
 6761	read_extent_buffer(path->nodes[0], kaddr,
 6762			   btrfs_file_extent_inline_start(fi), copy_size);
 6763	kunmap_local(kaddr);
 6764	if (copy_size < PAGE_SIZE)
 6765		memzero_page(page, copy_size, PAGE_SIZE - copy_size);
 6766	return 0;
 6767}
 6768
 6769/*
 6770 * Lookup the first extent overlapping a range in a file.
 6771 *
 6772 * @inode:	file to search in
 6773 * @page:	page to read extent data into if the extent is inline
 6774 * @start:	file offset
 6775 * @len:	length of range starting at @start
 6776 *
 6777 * Return the first &struct extent_map which overlaps the given range, reading
 6778 * it from the B-tree and caching it if necessary. Note that there may be more
 6779 * extents which overlap the given range after the returned extent_map.
 6780 *
 6781 * If @page is not NULL and the extent is inline, this also reads the extent
 6782 * data directly into the page and marks the extent up to date in the io_tree.
 6783 *
 6784 * Return: ERR_PTR on error, non-NULL extent_map on success.
 6785 */
 6786struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
 6787				    struct page *page, u64 start, u64 len)
 6788{
 6789	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 6790	int ret = 0;
 6791	u64 extent_start = 0;
 6792	u64 extent_end = 0;
 6793	u64 objectid = btrfs_ino(inode);
 6794	int extent_type = -1;
 6795	struct btrfs_path *path = NULL;
 6796	struct btrfs_root *root = inode->root;
 6797	struct btrfs_file_extent_item *item;
 6798	struct extent_buffer *leaf;
 6799	struct btrfs_key found_key;
 6800	struct extent_map *em = NULL;
 6801	struct extent_map_tree *em_tree = &inode->extent_tree;
 6802
 6803	read_lock(&em_tree->lock);
 6804	em = lookup_extent_mapping(em_tree, start, len);
 6805	read_unlock(&em_tree->lock);
 6806
 6807	if (em) {
 6808		if (em->start > start || em->start + em->len <= start)
 6809			free_extent_map(em);
 6810		else if (em->disk_bytenr == EXTENT_MAP_INLINE && page)
 6811			free_extent_map(em);
 6812		else
 6813			goto out;
 6814	}
 6815	em = alloc_extent_map();
 6816	if (!em) {
 6817		ret = -ENOMEM;
 6818		goto out;
 6819	}
 6820	em->start = EXTENT_MAP_HOLE;
 6821	em->disk_bytenr = EXTENT_MAP_HOLE;
 6822	em->len = (u64)-1;
 6823
 6824	path = btrfs_alloc_path();
 6825	if (!path) {
 6826		ret = -ENOMEM;
 6827		goto out;
 6828	}
 6829
 6830	/* Chances are we'll be called again, so go ahead and do readahead */
 6831	path->reada = READA_FORWARD;
 6832
 6833	/*
 6834	 * The same explanation in load_free_space_cache applies here as well,
 6835	 * we only read when we're loading the free space cache, and at that
 6836	 * point the commit_root has everything we need.
 6837	 */
 6838	if (btrfs_is_free_space_inode(inode)) {
 6839		path->search_commit_root = 1;
 6840		path->skip_locking = 1;
 6841	}
 6842
 6843	ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
 6844	if (ret < 0) {
 6845		goto out;
 6846	} else if (ret > 0) {
 6847		if (path->slots[0] == 0)
 6848			goto not_found;
 6849		path->slots[0]--;
 6850		ret = 0;
 6851	}
 6852
 6853	leaf = path->nodes[0];
 6854	item = btrfs_item_ptr(leaf, path->slots[0],
 6855			      struct btrfs_file_extent_item);
 6856	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 6857	if (found_key.objectid != objectid ||
 6858	    found_key.type != BTRFS_EXTENT_DATA_KEY) {
 6859		/*
 6860		 * If we backup past the first extent we want to move forward
 6861		 * and see if there is an extent in front of us, otherwise we'll
 6862		 * say there is a hole for our whole search range which can
 6863		 * cause problems.
 6864		 */
 6865		extent_end = start;
 6866		goto next;
 6867	}
 6868
 6869	extent_type = btrfs_file_extent_type(leaf, item);
 6870	extent_start = found_key.offset;
 6871	extent_end = btrfs_file_extent_end(path);
 6872	if (extent_type == BTRFS_FILE_EXTENT_REG ||
 6873	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 6874		/* Only regular file could have regular/prealloc extent */
 6875		if (!S_ISREG(inode->vfs_inode.i_mode)) {
 6876			ret = -EUCLEAN;
 6877			btrfs_crit(fs_info,
 6878		"regular/prealloc extent found for non-regular inode %llu",
 6879				   btrfs_ino(inode));
 6880			goto out;
 6881		}
 6882		trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
 6883						       extent_start);
 6884	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 6885		trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
 6886						      path->slots[0],
 6887						      extent_start);
 6888	}
 6889next:
 6890	if (start >= extent_end) {
 6891		path->slots[0]++;
 6892		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 6893			ret = btrfs_next_leaf(root, path);
 6894			if (ret < 0)
 6895				goto out;
 6896			else if (ret > 0)
 6897				goto not_found;
 6898
 6899			leaf = path->nodes[0];
 6900		}
 6901		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 6902		if (found_key.objectid != objectid ||
 6903		    found_key.type != BTRFS_EXTENT_DATA_KEY)
 6904			goto not_found;
 6905		if (start + len <= found_key.offset)
 6906			goto not_found;
 6907		if (start > found_key.offset)
 6908			goto next;
 6909
 6910		/* New extent overlaps with existing one */
 6911		em->start = start;
 6912		em->len = found_key.offset - start;
 6913		em->disk_bytenr = EXTENT_MAP_HOLE;
 6914		goto insert;
 6915	}
 6916
 6917	btrfs_extent_item_to_extent_map(inode, path, item, em);
 6918
 6919	if (extent_type == BTRFS_FILE_EXTENT_REG ||
 6920	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 6921		goto insert;
 6922	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 6923		/*
 6924		 * Inline extent can only exist at file offset 0. This is
 6925		 * ensured by tree-checker and inline extent creation path.
 6926		 * Thus all members representing file offsets should be zero.
 6927		 */
 6928		ASSERT(extent_start == 0);
 6929		ASSERT(em->start == 0);
 6930
 6931		/*
 6932		 * btrfs_extent_item_to_extent_map() should have properly
 6933		 * initialized em members already.
 6934		 *
 6935		 * Other members are not utilized for inline extents.
 6936		 */
 6937		ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);
 6938		ASSERT(em->len == fs_info->sectorsize);
 6939
 6940		ret = read_inline_extent(inode, path, page);
 6941		if (ret < 0)
 6942			goto out;
 6943		goto insert;
 6944	}
 6945not_found:
 6946	em->start = start;
 6947	em->len = len;
 6948	em->disk_bytenr = EXTENT_MAP_HOLE;
 6949insert:
 6950	ret = 0;
 6951	btrfs_release_path(path);
 6952	if (em->start > start || extent_map_end(em) <= start) {
 6953		btrfs_err(fs_info,
 6954			  "bad extent! em: [%llu %llu] passed [%llu %llu]",
 6955			  em->start, em->len, start, len);
 6956		ret = -EIO;
 6957		goto out;
 6958	}
 6959
 6960	write_lock(&em_tree->lock);
 6961	ret = btrfs_add_extent_mapping(inode, &em, start, len);
 6962	write_unlock(&em_tree->lock);
 6963out:
 6964	btrfs_free_path(path);
 6965
 6966	trace_btrfs_get_extent(root, inode, em);
 6967
 6968	if (ret) {
 6969		free_extent_map(em);
 6970		return ERR_PTR(ret);
 6971	}
 6972	return em;
 6973}
 6974
 6975static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
 6976{
 6977	struct btrfs_block_group *block_group;
 6978	bool readonly = false;
 6979
 6980	block_group = btrfs_lookup_block_group(fs_info, bytenr);
 6981	if (!block_group || block_group->ro)
 6982		readonly = true;
 6983	if (block_group)
 6984		btrfs_put_block_group(block_group);
 6985	return readonly;
 6986}
 6987
 6988/*
 6989 * Check if we can do nocow write into the range [@offset, @offset + @len)
 6990 *
 6991 * @offset:	File offset
 6992 * @len:	The length to write, will be updated to the nocow writeable
 6993 *		range
 6994 * @orig_start:	(optional) Return the original file offset of the file extent
 6995 * @orig_len:	(optional) Return the original on-disk length of the file extent
 6996 * @ram_bytes:	(optional) Return the ram_bytes of the file extent
 6997 * @strict:	if true, omit optimizations that might force us into unnecessary
 6998 *		cow. e.g., don't trust generation number.
 6999 *
 7000 * Return:
 7001 * >0	and update @len if we can do nocow write
 7002 *  0	if we can't do nocow write
 7003 * <0	if error happened
 7004 *
 7005 * NOTE: This only checks the file extents, caller is responsible to wait for
 7006 *	 any ordered extents.
 7007 */
 7008noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 7009			      struct btrfs_file_extent *file_extent,
 7010			      bool nowait, bool strict)
 7011{
 7012	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 7013	struct can_nocow_file_extent_args nocow_args = { 0 };
 7014	struct btrfs_path *path;
 7015	int ret;
 7016	struct extent_buffer *leaf;
 7017	struct btrfs_root *root = BTRFS_I(inode)->root;
 7018	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 7019	struct btrfs_file_extent_item *fi;
 7020	struct btrfs_key key;
 7021	int found_type;
 7022
 7023	path = btrfs_alloc_path();
 7024	if (!path)
 7025		return -ENOMEM;
 7026	path->nowait = nowait;
 7027
 7028	ret = btrfs_lookup_file_extent(NULL, root, path,
 7029			btrfs_ino(BTRFS_I(inode)), offset, 0);
 7030	if (ret < 0)
 7031		goto out;
 7032
 7033	if (ret == 1) {
 7034		if (path->slots[0] == 0) {
 7035			/* can't find the item, must cow */
 7036			ret = 0;
 7037			goto out;
 7038		}
 7039		path->slots[0]--;
 7040	}
 7041	ret = 0;
 7042	leaf = path->nodes[0];
 7043	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 7044	if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
 7045	    key.type != BTRFS_EXTENT_DATA_KEY) {
 7046		/* not our file or wrong item type, must cow */
 7047		goto out;
 7048	}
 7049
 7050	if (key.offset > offset) {
 7051		/* Wrong offset, must cow */
 7052		goto out;
 7053	}
 7054
 7055	if (btrfs_file_extent_end(path) <= offset)
 7056		goto out;
 7057
 7058	fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
 7059	found_type = btrfs_file_extent_type(leaf, fi);
 7060
 7061	nocow_args.start = offset;
 7062	nocow_args.end = offset + *len - 1;
 7063	nocow_args.strict = strict;
 7064	nocow_args.free_path = true;
 7065
 7066	ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
 7067	/* can_nocow_file_extent() has freed the path. */
 7068	path = NULL;
 7069
 7070	if (ret != 1) {
 7071		/* Treat errors as not being able to NOCOW. */
 7072		ret = 0;
 7073		goto out;
 7074	}
 7075
 7076	ret = 0;
 7077	if (btrfs_extent_readonly(fs_info,
 7078				  nocow_args.file_extent.disk_bytenr +
 7079				  nocow_args.file_extent.offset))
 7080		goto out;
 7081
 7082	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
 7083	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 7084		u64 range_end;
 7085
 7086		range_end = round_up(offset + nocow_args.file_extent.num_bytes,
 7087				     root->fs_info->sectorsize) - 1;
 7088		ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC);
 7089		if (ret) {
 7090			ret = -EAGAIN;
 7091			goto out;
 7092		}
 7093	}
 7094
 7095	if (file_extent)
 7096		memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent));
 7097
 7098	*len = nocow_args.file_extent.num_bytes;
 7099	ret = 1;
 7100out:
 7101	btrfs_free_path(path);
 7102	return ret;
 7103}
 7104
 7105/* The callers of this must take lock_extent() */
 7106struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
 7107				      const struct btrfs_file_extent *file_extent,
 7108				      int type)
 7109{
 7110	struct extent_map *em;
 7111	int ret;
 7112
 7113	/*
 7114	 * Note the missing NOCOW type.
 7115	 *
 7116	 * For pure NOCOW writes, we should not create an io extent map, but
 7117	 * just reusing the existing one.
 7118	 * Only PREALLOC writes (NOCOW write into preallocated range) can
 7119	 * create an io extent map.
 7120	 */
 7121	ASSERT(type == BTRFS_ORDERED_PREALLOC ||
 7122	       type == BTRFS_ORDERED_COMPRESSED ||
 7123	       type == BTRFS_ORDERED_REGULAR);
 7124
 7125	switch (type) {
 7126	case BTRFS_ORDERED_PREALLOC:
 7127		/* We're only referring part of a larger preallocated extent. */
 7128		ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);
 7129		break;
 7130	case BTRFS_ORDERED_REGULAR:
 7131		/* COW results a new extent matching our file extent size. */
 7132		ASSERT(file_extent->disk_num_bytes == file_extent->num_bytes);
 7133		ASSERT(file_extent->ram_bytes == file_extent->num_bytes);
 7134
 7135		/* Since it's a new extent, we should not have any offset. */
 7136		ASSERT(file_extent->offset == 0);
 7137		break;
 7138	case BTRFS_ORDERED_COMPRESSED:
 7139		/* Must be compressed. */
 7140		ASSERT(file_extent->compression != BTRFS_COMPRESS_NONE);
 7141
 7142		/*
 7143		 * Encoded write can make us to refer to part of the
 7144		 * uncompressed extent.
 7145		 */
 7146		ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);
 7147		break;
 7148	}
 7149
 7150	em = alloc_extent_map();
 7151	if (!em)
 7152		return ERR_PTR(-ENOMEM);
 7153
 7154	em->start = start;
 7155	em->len = file_extent->num_bytes;
 7156	em->disk_bytenr = file_extent->disk_bytenr;
 7157	em->disk_num_bytes = file_extent->disk_num_bytes;
 7158	em->ram_bytes = file_extent->ram_bytes;
 7159	em->generation = -1;
 7160	em->offset = file_extent->offset;
 7161	em->flags |= EXTENT_FLAG_PINNED;
 7162	if (type == BTRFS_ORDERED_COMPRESSED)
 7163		extent_map_set_compression(em, file_extent->compression);
 7164
 7165	ret = btrfs_replace_extent_map_range(inode, em, true);
 7166	if (ret) {
 7167		free_extent_map(em);
 7168		return ERR_PTR(ret);
 7169	}
 7170
 7171	/* em got 2 refs now, callers needs to do free_extent_map once. */
 7172	return em;
 7173}
 7174
 7175/*
 7176 * For release_folio() and invalidate_folio() we have a race window where
 7177 * folio_end_writeback() is called but the subpage spinlock is not yet released.
 7178 * If we continue to release/invalidate the page, we could cause use-after-free
 7179 * for subpage spinlock.  So this function is to spin and wait for subpage
 7180 * spinlock.
 7181 */
 7182static void wait_subpage_spinlock(struct page *page)
 7183{
 7184	struct btrfs_fs_info *fs_info = page_to_fs_info(page);
 7185	struct folio *folio = page_folio(page);
 7186	struct btrfs_subpage *subpage;
 7187
 7188	if (!btrfs_is_subpage(fs_info, page->mapping))
 7189		return;
 7190
 7191	ASSERT(folio_test_private(folio) && folio_get_private(folio));
 7192	subpage = folio_get_private(folio);
 7193
 7194	/*
 7195	 * This may look insane as we just acquire the spinlock and release it,
 7196	 * without doing anything.  But we just want to make sure no one is
 7197	 * still holding the subpage spinlock.
 7198	 * And since the page is not dirty nor writeback, and we have page
 7199	 * locked, the only possible way to hold a spinlock is from the endio
 7200	 * function to clear page writeback.
 7201	 *
 7202	 * Here we just acquire the spinlock so that all existing callers
 7203	 * should exit and we're safe to release/invalidate the page.
 7204	 */
 7205	spin_lock_irq(&subpage->lock);
 7206	spin_unlock_irq(&subpage->lock);
 7207}
 7208
 7209static int btrfs_launder_folio(struct folio *folio)
 7210{
 7211	return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio),
 7212				      PAGE_SIZE, NULL);
 7213}
 7214
 7215static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
 7216{
 7217	if (try_release_extent_mapping(&folio->page, gfp_flags)) {
 7218		wait_subpage_spinlock(&folio->page);
 7219		clear_page_extent_mapped(&folio->page);
 7220		return true;
 7221	}
 7222	return false;
 7223}
 7224
 7225static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
 7226{
 7227	if (folio_test_writeback(folio) || folio_test_dirty(folio))
 7228		return false;
 7229	return __btrfs_release_folio(folio, gfp_flags);
 7230}
 7231
 7232#ifdef CONFIG_MIGRATION
 7233static int btrfs_migrate_folio(struct address_space *mapping,
 7234			     struct folio *dst, struct folio *src,
 7235			     enum migrate_mode mode)
 7236{
 7237	int ret = filemap_migrate_folio(mapping, dst, src, mode);
 7238
 7239	if (ret != MIGRATEPAGE_SUCCESS)
 7240		return ret;
 7241
 7242	if (folio_test_ordered(src)) {
 7243		folio_clear_ordered(src);
 7244		folio_set_ordered(dst);
 7245	}
 7246
 7247	return MIGRATEPAGE_SUCCESS;
 7248}
 7249#else
 7250#define btrfs_migrate_folio NULL
 7251#endif
 7252
 7253static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
 7254				 size_t length)
 7255{
 7256	struct btrfs_inode *inode = folio_to_inode(folio);
 7257	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 7258	struct extent_io_tree *tree = &inode->io_tree;
 7259	struct extent_state *cached_state = NULL;
 7260	u64 page_start = folio_pos(folio);
 7261	u64 page_end = page_start + folio_size(folio) - 1;
 7262	u64 cur;
 7263	int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
 7264
 7265	/*
 7266	 * We have folio locked so no new ordered extent can be created on this
 7267	 * page, nor bio can be submitted for this folio.
 7268	 *
 7269	 * But already submitted bio can still be finished on this folio.
 7270	 * Furthermore, endio function won't skip folio which has Ordered
 7271	 * (Private2) already cleared, so it's possible for endio and
 7272	 * invalidate_folio to do the same ordered extent accounting twice
 7273	 * on one folio.
 7274	 *
 7275	 * So here we wait for any submitted bios to finish, so that we won't
 7276	 * do double ordered extent accounting on the same folio.
 7277	 */
 7278	folio_wait_writeback(folio);
 7279	wait_subpage_spinlock(&folio->page);
 7280
 7281	/*
 7282	 * For subpage case, we have call sites like
 7283	 * btrfs_punch_hole_lock_range() which passes range not aligned to
 7284	 * sectorsize.
 7285	 * If the range doesn't cover the full folio, we don't need to and
 7286	 * shouldn't clear page extent mapped, as folio->private can still
 7287	 * record subpage dirty bits for other part of the range.
 7288	 *
 7289	 * For cases that invalidate the full folio even the range doesn't
 7290	 * cover the full folio, like invalidating the last folio, we're
 7291	 * still safe to wait for ordered extent to finish.
 7292	 */
 7293	if (!(offset == 0 && length == folio_size(folio))) {
 7294		btrfs_release_folio(folio, GFP_NOFS);
 7295		return;
 7296	}
 7297
 7298	if (!inode_evicting)
 7299		lock_extent(tree, page_start, page_end, &cached_state);
 7300
 7301	cur = page_start;
 7302	while (cur < page_end) {
 7303		struct btrfs_ordered_extent *ordered;
 7304		u64 range_end;
 7305		u32 range_len;
 7306		u32 extra_flags = 0;
 7307
 7308		ordered = btrfs_lookup_first_ordered_range(inode, cur,
 7309							   page_end + 1 - cur);
 7310		if (!ordered) {
 7311			range_end = page_end;
 7312			/*
 7313			 * No ordered extent covering this range, we are safe
 7314			 * to delete all extent states in the range.
 7315			 */
 7316			extra_flags = EXTENT_CLEAR_ALL_BITS;
 7317			goto next;
 7318		}
 7319		if (ordered->file_offset > cur) {
 7320			/*
 7321			 * There is a range between [cur, oe->file_offset) not
 7322			 * covered by any ordered extent.
 7323			 * We are safe to delete all extent states, and handle
 7324			 * the ordered extent in the next iteration.
 7325			 */
 7326			range_end = ordered->file_offset - 1;
 7327			extra_flags = EXTENT_CLEAR_ALL_BITS;
 7328			goto next;
 7329		}
 7330
 7331		range_end = min(ordered->file_offset + ordered->num_bytes - 1,
 7332				page_end);
 7333		ASSERT(range_end + 1 - cur < U32_MAX);
 7334		range_len = range_end + 1 - cur;
 7335		if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) {
 7336			/*
 7337			 * If Ordered (Private2) is cleared, it means endio has
 7338			 * already been executed for the range.
 7339			 * We can't delete the extent states as
 7340			 * btrfs_finish_ordered_io() may still use some of them.
 7341			 */
 7342			goto next;
 7343		}
 7344		btrfs_folio_clear_ordered(fs_info, folio, cur, range_len);
 7345
 7346		/*
 7347		 * IO on this page will never be started, so we need to account
 7348		 * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
 7349		 * here, must leave that up for the ordered extent completion.
 7350		 *
 7351		 * This will also unlock the range for incoming
 7352		 * btrfs_finish_ordered_io().
 7353		 */
 7354		if (!inode_evicting)
 7355			clear_extent_bit(tree, cur, range_end,
 7356					 EXTENT_DELALLOC |
 7357					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
 7358					 EXTENT_DEFRAG, &cached_state);
 7359
 7360		spin_lock_irq(&inode->ordered_tree_lock);
 7361		set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
 7362		ordered->truncated_len = min(ordered->truncated_len,
 7363					     cur - ordered->file_offset);
 7364		spin_unlock_irq(&inode->ordered_tree_lock);
 7365
 7366		/*
 7367		 * If the ordered extent has finished, we're safe to delete all
 7368		 * the extent states of the range, otherwise
 7369		 * btrfs_finish_ordered_io() will get executed by endio for
 7370		 * other pages, so we can't delete extent states.
 7371		 */
 7372		if (btrfs_dec_test_ordered_pending(inode, &ordered,
 7373						   cur, range_end + 1 - cur)) {
 7374			btrfs_finish_ordered_io(ordered);
 7375			/*
 7376			 * The ordered extent has finished, now we're again
 7377			 * safe to delete all extent states of the range.
 7378			 */
 7379			extra_flags = EXTENT_CLEAR_ALL_BITS;
 7380		}
 7381next:
 7382		if (ordered)
 7383			btrfs_put_ordered_extent(ordered);
 7384		/*
 7385		 * Qgroup reserved space handler
 7386		 * Sector(s) here will be either:
 7387		 *
 7388		 * 1) Already written to disk or bio already finished
 7389		 *    Then its QGROUP_RESERVED bit in io_tree is already cleared.
 7390		 *    Qgroup will be handled by its qgroup_record then.
 7391		 *    btrfs_qgroup_free_data() call will do nothing here.
 7392		 *
 7393		 * 2) Not written to disk yet
 7394		 *    Then btrfs_qgroup_free_data() call will clear the
 7395		 *    QGROUP_RESERVED bit of its io_tree, and free the qgroup
 7396		 *    reserved data space.
 7397		 *    Since the IO will never happen for this page.
 7398		 */
 7399		btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL);
 7400		if (!inode_evicting) {
 7401			clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
 7402				 EXTENT_DELALLOC | EXTENT_UPTODATE |
 7403				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG |
 7404				 extra_flags, &cached_state);
 7405		}
 7406		cur = range_end + 1;
 7407	}
 7408	/*
 7409	 * We have iterated through all ordered extents of the page, the page
 7410	 * should not have Ordered (Private2) anymore, or the above iteration
 7411	 * did something wrong.
 7412	 */
 7413	ASSERT(!folio_test_ordered(folio));
 7414	btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
 7415	if (!inode_evicting)
 7416		__btrfs_release_folio(folio, GFP_NOFS);
 7417	clear_page_extent_mapped(&folio->page);
 7418}
 7419
 7420static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
 7421{
 7422	struct btrfs_truncate_control control = {
 7423		.inode = inode,
 7424		.ino = btrfs_ino(inode),
 7425		.min_type = BTRFS_EXTENT_DATA_KEY,
 7426		.clear_extent_range = true,
 7427	};
 7428	struct btrfs_root *root = inode->root;
 7429	struct btrfs_fs_info *fs_info = root->fs_info;
 7430	struct btrfs_block_rsv *rsv;
 7431	int ret;
 7432	struct btrfs_trans_handle *trans;
 7433	u64 mask = fs_info->sectorsize - 1;
 7434	const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
 7435
 7436	if (!skip_writeback) {
 7437		ret = btrfs_wait_ordered_range(inode,
 7438					       inode->vfs_inode.i_size & (~mask),
 7439					       (u64)-1);
 7440		if (ret)
 7441			return ret;
 7442	}
 7443
 7444	/*
 7445	 * Yes ladies and gentlemen, this is indeed ugly.  We have a couple of
 7446	 * things going on here:
 7447	 *
 7448	 * 1) We need to reserve space to update our inode.
 7449	 *
 7450	 * 2) We need to have something to cache all the space that is going to
 7451	 * be free'd up by the truncate operation, but also have some slack
 7452	 * space reserved in case it uses space during the truncate (thank you
 7453	 * very much snapshotting).
 7454	 *
 7455	 * And we need these to be separate.  The fact is we can use a lot of
 7456	 * space doing the truncate, and we have no earthly idea how much space
 7457	 * we will use, so we need the truncate reservation to be separate so it
 7458	 * doesn't end up using space reserved for updating the inode.  We also
 7459	 * need to be able to stop the transaction and start a new one, which
 7460	 * means we need to be able to update the inode several times, and we
 7461	 * have no idea of knowing how many times that will be, so we can't just
 7462	 * reserve 1 item for the entirety of the operation, so that has to be
 7463	 * done separately as well.
 7464	 *
 7465	 * So that leaves us with
 7466	 *
 7467	 * 1) rsv - for the truncate reservation, which we will steal from the
 7468	 * transaction reservation.
 7469	 * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
 7470	 * updating the inode.
 7471	 */
 7472	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
 7473	if (!rsv)
 7474		return -ENOMEM;
 7475	rsv->size = min_size;
 7476	rsv->failfast = true;
 7477
 7478	/*
 7479	 * 1 for the truncate slack space
 7480	 * 1 for updating the inode.
 7481	 */
 7482	trans = btrfs_start_transaction(root, 2);
 7483	if (IS_ERR(trans)) {
 7484		ret = PTR_ERR(trans);
 7485		goto out;
 7486	}
 7487
 7488	/* Migrate the slack space for the truncate to our reserve */
 7489	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
 7490				      min_size, false);
 7491	/*
 7492	 * We have reserved 2 metadata units when we started the transaction and
 7493	 * min_size matches 1 unit, so this should never fail, but if it does,
 7494	 * it's not critical we just fail truncation.
 7495	 */
 7496	if (WARN_ON(ret)) {
 7497		btrfs_end_transaction(trans);
 7498		goto out;
 7499	}
 7500
 7501	trans->block_rsv = rsv;
 7502
 7503	while (1) {
 7504		struct extent_state *cached_state = NULL;
 7505		const u64 new_size = inode->vfs_inode.i_size;
 7506		const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
 7507
 7508		control.new_size = new_size;
 7509		lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
 7510		/*
 7511		 * We want to drop from the next block forward in case this new
 7512		 * size is not block aligned since we will be keeping the last
 7513		 * block of the extent just the way it is.
 7514		 */
 7515		btrfs_drop_extent_map_range(inode,
 7516					    ALIGN(new_size, fs_info->sectorsize),
 7517					    (u64)-1, false);
 7518
 7519		ret = btrfs_truncate_inode_items(trans, root, &control);
 7520
 7521		inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
 7522		btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
 7523
 7524		unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
 7525
 7526		trans->block_rsv = &fs_info->trans_block_rsv;
 7527		if (ret != -ENOSPC && ret != -EAGAIN)
 7528			break;
 7529
 7530		ret = btrfs_update_inode(trans, inode);
 7531		if (ret)
 7532			break;
 7533
 7534		btrfs_end_transaction(trans);
 7535		btrfs_btree_balance_dirty(fs_info);
 7536
 7537		trans = btrfs_start_transaction(root, 2);
 7538		if (IS_ERR(trans)) {
 7539			ret = PTR_ERR(trans);
 7540			trans = NULL;
 7541			break;
 7542		}
 7543
 7544		btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
 7545		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
 7546					      rsv, min_size, false);
 7547		/*
 7548		 * We have reserved 2 metadata units when we started the
 7549		 * transaction and min_size matches 1 unit, so this should never
 7550		 * fail, but if it does, it's not critical we just fail truncation.
 7551		 */
 7552		if (WARN_ON(ret))
 7553			break;
 7554
 7555		trans->block_rsv = rsv;
 7556	}
 7557
 7558	/*
 7559	 * We can't call btrfs_truncate_block inside a trans handle as we could
 7560	 * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
 7561	 * know we've truncated everything except the last little bit, and can
 7562	 * do btrfs_truncate_block and then update the disk_i_size.
 7563	 */
 7564	if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
 7565		btrfs_end_transaction(trans);
 7566		btrfs_btree_balance_dirty(fs_info);
 7567
 7568		ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0);
 7569		if (ret)
 7570			goto out;
 7571		trans = btrfs_start_transaction(root, 1);
 7572		if (IS_ERR(trans)) {
 7573			ret = PTR_ERR(trans);
 7574			goto out;
 7575		}
 7576		btrfs_inode_safe_disk_i_size_write(inode, 0);
 7577	}
 7578
 7579	if (trans) {
 7580		int ret2;
 7581
 7582		trans->block_rsv = &fs_info->trans_block_rsv;
 7583		ret2 = btrfs_update_inode(trans, inode);
 7584		if (ret2 && !ret)
 7585			ret = ret2;
 7586
 7587		ret2 = btrfs_end_transaction(trans);
 7588		if (ret2 && !ret)
 7589			ret = ret2;
 7590		btrfs_btree_balance_dirty(fs_info);
 7591	}
 7592out:
 7593	btrfs_free_block_rsv(fs_info, rsv);
 7594	/*
 7595	 * So if we truncate and then write and fsync we normally would just
 7596	 * write the extents that changed, which is a problem if we need to
 7597	 * first truncate that entire inode.  So set this flag so we write out
 7598	 * all of the extents in the inode to the sync log so we're completely
 7599	 * safe.
 7600	 *
 7601	 * If no extents were dropped or trimmed we don't need to force the next
 7602	 * fsync to truncate all the inode's items from the log and re-log them
 7603	 * all. This means the truncate operation did not change the file size,
 7604	 * or changed it to a smaller size but there was only an implicit hole
 7605	 * between the old i_size and the new i_size, and there were no prealloc
 7606	 * extents beyond i_size to drop.
 7607	 */
 7608	if (control.extents_found > 0)
 7609		btrfs_set_inode_full_sync(inode);
 7610
 7611	return ret;
 7612}
 7613
 7614struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap,
 7615				     struct inode *dir)
 7616{
 7617	struct inode *inode;
 7618
 7619	inode = new_inode(dir->i_sb);
 7620	if (inode) {
 7621		/*
 7622		 * Subvolumes don't inherit the sgid bit or the parent's gid if
 7623		 * the parent's sgid bit is set. This is probably a bug.
 7624		 */
 7625		inode_init_owner(idmap, inode, NULL,
 7626				 S_IFDIR | (~current_umask() & S_IRWXUGO));
 7627		inode->i_op = &btrfs_dir_inode_operations;
 7628		inode->i_fop = &btrfs_dir_file_operations;
 7629	}
 7630	return inode;
 7631}
 7632
 7633struct inode *btrfs_alloc_inode(struct super_block *sb)
 7634{
 7635	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 7636	struct btrfs_inode *ei;
 7637	struct inode *inode;
 7638
 7639	ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
 7640	if (!ei)
 7641		return NULL;
 7642
 7643	ei->root = NULL;
 7644	ei->generation = 0;
 7645	ei->last_trans = 0;
 7646	ei->last_sub_trans = 0;
 7647	ei->logged_trans = 0;
 7648	ei->delalloc_bytes = 0;
 7649	ei->new_delalloc_bytes = 0;
 7650	ei->defrag_bytes = 0;
 7651	ei->disk_i_size = 0;
 7652	ei->flags = 0;
 7653	ei->ro_flags = 0;
 7654	/*
 7655	 * ->index_cnt will be properly initialized later when creating a new
 7656	 * inode (btrfs_create_new_inode()) or when reading an existing inode
 7657	 * from disk (btrfs_read_locked_inode()).
 7658	 */
 7659	ei->csum_bytes = 0;
 7660	ei->dir_index = 0;
 7661	ei->last_unlink_trans = 0;
 7662	ei->last_reflink_trans = 0;
 7663	ei->last_log_commit = 0;
 7664
 7665	spin_lock_init(&ei->lock);
 7666	ei->outstanding_extents = 0;
 7667	if (sb->s_magic != BTRFS_TEST_MAGIC)
 7668		btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
 7669					      BTRFS_BLOCK_RSV_DELALLOC);
 7670	ei->runtime_flags = 0;
 7671	ei->prop_compress = BTRFS_COMPRESS_NONE;
 7672	ei->defrag_compress = BTRFS_COMPRESS_NONE;
 7673
 7674	ei->delayed_node = NULL;
 7675
 7676	ei->i_otime_sec = 0;
 7677	ei->i_otime_nsec = 0;
 7678
 7679	inode = &ei->vfs_inode;
 7680	extent_map_tree_init(&ei->extent_tree);
 7681
 7682	/* This io tree sets the valid inode. */
 7683	extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
 7684	ei->io_tree.inode = ei;
 7685
 7686	ei->file_extent_tree = NULL;
 7687
 7688	mutex_init(&ei->log_mutex);
 7689	spin_lock_init(&ei->ordered_tree_lock);
 7690	ei->ordered_tree = RB_ROOT;
 7691	ei->ordered_tree_last = NULL;
 7692	INIT_LIST_HEAD(&ei->delalloc_inodes);
 7693	INIT_LIST_HEAD(&ei->delayed_iput);
 7694	init_rwsem(&ei->i_mmap_lock);
 7695
 7696	return inode;
 7697}
 7698
 7699#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 7700void btrfs_test_destroy_inode(struct inode *inode)
 7701{
 7702	btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
 7703	kfree(BTRFS_I(inode)->file_extent_tree);
 7704	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 7705}
 7706#endif
 7707
 7708void btrfs_free_inode(struct inode *inode)
 7709{
 7710	kfree(BTRFS_I(inode)->file_extent_tree);
 7711	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 7712}
 7713
 7714void btrfs_destroy_inode(struct inode *vfs_inode)
 7715{
 7716	struct btrfs_ordered_extent *ordered;
 7717	struct btrfs_inode *inode = BTRFS_I(vfs_inode);
 7718	struct btrfs_root *root = inode->root;
 7719	bool freespace_inode;
 7720
 7721	WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
 7722	WARN_ON(vfs_inode->i_data.nrpages);
 7723	WARN_ON(inode->block_rsv.reserved);
 7724	WARN_ON(inode->block_rsv.size);
 7725	WARN_ON(inode->outstanding_extents);
 7726	if (!S_ISDIR(vfs_inode->i_mode)) {
 7727		WARN_ON(inode->delalloc_bytes);
 7728		WARN_ON(inode->new_delalloc_bytes);
 7729		WARN_ON(inode->csum_bytes);
 7730	}
 7731	if (!root || !btrfs_is_data_reloc_root(root))
 7732		WARN_ON(inode->defrag_bytes);
 7733
 7734	/*
 7735	 * This can happen where we create an inode, but somebody else also
 7736	 * created the same inode and we need to destroy the one we already
 7737	 * created.
 7738	 */
 7739	if (!root)
 7740		return;
 7741
 7742	/*
 7743	 * If this is a free space inode do not take the ordered extents lockdep
 7744	 * map.
 7745	 */
 7746	freespace_inode = btrfs_is_free_space_inode(inode);
 7747
 7748	while (1) {
 7749		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
 7750		if (!ordered)
 7751			break;
 7752		else {
 7753			btrfs_err(root->fs_info,
 7754				  "found ordered extent %llu %llu on inode cleanup",
 7755				  ordered->file_offset, ordered->num_bytes);
 7756
 7757			if (!freespace_inode)
 7758				btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);
 7759
 7760			btrfs_remove_ordered_extent(inode, ordered);
 7761			btrfs_put_ordered_extent(ordered);
 7762			btrfs_put_ordered_extent(ordered);
 7763		}
 7764	}
 7765	btrfs_qgroup_check_reserved_leak(inode);
 7766	btrfs_del_inode_from_root(inode);
 7767	btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
 7768	btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
 7769	btrfs_put_root(inode->root);
 7770}
 7771
 7772int btrfs_drop_inode(struct inode *inode)
 7773{
 7774	struct btrfs_root *root = BTRFS_I(inode)->root;
 7775
 7776	if (root == NULL)
 7777		return 1;
 7778
 7779	/* the snap/subvol tree is on deleting */
 7780	if (btrfs_root_refs(&root->root_item) == 0)
 7781		return 1;
 7782	else
 7783		return generic_drop_inode(inode);
 7784}
 7785
 7786static void init_once(void *foo)
 7787{
 7788	struct btrfs_inode *ei = foo;
 7789
 7790	inode_init_once(&ei->vfs_inode);
 7791}
 7792
 7793void __cold btrfs_destroy_cachep(void)
 7794{
 7795	/*
 7796	 * Make sure all delayed rcu free inodes are flushed before we
 7797	 * destroy cache.
 7798	 */
 7799	rcu_barrier();
 7800	kmem_cache_destroy(btrfs_inode_cachep);
 7801}
 7802
 7803int __init btrfs_init_cachep(void)
 7804{
 7805	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
 7806			sizeof(struct btrfs_inode), 0,
 7807			SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
 7808			init_once);
 7809	if (!btrfs_inode_cachep)
 7810		return -ENOMEM;
 7811
 7812	return 0;
 7813}
 7814
 7815static int btrfs_getattr(struct mnt_idmap *idmap,
 7816			 const struct path *path, struct kstat *stat,
 7817			 u32 request_mask, unsigned int flags)
 7818{
 7819	u64 delalloc_bytes;
 7820	u64 inode_bytes;
 7821	struct inode *inode = d_inode(path->dentry);
 7822	u32 blocksize = btrfs_sb(inode->i_sb)->sectorsize;
 7823	u32 bi_flags = BTRFS_I(inode)->flags;
 7824	u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
 7825
 7826	stat->result_mask |= STATX_BTIME;
 7827	stat->btime.tv_sec = BTRFS_I(inode)->i_otime_sec;
 7828	stat->btime.tv_nsec = BTRFS_I(inode)->i_otime_nsec;
 7829	if (bi_flags & BTRFS_INODE_APPEND)
 7830		stat->attributes |= STATX_ATTR_APPEND;
 7831	if (bi_flags & BTRFS_INODE_COMPRESS)
 7832		stat->attributes |= STATX_ATTR_COMPRESSED;
 7833	if (bi_flags & BTRFS_INODE_IMMUTABLE)
 7834		stat->attributes |= STATX_ATTR_IMMUTABLE;
 7835	if (bi_flags & BTRFS_INODE_NODUMP)
 7836		stat->attributes |= STATX_ATTR_NODUMP;
 7837	if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
 7838		stat->attributes |= STATX_ATTR_VERITY;
 7839
 7840	stat->attributes_mask |= (STATX_ATTR_APPEND |
 7841				  STATX_ATTR_COMPRESSED |
 7842				  STATX_ATTR_IMMUTABLE |
 7843				  STATX_ATTR_NODUMP);
 7844
 7845	generic_fillattr(idmap, request_mask, inode, stat);
 7846	stat->dev = BTRFS_I(inode)->root->anon_dev;
 7847
 7848	stat->subvol = BTRFS_I(inode)->root->root_key.objectid;
 7849	stat->result_mask |= STATX_SUBVOL;
 7850
 7851	spin_lock(&BTRFS_I(inode)->lock);
 7852	delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
 7853	inode_bytes = inode_get_bytes(inode);
 7854	spin_unlock(&BTRFS_I(inode)->lock);
 7855	stat->blocks = (ALIGN(inode_bytes, blocksize) +
 7856			ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT;
 7857	return 0;
 7858}
 7859
 7860static int btrfs_rename_exchange(struct inode *old_dir,
 7861			      struct dentry *old_dentry,
 7862			      struct inode *new_dir,
 7863			      struct dentry *new_dentry)
 7864{
 7865	struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
 7866	struct btrfs_trans_handle *trans;
 7867	unsigned int trans_num_items;
 7868	struct btrfs_root *root = BTRFS_I(old_dir)->root;
 7869	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
 7870	struct inode *new_inode = new_dentry->d_inode;
 7871	struct inode *old_inode = old_dentry->d_inode;
 7872	struct btrfs_rename_ctx old_rename_ctx;
 7873	struct btrfs_rename_ctx new_rename_ctx;
 7874	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
 7875	u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
 7876	u64 old_idx = 0;
 7877	u64 new_idx = 0;
 7878	int ret;
 7879	int ret2;
 7880	bool need_abort = false;
 7881	struct fscrypt_name old_fname, new_fname;
 7882	struct fscrypt_str *old_name, *new_name;
 7883
 7884	/*
 7885	 * For non-subvolumes allow exchange only within one subvolume, in the
 7886	 * same inode namespace. Two subvolumes (represented as directory) can
 7887	 * be exchanged as they're a logical link and have a fixed inode number.
 7888	 */
 7889	if (root != dest &&
 7890	    (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
 7891	     new_ino != BTRFS_FIRST_FREE_OBJECTID))
 7892		return -EXDEV;
 7893
 7894	ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
 7895	if (ret)
 7896		return ret;
 7897
 7898	ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
 7899	if (ret) {
 7900		fscrypt_free_filename(&old_fname);
 7901		return ret;
 7902	}
 7903
 7904	old_name = &old_fname.disk_name;
 7905	new_name = &new_fname.disk_name;
 7906
 7907	/* close the race window with snapshot create/destroy ioctl */
 7908	if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
 7909	    new_ino == BTRFS_FIRST_FREE_OBJECTID)
 7910		down_read(&fs_info->subvol_sem);
 7911
 7912	/*
 7913	 * For each inode:
 7914	 * 1 to remove old dir item
 7915	 * 1 to remove old dir index
 7916	 * 1 to add new dir item
 7917	 * 1 to add new dir index
 7918	 * 1 to update parent inode
 7919	 *
 7920	 * If the parents are the same, we only need to account for one
 7921	 */
 7922	trans_num_items = (old_dir == new_dir ? 9 : 10);
 7923	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
 7924		/*
 7925		 * 1 to remove old root ref
 7926		 * 1 to remove old root backref
 7927		 * 1 to add new root ref
 7928		 * 1 to add new root backref
 7929		 */
 7930		trans_num_items += 4;
 7931	} else {
 7932		/*
 7933		 * 1 to update inode item
 7934		 * 1 to remove old inode ref
 7935		 * 1 to add new inode ref
 7936		 */
 7937		trans_num_items += 3;
 7938	}
 7939	if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
 7940		trans_num_items += 4;
 7941	else
 7942		trans_num_items += 3;
 7943	trans = btrfs_start_transaction(root, trans_num_items);
 7944	if (IS_ERR(trans)) {
 7945		ret = PTR_ERR(trans);
 7946		goto out_notrans;
 7947	}
 7948
 7949	if (dest != root) {
 7950		ret = btrfs_record_root_in_trans(trans, dest);
 7951		if (ret)
 7952			goto out_fail;
 7953	}
 7954
 7955	/*
 7956	 * We need to find a free sequence number both in the source and
 7957	 * in the destination directory for the exchange.
 7958	 */
 7959	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
 7960	if (ret)
 7961		goto out_fail;
 7962	ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
 7963	if (ret)
 7964		goto out_fail;
 7965
 7966	BTRFS_I(old_inode)->dir_index = 0ULL;
 7967	BTRFS_I(new_inode)->dir_index = 0ULL;
 7968
 7969	/* Reference for the source. */
 7970	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
 7971		/* force full log commit if subvolume involved. */
 7972		btrfs_set_log_full_commit(trans);
 7973	} else {
 7974		ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino,
 7975					     btrfs_ino(BTRFS_I(new_dir)),
 7976					     old_idx);
 7977		if (ret)
 7978			goto out_fail;
 7979		need_abort = true;
 7980	}
 7981
 7982	/* And now for the dest. */
 7983	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
 7984		/* force full log commit if subvolume involved. */
 7985		btrfs_set_log_full_commit(trans);
 7986	} else {
 7987		ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino,
 7988					     btrfs_ino(BTRFS_I(old_dir)),
 7989					     new_idx);
 7990		if (ret) {
 7991			if (need_abort)
 7992				btrfs_abort_transaction(trans, ret);
 7993			goto out_fail;
 7994		}
 7995	}
 7996
 7997	/* Update inode version and ctime/mtime. */
 7998	inode_inc_iversion(old_dir);
 7999	inode_inc_iversion(new_dir);
 8000	inode_inc_iversion(old_inode);
 8001	inode_inc_iversion(new_inode);
 8002	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
 8003
 8004	if (old_dentry->d_parent != new_dentry->d_parent) {
 8005		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
 8006					BTRFS_I(old_inode), true);
 8007		btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
 8008					BTRFS_I(new_inode), true);
 8009	}
 8010
 8011	/* src is a subvolume */
 8012	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
 8013		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
 8014	} else { /* src is an inode */
 8015		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
 8016					   BTRFS_I(old_dentry->d_inode),
 8017					   old_name, &old_rename_ctx);
 8018		if (!ret)
 8019			ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
 8020	}
 8021	if (ret) {
 8022		btrfs_abort_transaction(trans, ret);
 8023		goto out_fail;
 8024	}
 8025
 8026	/* dest is a subvolume */
 8027	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
 8028		ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
 8029	} else { /* dest is an inode */
 8030		ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
 8031					   BTRFS_I(new_dentry->d_inode),
 8032					   new_name, &new_rename_ctx);
 8033		if (!ret)
 8034			ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
 8035	}
 8036	if (ret) {
 8037		btrfs_abort_transaction(trans, ret);
 8038		goto out_fail;
 8039	}
 8040
 8041	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
 8042			     new_name, 0, old_idx);
 8043	if (ret) {
 8044		btrfs_abort_transaction(trans, ret);
 8045		goto out_fail;
 8046	}
 8047
 8048	ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
 8049			     old_name, 0, new_idx);
 8050	if (ret) {
 8051		btrfs_abort_transaction(trans, ret);
 8052		goto out_fail;
 8053	}
 8054
 8055	if (old_inode->i_nlink == 1)
 8056		BTRFS_I(old_inode)->dir_index = old_idx;
 8057	if (new_inode->i_nlink == 1)
 8058		BTRFS_I(new_inode)->dir_index = new_idx;
 8059
 8060	/*
 8061	 * Now pin the logs of the roots. We do it to ensure that no other task
 8062	 * can sync the logs while we are in progress with the rename, because
 8063	 * that could result in an inconsistency in case any of the inodes that
 8064	 * are part of this rename operation were logged before.
 8065	 */
 8066	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
 8067		btrfs_pin_log_trans(root);
 8068	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
 8069		btrfs_pin_log_trans(dest);
 8070
 8071	/* Do the log updates for all inodes. */
 8072	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
 8073		btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
 8074				   old_rename_ctx.index, new_dentry->d_parent);
 8075	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
 8076		btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
 8077				   new_rename_ctx.index, old_dentry->d_parent);
 8078
 8079	/* Now unpin the logs. */
 8080	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
 8081		btrfs_end_log_trans(root);
 8082	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
 8083		btrfs_end_log_trans(dest);
 8084out_fail:
 8085	ret2 = btrfs_end_transaction(trans);
 8086	ret = ret ? ret : ret2;
 8087out_notrans:
 8088	if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
 8089	    old_ino == BTRFS_FIRST_FREE_OBJECTID)
 8090		up_read(&fs_info->subvol_sem);
 8091
 8092	fscrypt_free_filename(&new_fname);
 8093	fscrypt_free_filename(&old_fname);
 8094	return ret;
 8095}
 8096
 8097static struct inode *new_whiteout_inode(struct mnt_idmap *idmap,
 8098					struct inode *dir)
 8099{
 8100	struct inode *inode;
 8101
 8102	inode = new_inode(dir->i_sb);
 8103	if (inode) {
 8104		inode_init_owner(idmap, inode, dir,
 8105				 S_IFCHR | WHITEOUT_MODE);
 8106		inode->i_op = &btrfs_special_inode_operations;
 8107		init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
 8108	}
 8109	return inode;
 8110}
 8111
 8112static int btrfs_rename(struct mnt_idmap *idmap,
 8113			struct inode *old_dir, struct dentry *old_dentry,
 8114			struct inode *new_dir, struct dentry *new_dentry,
 8115			unsigned int flags)
 8116{
 8117	struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
 8118	struct btrfs_new_inode_args whiteout_args = {
 8119		.dir = old_dir,
 8120		.dentry = old_dentry,
 8121	};
 8122	struct btrfs_trans_handle *trans;
 8123	unsigned int trans_num_items;
 8124	struct btrfs_root *root = BTRFS_I(old_dir)->root;
 8125	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
 8126	struct inode *new_inode = d_inode(new_dentry);
 8127	struct inode *old_inode = d_inode(old_dentry);
 8128	struct btrfs_rename_ctx rename_ctx;
 8129	u64 index = 0;
 8130	int ret;
 8131	int ret2;
 8132	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
 8133	struct fscrypt_name old_fname, new_fname;
 8134
 8135	if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
 8136		return -EPERM;
 8137
 8138	/* we only allow rename subvolume link between subvolumes */
 8139	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
 8140		return -EXDEV;
 8141
 8142	if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
 8143	    (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
 8144		return -ENOTEMPTY;
 8145
 8146	if (S_ISDIR(old_inode->i_mode) && new_inode &&
 8147	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 8148		return -ENOTEMPTY;
 8149
 8150	ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
 8151	if (ret)
 8152		return ret;
 8153
 8154	ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
 8155	if (ret) {
 8156		fscrypt_free_filename(&old_fname);
 8157		return ret;
 8158	}
 8159
 8160	/* check for collisions, even if the  name isn't there */
 8161	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name);
 8162	if (ret) {
 8163		if (ret == -EEXIST) {
 8164			/* we shouldn't get
 8165			 * eexist without a new_inode */
 8166			if (WARN_ON(!new_inode)) {
 8167				goto out_fscrypt_names;
 8168			}
 8169		} else {
 8170			/* maybe -EOVERFLOW */
 8171			goto out_fscrypt_names;
 8172		}
 8173	}
 8174	ret = 0;
 8175
 8176	/*
 8177	 * we're using rename to replace one file with another.  Start IO on it
 8178	 * now so  we don't add too much work to the end of the transaction
 8179	 */
 8180	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
 8181		filemap_flush(old_inode->i_mapping);
 8182
 8183	if (flags & RENAME_WHITEOUT) {
 8184		whiteout_args.inode = new_whiteout_inode(idmap, old_dir);
 8185		if (!whiteout_args.inode) {
 8186			ret = -ENOMEM;
 8187			goto out_fscrypt_names;
 8188		}
 8189		ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items);
 8190		if (ret)
 8191			goto out_whiteout_inode;
 8192	} else {
 8193		/* 1 to update the old parent inode. */
 8194		trans_num_items = 1;
 8195	}
 8196
 8197	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
 8198		/* Close the race window with snapshot create/destroy ioctl */
 8199		down_read(&fs_info->subvol_sem);
 8200		/*
 8201		 * 1 to remove old root ref
 8202		 * 1 to remove old root backref
 8203		 * 1 to add new root ref
 8204		 * 1 to add new root backref
 8205		 */
 8206		trans_num_items += 4;
 8207	} else {
 8208		/*
 8209		 * 1 to update inode
 8210		 * 1 to remove old inode ref
 8211		 * 1 to add new inode ref
 8212		 */
 8213		trans_num_items += 3;
 8214	}
 8215	/*
 8216	 * 1 to remove old dir item
 8217	 * 1 to remove old dir index
 8218	 * 1 to add new dir item
 8219	 * 1 to add new dir index
 8220	 */
 8221	trans_num_items += 4;
 8222	/* 1 to update new parent inode if it's not the same as the old parent */
 8223	if (new_dir != old_dir)
 8224		trans_num_items++;
 8225	if (new_inode) {
 8226		/*
 8227		 * 1 to update inode
 8228		 * 1 to remove inode ref
 8229		 * 1 to remove dir item
 8230		 * 1 to remove dir index
 8231		 * 1 to possibly add orphan item
 8232		 */
 8233		trans_num_items += 5;
 8234	}
 8235	trans = btrfs_start_transaction(root, trans_num_items);
 8236	if (IS_ERR(trans)) {
 8237		ret = PTR_ERR(trans);
 8238		goto out_notrans;
 8239	}
 8240
 8241	if (dest != root) {
 8242		ret = btrfs_record_root_in_trans(trans, dest);
 8243		if (ret)
 8244			goto out_fail;
 8245	}
 8246
 8247	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
 8248	if (ret)
 8249		goto out_fail;
 8250
 8251	BTRFS_I(old_inode)->dir_index = 0ULL;
 8252	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
 8253		/* force full log commit if subvolume involved. */
 8254		btrfs_set_log_full_commit(trans);
 8255	} else {
 8256		ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name,
 8257					     old_ino, btrfs_ino(BTRFS_I(new_dir)),
 8258					     index);
 8259		if (ret)
 8260			goto out_fail;
 8261	}
 8262
 8263	inode_inc_iversion(old_dir);
 8264	inode_inc_iversion(new_dir);
 8265	inode_inc_iversion(old_inode);
 8266	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
 8267
 8268	if (old_dentry->d_parent != new_dentry->d_parent)
 8269		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
 8270					BTRFS_I(old_inode), true);
 8271
 8272	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
 8273		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
 8274	} else {
 8275		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
 8276					   BTRFS_I(d_inode(old_dentry)),
 8277					   &old_fname.disk_name, &rename_ctx);
 8278		if (!ret)
 8279			ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
 8280	}
 8281	if (ret) {
 8282		btrfs_abort_transaction(trans, ret);
 8283		goto out_fail;
 8284	}
 8285
 8286	if (new_inode) {
 8287		inode_inc_iversion(new_inode);
 8288		if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
 8289			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
 8290			ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
 8291			BUG_ON(new_inode->i_nlink == 0);
 8292		} else {
 8293			ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
 8294						 BTRFS_I(d_inode(new_dentry)),
 8295						 &new_fname.disk_name);
 8296		}
 8297		if (!ret && new_inode->i_nlink == 0)
 8298			ret = btrfs_orphan_add(trans,
 8299					BTRFS_I(d_inode(new_dentry)));
 8300		if (ret) {
 8301			btrfs_abort_transaction(trans, ret);
 8302			goto out_fail;
 8303		}
 8304	}
 8305
 8306	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
 8307			     &new_fname.disk_name, 0, index);
 8308	if (ret) {
 8309		btrfs_abort_transaction(trans, ret);
 8310		goto out_fail;
 8311	}
 8312
 8313	if (old_inode->i_nlink == 1)
 8314		BTRFS_I(old_inode)->dir_index = index;
 8315
 8316	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
 8317		btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
 8318				   rename_ctx.index, new_dentry->d_parent);
 8319
 8320	if (flags & RENAME_WHITEOUT) {
 8321		ret = btrfs_create_new_inode(trans, &whiteout_args);
 8322		if (ret) {
 8323			btrfs_abort_transaction(trans, ret);
 8324			goto out_fail;
 8325		} else {
 8326			unlock_new_inode(whiteout_args.inode);
 8327			iput(whiteout_args.inode);
 8328			whiteout_args.inode = NULL;
 8329		}
 8330	}
 8331out_fail:
 8332	ret2 = btrfs_end_transaction(trans);
 8333	ret = ret ? ret : ret2;
 8334out_notrans:
 8335	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
 8336		up_read(&fs_info->subvol_sem);
 8337	if (flags & RENAME_WHITEOUT)
 8338		btrfs_new_inode_args_destroy(&whiteout_args);
 8339out_whiteout_inode:
 8340	if (flags & RENAME_WHITEOUT)
 8341		iput(whiteout_args.inode);
 8342out_fscrypt_names:
 8343	fscrypt_free_filename(&old_fname);
 8344	fscrypt_free_filename(&new_fname);
 8345	return ret;
 8346}
 8347
 8348static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir,
 8349			 struct dentry *old_dentry, struct inode *new_dir,
 8350			 struct dentry *new_dentry, unsigned int flags)
 8351{
 8352	int ret;
 8353
 8354	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
 8355		return -EINVAL;
 8356
 8357	if (flags & RENAME_EXCHANGE)
 8358		ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,
 8359					    new_dentry);
 8360	else
 8361		ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir,
 8362				   new_dentry, flags);
 8363
 8364	btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info);
 8365
 8366	return ret;
 8367}
 8368
 8369struct btrfs_delalloc_work {
 8370	struct inode *inode;
 8371	struct completion completion;
 8372	struct list_head list;
 8373	struct btrfs_work work;
 8374};
 8375
 8376static void btrfs_run_delalloc_work(struct btrfs_work *work)
 8377{
 8378	struct btrfs_delalloc_work *delalloc_work;
 8379	struct inode *inode;
 8380
 8381	delalloc_work = container_of(work, struct btrfs_delalloc_work,
 8382				     work);
 8383	inode = delalloc_work->inode;
 8384	filemap_flush(inode->i_mapping);
 8385	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
 8386				&BTRFS_I(inode)->runtime_flags))
 8387		filemap_flush(inode->i_mapping);
 8388
 8389	iput(inode);
 8390	complete(&delalloc_work->completion);
 8391}
 8392
 8393static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
 8394{
 8395	struct btrfs_delalloc_work *work;
 8396
 8397	work = kmalloc(sizeof(*work), GFP_NOFS);
 8398	if (!work)
 8399		return NULL;
 8400
 8401	init_completion(&work->completion);
 8402	INIT_LIST_HEAD(&work->list);
 8403	work->inode = inode;
 8404	btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL);
 8405
 8406	return work;
 8407}
 8408
 8409/*
 8410 * some fairly slow code that needs optimization. This walks the list
 8411 * of all the inodes with pending delalloc and forces them to disk.
 8412 */
 8413static int start_delalloc_inodes(struct btrfs_root *root,
 8414				 struct writeback_control *wbc, bool snapshot,
 8415				 bool in_reclaim_context)
 8416{
 8417	struct btrfs_inode *binode;
 8418	struct inode *inode;
 8419	struct btrfs_delalloc_work *work, *next;
 8420	LIST_HEAD(works);
 8421	LIST_HEAD(splice);
 8422	int ret = 0;
 8423	bool full_flush = wbc->nr_to_write == LONG_MAX;
 8424
 8425	mutex_lock(&root->delalloc_mutex);
 8426	spin_lock(&root->delalloc_lock);
 8427	list_splice_init(&root->delalloc_inodes, &splice);
 8428	while (!list_empty(&splice)) {
 8429		binode = list_entry(splice.next, struct btrfs_inode,
 8430				    delalloc_inodes);
 8431
 8432		list_move_tail(&binode->delalloc_inodes,
 8433			       &root->delalloc_inodes);
 8434
 8435		if (in_reclaim_context &&
 8436		    test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
 8437			continue;
 8438
 8439		inode = igrab(&binode->vfs_inode);
 8440		if (!inode) {
 8441			cond_resched_lock(&root->delalloc_lock);
 8442			continue;
 8443		}
 8444		spin_unlock(&root->delalloc_lock);
 8445
 8446		if (snapshot)
 8447			set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
 8448				&binode->runtime_flags);
 8449		if (full_flush) {
 8450			work = btrfs_alloc_delalloc_work(inode);
 8451			if (!work) {
 8452				iput(inode);
 8453				ret = -ENOMEM;
 8454				goto out;
 8455			}
 8456			list_add_tail(&work->list, &works);
 8457			btrfs_queue_work(root->fs_info->flush_workers,
 8458					 &work->work);
 8459		} else {
 8460			ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
 8461			btrfs_add_delayed_iput(BTRFS_I(inode));
 8462			if (ret || wbc->nr_to_write <= 0)
 8463				goto out;
 8464		}
 8465		cond_resched();
 8466		spin_lock(&root->delalloc_lock);
 8467	}
 8468	spin_unlock(&root->delalloc_lock);
 8469
 8470out:
 8471	list_for_each_entry_safe(work, next, &works, list) {
 8472		list_del_init(&work->list);
 8473		wait_for_completion(&work->completion);
 8474		kfree(work);
 8475	}
 8476
 8477	if (!list_empty(&splice)) {
 8478		spin_lock(&root->delalloc_lock);
 8479		list_splice_tail(&splice, &root->delalloc_inodes);
 8480		spin_unlock(&root->delalloc_lock);
 8481	}
 8482	mutex_unlock(&root->delalloc_mutex);
 8483	return ret;
 8484}
 8485
 8486int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
 8487{
 8488	struct writeback_control wbc = {
 8489		.nr_to_write = LONG_MAX,
 8490		.sync_mode = WB_SYNC_NONE,
 8491		.range_start = 0,
 8492		.range_end = LLONG_MAX,
 8493	};
 8494	struct btrfs_fs_info *fs_info = root->fs_info;
 8495
 8496	if (BTRFS_FS_ERROR(fs_info))
 8497		return -EROFS;
 8498
 8499	return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
 8500}
 8501
 8502int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
 8503			       bool in_reclaim_context)
 8504{
 8505	struct writeback_control wbc = {
 8506		.nr_to_write = nr,
 8507		.sync_mode = WB_SYNC_NONE,
 8508		.range_start = 0,
 8509		.range_end = LLONG_MAX,
 8510	};
 8511	struct btrfs_root *root;
 8512	LIST_HEAD(splice);
 8513	int ret;
 8514
 8515	if (BTRFS_FS_ERROR(fs_info))
 8516		return -EROFS;
 8517
 8518	mutex_lock(&fs_info->delalloc_root_mutex);
 8519	spin_lock(&fs_info->delalloc_root_lock);
 8520	list_splice_init(&fs_info->delalloc_roots, &splice);
 8521	while (!list_empty(&splice)) {
 8522		/*
 8523		 * Reset nr_to_write here so we know that we're doing a full
 8524		 * flush.
 8525		 */
 8526		if (nr == LONG_MAX)
 8527			wbc.nr_to_write = LONG_MAX;
 8528
 8529		root = list_first_entry(&splice, struct btrfs_root,
 8530					delalloc_root);
 8531		root = btrfs_grab_root(root);
 8532		BUG_ON(!root);
 8533		list_move_tail(&root->delalloc_root,
 8534			       &fs_info->delalloc_roots);
 8535		spin_unlock(&fs_info->delalloc_root_lock);
 8536
 8537		ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
 8538		btrfs_put_root(root);
 8539		if (ret < 0 || wbc.nr_to_write <= 0)
 8540			goto out;
 8541		spin_lock(&fs_info->delalloc_root_lock);
 8542	}
 8543	spin_unlock(&fs_info->delalloc_root_lock);
 8544
 8545	ret = 0;
 8546out:
 8547	if (!list_empty(&splice)) {
 8548		spin_lock(&fs_info->delalloc_root_lock);
 8549		list_splice_tail(&splice, &fs_info->delalloc_roots);
 8550		spin_unlock(&fs_info->delalloc_root_lock);
 8551	}
 8552	mutex_unlock(&fs_info->delalloc_root_mutex);
 8553	return ret;
 8554}
 8555
 8556static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 8557			 struct dentry *dentry, const char *symname)
 8558{
 8559	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
 8560	struct btrfs_trans_handle *trans;
 8561	struct btrfs_root *root = BTRFS_I(dir)->root;
 8562	struct btrfs_path *path;
 8563	struct btrfs_key key;
 8564	struct inode *inode;
 8565	struct btrfs_new_inode_args new_inode_args = {
 8566		.dir = dir,
 8567		.dentry = dentry,
 8568	};
 8569	unsigned int trans_num_items;
 8570	int err;
 8571	int name_len;
 8572	int datasize;
 8573	unsigned long ptr;
 8574	struct btrfs_file_extent_item *ei;
 8575	struct extent_buffer *leaf;
 8576
 8577	name_len = strlen(symname);
 8578	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
 8579		return -ENAMETOOLONG;
 8580
 8581	inode = new_inode(dir->i_sb);
 8582	if (!inode)
 8583		return -ENOMEM;
 8584	inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO);
 8585	inode->i_op = &btrfs_symlink_inode_operations;
 8586	inode_nohighmem(inode);
 8587	inode->i_mapping->a_ops = &btrfs_aops;
 8588	btrfs_i_size_write(BTRFS_I(inode), name_len);
 8589	inode_set_bytes(inode, name_len);
 8590
 8591	new_inode_args.inode = inode;
 8592	err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
 8593	if (err)
 8594		goto out_inode;
 8595	/* 1 additional item for the inline extent */
 8596	trans_num_items++;
 8597
 8598	trans = btrfs_start_transaction(root, trans_num_items);
 8599	if (IS_ERR(trans)) {
 8600		err = PTR_ERR(trans);
 8601		goto out_new_inode_args;
 8602	}
 8603
 8604	err = btrfs_create_new_inode(trans, &new_inode_args);
 8605	if (err)
 8606		goto out;
 8607
 8608	path = btrfs_alloc_path();
 8609	if (!path) {
 8610		err = -ENOMEM;
 8611		btrfs_abort_transaction(trans, err);
 8612		discard_new_inode(inode);
 8613		inode = NULL;
 8614		goto out;
 8615	}
 8616	key.objectid = btrfs_ino(BTRFS_I(inode));
 8617	key.offset = 0;
 8618	key.type = BTRFS_EXTENT_DATA_KEY;
 8619	datasize = btrfs_file_extent_calc_inline_size(name_len);
 8620	err = btrfs_insert_empty_item(trans, root, path, &key,
 8621				      datasize);
 8622	if (err) {
 8623		btrfs_abort_transaction(trans, err);
 8624		btrfs_free_path(path);
 8625		discard_new_inode(inode);
 8626		inode = NULL;
 8627		goto out;
 8628	}
 8629	leaf = path->nodes[0];
 8630	ei = btrfs_item_ptr(leaf, path->slots[0],
 8631			    struct btrfs_file_extent_item);
 8632	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
 8633	btrfs_set_file_extent_type(leaf, ei,
 8634				   BTRFS_FILE_EXTENT_INLINE);
 8635	btrfs_set_file_extent_encryption(leaf, ei, 0);
 8636	btrfs_set_file_extent_compression(leaf, ei, 0);
 8637	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
 8638	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
 8639
 8640	ptr = btrfs_file_extent_inline_start(ei);
 8641	write_extent_buffer(leaf, symname, ptr, name_len);
 8642	btrfs_mark_buffer_dirty(trans, leaf);
 8643	btrfs_free_path(path);
 8644
 8645	d_instantiate_new(dentry, inode);
 8646	err = 0;
 8647out:
 8648	btrfs_end_transaction(trans);
 8649	btrfs_btree_balance_dirty(fs_info);
 8650out_new_inode_args:
 8651	btrfs_new_inode_args_destroy(&new_inode_args);
 8652out_inode:
 8653	if (err)
 8654		iput(inode);
 8655	return err;
 8656}
 8657
 8658static struct btrfs_trans_handle *insert_prealloc_file_extent(
 8659				       struct btrfs_trans_handle *trans_in,
 8660				       struct btrfs_inode *inode,
 8661				       struct btrfs_key *ins,
 8662				       u64 file_offset)
 8663{
 8664	struct btrfs_file_extent_item stack_fi;
 8665	struct btrfs_replace_extent_info extent_info;
 8666	struct btrfs_trans_handle *trans = trans_in;
 8667	struct btrfs_path *path;
 8668	u64 start = ins->objectid;
 8669	u64 len = ins->offset;
 8670	u64 qgroup_released = 0;
 8671	int ret;
 8672
 8673	memset(&stack_fi, 0, sizeof(stack_fi));
 8674
 8675	btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
 8676	btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
 8677	btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
 8678	btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
 8679	btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
 8680	btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
 8681	/* Encryption and other encoding is reserved and all 0 */
 8682
 8683	ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released);
 8684	if (ret < 0)
 8685		return ERR_PTR(ret);
 8686
 8687	if (trans) {
 8688		ret = insert_reserved_file_extent(trans, inode,
 8689						  file_offset, &stack_fi,
 8690						  true, qgroup_released);
 8691		if (ret)
 8692			goto free_qgroup;
 8693		return trans;
 8694	}
 8695
 8696	extent_info.disk_offset = start;
 8697	extent_info.disk_len = len;
 8698	extent_info.data_offset = 0;
 8699	extent_info.data_len = len;
 8700	extent_info.file_offset = file_offset;
 8701	extent_info.extent_buf = (char *)&stack_fi;
 8702	extent_info.is_new_extent = true;
 8703	extent_info.update_times = true;
 8704	extent_info.qgroup_reserved = qgroup_released;
 8705	extent_info.insertions = 0;
 8706
 8707	path = btrfs_alloc_path();
 8708	if (!path) {
 8709		ret = -ENOMEM;
 8710		goto free_qgroup;
 8711	}
 8712
 8713	ret = btrfs_replace_file_extents(inode, path, file_offset,
 8714				     file_offset + len - 1, &extent_info,
 8715				     &trans);
 8716	btrfs_free_path(path);
 8717	if (ret)
 8718		goto free_qgroup;
 8719	return trans;
 8720
 8721free_qgroup:
 8722	/*
 8723	 * We have released qgroup data range at the beginning of the function,
 8724	 * and normally qgroup_released bytes will be freed when committing
 8725	 * transaction.
 8726	 * But if we error out early, we have to free what we have released
 8727	 * or we leak qgroup data reservation.
 8728	 */
 8729	btrfs_qgroup_free_refroot(inode->root->fs_info,
 8730			btrfs_root_id(inode->root), qgroup_released,
 8731			BTRFS_QGROUP_RSV_DATA);
 8732	return ERR_PTR(ret);
 8733}
 8734
 8735static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 8736				       u64 start, u64 num_bytes, u64 min_size,
 8737				       loff_t actual_len, u64 *alloc_hint,
 8738				       struct btrfs_trans_handle *trans)
 8739{
 8740	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 8741	struct extent_map *em;
 8742	struct btrfs_root *root = BTRFS_I(inode)->root;
 8743	struct btrfs_key ins;
 8744	u64 cur_offset = start;
 8745	u64 clear_offset = start;
 8746	u64 i_size;
 8747	u64 cur_bytes;
 8748	u64 last_alloc = (u64)-1;
 8749	int ret = 0;
 8750	bool own_trans = true;
 8751	u64 end = start + num_bytes - 1;
 8752
 8753	if (trans)
 8754		own_trans = false;
 8755	while (num_bytes > 0) {
 8756		cur_bytes = min_t(u64, num_bytes, SZ_256M);
 8757		cur_bytes = max(cur_bytes, min_size);
 8758		/*
 8759		 * If we are severely fragmented we could end up with really
 8760		 * small allocations, so if the allocator is returning small
 8761		 * chunks lets make its job easier by only searching for those
 8762		 * sized chunks.
 8763		 */
 8764		cur_bytes = min(cur_bytes, last_alloc);
 8765		ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
 8766				min_size, 0, *alloc_hint, &ins, 1, 0);
 8767		if (ret)
 8768			break;
 8769
 8770		/*
 8771		 * We've reserved this space, and thus converted it from
 8772		 * ->bytes_may_use to ->bytes_reserved.  Any error that happens
 8773		 * from here on out we will only need to clear our reservation
 8774		 * for the remaining unreserved area, so advance our
 8775		 * clear_offset by our extent size.
 8776		 */
 8777		clear_offset += ins.offset;
 8778
 8779		last_alloc = ins.offset;
 8780		trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
 8781						    &ins, cur_offset);
 8782		/*
 8783		 * Now that we inserted the prealloc extent we can finally
 8784		 * decrement the number of reservations in the block group.
 8785		 * If we did it before, we could race with relocation and have
 8786		 * relocation miss the reserved extent, making it fail later.
 8787		 */
 8788		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 8789		if (IS_ERR(trans)) {
 8790			ret = PTR_ERR(trans);
 8791			btrfs_free_reserved_extent(fs_info, ins.objectid,
 8792						   ins.offset, 0);
 8793			break;
 8794		}
 8795
 8796		em = alloc_extent_map();
 8797		if (!em) {
 8798			btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
 8799					    cur_offset + ins.offset - 1, false);
 8800			btrfs_set_inode_full_sync(BTRFS_I(inode));
 8801			goto next;
 8802		}
 8803
 8804		em->start = cur_offset;
 8805		em->len = ins.offset;
 8806		em->disk_bytenr = ins.objectid;
 8807		em->offset = 0;
 8808		em->disk_num_bytes = ins.offset;
 8809		em->ram_bytes = ins.offset;
 8810		em->flags |= EXTENT_FLAG_PREALLOC;
 8811		em->generation = trans->transid;
 8812
 8813		ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
 8814		free_extent_map(em);
 8815next:
 8816		num_bytes -= ins.offset;
 8817		cur_offset += ins.offset;
 8818		*alloc_hint = ins.objectid + ins.offset;
 8819
 8820		inode_inc_iversion(inode);
 8821		inode_set_ctime_current(inode);
 8822		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
 8823		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
 8824		    (actual_len > inode->i_size) &&
 8825		    (cur_offset > inode->i_size)) {
 8826			if (cur_offset > actual_len)
 8827				i_size = actual_len;
 8828			else
 8829				i_size = cur_offset;
 8830			i_size_write(inode, i_size);
 8831			btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
 8832		}
 8833
 8834		ret = btrfs_update_inode(trans, BTRFS_I(inode));
 8835
 8836		if (ret) {
 8837			btrfs_abort_transaction(trans, ret);
 8838			if (own_trans)
 8839				btrfs_end_transaction(trans);
 8840			break;
 8841		}
 8842
 8843		if (own_trans) {
 8844			btrfs_end_transaction(trans);
 8845			trans = NULL;
 8846		}
 8847	}
 8848	if (clear_offset < end)
 8849		btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
 8850			end - clear_offset + 1);
 8851	return ret;
 8852}
 8853
 8854int btrfs_prealloc_file_range(struct inode *inode, int mode,
 8855			      u64 start, u64 num_bytes, u64 min_size,
 8856			      loff_t actual_len, u64 *alloc_hint)
 8857{
 8858	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
 8859					   min_size, actual_len, alloc_hint,
 8860					   NULL);
 8861}
 8862
 8863int btrfs_prealloc_file_range_trans(struct inode *inode,
 8864				    struct btrfs_trans_handle *trans, int mode,
 8865				    u64 start, u64 num_bytes, u64 min_size,
 8866				    loff_t actual_len, u64 *alloc_hint)
 8867{
 8868	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
 8869					   min_size, actual_len, alloc_hint, trans);
 8870}
 8871
 8872static int btrfs_permission(struct mnt_idmap *idmap,
 8873			    struct inode *inode, int mask)
 8874{
 8875	struct btrfs_root *root = BTRFS_I(inode)->root;
 8876	umode_t mode = inode->i_mode;
 8877
 8878	if (mask & MAY_WRITE &&
 8879	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
 8880		if (btrfs_root_readonly(root))
 8881			return -EROFS;
 8882		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
 8883			return -EACCES;
 8884	}
 8885	return generic_permission(idmap, inode, mask);
 8886}
 8887
 8888static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
 8889			 struct file *file, umode_t mode)
 8890{
 8891	struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
 8892	struct btrfs_trans_handle *trans;
 8893	struct btrfs_root *root = BTRFS_I(dir)->root;
 8894	struct inode *inode;
 8895	struct btrfs_new_inode_args new_inode_args = {
 8896		.dir = dir,
 8897		.dentry = file->f_path.dentry,
 8898		.orphan = true,
 8899	};
 8900	unsigned int trans_num_items;
 8901	int ret;
 8902
 8903	inode = new_inode(dir->i_sb);
 8904	if (!inode)
 8905		return -ENOMEM;
 8906	inode_init_owner(idmap, inode, dir, mode);
 8907	inode->i_fop = &btrfs_file_operations;
 8908	inode->i_op = &btrfs_file_inode_operations;
 8909	inode->i_mapping->a_ops = &btrfs_aops;
 8910
 8911	new_inode_args.inode = inode;
 8912	ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
 8913	if (ret)
 8914		goto out_inode;
 8915
 8916	trans = btrfs_start_transaction(root, trans_num_items);
 8917	if (IS_ERR(trans)) {
 8918		ret = PTR_ERR(trans);
 8919		goto out_new_inode_args;
 8920	}
 8921
 8922	ret = btrfs_create_new_inode(trans, &new_inode_args);
 8923
 8924	/*
 8925	 * We set number of links to 0 in btrfs_create_new_inode(), and here we
 8926	 * set it to 1 because d_tmpfile() will issue a warning if the count is
 8927	 * 0, through:
 8928	 *
 8929	 *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
 8930	 */
 8931	set_nlink(inode, 1);
 8932
 8933	if (!ret) {
 8934		d_tmpfile(file, inode);
 8935		unlock_new_inode(inode);
 8936		mark_inode_dirty(inode);
 8937	}
 8938
 8939	btrfs_end_transaction(trans);
 8940	btrfs_btree_balance_dirty(fs_info);
 8941out_new_inode_args:
 8942	btrfs_new_inode_args_destroy(&new_inode_args);
 8943out_inode:
 8944	if (ret)
 8945		iput(inode);
 8946	return finish_open_simple(file, ret);
 8947}
 8948
 8949void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
 8950{
 8951	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 8952	unsigned long index = start >> PAGE_SHIFT;
 8953	unsigned long end_index = end >> PAGE_SHIFT;
 8954	struct page *page;
 8955	u32 len;
 8956
 8957	ASSERT(end + 1 - start <= U32_MAX);
 8958	len = end + 1 - start;
 8959	while (index <= end_index) {
 8960		page = find_get_page(inode->vfs_inode.i_mapping, index);
 8961		ASSERT(page); /* Pages should be in the extent_io_tree */
 8962
 8963		/* This is for data, which doesn't yet support larger folio. */
 8964		ASSERT(folio_order(page_folio(page)) == 0);
 8965		btrfs_folio_set_writeback(fs_info, page_folio(page), start, len);
 8966		put_page(page);
 8967		index++;
 8968	}
 8969}
 8970
 8971int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
 8972					     int compress_type)
 8973{
 8974	switch (compress_type) {
 8975	case BTRFS_COMPRESS_NONE:
 8976		return BTRFS_ENCODED_IO_COMPRESSION_NONE;
 8977	case BTRFS_COMPRESS_ZLIB:
 8978		return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
 8979	case BTRFS_COMPRESS_LZO:
 8980		/*
 8981		 * The LZO format depends on the sector size. 64K is the maximum
 8982		 * sector size that we support.
 8983		 */
 8984		if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
 8985			return -EINVAL;
 8986		return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
 8987		       (fs_info->sectorsize_bits - 12);
 8988	case BTRFS_COMPRESS_ZSTD:
 8989		return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
 8990	default:
 8991		return -EUCLEAN;
 8992	}
 8993}
 8994
 8995static ssize_t btrfs_encoded_read_inline(
 8996				struct kiocb *iocb,
 8997				struct iov_iter *iter, u64 start,
 8998				u64 lockend,
 8999				struct extent_state **cached_state,
 9000				u64 extent_start, size_t count,
 9001				struct btrfs_ioctl_encoded_io_args *encoded,
 9002				bool *unlocked)
 9003{
 9004	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
 9005	struct btrfs_root *root = inode->root;
 9006	struct btrfs_fs_info *fs_info = root->fs_info;
 9007	struct extent_io_tree *io_tree = &inode->io_tree;
 9008	struct btrfs_path *path;
 9009	struct extent_buffer *leaf;
 9010	struct btrfs_file_extent_item *item;
 9011	u64 ram_bytes;
 9012	unsigned long ptr;
 9013	void *tmp;
 9014	ssize_t ret;
 9015
 9016	path = btrfs_alloc_path();
 9017	if (!path) {
 9018		ret = -ENOMEM;
 9019		goto out;
 9020	}
 9021	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
 9022				       extent_start, 0);
 9023	if (ret) {
 9024		if (ret > 0) {
 9025			/* The extent item disappeared? */
 9026			ret = -EIO;
 9027		}
 9028		goto out;
 9029	}
 9030	leaf = path->nodes[0];
 9031	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
 9032
 9033	ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
 9034	ptr = btrfs_file_extent_inline_start(item);
 9035
 9036	encoded->len = min_t(u64, extent_start + ram_bytes,
 9037			     inode->vfs_inode.i_size) - iocb->ki_pos;
 9038	ret = btrfs_encoded_io_compression_from_extent(fs_info,
 9039				 btrfs_file_extent_compression(leaf, item));
 9040	if (ret < 0)
 9041		goto out;
 9042	encoded->compression = ret;
 9043	if (encoded->compression) {
 9044		size_t inline_size;
 9045
 9046		inline_size = btrfs_file_extent_inline_item_len(leaf,
 9047								path->slots[0]);
 9048		if (inline_size > count) {
 9049			ret = -ENOBUFS;
 9050			goto out;
 9051		}
 9052		count = inline_size;
 9053		encoded->unencoded_len = ram_bytes;
 9054		encoded->unencoded_offset = iocb->ki_pos - extent_start;
 9055	} else {
 9056		count = min_t(u64, count, encoded->len);
 9057		encoded->len = count;
 9058		encoded->unencoded_len = count;
 9059		ptr += iocb->ki_pos - extent_start;
 9060	}
 9061
 9062	tmp = kmalloc(count, GFP_NOFS);
 9063	if (!tmp) {
 9064		ret = -ENOMEM;
 9065		goto out;
 9066	}
 9067	read_extent_buffer(leaf, tmp, ptr, count);
 9068	btrfs_release_path(path);
 9069	unlock_extent(io_tree, start, lockend, cached_state);
 9070	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
 9071	*unlocked = true;
 9072
 9073	ret = copy_to_iter(tmp, count, iter);
 9074	if (ret != count)
 9075		ret = -EFAULT;
 9076	kfree(tmp);
 9077out:
 9078	btrfs_free_path(path);
 9079	return ret;
 9080}
 9081
 9082struct btrfs_encoded_read_private {
 9083	wait_queue_head_t wait;
 9084	atomic_t pending;
 9085	blk_status_t status;
 9086};
 9087
 9088static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
 9089{
 9090	struct btrfs_encoded_read_private *priv = bbio->private;
 9091
 9092	if (bbio->bio.bi_status) {
 9093		/*
 9094		 * The memory barrier implied by the atomic_dec_return() here
 9095		 * pairs with the memory barrier implied by the
 9096		 * atomic_dec_return() or io_wait_event() in
 9097		 * btrfs_encoded_read_regular_fill_pages() to ensure that this
 9098		 * write is observed before the load of status in
 9099		 * btrfs_encoded_read_regular_fill_pages().
 9100		 */
 9101		WRITE_ONCE(priv->status, bbio->bio.bi_status);
 9102	}
 9103	if (!atomic_dec_return(&priv->pending))
 9104		wake_up(&priv->wait);
 9105	bio_put(&bbio->bio);
 9106}
 9107
 9108int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 9109					  u64 file_offset, u64 disk_bytenr,
 9110					  u64 disk_io_size, struct page **pages)
 9111{
 9112	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 9113	struct btrfs_encoded_read_private priv = {
 9114		.pending = ATOMIC_INIT(1),
 9115	};
 9116	unsigned long i = 0;
 9117	struct btrfs_bio *bbio;
 9118
 9119	init_waitqueue_head(&priv.wait);
 9120
 9121	bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
 9122			       btrfs_encoded_read_endio, &priv);
 9123	bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
 9124	bbio->inode = inode;
 9125
 9126	do {
 9127		size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
 9128
 9129		if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
 9130			atomic_inc(&priv.pending);
 9131			btrfs_submit_bio(bbio, 0);
 9132
 9133			bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
 9134					       btrfs_encoded_read_endio, &priv);
 9135			bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
 9136			bbio->inode = inode;
 9137			continue;
 9138		}
 9139
 9140		i++;
 9141		disk_bytenr += bytes;
 9142		disk_io_size -= bytes;
 9143	} while (disk_io_size);
 9144
 9145	atomic_inc(&priv.pending);
 9146	btrfs_submit_bio(bbio, 0);
 9147
 9148	if (atomic_dec_return(&priv.pending))
 9149		io_wait_event(priv.wait, !atomic_read(&priv.pending));
 9150	/* See btrfs_encoded_read_endio() for ordering. */
 9151	return blk_status_to_errno(READ_ONCE(priv.status));
 9152}
 9153
 9154static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
 9155					  struct iov_iter *iter,
 9156					  u64 start, u64 lockend,
 9157					  struct extent_state **cached_state,
 9158					  u64 disk_bytenr, u64 disk_io_size,
 9159					  size_t count, bool compressed,
 9160					  bool *unlocked)
 9161{
 9162	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
 9163	struct extent_io_tree *io_tree = &inode->io_tree;
 9164	struct page **pages;
 9165	unsigned long nr_pages, i;
 9166	u64 cur;
 9167	size_t page_offset;
 9168	ssize_t ret;
 9169
 9170	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
 9171	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
 9172	if (!pages)
 9173		return -ENOMEM;
 9174	ret = btrfs_alloc_page_array(nr_pages, pages, false);
 9175	if (ret) {
 9176		ret = -ENOMEM;
 9177		goto out;
 9178		}
 9179
 9180	ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
 9181						    disk_io_size, pages);
 9182	if (ret)
 9183		goto out;
 9184
 9185	unlock_extent(io_tree, start, lockend, cached_state);
 9186	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
 9187	*unlocked = true;
 9188
 9189	if (compressed) {
 9190		i = 0;
 9191		page_offset = 0;
 9192	} else {
 9193		i = (iocb->ki_pos - start) >> PAGE_SHIFT;
 9194		page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
 9195	}
 9196	cur = 0;
 9197	while (cur < count) {
 9198		size_t bytes = min_t(size_t, count - cur,
 9199				     PAGE_SIZE - page_offset);
 9200
 9201		if (copy_page_to_iter(pages[i], page_offset, bytes,
 9202				      iter) != bytes) {
 9203			ret = -EFAULT;
 9204			goto out;
 9205		}
 9206		i++;
 9207		cur += bytes;
 9208		page_offset = 0;
 9209	}
 9210	ret = count;
 9211out:
 9212	for (i = 0; i < nr_pages; i++) {
 9213		if (pages[i])
 9214			__free_page(pages[i]);
 9215	}
 9216	kfree(pages);
 9217	return ret;
 9218}
 9219
 9220ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
 9221			   struct btrfs_ioctl_encoded_io_args *encoded)
 9222{
 9223	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
 9224	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 9225	struct extent_io_tree *io_tree = &inode->io_tree;
 9226	ssize_t ret;
 9227	size_t count = iov_iter_count(iter);
 9228	u64 start, lockend, disk_bytenr, disk_io_size;
 9229	struct extent_state *cached_state = NULL;
 9230	struct extent_map *em;
 9231	bool unlocked = false;
 9232
 9233	file_accessed(iocb->ki_filp);
 9234
 9235	btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
 9236
 9237	if (iocb->ki_pos >= inode->vfs_inode.i_size) {
 9238		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
 9239		return 0;
 9240	}
 9241	start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
 9242	/*
 9243	 * We don't know how long the extent containing iocb->ki_pos is, but if
 9244	 * it's compressed we know that it won't be longer than this.
 9245	 */
 9246	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
 9247
 9248	for (;;) {
 9249		struct btrfs_ordered_extent *ordered;
 9250
 9251		ret = btrfs_wait_ordered_range(inode, start,
 9252					       lockend - start + 1);
 9253		if (ret)
 9254			goto out_unlock_inode;
 9255		lock_extent(io_tree, start, lockend, &cached_state);
 9256		ordered = btrfs_lookup_ordered_range(inode, start,
 9257						     lockend - start + 1);
 9258		if (!ordered)
 9259			break;
 9260		btrfs_put_ordered_extent(ordered);
 9261		unlock_extent(io_tree, start, lockend, &cached_state);
 9262		cond_resched();
 9263	}
 9264
 9265	em = btrfs_get_extent(inode, NULL, start, lockend - start + 1);
 9266	if (IS_ERR(em)) {
 9267		ret = PTR_ERR(em);
 9268		goto out_unlock_extent;
 9269	}
 9270
 9271	if (em->disk_bytenr == EXTENT_MAP_INLINE) {
 9272		u64 extent_start = em->start;
 9273
 9274		/*
 9275		 * For inline extents we get everything we need out of the
 9276		 * extent item.
 9277		 */
 9278		free_extent_map(em);
 9279		em = NULL;
 9280		ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
 9281						&cached_state, extent_start,
 9282						count, encoded, &unlocked);
 9283		goto out;
 9284	}
 9285
 9286	/*
 9287	 * We only want to return up to EOF even if the extent extends beyond
 9288	 * that.
 9289	 */
 9290	encoded->len = min_t(u64, extent_map_end(em),
 9291			     inode->vfs_inode.i_size) - iocb->ki_pos;
 9292	if (em->disk_bytenr == EXTENT_MAP_HOLE ||
 9293	    (em->flags & EXTENT_FLAG_PREALLOC)) {
 9294		disk_bytenr = EXTENT_MAP_HOLE;
 9295		count = min_t(u64, count, encoded->len);
 9296		encoded->len = count;
 9297		encoded->unencoded_len = count;
 9298	} else if (extent_map_is_compressed(em)) {
 9299		disk_bytenr = em->disk_bytenr;
 9300		/*
 9301		 * Bail if the buffer isn't large enough to return the whole
 9302		 * compressed extent.
 9303		 */
 9304		if (em->disk_num_bytes > count) {
 9305			ret = -ENOBUFS;
 9306			goto out_em;
 9307		}
 9308		disk_io_size = em->disk_num_bytes;
 9309		count = em->disk_num_bytes;
 9310		encoded->unencoded_len = em->ram_bytes;
 9311		encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset);
 9312		ret = btrfs_encoded_io_compression_from_extent(fs_info,
 9313							       extent_map_compression(em));
 9314		if (ret < 0)
 9315			goto out_em;
 9316		encoded->compression = ret;
 9317	} else {
 9318		disk_bytenr = extent_map_block_start(em) + (start - em->start);
 9319		if (encoded->len > count)
 9320			encoded->len = count;
 9321		/*
 9322		 * Don't read beyond what we locked. This also limits the page
 9323		 * allocations that we'll do.
 9324		 */
 9325		disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
 9326		count = start + disk_io_size - iocb->ki_pos;
 9327		encoded->len = count;
 9328		encoded->unencoded_len = count;
 9329		disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
 9330	}
 9331	free_extent_map(em);
 9332	em = NULL;
 9333
 9334	if (disk_bytenr == EXTENT_MAP_HOLE) {
 9335		unlock_extent(io_tree, start, lockend, &cached_state);
 9336		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
 9337		unlocked = true;
 9338		ret = iov_iter_zero(count, iter);
 9339		if (ret != count)
 9340			ret = -EFAULT;
 9341	} else {
 9342		ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
 9343						 &cached_state, disk_bytenr,
 9344						 disk_io_size, count,
 9345						 encoded->compression,
 9346						 &unlocked);
 9347	}
 9348
 9349out:
 9350	if (ret >= 0)
 9351		iocb->ki_pos += encoded->len;
 9352out_em:
 9353	free_extent_map(em);
 9354out_unlock_extent:
 9355	if (!unlocked)
 9356		unlock_extent(io_tree, start, lockend, &cached_state);
 9357out_unlock_inode:
 9358	if (!unlocked)
 9359		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
 9360	return ret;
 9361}
 9362
 9363ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
 9364			       const struct btrfs_ioctl_encoded_io_args *encoded)
 9365{
 9366	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
 9367	struct btrfs_root *root = inode->root;
 9368	struct btrfs_fs_info *fs_info = root->fs_info;
 9369	struct extent_io_tree *io_tree = &inode->io_tree;
 9370	struct extent_changeset *data_reserved = NULL;
 9371	struct extent_state *cached_state = NULL;
 9372	struct btrfs_ordered_extent *ordered;
 9373	struct btrfs_file_extent file_extent;
 9374	int compression;
 9375	size_t orig_count;
 9376	u64 start, end;
 9377	u64 num_bytes, ram_bytes, disk_num_bytes;
 9378	unsigned long nr_folios, i;
 9379	struct folio **folios;
 9380	struct btrfs_key ins;
 9381	bool extent_reserved = false;
 9382	struct extent_map *em;
 9383	ssize_t ret;
 9384
 9385	switch (encoded->compression) {
 9386	case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
 9387		compression = BTRFS_COMPRESS_ZLIB;
 9388		break;
 9389	case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
 9390		compression = BTRFS_COMPRESS_ZSTD;
 9391		break;
 9392	case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
 9393	case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
 9394	case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
 9395	case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
 9396	case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
 9397		/* The sector size must match for LZO. */
 9398		if (encoded->compression -
 9399		    BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
 9400		    fs_info->sectorsize_bits)
 9401			return -EINVAL;
 9402		compression = BTRFS_COMPRESS_LZO;
 9403		break;
 9404	default:
 9405		return -EINVAL;
 9406	}
 9407	if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
 9408		return -EINVAL;
 9409
 9410	/*
 9411	 * Compressed extents should always have checksums, so error out if we
 9412	 * have a NOCOW file or inode was created while mounted with NODATASUM.
 9413	 */
 9414	if (inode->flags & BTRFS_INODE_NODATASUM)
 9415		return -EINVAL;
 9416
 9417	orig_count = iov_iter_count(from);
 9418
 9419	/* The extent size must be sane. */
 9420	if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
 9421	    orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
 9422		return -EINVAL;
 9423
 9424	/*
 9425	 * The compressed data must be smaller than the decompressed data.
 9426	 *
 9427	 * It's of course possible for data to compress to larger or the same
 9428	 * size, but the buffered I/O path falls back to no compression for such
 9429	 * data, and we don't want to break any assumptions by creating these
 9430	 * extents.
 9431	 *
 9432	 * Note that this is less strict than the current check we have that the
 9433	 * compressed data must be at least one sector smaller than the
 9434	 * decompressed data. We only want to enforce the weaker requirement
 9435	 * from old kernels that it is at least one byte smaller.
 9436	 */
 9437	if (orig_count >= encoded->unencoded_len)
 9438		return -EINVAL;
 9439
 9440	/* The extent must start on a sector boundary. */
 9441	start = iocb->ki_pos;
 9442	if (!IS_ALIGNED(start, fs_info->sectorsize))
 9443		return -EINVAL;
 9444
 9445	/*
 9446	 * The extent must end on a sector boundary. However, we allow a write
 9447	 * which ends at or extends i_size to have an unaligned length; we round
 9448	 * up the extent size and set i_size to the unaligned end.
 9449	 */
 9450	if (start + encoded->len < inode->vfs_inode.i_size &&
 9451	    !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
 9452		return -EINVAL;
 9453
 9454	/* Finally, the offset in the unencoded data must be sector-aligned. */
 9455	if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
 9456		return -EINVAL;
 9457
 9458	num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
 9459	ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
 9460	end = start + num_bytes - 1;
 9461
 9462	/*
 9463	 * If the extent cannot be inline, the compressed data on disk must be
 9464	 * sector-aligned. For convenience, we extend it with zeroes if it
 9465	 * isn't.
 9466	 */
 9467	disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
 9468	nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
 9469	folios = kvcalloc(nr_folios, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
 9470	if (!folios)
 9471		return -ENOMEM;
 9472	for (i = 0; i < nr_folios; i++) {
 9473		size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
 9474		char *kaddr;
 9475
 9476		folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0);
 9477		if (!folios[i]) {
 9478			ret = -ENOMEM;
 9479			goto out_folios;
 9480		}
 9481		kaddr = kmap_local_folio(folios[i], 0);
 9482		if (copy_from_iter(kaddr, bytes, from) != bytes) {
 9483			kunmap_local(kaddr);
 9484			ret = -EFAULT;
 9485			goto out_folios;
 9486		}
 9487		if (bytes < PAGE_SIZE)
 9488			memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
 9489		kunmap_local(kaddr);
 9490	}
 9491
 9492	for (;;) {
 9493		struct btrfs_ordered_extent *ordered;
 9494
 9495		ret = btrfs_wait_ordered_range(inode, start, num_bytes);
 9496		if (ret)
 9497			goto out_folios;
 9498		ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
 9499						    start >> PAGE_SHIFT,
 9500						    end >> PAGE_SHIFT);
 9501		if (ret)
 9502			goto out_folios;
 9503		lock_extent(io_tree, start, end, &cached_state);
 9504		ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
 9505		if (!ordered &&
 9506		    !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
 9507			break;
 9508		if (ordered)
 9509			btrfs_put_ordered_extent(ordered);
 9510		unlock_extent(io_tree, start, end, &cached_state);
 9511		cond_resched();
 9512	}
 9513
 9514	/*
 9515	 * We don't use the higher-level delalloc space functions because our
 9516	 * num_bytes and disk_num_bytes are different.
 9517	 */
 9518	ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
 9519	if (ret)
 9520		goto out_unlock;
 9521	ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
 9522	if (ret)
 9523		goto out_free_data_space;
 9524	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
 9525					      false);
 9526	if (ret)
 9527		goto out_qgroup_free_data;
 9528
 9529	/* Try an inline extent first. */
 9530	if (encoded->unencoded_len == encoded->len &&
 9531	    encoded->unencoded_offset == 0 &&
 9532	    can_cow_file_range_inline(inode, start, encoded->len, orig_count)) {
 9533		ret = __cow_file_range_inline(inode, start, encoded->len,
 9534					      orig_count, compression, folios[0],
 9535					      true);
 9536		if (ret <= 0) {
 9537			if (ret == 0)
 9538				ret = orig_count;
 9539			goto out_delalloc_release;
 9540		}
 9541	}
 9542
 9543	ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
 9544				   disk_num_bytes, 0, 0, &ins, 1, 1);
 9545	if (ret)
 9546		goto out_delalloc_release;
 9547	extent_reserved = true;
 9548
 9549	file_extent.disk_bytenr = ins.objectid;
 9550	file_extent.disk_num_bytes = ins.offset;
 9551	file_extent.num_bytes = num_bytes;
 9552	file_extent.ram_bytes = ram_bytes;
 9553	file_extent.offset = encoded->unencoded_offset;
 9554	file_extent.compression = compression;
 9555	em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
 9556	if (IS_ERR(em)) {
 9557		ret = PTR_ERR(em);
 9558		goto out_free_reserved;
 9559	}
 9560	free_extent_map(em);
 9561
 9562	ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
 9563				       (1 << BTRFS_ORDERED_ENCODED) |
 9564				       (1 << BTRFS_ORDERED_COMPRESSED));
 9565	if (IS_ERR(ordered)) {
 9566		btrfs_drop_extent_map_range(inode, start, end, false);
 9567		ret = PTR_ERR(ordered);
 9568		goto out_free_reserved;
 9569	}
 9570	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 9571
 9572	if (start + encoded->len > inode->vfs_inode.i_size)
 9573		i_size_write(&inode->vfs_inode, start + encoded->len);
 9574
 9575	unlock_extent(io_tree, start, end, &cached_state);
 9576
 9577	btrfs_delalloc_release_extents(inode, num_bytes);
 9578
 9579	btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false);
 9580	ret = orig_count;
 9581	goto out;
 9582
 9583out_free_reserved:
 9584	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 9585	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
 9586out_delalloc_release:
 9587	btrfs_delalloc_release_extents(inode, num_bytes);
 9588	btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
 9589out_qgroup_free_data:
 9590	if (ret < 0)
 9591		btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL);
 9592out_free_data_space:
 9593	/*
 9594	 * If btrfs_reserve_extent() succeeded, then we already decremented
 9595	 * bytes_may_use.
 9596	 */
 9597	if (!extent_reserved)
 9598		btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
 9599out_unlock:
 9600	unlock_extent(io_tree, start, end, &cached_state);
 9601out_folios:
 9602	for (i = 0; i < nr_folios; i++) {
 9603		if (folios[i])
 9604			folio_put(folios[i]);
 9605	}
 9606	kvfree(folios);
 9607out:
 9608	if (ret >= 0)
 9609		iocb->ki_pos += encoded->len;
 9610	return ret;
 9611}
 9612
 9613#ifdef CONFIG_SWAP
 9614/*
 9615 * Add an entry indicating a block group or device which is pinned by a
 9616 * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
 9617 * negative errno on failure.
 9618 */
 9619static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
 9620				  bool is_block_group)
 9621{
 9622	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
 9623	struct btrfs_swapfile_pin *sp, *entry;
 9624	struct rb_node **p;
 9625	struct rb_node *parent = NULL;
 9626
 9627	sp = kmalloc(sizeof(*sp), GFP_NOFS);
 9628	if (!sp)
 9629		return -ENOMEM;
 9630	sp->ptr = ptr;
 9631	sp->inode = inode;
 9632	sp->is_block_group = is_block_group;
 9633	sp->bg_extent_count = 1;
 9634
 9635	spin_lock(&fs_info->swapfile_pins_lock);
 9636	p = &fs_info->swapfile_pins.rb_node;
 9637	while (*p) {
 9638		parent = *p;
 9639		entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
 9640		if (sp->ptr < entry->ptr ||
 9641		    (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
 9642			p = &(*p)->rb_left;
 9643		} else if (sp->ptr > entry->ptr ||
 9644			   (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
 9645			p = &(*p)->rb_right;
 9646		} else {
 9647			if (is_block_group)
 9648				entry->bg_extent_count++;
 9649			spin_unlock(&fs_info->swapfile_pins_lock);
 9650			kfree(sp);
 9651			return 1;
 9652		}
 9653	}
 9654	rb_link_node(&sp->node, parent, p);
 9655	rb_insert_color(&sp->node, &fs_info->swapfile_pins);
 9656	spin_unlock(&fs_info->swapfile_pins_lock);
 9657	return 0;
 9658}
 9659
 9660/* Free all of the entries pinned by this swapfile. */
 9661static void btrfs_free_swapfile_pins(struct inode *inode)
 9662{
 9663	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
 9664	struct btrfs_swapfile_pin *sp;
 9665	struct rb_node *node, *next;
 9666
 9667	spin_lock(&fs_info->swapfile_pins_lock);
 9668	node = rb_first(&fs_info->swapfile_pins);
 9669	while (node) {
 9670		next = rb_next(node);
 9671		sp = rb_entry(node, struct btrfs_swapfile_pin, node);
 9672		if (sp->inode == inode) {
 9673			rb_erase(&sp->node, &fs_info->swapfile_pins);
 9674			if (sp->is_block_group) {
 9675				btrfs_dec_block_group_swap_extents(sp->ptr,
 9676							   sp->bg_extent_count);
 9677				btrfs_put_block_group(sp->ptr);
 9678			}
 9679			kfree(sp);
 9680		}
 9681		node = next;
 9682	}
 9683	spin_unlock(&fs_info->swapfile_pins_lock);
 9684}
 9685
 9686struct btrfs_swap_info {
 9687	u64 start;
 9688	u64 block_start;
 9689	u64 block_len;
 9690	u64 lowest_ppage;
 9691	u64 highest_ppage;
 9692	unsigned long nr_pages;
 9693	int nr_extents;
 9694};
 9695
 9696static int btrfs_add_swap_extent(struct swap_info_struct *sis,
 9697				 struct btrfs_swap_info *bsi)
 9698{
 9699	unsigned long nr_pages;
 9700	unsigned long max_pages;
 9701	u64 first_ppage, first_ppage_reported, next_ppage;
 9702	int ret;
 9703
 9704	/*
 9705	 * Our swapfile may have had its size extended after the swap header was
 9706	 * written. In that case activating the swapfile should not go beyond
 9707	 * the max size set in the swap header.
 9708	 */
 9709	if (bsi->nr_pages >= sis->max)
 9710		return 0;
 9711
 9712	max_pages = sis->max - bsi->nr_pages;
 9713	first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT;
 9714	next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT;
 9715
 9716	if (first_ppage >= next_ppage)
 9717		return 0;
 9718	nr_pages = next_ppage - first_ppage;
 9719	nr_pages = min(nr_pages, max_pages);
 9720
 9721	first_ppage_reported = first_ppage;
 9722	if (bsi->start == 0)
 9723		first_ppage_reported++;
 9724	if (bsi->lowest_ppage > first_ppage_reported)
 9725		bsi->lowest_ppage = first_ppage_reported;
 9726	if (bsi->highest_ppage < (next_ppage - 1))
 9727		bsi->highest_ppage = next_ppage - 1;
 9728
 9729	ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
 9730	if (ret < 0)
 9731		return ret;
 9732	bsi->nr_extents += ret;
 9733	bsi->nr_pages += nr_pages;
 9734	return 0;
 9735}
 9736
 9737static void btrfs_swap_deactivate(struct file *file)
 9738{
 9739	struct inode *inode = file_inode(file);
 9740
 9741	btrfs_free_swapfile_pins(inode);
 9742	atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
 9743}
 9744
 9745static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 9746			       sector_t *span)
 9747{
 9748	struct inode *inode = file_inode(file);
 9749	struct btrfs_root *root = BTRFS_I(inode)->root;
 9750	struct btrfs_fs_info *fs_info = root->fs_info;
 9751	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 9752	struct extent_state *cached_state = NULL;
 9753	struct extent_map *em = NULL;
 9754	struct btrfs_chunk_map *map = NULL;
 9755	struct btrfs_device *device = NULL;
 9756	struct btrfs_swap_info bsi = {
 9757		.lowest_ppage = (sector_t)-1ULL,
 9758	};
 9759	int ret = 0;
 9760	u64 isize;
 9761	u64 start;
 9762
 9763	/*
 9764	 * If the swap file was just created, make sure delalloc is done. If the
 9765	 * file changes again after this, the user is doing something stupid and
 9766	 * we don't really care.
 9767	 */
 9768	ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
 9769	if (ret)
 9770		return ret;
 9771
 9772	/*
 9773	 * The inode is locked, so these flags won't change after we check them.
 9774	 */
 9775	if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
 9776		btrfs_warn(fs_info, "swapfile must not be compressed");
 9777		return -EINVAL;
 9778	}
 9779	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
 9780		btrfs_warn(fs_info, "swapfile must not be copy-on-write");
 9781		return -EINVAL;
 9782	}
 9783	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
 9784		btrfs_warn(fs_info, "swapfile must not be checksummed");
 9785		return -EINVAL;
 9786	}
 9787
 9788	/*
 9789	 * Balance or device remove/replace/resize can move stuff around from
 9790	 * under us. The exclop protection makes sure they aren't running/won't
 9791	 * run concurrently while we are mapping the swap extents, and
 9792	 * fs_info->swapfile_pins prevents them from running while the swap
 9793	 * file is active and moving the extents. Note that this also prevents
 9794	 * a concurrent device add which isn't actually necessary, but it's not
 9795	 * really worth the trouble to allow it.
 9796	 */
 9797	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
 9798		btrfs_warn(fs_info,
 9799	   "cannot activate swapfile while exclusive operation is running");
 9800		return -EBUSY;
 9801	}
 9802
 9803	/*
 9804	 * Prevent snapshot creation while we are activating the swap file.
 9805	 * We do not want to race with snapshot creation. If snapshot creation
 9806	 * already started before we bumped nr_swapfiles from 0 to 1 and
 9807	 * completes before the first write into the swap file after it is
 9808	 * activated, than that write would fallback to COW.
 9809	 */
 9810	if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
 9811		btrfs_exclop_finish(fs_info);
 9812		btrfs_warn(fs_info,
 9813	   "cannot activate swapfile because snapshot creation is in progress");
 9814		return -EINVAL;
 9815	}
 9816	/*
 9817	 * Snapshots can create extents which require COW even if NODATACOW is
 9818	 * set. We use this counter to prevent snapshots. We must increment it
 9819	 * before walking the extents because we don't want a concurrent
 9820	 * snapshot to run after we've already checked the extents.
 9821	 *
 9822	 * It is possible that subvolume is marked for deletion but still not
 9823	 * removed yet. To prevent this race, we check the root status before
 9824	 * activating the swapfile.
 9825	 */
 9826	spin_lock(&root->root_item_lock);
 9827	if (btrfs_root_dead(root)) {
 9828		spin_unlock(&root->root_item_lock);
 9829
 9830		btrfs_exclop_finish(fs_info);
 9831		btrfs_warn(fs_info,
 9832		"cannot activate swapfile because subvolume %llu is being deleted",
 9833			btrfs_root_id(root));
 9834		return -EPERM;
 9835	}
 9836	atomic_inc(&root->nr_swapfiles);
 9837	spin_unlock(&root->root_item_lock);
 9838
 9839	isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
 9840
 9841	lock_extent(io_tree, 0, isize - 1, &cached_state);
 9842	start = 0;
 9843	while (start < isize) {
 9844		u64 logical_block_start, physical_block_start;
 9845		struct btrfs_block_group *bg;
 9846		u64 len = isize - start;
 9847
 9848		em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
 9849		if (IS_ERR(em)) {
 9850			ret = PTR_ERR(em);
 9851			goto out;
 9852		}
 9853
 9854		if (em->disk_bytenr == EXTENT_MAP_HOLE) {
 9855			btrfs_warn(fs_info, "swapfile must not have holes");
 9856			ret = -EINVAL;
 9857			goto out;
 9858		}
 9859		if (em->disk_bytenr == EXTENT_MAP_INLINE) {
 9860			/*
 9861			 * It's unlikely we'll ever actually find ourselves
 9862			 * here, as a file small enough to fit inline won't be
 9863			 * big enough to store more than the swap header, but in
 9864			 * case something changes in the future, let's catch it
 9865			 * here rather than later.
 9866			 */
 9867			btrfs_warn(fs_info, "swapfile must not be inline");
 9868			ret = -EINVAL;
 9869			goto out;
 9870		}
 9871		if (extent_map_is_compressed(em)) {
 9872			btrfs_warn(fs_info, "swapfile must not be compressed");
 9873			ret = -EINVAL;
 9874			goto out;
 9875		}
 9876
 9877		logical_block_start = extent_map_block_start(em) + (start - em->start);
 9878		len = min(len, em->len - (start - em->start));
 9879		free_extent_map(em);
 9880		em = NULL;
 9881
 9882		ret = can_nocow_extent(inode, start, &len, NULL, false, true);
 9883		if (ret < 0) {
 9884			goto out;
 9885		} else if (ret) {
 9886			ret = 0;
 9887		} else {
 9888			btrfs_warn(fs_info,
 9889				   "swapfile must not be copy-on-write");
 9890			ret = -EINVAL;
 9891			goto out;
 9892		}
 9893
 9894		map = btrfs_get_chunk_map(fs_info, logical_block_start, len);
 9895		if (IS_ERR(map)) {
 9896			ret = PTR_ERR(map);
 9897			goto out;
 9898		}
 9899
 9900		if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
 9901			btrfs_warn(fs_info,
 9902				   "swapfile must have single data profile");
 9903			ret = -EINVAL;
 9904			goto out;
 9905		}
 9906
 9907		if (device == NULL) {
 9908			device = map->stripes[0].dev;
 9909			ret = btrfs_add_swapfile_pin(inode, device, false);
 9910			if (ret == 1)
 9911				ret = 0;
 9912			else if (ret)
 9913				goto out;
 9914		} else if (device != map->stripes[0].dev) {
 9915			btrfs_warn(fs_info, "swapfile must be on one device");
 9916			ret = -EINVAL;
 9917			goto out;
 9918		}
 9919
 9920		physical_block_start = (map->stripes[0].physical +
 9921					(logical_block_start - map->start));
 9922		len = min(len, map->chunk_len - (logical_block_start - map->start));
 9923		btrfs_free_chunk_map(map);
 9924		map = NULL;
 9925
 9926		bg = btrfs_lookup_block_group(fs_info, logical_block_start);
 9927		if (!bg) {
 9928			btrfs_warn(fs_info,
 9929			   "could not find block group containing swapfile");
 9930			ret = -EINVAL;
 9931			goto out;
 9932		}
 9933
 9934		if (!btrfs_inc_block_group_swap_extents(bg)) {
 9935			btrfs_warn(fs_info,
 9936			   "block group for swapfile at %llu is read-only%s",
 9937			   bg->start,
 9938			   atomic_read(&fs_info->scrubs_running) ?
 9939				       " (scrub running)" : "");
 9940			btrfs_put_block_group(bg);
 9941			ret = -EINVAL;
 9942			goto out;
 9943		}
 9944
 9945		ret = btrfs_add_swapfile_pin(inode, bg, true);
 9946		if (ret) {
 9947			btrfs_put_block_group(bg);
 9948			if (ret == 1)
 9949				ret = 0;
 9950			else
 9951				goto out;
 9952		}
 9953
 9954		if (bsi.block_len &&
 9955		    bsi.block_start + bsi.block_len == physical_block_start) {
 9956			bsi.block_len += len;
 9957		} else {
 9958			if (bsi.block_len) {
 9959				ret = btrfs_add_swap_extent(sis, &bsi);
 9960				if (ret)
 9961					goto out;
 9962			}
 9963			bsi.start = start;
 9964			bsi.block_start = physical_block_start;
 9965			bsi.block_len = len;
 9966		}
 9967
 9968		start += len;
 9969	}
 9970
 9971	if (bsi.block_len)
 9972		ret = btrfs_add_swap_extent(sis, &bsi);
 9973
 9974out:
 9975	if (!IS_ERR_OR_NULL(em))
 9976		free_extent_map(em);
 9977	if (!IS_ERR_OR_NULL(map))
 9978		btrfs_free_chunk_map(map);
 9979
 9980	unlock_extent(io_tree, 0, isize - 1, &cached_state);
 9981
 9982	if (ret)
 9983		btrfs_swap_deactivate(file);
 9984
 9985	btrfs_drew_write_unlock(&root->snapshot_lock);
 9986
 9987	btrfs_exclop_finish(fs_info);
 9988
 9989	if (ret)
 9990		return ret;
 9991
 9992	if (device)
 9993		sis->bdev = device->bdev;
 9994	*span = bsi.highest_ppage - bsi.lowest_ppage + 1;
 9995	sis->max = bsi.nr_pages;
 9996	sis->pages = bsi.nr_pages - 1;
 9997	sis->highest_bit = bsi.nr_pages - 1;
 9998	return bsi.nr_extents;
 9999}
10000#else
10001static void btrfs_swap_deactivate(struct file *file)
10002{
10003}
10004
10005static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10006			       sector_t *span)
10007{
10008	return -EOPNOTSUPP;
10009}
10010#endif
10011
10012/*
10013 * Update the number of bytes used in the VFS' inode. When we replace extents in
10014 * a range (clone, dedupe, fallocate's zero range), we must update the number of
10015 * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
10016 * always get a correct value.
10017 */
10018void btrfs_update_inode_bytes(struct btrfs_inode *inode,
10019			      const u64 add_bytes,
10020			      const u64 del_bytes)
10021{
10022	if (add_bytes == del_bytes)
10023		return;
10024
10025	spin_lock(&inode->lock);
10026	if (del_bytes > 0)
10027		inode_sub_bytes(&inode->vfs_inode, del_bytes);
10028	if (add_bytes > 0)
10029		inode_add_bytes(&inode->vfs_inode, add_bytes);
10030	spin_unlock(&inode->lock);
10031}
10032
10033/*
10034 * Verify that there are no ordered extents for a given file range.
10035 *
10036 * @inode:   The target inode.
10037 * @start:   Start offset of the file range, should be sector size aligned.
10038 * @end:     End offset (inclusive) of the file range, its value +1 should be
10039 *           sector size aligned.
10040 *
10041 * This should typically be used for cases where we locked an inode's VFS lock in
10042 * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode,
10043 * we have flushed all delalloc in the range, we have waited for all ordered
10044 * extents in the range to complete and finally we have locked the file range in
10045 * the inode's io_tree.
10046 */
10047void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
10048{
10049	struct btrfs_root *root = inode->root;
10050	struct btrfs_ordered_extent *ordered;
10051
10052	if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
10053		return;
10054
10055	ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start);
10056	if (ordered) {
10057		btrfs_err(root->fs_info,
10058"found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
10059			  start, end, btrfs_ino(inode), btrfs_root_id(root),
10060			  ordered->file_offset,
10061			  ordered->file_offset + ordered->num_bytes - 1);
10062		btrfs_put_ordered_extent(ordered);
10063	}
10064
10065	ASSERT(ordered == NULL);
10066}
10067
10068/*
10069 * Find the first inode with a minimum number.
10070 *
10071 * @root:	The root to search for.
10072 * @min_ino:	The minimum inode number.
10073 *
10074 * Find the first inode in the @root with a number >= @min_ino and return it.
10075 * Returns NULL if no such inode found.
10076 */
10077struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino)
10078{
10079	struct btrfs_inode *inode;
10080	unsigned long from = min_ino;
10081
10082	xa_lock(&root->inodes);
10083	while (true) {
10084		inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
10085		if (!inode)
10086			break;
10087		if (igrab(&inode->vfs_inode))
10088			break;
10089
10090		from = btrfs_ino(inode) + 1;
10091		cond_resched_lock(&root->inodes.xa_lock);
10092	}
10093	xa_unlock(&root->inodes);
10094
10095	return inode;
10096}
10097
10098static const struct inode_operations btrfs_dir_inode_operations = {
10099	.getattr	= btrfs_getattr,
10100	.lookup		= btrfs_lookup,
10101	.create		= btrfs_create,
10102	.unlink		= btrfs_unlink,
10103	.link		= btrfs_link,
10104	.mkdir		= btrfs_mkdir,
10105	.rmdir		= btrfs_rmdir,
10106	.rename		= btrfs_rename2,
10107	.symlink	= btrfs_symlink,
10108	.setattr	= btrfs_setattr,
10109	.mknod		= btrfs_mknod,
10110	.listxattr	= btrfs_listxattr,
10111	.permission	= btrfs_permission,
10112	.get_inode_acl	= btrfs_get_acl,
10113	.set_acl	= btrfs_set_acl,
10114	.update_time	= btrfs_update_time,
10115	.tmpfile        = btrfs_tmpfile,
10116	.fileattr_get	= btrfs_fileattr_get,
10117	.fileattr_set	= btrfs_fileattr_set,
10118};
10119
10120static const struct file_operations btrfs_dir_file_operations = {
10121	.llseek		= btrfs_dir_llseek,
10122	.read		= generic_read_dir,
10123	.iterate_shared	= btrfs_real_readdir,
10124	.open		= btrfs_opendir,
10125	.unlocked_ioctl	= btrfs_ioctl,
10126#ifdef CONFIG_COMPAT
10127	.compat_ioctl	= btrfs_compat_ioctl,
10128#endif
10129	.release        = btrfs_release_file,
10130	.fsync		= btrfs_sync_file,
10131};
10132
10133/*
10134 * btrfs doesn't support the bmap operation because swapfiles
10135 * use bmap to make a mapping of extents in the file.  They assume
10136 * these extents won't change over the life of the file and they
10137 * use the bmap result to do IO directly to the drive.
10138 *
10139 * the btrfs bmap call would return logical addresses that aren't
10140 * suitable for IO and they also will change frequently as COW
10141 * operations happen.  So, swapfile + btrfs == corruption.
10142 *
10143 * For now we're avoiding this by dropping bmap.
10144 */
10145static const struct address_space_operations btrfs_aops = {
10146	.read_folio	= btrfs_read_folio,
10147	.writepages	= btrfs_writepages,
10148	.readahead	= btrfs_readahead,
10149	.invalidate_folio = btrfs_invalidate_folio,
10150	.launder_folio	= btrfs_launder_folio,
10151	.release_folio	= btrfs_release_folio,
10152	.migrate_folio	= btrfs_migrate_folio,
10153	.dirty_folio	= filemap_dirty_folio,
10154	.error_remove_folio = generic_error_remove_folio,
10155	.swap_activate	= btrfs_swap_activate,
10156	.swap_deactivate = btrfs_swap_deactivate,
10157};
10158
10159static const struct inode_operations btrfs_file_inode_operations = {
10160	.getattr	= btrfs_getattr,
10161	.setattr	= btrfs_setattr,
10162	.listxattr      = btrfs_listxattr,
10163	.permission	= btrfs_permission,
10164	.fiemap		= btrfs_fiemap,
10165	.get_inode_acl	= btrfs_get_acl,
10166	.set_acl	= btrfs_set_acl,
10167	.update_time	= btrfs_update_time,
10168	.fileattr_get	= btrfs_fileattr_get,
10169	.fileattr_set	= btrfs_fileattr_set,
10170};
10171static const struct inode_operations btrfs_special_inode_operations = {
10172	.getattr	= btrfs_getattr,
10173	.setattr	= btrfs_setattr,
10174	.permission	= btrfs_permission,
10175	.listxattr	= btrfs_listxattr,
10176	.get_inode_acl	= btrfs_get_acl,
10177	.set_acl	= btrfs_set_acl,
10178	.update_time	= btrfs_update_time,
10179};
10180static const struct inode_operations btrfs_symlink_inode_operations = {
10181	.get_link	= page_get_link,
10182	.getattr	= btrfs_getattr,
10183	.setattr	= btrfs_setattr,
10184	.permission	= btrfs_permission,
10185	.listxattr	= btrfs_listxattr,
10186	.update_time	= btrfs_update_time,
10187};
10188
10189const struct dentry_operations btrfs_dentry_operations = {
10190	.d_delete	= btrfs_dentry_delete,
10191};
Configure Feed

Configure Feed