fs/btrfs/inode.c at v6.7 · tjh.dev/kernel

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / fs / btrfs / inode.c
at v6.7 10970 lines 320 kB view raw
wrap content
    1// SPDX-License-Identifier: GPL-2.0
    2/*
    3 * Copyright (C) 2007 Oracle.  All rights reserved.
    4 */
    5
    6#include <crypto/hash.h>
    7#include <linux/kernel.h>
    8#include <linux/bio.h>
    9#include <linux/blk-cgroup.h>
   10#include <linux/file.h>
   11#include <linux/fs.h>
   12#include <linux/pagemap.h>
   13#include <linux/highmem.h>
   14#include <linux/time.h>
   15#include <linux/init.h>
   16#include <linux/string.h>
   17#include <linux/backing-dev.h>
   18#include <linux/writeback.h>
   19#include <linux/compat.h>
   20#include <linux/xattr.h>
   21#include <linux/posix_acl.h>
   22#include <linux/falloc.h>
   23#include <linux/slab.h>
   24#include <linux/ratelimit.h>
   25#include <linux/btrfs.h>
   26#include <linux/blkdev.h>
   27#include <linux/posix_acl_xattr.h>
   28#include <linux/uio.h>
   29#include <linux/magic.h>
   30#include <linux/iversion.h>
   31#include <linux/swap.h>
   32#include <linux/migrate.h>
   33#include <linux/sched/mm.h>
   34#include <linux/iomap.h>
   35#include <asm/unaligned.h>
   36#include <linux/fsverity.h>
   37#include "misc.h"
   38#include "ctree.h"
   39#include "disk-io.h"
   40#include "transaction.h"
   41#include "btrfs_inode.h"
   42#include "print-tree.h"
   43#include "ordered-data.h"
   44#include "xattr.h"
   45#include "tree-log.h"
   46#include "bio.h"
   47#include "compression.h"
   48#include "locking.h"
   49#include "free-space-cache.h"
   50#include "props.h"
   51#include "qgroup.h"
   52#include "delalloc-space.h"
   53#include "block-group.h"
   54#include "space-info.h"
   55#include "zoned.h"
   56#include "subpage.h"
   57#include "inode-item.h"
   58#include "fs.h"
   59#include "accessors.h"
   60#include "extent-tree.h"
   61#include "root-tree.h"
   62#include "defrag.h"
   63#include "dir-item.h"
   64#include "file-item.h"
   65#include "uuid-tree.h"
   66#include "ioctl.h"
   67#include "file.h"
   68#include "acl.h"
   69#include "relocation.h"
   70#include "verity.h"
   71#include "super.h"
   72#include "orphan.h"
   73#include "backref.h"
   74#include "raid-stripe-tree.h"
   75
   76struct btrfs_iget_args {
   77	u64 ino;
   78	struct btrfs_root *root;
   79};
   80
   81struct btrfs_dio_data {
   82	ssize_t submitted;
   83	struct extent_changeset *data_reserved;
   84	struct btrfs_ordered_extent *ordered;
   85	bool data_space_reserved;
   86	bool nocow_done;
   87};
   88
   89struct btrfs_dio_private {
   90	/* Range of I/O */
   91	u64 file_offset;
   92	u32 bytes;
   93
   94	/* This must be last */
   95	struct btrfs_bio bbio;
   96};
   97
   98static struct bio_set btrfs_dio_bioset;
   99
  100struct btrfs_rename_ctx {
  101	/* Output field. Stores the index number of the old directory entry. */
  102	u64 index;
  103};
  104
  105/*
  106 * Used by data_reloc_print_warning_inode() to pass needed info for filename
  107 * resolution and output of error message.
  108 */
  109struct data_reloc_warn {
  110	struct btrfs_path path;
  111	struct btrfs_fs_info *fs_info;
  112	u64 extent_item_size;
  113	u64 logical;
  114	int mirror_num;
  115};
  116
  117static const struct inode_operations btrfs_dir_inode_operations;
  118static const struct inode_operations btrfs_symlink_inode_operations;
  119static const struct inode_operations btrfs_special_inode_operations;
  120static const struct inode_operations btrfs_file_inode_operations;
  121static const struct address_space_operations btrfs_aops;
  122static const struct file_operations btrfs_dir_file_operations;
  123
  124static struct kmem_cache *btrfs_inode_cachep;
  125
  126static int btrfs_setsize(struct inode *inode, struct iattr *attr);
  127static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);
  128
  129static noinline int run_delalloc_cow(struct btrfs_inode *inode,
  130				     struct page *locked_page, u64 start,
  131				     u64 end, struct writeback_control *wbc,
  132				     bool pages_dirty);
  133static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
  134				       u64 len, u64 orig_start, u64 block_start,
  135				       u64 block_len, u64 orig_block_len,
  136				       u64 ram_bytes, int compress_type,
  137				       int type);
  138
  139static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
  140					  u64 root, void *warn_ctx)
  141{
  142	struct data_reloc_warn *warn = warn_ctx;
  143	struct btrfs_fs_info *fs_info = warn->fs_info;
  144	struct extent_buffer *eb;
  145	struct btrfs_inode_item *inode_item;
  146	struct inode_fs_paths *ipath = NULL;
  147	struct btrfs_root *local_root;
  148	struct btrfs_key key;
  149	unsigned int nofs_flag;
  150	u32 nlink;
  151	int ret;
  152
  153	local_root = btrfs_get_fs_root(fs_info, root, true);
  154	if (IS_ERR(local_root)) {
  155		ret = PTR_ERR(local_root);
  156		goto err;
  157	}
  158
  159	/* This makes the path point to (inum INODE_ITEM ioff). */
  160	key.objectid = inum;
  161	key.type = BTRFS_INODE_ITEM_KEY;
  162	key.offset = 0;
  163
  164	ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0);
  165	if (ret) {
  166		btrfs_put_root(local_root);
  167		btrfs_release_path(&warn->path);
  168		goto err;
  169	}
  170
  171	eb = warn->path.nodes[0];
  172	inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item);
  173	nlink = btrfs_inode_nlink(eb, inode_item);
  174	btrfs_release_path(&warn->path);
  175
  176	nofs_flag = memalloc_nofs_save();
  177	ipath = init_ipath(4096, local_root, &warn->path);
  178	memalloc_nofs_restore(nofs_flag);
  179	if (IS_ERR(ipath)) {
  180		btrfs_put_root(local_root);
  181		ret = PTR_ERR(ipath);
  182		ipath = NULL;
  183		/*
  184		 * -ENOMEM, not a critical error, just output an generic error
  185		 * without filename.
  186		 */
  187		btrfs_warn(fs_info,
  188"checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu",
  189			   warn->logical, warn->mirror_num, root, inum, offset);
  190		return ret;
  191	}
  192	ret = paths_from_inode(inum, ipath);
  193	if (ret < 0)
  194		goto err;
  195
  196	/*
  197	 * We deliberately ignore the bit ipath might have been too small to
  198	 * hold all of the paths here
  199	 */
  200	for (int i = 0; i < ipath->fspath->elem_cnt; i++) {
  201		btrfs_warn(fs_info,
  202"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)",
  203			   warn->logical, warn->mirror_num, root, inum, offset,
  204			   fs_info->sectorsize, nlink,
  205			   (char *)(unsigned long)ipath->fspath->val[i]);
  206	}
  207
  208	btrfs_put_root(local_root);
  209	free_ipath(ipath);
  210	return 0;
  211
  212err:
  213	btrfs_warn(fs_info,
  214"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d",
  215		   warn->logical, warn->mirror_num, root, inum, offset, ret);
  216
  217	free_ipath(ipath);
  218	return ret;
  219}
  220
  221/*
  222 * Do extra user-friendly error output (e.g. lookup all the affected files).
  223 *
  224 * Return true if we succeeded doing the backref lookup.
  225 * Return false if such lookup failed, and has to fallback to the old error message.
  226 */
  227static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off,
  228				   const u8 *csum, const u8 *csum_expected,
  229				   int mirror_num)
  230{
  231	struct btrfs_fs_info *fs_info = inode->root->fs_info;
  232	struct btrfs_path path = { 0 };
  233	struct btrfs_key found_key = { 0 };
  234	struct extent_buffer *eb;
  235	struct btrfs_extent_item *ei;
  236	const u32 csum_size = fs_info->csum_size;
  237	u64 logical;
  238	u64 flags;
  239	u32 item_size;
  240	int ret;
  241
  242	mutex_lock(&fs_info->reloc_mutex);
  243	logical = btrfs_get_reloc_bg_bytenr(fs_info);
  244	mutex_unlock(&fs_info->reloc_mutex);
  245
  246	if (logical == U64_MAX) {
  247		btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
  248		btrfs_warn_rl(fs_info,
  249"csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
  250			inode->root->root_key.objectid, btrfs_ino(inode), file_off,
  251			CSUM_FMT_VALUE(csum_size, csum),
  252			CSUM_FMT_VALUE(csum_size, csum_expected),
  253			mirror_num);
  254		return;
  255	}
  256
  257	logical += file_off;
  258	btrfs_warn_rl(fs_info,
  259"csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
  260			inode->root->root_key.objectid,
  261			btrfs_ino(inode), file_off, logical,
  262			CSUM_FMT_VALUE(csum_size, csum),
  263			CSUM_FMT_VALUE(csum_size, csum_expected),
  264			mirror_num);
  265
  266	ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags);
  267	if (ret < 0) {
  268		btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d",
  269			     logical, ret);
  270		return;
  271	}
  272	eb = path.nodes[0];
  273	ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item);
  274	item_size = btrfs_item_size(eb, path.slots[0]);
  275	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
  276		unsigned long ptr = 0;
  277		u64 ref_root;
  278		u8 ref_level;
  279
  280		while (true) {
  281			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
  282						      item_size, &ref_root,
  283						      &ref_level);
  284			if (ret < 0) {
  285				btrfs_warn_rl(fs_info,
  286				"failed to resolve tree backref for logical %llu: %d",
  287					      logical, ret);
  288				break;
  289			}
  290			if (ret > 0)
  291				break;
  292
  293			btrfs_warn_rl(fs_info,
  294"csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu",
  295				logical, mirror_num,
  296				(ref_level ? "node" : "leaf"),
  297				ref_level, ref_root);
  298		}
  299		btrfs_release_path(&path);
  300	} else {
  301		struct btrfs_backref_walk_ctx ctx = { 0 };
  302		struct data_reloc_warn reloc_warn = { 0 };
  303
  304		btrfs_release_path(&path);
  305
  306		ctx.bytenr = found_key.objectid;
  307		ctx.extent_item_pos = logical - found_key.objectid;
  308		ctx.fs_info = fs_info;
  309
  310		reloc_warn.logical = logical;
  311		reloc_warn.extent_item_size = found_key.offset;
  312		reloc_warn.mirror_num = mirror_num;
  313		reloc_warn.fs_info = fs_info;
  314
  315		iterate_extent_inodes(&ctx, true,
  316				      data_reloc_print_warning_inode, &reloc_warn);
  317	}
  318}
  319
  320static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
  321		u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
  322{
  323	struct btrfs_root *root = inode->root;
  324	const u32 csum_size = root->fs_info->csum_size;
  325
  326	/* For data reloc tree, it's better to do a backref lookup instead. */
  327	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
  328		return print_data_reloc_error(inode, logical_start, csum,
  329					      csum_expected, mirror_num);
  330
  331	/* Output without objectid, which is more meaningful */
  332	if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) {
  333		btrfs_warn_rl(root->fs_info,
  334"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
  335			root->root_key.objectid, btrfs_ino(inode),
  336			logical_start,
  337			CSUM_FMT_VALUE(csum_size, csum),
  338			CSUM_FMT_VALUE(csum_size, csum_expected),
  339			mirror_num);
  340	} else {
  341		btrfs_warn_rl(root->fs_info,
  342"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
  343			root->root_key.objectid, btrfs_ino(inode),
  344			logical_start,
  345			CSUM_FMT_VALUE(csum_size, csum),
  346			CSUM_FMT_VALUE(csum_size, csum_expected),
  347			mirror_num);
  348	}
  349}
  350
  351/*
  352 * Lock inode i_rwsem based on arguments passed.
  353 *
  354 * ilock_flags can have the following bit set:
  355 *
  356 * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
  357 * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
  358 *		     return -EAGAIN
  359 * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
  360 */
  361int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
  362{
  363	if (ilock_flags & BTRFS_ILOCK_SHARED) {
  364		if (ilock_flags & BTRFS_ILOCK_TRY) {
  365			if (!inode_trylock_shared(&inode->vfs_inode))
  366				return -EAGAIN;
  367			else
  368				return 0;
  369		}
  370		inode_lock_shared(&inode->vfs_inode);
  371	} else {
  372		if (ilock_flags & BTRFS_ILOCK_TRY) {
  373			if (!inode_trylock(&inode->vfs_inode))
  374				return -EAGAIN;
  375			else
  376				return 0;
  377		}
  378		inode_lock(&inode->vfs_inode);
  379	}
  380	if (ilock_flags & BTRFS_ILOCK_MMAP)
  381		down_write(&inode->i_mmap_lock);
  382	return 0;
  383}
  384
  385/*
  386 * Unock inode i_rwsem.
  387 *
  388 * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
  389 * to decide whether the lock acquired is shared or exclusive.
  390 */
  391void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
  392{
  393	if (ilock_flags & BTRFS_ILOCK_MMAP)
  394		up_write(&inode->i_mmap_lock);
  395	if (ilock_flags & BTRFS_ILOCK_SHARED)
  396		inode_unlock_shared(&inode->vfs_inode);
  397	else
  398		inode_unlock(&inode->vfs_inode);
  399}
  400
  401/*
  402 * Cleanup all submitted ordered extents in specified range to handle errors
  403 * from the btrfs_run_delalloc_range() callback.
  404 *
  405 * NOTE: caller must ensure that when an error happens, it can not call
  406 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
  407 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
  408 * to be released, which we want to happen only when finishing the ordered
  409 * extent (btrfs_finish_ordered_io()).
  410 */
  411static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
  412						 struct page *locked_page,
  413						 u64 offset, u64 bytes)
  414{
  415	unsigned long index = offset >> PAGE_SHIFT;
  416	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
  417	u64 page_start = 0, page_end = 0;
  418	struct page *page;
  419
  420	if (locked_page) {
  421		page_start = page_offset(locked_page);
  422		page_end = page_start + PAGE_SIZE - 1;
  423	}
  424
  425	while (index <= end_index) {
  426		/*
  427		 * For locked page, we will call btrfs_mark_ordered_io_finished
  428		 * through btrfs_mark_ordered_io_finished() on it
  429		 * in run_delalloc_range() for the error handling, which will
  430		 * clear page Ordered and run the ordered extent accounting.
  431		 *
  432		 * Here we can't just clear the Ordered bit, or
  433		 * btrfs_mark_ordered_io_finished() would skip the accounting
  434		 * for the page range, and the ordered extent will never finish.
  435		 */
  436		if (locked_page && index == (page_start >> PAGE_SHIFT)) {
  437			index++;
  438			continue;
  439		}
  440		page = find_get_page(inode->vfs_inode.i_mapping, index);
  441		index++;
  442		if (!page)
  443			continue;
  444
  445		/*
  446		 * Here we just clear all Ordered bits for every page in the
  447		 * range, then btrfs_mark_ordered_io_finished() will handle
  448		 * the ordered extent accounting for the range.
  449		 */
  450		btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
  451					       offset, bytes);
  452		put_page(page);
  453	}
  454
  455	if (locked_page) {
  456		/* The locked page covers the full range, nothing needs to be done */
  457		if (bytes + offset <= page_start + PAGE_SIZE)
  458			return;
  459		/*
  460		 * In case this page belongs to the delalloc range being
  461		 * instantiated then skip it, since the first page of a range is
  462		 * going to be properly cleaned up by the caller of
  463		 * run_delalloc_range
  464		 */
  465		if (page_start >= offset && page_end <= (offset + bytes - 1)) {
  466			bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
  467			offset = page_offset(locked_page) + PAGE_SIZE;
  468		}
  469	}
  470
  471	return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
  472}
  473
  474static int btrfs_dirty_inode(struct btrfs_inode *inode);
  475
  476static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
  477				     struct btrfs_new_inode_args *args)
  478{
  479	int err;
  480
  481	if (args->default_acl) {
  482		err = __btrfs_set_acl(trans, args->inode, args->default_acl,
  483				      ACL_TYPE_DEFAULT);
  484		if (err)
  485			return err;
  486	}
  487	if (args->acl) {
  488		err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
  489		if (err)
  490			return err;
  491	}
  492	if (!args->default_acl && !args->acl)
  493		cache_no_acl(args->inode);
  494	return btrfs_xattr_security_init(trans, args->inode, args->dir,
  495					 &args->dentry->d_name);
  496}
  497
  498/*
  499 * this does all the hard work for inserting an inline extent into
  500 * the btree.  The caller should have done a btrfs_drop_extents so that
  501 * no overlapping inline items exist in the btree
  502 */
  503static int insert_inline_extent(struct btrfs_trans_handle *trans,
  504				struct btrfs_path *path,
  505				struct btrfs_inode *inode, bool extent_inserted,
  506				size_t size, size_t compressed_size,
  507				int compress_type,
  508				struct page **compressed_pages,
  509				bool update_i_size)
  510{
  511	struct btrfs_root *root = inode->root;
  512	struct extent_buffer *leaf;
  513	struct page *page = NULL;
  514	char *kaddr;
  515	unsigned long ptr;
  516	struct btrfs_file_extent_item *ei;
  517	int ret;
  518	size_t cur_size = size;
  519	u64 i_size;
  520
  521	ASSERT((compressed_size > 0 && compressed_pages) ||
  522	       (compressed_size == 0 && !compressed_pages));
  523
  524	if (compressed_size && compressed_pages)
  525		cur_size = compressed_size;
  526
  527	if (!extent_inserted) {
  528		struct btrfs_key key;
  529		size_t datasize;
  530
  531		key.objectid = btrfs_ino(inode);
  532		key.offset = 0;
  533		key.type = BTRFS_EXTENT_DATA_KEY;
  534
  535		datasize = btrfs_file_extent_calc_inline_size(cur_size);
  536		ret = btrfs_insert_empty_item(trans, root, path, &key,
  537					      datasize);
  538		if (ret)
  539			goto fail;
  540	}
  541	leaf = path->nodes[0];
  542	ei = btrfs_item_ptr(leaf, path->slots[0],
  543			    struct btrfs_file_extent_item);
  544	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
  545	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
  546	btrfs_set_file_extent_encryption(leaf, ei, 0);
  547	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
  548	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
  549	ptr = btrfs_file_extent_inline_start(ei);
  550
  551	if (compress_type != BTRFS_COMPRESS_NONE) {
  552		struct page *cpage;
  553		int i = 0;
  554		while (compressed_size > 0) {
  555			cpage = compressed_pages[i];
  556			cur_size = min_t(unsigned long, compressed_size,
  557				       PAGE_SIZE);
  558
  559			kaddr = kmap_local_page(cpage);
  560			write_extent_buffer(leaf, kaddr, ptr, cur_size);
  561			kunmap_local(kaddr);
  562
  563			i++;
  564			ptr += cur_size;
  565			compressed_size -= cur_size;
  566		}
  567		btrfs_set_file_extent_compression(leaf, ei,
  568						  compress_type);
  569	} else {
  570		page = find_get_page(inode->vfs_inode.i_mapping, 0);
  571		btrfs_set_file_extent_compression(leaf, ei, 0);
  572		kaddr = kmap_local_page(page);
  573		write_extent_buffer(leaf, kaddr, ptr, size);
  574		kunmap_local(kaddr);
  575		put_page(page);
  576	}
  577	btrfs_mark_buffer_dirty(trans, leaf);
  578	btrfs_release_path(path);
  579
  580	/*
  581	 * We align size to sectorsize for inline extents just for simplicity
  582	 * sake.
  583	 */
  584	ret = btrfs_inode_set_file_extent_range(inode, 0,
  585					ALIGN(size, root->fs_info->sectorsize));
  586	if (ret)
  587		goto fail;
  588
  589	/*
  590	 * We're an inline extent, so nobody can extend the file past i_size
  591	 * without locking a page we already have locked.
  592	 *
  593	 * We must do any i_size and inode updates before we unlock the pages.
  594	 * Otherwise we could end up racing with unlink.
  595	 */
  596	i_size = i_size_read(&inode->vfs_inode);
  597	if (update_i_size && size > i_size) {
  598		i_size_write(&inode->vfs_inode, size);
  599		i_size = size;
  600	}
  601	inode->disk_i_size = i_size;
  602
  603fail:
  604	return ret;
  605}
  606
  607
  608/*
  609 * conditionally insert an inline extent into the file.  This
  610 * does the checks required to make sure the data is small enough
  611 * to fit as an inline extent.
  612 */
  613static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
  614					  size_t compressed_size,
  615					  int compress_type,
  616					  struct page **compressed_pages,
  617					  bool update_i_size)
  618{
  619	struct btrfs_drop_extents_args drop_args = { 0 };
  620	struct btrfs_root *root = inode->root;
  621	struct btrfs_fs_info *fs_info = root->fs_info;
  622	struct btrfs_trans_handle *trans;
  623	u64 data_len = (compressed_size ?: size);
  624	int ret;
  625	struct btrfs_path *path;
  626
  627	/*
  628	 * We can create an inline extent if it ends at or beyond the current
  629	 * i_size, is no larger than a sector (decompressed), and the (possibly
  630	 * compressed) data fits in a leaf and the configured maximum inline
  631	 * size.
  632	 */
  633	if (size < i_size_read(&inode->vfs_inode) ||
  634	    size > fs_info->sectorsize ||
  635	    data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
  636	    data_len > fs_info->max_inline)
  637		return 1;
  638
  639	path = btrfs_alloc_path();
  640	if (!path)
  641		return -ENOMEM;
  642
  643	trans = btrfs_join_transaction(root);
  644	if (IS_ERR(trans)) {
  645		btrfs_free_path(path);
  646		return PTR_ERR(trans);
  647	}
  648	trans->block_rsv = &inode->block_rsv;
  649
  650	drop_args.path = path;
  651	drop_args.start = 0;
  652	drop_args.end = fs_info->sectorsize;
  653	drop_args.drop_cache = true;
  654	drop_args.replace_extent = true;
  655	drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
  656	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
  657	if (ret) {
  658		btrfs_abort_transaction(trans, ret);
  659		goto out;
  660	}
  661
  662	ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
  663				   size, compressed_size, compress_type,
  664				   compressed_pages, update_i_size);
  665	if (ret && ret != -ENOSPC) {
  666		btrfs_abort_transaction(trans, ret);
  667		goto out;
  668	} else if (ret == -ENOSPC) {
  669		ret = 1;
  670		goto out;
  671	}
  672
  673	btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
  674	ret = btrfs_update_inode(trans, inode);
  675	if (ret && ret != -ENOSPC) {
  676		btrfs_abort_transaction(trans, ret);
  677		goto out;
  678	} else if (ret == -ENOSPC) {
  679		ret = 1;
  680		goto out;
  681	}
  682
  683	btrfs_set_inode_full_sync(inode);
  684out:
  685	/*
  686	 * Don't forget to free the reserved space, as for inlined extent
  687	 * it won't count as data extent, free them directly here.
  688	 * And at reserve time, it's always aligned to page size, so
  689	 * just free one page here.
  690	 */
  691	btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL);
  692	btrfs_free_path(path);
  693	btrfs_end_transaction(trans);
  694	return ret;
  695}
  696
  697struct async_extent {
  698	u64 start;
  699	u64 ram_size;
  700	u64 compressed_size;
  701	struct page **pages;
  702	unsigned long nr_pages;
  703	int compress_type;
  704	struct list_head list;
  705};
  706
  707struct async_chunk {
  708	struct btrfs_inode *inode;
  709	struct page *locked_page;
  710	u64 start;
  711	u64 end;
  712	blk_opf_t write_flags;
  713	struct list_head extents;
  714	struct cgroup_subsys_state *blkcg_css;
  715	struct btrfs_work work;
  716	struct async_cow *async_cow;
  717};
  718
  719struct async_cow {
  720	atomic_t num_chunks;
  721	struct async_chunk chunks[];
  722};
  723
  724static noinline int add_async_extent(struct async_chunk *cow,
  725				     u64 start, u64 ram_size,
  726				     u64 compressed_size,
  727				     struct page **pages,
  728				     unsigned long nr_pages,
  729				     int compress_type)
  730{
  731	struct async_extent *async_extent;
  732
  733	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
  734	BUG_ON(!async_extent); /* -ENOMEM */
  735	async_extent->start = start;
  736	async_extent->ram_size = ram_size;
  737	async_extent->compressed_size = compressed_size;
  738	async_extent->pages = pages;
  739	async_extent->nr_pages = nr_pages;
  740	async_extent->compress_type = compress_type;
  741	list_add_tail(&async_extent->list, &cow->extents);
  742	return 0;
  743}
  744
  745/*
  746 * Check if the inode needs to be submitted to compression, based on mount
  747 * options, defragmentation, properties or heuristics.
  748 */
  749static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
  750				      u64 end)
  751{
  752	struct btrfs_fs_info *fs_info = inode->root->fs_info;
  753
  754	if (!btrfs_inode_can_compress(inode)) {
  755		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
  756			KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
  757			btrfs_ino(inode));
  758		return 0;
  759	}
  760	/*
  761	 * Special check for subpage.
  762	 *
  763	 * We lock the full page then run each delalloc range in the page, thus
  764	 * for the following case, we will hit some subpage specific corner case:
  765	 *
  766	 * 0		32K		64K
  767	 * |	|///////|	|///////|
  768	 *		\- A		\- B
  769	 *
  770	 * In above case, both range A and range B will try to unlock the full
  771	 * page [0, 64K), causing the one finished later will have page
  772	 * unlocked already, triggering various page lock requirement BUG_ON()s.
  773	 *
  774	 * So here we add an artificial limit that subpage compression can only
  775	 * if the range is fully page aligned.
  776	 *
  777	 * In theory we only need to ensure the first page is fully covered, but
  778	 * the tailing partial page will be locked until the full compression
  779	 * finishes, delaying the write of other range.
  780	 *
  781	 * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
  782	 * first to prevent any submitted async extent to unlock the full page.
  783	 * By this, we can ensure for subpage case that only the last async_cow
  784	 * will unlock the full page.
  785	 */
  786	if (fs_info->sectorsize < PAGE_SIZE) {
  787		if (!PAGE_ALIGNED(start) ||
  788		    !PAGE_ALIGNED(end + 1))
  789			return 0;
  790	}
  791
  792	/* force compress */
  793	if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
  794		return 1;
  795	/* defrag ioctl */
  796	if (inode->defrag_compress)
  797		return 1;
  798	/* bad compression ratios */
  799	if (inode->flags & BTRFS_INODE_NOCOMPRESS)
  800		return 0;
  801	if (btrfs_test_opt(fs_info, COMPRESS) ||
  802	    inode->flags & BTRFS_INODE_COMPRESS ||
  803	    inode->prop_compress)
  804		return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
  805	return 0;
  806}
  807
  808static inline void inode_should_defrag(struct btrfs_inode *inode,
  809		u64 start, u64 end, u64 num_bytes, u32 small_write)
  810{
  811	/* If this is a small write inside eof, kick off a defrag */
  812	if (num_bytes < small_write &&
  813	    (start > 0 || end + 1 < inode->disk_i_size))
  814		btrfs_add_inode_defrag(NULL, inode, small_write);
  815}
  816
  817/*
  818 * Work queue call back to started compression on a file and pages.
  819 *
  820 * This is done inside an ordered work queue, and the compression is spread
  821 * across many cpus.  The actual IO submission is step two, and the ordered work
  822 * queue takes care of making sure that happens in the same order things were
  823 * put onto the queue by writepages and friends.
  824 *
  825 * If this code finds it can't get good compression, it puts an entry onto the
  826 * work queue to write the uncompressed bytes.  This makes sure that both
  827 * compressed inodes and uncompressed inodes are written in the same order that
  828 * the flusher thread sent them down.
  829 */
  830static void compress_file_range(struct btrfs_work *work)
  831{
  832	struct async_chunk *async_chunk =
  833		container_of(work, struct async_chunk, work);
  834	struct btrfs_inode *inode = async_chunk->inode;
  835	struct btrfs_fs_info *fs_info = inode->root->fs_info;
  836	struct address_space *mapping = inode->vfs_inode.i_mapping;
  837	u64 blocksize = fs_info->sectorsize;
  838	u64 start = async_chunk->start;
  839	u64 end = async_chunk->end;
  840	u64 actual_end;
  841	u64 i_size;
  842	int ret = 0;
  843	struct page **pages;
  844	unsigned long nr_pages;
  845	unsigned long total_compressed = 0;
  846	unsigned long total_in = 0;
  847	unsigned int poff;
  848	int i;
  849	int compress_type = fs_info->compress_type;
  850
  851	inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
  852
  853	/*
  854	 * We need to call clear_page_dirty_for_io on each page in the range.
  855	 * Otherwise applications with the file mmap'd can wander in and change
  856	 * the page contents while we are compressing them.
  857	 */
  858	extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
  859
  860	/*
  861	 * We need to save i_size before now because it could change in between
  862	 * us evaluating the size and assigning it.  This is because we lock and
  863	 * unlock the page in truncate and fallocate, and then modify the i_size
  864	 * later on.
  865	 *
  866	 * The barriers are to emulate READ_ONCE, remove that once i_size_read
  867	 * does that for us.
  868	 */
  869	barrier();
  870	i_size = i_size_read(&inode->vfs_inode);
  871	barrier();
  872	actual_end = min_t(u64, i_size, end + 1);
  873again:
  874	pages = NULL;
  875	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
  876	nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES);
  877
  878	/*
  879	 * we don't want to send crud past the end of i_size through
  880	 * compression, that's just a waste of CPU time.  So, if the
  881	 * end of the file is before the start of our current
  882	 * requested range of bytes, we bail out to the uncompressed
  883	 * cleanup code that can deal with all of this.
  884	 *
  885	 * It isn't really the fastest way to fix things, but this is a
  886	 * very uncommon corner.
  887	 */
  888	if (actual_end <= start)
  889		goto cleanup_and_bail_uncompressed;
  890
  891	total_compressed = actual_end - start;
  892
  893	/*
  894	 * Skip compression for a small file range(<=blocksize) that
  895	 * isn't an inline extent, since it doesn't save disk space at all.
  896	 */
  897	if (total_compressed <= blocksize &&
  898	   (start > 0 || end + 1 < inode->disk_i_size))
  899		goto cleanup_and_bail_uncompressed;
  900
  901	/*
  902	 * For subpage case, we require full page alignment for the sector
  903	 * aligned range.
  904	 * Thus we must also check against @actual_end, not just @end.
  905	 */
  906	if (blocksize < PAGE_SIZE) {
  907		if (!PAGE_ALIGNED(start) ||
  908		    !PAGE_ALIGNED(round_up(actual_end, blocksize)))
  909			goto cleanup_and_bail_uncompressed;
  910	}
  911
  912	total_compressed = min_t(unsigned long, total_compressed,
  913			BTRFS_MAX_UNCOMPRESSED);
  914	total_in = 0;
  915	ret = 0;
  916
  917	/*
  918	 * We do compression for mount -o compress and when the inode has not
  919	 * been flagged as NOCOMPRESS.  This flag can change at any time if we
  920	 * discover bad compression ratios.
  921	 */
  922	if (!inode_need_compress(inode, start, end))
  923		goto cleanup_and_bail_uncompressed;
  924
  925	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
  926	if (!pages) {
  927		/*
  928		 * Memory allocation failure is not a fatal error, we can fall
  929		 * back to uncompressed code.
  930		 */
  931		goto cleanup_and_bail_uncompressed;
  932	}
  933
  934	if (inode->defrag_compress)
  935		compress_type = inode->defrag_compress;
  936	else if (inode->prop_compress)
  937		compress_type = inode->prop_compress;
  938
  939	/* Compression level is applied here. */
  940	ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4),
  941				   mapping, start, pages, &nr_pages, &total_in,
  942				   &total_compressed);
  943	if (ret)
  944		goto mark_incompressible;
  945
  946	/*
  947	 * Zero the tail end of the last page, as we might be sending it down
  948	 * to disk.
  949	 */
  950	poff = offset_in_page(total_compressed);
  951	if (poff)
  952		memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff);
  953
  954	/*
  955	 * Try to create an inline extent.
  956	 *
  957	 * If we didn't compress the entire range, try to create an uncompressed
  958	 * inline extent, else a compressed one.
  959	 *
  960	 * Check cow_file_range() for why we don't even try to create inline
  961	 * extent for the subpage case.
  962	 */
  963	if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
  964		if (total_in < actual_end) {
  965			ret = cow_file_range_inline(inode, actual_end, 0,
  966						    BTRFS_COMPRESS_NONE, NULL,
  967						    false);
  968		} else {
  969			ret = cow_file_range_inline(inode, actual_end,
  970						    total_compressed,
  971						    compress_type, pages,
  972						    false);
  973		}
  974		if (ret <= 0) {
  975			unsigned long clear_flags = EXTENT_DELALLOC |
  976				EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
  977				EXTENT_DO_ACCOUNTING;
  978
  979			if (ret < 0)
  980				mapping_set_error(mapping, -EIO);
  981
  982			/*
  983			 * inline extent creation worked or returned error,
  984			 * we don't need to create any more async work items.
  985			 * Unlock and free up our temp pages.
  986			 *
  987			 * We use DO_ACCOUNTING here because we need the
  988			 * delalloc_release_metadata to be done _after_ we drop
  989			 * our outstanding extent for clearing delalloc for this
  990			 * range.
  991			 */
  992			extent_clear_unlock_delalloc(inode, start, end,
  993						     NULL,
  994						     clear_flags,
  995						     PAGE_UNLOCK |
  996						     PAGE_START_WRITEBACK |
  997						     PAGE_END_WRITEBACK);
  998			goto free_pages;
  999		}
 1000	}
 1001
 1002	/*
 1003	 * We aren't doing an inline extent. Round the compressed size up to a
 1004	 * block size boundary so the allocator does sane things.
 1005	 */
 1006	total_compressed = ALIGN(total_compressed, blocksize);
 1007
 1008	/*
 1009	 * One last check to make sure the compression is really a win, compare
 1010	 * the page count read with the blocks on disk, compression must free at
 1011	 * least one sector.
 1012	 */
 1013	total_in = round_up(total_in, fs_info->sectorsize);
 1014	if (total_compressed + blocksize > total_in)
 1015		goto mark_incompressible;
 1016
 1017	/*
 1018	 * The async work queues will take care of doing actual allocation on
 1019	 * disk for these compressed pages, and will submit the bios.
 1020	 */
 1021	add_async_extent(async_chunk, start, total_in, total_compressed, pages,
 1022			 nr_pages, compress_type);
 1023	if (start + total_in < end) {
 1024		start += total_in;
 1025		cond_resched();
 1026		goto again;
 1027	}
 1028	return;
 1029
 1030mark_incompressible:
 1031	if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress)
 1032		inode->flags |= BTRFS_INODE_NOCOMPRESS;
 1033cleanup_and_bail_uncompressed:
 1034	add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
 1035			 BTRFS_COMPRESS_NONE);
 1036free_pages:
 1037	if (pages) {
 1038		for (i = 0; i < nr_pages; i++) {
 1039			WARN_ON(pages[i]->mapping);
 1040			put_page(pages[i]);
 1041		}
 1042		kfree(pages);
 1043	}
 1044}
 1045
 1046static void free_async_extent_pages(struct async_extent *async_extent)
 1047{
 1048	int i;
 1049
 1050	if (!async_extent->pages)
 1051		return;
 1052
 1053	for (i = 0; i < async_extent->nr_pages; i++) {
 1054		WARN_ON(async_extent->pages[i]->mapping);
 1055		put_page(async_extent->pages[i]);
 1056	}
 1057	kfree(async_extent->pages);
 1058	async_extent->nr_pages = 0;
 1059	async_extent->pages = NULL;
 1060}
 1061
 1062static void submit_uncompressed_range(struct btrfs_inode *inode,
 1063				      struct async_extent *async_extent,
 1064				      struct page *locked_page)
 1065{
 1066	u64 start = async_extent->start;
 1067	u64 end = async_extent->start + async_extent->ram_size - 1;
 1068	int ret;
 1069	struct writeback_control wbc = {
 1070		.sync_mode		= WB_SYNC_ALL,
 1071		.range_start		= start,
 1072		.range_end		= end,
 1073		.no_cgroup_owner	= 1,
 1074	};
 1075
 1076	wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode);
 1077	ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false);
 1078	wbc_detach_inode(&wbc);
 1079	if (ret < 0) {
 1080		btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
 1081		if (locked_page) {
 1082			const u64 page_start = page_offset(locked_page);
 1083
 1084			set_page_writeback(locked_page);
 1085			end_page_writeback(locked_page);
 1086			btrfs_mark_ordered_io_finished(inode, locked_page,
 1087						       page_start, PAGE_SIZE,
 1088						       !ret);
 1089			mapping_set_error(locked_page->mapping, ret);
 1090			unlock_page(locked_page);
 1091		}
 1092	}
 1093}
 1094
 1095static void submit_one_async_extent(struct async_chunk *async_chunk,
 1096				    struct async_extent *async_extent,
 1097				    u64 *alloc_hint)
 1098{
 1099	struct btrfs_inode *inode = async_chunk->inode;
 1100	struct extent_io_tree *io_tree = &inode->io_tree;
 1101	struct btrfs_root *root = inode->root;
 1102	struct btrfs_fs_info *fs_info = root->fs_info;
 1103	struct btrfs_ordered_extent *ordered;
 1104	struct btrfs_key ins;
 1105	struct page *locked_page = NULL;
 1106	struct extent_map *em;
 1107	int ret = 0;
 1108	u64 start = async_extent->start;
 1109	u64 end = async_extent->start + async_extent->ram_size - 1;
 1110
 1111	if (async_chunk->blkcg_css)
 1112		kthread_associate_blkcg(async_chunk->blkcg_css);
 1113
 1114	/*
 1115	 * If async_chunk->locked_page is in the async_extent range, we need to
 1116	 * handle it.
 1117	 */
 1118	if (async_chunk->locked_page) {
 1119		u64 locked_page_start = page_offset(async_chunk->locked_page);
 1120		u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
 1121
 1122		if (!(start >= locked_page_end || end <= locked_page_start))
 1123			locked_page = async_chunk->locked_page;
 1124	}
 1125	lock_extent(io_tree, start, end, NULL);
 1126
 1127	if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
 1128		submit_uncompressed_range(inode, async_extent, locked_page);
 1129		goto done;
 1130	}
 1131
 1132	ret = btrfs_reserve_extent(root, async_extent->ram_size,
 1133				   async_extent->compressed_size,
 1134				   async_extent->compressed_size,
 1135				   0, *alloc_hint, &ins, 1, 1);
 1136	if (ret) {
 1137		/*
 1138		 * Here we used to try again by going back to non-compressed
 1139		 * path for ENOSPC.  But we can't reserve space even for
 1140		 * compressed size, how could it work for uncompressed size
 1141		 * which requires larger size?  So here we directly go error
 1142		 * path.
 1143		 */
 1144		goto out_free;
 1145	}
 1146
 1147	/* Here we're doing allocation and writeback of the compressed pages */
 1148	em = create_io_em(inode, start,
 1149			  async_extent->ram_size,	/* len */
 1150			  start,			/* orig_start */
 1151			  ins.objectid,			/* block_start */
 1152			  ins.offset,			/* block_len */
 1153			  ins.offset,			/* orig_block_len */
 1154			  async_extent->ram_size,	/* ram_bytes */
 1155			  async_extent->compress_type,
 1156			  BTRFS_ORDERED_COMPRESSED);
 1157	if (IS_ERR(em)) {
 1158		ret = PTR_ERR(em);
 1159		goto out_free_reserve;
 1160	}
 1161	free_extent_map(em);
 1162
 1163	ordered = btrfs_alloc_ordered_extent(inode, start,	/* file_offset */
 1164				       async_extent->ram_size,	/* num_bytes */
 1165				       async_extent->ram_size,	/* ram_bytes */
 1166				       ins.objectid,		/* disk_bytenr */
 1167				       ins.offset,		/* disk_num_bytes */
 1168				       0,			/* offset */
 1169				       1 << BTRFS_ORDERED_COMPRESSED,
 1170				       async_extent->compress_type);
 1171	if (IS_ERR(ordered)) {
 1172		btrfs_drop_extent_map_range(inode, start, end, false);
 1173		ret = PTR_ERR(ordered);
 1174		goto out_free_reserve;
 1175	}
 1176	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 1177
 1178	/* Clear dirty, set writeback and unlock the pages. */
 1179	extent_clear_unlock_delalloc(inode, start, end,
 1180			NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
 1181			PAGE_UNLOCK | PAGE_START_WRITEBACK);
 1182	btrfs_submit_compressed_write(ordered,
 1183			    async_extent->pages,	/* compressed_pages */
 1184			    async_extent->nr_pages,
 1185			    async_chunk->write_flags, true);
 1186	*alloc_hint = ins.objectid + ins.offset;
 1187done:
 1188	if (async_chunk->blkcg_css)
 1189		kthread_associate_blkcg(NULL);
 1190	kfree(async_extent);
 1191	return;
 1192
 1193out_free_reserve:
 1194	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 1195	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
 1196out_free:
 1197	mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
 1198	extent_clear_unlock_delalloc(inode, start, end,
 1199				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
 1200				     EXTENT_DELALLOC_NEW |
 1201				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
 1202				     PAGE_UNLOCK | PAGE_START_WRITEBACK |
 1203				     PAGE_END_WRITEBACK);
 1204	free_async_extent_pages(async_extent);
 1205	if (async_chunk->blkcg_css)
 1206		kthread_associate_blkcg(NULL);
 1207	btrfs_debug(fs_info,
 1208"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
 1209		    root->root_key.objectid, btrfs_ino(inode), start,
 1210		    async_extent->ram_size, ret);
 1211	kfree(async_extent);
 1212}
 1213
 1214static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
 1215				      u64 num_bytes)
 1216{
 1217	struct extent_map_tree *em_tree = &inode->extent_tree;
 1218	struct extent_map *em;
 1219	u64 alloc_hint = 0;
 1220
 1221	read_lock(&em_tree->lock);
 1222	em = search_extent_mapping(em_tree, start, num_bytes);
 1223	if (em) {
 1224		/*
 1225		 * if block start isn't an actual block number then find the
 1226		 * first block in this inode and use that as a hint.  If that
 1227		 * block is also bogus then just don't worry about it.
 1228		 */
 1229		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
 1230			free_extent_map(em);
 1231			em = search_extent_mapping(em_tree, 0, 0);
 1232			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
 1233				alloc_hint = em->block_start;
 1234			if (em)
 1235				free_extent_map(em);
 1236		} else {
 1237			alloc_hint = em->block_start;
 1238			free_extent_map(em);
 1239		}
 1240	}
 1241	read_unlock(&em_tree->lock);
 1242
 1243	return alloc_hint;
 1244}
 1245
 1246/*
 1247 * when extent_io.c finds a delayed allocation range in the file,
 1248 * the call backs end up in this code.  The basic idea is to
 1249 * allocate extents on disk for the range, and create ordered data structs
 1250 * in ram to track those extents.
 1251 *
 1252 * locked_page is the page that writepage had locked already.  We use
 1253 * it to make sure we don't do extra locks or unlocks.
 1254 *
 1255 * When this function fails, it unlocks all pages except @locked_page.
 1256 *
 1257 * When this function successfully creates an inline extent, it returns 1 and
 1258 * unlocks all pages including locked_page and starts I/O on them.
 1259 * (In reality inline extents are limited to a single page, so locked_page is
 1260 * the only page handled anyway).
 1261 *
 1262 * When this function succeed and creates a normal extent, the page locking
 1263 * status depends on the passed in flags:
 1264 *
 1265 * - If @keep_locked is set, all pages are kept locked.
 1266 * - Else all pages except for @locked_page are unlocked.
 1267 *
 1268 * When a failure happens in the second or later iteration of the
 1269 * while-loop, the ordered extents created in previous iterations are kept
 1270 * intact. So, the caller must clean them up by calling
 1271 * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
 1272 * example.
 1273 */
 1274static noinline int cow_file_range(struct btrfs_inode *inode,
 1275				   struct page *locked_page, u64 start, u64 end,
 1276				   u64 *done_offset,
 1277				   bool keep_locked, bool no_inline)
 1278{
 1279	struct btrfs_root *root = inode->root;
 1280	struct btrfs_fs_info *fs_info = root->fs_info;
 1281	u64 alloc_hint = 0;
 1282	u64 orig_start = start;
 1283	u64 num_bytes;
 1284	unsigned long ram_size;
 1285	u64 cur_alloc_size = 0;
 1286	u64 min_alloc_size;
 1287	u64 blocksize = fs_info->sectorsize;
 1288	struct btrfs_key ins;
 1289	struct extent_map *em;
 1290	unsigned clear_bits;
 1291	unsigned long page_ops;
 1292	bool extent_reserved = false;
 1293	int ret = 0;
 1294
 1295	if (btrfs_is_free_space_inode(inode)) {
 1296		ret = -EINVAL;
 1297		goto out_unlock;
 1298	}
 1299
 1300	num_bytes = ALIGN(end - start + 1, blocksize);
 1301	num_bytes = max(blocksize,  num_bytes);
 1302	ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
 1303
 1304	inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
 1305
 1306	/*
 1307	 * Due to the page size limit, for subpage we can only trigger the
 1308	 * writeback for the dirty sectors of page, that means data writeback
 1309	 * is doing more writeback than what we want.
 1310	 *
 1311	 * This is especially unexpected for some call sites like fallocate,
 1312	 * where we only increase i_size after everything is done.
 1313	 * This means we can trigger inline extent even if we didn't want to.
 1314	 * So here we skip inline extent creation completely.
 1315	 */
 1316	if (start == 0 && fs_info->sectorsize == PAGE_SIZE && !no_inline) {
 1317		u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode),
 1318				       end + 1);
 1319
 1320		/* lets try to make an inline extent */
 1321		ret = cow_file_range_inline(inode, actual_end, 0,
 1322					    BTRFS_COMPRESS_NONE, NULL, false);
 1323		if (ret == 0) {
 1324			/*
 1325			 * We use DO_ACCOUNTING here because we need the
 1326			 * delalloc_release_metadata to be run _after_ we drop
 1327			 * our outstanding extent for clearing delalloc for this
 1328			 * range.
 1329			 */
 1330			extent_clear_unlock_delalloc(inode, start, end,
 1331				     locked_page,
 1332				     EXTENT_LOCKED | EXTENT_DELALLOC |
 1333				     EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
 1334				     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
 1335				     PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
 1336			/*
 1337			 * locked_page is locked by the caller of
 1338			 * writepage_delalloc(), not locked by
 1339			 * __process_pages_contig().
 1340			 *
 1341			 * We can't let __process_pages_contig() to unlock it,
 1342			 * as it doesn't have any subpage::writers recorded.
 1343			 *
 1344			 * Here we manually unlock the page, since the caller
 1345			 * can't determine if it's an inline extent or a
 1346			 * compressed extent.
 1347			 */
 1348			unlock_page(locked_page);
 1349			ret = 1;
 1350			goto done;
 1351		} else if (ret < 0) {
 1352			goto out_unlock;
 1353		}
 1354	}
 1355
 1356	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
 1357
 1358	/*
 1359	 * Relocation relies on the relocated extents to have exactly the same
 1360	 * size as the original extents. Normally writeback for relocation data
 1361	 * extents follows a NOCOW path because relocation preallocates the
 1362	 * extents. However, due to an operation such as scrub turning a block
 1363	 * group to RO mode, it may fallback to COW mode, so we must make sure
 1364	 * an extent allocated during COW has exactly the requested size and can
 1365	 * not be split into smaller extents, otherwise relocation breaks and
 1366	 * fails during the stage where it updates the bytenr of file extent
 1367	 * items.
 1368	 */
 1369	if (btrfs_is_data_reloc_root(root))
 1370		min_alloc_size = num_bytes;
 1371	else
 1372		min_alloc_size = fs_info->sectorsize;
 1373
 1374	while (num_bytes > 0) {
 1375		struct btrfs_ordered_extent *ordered;
 1376
 1377		cur_alloc_size = num_bytes;
 1378		ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
 1379					   min_alloc_size, 0, alloc_hint,
 1380					   &ins, 1, 1);
 1381		if (ret == -EAGAIN) {
 1382			/*
 1383			 * btrfs_reserve_extent only returns -EAGAIN for zoned
 1384			 * file systems, which is an indication that there are
 1385			 * no active zones to allocate from at the moment.
 1386			 *
 1387			 * If this is the first loop iteration, wait for at
 1388			 * least one zone to finish before retrying the
 1389			 * allocation.  Otherwise ask the caller to write out
 1390			 * the already allocated blocks before coming back to
 1391			 * us, or return -ENOSPC if it can't handle retries.
 1392			 */
 1393			ASSERT(btrfs_is_zoned(fs_info));
 1394			if (start == orig_start) {
 1395				wait_on_bit_io(&inode->root->fs_info->flags,
 1396					       BTRFS_FS_NEED_ZONE_FINISH,
 1397					       TASK_UNINTERRUPTIBLE);
 1398				continue;
 1399			}
 1400			if (done_offset) {
 1401				*done_offset = start - 1;
 1402				return 0;
 1403			}
 1404			ret = -ENOSPC;
 1405		}
 1406		if (ret < 0)
 1407			goto out_unlock;
 1408		cur_alloc_size = ins.offset;
 1409		extent_reserved = true;
 1410
 1411		ram_size = ins.offset;
 1412		em = create_io_em(inode, start, ins.offset, /* len */
 1413				  start, /* orig_start */
 1414				  ins.objectid, /* block_start */
 1415				  ins.offset, /* block_len */
 1416				  ins.offset, /* orig_block_len */
 1417				  ram_size, /* ram_bytes */
 1418				  BTRFS_COMPRESS_NONE, /* compress_type */
 1419				  BTRFS_ORDERED_REGULAR /* type */);
 1420		if (IS_ERR(em)) {
 1421			ret = PTR_ERR(em);
 1422			goto out_reserve;
 1423		}
 1424		free_extent_map(em);
 1425
 1426		ordered = btrfs_alloc_ordered_extent(inode, start, ram_size,
 1427					ram_size, ins.objectid, cur_alloc_size,
 1428					0, 1 << BTRFS_ORDERED_REGULAR,
 1429					BTRFS_COMPRESS_NONE);
 1430		if (IS_ERR(ordered)) {
 1431			ret = PTR_ERR(ordered);
 1432			goto out_drop_extent_cache;
 1433		}
 1434
 1435		if (btrfs_is_data_reloc_root(root)) {
 1436			ret = btrfs_reloc_clone_csums(ordered);
 1437
 1438			/*
 1439			 * Only drop cache here, and process as normal.
 1440			 *
 1441			 * We must not allow extent_clear_unlock_delalloc()
 1442			 * at out_unlock label to free meta of this ordered
 1443			 * extent, as its meta should be freed by
 1444			 * btrfs_finish_ordered_io().
 1445			 *
 1446			 * So we must continue until @start is increased to
 1447			 * skip current ordered extent.
 1448			 */
 1449			if (ret)
 1450				btrfs_drop_extent_map_range(inode, start,
 1451							    start + ram_size - 1,
 1452							    false);
 1453		}
 1454		btrfs_put_ordered_extent(ordered);
 1455
 1456		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 1457
 1458		/*
 1459		 * We're not doing compressed IO, don't unlock the first page
 1460		 * (which the caller expects to stay locked), don't clear any
 1461		 * dirty bits and don't set any writeback bits
 1462		 *
 1463		 * Do set the Ordered (Private2) bit so we know this page was
 1464		 * properly setup for writepage.
 1465		 */
 1466		page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
 1467		page_ops |= PAGE_SET_ORDERED;
 1468
 1469		extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
 1470					     locked_page,
 1471					     EXTENT_LOCKED | EXTENT_DELALLOC,
 1472					     page_ops);
 1473		if (num_bytes < cur_alloc_size)
 1474			num_bytes = 0;
 1475		else
 1476			num_bytes -= cur_alloc_size;
 1477		alloc_hint = ins.objectid + ins.offset;
 1478		start += cur_alloc_size;
 1479		extent_reserved = false;
 1480
 1481		/*
 1482		 * btrfs_reloc_clone_csums() error, since start is increased
 1483		 * extent_clear_unlock_delalloc() at out_unlock label won't
 1484		 * free metadata of current ordered extent, we're OK to exit.
 1485		 */
 1486		if (ret)
 1487			goto out_unlock;
 1488	}
 1489done:
 1490	if (done_offset)
 1491		*done_offset = end;
 1492	return ret;
 1493
 1494out_drop_extent_cache:
 1495	btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false);
 1496out_reserve:
 1497	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 1498	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
 1499out_unlock:
 1500	/*
 1501	 * Now, we have three regions to clean up:
 1502	 *
 1503	 * |-------(1)----|---(2)---|-------------(3)----------|
 1504	 * `- orig_start  `- start  `- start + cur_alloc_size  `- end
 1505	 *
 1506	 * We process each region below.
 1507	 */
 1508
 1509	clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
 1510		EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
 1511	page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
 1512
 1513	/*
 1514	 * For the range (1). We have already instantiated the ordered extents
 1515	 * for this region. They are cleaned up by
 1516	 * btrfs_cleanup_ordered_extents() in e.g,
 1517	 * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
 1518	 * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
 1519	 * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
 1520	 * function.
 1521	 *
 1522	 * However, in case of @keep_locked, we still need to unlock the pages
 1523	 * (except @locked_page) to ensure all the pages are unlocked.
 1524	 */
 1525	if (keep_locked && orig_start < start) {
 1526		if (!locked_page)
 1527			mapping_set_error(inode->vfs_inode.i_mapping, ret);
 1528		extent_clear_unlock_delalloc(inode, orig_start, start - 1,
 1529					     locked_page, 0, page_ops);
 1530	}
 1531
 1532	/*
 1533	 * For the range (2). If we reserved an extent for our delalloc range
 1534	 * (or a subrange) and failed to create the respective ordered extent,
 1535	 * then it means that when we reserved the extent we decremented the
 1536	 * extent's size from the data space_info's bytes_may_use counter and
 1537	 * incremented the space_info's bytes_reserved counter by the same
 1538	 * amount. We must make sure extent_clear_unlock_delalloc() does not try
 1539	 * to decrement again the data space_info's bytes_may_use counter,
 1540	 * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
 1541	 */
 1542	if (extent_reserved) {
 1543		extent_clear_unlock_delalloc(inode, start,
 1544					     start + cur_alloc_size - 1,
 1545					     locked_page,
 1546					     clear_bits,
 1547					     page_ops);
 1548		start += cur_alloc_size;
 1549	}
 1550
 1551	/*
 1552	 * For the range (3). We never touched the region. In addition to the
 1553	 * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
 1554	 * space_info's bytes_may_use counter, reserved in
 1555	 * btrfs_check_data_free_space().
 1556	 */
 1557	if (start < end) {
 1558		clear_bits |= EXTENT_CLEAR_DATA_RESV;
 1559		extent_clear_unlock_delalloc(inode, start, end, locked_page,
 1560					     clear_bits, page_ops);
 1561	}
 1562	return ret;
 1563}
 1564
 1565/*
 1566 * Phase two of compressed writeback.  This is the ordered portion of the code,
 1567 * which only gets called in the order the work was queued.  We walk all the
 1568 * async extents created by compress_file_range and send them down to the disk.
 1569 *
 1570 * If called with @do_free == true then it'll try to finish the work and free
 1571 * the work struct eventually.
 1572 */
 1573static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free)
 1574{
 1575	struct async_chunk *async_chunk = container_of(work, struct async_chunk,
 1576						     work);
 1577	struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
 1578	struct async_extent *async_extent;
 1579	unsigned long nr_pages;
 1580	u64 alloc_hint = 0;
 1581
 1582	if (do_free) {
 1583		struct async_chunk *async_chunk;
 1584		struct async_cow *async_cow;
 1585
 1586		async_chunk = container_of(work, struct async_chunk, work);
 1587		btrfs_add_delayed_iput(async_chunk->inode);
 1588		if (async_chunk->blkcg_css)
 1589			css_put(async_chunk->blkcg_css);
 1590
 1591		async_cow = async_chunk->async_cow;
 1592		if (atomic_dec_and_test(&async_cow->num_chunks))
 1593			kvfree(async_cow);
 1594		return;
 1595	}
 1596
 1597	nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
 1598		PAGE_SHIFT;
 1599
 1600	while (!list_empty(&async_chunk->extents)) {
 1601		async_extent = list_entry(async_chunk->extents.next,
 1602					  struct async_extent, list);
 1603		list_del(&async_extent->list);
 1604		submit_one_async_extent(async_chunk, async_extent, &alloc_hint);
 1605	}
 1606
 1607	/* atomic_sub_return implies a barrier */
 1608	if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
 1609	    5 * SZ_1M)
 1610		cond_wake_up_nomb(&fs_info->async_submit_wait);
 1611}
 1612
 1613static bool run_delalloc_compressed(struct btrfs_inode *inode,
 1614				    struct page *locked_page, u64 start,
 1615				    u64 end, struct writeback_control *wbc)
 1616{
 1617	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 1618	struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
 1619	struct async_cow *ctx;
 1620	struct async_chunk *async_chunk;
 1621	unsigned long nr_pages;
 1622	u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
 1623	int i;
 1624	unsigned nofs_flag;
 1625	const blk_opf_t write_flags = wbc_to_write_flags(wbc);
 1626
 1627	nofs_flag = memalloc_nofs_save();
 1628	ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
 1629	memalloc_nofs_restore(nofs_flag);
 1630	if (!ctx)
 1631		return false;
 1632
 1633	unlock_extent(&inode->io_tree, start, end, NULL);
 1634	set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
 1635
 1636	async_chunk = ctx->chunks;
 1637	atomic_set(&ctx->num_chunks, num_chunks);
 1638
 1639	for (i = 0; i < num_chunks; i++) {
 1640		u64 cur_end = min(end, start + SZ_512K - 1);
 1641
 1642		/*
 1643		 * igrab is called higher up in the call chain, take only the
 1644		 * lightweight reference for the callback lifetime
 1645		 */
 1646		ihold(&inode->vfs_inode);
 1647		async_chunk[i].async_cow = ctx;
 1648		async_chunk[i].inode = inode;
 1649		async_chunk[i].start = start;
 1650		async_chunk[i].end = cur_end;
 1651		async_chunk[i].write_flags = write_flags;
 1652		INIT_LIST_HEAD(&async_chunk[i].extents);
 1653
 1654		/*
 1655		 * The locked_page comes all the way from writepage and its
 1656		 * the original page we were actually given.  As we spread
 1657		 * this large delalloc region across multiple async_chunk
 1658		 * structs, only the first struct needs a pointer to locked_page
 1659		 *
 1660		 * This way we don't need racey decisions about who is supposed
 1661		 * to unlock it.
 1662		 */
 1663		if (locked_page) {
 1664			/*
 1665			 * Depending on the compressibility, the pages might or
 1666			 * might not go through async.  We want all of them to
 1667			 * be accounted against wbc once.  Let's do it here
 1668			 * before the paths diverge.  wbc accounting is used
 1669			 * only for foreign writeback detection and doesn't
 1670			 * need full accuracy.  Just account the whole thing
 1671			 * against the first page.
 1672			 */
 1673			wbc_account_cgroup_owner(wbc, locked_page,
 1674						 cur_end - start);
 1675			async_chunk[i].locked_page = locked_page;
 1676			locked_page = NULL;
 1677		} else {
 1678			async_chunk[i].locked_page = NULL;
 1679		}
 1680
 1681		if (blkcg_css != blkcg_root_css) {
 1682			css_get(blkcg_css);
 1683			async_chunk[i].blkcg_css = blkcg_css;
 1684			async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT;
 1685		} else {
 1686			async_chunk[i].blkcg_css = NULL;
 1687		}
 1688
 1689		btrfs_init_work(&async_chunk[i].work, compress_file_range,
 1690				submit_compressed_extents);
 1691
 1692		nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
 1693		atomic_add(nr_pages, &fs_info->async_delalloc_pages);
 1694
 1695		btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
 1696
 1697		start = cur_end + 1;
 1698	}
 1699	return true;
 1700}
 1701
 1702/*
 1703 * Run the delalloc range from start to end, and write back any dirty pages
 1704 * covered by the range.
 1705 */
 1706static noinline int run_delalloc_cow(struct btrfs_inode *inode,
 1707				     struct page *locked_page, u64 start,
 1708				     u64 end, struct writeback_control *wbc,
 1709				     bool pages_dirty)
 1710{
 1711	u64 done_offset = end;
 1712	int ret;
 1713
 1714	while (start <= end) {
 1715		ret = cow_file_range(inode, locked_page, start, end, &done_offset,
 1716				     true, false);
 1717		if (ret)
 1718			return ret;
 1719		extent_write_locked_range(&inode->vfs_inode, locked_page, start,
 1720					  done_offset, wbc, pages_dirty);
 1721		start = done_offset + 1;
 1722	}
 1723
 1724	return 1;
 1725}
 1726
 1727static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
 1728					u64 bytenr, u64 num_bytes, bool nowait)
 1729{
 1730	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
 1731	struct btrfs_ordered_sum *sums;
 1732	int ret;
 1733	LIST_HEAD(list);
 1734
 1735	ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1,
 1736				      &list, 0, nowait);
 1737	if (ret == 0 && list_empty(&list))
 1738		return 0;
 1739
 1740	while (!list_empty(&list)) {
 1741		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
 1742		list_del(&sums->list);
 1743		kfree(sums);
 1744	}
 1745	if (ret < 0)
 1746		return ret;
 1747	return 1;
 1748}
 1749
 1750static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
 1751			   const u64 start, const u64 end)
 1752{
 1753	const bool is_space_ino = btrfs_is_free_space_inode(inode);
 1754	const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
 1755	const u64 range_bytes = end + 1 - start;
 1756	struct extent_io_tree *io_tree = &inode->io_tree;
 1757	u64 range_start = start;
 1758	u64 count;
 1759	int ret;
 1760
 1761	/*
 1762	 * If EXTENT_NORESERVE is set it means that when the buffered write was
 1763	 * made we had not enough available data space and therefore we did not
 1764	 * reserve data space for it, since we though we could do NOCOW for the
 1765	 * respective file range (either there is prealloc extent or the inode
 1766	 * has the NOCOW bit set).
 1767	 *
 1768	 * However when we need to fallback to COW mode (because for example the
 1769	 * block group for the corresponding extent was turned to RO mode by a
 1770	 * scrub or relocation) we need to do the following:
 1771	 *
 1772	 * 1) We increment the bytes_may_use counter of the data space info.
 1773	 *    If COW succeeds, it allocates a new data extent and after doing
 1774	 *    that it decrements the space info's bytes_may_use counter and
 1775	 *    increments its bytes_reserved counter by the same amount (we do
 1776	 *    this at btrfs_add_reserved_bytes()). So we need to increment the
 1777	 *    bytes_may_use counter to compensate (when space is reserved at
 1778	 *    buffered write time, the bytes_may_use counter is incremented);
 1779	 *
 1780	 * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
 1781	 *    that if the COW path fails for any reason, it decrements (through
 1782	 *    extent_clear_unlock_delalloc()) the bytes_may_use counter of the
 1783	 *    data space info, which we incremented in the step above.
 1784	 *
 1785	 * If we need to fallback to cow and the inode corresponds to a free
 1786	 * space cache inode or an inode of the data relocation tree, we must
 1787	 * also increment bytes_may_use of the data space_info for the same
 1788	 * reason. Space caches and relocated data extents always get a prealloc
 1789	 * extent for them, however scrub or balance may have set the block
 1790	 * group that contains that extent to RO mode and therefore force COW
 1791	 * when starting writeback.
 1792	 */
 1793	count = count_range_bits(io_tree, &range_start, end, range_bytes,
 1794				 EXTENT_NORESERVE, 0, NULL);
 1795	if (count > 0 || is_space_ino || is_reloc_ino) {
 1796		u64 bytes = count;
 1797		struct btrfs_fs_info *fs_info = inode->root->fs_info;
 1798		struct btrfs_space_info *sinfo = fs_info->data_sinfo;
 1799
 1800		if (is_space_ino || is_reloc_ino)
 1801			bytes = range_bytes;
 1802
 1803		spin_lock(&sinfo->lock);
 1804		btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
 1805		spin_unlock(&sinfo->lock);
 1806
 1807		if (count > 0)
 1808			clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
 1809					 NULL);
 1810	}
 1811
 1812	/*
 1813	 * Don't try to create inline extents, as a mix of inline extent that
 1814	 * is written out and unlocked directly and a normal NOCOW extent
 1815	 * doesn't work.
 1816	 */
 1817	ret = cow_file_range(inode, locked_page, start, end, NULL, false, true);
 1818	ASSERT(ret != 1);
 1819	return ret;
 1820}
 1821
 1822struct can_nocow_file_extent_args {
 1823	/* Input fields. */
 1824
 1825	/* Start file offset of the range we want to NOCOW. */
 1826	u64 start;
 1827	/* End file offset (inclusive) of the range we want to NOCOW. */
 1828	u64 end;
 1829	bool writeback_path;
 1830	bool strict;
 1831	/*
 1832	 * Free the path passed to can_nocow_file_extent() once it's not needed
 1833	 * anymore.
 1834	 */
 1835	bool free_path;
 1836
 1837	/* Output fields. Only set when can_nocow_file_extent() returns 1. */
 1838
 1839	u64 disk_bytenr;
 1840	u64 disk_num_bytes;
 1841	u64 extent_offset;
 1842	/* Number of bytes that can be written to in NOCOW mode. */
 1843	u64 num_bytes;
 1844};
 1845
 1846/*
 1847 * Check if we can NOCOW the file extent that the path points to.
 1848 * This function may return with the path released, so the caller should check
 1849 * if path->nodes[0] is NULL or not if it needs to use the path afterwards.
 1850 *
 1851 * Returns: < 0 on error
 1852 *            0 if we can not NOCOW
 1853 *            1 if we can NOCOW
 1854 */
 1855static int can_nocow_file_extent(struct btrfs_path *path,
 1856				 struct btrfs_key *key,
 1857				 struct btrfs_inode *inode,
 1858				 struct can_nocow_file_extent_args *args)
 1859{
 1860	const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
 1861	struct extent_buffer *leaf = path->nodes[0];
 1862	struct btrfs_root *root = inode->root;
 1863	struct btrfs_file_extent_item *fi;
 1864	u64 extent_end;
 1865	u8 extent_type;
 1866	int can_nocow = 0;
 1867	int ret = 0;
 1868	bool nowait = path->nowait;
 1869
 1870	fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
 1871	extent_type = btrfs_file_extent_type(leaf, fi);
 1872
 1873	if (extent_type == BTRFS_FILE_EXTENT_INLINE)
 1874		goto out;
 1875
 1876	/* Can't access these fields unless we know it's not an inline extent. */
 1877	args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 1878	args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
 1879	args->extent_offset = btrfs_file_extent_offset(leaf, fi);
 1880
 1881	if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
 1882	    extent_type == BTRFS_FILE_EXTENT_REG)
 1883		goto out;
 1884
 1885	/*
 1886	 * If the extent was created before the generation where the last snapshot
 1887	 * for its subvolume was created, then this implies the extent is shared,
 1888	 * hence we must COW.
 1889	 */
 1890	if (!args->strict &&
 1891	    btrfs_file_extent_generation(leaf, fi) <=
 1892	    btrfs_root_last_snapshot(&root->root_item))
 1893		goto out;
 1894
 1895	/* An explicit hole, must COW. */
 1896	if (args->disk_bytenr == 0)
 1897		goto out;
 1898
 1899	/* Compressed/encrypted/encoded extents must be COWed. */
 1900	if (btrfs_file_extent_compression(leaf, fi) ||
 1901	    btrfs_file_extent_encryption(leaf, fi) ||
 1902	    btrfs_file_extent_other_encoding(leaf, fi))
 1903		goto out;
 1904
 1905	extent_end = btrfs_file_extent_end(path);
 1906
 1907	/*
 1908	 * The following checks can be expensive, as they need to take other
 1909	 * locks and do btree or rbtree searches, so release the path to avoid
 1910	 * blocking other tasks for too long.
 1911	 */
 1912	btrfs_release_path(path);
 1913
 1914	ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
 1915				    key->offset - args->extent_offset,
 1916				    args->disk_bytenr, args->strict, path);
 1917	WARN_ON_ONCE(ret > 0 && is_freespace_inode);
 1918	if (ret != 0)
 1919		goto out;
 1920
 1921	if (args->free_path) {
 1922		/*
 1923		 * We don't need the path anymore, plus through the
 1924		 * csum_exist_in_range() call below we will end up allocating
 1925		 * another path. So free the path to avoid unnecessary extra
 1926		 * memory usage.
 1927		 */
 1928		btrfs_free_path(path);
 1929		path = NULL;
 1930	}
 1931
 1932	/* If there are pending snapshots for this root, we must COW. */
 1933	if (args->writeback_path && !is_freespace_inode &&
 1934	    atomic_read(&root->snapshot_force_cow))
 1935		goto out;
 1936
 1937	args->disk_bytenr += args->extent_offset;
 1938	args->disk_bytenr += args->start - key->offset;
 1939	args->num_bytes = min(args->end + 1, extent_end) - args->start;
 1940
 1941	/*
 1942	 * Force COW if csums exist in the range. This ensures that csums for a
 1943	 * given extent are either valid or do not exist.
 1944	 */
 1945	ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes,
 1946				  nowait);
 1947	WARN_ON_ONCE(ret > 0 && is_freespace_inode);
 1948	if (ret != 0)
 1949		goto out;
 1950
 1951	can_nocow = 1;
 1952 out:
 1953	if (args->free_path && path)
 1954		btrfs_free_path(path);
 1955
 1956	return ret < 0 ? ret : can_nocow;
 1957}
 1958
 1959/*
 1960 * when nowcow writeback call back.  This checks for snapshots or COW copies
 1961 * of the extents that exist in the file, and COWs the file as required.
 1962 *
 1963 * If no cow copies or snapshots exist, we write directly to the existing
 1964 * blocks on disk
 1965 */
 1966static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
 1967				       struct page *locked_page,
 1968				       const u64 start, const u64 end)
 1969{
 1970	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 1971	struct btrfs_root *root = inode->root;
 1972	struct btrfs_path *path;
 1973	u64 cow_start = (u64)-1;
 1974	u64 cur_offset = start;
 1975	int ret;
 1976	bool check_prev = true;
 1977	u64 ino = btrfs_ino(inode);
 1978	struct can_nocow_file_extent_args nocow_args = { 0 };
 1979
 1980	/*
 1981	 * Normally on a zoned device we're only doing COW writes, but in case
 1982	 * of relocation on a zoned filesystem serializes I/O so that we're only
 1983	 * writing sequentially and can end up here as well.
 1984	 */
 1985	ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root));
 1986
 1987	path = btrfs_alloc_path();
 1988	if (!path) {
 1989		ret = -ENOMEM;
 1990		goto error;
 1991	}
 1992
 1993	nocow_args.end = end;
 1994	nocow_args.writeback_path = true;
 1995
 1996	while (1) {
 1997		struct btrfs_block_group *nocow_bg = NULL;
 1998		struct btrfs_ordered_extent *ordered;
 1999		struct btrfs_key found_key;
 2000		struct btrfs_file_extent_item *fi;
 2001		struct extent_buffer *leaf;
 2002		u64 extent_end;
 2003		u64 ram_bytes;
 2004		u64 nocow_end;
 2005		int extent_type;
 2006		bool is_prealloc;
 2007
 2008		ret = btrfs_lookup_file_extent(NULL, root, path, ino,
 2009					       cur_offset, 0);
 2010		if (ret < 0)
 2011			goto error;
 2012
 2013		/*
 2014		 * If there is no extent for our range when doing the initial
 2015		 * search, then go back to the previous slot as it will be the
 2016		 * one containing the search offset
 2017		 */
 2018		if (ret > 0 && path->slots[0] > 0 && check_prev) {
 2019			leaf = path->nodes[0];
 2020			btrfs_item_key_to_cpu(leaf, &found_key,
 2021					      path->slots[0] - 1);
 2022			if (found_key.objectid == ino &&
 2023			    found_key.type == BTRFS_EXTENT_DATA_KEY)
 2024				path->slots[0]--;
 2025		}
 2026		check_prev = false;
 2027next_slot:
 2028		/* Go to next leaf if we have exhausted the current one */
 2029		leaf = path->nodes[0];
 2030		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 2031			ret = btrfs_next_leaf(root, path);
 2032			if (ret < 0)
 2033				goto error;
 2034			if (ret > 0)
 2035				break;
 2036			leaf = path->nodes[0];
 2037		}
 2038
 2039		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 2040
 2041		/* Didn't find anything for our INO */
 2042		if (found_key.objectid > ino)
 2043			break;
 2044		/*
 2045		 * Keep searching until we find an EXTENT_ITEM or there are no
 2046		 * more extents for this inode
 2047		 */
 2048		if (WARN_ON_ONCE(found_key.objectid < ino) ||
 2049		    found_key.type < BTRFS_EXTENT_DATA_KEY) {
 2050			path->slots[0]++;
 2051			goto next_slot;
 2052		}
 2053
 2054		/* Found key is not EXTENT_DATA_KEY or starts after req range */
 2055		if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
 2056		    found_key.offset > end)
 2057			break;
 2058
 2059		/*
 2060		 * If the found extent starts after requested offset, then
 2061		 * adjust extent_end to be right before this extent begins
 2062		 */
 2063		if (found_key.offset > cur_offset) {
 2064			extent_end = found_key.offset;
 2065			extent_type = 0;
 2066			goto must_cow;
 2067		}
 2068
 2069		/*
 2070		 * Found extent which begins before our range and potentially
 2071		 * intersect it
 2072		 */
 2073		fi = btrfs_item_ptr(leaf, path->slots[0],
 2074				    struct btrfs_file_extent_item);
 2075		extent_type = btrfs_file_extent_type(leaf, fi);
 2076		/* If this is triggered then we have a memory corruption. */
 2077		ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES);
 2078		if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
 2079			ret = -EUCLEAN;
 2080			goto error;
 2081		}
 2082		ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
 2083		extent_end = btrfs_file_extent_end(path);
 2084
 2085		/*
 2086		 * If the extent we got ends before our current offset, skip to
 2087		 * the next extent.
 2088		 */
 2089		if (extent_end <= cur_offset) {
 2090			path->slots[0]++;
 2091			goto next_slot;
 2092		}
 2093
 2094		nocow_args.start = cur_offset;
 2095		ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args);
 2096		if (ret < 0)
 2097			goto error;
 2098		if (ret == 0)
 2099			goto must_cow;
 2100
 2101		ret = 0;
 2102		nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr);
 2103		if (!nocow_bg) {
 2104must_cow:
 2105			/*
 2106			 * If we can't perform NOCOW writeback for the range,
 2107			 * then record the beginning of the range that needs to
 2108			 * be COWed.  It will be written out before the next
 2109			 * NOCOW range if we find one, or when exiting this
 2110			 * loop.
 2111			 */
 2112			if (cow_start == (u64)-1)
 2113				cow_start = cur_offset;
 2114			cur_offset = extent_end;
 2115			if (cur_offset > end)
 2116				break;
 2117			if (!path->nodes[0])
 2118				continue;
 2119			path->slots[0]++;
 2120			goto next_slot;
 2121		}
 2122
 2123		/*
 2124		 * COW range from cow_start to found_key.offset - 1. As the key
 2125		 * will contain the beginning of the first extent that can be
 2126		 * NOCOW, following one which needs to be COW'ed
 2127		 */
 2128		if (cow_start != (u64)-1) {
 2129			ret = fallback_to_cow(inode, locked_page,
 2130					      cow_start, found_key.offset - 1);
 2131			cow_start = (u64)-1;
 2132			if (ret) {
 2133				btrfs_dec_nocow_writers(nocow_bg);
 2134				goto error;
 2135			}
 2136		}
 2137
 2138		nocow_end = cur_offset + nocow_args.num_bytes - 1;
 2139		is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
 2140		if (is_prealloc) {
 2141			u64 orig_start = found_key.offset - nocow_args.extent_offset;
 2142			struct extent_map *em;
 2143
 2144			em = create_io_em(inode, cur_offset, nocow_args.num_bytes,
 2145					  orig_start,
 2146					  nocow_args.disk_bytenr, /* block_start */
 2147					  nocow_args.num_bytes, /* block_len */
 2148					  nocow_args.disk_num_bytes, /* orig_block_len */
 2149					  ram_bytes, BTRFS_COMPRESS_NONE,
 2150					  BTRFS_ORDERED_PREALLOC);
 2151			if (IS_ERR(em)) {
 2152				btrfs_dec_nocow_writers(nocow_bg);
 2153				ret = PTR_ERR(em);
 2154				goto error;
 2155			}
 2156			free_extent_map(em);
 2157		}
 2158
 2159		ordered = btrfs_alloc_ordered_extent(inode, cur_offset,
 2160				nocow_args.num_bytes, nocow_args.num_bytes,
 2161				nocow_args.disk_bytenr, nocow_args.num_bytes, 0,
 2162				is_prealloc
 2163				? (1 << BTRFS_ORDERED_PREALLOC)
 2164				: (1 << BTRFS_ORDERED_NOCOW),
 2165				BTRFS_COMPRESS_NONE);
 2166		btrfs_dec_nocow_writers(nocow_bg);
 2167		if (IS_ERR(ordered)) {
 2168			if (is_prealloc) {
 2169				btrfs_drop_extent_map_range(inode, cur_offset,
 2170							    nocow_end, false);
 2171			}
 2172			ret = PTR_ERR(ordered);
 2173			goto error;
 2174		}
 2175
 2176		if (btrfs_is_data_reloc_root(root))
 2177			/*
 2178			 * Error handled later, as we must prevent
 2179			 * extent_clear_unlock_delalloc() in error handler
 2180			 * from freeing metadata of created ordered extent.
 2181			 */
 2182			ret = btrfs_reloc_clone_csums(ordered);
 2183		btrfs_put_ordered_extent(ordered);
 2184
 2185		extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
 2186					     locked_page, EXTENT_LOCKED |
 2187					     EXTENT_DELALLOC |
 2188					     EXTENT_CLEAR_DATA_RESV,
 2189					     PAGE_UNLOCK | PAGE_SET_ORDERED);
 2190
 2191		cur_offset = extent_end;
 2192
 2193		/*
 2194		 * btrfs_reloc_clone_csums() error, now we're OK to call error
 2195		 * handler, as metadata for created ordered extent will only
 2196		 * be freed by btrfs_finish_ordered_io().
 2197		 */
 2198		if (ret)
 2199			goto error;
 2200		if (cur_offset > end)
 2201			break;
 2202	}
 2203	btrfs_release_path(path);
 2204
 2205	if (cur_offset <= end && cow_start == (u64)-1)
 2206		cow_start = cur_offset;
 2207
 2208	if (cow_start != (u64)-1) {
 2209		cur_offset = end;
 2210		ret = fallback_to_cow(inode, locked_page, cow_start, end);
 2211		cow_start = (u64)-1;
 2212		if (ret)
 2213			goto error;
 2214	}
 2215
 2216	btrfs_free_path(path);
 2217	return 0;
 2218
 2219error:
 2220	/*
 2221	 * If an error happened while a COW region is outstanding, cur_offset
 2222	 * needs to be reset to cow_start to ensure the COW region is unlocked
 2223	 * as well.
 2224	 */
 2225	if (cow_start != (u64)-1)
 2226		cur_offset = cow_start;
 2227	if (cur_offset < end)
 2228		extent_clear_unlock_delalloc(inode, cur_offset, end,
 2229					     locked_page, EXTENT_LOCKED |
 2230					     EXTENT_DELALLOC | EXTENT_DEFRAG |
 2231					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
 2232					     PAGE_START_WRITEBACK |
 2233					     PAGE_END_WRITEBACK);
 2234	btrfs_free_path(path);
 2235	return ret;
 2236}
 2237
 2238static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
 2239{
 2240	if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
 2241		if (inode->defrag_bytes &&
 2242		    test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
 2243			return false;
 2244		return true;
 2245	}
 2246	return false;
 2247}
 2248
 2249/*
 2250 * Function to process delayed allocation (create CoW) for ranges which are
 2251 * being touched for the first time.
 2252 */
 2253int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
 2254			     u64 start, u64 end, struct writeback_control *wbc)
 2255{
 2256	const bool zoned = btrfs_is_zoned(inode->root->fs_info);
 2257	int ret;
 2258
 2259	/*
 2260	 * The range must cover part of the @locked_page, or a return of 1
 2261	 * can confuse the caller.
 2262	 */
 2263	ASSERT(!(end <= page_offset(locked_page) ||
 2264		 start >= page_offset(locked_page) + PAGE_SIZE));
 2265
 2266	if (should_nocow(inode, start, end)) {
 2267		ret = run_delalloc_nocow(inode, locked_page, start, end);
 2268		goto out;
 2269	}
 2270
 2271	if (btrfs_inode_can_compress(inode) &&
 2272	    inode_need_compress(inode, start, end) &&
 2273	    run_delalloc_compressed(inode, locked_page, start, end, wbc))
 2274		return 1;
 2275
 2276	if (zoned)
 2277		ret = run_delalloc_cow(inode, locked_page, start, end, wbc,
 2278				       true);
 2279	else
 2280		ret = cow_file_range(inode, locked_page, start, end, NULL,
 2281				     false, false);
 2282
 2283out:
 2284	if (ret < 0)
 2285		btrfs_cleanup_ordered_extents(inode, locked_page, start,
 2286					      end - start + 1);
 2287	return ret;
 2288}
 2289
 2290void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
 2291				 struct extent_state *orig, u64 split)
 2292{
 2293	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 2294	u64 size;
 2295
 2296	/* not delalloc, ignore it */
 2297	if (!(orig->state & EXTENT_DELALLOC))
 2298		return;
 2299
 2300	size = orig->end - orig->start + 1;
 2301	if (size > fs_info->max_extent_size) {
 2302		u32 num_extents;
 2303		u64 new_size;
 2304
 2305		/*
 2306		 * See the explanation in btrfs_merge_delalloc_extent, the same
 2307		 * applies here, just in reverse.
 2308		 */
 2309		new_size = orig->end - split + 1;
 2310		num_extents = count_max_extents(fs_info, new_size);
 2311		new_size = split - orig->start;
 2312		num_extents += count_max_extents(fs_info, new_size);
 2313		if (count_max_extents(fs_info, size) >= num_extents)
 2314			return;
 2315	}
 2316
 2317	spin_lock(&inode->lock);
 2318	btrfs_mod_outstanding_extents(inode, 1);
 2319	spin_unlock(&inode->lock);
 2320}
 2321
 2322/*
 2323 * Handle merged delayed allocation extents so we can keep track of new extents
 2324 * that are just merged onto old extents, such as when we are doing sequential
 2325 * writes, so we can properly account for the metadata space we'll need.
 2326 */
 2327void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
 2328				 struct extent_state *other)
 2329{
 2330	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 2331	u64 new_size, old_size;
 2332	u32 num_extents;
 2333
 2334	/* not delalloc, ignore it */
 2335	if (!(other->state & EXTENT_DELALLOC))
 2336		return;
 2337
 2338	if (new->start > other->start)
 2339		new_size = new->end - other->start + 1;
 2340	else
 2341		new_size = other->end - new->start + 1;
 2342
 2343	/* we're not bigger than the max, unreserve the space and go */
 2344	if (new_size <= fs_info->max_extent_size) {
 2345		spin_lock(&inode->lock);
 2346		btrfs_mod_outstanding_extents(inode, -1);
 2347		spin_unlock(&inode->lock);
 2348		return;
 2349	}
 2350
 2351	/*
 2352	 * We have to add up either side to figure out how many extents were
 2353	 * accounted for before we merged into one big extent.  If the number of
 2354	 * extents we accounted for is <= the amount we need for the new range
 2355	 * then we can return, otherwise drop.  Think of it like this
 2356	 *
 2357	 * [ 4k][MAX_SIZE]
 2358	 *
 2359	 * So we've grown the extent by a MAX_SIZE extent, this would mean we
 2360	 * need 2 outstanding extents, on one side we have 1 and the other side
 2361	 * we have 1 so they are == and we can return.  But in this case
 2362	 *
 2363	 * [MAX_SIZE+4k][MAX_SIZE+4k]
 2364	 *
 2365	 * Each range on their own accounts for 2 extents, but merged together
 2366	 * they are only 3 extents worth of accounting, so we need to drop in
 2367	 * this case.
 2368	 */
 2369	old_size = other->end - other->start + 1;
 2370	num_extents = count_max_extents(fs_info, old_size);
 2371	old_size = new->end - new->start + 1;
 2372	num_extents += count_max_extents(fs_info, old_size);
 2373	if (count_max_extents(fs_info, new_size) >= num_extents)
 2374		return;
 2375
 2376	spin_lock(&inode->lock);
 2377	btrfs_mod_outstanding_extents(inode, -1);
 2378	spin_unlock(&inode->lock);
 2379}
 2380
 2381static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
 2382				      struct btrfs_inode *inode)
 2383{
 2384	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 2385
 2386	spin_lock(&root->delalloc_lock);
 2387	if (list_empty(&inode->delalloc_inodes)) {
 2388		list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
 2389		set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags);
 2390		root->nr_delalloc_inodes++;
 2391		if (root->nr_delalloc_inodes == 1) {
 2392			spin_lock(&fs_info->delalloc_root_lock);
 2393			BUG_ON(!list_empty(&root->delalloc_root));
 2394			list_add_tail(&root->delalloc_root,
 2395				      &fs_info->delalloc_roots);
 2396			spin_unlock(&fs_info->delalloc_root_lock);
 2397		}
 2398	}
 2399	spin_unlock(&root->delalloc_lock);
 2400}
 2401
 2402void __btrfs_del_delalloc_inode(struct btrfs_root *root,
 2403				struct btrfs_inode *inode)
 2404{
 2405	struct btrfs_fs_info *fs_info = root->fs_info;
 2406
 2407	if (!list_empty(&inode->delalloc_inodes)) {
 2408		list_del_init(&inode->delalloc_inodes);
 2409		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
 2410			  &inode->runtime_flags);
 2411		root->nr_delalloc_inodes--;
 2412		if (!root->nr_delalloc_inodes) {
 2413			ASSERT(list_empty(&root->delalloc_inodes));
 2414			spin_lock(&fs_info->delalloc_root_lock);
 2415			BUG_ON(list_empty(&root->delalloc_root));
 2416			list_del_init(&root->delalloc_root);
 2417			spin_unlock(&fs_info->delalloc_root_lock);
 2418		}
 2419	}
 2420}
 2421
 2422static void btrfs_del_delalloc_inode(struct btrfs_root *root,
 2423				     struct btrfs_inode *inode)
 2424{
 2425	spin_lock(&root->delalloc_lock);
 2426	__btrfs_del_delalloc_inode(root, inode);
 2427	spin_unlock(&root->delalloc_lock);
 2428}
 2429
 2430/*
 2431 * Properly track delayed allocation bytes in the inode and to maintain the
 2432 * list of inodes that have pending delalloc work to be done.
 2433 */
 2434void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
 2435			       u32 bits)
 2436{
 2437	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 2438
 2439	if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
 2440		WARN_ON(1);
 2441	/*
 2442	 * set_bit and clear bit hooks normally require _irqsave/restore
 2443	 * but in this case, we are only testing for the DELALLOC
 2444	 * bit, which is only set or cleared with irqs on
 2445	 */
 2446	if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 2447		struct btrfs_root *root = inode->root;
 2448		u64 len = state->end + 1 - state->start;
 2449		u32 num_extents = count_max_extents(fs_info, len);
 2450		bool do_list = !btrfs_is_free_space_inode(inode);
 2451
 2452		spin_lock(&inode->lock);
 2453		btrfs_mod_outstanding_extents(inode, num_extents);
 2454		spin_unlock(&inode->lock);
 2455
 2456		/* For sanity tests */
 2457		if (btrfs_is_testing(fs_info))
 2458			return;
 2459
 2460		percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
 2461					 fs_info->delalloc_batch);
 2462		spin_lock(&inode->lock);
 2463		inode->delalloc_bytes += len;
 2464		if (bits & EXTENT_DEFRAG)
 2465			inode->defrag_bytes += len;
 2466		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
 2467					 &inode->runtime_flags))
 2468			btrfs_add_delalloc_inodes(root, inode);
 2469		spin_unlock(&inode->lock);
 2470	}
 2471
 2472	if (!(state->state & EXTENT_DELALLOC_NEW) &&
 2473	    (bits & EXTENT_DELALLOC_NEW)) {
 2474		spin_lock(&inode->lock);
 2475		inode->new_delalloc_bytes += state->end + 1 - state->start;
 2476		spin_unlock(&inode->lock);
 2477	}
 2478}
 2479
 2480/*
 2481 * Once a range is no longer delalloc this function ensures that proper
 2482 * accounting happens.
 2483 */
 2484void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
 2485				 struct extent_state *state, u32 bits)
 2486{
 2487	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 2488	u64 len = state->end + 1 - state->start;
 2489	u32 num_extents = count_max_extents(fs_info, len);
 2490
 2491	if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
 2492		spin_lock(&inode->lock);
 2493		inode->defrag_bytes -= len;
 2494		spin_unlock(&inode->lock);
 2495	}
 2496
 2497	/*
 2498	 * set_bit and clear bit hooks normally require _irqsave/restore
 2499	 * but in this case, we are only testing for the DELALLOC
 2500	 * bit, which is only set or cleared with irqs on
 2501	 */
 2502	if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 2503		struct btrfs_root *root = inode->root;
 2504		bool do_list = !btrfs_is_free_space_inode(inode);
 2505
 2506		spin_lock(&inode->lock);
 2507		btrfs_mod_outstanding_extents(inode, -num_extents);
 2508		spin_unlock(&inode->lock);
 2509
 2510		/*
 2511		 * We don't reserve metadata space for space cache inodes so we
 2512		 * don't need to call delalloc_release_metadata if there is an
 2513		 * error.
 2514		 */
 2515		if (bits & EXTENT_CLEAR_META_RESV &&
 2516		    root != fs_info->tree_root)
 2517			btrfs_delalloc_release_metadata(inode, len, false);
 2518
 2519		/* For sanity tests. */
 2520		if (btrfs_is_testing(fs_info))
 2521			return;
 2522
 2523		if (!btrfs_is_data_reloc_root(root) &&
 2524		    do_list && !(state->state & EXTENT_NORESERVE) &&
 2525		    (bits & EXTENT_CLEAR_DATA_RESV))
 2526			btrfs_free_reserved_data_space_noquota(fs_info, len);
 2527
 2528		percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
 2529					 fs_info->delalloc_batch);
 2530		spin_lock(&inode->lock);
 2531		inode->delalloc_bytes -= len;
 2532		if (do_list && inode->delalloc_bytes == 0 &&
 2533		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
 2534					&inode->runtime_flags))
 2535			btrfs_del_delalloc_inode(root, inode);
 2536		spin_unlock(&inode->lock);
 2537	}
 2538
 2539	if ((state->state & EXTENT_DELALLOC_NEW) &&
 2540	    (bits & EXTENT_DELALLOC_NEW)) {
 2541		spin_lock(&inode->lock);
 2542		ASSERT(inode->new_delalloc_bytes >= len);
 2543		inode->new_delalloc_bytes -= len;
 2544		if (bits & EXTENT_ADD_INODE_BYTES)
 2545			inode_add_bytes(&inode->vfs_inode, len);
 2546		spin_unlock(&inode->lock);
 2547	}
 2548}
 2549
 2550static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
 2551					struct btrfs_ordered_extent *ordered)
 2552{
 2553	u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
 2554	u64 len = bbio->bio.bi_iter.bi_size;
 2555	struct btrfs_ordered_extent *new;
 2556	int ret;
 2557
 2558	/* Must always be called for the beginning of an ordered extent. */
 2559	if (WARN_ON_ONCE(start != ordered->disk_bytenr))
 2560		return -EINVAL;
 2561
 2562	/* No need to split if the ordered extent covers the entire bio. */
 2563	if (ordered->disk_num_bytes == len) {
 2564		refcount_inc(&ordered->refs);
 2565		bbio->ordered = ordered;
 2566		return 0;
 2567	}
 2568
 2569	/*
 2570	 * Don't split the extent_map for NOCOW extents, as we're writing into
 2571	 * a pre-existing one.
 2572	 */
 2573	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
 2574		ret = split_extent_map(bbio->inode, bbio->file_offset,
 2575				       ordered->num_bytes, len,
 2576				       ordered->disk_bytenr);
 2577		if (ret)
 2578			return ret;
 2579	}
 2580
 2581	new = btrfs_split_ordered_extent(ordered, len);
 2582	if (IS_ERR(new))
 2583		return PTR_ERR(new);
 2584	bbio->ordered = new;
 2585	return 0;
 2586}
 2587
 2588/*
 2589 * given a list of ordered sums record them in the inode.  This happens
 2590 * at IO completion time based on sums calculated at bio submission time.
 2591 */
 2592static int add_pending_csums(struct btrfs_trans_handle *trans,
 2593			     struct list_head *list)
 2594{
 2595	struct btrfs_ordered_sum *sum;
 2596	struct btrfs_root *csum_root = NULL;
 2597	int ret;
 2598
 2599	list_for_each_entry(sum, list, list) {
 2600		trans->adding_csums = true;
 2601		if (!csum_root)
 2602			csum_root = btrfs_csum_root(trans->fs_info,
 2603						    sum->logical);
 2604		ret = btrfs_csum_file_blocks(trans, csum_root, sum);
 2605		trans->adding_csums = false;
 2606		if (ret)
 2607			return ret;
 2608	}
 2609	return 0;
 2610}
 2611
 2612static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
 2613					 const u64 start,
 2614					 const u64 len,
 2615					 struct extent_state **cached_state)
 2616{
 2617	u64 search_start = start;
 2618	const u64 end = start + len - 1;
 2619
 2620	while (search_start < end) {
 2621		const u64 search_len = end - search_start + 1;
 2622		struct extent_map *em;
 2623		u64 em_len;
 2624		int ret = 0;
 2625
 2626		em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
 2627		if (IS_ERR(em))
 2628			return PTR_ERR(em);
 2629
 2630		if (em->block_start != EXTENT_MAP_HOLE)
 2631			goto next;
 2632
 2633		em_len = em->len;
 2634		if (em->start < search_start)
 2635			em_len -= search_start - em->start;
 2636		if (em_len > search_len)
 2637			em_len = search_len;
 2638
 2639		ret = set_extent_bit(&inode->io_tree, search_start,
 2640				     search_start + em_len - 1,
 2641				     EXTENT_DELALLOC_NEW, cached_state);
 2642next:
 2643		search_start = extent_map_end(em);
 2644		free_extent_map(em);
 2645		if (ret)
 2646			return ret;
 2647	}
 2648	return 0;
 2649}
 2650
 2651int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
 2652			      unsigned int extra_bits,
 2653			      struct extent_state **cached_state)
 2654{
 2655	WARN_ON(PAGE_ALIGNED(end));
 2656
 2657	if (start >= i_size_read(&inode->vfs_inode) &&
 2658	    !(inode->flags & BTRFS_INODE_PREALLOC)) {
 2659		/*
 2660		 * There can't be any extents following eof in this case so just
 2661		 * set the delalloc new bit for the range directly.
 2662		 */
 2663		extra_bits |= EXTENT_DELALLOC_NEW;
 2664	} else {
 2665		int ret;
 2666
 2667		ret = btrfs_find_new_delalloc_bytes(inode, start,
 2668						    end + 1 - start,
 2669						    cached_state);
 2670		if (ret)
 2671			return ret;
 2672	}
 2673
 2674	return set_extent_bit(&inode->io_tree, start, end,
 2675			      EXTENT_DELALLOC | extra_bits, cached_state);
 2676}
 2677
 2678/* see btrfs_writepage_start_hook for details on why this is required */
 2679struct btrfs_writepage_fixup {
 2680	struct page *page;
 2681	struct btrfs_inode *inode;
 2682	struct btrfs_work work;
 2683};
 2684
 2685static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
 2686{
 2687	struct btrfs_writepage_fixup *fixup =
 2688		container_of(work, struct btrfs_writepage_fixup, work);
 2689	struct btrfs_ordered_extent *ordered;
 2690	struct extent_state *cached_state = NULL;
 2691	struct extent_changeset *data_reserved = NULL;
 2692	struct page *page = fixup->page;
 2693	struct btrfs_inode *inode = fixup->inode;
 2694	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 2695	u64 page_start = page_offset(page);
 2696	u64 page_end = page_offset(page) + PAGE_SIZE - 1;
 2697	int ret = 0;
 2698	bool free_delalloc_space = true;
 2699
 2700	/*
 2701	 * This is similar to page_mkwrite, we need to reserve the space before
 2702	 * we take the page lock.
 2703	 */
 2704	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
 2705					   PAGE_SIZE);
 2706again:
 2707	lock_page(page);
 2708
 2709	/*
 2710	 * Before we queued this fixup, we took a reference on the page.
 2711	 * page->mapping may go NULL, but it shouldn't be moved to a different
 2712	 * address space.
 2713	 */
 2714	if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
 2715		/*
 2716		 * Unfortunately this is a little tricky, either
 2717		 *
 2718		 * 1) We got here and our page had already been dealt with and
 2719		 *    we reserved our space, thus ret == 0, so we need to just
 2720		 *    drop our space reservation and bail.  This can happen the
 2721		 *    first time we come into the fixup worker, or could happen
 2722		 *    while waiting for the ordered extent.
 2723		 * 2) Our page was already dealt with, but we happened to get an
 2724		 *    ENOSPC above from the btrfs_delalloc_reserve_space.  In
 2725		 *    this case we obviously don't have anything to release, but
 2726		 *    because the page was already dealt with we don't want to
 2727		 *    mark the page with an error, so make sure we're resetting
 2728		 *    ret to 0.  This is why we have this check _before_ the ret
 2729		 *    check, because we do not want to have a surprise ENOSPC
 2730		 *    when the page was already properly dealt with.
 2731		 */
 2732		if (!ret) {
 2733			btrfs_delalloc_release_extents(inode, PAGE_SIZE);
 2734			btrfs_delalloc_release_space(inode, data_reserved,
 2735						     page_start, PAGE_SIZE,
 2736						     true);
 2737		}
 2738		ret = 0;
 2739		goto out_page;
 2740	}
 2741
 2742	/*
 2743	 * We can't mess with the page state unless it is locked, so now that
 2744	 * it is locked bail if we failed to make our space reservation.
 2745	 */
 2746	if (ret)
 2747		goto out_page;
 2748
 2749	lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
 2750
 2751	/* already ordered? We're done */
 2752	if (PageOrdered(page))
 2753		goto out_reserved;
 2754
 2755	ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
 2756	if (ordered) {
 2757		unlock_extent(&inode->io_tree, page_start, page_end,
 2758			      &cached_state);
 2759		unlock_page(page);
 2760		btrfs_start_ordered_extent(ordered);
 2761		btrfs_put_ordered_extent(ordered);
 2762		goto again;
 2763	}
 2764
 2765	ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
 2766					&cached_state);
 2767	if (ret)
 2768		goto out_reserved;
 2769
 2770	/*
 2771	 * Everything went as planned, we're now the owner of a dirty page with
 2772	 * delayed allocation bits set and space reserved for our COW
 2773	 * destination.
 2774	 *
 2775	 * The page was dirty when we started, nothing should have cleaned it.
 2776	 */
 2777	BUG_ON(!PageDirty(page));
 2778	free_delalloc_space = false;
 2779out_reserved:
 2780	btrfs_delalloc_release_extents(inode, PAGE_SIZE);
 2781	if (free_delalloc_space)
 2782		btrfs_delalloc_release_space(inode, data_reserved, page_start,
 2783					     PAGE_SIZE, true);
 2784	unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
 2785out_page:
 2786	if (ret) {
 2787		/*
 2788		 * We hit ENOSPC or other errors.  Update the mapping and page
 2789		 * to reflect the errors and clean the page.
 2790		 */
 2791		mapping_set_error(page->mapping, ret);
 2792		btrfs_mark_ordered_io_finished(inode, page, page_start,
 2793					       PAGE_SIZE, !ret);
 2794		clear_page_dirty_for_io(page);
 2795	}
 2796	btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
 2797	unlock_page(page);
 2798	put_page(page);
 2799	kfree(fixup);
 2800	extent_changeset_free(data_reserved);
 2801	/*
 2802	 * As a precaution, do a delayed iput in case it would be the last iput
 2803	 * that could need flushing space. Recursing back to fixup worker would
 2804	 * deadlock.
 2805	 */
 2806	btrfs_add_delayed_iput(inode);
 2807}
 2808
 2809/*
 2810 * There are a few paths in the higher layers of the kernel that directly
 2811 * set the page dirty bit without asking the filesystem if it is a
 2812 * good idea.  This causes problems because we want to make sure COW
 2813 * properly happens and the data=ordered rules are followed.
 2814 *
 2815 * In our case any range that doesn't have the ORDERED bit set
 2816 * hasn't been properly setup for IO.  We kick off an async process
 2817 * to fix it up.  The async helper will wait for ordered extents, set
 2818 * the delalloc bit and make it safe to write the page.
 2819 */
 2820int btrfs_writepage_cow_fixup(struct page *page)
 2821{
 2822	struct inode *inode = page->mapping->host;
 2823	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 2824	struct btrfs_writepage_fixup *fixup;
 2825
 2826	/* This page has ordered extent covering it already */
 2827	if (PageOrdered(page))
 2828		return 0;
 2829
 2830	/*
 2831	 * PageChecked is set below when we create a fixup worker for this page,
 2832	 * don't try to create another one if we're already PageChecked()
 2833	 *
 2834	 * The extent_io writepage code will redirty the page if we send back
 2835	 * EAGAIN.
 2836	 */
 2837	if (PageChecked(page))
 2838		return -EAGAIN;
 2839
 2840	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
 2841	if (!fixup)
 2842		return -EAGAIN;
 2843
 2844	/*
 2845	 * We are already holding a reference to this inode from
 2846	 * write_cache_pages.  We need to hold it because the space reservation
 2847	 * takes place outside of the page lock, and we can't trust
 2848	 * page->mapping outside of the page lock.
 2849	 */
 2850	ihold(inode);
 2851	btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
 2852	get_page(page);
 2853	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
 2854	fixup->page = page;
 2855	fixup->inode = BTRFS_I(inode);
 2856	btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
 2857
 2858	return -EAGAIN;
 2859}
 2860
 2861static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 2862				       struct btrfs_inode *inode, u64 file_pos,
 2863				       struct btrfs_file_extent_item *stack_fi,
 2864				       const bool update_inode_bytes,
 2865				       u64 qgroup_reserved)
 2866{
 2867	struct btrfs_root *root = inode->root;
 2868	const u64 sectorsize = root->fs_info->sectorsize;
 2869	struct btrfs_path *path;
 2870	struct extent_buffer *leaf;
 2871	struct btrfs_key ins;
 2872	u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
 2873	u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
 2874	u64 offset = btrfs_stack_file_extent_offset(stack_fi);
 2875	u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
 2876	u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
 2877	struct btrfs_drop_extents_args drop_args = { 0 };
 2878	int ret;
 2879
 2880	path = btrfs_alloc_path();
 2881	if (!path)
 2882		return -ENOMEM;
 2883
 2884	/*
 2885	 * we may be replacing one extent in the tree with another.
 2886	 * The new extent is pinned in the extent map, and we don't want
 2887	 * to drop it from the cache until it is completely in the btree.
 2888	 *
 2889	 * So, tell btrfs_drop_extents to leave this extent in the cache.
 2890	 * the caller is expected to unpin it and allow it to be merged
 2891	 * with the others.
 2892	 */
 2893	drop_args.path = path;
 2894	drop_args.start = file_pos;
 2895	drop_args.end = file_pos + num_bytes;
 2896	drop_args.replace_extent = true;
 2897	drop_args.extent_item_size = sizeof(*stack_fi);
 2898	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
 2899	if (ret)
 2900		goto out;
 2901
 2902	if (!drop_args.extent_inserted) {
 2903		ins.objectid = btrfs_ino(inode);
 2904		ins.offset = file_pos;
 2905		ins.type = BTRFS_EXTENT_DATA_KEY;
 2906
 2907		ret = btrfs_insert_empty_item(trans, root, path, &ins,
 2908					      sizeof(*stack_fi));
 2909		if (ret)
 2910			goto out;
 2911	}
 2912	leaf = path->nodes[0];
 2913	btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
 2914	write_extent_buffer(leaf, stack_fi,
 2915			btrfs_item_ptr_offset(leaf, path->slots[0]),
 2916			sizeof(struct btrfs_file_extent_item));
 2917
 2918	btrfs_mark_buffer_dirty(trans, leaf);
 2919	btrfs_release_path(path);
 2920
 2921	/*
 2922	 * If we dropped an inline extent here, we know the range where it is
 2923	 * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
 2924	 * number of bytes only for that range containing the inline extent.
 2925	 * The remaining of the range will be processed when clearning the
 2926	 * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
 2927	 */
 2928	if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
 2929		u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
 2930
 2931		inline_size = drop_args.bytes_found - inline_size;
 2932		btrfs_update_inode_bytes(inode, sectorsize, inline_size);
 2933		drop_args.bytes_found -= inline_size;
 2934		num_bytes -= sectorsize;
 2935	}
 2936
 2937	if (update_inode_bytes)
 2938		btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
 2939
 2940	ins.objectid = disk_bytenr;
 2941	ins.offset = disk_num_bytes;
 2942	ins.type = BTRFS_EXTENT_ITEM_KEY;
 2943
 2944	ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
 2945	if (ret)
 2946		goto out;
 2947
 2948	ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
 2949					       file_pos - offset,
 2950					       qgroup_reserved, &ins);
 2951out:
 2952	btrfs_free_path(path);
 2953
 2954	return ret;
 2955}
 2956
 2957static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
 2958					 u64 start, u64 len)
 2959{
 2960	struct btrfs_block_group *cache;
 2961
 2962	cache = btrfs_lookup_block_group(fs_info, start);
 2963	ASSERT(cache);
 2964
 2965	spin_lock(&cache->lock);
 2966	cache->delalloc_bytes -= len;
 2967	spin_unlock(&cache->lock);
 2968
 2969	btrfs_put_block_group(cache);
 2970}
 2971
 2972static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
 2973					     struct btrfs_ordered_extent *oe)
 2974{
 2975	struct btrfs_file_extent_item stack_fi;
 2976	bool update_inode_bytes;
 2977	u64 num_bytes = oe->num_bytes;
 2978	u64 ram_bytes = oe->ram_bytes;
 2979
 2980	memset(&stack_fi, 0, sizeof(stack_fi));
 2981	btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
 2982	btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
 2983	btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
 2984						   oe->disk_num_bytes);
 2985	btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
 2986	if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) {
 2987		num_bytes = oe->truncated_len;
 2988		ram_bytes = num_bytes;
 2989	}
 2990	btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
 2991	btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
 2992	btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
 2993	/* Encryption and other encoding is reserved and all 0 */
 2994
 2995	/*
 2996	 * For delalloc, when completing an ordered extent we update the inode's
 2997	 * bytes when clearing the range in the inode's io tree, so pass false
 2998	 * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
 2999	 * except if the ordered extent was truncated.
 3000	 */
 3001	update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
 3002			     test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
 3003			     test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
 3004
 3005	return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
 3006					   oe->file_offset, &stack_fi,
 3007					   update_inode_bytes, oe->qgroup_rsv);
 3008}
 3009
 3010/*
 3011 * As ordered data IO finishes, this gets called so we can finish
 3012 * an ordered extent if the range of bytes in the file it covers are
 3013 * fully written.
 3014 */
 3015int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
 3016{
 3017	struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
 3018	struct btrfs_root *root = inode->root;
 3019	struct btrfs_fs_info *fs_info = root->fs_info;
 3020	struct btrfs_trans_handle *trans = NULL;
 3021	struct extent_io_tree *io_tree = &inode->io_tree;
 3022	struct extent_state *cached_state = NULL;
 3023	u64 start, end;
 3024	int compress_type = 0;
 3025	int ret = 0;
 3026	u64 logical_len = ordered_extent->num_bytes;
 3027	bool freespace_inode;
 3028	bool truncated = false;
 3029	bool clear_reserved_extent = true;
 3030	unsigned int clear_bits = EXTENT_DEFRAG;
 3031
 3032	start = ordered_extent->file_offset;
 3033	end = start + ordered_extent->num_bytes - 1;
 3034
 3035	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
 3036	    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
 3037	    !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
 3038	    !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
 3039		clear_bits |= EXTENT_DELALLOC_NEW;
 3040
 3041	freespace_inode = btrfs_is_free_space_inode(inode);
 3042	if (!freespace_inode)
 3043		btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
 3044
 3045	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
 3046		ret = -EIO;
 3047		goto out;
 3048	}
 3049
 3050	if (btrfs_is_zoned(fs_info))
 3051		btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
 3052					ordered_extent->disk_num_bytes);
 3053
 3054	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
 3055		truncated = true;
 3056		logical_len = ordered_extent->truncated_len;
 3057		/* Truncated the entire extent, don't bother adding */
 3058		if (!logical_len)
 3059			goto out;
 3060	}
 3061
 3062	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
 3063		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
 3064
 3065		btrfs_inode_safe_disk_i_size_write(inode, 0);
 3066		if (freespace_inode)
 3067			trans = btrfs_join_transaction_spacecache(root);
 3068		else
 3069			trans = btrfs_join_transaction(root);
 3070		if (IS_ERR(trans)) {
 3071			ret = PTR_ERR(trans);
 3072			trans = NULL;
 3073			goto out;
 3074		}
 3075		trans->block_rsv = &inode->block_rsv;
 3076		ret = btrfs_update_inode_fallback(trans, inode);
 3077		if (ret) /* -ENOMEM or corruption */
 3078			btrfs_abort_transaction(trans, ret);
 3079		goto out;
 3080	}
 3081
 3082	clear_bits |= EXTENT_LOCKED;
 3083	lock_extent(io_tree, start, end, &cached_state);
 3084
 3085	if (freespace_inode)
 3086		trans = btrfs_join_transaction_spacecache(root);
 3087	else
 3088		trans = btrfs_join_transaction(root);
 3089	if (IS_ERR(trans)) {
 3090		ret = PTR_ERR(trans);
 3091		trans = NULL;
 3092		goto out;
 3093	}
 3094
 3095	trans->block_rsv = &inode->block_rsv;
 3096
 3097	ret = btrfs_insert_raid_extent(trans, ordered_extent);
 3098	if (ret)
 3099		goto out;
 3100
 3101	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
 3102		compress_type = ordered_extent->compress_type;
 3103	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
 3104		BUG_ON(compress_type);
 3105		ret = btrfs_mark_extent_written(trans, inode,
 3106						ordered_extent->file_offset,
 3107						ordered_extent->file_offset +
 3108						logical_len);
 3109		btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
 3110						  ordered_extent->disk_num_bytes);
 3111	} else {
 3112		BUG_ON(root == fs_info->tree_root);
 3113		ret = insert_ordered_extent_file_extent(trans, ordered_extent);
 3114		if (!ret) {
 3115			clear_reserved_extent = false;
 3116			btrfs_release_delalloc_bytes(fs_info,
 3117						ordered_extent->disk_bytenr,
 3118						ordered_extent->disk_num_bytes);
 3119		}
 3120	}
 3121	unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
 3122			   ordered_extent->num_bytes, trans->transid);
 3123	if (ret < 0) {
 3124		btrfs_abort_transaction(trans, ret);
 3125		goto out;
 3126	}
 3127
 3128	ret = add_pending_csums(trans, &ordered_extent->list);
 3129	if (ret) {
 3130		btrfs_abort_transaction(trans, ret);
 3131		goto out;
 3132	}
 3133
 3134	/*
 3135	 * If this is a new delalloc range, clear its new delalloc flag to
 3136	 * update the inode's number of bytes. This needs to be done first
 3137	 * before updating the inode item.
 3138	 */
 3139	if ((clear_bits & EXTENT_DELALLOC_NEW) &&
 3140	    !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
 3141		clear_extent_bit(&inode->io_tree, start, end,
 3142				 EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
 3143				 &cached_state);
 3144
 3145	btrfs_inode_safe_disk_i_size_write(inode, 0);
 3146	ret = btrfs_update_inode_fallback(trans, inode);
 3147	if (ret) { /* -ENOMEM or corruption */
 3148		btrfs_abort_transaction(trans, ret);
 3149		goto out;
 3150	}
 3151	ret = 0;
 3152out:
 3153	clear_extent_bit(&inode->io_tree, start, end, clear_bits,
 3154			 &cached_state);
 3155
 3156	if (trans)
 3157		btrfs_end_transaction(trans);
 3158
 3159	if (ret || truncated) {
 3160		u64 unwritten_start = start;
 3161
 3162		/*
 3163		 * If we failed to finish this ordered extent for any reason we
 3164		 * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
 3165		 * extent, and mark the inode with the error if it wasn't
 3166		 * already set.  Any error during writeback would have already
 3167		 * set the mapping error, so we need to set it if we're the ones
 3168		 * marking this ordered extent as failed.
 3169		 */
 3170		if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
 3171					     &ordered_extent->flags))
 3172			mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
 3173
 3174		if (truncated)
 3175			unwritten_start += logical_len;
 3176		clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
 3177
 3178		/* Drop extent maps for the part of the extent we didn't write. */
 3179		btrfs_drop_extent_map_range(inode, unwritten_start, end, false);
 3180
 3181		/*
 3182		 * If the ordered extent had an IOERR or something else went
 3183		 * wrong we need to return the space for this ordered extent
 3184		 * back to the allocator.  We only free the extent in the
 3185		 * truncated case if we didn't write out the extent at all.
 3186		 *
 3187		 * If we made it past insert_reserved_file_extent before we
 3188		 * errored out then we don't need to do this as the accounting
 3189		 * has already been done.
 3190		 */
 3191		if ((ret || !logical_len) &&
 3192		    clear_reserved_extent &&
 3193		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
 3194		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
 3195			/*
 3196			 * Discard the range before returning it back to the
 3197			 * free space pool
 3198			 */
 3199			if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
 3200				btrfs_discard_extent(fs_info,
 3201						ordered_extent->disk_bytenr,
 3202						ordered_extent->disk_num_bytes,
 3203						NULL);
 3204			btrfs_free_reserved_extent(fs_info,
 3205					ordered_extent->disk_bytenr,
 3206					ordered_extent->disk_num_bytes, 1);
 3207			/*
 3208			 * Actually free the qgroup rsv which was released when
 3209			 * the ordered extent was created.
 3210			 */
 3211			btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid,
 3212						  ordered_extent->qgroup_rsv,
 3213						  BTRFS_QGROUP_RSV_DATA);
 3214		}
 3215	}
 3216
 3217	/*
 3218	 * This needs to be done to make sure anybody waiting knows we are done
 3219	 * updating everything for this ordered extent.
 3220	 */
 3221	btrfs_remove_ordered_extent(inode, ordered_extent);
 3222
 3223	/* once for us */
 3224	btrfs_put_ordered_extent(ordered_extent);
 3225	/* once for the tree */
 3226	btrfs_put_ordered_extent(ordered_extent);
 3227
 3228	return ret;
 3229}
 3230
 3231int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
 3232{
 3233	if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) &&
 3234	    !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
 3235	    list_empty(&ordered->bioc_list))
 3236		btrfs_finish_ordered_zoned(ordered);
 3237	return btrfs_finish_one_ordered(ordered);
 3238}
 3239
 3240/*
 3241 * Verify the checksum for a single sector without any extra action that depend
 3242 * on the type of I/O.
 3243 */
 3244int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
 3245			    u32 pgoff, u8 *csum, const u8 * const csum_expected)
 3246{
 3247	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 3248	char *kaddr;
 3249
 3250	ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
 3251
 3252	shash->tfm = fs_info->csum_shash;
 3253
 3254	kaddr = kmap_local_page(page) + pgoff;
 3255	crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
 3256	kunmap_local(kaddr);
 3257
 3258	if (memcmp(csum, csum_expected, fs_info->csum_size))
 3259		return -EIO;
 3260	return 0;
 3261}
 3262
 3263/*
 3264 * Verify the checksum of a single data sector.
 3265 *
 3266 * @bbio:	btrfs_io_bio which contains the csum
 3267 * @dev:	device the sector is on
 3268 * @bio_offset:	offset to the beginning of the bio (in bytes)
 3269 * @bv:		bio_vec to check
 3270 *
 3271 * Check if the checksum on a data block is valid.  When a checksum mismatch is
 3272 * detected, report the error and fill the corrupted range with zero.
 3273 *
 3274 * Return %true if the sector is ok or had no checksum to start with, else %false.
 3275 */
 3276bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
 3277			u32 bio_offset, struct bio_vec *bv)
 3278{
 3279	struct btrfs_inode *inode = bbio->inode;
 3280	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 3281	u64 file_offset = bbio->file_offset + bio_offset;
 3282	u64 end = file_offset + bv->bv_len - 1;
 3283	u8 *csum_expected;
 3284	u8 csum[BTRFS_CSUM_SIZE];
 3285
 3286	ASSERT(bv->bv_len == fs_info->sectorsize);
 3287
 3288	if (!bbio->csum)
 3289		return true;
 3290
 3291	if (btrfs_is_data_reloc_root(inode->root) &&
 3292	    test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
 3293			   NULL)) {
 3294		/* Skip the range without csum for data reloc inode */
 3295		clear_extent_bits(&inode->io_tree, file_offset, end,
 3296				  EXTENT_NODATASUM);
 3297		return true;
 3298	}
 3299
 3300	csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
 3301				fs_info->csum_size;
 3302	if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum,
 3303				    csum_expected))
 3304		goto zeroit;
 3305	return true;
 3306
 3307zeroit:
 3308	btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected,
 3309				    bbio->mirror_num);
 3310	if (dev)
 3311		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
 3312	memzero_bvec(bv);
 3313	return false;
 3314}
 3315
 3316/*
 3317 * Perform a delayed iput on @inode.
 3318 *
 3319 * @inode: The inode we want to perform iput on
 3320 *
 3321 * This function uses the generic vfs_inode::i_count to track whether we should
 3322 * just decrement it (in case it's > 1) or if this is the last iput then link
 3323 * the inode to the delayed iput machinery. Delayed iputs are processed at
 3324 * transaction commit time/superblock commit/cleaner kthread.
 3325 */
 3326void btrfs_add_delayed_iput(struct btrfs_inode *inode)
 3327{
 3328	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 3329	unsigned long flags;
 3330
 3331	if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
 3332		return;
 3333
 3334	atomic_inc(&fs_info->nr_delayed_iputs);
 3335	/*
 3336	 * Need to be irq safe here because we can be called from either an irq
 3337	 * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq
 3338	 * context.
 3339	 */
 3340	spin_lock_irqsave(&fs_info->delayed_iput_lock, flags);
 3341	ASSERT(list_empty(&inode->delayed_iput));
 3342	list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs);
 3343	spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags);
 3344	if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
 3345		wake_up_process(fs_info->cleaner_kthread);
 3346}
 3347
 3348static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
 3349				    struct btrfs_inode *inode)
 3350{
 3351	list_del_init(&inode->delayed_iput);
 3352	spin_unlock_irq(&fs_info->delayed_iput_lock);
 3353	iput(&inode->vfs_inode);
 3354	if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
 3355		wake_up(&fs_info->delayed_iputs_wait);
 3356	spin_lock_irq(&fs_info->delayed_iput_lock);
 3357}
 3358
 3359static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
 3360				   struct btrfs_inode *inode)
 3361{
 3362	if (!list_empty(&inode->delayed_iput)) {
 3363		spin_lock_irq(&fs_info->delayed_iput_lock);
 3364		if (!list_empty(&inode->delayed_iput))
 3365			run_delayed_iput_locked(fs_info, inode);
 3366		spin_unlock_irq(&fs_info->delayed_iput_lock);
 3367	}
 3368}
 3369
 3370void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
 3371{
 3372	/*
 3373	 * btrfs_put_ordered_extent() can run in irq context (see bio.c), which
 3374	 * calls btrfs_add_delayed_iput() and that needs to lock
 3375	 * fs_info->delayed_iput_lock. So we need to disable irqs here to
 3376	 * prevent a deadlock.
 3377	 */
 3378	spin_lock_irq(&fs_info->delayed_iput_lock);
 3379	while (!list_empty(&fs_info->delayed_iputs)) {
 3380		struct btrfs_inode *inode;
 3381
 3382		inode = list_first_entry(&fs_info->delayed_iputs,
 3383				struct btrfs_inode, delayed_iput);
 3384		run_delayed_iput_locked(fs_info, inode);
 3385		if (need_resched()) {
 3386			spin_unlock_irq(&fs_info->delayed_iput_lock);
 3387			cond_resched();
 3388			spin_lock_irq(&fs_info->delayed_iput_lock);
 3389		}
 3390	}
 3391	spin_unlock_irq(&fs_info->delayed_iput_lock);
 3392}
 3393
 3394/*
 3395 * Wait for flushing all delayed iputs
 3396 *
 3397 * @fs_info:  the filesystem
 3398 *
 3399 * This will wait on any delayed iputs that are currently running with KILLABLE
 3400 * set.  Once they are all done running we will return, unless we are killed in
 3401 * which case we return EINTR. This helps in user operations like fallocate etc
 3402 * that might get blocked on the iputs.
 3403 *
 3404 * Return EINTR if we were killed, 0 if nothing's pending
 3405 */
 3406int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
 3407{
 3408	int ret = wait_event_killable(fs_info->delayed_iputs_wait,
 3409			atomic_read(&fs_info->nr_delayed_iputs) == 0);
 3410	if (ret)
 3411		return -EINTR;
 3412	return 0;
 3413}
 3414
 3415/*
 3416 * This creates an orphan entry for the given inode in case something goes wrong
 3417 * in the middle of an unlink.
 3418 */
 3419int btrfs_orphan_add(struct btrfs_trans_handle *trans,
 3420		     struct btrfs_inode *inode)
 3421{
 3422	int ret;
 3423
 3424	ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
 3425	if (ret && ret != -EEXIST) {
 3426		btrfs_abort_transaction(trans, ret);
 3427		return ret;
 3428	}
 3429
 3430	return 0;
 3431}
 3432
 3433/*
 3434 * We have done the delete so we can go ahead and remove the orphan item for
 3435 * this particular inode.
 3436 */
 3437static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
 3438			    struct btrfs_inode *inode)
 3439{
 3440	return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
 3441}
 3442
 3443/*
 3444 * this cleans up any orphans that may be left on the list from the last use
 3445 * of this root.
 3446 */
 3447int btrfs_orphan_cleanup(struct btrfs_root *root)
 3448{
 3449	struct btrfs_fs_info *fs_info = root->fs_info;
 3450	struct btrfs_path *path;
 3451	struct extent_buffer *leaf;
 3452	struct btrfs_key key, found_key;
 3453	struct btrfs_trans_handle *trans;
 3454	struct inode *inode;
 3455	u64 last_objectid = 0;
 3456	int ret = 0, nr_unlink = 0;
 3457
 3458	if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
 3459		return 0;
 3460
 3461	path = btrfs_alloc_path();
 3462	if (!path) {
 3463		ret = -ENOMEM;
 3464		goto out;
 3465	}
 3466	path->reada = READA_BACK;
 3467
 3468	key.objectid = BTRFS_ORPHAN_OBJECTID;
 3469	key.type = BTRFS_ORPHAN_ITEM_KEY;
 3470	key.offset = (u64)-1;
 3471
 3472	while (1) {
 3473		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 3474		if (ret < 0)
 3475			goto out;
 3476
 3477		/*
 3478		 * if ret == 0 means we found what we were searching for, which
 3479		 * is weird, but possible, so only screw with path if we didn't
 3480		 * find the key and see if we have stuff that matches
 3481		 */
 3482		if (ret > 0) {
 3483			ret = 0;
 3484			if (path->slots[0] == 0)
 3485				break;
 3486			path->slots[0]--;
 3487		}
 3488
 3489		/* pull out the item */
 3490		leaf = path->nodes[0];
 3491		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 3492
 3493		/* make sure the item matches what we want */
 3494		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
 3495			break;
 3496		if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
 3497			break;
 3498
 3499		/* release the path since we're done with it */
 3500		btrfs_release_path(path);
 3501
 3502		/*
 3503		 * this is where we are basically btrfs_lookup, without the
 3504		 * crossing root thing.  we store the inode number in the
 3505		 * offset of the orphan item.
 3506		 */
 3507
 3508		if (found_key.offset == last_objectid) {
 3509			/*
 3510			 * We found the same inode as before. This means we were
 3511			 * not able to remove its items via eviction triggered
 3512			 * by an iput(). A transaction abort may have happened,
 3513			 * due to -ENOSPC for example, so try to grab the error
 3514			 * that lead to a transaction abort, if any.
 3515			 */
 3516			btrfs_err(fs_info,
 3517				  "Error removing orphan entry, stopping orphan cleanup");
 3518			ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL;
 3519			goto out;
 3520		}
 3521
 3522		last_objectid = found_key.offset;
 3523
 3524		found_key.objectid = found_key.offset;
 3525		found_key.type = BTRFS_INODE_ITEM_KEY;
 3526		found_key.offset = 0;
 3527		inode = btrfs_iget(fs_info->sb, last_objectid, root);
 3528		if (IS_ERR(inode)) {
 3529			ret = PTR_ERR(inode);
 3530			inode = NULL;
 3531			if (ret != -ENOENT)
 3532				goto out;
 3533		}
 3534
 3535		if (!inode && root == fs_info->tree_root) {
 3536			struct btrfs_root *dead_root;
 3537			int is_dead_root = 0;
 3538
 3539			/*
 3540			 * This is an orphan in the tree root. Currently these
 3541			 * could come from 2 sources:
 3542			 *  a) a root (snapshot/subvolume) deletion in progress
 3543			 *  b) a free space cache inode
 3544			 * We need to distinguish those two, as the orphan item
 3545			 * for a root must not get deleted before the deletion
 3546			 * of the snapshot/subvolume's tree completes.
 3547			 *
 3548			 * btrfs_find_orphan_roots() ran before us, which has
 3549			 * found all deleted roots and loaded them into
 3550			 * fs_info->fs_roots_radix. So here we can find if an
 3551			 * orphan item corresponds to a deleted root by looking
 3552			 * up the root from that radix tree.
 3553			 */
 3554
 3555			spin_lock(&fs_info->fs_roots_radix_lock);
 3556			dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
 3557							 (unsigned long)found_key.objectid);
 3558			if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
 3559				is_dead_root = 1;
 3560			spin_unlock(&fs_info->fs_roots_radix_lock);
 3561
 3562			if (is_dead_root) {
 3563				/* prevent this orphan from being found again */
 3564				key.offset = found_key.objectid - 1;
 3565				continue;
 3566			}
 3567
 3568		}
 3569
 3570		/*
 3571		 * If we have an inode with links, there are a couple of
 3572		 * possibilities:
 3573		 *
 3574		 * 1. We were halfway through creating fsverity metadata for the
 3575		 * file. In that case, the orphan item represents incomplete
 3576		 * fsverity metadata which must be cleaned up with
 3577		 * btrfs_drop_verity_items and deleting the orphan item.
 3578
 3579		 * 2. Old kernels (before v3.12) used to create an
 3580		 * orphan item for truncate indicating that there were possibly
 3581		 * extent items past i_size that needed to be deleted. In v3.12,
 3582		 * truncate was changed to update i_size in sync with the extent
 3583		 * items, but the (useless) orphan item was still created. Since
 3584		 * v4.18, we don't create the orphan item for truncate at all.
 3585		 *
 3586		 * So, this item could mean that we need to do a truncate, but
 3587		 * only if this filesystem was last used on a pre-v3.12 kernel
 3588		 * and was not cleanly unmounted. The odds of that are quite
 3589		 * slim, and it's a pain to do the truncate now, so just delete
 3590		 * the orphan item.
 3591		 *
 3592		 * It's also possible that this orphan item was supposed to be
 3593		 * deleted but wasn't. The inode number may have been reused,
 3594		 * but either way, we can delete the orphan item.
 3595		 */
 3596		if (!inode || inode->i_nlink) {
 3597			if (inode) {
 3598				ret = btrfs_drop_verity_items(BTRFS_I(inode));
 3599				iput(inode);
 3600				inode = NULL;
 3601				if (ret)
 3602					goto out;
 3603			}
 3604			trans = btrfs_start_transaction(root, 1);
 3605			if (IS_ERR(trans)) {
 3606				ret = PTR_ERR(trans);
 3607				goto out;
 3608			}
 3609			btrfs_debug(fs_info, "auto deleting %Lu",
 3610				    found_key.objectid);
 3611			ret = btrfs_del_orphan_item(trans, root,
 3612						    found_key.objectid);
 3613			btrfs_end_transaction(trans);
 3614			if (ret)
 3615				goto out;
 3616			continue;
 3617		}
 3618
 3619		nr_unlink++;
 3620
 3621		/* this will do delete_inode and everything for us */
 3622		iput(inode);
 3623	}
 3624	/* release the path since we're done with it */
 3625	btrfs_release_path(path);
 3626
 3627	if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
 3628		trans = btrfs_join_transaction(root);
 3629		if (!IS_ERR(trans))
 3630			btrfs_end_transaction(trans);
 3631	}
 3632
 3633	if (nr_unlink)
 3634		btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
 3635
 3636out:
 3637	if (ret)
 3638		btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
 3639	btrfs_free_path(path);
 3640	return ret;
 3641}
 3642
 3643/*
 3644 * very simple check to peek ahead in the leaf looking for xattrs.  If we
 3645 * don't find any xattrs, we know there can't be any acls.
 3646 *
 3647 * slot is the slot the inode is in, objectid is the objectid of the inode
 3648 */
 3649static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 3650					  int slot, u64 objectid,
 3651					  int *first_xattr_slot)
 3652{
 3653	u32 nritems = btrfs_header_nritems(leaf);
 3654	struct btrfs_key found_key;
 3655	static u64 xattr_access = 0;
 3656	static u64 xattr_default = 0;
 3657	int scanned = 0;
 3658
 3659	if (!xattr_access) {
 3660		xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
 3661					strlen(XATTR_NAME_POSIX_ACL_ACCESS));
 3662		xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
 3663					strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
 3664	}
 3665
 3666	slot++;
 3667	*first_xattr_slot = -1;
 3668	while (slot < nritems) {
 3669		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 3670
 3671		/* we found a different objectid, there must not be acls */
 3672		if (found_key.objectid != objectid)
 3673			return 0;
 3674
 3675		/* we found an xattr, assume we've got an acl */
 3676		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
 3677			if (*first_xattr_slot == -1)
 3678				*first_xattr_slot = slot;
 3679			if (found_key.offset == xattr_access ||
 3680			    found_key.offset == xattr_default)
 3681				return 1;
 3682		}
 3683
 3684		/*
 3685		 * we found a key greater than an xattr key, there can't
 3686		 * be any acls later on
 3687		 */
 3688		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
 3689			return 0;
 3690
 3691		slot++;
 3692		scanned++;
 3693
 3694		/*
 3695		 * it goes inode, inode backrefs, xattrs, extents,
 3696		 * so if there are a ton of hard links to an inode there can
 3697		 * be a lot of backrefs.  Don't waste time searching too hard,
 3698		 * this is just an optimization
 3699		 */
 3700		if (scanned >= 8)
 3701			break;
 3702	}
 3703	/* we hit the end of the leaf before we found an xattr or
 3704	 * something larger than an xattr.  We have to assume the inode
 3705	 * has acls
 3706	 */
 3707	if (*first_xattr_slot == -1)
 3708		*first_xattr_slot = slot;
 3709	return 1;
 3710}
 3711
 3712/*
 3713 * read an inode from the btree into the in-memory inode
 3714 */
 3715static int btrfs_read_locked_inode(struct inode *inode,
 3716				   struct btrfs_path *in_path)
 3717{
 3718	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 3719	struct btrfs_path *path = in_path;
 3720	struct extent_buffer *leaf;
 3721	struct btrfs_inode_item *inode_item;
 3722	struct btrfs_root *root = BTRFS_I(inode)->root;
 3723	struct btrfs_key location;
 3724	unsigned long ptr;
 3725	int maybe_acls;
 3726	u32 rdev;
 3727	int ret;
 3728	bool filled = false;
 3729	int first_xattr_slot;
 3730
 3731	ret = btrfs_fill_inode(inode, &rdev);
 3732	if (!ret)
 3733		filled = true;
 3734
 3735	if (!path) {
 3736		path = btrfs_alloc_path();
 3737		if (!path)
 3738			return -ENOMEM;
 3739	}
 3740
 3741	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
 3742
 3743	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
 3744	if (ret) {
 3745		if (path != in_path)
 3746			btrfs_free_path(path);
 3747		return ret;
 3748	}
 3749
 3750	leaf = path->nodes[0];
 3751
 3752	if (filled)
 3753		goto cache_index;
 3754
 3755	inode_item = btrfs_item_ptr(leaf, path->slots[0],
 3756				    struct btrfs_inode_item);
 3757	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
 3758	set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
 3759	i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
 3760	i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
 3761	btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
 3762	btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
 3763			round_up(i_size_read(inode), fs_info->sectorsize));
 3764
 3765	inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime),
 3766			btrfs_timespec_nsec(leaf, &inode_item->atime));
 3767
 3768	inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
 3769			btrfs_timespec_nsec(leaf, &inode_item->mtime));
 3770
 3771	inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
 3772			btrfs_timespec_nsec(leaf, &inode_item->ctime));
 3773
 3774	BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
 3775	BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);
 3776
 3777	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
 3778	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
 3779	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
 3780
 3781	inode_set_iversion_queried(inode,
 3782				   btrfs_inode_sequence(leaf, inode_item));
 3783	inode->i_generation = BTRFS_I(inode)->generation;
 3784	inode->i_rdev = 0;
 3785	rdev = btrfs_inode_rdev(leaf, inode_item);
 3786
 3787	BTRFS_I(inode)->index_cnt = (u64)-1;
 3788	btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
 3789				&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
 3790
 3791cache_index:
 3792	/*
 3793	 * If we were modified in the current generation and evicted from memory
 3794	 * and then re-read we need to do a full sync since we don't have any
 3795	 * idea about which extents were modified before we were evicted from
 3796	 * cache.
 3797	 *
 3798	 * This is required for both inode re-read from disk and delayed inode
 3799	 * in delayed_nodes_tree.
 3800	 */
 3801	if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
 3802		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 3803			&BTRFS_I(inode)->runtime_flags);
 3804
 3805	/*
 3806	 * We don't persist the id of the transaction where an unlink operation
 3807	 * against the inode was last made. So here we assume the inode might
 3808	 * have been evicted, and therefore the exact value of last_unlink_trans
 3809	 * lost, and set it to last_trans to avoid metadata inconsistencies
 3810	 * between the inode and its parent if the inode is fsync'ed and the log
 3811	 * replayed. For example, in the scenario:
 3812	 *
 3813	 * touch mydir/foo
 3814	 * ln mydir/foo mydir/bar
 3815	 * sync
 3816	 * unlink mydir/bar
 3817	 * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
 3818	 * xfs_io -c fsync mydir/foo
 3819	 * <power failure>
 3820	 * mount fs, triggers fsync log replay
 3821	 *
 3822	 * We must make sure that when we fsync our inode foo we also log its
 3823	 * parent inode, otherwise after log replay the parent still has the
 3824	 * dentry with the "bar" name but our inode foo has a link count of 1
 3825	 * and doesn't have an inode ref with the name "bar" anymore.
 3826	 *
 3827	 * Setting last_unlink_trans to last_trans is a pessimistic approach,
 3828	 * but it guarantees correctness at the expense of occasional full
 3829	 * transaction commits on fsync if our inode is a directory, or if our
 3830	 * inode is not a directory, logging its parent unnecessarily.
 3831	 */
 3832	BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
 3833
 3834	/*
 3835	 * Same logic as for last_unlink_trans. We don't persist the generation
 3836	 * of the last transaction where this inode was used for a reflink
 3837	 * operation, so after eviction and reloading the inode we must be
 3838	 * pessimistic and assume the last transaction that modified the inode.
 3839	 */
 3840	BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
 3841
 3842	path->slots[0]++;
 3843	if (inode->i_nlink != 1 ||
 3844	    path->slots[0] >= btrfs_header_nritems(leaf))
 3845		goto cache_acl;
 3846
 3847	btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
 3848	if (location.objectid != btrfs_ino(BTRFS_I(inode)))
 3849		goto cache_acl;
 3850
 3851	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
 3852	if (location.type == BTRFS_INODE_REF_KEY) {
 3853		struct btrfs_inode_ref *ref;
 3854
 3855		ref = (struct btrfs_inode_ref *)ptr;
 3856		BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
 3857	} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
 3858		struct btrfs_inode_extref *extref;
 3859
 3860		extref = (struct btrfs_inode_extref *)ptr;
 3861		BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
 3862								     extref);
 3863	}
 3864cache_acl:
 3865	/*
 3866	 * try to precache a NULL acl entry for files that don't have
 3867	 * any xattrs or acls
 3868	 */
 3869	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
 3870			btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
 3871	if (first_xattr_slot != -1) {
 3872		path->slots[0] = first_xattr_slot;
 3873		ret = btrfs_load_inode_props(inode, path);
 3874		if (ret)
 3875			btrfs_err(fs_info,
 3876				  "error loading props for ino %llu (root %llu): %d",
 3877				  btrfs_ino(BTRFS_I(inode)),
 3878				  root->root_key.objectid, ret);
 3879	}
 3880	if (path != in_path)
 3881		btrfs_free_path(path);
 3882
 3883	if (!maybe_acls)
 3884		cache_no_acl(inode);
 3885
 3886	switch (inode->i_mode & S_IFMT) {
 3887	case S_IFREG:
 3888		inode->i_mapping->a_ops = &btrfs_aops;
 3889		inode->i_fop = &btrfs_file_operations;
 3890		inode->i_op = &btrfs_file_inode_operations;
 3891		break;
 3892	case S_IFDIR:
 3893		inode->i_fop = &btrfs_dir_file_operations;
 3894		inode->i_op = &btrfs_dir_inode_operations;
 3895		break;
 3896	case S_IFLNK:
 3897		inode->i_op = &btrfs_symlink_inode_operations;
 3898		inode_nohighmem(inode);
 3899		inode->i_mapping->a_ops = &btrfs_aops;
 3900		break;
 3901	default:
 3902		inode->i_op = &btrfs_special_inode_operations;
 3903		init_special_inode(inode, inode->i_mode, rdev);
 3904		break;
 3905	}
 3906
 3907	btrfs_sync_inode_flags_to_i_flags(inode);
 3908	return 0;
 3909}
 3910
 3911/*
 3912 * given a leaf and an inode, copy the inode fields into the leaf
 3913 */
 3914static void fill_inode_item(struct btrfs_trans_handle *trans,
 3915			    struct extent_buffer *leaf,
 3916			    struct btrfs_inode_item *item,
 3917			    struct inode *inode)
 3918{
 3919	struct btrfs_map_token token;
 3920	u64 flags;
 3921
 3922	btrfs_init_map_token(&token, leaf);
 3923
 3924	btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
 3925	btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
 3926	btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
 3927	btrfs_set_token_inode_mode(&token, item, inode->i_mode);
 3928	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
 3929
 3930	btrfs_set_token_timespec_sec(&token, &item->atime,
 3931				     inode_get_atime_sec(inode));
 3932	btrfs_set_token_timespec_nsec(&token, &item->atime,
 3933				      inode_get_atime_nsec(inode));
 3934
 3935	btrfs_set_token_timespec_sec(&token, &item->mtime,
 3936				     inode_get_mtime_sec(inode));
 3937	btrfs_set_token_timespec_nsec(&token, &item->mtime,
 3938				      inode_get_mtime_nsec(inode));
 3939
 3940	btrfs_set_token_timespec_sec(&token, &item->ctime,
 3941				     inode_get_ctime_sec(inode));
 3942	btrfs_set_token_timespec_nsec(&token, &item->ctime,
 3943				      inode_get_ctime_nsec(inode));
 3944
 3945	btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec);
 3946	btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec);
 3947
 3948	btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
 3949	btrfs_set_token_inode_generation(&token, item,
 3950					 BTRFS_I(inode)->generation);
 3951	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
 3952	btrfs_set_token_inode_transid(&token, item, trans->transid);
 3953	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
 3954	flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
 3955					  BTRFS_I(inode)->ro_flags);
 3956	btrfs_set_token_inode_flags(&token, item, flags);
 3957	btrfs_set_token_inode_block_group(&token, item, 0);
 3958}
 3959
 3960/*
 3961 * copy everything in the in-memory inode into the btree.
 3962 */
 3963static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
 3964					    struct btrfs_inode *inode)
 3965{
 3966	struct btrfs_inode_item *inode_item;
 3967	struct btrfs_path *path;
 3968	struct extent_buffer *leaf;
 3969	int ret;
 3970
 3971	path = btrfs_alloc_path();
 3972	if (!path)
 3973		return -ENOMEM;
 3974
 3975	ret = btrfs_lookup_inode(trans, inode->root, path, &inode->location, 1);
 3976	if (ret) {
 3977		if (ret > 0)
 3978			ret = -ENOENT;
 3979		goto failed;
 3980	}
 3981
 3982	leaf = path->nodes[0];
 3983	inode_item = btrfs_item_ptr(leaf, path->slots[0],
 3984				    struct btrfs_inode_item);
 3985
 3986	fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
 3987	btrfs_mark_buffer_dirty(trans, leaf);
 3988	btrfs_set_inode_last_trans(trans, inode);
 3989	ret = 0;
 3990failed:
 3991	btrfs_free_path(path);
 3992	return ret;
 3993}
 3994
 3995/*
 3996 * copy everything in the in-memory inode into the btree.
 3997 */
 3998int btrfs_update_inode(struct btrfs_trans_handle *trans,
 3999		       struct btrfs_inode *inode)
 4000{
 4001	struct btrfs_root *root = inode->root;
 4002	struct btrfs_fs_info *fs_info = root->fs_info;
 4003	int ret;
 4004
 4005	/*
 4006	 * If the inode is a free space inode, we can deadlock during commit
 4007	 * if we put it into the delayed code.
 4008	 *
 4009	 * The data relocation inode should also be directly updated
 4010	 * without delay
 4011	 */
 4012	if (!btrfs_is_free_space_inode(inode)
 4013	    && !btrfs_is_data_reloc_root(root)
 4014	    && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
 4015		btrfs_update_root_times(trans, root);
 4016
 4017		ret = btrfs_delayed_update_inode(trans, inode);
 4018		if (!ret)
 4019			btrfs_set_inode_last_trans(trans, inode);
 4020		return ret;
 4021	}
 4022
 4023	return btrfs_update_inode_item(trans, inode);
 4024}
 4025
 4026int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
 4027				struct btrfs_inode *inode)
 4028{
 4029	int ret;
 4030
 4031	ret = btrfs_update_inode(trans, inode);
 4032	if (ret == -ENOSPC)
 4033		return btrfs_update_inode_item(trans, inode);
 4034	return ret;
 4035}
 4036
 4037/*
 4038 * unlink helper that gets used here in inode.c and in the tree logging
 4039 * recovery code.  It remove a link in a directory with a given name, and
 4040 * also drops the back refs in the inode to the directory
 4041 */
 4042static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 4043				struct btrfs_inode *dir,
 4044				struct btrfs_inode *inode,
 4045				const struct fscrypt_str *name,
 4046				struct btrfs_rename_ctx *rename_ctx)
 4047{
 4048	struct btrfs_root *root = dir->root;
 4049	struct btrfs_fs_info *fs_info = root->fs_info;
 4050	struct btrfs_path *path;
 4051	int ret = 0;
 4052	struct btrfs_dir_item *di;
 4053	u64 index;
 4054	u64 ino = btrfs_ino(inode);
 4055	u64 dir_ino = btrfs_ino(dir);
 4056
 4057	path = btrfs_alloc_path();
 4058	if (!path) {
 4059		ret = -ENOMEM;
 4060		goto out;
 4061	}
 4062
 4063	di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
 4064	if (IS_ERR_OR_NULL(di)) {
 4065		ret = di ? PTR_ERR(di) : -ENOENT;
 4066		goto err;
 4067	}
 4068	ret = btrfs_delete_one_dir_name(trans, root, path, di);
 4069	if (ret)
 4070		goto err;
 4071	btrfs_release_path(path);
 4072
 4073	/*
 4074	 * If we don't have dir index, we have to get it by looking up
 4075	 * the inode ref, since we get the inode ref, remove it directly,
 4076	 * it is unnecessary to do delayed deletion.
 4077	 *
 4078	 * But if we have dir index, needn't search inode ref to get it.
 4079	 * Since the inode ref is close to the inode item, it is better
 4080	 * that we delay to delete it, and just do this deletion when
 4081	 * we update the inode item.
 4082	 */
 4083	if (inode->dir_index) {
 4084		ret = btrfs_delayed_delete_inode_ref(inode);
 4085		if (!ret) {
 4086			index = inode->dir_index;
 4087			goto skip_backref;
 4088		}
 4089	}
 4090
 4091	ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
 4092	if (ret) {
 4093		btrfs_info(fs_info,
 4094			"failed to delete reference to %.*s, inode %llu parent %llu",
 4095			name->len, name->name, ino, dir_ino);
 4096		btrfs_abort_transaction(trans, ret);
 4097		goto err;
 4098	}
 4099skip_backref:
 4100	if (rename_ctx)
 4101		rename_ctx->index = index;
 4102
 4103	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
 4104	if (ret) {
 4105		btrfs_abort_transaction(trans, ret);
 4106		goto err;
 4107	}
 4108
 4109	/*
 4110	 * If we are in a rename context, we don't need to update anything in the
 4111	 * log. That will be done later during the rename by btrfs_log_new_name().
 4112	 * Besides that, doing it here would only cause extra unnecessary btree
 4113	 * operations on the log tree, increasing latency for applications.
 4114	 */
 4115	if (!rename_ctx) {
 4116		btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino);
 4117		btrfs_del_dir_entries_in_log(trans, root, name, dir, index);
 4118	}
 4119
 4120	/*
 4121	 * If we have a pending delayed iput we could end up with the final iput
 4122	 * being run in btrfs-cleaner context.  If we have enough of these built
 4123	 * up we can end up burning a lot of time in btrfs-cleaner without any
 4124	 * way to throttle the unlinks.  Since we're currently holding a ref on
 4125	 * the inode we can run the delayed iput here without any issues as the
 4126	 * final iput won't be done until after we drop the ref we're currently
 4127	 * holding.
 4128	 */
 4129	btrfs_run_delayed_iput(fs_info, inode);
 4130err:
 4131	btrfs_free_path(path);
 4132	if (ret)
 4133		goto out;
 4134
 4135	btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
 4136	inode_inc_iversion(&inode->vfs_inode);
 4137	inode_inc_iversion(&dir->vfs_inode);
 4138 	inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
 4139	ret = btrfs_update_inode(trans, dir);
 4140out:
 4141	return ret;
 4142}
 4143
 4144int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 4145		       struct btrfs_inode *dir, struct btrfs_inode *inode,
 4146		       const struct fscrypt_str *name)
 4147{
 4148	int ret;
 4149
 4150	ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
 4151	if (!ret) {
 4152		drop_nlink(&inode->vfs_inode);
 4153		ret = btrfs_update_inode(trans, inode);
 4154	}
 4155	return ret;
 4156}
 4157
 4158/*
 4159 * helper to start transaction for unlink and rmdir.
 4160 *
 4161 * unlink and rmdir are special in btrfs, they do not always free space, so
 4162 * if we cannot make our reservations the normal way try and see if there is
 4163 * plenty of slack room in the global reserve to migrate, otherwise we cannot
 4164 * allow the unlink to occur.
 4165 */
 4166static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir)
 4167{
 4168	struct btrfs_root *root = dir->root;
 4169
 4170	return btrfs_start_transaction_fallback_global_rsv(root,
 4171						   BTRFS_UNLINK_METADATA_UNITS);
 4172}
 4173
 4174static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 4175{
 4176	struct btrfs_trans_handle *trans;
 4177	struct inode *inode = d_inode(dentry);
 4178	int ret;
 4179	struct fscrypt_name fname;
 4180
 4181	ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
 4182	if (ret)
 4183		return ret;
 4184
 4185	/* This needs to handle no-key deletions later on */
 4186
 4187	trans = __unlink_start_trans(BTRFS_I(dir));
 4188	if (IS_ERR(trans)) {
 4189		ret = PTR_ERR(trans);
 4190		goto fscrypt_free;
 4191	}
 4192
 4193	btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
 4194				false);
 4195
 4196	ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
 4197				 &fname.disk_name);
 4198	if (ret)
 4199		goto end_trans;
 4200
 4201	if (inode->i_nlink == 0) {
 4202		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
 4203		if (ret)
 4204			goto end_trans;
 4205	}
 4206
 4207end_trans:
 4208	btrfs_end_transaction(trans);
 4209	btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
 4210fscrypt_free:
 4211	fscrypt_free_filename(&fname);
 4212	return ret;
 4213}
 4214
 4215static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 4216			       struct btrfs_inode *dir, struct dentry *dentry)
 4217{
 4218	struct btrfs_root *root = dir->root;
 4219	struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
 4220	struct btrfs_path *path;
 4221	struct extent_buffer *leaf;
 4222	struct btrfs_dir_item *di;
 4223	struct btrfs_key key;
 4224	u64 index;
 4225	int ret;
 4226	u64 objectid;
 4227	u64 dir_ino = btrfs_ino(dir);
 4228	struct fscrypt_name fname;
 4229
 4230	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
 4231	if (ret)
 4232		return ret;
 4233
 4234	/* This needs to handle no-key deletions later on */
 4235
 4236	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
 4237		objectid = inode->root->root_key.objectid;
 4238	} else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
 4239		objectid = inode->location.objectid;
 4240	} else {
 4241		WARN_ON(1);
 4242		fscrypt_free_filename(&fname);
 4243		return -EINVAL;
 4244	}
 4245
 4246	path = btrfs_alloc_path();
 4247	if (!path) {
 4248		ret = -ENOMEM;
 4249		goto out;
 4250	}
 4251
 4252	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
 4253				   &fname.disk_name, -1);
 4254	if (IS_ERR_OR_NULL(di)) {
 4255		ret = di ? PTR_ERR(di) : -ENOENT;
 4256		goto out;
 4257	}
 4258
 4259	leaf = path->nodes[0];
 4260	btrfs_dir_item_key_to_cpu(leaf, di, &key);
 4261	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
 4262	ret = btrfs_delete_one_dir_name(trans, root, path, di);
 4263	if (ret) {
 4264		btrfs_abort_transaction(trans, ret);
 4265		goto out;
 4266	}
 4267	btrfs_release_path(path);
 4268
 4269	/*
 4270	 * This is a placeholder inode for a subvolume we didn't have a
 4271	 * reference to at the time of the snapshot creation.  In the meantime
 4272	 * we could have renamed the real subvol link into our snapshot, so
 4273	 * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
 4274	 * Instead simply lookup the dir_index_item for this entry so we can
 4275	 * remove it.  Otherwise we know we have a ref to the root and we can
 4276	 * call btrfs_del_root_ref, and it _shouldn't_ fail.
 4277	 */
 4278	if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
 4279		di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
 4280		if (IS_ERR_OR_NULL(di)) {
 4281			if (!di)
 4282				ret = -ENOENT;
 4283			else
 4284				ret = PTR_ERR(di);
 4285			btrfs_abort_transaction(trans, ret);
 4286			goto out;
 4287		}
 4288
 4289		leaf = path->nodes[0];
 4290		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 4291		index = key.offset;
 4292		btrfs_release_path(path);
 4293	} else {
 4294		ret = btrfs_del_root_ref(trans, objectid,
 4295					 root->root_key.objectid, dir_ino,
 4296					 &index, &fname.disk_name);
 4297		if (ret) {
 4298			btrfs_abort_transaction(trans, ret);
 4299			goto out;
 4300		}
 4301	}
 4302
 4303	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
 4304	if (ret) {
 4305		btrfs_abort_transaction(trans, ret);
 4306		goto out;
 4307	}
 4308
 4309	btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
 4310	inode_inc_iversion(&dir->vfs_inode);
 4311	inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
 4312	ret = btrfs_update_inode_fallback(trans, dir);
 4313	if (ret)
 4314		btrfs_abort_transaction(trans, ret);
 4315out:
 4316	btrfs_free_path(path);
 4317	fscrypt_free_filename(&fname);
 4318	return ret;
 4319}
 4320
 4321/*
 4322 * Helper to check if the subvolume references other subvolumes or if it's
 4323 * default.
 4324 */
 4325static noinline int may_destroy_subvol(struct btrfs_root *root)
 4326{
 4327	struct btrfs_fs_info *fs_info = root->fs_info;
 4328	struct btrfs_path *path;
 4329	struct btrfs_dir_item *di;
 4330	struct btrfs_key key;
 4331	struct fscrypt_str name = FSTR_INIT("default", 7);
 4332	u64 dir_id;
 4333	int ret;
 4334
 4335	path = btrfs_alloc_path();
 4336	if (!path)
 4337		return -ENOMEM;
 4338
 4339	/* Make sure this root isn't set as the default subvol */
 4340	dir_id = btrfs_super_root_dir(fs_info->super_copy);
 4341	di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
 4342				   dir_id, &name, 0);
 4343	if (di && !IS_ERR(di)) {
 4344		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
 4345		if (key.objectid == root->root_key.objectid) {
 4346			ret = -EPERM;
 4347			btrfs_err(fs_info,
 4348				  "deleting default subvolume %llu is not allowed",
 4349				  key.objectid);
 4350			goto out;
 4351		}
 4352		btrfs_release_path(path);
 4353	}
 4354
 4355	key.objectid = root->root_key.objectid;
 4356	key.type = BTRFS_ROOT_REF_KEY;
 4357	key.offset = (u64)-1;
 4358
 4359	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
 4360	if (ret < 0)
 4361		goto out;
 4362	BUG_ON(ret == 0);
 4363
 4364	ret = 0;
 4365	if (path->slots[0] > 0) {
 4366		path->slots[0]--;
 4367		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
 4368		if (key.objectid == root->root_key.objectid &&
 4369		    key.type == BTRFS_ROOT_REF_KEY)
 4370			ret = -ENOTEMPTY;
 4371	}
 4372out:
 4373	btrfs_free_path(path);
 4374	return ret;
 4375}
 4376
 4377/* Delete all dentries for inodes belonging to the root */
 4378static void btrfs_prune_dentries(struct btrfs_root *root)
 4379{
 4380	struct btrfs_fs_info *fs_info = root->fs_info;
 4381	struct rb_node *node;
 4382	struct rb_node *prev;
 4383	struct btrfs_inode *entry;
 4384	struct inode *inode;
 4385	u64 objectid = 0;
 4386
 4387	if (!BTRFS_FS_ERROR(fs_info))
 4388		WARN_ON(btrfs_root_refs(&root->root_item) != 0);
 4389
 4390	spin_lock(&root->inode_lock);
 4391again:
 4392	node = root->inode_tree.rb_node;
 4393	prev = NULL;
 4394	while (node) {
 4395		prev = node;
 4396		entry = rb_entry(node, struct btrfs_inode, rb_node);
 4397
 4398		if (objectid < btrfs_ino(entry))
 4399			node = node->rb_left;
 4400		else if (objectid > btrfs_ino(entry))
 4401			node = node->rb_right;
 4402		else
 4403			break;
 4404	}
 4405	if (!node) {
 4406		while (prev) {
 4407			entry = rb_entry(prev, struct btrfs_inode, rb_node);
 4408			if (objectid <= btrfs_ino(entry)) {
 4409				node = prev;
 4410				break;
 4411			}
 4412			prev = rb_next(prev);
 4413		}
 4414	}
 4415	while (node) {
 4416		entry = rb_entry(node, struct btrfs_inode, rb_node);
 4417		objectid = btrfs_ino(entry) + 1;
 4418		inode = igrab(&entry->vfs_inode);
 4419		if (inode) {
 4420			spin_unlock(&root->inode_lock);
 4421			if (atomic_read(&inode->i_count) > 1)
 4422				d_prune_aliases(inode);
 4423			/*
 4424			 * btrfs_drop_inode will have it removed from the inode
 4425			 * cache when its usage count hits zero.
 4426			 */
 4427			iput(inode);
 4428			cond_resched();
 4429			spin_lock(&root->inode_lock);
 4430			goto again;
 4431		}
 4432
 4433		if (cond_resched_lock(&root->inode_lock))
 4434			goto again;
 4435
 4436		node = rb_next(node);
 4437	}
 4438	spin_unlock(&root->inode_lock);
 4439}
 4440
 4441int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
 4442{
 4443	struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
 4444	struct btrfs_root *root = dir->root;
 4445	struct inode *inode = d_inode(dentry);
 4446	struct btrfs_root *dest = BTRFS_I(inode)->root;
 4447	struct btrfs_trans_handle *trans;
 4448	struct btrfs_block_rsv block_rsv;
 4449	u64 root_flags;
 4450	int ret;
 4451
 4452	/*
 4453	 * Don't allow to delete a subvolume with send in progress. This is
 4454	 * inside the inode lock so the error handling that has to drop the bit
 4455	 * again is not run concurrently.
 4456	 */
 4457	spin_lock(&dest->root_item_lock);
 4458	if (dest->send_in_progress) {
 4459		spin_unlock(&dest->root_item_lock);
 4460		btrfs_warn(fs_info,
 4461			   "attempt to delete subvolume %llu during send",
 4462			   dest->root_key.objectid);
 4463		return -EPERM;
 4464	}
 4465	if (atomic_read(&dest->nr_swapfiles)) {
 4466		spin_unlock(&dest->root_item_lock);
 4467		btrfs_warn(fs_info,
 4468			   "attempt to delete subvolume %llu with active swapfile",
 4469			   root->root_key.objectid);
 4470		return -EPERM;
 4471	}
 4472	root_flags = btrfs_root_flags(&dest->root_item);
 4473	btrfs_set_root_flags(&dest->root_item,
 4474			     root_flags | BTRFS_ROOT_SUBVOL_DEAD);
 4475	spin_unlock(&dest->root_item_lock);
 4476
 4477	down_write(&fs_info->subvol_sem);
 4478
 4479	ret = may_destroy_subvol(dest);
 4480	if (ret)
 4481		goto out_up_write;
 4482
 4483	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
 4484	/*
 4485	 * One for dir inode,
 4486	 * two for dir entries,
 4487	 * two for root ref/backref.
 4488	 */
 4489	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
 4490	if (ret)
 4491		goto out_up_write;
 4492
 4493	trans = btrfs_start_transaction(root, 0);
 4494	if (IS_ERR(trans)) {
 4495		ret = PTR_ERR(trans);
 4496		goto out_release;
 4497	}
 4498	trans->block_rsv = &block_rsv;
 4499	trans->bytes_reserved = block_rsv.size;
 4500
 4501	btrfs_record_snapshot_destroy(trans, dir);
 4502
 4503	ret = btrfs_unlink_subvol(trans, dir, dentry);
 4504	if (ret) {
 4505		btrfs_abort_transaction(trans, ret);
 4506		goto out_end_trans;
 4507	}
 4508
 4509	ret = btrfs_record_root_in_trans(trans, dest);
 4510	if (ret) {
 4511		btrfs_abort_transaction(trans, ret);
 4512		goto out_end_trans;
 4513	}
 4514
 4515	memset(&dest->root_item.drop_progress, 0,
 4516		sizeof(dest->root_item.drop_progress));
 4517	btrfs_set_root_drop_level(&dest->root_item, 0);
 4518	btrfs_set_root_refs(&dest->root_item, 0);
 4519
 4520	if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
 4521		ret = btrfs_insert_orphan_item(trans,
 4522					fs_info->tree_root,
 4523					dest->root_key.objectid);
 4524		if (ret) {
 4525			btrfs_abort_transaction(trans, ret);
 4526			goto out_end_trans;
 4527		}
 4528	}
 4529
 4530	ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
 4531				  BTRFS_UUID_KEY_SUBVOL,
 4532				  dest->root_key.objectid);
 4533	if (ret && ret != -ENOENT) {
 4534		btrfs_abort_transaction(trans, ret);
 4535		goto out_end_trans;
 4536	}
 4537	if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
 4538		ret = btrfs_uuid_tree_remove(trans,
 4539					  dest->root_item.received_uuid,
 4540					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
 4541					  dest->root_key.objectid);
 4542		if (ret && ret != -ENOENT) {
 4543			btrfs_abort_transaction(trans, ret);
 4544			goto out_end_trans;
 4545		}
 4546	}
 4547
 4548	free_anon_bdev(dest->anon_dev);
 4549	dest->anon_dev = 0;
 4550out_end_trans:
 4551	trans->block_rsv = NULL;
 4552	trans->bytes_reserved = 0;
 4553	ret = btrfs_end_transaction(trans);
 4554	inode->i_flags |= S_DEAD;
 4555out_release:
 4556	btrfs_subvolume_release_metadata(root, &block_rsv);
 4557out_up_write:
 4558	up_write(&fs_info->subvol_sem);
 4559	if (ret) {
 4560		spin_lock(&dest->root_item_lock);
 4561		root_flags = btrfs_root_flags(&dest->root_item);
 4562		btrfs_set_root_flags(&dest->root_item,
 4563				root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
 4564		spin_unlock(&dest->root_item_lock);
 4565	} else {
 4566		d_invalidate(dentry);
 4567		btrfs_prune_dentries(dest);
 4568		ASSERT(dest->send_in_progress == 0);
 4569	}
 4570
 4571	return ret;
 4572}
 4573
 4574static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 4575{
 4576	struct inode *inode = d_inode(dentry);
 4577	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
 4578	int err = 0;
 4579	struct btrfs_trans_handle *trans;
 4580	u64 last_unlink_trans;
 4581	struct fscrypt_name fname;
 4582
 4583	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 4584		return -ENOTEMPTY;
 4585	if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
 4586		if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
 4587			btrfs_err(fs_info,
 4588			"extent tree v2 doesn't support snapshot deletion yet");
 4589			return -EOPNOTSUPP;
 4590		}
 4591		return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
 4592	}
 4593
 4594	err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
 4595	if (err)
 4596		return err;
 4597
 4598	/* This needs to handle no-key deletions later on */
 4599
 4600	trans = __unlink_start_trans(BTRFS_I(dir));
 4601	if (IS_ERR(trans)) {
 4602		err = PTR_ERR(trans);
 4603		goto out_notrans;
 4604	}
 4605
 4606	if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
 4607		err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
 4608		goto out;
 4609	}
 4610
 4611	err = btrfs_orphan_add(trans, BTRFS_I(inode));
 4612	if (err)
 4613		goto out;
 4614
 4615	last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
 4616
 4617	/* now the directory is empty */
 4618	err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
 4619				 &fname.disk_name);
 4620	if (!err) {
 4621		btrfs_i_size_write(BTRFS_I(inode), 0);
 4622		/*
 4623		 * Propagate the last_unlink_trans value of the deleted dir to
 4624		 * its parent directory. This is to prevent an unrecoverable
 4625		 * log tree in the case we do something like this:
 4626		 * 1) create dir foo
 4627		 * 2) create snapshot under dir foo
 4628		 * 3) delete the snapshot
 4629		 * 4) rmdir foo
 4630		 * 5) mkdir foo
 4631		 * 6) fsync foo or some file inside foo
 4632		 */
 4633		if (last_unlink_trans >= trans->transid)
 4634			BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
 4635	}
 4636out:
 4637	btrfs_end_transaction(trans);
 4638out_notrans:
 4639	btrfs_btree_balance_dirty(fs_info);
 4640	fscrypt_free_filename(&fname);
 4641
 4642	return err;
 4643}
 4644
 4645/*
 4646 * Read, zero a chunk and write a block.
 4647 *
 4648 * @inode - inode that we're zeroing
 4649 * @from - the offset to start zeroing
 4650 * @len - the length to zero, 0 to zero the entire range respective to the
 4651 *	offset
 4652 * @front - zero up to the offset instead of from the offset on
 4653 *
 4654 * This will find the block for the "from" offset and cow the block and zero the
 4655 * part we want to zero.  This is used with truncate and hole punching.
 4656 */
 4657int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
 4658			 int front)
 4659{
 4660	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 4661	struct address_space *mapping = inode->vfs_inode.i_mapping;
 4662	struct extent_io_tree *io_tree = &inode->io_tree;
 4663	struct btrfs_ordered_extent *ordered;
 4664	struct extent_state *cached_state = NULL;
 4665	struct extent_changeset *data_reserved = NULL;
 4666	bool only_release_metadata = false;
 4667	u32 blocksize = fs_info->sectorsize;
 4668	pgoff_t index = from >> PAGE_SHIFT;
 4669	unsigned offset = from & (blocksize - 1);
 4670	struct page *page;
 4671	gfp_t mask = btrfs_alloc_write_mask(mapping);
 4672	size_t write_bytes = blocksize;
 4673	int ret = 0;
 4674	u64 block_start;
 4675	u64 block_end;
 4676
 4677	if (IS_ALIGNED(offset, blocksize) &&
 4678	    (!len || IS_ALIGNED(len, blocksize)))
 4679		goto out;
 4680
 4681	block_start = round_down(from, blocksize);
 4682	block_end = block_start + blocksize - 1;
 4683
 4684	ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
 4685					  blocksize, false);
 4686	if (ret < 0) {
 4687		if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
 4688			/* For nocow case, no need to reserve data space */
 4689			only_release_metadata = true;
 4690		} else {
 4691			goto out;
 4692		}
 4693	}
 4694	ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
 4695	if (ret < 0) {
 4696		if (!only_release_metadata)
 4697			btrfs_free_reserved_data_space(inode, data_reserved,
 4698						       block_start, blocksize);
 4699		goto out;
 4700	}
 4701again:
 4702	page = find_or_create_page(mapping, index, mask);
 4703	if (!page) {
 4704		btrfs_delalloc_release_space(inode, data_reserved, block_start,
 4705					     blocksize, true);
 4706		btrfs_delalloc_release_extents(inode, blocksize);
 4707		ret = -ENOMEM;
 4708		goto out;
 4709	}
 4710
 4711	if (!PageUptodate(page)) {
 4712		ret = btrfs_read_folio(NULL, page_folio(page));
 4713		lock_page(page);
 4714		if (page->mapping != mapping) {
 4715			unlock_page(page);
 4716			put_page(page);
 4717			goto again;
 4718		}
 4719		if (!PageUptodate(page)) {
 4720			ret = -EIO;
 4721			goto out_unlock;
 4722		}
 4723	}
 4724
 4725	/*
 4726	 * We unlock the page after the io is completed and then re-lock it
 4727	 * above.  release_folio() could have come in between that and cleared
 4728	 * PagePrivate(), but left the page in the mapping.  Set the page mapped
 4729	 * here to make sure it's properly set for the subpage stuff.
 4730	 */
 4731	ret = set_page_extent_mapped(page);
 4732	if (ret < 0)
 4733		goto out_unlock;
 4734
 4735	wait_on_page_writeback(page);
 4736
 4737	lock_extent(io_tree, block_start, block_end, &cached_state);
 4738
 4739	ordered = btrfs_lookup_ordered_extent(inode, block_start);
 4740	if (ordered) {
 4741		unlock_extent(io_tree, block_start, block_end, &cached_state);
 4742		unlock_page(page);
 4743		put_page(page);
 4744		btrfs_start_ordered_extent(ordered);
 4745		btrfs_put_ordered_extent(ordered);
 4746		goto again;
 4747	}
 4748
 4749	clear_extent_bit(&inode->io_tree, block_start, block_end,
 4750			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
 4751			 &cached_state);
 4752
 4753	ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
 4754					&cached_state);
 4755	if (ret) {
 4756		unlock_extent(io_tree, block_start, block_end, &cached_state);
 4757		goto out_unlock;
 4758	}
 4759
 4760	if (offset != blocksize) {
 4761		if (!len)
 4762			len = blocksize - offset;
 4763		if (front)
 4764			memzero_page(page, (block_start - page_offset(page)),
 4765				     offset);
 4766		else
 4767			memzero_page(page, (block_start - page_offset(page)) + offset,
 4768				     len);
 4769	}
 4770	btrfs_page_clear_checked(fs_info, page, block_start,
 4771				 block_end + 1 - block_start);
 4772	btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
 4773	unlock_extent(io_tree, block_start, block_end, &cached_state);
 4774
 4775	if (only_release_metadata)
 4776		set_extent_bit(&inode->io_tree, block_start, block_end,
 4777			       EXTENT_NORESERVE, NULL);
 4778
 4779out_unlock:
 4780	if (ret) {
 4781		if (only_release_metadata)
 4782			btrfs_delalloc_release_metadata(inode, blocksize, true);
 4783		else
 4784			btrfs_delalloc_release_space(inode, data_reserved,
 4785					block_start, blocksize, true);
 4786	}
 4787	btrfs_delalloc_release_extents(inode, blocksize);
 4788	unlock_page(page);
 4789	put_page(page);
 4790out:
 4791	if (only_release_metadata)
 4792		btrfs_check_nocow_unlock(inode);
 4793	extent_changeset_free(data_reserved);
 4794	return ret;
 4795}
 4796
 4797static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len)
 4798{
 4799	struct btrfs_root *root = inode->root;
 4800	struct btrfs_fs_info *fs_info = root->fs_info;
 4801	struct btrfs_trans_handle *trans;
 4802	struct btrfs_drop_extents_args drop_args = { 0 };
 4803	int ret;
 4804
 4805	/*
 4806	 * If NO_HOLES is enabled, we don't need to do anything.
 4807	 * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
 4808	 * or btrfs_update_inode() will be called, which guarantee that the next
 4809	 * fsync will know this inode was changed and needs to be logged.
 4810	 */
 4811	if (btrfs_fs_incompat(fs_info, NO_HOLES))
 4812		return 0;
 4813
 4814	/*
 4815	 * 1 - for the one we're dropping
 4816	 * 1 - for the one we're adding
 4817	 * 1 - for updating the inode.
 4818	 */
 4819	trans = btrfs_start_transaction(root, 3);
 4820	if (IS_ERR(trans))
 4821		return PTR_ERR(trans);
 4822
 4823	drop_args.start = offset;
 4824	drop_args.end = offset + len;
 4825	drop_args.drop_cache = true;
 4826
 4827	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
 4828	if (ret) {
 4829		btrfs_abort_transaction(trans, ret);
 4830		btrfs_end_transaction(trans);
 4831		return ret;
 4832	}
 4833
 4834	ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len);
 4835	if (ret) {
 4836		btrfs_abort_transaction(trans, ret);
 4837	} else {
 4838		btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
 4839		btrfs_update_inode(trans, inode);
 4840	}
 4841	btrfs_end_transaction(trans);
 4842	return ret;
 4843}
 4844
 4845/*
 4846 * This function puts in dummy file extents for the area we're creating a hole
 4847 * for.  So if we are truncating this file to a larger size we need to insert
 4848 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
 4849 * the range between oldsize and size
 4850 */
 4851int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
 4852{
 4853	struct btrfs_root *root = inode->root;
 4854	struct btrfs_fs_info *fs_info = root->fs_info;
 4855	struct extent_io_tree *io_tree = &inode->io_tree;
 4856	struct extent_map *em = NULL;
 4857	struct extent_state *cached_state = NULL;
 4858	u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
 4859	u64 block_end = ALIGN(size, fs_info->sectorsize);
 4860	u64 last_byte;
 4861	u64 cur_offset;
 4862	u64 hole_size;
 4863	int err = 0;
 4864
 4865	/*
 4866	 * If our size started in the middle of a block we need to zero out the
 4867	 * rest of the block before we expand the i_size, otherwise we could
 4868	 * expose stale data.
 4869	 */
 4870	err = btrfs_truncate_block(inode, oldsize, 0, 0);
 4871	if (err)
 4872		return err;
 4873
 4874	if (size <= hole_start)
 4875		return 0;
 4876
 4877	btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
 4878					   &cached_state);
 4879	cur_offset = hole_start;
 4880	while (1) {
 4881		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
 4882				      block_end - cur_offset);
 4883		if (IS_ERR(em)) {
 4884			err = PTR_ERR(em);
 4885			em = NULL;
 4886			break;
 4887		}
 4888		last_byte = min(extent_map_end(em), block_end);
 4889		last_byte = ALIGN(last_byte, fs_info->sectorsize);
 4890		hole_size = last_byte - cur_offset;
 4891
 4892		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
 4893			struct extent_map *hole_em;
 4894
 4895			err = maybe_insert_hole(inode, cur_offset, hole_size);
 4896			if (err)
 4897				break;
 4898
 4899			err = btrfs_inode_set_file_extent_range(inode,
 4900							cur_offset, hole_size);
 4901			if (err)
 4902				break;
 4903
 4904			hole_em = alloc_extent_map();
 4905			if (!hole_em) {
 4906				btrfs_drop_extent_map_range(inode, cur_offset,
 4907						    cur_offset + hole_size - 1,
 4908						    false);
 4909				btrfs_set_inode_full_sync(inode);
 4910				goto next;
 4911			}
 4912			hole_em->start = cur_offset;
 4913			hole_em->len = hole_size;
 4914			hole_em->orig_start = cur_offset;
 4915
 4916			hole_em->block_start = EXTENT_MAP_HOLE;
 4917			hole_em->block_len = 0;
 4918			hole_em->orig_block_len = 0;
 4919			hole_em->ram_bytes = hole_size;
 4920			hole_em->compress_type = BTRFS_COMPRESS_NONE;
 4921			hole_em->generation = btrfs_get_fs_generation(fs_info);
 4922
 4923			err = btrfs_replace_extent_map_range(inode, hole_em, true);
 4924			free_extent_map(hole_em);
 4925		} else {
 4926			err = btrfs_inode_set_file_extent_range(inode,
 4927							cur_offset, hole_size);
 4928			if (err)
 4929				break;
 4930		}
 4931next:
 4932		free_extent_map(em);
 4933		em = NULL;
 4934		cur_offset = last_byte;
 4935		if (cur_offset >= block_end)
 4936			break;
 4937	}
 4938	free_extent_map(em);
 4939	unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
 4940	return err;
 4941}
 4942
 4943static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 4944{
 4945	struct btrfs_root *root = BTRFS_I(inode)->root;
 4946	struct btrfs_trans_handle *trans;
 4947	loff_t oldsize = i_size_read(inode);
 4948	loff_t newsize = attr->ia_size;
 4949	int mask = attr->ia_valid;
 4950	int ret;
 4951
 4952	/*
 4953	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
 4954	 * special case where we need to update the times despite not having
 4955	 * these flags set.  For all other operations the VFS set these flags
 4956	 * explicitly if it wants a timestamp update.
 4957	 */
 4958	if (newsize != oldsize) {
 4959		inode_inc_iversion(inode);
 4960		if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
 4961			inode_set_mtime_to_ts(inode,
 4962					      inode_set_ctime_current(inode));
 4963		}
 4964	}
 4965
 4966	if (newsize > oldsize) {
 4967		/*
 4968		 * Don't do an expanding truncate while snapshotting is ongoing.
 4969		 * This is to ensure the snapshot captures a fully consistent
 4970		 * state of this file - if the snapshot captures this expanding
 4971		 * truncation, it must capture all writes that happened before
 4972		 * this truncation.
 4973		 */
 4974		btrfs_drew_write_lock(&root->snapshot_lock);
 4975		ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
 4976		if (ret) {
 4977			btrfs_drew_write_unlock(&root->snapshot_lock);
 4978			return ret;
 4979		}
 4980
 4981		trans = btrfs_start_transaction(root, 1);
 4982		if (IS_ERR(trans)) {
 4983			btrfs_drew_write_unlock(&root->snapshot_lock);
 4984			return PTR_ERR(trans);
 4985		}
 4986
 4987		i_size_write(inode, newsize);
 4988		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
 4989		pagecache_isize_extended(inode, oldsize, newsize);
 4990		ret = btrfs_update_inode(trans, BTRFS_I(inode));
 4991		btrfs_drew_write_unlock(&root->snapshot_lock);
 4992		btrfs_end_transaction(trans);
 4993	} else {
 4994		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 4995
 4996		if (btrfs_is_zoned(fs_info)) {
 4997			ret = btrfs_wait_ordered_range(inode,
 4998					ALIGN(newsize, fs_info->sectorsize),
 4999					(u64)-1);
 5000			if (ret)
 5001				return ret;
 5002		}
 5003
 5004		/*
 5005		 * We're truncating a file that used to have good data down to
 5006		 * zero. Make sure any new writes to the file get on disk
 5007		 * on close.
 5008		 */
 5009		if (newsize == 0)
 5010			set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
 5011				&BTRFS_I(inode)->runtime_flags);
 5012
 5013		truncate_setsize(inode, newsize);
 5014
 5015		inode_dio_wait(inode);
 5016
 5017		ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize);
 5018		if (ret && inode->i_nlink) {
 5019			int err;
 5020
 5021			/*
 5022			 * Truncate failed, so fix up the in-memory size. We
 5023			 * adjusted disk_i_size down as we removed extents, so
 5024			 * wait for disk_i_size to be stable and then update the
 5025			 * in-memory size to match.
 5026			 */
 5027			err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
 5028			if (err)
 5029				return err;
 5030			i_size_write(inode, BTRFS_I(inode)->disk_i_size);
 5031		}
 5032	}
 5033
 5034	return ret;
 5035}
 5036
 5037static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 5038			 struct iattr *attr)
 5039{
 5040	struct inode *inode = d_inode(dentry);
 5041	struct btrfs_root *root = BTRFS_I(inode)->root;
 5042	int err;
 5043
 5044	if (btrfs_root_readonly(root))
 5045		return -EROFS;
 5046
 5047	err = setattr_prepare(idmap, dentry, attr);
 5048	if (err)
 5049		return err;
 5050
 5051	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
 5052		err = btrfs_setsize(inode, attr);
 5053		if (err)
 5054			return err;
 5055	}
 5056
 5057	if (attr->ia_valid) {
 5058		setattr_copy(idmap, inode, attr);
 5059		inode_inc_iversion(inode);
 5060		err = btrfs_dirty_inode(BTRFS_I(inode));
 5061
 5062		if (!err && attr->ia_valid & ATTR_MODE)
 5063			err = posix_acl_chmod(idmap, dentry, inode->i_mode);
 5064	}
 5065
 5066	return err;
 5067}
 5068
 5069/*
 5070 * While truncating the inode pages during eviction, we get the VFS
 5071 * calling btrfs_invalidate_folio() against each folio of the inode. This
 5072 * is slow because the calls to btrfs_invalidate_folio() result in a
 5073 * huge amount of calls to lock_extent() and clear_extent_bit(),
 5074 * which keep merging and splitting extent_state structures over and over,
 5075 * wasting lots of time.
 5076 *
 5077 * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
 5078 * skip all those expensive operations on a per folio basis and do only
 5079 * the ordered io finishing, while we release here the extent_map and
 5080 * extent_state structures, without the excessive merging and splitting.
 5081 */
 5082static void evict_inode_truncate_pages(struct inode *inode)
 5083{
 5084	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 5085	struct rb_node *node;
 5086
 5087	ASSERT(inode->i_state & I_FREEING);
 5088	truncate_inode_pages_final(&inode->i_data);
 5089
 5090	btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
 5091
 5092	/*
 5093	 * Keep looping until we have no more ranges in the io tree.
 5094	 * We can have ongoing bios started by readahead that have
 5095	 * their endio callback (extent_io.c:end_bio_extent_readpage)
 5096	 * still in progress (unlocked the pages in the bio but did not yet
 5097	 * unlocked the ranges in the io tree). Therefore this means some
 5098	 * ranges can still be locked and eviction started because before
 5099	 * submitting those bios, which are executed by a separate task (work
 5100	 * queue kthread), inode references (inode->i_count) were not taken
 5101	 * (which would be dropped in the end io callback of each bio).
 5102	 * Therefore here we effectively end up waiting for those bios and
 5103	 * anyone else holding locked ranges without having bumped the inode's
 5104	 * reference count - if we don't do it, when they access the inode's
 5105	 * io_tree to unlock a range it may be too late, leading to an
 5106	 * use-after-free issue.
 5107	 */
 5108	spin_lock(&io_tree->lock);
 5109	while (!RB_EMPTY_ROOT(&io_tree->state)) {
 5110		struct extent_state *state;
 5111		struct extent_state *cached_state = NULL;
 5112		u64 start;
 5113		u64 end;
 5114		unsigned state_flags;
 5115
 5116		node = rb_first(&io_tree->state);
 5117		state = rb_entry(node, struct extent_state, rb_node);
 5118		start = state->start;
 5119		end = state->end;
 5120		state_flags = state->state;
 5121		spin_unlock(&io_tree->lock);
 5122
 5123		lock_extent(io_tree, start, end, &cached_state);
 5124
 5125		/*
 5126		 * If still has DELALLOC flag, the extent didn't reach disk,
 5127		 * and its reserved space won't be freed by delayed_ref.
 5128		 * So we need to free its reserved space here.
 5129		 * (Refer to comment in btrfs_invalidate_folio, case 2)
 5130		 *
 5131		 * Note, end is the bytenr of last byte, so we need + 1 here.
 5132		 */
 5133		if (state_flags & EXTENT_DELALLOC)
 5134			btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
 5135					       end - start + 1, NULL);
 5136
 5137		clear_extent_bit(io_tree, start, end,
 5138				 EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
 5139				 &cached_state);
 5140
 5141		cond_resched();
 5142		spin_lock(&io_tree->lock);
 5143	}
 5144	spin_unlock(&io_tree->lock);
 5145}
 5146
 5147static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
 5148							struct btrfs_block_rsv *rsv)
 5149{
 5150	struct btrfs_fs_info *fs_info = root->fs_info;
 5151	struct btrfs_trans_handle *trans;
 5152	u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1);
 5153	int ret;
 5154
 5155	/*
 5156	 * Eviction should be taking place at some place safe because of our
 5157	 * delayed iputs.  However the normal flushing code will run delayed
 5158	 * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
 5159	 *
 5160	 * We reserve the delayed_refs_extra here again because we can't use
 5161	 * btrfs_start_transaction(root, 0) for the same deadlocky reason as
 5162	 * above.  We reserve our extra bit here because we generate a ton of
 5163	 * delayed refs activity by truncating.
 5164	 *
 5165	 * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
 5166	 * if we fail to make this reservation we can re-try without the
 5167	 * delayed_refs_extra so we can make some forward progress.
 5168	 */
 5169	ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
 5170				     BTRFS_RESERVE_FLUSH_EVICT);
 5171	if (ret) {
 5172		ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
 5173					     BTRFS_RESERVE_FLUSH_EVICT);
 5174		if (ret) {
 5175			btrfs_warn(fs_info,
 5176				   "could not allocate space for delete; will truncate on mount");
 5177			return ERR_PTR(-ENOSPC);
 5178		}
 5179		delayed_refs_extra = 0;
 5180	}
 5181
 5182	trans = btrfs_join_transaction(root);
 5183	if (IS_ERR(trans))
 5184		return trans;
 5185
 5186	if (delayed_refs_extra) {
 5187		trans->block_rsv = &fs_info->trans_block_rsv;
 5188		trans->bytes_reserved = delayed_refs_extra;
 5189		btrfs_block_rsv_migrate(rsv, trans->block_rsv,
 5190					delayed_refs_extra, true);
 5191	}
 5192	return trans;
 5193}
 5194
 5195void btrfs_evict_inode(struct inode *inode)
 5196{
 5197	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 5198	struct btrfs_trans_handle *trans;
 5199	struct btrfs_root *root = BTRFS_I(inode)->root;
 5200	struct btrfs_block_rsv *rsv = NULL;
 5201	int ret;
 5202
 5203	trace_btrfs_inode_evict(inode);
 5204
 5205	if (!root) {
 5206		fsverity_cleanup_inode(inode);
 5207		clear_inode(inode);
 5208		return;
 5209	}
 5210
 5211	evict_inode_truncate_pages(inode);
 5212
 5213	if (inode->i_nlink &&
 5214	    ((btrfs_root_refs(&root->root_item) != 0 &&
 5215	      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
 5216	     btrfs_is_free_space_inode(BTRFS_I(inode))))
 5217		goto out;
 5218
 5219	if (is_bad_inode(inode))
 5220		goto out;
 5221
 5222	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
 5223		goto out;
 5224
 5225	if (inode->i_nlink > 0) {
 5226		BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
 5227		       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
 5228		goto out;
 5229	}
 5230
 5231	/*
 5232	 * This makes sure the inode item in tree is uptodate and the space for
 5233	 * the inode update is released.
 5234	 */
 5235	ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
 5236	if (ret)
 5237		goto out;
 5238
 5239	/*
 5240	 * This drops any pending insert or delete operations we have for this
 5241	 * inode.  We could have a delayed dir index deletion queued up, but
 5242	 * we're removing the inode completely so that'll be taken care of in
 5243	 * the truncate.
 5244	 */
 5245	btrfs_kill_delayed_inode_items(BTRFS_I(inode));
 5246
 5247	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
 5248	if (!rsv)
 5249		goto out;
 5250	rsv->size = btrfs_calc_metadata_size(fs_info, 1);
 5251	rsv->failfast = true;
 5252
 5253	btrfs_i_size_write(BTRFS_I(inode), 0);
 5254
 5255	while (1) {
 5256		struct btrfs_truncate_control control = {
 5257			.inode = BTRFS_I(inode),
 5258			.ino = btrfs_ino(BTRFS_I(inode)),
 5259			.new_size = 0,
 5260			.min_type = 0,
 5261		};
 5262
 5263		trans = evict_refill_and_join(root, rsv);
 5264		if (IS_ERR(trans))
 5265			goto out;
 5266
 5267		trans->block_rsv = rsv;
 5268
 5269		ret = btrfs_truncate_inode_items(trans, root, &control);
 5270		trans->block_rsv = &fs_info->trans_block_rsv;
 5271		btrfs_end_transaction(trans);
 5272		/*
 5273		 * We have not added new delayed items for our inode after we
 5274		 * have flushed its delayed items, so no need to throttle on
 5275		 * delayed items. However we have modified extent buffers.
 5276		 */
 5277		btrfs_btree_balance_dirty_nodelay(fs_info);
 5278		if (ret && ret != -ENOSPC && ret != -EAGAIN)
 5279			goto out;
 5280		else if (!ret)
 5281			break;
 5282	}
 5283
 5284	/*
 5285	 * Errors here aren't a big deal, it just means we leave orphan items in
 5286	 * the tree. They will be cleaned up on the next mount. If the inode
 5287	 * number gets reused, cleanup deletes the orphan item without doing
 5288	 * anything, and unlink reuses the existing orphan item.
 5289	 *
 5290	 * If it turns out that we are dropping too many of these, we might want
 5291	 * to add a mechanism for retrying these after a commit.
 5292	 */
 5293	trans = evict_refill_and_join(root, rsv);
 5294	if (!IS_ERR(trans)) {
 5295		trans->block_rsv = rsv;
 5296		btrfs_orphan_del(trans, BTRFS_I(inode));
 5297		trans->block_rsv = &fs_info->trans_block_rsv;
 5298		btrfs_end_transaction(trans);
 5299	}
 5300
 5301out:
 5302	btrfs_free_block_rsv(fs_info, rsv);
 5303	/*
 5304	 * If we didn't successfully delete, the orphan item will still be in
 5305	 * the tree and we'll retry on the next mount. Again, we might also want
 5306	 * to retry these periodically in the future.
 5307	 */
 5308	btrfs_remove_delayed_node(BTRFS_I(inode));
 5309	fsverity_cleanup_inode(inode);
 5310	clear_inode(inode);
 5311}
 5312
 5313/*
 5314 * Return the key found in the dir entry in the location pointer, fill @type
 5315 * with BTRFS_FT_*, and return 0.
 5316 *
 5317 * If no dir entries were found, returns -ENOENT.
 5318 * If found a corrupted location in dir entry, returns -EUCLEAN.
 5319 */
 5320static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
 5321			       struct btrfs_key *location, u8 *type)
 5322{
 5323	struct btrfs_dir_item *di;
 5324	struct btrfs_path *path;
 5325	struct btrfs_root *root = dir->root;
 5326	int ret = 0;
 5327	struct fscrypt_name fname;
 5328
 5329	path = btrfs_alloc_path();
 5330	if (!path)
 5331		return -ENOMEM;
 5332
 5333	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
 5334	if (ret < 0)
 5335		goto out;
 5336	/*
 5337	 * fscrypt_setup_filename() should never return a positive value, but
 5338	 * gcc on sparc/parisc thinks it can, so assert that doesn't happen.
 5339	 */
 5340	ASSERT(ret == 0);
 5341
 5342	/* This needs to handle no-key deletions later on */
 5343
 5344	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir),
 5345				   &fname.disk_name, 0);
 5346	if (IS_ERR_OR_NULL(di)) {
 5347		ret = di ? PTR_ERR(di) : -ENOENT;
 5348		goto out;
 5349	}
 5350
 5351	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
 5352	if (location->type != BTRFS_INODE_ITEM_KEY &&
 5353	    location->type != BTRFS_ROOT_ITEM_KEY) {
 5354		ret = -EUCLEAN;
 5355		btrfs_warn(root->fs_info,
 5356"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
 5357			   __func__, fname.disk_name.name, btrfs_ino(dir),
 5358			   location->objectid, location->type, location->offset);
 5359	}
 5360	if (!ret)
 5361		*type = btrfs_dir_ftype(path->nodes[0], di);
 5362out:
 5363	fscrypt_free_filename(&fname);
 5364	btrfs_free_path(path);
 5365	return ret;
 5366}
 5367
 5368/*
 5369 * when we hit a tree root in a directory, the btrfs part of the inode
 5370 * needs to be changed to reflect the root directory of the tree root.  This
 5371 * is kind of like crossing a mount point.
 5372 */
 5373static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
 5374				    struct btrfs_inode *dir,
 5375				    struct dentry *dentry,
 5376				    struct btrfs_key *location,
 5377				    struct btrfs_root **sub_root)
 5378{
 5379	struct btrfs_path *path;
 5380	struct btrfs_root *new_root;
 5381	struct btrfs_root_ref *ref;
 5382	struct extent_buffer *leaf;
 5383	struct btrfs_key key;
 5384	int ret;
 5385	int err = 0;
 5386	struct fscrypt_name fname;
 5387
 5388	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname);
 5389	if (ret)
 5390		return ret;
 5391
 5392	path = btrfs_alloc_path();
 5393	if (!path) {
 5394		err = -ENOMEM;
 5395		goto out;
 5396	}
 5397
 5398	err = -ENOENT;
 5399	key.objectid = dir->root->root_key.objectid;
 5400	key.type = BTRFS_ROOT_REF_KEY;
 5401	key.offset = location->objectid;
 5402
 5403	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
 5404	if (ret) {
 5405		if (ret < 0)
 5406			err = ret;
 5407		goto out;
 5408	}
 5409
 5410	leaf = path->nodes[0];
 5411	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
 5412	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
 5413	    btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len)
 5414		goto out;
 5415
 5416	ret = memcmp_extent_buffer(leaf, fname.disk_name.name,
 5417				   (unsigned long)(ref + 1), fname.disk_name.len);
 5418	if (ret)
 5419		goto out;
 5420
 5421	btrfs_release_path(path);
 5422
 5423	new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
 5424	if (IS_ERR(new_root)) {
 5425		err = PTR_ERR(new_root);
 5426		goto out;
 5427	}
 5428
 5429	*sub_root = new_root;
 5430	location->objectid = btrfs_root_dirid(&new_root->root_item);
 5431	location->type = BTRFS_INODE_ITEM_KEY;
 5432	location->offset = 0;
 5433	err = 0;
 5434out:
 5435	btrfs_free_path(path);
 5436	fscrypt_free_filename(&fname);
 5437	return err;
 5438}
 5439
 5440static void inode_tree_add(struct btrfs_inode *inode)
 5441{
 5442	struct btrfs_root *root = inode->root;
 5443	struct btrfs_inode *entry;
 5444	struct rb_node **p;
 5445	struct rb_node *parent;
 5446	struct rb_node *new = &inode->rb_node;
 5447	u64 ino = btrfs_ino(inode);
 5448
 5449	if (inode_unhashed(&inode->vfs_inode))
 5450		return;
 5451	parent = NULL;
 5452	spin_lock(&root->inode_lock);
 5453	p = &root->inode_tree.rb_node;
 5454	while (*p) {
 5455		parent = *p;
 5456		entry = rb_entry(parent, struct btrfs_inode, rb_node);
 5457
 5458		if (ino < btrfs_ino(entry))
 5459			p = &parent->rb_left;
 5460		else if (ino > btrfs_ino(entry))
 5461			p = &parent->rb_right;
 5462		else {
 5463			WARN_ON(!(entry->vfs_inode.i_state &
 5464				  (I_WILL_FREE | I_FREEING)));
 5465			rb_replace_node(parent, new, &root->inode_tree);
 5466			RB_CLEAR_NODE(parent);
 5467			spin_unlock(&root->inode_lock);
 5468			return;
 5469		}
 5470	}
 5471	rb_link_node(new, parent, p);
 5472	rb_insert_color(new, &root->inode_tree);
 5473	spin_unlock(&root->inode_lock);
 5474}
 5475
 5476static void inode_tree_del(struct btrfs_inode *inode)
 5477{
 5478	struct btrfs_root *root = inode->root;
 5479	int empty = 0;
 5480
 5481	spin_lock(&root->inode_lock);
 5482	if (!RB_EMPTY_NODE(&inode->rb_node)) {
 5483		rb_erase(&inode->rb_node, &root->inode_tree);
 5484		RB_CLEAR_NODE(&inode->rb_node);
 5485		empty = RB_EMPTY_ROOT(&root->inode_tree);
 5486	}
 5487	spin_unlock(&root->inode_lock);
 5488
 5489	if (empty && btrfs_root_refs(&root->root_item) == 0) {
 5490		spin_lock(&root->inode_lock);
 5491		empty = RB_EMPTY_ROOT(&root->inode_tree);
 5492		spin_unlock(&root->inode_lock);
 5493		if (empty)
 5494			btrfs_add_dead_root(root);
 5495	}
 5496}
 5497
 5498
 5499static int btrfs_init_locked_inode(struct inode *inode, void *p)
 5500{
 5501	struct btrfs_iget_args *args = p;
 5502
 5503	inode->i_ino = args->ino;
 5504	BTRFS_I(inode)->location.objectid = args->ino;
 5505	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
 5506	BTRFS_I(inode)->location.offset = 0;
 5507	BTRFS_I(inode)->root = btrfs_grab_root(args->root);
 5508	BUG_ON(args->root && !BTRFS_I(inode)->root);
 5509
 5510	if (args->root && args->root == args->root->fs_info->tree_root &&
 5511	    args->ino != BTRFS_BTREE_INODE_OBJECTID)
 5512		set_bit(BTRFS_INODE_FREE_SPACE_INODE,
 5513			&BTRFS_I(inode)->runtime_flags);
 5514	return 0;
 5515}
 5516
 5517static int btrfs_find_actor(struct inode *inode, void *opaque)
 5518{
 5519	struct btrfs_iget_args *args = opaque;
 5520
 5521	return args->ino == BTRFS_I(inode)->location.objectid &&
 5522		args->root == BTRFS_I(inode)->root;
 5523}
 5524
 5525static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
 5526				       struct btrfs_root *root)
 5527{
 5528	struct inode *inode;
 5529	struct btrfs_iget_args args;
 5530	unsigned long hashval = btrfs_inode_hash(ino, root);
 5531
 5532	args.ino = ino;
 5533	args.root = root;
 5534
 5535	inode = iget5_locked(s, hashval, btrfs_find_actor,
 5536			     btrfs_init_locked_inode,
 5537			     (void *)&args);
 5538	return inode;
 5539}
 5540
 5541/*
 5542 * Get an inode object given its inode number and corresponding root.
 5543 * Path can be preallocated to prevent recursing back to iget through
 5544 * allocator. NULL is also valid but may require an additional allocation
 5545 * later.
 5546 */
 5547struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
 5548			      struct btrfs_root *root, struct btrfs_path *path)
 5549{
 5550	struct inode *inode;
 5551
 5552	inode = btrfs_iget_locked(s, ino, root);
 5553	if (!inode)
 5554		return ERR_PTR(-ENOMEM);
 5555
 5556	if (inode->i_state & I_NEW) {
 5557		int ret;
 5558
 5559		ret = btrfs_read_locked_inode(inode, path);
 5560		if (!ret) {
 5561			inode_tree_add(BTRFS_I(inode));
 5562			unlock_new_inode(inode);
 5563		} else {
 5564			iget_failed(inode);
 5565			/*
 5566			 * ret > 0 can come from btrfs_search_slot called by
 5567			 * btrfs_read_locked_inode, this means the inode item
 5568			 * was not found.
 5569			 */
 5570			if (ret > 0)
 5571				ret = -ENOENT;
 5572			inode = ERR_PTR(ret);
 5573		}
 5574	}
 5575
 5576	return inode;
 5577}
 5578
 5579struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
 5580{
 5581	return btrfs_iget_path(s, ino, root, NULL);
 5582}
 5583
 5584static struct inode *new_simple_dir(struct inode *dir,
 5585				    struct btrfs_key *key,
 5586				    struct btrfs_root *root)
 5587{
 5588	struct timespec64 ts;
 5589	struct inode *inode = new_inode(dir->i_sb);
 5590
 5591	if (!inode)
 5592		return ERR_PTR(-ENOMEM);
 5593
 5594	BTRFS_I(inode)->root = btrfs_grab_root(root);
 5595	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
 5596	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
 5597
 5598	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
 5599	/*
 5600	 * We only need lookup, the rest is read-only and there's no inode
 5601	 * associated with the dentry
 5602	 */
 5603	inode->i_op = &simple_dir_inode_operations;
 5604	inode->i_opflags &= ~IOP_XATTR;
 5605	inode->i_fop = &simple_dir_operations;
 5606	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
 5607
 5608	ts = inode_set_ctime_current(inode);
 5609	inode_set_mtime_to_ts(inode, ts);
 5610	inode_set_atime_to_ts(inode, inode_get_atime(dir));
 5611	BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
 5612	BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
 5613
 5614	inode->i_uid = dir->i_uid;
 5615	inode->i_gid = dir->i_gid;
 5616
 5617	return inode;
 5618}
 5619
 5620static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
 5621static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
 5622static_assert(BTRFS_FT_DIR == FT_DIR);
 5623static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
 5624static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
 5625static_assert(BTRFS_FT_FIFO == FT_FIFO);
 5626static_assert(BTRFS_FT_SOCK == FT_SOCK);
 5627static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
 5628
 5629static inline u8 btrfs_inode_type(struct inode *inode)
 5630{
 5631	return fs_umode_to_ftype(inode->i_mode);
 5632}
 5633
 5634struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 5635{
 5636	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
 5637	struct inode *inode;
 5638	struct btrfs_root *root = BTRFS_I(dir)->root;
 5639	struct btrfs_root *sub_root = root;
 5640	struct btrfs_key location;
 5641	u8 di_type = 0;
 5642	int ret = 0;
 5643
 5644	if (dentry->d_name.len > BTRFS_NAME_LEN)
 5645		return ERR_PTR(-ENAMETOOLONG);
 5646
 5647	ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type);
 5648	if (ret < 0)
 5649		return ERR_PTR(ret);
 5650
 5651	if (location.type == BTRFS_INODE_ITEM_KEY) {
 5652		inode = btrfs_iget(dir->i_sb, location.objectid, root);
 5653		if (IS_ERR(inode))
 5654			return inode;
 5655
 5656		/* Do extra check against inode mode with di_type */
 5657		if (btrfs_inode_type(inode) != di_type) {
 5658			btrfs_crit(fs_info,
 5659"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
 5660				  inode->i_mode, btrfs_inode_type(inode),
 5661				  di_type);
 5662			iput(inode);
 5663			return ERR_PTR(-EUCLEAN);
 5664		}
 5665		return inode;
 5666	}
 5667
 5668	ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry,
 5669				       &location, &sub_root);
 5670	if (ret < 0) {
 5671		if (ret != -ENOENT)
 5672			inode = ERR_PTR(ret);
 5673		else
 5674			inode = new_simple_dir(dir, &location, root);
 5675	} else {
 5676		inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
 5677		btrfs_put_root(sub_root);
 5678
 5679		if (IS_ERR(inode))
 5680			return inode;
 5681
 5682		down_read(&fs_info->cleanup_work_sem);
 5683		if (!sb_rdonly(inode->i_sb))
 5684			ret = btrfs_orphan_cleanup(sub_root);
 5685		up_read(&fs_info->cleanup_work_sem);
 5686		if (ret) {
 5687			iput(inode);
 5688			inode = ERR_PTR(ret);
 5689		}
 5690	}
 5691
 5692	return inode;
 5693}
 5694
 5695static int btrfs_dentry_delete(const struct dentry *dentry)
 5696{
 5697	struct btrfs_root *root;
 5698	struct inode *inode = d_inode(dentry);
 5699
 5700	if (!inode && !IS_ROOT(dentry))
 5701		inode = d_inode(dentry->d_parent);
 5702
 5703	if (inode) {
 5704		root = BTRFS_I(inode)->root;
 5705		if (btrfs_root_refs(&root->root_item) == 0)
 5706			return 1;
 5707
 5708		if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
 5709			return 1;
 5710	}
 5711	return 0;
 5712}
 5713
 5714static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 5715				   unsigned int flags)
 5716{
 5717	struct inode *inode = btrfs_lookup_dentry(dir, dentry);
 5718
 5719	if (inode == ERR_PTR(-ENOENT))
 5720		inode = NULL;
 5721	return d_splice_alias(inode, dentry);
 5722}
 5723
 5724/*
 5725 * Find the highest existing sequence number in a directory and then set the
 5726 * in-memory index_cnt variable to the first free sequence number.
 5727 */
 5728static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
 5729{
 5730	struct btrfs_root *root = inode->root;
 5731	struct btrfs_key key, found_key;
 5732	struct btrfs_path *path;
 5733	struct extent_buffer *leaf;
 5734	int ret;
 5735
 5736	key.objectid = btrfs_ino(inode);
 5737	key.type = BTRFS_DIR_INDEX_KEY;
 5738	key.offset = (u64)-1;
 5739
 5740	path = btrfs_alloc_path();
 5741	if (!path)
 5742		return -ENOMEM;
 5743
 5744	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 5745	if (ret < 0)
 5746		goto out;
 5747	/* FIXME: we should be able to handle this */
 5748	if (ret == 0)
 5749		goto out;
 5750	ret = 0;
 5751
 5752	if (path->slots[0] == 0) {
 5753		inode->index_cnt = BTRFS_DIR_START_INDEX;
 5754		goto out;
 5755	}
 5756
 5757	path->slots[0]--;
 5758
 5759	leaf = path->nodes[0];
 5760	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 5761
 5762	if (found_key.objectid != btrfs_ino(inode) ||
 5763	    found_key.type != BTRFS_DIR_INDEX_KEY) {
 5764		inode->index_cnt = BTRFS_DIR_START_INDEX;
 5765		goto out;
 5766	}
 5767
 5768	inode->index_cnt = found_key.offset + 1;
 5769out:
 5770	btrfs_free_path(path);
 5771	return ret;
 5772}
 5773
 5774static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
 5775{
 5776	int ret = 0;
 5777
 5778	btrfs_inode_lock(dir, 0);
 5779	if (dir->index_cnt == (u64)-1) {
 5780		ret = btrfs_inode_delayed_dir_index_count(dir);
 5781		if (ret) {
 5782			ret = btrfs_set_inode_index_count(dir);
 5783			if (ret)
 5784				goto out;
 5785		}
 5786	}
 5787
 5788	/* index_cnt is the index number of next new entry, so decrement it. */
 5789	*index = dir->index_cnt - 1;
 5790out:
 5791	btrfs_inode_unlock(dir, 0);
 5792
 5793	return ret;
 5794}
 5795
 5796/*
 5797 * All this infrastructure exists because dir_emit can fault, and we are holding
 5798 * the tree lock when doing readdir.  For now just allocate a buffer and copy
 5799 * our information into that, and then dir_emit from the buffer.  This is
 5800 * similar to what NFS does, only we don't keep the buffer around in pagecache
 5801 * because I'm afraid I'll mess that up.  Long term we need to make filldir do
 5802 * copy_to_user_inatomic so we don't have to worry about page faulting under the
 5803 * tree lock.
 5804 */
 5805static int btrfs_opendir(struct inode *inode, struct file *file)
 5806{
 5807	struct btrfs_file_private *private;
 5808	u64 last_index;
 5809	int ret;
 5810
 5811	ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index);
 5812	if (ret)
 5813		return ret;
 5814
 5815	private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
 5816	if (!private)
 5817		return -ENOMEM;
 5818	private->last_index = last_index;
 5819	private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
 5820	if (!private->filldir_buf) {
 5821		kfree(private);
 5822		return -ENOMEM;
 5823	}
 5824	file->private_data = private;
 5825	return 0;
 5826}
 5827
 5828static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence)
 5829{
 5830	struct btrfs_file_private *private = file->private_data;
 5831	int ret;
 5832
 5833	ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)),
 5834				       &private->last_index);
 5835	if (ret)
 5836		return ret;
 5837
 5838	return generic_file_llseek(file, offset, whence);
 5839}
 5840
 5841struct dir_entry {
 5842	u64 ino;
 5843	u64 offset;
 5844	unsigned type;
 5845	int name_len;
 5846};
 5847
 5848static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
 5849{
 5850	while (entries--) {
 5851		struct dir_entry *entry = addr;
 5852		char *name = (char *)(entry + 1);
 5853
 5854		ctx->pos = get_unaligned(&entry->offset);
 5855		if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
 5856					 get_unaligned(&entry->ino),
 5857					 get_unaligned(&entry->type)))
 5858			return 1;
 5859		addr += sizeof(struct dir_entry) +
 5860			get_unaligned(&entry->name_len);
 5861		ctx->pos++;
 5862	}
 5863	return 0;
 5864}
 5865
 5866static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
 5867{
 5868	struct inode *inode = file_inode(file);
 5869	struct btrfs_root *root = BTRFS_I(inode)->root;
 5870	struct btrfs_file_private *private = file->private_data;
 5871	struct btrfs_dir_item *di;
 5872	struct btrfs_key key;
 5873	struct btrfs_key found_key;
 5874	struct btrfs_path *path;
 5875	void *addr;
 5876	LIST_HEAD(ins_list);
 5877	LIST_HEAD(del_list);
 5878	int ret;
 5879	char *name_ptr;
 5880	int name_len;
 5881	int entries = 0;
 5882	int total_len = 0;
 5883	bool put = false;
 5884	struct btrfs_key location;
 5885
 5886	if (!dir_emit_dots(file, ctx))
 5887		return 0;
 5888
 5889	path = btrfs_alloc_path();
 5890	if (!path)
 5891		return -ENOMEM;
 5892
 5893	addr = private->filldir_buf;
 5894	path->reada = READA_FORWARD;
 5895
 5896	put = btrfs_readdir_get_delayed_items(inode, private->last_index,
 5897					      &ins_list, &del_list);
 5898
 5899again:
 5900	key.type = BTRFS_DIR_INDEX_KEY;
 5901	key.offset = ctx->pos;
 5902	key.objectid = btrfs_ino(BTRFS_I(inode));
 5903
 5904	btrfs_for_each_slot(root, &key, &found_key, path, ret) {
 5905		struct dir_entry *entry;
 5906		struct extent_buffer *leaf = path->nodes[0];
 5907		u8 ftype;
 5908
 5909		if (found_key.objectid != key.objectid)
 5910			break;
 5911		if (found_key.type != BTRFS_DIR_INDEX_KEY)
 5912			break;
 5913		if (found_key.offset < ctx->pos)
 5914			continue;
 5915		if (found_key.offset > private->last_index)
 5916			break;
 5917		if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
 5918			continue;
 5919		di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
 5920		name_len = btrfs_dir_name_len(leaf, di);
 5921		if ((total_len + sizeof(struct dir_entry) + name_len) >=
 5922		    PAGE_SIZE) {
 5923			btrfs_release_path(path);
 5924			ret = btrfs_filldir(private->filldir_buf, entries, ctx);
 5925			if (ret)
 5926				goto nopos;
 5927			addr = private->filldir_buf;
 5928			entries = 0;
 5929			total_len = 0;
 5930			goto again;
 5931		}
 5932
 5933		ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di));
 5934		entry = addr;
 5935		name_ptr = (char *)(entry + 1);
 5936		read_extent_buffer(leaf, name_ptr,
 5937				   (unsigned long)(di + 1), name_len);
 5938		put_unaligned(name_len, &entry->name_len);
 5939		put_unaligned(fs_ftype_to_dtype(ftype), &entry->type);
 5940		btrfs_dir_item_key_to_cpu(leaf, di, &location);
 5941		put_unaligned(location.objectid, &entry->ino);
 5942		put_unaligned(found_key.offset, &entry->offset);
 5943		entries++;
 5944		addr += sizeof(struct dir_entry) + name_len;
 5945		total_len += sizeof(struct dir_entry) + name_len;
 5946	}
 5947	/* Catch error encountered during iteration */
 5948	if (ret < 0)
 5949		goto err;
 5950
 5951	btrfs_release_path(path);
 5952
 5953	ret = btrfs_filldir(private->filldir_buf, entries, ctx);
 5954	if (ret)
 5955		goto nopos;
 5956
 5957	ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
 5958	if (ret)
 5959		goto nopos;
 5960
 5961	/*
 5962	 * Stop new entries from being returned after we return the last
 5963	 * entry.
 5964	 *
 5965	 * New directory entries are assigned a strictly increasing
 5966	 * offset.  This means that new entries created during readdir
 5967	 * are *guaranteed* to be seen in the future by that readdir.
 5968	 * This has broken buggy programs which operate on names as
 5969	 * they're returned by readdir.  Until we re-use freed offsets
 5970	 * we have this hack to stop new entries from being returned
 5971	 * under the assumption that they'll never reach this huge
 5972	 * offset.
 5973	 *
 5974	 * This is being careful not to overflow 32bit loff_t unless the
 5975	 * last entry requires it because doing so has broken 32bit apps
 5976	 * in the past.
 5977	 */
 5978	if (ctx->pos >= INT_MAX)
 5979		ctx->pos = LLONG_MAX;
 5980	else
 5981		ctx->pos = INT_MAX;
 5982nopos:
 5983	ret = 0;
 5984err:
 5985	if (put)
 5986		btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
 5987	btrfs_free_path(path);
 5988	return ret;
 5989}
 5990
 5991/*
 5992 * This is somewhat expensive, updating the tree every time the
 5993 * inode changes.  But, it is most likely to find the inode in cache.
 5994 * FIXME, needs more benchmarking...there are no reasons other than performance
 5995 * to keep or drop this code.
 5996 */
 5997static int btrfs_dirty_inode(struct btrfs_inode *inode)
 5998{
 5999	struct btrfs_root *root = inode->root;
 6000	struct btrfs_fs_info *fs_info = root->fs_info;
 6001	struct btrfs_trans_handle *trans;
 6002	int ret;
 6003
 6004	if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags))
 6005		return 0;
 6006
 6007	trans = btrfs_join_transaction(root);
 6008	if (IS_ERR(trans))
 6009		return PTR_ERR(trans);
 6010
 6011	ret = btrfs_update_inode(trans, inode);
 6012	if (ret == -ENOSPC || ret == -EDQUOT) {
 6013		/* whoops, lets try again with the full transaction */
 6014		btrfs_end_transaction(trans);
 6015		trans = btrfs_start_transaction(root, 1);
 6016		if (IS_ERR(trans))
 6017			return PTR_ERR(trans);
 6018
 6019		ret = btrfs_update_inode(trans, inode);
 6020	}
 6021	btrfs_end_transaction(trans);
 6022	if (inode->delayed_node)
 6023		btrfs_balance_delayed_items(fs_info);
 6024
 6025	return ret;
 6026}
 6027
 6028/*
 6029 * This is a copy of file_update_time.  We need this so we can return error on
 6030 * ENOSPC for updating the inode in the case of file write and mmap writes.
 6031 */
 6032static int btrfs_update_time(struct inode *inode, int flags)
 6033{
 6034	struct btrfs_root *root = BTRFS_I(inode)->root;
 6035	bool dirty;
 6036
 6037	if (btrfs_root_readonly(root))
 6038		return -EROFS;
 6039
 6040	dirty = inode_update_timestamps(inode, flags);
 6041	return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0;
 6042}
 6043
 6044/*
 6045 * helper to find a free sequence number in a given directory.  This current
 6046 * code is very simple, later versions will do smarter things in the btree
 6047 */
 6048int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
 6049{
 6050	int ret = 0;
 6051
 6052	if (dir->index_cnt == (u64)-1) {
 6053		ret = btrfs_inode_delayed_dir_index_count(dir);
 6054		if (ret) {
 6055			ret = btrfs_set_inode_index_count(dir);
 6056			if (ret)
 6057				return ret;
 6058		}
 6059	}
 6060
 6061	*index = dir->index_cnt;
 6062	dir->index_cnt++;
 6063
 6064	return ret;
 6065}
 6066
 6067static int btrfs_insert_inode_locked(struct inode *inode)
 6068{
 6069	struct btrfs_iget_args args;
 6070
 6071	args.ino = BTRFS_I(inode)->location.objectid;
 6072	args.root = BTRFS_I(inode)->root;
 6073
 6074	return insert_inode_locked4(inode,
 6075		   btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
 6076		   btrfs_find_actor, &args);
 6077}
 6078
 6079int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
 6080			    unsigned int *trans_num_items)
 6081{
 6082	struct inode *dir = args->dir;
 6083	struct inode *inode = args->inode;
 6084	int ret;
 6085
 6086	if (!args->orphan) {
 6087		ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0,
 6088					     &args->fname);
 6089		if (ret)
 6090			return ret;
 6091	}
 6092
 6093	ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
 6094	if (ret) {
 6095		fscrypt_free_filename(&args->fname);
 6096		return ret;
 6097	}
 6098
 6099	/* 1 to add inode item */
 6100	*trans_num_items = 1;
 6101	/* 1 to add compression property */
 6102	if (BTRFS_I(dir)->prop_compress)
 6103		(*trans_num_items)++;
 6104	/* 1 to add default ACL xattr */
 6105	if (args->default_acl)
 6106		(*trans_num_items)++;
 6107	/* 1 to add access ACL xattr */
 6108	if (args->acl)
 6109		(*trans_num_items)++;
 6110#ifdef CONFIG_SECURITY
 6111	/* 1 to add LSM xattr */
 6112	if (dir->i_security)
 6113		(*trans_num_items)++;
 6114#endif
 6115	if (args->orphan) {
 6116		/* 1 to add orphan item */
 6117		(*trans_num_items)++;
 6118	} else {
 6119		/*
 6120		 * 1 to add dir item
 6121		 * 1 to add dir index
 6122		 * 1 to update parent inode item
 6123		 *
 6124		 * No need for 1 unit for the inode ref item because it is
 6125		 * inserted in a batch together with the inode item at
 6126		 * btrfs_create_new_inode().
 6127		 */
 6128		*trans_num_items += 3;
 6129	}
 6130	return 0;
 6131}
 6132
 6133void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
 6134{
 6135	posix_acl_release(args->acl);
 6136	posix_acl_release(args->default_acl);
 6137	fscrypt_free_filename(&args->fname);
 6138}
 6139
 6140/*
 6141 * Inherit flags from the parent inode.
 6142 *
 6143 * Currently only the compression flags and the cow flags are inherited.
 6144 */
 6145static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir)
 6146{
 6147	unsigned int flags;
 6148
 6149	flags = dir->flags;
 6150
 6151	if (flags & BTRFS_INODE_NOCOMPRESS) {
 6152		inode->flags &= ~BTRFS_INODE_COMPRESS;
 6153		inode->flags |= BTRFS_INODE_NOCOMPRESS;
 6154	} else if (flags & BTRFS_INODE_COMPRESS) {
 6155		inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
 6156		inode->flags |= BTRFS_INODE_COMPRESS;
 6157	}
 6158
 6159	if (flags & BTRFS_INODE_NODATACOW) {
 6160		inode->flags |= BTRFS_INODE_NODATACOW;
 6161		if (S_ISREG(inode->vfs_inode.i_mode))
 6162			inode->flags |= BTRFS_INODE_NODATASUM;
 6163	}
 6164
 6165	btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
 6166}
 6167
 6168int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
 6169			   struct btrfs_new_inode_args *args)
 6170{
 6171	struct timespec64 ts;
 6172	struct inode *dir = args->dir;
 6173	struct inode *inode = args->inode;
 6174	const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
 6175	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
 6176	struct btrfs_root *root;
 6177	struct btrfs_inode_item *inode_item;
 6178	struct btrfs_key *location;
 6179	struct btrfs_path *path;
 6180	u64 objectid;
 6181	struct btrfs_inode_ref *ref;
 6182	struct btrfs_key key[2];
 6183	u32 sizes[2];
 6184	struct btrfs_item_batch batch;
 6185	unsigned long ptr;
 6186	int ret;
 6187
 6188	path = btrfs_alloc_path();
 6189	if (!path)
 6190		return -ENOMEM;
 6191
 6192	if (!args->subvol)
 6193		BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
 6194	root = BTRFS_I(inode)->root;
 6195
 6196	ret = btrfs_get_free_objectid(root, &objectid);
 6197	if (ret)
 6198		goto out;
 6199	inode->i_ino = objectid;
 6200
 6201	if (args->orphan) {
 6202		/*
 6203		 * O_TMPFILE, set link count to 0, so that after this point, we
 6204		 * fill in an inode item with the correct link count.
 6205		 */
 6206		set_nlink(inode, 0);
 6207	} else {
 6208		trace_btrfs_inode_request(dir);
 6209
 6210		ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index);
 6211		if (ret)
 6212			goto out;
 6213	}
 6214	/* index_cnt is ignored for everything but a dir. */
 6215	BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
 6216	BTRFS_I(inode)->generation = trans->transid;
 6217	inode->i_generation = BTRFS_I(inode)->generation;
 6218
 6219	/*
 6220	 * Subvolumes don't inherit flags from their parent directory.
 6221	 * Originally this was probably by accident, but we probably can't
 6222	 * change it now without compatibility issues.
 6223	 */
 6224	if (!args->subvol)
 6225		btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir));
 6226
 6227	if (S_ISREG(inode->i_mode)) {
 6228		if (btrfs_test_opt(fs_info, NODATASUM))
 6229			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
 6230		if (btrfs_test_opt(fs_info, NODATACOW))
 6231			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
 6232				BTRFS_INODE_NODATASUM;
 6233	}
 6234
 6235	location = &BTRFS_I(inode)->location;
 6236	location->objectid = objectid;
 6237	location->offset = 0;
 6238	location->type = BTRFS_INODE_ITEM_KEY;
 6239
 6240	ret = btrfs_insert_inode_locked(inode);
 6241	if (ret < 0) {
 6242		if (!args->orphan)
 6243			BTRFS_I(dir)->index_cnt--;
 6244		goto out;
 6245	}
 6246
 6247	/*
 6248	 * We could have gotten an inode number from somebody who was fsynced
 6249	 * and then removed in this same transaction, so let's just set full
 6250	 * sync since it will be a full sync anyway and this will blow away the
 6251	 * old info in the log.
 6252	 */
 6253	btrfs_set_inode_full_sync(BTRFS_I(inode));
 6254
 6255	key[0].objectid = objectid;
 6256	key[0].type = BTRFS_INODE_ITEM_KEY;
 6257	key[0].offset = 0;
 6258
 6259	sizes[0] = sizeof(struct btrfs_inode_item);
 6260
 6261	if (!args->orphan) {
 6262		/*
 6263		 * Start new inodes with an inode_ref. This is slightly more
 6264		 * efficient for small numbers of hard links since they will
 6265		 * be packed into one item. Extended refs will kick in if we
 6266		 * add more hard links than can fit in the ref item.
 6267		 */
 6268		key[1].objectid = objectid;
 6269		key[1].type = BTRFS_INODE_REF_KEY;
 6270		if (args->subvol) {
 6271			key[1].offset = objectid;
 6272			sizes[1] = 2 + sizeof(*ref);
 6273		} else {
 6274			key[1].offset = btrfs_ino(BTRFS_I(dir));
 6275			sizes[1] = name->len + sizeof(*ref);
 6276		}
 6277	}
 6278
 6279	batch.keys = &key[0];
 6280	batch.data_sizes = &sizes[0];
 6281	batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
 6282	batch.nr = args->orphan ? 1 : 2;
 6283	ret = btrfs_insert_empty_items(trans, root, path, &batch);
 6284	if (ret != 0) {
 6285		btrfs_abort_transaction(trans, ret);
 6286		goto discard;
 6287	}
 6288
 6289	ts = simple_inode_init_ts(inode);
 6290	BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
 6291	BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
 6292
 6293	/*
 6294	 * We're going to fill the inode item now, so at this point the inode
 6295	 * must be fully initialized.
 6296	 */
 6297
 6298	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 6299				  struct btrfs_inode_item);
 6300	memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
 6301			     sizeof(*inode_item));
 6302	fill_inode_item(trans, path->nodes[0], inode_item, inode);
 6303
 6304	if (!args->orphan) {
 6305		ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
 6306				     struct btrfs_inode_ref);
 6307		ptr = (unsigned long)(ref + 1);
 6308		if (args->subvol) {
 6309			btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2);
 6310			btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
 6311			write_extent_buffer(path->nodes[0], "..", ptr, 2);
 6312		} else {
 6313			btrfs_set_inode_ref_name_len(path->nodes[0], ref,
 6314						     name->len);
 6315			btrfs_set_inode_ref_index(path->nodes[0], ref,
 6316						  BTRFS_I(inode)->dir_index);
 6317			write_extent_buffer(path->nodes[0], name->name, ptr,
 6318					    name->len);
 6319		}
 6320	}
 6321
 6322	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 6323	/*
 6324	 * We don't need the path anymore, plus inheriting properties, adding
 6325	 * ACLs, security xattrs, orphan item or adding the link, will result in
 6326	 * allocating yet another path. So just free our path.
 6327	 */
 6328	btrfs_free_path(path);
 6329	path = NULL;
 6330
 6331	if (args->subvol) {
 6332		struct inode *parent;
 6333
 6334		/*
 6335		 * Subvolumes inherit properties from their parent subvolume,
 6336		 * not the directory they were created in.
 6337		 */
 6338		parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID,
 6339				    BTRFS_I(dir)->root);
 6340		if (IS_ERR(parent)) {
 6341			ret = PTR_ERR(parent);
 6342		} else {
 6343			ret = btrfs_inode_inherit_props(trans, inode, parent);
 6344			iput(parent);
 6345		}
 6346	} else {
 6347		ret = btrfs_inode_inherit_props(trans, inode, dir);
 6348	}
 6349	if (ret) {
 6350		btrfs_err(fs_info,
 6351			  "error inheriting props for ino %llu (root %llu): %d",
 6352			  btrfs_ino(BTRFS_I(inode)), root->root_key.objectid,
 6353			  ret);
 6354	}
 6355
 6356	/*
 6357	 * Subvolumes don't inherit ACLs or get passed to the LSM. This is
 6358	 * probably a bug.
 6359	 */
 6360	if (!args->subvol) {
 6361		ret = btrfs_init_inode_security(trans, args);
 6362		if (ret) {
 6363			btrfs_abort_transaction(trans, ret);
 6364			goto discard;
 6365		}
 6366	}
 6367
 6368	inode_tree_add(BTRFS_I(inode));
 6369
 6370	trace_btrfs_inode_new(inode);
 6371	btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
 6372
 6373	btrfs_update_root_times(trans, root);
 6374
 6375	if (args->orphan) {
 6376		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
 6377	} else {
 6378		ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
 6379				     0, BTRFS_I(inode)->dir_index);
 6380	}
 6381	if (ret) {
 6382		btrfs_abort_transaction(trans, ret);
 6383		goto discard;
 6384	}
 6385
 6386	return 0;
 6387
 6388discard:
 6389	/*
 6390	 * discard_new_inode() calls iput(), but the caller owns the reference
 6391	 * to the inode.
 6392	 */
 6393	ihold(inode);
 6394	discard_new_inode(inode);
 6395out:
 6396	btrfs_free_path(path);
 6397	return ret;
 6398}
 6399
 6400/*
 6401 * utility function to add 'inode' into 'parent_inode' with
 6402 * a give name and a given sequence number.
 6403 * if 'add_backref' is true, also insert a backref from the
 6404 * inode to the parent directory.
 6405 */
 6406int btrfs_add_link(struct btrfs_trans_handle *trans,
 6407		   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
 6408		   const struct fscrypt_str *name, int add_backref, u64 index)
 6409{
 6410	int ret = 0;
 6411	struct btrfs_key key;
 6412	struct btrfs_root *root = parent_inode->root;
 6413	u64 ino = btrfs_ino(inode);
 6414	u64 parent_ino = btrfs_ino(parent_inode);
 6415
 6416	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
 6417		memcpy(&key, &inode->root->root_key, sizeof(key));
 6418	} else {
 6419		key.objectid = ino;
 6420		key.type = BTRFS_INODE_ITEM_KEY;
 6421		key.offset = 0;
 6422	}
 6423
 6424	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
 6425		ret = btrfs_add_root_ref(trans, key.objectid,
 6426					 root->root_key.objectid, parent_ino,
 6427					 index, name);
 6428	} else if (add_backref) {
 6429		ret = btrfs_insert_inode_ref(trans, root, name,
 6430					     ino, parent_ino, index);
 6431	}
 6432
 6433	/* Nothing to clean up yet */
 6434	if (ret)
 6435		return ret;
 6436
 6437	ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
 6438				    btrfs_inode_type(&inode->vfs_inode), index);
 6439	if (ret == -EEXIST || ret == -EOVERFLOW)
 6440		goto fail_dir_item;
 6441	else if (ret) {
 6442		btrfs_abort_transaction(trans, ret);
 6443		return ret;
 6444	}
 6445
 6446	btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
 6447			   name->len * 2);
 6448	inode_inc_iversion(&parent_inode->vfs_inode);
 6449	/*
 6450	 * If we are replaying a log tree, we do not want to update the mtime
 6451	 * and ctime of the parent directory with the current time, since the
 6452	 * log replay procedure is responsible for setting them to their correct
 6453	 * values (the ones it had when the fsync was done).
 6454	 */
 6455	if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
 6456		inode_set_mtime_to_ts(&parent_inode->vfs_inode,
 6457				      inode_set_ctime_current(&parent_inode->vfs_inode));
 6458
 6459	ret = btrfs_update_inode(trans, parent_inode);
 6460	if (ret)
 6461		btrfs_abort_transaction(trans, ret);
 6462	return ret;
 6463
 6464fail_dir_item:
 6465	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
 6466		u64 local_index;
 6467		int err;
 6468		err = btrfs_del_root_ref(trans, key.objectid,
 6469					 root->root_key.objectid, parent_ino,
 6470					 &local_index, name);
 6471		if (err)
 6472			btrfs_abort_transaction(trans, err);
 6473	} else if (add_backref) {
 6474		u64 local_index;
 6475		int err;
 6476
 6477		err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino,
 6478					  &local_index);
 6479		if (err)
 6480			btrfs_abort_transaction(trans, err);
 6481	}
 6482
 6483	/* Return the original error code */
 6484	return ret;
 6485}
 6486
 6487static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
 6488			       struct inode *inode)
 6489{
 6490	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
 6491	struct btrfs_root *root = BTRFS_I(dir)->root;
 6492	struct btrfs_new_inode_args new_inode_args = {
 6493		.dir = dir,
 6494		.dentry = dentry,
 6495		.inode = inode,
 6496	};
 6497	unsigned int trans_num_items;
 6498	struct btrfs_trans_handle *trans;
 6499	int err;
 6500
 6501	err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
 6502	if (err)
 6503		goto out_inode;
 6504
 6505	trans = btrfs_start_transaction(root, trans_num_items);
 6506	if (IS_ERR(trans)) {
 6507		err = PTR_ERR(trans);
 6508		goto out_new_inode_args;
 6509	}
 6510
 6511	err = btrfs_create_new_inode(trans, &new_inode_args);
 6512	if (!err)
 6513		d_instantiate_new(dentry, inode);
 6514
 6515	btrfs_end_transaction(trans);
 6516	btrfs_btree_balance_dirty(fs_info);
 6517out_new_inode_args:
 6518	btrfs_new_inode_args_destroy(&new_inode_args);
 6519out_inode:
 6520	if (err)
 6521		iput(inode);
 6522	return err;
 6523}
 6524
 6525static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 6526		       struct dentry *dentry, umode_t mode, dev_t rdev)
 6527{
 6528	struct inode *inode;
 6529
 6530	inode = new_inode(dir->i_sb);
 6531	if (!inode)
 6532		return -ENOMEM;
 6533	inode_init_owner(idmap, inode, dir, mode);
 6534	inode->i_op = &btrfs_special_inode_operations;
 6535	init_special_inode(inode, inode->i_mode, rdev);
 6536	return btrfs_create_common(dir, dentry, inode);
 6537}
 6538
 6539static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir,
 6540			struct dentry *dentry, umode_t mode, bool excl)
 6541{
 6542	struct inode *inode;
 6543
 6544	inode = new_inode(dir->i_sb);
 6545	if (!inode)
 6546		return -ENOMEM;
 6547	inode_init_owner(idmap, inode, dir, mode);
 6548	inode->i_fop = &btrfs_file_operations;
 6549	inode->i_op = &btrfs_file_inode_operations;
 6550	inode->i_mapping->a_ops = &btrfs_aops;
 6551	return btrfs_create_common(dir, dentry, inode);
 6552}
 6553
 6554static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 6555		      struct dentry *dentry)
 6556{
 6557	struct btrfs_trans_handle *trans = NULL;
 6558	struct btrfs_root *root = BTRFS_I(dir)->root;
 6559	struct inode *inode = d_inode(old_dentry);
 6560	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 6561	struct fscrypt_name fname;
 6562	u64 index;
 6563	int err;
 6564	int drop_inode = 0;
 6565
 6566	/* do not allow sys_link's with other subvols of the same device */
 6567	if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
 6568		return -EXDEV;
 6569
 6570	if (inode->i_nlink >= BTRFS_LINK_MAX)
 6571		return -EMLINK;
 6572
 6573	err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
 6574	if (err)
 6575		goto fail;
 6576
 6577	err = btrfs_set_inode_index(BTRFS_I(dir), &index);
 6578	if (err)
 6579		goto fail;
 6580
 6581	/*
 6582	 * 2 items for inode and inode ref
 6583	 * 2 items for dir items
 6584	 * 1 item for parent inode
 6585	 * 1 item for orphan item deletion if O_TMPFILE
 6586	 */
 6587	trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
 6588	if (IS_ERR(trans)) {
 6589		err = PTR_ERR(trans);
 6590		trans = NULL;
 6591		goto fail;
 6592	}
 6593
 6594	/* There are several dir indexes for this inode, clear the cache. */
 6595	BTRFS_I(inode)->dir_index = 0ULL;
 6596	inc_nlink(inode);
 6597	inode_inc_iversion(inode);
 6598	inode_set_ctime_current(inode);
 6599	ihold(inode);
 6600	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
 6601
 6602	err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
 6603			     &fname.disk_name, 1, index);
 6604
 6605	if (err) {
 6606		drop_inode = 1;
 6607	} else {
 6608		struct dentry *parent = dentry->d_parent;
 6609
 6610		err = btrfs_update_inode(trans, BTRFS_I(inode));
 6611		if (err)
 6612			goto fail;
 6613		if (inode->i_nlink == 1) {
 6614			/*
 6615			 * If new hard link count is 1, it's a file created
 6616			 * with open(2) O_TMPFILE flag.
 6617			 */
 6618			err = btrfs_orphan_del(trans, BTRFS_I(inode));
 6619			if (err)
 6620				goto fail;
 6621		}
 6622		d_instantiate(dentry, inode);
 6623		btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
 6624	}
 6625
 6626fail:
 6627	fscrypt_free_filename(&fname);
 6628	if (trans)
 6629		btrfs_end_transaction(trans);
 6630	if (drop_inode) {
 6631		inode_dec_link_count(inode);
 6632		iput(inode);
 6633	}
 6634	btrfs_btree_balance_dirty(fs_info);
 6635	return err;
 6636}
 6637
 6638static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 6639		       struct dentry *dentry, umode_t mode)
 6640{
 6641	struct inode *inode;
 6642
 6643	inode = new_inode(dir->i_sb);
 6644	if (!inode)
 6645		return -ENOMEM;
 6646	inode_init_owner(idmap, inode, dir, S_IFDIR | mode);
 6647	inode->i_op = &btrfs_dir_inode_operations;
 6648	inode->i_fop = &btrfs_dir_file_operations;
 6649	return btrfs_create_common(dir, dentry, inode);
 6650}
 6651
 6652static noinline int uncompress_inline(struct btrfs_path *path,
 6653				      struct page *page,
 6654				      struct btrfs_file_extent_item *item)
 6655{
 6656	int ret;
 6657	struct extent_buffer *leaf = path->nodes[0];
 6658	char *tmp;
 6659	size_t max_size;
 6660	unsigned long inline_size;
 6661	unsigned long ptr;
 6662	int compress_type;
 6663
 6664	compress_type = btrfs_file_extent_compression(leaf, item);
 6665	max_size = btrfs_file_extent_ram_bytes(leaf, item);
 6666	inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
 6667	tmp = kmalloc(inline_size, GFP_NOFS);
 6668	if (!tmp)
 6669		return -ENOMEM;
 6670	ptr = btrfs_file_extent_inline_start(item);
 6671
 6672	read_extent_buffer(leaf, tmp, ptr, inline_size);
 6673
 6674	max_size = min_t(unsigned long, PAGE_SIZE, max_size);
 6675	ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size);
 6676
 6677	/*
 6678	 * decompression code contains a memset to fill in any space between the end
 6679	 * of the uncompressed data and the end of max_size in case the decompressed
 6680	 * data ends up shorter than ram_bytes.  That doesn't cover the hole between
 6681	 * the end of an inline extent and the beginning of the next block, so we
 6682	 * cover that region here.
 6683	 */
 6684
 6685	if (max_size < PAGE_SIZE)
 6686		memzero_page(page, max_size, PAGE_SIZE - max_size);
 6687	kfree(tmp);
 6688	return ret;
 6689}
 6690
 6691static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path,
 6692			      struct page *page)
 6693{
 6694	struct btrfs_file_extent_item *fi;
 6695	void *kaddr;
 6696	size_t copy_size;
 6697
 6698	if (!page || PageUptodate(page))
 6699		return 0;
 6700
 6701	ASSERT(page_offset(page) == 0);
 6702
 6703	fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
 6704			    struct btrfs_file_extent_item);
 6705	if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
 6706		return uncompress_inline(path, page, fi);
 6707
 6708	copy_size = min_t(u64, PAGE_SIZE,
 6709			  btrfs_file_extent_ram_bytes(path->nodes[0], fi));
 6710	kaddr = kmap_local_page(page);
 6711	read_extent_buffer(path->nodes[0], kaddr,
 6712			   btrfs_file_extent_inline_start(fi), copy_size);
 6713	kunmap_local(kaddr);
 6714	if (copy_size < PAGE_SIZE)
 6715		memzero_page(page, copy_size, PAGE_SIZE - copy_size);
 6716	return 0;
 6717}
 6718
 6719/*
 6720 * Lookup the first extent overlapping a range in a file.
 6721 *
 6722 * @inode:	file to search in
 6723 * @page:	page to read extent data into if the extent is inline
 6724 * @pg_offset:	offset into @page to copy to
 6725 * @start:	file offset
 6726 * @len:	length of range starting at @start
 6727 *
 6728 * Return the first &struct extent_map which overlaps the given range, reading
 6729 * it from the B-tree and caching it if necessary. Note that there may be more
 6730 * extents which overlap the given range after the returned extent_map.
 6731 *
 6732 * If @page is not NULL and the extent is inline, this also reads the extent
 6733 * data directly into the page and marks the extent up to date in the io_tree.
 6734 *
 6735 * Return: ERR_PTR on error, non-NULL extent_map on success.
 6736 */
 6737struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
 6738				    struct page *page, size_t pg_offset,
 6739				    u64 start, u64 len)
 6740{
 6741	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 6742	int ret = 0;
 6743	u64 extent_start = 0;
 6744	u64 extent_end = 0;
 6745	u64 objectid = btrfs_ino(inode);
 6746	int extent_type = -1;
 6747	struct btrfs_path *path = NULL;
 6748	struct btrfs_root *root = inode->root;
 6749	struct btrfs_file_extent_item *item;
 6750	struct extent_buffer *leaf;
 6751	struct btrfs_key found_key;
 6752	struct extent_map *em = NULL;
 6753	struct extent_map_tree *em_tree = &inode->extent_tree;
 6754
 6755	read_lock(&em_tree->lock);
 6756	em = lookup_extent_mapping(em_tree, start, len);
 6757	read_unlock(&em_tree->lock);
 6758
 6759	if (em) {
 6760		if (em->start > start || em->start + em->len <= start)
 6761			free_extent_map(em);
 6762		else if (em->block_start == EXTENT_MAP_INLINE && page)
 6763			free_extent_map(em);
 6764		else
 6765			goto out;
 6766	}
 6767	em = alloc_extent_map();
 6768	if (!em) {
 6769		ret = -ENOMEM;
 6770		goto out;
 6771	}
 6772	em->start = EXTENT_MAP_HOLE;
 6773	em->orig_start = EXTENT_MAP_HOLE;
 6774	em->len = (u64)-1;
 6775	em->block_len = (u64)-1;
 6776
 6777	path = btrfs_alloc_path();
 6778	if (!path) {
 6779		ret = -ENOMEM;
 6780		goto out;
 6781	}
 6782
 6783	/* Chances are we'll be called again, so go ahead and do readahead */
 6784	path->reada = READA_FORWARD;
 6785
 6786	/*
 6787	 * The same explanation in load_free_space_cache applies here as well,
 6788	 * we only read when we're loading the free space cache, and at that
 6789	 * point the commit_root has everything we need.
 6790	 */
 6791	if (btrfs_is_free_space_inode(inode)) {
 6792		path->search_commit_root = 1;
 6793		path->skip_locking = 1;
 6794	}
 6795
 6796	ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
 6797	if (ret < 0) {
 6798		goto out;
 6799	} else if (ret > 0) {
 6800		if (path->slots[0] == 0)
 6801			goto not_found;
 6802		path->slots[0]--;
 6803		ret = 0;
 6804	}
 6805
 6806	leaf = path->nodes[0];
 6807	item = btrfs_item_ptr(leaf, path->slots[0],
 6808			      struct btrfs_file_extent_item);
 6809	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 6810	if (found_key.objectid != objectid ||
 6811	    found_key.type != BTRFS_EXTENT_DATA_KEY) {
 6812		/*
 6813		 * If we backup past the first extent we want to move forward
 6814		 * and see if there is an extent in front of us, otherwise we'll
 6815		 * say there is a hole for our whole search range which can
 6816		 * cause problems.
 6817		 */
 6818		extent_end = start;
 6819		goto next;
 6820	}
 6821
 6822	extent_type = btrfs_file_extent_type(leaf, item);
 6823	extent_start = found_key.offset;
 6824	extent_end = btrfs_file_extent_end(path);
 6825	if (extent_type == BTRFS_FILE_EXTENT_REG ||
 6826	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 6827		/* Only regular file could have regular/prealloc extent */
 6828		if (!S_ISREG(inode->vfs_inode.i_mode)) {
 6829			ret = -EUCLEAN;
 6830			btrfs_crit(fs_info,
 6831		"regular/prealloc extent found for non-regular inode %llu",
 6832				   btrfs_ino(inode));
 6833			goto out;
 6834		}
 6835		trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
 6836						       extent_start);
 6837	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 6838		trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
 6839						      path->slots[0],
 6840						      extent_start);
 6841	}
 6842next:
 6843	if (start >= extent_end) {
 6844		path->slots[0]++;
 6845		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 6846			ret = btrfs_next_leaf(root, path);
 6847			if (ret < 0)
 6848				goto out;
 6849			else if (ret > 0)
 6850				goto not_found;
 6851
 6852			leaf = path->nodes[0];
 6853		}
 6854		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 6855		if (found_key.objectid != objectid ||
 6856		    found_key.type != BTRFS_EXTENT_DATA_KEY)
 6857			goto not_found;
 6858		if (start + len <= found_key.offset)
 6859			goto not_found;
 6860		if (start > found_key.offset)
 6861			goto next;
 6862
 6863		/* New extent overlaps with existing one */
 6864		em->start = start;
 6865		em->orig_start = start;
 6866		em->len = found_key.offset - start;
 6867		em->block_start = EXTENT_MAP_HOLE;
 6868		goto insert;
 6869	}
 6870
 6871	btrfs_extent_item_to_extent_map(inode, path, item, em);
 6872
 6873	if (extent_type == BTRFS_FILE_EXTENT_REG ||
 6874	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 6875		goto insert;
 6876	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 6877		/*
 6878		 * Inline extent can only exist at file offset 0. This is
 6879		 * ensured by tree-checker and inline extent creation path.
 6880		 * Thus all members representing file offsets should be zero.
 6881		 */
 6882		ASSERT(pg_offset == 0);
 6883		ASSERT(extent_start == 0);
 6884		ASSERT(em->start == 0);
 6885
 6886		/*
 6887		 * btrfs_extent_item_to_extent_map() should have properly
 6888		 * initialized em members already.
 6889		 *
 6890		 * Other members are not utilized for inline extents.
 6891		 */
 6892		ASSERT(em->block_start == EXTENT_MAP_INLINE);
 6893		ASSERT(em->len == fs_info->sectorsize);
 6894
 6895		ret = read_inline_extent(inode, path, page);
 6896		if (ret < 0)
 6897			goto out;
 6898		goto insert;
 6899	}
 6900not_found:
 6901	em->start = start;
 6902	em->orig_start = start;
 6903	em->len = len;
 6904	em->block_start = EXTENT_MAP_HOLE;
 6905insert:
 6906	ret = 0;
 6907	btrfs_release_path(path);
 6908	if (em->start > start || extent_map_end(em) <= start) {
 6909		btrfs_err(fs_info,
 6910			  "bad extent! em: [%llu %llu] passed [%llu %llu]",
 6911			  em->start, em->len, start, len);
 6912		ret = -EIO;
 6913		goto out;
 6914	}
 6915
 6916	write_lock(&em_tree->lock);
 6917	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
 6918	write_unlock(&em_tree->lock);
 6919out:
 6920	btrfs_free_path(path);
 6921
 6922	trace_btrfs_get_extent(root, inode, em);
 6923
 6924	if (ret) {
 6925		free_extent_map(em);
 6926		return ERR_PTR(ret);
 6927	}
 6928	return em;
 6929}
 6930
 6931static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
 6932						  struct btrfs_dio_data *dio_data,
 6933						  const u64 start,
 6934						  const u64 len,
 6935						  const u64 orig_start,
 6936						  const u64 block_start,
 6937						  const u64 block_len,
 6938						  const u64 orig_block_len,
 6939						  const u64 ram_bytes,
 6940						  const int type)
 6941{
 6942	struct extent_map *em = NULL;
 6943	struct btrfs_ordered_extent *ordered;
 6944
 6945	if (type != BTRFS_ORDERED_NOCOW) {
 6946		em = create_io_em(inode, start, len, orig_start, block_start,
 6947				  block_len, orig_block_len, ram_bytes,
 6948				  BTRFS_COMPRESS_NONE, /* compress_type */
 6949				  type);
 6950		if (IS_ERR(em))
 6951			goto out;
 6952	}
 6953	ordered = btrfs_alloc_ordered_extent(inode, start, len, len,
 6954					     block_start, block_len, 0,
 6955					     (1 << type) |
 6956					     (1 << BTRFS_ORDERED_DIRECT),
 6957					     BTRFS_COMPRESS_NONE);
 6958	if (IS_ERR(ordered)) {
 6959		if (em) {
 6960			free_extent_map(em);
 6961			btrfs_drop_extent_map_range(inode, start,
 6962						    start + len - 1, false);
 6963		}
 6964		em = ERR_CAST(ordered);
 6965	} else {
 6966		ASSERT(!dio_data->ordered);
 6967		dio_data->ordered = ordered;
 6968	}
 6969 out:
 6970
 6971	return em;
 6972}
 6973
 6974static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
 6975						  struct btrfs_dio_data *dio_data,
 6976						  u64 start, u64 len)
 6977{
 6978	struct btrfs_root *root = inode->root;
 6979	struct btrfs_fs_info *fs_info = root->fs_info;
 6980	struct extent_map *em;
 6981	struct btrfs_key ins;
 6982	u64 alloc_hint;
 6983	int ret;
 6984
 6985	alloc_hint = get_extent_allocation_hint(inode, start, len);
 6986again:
 6987	ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
 6988				   0, alloc_hint, &ins, 1, 1);
 6989	if (ret == -EAGAIN) {
 6990		ASSERT(btrfs_is_zoned(fs_info));
 6991		wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
 6992			       TASK_UNINTERRUPTIBLE);
 6993		goto again;
 6994	}
 6995	if (ret)
 6996		return ERR_PTR(ret);
 6997
 6998	em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start,
 6999				     ins.objectid, ins.offset, ins.offset,
 7000				     ins.offset, BTRFS_ORDERED_REGULAR);
 7001	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 7002	if (IS_ERR(em))
 7003		btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
 7004					   1);
 7005
 7006	return em;
 7007}
 7008
 7009static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
 7010{
 7011	struct btrfs_block_group *block_group;
 7012	bool readonly = false;
 7013
 7014	block_group = btrfs_lookup_block_group(fs_info, bytenr);
 7015	if (!block_group || block_group->ro)
 7016		readonly = true;
 7017	if (block_group)
 7018		btrfs_put_block_group(block_group);
 7019	return readonly;
 7020}
 7021
 7022/*
 7023 * Check if we can do nocow write into the range [@offset, @offset + @len)
 7024 *
 7025 * @offset:	File offset
 7026 * @len:	The length to write, will be updated to the nocow writeable
 7027 *		range
 7028 * @orig_start:	(optional) Return the original file offset of the file extent
 7029 * @orig_len:	(optional) Return the original on-disk length of the file extent
 7030 * @ram_bytes:	(optional) Return the ram_bytes of the file extent
 7031 * @strict:	if true, omit optimizations that might force us into unnecessary
 7032 *		cow. e.g., don't trust generation number.
 7033 *
 7034 * Return:
 7035 * >0	and update @len if we can do nocow write
 7036 *  0	if we can't do nocow write
 7037 * <0	if error happened
 7038 *
 7039 * NOTE: This only checks the file extents, caller is responsible to wait for
 7040 *	 any ordered extents.
 7041 */
 7042noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 7043			      u64 *orig_start, u64 *orig_block_len,
 7044			      u64 *ram_bytes, bool nowait, bool strict)
 7045{
 7046	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 7047	struct can_nocow_file_extent_args nocow_args = { 0 };
 7048	struct btrfs_path *path;
 7049	int ret;
 7050	struct extent_buffer *leaf;
 7051	struct btrfs_root *root = BTRFS_I(inode)->root;
 7052	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 7053	struct btrfs_file_extent_item *fi;
 7054	struct btrfs_key key;
 7055	int found_type;
 7056
 7057	path = btrfs_alloc_path();
 7058	if (!path)
 7059		return -ENOMEM;
 7060	path->nowait = nowait;
 7061
 7062	ret = btrfs_lookup_file_extent(NULL, root, path,
 7063			btrfs_ino(BTRFS_I(inode)), offset, 0);
 7064	if (ret < 0)
 7065		goto out;
 7066
 7067	if (ret == 1) {
 7068		if (path->slots[0] == 0) {
 7069			/* can't find the item, must cow */
 7070			ret = 0;
 7071			goto out;
 7072		}
 7073		path->slots[0]--;
 7074	}
 7075	ret = 0;
 7076	leaf = path->nodes[0];
 7077	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 7078	if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
 7079	    key.type != BTRFS_EXTENT_DATA_KEY) {
 7080		/* not our file or wrong item type, must cow */
 7081		goto out;
 7082	}
 7083
 7084	if (key.offset > offset) {
 7085		/* Wrong offset, must cow */
 7086		goto out;
 7087	}
 7088
 7089	if (btrfs_file_extent_end(path) <= offset)
 7090		goto out;
 7091
 7092	fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
 7093	found_type = btrfs_file_extent_type(leaf, fi);
 7094	if (ram_bytes)
 7095		*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
 7096
 7097	nocow_args.start = offset;
 7098	nocow_args.end = offset + *len - 1;
 7099	nocow_args.strict = strict;
 7100	nocow_args.free_path = true;
 7101
 7102	ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
 7103	/* can_nocow_file_extent() has freed the path. */
 7104	path = NULL;
 7105
 7106	if (ret != 1) {
 7107		/* Treat errors as not being able to NOCOW. */
 7108		ret = 0;
 7109		goto out;
 7110	}
 7111
 7112	ret = 0;
 7113	if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr))
 7114		goto out;
 7115
 7116	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
 7117	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 7118		u64 range_end;
 7119
 7120		range_end = round_up(offset + nocow_args.num_bytes,
 7121				     root->fs_info->sectorsize) - 1;
 7122		ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC);
 7123		if (ret) {
 7124			ret = -EAGAIN;
 7125			goto out;
 7126		}
 7127	}
 7128
 7129	if (orig_start)
 7130		*orig_start = key.offset - nocow_args.extent_offset;
 7131	if (orig_block_len)
 7132		*orig_block_len = nocow_args.disk_num_bytes;
 7133
 7134	*len = nocow_args.num_bytes;
 7135	ret = 1;
 7136out:
 7137	btrfs_free_path(path);
 7138	return ret;
 7139}
 7140
 7141static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
 7142			      struct extent_state **cached_state,
 7143			      unsigned int iomap_flags)
 7144{
 7145	const bool writing = (iomap_flags & IOMAP_WRITE);
 7146	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
 7147	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 7148	struct btrfs_ordered_extent *ordered;
 7149	int ret = 0;
 7150
 7151	while (1) {
 7152		if (nowait) {
 7153			if (!try_lock_extent(io_tree, lockstart, lockend,
 7154					     cached_state))
 7155				return -EAGAIN;
 7156		} else {
 7157			lock_extent(io_tree, lockstart, lockend, cached_state);
 7158		}
 7159		/*
 7160		 * We're concerned with the entire range that we're going to be
 7161		 * doing DIO to, so we need to make sure there's no ordered
 7162		 * extents in this range.
 7163		 */
 7164		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
 7165						     lockend - lockstart + 1);
 7166
 7167		/*
 7168		 * We need to make sure there are no buffered pages in this
 7169		 * range either, we could have raced between the invalidate in
 7170		 * generic_file_direct_write and locking the extent.  The
 7171		 * invalidate needs to happen so that reads after a write do not
 7172		 * get stale data.
 7173		 */
 7174		if (!ordered &&
 7175		    (!writing || !filemap_range_has_page(inode->i_mapping,
 7176							 lockstart, lockend)))
 7177			break;
 7178
 7179		unlock_extent(io_tree, lockstart, lockend, cached_state);
 7180
 7181		if (ordered) {
 7182			if (nowait) {
 7183				btrfs_put_ordered_extent(ordered);
 7184				ret = -EAGAIN;
 7185				break;
 7186			}
 7187			/*
 7188			 * If we are doing a DIO read and the ordered extent we
 7189			 * found is for a buffered write, we can not wait for it
 7190			 * to complete and retry, because if we do so we can
 7191			 * deadlock with concurrent buffered writes on page
 7192			 * locks. This happens only if our DIO read covers more
 7193			 * than one extent map, if at this point has already
 7194			 * created an ordered extent for a previous extent map
 7195			 * and locked its range in the inode's io tree, and a
 7196			 * concurrent write against that previous extent map's
 7197			 * range and this range started (we unlock the ranges
 7198			 * in the io tree only when the bios complete and
 7199			 * buffered writes always lock pages before attempting
 7200			 * to lock range in the io tree).
 7201			 */
 7202			if (writing ||
 7203			    test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
 7204				btrfs_start_ordered_extent(ordered);
 7205			else
 7206				ret = nowait ? -EAGAIN : -ENOTBLK;
 7207			btrfs_put_ordered_extent(ordered);
 7208		} else {
 7209			/*
 7210			 * We could trigger writeback for this range (and wait
 7211			 * for it to complete) and then invalidate the pages for
 7212			 * this range (through invalidate_inode_pages2_range()),
 7213			 * but that can lead us to a deadlock with a concurrent
 7214			 * call to readahead (a buffered read or a defrag call
 7215			 * triggered a readahead) on a page lock due to an
 7216			 * ordered dio extent we created before but did not have
 7217			 * yet a corresponding bio submitted (whence it can not
 7218			 * complete), which makes readahead wait for that
 7219			 * ordered extent to complete while holding a lock on
 7220			 * that page.
 7221			 */
 7222			ret = nowait ? -EAGAIN : -ENOTBLK;
 7223		}
 7224
 7225		if (ret)
 7226			break;
 7227
 7228		cond_resched();
 7229	}
 7230
 7231	return ret;
 7232}
 7233
 7234/* The callers of this must take lock_extent() */
 7235static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
 7236				       u64 len, u64 orig_start, u64 block_start,
 7237				       u64 block_len, u64 orig_block_len,
 7238				       u64 ram_bytes, int compress_type,
 7239				       int type)
 7240{
 7241	struct extent_map *em;
 7242	int ret;
 7243
 7244	ASSERT(type == BTRFS_ORDERED_PREALLOC ||
 7245	       type == BTRFS_ORDERED_COMPRESSED ||
 7246	       type == BTRFS_ORDERED_NOCOW ||
 7247	       type == BTRFS_ORDERED_REGULAR);
 7248
 7249	em = alloc_extent_map();
 7250	if (!em)
 7251		return ERR_PTR(-ENOMEM);
 7252
 7253	em->start = start;
 7254	em->orig_start = orig_start;
 7255	em->len = len;
 7256	em->block_len = block_len;
 7257	em->block_start = block_start;
 7258	em->orig_block_len = orig_block_len;
 7259	em->ram_bytes = ram_bytes;
 7260	em->generation = -1;
 7261	set_bit(EXTENT_FLAG_PINNED, &em->flags);
 7262	if (type == BTRFS_ORDERED_PREALLOC) {
 7263		set_bit(EXTENT_FLAG_FILLING, &em->flags);
 7264	} else if (type == BTRFS_ORDERED_COMPRESSED) {
 7265		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 7266		em->compress_type = compress_type;
 7267	}
 7268
 7269	ret = btrfs_replace_extent_map_range(inode, em, true);
 7270	if (ret) {
 7271		free_extent_map(em);
 7272		return ERR_PTR(ret);
 7273	}
 7274
 7275	/* em got 2 refs now, callers needs to do free_extent_map once. */
 7276	return em;
 7277}
 7278
 7279
 7280static int btrfs_get_blocks_direct_write(struct extent_map **map,
 7281					 struct inode *inode,
 7282					 struct btrfs_dio_data *dio_data,
 7283					 u64 start, u64 *lenp,
 7284					 unsigned int iomap_flags)
 7285{
 7286	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
 7287	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 7288	struct extent_map *em = *map;
 7289	int type;
 7290	u64 block_start, orig_start, orig_block_len, ram_bytes;
 7291	struct btrfs_block_group *bg;
 7292	bool can_nocow = false;
 7293	bool space_reserved = false;
 7294	u64 len = *lenp;
 7295	u64 prev_len;
 7296	int ret = 0;
 7297
 7298	/*
 7299	 * We don't allocate a new extent in the following cases
 7300	 *
 7301	 * 1) The inode is marked as NODATACOW. In this case we'll just use the
 7302	 * existing extent.
 7303	 * 2) The extent is marked as PREALLOC. We're good to go here and can
 7304	 * just use the extent.
 7305	 *
 7306	 */
 7307	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
 7308	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
 7309	     em->block_start != EXTENT_MAP_HOLE)) {
 7310		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
 7311			type = BTRFS_ORDERED_PREALLOC;
 7312		else
 7313			type = BTRFS_ORDERED_NOCOW;
 7314		len = min(len, em->len - (start - em->start));
 7315		block_start = em->block_start + (start - em->start);
 7316
 7317		if (can_nocow_extent(inode, start, &len, &orig_start,
 7318				     &orig_block_len, &ram_bytes, false, false) == 1) {
 7319			bg = btrfs_inc_nocow_writers(fs_info, block_start);
 7320			if (bg)
 7321				can_nocow = true;
 7322		}
 7323	}
 7324
 7325	prev_len = len;
 7326	if (can_nocow) {
 7327		struct extent_map *em2;
 7328
 7329		/* We can NOCOW, so only need to reserve metadata space. */
 7330		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
 7331						      nowait);
 7332		if (ret < 0) {
 7333			/* Our caller expects us to free the input extent map. */
 7334			free_extent_map(em);
 7335			*map = NULL;
 7336			btrfs_dec_nocow_writers(bg);
 7337			if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
 7338				ret = -EAGAIN;
 7339			goto out;
 7340		}
 7341		space_reserved = true;
 7342
 7343		em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len,
 7344					      orig_start, block_start,
 7345					      len, orig_block_len,
 7346					      ram_bytes, type);
 7347		btrfs_dec_nocow_writers(bg);
 7348		if (type == BTRFS_ORDERED_PREALLOC) {
 7349			free_extent_map(em);
 7350			*map = em2;
 7351			em = em2;
 7352		}
 7353
 7354		if (IS_ERR(em2)) {
 7355			ret = PTR_ERR(em2);
 7356			goto out;
 7357		}
 7358
 7359		dio_data->nocow_done = true;
 7360	} else {
 7361		/* Our caller expects us to free the input extent map. */
 7362		free_extent_map(em);
 7363		*map = NULL;
 7364
 7365		if (nowait) {
 7366			ret = -EAGAIN;
 7367			goto out;
 7368		}
 7369
 7370		/*
 7371		 * If we could not allocate data space before locking the file
 7372		 * range and we can't do a NOCOW write, then we have to fail.
 7373		 */
 7374		if (!dio_data->data_space_reserved) {
 7375			ret = -ENOSPC;
 7376			goto out;
 7377		}
 7378
 7379		/*
 7380		 * We have to COW and we have already reserved data space before,
 7381		 * so now we reserve only metadata.
 7382		 */
 7383		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
 7384						      false);
 7385		if (ret < 0)
 7386			goto out;
 7387		space_reserved = true;
 7388
 7389		em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
 7390		if (IS_ERR(em)) {
 7391			ret = PTR_ERR(em);
 7392			goto out;
 7393		}
 7394		*map = em;
 7395		len = min(len, em->len - (start - em->start));
 7396		if (len < prev_len)
 7397			btrfs_delalloc_release_metadata(BTRFS_I(inode),
 7398							prev_len - len, true);
 7399	}
 7400
 7401	/*
 7402	 * We have created our ordered extent, so we can now release our reservation
 7403	 * for an outstanding extent.
 7404	 */
 7405	btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
 7406
 7407	/*
 7408	 * Need to update the i_size under the extent lock so buffered
 7409	 * readers will get the updated i_size when we unlock.
 7410	 */
 7411	if (start + len > i_size_read(inode))
 7412		i_size_write(inode, start + len);
 7413out:
 7414	if (ret && space_reserved) {
 7415		btrfs_delalloc_release_extents(BTRFS_I(inode), len);
 7416		btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
 7417	}
 7418	*lenp = len;
 7419	return ret;
 7420}
 7421
 7422static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
 7423		loff_t length, unsigned int flags, struct iomap *iomap,
 7424		struct iomap *srcmap)
 7425{
 7426	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
 7427	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 7428	struct extent_map *em;
 7429	struct extent_state *cached_state = NULL;
 7430	struct btrfs_dio_data *dio_data = iter->private;
 7431	u64 lockstart, lockend;
 7432	const bool write = !!(flags & IOMAP_WRITE);
 7433	int ret = 0;
 7434	u64 len = length;
 7435	const u64 data_alloc_len = length;
 7436	bool unlock_extents = false;
 7437
 7438	/*
 7439	 * We could potentially fault if we have a buffer > PAGE_SIZE, and if
 7440	 * we're NOWAIT we may submit a bio for a partial range and return
 7441	 * EIOCBQUEUED, which would result in an errant short read.
 7442	 *
 7443	 * The best way to handle this would be to allow for partial completions
 7444	 * of iocb's, so we could submit the partial bio, return and fault in
 7445	 * the rest of the pages, and then submit the io for the rest of the
 7446	 * range.  However we don't have that currently, so simply return
 7447	 * -EAGAIN at this point so that the normal path is used.
 7448	 */
 7449	if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
 7450		return -EAGAIN;
 7451
 7452	/*
 7453	 * Cap the size of reads to that usually seen in buffered I/O as we need
 7454	 * to allocate a contiguous array for the checksums.
 7455	 */
 7456	if (!write)
 7457		len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
 7458
 7459	lockstart = start;
 7460	lockend = start + len - 1;
 7461
 7462	/*
 7463	 * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
 7464	 * enough if we've written compressed pages to this area, so we need to
 7465	 * flush the dirty pages again to make absolutely sure that any
 7466	 * outstanding dirty pages are on disk - the first flush only starts
 7467	 * compression on the data, while keeping the pages locked, so by the
 7468	 * time the second flush returns we know bios for the compressed pages
 7469	 * were submitted and finished, and the pages no longer under writeback.
 7470	 *
 7471	 * If we have a NOWAIT request and we have any pages in the range that
 7472	 * are locked, likely due to compression still in progress, we don't want
 7473	 * to block on page locks. We also don't want to block on pages marked as
 7474	 * dirty or under writeback (same as for the non-compression case).
 7475	 * iomap_dio_rw() did the same check, but after that and before we got
 7476	 * here, mmap'ed writes may have happened or buffered reads started
 7477	 * (readpage() and readahead(), which lock pages), as we haven't locked
 7478	 * the file range yet.
 7479	 */
 7480	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
 7481		     &BTRFS_I(inode)->runtime_flags)) {
 7482		if (flags & IOMAP_NOWAIT) {
 7483			if (filemap_range_needs_writeback(inode->i_mapping,
 7484							  lockstart, lockend))
 7485				return -EAGAIN;
 7486		} else {
 7487			ret = filemap_fdatawrite_range(inode->i_mapping, start,
 7488						       start + length - 1);
 7489			if (ret)
 7490				return ret;
 7491		}
 7492	}
 7493
 7494	memset(dio_data, 0, sizeof(*dio_data));
 7495
 7496	/*
 7497	 * We always try to allocate data space and must do it before locking
 7498	 * the file range, to avoid deadlocks with concurrent writes to the same
 7499	 * range if the range has several extents and the writes don't expand the
 7500	 * current i_size (the inode lock is taken in shared mode). If we fail to
 7501	 * allocate data space here we continue and later, after locking the
 7502	 * file range, we fail with ENOSPC only if we figure out we can not do a
 7503	 * NOCOW write.
 7504	 */
 7505	if (write && !(flags & IOMAP_NOWAIT)) {
 7506		ret = btrfs_check_data_free_space(BTRFS_I(inode),
 7507						  &dio_data->data_reserved,
 7508						  start, data_alloc_len, false);
 7509		if (!ret)
 7510			dio_data->data_space_reserved = true;
 7511		else if (ret && !(BTRFS_I(inode)->flags &
 7512				  (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
 7513			goto err;
 7514	}
 7515
 7516	/*
 7517	 * If this errors out it's because we couldn't invalidate pagecache for
 7518	 * this range and we need to fallback to buffered IO, or we are doing a
 7519	 * NOWAIT read/write and we need to block.
 7520	 */
 7521	ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
 7522	if (ret < 0)
 7523		goto err;
 7524
 7525	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
 7526	if (IS_ERR(em)) {
 7527		ret = PTR_ERR(em);
 7528		goto unlock_err;
 7529	}
 7530
 7531	/*
 7532	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
 7533	 * io.  INLINE is special, and we could probably kludge it in here, but
 7534	 * it's still buffered so for safety lets just fall back to the generic
 7535	 * buffered path.
 7536	 *
 7537	 * For COMPRESSED we _have_ to read the entire extent in so we can
 7538	 * decompress it, so there will be buffering required no matter what we
 7539	 * do, so go ahead and fallback to buffered.
 7540	 *
 7541	 * We return -ENOTBLK because that's what makes DIO go ahead and go back
 7542	 * to buffered IO.  Don't blame me, this is the price we pay for using
 7543	 * the generic code.
 7544	 */
 7545	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
 7546	    em->block_start == EXTENT_MAP_INLINE) {
 7547		free_extent_map(em);
 7548		/*
 7549		 * If we are in a NOWAIT context, return -EAGAIN in order to
 7550		 * fallback to buffered IO. This is not only because we can
 7551		 * block with buffered IO (no support for NOWAIT semantics at
 7552		 * the moment) but also to avoid returning short reads to user
 7553		 * space - this happens if we were able to read some data from
 7554		 * previous non-compressed extents and then when we fallback to
 7555		 * buffered IO, at btrfs_file_read_iter() by calling
 7556		 * filemap_read(), we fail to fault in pages for the read buffer,
 7557		 * in which case filemap_read() returns a short read (the number
 7558		 * of bytes previously read is > 0, so it does not return -EFAULT).
 7559		 */
 7560		ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
 7561		goto unlock_err;
 7562	}
 7563
 7564	len = min(len, em->len - (start - em->start));
 7565
 7566	/*
 7567	 * If we have a NOWAIT request and the range contains multiple extents
 7568	 * (or a mix of extents and holes), then we return -EAGAIN to make the
 7569	 * caller fallback to a context where it can do a blocking (without
 7570	 * NOWAIT) request. This way we avoid doing partial IO and returning
 7571	 * success to the caller, which is not optimal for writes and for reads
 7572	 * it can result in unexpected behaviour for an application.
 7573	 *
 7574	 * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
 7575	 * iomap_dio_rw(), we can end up returning less data then what the caller
 7576	 * asked for, resulting in an unexpected, and incorrect, short read.
 7577	 * That is, the caller asked to read N bytes and we return less than that,
 7578	 * which is wrong unless we are crossing EOF. This happens if we get a
 7579	 * page fault error when trying to fault in pages for the buffer that is
 7580	 * associated to the struct iov_iter passed to iomap_dio_rw(), and we
 7581	 * have previously submitted bios for other extents in the range, in
 7582	 * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
 7583	 * those bios have completed by the time we get the page fault error,
 7584	 * which we return back to our caller - we should only return EIOCBQUEUED
 7585	 * after we have submitted bios for all the extents in the range.
 7586	 */
 7587	if ((flags & IOMAP_NOWAIT) && len < length) {
 7588		free_extent_map(em);
 7589		ret = -EAGAIN;
 7590		goto unlock_err;
 7591	}
 7592
 7593	if (write) {
 7594		ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
 7595						    start, &len, flags);
 7596		if (ret < 0)
 7597			goto unlock_err;
 7598		unlock_extents = true;
 7599		/* Recalc len in case the new em is smaller than requested */
 7600		len = min(len, em->len - (start - em->start));
 7601		if (dio_data->data_space_reserved) {
 7602			u64 release_offset;
 7603			u64 release_len = 0;
 7604
 7605			if (dio_data->nocow_done) {
 7606				release_offset = start;
 7607				release_len = data_alloc_len;
 7608			} else if (len < data_alloc_len) {
 7609				release_offset = start + len;
 7610				release_len = data_alloc_len - len;
 7611			}
 7612
 7613			if (release_len > 0)
 7614				btrfs_free_reserved_data_space(BTRFS_I(inode),
 7615							       dio_data->data_reserved,
 7616							       release_offset,
 7617							       release_len);
 7618		}
 7619	} else {
 7620		/*
 7621		 * We need to unlock only the end area that we aren't using.
 7622		 * The rest is going to be unlocked by the endio routine.
 7623		 */
 7624		lockstart = start + len;
 7625		if (lockstart < lockend)
 7626			unlock_extents = true;
 7627	}
 7628
 7629	if (unlock_extents)
 7630		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
 7631			      &cached_state);
 7632	else
 7633		free_extent_state(cached_state);
 7634
 7635	/*
 7636	 * Translate extent map information to iomap.
 7637	 * We trim the extents (and move the addr) even though iomap code does
 7638	 * that, since we have locked only the parts we are performing I/O in.
 7639	 */
 7640	if ((em->block_start == EXTENT_MAP_HOLE) ||
 7641	    (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
 7642		iomap->addr = IOMAP_NULL_ADDR;
 7643		iomap->type = IOMAP_HOLE;
 7644	} else {
 7645		iomap->addr = em->block_start + (start - em->start);
 7646		iomap->type = IOMAP_MAPPED;
 7647	}
 7648	iomap->offset = start;
 7649	iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
 7650	iomap->length = len;
 7651	free_extent_map(em);
 7652
 7653	return 0;
 7654
 7655unlock_err:
 7656	unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
 7657		      &cached_state);
 7658err:
 7659	if (dio_data->data_space_reserved) {
 7660		btrfs_free_reserved_data_space(BTRFS_I(inode),
 7661					       dio_data->data_reserved,
 7662					       start, data_alloc_len);
 7663		extent_changeset_free(dio_data->data_reserved);
 7664	}
 7665
 7666	return ret;
 7667}
 7668
 7669static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
 7670		ssize_t written, unsigned int flags, struct iomap *iomap)
 7671{
 7672	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
 7673	struct btrfs_dio_data *dio_data = iter->private;
 7674	size_t submitted = dio_data->submitted;
 7675	const bool write = !!(flags & IOMAP_WRITE);
 7676	int ret = 0;
 7677
 7678	if (!write && (iomap->type == IOMAP_HOLE)) {
 7679		/* If reading from a hole, unlock and return */
 7680		unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
 7681			      NULL);
 7682		return 0;
 7683	}
 7684
 7685	if (submitted < length) {
 7686		pos += submitted;
 7687		length -= submitted;
 7688		if (write)
 7689			btrfs_finish_ordered_extent(dio_data->ordered, NULL,
 7690						    pos, length, false);
 7691		else
 7692			unlock_extent(&BTRFS_I(inode)->io_tree, pos,
 7693				      pos + length - 1, NULL);
 7694		ret = -ENOTBLK;
 7695	}
 7696	if (write) {
 7697		btrfs_put_ordered_extent(dio_data->ordered);
 7698		dio_data->ordered = NULL;
 7699	}
 7700
 7701	if (write)
 7702		extent_changeset_free(dio_data->data_reserved);
 7703	return ret;
 7704}
 7705
 7706static void btrfs_dio_end_io(struct btrfs_bio *bbio)
 7707{
 7708	struct btrfs_dio_private *dip =
 7709		container_of(bbio, struct btrfs_dio_private, bbio);
 7710	struct btrfs_inode *inode = bbio->inode;
 7711	struct bio *bio = &bbio->bio;
 7712
 7713	if (bio->bi_status) {
 7714		btrfs_warn(inode->root->fs_info,
 7715		"direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
 7716			   btrfs_ino(inode), bio->bi_opf,
 7717			   dip->file_offset, dip->bytes, bio->bi_status);
 7718	}
 7719
 7720	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
 7721		btrfs_finish_ordered_extent(bbio->ordered, NULL,
 7722					    dip->file_offset, dip->bytes,
 7723					    !bio->bi_status);
 7724	} else {
 7725		unlock_extent(&inode->io_tree, dip->file_offset,
 7726			      dip->file_offset + dip->bytes - 1, NULL);
 7727	}
 7728
 7729	bbio->bio.bi_private = bbio->private;
 7730	iomap_dio_bio_end_io(bio);
 7731}
 7732
 7733static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
 7734				loff_t file_offset)
 7735{
 7736	struct btrfs_bio *bbio = btrfs_bio(bio);
 7737	struct btrfs_dio_private *dip =
 7738		container_of(bbio, struct btrfs_dio_private, bbio);
 7739	struct btrfs_dio_data *dio_data = iter->private;
 7740
 7741	btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
 7742		       btrfs_dio_end_io, bio->bi_private);
 7743	bbio->inode = BTRFS_I(iter->inode);
 7744	bbio->file_offset = file_offset;
 7745
 7746	dip->file_offset = file_offset;
 7747	dip->bytes = bio->bi_iter.bi_size;
 7748
 7749	dio_data->submitted += bio->bi_iter.bi_size;
 7750
 7751	/*
 7752	 * Check if we are doing a partial write.  If we are, we need to split
 7753	 * the ordered extent to match the submitted bio.  Hang on to the
 7754	 * remaining unfinishable ordered_extent in dio_data so that it can be
 7755	 * cancelled in iomap_end to avoid a deadlock wherein faulting the
 7756	 * remaining pages is blocked on the outstanding ordered extent.
 7757	 */
 7758	if (iter->flags & IOMAP_WRITE) {
 7759		int ret;
 7760
 7761		ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
 7762		if (ret) {
 7763			btrfs_finish_ordered_extent(dio_data->ordered, NULL,
 7764						    file_offset, dip->bytes,
 7765						    !ret);
 7766			bio->bi_status = errno_to_blk_status(ret);
 7767			iomap_dio_bio_end_io(bio);
 7768			return;
 7769		}
 7770	}
 7771
 7772	btrfs_submit_bio(bbio, 0);
 7773}
 7774
 7775static const struct iomap_ops btrfs_dio_iomap_ops = {
 7776	.iomap_begin            = btrfs_dio_iomap_begin,
 7777	.iomap_end              = btrfs_dio_iomap_end,
 7778};
 7779
 7780static const struct iomap_dio_ops btrfs_dio_ops = {
 7781	.submit_io		= btrfs_dio_submit_io,
 7782	.bio_set		= &btrfs_dio_bioset,
 7783};
 7784
 7785ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
 7786{
 7787	struct btrfs_dio_data data = { 0 };
 7788
 7789	return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
 7790			    IOMAP_DIO_PARTIAL, &data, done_before);
 7791}
 7792
 7793struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
 7794				  size_t done_before)
 7795{
 7796	struct btrfs_dio_data data = { 0 };
 7797
 7798	return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
 7799			    IOMAP_DIO_PARTIAL, &data, done_before);
 7800}
 7801
 7802static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 7803			u64 start, u64 len)
 7804{
 7805	int	ret;
 7806
 7807	ret = fiemap_prep(inode, fieinfo, start, &len, 0);
 7808	if (ret)
 7809		return ret;
 7810
 7811	/*
 7812	 * fiemap_prep() called filemap_write_and_wait() for the whole possible
 7813	 * file range (0 to LLONG_MAX), but that is not enough if we have
 7814	 * compression enabled. The first filemap_fdatawrite_range() only kicks
 7815	 * in the compression of data (in an async thread) and will return
 7816	 * before the compression is done and writeback is started. A second
 7817	 * filemap_fdatawrite_range() is needed to wait for the compression to
 7818	 * complete and writeback to start. We also need to wait for ordered
 7819	 * extents to complete, because our fiemap implementation uses mainly
 7820	 * file extent items to list the extents, searching for extent maps
 7821	 * only for file ranges with holes or prealloc extents to figure out
 7822	 * if we have delalloc in those ranges.
 7823	 */
 7824	if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
 7825		ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
 7826		if (ret)
 7827			return ret;
 7828	}
 7829
 7830	return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
 7831}
 7832
 7833static int btrfs_writepages(struct address_space *mapping,
 7834			    struct writeback_control *wbc)
 7835{
 7836	return extent_writepages(mapping, wbc);
 7837}
 7838
 7839static void btrfs_readahead(struct readahead_control *rac)
 7840{
 7841	extent_readahead(rac);
 7842}
 7843
 7844/*
 7845 * For release_folio() and invalidate_folio() we have a race window where
 7846 * folio_end_writeback() is called but the subpage spinlock is not yet released.
 7847 * If we continue to release/invalidate the page, we could cause use-after-free
 7848 * for subpage spinlock.  So this function is to spin and wait for subpage
 7849 * spinlock.
 7850 */
 7851static void wait_subpage_spinlock(struct page *page)
 7852{
 7853	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
 7854	struct btrfs_subpage *subpage;
 7855
 7856	if (!btrfs_is_subpage(fs_info, page))
 7857		return;
 7858
 7859	ASSERT(PagePrivate(page) && page->private);
 7860	subpage = (struct btrfs_subpage *)page->private;
 7861
 7862	/*
 7863	 * This may look insane as we just acquire the spinlock and release it,
 7864	 * without doing anything.  But we just want to make sure no one is
 7865	 * still holding the subpage spinlock.
 7866	 * And since the page is not dirty nor writeback, and we have page
 7867	 * locked, the only possible way to hold a spinlock is from the endio
 7868	 * function to clear page writeback.
 7869	 *
 7870	 * Here we just acquire the spinlock so that all existing callers
 7871	 * should exit and we're safe to release/invalidate the page.
 7872	 */
 7873	spin_lock_irq(&subpage->lock);
 7874	spin_unlock_irq(&subpage->lock);
 7875}
 7876
 7877static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
 7878{
 7879	int ret = try_release_extent_mapping(&folio->page, gfp_flags);
 7880
 7881	if (ret == 1) {
 7882		wait_subpage_spinlock(&folio->page);
 7883		clear_page_extent_mapped(&folio->page);
 7884	}
 7885	return ret;
 7886}
 7887
 7888static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
 7889{
 7890	if (folio_test_writeback(folio) || folio_test_dirty(folio))
 7891		return false;
 7892	return __btrfs_release_folio(folio, gfp_flags);
 7893}
 7894
 7895#ifdef CONFIG_MIGRATION
 7896static int btrfs_migrate_folio(struct address_space *mapping,
 7897			     struct folio *dst, struct folio *src,
 7898			     enum migrate_mode mode)
 7899{
 7900	int ret = filemap_migrate_folio(mapping, dst, src, mode);
 7901
 7902	if (ret != MIGRATEPAGE_SUCCESS)
 7903		return ret;
 7904
 7905	if (folio_test_ordered(src)) {
 7906		folio_clear_ordered(src);
 7907		folio_set_ordered(dst);
 7908	}
 7909
 7910	return MIGRATEPAGE_SUCCESS;
 7911}
 7912#else
 7913#define btrfs_migrate_folio NULL
 7914#endif
 7915
 7916static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
 7917				 size_t length)
 7918{
 7919	struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
 7920	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 7921	struct extent_io_tree *tree = &inode->io_tree;
 7922	struct extent_state *cached_state = NULL;
 7923	u64 page_start = folio_pos(folio);
 7924	u64 page_end = page_start + folio_size(folio) - 1;
 7925	u64 cur;
 7926	int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
 7927
 7928	/*
 7929	 * We have folio locked so no new ordered extent can be created on this
 7930	 * page, nor bio can be submitted for this folio.
 7931	 *
 7932	 * But already submitted bio can still be finished on this folio.
 7933	 * Furthermore, endio function won't skip folio which has Ordered
 7934	 * (Private2) already cleared, so it's possible for endio and
 7935	 * invalidate_folio to do the same ordered extent accounting twice
 7936	 * on one folio.
 7937	 *
 7938	 * So here we wait for any submitted bios to finish, so that we won't
 7939	 * do double ordered extent accounting on the same folio.
 7940	 */
 7941	folio_wait_writeback(folio);
 7942	wait_subpage_spinlock(&folio->page);
 7943
 7944	/*
 7945	 * For subpage case, we have call sites like
 7946	 * btrfs_punch_hole_lock_range() which passes range not aligned to
 7947	 * sectorsize.
 7948	 * If the range doesn't cover the full folio, we don't need to and
 7949	 * shouldn't clear page extent mapped, as folio->private can still
 7950	 * record subpage dirty bits for other part of the range.
 7951	 *
 7952	 * For cases that invalidate the full folio even the range doesn't
 7953	 * cover the full folio, like invalidating the last folio, we're
 7954	 * still safe to wait for ordered extent to finish.
 7955	 */
 7956	if (!(offset == 0 && length == folio_size(folio))) {
 7957		btrfs_release_folio(folio, GFP_NOFS);
 7958		return;
 7959	}
 7960
 7961	if (!inode_evicting)
 7962		lock_extent(tree, page_start, page_end, &cached_state);
 7963
 7964	cur = page_start;
 7965	while (cur < page_end) {
 7966		struct btrfs_ordered_extent *ordered;
 7967		u64 range_end;
 7968		u32 range_len;
 7969		u32 extra_flags = 0;
 7970
 7971		ordered = btrfs_lookup_first_ordered_range(inode, cur,
 7972							   page_end + 1 - cur);
 7973		if (!ordered) {
 7974			range_end = page_end;
 7975			/*
 7976			 * No ordered extent covering this range, we are safe
 7977			 * to delete all extent states in the range.
 7978			 */
 7979			extra_flags = EXTENT_CLEAR_ALL_BITS;
 7980			goto next;
 7981		}
 7982		if (ordered->file_offset > cur) {
 7983			/*
 7984			 * There is a range between [cur, oe->file_offset) not
 7985			 * covered by any ordered extent.
 7986			 * We are safe to delete all extent states, and handle
 7987			 * the ordered extent in the next iteration.
 7988			 */
 7989			range_end = ordered->file_offset - 1;
 7990			extra_flags = EXTENT_CLEAR_ALL_BITS;
 7991			goto next;
 7992		}
 7993
 7994		range_end = min(ordered->file_offset + ordered->num_bytes - 1,
 7995				page_end);
 7996		ASSERT(range_end + 1 - cur < U32_MAX);
 7997		range_len = range_end + 1 - cur;
 7998		if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
 7999			/*
 8000			 * If Ordered (Private2) is cleared, it means endio has
 8001			 * already been executed for the range.
 8002			 * We can't delete the extent states as
 8003			 * btrfs_finish_ordered_io() may still use some of them.
 8004			 */
 8005			goto next;
 8006		}
 8007		btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
 8008
 8009		/*
 8010		 * IO on this page will never be started, so we need to account
 8011		 * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
 8012		 * here, must leave that up for the ordered extent completion.
 8013		 *
 8014		 * This will also unlock the range for incoming
 8015		 * btrfs_finish_ordered_io().
 8016		 */
 8017		if (!inode_evicting)
 8018			clear_extent_bit(tree, cur, range_end,
 8019					 EXTENT_DELALLOC |
 8020					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
 8021					 EXTENT_DEFRAG, &cached_state);
 8022
 8023		spin_lock_irq(&inode->ordered_tree_lock);
 8024		set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
 8025		ordered->truncated_len = min(ordered->truncated_len,
 8026					     cur - ordered->file_offset);
 8027		spin_unlock_irq(&inode->ordered_tree_lock);
 8028
 8029		/*
 8030		 * If the ordered extent has finished, we're safe to delete all
 8031		 * the extent states of the range, otherwise
 8032		 * btrfs_finish_ordered_io() will get executed by endio for
 8033		 * other pages, so we can't delete extent states.
 8034		 */
 8035		if (btrfs_dec_test_ordered_pending(inode, &ordered,
 8036						   cur, range_end + 1 - cur)) {
 8037			btrfs_finish_ordered_io(ordered);
 8038			/*
 8039			 * The ordered extent has finished, now we're again
 8040			 * safe to delete all extent states of the range.
 8041			 */
 8042			extra_flags = EXTENT_CLEAR_ALL_BITS;
 8043		}
 8044next:
 8045		if (ordered)
 8046			btrfs_put_ordered_extent(ordered);
 8047		/*
 8048		 * Qgroup reserved space handler
 8049		 * Sector(s) here will be either:
 8050		 *
 8051		 * 1) Already written to disk or bio already finished
 8052		 *    Then its QGROUP_RESERVED bit in io_tree is already cleared.
 8053		 *    Qgroup will be handled by its qgroup_record then.
 8054		 *    btrfs_qgroup_free_data() call will do nothing here.
 8055		 *
 8056		 * 2) Not written to disk yet
 8057		 *    Then btrfs_qgroup_free_data() call will clear the
 8058		 *    QGROUP_RESERVED bit of its io_tree, and free the qgroup
 8059		 *    reserved data space.
 8060		 *    Since the IO will never happen for this page.
 8061		 */
 8062		btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL);
 8063		if (!inode_evicting) {
 8064			clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
 8065				 EXTENT_DELALLOC | EXTENT_UPTODATE |
 8066				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG |
 8067				 extra_flags, &cached_state);
 8068		}
 8069		cur = range_end + 1;
 8070	}
 8071	/*
 8072	 * We have iterated through all ordered extents of the page, the page
 8073	 * should not have Ordered (Private2) anymore, or the above iteration
 8074	 * did something wrong.
 8075	 */
 8076	ASSERT(!folio_test_ordered(folio));
 8077	btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
 8078	if (!inode_evicting)
 8079		__btrfs_release_folio(folio, GFP_NOFS);
 8080	clear_page_extent_mapped(&folio->page);
 8081}
 8082
 8083/*
 8084 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
 8085 * called from a page fault handler when a page is first dirtied. Hence we must
 8086 * be careful to check for EOF conditions here. We set the page up correctly
 8087 * for a written page which means we get ENOSPC checking when writing into
 8088 * holes and correct delalloc and unwritten extent mapping on filesystems that
 8089 * support these features.
 8090 *
 8091 * We are not allowed to take the i_mutex here so we have to play games to
 8092 * protect against truncate races as the page could now be beyond EOF.  Because
 8093 * truncate_setsize() writes the inode size before removing pages, once we have
 8094 * the page lock we can determine safely if the page is beyond EOF. If it is not
 8095 * beyond EOF, then the page is guaranteed safe against truncation until we
 8096 * unlock the page.
 8097 */
 8098vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
 8099{
 8100	struct page *page = vmf->page;
 8101	struct inode *inode = file_inode(vmf->vma->vm_file);
 8102	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 8103	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 8104	struct btrfs_ordered_extent *ordered;
 8105	struct extent_state *cached_state = NULL;
 8106	struct extent_changeset *data_reserved = NULL;
 8107	unsigned long zero_start;
 8108	loff_t size;
 8109	vm_fault_t ret;
 8110	int ret2;
 8111	int reserved = 0;
 8112	u64 reserved_space;
 8113	u64 page_start;
 8114	u64 page_end;
 8115	u64 end;
 8116
 8117	reserved_space = PAGE_SIZE;
 8118
 8119	sb_start_pagefault(inode->i_sb);
 8120	page_start = page_offset(page);
 8121	page_end = page_start + PAGE_SIZE - 1;
 8122	end = page_end;
 8123
 8124	/*
 8125	 * Reserving delalloc space after obtaining the page lock can lead to
 8126	 * deadlock. For example, if a dirty page is locked by this function
 8127	 * and the call to btrfs_delalloc_reserve_space() ends up triggering
 8128	 * dirty page write out, then the btrfs_writepages() function could
 8129	 * end up waiting indefinitely to get a lock on the page currently
 8130	 * being processed by btrfs_page_mkwrite() function.
 8131	 */
 8132	ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
 8133					    page_start, reserved_space);
 8134	if (!ret2) {
 8135		ret2 = file_update_time(vmf->vma->vm_file);
 8136		reserved = 1;
 8137	}
 8138	if (ret2) {
 8139		ret = vmf_error(ret2);
 8140		if (reserved)
 8141			goto out;
 8142		goto out_noreserve;
 8143	}
 8144
 8145	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
 8146again:
 8147	down_read(&BTRFS_I(inode)->i_mmap_lock);
 8148	lock_page(page);
 8149	size = i_size_read(inode);
 8150
 8151	if ((page->mapping != inode->i_mapping) ||
 8152	    (page_start >= size)) {
 8153		/* page got truncated out from underneath us */
 8154		goto out_unlock;
 8155	}
 8156	wait_on_page_writeback(page);
 8157
 8158	lock_extent(io_tree, page_start, page_end, &cached_state);
 8159	ret2 = set_page_extent_mapped(page);
 8160	if (ret2 < 0) {
 8161		ret = vmf_error(ret2);
 8162		unlock_extent(io_tree, page_start, page_end, &cached_state);
 8163		goto out_unlock;
 8164	}
 8165
 8166	/*
 8167	 * we can't set the delalloc bits if there are pending ordered
 8168	 * extents.  Drop our locks and wait for them to finish
 8169	 */
 8170	ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
 8171			PAGE_SIZE);
 8172	if (ordered) {
 8173		unlock_extent(io_tree, page_start, page_end, &cached_state);
 8174		unlock_page(page);
 8175		up_read(&BTRFS_I(inode)->i_mmap_lock);
 8176		btrfs_start_ordered_extent(ordered);
 8177		btrfs_put_ordered_extent(ordered);
 8178		goto again;
 8179	}
 8180
 8181	if (page->index == ((size - 1) >> PAGE_SHIFT)) {
 8182		reserved_space = round_up(size - page_start,
 8183					  fs_info->sectorsize);
 8184		if (reserved_space < PAGE_SIZE) {
 8185			end = page_start + reserved_space - 1;
 8186			btrfs_delalloc_release_space(BTRFS_I(inode),
 8187					data_reserved, page_start,
 8188					PAGE_SIZE - reserved_space, true);
 8189		}
 8190	}
 8191
 8192	/*
 8193	 * page_mkwrite gets called when the page is firstly dirtied after it's
 8194	 * faulted in, but write(2) could also dirty a page and set delalloc
 8195	 * bits, thus in this case for space account reason, we still need to
 8196	 * clear any delalloc bits within this page range since we have to
 8197	 * reserve data&meta space before lock_page() (see above comments).
 8198	 */
 8199	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
 8200			  EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
 8201			  EXTENT_DEFRAG, &cached_state);
 8202
 8203	ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
 8204					&cached_state);
 8205	if (ret2) {
 8206		unlock_extent(io_tree, page_start, page_end, &cached_state);
 8207		ret = VM_FAULT_SIGBUS;
 8208		goto out_unlock;
 8209	}
 8210
 8211	/* page is wholly or partially inside EOF */
 8212	if (page_start + PAGE_SIZE > size)
 8213		zero_start = offset_in_page(size);
 8214	else
 8215		zero_start = PAGE_SIZE;
 8216
 8217	if (zero_start != PAGE_SIZE)
 8218		memzero_page(page, zero_start, PAGE_SIZE - zero_start);
 8219
 8220	btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
 8221	btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
 8222	btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
 8223
 8224	btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
 8225
 8226	unlock_extent(io_tree, page_start, page_end, &cached_state);
 8227	up_read(&BTRFS_I(inode)->i_mmap_lock);
 8228
 8229	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
 8230	sb_end_pagefault(inode->i_sb);
 8231	extent_changeset_free(data_reserved);
 8232	return VM_FAULT_LOCKED;
 8233
 8234out_unlock:
 8235	unlock_page(page);
 8236	up_read(&BTRFS_I(inode)->i_mmap_lock);
 8237out:
 8238	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
 8239	btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
 8240				     reserved_space, (ret != 0));
 8241out_noreserve:
 8242	sb_end_pagefault(inode->i_sb);
 8243	extent_changeset_free(data_reserved);
 8244	return ret;
 8245}
 8246
 8247static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
 8248{
 8249	struct btrfs_truncate_control control = {
 8250		.inode = inode,
 8251		.ino = btrfs_ino(inode),
 8252		.min_type = BTRFS_EXTENT_DATA_KEY,
 8253		.clear_extent_range = true,
 8254	};
 8255	struct btrfs_root *root = inode->root;
 8256	struct btrfs_fs_info *fs_info = root->fs_info;
 8257	struct btrfs_block_rsv *rsv;
 8258	int ret;
 8259	struct btrfs_trans_handle *trans;
 8260	u64 mask = fs_info->sectorsize - 1;
 8261	const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
 8262
 8263	if (!skip_writeback) {
 8264		ret = btrfs_wait_ordered_range(&inode->vfs_inode,
 8265					       inode->vfs_inode.i_size & (~mask),
 8266					       (u64)-1);
 8267		if (ret)
 8268			return ret;
 8269	}
 8270
 8271	/*
 8272	 * Yes ladies and gentlemen, this is indeed ugly.  We have a couple of
 8273	 * things going on here:
 8274	 *
 8275	 * 1) We need to reserve space to update our inode.
 8276	 *
 8277	 * 2) We need to have something to cache all the space that is going to
 8278	 * be free'd up by the truncate operation, but also have some slack
 8279	 * space reserved in case it uses space during the truncate (thank you
 8280	 * very much snapshotting).
 8281	 *
 8282	 * And we need these to be separate.  The fact is we can use a lot of
 8283	 * space doing the truncate, and we have no earthly idea how much space
 8284	 * we will use, so we need the truncate reservation to be separate so it
 8285	 * doesn't end up using space reserved for updating the inode.  We also
 8286	 * need to be able to stop the transaction and start a new one, which
 8287	 * means we need to be able to update the inode several times, and we
 8288	 * have no idea of knowing how many times that will be, so we can't just
 8289	 * reserve 1 item for the entirety of the operation, so that has to be
 8290	 * done separately as well.
 8291	 *
 8292	 * So that leaves us with
 8293	 *
 8294	 * 1) rsv - for the truncate reservation, which we will steal from the
 8295	 * transaction reservation.
 8296	 * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
 8297	 * updating the inode.
 8298	 */
 8299	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
 8300	if (!rsv)
 8301		return -ENOMEM;
 8302	rsv->size = min_size;
 8303	rsv->failfast = true;
 8304
 8305	/*
 8306	 * 1 for the truncate slack space
 8307	 * 1 for updating the inode.
 8308	 */
 8309	trans = btrfs_start_transaction(root, 2);
 8310	if (IS_ERR(trans)) {
 8311		ret = PTR_ERR(trans);
 8312		goto out;
 8313	}
 8314
 8315	/* Migrate the slack space for the truncate to our reserve */
 8316	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
 8317				      min_size, false);
 8318	/*
 8319	 * We have reserved 2 metadata units when we started the transaction and
 8320	 * min_size matches 1 unit, so this should never fail, but if it does,
 8321	 * it's not critical we just fail truncation.
 8322	 */
 8323	if (WARN_ON(ret)) {
 8324		btrfs_end_transaction(trans);
 8325		goto out;
 8326	}
 8327
 8328	trans->block_rsv = rsv;
 8329
 8330	while (1) {
 8331		struct extent_state *cached_state = NULL;
 8332		const u64 new_size = inode->vfs_inode.i_size;
 8333		const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
 8334
 8335		control.new_size = new_size;
 8336		lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
 8337		/*
 8338		 * We want to drop from the next block forward in case this new
 8339		 * size is not block aligned since we will be keeping the last
 8340		 * block of the extent just the way it is.
 8341		 */
 8342		btrfs_drop_extent_map_range(inode,
 8343					    ALIGN(new_size, fs_info->sectorsize),
 8344					    (u64)-1, false);
 8345
 8346		ret = btrfs_truncate_inode_items(trans, root, &control);
 8347
 8348		inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
 8349		btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
 8350
 8351		unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
 8352
 8353		trans->block_rsv = &fs_info->trans_block_rsv;
 8354		if (ret != -ENOSPC && ret != -EAGAIN)
 8355			break;
 8356
 8357		ret = btrfs_update_inode(trans, inode);
 8358		if (ret)
 8359			break;
 8360
 8361		btrfs_end_transaction(trans);
 8362		btrfs_btree_balance_dirty(fs_info);
 8363
 8364		trans = btrfs_start_transaction(root, 2);
 8365		if (IS_ERR(trans)) {
 8366			ret = PTR_ERR(trans);
 8367			trans = NULL;
 8368			break;
 8369		}
 8370
 8371		btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
 8372		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
 8373					      rsv, min_size, false);
 8374		/*
 8375		 * We have reserved 2 metadata units when we started the
 8376		 * transaction and min_size matches 1 unit, so this should never
 8377		 * fail, but if it does, it's not critical we just fail truncation.
 8378		 */
 8379		if (WARN_ON(ret))
 8380			break;
 8381
 8382		trans->block_rsv = rsv;
 8383	}
 8384
 8385	/*
 8386	 * We can't call btrfs_truncate_block inside a trans handle as we could
 8387	 * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
 8388	 * know we've truncated everything except the last little bit, and can
 8389	 * do btrfs_truncate_block and then update the disk_i_size.
 8390	 */
 8391	if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
 8392		btrfs_end_transaction(trans);
 8393		btrfs_btree_balance_dirty(fs_info);
 8394
 8395		ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0);
 8396		if (ret)
 8397			goto out;
 8398		trans = btrfs_start_transaction(root, 1);
 8399		if (IS_ERR(trans)) {
 8400			ret = PTR_ERR(trans);
 8401			goto out;
 8402		}
 8403		btrfs_inode_safe_disk_i_size_write(inode, 0);
 8404	}
 8405
 8406	if (trans) {
 8407		int ret2;
 8408
 8409		trans->block_rsv = &fs_info->trans_block_rsv;
 8410		ret2 = btrfs_update_inode(trans, inode);
 8411		if (ret2 && !ret)
 8412			ret = ret2;
 8413
 8414		ret2 = btrfs_end_transaction(trans);
 8415		if (ret2 && !ret)
 8416			ret = ret2;
 8417		btrfs_btree_balance_dirty(fs_info);
 8418	}
 8419out:
 8420	btrfs_free_block_rsv(fs_info, rsv);
 8421	/*
 8422	 * So if we truncate and then write and fsync we normally would just
 8423	 * write the extents that changed, which is a problem if we need to
 8424	 * first truncate that entire inode.  So set this flag so we write out
 8425	 * all of the extents in the inode to the sync log so we're completely
 8426	 * safe.
 8427	 *
 8428	 * If no extents were dropped or trimmed we don't need to force the next
 8429	 * fsync to truncate all the inode's items from the log and re-log them
 8430	 * all. This means the truncate operation did not change the file size,
 8431	 * or changed it to a smaller size but there was only an implicit hole
 8432	 * between the old i_size and the new i_size, and there were no prealloc
 8433	 * extents beyond i_size to drop.
 8434	 */
 8435	if (control.extents_found > 0)
 8436		btrfs_set_inode_full_sync(inode);
 8437
 8438	return ret;
 8439}
 8440
 8441struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap,
 8442				     struct inode *dir)
 8443{
 8444	struct inode *inode;
 8445
 8446	inode = new_inode(dir->i_sb);
 8447	if (inode) {
 8448		/*
 8449		 * Subvolumes don't inherit the sgid bit or the parent's gid if
 8450		 * the parent's sgid bit is set. This is probably a bug.
 8451		 */
 8452		inode_init_owner(idmap, inode, NULL,
 8453				 S_IFDIR | (~current_umask() & S_IRWXUGO));
 8454		inode->i_op = &btrfs_dir_inode_operations;
 8455		inode->i_fop = &btrfs_dir_file_operations;
 8456	}
 8457	return inode;
 8458}
 8459
 8460struct inode *btrfs_alloc_inode(struct super_block *sb)
 8461{
 8462	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 8463	struct btrfs_inode *ei;
 8464	struct inode *inode;
 8465
 8466	ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
 8467	if (!ei)
 8468		return NULL;
 8469
 8470	ei->root = NULL;
 8471	ei->generation = 0;
 8472	ei->last_trans = 0;
 8473	ei->last_sub_trans = 0;
 8474	ei->logged_trans = 0;
 8475	ei->delalloc_bytes = 0;
 8476	ei->new_delalloc_bytes = 0;
 8477	ei->defrag_bytes = 0;
 8478	ei->disk_i_size = 0;
 8479	ei->flags = 0;
 8480	ei->ro_flags = 0;
 8481	ei->csum_bytes = 0;
 8482	ei->index_cnt = (u64)-1;
 8483	ei->dir_index = 0;
 8484	ei->last_unlink_trans = 0;
 8485	ei->last_reflink_trans = 0;
 8486	ei->last_log_commit = 0;
 8487
 8488	spin_lock_init(&ei->lock);
 8489	ei->outstanding_extents = 0;
 8490	if (sb->s_magic != BTRFS_TEST_MAGIC)
 8491		btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
 8492					      BTRFS_BLOCK_RSV_DELALLOC);
 8493	ei->runtime_flags = 0;
 8494	ei->prop_compress = BTRFS_COMPRESS_NONE;
 8495	ei->defrag_compress = BTRFS_COMPRESS_NONE;
 8496
 8497	ei->delayed_node = NULL;
 8498
 8499	ei->i_otime_sec = 0;
 8500	ei->i_otime_nsec = 0;
 8501
 8502	inode = &ei->vfs_inode;
 8503	extent_map_tree_init(&ei->extent_tree);
 8504	extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
 8505	ei->io_tree.inode = ei;
 8506	extent_io_tree_init(fs_info, &ei->file_extent_tree,
 8507			    IO_TREE_INODE_FILE_EXTENT);
 8508	mutex_init(&ei->log_mutex);
 8509	spin_lock_init(&ei->ordered_tree_lock);
 8510	ei->ordered_tree = RB_ROOT;
 8511	ei->ordered_tree_last = NULL;
 8512	INIT_LIST_HEAD(&ei->delalloc_inodes);
 8513	INIT_LIST_HEAD(&ei->delayed_iput);
 8514	RB_CLEAR_NODE(&ei->rb_node);
 8515	init_rwsem(&ei->i_mmap_lock);
 8516
 8517	return inode;
 8518}
 8519
 8520#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 8521void btrfs_test_destroy_inode(struct inode *inode)
 8522{
 8523	btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
 8524	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 8525}
 8526#endif
 8527
 8528void btrfs_free_inode(struct inode *inode)
 8529{
 8530	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 8531}
 8532
 8533void btrfs_destroy_inode(struct inode *vfs_inode)
 8534{
 8535	struct btrfs_ordered_extent *ordered;
 8536	struct btrfs_inode *inode = BTRFS_I(vfs_inode);
 8537	struct btrfs_root *root = inode->root;
 8538	bool freespace_inode;
 8539
 8540	WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
 8541	WARN_ON(vfs_inode->i_data.nrpages);
 8542	WARN_ON(inode->block_rsv.reserved);
 8543	WARN_ON(inode->block_rsv.size);
 8544	WARN_ON(inode->outstanding_extents);
 8545	if (!S_ISDIR(vfs_inode->i_mode)) {
 8546		WARN_ON(inode->delalloc_bytes);
 8547		WARN_ON(inode->new_delalloc_bytes);
 8548	}
 8549	WARN_ON(inode->csum_bytes);
 8550	WARN_ON(inode->defrag_bytes);
 8551
 8552	/*
 8553	 * This can happen where we create an inode, but somebody else also
 8554	 * created the same inode and we need to destroy the one we already
 8555	 * created.
 8556	 */
 8557	if (!root)
 8558		return;
 8559
 8560	/*
 8561	 * If this is a free space inode do not take the ordered extents lockdep
 8562	 * map.
 8563	 */
 8564	freespace_inode = btrfs_is_free_space_inode(inode);
 8565
 8566	while (1) {
 8567		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
 8568		if (!ordered)
 8569			break;
 8570		else {
 8571			btrfs_err(root->fs_info,
 8572				  "found ordered extent %llu %llu on inode cleanup",
 8573				  ordered->file_offset, ordered->num_bytes);
 8574
 8575			if (!freespace_inode)
 8576				btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);
 8577
 8578			btrfs_remove_ordered_extent(inode, ordered);
 8579			btrfs_put_ordered_extent(ordered);
 8580			btrfs_put_ordered_extent(ordered);
 8581		}
 8582	}
 8583	btrfs_qgroup_check_reserved_leak(inode);
 8584	inode_tree_del(inode);
 8585	btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
 8586	btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
 8587	btrfs_put_root(inode->root);
 8588}
 8589
 8590int btrfs_drop_inode(struct inode *inode)
 8591{
 8592	struct btrfs_root *root = BTRFS_I(inode)->root;
 8593
 8594	if (root == NULL)
 8595		return 1;
 8596
 8597	/* the snap/subvol tree is on deleting */
 8598	if (btrfs_root_refs(&root->root_item) == 0)
 8599		return 1;
 8600	else
 8601		return generic_drop_inode(inode);
 8602}
 8603
 8604static void init_once(void *foo)
 8605{
 8606	struct btrfs_inode *ei = foo;
 8607
 8608	inode_init_once(&ei->vfs_inode);
 8609}
 8610
 8611void __cold btrfs_destroy_cachep(void)
 8612{
 8613	/*
 8614	 * Make sure all delayed rcu free inodes are flushed before we
 8615	 * destroy cache.
 8616	 */
 8617	rcu_barrier();
 8618	bioset_exit(&btrfs_dio_bioset);
 8619	kmem_cache_destroy(btrfs_inode_cachep);
 8620}
 8621
 8622int __init btrfs_init_cachep(void)
 8623{
 8624	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
 8625			sizeof(struct btrfs_inode), 0,
 8626			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
 8627			init_once);
 8628	if (!btrfs_inode_cachep)
 8629		goto fail;
 8630
 8631	if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
 8632			offsetof(struct btrfs_dio_private, bbio.bio),
 8633			BIOSET_NEED_BVECS))
 8634		goto fail;
 8635
 8636	return 0;
 8637fail:
 8638	btrfs_destroy_cachep();
 8639	return -ENOMEM;
 8640}
 8641
 8642static int btrfs_getattr(struct mnt_idmap *idmap,
 8643			 const struct path *path, struct kstat *stat,
 8644			 u32 request_mask, unsigned int flags)
 8645{
 8646	u64 delalloc_bytes;
 8647	u64 inode_bytes;
 8648	struct inode *inode = d_inode(path->dentry);
 8649	u32 blocksize = inode->i_sb->s_blocksize;
 8650	u32 bi_flags = BTRFS_I(inode)->flags;
 8651	u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
 8652
 8653	stat->result_mask |= STATX_BTIME;
 8654	stat->btime.tv_sec = BTRFS_I(inode)->i_otime_sec;
 8655	stat->btime.tv_nsec = BTRFS_I(inode)->i_otime_nsec;
 8656	if (bi_flags & BTRFS_INODE_APPEND)
 8657		stat->attributes |= STATX_ATTR_APPEND;
 8658	if (bi_flags & BTRFS_INODE_COMPRESS)
 8659		stat->attributes |= STATX_ATTR_COMPRESSED;
 8660	if (bi_flags & BTRFS_INODE_IMMUTABLE)
 8661		stat->attributes |= STATX_ATTR_IMMUTABLE;
 8662	if (bi_flags & BTRFS_INODE_NODUMP)
 8663		stat->attributes |= STATX_ATTR_NODUMP;
 8664	if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
 8665		stat->attributes |= STATX_ATTR_VERITY;
 8666
 8667	stat->attributes_mask |= (STATX_ATTR_APPEND |
 8668				  STATX_ATTR_COMPRESSED |
 8669				  STATX_ATTR_IMMUTABLE |
 8670				  STATX_ATTR_NODUMP);
 8671
 8672	generic_fillattr(idmap, request_mask, inode, stat);
 8673	stat->dev = BTRFS_I(inode)->root->anon_dev;
 8674
 8675	spin_lock(&BTRFS_I(inode)->lock);
 8676	delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
 8677	inode_bytes = inode_get_bytes(inode);
 8678	spin_unlock(&BTRFS_I(inode)->lock);
 8679	stat->blocks = (ALIGN(inode_bytes, blocksize) +
 8680			ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT;
 8681	return 0;
 8682}
 8683
 8684static int btrfs_rename_exchange(struct inode *old_dir,
 8685			      struct dentry *old_dentry,
 8686			      struct inode *new_dir,
 8687			      struct dentry *new_dentry)
 8688{
 8689	struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
 8690	struct btrfs_trans_handle *trans;
 8691	unsigned int trans_num_items;
 8692	struct btrfs_root *root = BTRFS_I(old_dir)->root;
 8693	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
 8694	struct inode *new_inode = new_dentry->d_inode;
 8695	struct inode *old_inode = old_dentry->d_inode;
 8696	struct btrfs_rename_ctx old_rename_ctx;
 8697	struct btrfs_rename_ctx new_rename_ctx;
 8698	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
 8699	u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
 8700	u64 old_idx = 0;
 8701	u64 new_idx = 0;
 8702	int ret;
 8703	int ret2;
 8704	bool need_abort = false;
 8705	struct fscrypt_name old_fname, new_fname;
 8706	struct fscrypt_str *old_name, *new_name;
 8707
 8708	/*
 8709	 * For non-subvolumes allow exchange only within one subvolume, in the
 8710	 * same inode namespace. Two subvolumes (represented as directory) can
 8711	 * be exchanged as they're a logical link and have a fixed inode number.
 8712	 */
 8713	if (root != dest &&
 8714	    (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
 8715	     new_ino != BTRFS_FIRST_FREE_OBJECTID))
 8716		return -EXDEV;
 8717
 8718	ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
 8719	if (ret)
 8720		return ret;
 8721
 8722	ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
 8723	if (ret) {
 8724		fscrypt_free_filename(&old_fname);
 8725		return ret;
 8726	}
 8727
 8728	old_name = &old_fname.disk_name;
 8729	new_name = &new_fname.disk_name;
 8730
 8731	/* close the race window with snapshot create/destroy ioctl */
 8732	if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
 8733	    new_ino == BTRFS_FIRST_FREE_OBJECTID)
 8734		down_read(&fs_info->subvol_sem);
 8735
 8736	/*
 8737	 * For each inode:
 8738	 * 1 to remove old dir item
 8739	 * 1 to remove old dir index
 8740	 * 1 to add new dir item
 8741	 * 1 to add new dir index
 8742	 * 1 to update parent inode
 8743	 *
 8744	 * If the parents are the same, we only need to account for one
 8745	 */
 8746	trans_num_items = (old_dir == new_dir ? 9 : 10);
 8747	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
 8748		/*
 8749		 * 1 to remove old root ref
 8750		 * 1 to remove old root backref
 8751		 * 1 to add new root ref
 8752		 * 1 to add new root backref
 8753		 */
 8754		trans_num_items += 4;
 8755	} else {
 8756		/*
 8757		 * 1 to update inode item
 8758		 * 1 to remove old inode ref
 8759		 * 1 to add new inode ref
 8760		 */
 8761		trans_num_items += 3;
 8762	}
 8763	if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
 8764		trans_num_items += 4;
 8765	else
 8766		trans_num_items += 3;
 8767	trans = btrfs_start_transaction(root, trans_num_items);
 8768	if (IS_ERR(trans)) {
 8769		ret = PTR_ERR(trans);
 8770		goto out_notrans;
 8771	}
 8772
 8773	if (dest != root) {
 8774		ret = btrfs_record_root_in_trans(trans, dest);
 8775		if (ret)
 8776			goto out_fail;
 8777	}
 8778
 8779	/*
 8780	 * We need to find a free sequence number both in the source and
 8781	 * in the destination directory for the exchange.
 8782	 */
 8783	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
 8784	if (ret)
 8785		goto out_fail;
 8786	ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
 8787	if (ret)
 8788		goto out_fail;
 8789
 8790	BTRFS_I(old_inode)->dir_index = 0ULL;
 8791	BTRFS_I(new_inode)->dir_index = 0ULL;
 8792
 8793	/* Reference for the source. */
 8794	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
 8795		/* force full log commit if subvolume involved. */
 8796		btrfs_set_log_full_commit(trans);
 8797	} else {
 8798		ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino,
 8799					     btrfs_ino(BTRFS_I(new_dir)),
 8800					     old_idx);
 8801		if (ret)
 8802			goto out_fail;
 8803		need_abort = true;
 8804	}
 8805
 8806	/* And now for the dest. */
 8807	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
 8808		/* force full log commit if subvolume involved. */
 8809		btrfs_set_log_full_commit(trans);
 8810	} else {
 8811		ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino,
 8812					     btrfs_ino(BTRFS_I(old_dir)),
 8813					     new_idx);
 8814		if (ret) {
 8815			if (need_abort)
 8816				btrfs_abort_transaction(trans, ret);
 8817			goto out_fail;
 8818		}
 8819	}
 8820
 8821	/* Update inode version and ctime/mtime. */
 8822	inode_inc_iversion(old_dir);
 8823	inode_inc_iversion(new_dir);
 8824	inode_inc_iversion(old_inode);
 8825	inode_inc_iversion(new_inode);
 8826	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
 8827
 8828	if (old_dentry->d_parent != new_dentry->d_parent) {
 8829		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
 8830					BTRFS_I(old_inode), true);
 8831		btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
 8832					BTRFS_I(new_inode), true);
 8833	}
 8834
 8835	/* src is a subvolume */
 8836	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
 8837		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
 8838	} else { /* src is an inode */
 8839		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
 8840					   BTRFS_I(old_dentry->d_inode),
 8841					   old_name, &old_rename_ctx);
 8842		if (!ret)
 8843			ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
 8844	}
 8845	if (ret) {
 8846		btrfs_abort_transaction(trans, ret);
 8847		goto out_fail;
 8848	}
 8849
 8850	/* dest is a subvolume */
 8851	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
 8852		ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
 8853	} else { /* dest is an inode */
 8854		ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
 8855					   BTRFS_I(new_dentry->d_inode),
 8856					   new_name, &new_rename_ctx);
 8857		if (!ret)
 8858			ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
 8859	}
 8860	if (ret) {
 8861		btrfs_abort_transaction(trans, ret);
 8862		goto out_fail;
 8863	}
 8864
 8865	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
 8866			     new_name, 0, old_idx);
 8867	if (ret) {
 8868		btrfs_abort_transaction(trans, ret);
 8869		goto out_fail;
 8870	}
 8871
 8872	ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
 8873			     old_name, 0, new_idx);
 8874	if (ret) {
 8875		btrfs_abort_transaction(trans, ret);
 8876		goto out_fail;
 8877	}
 8878
 8879	if (old_inode->i_nlink == 1)
 8880		BTRFS_I(old_inode)->dir_index = old_idx;
 8881	if (new_inode->i_nlink == 1)
 8882		BTRFS_I(new_inode)->dir_index = new_idx;
 8883
 8884	/*
 8885	 * Now pin the logs of the roots. We do it to ensure that no other task
 8886	 * can sync the logs while we are in progress with the rename, because
 8887	 * that could result in an inconsistency in case any of the inodes that
 8888	 * are part of this rename operation were logged before.
 8889	 */
 8890	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
 8891		btrfs_pin_log_trans(root);
 8892	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
 8893		btrfs_pin_log_trans(dest);
 8894
 8895	/* Do the log updates for all inodes. */
 8896	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
 8897		btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
 8898				   old_rename_ctx.index, new_dentry->d_parent);
 8899	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
 8900		btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
 8901				   new_rename_ctx.index, old_dentry->d_parent);
 8902
 8903	/* Now unpin the logs. */
 8904	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
 8905		btrfs_end_log_trans(root);
 8906	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
 8907		btrfs_end_log_trans(dest);
 8908out_fail:
 8909	ret2 = btrfs_end_transaction(trans);
 8910	ret = ret ? ret : ret2;
 8911out_notrans:
 8912	if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
 8913	    old_ino == BTRFS_FIRST_FREE_OBJECTID)
 8914		up_read(&fs_info->subvol_sem);
 8915
 8916	fscrypt_free_filename(&new_fname);
 8917	fscrypt_free_filename(&old_fname);
 8918	return ret;
 8919}
 8920
 8921static struct inode *new_whiteout_inode(struct mnt_idmap *idmap,
 8922					struct inode *dir)
 8923{
 8924	struct inode *inode;
 8925
 8926	inode = new_inode(dir->i_sb);
 8927	if (inode) {
 8928		inode_init_owner(idmap, inode, dir,
 8929				 S_IFCHR | WHITEOUT_MODE);
 8930		inode->i_op = &btrfs_special_inode_operations;
 8931		init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
 8932	}
 8933	return inode;
 8934}
 8935
 8936static int btrfs_rename(struct mnt_idmap *idmap,
 8937			struct inode *old_dir, struct dentry *old_dentry,
 8938			struct inode *new_dir, struct dentry *new_dentry,
 8939			unsigned int flags)
 8940{
 8941	struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
 8942	struct btrfs_new_inode_args whiteout_args = {
 8943		.dir = old_dir,
 8944		.dentry = old_dentry,
 8945	};
 8946	struct btrfs_trans_handle *trans;
 8947	unsigned int trans_num_items;
 8948	struct btrfs_root *root = BTRFS_I(old_dir)->root;
 8949	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
 8950	struct inode *new_inode = d_inode(new_dentry);
 8951	struct inode *old_inode = d_inode(old_dentry);
 8952	struct btrfs_rename_ctx rename_ctx;
 8953	u64 index = 0;
 8954	int ret;
 8955	int ret2;
 8956	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
 8957	struct fscrypt_name old_fname, new_fname;
 8958
 8959	if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
 8960		return -EPERM;
 8961
 8962	/* we only allow rename subvolume link between subvolumes */
 8963	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
 8964		return -EXDEV;
 8965
 8966	if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
 8967	    (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
 8968		return -ENOTEMPTY;
 8969
 8970	if (S_ISDIR(old_inode->i_mode) && new_inode &&
 8971	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 8972		return -ENOTEMPTY;
 8973
 8974	ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
 8975	if (ret)
 8976		return ret;
 8977
 8978	ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
 8979	if (ret) {
 8980		fscrypt_free_filename(&old_fname);
 8981		return ret;
 8982	}
 8983
 8984	/* check for collisions, even if the  name isn't there */
 8985	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name);
 8986	if (ret) {
 8987		if (ret == -EEXIST) {
 8988			/* we shouldn't get
 8989			 * eexist without a new_inode */
 8990			if (WARN_ON(!new_inode)) {
 8991				goto out_fscrypt_names;
 8992			}
 8993		} else {
 8994			/* maybe -EOVERFLOW */
 8995			goto out_fscrypt_names;
 8996		}
 8997	}
 8998	ret = 0;
 8999
 9000	/*
 9001	 * we're using rename to replace one file with another.  Start IO on it
 9002	 * now so  we don't add too much work to the end of the transaction
 9003	 */
 9004	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
 9005		filemap_flush(old_inode->i_mapping);
 9006
 9007	if (flags & RENAME_WHITEOUT) {
 9008		whiteout_args.inode = new_whiteout_inode(idmap, old_dir);
 9009		if (!whiteout_args.inode) {
 9010			ret = -ENOMEM;
 9011			goto out_fscrypt_names;
 9012		}
 9013		ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items);
 9014		if (ret)
 9015			goto out_whiteout_inode;
 9016	} else {
 9017		/* 1 to update the old parent inode. */
 9018		trans_num_items = 1;
 9019	}
 9020
 9021	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
 9022		/* Close the race window with snapshot create/destroy ioctl */
 9023		down_read(&fs_info->subvol_sem);
 9024		/*
 9025		 * 1 to remove old root ref
 9026		 * 1 to remove old root backref
 9027		 * 1 to add new root ref
 9028		 * 1 to add new root backref
 9029		 */
 9030		trans_num_items += 4;
 9031	} else {
 9032		/*
 9033		 * 1 to update inode
 9034		 * 1 to remove old inode ref
 9035		 * 1 to add new inode ref
 9036		 */
 9037		trans_num_items += 3;
 9038	}
 9039	/*
 9040	 * 1 to remove old dir item
 9041	 * 1 to remove old dir index
 9042	 * 1 to add new dir item
 9043	 * 1 to add new dir index
 9044	 */
 9045	trans_num_items += 4;
 9046	/* 1 to update new parent inode if it's not the same as the old parent */
 9047	if (new_dir != old_dir)
 9048		trans_num_items++;
 9049	if (new_inode) {
 9050		/*
 9051		 * 1 to update inode
 9052		 * 1 to remove inode ref
 9053		 * 1 to remove dir item
 9054		 * 1 to remove dir index
 9055		 * 1 to possibly add orphan item
 9056		 */
 9057		trans_num_items += 5;
 9058	}
 9059	trans = btrfs_start_transaction(root, trans_num_items);
 9060	if (IS_ERR(trans)) {
 9061		ret = PTR_ERR(trans);
 9062		goto out_notrans;
 9063	}
 9064
 9065	if (dest != root) {
 9066		ret = btrfs_record_root_in_trans(trans, dest);
 9067		if (ret)
 9068			goto out_fail;
 9069	}
 9070
 9071	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
 9072	if (ret)
 9073		goto out_fail;
 9074
 9075	BTRFS_I(old_inode)->dir_index = 0ULL;
 9076	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
 9077		/* force full log commit if subvolume involved. */
 9078		btrfs_set_log_full_commit(trans);
 9079	} else {
 9080		ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name,
 9081					     old_ino, btrfs_ino(BTRFS_I(new_dir)),
 9082					     index);
 9083		if (ret)
 9084			goto out_fail;
 9085	}
 9086
 9087	inode_inc_iversion(old_dir);
 9088	inode_inc_iversion(new_dir);
 9089	inode_inc_iversion(old_inode);
 9090	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
 9091
 9092	if (old_dentry->d_parent != new_dentry->d_parent)
 9093		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
 9094					BTRFS_I(old_inode), true);
 9095
 9096	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
 9097		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
 9098	} else {
 9099		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
 9100					   BTRFS_I(d_inode(old_dentry)),
 9101					   &old_fname.disk_name, &rename_ctx);
 9102		if (!ret)
 9103			ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
 9104	}
 9105	if (ret) {
 9106		btrfs_abort_transaction(trans, ret);
 9107		goto out_fail;
 9108	}
 9109
 9110	if (new_inode) {
 9111		inode_inc_iversion(new_inode);
 9112		if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
 9113			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
 9114			ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
 9115			BUG_ON(new_inode->i_nlink == 0);
 9116		} else {
 9117			ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
 9118						 BTRFS_I(d_inode(new_dentry)),
 9119						 &new_fname.disk_name);
 9120		}
 9121		if (!ret && new_inode->i_nlink == 0)
 9122			ret = btrfs_orphan_add(trans,
 9123					BTRFS_I(d_inode(new_dentry)));
 9124		if (ret) {
 9125			btrfs_abort_transaction(trans, ret);
 9126			goto out_fail;
 9127		}
 9128	}
 9129
 9130	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
 9131			     &new_fname.disk_name, 0, index);
 9132	if (ret) {
 9133		btrfs_abort_transaction(trans, ret);
 9134		goto out_fail;
 9135	}
 9136
 9137	if (old_inode->i_nlink == 1)
 9138		BTRFS_I(old_inode)->dir_index = index;
 9139
 9140	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
 9141		btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
 9142				   rename_ctx.index, new_dentry->d_parent);
 9143
 9144	if (flags & RENAME_WHITEOUT) {
 9145		ret = btrfs_create_new_inode(trans, &whiteout_args);
 9146		if (ret) {
 9147			btrfs_abort_transaction(trans, ret);
 9148			goto out_fail;
 9149		} else {
 9150			unlock_new_inode(whiteout_args.inode);
 9151			iput(whiteout_args.inode);
 9152			whiteout_args.inode = NULL;
 9153		}
 9154	}
 9155out_fail:
 9156	ret2 = btrfs_end_transaction(trans);
 9157	ret = ret ? ret : ret2;
 9158out_notrans:
 9159	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
 9160		up_read(&fs_info->subvol_sem);
 9161	if (flags & RENAME_WHITEOUT)
 9162		btrfs_new_inode_args_destroy(&whiteout_args);
 9163out_whiteout_inode:
 9164	if (flags & RENAME_WHITEOUT)
 9165		iput(whiteout_args.inode);
 9166out_fscrypt_names:
 9167	fscrypt_free_filename(&old_fname);
 9168	fscrypt_free_filename(&new_fname);
 9169	return ret;
 9170}
 9171
 9172static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir,
 9173			 struct dentry *old_dentry, struct inode *new_dir,
 9174			 struct dentry *new_dentry, unsigned int flags)
 9175{
 9176	int ret;
 9177
 9178	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
 9179		return -EINVAL;
 9180
 9181	if (flags & RENAME_EXCHANGE)
 9182		ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,
 9183					    new_dentry);
 9184	else
 9185		ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir,
 9186				   new_dentry, flags);
 9187
 9188	btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info);
 9189
 9190	return ret;
 9191}
 9192
 9193struct btrfs_delalloc_work {
 9194	struct inode *inode;
 9195	struct completion completion;
 9196	struct list_head list;
 9197	struct btrfs_work work;
 9198};
 9199
 9200static void btrfs_run_delalloc_work(struct btrfs_work *work)
 9201{
 9202	struct btrfs_delalloc_work *delalloc_work;
 9203	struct inode *inode;
 9204
 9205	delalloc_work = container_of(work, struct btrfs_delalloc_work,
 9206				     work);
 9207	inode = delalloc_work->inode;
 9208	filemap_flush(inode->i_mapping);
 9209	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
 9210				&BTRFS_I(inode)->runtime_flags))
 9211		filemap_flush(inode->i_mapping);
 9212
 9213	iput(inode);
 9214	complete(&delalloc_work->completion);
 9215}
 9216
 9217static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
 9218{
 9219	struct btrfs_delalloc_work *work;
 9220
 9221	work = kmalloc(sizeof(*work), GFP_NOFS);
 9222	if (!work)
 9223		return NULL;
 9224
 9225	init_completion(&work->completion);
 9226	INIT_LIST_HEAD(&work->list);
 9227	work->inode = inode;
 9228	btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL);
 9229
 9230	return work;
 9231}
 9232
 9233/*
 9234 * some fairly slow code that needs optimization. This walks the list
 9235 * of all the inodes with pending delalloc and forces them to disk.
 9236 */
 9237static int start_delalloc_inodes(struct btrfs_root *root,
 9238				 struct writeback_control *wbc, bool snapshot,
 9239				 bool in_reclaim_context)
 9240{
 9241	struct btrfs_inode *binode;
 9242	struct inode *inode;
 9243	struct btrfs_delalloc_work *work, *next;
 9244	LIST_HEAD(works);
 9245	LIST_HEAD(splice);
 9246	int ret = 0;
 9247	bool full_flush = wbc->nr_to_write == LONG_MAX;
 9248
 9249	mutex_lock(&root->delalloc_mutex);
 9250	spin_lock(&root->delalloc_lock);
 9251	list_splice_init(&root->delalloc_inodes, &splice);
 9252	while (!list_empty(&splice)) {
 9253		binode = list_entry(splice.next, struct btrfs_inode,
 9254				    delalloc_inodes);
 9255
 9256		list_move_tail(&binode->delalloc_inodes,
 9257			       &root->delalloc_inodes);
 9258
 9259		if (in_reclaim_context &&
 9260		    test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
 9261			continue;
 9262
 9263		inode = igrab(&binode->vfs_inode);
 9264		if (!inode) {
 9265			cond_resched_lock(&root->delalloc_lock);
 9266			continue;
 9267		}
 9268		spin_unlock(&root->delalloc_lock);
 9269
 9270		if (snapshot)
 9271			set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
 9272				&binode->runtime_flags);
 9273		if (full_flush) {
 9274			work = btrfs_alloc_delalloc_work(inode);
 9275			if (!work) {
 9276				iput(inode);
 9277				ret = -ENOMEM;
 9278				goto out;
 9279			}
 9280			list_add_tail(&work->list, &works);
 9281			btrfs_queue_work(root->fs_info->flush_workers,
 9282					 &work->work);
 9283		} else {
 9284			ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
 9285			btrfs_add_delayed_iput(BTRFS_I(inode));
 9286			if (ret || wbc->nr_to_write <= 0)
 9287				goto out;
 9288		}
 9289		cond_resched();
 9290		spin_lock(&root->delalloc_lock);
 9291	}
 9292	spin_unlock(&root->delalloc_lock);
 9293
 9294out:
 9295	list_for_each_entry_safe(work, next, &works, list) {
 9296		list_del_init(&work->list);
 9297		wait_for_completion(&work->completion);
 9298		kfree(work);
 9299	}
 9300
 9301	if (!list_empty(&splice)) {
 9302		spin_lock(&root->delalloc_lock);
 9303		list_splice_tail(&splice, &root->delalloc_inodes);
 9304		spin_unlock(&root->delalloc_lock);
 9305	}
 9306	mutex_unlock(&root->delalloc_mutex);
 9307	return ret;
 9308}
 9309
 9310int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
 9311{
 9312	struct writeback_control wbc = {
 9313		.nr_to_write = LONG_MAX,
 9314		.sync_mode = WB_SYNC_NONE,
 9315		.range_start = 0,
 9316		.range_end = LLONG_MAX,
 9317	};
 9318	struct btrfs_fs_info *fs_info = root->fs_info;
 9319
 9320	if (BTRFS_FS_ERROR(fs_info))
 9321		return -EROFS;
 9322
 9323	return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
 9324}
 9325
 9326int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
 9327			       bool in_reclaim_context)
 9328{
 9329	struct writeback_control wbc = {
 9330		.nr_to_write = nr,
 9331		.sync_mode = WB_SYNC_NONE,
 9332		.range_start = 0,
 9333		.range_end = LLONG_MAX,
 9334	};
 9335	struct btrfs_root *root;
 9336	LIST_HEAD(splice);
 9337	int ret;
 9338
 9339	if (BTRFS_FS_ERROR(fs_info))
 9340		return -EROFS;
 9341
 9342	mutex_lock(&fs_info->delalloc_root_mutex);
 9343	spin_lock(&fs_info->delalloc_root_lock);
 9344	list_splice_init(&fs_info->delalloc_roots, &splice);
 9345	while (!list_empty(&splice)) {
 9346		/*
 9347		 * Reset nr_to_write here so we know that we're doing a full
 9348		 * flush.
 9349		 */
 9350		if (nr == LONG_MAX)
 9351			wbc.nr_to_write = LONG_MAX;
 9352
 9353		root = list_first_entry(&splice, struct btrfs_root,
 9354					delalloc_root);
 9355		root = btrfs_grab_root(root);
 9356		BUG_ON(!root);
 9357		list_move_tail(&root->delalloc_root,
 9358			       &fs_info->delalloc_roots);
 9359		spin_unlock(&fs_info->delalloc_root_lock);
 9360
 9361		ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
 9362		btrfs_put_root(root);
 9363		if (ret < 0 || wbc.nr_to_write <= 0)
 9364			goto out;
 9365		spin_lock(&fs_info->delalloc_root_lock);
 9366	}
 9367	spin_unlock(&fs_info->delalloc_root_lock);
 9368
 9369	ret = 0;
 9370out:
 9371	if (!list_empty(&splice)) {
 9372		spin_lock(&fs_info->delalloc_root_lock);
 9373		list_splice_tail(&splice, &fs_info->delalloc_roots);
 9374		spin_unlock(&fs_info->delalloc_root_lock);
 9375	}
 9376	mutex_unlock(&fs_info->delalloc_root_mutex);
 9377	return ret;
 9378}
 9379
 9380static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 9381			 struct dentry *dentry, const char *symname)
 9382{
 9383	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
 9384	struct btrfs_trans_handle *trans;
 9385	struct btrfs_root *root = BTRFS_I(dir)->root;
 9386	struct btrfs_path *path;
 9387	struct btrfs_key key;
 9388	struct inode *inode;
 9389	struct btrfs_new_inode_args new_inode_args = {
 9390		.dir = dir,
 9391		.dentry = dentry,
 9392	};
 9393	unsigned int trans_num_items;
 9394	int err;
 9395	int name_len;
 9396	int datasize;
 9397	unsigned long ptr;
 9398	struct btrfs_file_extent_item *ei;
 9399	struct extent_buffer *leaf;
 9400
 9401	name_len = strlen(symname);
 9402	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
 9403		return -ENAMETOOLONG;
 9404
 9405	inode = new_inode(dir->i_sb);
 9406	if (!inode)
 9407		return -ENOMEM;
 9408	inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO);
 9409	inode->i_op = &btrfs_symlink_inode_operations;
 9410	inode_nohighmem(inode);
 9411	inode->i_mapping->a_ops = &btrfs_aops;
 9412	btrfs_i_size_write(BTRFS_I(inode), name_len);
 9413	inode_set_bytes(inode, name_len);
 9414
 9415	new_inode_args.inode = inode;
 9416	err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
 9417	if (err)
 9418		goto out_inode;
 9419	/* 1 additional item for the inline extent */
 9420	trans_num_items++;
 9421
 9422	trans = btrfs_start_transaction(root, trans_num_items);
 9423	if (IS_ERR(trans)) {
 9424		err = PTR_ERR(trans);
 9425		goto out_new_inode_args;
 9426	}
 9427
 9428	err = btrfs_create_new_inode(trans, &new_inode_args);
 9429	if (err)
 9430		goto out;
 9431
 9432	path = btrfs_alloc_path();
 9433	if (!path) {
 9434		err = -ENOMEM;
 9435		btrfs_abort_transaction(trans, err);
 9436		discard_new_inode(inode);
 9437		inode = NULL;
 9438		goto out;
 9439	}
 9440	key.objectid = btrfs_ino(BTRFS_I(inode));
 9441	key.offset = 0;
 9442	key.type = BTRFS_EXTENT_DATA_KEY;
 9443	datasize = btrfs_file_extent_calc_inline_size(name_len);
 9444	err = btrfs_insert_empty_item(trans, root, path, &key,
 9445				      datasize);
 9446	if (err) {
 9447		btrfs_abort_transaction(trans, err);
 9448		btrfs_free_path(path);
 9449		discard_new_inode(inode);
 9450		inode = NULL;
 9451		goto out;
 9452	}
 9453	leaf = path->nodes[0];
 9454	ei = btrfs_item_ptr(leaf, path->slots[0],
 9455			    struct btrfs_file_extent_item);
 9456	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
 9457	btrfs_set_file_extent_type(leaf, ei,
 9458				   BTRFS_FILE_EXTENT_INLINE);
 9459	btrfs_set_file_extent_encryption(leaf, ei, 0);
 9460	btrfs_set_file_extent_compression(leaf, ei, 0);
 9461	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
 9462	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
 9463
 9464	ptr = btrfs_file_extent_inline_start(ei);
 9465	write_extent_buffer(leaf, symname, ptr, name_len);
 9466	btrfs_mark_buffer_dirty(trans, leaf);
 9467	btrfs_free_path(path);
 9468
 9469	d_instantiate_new(dentry, inode);
 9470	err = 0;
 9471out:
 9472	btrfs_end_transaction(trans);
 9473	btrfs_btree_balance_dirty(fs_info);
 9474out_new_inode_args:
 9475	btrfs_new_inode_args_destroy(&new_inode_args);
 9476out_inode:
 9477	if (err)
 9478		iput(inode);
 9479	return err;
 9480}
 9481
 9482static struct btrfs_trans_handle *insert_prealloc_file_extent(
 9483				       struct btrfs_trans_handle *trans_in,
 9484				       struct btrfs_inode *inode,
 9485				       struct btrfs_key *ins,
 9486				       u64 file_offset)
 9487{
 9488	struct btrfs_file_extent_item stack_fi;
 9489	struct btrfs_replace_extent_info extent_info;
 9490	struct btrfs_trans_handle *trans = trans_in;
 9491	struct btrfs_path *path;
 9492	u64 start = ins->objectid;
 9493	u64 len = ins->offset;
 9494	u64 qgroup_released = 0;
 9495	int ret;
 9496
 9497	memset(&stack_fi, 0, sizeof(stack_fi));
 9498
 9499	btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
 9500	btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
 9501	btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
 9502	btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
 9503	btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
 9504	btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
 9505	/* Encryption and other encoding is reserved and all 0 */
 9506
 9507	ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released);
 9508	if (ret < 0)
 9509		return ERR_PTR(ret);
 9510
 9511	if (trans) {
 9512		ret = insert_reserved_file_extent(trans, inode,
 9513						  file_offset, &stack_fi,
 9514						  true, qgroup_released);
 9515		if (ret)
 9516			goto free_qgroup;
 9517		return trans;
 9518	}
 9519
 9520	extent_info.disk_offset = start;
 9521	extent_info.disk_len = len;
 9522	extent_info.data_offset = 0;
 9523	extent_info.data_len = len;
 9524	extent_info.file_offset = file_offset;
 9525	extent_info.extent_buf = (char *)&stack_fi;
 9526	extent_info.is_new_extent = true;
 9527	extent_info.update_times = true;
 9528	extent_info.qgroup_reserved = qgroup_released;
 9529	extent_info.insertions = 0;
 9530
 9531	path = btrfs_alloc_path();
 9532	if (!path) {
 9533		ret = -ENOMEM;
 9534		goto free_qgroup;
 9535	}
 9536
 9537	ret = btrfs_replace_file_extents(inode, path, file_offset,
 9538				     file_offset + len - 1, &extent_info,
 9539				     &trans);
 9540	btrfs_free_path(path);
 9541	if (ret)
 9542		goto free_qgroup;
 9543	return trans;
 9544
 9545free_qgroup:
 9546	/*
 9547	 * We have released qgroup data range at the beginning of the function,
 9548	 * and normally qgroup_released bytes will be freed when committing
 9549	 * transaction.
 9550	 * But if we error out early, we have to free what we have released
 9551	 * or we leak qgroup data reservation.
 9552	 */
 9553	btrfs_qgroup_free_refroot(inode->root->fs_info,
 9554			inode->root->root_key.objectid, qgroup_released,
 9555			BTRFS_QGROUP_RSV_DATA);
 9556	return ERR_PTR(ret);
 9557}
 9558
 9559static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 9560				       u64 start, u64 num_bytes, u64 min_size,
 9561				       loff_t actual_len, u64 *alloc_hint,
 9562				       struct btrfs_trans_handle *trans)
 9563{
 9564	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 9565	struct extent_map *em;
 9566	struct btrfs_root *root = BTRFS_I(inode)->root;
 9567	struct btrfs_key ins;
 9568	u64 cur_offset = start;
 9569	u64 clear_offset = start;
 9570	u64 i_size;
 9571	u64 cur_bytes;
 9572	u64 last_alloc = (u64)-1;
 9573	int ret = 0;
 9574	bool own_trans = true;
 9575	u64 end = start + num_bytes - 1;
 9576
 9577	if (trans)
 9578		own_trans = false;
 9579	while (num_bytes > 0) {
 9580		cur_bytes = min_t(u64, num_bytes, SZ_256M);
 9581		cur_bytes = max(cur_bytes, min_size);
 9582		/*
 9583		 * If we are severely fragmented we could end up with really
 9584		 * small allocations, so if the allocator is returning small
 9585		 * chunks lets make its job easier by only searching for those
 9586		 * sized chunks.
 9587		 */
 9588		cur_bytes = min(cur_bytes, last_alloc);
 9589		ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
 9590				min_size, 0, *alloc_hint, &ins, 1, 0);
 9591		if (ret)
 9592			break;
 9593
 9594		/*
 9595		 * We've reserved this space, and thus converted it from
 9596		 * ->bytes_may_use to ->bytes_reserved.  Any error that happens
 9597		 * from here on out we will only need to clear our reservation
 9598		 * for the remaining unreserved area, so advance our
 9599		 * clear_offset by our extent size.
 9600		 */
 9601		clear_offset += ins.offset;
 9602
 9603		last_alloc = ins.offset;
 9604		trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
 9605						    &ins, cur_offset);
 9606		/*
 9607		 * Now that we inserted the prealloc extent we can finally
 9608		 * decrement the number of reservations in the block group.
 9609		 * If we did it before, we could race with relocation and have
 9610		 * relocation miss the reserved extent, making it fail later.
 9611		 */
 9612		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 9613		if (IS_ERR(trans)) {
 9614			ret = PTR_ERR(trans);
 9615			btrfs_free_reserved_extent(fs_info, ins.objectid,
 9616						   ins.offset, 0);
 9617			break;
 9618		}
 9619
 9620		em = alloc_extent_map();
 9621		if (!em) {
 9622			btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
 9623					    cur_offset + ins.offset - 1, false);
 9624			btrfs_set_inode_full_sync(BTRFS_I(inode));
 9625			goto next;
 9626		}
 9627
 9628		em->start = cur_offset;
 9629		em->orig_start = cur_offset;
 9630		em->len = ins.offset;
 9631		em->block_start = ins.objectid;
 9632		em->block_len = ins.offset;
 9633		em->orig_block_len = ins.offset;
 9634		em->ram_bytes = ins.offset;
 9635		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
 9636		em->generation = trans->transid;
 9637
 9638		ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
 9639		free_extent_map(em);
 9640next:
 9641		num_bytes -= ins.offset;
 9642		cur_offset += ins.offset;
 9643		*alloc_hint = ins.objectid + ins.offset;
 9644
 9645		inode_inc_iversion(inode);
 9646		inode_set_ctime_current(inode);
 9647		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
 9648		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
 9649		    (actual_len > inode->i_size) &&
 9650		    (cur_offset > inode->i_size)) {
 9651			if (cur_offset > actual_len)
 9652				i_size = actual_len;
 9653			else
 9654				i_size = cur_offset;
 9655			i_size_write(inode, i_size);
 9656			btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
 9657		}
 9658
 9659		ret = btrfs_update_inode(trans, BTRFS_I(inode));
 9660
 9661		if (ret) {
 9662			btrfs_abort_transaction(trans, ret);
 9663			if (own_trans)
 9664				btrfs_end_transaction(trans);
 9665			break;
 9666		}
 9667
 9668		if (own_trans) {
 9669			btrfs_end_transaction(trans);
 9670			trans = NULL;
 9671		}
 9672	}
 9673	if (clear_offset < end)
 9674		btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
 9675			end - clear_offset + 1);
 9676	return ret;
 9677}
 9678
 9679int btrfs_prealloc_file_range(struct inode *inode, int mode,
 9680			      u64 start, u64 num_bytes, u64 min_size,
 9681			      loff_t actual_len, u64 *alloc_hint)
 9682{
 9683	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
 9684					   min_size, actual_len, alloc_hint,
 9685					   NULL);
 9686}
 9687
 9688int btrfs_prealloc_file_range_trans(struct inode *inode,
 9689				    struct btrfs_trans_handle *trans, int mode,
 9690				    u64 start, u64 num_bytes, u64 min_size,
 9691				    loff_t actual_len, u64 *alloc_hint)
 9692{
 9693	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
 9694					   min_size, actual_len, alloc_hint, trans);
 9695}
 9696
 9697static int btrfs_permission(struct mnt_idmap *idmap,
 9698			    struct inode *inode, int mask)
 9699{
 9700	struct btrfs_root *root = BTRFS_I(inode)->root;
 9701	umode_t mode = inode->i_mode;
 9702
 9703	if (mask & MAY_WRITE &&
 9704	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
 9705		if (btrfs_root_readonly(root))
 9706			return -EROFS;
 9707		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
 9708			return -EACCES;
 9709	}
 9710	return generic_permission(idmap, inode, mask);
 9711}
 9712
 9713static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
 9714			 struct file *file, umode_t mode)
 9715{
 9716	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
 9717	struct btrfs_trans_handle *trans;
 9718	struct btrfs_root *root = BTRFS_I(dir)->root;
 9719	struct inode *inode;
 9720	struct btrfs_new_inode_args new_inode_args = {
 9721		.dir = dir,
 9722		.dentry = file->f_path.dentry,
 9723		.orphan = true,
 9724	};
 9725	unsigned int trans_num_items;
 9726	int ret;
 9727
 9728	inode = new_inode(dir->i_sb);
 9729	if (!inode)
 9730		return -ENOMEM;
 9731	inode_init_owner(idmap, inode, dir, mode);
 9732	inode->i_fop = &btrfs_file_operations;
 9733	inode->i_op = &btrfs_file_inode_operations;
 9734	inode->i_mapping->a_ops = &btrfs_aops;
 9735
 9736	new_inode_args.inode = inode;
 9737	ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
 9738	if (ret)
 9739		goto out_inode;
 9740
 9741	trans = btrfs_start_transaction(root, trans_num_items);
 9742	if (IS_ERR(trans)) {
 9743		ret = PTR_ERR(trans);
 9744		goto out_new_inode_args;
 9745	}
 9746
 9747	ret = btrfs_create_new_inode(trans, &new_inode_args);
 9748
 9749	/*
 9750	 * We set number of links to 0 in btrfs_create_new_inode(), and here we
 9751	 * set it to 1 because d_tmpfile() will issue a warning if the count is
 9752	 * 0, through:
 9753	 *
 9754	 *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
 9755	 */
 9756	set_nlink(inode, 1);
 9757
 9758	if (!ret) {
 9759		d_tmpfile(file, inode);
 9760		unlock_new_inode(inode);
 9761		mark_inode_dirty(inode);
 9762	}
 9763
 9764	btrfs_end_transaction(trans);
 9765	btrfs_btree_balance_dirty(fs_info);
 9766out_new_inode_args:
 9767	btrfs_new_inode_args_destroy(&new_inode_args);
 9768out_inode:
 9769	if (ret)
 9770		iput(inode);
 9771	return finish_open_simple(file, ret);
 9772}
 9773
 9774void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
 9775{
 9776	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 9777	unsigned long index = start >> PAGE_SHIFT;
 9778	unsigned long end_index = end >> PAGE_SHIFT;
 9779	struct page *page;
 9780	u32 len;
 9781
 9782	ASSERT(end + 1 - start <= U32_MAX);
 9783	len = end + 1 - start;
 9784	while (index <= end_index) {
 9785		page = find_get_page(inode->vfs_inode.i_mapping, index);
 9786		ASSERT(page); /* Pages should be in the extent_io_tree */
 9787
 9788		btrfs_page_set_writeback(fs_info, page, start, len);
 9789		put_page(page);
 9790		index++;
 9791	}
 9792}
 9793
 9794int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
 9795					     int compress_type)
 9796{
 9797	switch (compress_type) {
 9798	case BTRFS_COMPRESS_NONE:
 9799		return BTRFS_ENCODED_IO_COMPRESSION_NONE;
 9800	case BTRFS_COMPRESS_ZLIB:
 9801		return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
 9802	case BTRFS_COMPRESS_LZO:
 9803		/*
 9804		 * The LZO format depends on the sector size. 64K is the maximum
 9805		 * sector size that we support.
 9806		 */
 9807		if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
 9808			return -EINVAL;
 9809		return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
 9810		       (fs_info->sectorsize_bits - 12);
 9811	case BTRFS_COMPRESS_ZSTD:
 9812		return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
 9813	default:
 9814		return -EUCLEAN;
 9815	}
 9816}
 9817
 9818static ssize_t btrfs_encoded_read_inline(
 9819				struct kiocb *iocb,
 9820				struct iov_iter *iter, u64 start,
 9821				u64 lockend,
 9822				struct extent_state **cached_state,
 9823				u64 extent_start, size_t count,
 9824				struct btrfs_ioctl_encoded_io_args *encoded,
 9825				bool *unlocked)
 9826{
 9827	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
 9828	struct btrfs_root *root = inode->root;
 9829	struct btrfs_fs_info *fs_info = root->fs_info;
 9830	struct extent_io_tree *io_tree = &inode->io_tree;
 9831	struct btrfs_path *path;
 9832	struct extent_buffer *leaf;
 9833	struct btrfs_file_extent_item *item;
 9834	u64 ram_bytes;
 9835	unsigned long ptr;
 9836	void *tmp;
 9837	ssize_t ret;
 9838
 9839	path = btrfs_alloc_path();
 9840	if (!path) {
 9841		ret = -ENOMEM;
 9842		goto out;
 9843	}
 9844	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
 9845				       extent_start, 0);
 9846	if (ret) {
 9847		if (ret > 0) {
 9848			/* The extent item disappeared? */
 9849			ret = -EIO;
 9850		}
 9851		goto out;
 9852	}
 9853	leaf = path->nodes[0];
 9854	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
 9855
 9856	ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
 9857	ptr = btrfs_file_extent_inline_start(item);
 9858
 9859	encoded->len = min_t(u64, extent_start + ram_bytes,
 9860			     inode->vfs_inode.i_size) - iocb->ki_pos;
 9861	ret = btrfs_encoded_io_compression_from_extent(fs_info,
 9862				 btrfs_file_extent_compression(leaf, item));
 9863	if (ret < 0)
 9864		goto out;
 9865	encoded->compression = ret;
 9866	if (encoded->compression) {
 9867		size_t inline_size;
 9868
 9869		inline_size = btrfs_file_extent_inline_item_len(leaf,
 9870								path->slots[0]);
 9871		if (inline_size > count) {
 9872			ret = -ENOBUFS;
 9873			goto out;
 9874		}
 9875		count = inline_size;
 9876		encoded->unencoded_len = ram_bytes;
 9877		encoded->unencoded_offset = iocb->ki_pos - extent_start;
 9878	} else {
 9879		count = min_t(u64, count, encoded->len);
 9880		encoded->len = count;
 9881		encoded->unencoded_len = count;
 9882		ptr += iocb->ki_pos - extent_start;
 9883	}
 9884
 9885	tmp = kmalloc(count, GFP_NOFS);
 9886	if (!tmp) {
 9887		ret = -ENOMEM;
 9888		goto out;
 9889	}
 9890	read_extent_buffer(leaf, tmp, ptr, count);
 9891	btrfs_release_path(path);
 9892	unlock_extent(io_tree, start, lockend, cached_state);
 9893	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
 9894	*unlocked = true;
 9895
 9896	ret = copy_to_iter(tmp, count, iter);
 9897	if (ret != count)
 9898		ret = -EFAULT;
 9899	kfree(tmp);
 9900out:
 9901	btrfs_free_path(path);
 9902	return ret;
 9903}
 9904
 9905struct btrfs_encoded_read_private {
 9906	wait_queue_head_t wait;
 9907	atomic_t pending;
 9908	blk_status_t status;
 9909};
 9910
 9911static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
 9912{
 9913	struct btrfs_encoded_read_private *priv = bbio->private;
 9914
 9915	if (bbio->bio.bi_status) {
 9916		/*
 9917		 * The memory barrier implied by the atomic_dec_return() here
 9918		 * pairs with the memory barrier implied by the
 9919		 * atomic_dec_return() or io_wait_event() in
 9920		 * btrfs_encoded_read_regular_fill_pages() to ensure that this
 9921		 * write is observed before the load of status in
 9922		 * btrfs_encoded_read_regular_fill_pages().
 9923		 */
 9924		WRITE_ONCE(priv->status, bbio->bio.bi_status);
 9925	}
 9926	if (!atomic_dec_return(&priv->pending))
 9927		wake_up(&priv->wait);
 9928	bio_put(&bbio->bio);
 9929}
 9930
 9931int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
 9932					  u64 file_offset, u64 disk_bytenr,
 9933					  u64 disk_io_size, struct page **pages)
 9934{
 9935	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 9936	struct btrfs_encoded_read_private priv = {
 9937		.pending = ATOMIC_INIT(1),
 9938	};
 9939	unsigned long i = 0;
 9940	struct btrfs_bio *bbio;
 9941
 9942	init_waitqueue_head(&priv.wait);
 9943
 9944	bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
 9945			       btrfs_encoded_read_endio, &priv);
 9946	bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
 9947	bbio->inode = inode;
 9948
 9949	do {
 9950		size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
 9951
 9952		if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
 9953			atomic_inc(&priv.pending);
 9954			btrfs_submit_bio(bbio, 0);
 9955
 9956			bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
 9957					       btrfs_encoded_read_endio, &priv);
 9958			bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
 9959			bbio->inode = inode;
 9960			continue;
 9961		}
 9962
 9963		i++;
 9964		disk_bytenr += bytes;
 9965		disk_io_size -= bytes;
 9966	} while (disk_io_size);
 9967
 9968	atomic_inc(&priv.pending);
 9969	btrfs_submit_bio(bbio, 0);
 9970
 9971	if (atomic_dec_return(&priv.pending))
 9972		io_wait_event(priv.wait, !atomic_read(&priv.pending));
 9973	/* See btrfs_encoded_read_endio() for ordering. */
 9974	return blk_status_to_errno(READ_ONCE(priv.status));
 9975}
 9976
 9977static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
 9978					  struct iov_iter *iter,
 9979					  u64 start, u64 lockend,
 9980					  struct extent_state **cached_state,
 9981					  u64 disk_bytenr, u64 disk_io_size,
 9982					  size_t count, bool compressed,
 9983					  bool *unlocked)
 9984{
 9985	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
 9986	struct extent_io_tree *io_tree = &inode->io_tree;
 9987	struct page **pages;
 9988	unsigned long nr_pages, i;
 9989	u64 cur;
 9990	size_t page_offset;
 9991	ssize_t ret;
 9992
 9993	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
 9994	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
 9995	if (!pages)
 9996		return -ENOMEM;
 9997	ret = btrfs_alloc_page_array(nr_pages, pages);
 9998	if (ret) {
 9999		ret = -ENOMEM;
10000		goto out;
10001		}
10002
10003	ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
10004						    disk_io_size, pages);
10005	if (ret)
10006		goto out;
10007
10008	unlock_extent(io_tree, start, lockend, cached_state);
10009	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
10010	*unlocked = true;
10011
10012	if (compressed) {
10013		i = 0;
10014		page_offset = 0;
10015	} else {
10016		i = (iocb->ki_pos - start) >> PAGE_SHIFT;
10017		page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
10018	}
10019	cur = 0;
10020	while (cur < count) {
10021		size_t bytes = min_t(size_t, count - cur,
10022				     PAGE_SIZE - page_offset);
10023
10024		if (copy_page_to_iter(pages[i], page_offset, bytes,
10025				      iter) != bytes) {
10026			ret = -EFAULT;
10027			goto out;
10028		}
10029		i++;
10030		cur += bytes;
10031		page_offset = 0;
10032	}
10033	ret = count;
10034out:
10035	for (i = 0; i < nr_pages; i++) {
10036		if (pages[i])
10037			__free_page(pages[i]);
10038	}
10039	kfree(pages);
10040	return ret;
10041}
10042
10043ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
10044			   struct btrfs_ioctl_encoded_io_args *encoded)
10045{
10046	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
10047	struct btrfs_fs_info *fs_info = inode->root->fs_info;
10048	struct extent_io_tree *io_tree = &inode->io_tree;
10049	ssize_t ret;
10050	size_t count = iov_iter_count(iter);
10051	u64 start, lockend, disk_bytenr, disk_io_size;
10052	struct extent_state *cached_state = NULL;
10053	struct extent_map *em;
10054	bool unlocked = false;
10055
10056	file_accessed(iocb->ki_filp);
10057
10058	btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
10059
10060	if (iocb->ki_pos >= inode->vfs_inode.i_size) {
10061		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
10062		return 0;
10063	}
10064	start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
10065	/*
10066	 * We don't know how long the extent containing iocb->ki_pos is, but if
10067	 * it's compressed we know that it won't be longer than this.
10068	 */
10069	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
10070
10071	for (;;) {
10072		struct btrfs_ordered_extent *ordered;
10073
10074		ret = btrfs_wait_ordered_range(&inode->vfs_inode, start,
10075					       lockend - start + 1);
10076		if (ret)
10077			goto out_unlock_inode;
10078		lock_extent(io_tree, start, lockend, &cached_state);
10079		ordered = btrfs_lookup_ordered_range(inode, start,
10080						     lockend - start + 1);
10081		if (!ordered)
10082			break;
10083		btrfs_put_ordered_extent(ordered);
10084		unlock_extent(io_tree, start, lockend, &cached_state);
10085		cond_resched();
10086	}
10087
10088	em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1);
10089	if (IS_ERR(em)) {
10090		ret = PTR_ERR(em);
10091		goto out_unlock_extent;
10092	}
10093
10094	if (em->block_start == EXTENT_MAP_INLINE) {
10095		u64 extent_start = em->start;
10096
10097		/*
10098		 * For inline extents we get everything we need out of the
10099		 * extent item.
10100		 */
10101		free_extent_map(em);
10102		em = NULL;
10103		ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
10104						&cached_state, extent_start,
10105						count, encoded, &unlocked);
10106		goto out;
10107	}
10108
10109	/*
10110	 * We only want to return up to EOF even if the extent extends beyond
10111	 * that.
10112	 */
10113	encoded->len = min_t(u64, extent_map_end(em),
10114			     inode->vfs_inode.i_size) - iocb->ki_pos;
10115	if (em->block_start == EXTENT_MAP_HOLE ||
10116	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
10117		disk_bytenr = EXTENT_MAP_HOLE;
10118		count = min_t(u64, count, encoded->len);
10119		encoded->len = count;
10120		encoded->unencoded_len = count;
10121	} else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
10122		disk_bytenr = em->block_start;
10123		/*
10124		 * Bail if the buffer isn't large enough to return the whole
10125		 * compressed extent.
10126		 */
10127		if (em->block_len > count) {
10128			ret = -ENOBUFS;
10129			goto out_em;
10130		}
10131		disk_io_size = em->block_len;
10132		count = em->block_len;
10133		encoded->unencoded_len = em->ram_bytes;
10134		encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
10135		ret = btrfs_encoded_io_compression_from_extent(fs_info,
10136							     em->compress_type);
10137		if (ret < 0)
10138			goto out_em;
10139		encoded->compression = ret;
10140	} else {
10141		disk_bytenr = em->block_start + (start - em->start);
10142		if (encoded->len > count)
10143			encoded->len = count;
10144		/*
10145		 * Don't read beyond what we locked. This also limits the page
10146		 * allocations that we'll do.
10147		 */
10148		disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
10149		count = start + disk_io_size - iocb->ki_pos;
10150		encoded->len = count;
10151		encoded->unencoded_len = count;
10152		disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
10153	}
10154	free_extent_map(em);
10155	em = NULL;
10156
10157	if (disk_bytenr == EXTENT_MAP_HOLE) {
10158		unlock_extent(io_tree, start, lockend, &cached_state);
10159		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
10160		unlocked = true;
10161		ret = iov_iter_zero(count, iter);
10162		if (ret != count)
10163			ret = -EFAULT;
10164	} else {
10165		ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
10166						 &cached_state, disk_bytenr,
10167						 disk_io_size, count,
10168						 encoded->compression,
10169						 &unlocked);
10170	}
10171
10172out:
10173	if (ret >= 0)
10174		iocb->ki_pos += encoded->len;
10175out_em:
10176	free_extent_map(em);
10177out_unlock_extent:
10178	if (!unlocked)
10179		unlock_extent(io_tree, start, lockend, &cached_state);
10180out_unlock_inode:
10181	if (!unlocked)
10182		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
10183	return ret;
10184}
10185
10186ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
10187			       const struct btrfs_ioctl_encoded_io_args *encoded)
10188{
10189	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
10190	struct btrfs_root *root = inode->root;
10191	struct btrfs_fs_info *fs_info = root->fs_info;
10192	struct extent_io_tree *io_tree = &inode->io_tree;
10193	struct extent_changeset *data_reserved = NULL;
10194	struct extent_state *cached_state = NULL;
10195	struct btrfs_ordered_extent *ordered;
10196	int compression;
10197	size_t orig_count;
10198	u64 start, end;
10199	u64 num_bytes, ram_bytes, disk_num_bytes;
10200	unsigned long nr_pages, i;
10201	struct page **pages;
10202	struct btrfs_key ins;
10203	bool extent_reserved = false;
10204	struct extent_map *em;
10205	ssize_t ret;
10206
10207	switch (encoded->compression) {
10208	case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
10209		compression = BTRFS_COMPRESS_ZLIB;
10210		break;
10211	case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
10212		compression = BTRFS_COMPRESS_ZSTD;
10213		break;
10214	case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
10215	case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
10216	case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
10217	case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
10218	case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
10219		/* The sector size must match for LZO. */
10220		if (encoded->compression -
10221		    BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
10222		    fs_info->sectorsize_bits)
10223			return -EINVAL;
10224		compression = BTRFS_COMPRESS_LZO;
10225		break;
10226	default:
10227		return -EINVAL;
10228	}
10229	if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
10230		return -EINVAL;
10231
10232	orig_count = iov_iter_count(from);
10233
10234	/* The extent size must be sane. */
10235	if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
10236	    orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
10237		return -EINVAL;
10238
10239	/*
10240	 * The compressed data must be smaller than the decompressed data.
10241	 *
10242	 * It's of course possible for data to compress to larger or the same
10243	 * size, but the buffered I/O path falls back to no compression for such
10244	 * data, and we don't want to break any assumptions by creating these
10245	 * extents.
10246	 *
10247	 * Note that this is less strict than the current check we have that the
10248	 * compressed data must be at least one sector smaller than the
10249	 * decompressed data. We only want to enforce the weaker requirement
10250	 * from old kernels that it is at least one byte smaller.
10251	 */
10252	if (orig_count >= encoded->unencoded_len)
10253		return -EINVAL;
10254
10255	/* The extent must start on a sector boundary. */
10256	start = iocb->ki_pos;
10257	if (!IS_ALIGNED(start, fs_info->sectorsize))
10258		return -EINVAL;
10259
10260	/*
10261	 * The extent must end on a sector boundary. However, we allow a write
10262	 * which ends at or extends i_size to have an unaligned length; we round
10263	 * up the extent size and set i_size to the unaligned end.
10264	 */
10265	if (start + encoded->len < inode->vfs_inode.i_size &&
10266	    !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
10267		return -EINVAL;
10268
10269	/* Finally, the offset in the unencoded data must be sector-aligned. */
10270	if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
10271		return -EINVAL;
10272
10273	num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
10274	ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
10275	end = start + num_bytes - 1;
10276
10277	/*
10278	 * If the extent cannot be inline, the compressed data on disk must be
10279	 * sector-aligned. For convenience, we extend it with zeroes if it
10280	 * isn't.
10281	 */
10282	disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
10283	nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
10284	pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
10285	if (!pages)
10286		return -ENOMEM;
10287	for (i = 0; i < nr_pages; i++) {
10288		size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
10289		char *kaddr;
10290
10291		pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
10292		if (!pages[i]) {
10293			ret = -ENOMEM;
10294			goto out_pages;
10295		}
10296		kaddr = kmap_local_page(pages[i]);
10297		if (copy_from_iter(kaddr, bytes, from) != bytes) {
10298			kunmap_local(kaddr);
10299			ret = -EFAULT;
10300			goto out_pages;
10301		}
10302		if (bytes < PAGE_SIZE)
10303			memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
10304		kunmap_local(kaddr);
10305	}
10306
10307	for (;;) {
10308		struct btrfs_ordered_extent *ordered;
10309
10310		ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
10311		if (ret)
10312			goto out_pages;
10313		ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
10314						    start >> PAGE_SHIFT,
10315						    end >> PAGE_SHIFT);
10316		if (ret)
10317			goto out_pages;
10318		lock_extent(io_tree, start, end, &cached_state);
10319		ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
10320		if (!ordered &&
10321		    !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
10322			break;
10323		if (ordered)
10324			btrfs_put_ordered_extent(ordered);
10325		unlock_extent(io_tree, start, end, &cached_state);
10326		cond_resched();
10327	}
10328
10329	/*
10330	 * We don't use the higher-level delalloc space functions because our
10331	 * num_bytes and disk_num_bytes are different.
10332	 */
10333	ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
10334	if (ret)
10335		goto out_unlock;
10336	ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
10337	if (ret)
10338		goto out_free_data_space;
10339	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
10340					      false);
10341	if (ret)
10342		goto out_qgroup_free_data;
10343
10344	/* Try an inline extent first. */
10345	if (start == 0 && encoded->unencoded_len == encoded->len &&
10346	    encoded->unencoded_offset == 0) {
10347		ret = cow_file_range_inline(inode, encoded->len, orig_count,
10348					    compression, pages, true);
10349		if (ret <= 0) {
10350			if (ret == 0)
10351				ret = orig_count;
10352			goto out_delalloc_release;
10353		}
10354	}
10355
10356	ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
10357				   disk_num_bytes, 0, 0, &ins, 1, 1);
10358	if (ret)
10359		goto out_delalloc_release;
10360	extent_reserved = true;
10361
10362	em = create_io_em(inode, start, num_bytes,
10363			  start - encoded->unencoded_offset, ins.objectid,
10364			  ins.offset, ins.offset, ram_bytes, compression,
10365			  BTRFS_ORDERED_COMPRESSED);
10366	if (IS_ERR(em)) {
10367		ret = PTR_ERR(em);
10368		goto out_free_reserved;
10369	}
10370	free_extent_map(em);
10371
10372	ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes,
10373				       ins.objectid, ins.offset,
10374				       encoded->unencoded_offset,
10375				       (1 << BTRFS_ORDERED_ENCODED) |
10376				       (1 << BTRFS_ORDERED_COMPRESSED),
10377				       compression);
10378	if (IS_ERR(ordered)) {
10379		btrfs_drop_extent_map_range(inode, start, end, false);
10380		ret = PTR_ERR(ordered);
10381		goto out_free_reserved;
10382	}
10383	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
10384
10385	if (start + encoded->len > inode->vfs_inode.i_size)
10386		i_size_write(&inode->vfs_inode, start + encoded->len);
10387
10388	unlock_extent(io_tree, start, end, &cached_state);
10389
10390	btrfs_delalloc_release_extents(inode, num_bytes);
10391
10392	btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false);
10393	ret = orig_count;
10394	goto out;
10395
10396out_free_reserved:
10397	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
10398	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
10399out_delalloc_release:
10400	btrfs_delalloc_release_extents(inode, num_bytes);
10401	btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
10402out_qgroup_free_data:
10403	if (ret < 0)
10404		btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL);
10405out_free_data_space:
10406	/*
10407	 * If btrfs_reserve_extent() succeeded, then we already decremented
10408	 * bytes_may_use.
10409	 */
10410	if (!extent_reserved)
10411		btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
10412out_unlock:
10413	unlock_extent(io_tree, start, end, &cached_state);
10414out_pages:
10415	for (i = 0; i < nr_pages; i++) {
10416		if (pages[i])
10417			__free_page(pages[i]);
10418	}
10419	kvfree(pages);
10420out:
10421	if (ret >= 0)
10422		iocb->ki_pos += encoded->len;
10423	return ret;
10424}
10425
10426#ifdef CONFIG_SWAP
10427/*
10428 * Add an entry indicating a block group or device which is pinned by a
10429 * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
10430 * negative errno on failure.
10431 */
10432static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
10433				  bool is_block_group)
10434{
10435	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10436	struct btrfs_swapfile_pin *sp, *entry;
10437	struct rb_node **p;
10438	struct rb_node *parent = NULL;
10439
10440	sp = kmalloc(sizeof(*sp), GFP_NOFS);
10441	if (!sp)
10442		return -ENOMEM;
10443	sp->ptr = ptr;
10444	sp->inode = inode;
10445	sp->is_block_group = is_block_group;
10446	sp->bg_extent_count = 1;
10447
10448	spin_lock(&fs_info->swapfile_pins_lock);
10449	p = &fs_info->swapfile_pins.rb_node;
10450	while (*p) {
10451		parent = *p;
10452		entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
10453		if (sp->ptr < entry->ptr ||
10454		    (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
10455			p = &(*p)->rb_left;
10456		} else if (sp->ptr > entry->ptr ||
10457			   (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
10458			p = &(*p)->rb_right;
10459		} else {
10460			if (is_block_group)
10461				entry->bg_extent_count++;
10462			spin_unlock(&fs_info->swapfile_pins_lock);
10463			kfree(sp);
10464			return 1;
10465		}
10466	}
10467	rb_link_node(&sp->node, parent, p);
10468	rb_insert_color(&sp->node, &fs_info->swapfile_pins);
10469	spin_unlock(&fs_info->swapfile_pins_lock);
10470	return 0;
10471}
10472
10473/* Free all of the entries pinned by this swapfile. */
10474static void btrfs_free_swapfile_pins(struct inode *inode)
10475{
10476	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10477	struct btrfs_swapfile_pin *sp;
10478	struct rb_node *node, *next;
10479
10480	spin_lock(&fs_info->swapfile_pins_lock);
10481	node = rb_first(&fs_info->swapfile_pins);
10482	while (node) {
10483		next = rb_next(node);
10484		sp = rb_entry(node, struct btrfs_swapfile_pin, node);
10485		if (sp->inode == inode) {
10486			rb_erase(&sp->node, &fs_info->swapfile_pins);
10487			if (sp->is_block_group) {
10488				btrfs_dec_block_group_swap_extents(sp->ptr,
10489							   sp->bg_extent_count);
10490				btrfs_put_block_group(sp->ptr);
10491			}
10492			kfree(sp);
10493		}
10494		node = next;
10495	}
10496	spin_unlock(&fs_info->swapfile_pins_lock);
10497}
10498
10499struct btrfs_swap_info {
10500	u64 start;
10501	u64 block_start;
10502	u64 block_len;
10503	u64 lowest_ppage;
10504	u64 highest_ppage;
10505	unsigned long nr_pages;
10506	int nr_extents;
10507};
10508
10509static int btrfs_add_swap_extent(struct swap_info_struct *sis,
10510				 struct btrfs_swap_info *bsi)
10511{
10512	unsigned long nr_pages;
10513	unsigned long max_pages;
10514	u64 first_ppage, first_ppage_reported, next_ppage;
10515	int ret;
10516
10517	/*
10518	 * Our swapfile may have had its size extended after the swap header was
10519	 * written. In that case activating the swapfile should not go beyond
10520	 * the max size set in the swap header.
10521	 */
10522	if (bsi->nr_pages >= sis->max)
10523		return 0;
10524
10525	max_pages = sis->max - bsi->nr_pages;
10526	first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT;
10527	next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT;
10528
10529	if (first_ppage >= next_ppage)
10530		return 0;
10531	nr_pages = next_ppage - first_ppage;
10532	nr_pages = min(nr_pages, max_pages);
10533
10534	first_ppage_reported = first_ppage;
10535	if (bsi->start == 0)
10536		first_ppage_reported++;
10537	if (bsi->lowest_ppage > first_ppage_reported)
10538		bsi->lowest_ppage = first_ppage_reported;
10539	if (bsi->highest_ppage < (next_ppage - 1))
10540		bsi->highest_ppage = next_ppage - 1;
10541
10542	ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
10543	if (ret < 0)
10544		return ret;
10545	bsi->nr_extents += ret;
10546	bsi->nr_pages += nr_pages;
10547	return 0;
10548}
10549
10550static void btrfs_swap_deactivate(struct file *file)
10551{
10552	struct inode *inode = file_inode(file);
10553
10554	btrfs_free_swapfile_pins(inode);
10555	atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
10556}
10557
10558static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10559			       sector_t *span)
10560{
10561	struct inode *inode = file_inode(file);
10562	struct btrfs_root *root = BTRFS_I(inode)->root;
10563	struct btrfs_fs_info *fs_info = root->fs_info;
10564	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
10565	struct extent_state *cached_state = NULL;
10566	struct extent_map *em = NULL;
10567	struct btrfs_device *device = NULL;
10568	struct btrfs_swap_info bsi = {
10569		.lowest_ppage = (sector_t)-1ULL,
10570	};
10571	int ret = 0;
10572	u64 isize;
10573	u64 start;
10574
10575	/*
10576	 * If the swap file was just created, make sure delalloc is done. If the
10577	 * file changes again after this, the user is doing something stupid and
10578	 * we don't really care.
10579	 */
10580	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
10581	if (ret)
10582		return ret;
10583
10584	/*
10585	 * The inode is locked, so these flags won't change after we check them.
10586	 */
10587	if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
10588		btrfs_warn(fs_info, "swapfile must not be compressed");
10589		return -EINVAL;
10590	}
10591	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
10592		btrfs_warn(fs_info, "swapfile must not be copy-on-write");
10593		return -EINVAL;
10594	}
10595	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
10596		btrfs_warn(fs_info, "swapfile must not be checksummed");
10597		return -EINVAL;
10598	}
10599
10600	/*
10601	 * Balance or device remove/replace/resize can move stuff around from
10602	 * under us. The exclop protection makes sure they aren't running/won't
10603	 * run concurrently while we are mapping the swap extents, and
10604	 * fs_info->swapfile_pins prevents them from running while the swap
10605	 * file is active and moving the extents. Note that this also prevents
10606	 * a concurrent device add which isn't actually necessary, but it's not
10607	 * really worth the trouble to allow it.
10608	 */
10609	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
10610		btrfs_warn(fs_info,
10611	   "cannot activate swapfile while exclusive operation is running");
10612		return -EBUSY;
10613	}
10614
10615	/*
10616	 * Prevent snapshot creation while we are activating the swap file.
10617	 * We do not want to race with snapshot creation. If snapshot creation
10618	 * already started before we bumped nr_swapfiles from 0 to 1 and
10619	 * completes before the first write into the swap file after it is
10620	 * activated, than that write would fallback to COW.
10621	 */
10622	if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
10623		btrfs_exclop_finish(fs_info);
10624		btrfs_warn(fs_info,
10625	   "cannot activate swapfile because snapshot creation is in progress");
10626		return -EINVAL;
10627	}
10628	/*
10629	 * Snapshots can create extents which require COW even if NODATACOW is
10630	 * set. We use this counter to prevent snapshots. We must increment it
10631	 * before walking the extents because we don't want a concurrent
10632	 * snapshot to run after we've already checked the extents.
10633	 *
10634	 * It is possible that subvolume is marked for deletion but still not
10635	 * removed yet. To prevent this race, we check the root status before
10636	 * activating the swapfile.
10637	 */
10638	spin_lock(&root->root_item_lock);
10639	if (btrfs_root_dead(root)) {
10640		spin_unlock(&root->root_item_lock);
10641
10642		btrfs_exclop_finish(fs_info);
10643		btrfs_warn(fs_info,
10644		"cannot activate swapfile because subvolume %llu is being deleted",
10645			root->root_key.objectid);
10646		return -EPERM;
10647	}
10648	atomic_inc(&root->nr_swapfiles);
10649	spin_unlock(&root->root_item_lock);
10650
10651	isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
10652
10653	lock_extent(io_tree, 0, isize - 1, &cached_state);
10654	start = 0;
10655	while (start < isize) {
10656		u64 logical_block_start, physical_block_start;
10657		struct btrfs_block_group *bg;
10658		u64 len = isize - start;
10659
10660		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
10661		if (IS_ERR(em)) {
10662			ret = PTR_ERR(em);
10663			goto out;
10664		}
10665
10666		if (em->block_start == EXTENT_MAP_HOLE) {
10667			btrfs_warn(fs_info, "swapfile must not have holes");
10668			ret = -EINVAL;
10669			goto out;
10670		}
10671		if (em->block_start == EXTENT_MAP_INLINE) {
10672			/*
10673			 * It's unlikely we'll ever actually find ourselves
10674			 * here, as a file small enough to fit inline won't be
10675			 * big enough to store more than the swap header, but in
10676			 * case something changes in the future, let's catch it
10677			 * here rather than later.
10678			 */
10679			btrfs_warn(fs_info, "swapfile must not be inline");
10680			ret = -EINVAL;
10681			goto out;
10682		}
10683		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
10684			btrfs_warn(fs_info, "swapfile must not be compressed");
10685			ret = -EINVAL;
10686			goto out;
10687		}
10688
10689		logical_block_start = em->block_start + (start - em->start);
10690		len = min(len, em->len - (start - em->start));
10691		free_extent_map(em);
10692		em = NULL;
10693
10694		ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true);
10695		if (ret < 0) {
10696			goto out;
10697		} else if (ret) {
10698			ret = 0;
10699		} else {
10700			btrfs_warn(fs_info,
10701				   "swapfile must not be copy-on-write");
10702			ret = -EINVAL;
10703			goto out;
10704		}
10705
10706		em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
10707		if (IS_ERR(em)) {
10708			ret = PTR_ERR(em);
10709			goto out;
10710		}
10711
10712		if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
10713			btrfs_warn(fs_info,
10714				   "swapfile must have single data profile");
10715			ret = -EINVAL;
10716			goto out;
10717		}
10718
10719		if (device == NULL) {
10720			device = em->map_lookup->stripes[0].dev;
10721			ret = btrfs_add_swapfile_pin(inode, device, false);
10722			if (ret == 1)
10723				ret = 0;
10724			else if (ret)
10725				goto out;
10726		} else if (device != em->map_lookup->stripes[0].dev) {
10727			btrfs_warn(fs_info, "swapfile must be on one device");
10728			ret = -EINVAL;
10729			goto out;
10730		}
10731
10732		physical_block_start = (em->map_lookup->stripes[0].physical +
10733					(logical_block_start - em->start));
10734		len = min(len, em->len - (logical_block_start - em->start));
10735		free_extent_map(em);
10736		em = NULL;
10737
10738		bg = btrfs_lookup_block_group(fs_info, logical_block_start);
10739		if (!bg) {
10740			btrfs_warn(fs_info,
10741			   "could not find block group containing swapfile");
10742			ret = -EINVAL;
10743			goto out;
10744		}
10745
10746		if (!btrfs_inc_block_group_swap_extents(bg)) {
10747			btrfs_warn(fs_info,
10748			   "block group for swapfile at %llu is read-only%s",
10749			   bg->start,
10750			   atomic_read(&fs_info->scrubs_running) ?
10751				       " (scrub running)" : "");
10752			btrfs_put_block_group(bg);
10753			ret = -EINVAL;
10754			goto out;
10755		}
10756
10757		ret = btrfs_add_swapfile_pin(inode, bg, true);
10758		if (ret) {
10759			btrfs_put_block_group(bg);
10760			if (ret == 1)
10761				ret = 0;
10762			else
10763				goto out;
10764		}
10765
10766		if (bsi.block_len &&
10767		    bsi.block_start + bsi.block_len == physical_block_start) {
10768			bsi.block_len += len;
10769		} else {
10770			if (bsi.block_len) {
10771				ret = btrfs_add_swap_extent(sis, &bsi);
10772				if (ret)
10773					goto out;
10774			}
10775			bsi.start = start;
10776			bsi.block_start = physical_block_start;
10777			bsi.block_len = len;
10778		}
10779
10780		start += len;
10781	}
10782
10783	if (bsi.block_len)
10784		ret = btrfs_add_swap_extent(sis, &bsi);
10785
10786out:
10787	if (!IS_ERR_OR_NULL(em))
10788		free_extent_map(em);
10789
10790	unlock_extent(io_tree, 0, isize - 1, &cached_state);
10791
10792	if (ret)
10793		btrfs_swap_deactivate(file);
10794
10795	btrfs_drew_write_unlock(&root->snapshot_lock);
10796
10797	btrfs_exclop_finish(fs_info);
10798
10799	if (ret)
10800		return ret;
10801
10802	if (device)
10803		sis->bdev = device->bdev;
10804	*span = bsi.highest_ppage - bsi.lowest_ppage + 1;
10805	sis->max = bsi.nr_pages;
10806	sis->pages = bsi.nr_pages - 1;
10807	sis->highest_bit = bsi.nr_pages - 1;
10808	return bsi.nr_extents;
10809}
10810#else
10811static void btrfs_swap_deactivate(struct file *file)
10812{
10813}
10814
10815static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10816			       sector_t *span)
10817{
10818	return -EOPNOTSUPP;
10819}
10820#endif
10821
10822/*
10823 * Update the number of bytes used in the VFS' inode. When we replace extents in
10824 * a range (clone, dedupe, fallocate's zero range), we must update the number of
10825 * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
10826 * always get a correct value.
10827 */
10828void btrfs_update_inode_bytes(struct btrfs_inode *inode,
10829			      const u64 add_bytes,
10830			      const u64 del_bytes)
10831{
10832	if (add_bytes == del_bytes)
10833		return;
10834
10835	spin_lock(&inode->lock);
10836	if (del_bytes > 0)
10837		inode_sub_bytes(&inode->vfs_inode, del_bytes);
10838	if (add_bytes > 0)
10839		inode_add_bytes(&inode->vfs_inode, add_bytes);
10840	spin_unlock(&inode->lock);
10841}
10842
10843/*
10844 * Verify that there are no ordered extents for a given file range.
10845 *
10846 * @inode:   The target inode.
10847 * @start:   Start offset of the file range, should be sector size aligned.
10848 * @end:     End offset (inclusive) of the file range, its value +1 should be
10849 *           sector size aligned.
10850 *
10851 * This should typically be used for cases where we locked an inode's VFS lock in
10852 * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode,
10853 * we have flushed all delalloc in the range, we have waited for all ordered
10854 * extents in the range to complete and finally we have locked the file range in
10855 * the inode's io_tree.
10856 */
10857void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
10858{
10859	struct btrfs_root *root = inode->root;
10860	struct btrfs_ordered_extent *ordered;
10861
10862	if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
10863		return;
10864
10865	ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start);
10866	if (ordered) {
10867		btrfs_err(root->fs_info,
10868"found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
10869			  start, end, btrfs_ino(inode), root->root_key.objectid,
10870			  ordered->file_offset,
10871			  ordered->file_offset + ordered->num_bytes - 1);
10872		btrfs_put_ordered_extent(ordered);
10873	}
10874
10875	ASSERT(ordered == NULL);
10876}
10877
10878static const struct inode_operations btrfs_dir_inode_operations = {
10879	.getattr	= btrfs_getattr,
10880	.lookup		= btrfs_lookup,
10881	.create		= btrfs_create,
10882	.unlink		= btrfs_unlink,
10883	.link		= btrfs_link,
10884	.mkdir		= btrfs_mkdir,
10885	.rmdir		= btrfs_rmdir,
10886	.rename		= btrfs_rename2,
10887	.symlink	= btrfs_symlink,
10888	.setattr	= btrfs_setattr,
10889	.mknod		= btrfs_mknod,
10890	.listxattr	= btrfs_listxattr,
10891	.permission	= btrfs_permission,
10892	.get_inode_acl	= btrfs_get_acl,
10893	.set_acl	= btrfs_set_acl,
10894	.update_time	= btrfs_update_time,
10895	.tmpfile        = btrfs_tmpfile,
10896	.fileattr_get	= btrfs_fileattr_get,
10897	.fileattr_set	= btrfs_fileattr_set,
10898};
10899
10900static const struct file_operations btrfs_dir_file_operations = {
10901	.llseek		= btrfs_dir_llseek,
10902	.read		= generic_read_dir,
10903	.iterate_shared	= btrfs_real_readdir,
10904	.open		= btrfs_opendir,
10905	.unlocked_ioctl	= btrfs_ioctl,
10906#ifdef CONFIG_COMPAT
10907	.compat_ioctl	= btrfs_compat_ioctl,
10908#endif
10909	.release        = btrfs_release_file,
10910	.fsync		= btrfs_sync_file,
10911};
10912
10913/*
10914 * btrfs doesn't support the bmap operation because swapfiles
10915 * use bmap to make a mapping of extents in the file.  They assume
10916 * these extents won't change over the life of the file and they
10917 * use the bmap result to do IO directly to the drive.
10918 *
10919 * the btrfs bmap call would return logical addresses that aren't
10920 * suitable for IO and they also will change frequently as COW
10921 * operations happen.  So, swapfile + btrfs == corruption.
10922 *
10923 * For now we're avoiding this by dropping bmap.
10924 */
10925static const struct address_space_operations btrfs_aops = {
10926	.read_folio	= btrfs_read_folio,
10927	.writepages	= btrfs_writepages,
10928	.readahead	= btrfs_readahead,
10929	.invalidate_folio = btrfs_invalidate_folio,
10930	.release_folio	= btrfs_release_folio,
10931	.migrate_folio	= btrfs_migrate_folio,
10932	.dirty_folio	= filemap_dirty_folio,
10933	.error_remove_page = generic_error_remove_page,
10934	.swap_activate	= btrfs_swap_activate,
10935	.swap_deactivate = btrfs_swap_deactivate,
10936};
10937
10938static const struct inode_operations btrfs_file_inode_operations = {
10939	.getattr	= btrfs_getattr,
10940	.setattr	= btrfs_setattr,
10941	.listxattr      = btrfs_listxattr,
10942	.permission	= btrfs_permission,
10943	.fiemap		= btrfs_fiemap,
10944	.get_inode_acl	= btrfs_get_acl,
10945	.set_acl	= btrfs_set_acl,
10946	.update_time	= btrfs_update_time,
10947	.fileattr_get	= btrfs_fileattr_get,
10948	.fileattr_set	= btrfs_fileattr_set,
10949};
10950static const struct inode_operations btrfs_special_inode_operations = {
10951	.getattr	= btrfs_getattr,
10952	.setattr	= btrfs_setattr,
10953	.permission	= btrfs_permission,
10954	.listxattr	= btrfs_listxattr,
10955	.get_inode_acl	= btrfs_get_acl,
10956	.set_acl	= btrfs_set_acl,
10957	.update_time	= btrfs_update_time,
10958};
10959static const struct inode_operations btrfs_symlink_inode_operations = {
10960	.get_link	= page_get_link,
10961	.getattr	= btrfs_getattr,
10962	.setattr	= btrfs_setattr,
10963	.permission	= btrfs_permission,
10964	.listxattr	= btrfs_listxattr,
10965	.update_time	= btrfs_update_time,
10966};
10967
10968const struct dentry_operations btrfs_dentry_operations = {
10969	.d_delete	= btrfs_dentry_delete,
10970};
Configure Feed

Configure Feed