fs/btrfs/extent-tree.c at v5.0-rc4

tjh.dev / kernel
fork atom
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork atom
kernel / fs / btrfs / extent-tree.c
at v5.0-rc4 11404 lines 320 kB view raw
wrap content
    1// SPDX-License-Identifier: GPL-2.0
    2/*
    3 * Copyright (C) 2007 Oracle.  All rights reserved.
    4 */
    5
    6#include <linux/sched.h>
    7#include <linux/sched/signal.h>
    8#include <linux/pagemap.h>
    9#include <linux/writeback.h>
   10#include <linux/blkdev.h>
   11#include <linux/sort.h>
   12#include <linux/rcupdate.h>
   13#include <linux/kthread.h>
   14#include <linux/slab.h>
   15#include <linux/ratelimit.h>
   16#include <linux/percpu_counter.h>
   17#include <linux/lockdep.h>
   18#include <linux/crc32c.h>
   19#include "tree-log.h"
   20#include "disk-io.h"
   21#include "print-tree.h"
   22#include "volumes.h"
   23#include "raid56.h"
   24#include "locking.h"
   25#include "free-space-cache.h"
   26#include "free-space-tree.h"
   27#include "math.h"
   28#include "sysfs.h"
   29#include "qgroup.h"
   30#include "ref-verify.h"
   31
   32#undef SCRAMBLE_DELAYED_REFS
   33
   34/*
   35 * control flags for do_chunk_alloc's force field
   36 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
   37 * if we really need one.
   38 *
   39 * CHUNK_ALLOC_LIMITED means to only try and allocate one
   40 * if we have very few chunks already allocated.  This is
   41 * used as part of the clustering code to help make sure
   42 * we have a good pool of storage to cluster in, without
   43 * filling the FS with empty chunks
   44 *
   45 * CHUNK_ALLOC_FORCE means it must try to allocate one
   46 *
   47 */
   48enum {
   49	CHUNK_ALLOC_NO_FORCE = 0,
   50	CHUNK_ALLOC_LIMITED = 1,
   51	CHUNK_ALLOC_FORCE = 2,
   52};
   53
   54/*
   55 * Declare a helper function to detect underflow of various space info members
   56 */
   57#define DECLARE_SPACE_INFO_UPDATE(name)					\
   58static inline void update_##name(struct btrfs_space_info *sinfo,	\
   59				 s64 bytes)				\
   60{									\
   61	if (bytes < 0 && sinfo->name < -bytes) {			\
   62		WARN_ON(1);						\
   63		sinfo->name = 0;					\
   64		return;							\
   65	}								\
   66	sinfo->name += bytes;						\
   67}
   68
   69DECLARE_SPACE_INFO_UPDATE(bytes_may_use);
   70DECLARE_SPACE_INFO_UPDATE(bytes_pinned);
   71
   72static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
   73			       struct btrfs_delayed_ref_node *node, u64 parent,
   74			       u64 root_objectid, u64 owner_objectid,
   75			       u64 owner_offset, int refs_to_drop,
   76			       struct btrfs_delayed_extent_op *extra_op);
   77static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
   78				    struct extent_buffer *leaf,
   79				    struct btrfs_extent_item *ei);
   80static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
   81				      u64 parent, u64 root_objectid,
   82				      u64 flags, u64 owner, u64 offset,
   83				      struct btrfs_key *ins, int ref_mod);
   84static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
   85				     struct btrfs_delayed_ref_node *node,
   86				     struct btrfs_delayed_extent_op *extent_op);
   87static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
   88			  int force);
   89static int find_next_key(struct btrfs_path *path, int level,
   90			 struct btrfs_key *key);
   91static void dump_space_info(struct btrfs_fs_info *fs_info,
   92			    struct btrfs_space_info *info, u64 bytes,
   93			    int dump_block_groups);
   94static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
   95			       u64 num_bytes);
   96static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
   97				     struct btrfs_space_info *space_info,
   98				     u64 num_bytes);
   99static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
  100				     struct btrfs_space_info *space_info,
  101				     u64 num_bytes);
  102
  103static noinline int
  104block_group_cache_done(struct btrfs_block_group_cache *cache)
  105{
  106	smp_mb();
  107	return cache->cached == BTRFS_CACHE_FINISHED ||
  108		cache->cached == BTRFS_CACHE_ERROR;
  109}
  110
  111static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
  112{
  113	return (cache->flags & bits) == bits;
  114}
  115
  116void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
  117{
  118	atomic_inc(&cache->count);
  119}
  120
  121void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
  122{
  123	if (atomic_dec_and_test(&cache->count)) {
  124		WARN_ON(cache->pinned > 0);
  125		WARN_ON(cache->reserved > 0);
  126
  127		/*
  128		 * If not empty, someone is still holding mutex of
  129		 * full_stripe_lock, which can only be released by caller.
  130		 * And it will definitely cause use-after-free when caller
  131		 * tries to release full stripe lock.
  132		 *
  133		 * No better way to resolve, but only to warn.
  134		 */
  135		WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
  136		kfree(cache->free_space_ctl);
  137		kfree(cache);
  138	}
  139}
  140
  141/*
  142 * this adds the block group to the fs_info rb tree for the block group
  143 * cache
  144 */
  145static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
  146				struct btrfs_block_group_cache *block_group)
  147{
  148	struct rb_node **p;
  149	struct rb_node *parent = NULL;
  150	struct btrfs_block_group_cache *cache;
  151
  152	spin_lock(&info->block_group_cache_lock);
  153	p = &info->block_group_cache_tree.rb_node;
  154
  155	while (*p) {
  156		parent = *p;
  157		cache = rb_entry(parent, struct btrfs_block_group_cache,
  158				 cache_node);
  159		if (block_group->key.objectid < cache->key.objectid) {
  160			p = &(*p)->rb_left;
  161		} else if (block_group->key.objectid > cache->key.objectid) {
  162			p = &(*p)->rb_right;
  163		} else {
  164			spin_unlock(&info->block_group_cache_lock);
  165			return -EEXIST;
  166		}
  167	}
  168
  169	rb_link_node(&block_group->cache_node, parent, p);
  170	rb_insert_color(&block_group->cache_node,
  171			&info->block_group_cache_tree);
  172
  173	if (info->first_logical_byte > block_group->key.objectid)
  174		info->first_logical_byte = block_group->key.objectid;
  175
  176	spin_unlock(&info->block_group_cache_lock);
  177
  178	return 0;
  179}
  180
  181/*
  182 * This will return the block group at or after bytenr if contains is 0, else
  183 * it will return the block group that contains the bytenr
  184 */
  185static struct btrfs_block_group_cache *
  186block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
  187			      int contains)
  188{
  189	struct btrfs_block_group_cache *cache, *ret = NULL;
  190	struct rb_node *n;
  191	u64 end, start;
  192
  193	spin_lock(&info->block_group_cache_lock);
  194	n = info->block_group_cache_tree.rb_node;
  195
  196	while (n) {
  197		cache = rb_entry(n, struct btrfs_block_group_cache,
  198				 cache_node);
  199		end = cache->key.objectid + cache->key.offset - 1;
  200		start = cache->key.objectid;
  201
  202		if (bytenr < start) {
  203			if (!contains && (!ret || start < ret->key.objectid))
  204				ret = cache;
  205			n = n->rb_left;
  206		} else if (bytenr > start) {
  207			if (contains && bytenr <= end) {
  208				ret = cache;
  209				break;
  210			}
  211			n = n->rb_right;
  212		} else {
  213			ret = cache;
  214			break;
  215		}
  216	}
  217	if (ret) {
  218		btrfs_get_block_group(ret);
  219		if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
  220			info->first_logical_byte = ret->key.objectid;
  221	}
  222	spin_unlock(&info->block_group_cache_lock);
  223
  224	return ret;
  225}
  226
  227static int add_excluded_extent(struct btrfs_fs_info *fs_info,
  228			       u64 start, u64 num_bytes)
  229{
  230	u64 end = start + num_bytes - 1;
  231	set_extent_bits(&fs_info->freed_extents[0],
  232			start, end, EXTENT_UPTODATE);
  233	set_extent_bits(&fs_info->freed_extents[1],
  234			start, end, EXTENT_UPTODATE);
  235	return 0;
  236}
  237
  238static void free_excluded_extents(struct btrfs_block_group_cache *cache)
  239{
  240	struct btrfs_fs_info *fs_info = cache->fs_info;
  241	u64 start, end;
  242
  243	start = cache->key.objectid;
  244	end = start + cache->key.offset - 1;
  245
  246	clear_extent_bits(&fs_info->freed_extents[0],
  247			  start, end, EXTENT_UPTODATE);
  248	clear_extent_bits(&fs_info->freed_extents[1],
  249			  start, end, EXTENT_UPTODATE);
  250}
  251
  252static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
  253{
  254	struct btrfs_fs_info *fs_info = cache->fs_info;
  255	u64 bytenr;
  256	u64 *logical;
  257	int stripe_len;
  258	int i, nr, ret;
  259
  260	if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
  261		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
  262		cache->bytes_super += stripe_len;
  263		ret = add_excluded_extent(fs_info, cache->key.objectid,
  264					  stripe_len);
  265		if (ret)
  266			return ret;
  267	}
  268
  269	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
  270		bytenr = btrfs_sb_offset(i);
  271		ret = btrfs_rmap_block(fs_info, cache->key.objectid,
  272				       bytenr, &logical, &nr, &stripe_len);
  273		if (ret)
  274			return ret;
  275
  276		while (nr--) {
  277			u64 start, len;
  278
  279			if (logical[nr] > cache->key.objectid +
  280			    cache->key.offset)
  281				continue;
  282
  283			if (logical[nr] + stripe_len <= cache->key.objectid)
  284				continue;
  285
  286			start = logical[nr];
  287			if (start < cache->key.objectid) {
  288				start = cache->key.objectid;
  289				len = (logical[nr] + stripe_len) - start;
  290			} else {
  291				len = min_t(u64, stripe_len,
  292					    cache->key.objectid +
  293					    cache->key.offset - start);
  294			}
  295
  296			cache->bytes_super += len;
  297			ret = add_excluded_extent(fs_info, start, len);
  298			if (ret) {
  299				kfree(logical);
  300				return ret;
  301			}
  302		}
  303
  304		kfree(logical);
  305	}
  306	return 0;
  307}
  308
  309static struct btrfs_caching_control *
  310get_caching_control(struct btrfs_block_group_cache *cache)
  311{
  312	struct btrfs_caching_control *ctl;
  313
  314	spin_lock(&cache->lock);
  315	if (!cache->caching_ctl) {
  316		spin_unlock(&cache->lock);
  317		return NULL;
  318	}
  319
  320	ctl = cache->caching_ctl;
  321	refcount_inc(&ctl->count);
  322	spin_unlock(&cache->lock);
  323	return ctl;
  324}
  325
  326static void put_caching_control(struct btrfs_caching_control *ctl)
  327{
  328	if (refcount_dec_and_test(&ctl->count))
  329		kfree(ctl);
  330}
  331
  332#ifdef CONFIG_BTRFS_DEBUG
  333static void fragment_free_space(struct btrfs_block_group_cache *block_group)
  334{
  335	struct btrfs_fs_info *fs_info = block_group->fs_info;
  336	u64 start = block_group->key.objectid;
  337	u64 len = block_group->key.offset;
  338	u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
  339		fs_info->nodesize : fs_info->sectorsize;
  340	u64 step = chunk << 1;
  341
  342	while (len > chunk) {
  343		btrfs_remove_free_space(block_group, start, chunk);
  344		start += step;
  345		if (len < step)
  346			len = 0;
  347		else
  348			len -= step;
  349	}
  350}
  351#endif
  352
  353/*
  354 * this is only called by cache_block_group, since we could have freed extents
  355 * we need to check the pinned_extents for any extents that can't be used yet
  356 * since their free space will be released as soon as the transaction commits.
  357 */
  358u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
  359		       u64 start, u64 end)
  360{
  361	struct btrfs_fs_info *info = block_group->fs_info;
  362	u64 extent_start, extent_end, size, total_added = 0;
  363	int ret;
  364
  365	while (start < end) {
  366		ret = find_first_extent_bit(info->pinned_extents, start,
  367					    &extent_start, &extent_end,
  368					    EXTENT_DIRTY | EXTENT_UPTODATE,
  369					    NULL);
  370		if (ret)
  371			break;
  372
  373		if (extent_start <= start) {
  374			start = extent_end + 1;
  375		} else if (extent_start > start && extent_start < end) {
  376			size = extent_start - start;
  377			total_added += size;
  378			ret = btrfs_add_free_space(block_group, start,
  379						   size);
  380			BUG_ON(ret); /* -ENOMEM or logic error */
  381			start = extent_end + 1;
  382		} else {
  383			break;
  384		}
  385	}
  386
  387	if (start < end) {
  388		size = end - start;
  389		total_added += size;
  390		ret = btrfs_add_free_space(block_group, start, size);
  391		BUG_ON(ret); /* -ENOMEM or logic error */
  392	}
  393
  394	return total_added;
  395}
  396
  397static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
  398{
  399	struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
  400	struct btrfs_fs_info *fs_info = block_group->fs_info;
  401	struct btrfs_root *extent_root = fs_info->extent_root;
  402	struct btrfs_path *path;
  403	struct extent_buffer *leaf;
  404	struct btrfs_key key;
  405	u64 total_found = 0;
  406	u64 last = 0;
  407	u32 nritems;
  408	int ret;
  409	bool wakeup = true;
  410
  411	path = btrfs_alloc_path();
  412	if (!path)
  413		return -ENOMEM;
  414
  415	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
  416
  417#ifdef CONFIG_BTRFS_DEBUG
  418	/*
  419	 * If we're fragmenting we don't want to make anybody think we can
  420	 * allocate from this block group until we've had a chance to fragment
  421	 * the free space.
  422	 */
  423	if (btrfs_should_fragment_free_space(block_group))
  424		wakeup = false;
  425#endif
  426	/*
  427	 * We don't want to deadlock with somebody trying to allocate a new
  428	 * extent for the extent root while also trying to search the extent
  429	 * root to add free space.  So we skip locking and search the commit
  430	 * root, since its read-only
  431	 */
  432	path->skip_locking = 1;
  433	path->search_commit_root = 1;
  434	path->reada = READA_FORWARD;
  435
  436	key.objectid = last;
  437	key.offset = 0;
  438	key.type = BTRFS_EXTENT_ITEM_KEY;
  439
  440next:
  441	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
  442	if (ret < 0)
  443		goto out;
  444
  445	leaf = path->nodes[0];
  446	nritems = btrfs_header_nritems(leaf);
  447
  448	while (1) {
  449		if (btrfs_fs_closing(fs_info) > 1) {
  450			last = (u64)-1;
  451			break;
  452		}
  453
  454		if (path->slots[0] < nritems) {
  455			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
  456		} else {
  457			ret = find_next_key(path, 0, &key);
  458			if (ret)
  459				break;
  460
  461			if (need_resched() ||
  462			    rwsem_is_contended(&fs_info->commit_root_sem)) {
  463				if (wakeup)
  464					caching_ctl->progress = last;
  465				btrfs_release_path(path);
  466				up_read(&fs_info->commit_root_sem);
  467				mutex_unlock(&caching_ctl->mutex);
  468				cond_resched();
  469				mutex_lock(&caching_ctl->mutex);
  470				down_read(&fs_info->commit_root_sem);
  471				goto next;
  472			}
  473
  474			ret = btrfs_next_leaf(extent_root, path);
  475			if (ret < 0)
  476				goto out;
  477			if (ret)
  478				break;
  479			leaf = path->nodes[0];
  480			nritems = btrfs_header_nritems(leaf);
  481			continue;
  482		}
  483
  484		if (key.objectid < last) {
  485			key.objectid = last;
  486			key.offset = 0;
  487			key.type = BTRFS_EXTENT_ITEM_KEY;
  488
  489			if (wakeup)
  490				caching_ctl->progress = last;
  491			btrfs_release_path(path);
  492			goto next;
  493		}
  494
  495		if (key.objectid < block_group->key.objectid) {
  496			path->slots[0]++;
  497			continue;
  498		}
  499
  500		if (key.objectid >= block_group->key.objectid +
  501		    block_group->key.offset)
  502			break;
  503
  504		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
  505		    key.type == BTRFS_METADATA_ITEM_KEY) {
  506			total_found += add_new_free_space(block_group, last,
  507							  key.objectid);
  508			if (key.type == BTRFS_METADATA_ITEM_KEY)
  509				last = key.objectid +
  510					fs_info->nodesize;
  511			else
  512				last = key.objectid + key.offset;
  513
  514			if (total_found > CACHING_CTL_WAKE_UP) {
  515				total_found = 0;
  516				if (wakeup)
  517					wake_up(&caching_ctl->wait);
  518			}
  519		}
  520		path->slots[0]++;
  521	}
  522	ret = 0;
  523
  524	total_found += add_new_free_space(block_group, last,
  525					  block_group->key.objectid +
  526					  block_group->key.offset);
  527	caching_ctl->progress = (u64)-1;
  528
  529out:
  530	btrfs_free_path(path);
  531	return ret;
  532}
  533
  534static noinline void caching_thread(struct btrfs_work *work)
  535{
  536	struct btrfs_block_group_cache *block_group;
  537	struct btrfs_fs_info *fs_info;
  538	struct btrfs_caching_control *caching_ctl;
  539	int ret;
  540
  541	caching_ctl = container_of(work, struct btrfs_caching_control, work);
  542	block_group = caching_ctl->block_group;
  543	fs_info = block_group->fs_info;
  544
  545	mutex_lock(&caching_ctl->mutex);
  546	down_read(&fs_info->commit_root_sem);
  547
  548	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
  549		ret = load_free_space_tree(caching_ctl);
  550	else
  551		ret = load_extent_tree_free(caching_ctl);
  552
  553	spin_lock(&block_group->lock);
  554	block_group->caching_ctl = NULL;
  555	block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
  556	spin_unlock(&block_group->lock);
  557
  558#ifdef CONFIG_BTRFS_DEBUG
  559	if (btrfs_should_fragment_free_space(block_group)) {
  560		u64 bytes_used;
  561
  562		spin_lock(&block_group->space_info->lock);
  563		spin_lock(&block_group->lock);
  564		bytes_used = block_group->key.offset -
  565			btrfs_block_group_used(&block_group->item);
  566		block_group->space_info->bytes_used += bytes_used >> 1;
  567		spin_unlock(&block_group->lock);
  568		spin_unlock(&block_group->space_info->lock);
  569		fragment_free_space(block_group);
  570	}
  571#endif
  572
  573	caching_ctl->progress = (u64)-1;
  574
  575	up_read(&fs_info->commit_root_sem);
  576	free_excluded_extents(block_group);
  577	mutex_unlock(&caching_ctl->mutex);
  578
  579	wake_up(&caching_ctl->wait);
  580
  581	put_caching_control(caching_ctl);
  582	btrfs_put_block_group(block_group);
  583}
  584
  585static int cache_block_group(struct btrfs_block_group_cache *cache,
  586			     int load_cache_only)
  587{
  588	DEFINE_WAIT(wait);
  589	struct btrfs_fs_info *fs_info = cache->fs_info;
  590	struct btrfs_caching_control *caching_ctl;
  591	int ret = 0;
  592
  593	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
  594	if (!caching_ctl)
  595		return -ENOMEM;
  596
  597	INIT_LIST_HEAD(&caching_ctl->list);
  598	mutex_init(&caching_ctl->mutex);
  599	init_waitqueue_head(&caching_ctl->wait);
  600	caching_ctl->block_group = cache;
  601	caching_ctl->progress = cache->key.objectid;
  602	refcount_set(&caching_ctl->count, 1);
  603	btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
  604			caching_thread, NULL, NULL);
  605
  606	spin_lock(&cache->lock);
  607	/*
  608	 * This should be a rare occasion, but this could happen I think in the
  609	 * case where one thread starts to load the space cache info, and then
  610	 * some other thread starts a transaction commit which tries to do an
  611	 * allocation while the other thread is still loading the space cache
  612	 * info.  The previous loop should have kept us from choosing this block
  613	 * group, but if we've moved to the state where we will wait on caching
  614	 * block groups we need to first check if we're doing a fast load here,
  615	 * so we can wait for it to finish, otherwise we could end up allocating
  616	 * from a block group who's cache gets evicted for one reason or
  617	 * another.
  618	 */
  619	while (cache->cached == BTRFS_CACHE_FAST) {
  620		struct btrfs_caching_control *ctl;
  621
  622		ctl = cache->caching_ctl;
  623		refcount_inc(&ctl->count);
  624		prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
  625		spin_unlock(&cache->lock);
  626
  627		schedule();
  628
  629		finish_wait(&ctl->wait, &wait);
  630		put_caching_control(ctl);
  631		spin_lock(&cache->lock);
  632	}
  633
  634	if (cache->cached != BTRFS_CACHE_NO) {
  635		spin_unlock(&cache->lock);
  636		kfree(caching_ctl);
  637		return 0;
  638	}
  639	WARN_ON(cache->caching_ctl);
  640	cache->caching_ctl = caching_ctl;
  641	cache->cached = BTRFS_CACHE_FAST;
  642	spin_unlock(&cache->lock);
  643
  644	if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
  645		mutex_lock(&caching_ctl->mutex);
  646		ret = load_free_space_cache(fs_info, cache);
  647
  648		spin_lock(&cache->lock);
  649		if (ret == 1) {
  650			cache->caching_ctl = NULL;
  651			cache->cached = BTRFS_CACHE_FINISHED;
  652			cache->last_byte_to_unpin = (u64)-1;
  653			caching_ctl->progress = (u64)-1;
  654		} else {
  655			if (load_cache_only) {
  656				cache->caching_ctl = NULL;
  657				cache->cached = BTRFS_CACHE_NO;
  658			} else {
  659				cache->cached = BTRFS_CACHE_STARTED;
  660				cache->has_caching_ctl = 1;
  661			}
  662		}
  663		spin_unlock(&cache->lock);
  664#ifdef CONFIG_BTRFS_DEBUG
  665		if (ret == 1 &&
  666		    btrfs_should_fragment_free_space(cache)) {
  667			u64 bytes_used;
  668
  669			spin_lock(&cache->space_info->lock);
  670			spin_lock(&cache->lock);
  671			bytes_used = cache->key.offset -
  672				btrfs_block_group_used(&cache->item);
  673			cache->space_info->bytes_used += bytes_used >> 1;
  674			spin_unlock(&cache->lock);
  675			spin_unlock(&cache->space_info->lock);
  676			fragment_free_space(cache);
  677		}
  678#endif
  679		mutex_unlock(&caching_ctl->mutex);
  680
  681		wake_up(&caching_ctl->wait);
  682		if (ret == 1) {
  683			put_caching_control(caching_ctl);
  684			free_excluded_extents(cache);
  685			return 0;
  686		}
  687	} else {
  688		/*
  689		 * We're either using the free space tree or no caching at all.
  690		 * Set cached to the appropriate value and wakeup any waiters.
  691		 */
  692		spin_lock(&cache->lock);
  693		if (load_cache_only) {
  694			cache->caching_ctl = NULL;
  695			cache->cached = BTRFS_CACHE_NO;
  696		} else {
  697			cache->cached = BTRFS_CACHE_STARTED;
  698			cache->has_caching_ctl = 1;
  699		}
  700		spin_unlock(&cache->lock);
  701		wake_up(&caching_ctl->wait);
  702	}
  703
  704	if (load_cache_only) {
  705		put_caching_control(caching_ctl);
  706		return 0;
  707	}
  708
  709	down_write(&fs_info->commit_root_sem);
  710	refcount_inc(&caching_ctl->count);
  711	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
  712	up_write(&fs_info->commit_root_sem);
  713
  714	btrfs_get_block_group(cache);
  715
  716	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
  717
  718	return ret;
  719}
  720
  721/*
  722 * return the block group that starts at or after bytenr
  723 */
  724static struct btrfs_block_group_cache *
  725btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
  726{
  727	return block_group_cache_tree_search(info, bytenr, 0);
  728}
  729
  730/*
  731 * return the block group that contains the given bytenr
  732 */
  733struct btrfs_block_group_cache *btrfs_lookup_block_group(
  734						 struct btrfs_fs_info *info,
  735						 u64 bytenr)
  736{
  737	return block_group_cache_tree_search(info, bytenr, 1);
  738}
  739
  740static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
  741						  u64 flags)
  742{
  743	struct list_head *head = &info->space_info;
  744	struct btrfs_space_info *found;
  745
  746	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
  747
  748	rcu_read_lock();
  749	list_for_each_entry_rcu(found, head, list) {
  750		if (found->flags & flags) {
  751			rcu_read_unlock();
  752			return found;
  753		}
  754	}
  755	rcu_read_unlock();
  756	return NULL;
  757}
  758
  759static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
  760			     bool metadata, u64 root_objectid)
  761{
  762	struct btrfs_space_info *space_info;
  763	u64 flags;
  764
  765	if (metadata) {
  766		if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
  767			flags = BTRFS_BLOCK_GROUP_SYSTEM;
  768		else
  769			flags = BTRFS_BLOCK_GROUP_METADATA;
  770	} else {
  771		flags = BTRFS_BLOCK_GROUP_DATA;
  772	}
  773
  774	space_info = __find_space_info(fs_info, flags);
  775	ASSERT(space_info);
  776	percpu_counter_add_batch(&space_info->total_bytes_pinned, num_bytes,
  777		    BTRFS_TOTAL_BYTES_PINNED_BATCH);
  778}
  779
  780/*
  781 * after adding space to the filesystem, we need to clear the full flags
  782 * on all the space infos.
  783 */
  784void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
  785{
  786	struct list_head *head = &info->space_info;
  787	struct btrfs_space_info *found;
  788
  789	rcu_read_lock();
  790	list_for_each_entry_rcu(found, head, list)
  791		found->full = 0;
  792	rcu_read_unlock();
  793}
  794
  795/* simple helper to search for an existing data extent at a given offset */
  796int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
  797{
  798	int ret;
  799	struct btrfs_key key;
  800	struct btrfs_path *path;
  801
  802	path = btrfs_alloc_path();
  803	if (!path)
  804		return -ENOMEM;
  805
  806	key.objectid = start;
  807	key.offset = len;
  808	key.type = BTRFS_EXTENT_ITEM_KEY;
  809	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
  810	btrfs_free_path(path);
  811	return ret;
  812}
  813
  814/*
  815 * helper function to lookup reference count and flags of a tree block.
  816 *
  817 * the head node for delayed ref is used to store the sum of all the
  818 * reference count modifications queued up in the rbtree. the head
  819 * node may also store the extent flags to set. This way you can check
  820 * to see what the reference count and extent flags would be if all of
  821 * the delayed refs are not processed.
  822 */
  823int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
  824			     struct btrfs_fs_info *fs_info, u64 bytenr,
  825			     u64 offset, int metadata, u64 *refs, u64 *flags)
  826{
  827	struct btrfs_delayed_ref_head *head;
  828	struct btrfs_delayed_ref_root *delayed_refs;
  829	struct btrfs_path *path;
  830	struct btrfs_extent_item *ei;
  831	struct extent_buffer *leaf;
  832	struct btrfs_key key;
  833	u32 item_size;
  834	u64 num_refs;
  835	u64 extent_flags;
  836	int ret;
  837
  838	/*
  839	 * If we don't have skinny metadata, don't bother doing anything
  840	 * different
  841	 */
  842	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
  843		offset = fs_info->nodesize;
  844		metadata = 0;
  845	}
  846
  847	path = btrfs_alloc_path();
  848	if (!path)
  849		return -ENOMEM;
  850
  851	if (!trans) {
  852		path->skip_locking = 1;
  853		path->search_commit_root = 1;
  854	}
  855
  856search_again:
  857	key.objectid = bytenr;
  858	key.offset = offset;
  859	if (metadata)
  860		key.type = BTRFS_METADATA_ITEM_KEY;
  861	else
  862		key.type = BTRFS_EXTENT_ITEM_KEY;
  863
  864	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
  865	if (ret < 0)
  866		goto out_free;
  867
  868	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
  869		if (path->slots[0]) {
  870			path->slots[0]--;
  871			btrfs_item_key_to_cpu(path->nodes[0], &key,
  872					      path->slots[0]);
  873			if (key.objectid == bytenr &&
  874			    key.type == BTRFS_EXTENT_ITEM_KEY &&
  875			    key.offset == fs_info->nodesize)
  876				ret = 0;
  877		}
  878	}
  879
  880	if (ret == 0) {
  881		leaf = path->nodes[0];
  882		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
  883		if (item_size >= sizeof(*ei)) {
  884			ei = btrfs_item_ptr(leaf, path->slots[0],
  885					    struct btrfs_extent_item);
  886			num_refs = btrfs_extent_refs(leaf, ei);
  887			extent_flags = btrfs_extent_flags(leaf, ei);
  888		} else {
  889			ret = -EINVAL;
  890			btrfs_print_v0_err(fs_info);
  891			if (trans)
  892				btrfs_abort_transaction(trans, ret);
  893			else
  894				btrfs_handle_fs_error(fs_info, ret, NULL);
  895
  896			goto out_free;
  897		}
  898
  899		BUG_ON(num_refs == 0);
  900	} else {
  901		num_refs = 0;
  902		extent_flags = 0;
  903		ret = 0;
  904	}
  905
  906	if (!trans)
  907		goto out;
  908
  909	delayed_refs = &trans->transaction->delayed_refs;
  910	spin_lock(&delayed_refs->lock);
  911	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
  912	if (head) {
  913		if (!mutex_trylock(&head->mutex)) {
  914			refcount_inc(&head->refs);
  915			spin_unlock(&delayed_refs->lock);
  916
  917			btrfs_release_path(path);
  918
  919			/*
  920			 * Mutex was contended, block until it's released and try
  921			 * again
  922			 */
  923			mutex_lock(&head->mutex);
  924			mutex_unlock(&head->mutex);
  925			btrfs_put_delayed_ref_head(head);
  926			goto search_again;
  927		}
  928		spin_lock(&head->lock);
  929		if (head->extent_op && head->extent_op->update_flags)
  930			extent_flags |= head->extent_op->flags_to_set;
  931		else
  932			BUG_ON(num_refs == 0);
  933
  934		num_refs += head->ref_mod;
  935		spin_unlock(&head->lock);
  936		mutex_unlock(&head->mutex);
  937	}
  938	spin_unlock(&delayed_refs->lock);
  939out:
  940	WARN_ON(num_refs == 0);
  941	if (refs)
  942		*refs = num_refs;
  943	if (flags)
  944		*flags = extent_flags;
  945out_free:
  946	btrfs_free_path(path);
  947	return ret;
  948}
  949
  950/*
  951 * Back reference rules.  Back refs have three main goals:
  952 *
  953 * 1) differentiate between all holders of references to an extent so that
  954 *    when a reference is dropped we can make sure it was a valid reference
  955 *    before freeing the extent.
  956 *
  957 * 2) Provide enough information to quickly find the holders of an extent
  958 *    if we notice a given block is corrupted or bad.
  959 *
  960 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
  961 *    maintenance.  This is actually the same as #2, but with a slightly
  962 *    different use case.
  963 *
  964 * There are two kinds of back refs. The implicit back refs is optimized
  965 * for pointers in non-shared tree blocks. For a given pointer in a block,
  966 * back refs of this kind provide information about the block's owner tree
  967 * and the pointer's key. These information allow us to find the block by
  968 * b-tree searching. The full back refs is for pointers in tree blocks not
  969 * referenced by their owner trees. The location of tree block is recorded
  970 * in the back refs. Actually the full back refs is generic, and can be
  971 * used in all cases the implicit back refs is used. The major shortcoming
  972 * of the full back refs is its overhead. Every time a tree block gets
  973 * COWed, we have to update back refs entry for all pointers in it.
  974 *
  975 * For a newly allocated tree block, we use implicit back refs for
  976 * pointers in it. This means most tree related operations only involve
  977 * implicit back refs. For a tree block created in old transaction, the
  978 * only way to drop a reference to it is COW it. So we can detect the
  979 * event that tree block loses its owner tree's reference and do the
  980 * back refs conversion.
  981 *
  982 * When a tree block is COWed through a tree, there are four cases:
  983 *
  984 * The reference count of the block is one and the tree is the block's
  985 * owner tree. Nothing to do in this case.
  986 *
  987 * The reference count of the block is one and the tree is not the
  988 * block's owner tree. In this case, full back refs is used for pointers
  989 * in the block. Remove these full back refs, add implicit back refs for
  990 * every pointers in the new block.
  991 *
  992 * The reference count of the block is greater than one and the tree is
  993 * the block's owner tree. In this case, implicit back refs is used for
  994 * pointers in the block. Add full back refs for every pointers in the
  995 * block, increase lower level extents' reference counts. The original
  996 * implicit back refs are entailed to the new block.
  997 *
  998 * The reference count of the block is greater than one and the tree is
  999 * not the block's owner tree. Add implicit back refs for every pointer in
 1000 * the new block, increase lower level extents' reference count.
 1001 *
 1002 * Back Reference Key composing:
 1003 *
 1004 * The key objectid corresponds to the first byte in the extent,
 1005 * The key type is used to differentiate between types of back refs.
 1006 * There are different meanings of the key offset for different types
 1007 * of back refs.
 1008 *
 1009 * File extents can be referenced by:
 1010 *
 1011 * - multiple snapshots, subvolumes, or different generations in one subvol
 1012 * - different files inside a single subvolume
 1013 * - different offsets inside a file (bookend extents in file.c)
 1014 *
 1015 * The extent ref structure for the implicit back refs has fields for:
 1016 *
 1017 * - Objectid of the subvolume root
 1018 * - objectid of the file holding the reference
 1019 * - original offset in the file
 1020 * - how many bookend extents
 1021 *
 1022 * The key offset for the implicit back refs is hash of the first
 1023 * three fields.
 1024 *
 1025 * The extent ref structure for the full back refs has field for:
 1026 *
 1027 * - number of pointers in the tree leaf
 1028 *
 1029 * The key offset for the implicit back refs is the first byte of
 1030 * the tree leaf
 1031 *
 1032 * When a file extent is allocated, The implicit back refs is used.
 1033 * the fields are filled in:
 1034 *
 1035 *     (root_key.objectid, inode objectid, offset in file, 1)
 1036 *
 1037 * When a file extent is removed file truncation, we find the
 1038 * corresponding implicit back refs and check the following fields:
 1039 *
 1040 *     (btrfs_header_owner(leaf), inode objectid, offset in file)
 1041 *
 1042 * Btree extents can be referenced by:
 1043 *
 1044 * - Different subvolumes
 1045 *
 1046 * Both the implicit back refs and the full back refs for tree blocks
 1047 * only consist of key. The key offset for the implicit back refs is
 1048 * objectid of block's owner tree. The key offset for the full back refs
 1049 * is the first byte of parent block.
 1050 *
 1051 * When implicit back refs is used, information about the lowest key and
 1052 * level of the tree block are required. These information are stored in
 1053 * tree block info structure.
 1054 */
 1055
 1056/*
 1057 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
 1058 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
 1059 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
 1060 */
 1061int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
 1062				     struct btrfs_extent_inline_ref *iref,
 1063				     enum btrfs_inline_ref_type is_data)
 1064{
 1065	int type = btrfs_extent_inline_ref_type(eb, iref);
 1066	u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
 1067
 1068	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
 1069	    type == BTRFS_SHARED_BLOCK_REF_KEY ||
 1070	    type == BTRFS_SHARED_DATA_REF_KEY ||
 1071	    type == BTRFS_EXTENT_DATA_REF_KEY) {
 1072		if (is_data == BTRFS_REF_TYPE_BLOCK) {
 1073			if (type == BTRFS_TREE_BLOCK_REF_KEY)
 1074				return type;
 1075			if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
 1076				ASSERT(eb->fs_info);
 1077				/*
 1078				 * Every shared one has parent tree
 1079				 * block, which must be aligned to
 1080				 * nodesize.
 1081				 */
 1082				if (offset &&
 1083				    IS_ALIGNED(offset, eb->fs_info->nodesize))
 1084					return type;
 1085			}
 1086		} else if (is_data == BTRFS_REF_TYPE_DATA) {
 1087			if (type == BTRFS_EXTENT_DATA_REF_KEY)
 1088				return type;
 1089			if (type == BTRFS_SHARED_DATA_REF_KEY) {
 1090				ASSERT(eb->fs_info);
 1091				/*
 1092				 * Every shared one has parent tree
 1093				 * block, which must be aligned to
 1094				 * nodesize.
 1095				 */
 1096				if (offset &&
 1097				    IS_ALIGNED(offset, eb->fs_info->nodesize))
 1098					return type;
 1099			}
 1100		} else {
 1101			ASSERT(is_data == BTRFS_REF_TYPE_ANY);
 1102			return type;
 1103		}
 1104	}
 1105
 1106	btrfs_print_leaf((struct extent_buffer *)eb);
 1107	btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
 1108		  eb->start, type);
 1109	WARN_ON(1);
 1110
 1111	return BTRFS_REF_TYPE_INVALID;
 1112}
 1113
 1114static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
 1115{
 1116	u32 high_crc = ~(u32)0;
 1117	u32 low_crc = ~(u32)0;
 1118	__le64 lenum;
 1119
 1120	lenum = cpu_to_le64(root_objectid);
 1121	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
 1122	lenum = cpu_to_le64(owner);
 1123	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
 1124	lenum = cpu_to_le64(offset);
 1125	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
 1126
 1127	return ((u64)high_crc << 31) ^ (u64)low_crc;
 1128}
 1129
 1130static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
 1131				     struct btrfs_extent_data_ref *ref)
 1132{
 1133	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
 1134				    btrfs_extent_data_ref_objectid(leaf, ref),
 1135				    btrfs_extent_data_ref_offset(leaf, ref));
 1136}
 1137
 1138static int match_extent_data_ref(struct extent_buffer *leaf,
 1139				 struct btrfs_extent_data_ref *ref,
 1140				 u64 root_objectid, u64 owner, u64 offset)
 1141{
 1142	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
 1143	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
 1144	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
 1145		return 0;
 1146	return 1;
 1147}
 1148
 1149static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
 1150					   struct btrfs_path *path,
 1151					   u64 bytenr, u64 parent,
 1152					   u64 root_objectid,
 1153					   u64 owner, u64 offset)
 1154{
 1155	struct btrfs_root *root = trans->fs_info->extent_root;
 1156	struct btrfs_key key;
 1157	struct btrfs_extent_data_ref *ref;
 1158	struct extent_buffer *leaf;
 1159	u32 nritems;
 1160	int ret;
 1161	int recow;
 1162	int err = -ENOENT;
 1163
 1164	key.objectid = bytenr;
 1165	if (parent) {
 1166		key.type = BTRFS_SHARED_DATA_REF_KEY;
 1167		key.offset = parent;
 1168	} else {
 1169		key.type = BTRFS_EXTENT_DATA_REF_KEY;
 1170		key.offset = hash_extent_data_ref(root_objectid,
 1171						  owner, offset);
 1172	}
 1173again:
 1174	recow = 0;
 1175	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 1176	if (ret < 0) {
 1177		err = ret;
 1178		goto fail;
 1179	}
 1180
 1181	if (parent) {
 1182		if (!ret)
 1183			return 0;
 1184		goto fail;
 1185	}
 1186
 1187	leaf = path->nodes[0];
 1188	nritems = btrfs_header_nritems(leaf);
 1189	while (1) {
 1190		if (path->slots[0] >= nritems) {
 1191			ret = btrfs_next_leaf(root, path);
 1192			if (ret < 0)
 1193				err = ret;
 1194			if (ret)
 1195				goto fail;
 1196
 1197			leaf = path->nodes[0];
 1198			nritems = btrfs_header_nritems(leaf);
 1199			recow = 1;
 1200		}
 1201
 1202		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 1203		if (key.objectid != bytenr ||
 1204		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
 1205			goto fail;
 1206
 1207		ref = btrfs_item_ptr(leaf, path->slots[0],
 1208				     struct btrfs_extent_data_ref);
 1209
 1210		if (match_extent_data_ref(leaf, ref, root_objectid,
 1211					  owner, offset)) {
 1212			if (recow) {
 1213				btrfs_release_path(path);
 1214				goto again;
 1215			}
 1216			err = 0;
 1217			break;
 1218		}
 1219		path->slots[0]++;
 1220	}
 1221fail:
 1222	return err;
 1223}
 1224
 1225static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
 1226					   struct btrfs_path *path,
 1227					   u64 bytenr, u64 parent,
 1228					   u64 root_objectid, u64 owner,
 1229					   u64 offset, int refs_to_add)
 1230{
 1231	struct btrfs_root *root = trans->fs_info->extent_root;
 1232	struct btrfs_key key;
 1233	struct extent_buffer *leaf;
 1234	u32 size;
 1235	u32 num_refs;
 1236	int ret;
 1237
 1238	key.objectid = bytenr;
 1239	if (parent) {
 1240		key.type = BTRFS_SHARED_DATA_REF_KEY;
 1241		key.offset = parent;
 1242		size = sizeof(struct btrfs_shared_data_ref);
 1243	} else {
 1244		key.type = BTRFS_EXTENT_DATA_REF_KEY;
 1245		key.offset = hash_extent_data_ref(root_objectid,
 1246						  owner, offset);
 1247		size = sizeof(struct btrfs_extent_data_ref);
 1248	}
 1249
 1250	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
 1251	if (ret && ret != -EEXIST)
 1252		goto fail;
 1253
 1254	leaf = path->nodes[0];
 1255	if (parent) {
 1256		struct btrfs_shared_data_ref *ref;
 1257		ref = btrfs_item_ptr(leaf, path->slots[0],
 1258				     struct btrfs_shared_data_ref);
 1259		if (ret == 0) {
 1260			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
 1261		} else {
 1262			num_refs = btrfs_shared_data_ref_count(leaf, ref);
 1263			num_refs += refs_to_add;
 1264			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
 1265		}
 1266	} else {
 1267		struct btrfs_extent_data_ref *ref;
 1268		while (ret == -EEXIST) {
 1269			ref = btrfs_item_ptr(leaf, path->slots[0],
 1270					     struct btrfs_extent_data_ref);
 1271			if (match_extent_data_ref(leaf, ref, root_objectid,
 1272						  owner, offset))
 1273				break;
 1274			btrfs_release_path(path);
 1275			key.offset++;
 1276			ret = btrfs_insert_empty_item(trans, root, path, &key,
 1277						      size);
 1278			if (ret && ret != -EEXIST)
 1279				goto fail;
 1280
 1281			leaf = path->nodes[0];
 1282		}
 1283		ref = btrfs_item_ptr(leaf, path->slots[0],
 1284				     struct btrfs_extent_data_ref);
 1285		if (ret == 0) {
 1286			btrfs_set_extent_data_ref_root(leaf, ref,
 1287						       root_objectid);
 1288			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
 1289			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
 1290			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
 1291		} else {
 1292			num_refs = btrfs_extent_data_ref_count(leaf, ref);
 1293			num_refs += refs_to_add;
 1294			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
 1295		}
 1296	}
 1297	btrfs_mark_buffer_dirty(leaf);
 1298	ret = 0;
 1299fail:
 1300	btrfs_release_path(path);
 1301	return ret;
 1302}
 1303
 1304static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
 1305					   struct btrfs_path *path,
 1306					   int refs_to_drop, int *last_ref)
 1307{
 1308	struct btrfs_key key;
 1309	struct btrfs_extent_data_ref *ref1 = NULL;
 1310	struct btrfs_shared_data_ref *ref2 = NULL;
 1311	struct extent_buffer *leaf;
 1312	u32 num_refs = 0;
 1313	int ret = 0;
 1314
 1315	leaf = path->nodes[0];
 1316	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 1317
 1318	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
 1319		ref1 = btrfs_item_ptr(leaf, path->slots[0],
 1320				      struct btrfs_extent_data_ref);
 1321		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
 1322	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
 1323		ref2 = btrfs_item_ptr(leaf, path->slots[0],
 1324				      struct btrfs_shared_data_ref);
 1325		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
 1326	} else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
 1327		btrfs_print_v0_err(trans->fs_info);
 1328		btrfs_abort_transaction(trans, -EINVAL);
 1329		return -EINVAL;
 1330	} else {
 1331		BUG();
 1332	}
 1333
 1334	BUG_ON(num_refs < refs_to_drop);
 1335	num_refs -= refs_to_drop;
 1336
 1337	if (num_refs == 0) {
 1338		ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
 1339		*last_ref = 1;
 1340	} else {
 1341		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
 1342			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
 1343		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
 1344			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
 1345		btrfs_mark_buffer_dirty(leaf);
 1346	}
 1347	return ret;
 1348}
 1349
 1350static noinline u32 extent_data_ref_count(struct btrfs_path *path,
 1351					  struct btrfs_extent_inline_ref *iref)
 1352{
 1353	struct btrfs_key key;
 1354	struct extent_buffer *leaf;
 1355	struct btrfs_extent_data_ref *ref1;
 1356	struct btrfs_shared_data_ref *ref2;
 1357	u32 num_refs = 0;
 1358	int type;
 1359
 1360	leaf = path->nodes[0];
 1361	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 1362
 1363	BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
 1364	if (iref) {
 1365		/*
 1366		 * If type is invalid, we should have bailed out earlier than
 1367		 * this call.
 1368		 */
 1369		type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
 1370		ASSERT(type != BTRFS_REF_TYPE_INVALID);
 1371		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
 1372			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
 1373			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
 1374		} else {
 1375			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
 1376			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
 1377		}
 1378	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
 1379		ref1 = btrfs_item_ptr(leaf, path->slots[0],
 1380				      struct btrfs_extent_data_ref);
 1381		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
 1382	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
 1383		ref2 = btrfs_item_ptr(leaf, path->slots[0],
 1384				      struct btrfs_shared_data_ref);
 1385		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
 1386	} else {
 1387		WARN_ON(1);
 1388	}
 1389	return num_refs;
 1390}
 1391
 1392static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
 1393					  struct btrfs_path *path,
 1394					  u64 bytenr, u64 parent,
 1395					  u64 root_objectid)
 1396{
 1397	struct btrfs_root *root = trans->fs_info->extent_root;
 1398	struct btrfs_key key;
 1399	int ret;
 1400
 1401	key.objectid = bytenr;
 1402	if (parent) {
 1403		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
 1404		key.offset = parent;
 1405	} else {
 1406		key.type = BTRFS_TREE_BLOCK_REF_KEY;
 1407		key.offset = root_objectid;
 1408	}
 1409
 1410	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 1411	if (ret > 0)
 1412		ret = -ENOENT;
 1413	return ret;
 1414}
 1415
 1416static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
 1417					  struct btrfs_path *path,
 1418					  u64 bytenr, u64 parent,
 1419					  u64 root_objectid)
 1420{
 1421	struct btrfs_key key;
 1422	int ret;
 1423
 1424	key.objectid = bytenr;
 1425	if (parent) {
 1426		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
 1427		key.offset = parent;
 1428	} else {
 1429		key.type = BTRFS_TREE_BLOCK_REF_KEY;
 1430		key.offset = root_objectid;
 1431	}
 1432
 1433	ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
 1434				      path, &key, 0);
 1435	btrfs_release_path(path);
 1436	return ret;
 1437}
 1438
 1439static inline int extent_ref_type(u64 parent, u64 owner)
 1440{
 1441	int type;
 1442	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
 1443		if (parent > 0)
 1444			type = BTRFS_SHARED_BLOCK_REF_KEY;
 1445		else
 1446			type = BTRFS_TREE_BLOCK_REF_KEY;
 1447	} else {
 1448		if (parent > 0)
 1449			type = BTRFS_SHARED_DATA_REF_KEY;
 1450		else
 1451			type = BTRFS_EXTENT_DATA_REF_KEY;
 1452	}
 1453	return type;
 1454}
 1455
 1456static int find_next_key(struct btrfs_path *path, int level,
 1457			 struct btrfs_key *key)
 1458
 1459{
 1460	for (; level < BTRFS_MAX_LEVEL; level++) {
 1461		if (!path->nodes[level])
 1462			break;
 1463		if (path->slots[level] + 1 >=
 1464		    btrfs_header_nritems(path->nodes[level]))
 1465			continue;
 1466		if (level == 0)
 1467			btrfs_item_key_to_cpu(path->nodes[level], key,
 1468					      path->slots[level] + 1);
 1469		else
 1470			btrfs_node_key_to_cpu(path->nodes[level], key,
 1471					      path->slots[level] + 1);
 1472		return 0;
 1473	}
 1474	return 1;
 1475}
 1476
 1477/*
 1478 * look for inline back ref. if back ref is found, *ref_ret is set
 1479 * to the address of inline back ref, and 0 is returned.
 1480 *
 1481 * if back ref isn't found, *ref_ret is set to the address where it
 1482 * should be inserted, and -ENOENT is returned.
 1483 *
 1484 * if insert is true and there are too many inline back refs, the path
 1485 * points to the extent item, and -EAGAIN is returned.
 1486 *
 1487 * NOTE: inline back refs are ordered in the same way that back ref
 1488 *	 items in the tree are ordered.
 1489 */
 1490static noinline_for_stack
 1491int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
 1492				 struct btrfs_path *path,
 1493				 struct btrfs_extent_inline_ref **ref_ret,
 1494				 u64 bytenr, u64 num_bytes,
 1495				 u64 parent, u64 root_objectid,
 1496				 u64 owner, u64 offset, int insert)
 1497{
 1498	struct btrfs_fs_info *fs_info = trans->fs_info;
 1499	struct btrfs_root *root = fs_info->extent_root;
 1500	struct btrfs_key key;
 1501	struct extent_buffer *leaf;
 1502	struct btrfs_extent_item *ei;
 1503	struct btrfs_extent_inline_ref *iref;
 1504	u64 flags;
 1505	u64 item_size;
 1506	unsigned long ptr;
 1507	unsigned long end;
 1508	int extra_size;
 1509	int type;
 1510	int want;
 1511	int ret;
 1512	int err = 0;
 1513	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
 1514	int needed;
 1515
 1516	key.objectid = bytenr;
 1517	key.type = BTRFS_EXTENT_ITEM_KEY;
 1518	key.offset = num_bytes;
 1519
 1520	want = extent_ref_type(parent, owner);
 1521	if (insert) {
 1522		extra_size = btrfs_extent_inline_ref_size(want);
 1523		path->keep_locks = 1;
 1524	} else
 1525		extra_size = -1;
 1526
 1527	/*
 1528	 * Owner is our level, so we can just add one to get the level for the
 1529	 * block we are interested in.
 1530	 */
 1531	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
 1532		key.type = BTRFS_METADATA_ITEM_KEY;
 1533		key.offset = owner;
 1534	}
 1535
 1536again:
 1537	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
 1538	if (ret < 0) {
 1539		err = ret;
 1540		goto out;
 1541	}
 1542
 1543	/*
 1544	 * We may be a newly converted file system which still has the old fat
 1545	 * extent entries for metadata, so try and see if we have one of those.
 1546	 */
 1547	if (ret > 0 && skinny_metadata) {
 1548		skinny_metadata = false;
 1549		if (path->slots[0]) {
 1550			path->slots[0]--;
 1551			btrfs_item_key_to_cpu(path->nodes[0], &key,
 1552					      path->slots[0]);
 1553			if (key.objectid == bytenr &&
 1554			    key.type == BTRFS_EXTENT_ITEM_KEY &&
 1555			    key.offset == num_bytes)
 1556				ret = 0;
 1557		}
 1558		if (ret) {
 1559			key.objectid = bytenr;
 1560			key.type = BTRFS_EXTENT_ITEM_KEY;
 1561			key.offset = num_bytes;
 1562			btrfs_release_path(path);
 1563			goto again;
 1564		}
 1565	}
 1566
 1567	if (ret && !insert) {
 1568		err = -ENOENT;
 1569		goto out;
 1570	} else if (WARN_ON(ret)) {
 1571		err = -EIO;
 1572		goto out;
 1573	}
 1574
 1575	leaf = path->nodes[0];
 1576	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 1577	if (unlikely(item_size < sizeof(*ei))) {
 1578		err = -EINVAL;
 1579		btrfs_print_v0_err(fs_info);
 1580		btrfs_abort_transaction(trans, err);
 1581		goto out;
 1582	}
 1583
 1584	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 1585	flags = btrfs_extent_flags(leaf, ei);
 1586
 1587	ptr = (unsigned long)(ei + 1);
 1588	end = (unsigned long)ei + item_size;
 1589
 1590	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
 1591		ptr += sizeof(struct btrfs_tree_block_info);
 1592		BUG_ON(ptr > end);
 1593	}
 1594
 1595	if (owner >= BTRFS_FIRST_FREE_OBJECTID)
 1596		needed = BTRFS_REF_TYPE_DATA;
 1597	else
 1598		needed = BTRFS_REF_TYPE_BLOCK;
 1599
 1600	err = -ENOENT;
 1601	while (1) {
 1602		if (ptr >= end) {
 1603			WARN_ON(ptr > end);
 1604			break;
 1605		}
 1606		iref = (struct btrfs_extent_inline_ref *)ptr;
 1607		type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
 1608		if (type == BTRFS_REF_TYPE_INVALID) {
 1609			err = -EUCLEAN;
 1610			goto out;
 1611		}
 1612
 1613		if (want < type)
 1614			break;
 1615		if (want > type) {
 1616			ptr += btrfs_extent_inline_ref_size(type);
 1617			continue;
 1618		}
 1619
 1620		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
 1621			struct btrfs_extent_data_ref *dref;
 1622			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
 1623			if (match_extent_data_ref(leaf, dref, root_objectid,
 1624						  owner, offset)) {
 1625				err = 0;
 1626				break;
 1627			}
 1628			if (hash_extent_data_ref_item(leaf, dref) <
 1629			    hash_extent_data_ref(root_objectid, owner, offset))
 1630				break;
 1631		} else {
 1632			u64 ref_offset;
 1633			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
 1634			if (parent > 0) {
 1635				if (parent == ref_offset) {
 1636					err = 0;
 1637					break;
 1638				}
 1639				if (ref_offset < parent)
 1640					break;
 1641			} else {
 1642				if (root_objectid == ref_offset) {
 1643					err = 0;
 1644					break;
 1645				}
 1646				if (ref_offset < root_objectid)
 1647					break;
 1648			}
 1649		}
 1650		ptr += btrfs_extent_inline_ref_size(type);
 1651	}
 1652	if (err == -ENOENT && insert) {
 1653		if (item_size + extra_size >=
 1654		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
 1655			err = -EAGAIN;
 1656			goto out;
 1657		}
 1658		/*
 1659		 * To add new inline back ref, we have to make sure
 1660		 * there is no corresponding back ref item.
 1661		 * For simplicity, we just do not add new inline back
 1662		 * ref if there is any kind of item for this block
 1663		 */
 1664		if (find_next_key(path, 0, &key) == 0 &&
 1665		    key.objectid == bytenr &&
 1666		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
 1667			err = -EAGAIN;
 1668			goto out;
 1669		}
 1670	}
 1671	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
 1672out:
 1673	if (insert) {
 1674		path->keep_locks = 0;
 1675		btrfs_unlock_up_safe(path, 1);
 1676	}
 1677	return err;
 1678}
 1679
 1680/*
 1681 * helper to add new inline back ref
 1682 */
 1683static noinline_for_stack
 1684void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
 1685				 struct btrfs_path *path,
 1686				 struct btrfs_extent_inline_ref *iref,
 1687				 u64 parent, u64 root_objectid,
 1688				 u64 owner, u64 offset, int refs_to_add,
 1689				 struct btrfs_delayed_extent_op *extent_op)
 1690{
 1691	struct extent_buffer *leaf;
 1692	struct btrfs_extent_item *ei;
 1693	unsigned long ptr;
 1694	unsigned long end;
 1695	unsigned long item_offset;
 1696	u64 refs;
 1697	int size;
 1698	int type;
 1699
 1700	leaf = path->nodes[0];
 1701	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 1702	item_offset = (unsigned long)iref - (unsigned long)ei;
 1703
 1704	type = extent_ref_type(parent, owner);
 1705	size = btrfs_extent_inline_ref_size(type);
 1706
 1707	btrfs_extend_item(fs_info, path, size);
 1708
 1709	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 1710	refs = btrfs_extent_refs(leaf, ei);
 1711	refs += refs_to_add;
 1712	btrfs_set_extent_refs(leaf, ei, refs);
 1713	if (extent_op)
 1714		__run_delayed_extent_op(extent_op, leaf, ei);
 1715
 1716	ptr = (unsigned long)ei + item_offset;
 1717	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
 1718	if (ptr < end - size)
 1719		memmove_extent_buffer(leaf, ptr + size, ptr,
 1720				      end - size - ptr);
 1721
 1722	iref = (struct btrfs_extent_inline_ref *)ptr;
 1723	btrfs_set_extent_inline_ref_type(leaf, iref, type);
 1724	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
 1725		struct btrfs_extent_data_ref *dref;
 1726		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
 1727		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
 1728		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
 1729		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
 1730		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
 1731	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
 1732		struct btrfs_shared_data_ref *sref;
 1733		sref = (struct btrfs_shared_data_ref *)(iref + 1);
 1734		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
 1735		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
 1736	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
 1737		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
 1738	} else {
 1739		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
 1740	}
 1741	btrfs_mark_buffer_dirty(leaf);
 1742}
 1743
 1744static int lookup_extent_backref(struct btrfs_trans_handle *trans,
 1745				 struct btrfs_path *path,
 1746				 struct btrfs_extent_inline_ref **ref_ret,
 1747				 u64 bytenr, u64 num_bytes, u64 parent,
 1748				 u64 root_objectid, u64 owner, u64 offset)
 1749{
 1750	int ret;
 1751
 1752	ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
 1753					   num_bytes, parent, root_objectid,
 1754					   owner, offset, 0);
 1755	if (ret != -ENOENT)
 1756		return ret;
 1757
 1758	btrfs_release_path(path);
 1759	*ref_ret = NULL;
 1760
 1761	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
 1762		ret = lookup_tree_block_ref(trans, path, bytenr, parent,
 1763					    root_objectid);
 1764	} else {
 1765		ret = lookup_extent_data_ref(trans, path, bytenr, parent,
 1766					     root_objectid, owner, offset);
 1767	}
 1768	return ret;
 1769}
 1770
 1771/*
 1772 * helper to update/remove inline back ref
 1773 */
 1774static noinline_for_stack
 1775void update_inline_extent_backref(struct btrfs_path *path,
 1776				  struct btrfs_extent_inline_ref *iref,
 1777				  int refs_to_mod,
 1778				  struct btrfs_delayed_extent_op *extent_op,
 1779				  int *last_ref)
 1780{
 1781	struct extent_buffer *leaf = path->nodes[0];
 1782	struct btrfs_fs_info *fs_info = leaf->fs_info;
 1783	struct btrfs_extent_item *ei;
 1784	struct btrfs_extent_data_ref *dref = NULL;
 1785	struct btrfs_shared_data_ref *sref = NULL;
 1786	unsigned long ptr;
 1787	unsigned long end;
 1788	u32 item_size;
 1789	int size;
 1790	int type;
 1791	u64 refs;
 1792
 1793	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 1794	refs = btrfs_extent_refs(leaf, ei);
 1795	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
 1796	refs += refs_to_mod;
 1797	btrfs_set_extent_refs(leaf, ei, refs);
 1798	if (extent_op)
 1799		__run_delayed_extent_op(extent_op, leaf, ei);
 1800
 1801	/*
 1802	 * If type is invalid, we should have bailed out after
 1803	 * lookup_inline_extent_backref().
 1804	 */
 1805	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
 1806	ASSERT(type != BTRFS_REF_TYPE_INVALID);
 1807
 1808	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
 1809		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
 1810		refs = btrfs_extent_data_ref_count(leaf, dref);
 1811	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
 1812		sref = (struct btrfs_shared_data_ref *)(iref + 1);
 1813		refs = btrfs_shared_data_ref_count(leaf, sref);
 1814	} else {
 1815		refs = 1;
 1816		BUG_ON(refs_to_mod != -1);
 1817	}
 1818
 1819	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
 1820	refs += refs_to_mod;
 1821
 1822	if (refs > 0) {
 1823		if (type == BTRFS_EXTENT_DATA_REF_KEY)
 1824			btrfs_set_extent_data_ref_count(leaf, dref, refs);
 1825		else
 1826			btrfs_set_shared_data_ref_count(leaf, sref, refs);
 1827	} else {
 1828		*last_ref = 1;
 1829		size =  btrfs_extent_inline_ref_size(type);
 1830		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 1831		ptr = (unsigned long)iref;
 1832		end = (unsigned long)ei + item_size;
 1833		if (ptr + size < end)
 1834			memmove_extent_buffer(leaf, ptr, ptr + size,
 1835					      end - ptr - size);
 1836		item_size -= size;
 1837		btrfs_truncate_item(fs_info, path, item_size, 1);
 1838	}
 1839	btrfs_mark_buffer_dirty(leaf);
 1840}
 1841
 1842static noinline_for_stack
 1843int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
 1844				 struct btrfs_path *path,
 1845				 u64 bytenr, u64 num_bytes, u64 parent,
 1846				 u64 root_objectid, u64 owner,
 1847				 u64 offset, int refs_to_add,
 1848				 struct btrfs_delayed_extent_op *extent_op)
 1849{
 1850	struct btrfs_extent_inline_ref *iref;
 1851	int ret;
 1852
 1853	ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
 1854					   num_bytes, parent, root_objectid,
 1855					   owner, offset, 1);
 1856	if (ret == 0) {
 1857		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
 1858		update_inline_extent_backref(path, iref, refs_to_add,
 1859					     extent_op, NULL);
 1860	} else if (ret == -ENOENT) {
 1861		setup_inline_extent_backref(trans->fs_info, path, iref, parent,
 1862					    root_objectid, owner, offset,
 1863					    refs_to_add, extent_op);
 1864		ret = 0;
 1865	}
 1866	return ret;
 1867}
 1868
 1869static int insert_extent_backref(struct btrfs_trans_handle *trans,
 1870				 struct btrfs_path *path,
 1871				 u64 bytenr, u64 parent, u64 root_objectid,
 1872				 u64 owner, u64 offset, int refs_to_add)
 1873{
 1874	int ret;
 1875	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
 1876		BUG_ON(refs_to_add != 1);
 1877		ret = insert_tree_block_ref(trans, path, bytenr, parent,
 1878					    root_objectid);
 1879	} else {
 1880		ret = insert_extent_data_ref(trans, path, bytenr, parent,
 1881					     root_objectid, owner, offset,
 1882					     refs_to_add);
 1883	}
 1884	return ret;
 1885}
 1886
 1887static int remove_extent_backref(struct btrfs_trans_handle *trans,
 1888				 struct btrfs_path *path,
 1889				 struct btrfs_extent_inline_ref *iref,
 1890				 int refs_to_drop, int is_data, int *last_ref)
 1891{
 1892	int ret = 0;
 1893
 1894	BUG_ON(!is_data && refs_to_drop != 1);
 1895	if (iref) {
 1896		update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
 1897					     last_ref);
 1898	} else if (is_data) {
 1899		ret = remove_extent_data_ref(trans, path, refs_to_drop,
 1900					     last_ref);
 1901	} else {
 1902		*last_ref = 1;
 1903		ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
 1904	}
 1905	return ret;
 1906}
 1907
 1908#define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
 1909static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
 1910			       u64 *discarded_bytes)
 1911{
 1912	int j, ret = 0;
 1913	u64 bytes_left, end;
 1914	u64 aligned_start = ALIGN(start, 1 << 9);
 1915
 1916	if (WARN_ON(start != aligned_start)) {
 1917		len -= aligned_start - start;
 1918		len = round_down(len, 1 << 9);
 1919		start = aligned_start;
 1920	}
 1921
 1922	*discarded_bytes = 0;
 1923
 1924	if (!len)
 1925		return 0;
 1926
 1927	end = start + len;
 1928	bytes_left = len;
 1929
 1930	/* Skip any superblocks on this device. */
 1931	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
 1932		u64 sb_start = btrfs_sb_offset(j);
 1933		u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
 1934		u64 size = sb_start - start;
 1935
 1936		if (!in_range(sb_start, start, bytes_left) &&
 1937		    !in_range(sb_end, start, bytes_left) &&
 1938		    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
 1939			continue;
 1940
 1941		/*
 1942		 * Superblock spans beginning of range.  Adjust start and
 1943		 * try again.
 1944		 */
 1945		if (sb_start <= start) {
 1946			start += sb_end - start;
 1947			if (start > end) {
 1948				bytes_left = 0;
 1949				break;
 1950			}
 1951			bytes_left = end - start;
 1952			continue;
 1953		}
 1954
 1955		if (size) {
 1956			ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
 1957						   GFP_NOFS, 0);
 1958			if (!ret)
 1959				*discarded_bytes += size;
 1960			else if (ret != -EOPNOTSUPP)
 1961				return ret;
 1962		}
 1963
 1964		start = sb_end;
 1965		if (start > end) {
 1966			bytes_left = 0;
 1967			break;
 1968		}
 1969		bytes_left = end - start;
 1970	}
 1971
 1972	if (bytes_left) {
 1973		ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
 1974					   GFP_NOFS, 0);
 1975		if (!ret)
 1976			*discarded_bytes += bytes_left;
 1977	}
 1978	return ret;
 1979}
 1980
 1981int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
 1982			 u64 num_bytes, u64 *actual_bytes)
 1983{
 1984	int ret;
 1985	u64 discarded_bytes = 0;
 1986	struct btrfs_bio *bbio = NULL;
 1987
 1988
 1989	/*
 1990	 * Avoid races with device replace and make sure our bbio has devices
 1991	 * associated to its stripes that don't go away while we are discarding.
 1992	 */
 1993	btrfs_bio_counter_inc_blocked(fs_info);
 1994	/* Tell the block device(s) that the sectors can be discarded */
 1995	ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
 1996			      &bbio, 0);
 1997	/* Error condition is -ENOMEM */
 1998	if (!ret) {
 1999		struct btrfs_bio_stripe *stripe = bbio->stripes;
 2000		int i;
 2001
 2002
 2003		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
 2004			u64 bytes;
 2005			struct request_queue *req_q;
 2006
 2007			if (!stripe->dev->bdev) {
 2008				ASSERT(btrfs_test_opt(fs_info, DEGRADED));
 2009				continue;
 2010			}
 2011			req_q = bdev_get_queue(stripe->dev->bdev);
 2012			if (!blk_queue_discard(req_q))
 2013				continue;
 2014
 2015			ret = btrfs_issue_discard(stripe->dev->bdev,
 2016						  stripe->physical,
 2017						  stripe->length,
 2018						  &bytes);
 2019			if (!ret)
 2020				discarded_bytes += bytes;
 2021			else if (ret != -EOPNOTSUPP)
 2022				break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
 2023
 2024			/*
 2025			 * Just in case we get back EOPNOTSUPP for some reason,
 2026			 * just ignore the return value so we don't screw up
 2027			 * people calling discard_extent.
 2028			 */
 2029			ret = 0;
 2030		}
 2031		btrfs_put_bbio(bbio);
 2032	}
 2033	btrfs_bio_counter_dec(fs_info);
 2034
 2035	if (actual_bytes)
 2036		*actual_bytes = discarded_bytes;
 2037
 2038
 2039	if (ret == -EOPNOTSUPP)
 2040		ret = 0;
 2041	return ret;
 2042}
 2043
 2044/* Can return -ENOMEM */
 2045int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 2046			 struct btrfs_root *root,
 2047			 u64 bytenr, u64 num_bytes, u64 parent,
 2048			 u64 root_objectid, u64 owner, u64 offset)
 2049{
 2050	struct btrfs_fs_info *fs_info = root->fs_info;
 2051	int old_ref_mod, new_ref_mod;
 2052	int ret;
 2053
 2054	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
 2055	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
 2056
 2057	btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid,
 2058			   owner, offset, BTRFS_ADD_DELAYED_REF);
 2059
 2060	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
 2061		ret = btrfs_add_delayed_tree_ref(trans, bytenr,
 2062						 num_bytes, parent,
 2063						 root_objectid, (int)owner,
 2064						 BTRFS_ADD_DELAYED_REF, NULL,
 2065						 &old_ref_mod, &new_ref_mod);
 2066	} else {
 2067		ret = btrfs_add_delayed_data_ref(trans, bytenr,
 2068						 num_bytes, parent,
 2069						 root_objectid, owner, offset,
 2070						 0, BTRFS_ADD_DELAYED_REF,
 2071						 &old_ref_mod, &new_ref_mod);
 2072	}
 2073
 2074	if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) {
 2075		bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
 2076
 2077		add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid);
 2078	}
 2079
 2080	return ret;
 2081}
 2082
 2083/*
 2084 * __btrfs_inc_extent_ref - insert backreference for a given extent
 2085 *
 2086 * @trans:	    Handle of transaction
 2087 *
 2088 * @node:	    The delayed ref node used to get the bytenr/length for
 2089 *		    extent whose references are incremented.
 2090 *
 2091 * @parent:	    If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
 2092 *		    BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
 2093 *		    bytenr of the parent block. Since new extents are always
 2094 *		    created with indirect references, this will only be the case
 2095 *		    when relocating a shared extent. In that case, root_objectid
 2096 *		    will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
 2097 *		    be 0
 2098 *
 2099 * @root_objectid:  The id of the root where this modification has originated,
 2100 *		    this can be either one of the well-known metadata trees or
 2101 *		    the subvolume id which references this extent.
 2102 *
 2103 * @owner:	    For data extents it is the inode number of the owning file.
 2104 *		    For metadata extents this parameter holds the level in the
 2105 *		    tree of the extent.
 2106 *
 2107 * @offset:	    For metadata extents the offset is ignored and is currently
 2108 *		    always passed as 0. For data extents it is the fileoffset
 2109 *		    this extent belongs to.
 2110 *
 2111 * @refs_to_add     Number of references to add
 2112 *
 2113 * @extent_op       Pointer to a structure, holding information necessary when
 2114 *                  updating a tree block's flags
 2115 *
 2116 */
 2117static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 2118				  struct btrfs_delayed_ref_node *node,
 2119				  u64 parent, u64 root_objectid,
 2120				  u64 owner, u64 offset, int refs_to_add,
 2121				  struct btrfs_delayed_extent_op *extent_op)
 2122{
 2123	struct btrfs_path *path;
 2124	struct extent_buffer *leaf;
 2125	struct btrfs_extent_item *item;
 2126	struct btrfs_key key;
 2127	u64 bytenr = node->bytenr;
 2128	u64 num_bytes = node->num_bytes;
 2129	u64 refs;
 2130	int ret;
 2131
 2132	path = btrfs_alloc_path();
 2133	if (!path)
 2134		return -ENOMEM;
 2135
 2136	path->reada = READA_FORWARD;
 2137	path->leave_spinning = 1;
 2138	/* this will setup the path even if it fails to insert the back ref */
 2139	ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
 2140					   parent, root_objectid, owner,
 2141					   offset, refs_to_add, extent_op);
 2142	if ((ret < 0 && ret != -EAGAIN) || !ret)
 2143		goto out;
 2144
 2145	/*
 2146	 * Ok we had -EAGAIN which means we didn't have space to insert and
 2147	 * inline extent ref, so just update the reference count and add a
 2148	 * normal backref.
 2149	 */
 2150	leaf = path->nodes[0];
 2151	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 2152	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 2153	refs = btrfs_extent_refs(leaf, item);
 2154	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
 2155	if (extent_op)
 2156		__run_delayed_extent_op(extent_op, leaf, item);
 2157
 2158	btrfs_mark_buffer_dirty(leaf);
 2159	btrfs_release_path(path);
 2160
 2161	path->reada = READA_FORWARD;
 2162	path->leave_spinning = 1;
 2163	/* now insert the actual backref */
 2164	ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
 2165				    owner, offset, refs_to_add);
 2166	if (ret)
 2167		btrfs_abort_transaction(trans, ret);
 2168out:
 2169	btrfs_free_path(path);
 2170	return ret;
 2171}
 2172
 2173static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
 2174				struct btrfs_delayed_ref_node *node,
 2175				struct btrfs_delayed_extent_op *extent_op,
 2176				int insert_reserved)
 2177{
 2178	int ret = 0;
 2179	struct btrfs_delayed_data_ref *ref;
 2180	struct btrfs_key ins;
 2181	u64 parent = 0;
 2182	u64 ref_root = 0;
 2183	u64 flags = 0;
 2184
 2185	ins.objectid = node->bytenr;
 2186	ins.offset = node->num_bytes;
 2187	ins.type = BTRFS_EXTENT_ITEM_KEY;
 2188
 2189	ref = btrfs_delayed_node_to_data_ref(node);
 2190	trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
 2191
 2192	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
 2193		parent = ref->parent;
 2194	ref_root = ref->root;
 2195
 2196	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
 2197		if (extent_op)
 2198			flags |= extent_op->flags_to_set;
 2199		ret = alloc_reserved_file_extent(trans, parent, ref_root,
 2200						 flags, ref->objectid,
 2201						 ref->offset, &ins,
 2202						 node->ref_mod);
 2203	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
 2204		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
 2205					     ref->objectid, ref->offset,
 2206					     node->ref_mod, extent_op);
 2207	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
 2208		ret = __btrfs_free_extent(trans, node, parent,
 2209					  ref_root, ref->objectid,
 2210					  ref->offset, node->ref_mod,
 2211					  extent_op);
 2212	} else {
 2213		BUG();
 2214	}
 2215	return ret;
 2216}
 2217
 2218static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
 2219				    struct extent_buffer *leaf,
 2220				    struct btrfs_extent_item *ei)
 2221{
 2222	u64 flags = btrfs_extent_flags(leaf, ei);
 2223	if (extent_op->update_flags) {
 2224		flags |= extent_op->flags_to_set;
 2225		btrfs_set_extent_flags(leaf, ei, flags);
 2226	}
 2227
 2228	if (extent_op->update_key) {
 2229		struct btrfs_tree_block_info *bi;
 2230		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
 2231		bi = (struct btrfs_tree_block_info *)(ei + 1);
 2232		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
 2233	}
 2234}
 2235
 2236static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
 2237				 struct btrfs_delayed_ref_head *head,
 2238				 struct btrfs_delayed_extent_op *extent_op)
 2239{
 2240	struct btrfs_fs_info *fs_info = trans->fs_info;
 2241	struct btrfs_key key;
 2242	struct btrfs_path *path;
 2243	struct btrfs_extent_item *ei;
 2244	struct extent_buffer *leaf;
 2245	u32 item_size;
 2246	int ret;
 2247	int err = 0;
 2248	int metadata = !extent_op->is_data;
 2249
 2250	if (trans->aborted)
 2251		return 0;
 2252
 2253	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
 2254		metadata = 0;
 2255
 2256	path = btrfs_alloc_path();
 2257	if (!path)
 2258		return -ENOMEM;
 2259
 2260	key.objectid = head->bytenr;
 2261
 2262	if (metadata) {
 2263		key.type = BTRFS_METADATA_ITEM_KEY;
 2264		key.offset = extent_op->level;
 2265	} else {
 2266		key.type = BTRFS_EXTENT_ITEM_KEY;
 2267		key.offset = head->num_bytes;
 2268	}
 2269
 2270again:
 2271	path->reada = READA_FORWARD;
 2272	path->leave_spinning = 1;
 2273	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
 2274	if (ret < 0) {
 2275		err = ret;
 2276		goto out;
 2277	}
 2278	if (ret > 0) {
 2279		if (metadata) {
 2280			if (path->slots[0] > 0) {
 2281				path->slots[0]--;
 2282				btrfs_item_key_to_cpu(path->nodes[0], &key,
 2283						      path->slots[0]);
 2284				if (key.objectid == head->bytenr &&
 2285				    key.type == BTRFS_EXTENT_ITEM_KEY &&
 2286				    key.offset == head->num_bytes)
 2287					ret = 0;
 2288			}
 2289			if (ret > 0) {
 2290				btrfs_release_path(path);
 2291				metadata = 0;
 2292
 2293				key.objectid = head->bytenr;
 2294				key.offset = head->num_bytes;
 2295				key.type = BTRFS_EXTENT_ITEM_KEY;
 2296				goto again;
 2297			}
 2298		} else {
 2299			err = -EIO;
 2300			goto out;
 2301		}
 2302	}
 2303
 2304	leaf = path->nodes[0];
 2305	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 2306
 2307	if (unlikely(item_size < sizeof(*ei))) {
 2308		err = -EINVAL;
 2309		btrfs_print_v0_err(fs_info);
 2310		btrfs_abort_transaction(trans, err);
 2311		goto out;
 2312	}
 2313
 2314	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 2315	__run_delayed_extent_op(extent_op, leaf, ei);
 2316
 2317	btrfs_mark_buffer_dirty(leaf);
 2318out:
 2319	btrfs_free_path(path);
 2320	return err;
 2321}
 2322
 2323static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
 2324				struct btrfs_delayed_ref_node *node,
 2325				struct btrfs_delayed_extent_op *extent_op,
 2326				int insert_reserved)
 2327{
 2328	int ret = 0;
 2329	struct btrfs_delayed_tree_ref *ref;
 2330	u64 parent = 0;
 2331	u64 ref_root = 0;
 2332
 2333	ref = btrfs_delayed_node_to_tree_ref(node);
 2334	trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
 2335
 2336	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
 2337		parent = ref->parent;
 2338	ref_root = ref->root;
 2339
 2340	if (node->ref_mod != 1) {
 2341		btrfs_err(trans->fs_info,
 2342	"btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
 2343			  node->bytenr, node->ref_mod, node->action, ref_root,
 2344			  parent);
 2345		return -EIO;
 2346	}
 2347	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
 2348		BUG_ON(!extent_op || !extent_op->update_flags);
 2349		ret = alloc_reserved_tree_block(trans, node, extent_op);
 2350	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
 2351		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
 2352					     ref->level, 0, 1, extent_op);
 2353	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
 2354		ret = __btrfs_free_extent(trans, node, parent, ref_root,
 2355					  ref->level, 0, 1, extent_op);
 2356	} else {
 2357		BUG();
 2358	}
 2359	return ret;
 2360}
 2361
 2362/* helper function to actually process a single delayed ref entry */
 2363static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 2364			       struct btrfs_delayed_ref_node *node,
 2365			       struct btrfs_delayed_extent_op *extent_op,
 2366			       int insert_reserved)
 2367{
 2368	int ret = 0;
 2369
 2370	if (trans->aborted) {
 2371		if (insert_reserved)
 2372			btrfs_pin_extent(trans->fs_info, node->bytenr,
 2373					 node->num_bytes, 1);
 2374		return 0;
 2375	}
 2376
 2377	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
 2378	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
 2379		ret = run_delayed_tree_ref(trans, node, extent_op,
 2380					   insert_reserved);
 2381	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
 2382		 node->type == BTRFS_SHARED_DATA_REF_KEY)
 2383		ret = run_delayed_data_ref(trans, node, extent_op,
 2384					   insert_reserved);
 2385	else
 2386		BUG();
 2387	if (ret && insert_reserved)
 2388		btrfs_pin_extent(trans->fs_info, node->bytenr,
 2389				 node->num_bytes, 1);
 2390	return ret;
 2391}
 2392
 2393static inline struct btrfs_delayed_ref_node *
 2394select_delayed_ref(struct btrfs_delayed_ref_head *head)
 2395{
 2396	struct btrfs_delayed_ref_node *ref;
 2397
 2398	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
 2399		return NULL;
 2400
 2401	/*
 2402	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
 2403	 * This is to prevent a ref count from going down to zero, which deletes
 2404	 * the extent item from the extent tree, when there still are references
 2405	 * to add, which would fail because they would not find the extent item.
 2406	 */
 2407	if (!list_empty(&head->ref_add_list))
 2408		return list_first_entry(&head->ref_add_list,
 2409				struct btrfs_delayed_ref_node, add_list);
 2410
 2411	ref = rb_entry(rb_first_cached(&head->ref_tree),
 2412		       struct btrfs_delayed_ref_node, ref_node);
 2413	ASSERT(list_empty(&ref->add_list));
 2414	return ref;
 2415}
 2416
 2417static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
 2418				      struct btrfs_delayed_ref_head *head)
 2419{
 2420	spin_lock(&delayed_refs->lock);
 2421	head->processing = 0;
 2422	delayed_refs->num_heads_ready++;
 2423	spin_unlock(&delayed_refs->lock);
 2424	btrfs_delayed_ref_unlock(head);
 2425}
 2426
 2427static struct btrfs_delayed_extent_op *cleanup_extent_op(
 2428				struct btrfs_delayed_ref_head *head)
 2429{
 2430	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
 2431
 2432	if (!extent_op)
 2433		return NULL;
 2434
 2435	if (head->must_insert_reserved) {
 2436		head->extent_op = NULL;
 2437		btrfs_free_delayed_extent_op(extent_op);
 2438		return NULL;
 2439	}
 2440	return extent_op;
 2441}
 2442
 2443static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
 2444				     struct btrfs_delayed_ref_head *head)
 2445{
 2446	struct btrfs_delayed_extent_op *extent_op;
 2447	int ret;
 2448
 2449	extent_op = cleanup_extent_op(head);
 2450	if (!extent_op)
 2451		return 0;
 2452	head->extent_op = NULL;
 2453	spin_unlock(&head->lock);
 2454	ret = run_delayed_extent_op(trans, head, extent_op);
 2455	btrfs_free_delayed_extent_op(extent_op);
 2456	return ret ? ret : 1;
 2457}
 2458
 2459void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
 2460				  struct btrfs_delayed_ref_root *delayed_refs,
 2461				  struct btrfs_delayed_ref_head *head)
 2462{
 2463	int nr_items = 1;	/* Dropping this ref head update. */
 2464
 2465	if (head->total_ref_mod < 0) {
 2466		struct btrfs_space_info *space_info;
 2467		u64 flags;
 2468
 2469		if (head->is_data)
 2470			flags = BTRFS_BLOCK_GROUP_DATA;
 2471		else if (head->is_system)
 2472			flags = BTRFS_BLOCK_GROUP_SYSTEM;
 2473		else
 2474			flags = BTRFS_BLOCK_GROUP_METADATA;
 2475		space_info = __find_space_info(fs_info, flags);
 2476		ASSERT(space_info);
 2477		percpu_counter_add_batch(&space_info->total_bytes_pinned,
 2478				   -head->num_bytes,
 2479				   BTRFS_TOTAL_BYTES_PINNED_BATCH);
 2480
 2481		/*
 2482		 * We had csum deletions accounted for in our delayed refs rsv,
 2483		 * we need to drop the csum leaves for this update from our
 2484		 * delayed_refs_rsv.
 2485		 */
 2486		if (head->is_data) {
 2487			spin_lock(&delayed_refs->lock);
 2488			delayed_refs->pending_csums -= head->num_bytes;
 2489			spin_unlock(&delayed_refs->lock);
 2490			nr_items += btrfs_csum_bytes_to_leaves(fs_info,
 2491				head->num_bytes);
 2492		}
 2493	}
 2494
 2495	/* Also free its reserved qgroup space */
 2496	btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
 2497				      head->qgroup_reserved);
 2498	btrfs_delayed_refs_rsv_release(fs_info, nr_items);
 2499}
 2500
 2501static int cleanup_ref_head(struct btrfs_trans_handle *trans,
 2502			    struct btrfs_delayed_ref_head *head)
 2503{
 2504
 2505	struct btrfs_fs_info *fs_info = trans->fs_info;
 2506	struct btrfs_delayed_ref_root *delayed_refs;
 2507	int ret;
 2508
 2509	delayed_refs = &trans->transaction->delayed_refs;
 2510
 2511	ret = run_and_cleanup_extent_op(trans, head);
 2512	if (ret < 0) {
 2513		unselect_delayed_ref_head(delayed_refs, head);
 2514		btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
 2515		return ret;
 2516	} else if (ret) {
 2517		return ret;
 2518	}
 2519
 2520	/*
 2521	 * Need to drop our head ref lock and re-acquire the delayed ref lock
 2522	 * and then re-check to make sure nobody got added.
 2523	 */
 2524	spin_unlock(&head->lock);
 2525	spin_lock(&delayed_refs->lock);
 2526	spin_lock(&head->lock);
 2527	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
 2528		spin_unlock(&head->lock);
 2529		spin_unlock(&delayed_refs->lock);
 2530		return 1;
 2531	}
 2532	btrfs_delete_ref_head(delayed_refs, head);
 2533	spin_unlock(&head->lock);
 2534	spin_unlock(&delayed_refs->lock);
 2535
 2536	if (head->must_insert_reserved) {
 2537		btrfs_pin_extent(fs_info, head->bytenr,
 2538				 head->num_bytes, 1);
 2539		if (head->is_data) {
 2540			ret = btrfs_del_csums(trans, fs_info, head->bytenr,
 2541					      head->num_bytes);
 2542		}
 2543	}
 2544
 2545	btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
 2546
 2547	trace_run_delayed_ref_head(fs_info, head, 0);
 2548	btrfs_delayed_ref_unlock(head);
 2549	btrfs_put_delayed_ref_head(head);
 2550	return 0;
 2551}
 2552
 2553static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
 2554					struct btrfs_trans_handle *trans)
 2555{
 2556	struct btrfs_delayed_ref_root *delayed_refs =
 2557		&trans->transaction->delayed_refs;
 2558	struct btrfs_delayed_ref_head *head = NULL;
 2559	int ret;
 2560
 2561	spin_lock(&delayed_refs->lock);
 2562	head = btrfs_select_ref_head(delayed_refs);
 2563	if (!head) {
 2564		spin_unlock(&delayed_refs->lock);
 2565		return head;
 2566	}
 2567
 2568	/*
 2569	 * Grab the lock that says we are going to process all the refs for
 2570	 * this head
 2571	 */
 2572	ret = btrfs_delayed_ref_lock(delayed_refs, head);
 2573	spin_unlock(&delayed_refs->lock);
 2574
 2575	/*
 2576	 * We may have dropped the spin lock to get the head mutex lock, and
 2577	 * that might have given someone else time to free the head.  If that's
 2578	 * true, it has been removed from our list and we can move on.
 2579	 */
 2580	if (ret == -EAGAIN)
 2581		head = ERR_PTR(-EAGAIN);
 2582
 2583	return head;
 2584}
 2585
 2586static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
 2587				    struct btrfs_delayed_ref_head *locked_ref,
 2588				    unsigned long *run_refs)
 2589{
 2590	struct btrfs_fs_info *fs_info = trans->fs_info;
 2591	struct btrfs_delayed_ref_root *delayed_refs;
 2592	struct btrfs_delayed_extent_op *extent_op;
 2593	struct btrfs_delayed_ref_node *ref;
 2594	int must_insert_reserved = 0;
 2595	int ret;
 2596
 2597	delayed_refs = &trans->transaction->delayed_refs;
 2598
 2599	lockdep_assert_held(&locked_ref->mutex);
 2600	lockdep_assert_held(&locked_ref->lock);
 2601
 2602	while ((ref = select_delayed_ref(locked_ref))) {
 2603		if (ref->seq &&
 2604		    btrfs_check_delayed_seq(fs_info, ref->seq)) {
 2605			spin_unlock(&locked_ref->lock);
 2606			unselect_delayed_ref_head(delayed_refs, locked_ref);
 2607			return -EAGAIN;
 2608		}
 2609
 2610		(*run_refs)++;
 2611		ref->in_tree = 0;
 2612		rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
 2613		RB_CLEAR_NODE(&ref->ref_node);
 2614		if (!list_empty(&ref->add_list))
 2615			list_del(&ref->add_list);
 2616		/*
 2617		 * When we play the delayed ref, also correct the ref_mod on
 2618		 * head
 2619		 */
 2620		switch (ref->action) {
 2621		case BTRFS_ADD_DELAYED_REF:
 2622		case BTRFS_ADD_DELAYED_EXTENT:
 2623			locked_ref->ref_mod -= ref->ref_mod;
 2624			break;
 2625		case BTRFS_DROP_DELAYED_REF:
 2626			locked_ref->ref_mod += ref->ref_mod;
 2627			break;
 2628		default:
 2629			WARN_ON(1);
 2630		}
 2631		atomic_dec(&delayed_refs->num_entries);
 2632
 2633		/*
 2634		 * Record the must_insert_reserved flag before we drop the
 2635		 * spin lock.
 2636		 */
 2637		must_insert_reserved = locked_ref->must_insert_reserved;
 2638		locked_ref->must_insert_reserved = 0;
 2639
 2640		extent_op = locked_ref->extent_op;
 2641		locked_ref->extent_op = NULL;
 2642		spin_unlock(&locked_ref->lock);
 2643
 2644		ret = run_one_delayed_ref(trans, ref, extent_op,
 2645					  must_insert_reserved);
 2646
 2647		btrfs_free_delayed_extent_op(extent_op);
 2648		if (ret) {
 2649			unselect_delayed_ref_head(delayed_refs, locked_ref);
 2650			btrfs_put_delayed_ref(ref);
 2651			btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
 2652				    ret);
 2653			return ret;
 2654		}
 2655
 2656		btrfs_put_delayed_ref(ref);
 2657		cond_resched();
 2658
 2659		spin_lock(&locked_ref->lock);
 2660		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
 2661	}
 2662
 2663	return 0;
 2664}
 2665
 2666/*
 2667 * Returns 0 on success or if called with an already aborted transaction.
 2668 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
 2669 */
 2670static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 2671					     unsigned long nr)
 2672{
 2673	struct btrfs_fs_info *fs_info = trans->fs_info;
 2674	struct btrfs_delayed_ref_root *delayed_refs;
 2675	struct btrfs_delayed_ref_head *locked_ref = NULL;
 2676	ktime_t start = ktime_get();
 2677	int ret;
 2678	unsigned long count = 0;
 2679	unsigned long actual_count = 0;
 2680
 2681	delayed_refs = &trans->transaction->delayed_refs;
 2682	do {
 2683		if (!locked_ref) {
 2684			locked_ref = btrfs_obtain_ref_head(trans);
 2685			if (IS_ERR_OR_NULL(locked_ref)) {
 2686				if (PTR_ERR(locked_ref) == -EAGAIN) {
 2687					continue;
 2688				} else {
 2689					break;
 2690				}
 2691			}
 2692			count++;
 2693		}
 2694		/*
 2695		 * We need to try and merge add/drops of the same ref since we
 2696		 * can run into issues with relocate dropping the implicit ref
 2697		 * and then it being added back again before the drop can
 2698		 * finish.  If we merged anything we need to re-loop so we can
 2699		 * get a good ref.
 2700		 * Or we can get node references of the same type that weren't
 2701		 * merged when created due to bumps in the tree mod seq, and
 2702		 * we need to merge them to prevent adding an inline extent
 2703		 * backref before dropping it (triggering a BUG_ON at
 2704		 * insert_inline_extent_backref()).
 2705		 */
 2706		spin_lock(&locked_ref->lock);
 2707		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
 2708
 2709		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
 2710						      &actual_count);
 2711		if (ret < 0 && ret != -EAGAIN) {
 2712			/*
 2713			 * Error, btrfs_run_delayed_refs_for_head already
 2714			 * unlocked everything so just bail out
 2715			 */
 2716			return ret;
 2717		} else if (!ret) {
 2718			/*
 2719			 * Success, perform the usual cleanup of a processed
 2720			 * head
 2721			 */
 2722			ret = cleanup_ref_head(trans, locked_ref);
 2723			if (ret > 0 ) {
 2724				/* We dropped our lock, we need to loop. */
 2725				ret = 0;
 2726				continue;
 2727			} else if (ret) {
 2728				return ret;
 2729			}
 2730		}
 2731
 2732		/*
 2733		 * Either success case or btrfs_run_delayed_refs_for_head
 2734		 * returned -EAGAIN, meaning we need to select another head
 2735		 */
 2736
 2737		locked_ref = NULL;
 2738		cond_resched();
 2739	} while ((nr != -1 && count < nr) || locked_ref);
 2740
 2741	/*
 2742	 * We don't want to include ref heads since we can have empty ref heads
 2743	 * and those will drastically skew our runtime down since we just do
 2744	 * accounting, no actual extent tree updates.
 2745	 */
 2746	if (actual_count > 0) {
 2747		u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
 2748		u64 avg;
 2749
 2750		/*
 2751		 * We weigh the current average higher than our current runtime
 2752		 * to avoid large swings in the average.
 2753		 */
 2754		spin_lock(&delayed_refs->lock);
 2755		avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
 2756		fs_info->avg_delayed_ref_runtime = avg >> 2;	/* div by 4 */
 2757		spin_unlock(&delayed_refs->lock);
 2758	}
 2759	return 0;
 2760}
 2761
 2762#ifdef SCRAMBLE_DELAYED_REFS
 2763/*
 2764 * Normally delayed refs get processed in ascending bytenr order. This
 2765 * correlates in most cases to the order added. To expose dependencies on this
 2766 * order, we start to process the tree in the middle instead of the beginning
 2767 */
 2768static u64 find_middle(struct rb_root *root)
 2769{
 2770	struct rb_node *n = root->rb_node;
 2771	struct btrfs_delayed_ref_node *entry;
 2772	int alt = 1;
 2773	u64 middle;
 2774	u64 first = 0, last = 0;
 2775
 2776	n = rb_first(root);
 2777	if (n) {
 2778		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
 2779		first = entry->bytenr;
 2780	}
 2781	n = rb_last(root);
 2782	if (n) {
 2783		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
 2784		last = entry->bytenr;
 2785	}
 2786	n = root->rb_node;
 2787
 2788	while (n) {
 2789		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
 2790		WARN_ON(!entry->in_tree);
 2791
 2792		middle = entry->bytenr;
 2793
 2794		if (alt)
 2795			n = n->rb_left;
 2796		else
 2797			n = n->rb_right;
 2798
 2799		alt = 1 - alt;
 2800	}
 2801	return middle;
 2802}
 2803#endif
 2804
 2805static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
 2806{
 2807	u64 num_bytes;
 2808
 2809	num_bytes = heads * (sizeof(struct btrfs_extent_item) +
 2810			     sizeof(struct btrfs_extent_inline_ref));
 2811	if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
 2812		num_bytes += heads * sizeof(struct btrfs_tree_block_info);
 2813
 2814	/*
 2815	 * We don't ever fill up leaves all the way so multiply by 2 just to be
 2816	 * closer to what we're really going to want to use.
 2817	 */
 2818	return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
 2819}
 2820
 2821/*
 2822 * Takes the number of bytes to be csumm'ed and figures out how many leaves it
 2823 * would require to store the csums for that many bytes.
 2824 */
 2825u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
 2826{
 2827	u64 csum_size;
 2828	u64 num_csums_per_leaf;
 2829	u64 num_csums;
 2830
 2831	csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
 2832	num_csums_per_leaf = div64_u64(csum_size,
 2833			(u64)btrfs_super_csum_size(fs_info->super_copy));
 2834	num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
 2835	num_csums += num_csums_per_leaf - 1;
 2836	num_csums = div64_u64(num_csums, num_csums_per_leaf);
 2837	return num_csums;
 2838}
 2839
 2840bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
 2841{
 2842	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
 2843	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 2844	bool ret = false;
 2845	u64 reserved;
 2846
 2847	spin_lock(&global_rsv->lock);
 2848	reserved = global_rsv->reserved;
 2849	spin_unlock(&global_rsv->lock);
 2850
 2851	/*
 2852	 * Since the global reserve is just kind of magic we don't really want
 2853	 * to rely on it to save our bacon, so if our size is more than the
 2854	 * delayed_refs_rsv and the global rsv then it's time to think about
 2855	 * bailing.
 2856	 */
 2857	spin_lock(&delayed_refs_rsv->lock);
 2858	reserved += delayed_refs_rsv->reserved;
 2859	if (delayed_refs_rsv->size >= reserved)
 2860		ret = true;
 2861	spin_unlock(&delayed_refs_rsv->lock);
 2862	return ret;
 2863}
 2864
 2865int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
 2866{
 2867	u64 num_entries =
 2868		atomic_read(&trans->transaction->delayed_refs.num_entries);
 2869	u64 avg_runtime;
 2870	u64 val;
 2871
 2872	smp_mb();
 2873	avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
 2874	val = num_entries * avg_runtime;
 2875	if (val >= NSEC_PER_SEC)
 2876		return 1;
 2877	if (val >= NSEC_PER_SEC / 2)
 2878		return 2;
 2879
 2880	return btrfs_check_space_for_delayed_refs(trans->fs_info);
 2881}
 2882
 2883struct async_delayed_refs {
 2884	struct btrfs_root *root;
 2885	u64 transid;
 2886	int count;
 2887	int error;
 2888	int sync;
 2889	struct completion wait;
 2890	struct btrfs_work work;
 2891};
 2892
 2893static inline struct async_delayed_refs *
 2894to_async_delayed_refs(struct btrfs_work *work)
 2895{
 2896	return container_of(work, struct async_delayed_refs, work);
 2897}
 2898
 2899static void delayed_ref_async_start(struct btrfs_work *work)
 2900{
 2901	struct async_delayed_refs *async = to_async_delayed_refs(work);
 2902	struct btrfs_trans_handle *trans;
 2903	struct btrfs_fs_info *fs_info = async->root->fs_info;
 2904	int ret;
 2905
 2906	/* if the commit is already started, we don't need to wait here */
 2907	if (btrfs_transaction_blocked(fs_info))
 2908		goto done;
 2909
 2910	trans = btrfs_join_transaction(async->root);
 2911	if (IS_ERR(trans)) {
 2912		async->error = PTR_ERR(trans);
 2913		goto done;
 2914	}
 2915
 2916	/*
 2917	 * trans->sync means that when we call end_transaction, we won't
 2918	 * wait on delayed refs
 2919	 */
 2920	trans->sync = true;
 2921
 2922	/* Don't bother flushing if we got into a different transaction */
 2923	if (trans->transid > async->transid)
 2924		goto end;
 2925
 2926	ret = btrfs_run_delayed_refs(trans, async->count);
 2927	if (ret)
 2928		async->error = ret;
 2929end:
 2930	ret = btrfs_end_transaction(trans);
 2931	if (ret && !async->error)
 2932		async->error = ret;
 2933done:
 2934	if (async->sync)
 2935		complete(&async->wait);
 2936	else
 2937		kfree(async);
 2938}
 2939
 2940int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
 2941				 unsigned long count, u64 transid, int wait)
 2942{
 2943	struct async_delayed_refs *async;
 2944	int ret;
 2945
 2946	async = kmalloc(sizeof(*async), GFP_NOFS);
 2947	if (!async)
 2948		return -ENOMEM;
 2949
 2950	async->root = fs_info->tree_root;
 2951	async->count = count;
 2952	async->error = 0;
 2953	async->transid = transid;
 2954	if (wait)
 2955		async->sync = 1;
 2956	else
 2957		async->sync = 0;
 2958	init_completion(&async->wait);
 2959
 2960	btrfs_init_work(&async->work, btrfs_extent_refs_helper,
 2961			delayed_ref_async_start, NULL, NULL);
 2962
 2963	btrfs_queue_work(fs_info->extent_workers, &async->work);
 2964
 2965	if (wait) {
 2966		wait_for_completion(&async->wait);
 2967		ret = async->error;
 2968		kfree(async);
 2969		return ret;
 2970	}
 2971	return 0;
 2972}
 2973
 2974/*
 2975 * this starts processing the delayed reference count updates and
 2976 * extent insertions we have queued up so far.  count can be
 2977 * 0, which means to process everything in the tree at the start
 2978 * of the run (but not newly added entries), or it can be some target
 2979 * number you'd like to process.
 2980 *
 2981 * Returns 0 on success or if called with an aborted transaction
 2982 * Returns <0 on error and aborts the transaction
 2983 */
 2984int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 2985			   unsigned long count)
 2986{
 2987	struct btrfs_fs_info *fs_info = trans->fs_info;
 2988	struct rb_node *node;
 2989	struct btrfs_delayed_ref_root *delayed_refs;
 2990	struct btrfs_delayed_ref_head *head;
 2991	int ret;
 2992	int run_all = count == (unsigned long)-1;
 2993
 2994	/* We'll clean this up in btrfs_cleanup_transaction */
 2995	if (trans->aborted)
 2996		return 0;
 2997
 2998	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
 2999		return 0;
 3000
 3001	delayed_refs = &trans->transaction->delayed_refs;
 3002	if (count == 0)
 3003		count = atomic_read(&delayed_refs->num_entries) * 2;
 3004
 3005again:
 3006#ifdef SCRAMBLE_DELAYED_REFS
 3007	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
 3008#endif
 3009	ret = __btrfs_run_delayed_refs(trans, count);
 3010	if (ret < 0) {
 3011		btrfs_abort_transaction(trans, ret);
 3012		return ret;
 3013	}
 3014
 3015	if (run_all) {
 3016		if (!list_empty(&trans->new_bgs))
 3017			btrfs_create_pending_block_groups(trans);
 3018
 3019		spin_lock(&delayed_refs->lock);
 3020		node = rb_first_cached(&delayed_refs->href_root);
 3021		if (!node) {
 3022			spin_unlock(&delayed_refs->lock);
 3023			goto out;
 3024		}
 3025		head = rb_entry(node, struct btrfs_delayed_ref_head,
 3026				href_node);
 3027		refcount_inc(&head->refs);
 3028		spin_unlock(&delayed_refs->lock);
 3029
 3030		/* Mutex was contended, block until it's released and retry. */
 3031		mutex_lock(&head->mutex);
 3032		mutex_unlock(&head->mutex);
 3033
 3034		btrfs_put_delayed_ref_head(head);
 3035		cond_resched();
 3036		goto again;
 3037	}
 3038out:
 3039	return 0;
 3040}
 3041
 3042int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 3043				struct btrfs_fs_info *fs_info,
 3044				u64 bytenr, u64 num_bytes, u64 flags,
 3045				int level, int is_data)
 3046{
 3047	struct btrfs_delayed_extent_op *extent_op;
 3048	int ret;
 3049
 3050	extent_op = btrfs_alloc_delayed_extent_op();
 3051	if (!extent_op)
 3052		return -ENOMEM;
 3053
 3054	extent_op->flags_to_set = flags;
 3055	extent_op->update_flags = true;
 3056	extent_op->update_key = false;
 3057	extent_op->is_data = is_data ? true : false;
 3058	extent_op->level = level;
 3059
 3060	ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr,
 3061					  num_bytes, extent_op);
 3062	if (ret)
 3063		btrfs_free_delayed_extent_op(extent_op);
 3064	return ret;
 3065}
 3066
 3067static noinline int check_delayed_ref(struct btrfs_root *root,
 3068				      struct btrfs_path *path,
 3069				      u64 objectid, u64 offset, u64 bytenr)
 3070{
 3071	struct btrfs_delayed_ref_head *head;
 3072	struct btrfs_delayed_ref_node *ref;
 3073	struct btrfs_delayed_data_ref *data_ref;
 3074	struct btrfs_delayed_ref_root *delayed_refs;
 3075	struct btrfs_transaction *cur_trans;
 3076	struct rb_node *node;
 3077	int ret = 0;
 3078
 3079	spin_lock(&root->fs_info->trans_lock);
 3080	cur_trans = root->fs_info->running_transaction;
 3081	if (cur_trans)
 3082		refcount_inc(&cur_trans->use_count);
 3083	spin_unlock(&root->fs_info->trans_lock);
 3084	if (!cur_trans)
 3085		return 0;
 3086
 3087	delayed_refs = &cur_trans->delayed_refs;
 3088	spin_lock(&delayed_refs->lock);
 3089	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
 3090	if (!head) {
 3091		spin_unlock(&delayed_refs->lock);
 3092		btrfs_put_transaction(cur_trans);
 3093		return 0;
 3094	}
 3095
 3096	if (!mutex_trylock(&head->mutex)) {
 3097		refcount_inc(&head->refs);
 3098		spin_unlock(&delayed_refs->lock);
 3099
 3100		btrfs_release_path(path);
 3101
 3102		/*
 3103		 * Mutex was contended, block until it's released and let
 3104		 * caller try again
 3105		 */
 3106		mutex_lock(&head->mutex);
 3107		mutex_unlock(&head->mutex);
 3108		btrfs_put_delayed_ref_head(head);
 3109		btrfs_put_transaction(cur_trans);
 3110		return -EAGAIN;
 3111	}
 3112	spin_unlock(&delayed_refs->lock);
 3113
 3114	spin_lock(&head->lock);
 3115	/*
 3116	 * XXX: We should replace this with a proper search function in the
 3117	 * future.
 3118	 */
 3119	for (node = rb_first_cached(&head->ref_tree); node;
 3120	     node = rb_next(node)) {
 3121		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
 3122		/* If it's a shared ref we know a cross reference exists */
 3123		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
 3124			ret = 1;
 3125			break;
 3126		}
 3127
 3128		data_ref = btrfs_delayed_node_to_data_ref(ref);
 3129
 3130		/*
 3131		 * If our ref doesn't match the one we're currently looking at
 3132		 * then we have a cross reference.
 3133		 */
 3134		if (data_ref->root != root->root_key.objectid ||
 3135		    data_ref->objectid != objectid ||
 3136		    data_ref->offset != offset) {
 3137			ret = 1;
 3138			break;
 3139		}
 3140	}
 3141	spin_unlock(&head->lock);
 3142	mutex_unlock(&head->mutex);
 3143	btrfs_put_transaction(cur_trans);
 3144	return ret;
 3145}
 3146
 3147static noinline int check_committed_ref(struct btrfs_root *root,
 3148					struct btrfs_path *path,
 3149					u64 objectid, u64 offset, u64 bytenr)
 3150{
 3151	struct btrfs_fs_info *fs_info = root->fs_info;
 3152	struct btrfs_root *extent_root = fs_info->extent_root;
 3153	struct extent_buffer *leaf;
 3154	struct btrfs_extent_data_ref *ref;
 3155	struct btrfs_extent_inline_ref *iref;
 3156	struct btrfs_extent_item *ei;
 3157	struct btrfs_key key;
 3158	u32 item_size;
 3159	int type;
 3160	int ret;
 3161
 3162	key.objectid = bytenr;
 3163	key.offset = (u64)-1;
 3164	key.type = BTRFS_EXTENT_ITEM_KEY;
 3165
 3166	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 3167	if (ret < 0)
 3168		goto out;
 3169	BUG_ON(ret == 0); /* Corruption */
 3170
 3171	ret = -ENOENT;
 3172	if (path->slots[0] == 0)
 3173		goto out;
 3174
 3175	path->slots[0]--;
 3176	leaf = path->nodes[0];
 3177	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 3178
 3179	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
 3180		goto out;
 3181
 3182	ret = 1;
 3183	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 3184	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 3185
 3186	if (item_size != sizeof(*ei) +
 3187	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
 3188		goto out;
 3189
 3190	if (btrfs_extent_generation(leaf, ei) <=
 3191	    btrfs_root_last_snapshot(&root->root_item))
 3192		goto out;
 3193
 3194	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
 3195
 3196	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
 3197	if (type != BTRFS_EXTENT_DATA_REF_KEY)
 3198		goto out;
 3199
 3200	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
 3201	if (btrfs_extent_refs(leaf, ei) !=
 3202	    btrfs_extent_data_ref_count(leaf, ref) ||
 3203	    btrfs_extent_data_ref_root(leaf, ref) !=
 3204	    root->root_key.objectid ||
 3205	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
 3206	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
 3207		goto out;
 3208
 3209	ret = 0;
 3210out:
 3211	return ret;
 3212}
 3213
 3214int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
 3215			  u64 bytenr)
 3216{
 3217	struct btrfs_path *path;
 3218	int ret;
 3219
 3220	path = btrfs_alloc_path();
 3221	if (!path)
 3222		return -ENOMEM;
 3223
 3224	do {
 3225		ret = check_committed_ref(root, path, objectid,
 3226					  offset, bytenr);
 3227		if (ret && ret != -ENOENT)
 3228			goto out;
 3229
 3230		ret = check_delayed_ref(root, path, objectid, offset, bytenr);
 3231	} while (ret == -EAGAIN);
 3232
 3233out:
 3234	btrfs_free_path(path);
 3235	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
 3236		WARN_ON(ret > 0);
 3237	return ret;
 3238}
 3239
 3240static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 3241			   struct btrfs_root *root,
 3242			   struct extent_buffer *buf,
 3243			   int full_backref, int inc)
 3244{
 3245	struct btrfs_fs_info *fs_info = root->fs_info;
 3246	u64 bytenr;
 3247	u64 num_bytes;
 3248	u64 parent;
 3249	u64 ref_root;
 3250	u32 nritems;
 3251	struct btrfs_key key;
 3252	struct btrfs_file_extent_item *fi;
 3253	int i;
 3254	int level;
 3255	int ret = 0;
 3256	int (*process_func)(struct btrfs_trans_handle *,
 3257			    struct btrfs_root *,
 3258			    u64, u64, u64, u64, u64, u64);
 3259
 3260
 3261	if (btrfs_is_testing(fs_info))
 3262		return 0;
 3263
 3264	ref_root = btrfs_header_owner(buf);
 3265	nritems = btrfs_header_nritems(buf);
 3266	level = btrfs_header_level(buf);
 3267
 3268	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
 3269		return 0;
 3270
 3271	if (inc)
 3272		process_func = btrfs_inc_extent_ref;
 3273	else
 3274		process_func = btrfs_free_extent;
 3275
 3276	if (full_backref)
 3277		parent = buf->start;
 3278	else
 3279		parent = 0;
 3280
 3281	for (i = 0; i < nritems; i++) {
 3282		if (level == 0) {
 3283			btrfs_item_key_to_cpu(buf, &key, i);
 3284			if (key.type != BTRFS_EXTENT_DATA_KEY)
 3285				continue;
 3286			fi = btrfs_item_ptr(buf, i,
 3287					    struct btrfs_file_extent_item);
 3288			if (btrfs_file_extent_type(buf, fi) ==
 3289			    BTRFS_FILE_EXTENT_INLINE)
 3290				continue;
 3291			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
 3292			if (bytenr == 0)
 3293				continue;
 3294
 3295			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
 3296			key.offset -= btrfs_file_extent_offset(buf, fi);
 3297			ret = process_func(trans, root, bytenr, num_bytes,
 3298					   parent, ref_root, key.objectid,
 3299					   key.offset);
 3300			if (ret)
 3301				goto fail;
 3302		} else {
 3303			bytenr = btrfs_node_blockptr(buf, i);
 3304			num_bytes = fs_info->nodesize;
 3305			ret = process_func(trans, root, bytenr, num_bytes,
 3306					   parent, ref_root, level - 1, 0);
 3307			if (ret)
 3308				goto fail;
 3309		}
 3310	}
 3311	return 0;
 3312fail:
 3313	return ret;
 3314}
 3315
 3316int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 3317		  struct extent_buffer *buf, int full_backref)
 3318{
 3319	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
 3320}
 3321
 3322int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 3323		  struct extent_buffer *buf, int full_backref)
 3324{
 3325	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
 3326}
 3327
 3328static int write_one_cache_group(struct btrfs_trans_handle *trans,
 3329				 struct btrfs_fs_info *fs_info,
 3330				 struct btrfs_path *path,
 3331				 struct btrfs_block_group_cache *cache)
 3332{
 3333	int ret;
 3334	struct btrfs_root *extent_root = fs_info->extent_root;
 3335	unsigned long bi;
 3336	struct extent_buffer *leaf;
 3337
 3338	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
 3339	if (ret) {
 3340		if (ret > 0)
 3341			ret = -ENOENT;
 3342		goto fail;
 3343	}
 3344
 3345	leaf = path->nodes[0];
 3346	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
 3347	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
 3348	btrfs_mark_buffer_dirty(leaf);
 3349fail:
 3350	btrfs_release_path(path);
 3351	return ret;
 3352
 3353}
 3354
 3355static struct btrfs_block_group_cache *
 3356next_block_group(struct btrfs_fs_info *fs_info,
 3357		 struct btrfs_block_group_cache *cache)
 3358{
 3359	struct rb_node *node;
 3360
 3361	spin_lock(&fs_info->block_group_cache_lock);
 3362
 3363	/* If our block group was removed, we need a full search. */
 3364	if (RB_EMPTY_NODE(&cache->cache_node)) {
 3365		const u64 next_bytenr = cache->key.objectid + cache->key.offset;
 3366
 3367		spin_unlock(&fs_info->block_group_cache_lock);
 3368		btrfs_put_block_group(cache);
 3369		cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
 3370	}
 3371	node = rb_next(&cache->cache_node);
 3372	btrfs_put_block_group(cache);
 3373	if (node) {
 3374		cache = rb_entry(node, struct btrfs_block_group_cache,
 3375				 cache_node);
 3376		btrfs_get_block_group(cache);
 3377	} else
 3378		cache = NULL;
 3379	spin_unlock(&fs_info->block_group_cache_lock);
 3380	return cache;
 3381}
 3382
 3383static int cache_save_setup(struct btrfs_block_group_cache *block_group,
 3384			    struct btrfs_trans_handle *trans,
 3385			    struct btrfs_path *path)
 3386{
 3387	struct btrfs_fs_info *fs_info = block_group->fs_info;
 3388	struct btrfs_root *root = fs_info->tree_root;
 3389	struct inode *inode = NULL;
 3390	struct extent_changeset *data_reserved = NULL;
 3391	u64 alloc_hint = 0;
 3392	int dcs = BTRFS_DC_ERROR;
 3393	u64 num_pages = 0;
 3394	int retries = 0;
 3395	int ret = 0;
 3396
 3397	/*
 3398	 * If this block group is smaller than 100 megs don't bother caching the
 3399	 * block group.
 3400	 */
 3401	if (block_group->key.offset < (100 * SZ_1M)) {
 3402		spin_lock(&block_group->lock);
 3403		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
 3404		spin_unlock(&block_group->lock);
 3405		return 0;
 3406	}
 3407
 3408	if (trans->aborted)
 3409		return 0;
 3410again:
 3411	inode = lookup_free_space_inode(fs_info, block_group, path);
 3412	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
 3413		ret = PTR_ERR(inode);
 3414		btrfs_release_path(path);
 3415		goto out;
 3416	}
 3417
 3418	if (IS_ERR(inode)) {
 3419		BUG_ON(retries);
 3420		retries++;
 3421
 3422		if (block_group->ro)
 3423			goto out_free;
 3424
 3425		ret = create_free_space_inode(fs_info, trans, block_group,
 3426					      path);
 3427		if (ret)
 3428			goto out_free;
 3429		goto again;
 3430	}
 3431
 3432	/*
 3433	 * We want to set the generation to 0, that way if anything goes wrong
 3434	 * from here on out we know not to trust this cache when we load up next
 3435	 * time.
 3436	 */
 3437	BTRFS_I(inode)->generation = 0;
 3438	ret = btrfs_update_inode(trans, root, inode);
 3439	if (ret) {
 3440		/*
 3441		 * So theoretically we could recover from this, simply set the
 3442		 * super cache generation to 0 so we know to invalidate the
 3443		 * cache, but then we'd have to keep track of the block groups
 3444		 * that fail this way so we know we _have_ to reset this cache
 3445		 * before the next commit or risk reading stale cache.  So to
 3446		 * limit our exposure to horrible edge cases lets just abort the
 3447		 * transaction, this only happens in really bad situations
 3448		 * anyway.
 3449		 */
 3450		btrfs_abort_transaction(trans, ret);
 3451		goto out_put;
 3452	}
 3453	WARN_ON(ret);
 3454
 3455	/* We've already setup this transaction, go ahead and exit */
 3456	if (block_group->cache_generation == trans->transid &&
 3457	    i_size_read(inode)) {
 3458		dcs = BTRFS_DC_SETUP;
 3459		goto out_put;
 3460	}
 3461
 3462	if (i_size_read(inode) > 0) {
 3463		ret = btrfs_check_trunc_cache_free_space(fs_info,
 3464					&fs_info->global_block_rsv);
 3465		if (ret)
 3466			goto out_put;
 3467
 3468		ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
 3469		if (ret)
 3470			goto out_put;
 3471	}
 3472
 3473	spin_lock(&block_group->lock);
 3474	if (block_group->cached != BTRFS_CACHE_FINISHED ||
 3475	    !btrfs_test_opt(fs_info, SPACE_CACHE)) {
 3476		/*
 3477		 * don't bother trying to write stuff out _if_
 3478		 * a) we're not cached,
 3479		 * b) we're with nospace_cache mount option,
 3480		 * c) we're with v2 space_cache (FREE_SPACE_TREE).
 3481		 */
 3482		dcs = BTRFS_DC_WRITTEN;
 3483		spin_unlock(&block_group->lock);
 3484		goto out_put;
 3485	}
 3486	spin_unlock(&block_group->lock);
 3487
 3488	/*
 3489	 * We hit an ENOSPC when setting up the cache in this transaction, just
 3490	 * skip doing the setup, we've already cleared the cache so we're safe.
 3491	 */
 3492	if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
 3493		ret = -ENOSPC;
 3494		goto out_put;
 3495	}
 3496
 3497	/*
 3498	 * Try to preallocate enough space based on how big the block group is.
 3499	 * Keep in mind this has to include any pinned space which could end up
 3500	 * taking up quite a bit since it's not folded into the other space
 3501	 * cache.
 3502	 */
 3503	num_pages = div_u64(block_group->key.offset, SZ_256M);
 3504	if (!num_pages)
 3505		num_pages = 1;
 3506
 3507	num_pages *= 16;
 3508	num_pages *= PAGE_SIZE;
 3509
 3510	ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
 3511	if (ret)
 3512		goto out_put;
 3513
 3514	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
 3515					      num_pages, num_pages,
 3516					      &alloc_hint);
 3517	/*
 3518	 * Our cache requires contiguous chunks so that we don't modify a bunch
 3519	 * of metadata or split extents when writing the cache out, which means
 3520	 * we can enospc if we are heavily fragmented in addition to just normal
 3521	 * out of space conditions.  So if we hit this just skip setting up any
 3522	 * other block groups for this transaction, maybe we'll unpin enough
 3523	 * space the next time around.
 3524	 */
 3525	if (!ret)
 3526		dcs = BTRFS_DC_SETUP;
 3527	else if (ret == -ENOSPC)
 3528		set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
 3529
 3530out_put:
 3531	iput(inode);
 3532out_free:
 3533	btrfs_release_path(path);
 3534out:
 3535	spin_lock(&block_group->lock);
 3536	if (!ret && dcs == BTRFS_DC_SETUP)
 3537		block_group->cache_generation = trans->transid;
 3538	block_group->disk_cache_state = dcs;
 3539	spin_unlock(&block_group->lock);
 3540
 3541	extent_changeset_free(data_reserved);
 3542	return ret;
 3543}
 3544
 3545int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
 3546			    struct btrfs_fs_info *fs_info)
 3547{
 3548	struct btrfs_block_group_cache *cache, *tmp;
 3549	struct btrfs_transaction *cur_trans = trans->transaction;
 3550	struct btrfs_path *path;
 3551
 3552	if (list_empty(&cur_trans->dirty_bgs) ||
 3553	    !btrfs_test_opt(fs_info, SPACE_CACHE))
 3554		return 0;
 3555
 3556	path = btrfs_alloc_path();
 3557	if (!path)
 3558		return -ENOMEM;
 3559
 3560	/* Could add new block groups, use _safe just in case */
 3561	list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
 3562				 dirty_list) {
 3563		if (cache->disk_cache_state == BTRFS_DC_CLEAR)
 3564			cache_save_setup(cache, trans, path);
 3565	}
 3566
 3567	btrfs_free_path(path);
 3568	return 0;
 3569}
 3570
 3571/*
 3572 * transaction commit does final block group cache writeback during a
 3573 * critical section where nothing is allowed to change the FS.  This is
 3574 * required in order for the cache to actually match the block group,
 3575 * but can introduce a lot of latency into the commit.
 3576 *
 3577 * So, btrfs_start_dirty_block_groups is here to kick off block group
 3578 * cache IO.  There's a chance we'll have to redo some of it if the
 3579 * block group changes again during the commit, but it greatly reduces
 3580 * the commit latency by getting rid of the easy block groups while
 3581 * we're still allowing others to join the commit.
 3582 */
 3583int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
 3584{
 3585	struct btrfs_fs_info *fs_info = trans->fs_info;
 3586	struct btrfs_block_group_cache *cache;
 3587	struct btrfs_transaction *cur_trans = trans->transaction;
 3588	int ret = 0;
 3589	int should_put;
 3590	struct btrfs_path *path = NULL;
 3591	LIST_HEAD(dirty);
 3592	struct list_head *io = &cur_trans->io_bgs;
 3593	int num_started = 0;
 3594	int loops = 0;
 3595
 3596	spin_lock(&cur_trans->dirty_bgs_lock);
 3597	if (list_empty(&cur_trans->dirty_bgs)) {
 3598		spin_unlock(&cur_trans->dirty_bgs_lock);
 3599		return 0;
 3600	}
 3601	list_splice_init(&cur_trans->dirty_bgs, &dirty);
 3602	spin_unlock(&cur_trans->dirty_bgs_lock);
 3603
 3604again:
 3605	/*
 3606	 * make sure all the block groups on our dirty list actually
 3607	 * exist
 3608	 */
 3609	btrfs_create_pending_block_groups(trans);
 3610
 3611	if (!path) {
 3612		path = btrfs_alloc_path();
 3613		if (!path)
 3614			return -ENOMEM;
 3615	}
 3616
 3617	/*
 3618	 * cache_write_mutex is here only to save us from balance or automatic
 3619	 * removal of empty block groups deleting this block group while we are
 3620	 * writing out the cache
 3621	 */
 3622	mutex_lock(&trans->transaction->cache_write_mutex);
 3623	while (!list_empty(&dirty)) {
 3624		bool drop_reserve = true;
 3625
 3626		cache = list_first_entry(&dirty,
 3627					 struct btrfs_block_group_cache,
 3628					 dirty_list);
 3629		/*
 3630		 * this can happen if something re-dirties a block
 3631		 * group that is already under IO.  Just wait for it to
 3632		 * finish and then do it all again
 3633		 */
 3634		if (!list_empty(&cache->io_list)) {
 3635			list_del_init(&cache->io_list);
 3636			btrfs_wait_cache_io(trans, cache, path);
 3637			btrfs_put_block_group(cache);
 3638		}
 3639
 3640
 3641		/*
 3642		 * btrfs_wait_cache_io uses the cache->dirty_list to decide
 3643		 * if it should update the cache_state.  Don't delete
 3644		 * until after we wait.
 3645		 *
 3646		 * Since we're not running in the commit critical section
 3647		 * we need the dirty_bgs_lock to protect from update_block_group
 3648		 */
 3649		spin_lock(&cur_trans->dirty_bgs_lock);
 3650		list_del_init(&cache->dirty_list);
 3651		spin_unlock(&cur_trans->dirty_bgs_lock);
 3652
 3653		should_put = 1;
 3654
 3655		cache_save_setup(cache, trans, path);
 3656
 3657		if (cache->disk_cache_state == BTRFS_DC_SETUP) {
 3658			cache->io_ctl.inode = NULL;
 3659			ret = btrfs_write_out_cache(fs_info, trans,
 3660						    cache, path);
 3661			if (ret == 0 && cache->io_ctl.inode) {
 3662				num_started++;
 3663				should_put = 0;
 3664
 3665				/*
 3666				 * The cache_write_mutex is protecting the
 3667				 * io_list, also refer to the definition of
 3668				 * btrfs_transaction::io_bgs for more details
 3669				 */
 3670				list_add_tail(&cache->io_list, io);
 3671			} else {
 3672				/*
 3673				 * if we failed to write the cache, the
 3674				 * generation will be bad and life goes on
 3675				 */
 3676				ret = 0;
 3677			}
 3678		}
 3679		if (!ret) {
 3680			ret = write_one_cache_group(trans, fs_info,
 3681						    path, cache);
 3682			/*
 3683			 * Our block group might still be attached to the list
 3684			 * of new block groups in the transaction handle of some
 3685			 * other task (struct btrfs_trans_handle->new_bgs). This
 3686			 * means its block group item isn't yet in the extent
 3687			 * tree. If this happens ignore the error, as we will
 3688			 * try again later in the critical section of the
 3689			 * transaction commit.
 3690			 */
 3691			if (ret == -ENOENT) {
 3692				ret = 0;
 3693				spin_lock(&cur_trans->dirty_bgs_lock);
 3694				if (list_empty(&cache->dirty_list)) {
 3695					list_add_tail(&cache->dirty_list,
 3696						      &cur_trans->dirty_bgs);
 3697					btrfs_get_block_group(cache);
 3698					drop_reserve = false;
 3699				}
 3700				spin_unlock(&cur_trans->dirty_bgs_lock);
 3701			} else if (ret) {
 3702				btrfs_abort_transaction(trans, ret);
 3703			}
 3704		}
 3705
 3706		/* if it's not on the io list, we need to put the block group */
 3707		if (should_put)
 3708			btrfs_put_block_group(cache);
 3709		if (drop_reserve)
 3710			btrfs_delayed_refs_rsv_release(fs_info, 1);
 3711
 3712		if (ret)
 3713			break;
 3714
 3715		/*
 3716		 * Avoid blocking other tasks for too long. It might even save
 3717		 * us from writing caches for block groups that are going to be
 3718		 * removed.
 3719		 */
 3720		mutex_unlock(&trans->transaction->cache_write_mutex);
 3721		mutex_lock(&trans->transaction->cache_write_mutex);
 3722	}
 3723	mutex_unlock(&trans->transaction->cache_write_mutex);
 3724
 3725	/*
 3726	 * go through delayed refs for all the stuff we've just kicked off
 3727	 * and then loop back (just once)
 3728	 */
 3729	ret = btrfs_run_delayed_refs(trans, 0);
 3730	if (!ret && loops == 0) {
 3731		loops++;
 3732		spin_lock(&cur_trans->dirty_bgs_lock);
 3733		list_splice_init(&cur_trans->dirty_bgs, &dirty);
 3734		/*
 3735		 * dirty_bgs_lock protects us from concurrent block group
 3736		 * deletes too (not just cache_write_mutex).
 3737		 */
 3738		if (!list_empty(&dirty)) {
 3739			spin_unlock(&cur_trans->dirty_bgs_lock);
 3740			goto again;
 3741		}
 3742		spin_unlock(&cur_trans->dirty_bgs_lock);
 3743	} else if (ret < 0) {
 3744		btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
 3745	}
 3746
 3747	btrfs_free_path(path);
 3748	return ret;
 3749}
 3750
 3751int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 3752				   struct btrfs_fs_info *fs_info)
 3753{
 3754	struct btrfs_block_group_cache *cache;
 3755	struct btrfs_transaction *cur_trans = trans->transaction;
 3756	int ret = 0;
 3757	int should_put;
 3758	struct btrfs_path *path;
 3759	struct list_head *io = &cur_trans->io_bgs;
 3760	int num_started = 0;
 3761
 3762	path = btrfs_alloc_path();
 3763	if (!path)
 3764		return -ENOMEM;
 3765
 3766	/*
 3767	 * Even though we are in the critical section of the transaction commit,
 3768	 * we can still have concurrent tasks adding elements to this
 3769	 * transaction's list of dirty block groups. These tasks correspond to
 3770	 * endio free space workers started when writeback finishes for a
 3771	 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
 3772	 * allocate new block groups as a result of COWing nodes of the root
 3773	 * tree when updating the free space inode. The writeback for the space
 3774	 * caches is triggered by an earlier call to
 3775	 * btrfs_start_dirty_block_groups() and iterations of the following
 3776	 * loop.
 3777	 * Also we want to do the cache_save_setup first and then run the
 3778	 * delayed refs to make sure we have the best chance at doing this all
 3779	 * in one shot.
 3780	 */
 3781	spin_lock(&cur_trans->dirty_bgs_lock);
 3782	while (!list_empty(&cur_trans->dirty_bgs)) {
 3783		cache = list_first_entry(&cur_trans->dirty_bgs,
 3784					 struct btrfs_block_group_cache,
 3785					 dirty_list);
 3786
 3787		/*
 3788		 * this can happen if cache_save_setup re-dirties a block
 3789		 * group that is already under IO.  Just wait for it to
 3790		 * finish and then do it all again
 3791		 */
 3792		if (!list_empty(&cache->io_list)) {
 3793			spin_unlock(&cur_trans->dirty_bgs_lock);
 3794			list_del_init(&cache->io_list);
 3795			btrfs_wait_cache_io(trans, cache, path);
 3796			btrfs_put_block_group(cache);
 3797			spin_lock(&cur_trans->dirty_bgs_lock);
 3798		}
 3799
 3800		/*
 3801		 * don't remove from the dirty list until after we've waited
 3802		 * on any pending IO
 3803		 */
 3804		list_del_init(&cache->dirty_list);
 3805		spin_unlock(&cur_trans->dirty_bgs_lock);
 3806		should_put = 1;
 3807
 3808		cache_save_setup(cache, trans, path);
 3809
 3810		if (!ret)
 3811			ret = btrfs_run_delayed_refs(trans,
 3812						     (unsigned long) -1);
 3813
 3814		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
 3815			cache->io_ctl.inode = NULL;
 3816			ret = btrfs_write_out_cache(fs_info, trans,
 3817						    cache, path);
 3818			if (ret == 0 && cache->io_ctl.inode) {
 3819				num_started++;
 3820				should_put = 0;
 3821				list_add_tail(&cache->io_list, io);
 3822			} else {
 3823				/*
 3824				 * if we failed to write the cache, the
 3825				 * generation will be bad and life goes on
 3826				 */
 3827				ret = 0;
 3828			}
 3829		}
 3830		if (!ret) {
 3831			ret = write_one_cache_group(trans, fs_info,
 3832						    path, cache);
 3833			/*
 3834			 * One of the free space endio workers might have
 3835			 * created a new block group while updating a free space
 3836			 * cache's inode (at inode.c:btrfs_finish_ordered_io())
 3837			 * and hasn't released its transaction handle yet, in
 3838			 * which case the new block group is still attached to
 3839			 * its transaction handle and its creation has not
 3840			 * finished yet (no block group item in the extent tree
 3841			 * yet, etc). If this is the case, wait for all free
 3842			 * space endio workers to finish and retry. This is a
 3843			 * a very rare case so no need for a more efficient and
 3844			 * complex approach.
 3845			 */
 3846			if (ret == -ENOENT) {
 3847				wait_event(cur_trans->writer_wait,
 3848				   atomic_read(&cur_trans->num_writers) == 1);
 3849				ret = write_one_cache_group(trans, fs_info,
 3850							    path, cache);
 3851			}
 3852			if (ret)
 3853				btrfs_abort_transaction(trans, ret);
 3854		}
 3855
 3856		/* if its not on the io list, we need to put the block group */
 3857		if (should_put)
 3858			btrfs_put_block_group(cache);
 3859		btrfs_delayed_refs_rsv_release(fs_info, 1);
 3860		spin_lock(&cur_trans->dirty_bgs_lock);
 3861	}
 3862	spin_unlock(&cur_trans->dirty_bgs_lock);
 3863
 3864	/*
 3865	 * Refer to the definition of io_bgs member for details why it's safe
 3866	 * to use it without any locking
 3867	 */
 3868	while (!list_empty(io)) {
 3869		cache = list_first_entry(io, struct btrfs_block_group_cache,
 3870					 io_list);
 3871		list_del_init(&cache->io_list);
 3872		btrfs_wait_cache_io(trans, cache, path);
 3873		btrfs_put_block_group(cache);
 3874	}
 3875
 3876	btrfs_free_path(path);
 3877	return ret;
 3878}
 3879
 3880int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
 3881{
 3882	struct btrfs_block_group_cache *block_group;
 3883	int readonly = 0;
 3884
 3885	block_group = btrfs_lookup_block_group(fs_info, bytenr);
 3886	if (!block_group || block_group->ro)
 3887		readonly = 1;
 3888	if (block_group)
 3889		btrfs_put_block_group(block_group);
 3890	return readonly;
 3891}
 3892
 3893bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
 3894{
 3895	struct btrfs_block_group_cache *bg;
 3896	bool ret = true;
 3897
 3898	bg = btrfs_lookup_block_group(fs_info, bytenr);
 3899	if (!bg)
 3900		return false;
 3901
 3902	spin_lock(&bg->lock);
 3903	if (bg->ro)
 3904		ret = false;
 3905	else
 3906		atomic_inc(&bg->nocow_writers);
 3907	spin_unlock(&bg->lock);
 3908
 3909	/* no put on block group, done by btrfs_dec_nocow_writers */
 3910	if (!ret)
 3911		btrfs_put_block_group(bg);
 3912
 3913	return ret;
 3914
 3915}
 3916
 3917void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
 3918{
 3919	struct btrfs_block_group_cache *bg;
 3920
 3921	bg = btrfs_lookup_block_group(fs_info, bytenr);
 3922	ASSERT(bg);
 3923	if (atomic_dec_and_test(&bg->nocow_writers))
 3924		wake_up_var(&bg->nocow_writers);
 3925	/*
 3926	 * Once for our lookup and once for the lookup done by a previous call
 3927	 * to btrfs_inc_nocow_writers()
 3928	 */
 3929	btrfs_put_block_group(bg);
 3930	btrfs_put_block_group(bg);
 3931}
 3932
 3933void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
 3934{
 3935	wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
 3936}
 3937
 3938static const char *alloc_name(u64 flags)
 3939{
 3940	switch (flags) {
 3941	case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
 3942		return "mixed";
 3943	case BTRFS_BLOCK_GROUP_METADATA:
 3944		return "metadata";
 3945	case BTRFS_BLOCK_GROUP_DATA:
 3946		return "data";
 3947	case BTRFS_BLOCK_GROUP_SYSTEM:
 3948		return "system";
 3949	default:
 3950		WARN_ON(1);
 3951		return "invalid-combination";
 3952	};
 3953}
 3954
 3955static int create_space_info(struct btrfs_fs_info *info, u64 flags)
 3956{
 3957
 3958	struct btrfs_space_info *space_info;
 3959	int i;
 3960	int ret;
 3961
 3962	space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
 3963	if (!space_info)
 3964		return -ENOMEM;
 3965
 3966	ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
 3967				 GFP_KERNEL);
 3968	if (ret) {
 3969		kfree(space_info);
 3970		return ret;
 3971	}
 3972
 3973	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
 3974		INIT_LIST_HEAD(&space_info->block_groups[i]);
 3975	init_rwsem(&space_info->groups_sem);
 3976	spin_lock_init(&space_info->lock);
 3977	space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
 3978	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
 3979	init_waitqueue_head(&space_info->wait);
 3980	INIT_LIST_HEAD(&space_info->ro_bgs);
 3981	INIT_LIST_HEAD(&space_info->tickets);
 3982	INIT_LIST_HEAD(&space_info->priority_tickets);
 3983
 3984	ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
 3985				    info->space_info_kobj, "%s",
 3986				    alloc_name(space_info->flags));
 3987	if (ret) {
 3988		percpu_counter_destroy(&space_info->total_bytes_pinned);
 3989		kfree(space_info);
 3990		return ret;
 3991	}
 3992
 3993	list_add_rcu(&space_info->list, &info->space_info);
 3994	if (flags & BTRFS_BLOCK_GROUP_DATA)
 3995		info->data_sinfo = space_info;
 3996
 3997	return ret;
 3998}
 3999
 4000static void update_space_info(struct btrfs_fs_info *info, u64 flags,
 4001			     u64 total_bytes, u64 bytes_used,
 4002			     u64 bytes_readonly,
 4003			     struct btrfs_space_info **space_info)
 4004{
 4005	struct btrfs_space_info *found;
 4006	int factor;
 4007
 4008	factor = btrfs_bg_type_to_factor(flags);
 4009
 4010	found = __find_space_info(info, flags);
 4011	ASSERT(found);
 4012	spin_lock(&found->lock);
 4013	found->total_bytes += total_bytes;
 4014	found->disk_total += total_bytes * factor;
 4015	found->bytes_used += bytes_used;
 4016	found->disk_used += bytes_used * factor;
 4017	found->bytes_readonly += bytes_readonly;
 4018	if (total_bytes > 0)
 4019		found->full = 0;
 4020	space_info_add_new_bytes(info, found, total_bytes -
 4021				 bytes_used - bytes_readonly);
 4022	spin_unlock(&found->lock);
 4023	*space_info = found;
 4024}
 4025
 4026static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 4027{
 4028	u64 extra_flags = chunk_to_extended(flags) &
 4029				BTRFS_EXTENDED_PROFILE_MASK;
 4030
 4031	write_seqlock(&fs_info->profiles_lock);
 4032	if (flags & BTRFS_BLOCK_GROUP_DATA)
 4033		fs_info->avail_data_alloc_bits |= extra_flags;
 4034	if (flags & BTRFS_BLOCK_GROUP_METADATA)
 4035		fs_info->avail_metadata_alloc_bits |= extra_flags;
 4036	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
 4037		fs_info->avail_system_alloc_bits |= extra_flags;
 4038	write_sequnlock(&fs_info->profiles_lock);
 4039}
 4040
 4041/*
 4042 * returns target flags in extended format or 0 if restripe for this
 4043 * chunk_type is not in progress
 4044 *
 4045 * should be called with balance_lock held
 4046 */
 4047static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
 4048{
 4049	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
 4050	u64 target = 0;
 4051
 4052	if (!bctl)
 4053		return 0;
 4054
 4055	if (flags & BTRFS_BLOCK_GROUP_DATA &&
 4056	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
 4057		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
 4058	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
 4059		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
 4060		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
 4061	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
 4062		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
 4063		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
 4064	}
 4065
 4066	return target;
 4067}
 4068
 4069/*
 4070 * @flags: available profiles in extended format (see ctree.h)
 4071 *
 4072 * Returns reduced profile in chunk format.  If profile changing is in
 4073 * progress (either running or paused) picks the target profile (if it's
 4074 * already available), otherwise falls back to plain reducing.
 4075 */
 4076static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
 4077{
 4078	u64 num_devices = fs_info->fs_devices->rw_devices;
 4079	u64 target;
 4080	u64 raid_type;
 4081	u64 allowed = 0;
 4082
 4083	/*
 4084	 * see if restripe for this chunk_type is in progress, if so
 4085	 * try to reduce to the target profile
 4086	 */
 4087	spin_lock(&fs_info->balance_lock);
 4088	target = get_restripe_target(fs_info, flags);
 4089	if (target) {
 4090		/* pick target profile only if it's already available */
 4091		if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
 4092			spin_unlock(&fs_info->balance_lock);
 4093			return extended_to_chunk(target);
 4094		}
 4095	}
 4096	spin_unlock(&fs_info->balance_lock);
 4097
 4098	/* First, mask out the RAID levels which aren't possible */
 4099	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
 4100		if (num_devices >= btrfs_raid_array[raid_type].devs_min)
 4101			allowed |= btrfs_raid_array[raid_type].bg_flag;
 4102	}
 4103	allowed &= flags;
 4104
 4105	if (allowed & BTRFS_BLOCK_GROUP_RAID6)
 4106		allowed = BTRFS_BLOCK_GROUP_RAID6;
 4107	else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
 4108		allowed = BTRFS_BLOCK_GROUP_RAID5;
 4109	else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
 4110		allowed = BTRFS_BLOCK_GROUP_RAID10;
 4111	else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
 4112		allowed = BTRFS_BLOCK_GROUP_RAID1;
 4113	else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
 4114		allowed = BTRFS_BLOCK_GROUP_RAID0;
 4115
 4116	flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
 4117
 4118	return extended_to_chunk(flags | allowed);
 4119}
 4120
 4121static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
 4122{
 4123	unsigned seq;
 4124	u64 flags;
 4125
 4126	do {
 4127		flags = orig_flags;
 4128		seq = read_seqbegin(&fs_info->profiles_lock);
 4129
 4130		if (flags & BTRFS_BLOCK_GROUP_DATA)
 4131			flags |= fs_info->avail_data_alloc_bits;
 4132		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
 4133			flags |= fs_info->avail_system_alloc_bits;
 4134		else if (flags & BTRFS_BLOCK_GROUP_METADATA)
 4135			flags |= fs_info->avail_metadata_alloc_bits;
 4136	} while (read_seqretry(&fs_info->profiles_lock, seq));
 4137
 4138	return btrfs_reduce_alloc_profile(fs_info, flags);
 4139}
 4140
 4141static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
 4142{
 4143	struct btrfs_fs_info *fs_info = root->fs_info;
 4144	u64 flags;
 4145	u64 ret;
 4146
 4147	if (data)
 4148		flags = BTRFS_BLOCK_GROUP_DATA;
 4149	else if (root == fs_info->chunk_root)
 4150		flags = BTRFS_BLOCK_GROUP_SYSTEM;
 4151	else
 4152		flags = BTRFS_BLOCK_GROUP_METADATA;
 4153
 4154	ret = get_alloc_profile(fs_info, flags);
 4155	return ret;
 4156}
 4157
 4158u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
 4159{
 4160	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
 4161}
 4162
 4163u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
 4164{
 4165	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 4166}
 4167
 4168u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
 4169{
 4170	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
 4171}
 4172
 4173static u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
 4174				 bool may_use_included)
 4175{
 4176	ASSERT(s_info);
 4177	return s_info->bytes_used + s_info->bytes_reserved +
 4178		s_info->bytes_pinned + s_info->bytes_readonly +
 4179		(may_use_included ? s_info->bytes_may_use : 0);
 4180}
 4181
 4182int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
 4183{
 4184	struct btrfs_root *root = inode->root;
 4185	struct btrfs_fs_info *fs_info = root->fs_info;
 4186	struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
 4187	u64 used;
 4188	int ret = 0;
 4189	int need_commit = 2;
 4190	int have_pinned_space;
 4191
 4192	/* make sure bytes are sectorsize aligned */
 4193	bytes = ALIGN(bytes, fs_info->sectorsize);
 4194
 4195	if (btrfs_is_free_space_inode(inode)) {
 4196		need_commit = 0;
 4197		ASSERT(current->journal_info);
 4198	}
 4199
 4200again:
 4201	/* make sure we have enough space to handle the data first */
 4202	spin_lock(&data_sinfo->lock);
 4203	used = btrfs_space_info_used(data_sinfo, true);
 4204
 4205	if (used + bytes > data_sinfo->total_bytes) {
 4206		struct btrfs_trans_handle *trans;
 4207
 4208		/*
 4209		 * if we don't have enough free bytes in this space then we need
 4210		 * to alloc a new chunk.
 4211		 */
 4212		if (!data_sinfo->full) {
 4213			u64 alloc_target;
 4214
 4215			data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
 4216			spin_unlock(&data_sinfo->lock);
 4217
 4218			alloc_target = btrfs_data_alloc_profile(fs_info);
 4219			/*
 4220			 * It is ugly that we don't call nolock join
 4221			 * transaction for the free space inode case here.
 4222			 * But it is safe because we only do the data space
 4223			 * reservation for the free space cache in the
 4224			 * transaction context, the common join transaction
 4225			 * just increase the counter of the current transaction
 4226			 * handler, doesn't try to acquire the trans_lock of
 4227			 * the fs.
 4228			 */
 4229			trans = btrfs_join_transaction(root);
 4230			if (IS_ERR(trans))
 4231				return PTR_ERR(trans);
 4232
 4233			ret = do_chunk_alloc(trans, alloc_target,
 4234					     CHUNK_ALLOC_NO_FORCE);
 4235			btrfs_end_transaction(trans);
 4236			if (ret < 0) {
 4237				if (ret != -ENOSPC)
 4238					return ret;
 4239				else {
 4240					have_pinned_space = 1;
 4241					goto commit_trans;
 4242				}
 4243			}
 4244
 4245			goto again;
 4246		}
 4247
 4248		/*
 4249		 * If we don't have enough pinned space to deal with this
 4250		 * allocation, and no removed chunk in current transaction,
 4251		 * don't bother committing the transaction.
 4252		 */
 4253		have_pinned_space = __percpu_counter_compare(
 4254			&data_sinfo->total_bytes_pinned,
 4255			used + bytes - data_sinfo->total_bytes,
 4256			BTRFS_TOTAL_BYTES_PINNED_BATCH);
 4257		spin_unlock(&data_sinfo->lock);
 4258
 4259		/* commit the current transaction and try again */
 4260commit_trans:
 4261		if (need_commit) {
 4262			need_commit--;
 4263
 4264			if (need_commit > 0) {
 4265				btrfs_start_delalloc_roots(fs_info, -1);
 4266				btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
 4267							 (u64)-1);
 4268			}
 4269
 4270			trans = btrfs_join_transaction(root);
 4271			if (IS_ERR(trans))
 4272				return PTR_ERR(trans);
 4273			if (have_pinned_space >= 0 ||
 4274			    test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
 4275				     &trans->transaction->flags) ||
 4276			    need_commit > 0) {
 4277				ret = btrfs_commit_transaction(trans);
 4278				if (ret)
 4279					return ret;
 4280				/*
 4281				 * The cleaner kthread might still be doing iput
 4282				 * operations. Wait for it to finish so that
 4283				 * more space is released.
 4284				 */
 4285				mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
 4286				mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
 4287				goto again;
 4288			} else {
 4289				btrfs_end_transaction(trans);
 4290			}
 4291		}
 4292
 4293		trace_btrfs_space_reservation(fs_info,
 4294					      "space_info:enospc",
 4295					      data_sinfo->flags, bytes, 1);
 4296		return -ENOSPC;
 4297	}
 4298	update_bytes_may_use(data_sinfo, bytes);
 4299	trace_btrfs_space_reservation(fs_info, "space_info",
 4300				      data_sinfo->flags, bytes, 1);
 4301	spin_unlock(&data_sinfo->lock);
 4302
 4303	return 0;
 4304}
 4305
 4306int btrfs_check_data_free_space(struct inode *inode,
 4307			struct extent_changeset **reserved, u64 start, u64 len)
 4308{
 4309	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 4310	int ret;
 4311
 4312	/* align the range */
 4313	len = round_up(start + len, fs_info->sectorsize) -
 4314	      round_down(start, fs_info->sectorsize);
 4315	start = round_down(start, fs_info->sectorsize);
 4316
 4317	ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
 4318	if (ret < 0)
 4319		return ret;
 4320
 4321	/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
 4322	ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
 4323	if (ret < 0)
 4324		btrfs_free_reserved_data_space_noquota(inode, start, len);
 4325	else
 4326		ret = 0;
 4327	return ret;
 4328}
 4329
 4330/*
 4331 * Called if we need to clear a data reservation for this inode
 4332 * Normally in a error case.
 4333 *
 4334 * This one will *NOT* use accurate qgroup reserved space API, just for case
 4335 * which we can't sleep and is sure it won't affect qgroup reserved space.
 4336 * Like clear_bit_hook().
 4337 */
 4338void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
 4339					    u64 len)
 4340{
 4341	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 4342	struct btrfs_space_info *data_sinfo;
 4343
 4344	/* Make sure the range is aligned to sectorsize */
 4345	len = round_up(start + len, fs_info->sectorsize) -
 4346	      round_down(start, fs_info->sectorsize);
 4347	start = round_down(start, fs_info->sectorsize);
 4348
 4349	data_sinfo = fs_info->data_sinfo;
 4350	spin_lock(&data_sinfo->lock);
 4351	update_bytes_may_use(data_sinfo, -len);
 4352	trace_btrfs_space_reservation(fs_info, "space_info",
 4353				      data_sinfo->flags, len, 0);
 4354	spin_unlock(&data_sinfo->lock);
 4355}
 4356
 4357/*
 4358 * Called if we need to clear a data reservation for this inode
 4359 * Normally in a error case.
 4360 *
 4361 * This one will handle the per-inode data rsv map for accurate reserved
 4362 * space framework.
 4363 */
 4364void btrfs_free_reserved_data_space(struct inode *inode,
 4365			struct extent_changeset *reserved, u64 start, u64 len)
 4366{
 4367	struct btrfs_root *root = BTRFS_I(inode)->root;
 4368
 4369	/* Make sure the range is aligned to sectorsize */
 4370	len = round_up(start + len, root->fs_info->sectorsize) -
 4371	      round_down(start, root->fs_info->sectorsize);
 4372	start = round_down(start, root->fs_info->sectorsize);
 4373
 4374	btrfs_free_reserved_data_space_noquota(inode, start, len);
 4375	btrfs_qgroup_free_data(inode, reserved, start, len);
 4376}
 4377
 4378static void force_metadata_allocation(struct btrfs_fs_info *info)
 4379{
 4380	struct list_head *head = &info->space_info;
 4381	struct btrfs_space_info *found;
 4382
 4383	rcu_read_lock();
 4384	list_for_each_entry_rcu(found, head, list) {
 4385		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
 4386			found->force_alloc = CHUNK_ALLOC_FORCE;
 4387	}
 4388	rcu_read_unlock();
 4389}
 4390
 4391static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
 4392{
 4393	return (global->size << 1);
 4394}
 4395
 4396static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
 4397			      struct btrfs_space_info *sinfo, int force)
 4398{
 4399	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 4400	u64 bytes_used = btrfs_space_info_used(sinfo, false);
 4401	u64 thresh;
 4402
 4403	if (force == CHUNK_ALLOC_FORCE)
 4404		return 1;
 4405
 4406	/*
 4407	 * We need to take into account the global rsv because for all intents
 4408	 * and purposes it's used space.  Don't worry about locking the
 4409	 * global_rsv, it doesn't change except when the transaction commits.
 4410	 */
 4411	if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
 4412		bytes_used += calc_global_rsv_need_space(global_rsv);
 4413
 4414	/*
 4415	 * in limited mode, we want to have some free space up to
 4416	 * about 1% of the FS size.
 4417	 */
 4418	if (force == CHUNK_ALLOC_LIMITED) {
 4419		thresh = btrfs_super_total_bytes(fs_info->super_copy);
 4420		thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
 4421
 4422		if (sinfo->total_bytes - bytes_used < thresh)
 4423			return 1;
 4424	}
 4425
 4426	if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
 4427		return 0;
 4428	return 1;
 4429}
 4430
 4431static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
 4432{
 4433	u64 num_dev;
 4434
 4435	if (type & (BTRFS_BLOCK_GROUP_RAID10 |
 4436		    BTRFS_BLOCK_GROUP_RAID0 |
 4437		    BTRFS_BLOCK_GROUP_RAID5 |
 4438		    BTRFS_BLOCK_GROUP_RAID6))
 4439		num_dev = fs_info->fs_devices->rw_devices;
 4440	else if (type & BTRFS_BLOCK_GROUP_RAID1)
 4441		num_dev = 2;
 4442	else
 4443		num_dev = 1;	/* DUP or single */
 4444
 4445	return num_dev;
 4446}
 4447
 4448/*
 4449 * If @is_allocation is true, reserve space in the system space info necessary
 4450 * for allocating a chunk, otherwise if it's false, reserve space necessary for
 4451 * removing a chunk.
 4452 */
 4453void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
 4454{
 4455	struct btrfs_fs_info *fs_info = trans->fs_info;
 4456	struct btrfs_space_info *info;
 4457	u64 left;
 4458	u64 thresh;
 4459	int ret = 0;
 4460	u64 num_devs;
 4461
 4462	/*
 4463	 * Needed because we can end up allocating a system chunk and for an
 4464	 * atomic and race free space reservation in the chunk block reserve.
 4465	 */
 4466	lockdep_assert_held(&fs_info->chunk_mutex);
 4467
 4468	info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
 4469	spin_lock(&info->lock);
 4470	left = info->total_bytes - btrfs_space_info_used(info, true);
 4471	spin_unlock(&info->lock);
 4472
 4473	num_devs = get_profile_num_devs(fs_info, type);
 4474
 4475	/* num_devs device items to update and 1 chunk item to add or remove */
 4476	thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
 4477		btrfs_calc_trans_metadata_size(fs_info, 1);
 4478
 4479	if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
 4480		btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
 4481			   left, thresh, type);
 4482		dump_space_info(fs_info, info, 0, 0);
 4483	}
 4484
 4485	if (left < thresh) {
 4486		u64 flags = btrfs_system_alloc_profile(fs_info);
 4487
 4488		/*
 4489		 * Ignore failure to create system chunk. We might end up not
 4490		 * needing it, as we might not need to COW all nodes/leafs from
 4491		 * the paths we visit in the chunk tree (they were already COWed
 4492		 * or created in the current transaction for example).
 4493		 */
 4494		ret = btrfs_alloc_chunk(trans, flags);
 4495	}
 4496
 4497	if (!ret) {
 4498		ret = btrfs_block_rsv_add(fs_info->chunk_root,
 4499					  &fs_info->chunk_block_rsv,
 4500					  thresh, BTRFS_RESERVE_NO_FLUSH);
 4501		if (!ret)
 4502			trans->chunk_bytes_reserved += thresh;
 4503	}
 4504}
 4505
 4506/*
 4507 * If force is CHUNK_ALLOC_FORCE:
 4508 *    - return 1 if it successfully allocates a chunk,
 4509 *    - return errors including -ENOSPC otherwise.
 4510 * If force is NOT CHUNK_ALLOC_FORCE:
 4511 *    - return 0 if it doesn't need to allocate a new chunk,
 4512 *    - return 1 if it successfully allocates a chunk,
 4513 *    - return errors including -ENOSPC otherwise.
 4514 */
 4515static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
 4516			  int force)
 4517{
 4518	struct btrfs_fs_info *fs_info = trans->fs_info;
 4519	struct btrfs_space_info *space_info;
 4520	bool wait_for_alloc = false;
 4521	bool should_alloc = false;
 4522	int ret = 0;
 4523
 4524	/* Don't re-enter if we're already allocating a chunk */
 4525	if (trans->allocating_chunk)
 4526		return -ENOSPC;
 4527
 4528	space_info = __find_space_info(fs_info, flags);
 4529	ASSERT(space_info);
 4530
 4531	do {
 4532		spin_lock(&space_info->lock);
 4533		if (force < space_info->force_alloc)
 4534			force = space_info->force_alloc;
 4535		should_alloc = should_alloc_chunk(fs_info, space_info, force);
 4536		if (space_info->full) {
 4537			/* No more free physical space */
 4538			if (should_alloc)
 4539				ret = -ENOSPC;
 4540			else
 4541				ret = 0;
 4542			spin_unlock(&space_info->lock);
 4543			return ret;
 4544		} else if (!should_alloc) {
 4545			spin_unlock(&space_info->lock);
 4546			return 0;
 4547		} else if (space_info->chunk_alloc) {
 4548			/*
 4549			 * Someone is already allocating, so we need to block
 4550			 * until this someone is finished and then loop to
 4551			 * recheck if we should continue with our allocation
 4552			 * attempt.
 4553			 */
 4554			wait_for_alloc = true;
 4555			spin_unlock(&space_info->lock);
 4556			mutex_lock(&fs_info->chunk_mutex);
 4557			mutex_unlock(&fs_info->chunk_mutex);
 4558		} else {
 4559			/* Proceed with allocation */
 4560			space_info->chunk_alloc = 1;
 4561			wait_for_alloc = false;
 4562			spin_unlock(&space_info->lock);
 4563		}
 4564
 4565		cond_resched();
 4566	} while (wait_for_alloc);
 4567
 4568	mutex_lock(&fs_info->chunk_mutex);
 4569	trans->allocating_chunk = true;
 4570
 4571	/*
 4572	 * If we have mixed data/metadata chunks we want to make sure we keep
 4573	 * allocating mixed chunks instead of individual chunks.
 4574	 */
 4575	if (btrfs_mixed_space_info(space_info))
 4576		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
 4577
 4578	/*
 4579	 * if we're doing a data chunk, go ahead and make sure that
 4580	 * we keep a reasonable number of metadata chunks allocated in the
 4581	 * FS as well.
 4582	 */
 4583	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
 4584		fs_info->data_chunk_allocations++;
 4585		if (!(fs_info->data_chunk_allocations %
 4586		      fs_info->metadata_ratio))
 4587			force_metadata_allocation(fs_info);
 4588	}
 4589
 4590	/*
 4591	 * Check if we have enough space in SYSTEM chunk because we may need
 4592	 * to update devices.
 4593	 */
 4594	check_system_chunk(trans, flags);
 4595
 4596	ret = btrfs_alloc_chunk(trans, flags);
 4597	trans->allocating_chunk = false;
 4598
 4599	spin_lock(&space_info->lock);
 4600	if (ret < 0) {
 4601		if (ret == -ENOSPC)
 4602			space_info->full = 1;
 4603		else
 4604			goto out;
 4605	} else {
 4606		ret = 1;
 4607		space_info->max_extent_size = 0;
 4608	}
 4609
 4610	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
 4611out:
 4612	space_info->chunk_alloc = 0;
 4613	spin_unlock(&space_info->lock);
 4614	mutex_unlock(&fs_info->chunk_mutex);
 4615	/*
 4616	 * When we allocate a new chunk we reserve space in the chunk block
 4617	 * reserve to make sure we can COW nodes/leafs in the chunk tree or
 4618	 * add new nodes/leafs to it if we end up needing to do it when
 4619	 * inserting the chunk item and updating device items as part of the
 4620	 * second phase of chunk allocation, performed by
 4621	 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
 4622	 * large number of new block groups to create in our transaction
 4623	 * handle's new_bgs list to avoid exhausting the chunk block reserve
 4624	 * in extreme cases - like having a single transaction create many new
 4625	 * block groups when starting to write out the free space caches of all
 4626	 * the block groups that were made dirty during the lifetime of the
 4627	 * transaction.
 4628	 */
 4629	if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
 4630		btrfs_create_pending_block_groups(trans);
 4631
 4632	return ret;
 4633}
 4634
 4635static int can_overcommit(struct btrfs_fs_info *fs_info,
 4636			  struct btrfs_space_info *space_info, u64 bytes,
 4637			  enum btrfs_reserve_flush_enum flush,
 4638			  bool system_chunk)
 4639{
 4640	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 4641	u64 profile;
 4642	u64 space_size;
 4643	u64 avail;
 4644	u64 used;
 4645	int factor;
 4646
 4647	/* Don't overcommit when in mixed mode. */
 4648	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
 4649		return 0;
 4650
 4651	if (system_chunk)
 4652		profile = btrfs_system_alloc_profile(fs_info);
 4653	else
 4654		profile = btrfs_metadata_alloc_profile(fs_info);
 4655
 4656	used = btrfs_space_info_used(space_info, false);
 4657
 4658	/*
 4659	 * We only want to allow over committing if we have lots of actual space
 4660	 * free, but if we don't have enough space to handle the global reserve
 4661	 * space then we could end up having a real enospc problem when trying
 4662	 * to allocate a chunk or some other such important allocation.
 4663	 */
 4664	spin_lock(&global_rsv->lock);
 4665	space_size = calc_global_rsv_need_space(global_rsv);
 4666	spin_unlock(&global_rsv->lock);
 4667	if (used + space_size >= space_info->total_bytes)
 4668		return 0;
 4669
 4670	used += space_info->bytes_may_use;
 4671
 4672	avail = atomic64_read(&fs_info->free_chunk_space);
 4673
 4674	/*
 4675	 * If we have dup, raid1 or raid10 then only half of the free
 4676	 * space is actually usable.  For raid56, the space info used
 4677	 * doesn't include the parity drive, so we don't have to
 4678	 * change the math
 4679	 */
 4680	factor = btrfs_bg_type_to_factor(profile);
 4681	avail = div_u64(avail, factor);
 4682
 4683	/*
 4684	 * If we aren't flushing all things, let us overcommit up to
 4685	 * 1/2th of the space. If we can flush, don't let us overcommit
 4686	 * too much, let it overcommit up to 1/8 of the space.
 4687	 */
 4688	if (flush == BTRFS_RESERVE_FLUSH_ALL)
 4689		avail >>= 3;
 4690	else
 4691		avail >>= 1;
 4692
 4693	if (used + bytes < space_info->total_bytes + avail)
 4694		return 1;
 4695	return 0;
 4696}
 4697
 4698static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
 4699					 unsigned long nr_pages, int nr_items)
 4700{
 4701	struct super_block *sb = fs_info->sb;
 4702
 4703	if (down_read_trylock(&sb->s_umount)) {
 4704		writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
 4705		up_read(&sb->s_umount);
 4706	} else {
 4707		/*
 4708		 * We needn't worry the filesystem going from r/w to r/o though
 4709		 * we don't acquire ->s_umount mutex, because the filesystem
 4710		 * should guarantee the delalloc inodes list be empty after
 4711		 * the filesystem is readonly(all dirty pages are written to
 4712		 * the disk).
 4713		 */
 4714		btrfs_start_delalloc_roots(fs_info, nr_items);
 4715		if (!current->journal_info)
 4716			btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
 4717	}
 4718}
 4719
 4720static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
 4721					u64 to_reclaim)
 4722{
 4723	u64 bytes;
 4724	u64 nr;
 4725
 4726	bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
 4727	nr = div64_u64(to_reclaim, bytes);
 4728	if (!nr)
 4729		nr = 1;
 4730	return nr;
 4731}
 4732
 4733#define EXTENT_SIZE_PER_ITEM	SZ_256K
 4734
 4735/*
 4736 * shrink metadata reservation for delalloc
 4737 */
 4738static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
 4739			    u64 orig, bool wait_ordered)
 4740{
 4741	struct btrfs_space_info *space_info;
 4742	struct btrfs_trans_handle *trans;
 4743	u64 delalloc_bytes;
 4744	u64 max_reclaim;
 4745	u64 items;
 4746	long time_left;
 4747	unsigned long nr_pages;
 4748	int loops;
 4749
 4750	/* Calc the number of the pages we need flush for space reservation */
 4751	items = calc_reclaim_items_nr(fs_info, to_reclaim);
 4752	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
 4753
 4754	trans = (struct btrfs_trans_handle *)current->journal_info;
 4755	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 4756
 4757	delalloc_bytes = percpu_counter_sum_positive(
 4758						&fs_info->delalloc_bytes);
 4759	if (delalloc_bytes == 0) {
 4760		if (trans)
 4761			return;
 4762		if (wait_ordered)
 4763			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
 4764		return;
 4765	}
 4766
 4767	loops = 0;
 4768	while (delalloc_bytes && loops < 3) {
 4769		max_reclaim = min(delalloc_bytes, to_reclaim);
 4770		nr_pages = max_reclaim >> PAGE_SHIFT;
 4771		btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
 4772		/*
 4773		 * We need to wait for the async pages to actually start before
 4774		 * we do anything.
 4775		 */
 4776		max_reclaim = atomic_read(&fs_info->async_delalloc_pages);
 4777		if (!max_reclaim)
 4778			goto skip_async;
 4779
 4780		if (max_reclaim <= nr_pages)
 4781			max_reclaim = 0;
 4782		else
 4783			max_reclaim -= nr_pages;
 4784
 4785		wait_event(fs_info->async_submit_wait,
 4786			   atomic_read(&fs_info->async_delalloc_pages) <=
 4787			   (int)max_reclaim);
 4788skip_async:
 4789		spin_lock(&space_info->lock);
 4790		if (list_empty(&space_info->tickets) &&
 4791		    list_empty(&space_info->priority_tickets)) {
 4792			spin_unlock(&space_info->lock);
 4793			break;
 4794		}
 4795		spin_unlock(&space_info->lock);
 4796
 4797		loops++;
 4798		if (wait_ordered && !trans) {
 4799			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
 4800		} else {
 4801			time_left = schedule_timeout_killable(1);
 4802			if (time_left)
 4803				break;
 4804		}
 4805		delalloc_bytes = percpu_counter_sum_positive(
 4806						&fs_info->delalloc_bytes);
 4807	}
 4808}
 4809
 4810struct reserve_ticket {
 4811	u64 bytes;
 4812	int error;
 4813	struct list_head list;
 4814	wait_queue_head_t wait;
 4815};
 4816
 4817/**
 4818 * maybe_commit_transaction - possibly commit the transaction if its ok to
 4819 * @root - the root we're allocating for
 4820 * @bytes - the number of bytes we want to reserve
 4821 * @force - force the commit
 4822 *
 4823 * This will check to make sure that committing the transaction will actually
 4824 * get us somewhere and then commit the transaction if it does.  Otherwise it
 4825 * will return -ENOSPC.
 4826 */
 4827static int may_commit_transaction(struct btrfs_fs_info *fs_info,
 4828				  struct btrfs_space_info *space_info)
 4829{
 4830	struct reserve_ticket *ticket = NULL;
 4831	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
 4832	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
 4833	struct btrfs_trans_handle *trans;
 4834	u64 bytes_needed;
 4835	u64 reclaim_bytes = 0;
 4836
 4837	trans = (struct btrfs_trans_handle *)current->journal_info;
 4838	if (trans)
 4839		return -EAGAIN;
 4840
 4841	spin_lock(&space_info->lock);
 4842	if (!list_empty(&space_info->priority_tickets))
 4843		ticket = list_first_entry(&space_info->priority_tickets,
 4844					  struct reserve_ticket, list);
 4845	else if (!list_empty(&space_info->tickets))
 4846		ticket = list_first_entry(&space_info->tickets,
 4847					  struct reserve_ticket, list);
 4848	bytes_needed = (ticket) ? ticket->bytes : 0;
 4849	spin_unlock(&space_info->lock);
 4850
 4851	if (!bytes_needed)
 4852		return 0;
 4853
 4854	/* See if there is enough pinned space to make this reservation */
 4855	if (__percpu_counter_compare(&space_info->total_bytes_pinned,
 4856				   bytes_needed,
 4857				   BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
 4858		goto commit;
 4859
 4860	/*
 4861	 * See if there is some space in the delayed insertion reservation for
 4862	 * this reservation.
 4863	 */
 4864	if (space_info != delayed_rsv->space_info)
 4865		return -ENOSPC;
 4866
 4867	spin_lock(&delayed_rsv->lock);
 4868	reclaim_bytes += delayed_rsv->reserved;
 4869	spin_unlock(&delayed_rsv->lock);
 4870
 4871	spin_lock(&delayed_refs_rsv->lock);
 4872	reclaim_bytes += delayed_refs_rsv->reserved;
 4873	spin_unlock(&delayed_refs_rsv->lock);
 4874	if (reclaim_bytes >= bytes_needed)
 4875		goto commit;
 4876	bytes_needed -= reclaim_bytes;
 4877
 4878	if (__percpu_counter_compare(&space_info->total_bytes_pinned,
 4879				   bytes_needed,
 4880				   BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) {
 4881		return -ENOSPC;
 4882	}
 4883
 4884commit:
 4885	trans = btrfs_join_transaction(fs_info->extent_root);
 4886	if (IS_ERR(trans))
 4887		return -ENOSPC;
 4888
 4889	return btrfs_commit_transaction(trans);
 4890}
 4891
 4892/*
 4893 * Try to flush some data based on policy set by @state. This is only advisory
 4894 * and may fail for various reasons. The caller is supposed to examine the
 4895 * state of @space_info to detect the outcome.
 4896 */
 4897static void flush_space(struct btrfs_fs_info *fs_info,
 4898		       struct btrfs_space_info *space_info, u64 num_bytes,
 4899		       int state)
 4900{
 4901	struct btrfs_root *root = fs_info->extent_root;
 4902	struct btrfs_trans_handle *trans;
 4903	int nr;
 4904	int ret = 0;
 4905
 4906	switch (state) {
 4907	case FLUSH_DELAYED_ITEMS_NR:
 4908	case FLUSH_DELAYED_ITEMS:
 4909		if (state == FLUSH_DELAYED_ITEMS_NR)
 4910			nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
 4911		else
 4912			nr = -1;
 4913
 4914		trans = btrfs_join_transaction(root);
 4915		if (IS_ERR(trans)) {
 4916			ret = PTR_ERR(trans);
 4917			break;
 4918		}
 4919		ret = btrfs_run_delayed_items_nr(trans, nr);
 4920		btrfs_end_transaction(trans);
 4921		break;
 4922	case FLUSH_DELALLOC:
 4923	case FLUSH_DELALLOC_WAIT:
 4924		shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
 4925				state == FLUSH_DELALLOC_WAIT);
 4926		break;
 4927	case FLUSH_DELAYED_REFS_NR:
 4928	case FLUSH_DELAYED_REFS:
 4929		trans = btrfs_join_transaction(root);
 4930		if (IS_ERR(trans)) {
 4931			ret = PTR_ERR(trans);
 4932			break;
 4933		}
 4934		if (state == FLUSH_DELAYED_REFS_NR)
 4935			nr = calc_reclaim_items_nr(fs_info, num_bytes);
 4936		else
 4937			nr = 0;
 4938		btrfs_run_delayed_refs(trans, nr);
 4939		btrfs_end_transaction(trans);
 4940		break;
 4941	case ALLOC_CHUNK:
 4942		trans = btrfs_join_transaction(root);
 4943		if (IS_ERR(trans)) {
 4944			ret = PTR_ERR(trans);
 4945			break;
 4946		}
 4947		ret = do_chunk_alloc(trans,
 4948				     btrfs_metadata_alloc_profile(fs_info),
 4949				     CHUNK_ALLOC_NO_FORCE);
 4950		btrfs_end_transaction(trans);
 4951		if (ret > 0 || ret == -ENOSPC)
 4952			ret = 0;
 4953		break;
 4954	case COMMIT_TRANS:
 4955		/*
 4956		 * If we have pending delayed iputs then we could free up a
 4957		 * bunch of pinned space, so make sure we run the iputs before
 4958		 * we do our pinned bytes check below.
 4959		 */
 4960		mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
 4961		btrfs_run_delayed_iputs(fs_info);
 4962		mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
 4963
 4964		ret = may_commit_transaction(fs_info, space_info);
 4965		break;
 4966	default:
 4967		ret = -ENOSPC;
 4968		break;
 4969	}
 4970
 4971	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
 4972				ret);
 4973	return;
 4974}
 4975
 4976static inline u64
 4977btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
 4978				 struct btrfs_space_info *space_info,
 4979				 bool system_chunk)
 4980{
 4981	struct reserve_ticket *ticket;
 4982	u64 used;
 4983	u64 expected;
 4984	u64 to_reclaim = 0;
 4985
 4986	list_for_each_entry(ticket, &space_info->tickets, list)
 4987		to_reclaim += ticket->bytes;
 4988	list_for_each_entry(ticket, &space_info->priority_tickets, list)
 4989		to_reclaim += ticket->bytes;
 4990	if (to_reclaim)
 4991		return to_reclaim;
 4992
 4993	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
 4994	if (can_overcommit(fs_info, space_info, to_reclaim,
 4995			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
 4996		return 0;
 4997
 4998	used = btrfs_space_info_used(space_info, true);
 4999
 5000	if (can_overcommit(fs_info, space_info, SZ_1M,
 5001			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
 5002		expected = div_factor_fine(space_info->total_bytes, 95);
 5003	else
 5004		expected = div_factor_fine(space_info->total_bytes, 90);
 5005
 5006	if (used > expected)
 5007		to_reclaim = used - expected;
 5008	else
 5009		to_reclaim = 0;
 5010	to_reclaim = min(to_reclaim, space_info->bytes_may_use +
 5011				     space_info->bytes_reserved);
 5012	return to_reclaim;
 5013}
 5014
 5015static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
 5016					struct btrfs_space_info *space_info,
 5017					u64 used, bool system_chunk)
 5018{
 5019	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
 5020
 5021	/* If we're just plain full then async reclaim just slows us down. */
 5022	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
 5023		return 0;
 5024
 5025	if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
 5026					      system_chunk))
 5027		return 0;
 5028
 5029	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
 5030		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
 5031}
 5032
 5033static void wake_all_tickets(struct list_head *head)
 5034{
 5035	struct reserve_ticket *ticket;
 5036
 5037	while (!list_empty(head)) {
 5038		ticket = list_first_entry(head, struct reserve_ticket, list);
 5039		list_del_init(&ticket->list);
 5040		ticket->error = -ENOSPC;
 5041		wake_up(&ticket->wait);
 5042	}
 5043}
 5044
 5045/*
 5046 * This is for normal flushers, we can wait all goddamned day if we want to.  We
 5047 * will loop and continuously try to flush as long as we are making progress.
 5048 * We count progress as clearing off tickets each time we have to loop.
 5049 */
 5050static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 5051{
 5052	struct btrfs_fs_info *fs_info;
 5053	struct btrfs_space_info *space_info;
 5054	u64 to_reclaim;
 5055	int flush_state;
 5056	int commit_cycles = 0;
 5057	u64 last_tickets_id;
 5058
 5059	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
 5060	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 5061
 5062	spin_lock(&space_info->lock);
 5063	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
 5064						      false);
 5065	if (!to_reclaim) {
 5066		space_info->flush = 0;
 5067		spin_unlock(&space_info->lock);
 5068		return;
 5069	}
 5070	last_tickets_id = space_info->tickets_id;
 5071	spin_unlock(&space_info->lock);
 5072
 5073	flush_state = FLUSH_DELAYED_ITEMS_NR;
 5074	do {
 5075		flush_space(fs_info, space_info, to_reclaim, flush_state);
 5076		spin_lock(&space_info->lock);
 5077		if (list_empty(&space_info->tickets)) {
 5078			space_info->flush = 0;
 5079			spin_unlock(&space_info->lock);
 5080			return;
 5081		}
 5082		to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
 5083							      space_info,
 5084							      false);
 5085		if (last_tickets_id == space_info->tickets_id) {
 5086			flush_state++;
 5087		} else {
 5088			last_tickets_id = space_info->tickets_id;
 5089			flush_state = FLUSH_DELAYED_ITEMS_NR;
 5090			if (commit_cycles)
 5091				commit_cycles--;
 5092		}
 5093
 5094		if (flush_state > COMMIT_TRANS) {
 5095			commit_cycles++;
 5096			if (commit_cycles > 2) {
 5097				wake_all_tickets(&space_info->tickets);
 5098				space_info->flush = 0;
 5099			} else {
 5100				flush_state = FLUSH_DELAYED_ITEMS_NR;
 5101			}
 5102		}
 5103		spin_unlock(&space_info->lock);
 5104	} while (flush_state <= COMMIT_TRANS);
 5105}
 5106
 5107void btrfs_init_async_reclaim_work(struct work_struct *work)
 5108{
 5109	INIT_WORK(work, btrfs_async_reclaim_metadata_space);
 5110}
 5111
 5112static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
 5113					    struct btrfs_space_info *space_info,
 5114					    struct reserve_ticket *ticket)
 5115{
 5116	u64 to_reclaim;
 5117	int flush_state = FLUSH_DELAYED_ITEMS_NR;
 5118
 5119	spin_lock(&space_info->lock);
 5120	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
 5121						      false);
 5122	if (!to_reclaim) {
 5123		spin_unlock(&space_info->lock);
 5124		return;
 5125	}
 5126	spin_unlock(&space_info->lock);
 5127
 5128	do {
 5129		flush_space(fs_info, space_info, to_reclaim, flush_state);
 5130		flush_state++;
 5131		spin_lock(&space_info->lock);
 5132		if (ticket->bytes == 0) {
 5133			spin_unlock(&space_info->lock);
 5134			return;
 5135		}
 5136		spin_unlock(&space_info->lock);
 5137
 5138		/*
 5139		 * Priority flushers can't wait on delalloc without
 5140		 * deadlocking.
 5141		 */
 5142		if (flush_state == FLUSH_DELALLOC ||
 5143		    flush_state == FLUSH_DELALLOC_WAIT)
 5144			flush_state = ALLOC_CHUNK;
 5145	} while (flush_state < COMMIT_TRANS);
 5146}
 5147
 5148static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
 5149			       struct btrfs_space_info *space_info,
 5150			       struct reserve_ticket *ticket, u64 orig_bytes)
 5151
 5152{
 5153	DEFINE_WAIT(wait);
 5154	int ret = 0;
 5155
 5156	spin_lock(&space_info->lock);
 5157	while (ticket->bytes > 0 && ticket->error == 0) {
 5158		ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
 5159		if (ret) {
 5160			ret = -EINTR;
 5161			break;
 5162		}
 5163		spin_unlock(&space_info->lock);
 5164
 5165		schedule();
 5166
 5167		finish_wait(&ticket->wait, &wait);
 5168		spin_lock(&space_info->lock);
 5169	}
 5170	if (!ret)
 5171		ret = ticket->error;
 5172	if (!list_empty(&ticket->list))
 5173		list_del_init(&ticket->list);
 5174	if (ticket->bytes && ticket->bytes < orig_bytes) {
 5175		u64 num_bytes = orig_bytes - ticket->bytes;
 5176		update_bytes_may_use(space_info, -num_bytes);
 5177		trace_btrfs_space_reservation(fs_info, "space_info",
 5178					      space_info->flags, num_bytes, 0);
 5179	}
 5180	spin_unlock(&space_info->lock);
 5181
 5182	return ret;
 5183}
 5184
 5185/**
 5186 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
 5187 * @root - the root we're allocating for
 5188 * @space_info - the space info we want to allocate from
 5189 * @orig_bytes - the number of bytes we want
 5190 * @flush - whether or not we can flush to make our reservation
 5191 *
 5192 * This will reserve orig_bytes number of bytes from the space info associated
 5193 * with the block_rsv.  If there is not enough space it will make an attempt to
 5194 * flush out space to make room.  It will do this by flushing delalloc if
 5195 * possible or committing the transaction.  If flush is 0 then no attempts to
 5196 * regain reservations will be made and this will fail if there is not enough
 5197 * space already.
 5198 */
 5199static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
 5200				    struct btrfs_space_info *space_info,
 5201				    u64 orig_bytes,
 5202				    enum btrfs_reserve_flush_enum flush,
 5203				    bool system_chunk)
 5204{
 5205	struct reserve_ticket ticket;
 5206	u64 used;
 5207	int ret = 0;
 5208
 5209	ASSERT(orig_bytes);
 5210	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
 5211
 5212	spin_lock(&space_info->lock);
 5213	ret = -ENOSPC;
 5214	used = btrfs_space_info_used(space_info, true);
 5215
 5216	/*
 5217	 * If we have enough space then hooray, make our reservation and carry
 5218	 * on.  If not see if we can overcommit, and if we can, hooray carry on.
 5219	 * If not things get more complicated.
 5220	 */
 5221	if (used + orig_bytes <= space_info->total_bytes) {
 5222		update_bytes_may_use(space_info, orig_bytes);
 5223		trace_btrfs_space_reservation(fs_info, "space_info",
 5224					      space_info->flags, orig_bytes, 1);
 5225		ret = 0;
 5226	} else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
 5227				  system_chunk)) {
 5228		update_bytes_may_use(space_info, orig_bytes);
 5229		trace_btrfs_space_reservation(fs_info, "space_info",
 5230					      space_info->flags, orig_bytes, 1);
 5231		ret = 0;
 5232	}
 5233
 5234	/*
 5235	 * If we couldn't make a reservation then setup our reservation ticket
 5236	 * and kick the async worker if it's not already running.
 5237	 *
 5238	 * If we are a priority flusher then we just need to add our ticket to
 5239	 * the list and we will do our own flushing further down.
 5240	 */
 5241	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
 5242		ticket.bytes = orig_bytes;
 5243		ticket.error = 0;
 5244		init_waitqueue_head(&ticket.wait);
 5245		if (flush == BTRFS_RESERVE_FLUSH_ALL) {
 5246			list_add_tail(&ticket.list, &space_info->tickets);
 5247			if (!space_info->flush) {
 5248				space_info->flush = 1;
 5249				trace_btrfs_trigger_flush(fs_info,
 5250							  space_info->flags,
 5251							  orig_bytes, flush,
 5252							  "enospc");
 5253				queue_work(system_unbound_wq,
 5254					   &fs_info->async_reclaim_work);
 5255			}
 5256		} else {
 5257			list_add_tail(&ticket.list,
 5258				      &space_info->priority_tickets);
 5259		}
 5260	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
 5261		used += orig_bytes;
 5262		/*
 5263		 * We will do the space reservation dance during log replay,
 5264		 * which means we won't have fs_info->fs_root set, so don't do
 5265		 * the async reclaim as we will panic.
 5266		 */
 5267		if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
 5268		    need_do_async_reclaim(fs_info, space_info,
 5269					  used, system_chunk) &&
 5270		    !work_busy(&fs_info->async_reclaim_work)) {
 5271			trace_btrfs_trigger_flush(fs_info, space_info->flags,
 5272						  orig_bytes, flush, "preempt");
 5273			queue_work(system_unbound_wq,
 5274				   &fs_info->async_reclaim_work);
 5275		}
 5276	}
 5277	spin_unlock(&space_info->lock);
 5278	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
 5279		return ret;
 5280
 5281	if (flush == BTRFS_RESERVE_FLUSH_ALL)
 5282		return wait_reserve_ticket(fs_info, space_info, &ticket,
 5283					   orig_bytes);
 5284
 5285	ret = 0;
 5286	priority_reclaim_metadata_space(fs_info, space_info, &ticket);
 5287	spin_lock(&space_info->lock);
 5288	if (ticket.bytes) {
 5289		if (ticket.bytes < orig_bytes) {
 5290			u64 num_bytes = orig_bytes - ticket.bytes;
 5291			update_bytes_may_use(space_info, -num_bytes);
 5292			trace_btrfs_space_reservation(fs_info, "space_info",
 5293						      space_info->flags,
 5294						      num_bytes, 0);
 5295
 5296		}
 5297		list_del_init(&ticket.list);
 5298		ret = -ENOSPC;
 5299	}
 5300	spin_unlock(&space_info->lock);
 5301	ASSERT(list_empty(&ticket.list));
 5302	return ret;
 5303}
 5304
 5305/**
 5306 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
 5307 * @root - the root we're allocating for
 5308 * @block_rsv - the block_rsv we're allocating for
 5309 * @orig_bytes - the number of bytes we want
 5310 * @flush - whether or not we can flush to make our reservation
 5311 *
 5312 * This will reserve orig_bytes number of bytes from the space info associated
 5313 * with the block_rsv.  If there is not enough space it will make an attempt to
 5314 * flush out space to make room.  It will do this by flushing delalloc if
 5315 * possible or committing the transaction.  If flush is 0 then no attempts to
 5316 * regain reservations will be made and this will fail if there is not enough
 5317 * space already.
 5318 */
 5319static int reserve_metadata_bytes(struct btrfs_root *root,
 5320				  struct btrfs_block_rsv *block_rsv,
 5321				  u64 orig_bytes,
 5322				  enum btrfs_reserve_flush_enum flush)
 5323{
 5324	struct btrfs_fs_info *fs_info = root->fs_info;
 5325	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 5326	int ret;
 5327	bool system_chunk = (root == fs_info->chunk_root);
 5328
 5329	ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
 5330				       orig_bytes, flush, system_chunk);
 5331	if (ret == -ENOSPC &&
 5332	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
 5333		if (block_rsv != global_rsv &&
 5334		    !block_rsv_use_bytes(global_rsv, orig_bytes))
 5335			ret = 0;
 5336	}
 5337	if (ret == -ENOSPC) {
 5338		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
 5339					      block_rsv->space_info->flags,
 5340					      orig_bytes, 1);
 5341
 5342		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
 5343			dump_space_info(fs_info, block_rsv->space_info,
 5344					orig_bytes, 0);
 5345	}
 5346	return ret;
 5347}
 5348
 5349static struct btrfs_block_rsv *get_block_rsv(
 5350					const struct btrfs_trans_handle *trans,
 5351					const struct btrfs_root *root)
 5352{
 5353	struct btrfs_fs_info *fs_info = root->fs_info;
 5354	struct btrfs_block_rsv *block_rsv = NULL;
 5355
 5356	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
 5357	    (root == fs_info->csum_root && trans->adding_csums) ||
 5358	    (root == fs_info->uuid_root))
 5359		block_rsv = trans->block_rsv;
 5360
 5361	if (!block_rsv)
 5362		block_rsv = root->block_rsv;
 5363
 5364	if (!block_rsv)
 5365		block_rsv = &fs_info->empty_block_rsv;
 5366
 5367	return block_rsv;
 5368}
 5369
 5370static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
 5371			       u64 num_bytes)
 5372{
 5373	int ret = -ENOSPC;
 5374	spin_lock(&block_rsv->lock);
 5375	if (block_rsv->reserved >= num_bytes) {
 5376		block_rsv->reserved -= num_bytes;
 5377		if (block_rsv->reserved < block_rsv->size)
 5378			block_rsv->full = 0;
 5379		ret = 0;
 5380	}
 5381	spin_unlock(&block_rsv->lock);
 5382	return ret;
 5383}
 5384
 5385static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
 5386				u64 num_bytes, bool update_size)
 5387{
 5388	spin_lock(&block_rsv->lock);
 5389	block_rsv->reserved += num_bytes;
 5390	if (update_size)
 5391		block_rsv->size += num_bytes;
 5392	else if (block_rsv->reserved >= block_rsv->size)
 5393		block_rsv->full = 1;
 5394	spin_unlock(&block_rsv->lock);
 5395}
 5396
 5397int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
 5398			     struct btrfs_block_rsv *dest, u64 num_bytes,
 5399			     int min_factor)
 5400{
 5401	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 5402	u64 min_bytes;
 5403
 5404	if (global_rsv->space_info != dest->space_info)
 5405		return -ENOSPC;
 5406
 5407	spin_lock(&global_rsv->lock);
 5408	min_bytes = div_factor(global_rsv->size, min_factor);
 5409	if (global_rsv->reserved < min_bytes + num_bytes) {
 5410		spin_unlock(&global_rsv->lock);
 5411		return -ENOSPC;
 5412	}
 5413	global_rsv->reserved -= num_bytes;
 5414	if (global_rsv->reserved < global_rsv->size)
 5415		global_rsv->full = 0;
 5416	spin_unlock(&global_rsv->lock);
 5417
 5418	block_rsv_add_bytes(dest, num_bytes, true);
 5419	return 0;
 5420}
 5421
 5422/**
 5423 * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
 5424 * @fs_info - the fs info for our fs.
 5425 * @src - the source block rsv to transfer from.
 5426 * @num_bytes - the number of bytes to transfer.
 5427 *
 5428 * This transfers up to the num_bytes amount from the src rsv to the
 5429 * delayed_refs_rsv.  Any extra bytes are returned to the space info.
 5430 */
 5431void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
 5432				       struct btrfs_block_rsv *src,
 5433				       u64 num_bytes)
 5434{
 5435	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
 5436	u64 to_free = 0;
 5437
 5438	spin_lock(&src->lock);
 5439	src->reserved -= num_bytes;
 5440	src->size -= num_bytes;
 5441	spin_unlock(&src->lock);
 5442
 5443	spin_lock(&delayed_refs_rsv->lock);
 5444	if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
 5445		u64 delta = delayed_refs_rsv->size -
 5446			delayed_refs_rsv->reserved;
 5447		if (num_bytes > delta) {
 5448			to_free = num_bytes - delta;
 5449			num_bytes = delta;
 5450		}
 5451	} else {
 5452		to_free = num_bytes;
 5453		num_bytes = 0;
 5454	}
 5455
 5456	if (num_bytes)
 5457		delayed_refs_rsv->reserved += num_bytes;
 5458	if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
 5459		delayed_refs_rsv->full = 1;
 5460	spin_unlock(&delayed_refs_rsv->lock);
 5461
 5462	if (num_bytes)
 5463		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
 5464					      0, num_bytes, 1);
 5465	if (to_free)
 5466		space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info,
 5467					 to_free);
 5468}
 5469
 5470/**
 5471 * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage.
 5472 * @fs_info - the fs_info for our fs.
 5473 * @flush - control how we can flush for this reservation.
 5474 *
 5475 * This will refill the delayed block_rsv up to 1 items size worth of space and
 5476 * will return -ENOSPC if we can't make the reservation.
 5477 */
 5478int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
 5479				  enum btrfs_reserve_flush_enum flush)
 5480{
 5481	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
 5482	u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1);
 5483	u64 num_bytes = 0;
 5484	int ret = -ENOSPC;
 5485
 5486	spin_lock(&block_rsv->lock);
 5487	if (block_rsv->reserved < block_rsv->size) {
 5488		num_bytes = block_rsv->size - block_rsv->reserved;
 5489		num_bytes = min(num_bytes, limit);
 5490	}
 5491	spin_unlock(&block_rsv->lock);
 5492
 5493	if (!num_bytes)
 5494		return 0;
 5495
 5496	ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv,
 5497				     num_bytes, flush);
 5498	if (ret)
 5499		return ret;
 5500	block_rsv_add_bytes(block_rsv, num_bytes, 0);
 5501	trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
 5502				      0, num_bytes, 1);
 5503	return 0;
 5504}
 5505
 5506/*
 5507 * This is for space we already have accounted in space_info->bytes_may_use, so
 5508 * basically when we're returning space from block_rsv's.
 5509 */
 5510static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
 5511				     struct btrfs_space_info *space_info,
 5512				     u64 num_bytes)
 5513{
 5514	struct reserve_ticket *ticket;
 5515	struct list_head *head;
 5516	u64 used;
 5517	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
 5518	bool check_overcommit = false;
 5519
 5520	spin_lock(&space_info->lock);
 5521	head = &space_info->priority_tickets;
 5522
 5523	/*
 5524	 * If we are over our limit then we need to check and see if we can
 5525	 * overcommit, and if we can't then we just need to free up our space
 5526	 * and not satisfy any requests.
 5527	 */
 5528	used = btrfs_space_info_used(space_info, true);
 5529	if (used - num_bytes >= space_info->total_bytes)
 5530		check_overcommit = true;
 5531again:
 5532	while (!list_empty(head) && num_bytes) {
 5533		ticket = list_first_entry(head, struct reserve_ticket,
 5534					  list);
 5535		/*
 5536		 * We use 0 bytes because this space is already reserved, so
 5537		 * adding the ticket space would be a double count.
 5538		 */
 5539		if (check_overcommit &&
 5540		    !can_overcommit(fs_info, space_info, 0, flush, false))
 5541			break;
 5542		if (num_bytes >= ticket->bytes) {
 5543			list_del_init(&ticket->list);
 5544			num_bytes -= ticket->bytes;
 5545			ticket->bytes = 0;
 5546			space_info->tickets_id++;
 5547			wake_up(&ticket->wait);
 5548		} else {
 5549			ticket->bytes -= num_bytes;
 5550			num_bytes = 0;
 5551		}
 5552	}
 5553
 5554	if (num_bytes && head == &space_info->priority_tickets) {
 5555		head = &space_info->tickets;
 5556		flush = BTRFS_RESERVE_FLUSH_ALL;
 5557		goto again;
 5558	}
 5559	update_bytes_may_use(space_info, -num_bytes);
 5560	trace_btrfs_space_reservation(fs_info, "space_info",
 5561				      space_info->flags, num_bytes, 0);
 5562	spin_unlock(&space_info->lock);
 5563}
 5564
 5565/*
 5566 * This is for newly allocated space that isn't accounted in
 5567 * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
 5568 * we use this helper.
 5569 */
 5570static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
 5571				     struct btrfs_space_info *space_info,
 5572				     u64 num_bytes)
 5573{
 5574	struct reserve_ticket *ticket;
 5575	struct list_head *head = &space_info->priority_tickets;
 5576
 5577again:
 5578	while (!list_empty(head) && num_bytes) {
 5579		ticket = list_first_entry(head, struct reserve_ticket,
 5580					  list);
 5581		if (num_bytes >= ticket->bytes) {
 5582			trace_btrfs_space_reservation(fs_info, "space_info",
 5583						      space_info->flags,
 5584						      ticket->bytes, 1);
 5585			list_del_init(&ticket->list);
 5586			num_bytes -= ticket->bytes;
 5587			update_bytes_may_use(space_info, ticket->bytes);
 5588			ticket->bytes = 0;
 5589			space_info->tickets_id++;
 5590			wake_up(&ticket->wait);
 5591		} else {
 5592			trace_btrfs_space_reservation(fs_info, "space_info",
 5593						      space_info->flags,
 5594						      num_bytes, 1);
 5595			update_bytes_may_use(space_info, num_bytes);
 5596			ticket->bytes -= num_bytes;
 5597			num_bytes = 0;
 5598		}
 5599	}
 5600
 5601	if (num_bytes && head == &space_info->priority_tickets) {
 5602		head = &space_info->tickets;
 5603		goto again;
 5604	}
 5605}
 5606
 5607static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
 5608				    struct btrfs_block_rsv *block_rsv,
 5609				    struct btrfs_block_rsv *dest, u64 num_bytes,
 5610				    u64 *qgroup_to_release_ret)
 5611{
 5612	struct btrfs_space_info *space_info = block_rsv->space_info;
 5613	u64 qgroup_to_release = 0;
 5614	u64 ret;
 5615
 5616	spin_lock(&block_rsv->lock);
 5617	if (num_bytes == (u64)-1) {
 5618		num_bytes = block_rsv->size;
 5619		qgroup_to_release = block_rsv->qgroup_rsv_size;
 5620	}
 5621	block_rsv->size -= num_bytes;
 5622	if (block_rsv->reserved >= block_rsv->size) {
 5623		num_bytes = block_rsv->reserved - block_rsv->size;
 5624		block_rsv->reserved = block_rsv->size;
 5625		block_rsv->full = 1;
 5626	} else {
 5627		num_bytes = 0;
 5628	}
 5629	if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
 5630		qgroup_to_release = block_rsv->qgroup_rsv_reserved -
 5631				    block_rsv->qgroup_rsv_size;
 5632		block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
 5633	} else {
 5634		qgroup_to_release = 0;
 5635	}
 5636	spin_unlock(&block_rsv->lock);
 5637
 5638	ret = num_bytes;
 5639	if (num_bytes > 0) {
 5640		if (dest) {
 5641			spin_lock(&dest->lock);
 5642			if (!dest->full) {
 5643				u64 bytes_to_add;
 5644
 5645				bytes_to_add = dest->size - dest->reserved;
 5646				bytes_to_add = min(num_bytes, bytes_to_add);
 5647				dest->reserved += bytes_to_add;
 5648				if (dest->reserved >= dest->size)
 5649					dest->full = 1;
 5650				num_bytes -= bytes_to_add;
 5651			}
 5652			spin_unlock(&dest->lock);
 5653		}
 5654		if (num_bytes)
 5655			space_info_add_old_bytes(fs_info, space_info,
 5656						 num_bytes);
 5657	}
 5658	if (qgroup_to_release_ret)
 5659		*qgroup_to_release_ret = qgroup_to_release;
 5660	return ret;
 5661}
 5662
 5663int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
 5664			    struct btrfs_block_rsv *dst, u64 num_bytes,
 5665			    bool update_size)
 5666{
 5667	int ret;
 5668
 5669	ret = block_rsv_use_bytes(src, num_bytes);
 5670	if (ret)
 5671		return ret;
 5672
 5673	block_rsv_add_bytes(dst, num_bytes, update_size);
 5674	return 0;
 5675}
 5676
 5677void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
 5678{
 5679	memset(rsv, 0, sizeof(*rsv));
 5680	spin_lock_init(&rsv->lock);
 5681	rsv->type = type;
 5682}
 5683
 5684void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
 5685				   struct btrfs_block_rsv *rsv,
 5686				   unsigned short type)
 5687{
 5688	btrfs_init_block_rsv(rsv, type);
 5689	rsv->space_info = __find_space_info(fs_info,
 5690					    BTRFS_BLOCK_GROUP_METADATA);
 5691}
 5692
 5693struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
 5694					      unsigned short type)
 5695{
 5696	struct btrfs_block_rsv *block_rsv;
 5697
 5698	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
 5699	if (!block_rsv)
 5700		return NULL;
 5701
 5702	btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
 5703	return block_rsv;
 5704}
 5705
 5706void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
 5707			  struct btrfs_block_rsv *rsv)
 5708{
 5709	if (!rsv)
 5710		return;
 5711	btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
 5712	kfree(rsv);
 5713}
 5714
 5715int btrfs_block_rsv_add(struct btrfs_root *root,
 5716			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
 5717			enum btrfs_reserve_flush_enum flush)
 5718{
 5719	int ret;
 5720
 5721	if (num_bytes == 0)
 5722		return 0;
 5723
 5724	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
 5725	if (!ret)
 5726		block_rsv_add_bytes(block_rsv, num_bytes, true);
 5727
 5728	return ret;
 5729}
 5730
 5731int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
 5732{
 5733	u64 num_bytes = 0;
 5734	int ret = -ENOSPC;
 5735
 5736	if (!block_rsv)
 5737		return 0;
 5738
 5739	spin_lock(&block_rsv->lock);
 5740	num_bytes = div_factor(block_rsv->size, min_factor);
 5741	if (block_rsv->reserved >= num_bytes)
 5742		ret = 0;
 5743	spin_unlock(&block_rsv->lock);
 5744
 5745	return ret;
 5746}
 5747
 5748int btrfs_block_rsv_refill(struct btrfs_root *root,
 5749			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
 5750			   enum btrfs_reserve_flush_enum flush)
 5751{
 5752	u64 num_bytes = 0;
 5753	int ret = -ENOSPC;
 5754
 5755	if (!block_rsv)
 5756		return 0;
 5757
 5758	spin_lock(&block_rsv->lock);
 5759	num_bytes = min_reserved;
 5760	if (block_rsv->reserved >= num_bytes)
 5761		ret = 0;
 5762	else
 5763		num_bytes -= block_rsv->reserved;
 5764	spin_unlock(&block_rsv->lock);
 5765
 5766	if (!ret)
 5767		return 0;
 5768
 5769	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
 5770	if (!ret) {
 5771		block_rsv_add_bytes(block_rsv, num_bytes, false);
 5772		return 0;
 5773	}
 5774
 5775	return ret;
 5776}
 5777
 5778/**
 5779 * btrfs_inode_rsv_refill - refill the inode block rsv.
 5780 * @inode - the inode we are refilling.
 5781 * @flush - the flushing restriction.
 5782 *
 5783 * Essentially the same as btrfs_block_rsv_refill, except it uses the
 5784 * block_rsv->size as the minimum size.  We'll either refill the missing amount
 5785 * or return if we already have enough space.  This will also handle the reserve
 5786 * tracepoint for the reserved amount.
 5787 */
 5788static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
 5789				  enum btrfs_reserve_flush_enum flush)
 5790{
 5791	struct btrfs_root *root = inode->root;
 5792	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
 5793	u64 num_bytes = 0;
 5794	u64 qgroup_num_bytes = 0;
 5795	int ret = -ENOSPC;
 5796
 5797	spin_lock(&block_rsv->lock);
 5798	if (block_rsv->reserved < block_rsv->size)
 5799		num_bytes = block_rsv->size - block_rsv->reserved;
 5800	if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
 5801		qgroup_num_bytes = block_rsv->qgroup_rsv_size -
 5802				   block_rsv->qgroup_rsv_reserved;
 5803	spin_unlock(&block_rsv->lock);
 5804
 5805	if (num_bytes == 0)
 5806		return 0;
 5807
 5808	ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
 5809	if (ret)
 5810		return ret;
 5811	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
 5812	if (!ret) {
 5813		block_rsv_add_bytes(block_rsv, num_bytes, false);
 5814		trace_btrfs_space_reservation(root->fs_info, "delalloc",
 5815					      btrfs_ino(inode), num_bytes, 1);
 5816
 5817		/* Don't forget to increase qgroup_rsv_reserved */
 5818		spin_lock(&block_rsv->lock);
 5819		block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
 5820		spin_unlock(&block_rsv->lock);
 5821	} else
 5822		btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
 5823	return ret;
 5824}
 5825
 5826static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
 5827				     struct btrfs_block_rsv *block_rsv,
 5828				     u64 num_bytes, u64 *qgroup_to_release)
 5829{
 5830	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 5831	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
 5832	struct btrfs_block_rsv *target = delayed_rsv;
 5833
 5834	if (target->full || target == block_rsv)
 5835		target = global_rsv;
 5836
 5837	if (block_rsv->space_info != target->space_info)
 5838		target = NULL;
 5839
 5840	return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
 5841				       qgroup_to_release);
 5842}
 5843
 5844void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
 5845			     struct btrfs_block_rsv *block_rsv,
 5846			     u64 num_bytes)
 5847{
 5848	__btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
 5849}
 5850
 5851/**
 5852 * btrfs_inode_rsv_release - release any excessive reservation.
 5853 * @inode - the inode we need to release from.
 5854 * @qgroup_free - free or convert qgroup meta.
 5855 *   Unlike normal operation, qgroup meta reservation needs to know if we are
 5856 *   freeing qgroup reservation or just converting it into per-trans.  Normally
 5857 *   @qgroup_free is true for error handling, and false for normal release.
 5858 *
 5859 * This is the same as btrfs_block_rsv_release, except that it handles the
 5860 * tracepoint for the reservation.
 5861 */
 5862static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
 5863{
 5864	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 5865	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
 5866	u64 released = 0;
 5867	u64 qgroup_to_release = 0;
 5868
 5869	/*
 5870	 * Since we statically set the block_rsv->size we just want to say we
 5871	 * are releasing 0 bytes, and then we'll just get the reservation over
 5872	 * the size free'd.
 5873	 */
 5874	released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
 5875					     &qgroup_to_release);
 5876	if (released > 0)
 5877		trace_btrfs_space_reservation(fs_info, "delalloc",
 5878					      btrfs_ino(inode), released, 0);
 5879	if (qgroup_free)
 5880		btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
 5881	else
 5882		btrfs_qgroup_convert_reserved_meta(inode->root,
 5883						   qgroup_to_release);
 5884}
 5885
 5886/**
 5887 * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
 5888 * @fs_info - the fs_info for our fs.
 5889 * @nr - the number of items to drop.
 5890 *
 5891 * This drops the delayed ref head's count from the delayed refs rsv and frees
 5892 * any excess reservation we had.
 5893 */
 5894void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
 5895{
 5896	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
 5897	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 5898	u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr);
 5899	u64 released = 0;
 5900
 5901	released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv,
 5902					   num_bytes, NULL);
 5903	if (released)
 5904		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
 5905					      0, released, 0);
 5906}
 5907
 5908static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
 5909{
 5910	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
 5911	struct btrfs_space_info *sinfo = block_rsv->space_info;
 5912	u64 num_bytes;
 5913
 5914	/*
 5915	 * The global block rsv is based on the size of the extent tree, the
 5916	 * checksum tree and the root tree.  If the fs is empty we want to set
 5917	 * it to a minimal amount for safety.
 5918	 */
 5919	num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
 5920		btrfs_root_used(&fs_info->csum_root->root_item) +
 5921		btrfs_root_used(&fs_info->tree_root->root_item);
 5922	num_bytes = max_t(u64, num_bytes, SZ_16M);
 5923
 5924	spin_lock(&sinfo->lock);
 5925	spin_lock(&block_rsv->lock);
 5926
 5927	block_rsv->size = min_t(u64, num_bytes, SZ_512M);
 5928
 5929	if (block_rsv->reserved < block_rsv->size) {
 5930		num_bytes = btrfs_space_info_used(sinfo, true);
 5931		if (sinfo->total_bytes > num_bytes) {
 5932			num_bytes = sinfo->total_bytes - num_bytes;
 5933			num_bytes = min(num_bytes,
 5934					block_rsv->size - block_rsv->reserved);
 5935			block_rsv->reserved += num_bytes;
 5936			update_bytes_may_use(sinfo, num_bytes);
 5937			trace_btrfs_space_reservation(fs_info, "space_info",
 5938						      sinfo->flags, num_bytes,
 5939						      1);
 5940		}
 5941	} else if (block_rsv->reserved > block_rsv->size) {
 5942		num_bytes = block_rsv->reserved - block_rsv->size;
 5943		update_bytes_may_use(sinfo, -num_bytes);
 5944		trace_btrfs_space_reservation(fs_info, "space_info",
 5945				      sinfo->flags, num_bytes, 0);
 5946		block_rsv->reserved = block_rsv->size;
 5947	}
 5948
 5949	if (block_rsv->reserved == block_rsv->size)
 5950		block_rsv->full = 1;
 5951	else
 5952		block_rsv->full = 0;
 5953
 5954	spin_unlock(&block_rsv->lock);
 5955	spin_unlock(&sinfo->lock);
 5956}
 5957
 5958static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 5959{
 5960	struct btrfs_space_info *space_info;
 5961
 5962	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
 5963	fs_info->chunk_block_rsv.space_info = space_info;
 5964
 5965	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 5966	fs_info->global_block_rsv.space_info = space_info;
 5967	fs_info->trans_block_rsv.space_info = space_info;
 5968	fs_info->empty_block_rsv.space_info = space_info;
 5969	fs_info->delayed_block_rsv.space_info = space_info;
 5970	fs_info->delayed_refs_rsv.space_info = space_info;
 5971
 5972	fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
 5973	fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
 5974	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
 5975	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
 5976	if (fs_info->quota_root)
 5977		fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
 5978	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
 5979
 5980	update_global_block_rsv(fs_info);
 5981}
 5982
 5983static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 5984{
 5985	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
 5986				(u64)-1, NULL);
 5987	WARN_ON(fs_info->trans_block_rsv.size > 0);
 5988	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
 5989	WARN_ON(fs_info->chunk_block_rsv.size > 0);
 5990	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
 5991	WARN_ON(fs_info->delayed_block_rsv.size > 0);
 5992	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
 5993	WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
 5994	WARN_ON(fs_info->delayed_refs_rsv.size > 0);
 5995}
 5996
 5997/*
 5998 * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
 5999 * @trans - the trans that may have generated delayed refs
 6000 *
 6001 * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
 6002 * it'll calculate the additional size and add it to the delayed_refs_rsv.
 6003 */
 6004void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
 6005{
 6006	struct btrfs_fs_info *fs_info = trans->fs_info;
 6007	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
 6008	u64 num_bytes;
 6009
 6010	if (!trans->delayed_ref_updates)
 6011		return;
 6012
 6013	num_bytes = btrfs_calc_trans_metadata_size(fs_info,
 6014						   trans->delayed_ref_updates);
 6015	spin_lock(&delayed_rsv->lock);
 6016	delayed_rsv->size += num_bytes;
 6017	delayed_rsv->full = 0;
 6018	spin_unlock(&delayed_rsv->lock);
 6019	trans->delayed_ref_updates = 0;
 6020}
 6021
 6022/*
 6023 * To be called after all the new block groups attached to the transaction
 6024 * handle have been created (btrfs_create_pending_block_groups()).
 6025 */
 6026void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
 6027{
 6028	struct btrfs_fs_info *fs_info = trans->fs_info;
 6029
 6030	if (!trans->chunk_bytes_reserved)
 6031		return;
 6032
 6033	WARN_ON_ONCE(!list_empty(&trans->new_bgs));
 6034
 6035	block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
 6036				trans->chunk_bytes_reserved, NULL);
 6037	trans->chunk_bytes_reserved = 0;
 6038}
 6039
 6040/*
 6041 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
 6042 * root: the root of the parent directory
 6043 * rsv: block reservation
 6044 * items: the number of items that we need do reservation
 6045 * use_global_rsv: allow fallback to the global block reservation
 6046 *
 6047 * This function is used to reserve the space for snapshot/subvolume
 6048 * creation and deletion. Those operations are different with the
 6049 * common file/directory operations, they change two fs/file trees
 6050 * and root tree, the number of items that the qgroup reserves is
 6051 * different with the free space reservation. So we can not use
 6052 * the space reservation mechanism in start_transaction().
 6053 */
 6054int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 6055				     struct btrfs_block_rsv *rsv, int items,
 6056				     bool use_global_rsv)
 6057{
 6058	u64 qgroup_num_bytes = 0;
 6059	u64 num_bytes;
 6060	int ret;
 6061	struct btrfs_fs_info *fs_info = root->fs_info;
 6062	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 6063
 6064	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
 6065		/* One for parent inode, two for dir entries */
 6066		qgroup_num_bytes = 3 * fs_info->nodesize;
 6067		ret = btrfs_qgroup_reserve_meta_prealloc(root,
 6068				qgroup_num_bytes, true);
 6069		if (ret)
 6070			return ret;
 6071	}
 6072
 6073	num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
 6074	rsv->space_info = __find_space_info(fs_info,
 6075					    BTRFS_BLOCK_GROUP_METADATA);
 6076	ret = btrfs_block_rsv_add(root, rsv, num_bytes,
 6077				  BTRFS_RESERVE_FLUSH_ALL);
 6078
 6079	if (ret == -ENOSPC && use_global_rsv)
 6080		ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
 6081
 6082	if (ret && qgroup_num_bytes)
 6083		btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
 6084
 6085	return ret;
 6086}
 6087
 6088void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
 6089				      struct btrfs_block_rsv *rsv)
 6090{
 6091	btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
 6092}
 6093
 6094static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
 6095						 struct btrfs_inode *inode)
 6096{
 6097	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
 6098	u64 reserve_size = 0;
 6099	u64 qgroup_rsv_size = 0;
 6100	u64 csum_leaves;
 6101	unsigned outstanding_extents;
 6102
 6103	lockdep_assert_held(&inode->lock);
 6104	outstanding_extents = inode->outstanding_extents;
 6105	if (outstanding_extents)
 6106		reserve_size = btrfs_calc_trans_metadata_size(fs_info,
 6107						outstanding_extents + 1);
 6108	csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
 6109						 inode->csum_bytes);
 6110	reserve_size += btrfs_calc_trans_metadata_size(fs_info,
 6111						       csum_leaves);
 6112	/*
 6113	 * For qgroup rsv, the calculation is very simple:
 6114	 * account one nodesize for each outstanding extent
 6115	 *
 6116	 * This is overestimating in most cases.
 6117	 */
 6118	qgroup_rsv_size = outstanding_extents * fs_info->nodesize;
 6119
 6120	spin_lock(&block_rsv->lock);
 6121	block_rsv->size = reserve_size;
 6122	block_rsv->qgroup_rsv_size = qgroup_rsv_size;
 6123	spin_unlock(&block_rsv->lock);
 6124}
 6125
 6126int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 6127{
 6128	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 6129	unsigned nr_extents;
 6130	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
 6131	int ret = 0;
 6132	bool delalloc_lock = true;
 6133
 6134	/* If we are a free space inode we need to not flush since we will be in
 6135	 * the middle of a transaction commit.  We also don't need the delalloc
 6136	 * mutex since we won't race with anybody.  We need this mostly to make
 6137	 * lockdep shut its filthy mouth.
 6138	 *
 6139	 * If we have a transaction open (can happen if we call truncate_block
 6140	 * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
 6141	 */
 6142	if (btrfs_is_free_space_inode(inode)) {
 6143		flush = BTRFS_RESERVE_NO_FLUSH;
 6144		delalloc_lock = false;
 6145	} else {
 6146		if (current->journal_info)
 6147			flush = BTRFS_RESERVE_FLUSH_LIMIT;
 6148
 6149		if (btrfs_transaction_in_commit(fs_info))
 6150			schedule_timeout(1);
 6151	}
 6152
 6153	if (delalloc_lock)
 6154		mutex_lock(&inode->delalloc_mutex);
 6155
 6156	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
 6157
 6158	/* Add our new extents and calculate the new rsv size. */
 6159	spin_lock(&inode->lock);
 6160	nr_extents = count_max_extents(num_bytes);
 6161	btrfs_mod_outstanding_extents(inode, nr_extents);
 6162	inode->csum_bytes += num_bytes;
 6163	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 6164	spin_unlock(&inode->lock);
 6165
 6166	ret = btrfs_inode_rsv_refill(inode, flush);
 6167	if (unlikely(ret))
 6168		goto out_fail;
 6169
 6170	if (delalloc_lock)
 6171		mutex_unlock(&inode->delalloc_mutex);
 6172	return 0;
 6173
 6174out_fail:
 6175	spin_lock(&inode->lock);
 6176	nr_extents = count_max_extents(num_bytes);
 6177	btrfs_mod_outstanding_extents(inode, -nr_extents);
 6178	inode->csum_bytes -= num_bytes;
 6179	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 6180	spin_unlock(&inode->lock);
 6181
 6182	btrfs_inode_rsv_release(inode, true);
 6183	if (delalloc_lock)
 6184		mutex_unlock(&inode->delalloc_mutex);
 6185	return ret;
 6186}
 6187
 6188/**
 6189 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
 6190 * @inode: the inode to release the reservation for.
 6191 * @num_bytes: the number of bytes we are releasing.
 6192 * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
 6193 *
 6194 * This will release the metadata reservation for an inode.  This can be called
 6195 * once we complete IO for a given set of bytes to release their metadata
 6196 * reservations, or on error for the same reason.
 6197 */
 6198void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
 6199				     bool qgroup_free)
 6200{
 6201	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 6202
 6203	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
 6204	spin_lock(&inode->lock);
 6205	inode->csum_bytes -= num_bytes;
 6206	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 6207	spin_unlock(&inode->lock);
 6208
 6209	if (btrfs_is_testing(fs_info))
 6210		return;
 6211
 6212	btrfs_inode_rsv_release(inode, qgroup_free);
 6213}
 6214
 6215/**
 6216 * btrfs_delalloc_release_extents - release our outstanding_extents
 6217 * @inode: the inode to balance the reservation for.
 6218 * @num_bytes: the number of bytes we originally reserved with
 6219 * @qgroup_free: do we need to free qgroup meta reservation or convert them.
 6220 *
 6221 * When we reserve space we increase outstanding_extents for the extents we may
 6222 * add.  Once we've set the range as delalloc or created our ordered extents we
 6223 * have outstanding_extents to track the real usage, so we use this to free our
 6224 * temporarily tracked outstanding_extents.  This _must_ be used in conjunction
 6225 * with btrfs_delalloc_reserve_metadata.
 6226 */
 6227void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
 6228				    bool qgroup_free)
 6229{
 6230	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 6231	unsigned num_extents;
 6232
 6233	spin_lock(&inode->lock);
 6234	num_extents = count_max_extents(num_bytes);
 6235	btrfs_mod_outstanding_extents(inode, -num_extents);
 6236	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 6237	spin_unlock(&inode->lock);
 6238
 6239	if (btrfs_is_testing(fs_info))
 6240		return;
 6241
 6242	btrfs_inode_rsv_release(inode, qgroup_free);
 6243}
 6244
 6245/**
 6246 * btrfs_delalloc_reserve_space - reserve data and metadata space for
 6247 * delalloc
 6248 * @inode: inode we're writing to
 6249 * @start: start range we are writing to
 6250 * @len: how long the range we are writing to
 6251 * @reserved: mandatory parameter, record actually reserved qgroup ranges of
 6252 * 	      current reservation.
 6253 *
 6254 * This will do the following things
 6255 *
 6256 * o reserve space in data space info for num bytes
 6257 *   and reserve precious corresponding qgroup space
 6258 *   (Done in check_data_free_space)
 6259 *
 6260 * o reserve space for metadata space, based on the number of outstanding
 6261 *   extents and how much csums will be needed
 6262 *   also reserve metadata space in a per root over-reserve method.
 6263 * o add to the inodes->delalloc_bytes
 6264 * o add it to the fs_info's delalloc inodes list.
 6265 *   (Above 3 all done in delalloc_reserve_metadata)
 6266 *
 6267 * Return 0 for success
 6268 * Return <0 for error(-ENOSPC or -EQUOT)
 6269 */
 6270int btrfs_delalloc_reserve_space(struct inode *inode,
 6271			struct extent_changeset **reserved, u64 start, u64 len)
 6272{
 6273	int ret;
 6274
 6275	ret = btrfs_check_data_free_space(inode, reserved, start, len);
 6276	if (ret < 0)
 6277		return ret;
 6278	ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
 6279	if (ret < 0)
 6280		btrfs_free_reserved_data_space(inode, *reserved, start, len);
 6281	return ret;
 6282}
 6283
 6284/**
 6285 * btrfs_delalloc_release_space - release data and metadata space for delalloc
 6286 * @inode: inode we're releasing space for
 6287 * @start: start position of the space already reserved
 6288 * @len: the len of the space already reserved
 6289 * @release_bytes: the len of the space we consumed or didn't use
 6290 *
 6291 * This function will release the metadata space that was not used and will
 6292 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
 6293 * list if there are no delalloc bytes left.
 6294 * Also it will handle the qgroup reserved space.
 6295 */
 6296void btrfs_delalloc_release_space(struct inode *inode,
 6297				  struct extent_changeset *reserved,
 6298				  u64 start, u64 len, bool qgroup_free)
 6299{
 6300	btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
 6301	btrfs_free_reserved_data_space(inode, reserved, start, len);
 6302}
 6303
 6304static int update_block_group(struct btrfs_trans_handle *trans,
 6305			      struct btrfs_fs_info *info, u64 bytenr,
 6306			      u64 num_bytes, int alloc)
 6307{
 6308	struct btrfs_block_group_cache *cache = NULL;
 6309	u64 total = num_bytes;
 6310	u64 old_val;
 6311	u64 byte_in_group;
 6312	int factor;
 6313	int ret = 0;
 6314
 6315	/* block accounting for super block */
 6316	spin_lock(&info->delalloc_root_lock);
 6317	old_val = btrfs_super_bytes_used(info->super_copy);
 6318	if (alloc)
 6319		old_val += num_bytes;
 6320	else
 6321		old_val -= num_bytes;
 6322	btrfs_set_super_bytes_used(info->super_copy, old_val);
 6323	spin_unlock(&info->delalloc_root_lock);
 6324
 6325	while (total) {
 6326		cache = btrfs_lookup_block_group(info, bytenr);
 6327		if (!cache) {
 6328			ret = -ENOENT;
 6329			break;
 6330		}
 6331		factor = btrfs_bg_type_to_factor(cache->flags);
 6332
 6333		/*
 6334		 * If this block group has free space cache written out, we
 6335		 * need to make sure to load it if we are removing space.  This
 6336		 * is because we need the unpinning stage to actually add the
 6337		 * space back to the block group, otherwise we will leak space.
 6338		 */
 6339		if (!alloc && cache->cached == BTRFS_CACHE_NO)
 6340			cache_block_group(cache, 1);
 6341
 6342		byte_in_group = bytenr - cache->key.objectid;
 6343		WARN_ON(byte_in_group > cache->key.offset);
 6344
 6345		spin_lock(&cache->space_info->lock);
 6346		spin_lock(&cache->lock);
 6347
 6348		if (btrfs_test_opt(info, SPACE_CACHE) &&
 6349		    cache->disk_cache_state < BTRFS_DC_CLEAR)
 6350			cache->disk_cache_state = BTRFS_DC_CLEAR;
 6351
 6352		old_val = btrfs_block_group_used(&cache->item);
 6353		num_bytes = min(total, cache->key.offset - byte_in_group);
 6354		if (alloc) {
 6355			old_val += num_bytes;
 6356			btrfs_set_block_group_used(&cache->item, old_val);
 6357			cache->reserved -= num_bytes;
 6358			cache->space_info->bytes_reserved -= num_bytes;
 6359			cache->space_info->bytes_used += num_bytes;
 6360			cache->space_info->disk_used += num_bytes * factor;
 6361			spin_unlock(&cache->lock);
 6362			spin_unlock(&cache->space_info->lock);
 6363		} else {
 6364			old_val -= num_bytes;
 6365			btrfs_set_block_group_used(&cache->item, old_val);
 6366			cache->pinned += num_bytes;
 6367			update_bytes_pinned(cache->space_info, num_bytes);
 6368			cache->space_info->bytes_used -= num_bytes;
 6369			cache->space_info->disk_used -= num_bytes * factor;
 6370			spin_unlock(&cache->lock);
 6371			spin_unlock(&cache->space_info->lock);
 6372
 6373			trace_btrfs_space_reservation(info, "pinned",
 6374						      cache->space_info->flags,
 6375						      num_bytes, 1);
 6376			percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
 6377					   num_bytes,
 6378					   BTRFS_TOTAL_BYTES_PINNED_BATCH);
 6379			set_extent_dirty(info->pinned_extents,
 6380					 bytenr, bytenr + num_bytes - 1,
 6381					 GFP_NOFS | __GFP_NOFAIL);
 6382		}
 6383
 6384		spin_lock(&trans->transaction->dirty_bgs_lock);
 6385		if (list_empty(&cache->dirty_list)) {
 6386			list_add_tail(&cache->dirty_list,
 6387				      &trans->transaction->dirty_bgs);
 6388			trans->transaction->num_dirty_bgs++;
 6389			trans->delayed_ref_updates++;
 6390			btrfs_get_block_group(cache);
 6391		}
 6392		spin_unlock(&trans->transaction->dirty_bgs_lock);
 6393
 6394		/*
 6395		 * No longer have used bytes in this block group, queue it for
 6396		 * deletion. We do this after adding the block group to the
 6397		 * dirty list to avoid races between cleaner kthread and space
 6398		 * cache writeout.
 6399		 */
 6400		if (!alloc && old_val == 0)
 6401			btrfs_mark_bg_unused(cache);
 6402
 6403		btrfs_put_block_group(cache);
 6404		total -= num_bytes;
 6405		bytenr += num_bytes;
 6406	}
 6407
 6408	/* Modified block groups are accounted for in the delayed_refs_rsv. */
 6409	btrfs_update_delayed_refs_rsv(trans);
 6410	return ret;
 6411}
 6412
 6413static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
 6414{
 6415	struct btrfs_block_group_cache *cache;
 6416	u64 bytenr;
 6417
 6418	spin_lock(&fs_info->block_group_cache_lock);
 6419	bytenr = fs_info->first_logical_byte;
 6420	spin_unlock(&fs_info->block_group_cache_lock);
 6421
 6422	if (bytenr < (u64)-1)
 6423		return bytenr;
 6424
 6425	cache = btrfs_lookup_first_block_group(fs_info, search_start);
 6426	if (!cache)
 6427		return 0;
 6428
 6429	bytenr = cache->key.objectid;
 6430	btrfs_put_block_group(cache);
 6431
 6432	return bytenr;
 6433}
 6434
 6435static int pin_down_extent(struct btrfs_fs_info *fs_info,
 6436			   struct btrfs_block_group_cache *cache,
 6437			   u64 bytenr, u64 num_bytes, int reserved)
 6438{
 6439	spin_lock(&cache->space_info->lock);
 6440	spin_lock(&cache->lock);
 6441	cache->pinned += num_bytes;
 6442	update_bytes_pinned(cache->space_info, num_bytes);
 6443	if (reserved) {
 6444		cache->reserved -= num_bytes;
 6445		cache->space_info->bytes_reserved -= num_bytes;
 6446	}
 6447	spin_unlock(&cache->lock);
 6448	spin_unlock(&cache->space_info->lock);
 6449
 6450	trace_btrfs_space_reservation(fs_info, "pinned",
 6451				      cache->space_info->flags, num_bytes, 1);
 6452	percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
 6453		    num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
 6454	set_extent_dirty(fs_info->pinned_extents, bytenr,
 6455			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
 6456	return 0;
 6457}
 6458
 6459/*
 6460 * this function must be called within transaction
 6461 */
 6462int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
 6463		     u64 bytenr, u64 num_bytes, int reserved)
 6464{
 6465	struct btrfs_block_group_cache *cache;
 6466
 6467	cache = btrfs_lookup_block_group(fs_info, bytenr);
 6468	BUG_ON(!cache); /* Logic error */
 6469
 6470	pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved);
 6471
 6472	btrfs_put_block_group(cache);
 6473	return 0;
 6474}
 6475
 6476/*
 6477 * this function must be called within transaction
 6478 */
 6479int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
 6480				    u64 bytenr, u64 num_bytes)
 6481{
 6482	struct btrfs_block_group_cache *cache;
 6483	int ret;
 6484
 6485	cache = btrfs_lookup_block_group(fs_info, bytenr);
 6486	if (!cache)
 6487		return -EINVAL;
 6488
 6489	/*
 6490	 * pull in the free space cache (if any) so that our pin
 6491	 * removes the free space from the cache.  We have load_only set
 6492	 * to one because the slow code to read in the free extents does check
 6493	 * the pinned extents.
 6494	 */
 6495	cache_block_group(cache, 1);
 6496
 6497	pin_down_extent(fs_info, cache, bytenr, num_bytes, 0);
 6498
 6499	/* remove us from the free space cache (if we're there at all) */
 6500	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
 6501	btrfs_put_block_group(cache);
 6502	return ret;
 6503}
 6504
 6505static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
 6506				   u64 start, u64 num_bytes)
 6507{
 6508	int ret;
 6509	struct btrfs_block_group_cache *block_group;
 6510	struct btrfs_caching_control *caching_ctl;
 6511
 6512	block_group = btrfs_lookup_block_group(fs_info, start);
 6513	if (!block_group)
 6514		return -EINVAL;
 6515
 6516	cache_block_group(block_group, 0);
 6517	caching_ctl = get_caching_control(block_group);
 6518
 6519	if (!caching_ctl) {
 6520		/* Logic error */
 6521		BUG_ON(!block_group_cache_done(block_group));
 6522		ret = btrfs_remove_free_space(block_group, start, num_bytes);
 6523	} else {
 6524		mutex_lock(&caching_ctl->mutex);
 6525
 6526		if (start >= caching_ctl->progress) {
 6527			ret = add_excluded_extent(fs_info, start, num_bytes);
 6528		} else if (start + num_bytes <= caching_ctl->progress) {
 6529			ret = btrfs_remove_free_space(block_group,
 6530						      start, num_bytes);
 6531		} else {
 6532			num_bytes = caching_ctl->progress - start;
 6533			ret = btrfs_remove_free_space(block_group,
 6534						      start, num_bytes);
 6535			if (ret)
 6536				goto out_lock;
 6537
 6538			num_bytes = (start + num_bytes) -
 6539				caching_ctl->progress;
 6540			start = caching_ctl->progress;
 6541			ret = add_excluded_extent(fs_info, start, num_bytes);
 6542		}
 6543out_lock:
 6544		mutex_unlock(&caching_ctl->mutex);
 6545		put_caching_control(caching_ctl);
 6546	}
 6547	btrfs_put_block_group(block_group);
 6548	return ret;
 6549}
 6550
 6551int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
 6552				 struct extent_buffer *eb)
 6553{
 6554	struct btrfs_file_extent_item *item;
 6555	struct btrfs_key key;
 6556	int found_type;
 6557	int i;
 6558	int ret = 0;
 6559
 6560	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
 6561		return 0;
 6562
 6563	for (i = 0; i < btrfs_header_nritems(eb); i++) {
 6564		btrfs_item_key_to_cpu(eb, &key, i);
 6565		if (key.type != BTRFS_EXTENT_DATA_KEY)
 6566			continue;
 6567		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
 6568		found_type = btrfs_file_extent_type(eb, item);
 6569		if (found_type == BTRFS_FILE_EXTENT_INLINE)
 6570			continue;
 6571		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
 6572			continue;
 6573		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
 6574		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
 6575		ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
 6576		if (ret)
 6577			break;
 6578	}
 6579
 6580	return ret;
 6581}
 6582
 6583static void
 6584btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
 6585{
 6586	atomic_inc(&bg->reservations);
 6587}
 6588
 6589void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
 6590					const u64 start)
 6591{
 6592	struct btrfs_block_group_cache *bg;
 6593
 6594	bg = btrfs_lookup_block_group(fs_info, start);
 6595	ASSERT(bg);
 6596	if (atomic_dec_and_test(&bg->reservations))
 6597		wake_up_var(&bg->reservations);
 6598	btrfs_put_block_group(bg);
 6599}
 6600
 6601void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
 6602{
 6603	struct btrfs_space_info *space_info = bg->space_info;
 6604
 6605	ASSERT(bg->ro);
 6606
 6607	if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
 6608		return;
 6609
 6610	/*
 6611	 * Our block group is read only but before we set it to read only,
 6612	 * some task might have had allocated an extent from it already, but it
 6613	 * has not yet created a respective ordered extent (and added it to a
 6614	 * root's list of ordered extents).
 6615	 * Therefore wait for any task currently allocating extents, since the
 6616	 * block group's reservations counter is incremented while a read lock
 6617	 * on the groups' semaphore is held and decremented after releasing
 6618	 * the read access on that semaphore and creating the ordered extent.
 6619	 */
 6620	down_write(&space_info->groups_sem);
 6621	up_write(&space_info->groups_sem);
 6622
 6623	wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
 6624}
 6625
 6626/**
 6627 * btrfs_add_reserved_bytes - update the block_group and space info counters
 6628 * @cache:	The cache we are manipulating
 6629 * @ram_bytes:  The number of bytes of file content, and will be same to
 6630 *              @num_bytes except for the compress path.
 6631 * @num_bytes:	The number of bytes in question
 6632 * @delalloc:   The blocks are allocated for the delalloc write
 6633 *
 6634 * This is called by the allocator when it reserves space. If this is a
 6635 * reservation and the block group has become read only we cannot make the
 6636 * reservation and return -EAGAIN, otherwise this function always succeeds.
 6637 */
 6638static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
 6639				    u64 ram_bytes, u64 num_bytes, int delalloc)
 6640{
 6641	struct btrfs_space_info *space_info = cache->space_info;
 6642	int ret = 0;
 6643
 6644	spin_lock(&space_info->lock);
 6645	spin_lock(&cache->lock);
 6646	if (cache->ro) {
 6647		ret = -EAGAIN;
 6648	} else {
 6649		cache->reserved += num_bytes;
 6650		space_info->bytes_reserved += num_bytes;
 6651		update_bytes_may_use(space_info, -ram_bytes);
 6652		if (delalloc)
 6653			cache->delalloc_bytes += num_bytes;
 6654	}
 6655	spin_unlock(&cache->lock);
 6656	spin_unlock(&space_info->lock);
 6657	return ret;
 6658}
 6659
 6660/**
 6661 * btrfs_free_reserved_bytes - update the block_group and space info counters
 6662 * @cache:      The cache we are manipulating
 6663 * @num_bytes:  The number of bytes in question
 6664 * @delalloc:   The blocks are allocated for the delalloc write
 6665 *
 6666 * This is called by somebody who is freeing space that was never actually used
 6667 * on disk.  For example if you reserve some space for a new leaf in transaction
 6668 * A and before transaction A commits you free that leaf, you call this with
 6669 * reserve set to 0 in order to clear the reservation.
 6670 */
 6671
 6672static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
 6673				      u64 num_bytes, int delalloc)
 6674{
 6675	struct btrfs_space_info *space_info = cache->space_info;
 6676
 6677	spin_lock(&space_info->lock);
 6678	spin_lock(&cache->lock);
 6679	if (cache->ro)
 6680		space_info->bytes_readonly += num_bytes;
 6681	cache->reserved -= num_bytes;
 6682	space_info->bytes_reserved -= num_bytes;
 6683	space_info->max_extent_size = 0;
 6684
 6685	if (delalloc)
 6686		cache->delalloc_bytes -= num_bytes;
 6687	spin_unlock(&cache->lock);
 6688	spin_unlock(&space_info->lock);
 6689}
 6690void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
 6691{
 6692	struct btrfs_caching_control *next;
 6693	struct btrfs_caching_control *caching_ctl;
 6694	struct btrfs_block_group_cache *cache;
 6695
 6696	down_write(&fs_info->commit_root_sem);
 6697
 6698	list_for_each_entry_safe(caching_ctl, next,
 6699				 &fs_info->caching_block_groups, list) {
 6700		cache = caching_ctl->block_group;
 6701		if (block_group_cache_done(cache)) {
 6702			cache->last_byte_to_unpin = (u64)-1;
 6703			list_del_init(&caching_ctl->list);
 6704			put_caching_control(caching_ctl);
 6705		} else {
 6706			cache->last_byte_to_unpin = caching_ctl->progress;
 6707		}
 6708	}
 6709
 6710	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
 6711		fs_info->pinned_extents = &fs_info->freed_extents[1];
 6712	else
 6713		fs_info->pinned_extents = &fs_info->freed_extents[0];
 6714
 6715	up_write(&fs_info->commit_root_sem);
 6716
 6717	update_global_block_rsv(fs_info);
 6718}
 6719
 6720/*
 6721 * Returns the free cluster for the given space info and sets empty_cluster to
 6722 * what it should be based on the mount options.
 6723 */
 6724static struct btrfs_free_cluster *
 6725fetch_cluster_info(struct btrfs_fs_info *fs_info,
 6726		   struct btrfs_space_info *space_info, u64 *empty_cluster)
 6727{
 6728	struct btrfs_free_cluster *ret = NULL;
 6729
 6730	*empty_cluster = 0;
 6731	if (btrfs_mixed_space_info(space_info))
 6732		return ret;
 6733
 6734	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
 6735		ret = &fs_info->meta_alloc_cluster;
 6736		if (btrfs_test_opt(fs_info, SSD))
 6737			*empty_cluster = SZ_2M;
 6738		else
 6739			*empty_cluster = SZ_64K;
 6740	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
 6741		   btrfs_test_opt(fs_info, SSD_SPREAD)) {
 6742		*empty_cluster = SZ_2M;
 6743		ret = &fs_info->data_alloc_cluster;
 6744	}
 6745
 6746	return ret;
 6747}
 6748
 6749static int unpin_extent_range(struct btrfs_fs_info *fs_info,
 6750			      u64 start, u64 end,
 6751			      const bool return_free_space)
 6752{
 6753	struct btrfs_block_group_cache *cache = NULL;
 6754	struct btrfs_space_info *space_info;
 6755	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 6756	struct btrfs_free_cluster *cluster = NULL;
 6757	u64 len;
 6758	u64 total_unpinned = 0;
 6759	u64 empty_cluster = 0;
 6760	bool readonly;
 6761
 6762	while (start <= end) {
 6763		readonly = false;
 6764		if (!cache ||
 6765		    start >= cache->key.objectid + cache->key.offset) {
 6766			if (cache)
 6767				btrfs_put_block_group(cache);
 6768			total_unpinned = 0;
 6769			cache = btrfs_lookup_block_group(fs_info, start);
 6770			BUG_ON(!cache); /* Logic error */
 6771
 6772			cluster = fetch_cluster_info(fs_info,
 6773						     cache->space_info,
 6774						     &empty_cluster);
 6775			empty_cluster <<= 1;
 6776		}
 6777
 6778		len = cache->key.objectid + cache->key.offset - start;
 6779		len = min(len, end + 1 - start);
 6780
 6781		if (start < cache->last_byte_to_unpin) {
 6782			len = min(len, cache->last_byte_to_unpin - start);
 6783			if (return_free_space)
 6784				btrfs_add_free_space(cache, start, len);
 6785		}
 6786
 6787		start += len;
 6788		total_unpinned += len;
 6789		space_info = cache->space_info;
 6790
 6791		/*
 6792		 * If this space cluster has been marked as fragmented and we've
 6793		 * unpinned enough in this block group to potentially allow a
 6794		 * cluster to be created inside of it go ahead and clear the
 6795		 * fragmented check.
 6796		 */
 6797		if (cluster && cluster->fragmented &&
 6798		    total_unpinned > empty_cluster) {
 6799			spin_lock(&cluster->lock);
 6800			cluster->fragmented = 0;
 6801			spin_unlock(&cluster->lock);
 6802		}
 6803
 6804		spin_lock(&space_info->lock);
 6805		spin_lock(&cache->lock);
 6806		cache->pinned -= len;
 6807		update_bytes_pinned(space_info, -len);
 6808
 6809		trace_btrfs_space_reservation(fs_info, "pinned",
 6810					      space_info->flags, len, 0);
 6811		space_info->max_extent_size = 0;
 6812		percpu_counter_add_batch(&space_info->total_bytes_pinned,
 6813			    -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
 6814		if (cache->ro) {
 6815			space_info->bytes_readonly += len;
 6816			readonly = true;
 6817		}
 6818		spin_unlock(&cache->lock);
 6819		if (!readonly && return_free_space &&
 6820		    global_rsv->space_info == space_info) {
 6821			u64 to_add = len;
 6822
 6823			spin_lock(&global_rsv->lock);
 6824			if (!global_rsv->full) {
 6825				to_add = min(len, global_rsv->size -
 6826					     global_rsv->reserved);
 6827				global_rsv->reserved += to_add;
 6828				update_bytes_may_use(space_info, to_add);
 6829				if (global_rsv->reserved >= global_rsv->size)
 6830					global_rsv->full = 1;
 6831				trace_btrfs_space_reservation(fs_info,
 6832							      "space_info",
 6833							      space_info->flags,
 6834							      to_add, 1);
 6835				len -= to_add;
 6836			}
 6837			spin_unlock(&global_rsv->lock);
 6838			/* Add to any tickets we may have */
 6839			if (len)
 6840				space_info_add_new_bytes(fs_info, space_info,
 6841							 len);
 6842		}
 6843		spin_unlock(&space_info->lock);
 6844	}
 6845
 6846	if (cache)
 6847		btrfs_put_block_group(cache);
 6848	return 0;
 6849}
 6850
 6851int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
 6852{
 6853	struct btrfs_fs_info *fs_info = trans->fs_info;
 6854	struct btrfs_block_group_cache *block_group, *tmp;
 6855	struct list_head *deleted_bgs;
 6856	struct extent_io_tree *unpin;
 6857	u64 start;
 6858	u64 end;
 6859	int ret;
 6860
 6861	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
 6862		unpin = &fs_info->freed_extents[1];
 6863	else
 6864		unpin = &fs_info->freed_extents[0];
 6865
 6866	while (!trans->aborted) {
 6867		struct extent_state *cached_state = NULL;
 6868
 6869		mutex_lock(&fs_info->unused_bg_unpin_mutex);
 6870		ret = find_first_extent_bit(unpin, 0, &start, &end,
 6871					    EXTENT_DIRTY, &cached_state);
 6872		if (ret) {
 6873			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
 6874			break;
 6875		}
 6876
 6877		if (btrfs_test_opt(fs_info, DISCARD))
 6878			ret = btrfs_discard_extent(fs_info, start,
 6879						   end + 1 - start, NULL);
 6880
 6881		clear_extent_dirty(unpin, start, end, &cached_state);
 6882		unpin_extent_range(fs_info, start, end, true);
 6883		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
 6884		free_extent_state(cached_state);
 6885		cond_resched();
 6886	}
 6887
 6888	/*
 6889	 * Transaction is finished.  We don't need the lock anymore.  We
 6890	 * do need to clean up the block groups in case of a transaction
 6891	 * abort.
 6892	 */
 6893	deleted_bgs = &trans->transaction->deleted_bgs;
 6894	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
 6895		u64 trimmed = 0;
 6896
 6897		ret = -EROFS;
 6898		if (!trans->aborted)
 6899			ret = btrfs_discard_extent(fs_info,
 6900						   block_group->key.objectid,
 6901						   block_group->key.offset,
 6902						   &trimmed);
 6903
 6904		list_del_init(&block_group->bg_list);
 6905		btrfs_put_block_group_trimming(block_group);
 6906		btrfs_put_block_group(block_group);
 6907
 6908		if (ret) {
 6909			const char *errstr = btrfs_decode_error(ret);
 6910			btrfs_warn(fs_info,
 6911			   "discard failed while removing blockgroup: errno=%d %s",
 6912				   ret, errstr);
 6913		}
 6914	}
 6915
 6916	return 0;
 6917}
 6918
 6919static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 6920			       struct btrfs_delayed_ref_node *node, u64 parent,
 6921			       u64 root_objectid, u64 owner_objectid,
 6922			       u64 owner_offset, int refs_to_drop,
 6923			       struct btrfs_delayed_extent_op *extent_op)
 6924{
 6925	struct btrfs_fs_info *info = trans->fs_info;
 6926	struct btrfs_key key;
 6927	struct btrfs_path *path;
 6928	struct btrfs_root *extent_root = info->extent_root;
 6929	struct extent_buffer *leaf;
 6930	struct btrfs_extent_item *ei;
 6931	struct btrfs_extent_inline_ref *iref;
 6932	int ret;
 6933	int is_data;
 6934	int extent_slot = 0;
 6935	int found_extent = 0;
 6936	int num_to_del = 1;
 6937	u32 item_size;
 6938	u64 refs;
 6939	u64 bytenr = node->bytenr;
 6940	u64 num_bytes = node->num_bytes;
 6941	int last_ref = 0;
 6942	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
 6943
 6944	path = btrfs_alloc_path();
 6945	if (!path)
 6946		return -ENOMEM;
 6947
 6948	path->reada = READA_FORWARD;
 6949	path->leave_spinning = 1;
 6950
 6951	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
 6952	BUG_ON(!is_data && refs_to_drop != 1);
 6953
 6954	if (is_data)
 6955		skinny_metadata = false;
 6956
 6957	ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
 6958				    parent, root_objectid, owner_objectid,
 6959				    owner_offset);
 6960	if (ret == 0) {
 6961		extent_slot = path->slots[0];
 6962		while (extent_slot >= 0) {
 6963			btrfs_item_key_to_cpu(path->nodes[0], &key,
 6964					      extent_slot);
 6965			if (key.objectid != bytenr)
 6966				break;
 6967			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
 6968			    key.offset == num_bytes) {
 6969				found_extent = 1;
 6970				break;
 6971			}
 6972			if (key.type == BTRFS_METADATA_ITEM_KEY &&
 6973			    key.offset == owner_objectid) {
 6974				found_extent = 1;
 6975				break;
 6976			}
 6977			if (path->slots[0] - extent_slot > 5)
 6978				break;
 6979			extent_slot--;
 6980		}
 6981
 6982		if (!found_extent) {
 6983			BUG_ON(iref);
 6984			ret = remove_extent_backref(trans, path, NULL,
 6985						    refs_to_drop,
 6986						    is_data, &last_ref);
 6987			if (ret) {
 6988				btrfs_abort_transaction(trans, ret);
 6989				goto out;
 6990			}
 6991			btrfs_release_path(path);
 6992			path->leave_spinning = 1;
 6993
 6994			key.objectid = bytenr;
 6995			key.type = BTRFS_EXTENT_ITEM_KEY;
 6996			key.offset = num_bytes;
 6997
 6998			if (!is_data && skinny_metadata) {
 6999				key.type = BTRFS_METADATA_ITEM_KEY;
 7000				key.offset = owner_objectid;
 7001			}
 7002
 7003			ret = btrfs_search_slot(trans, extent_root,
 7004						&key, path, -1, 1);
 7005			if (ret > 0 && skinny_metadata && path->slots[0]) {
 7006				/*
 7007				 * Couldn't find our skinny metadata item,
 7008				 * see if we have ye olde extent item.
 7009				 */
 7010				path->slots[0]--;
 7011				btrfs_item_key_to_cpu(path->nodes[0], &key,
 7012						      path->slots[0]);
 7013				if (key.objectid == bytenr &&
 7014				    key.type == BTRFS_EXTENT_ITEM_KEY &&
 7015				    key.offset == num_bytes)
 7016					ret = 0;
 7017			}
 7018
 7019			if (ret > 0 && skinny_metadata) {
 7020				skinny_metadata = false;
 7021				key.objectid = bytenr;
 7022				key.type = BTRFS_EXTENT_ITEM_KEY;
 7023				key.offset = num_bytes;
 7024				btrfs_release_path(path);
 7025				ret = btrfs_search_slot(trans, extent_root,
 7026							&key, path, -1, 1);
 7027			}
 7028
 7029			if (ret) {
 7030				btrfs_err(info,
 7031					  "umm, got %d back from search, was looking for %llu",
 7032					  ret, bytenr);
 7033				if (ret > 0)
 7034					btrfs_print_leaf(path->nodes[0]);
 7035			}
 7036			if (ret < 0) {
 7037				btrfs_abort_transaction(trans, ret);
 7038				goto out;
 7039			}
 7040			extent_slot = path->slots[0];
 7041		}
 7042	} else if (WARN_ON(ret == -ENOENT)) {
 7043		btrfs_print_leaf(path->nodes[0]);
 7044		btrfs_err(info,
 7045			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
 7046			bytenr, parent, root_objectid, owner_objectid,
 7047			owner_offset);
 7048		btrfs_abort_transaction(trans, ret);
 7049		goto out;
 7050	} else {
 7051		btrfs_abort_transaction(trans, ret);
 7052		goto out;
 7053	}
 7054
 7055	leaf = path->nodes[0];
 7056	item_size = btrfs_item_size_nr(leaf, extent_slot);
 7057	if (unlikely(item_size < sizeof(*ei))) {
 7058		ret = -EINVAL;
 7059		btrfs_print_v0_err(info);
 7060		btrfs_abort_transaction(trans, ret);
 7061		goto out;
 7062	}
 7063	ei = btrfs_item_ptr(leaf, extent_slot,
 7064			    struct btrfs_extent_item);
 7065	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
 7066	    key.type == BTRFS_EXTENT_ITEM_KEY) {
 7067		struct btrfs_tree_block_info *bi;
 7068		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
 7069		bi = (struct btrfs_tree_block_info *)(ei + 1);
 7070		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
 7071	}
 7072
 7073	refs = btrfs_extent_refs(leaf, ei);
 7074	if (refs < refs_to_drop) {
 7075		btrfs_err(info,
 7076			  "trying to drop %d refs but we only have %Lu for bytenr %Lu",
 7077			  refs_to_drop, refs, bytenr);
 7078		ret = -EINVAL;
 7079		btrfs_abort_transaction(trans, ret);
 7080		goto out;
 7081	}
 7082	refs -= refs_to_drop;
 7083
 7084	if (refs > 0) {
 7085		if (extent_op)
 7086			__run_delayed_extent_op(extent_op, leaf, ei);
 7087		/*
 7088		 * In the case of inline back ref, reference count will
 7089		 * be updated by remove_extent_backref
 7090		 */
 7091		if (iref) {
 7092			BUG_ON(!found_extent);
 7093		} else {
 7094			btrfs_set_extent_refs(leaf, ei, refs);
 7095			btrfs_mark_buffer_dirty(leaf);
 7096		}
 7097		if (found_extent) {
 7098			ret = remove_extent_backref(trans, path, iref,
 7099						    refs_to_drop, is_data,
 7100						    &last_ref);
 7101			if (ret) {
 7102				btrfs_abort_transaction(trans, ret);
 7103				goto out;
 7104			}
 7105		}
 7106	} else {
 7107		if (found_extent) {
 7108			BUG_ON(is_data && refs_to_drop !=
 7109			       extent_data_ref_count(path, iref));
 7110			if (iref) {
 7111				BUG_ON(path->slots[0] != extent_slot);
 7112			} else {
 7113				BUG_ON(path->slots[0] != extent_slot + 1);
 7114				path->slots[0] = extent_slot;
 7115				num_to_del = 2;
 7116			}
 7117		}
 7118
 7119		last_ref = 1;
 7120		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
 7121				      num_to_del);
 7122		if (ret) {
 7123			btrfs_abort_transaction(trans, ret);
 7124			goto out;
 7125		}
 7126		btrfs_release_path(path);
 7127
 7128		if (is_data) {
 7129			ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
 7130			if (ret) {
 7131				btrfs_abort_transaction(trans, ret);
 7132				goto out;
 7133			}
 7134		}
 7135
 7136		ret = add_to_free_space_tree(trans, bytenr, num_bytes);
 7137		if (ret) {
 7138			btrfs_abort_transaction(trans, ret);
 7139			goto out;
 7140		}
 7141
 7142		ret = update_block_group(trans, info, bytenr, num_bytes, 0);
 7143		if (ret) {
 7144			btrfs_abort_transaction(trans, ret);
 7145			goto out;
 7146		}
 7147	}
 7148	btrfs_release_path(path);
 7149
 7150out:
 7151	btrfs_free_path(path);
 7152	return ret;
 7153}
 7154
 7155/*
 7156 * when we free an block, it is possible (and likely) that we free the last
 7157 * delayed ref for that extent as well.  This searches the delayed ref tree for
 7158 * a given extent, and if there are no other delayed refs to be processed, it
 7159 * removes it from the tree.
 7160 */
 7161static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 7162				      u64 bytenr)
 7163{
 7164	struct btrfs_delayed_ref_head *head;
 7165	struct btrfs_delayed_ref_root *delayed_refs;
 7166	int ret = 0;
 7167
 7168	delayed_refs = &trans->transaction->delayed_refs;
 7169	spin_lock(&delayed_refs->lock);
 7170	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
 7171	if (!head)
 7172		goto out_delayed_unlock;
 7173
 7174	spin_lock(&head->lock);
 7175	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
 7176		goto out;
 7177
 7178	if (cleanup_extent_op(head) != NULL)
 7179		goto out;
 7180
 7181	/*
 7182	 * waiting for the lock here would deadlock.  If someone else has it
 7183	 * locked they are already in the process of dropping it anyway
 7184	 */
 7185	if (!mutex_trylock(&head->mutex))
 7186		goto out;
 7187
 7188	btrfs_delete_ref_head(delayed_refs, head);
 7189	head->processing = 0;
 7190
 7191	spin_unlock(&head->lock);
 7192	spin_unlock(&delayed_refs->lock);
 7193
 7194	BUG_ON(head->extent_op);
 7195	if (head->must_insert_reserved)
 7196		ret = 1;
 7197
 7198	btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
 7199	mutex_unlock(&head->mutex);
 7200	btrfs_put_delayed_ref_head(head);
 7201	return ret;
 7202out:
 7203	spin_unlock(&head->lock);
 7204
 7205out_delayed_unlock:
 7206	spin_unlock(&delayed_refs->lock);
 7207	return 0;
 7208}
 7209
 7210void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 7211			   struct btrfs_root *root,
 7212			   struct extent_buffer *buf,
 7213			   u64 parent, int last_ref)
 7214{
 7215	struct btrfs_fs_info *fs_info = root->fs_info;
 7216	int pin = 1;
 7217	int ret;
 7218
 7219	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
 7220		int old_ref_mod, new_ref_mod;
 7221
 7222		btrfs_ref_tree_mod(root, buf->start, buf->len, parent,
 7223				   root->root_key.objectid,
 7224				   btrfs_header_level(buf), 0,
 7225				   BTRFS_DROP_DELAYED_REF);
 7226		ret = btrfs_add_delayed_tree_ref(trans, buf->start,
 7227						 buf->len, parent,
 7228						 root->root_key.objectid,
 7229						 btrfs_header_level(buf),
 7230						 BTRFS_DROP_DELAYED_REF, NULL,
 7231						 &old_ref_mod, &new_ref_mod);
 7232		BUG_ON(ret); /* -ENOMEM */
 7233		pin = old_ref_mod >= 0 && new_ref_mod < 0;
 7234	}
 7235
 7236	if (last_ref && btrfs_header_generation(buf) == trans->transid) {
 7237		struct btrfs_block_group_cache *cache;
 7238
 7239		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
 7240			ret = check_ref_cleanup(trans, buf->start);
 7241			if (!ret)
 7242				goto out;
 7243		}
 7244
 7245		pin = 0;
 7246		cache = btrfs_lookup_block_group(fs_info, buf->start);
 7247
 7248		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
 7249			pin_down_extent(fs_info, cache, buf->start,
 7250					buf->len, 1);
 7251			btrfs_put_block_group(cache);
 7252			goto out;
 7253		}
 7254
 7255		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
 7256
 7257		btrfs_add_free_space(cache, buf->start, buf->len);
 7258		btrfs_free_reserved_bytes(cache, buf->len, 0);
 7259		btrfs_put_block_group(cache);
 7260		trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
 7261	}
 7262out:
 7263	if (pin)
 7264		add_pinned_bytes(fs_info, buf->len, true,
 7265				 root->root_key.objectid);
 7266
 7267	if (last_ref) {
 7268		/*
 7269		 * Deleting the buffer, clear the corrupt flag since it doesn't
 7270		 * matter anymore.
 7271		 */
 7272		clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
 7273	}
 7274}
 7275
 7276/* Can return -ENOMEM */
 7277int btrfs_free_extent(struct btrfs_trans_handle *trans,
 7278		      struct btrfs_root *root,
 7279		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
 7280		      u64 owner, u64 offset)
 7281{
 7282	struct btrfs_fs_info *fs_info = root->fs_info;
 7283	int old_ref_mod, new_ref_mod;
 7284	int ret;
 7285
 7286	if (btrfs_is_testing(fs_info))
 7287		return 0;
 7288
 7289	if (root_objectid != BTRFS_TREE_LOG_OBJECTID)
 7290		btrfs_ref_tree_mod(root, bytenr, num_bytes, parent,
 7291				   root_objectid, owner, offset,
 7292				   BTRFS_DROP_DELAYED_REF);
 7293
 7294	/*
 7295	 * tree log blocks never actually go into the extent allocation
 7296	 * tree, just update pinning info and exit early.
 7297	 */
 7298	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
 7299		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
 7300		/* unlocks the pinned mutex */
 7301		btrfs_pin_extent(fs_info, bytenr, num_bytes, 1);
 7302		old_ref_mod = new_ref_mod = 0;
 7303		ret = 0;
 7304	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
 7305		ret = btrfs_add_delayed_tree_ref(trans, bytenr,
 7306						 num_bytes, parent,
 7307						 root_objectid, (int)owner,
 7308						 BTRFS_DROP_DELAYED_REF, NULL,
 7309						 &old_ref_mod, &new_ref_mod);
 7310	} else {
 7311		ret = btrfs_add_delayed_data_ref(trans, bytenr,
 7312						 num_bytes, parent,
 7313						 root_objectid, owner, offset,
 7314						 0, BTRFS_DROP_DELAYED_REF,
 7315						 &old_ref_mod, &new_ref_mod);
 7316	}
 7317
 7318	if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) {
 7319		bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
 7320
 7321		add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid);
 7322	}
 7323
 7324	return ret;
 7325}
 7326
 7327/*
 7328 * when we wait for progress in the block group caching, its because
 7329 * our allocation attempt failed at least once.  So, we must sleep
 7330 * and let some progress happen before we try again.
 7331 *
 7332 * This function will sleep at least once waiting for new free space to
 7333 * show up, and then it will check the block group free space numbers
 7334 * for our min num_bytes.  Another option is to have it go ahead
 7335 * and look in the rbtree for a free extent of a given size, but this
 7336 * is a good start.
 7337 *
 7338 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
 7339 * any of the information in this block group.
 7340 */
 7341static noinline void
 7342wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
 7343				u64 num_bytes)
 7344{
 7345	struct btrfs_caching_control *caching_ctl;
 7346
 7347	caching_ctl = get_caching_control(cache);
 7348	if (!caching_ctl)
 7349		return;
 7350
 7351	wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
 7352		   (cache->free_space_ctl->free_space >= num_bytes));
 7353
 7354	put_caching_control(caching_ctl);
 7355}
 7356
 7357static noinline int
 7358wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
 7359{
 7360	struct btrfs_caching_control *caching_ctl;
 7361	int ret = 0;
 7362
 7363	caching_ctl = get_caching_control(cache);
 7364	if (!caching_ctl)
 7365		return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
 7366
 7367	wait_event(caching_ctl->wait, block_group_cache_done(cache));
 7368	if (cache->cached == BTRFS_CACHE_ERROR)
 7369		ret = -EIO;
 7370	put_caching_control(caching_ctl);
 7371	return ret;
 7372}
 7373
 7374enum btrfs_loop_type {
 7375	LOOP_CACHING_NOWAIT = 0,
 7376	LOOP_CACHING_WAIT = 1,
 7377	LOOP_ALLOC_CHUNK = 2,
 7378	LOOP_NO_EMPTY_SIZE = 3,
 7379};
 7380
 7381static inline void
 7382btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
 7383		       int delalloc)
 7384{
 7385	if (delalloc)
 7386		down_read(&cache->data_rwsem);
 7387}
 7388
 7389static inline void
 7390btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
 7391		       int delalloc)
 7392{
 7393	btrfs_get_block_group(cache);
 7394	if (delalloc)
 7395		down_read(&cache->data_rwsem);
 7396}
 7397
 7398static struct btrfs_block_group_cache *
 7399btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
 7400		   struct btrfs_free_cluster *cluster,
 7401		   int delalloc)
 7402{
 7403	struct btrfs_block_group_cache *used_bg = NULL;
 7404
 7405	spin_lock(&cluster->refill_lock);
 7406	while (1) {
 7407		used_bg = cluster->block_group;
 7408		if (!used_bg)
 7409			return NULL;
 7410
 7411		if (used_bg == block_group)
 7412			return used_bg;
 7413
 7414		btrfs_get_block_group(used_bg);
 7415
 7416		if (!delalloc)
 7417			return used_bg;
 7418
 7419		if (down_read_trylock(&used_bg->data_rwsem))
 7420			return used_bg;
 7421
 7422		spin_unlock(&cluster->refill_lock);
 7423
 7424		/* We should only have one-level nested. */
 7425		down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
 7426
 7427		spin_lock(&cluster->refill_lock);
 7428		if (used_bg == cluster->block_group)
 7429			return used_bg;
 7430
 7431		up_read(&used_bg->data_rwsem);
 7432		btrfs_put_block_group(used_bg);
 7433	}
 7434}
 7435
 7436static inline void
 7437btrfs_release_block_group(struct btrfs_block_group_cache *cache,
 7438			 int delalloc)
 7439{
 7440	if (delalloc)
 7441		up_read(&cache->data_rwsem);
 7442	btrfs_put_block_group(cache);
 7443}
 7444
 7445/*
 7446 * Structure used internally for find_free_extent() function.  Wraps needed
 7447 * parameters.
 7448 */
 7449struct find_free_extent_ctl {
 7450	/* Basic allocation info */
 7451	u64 ram_bytes;
 7452	u64 num_bytes;
 7453	u64 empty_size;
 7454	u64 flags;
 7455	int delalloc;
 7456
 7457	/* Where to start the search inside the bg */
 7458	u64 search_start;
 7459
 7460	/* For clustered allocation */
 7461	u64 empty_cluster;
 7462
 7463	bool have_caching_bg;
 7464	bool orig_have_caching_bg;
 7465
 7466	/* RAID index, converted from flags */
 7467	int index;
 7468
 7469	/*
 7470	 * Current loop number, check find_free_extent_update_loop() for details
 7471	 */
 7472	int loop;
 7473
 7474	/*
 7475	 * Whether we're refilling a cluster, if true we need to re-search
 7476	 * current block group but don't try to refill the cluster again.
 7477	 */
 7478	bool retry_clustered;
 7479
 7480	/*
 7481	 * Whether we're updating free space cache, if true we need to re-search
 7482	 * current block group but don't try updating free space cache again.
 7483	 */
 7484	bool retry_unclustered;
 7485
 7486	/* If current block group is cached */
 7487	int cached;
 7488
 7489	/* Max contiguous hole found */
 7490	u64 max_extent_size;
 7491
 7492	/* Total free space from free space cache, not always contiguous */
 7493	u64 total_free_space;
 7494
 7495	/* Found result */
 7496	u64 found_offset;
 7497};
 7498
 7499
 7500/*
 7501 * Helper function for find_free_extent().
 7502 *
 7503 * Return -ENOENT to inform caller that we need fallback to unclustered mode.
 7504 * Return -EAGAIN to inform caller that we need to re-search this block group
 7505 * Return >0 to inform caller that we find nothing
 7506 * Return 0 means we have found a location and set ffe_ctl->found_offset.
 7507 */
 7508static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
 7509		struct btrfs_free_cluster *last_ptr,
 7510		struct find_free_extent_ctl *ffe_ctl,
 7511		struct btrfs_block_group_cache **cluster_bg_ret)
 7512{
 7513	struct btrfs_fs_info *fs_info = bg->fs_info;
 7514	struct btrfs_block_group_cache *cluster_bg;
 7515	u64 aligned_cluster;
 7516	u64 offset;
 7517	int ret;
 7518
 7519	cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
 7520	if (!cluster_bg)
 7521		goto refill_cluster;
 7522	if (cluster_bg != bg && (cluster_bg->ro ||
 7523	    !block_group_bits(cluster_bg, ffe_ctl->flags)))
 7524		goto release_cluster;
 7525
 7526	offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
 7527			ffe_ctl->num_bytes, cluster_bg->key.objectid,
 7528			&ffe_ctl->max_extent_size);
 7529	if (offset) {
 7530		/* We have a block, we're done */
 7531		spin_unlock(&last_ptr->refill_lock);
 7532		trace_btrfs_reserve_extent_cluster(cluster_bg,
 7533				ffe_ctl->search_start, ffe_ctl->num_bytes);
 7534		*cluster_bg_ret = cluster_bg;
 7535		ffe_ctl->found_offset = offset;
 7536		return 0;
 7537	}
 7538	WARN_ON(last_ptr->block_group != cluster_bg);
 7539
 7540release_cluster:
 7541	/*
 7542	 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
 7543	 * lets just skip it and let the allocator find whatever block it can
 7544	 * find. If we reach this point, we will have tried the cluster
 7545	 * allocator plenty of times and not have found anything, so we are
 7546	 * likely way too fragmented for the clustering stuff to find anything.
 7547	 *
 7548	 * However, if the cluster is taken from the current block group,
 7549	 * release the cluster first, so that we stand a better chance of
 7550	 * succeeding in the unclustered allocation.
 7551	 */
 7552	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
 7553		spin_unlock(&last_ptr->refill_lock);
 7554		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
 7555		return -ENOENT;
 7556	}
 7557
 7558	/* This cluster didn't work out, free it and start over */
 7559	btrfs_return_cluster_to_free_space(NULL, last_ptr);
 7560
 7561	if (cluster_bg != bg)
 7562		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
 7563
 7564refill_cluster:
 7565	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
 7566		spin_unlock(&last_ptr->refill_lock);
 7567		return -ENOENT;
 7568	}
 7569
 7570	aligned_cluster = max_t(u64,
 7571			ffe_ctl->empty_cluster + ffe_ctl->empty_size,
 7572			bg->full_stripe_len);
 7573	ret = btrfs_find_space_cluster(fs_info, bg, last_ptr,
 7574			ffe_ctl->search_start, ffe_ctl->num_bytes,
 7575			aligned_cluster);
 7576	if (ret == 0) {
 7577		/* Now pull our allocation out of this cluster */
 7578		offset = btrfs_alloc_from_cluster(bg, last_ptr,
 7579				ffe_ctl->num_bytes, ffe_ctl->search_start,
 7580				&ffe_ctl->max_extent_size);
 7581		if (offset) {
 7582			/* We found one, proceed */
 7583			spin_unlock(&last_ptr->refill_lock);
 7584			trace_btrfs_reserve_extent_cluster(bg,
 7585					ffe_ctl->search_start,
 7586					ffe_ctl->num_bytes);
 7587			ffe_ctl->found_offset = offset;
 7588			return 0;
 7589		}
 7590	} else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
 7591		   !ffe_ctl->retry_clustered) {
 7592		spin_unlock(&last_ptr->refill_lock);
 7593
 7594		ffe_ctl->retry_clustered = true;
 7595		wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
 7596				ffe_ctl->empty_cluster + ffe_ctl->empty_size);
 7597		return -EAGAIN;
 7598	}
 7599	/*
 7600	 * At this point we either didn't find a cluster or we weren't able to
 7601	 * allocate a block from our cluster.  Free the cluster we've been
 7602	 * trying to use, and go to the next block group.
 7603	 */
 7604	btrfs_return_cluster_to_free_space(NULL, last_ptr);
 7605	spin_unlock(&last_ptr->refill_lock);
 7606	return 1;
 7607}
 7608
 7609/*
 7610 * Return >0 to inform caller that we find nothing
 7611 * Return 0 when we found an free extent and set ffe_ctrl->found_offset
 7612 * Return -EAGAIN to inform caller that we need to re-search this block group
 7613 */
 7614static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
 7615		struct btrfs_free_cluster *last_ptr,
 7616		struct find_free_extent_ctl *ffe_ctl)
 7617{
 7618	u64 offset;
 7619
 7620	/*
 7621	 * We are doing an unclustered allocation, set the fragmented flag so
 7622	 * we don't bother trying to setup a cluster again until we get more
 7623	 * space.
 7624	 */
 7625	if (unlikely(last_ptr)) {
 7626		spin_lock(&last_ptr->lock);
 7627		last_ptr->fragmented = 1;
 7628		spin_unlock(&last_ptr->lock);
 7629	}
 7630	if (ffe_ctl->cached) {
 7631		struct btrfs_free_space_ctl *free_space_ctl;
 7632
 7633		free_space_ctl = bg->free_space_ctl;
 7634		spin_lock(&free_space_ctl->tree_lock);
 7635		if (free_space_ctl->free_space <
 7636		    ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
 7637		    ffe_ctl->empty_size) {
 7638			ffe_ctl->total_free_space = max_t(u64,
 7639					ffe_ctl->total_free_space,
 7640					free_space_ctl->free_space);
 7641			spin_unlock(&free_space_ctl->tree_lock);
 7642			return 1;
 7643		}
 7644		spin_unlock(&free_space_ctl->tree_lock);
 7645	}
 7646
 7647	offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
 7648			ffe_ctl->num_bytes, ffe_ctl->empty_size,
 7649			&ffe_ctl->max_extent_size);
 7650
 7651	/*
 7652	 * If we didn't find a chunk, and we haven't failed on this block group
 7653	 * before, and this block group is in the middle of caching and we are
 7654	 * ok with waiting, then go ahead and wait for progress to be made, and
 7655	 * set @retry_unclustered to true.
 7656	 *
 7657	 * If @retry_unclustered is true then we've already waited on this
 7658	 * block group once and should move on to the next block group.
 7659	 */
 7660	if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
 7661	    ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
 7662		wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
 7663						ffe_ctl->empty_size);
 7664		ffe_ctl->retry_unclustered = true;
 7665		return -EAGAIN;
 7666	} else if (!offset) {
 7667		return 1;
 7668	}
 7669	ffe_ctl->found_offset = offset;
 7670	return 0;
 7671}
 7672
 7673/*
 7674 * Return >0 means caller needs to re-search for free extent
 7675 * Return 0 means we have the needed free extent.
 7676 * Return <0 means we failed to locate any free extent.
 7677 */
 7678static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
 7679					struct btrfs_free_cluster *last_ptr,
 7680					struct btrfs_key *ins,
 7681					struct find_free_extent_ctl *ffe_ctl,
 7682					int full_search, bool use_cluster)
 7683{
 7684	struct btrfs_root *root = fs_info->extent_root;
 7685	int ret;
 7686
 7687	if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
 7688	    ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
 7689		ffe_ctl->orig_have_caching_bg = true;
 7690
 7691	if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
 7692	    ffe_ctl->have_caching_bg)
 7693		return 1;
 7694
 7695	if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
 7696		return 1;
 7697
 7698	if (ins->objectid) {
 7699		if (!use_cluster && last_ptr) {
 7700			spin_lock(&last_ptr->lock);
 7701			last_ptr->window_start = ins->objectid;
 7702			spin_unlock(&last_ptr->lock);
 7703		}
 7704		return 0;
 7705	}
 7706
 7707	/*
 7708	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
 7709	 *			caching kthreads as we move along
 7710	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
 7711	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
 7712	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
 7713	 *		       again
 7714	 */
 7715	if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
 7716		ffe_ctl->index = 0;
 7717		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
 7718			/*
 7719			 * We want to skip the LOOP_CACHING_WAIT step if we
 7720			 * don't have any uncached bgs and we've already done a
 7721			 * full search through.
 7722			 */
 7723			if (ffe_ctl->orig_have_caching_bg || !full_search)
 7724				ffe_ctl->loop = LOOP_CACHING_WAIT;
 7725			else
 7726				ffe_ctl->loop = LOOP_ALLOC_CHUNK;
 7727		} else {
 7728			ffe_ctl->loop++;
 7729		}
 7730
 7731		if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
 7732			struct btrfs_trans_handle *trans;
 7733			int exist = 0;
 7734
 7735			trans = current->journal_info;
 7736			if (trans)
 7737				exist = 1;
 7738			else
 7739				trans = btrfs_join_transaction(root);
 7740
 7741			if (IS_ERR(trans)) {
 7742				ret = PTR_ERR(trans);
 7743				return ret;
 7744			}
 7745
 7746			ret = do_chunk_alloc(trans, ffe_ctl->flags,
 7747					     CHUNK_ALLOC_FORCE);
 7748
 7749			/*
 7750			 * If we can't allocate a new chunk we've already looped
 7751			 * through at least once, move on to the NO_EMPTY_SIZE
 7752			 * case.
 7753			 */
 7754			if (ret == -ENOSPC)
 7755				ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
 7756
 7757			/* Do not bail out on ENOSPC since we can do more. */
 7758			if (ret < 0 && ret != -ENOSPC)
 7759				btrfs_abort_transaction(trans, ret);
 7760			else
 7761				ret = 0;
 7762			if (!exist)
 7763				btrfs_end_transaction(trans);
 7764			if (ret)
 7765				return ret;
 7766		}
 7767
 7768		if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
 7769			/*
 7770			 * Don't loop again if we already have no empty_size and
 7771			 * no empty_cluster.
 7772			 */
 7773			if (ffe_ctl->empty_size == 0 &&
 7774			    ffe_ctl->empty_cluster == 0)
 7775				return -ENOSPC;
 7776			ffe_ctl->empty_size = 0;
 7777			ffe_ctl->empty_cluster = 0;
 7778		}
 7779		return 1;
 7780	}
 7781	return -ENOSPC;
 7782}
 7783
 7784/*
 7785 * walks the btree of allocated extents and find a hole of a given size.
 7786 * The key ins is changed to record the hole:
 7787 * ins->objectid == start position
 7788 * ins->flags = BTRFS_EXTENT_ITEM_KEY
 7789 * ins->offset == the size of the hole.
 7790 * Any available blocks before search_start are skipped.
 7791 *
 7792 * If there is no suitable free space, we will record the max size of
 7793 * the free space extent currently.
 7794 *
 7795 * The overall logic and call chain:
 7796 *
 7797 * find_free_extent()
 7798 * |- Iterate through all block groups
 7799 * |  |- Get a valid block group
 7800 * |  |- Try to do clustered allocation in that block group
 7801 * |  |- Try to do unclustered allocation in that block group
 7802 * |  |- Check if the result is valid
 7803 * |  |  |- If valid, then exit
 7804 * |  |- Jump to next block group
 7805 * |
 7806 * |- Push harder to find free extents
 7807 *    |- If not found, re-iterate all block groups
 7808 */
 7809static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
 7810				u64 ram_bytes, u64 num_bytes, u64 empty_size,
 7811				u64 hint_byte, struct btrfs_key *ins,
 7812				u64 flags, int delalloc)
 7813{
 7814	int ret = 0;
 7815	struct btrfs_free_cluster *last_ptr = NULL;
 7816	struct btrfs_block_group_cache *block_group = NULL;
 7817	struct find_free_extent_ctl ffe_ctl = {0};
 7818	struct btrfs_space_info *space_info;
 7819	bool use_cluster = true;
 7820	bool full_search = false;
 7821
 7822	WARN_ON(num_bytes < fs_info->sectorsize);
 7823
 7824	ffe_ctl.ram_bytes = ram_bytes;
 7825	ffe_ctl.num_bytes = num_bytes;
 7826	ffe_ctl.empty_size = empty_size;
 7827	ffe_ctl.flags = flags;
 7828	ffe_ctl.search_start = 0;
 7829	ffe_ctl.retry_clustered = false;
 7830	ffe_ctl.retry_unclustered = false;
 7831	ffe_ctl.delalloc = delalloc;
 7832	ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
 7833	ffe_ctl.have_caching_bg = false;
 7834	ffe_ctl.orig_have_caching_bg = false;
 7835	ffe_ctl.found_offset = 0;
 7836
 7837	ins->type = BTRFS_EXTENT_ITEM_KEY;
 7838	ins->objectid = 0;
 7839	ins->offset = 0;
 7840
 7841	trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
 7842
 7843	space_info = __find_space_info(fs_info, flags);
 7844	if (!space_info) {
 7845		btrfs_err(fs_info, "No space info for %llu", flags);
 7846		return -ENOSPC;
 7847	}
 7848
 7849	/*
 7850	 * If our free space is heavily fragmented we may not be able to make
 7851	 * big contiguous allocations, so instead of doing the expensive search
 7852	 * for free space, simply return ENOSPC with our max_extent_size so we
 7853	 * can go ahead and search for a more manageable chunk.
 7854	 *
 7855	 * If our max_extent_size is large enough for our allocation simply
 7856	 * disable clustering since we will likely not be able to find enough
 7857	 * space to create a cluster and induce latency trying.
 7858	 */
 7859	if (unlikely(space_info->max_extent_size)) {
 7860		spin_lock(&space_info->lock);
 7861		if (space_info->max_extent_size &&
 7862		    num_bytes > space_info->max_extent_size) {
 7863			ins->offset = space_info->max_extent_size;
 7864			spin_unlock(&space_info->lock);
 7865			return -ENOSPC;
 7866		} else if (space_info->max_extent_size) {
 7867			use_cluster = false;
 7868		}
 7869		spin_unlock(&space_info->lock);
 7870	}
 7871
 7872	last_ptr = fetch_cluster_info(fs_info, space_info,
 7873				      &ffe_ctl.empty_cluster);
 7874	if (last_ptr) {
 7875		spin_lock(&last_ptr->lock);
 7876		if (last_ptr->block_group)
 7877			hint_byte = last_ptr->window_start;
 7878		if (last_ptr->fragmented) {
 7879			/*
 7880			 * We still set window_start so we can keep track of the
 7881			 * last place we found an allocation to try and save
 7882			 * some time.
 7883			 */
 7884			hint_byte = last_ptr->window_start;
 7885			use_cluster = false;
 7886		}
 7887		spin_unlock(&last_ptr->lock);
 7888	}
 7889
 7890	ffe_ctl.search_start = max(ffe_ctl.search_start,
 7891				   first_logical_byte(fs_info, 0));
 7892	ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
 7893	if (ffe_ctl.search_start == hint_byte) {
 7894		block_group = btrfs_lookup_block_group(fs_info,
 7895						       ffe_ctl.search_start);
 7896		/*
 7897		 * we don't want to use the block group if it doesn't match our
 7898		 * allocation bits, or if its not cached.
 7899		 *
 7900		 * However if we are re-searching with an ideal block group
 7901		 * picked out then we don't care that the block group is cached.
 7902		 */
 7903		if (block_group && block_group_bits(block_group, flags) &&
 7904		    block_group->cached != BTRFS_CACHE_NO) {
 7905			down_read(&space_info->groups_sem);
 7906			if (list_empty(&block_group->list) ||
 7907			    block_group->ro) {
 7908				/*
 7909				 * someone is removing this block group,
 7910				 * we can't jump into the have_block_group
 7911				 * target because our list pointers are not
 7912				 * valid
 7913				 */
 7914				btrfs_put_block_group(block_group);
 7915				up_read(&space_info->groups_sem);
 7916			} else {
 7917				ffe_ctl.index = btrfs_bg_flags_to_raid_index(
 7918						block_group->flags);
 7919				btrfs_lock_block_group(block_group, delalloc);
 7920				goto have_block_group;
 7921			}
 7922		} else if (block_group) {
 7923			btrfs_put_block_group(block_group);
 7924		}
 7925	}
 7926search:
 7927	ffe_ctl.have_caching_bg = false;
 7928	if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
 7929	    ffe_ctl.index == 0)
 7930		full_search = true;
 7931	down_read(&space_info->groups_sem);
 7932	list_for_each_entry(block_group,
 7933			    &space_info->block_groups[ffe_ctl.index], list) {
 7934		/* If the block group is read-only, we can skip it entirely. */
 7935		if (unlikely(block_group->ro))
 7936			continue;
 7937
 7938		btrfs_grab_block_group(block_group, delalloc);
 7939		ffe_ctl.search_start = block_group->key.objectid;
 7940
 7941		/*
 7942		 * this can happen if we end up cycling through all the
 7943		 * raid types, but we want to make sure we only allocate
 7944		 * for the proper type.
 7945		 */
 7946		if (!block_group_bits(block_group, flags)) {
 7947			u64 extra = BTRFS_BLOCK_GROUP_DUP |
 7948				BTRFS_BLOCK_GROUP_RAID1 |
 7949				BTRFS_BLOCK_GROUP_RAID5 |
 7950				BTRFS_BLOCK_GROUP_RAID6 |
 7951				BTRFS_BLOCK_GROUP_RAID10;
 7952
 7953			/*
 7954			 * if they asked for extra copies and this block group
 7955			 * doesn't provide them, bail.  This does allow us to
 7956			 * fill raid0 from raid1.
 7957			 */
 7958			if ((flags & extra) && !(block_group->flags & extra))
 7959				goto loop;
 7960		}
 7961
 7962have_block_group:
 7963		ffe_ctl.cached = block_group_cache_done(block_group);
 7964		if (unlikely(!ffe_ctl.cached)) {
 7965			ffe_ctl.have_caching_bg = true;
 7966			ret = cache_block_group(block_group, 0);
 7967			BUG_ON(ret < 0);
 7968			ret = 0;
 7969		}
 7970
 7971		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
 7972			goto loop;
 7973
 7974		/*
 7975		 * Ok we want to try and use the cluster allocator, so
 7976		 * lets look there
 7977		 */
 7978		if (last_ptr && use_cluster) {
 7979			struct btrfs_block_group_cache *cluster_bg = NULL;
 7980
 7981			ret = find_free_extent_clustered(block_group, last_ptr,
 7982							 &ffe_ctl, &cluster_bg);
 7983
 7984			if (ret == 0) {
 7985				if (cluster_bg && cluster_bg != block_group) {
 7986					btrfs_release_block_group(block_group,
 7987								  delalloc);
 7988					block_group = cluster_bg;
 7989				}
 7990				goto checks;
 7991			} else if (ret == -EAGAIN) {
 7992				goto have_block_group;
 7993			} else if (ret > 0) {
 7994				goto loop;
 7995			}
 7996			/* ret == -ENOENT case falls through */
 7997		}
 7998
 7999		ret = find_free_extent_unclustered(block_group, last_ptr,
 8000						   &ffe_ctl);
 8001		if (ret == -EAGAIN)
 8002			goto have_block_group;
 8003		else if (ret > 0)
 8004			goto loop;
 8005		/* ret == 0 case falls through */
 8006checks:
 8007		ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
 8008					     fs_info->stripesize);
 8009
 8010		/* move on to the next group */
 8011		if (ffe_ctl.search_start + num_bytes >
 8012		    block_group->key.objectid + block_group->key.offset) {
 8013			btrfs_add_free_space(block_group, ffe_ctl.found_offset,
 8014					     num_bytes);
 8015			goto loop;
 8016		}
 8017
 8018		if (ffe_ctl.found_offset < ffe_ctl.search_start)
 8019			btrfs_add_free_space(block_group, ffe_ctl.found_offset,
 8020				ffe_ctl.search_start - ffe_ctl.found_offset);
 8021
 8022		ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
 8023				num_bytes, delalloc);
 8024		if (ret == -EAGAIN) {
 8025			btrfs_add_free_space(block_group, ffe_ctl.found_offset,
 8026					     num_bytes);
 8027			goto loop;
 8028		}
 8029		btrfs_inc_block_group_reservations(block_group);
 8030
 8031		/* we are all good, lets return */
 8032		ins->objectid = ffe_ctl.search_start;
 8033		ins->offset = num_bytes;
 8034
 8035		trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
 8036					   num_bytes);
 8037		btrfs_release_block_group(block_group, delalloc);
 8038		break;
 8039loop:
 8040		ffe_ctl.retry_clustered = false;
 8041		ffe_ctl.retry_unclustered = false;
 8042		BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
 8043		       ffe_ctl.index);
 8044		btrfs_release_block_group(block_group, delalloc);
 8045		cond_resched();
 8046	}
 8047	up_read(&space_info->groups_sem);
 8048
 8049	ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
 8050					   full_search, use_cluster);
 8051	if (ret > 0)
 8052		goto search;
 8053
 8054	if (ret == -ENOSPC) {
 8055		/*
 8056		 * Use ffe_ctl->total_free_space as fallback if we can't find
 8057		 * any contiguous hole.
 8058		 */
 8059		if (!ffe_ctl.max_extent_size)
 8060			ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
 8061		spin_lock(&space_info->lock);
 8062		space_info->max_extent_size = ffe_ctl.max_extent_size;
 8063		spin_unlock(&space_info->lock);
 8064		ins->offset = ffe_ctl.max_extent_size;
 8065	}
 8066	return ret;
 8067}
 8068
 8069static void dump_space_info(struct btrfs_fs_info *fs_info,
 8070			    struct btrfs_space_info *info, u64 bytes,
 8071			    int dump_block_groups)
 8072{
 8073	struct btrfs_block_group_cache *cache;
 8074	int index = 0;
 8075
 8076	spin_lock(&info->lock);
 8077	btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
 8078		   info->flags,
 8079		   info->total_bytes - btrfs_space_info_used(info, true),
 8080		   info->full ? "" : "not ");
 8081	btrfs_info(fs_info,
 8082		"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
 8083		info->total_bytes, info->bytes_used, info->bytes_pinned,
 8084		info->bytes_reserved, info->bytes_may_use,
 8085		info->bytes_readonly);
 8086	spin_unlock(&info->lock);
 8087
 8088	if (!dump_block_groups)
 8089		return;
 8090
 8091	down_read(&info->groups_sem);
 8092again:
 8093	list_for_each_entry(cache, &info->block_groups[index], list) {
 8094		spin_lock(&cache->lock);
 8095		btrfs_info(fs_info,
 8096			"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
 8097			cache->key.objectid, cache->key.offset,
 8098			btrfs_block_group_used(&cache->item), cache->pinned,
 8099			cache->reserved, cache->ro ? "[readonly]" : "");
 8100		btrfs_dump_free_space(cache, bytes);
 8101		spin_unlock(&cache->lock);
 8102	}
 8103	if (++index < BTRFS_NR_RAID_TYPES)
 8104		goto again;
 8105	up_read(&info->groups_sem);
 8106}
 8107
 8108/*
 8109 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
 8110 *			  hole that is at least as big as @num_bytes.
 8111 *
 8112 * @root           -	The root that will contain this extent
 8113 *
 8114 * @ram_bytes      -	The amount of space in ram that @num_bytes take. This
 8115 *			is used for accounting purposes. This value differs
 8116 *			from @num_bytes only in the case of compressed extents.
 8117 *
 8118 * @num_bytes      -	Number of bytes to allocate on-disk.
 8119 *
 8120 * @min_alloc_size -	Indicates the minimum amount of space that the
 8121 *			allocator should try to satisfy. In some cases
 8122 *			@num_bytes may be larger than what is required and if
 8123 *			the filesystem is fragmented then allocation fails.
 8124 *			However, the presence of @min_alloc_size gives a
 8125 *			chance to try and satisfy the smaller allocation.
 8126 *
 8127 * @empty_size     -	A hint that you plan on doing more COW. This is the
 8128 *			size in bytes the allocator should try to find free
 8129 *			next to the block it returns.  This is just a hint and
 8130 *			may be ignored by the allocator.
 8131 *
 8132 * @hint_byte      -	Hint to the allocator to start searching above the byte
 8133 *			address passed. It might be ignored.
 8134 *
 8135 * @ins            -	This key is modified to record the found hole. It will
 8136 *			have the following values:
 8137 *			ins->objectid == start position
 8138 *			ins->flags = BTRFS_EXTENT_ITEM_KEY
 8139 *			ins->offset == the size of the hole.
 8140 *
 8141 * @is_data        -	Boolean flag indicating whether an extent is
 8142 *			allocated for data (true) or metadata (false)
 8143 *
 8144 * @delalloc       -	Boolean flag indicating whether this allocation is for
 8145 *			delalloc or not. If 'true' data_rwsem of block groups
 8146 *			is going to be acquired.
 8147 *
 8148 *
 8149 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
 8150 * case -ENOSPC is returned then @ins->offset will contain the size of the
 8151 * largest available hole the allocator managed to find.
 8152 */
 8153int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
 8154			 u64 num_bytes, u64 min_alloc_size,
 8155			 u64 empty_size, u64 hint_byte,
 8156			 struct btrfs_key *ins, int is_data, int delalloc)
 8157{
 8158	struct btrfs_fs_info *fs_info = root->fs_info;
 8159	bool final_tried = num_bytes == min_alloc_size;
 8160	u64 flags;
 8161	int ret;
 8162
 8163	flags = get_alloc_profile_by_root(root, is_data);
 8164again:
 8165	WARN_ON(num_bytes < fs_info->sectorsize);
 8166	ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
 8167			       hint_byte, ins, flags, delalloc);
 8168	if (!ret && !is_data) {
 8169		btrfs_dec_block_group_reservations(fs_info, ins->objectid);
 8170	} else if (ret == -ENOSPC) {
 8171		if (!final_tried && ins->offset) {
 8172			num_bytes = min(num_bytes >> 1, ins->offset);
 8173			num_bytes = round_down(num_bytes,
 8174					       fs_info->sectorsize);
 8175			num_bytes = max(num_bytes, min_alloc_size);
 8176			ram_bytes = num_bytes;
 8177			if (num_bytes == min_alloc_size)
 8178				final_tried = true;
 8179			goto again;
 8180		} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
 8181			struct btrfs_space_info *sinfo;
 8182
 8183			sinfo = __find_space_info(fs_info, flags);
 8184			btrfs_err(fs_info,
 8185				  "allocation failed flags %llu, wanted %llu",
 8186				  flags, num_bytes);
 8187			if (sinfo)
 8188				dump_space_info(fs_info, sinfo, num_bytes, 1);
 8189		}
 8190	}
 8191
 8192	return ret;
 8193}
 8194
 8195static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
 8196					u64 start, u64 len,
 8197					int pin, int delalloc)
 8198{
 8199	struct btrfs_block_group_cache *cache;
 8200	int ret = 0;
 8201
 8202	cache = btrfs_lookup_block_group(fs_info, start);
 8203	if (!cache) {
 8204		btrfs_err(fs_info, "Unable to find block group for %llu",
 8205			  start);
 8206		return -ENOSPC;
 8207	}
 8208
 8209	if (pin)
 8210		pin_down_extent(fs_info, cache, start, len, 1);
 8211	else {
 8212		if (btrfs_test_opt(fs_info, DISCARD))
 8213			ret = btrfs_discard_extent(fs_info, start, len, NULL);
 8214		btrfs_add_free_space(cache, start, len);
 8215		btrfs_free_reserved_bytes(cache, len, delalloc);
 8216		trace_btrfs_reserved_extent_free(fs_info, start, len);
 8217	}
 8218
 8219	btrfs_put_block_group(cache);
 8220	return ret;
 8221}
 8222
 8223int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
 8224			       u64 start, u64 len, int delalloc)
 8225{
 8226	return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
 8227}
 8228
 8229int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
 8230				       u64 start, u64 len)
 8231{
 8232	return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
 8233}
 8234
 8235static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 8236				      u64 parent, u64 root_objectid,
 8237				      u64 flags, u64 owner, u64 offset,
 8238				      struct btrfs_key *ins, int ref_mod)
 8239{
 8240	struct btrfs_fs_info *fs_info = trans->fs_info;
 8241	int ret;
 8242	struct btrfs_extent_item *extent_item;
 8243	struct btrfs_extent_inline_ref *iref;
 8244	struct btrfs_path *path;
 8245	struct extent_buffer *leaf;
 8246	int type;
 8247	u32 size;
 8248
 8249	if (parent > 0)
 8250		type = BTRFS_SHARED_DATA_REF_KEY;
 8251	else
 8252		type = BTRFS_EXTENT_DATA_REF_KEY;
 8253
 8254	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
 8255
 8256	path = btrfs_alloc_path();
 8257	if (!path)
 8258		return -ENOMEM;
 8259
 8260	path->leave_spinning = 1;
 8261	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
 8262				      ins, size);
 8263	if (ret) {
 8264		btrfs_free_path(path);
 8265		return ret;
 8266	}
 8267
 8268	leaf = path->nodes[0];
 8269	extent_item = btrfs_item_ptr(leaf, path->slots[0],
 8270				     struct btrfs_extent_item);
 8271	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
 8272	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
 8273	btrfs_set_extent_flags(leaf, extent_item,
 8274			       flags | BTRFS_EXTENT_FLAG_DATA);
 8275
 8276	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
 8277	btrfs_set_extent_inline_ref_type(leaf, iref, type);
 8278	if (parent > 0) {
 8279		struct btrfs_shared_data_ref *ref;
 8280		ref = (struct btrfs_shared_data_ref *)(iref + 1);
 8281		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
 8282		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
 8283	} else {
 8284		struct btrfs_extent_data_ref *ref;
 8285		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
 8286		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
 8287		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
 8288		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
 8289		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
 8290	}
 8291
 8292	btrfs_mark_buffer_dirty(path->nodes[0]);
 8293	btrfs_free_path(path);
 8294
 8295	ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
 8296	if (ret)
 8297		return ret;
 8298
 8299	ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1);
 8300	if (ret) { /* -ENOENT, logic error */
 8301		btrfs_err(fs_info, "update block group failed for %llu %llu",
 8302			ins->objectid, ins->offset);
 8303		BUG();
 8304	}
 8305	trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
 8306	return ret;
 8307}
 8308
 8309static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 8310				     struct btrfs_delayed_ref_node *node,
 8311				     struct btrfs_delayed_extent_op *extent_op)
 8312{
 8313	struct btrfs_fs_info *fs_info = trans->fs_info;
 8314	int ret;
 8315	struct btrfs_extent_item *extent_item;
 8316	struct btrfs_key extent_key;
 8317	struct btrfs_tree_block_info *block_info;
 8318	struct btrfs_extent_inline_ref *iref;
 8319	struct btrfs_path *path;
 8320	struct extent_buffer *leaf;
 8321	struct btrfs_delayed_tree_ref *ref;
 8322	u32 size = sizeof(*extent_item) + sizeof(*iref);
 8323	u64 num_bytes;
 8324	u64 flags = extent_op->flags_to_set;
 8325	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
 8326
 8327	ref = btrfs_delayed_node_to_tree_ref(node);
 8328
 8329	extent_key.objectid = node->bytenr;
 8330	if (skinny_metadata) {
 8331		extent_key.offset = ref->level;
 8332		extent_key.type = BTRFS_METADATA_ITEM_KEY;
 8333		num_bytes = fs_info->nodesize;
 8334	} else {
 8335		extent_key.offset = node->num_bytes;
 8336		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
 8337		size += sizeof(*block_info);
 8338		num_bytes = node->num_bytes;
 8339	}
 8340
 8341	path = btrfs_alloc_path();
 8342	if (!path)
 8343		return -ENOMEM;
 8344
 8345	path->leave_spinning = 1;
 8346	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
 8347				      &extent_key, size);
 8348	if (ret) {
 8349		btrfs_free_path(path);
 8350		return ret;
 8351	}
 8352
 8353	leaf = path->nodes[0];
 8354	extent_item = btrfs_item_ptr(leaf, path->slots[0],
 8355				     struct btrfs_extent_item);
 8356	btrfs_set_extent_refs(leaf, extent_item, 1);
 8357	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
 8358	btrfs_set_extent_flags(leaf, extent_item,
 8359			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
 8360
 8361	if (skinny_metadata) {
 8362		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
 8363	} else {
 8364		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
 8365		btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
 8366		btrfs_set_tree_block_level(leaf, block_info, ref->level);
 8367		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
 8368	}
 8369
 8370	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
 8371		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
 8372		btrfs_set_extent_inline_ref_type(leaf, iref,
 8373						 BTRFS_SHARED_BLOCK_REF_KEY);
 8374		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
 8375	} else {
 8376		btrfs_set_extent_inline_ref_type(leaf, iref,
 8377						 BTRFS_TREE_BLOCK_REF_KEY);
 8378		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
 8379	}
 8380
 8381	btrfs_mark_buffer_dirty(leaf);
 8382	btrfs_free_path(path);
 8383
 8384	ret = remove_from_free_space_tree(trans, extent_key.objectid,
 8385					  num_bytes);
 8386	if (ret)
 8387		return ret;
 8388
 8389	ret = update_block_group(trans, fs_info, extent_key.objectid,
 8390				 fs_info->nodesize, 1);
 8391	if (ret) { /* -ENOENT, logic error */
 8392		btrfs_err(fs_info, "update block group failed for %llu %llu",
 8393			extent_key.objectid, extent_key.offset);
 8394		BUG();
 8395	}
 8396
 8397	trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
 8398					  fs_info->nodesize);
 8399	return ret;
 8400}
 8401
 8402int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 8403				     struct btrfs_root *root, u64 owner,
 8404				     u64 offset, u64 ram_bytes,
 8405				     struct btrfs_key *ins)
 8406{
 8407	int ret;
 8408
 8409	BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
 8410
 8411	btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0,
 8412			   root->root_key.objectid, owner, offset,
 8413			   BTRFS_ADD_DELAYED_EXTENT);
 8414
 8415	ret = btrfs_add_delayed_data_ref(trans, ins->objectid,
 8416					 ins->offset, 0,
 8417					 root->root_key.objectid, owner,
 8418					 offset, ram_bytes,
 8419					 BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
 8420	return ret;
 8421}
 8422
 8423/*
 8424 * this is used by the tree logging recovery code.  It records that
 8425 * an extent has been allocated and makes sure to clear the free
 8426 * space cache bits as well
 8427 */
 8428int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 8429				   u64 root_objectid, u64 owner, u64 offset,
 8430				   struct btrfs_key *ins)
 8431{
 8432	struct btrfs_fs_info *fs_info = trans->fs_info;
 8433	int ret;
 8434	struct btrfs_block_group_cache *block_group;
 8435	struct btrfs_space_info *space_info;
 8436
 8437	/*
 8438	 * Mixed block groups will exclude before processing the log so we only
 8439	 * need to do the exclude dance if this fs isn't mixed.
 8440	 */
 8441	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
 8442		ret = __exclude_logged_extent(fs_info, ins->objectid,
 8443					      ins->offset);
 8444		if (ret)
 8445			return ret;
 8446	}
 8447
 8448	block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
 8449	if (!block_group)
 8450		return -EINVAL;
 8451
 8452	space_info = block_group->space_info;
 8453	spin_lock(&space_info->lock);
 8454	spin_lock(&block_group->lock);
 8455	space_info->bytes_reserved += ins->offset;
 8456	block_group->reserved += ins->offset;
 8457	spin_unlock(&block_group->lock);
 8458	spin_unlock(&space_info->lock);
 8459
 8460	ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
 8461					 offset, ins, 1);
 8462	btrfs_put_block_group(block_group);
 8463	return ret;
 8464}
 8465
 8466static struct extent_buffer *
 8467btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 8468		      u64 bytenr, int level, u64 owner)
 8469{
 8470	struct btrfs_fs_info *fs_info = root->fs_info;
 8471	struct extent_buffer *buf;
 8472
 8473	buf = btrfs_find_create_tree_block(fs_info, bytenr);
 8474	if (IS_ERR(buf))
 8475		return buf;
 8476
 8477	/*
 8478	 * Extra safety check in case the extent tree is corrupted and extent
 8479	 * allocator chooses to use a tree block which is already used and
 8480	 * locked.
 8481	 */
 8482	if (buf->lock_owner == current->pid) {
 8483		btrfs_err_rl(fs_info,
 8484"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
 8485			buf->start, btrfs_header_owner(buf), current->pid);
 8486		free_extent_buffer(buf);
 8487		return ERR_PTR(-EUCLEAN);
 8488	}
 8489
 8490	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
 8491	btrfs_tree_lock(buf);
 8492	clean_tree_block(fs_info, buf);
 8493	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
 8494
 8495	btrfs_set_lock_blocking(buf);
 8496	set_extent_buffer_uptodate(buf);
 8497
 8498	memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
 8499	btrfs_set_header_level(buf, level);
 8500	btrfs_set_header_bytenr(buf, buf->start);
 8501	btrfs_set_header_generation(buf, trans->transid);
 8502	btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
 8503	btrfs_set_header_owner(buf, owner);
 8504	write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
 8505	write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
 8506	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
 8507		buf->log_index = root->log_transid % 2;
 8508		/*
 8509		 * we allow two log transactions at a time, use different
 8510		 * EXTENT bit to differentiate dirty pages.
 8511		 */
 8512		if (buf->log_index == 0)
 8513			set_extent_dirty(&root->dirty_log_pages, buf->start,
 8514					buf->start + buf->len - 1, GFP_NOFS);
 8515		else
 8516			set_extent_new(&root->dirty_log_pages, buf->start,
 8517					buf->start + buf->len - 1);
 8518	} else {
 8519		buf->log_index = -1;
 8520		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
 8521			 buf->start + buf->len - 1, GFP_NOFS);
 8522	}
 8523	trans->dirty = true;
 8524	/* this returns a buffer locked for blocking */
 8525	return buf;
 8526}
 8527
 8528static struct btrfs_block_rsv *
 8529use_block_rsv(struct btrfs_trans_handle *trans,
 8530	      struct btrfs_root *root, u32 blocksize)
 8531{
 8532	struct btrfs_fs_info *fs_info = root->fs_info;
 8533	struct btrfs_block_rsv *block_rsv;
 8534	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 8535	int ret;
 8536	bool global_updated = false;
 8537
 8538	block_rsv = get_block_rsv(trans, root);
 8539
 8540	if (unlikely(block_rsv->size == 0))
 8541		goto try_reserve;
 8542again:
 8543	ret = block_rsv_use_bytes(block_rsv, blocksize);
 8544	if (!ret)
 8545		return block_rsv;
 8546
 8547	if (block_rsv->failfast)
 8548		return ERR_PTR(ret);
 8549
 8550	if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
 8551		global_updated = true;
 8552		update_global_block_rsv(fs_info);
 8553		goto again;
 8554	}
 8555
 8556	/*
 8557	 * The global reserve still exists to save us from ourselves, so don't
 8558	 * warn_on if we are short on our delayed refs reserve.
 8559	 */
 8560	if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
 8561	    btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
 8562		static DEFINE_RATELIMIT_STATE(_rs,
 8563				DEFAULT_RATELIMIT_INTERVAL * 10,
 8564				/*DEFAULT_RATELIMIT_BURST*/ 1);
 8565		if (__ratelimit(&_rs))
 8566			WARN(1, KERN_DEBUG
 8567				"BTRFS: block rsv returned %d\n", ret);
 8568	}
 8569try_reserve:
 8570	ret = reserve_metadata_bytes(root, block_rsv, blocksize,
 8571				     BTRFS_RESERVE_NO_FLUSH);
 8572	if (!ret)
 8573		return block_rsv;
 8574	/*
 8575	 * If we couldn't reserve metadata bytes try and use some from
 8576	 * the global reserve if its space type is the same as the global
 8577	 * reservation.
 8578	 */
 8579	if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
 8580	    block_rsv->space_info == global_rsv->space_info) {
 8581		ret = block_rsv_use_bytes(global_rsv, blocksize);
 8582		if (!ret)
 8583			return global_rsv;
 8584	}
 8585	return ERR_PTR(ret);
 8586}
 8587
 8588static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
 8589			    struct btrfs_block_rsv *block_rsv, u32 blocksize)
 8590{
 8591	block_rsv_add_bytes(block_rsv, blocksize, false);
 8592	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
 8593}
 8594
 8595/*
 8596 * finds a free extent and does all the dirty work required for allocation
 8597 * returns the tree buffer or an ERR_PTR on error.
 8598 */
 8599struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 8600					     struct btrfs_root *root,
 8601					     u64 parent, u64 root_objectid,
 8602					     const struct btrfs_disk_key *key,
 8603					     int level, u64 hint,
 8604					     u64 empty_size)
 8605{
 8606	struct btrfs_fs_info *fs_info = root->fs_info;
 8607	struct btrfs_key ins;
 8608	struct btrfs_block_rsv *block_rsv;
 8609	struct extent_buffer *buf;
 8610	struct btrfs_delayed_extent_op *extent_op;
 8611	u64 flags = 0;
 8612	int ret;
 8613	u32 blocksize = fs_info->nodesize;
 8614	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
 8615
 8616#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 8617	if (btrfs_is_testing(fs_info)) {
 8618		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
 8619					    level, root_objectid);
 8620		if (!IS_ERR(buf))
 8621			root->alloc_bytenr += blocksize;
 8622		return buf;
 8623	}
 8624#endif
 8625
 8626	block_rsv = use_block_rsv(trans, root, blocksize);
 8627	if (IS_ERR(block_rsv))
 8628		return ERR_CAST(block_rsv);
 8629
 8630	ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
 8631				   empty_size, hint, &ins, 0, 0);
 8632	if (ret)
 8633		goto out_unuse;
 8634
 8635	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
 8636				    root_objectid);
 8637	if (IS_ERR(buf)) {
 8638		ret = PTR_ERR(buf);
 8639		goto out_free_reserved;
 8640	}
 8641
 8642	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
 8643		if (parent == 0)
 8644			parent = ins.objectid;
 8645		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
 8646	} else
 8647		BUG_ON(parent > 0);
 8648
 8649	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
 8650		extent_op = btrfs_alloc_delayed_extent_op();
 8651		if (!extent_op) {
 8652			ret = -ENOMEM;
 8653			goto out_free_buf;
 8654		}
 8655		if (key)
 8656			memcpy(&extent_op->key, key, sizeof(extent_op->key));
 8657		else
 8658			memset(&extent_op->key, 0, sizeof(extent_op->key));
 8659		extent_op->flags_to_set = flags;
 8660		extent_op->update_key = skinny_metadata ? false : true;
 8661		extent_op->update_flags = true;
 8662		extent_op->is_data = false;
 8663		extent_op->level = level;
 8664
 8665		btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent,
 8666				   root_objectid, level, 0,
 8667				   BTRFS_ADD_DELAYED_EXTENT);
 8668		ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
 8669						 ins.offset, parent,
 8670						 root_objectid, level,
 8671						 BTRFS_ADD_DELAYED_EXTENT,
 8672						 extent_op, NULL, NULL);
 8673		if (ret)
 8674			goto out_free_delayed;
 8675	}
 8676	return buf;
 8677
 8678out_free_delayed:
 8679	btrfs_free_delayed_extent_op(extent_op);
 8680out_free_buf:
 8681	free_extent_buffer(buf);
 8682out_free_reserved:
 8683	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
 8684out_unuse:
 8685	unuse_block_rsv(fs_info, block_rsv, blocksize);
 8686	return ERR_PTR(ret);
 8687}
 8688
 8689struct walk_control {
 8690	u64 refs[BTRFS_MAX_LEVEL];
 8691	u64 flags[BTRFS_MAX_LEVEL];
 8692	struct btrfs_key update_progress;
 8693	int stage;
 8694	int level;
 8695	int shared_level;
 8696	int update_ref;
 8697	int keep_locks;
 8698	int reada_slot;
 8699	int reada_count;
 8700};
 8701
 8702#define DROP_REFERENCE	1
 8703#define UPDATE_BACKREF	2
 8704
 8705static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
 8706				     struct btrfs_root *root,
 8707				     struct walk_control *wc,
 8708				     struct btrfs_path *path)
 8709{
 8710	struct btrfs_fs_info *fs_info = root->fs_info;
 8711	u64 bytenr;
 8712	u64 generation;
 8713	u64 refs;
 8714	u64 flags;
 8715	u32 nritems;
 8716	struct btrfs_key key;
 8717	struct extent_buffer *eb;
 8718	int ret;
 8719	int slot;
 8720	int nread = 0;
 8721
 8722	if (path->slots[wc->level] < wc->reada_slot) {
 8723		wc->reada_count = wc->reada_count * 2 / 3;
 8724		wc->reada_count = max(wc->reada_count, 2);
 8725	} else {
 8726		wc->reada_count = wc->reada_count * 3 / 2;
 8727		wc->reada_count = min_t(int, wc->reada_count,
 8728					BTRFS_NODEPTRS_PER_BLOCK(fs_info));
 8729	}
 8730
 8731	eb = path->nodes[wc->level];
 8732	nritems = btrfs_header_nritems(eb);
 8733
 8734	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
 8735		if (nread >= wc->reada_count)
 8736			break;
 8737
 8738		cond_resched();
 8739		bytenr = btrfs_node_blockptr(eb, slot);
 8740		generation = btrfs_node_ptr_generation(eb, slot);
 8741
 8742		if (slot == path->slots[wc->level])
 8743			goto reada;
 8744
 8745		if (wc->stage == UPDATE_BACKREF &&
 8746		    generation <= root->root_key.offset)
 8747			continue;
 8748
 8749		/* We don't lock the tree block, it's OK to be racy here */
 8750		ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
 8751					       wc->level - 1, 1, &refs,
 8752					       &flags);
 8753		/* We don't care about errors in readahead. */
 8754		if (ret < 0)
 8755			continue;
 8756		BUG_ON(refs == 0);
 8757
 8758		if (wc->stage == DROP_REFERENCE) {
 8759			if (refs == 1)
 8760				goto reada;
 8761
 8762			if (wc->level == 1 &&
 8763			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
 8764				continue;
 8765			if (!wc->update_ref ||
 8766			    generation <= root->root_key.offset)
 8767				continue;
 8768			btrfs_node_key_to_cpu(eb, &key, slot);
 8769			ret = btrfs_comp_cpu_keys(&key,
 8770						  &wc->update_progress);
 8771			if (ret < 0)
 8772				continue;
 8773		} else {
 8774			if (wc->level == 1 &&
 8775			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
 8776				continue;
 8777		}
 8778reada:
 8779		readahead_tree_block(fs_info, bytenr);
 8780		nread++;
 8781	}
 8782	wc->reada_slot = slot;
 8783}
 8784
 8785/*
 8786 * helper to process tree block while walking down the tree.
 8787 *
 8788 * when wc->stage == UPDATE_BACKREF, this function updates
 8789 * back refs for pointers in the block.
 8790 *
 8791 * NOTE: return value 1 means we should stop walking down.
 8792 */
 8793static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 8794				   struct btrfs_root *root,
 8795				   struct btrfs_path *path,
 8796				   struct walk_control *wc, int lookup_info)
 8797{
 8798	struct btrfs_fs_info *fs_info = root->fs_info;
 8799	int level = wc->level;
 8800	struct extent_buffer *eb = path->nodes[level];
 8801	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 8802	int ret;
 8803
 8804	if (wc->stage == UPDATE_BACKREF &&
 8805	    btrfs_header_owner(eb) != root->root_key.objectid)
 8806		return 1;
 8807
 8808	/*
 8809	 * when reference count of tree block is 1, it won't increase
 8810	 * again. once full backref flag is set, we never clear it.
 8811	 */
 8812	if (lookup_info &&
 8813	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
 8814	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
 8815		BUG_ON(!path->locks[level]);
 8816		ret = btrfs_lookup_extent_info(trans, fs_info,
 8817					       eb->start, level, 1,
 8818					       &wc->refs[level],
 8819					       &wc->flags[level]);
 8820		BUG_ON(ret == -ENOMEM);
 8821		if (ret)
 8822			return ret;
 8823		BUG_ON(wc->refs[level] == 0);
 8824	}
 8825
 8826	if (wc->stage == DROP_REFERENCE) {
 8827		if (wc->refs[level] > 1)
 8828			return 1;
 8829
 8830		if (path->locks[level] && !wc->keep_locks) {
 8831			btrfs_tree_unlock_rw(eb, path->locks[level]);
 8832			path->locks[level] = 0;
 8833		}
 8834		return 0;
 8835	}
 8836
 8837	/* wc->stage == UPDATE_BACKREF */
 8838	if (!(wc->flags[level] & flag)) {
 8839		BUG_ON(!path->locks[level]);
 8840		ret = btrfs_inc_ref(trans, root, eb, 1);
 8841		BUG_ON(ret); /* -ENOMEM */
 8842		ret = btrfs_dec_ref(trans, root, eb, 0);
 8843		BUG_ON(ret); /* -ENOMEM */
 8844		ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start,
 8845						  eb->len, flag,
 8846						  btrfs_header_level(eb), 0);
 8847		BUG_ON(ret); /* -ENOMEM */
 8848		wc->flags[level] |= flag;
 8849	}
 8850
 8851	/*
 8852	 * the block is shared by multiple trees, so it's not good to
 8853	 * keep the tree lock
 8854	 */
 8855	if (path->locks[level] && level > 0) {
 8856		btrfs_tree_unlock_rw(eb, path->locks[level]);
 8857		path->locks[level] = 0;
 8858	}
 8859	return 0;
 8860}
 8861
 8862/*
 8863 * helper to process tree block pointer.
 8864 *
 8865 * when wc->stage == DROP_REFERENCE, this function checks
 8866 * reference count of the block pointed to. if the block
 8867 * is shared and we need update back refs for the subtree
 8868 * rooted at the block, this function changes wc->stage to
 8869 * UPDATE_BACKREF. if the block is shared and there is no
 8870 * need to update back, this function drops the reference
 8871 * to the block.
 8872 *
 8873 * NOTE: return value 1 means we should stop walking down.
 8874 */
 8875static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 8876				 struct btrfs_root *root,
 8877				 struct btrfs_path *path,
 8878				 struct walk_control *wc, int *lookup_info)
 8879{
 8880	struct btrfs_fs_info *fs_info = root->fs_info;
 8881	u64 bytenr;
 8882	u64 generation;
 8883	u64 parent;
 8884	struct btrfs_key key;
 8885	struct btrfs_key first_key;
 8886	struct extent_buffer *next;
 8887	int level = wc->level;
 8888	int reada = 0;
 8889	int ret = 0;
 8890	bool need_account = false;
 8891
 8892	generation = btrfs_node_ptr_generation(path->nodes[level],
 8893					       path->slots[level]);
 8894	/*
 8895	 * if the lower level block was created before the snapshot
 8896	 * was created, we know there is no need to update back refs
 8897	 * for the subtree
 8898	 */
 8899	if (wc->stage == UPDATE_BACKREF &&
 8900	    generation <= root->root_key.offset) {
 8901		*lookup_info = 1;
 8902		return 1;
 8903	}
 8904
 8905	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
 8906	btrfs_node_key_to_cpu(path->nodes[level], &first_key,
 8907			      path->slots[level]);
 8908
 8909	next = find_extent_buffer(fs_info, bytenr);
 8910	if (!next) {
 8911		next = btrfs_find_create_tree_block(fs_info, bytenr);
 8912		if (IS_ERR(next))
 8913			return PTR_ERR(next);
 8914
 8915		btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
 8916					       level - 1);
 8917		reada = 1;
 8918	}
 8919	btrfs_tree_lock(next);
 8920	btrfs_set_lock_blocking(next);
 8921
 8922	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
 8923				       &wc->refs[level - 1],
 8924				       &wc->flags[level - 1]);
 8925	if (ret < 0)
 8926		goto out_unlock;
 8927
 8928	if (unlikely(wc->refs[level - 1] == 0)) {
 8929		btrfs_err(fs_info, "Missing references.");
 8930		ret = -EIO;
 8931		goto out_unlock;
 8932	}
 8933	*lookup_info = 0;
 8934
 8935	if (wc->stage == DROP_REFERENCE) {
 8936		if (wc->refs[level - 1] > 1) {
 8937			need_account = true;
 8938			if (level == 1 &&
 8939			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
 8940				goto skip;
 8941
 8942			if (!wc->update_ref ||
 8943			    generation <= root->root_key.offset)
 8944				goto skip;
 8945
 8946			btrfs_node_key_to_cpu(path->nodes[level], &key,
 8947					      path->slots[level]);
 8948			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
 8949			if (ret < 0)
 8950				goto skip;
 8951
 8952			wc->stage = UPDATE_BACKREF;
 8953			wc->shared_level = level - 1;
 8954		}
 8955	} else {
 8956		if (level == 1 &&
 8957		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
 8958			goto skip;
 8959	}
 8960
 8961	if (!btrfs_buffer_uptodate(next, generation, 0)) {
 8962		btrfs_tree_unlock(next);
 8963		free_extent_buffer(next);
 8964		next = NULL;
 8965		*lookup_info = 1;
 8966	}
 8967
 8968	if (!next) {
 8969		if (reada && level == 1)
 8970			reada_walk_down(trans, root, wc, path);
 8971		next = read_tree_block(fs_info, bytenr, generation, level - 1,
 8972				       &first_key);
 8973		if (IS_ERR(next)) {
 8974			return PTR_ERR(next);
 8975		} else if (!extent_buffer_uptodate(next)) {
 8976			free_extent_buffer(next);
 8977			return -EIO;
 8978		}
 8979		btrfs_tree_lock(next);
 8980		btrfs_set_lock_blocking(next);
 8981	}
 8982
 8983	level--;
 8984	ASSERT(level == btrfs_header_level(next));
 8985	if (level != btrfs_header_level(next)) {
 8986		btrfs_err(root->fs_info, "mismatched level");
 8987		ret = -EIO;
 8988		goto out_unlock;
 8989	}
 8990	path->nodes[level] = next;
 8991	path->slots[level] = 0;
 8992	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 8993	wc->level = level;
 8994	if (wc->level == 1)
 8995		wc->reada_slot = 0;
 8996	return 0;
 8997skip:
 8998	wc->refs[level - 1] = 0;
 8999	wc->flags[level - 1] = 0;
 9000	if (wc->stage == DROP_REFERENCE) {
 9001		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
 9002			parent = path->nodes[level]->start;
 9003		} else {
 9004			ASSERT(root->root_key.objectid ==
 9005			       btrfs_header_owner(path->nodes[level]));
 9006			if (root->root_key.objectid !=
 9007			    btrfs_header_owner(path->nodes[level])) {
 9008				btrfs_err(root->fs_info,
 9009						"mismatched block owner");
 9010				ret = -EIO;
 9011				goto out_unlock;
 9012			}
 9013			parent = 0;
 9014		}
 9015
 9016		/*
 9017		 * Reloc tree doesn't contribute to qgroup numbers, and we have
 9018		 * already accounted them at merge time (replace_path),
 9019		 * thus we could skip expensive subtree trace here.
 9020		 */
 9021		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
 9022		    need_account) {
 9023			ret = btrfs_qgroup_trace_subtree(trans, next,
 9024							 generation, level - 1);
 9025			if (ret) {
 9026				btrfs_err_rl(fs_info,
 9027					     "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
 9028					     ret);
 9029			}
 9030		}
 9031		ret = btrfs_free_extent(trans, root, bytenr, fs_info->nodesize,
 9032					parent, root->root_key.objectid,
 9033					level - 1, 0);
 9034		if (ret)
 9035			goto out_unlock;
 9036	}
 9037
 9038	*lookup_info = 1;
 9039	ret = 1;
 9040
 9041out_unlock:
 9042	btrfs_tree_unlock(next);
 9043	free_extent_buffer(next);
 9044
 9045	return ret;
 9046}
 9047
 9048/*
 9049 * helper to process tree block while walking up the tree.
 9050 *
 9051 * when wc->stage == DROP_REFERENCE, this function drops
 9052 * reference count on the block.
 9053 *
 9054 * when wc->stage == UPDATE_BACKREF, this function changes
 9055 * wc->stage back to DROP_REFERENCE if we changed wc->stage
 9056 * to UPDATE_BACKREF previously while processing the block.
 9057 *
 9058 * NOTE: return value 1 means we should stop walking up.
 9059 */
 9060static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 9061				 struct btrfs_root *root,
 9062				 struct btrfs_path *path,
 9063				 struct walk_control *wc)
 9064{
 9065	struct btrfs_fs_info *fs_info = root->fs_info;
 9066	int ret;
 9067	int level = wc->level;
 9068	struct extent_buffer *eb = path->nodes[level];
 9069	u64 parent = 0;
 9070
 9071	if (wc->stage == UPDATE_BACKREF) {
 9072		BUG_ON(wc->shared_level < level);
 9073		if (level < wc->shared_level)
 9074			goto out;
 9075
 9076		ret = find_next_key(path, level + 1, &wc->update_progress);
 9077		if (ret > 0)
 9078			wc->update_ref = 0;
 9079
 9080		wc->stage = DROP_REFERENCE;
 9081		wc->shared_level = -1;
 9082		path->slots[level] = 0;
 9083
 9084		/*
 9085		 * check reference count again if the block isn't locked.
 9086		 * we should start walking down the tree again if reference
 9087		 * count is one.
 9088		 */
 9089		if (!path->locks[level]) {
 9090			BUG_ON(level == 0);
 9091			btrfs_tree_lock(eb);
 9092			btrfs_set_lock_blocking(eb);
 9093			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 9094
 9095			ret = btrfs_lookup_extent_info(trans, fs_info,
 9096						       eb->start, level, 1,
 9097						       &wc->refs[level],
 9098						       &wc->flags[level]);
 9099			if (ret < 0) {
 9100				btrfs_tree_unlock_rw(eb, path->locks[level]);
 9101				path->locks[level] = 0;
 9102				return ret;
 9103			}
 9104			BUG_ON(wc->refs[level] == 0);
 9105			if (wc->refs[level] == 1) {
 9106				btrfs_tree_unlock_rw(eb, path->locks[level]);
 9107				path->locks[level] = 0;
 9108				return 1;
 9109			}
 9110		}
 9111	}
 9112
 9113	/* wc->stage == DROP_REFERENCE */
 9114	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
 9115
 9116	if (wc->refs[level] == 1) {
 9117		if (level == 0) {
 9118			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
 9119				ret = btrfs_dec_ref(trans, root, eb, 1);
 9120			else
 9121				ret = btrfs_dec_ref(trans, root, eb, 0);
 9122			BUG_ON(ret); /* -ENOMEM */
 9123			ret = btrfs_qgroup_trace_leaf_items(trans, eb);
 9124			if (ret) {
 9125				btrfs_err_rl(fs_info,
 9126					     "error %d accounting leaf items. Quota is out of sync, rescan required.",
 9127					     ret);
 9128			}
 9129		}
 9130		/* make block locked assertion in clean_tree_block happy */
 9131		if (!path->locks[level] &&
 9132		    btrfs_header_generation(eb) == trans->transid) {
 9133			btrfs_tree_lock(eb);
 9134			btrfs_set_lock_blocking(eb);
 9135			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 9136		}
 9137		clean_tree_block(fs_info, eb);
 9138	}
 9139
 9140	if (eb == root->node) {
 9141		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
 9142			parent = eb->start;
 9143		else if (root->root_key.objectid != btrfs_header_owner(eb))
 9144			goto owner_mismatch;
 9145	} else {
 9146		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
 9147			parent = path->nodes[level + 1]->start;
 9148		else if (root->root_key.objectid !=
 9149			 btrfs_header_owner(path->nodes[level + 1]))
 9150			goto owner_mismatch;
 9151	}
 9152
 9153	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
 9154out:
 9155	wc->refs[level] = 0;
 9156	wc->flags[level] = 0;
 9157	return 0;
 9158
 9159owner_mismatch:
 9160	btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
 9161		     btrfs_header_owner(eb), root->root_key.objectid);
 9162	return -EUCLEAN;
 9163}
 9164
 9165static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 9166				   struct btrfs_root *root,
 9167				   struct btrfs_path *path,
 9168				   struct walk_control *wc)
 9169{
 9170	int level = wc->level;
 9171	int lookup_info = 1;
 9172	int ret;
 9173
 9174	while (level >= 0) {
 9175		ret = walk_down_proc(trans, root, path, wc, lookup_info);
 9176		if (ret > 0)
 9177			break;
 9178
 9179		if (level == 0)
 9180			break;
 9181
 9182		if (path->slots[level] >=
 9183		    btrfs_header_nritems(path->nodes[level]))
 9184			break;
 9185
 9186		ret = do_walk_down(trans, root, path, wc, &lookup_info);
 9187		if (ret > 0) {
 9188			path->slots[level]++;
 9189			continue;
 9190		} else if (ret < 0)
 9191			return ret;
 9192		level = wc->level;
 9193	}
 9194	return 0;
 9195}
 9196
 9197static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 9198				 struct btrfs_root *root,
 9199				 struct btrfs_path *path,
 9200				 struct walk_control *wc, int max_level)
 9201{
 9202	int level = wc->level;
 9203	int ret;
 9204
 9205	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
 9206	while (level < max_level && path->nodes[level]) {
 9207		wc->level = level;
 9208		if (path->slots[level] + 1 <
 9209		    btrfs_header_nritems(path->nodes[level])) {
 9210			path->slots[level]++;
 9211			return 0;
 9212		} else {
 9213			ret = walk_up_proc(trans, root, path, wc);
 9214			if (ret > 0)
 9215				return 0;
 9216			if (ret < 0)
 9217				return ret;
 9218
 9219			if (path->locks[level]) {
 9220				btrfs_tree_unlock_rw(path->nodes[level],
 9221						     path->locks[level]);
 9222				path->locks[level] = 0;
 9223			}
 9224			free_extent_buffer(path->nodes[level]);
 9225			path->nodes[level] = NULL;
 9226			level++;
 9227		}
 9228	}
 9229	return 1;
 9230}
 9231
 9232/*
 9233 * drop a subvolume tree.
 9234 *
 9235 * this function traverses the tree freeing any blocks that only
 9236 * referenced by the tree.
 9237 *
 9238 * when a shared tree block is found. this function decreases its
 9239 * reference count by one. if update_ref is true, this function
 9240 * also make sure backrefs for the shared block and all lower level
 9241 * blocks are properly updated.
 9242 *
 9243 * If called with for_reloc == 0, may exit early with -EAGAIN
 9244 */
 9245int btrfs_drop_snapshot(struct btrfs_root *root,
 9246			 struct btrfs_block_rsv *block_rsv, int update_ref,
 9247			 int for_reloc)
 9248{
 9249	struct btrfs_fs_info *fs_info = root->fs_info;
 9250	struct btrfs_path *path;
 9251	struct btrfs_trans_handle *trans;
 9252	struct btrfs_root *tree_root = fs_info->tree_root;
 9253	struct btrfs_root_item *root_item = &root->root_item;
 9254	struct walk_control *wc;
 9255	struct btrfs_key key;
 9256	int err = 0;
 9257	int ret;
 9258	int level;
 9259	bool root_dropped = false;
 9260
 9261	btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
 9262
 9263	path = btrfs_alloc_path();
 9264	if (!path) {
 9265		err = -ENOMEM;
 9266		goto out;
 9267	}
 9268
 9269	wc = kzalloc(sizeof(*wc), GFP_NOFS);
 9270	if (!wc) {
 9271		btrfs_free_path(path);
 9272		err = -ENOMEM;
 9273		goto out;
 9274	}
 9275
 9276	trans = btrfs_start_transaction(tree_root, 0);
 9277	if (IS_ERR(trans)) {
 9278		err = PTR_ERR(trans);
 9279		goto out_free;
 9280	}
 9281
 9282	err = btrfs_run_delayed_items(trans);
 9283	if (err)
 9284		goto out_end_trans;
 9285
 9286	if (block_rsv)
 9287		trans->block_rsv = block_rsv;
 9288
 9289	/*
 9290	 * This will help us catch people modifying the fs tree while we're
 9291	 * dropping it.  It is unsafe to mess with the fs tree while it's being
 9292	 * dropped as we unlock the root node and parent nodes as we walk down
 9293	 * the tree, assuming nothing will change.  If something does change
 9294	 * then we'll have stale information and drop references to blocks we've
 9295	 * already dropped.
 9296	 */
 9297	set_bit(BTRFS_ROOT_DELETING, &root->state);
 9298	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
 9299		level = btrfs_header_level(root->node);
 9300		path->nodes[level] = btrfs_lock_root_node(root);
 9301		btrfs_set_lock_blocking(path->nodes[level]);
 9302		path->slots[level] = 0;
 9303		path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 9304		memset(&wc->update_progress, 0,
 9305		       sizeof(wc->update_progress));
 9306	} else {
 9307		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
 9308		memcpy(&wc->update_progress, &key,
 9309		       sizeof(wc->update_progress));
 9310
 9311		level = root_item->drop_level;
 9312		BUG_ON(level == 0);
 9313		path->lowest_level = level;
 9314		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 9315		path->lowest_level = 0;
 9316		if (ret < 0) {
 9317			err = ret;
 9318			goto out_end_trans;
 9319		}
 9320		WARN_ON(ret > 0);
 9321
 9322		/*
 9323		 * unlock our path, this is safe because only this
 9324		 * function is allowed to delete this snapshot
 9325		 */
 9326		btrfs_unlock_up_safe(path, 0);
 9327
 9328		level = btrfs_header_level(root->node);
 9329		while (1) {
 9330			btrfs_tree_lock(path->nodes[level]);
 9331			btrfs_set_lock_blocking(path->nodes[level]);
 9332			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 9333
 9334			ret = btrfs_lookup_extent_info(trans, fs_info,
 9335						path->nodes[level]->start,
 9336						level, 1, &wc->refs[level],
 9337						&wc->flags[level]);
 9338			if (ret < 0) {
 9339				err = ret;
 9340				goto out_end_trans;
 9341			}
 9342			BUG_ON(wc->refs[level] == 0);
 9343
 9344			if (level == root_item->drop_level)
 9345				break;
 9346
 9347			btrfs_tree_unlock(path->nodes[level]);
 9348			path->locks[level] = 0;
 9349			WARN_ON(wc->refs[level] != 1);
 9350			level--;
 9351		}
 9352	}
 9353
 9354	wc->level = level;
 9355	wc->shared_level = -1;
 9356	wc->stage = DROP_REFERENCE;
 9357	wc->update_ref = update_ref;
 9358	wc->keep_locks = 0;
 9359	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
 9360
 9361	while (1) {
 9362
 9363		ret = walk_down_tree(trans, root, path, wc);
 9364		if (ret < 0) {
 9365			err = ret;
 9366			break;
 9367		}
 9368
 9369		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
 9370		if (ret < 0) {
 9371			err = ret;
 9372			break;
 9373		}
 9374
 9375		if (ret > 0) {
 9376			BUG_ON(wc->stage != DROP_REFERENCE);
 9377			break;
 9378		}
 9379
 9380		if (wc->stage == DROP_REFERENCE) {
 9381			level = wc->level;
 9382			btrfs_node_key(path->nodes[level],
 9383				       &root_item->drop_progress,
 9384				       path->slots[level]);
 9385			root_item->drop_level = level;
 9386		}
 9387
 9388		BUG_ON(wc->level == 0);
 9389		if (btrfs_should_end_transaction(trans) ||
 9390		    (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
 9391			ret = btrfs_update_root(trans, tree_root,
 9392						&root->root_key,
 9393						root_item);
 9394			if (ret) {
 9395				btrfs_abort_transaction(trans, ret);
 9396				err = ret;
 9397				goto out_end_trans;
 9398			}
 9399
 9400			btrfs_end_transaction_throttle(trans);
 9401			if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
 9402				btrfs_debug(fs_info,
 9403					    "drop snapshot early exit");
 9404				err = -EAGAIN;
 9405				goto out_free;
 9406			}
 9407
 9408			trans = btrfs_start_transaction(tree_root, 0);
 9409			if (IS_ERR(trans)) {
 9410				err = PTR_ERR(trans);
 9411				goto out_free;
 9412			}
 9413			if (block_rsv)
 9414				trans->block_rsv = block_rsv;
 9415		}
 9416	}
 9417	btrfs_release_path(path);
 9418	if (err)
 9419		goto out_end_trans;
 9420
 9421	ret = btrfs_del_root(trans, &root->root_key);
 9422	if (ret) {
 9423		btrfs_abort_transaction(trans, ret);
 9424		err = ret;
 9425		goto out_end_trans;
 9426	}
 9427
 9428	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
 9429		ret = btrfs_find_root(tree_root, &root->root_key, path,
 9430				      NULL, NULL);
 9431		if (ret < 0) {
 9432			btrfs_abort_transaction(trans, ret);
 9433			err = ret;
 9434			goto out_end_trans;
 9435		} else if (ret > 0) {
 9436			/* if we fail to delete the orphan item this time
 9437			 * around, it'll get picked up the next time.
 9438			 *
 9439			 * The most common failure here is just -ENOENT.
 9440			 */
 9441			btrfs_del_orphan_item(trans, tree_root,
 9442					      root->root_key.objectid);
 9443		}
 9444	}
 9445
 9446	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
 9447		btrfs_add_dropped_root(trans, root);
 9448	} else {
 9449		free_extent_buffer(root->node);
 9450		free_extent_buffer(root->commit_root);
 9451		btrfs_put_fs_root(root);
 9452	}
 9453	root_dropped = true;
 9454out_end_trans:
 9455	btrfs_end_transaction_throttle(trans);
 9456out_free:
 9457	kfree(wc);
 9458	btrfs_free_path(path);
 9459out:
 9460	/*
 9461	 * So if we need to stop dropping the snapshot for whatever reason we
 9462	 * need to make sure to add it back to the dead root list so that we
 9463	 * keep trying to do the work later.  This also cleans up roots if we
 9464	 * don't have it in the radix (like when we recover after a power fail
 9465	 * or unmount) so we don't leak memory.
 9466	 */
 9467	if (!for_reloc && !root_dropped)
 9468		btrfs_add_dead_root(root);
 9469	if (err && err != -EAGAIN)
 9470		btrfs_handle_fs_error(fs_info, err, NULL);
 9471	return err;
 9472}
 9473
 9474/*
 9475 * drop subtree rooted at tree block 'node'.
 9476 *
 9477 * NOTE: this function will unlock and release tree block 'node'
 9478 * only used by relocation code
 9479 */
 9480int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 9481			struct btrfs_root *root,
 9482			struct extent_buffer *node,
 9483			struct extent_buffer *parent)
 9484{
 9485	struct btrfs_fs_info *fs_info = root->fs_info;
 9486	struct btrfs_path *path;
 9487	struct walk_control *wc;
 9488	int level;
 9489	int parent_level;
 9490	int ret = 0;
 9491	int wret;
 9492
 9493	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
 9494
 9495	path = btrfs_alloc_path();
 9496	if (!path)
 9497		return -ENOMEM;
 9498
 9499	wc = kzalloc(sizeof(*wc), GFP_NOFS);
 9500	if (!wc) {
 9501		btrfs_free_path(path);
 9502		return -ENOMEM;
 9503	}
 9504
 9505	btrfs_assert_tree_locked(parent);
 9506	parent_level = btrfs_header_level(parent);
 9507	extent_buffer_get(parent);
 9508	path->nodes[parent_level] = parent;
 9509	path->slots[parent_level] = btrfs_header_nritems(parent);
 9510
 9511	btrfs_assert_tree_locked(node);
 9512	level = btrfs_header_level(node);
 9513	path->nodes[level] = node;
 9514	path->slots[level] = 0;
 9515	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 9516
 9517	wc->refs[parent_level] = 1;
 9518	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 9519	wc->level = level;
 9520	wc->shared_level = -1;
 9521	wc->stage = DROP_REFERENCE;
 9522	wc->update_ref = 0;
 9523	wc->keep_locks = 1;
 9524	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
 9525
 9526	while (1) {
 9527		wret = walk_down_tree(trans, root, path, wc);
 9528		if (wret < 0) {
 9529			ret = wret;
 9530			break;
 9531		}
 9532
 9533		wret = walk_up_tree(trans, root, path, wc, parent_level);
 9534		if (wret < 0)
 9535			ret = wret;
 9536		if (wret != 0)
 9537			break;
 9538	}
 9539
 9540	kfree(wc);
 9541	btrfs_free_path(path);
 9542	return ret;
 9543}
 9544
 9545static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
 9546{
 9547	u64 num_devices;
 9548	u64 stripped;
 9549
 9550	/*
 9551	 * if restripe for this chunk_type is on pick target profile and
 9552	 * return, otherwise do the usual balance
 9553	 */
 9554	stripped = get_restripe_target(fs_info, flags);
 9555	if (stripped)
 9556		return extended_to_chunk(stripped);
 9557
 9558	num_devices = fs_info->fs_devices->rw_devices;
 9559
 9560	stripped = BTRFS_BLOCK_GROUP_RAID0 |
 9561		BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
 9562		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
 9563
 9564	if (num_devices == 1) {
 9565		stripped |= BTRFS_BLOCK_GROUP_DUP;
 9566		stripped = flags & ~stripped;
 9567
 9568		/* turn raid0 into single device chunks */
 9569		if (flags & BTRFS_BLOCK_GROUP_RAID0)
 9570			return stripped;
 9571
 9572		/* turn mirroring into duplication */
 9573		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
 9574			     BTRFS_BLOCK_GROUP_RAID10))
 9575			return stripped | BTRFS_BLOCK_GROUP_DUP;
 9576	} else {
 9577		/* they already had raid on here, just return */
 9578		if (flags & stripped)
 9579			return flags;
 9580
 9581		stripped |= BTRFS_BLOCK_GROUP_DUP;
 9582		stripped = flags & ~stripped;
 9583
 9584		/* switch duplicated blocks with raid1 */
 9585		if (flags & BTRFS_BLOCK_GROUP_DUP)
 9586			return stripped | BTRFS_BLOCK_GROUP_RAID1;
 9587
 9588		/* this is drive concat, leave it alone */
 9589	}
 9590
 9591	return flags;
 9592}
 9593
 9594static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 9595{
 9596	struct btrfs_space_info *sinfo = cache->space_info;
 9597	u64 num_bytes;
 9598	u64 min_allocable_bytes;
 9599	int ret = -ENOSPC;
 9600
 9601	/*
 9602	 * We need some metadata space and system metadata space for
 9603	 * allocating chunks in some corner cases until we force to set
 9604	 * it to be readonly.
 9605	 */
 9606	if ((sinfo->flags &
 9607	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
 9608	    !force)
 9609		min_allocable_bytes = SZ_1M;
 9610	else
 9611		min_allocable_bytes = 0;
 9612
 9613	spin_lock(&sinfo->lock);
 9614	spin_lock(&cache->lock);
 9615
 9616	if (cache->ro) {
 9617		cache->ro++;
 9618		ret = 0;
 9619		goto out;
 9620	}
 9621
 9622	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
 9623		    cache->bytes_super - btrfs_block_group_used(&cache->item);
 9624
 9625	if (btrfs_space_info_used(sinfo, true) + num_bytes +
 9626	    min_allocable_bytes <= sinfo->total_bytes) {
 9627		sinfo->bytes_readonly += num_bytes;
 9628		cache->ro++;
 9629		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
 9630		ret = 0;
 9631	}
 9632out:
 9633	spin_unlock(&cache->lock);
 9634	spin_unlock(&sinfo->lock);
 9635	return ret;
 9636}
 9637
 9638int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)
 9639
 9640{
 9641	struct btrfs_fs_info *fs_info = cache->fs_info;
 9642	struct btrfs_trans_handle *trans;
 9643	u64 alloc_flags;
 9644	int ret;
 9645
 9646again:
 9647	trans = btrfs_join_transaction(fs_info->extent_root);
 9648	if (IS_ERR(trans))
 9649		return PTR_ERR(trans);
 9650
 9651	/*
 9652	 * we're not allowed to set block groups readonly after the dirty
 9653	 * block groups cache has started writing.  If it already started,
 9654	 * back off and let this transaction commit
 9655	 */
 9656	mutex_lock(&fs_info->ro_block_group_mutex);
 9657	if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
 9658		u64 transid = trans->transid;
 9659
 9660		mutex_unlock(&fs_info->ro_block_group_mutex);
 9661		btrfs_end_transaction(trans);
 9662
 9663		ret = btrfs_wait_for_commit(fs_info, transid);
 9664		if (ret)
 9665			return ret;
 9666		goto again;
 9667	}
 9668
 9669	/*
 9670	 * if we are changing raid levels, try to allocate a corresponding
 9671	 * block group with the new raid level.
 9672	 */
 9673	alloc_flags = update_block_group_flags(fs_info, cache->flags);
 9674	if (alloc_flags != cache->flags) {
 9675		ret = do_chunk_alloc(trans, alloc_flags,
 9676				     CHUNK_ALLOC_FORCE);
 9677		/*
 9678		 * ENOSPC is allowed here, we may have enough space
 9679		 * already allocated at the new raid level to
 9680		 * carry on
 9681		 */
 9682		if (ret == -ENOSPC)
 9683			ret = 0;
 9684		if (ret < 0)
 9685			goto out;
 9686	}
 9687
 9688	ret = inc_block_group_ro(cache, 0);
 9689	if (!ret)
 9690		goto out;
 9691	alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
 9692	ret = do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
 9693	if (ret < 0)
 9694		goto out;
 9695	ret = inc_block_group_ro(cache, 0);
 9696out:
 9697	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
 9698		alloc_flags = update_block_group_flags(fs_info, cache->flags);
 9699		mutex_lock(&fs_info->chunk_mutex);
 9700		check_system_chunk(trans, alloc_flags);
 9701		mutex_unlock(&fs_info->chunk_mutex);
 9702	}
 9703	mutex_unlock(&fs_info->ro_block_group_mutex);
 9704
 9705	btrfs_end_transaction(trans);
 9706	return ret;
 9707}
 9708
 9709int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
 9710{
 9711	u64 alloc_flags = get_alloc_profile(trans->fs_info, type);
 9712
 9713	return do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
 9714}
 9715
 9716/*
 9717 * helper to account the unused space of all the readonly block group in the
 9718 * space_info. takes mirrors into account.
 9719 */
 9720u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
 9721{
 9722	struct btrfs_block_group_cache *block_group;
 9723	u64 free_bytes = 0;
 9724	int factor;
 9725
 9726	/* It's df, we don't care if it's racy */
 9727	if (list_empty(&sinfo->ro_bgs))
 9728		return 0;
 9729
 9730	spin_lock(&sinfo->lock);
 9731	list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
 9732		spin_lock(&block_group->lock);
 9733
 9734		if (!block_group->ro) {
 9735			spin_unlock(&block_group->lock);
 9736			continue;
 9737		}
 9738
 9739		factor = btrfs_bg_type_to_factor(block_group->flags);
 9740		free_bytes += (block_group->key.offset -
 9741			       btrfs_block_group_used(&block_group->item)) *
 9742			       factor;
 9743
 9744		spin_unlock(&block_group->lock);
 9745	}
 9746	spin_unlock(&sinfo->lock);
 9747
 9748	return free_bytes;
 9749}
 9750
 9751void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
 9752{
 9753	struct btrfs_space_info *sinfo = cache->space_info;
 9754	u64 num_bytes;
 9755
 9756	BUG_ON(!cache->ro);
 9757
 9758	spin_lock(&sinfo->lock);
 9759	spin_lock(&cache->lock);
 9760	if (!--cache->ro) {
 9761		num_bytes = cache->key.offset - cache->reserved -
 9762			    cache->pinned - cache->bytes_super -
 9763			    btrfs_block_group_used(&cache->item);
 9764		sinfo->bytes_readonly -= num_bytes;
 9765		list_del_init(&cache->ro_list);
 9766	}
 9767	spin_unlock(&cache->lock);
 9768	spin_unlock(&sinfo->lock);
 9769}
 9770
 9771/*
 9772 * Checks to see if it's even possible to relocate this block group.
 9773 *
 9774 * @return - -1 if it's not a good idea to relocate this block group, 0 if its
 9775 * ok to go ahead and try.
 9776 */
 9777int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
 9778{
 9779	struct btrfs_root *root = fs_info->extent_root;
 9780	struct btrfs_block_group_cache *block_group;
 9781	struct btrfs_space_info *space_info;
 9782	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 9783	struct btrfs_device *device;
 9784	struct btrfs_trans_handle *trans;
 9785	u64 min_free;
 9786	u64 dev_min = 1;
 9787	u64 dev_nr = 0;
 9788	u64 target;
 9789	int debug;
 9790	int index;
 9791	int full = 0;
 9792	int ret = 0;
 9793
 9794	debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
 9795
 9796	block_group = btrfs_lookup_block_group(fs_info, bytenr);
 9797
 9798	/* odd, couldn't find the block group, leave it alone */
 9799	if (!block_group) {
 9800		if (debug)
 9801			btrfs_warn(fs_info,
 9802				   "can't find block group for bytenr %llu",
 9803				   bytenr);
 9804		return -1;
 9805	}
 9806
 9807	min_free = btrfs_block_group_used(&block_group->item);
 9808
 9809	/* no bytes used, we're good */
 9810	if (!min_free)
 9811		goto out;
 9812
 9813	space_info = block_group->space_info;
 9814	spin_lock(&space_info->lock);
 9815
 9816	full = space_info->full;
 9817
 9818	/*
 9819	 * if this is the last block group we have in this space, we can't
 9820	 * relocate it unless we're able to allocate a new chunk below.
 9821	 *
 9822	 * Otherwise, we need to make sure we have room in the space to handle
 9823	 * all of the extents from this block group.  If we can, we're good
 9824	 */
 9825	if ((space_info->total_bytes != block_group->key.offset) &&
 9826	    (btrfs_space_info_used(space_info, false) + min_free <
 9827	     space_info->total_bytes)) {
 9828		spin_unlock(&space_info->lock);
 9829		goto out;
 9830	}
 9831	spin_unlock(&space_info->lock);
 9832
 9833	/*
 9834	 * ok we don't have enough space, but maybe we have free space on our
 9835	 * devices to allocate new chunks for relocation, so loop through our
 9836	 * alloc devices and guess if we have enough space.  if this block
 9837	 * group is going to be restriped, run checks against the target
 9838	 * profile instead of the current one.
 9839	 */
 9840	ret = -1;
 9841
 9842	/*
 9843	 * index:
 9844	 *      0: raid10
 9845	 *      1: raid1
 9846	 *      2: dup
 9847	 *      3: raid0
 9848	 *      4: single
 9849	 */
 9850	target = get_restripe_target(fs_info, block_group->flags);
 9851	if (target) {
 9852		index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
 9853	} else {
 9854		/*
 9855		 * this is just a balance, so if we were marked as full
 9856		 * we know there is no space for a new chunk
 9857		 */
 9858		if (full) {
 9859			if (debug)
 9860				btrfs_warn(fs_info,
 9861					   "no space to alloc new chunk for block group %llu",
 9862					   block_group->key.objectid);
 9863			goto out;
 9864		}
 9865
 9866		index = btrfs_bg_flags_to_raid_index(block_group->flags);
 9867	}
 9868
 9869	if (index == BTRFS_RAID_RAID10) {
 9870		dev_min = 4;
 9871		/* Divide by 2 */
 9872		min_free >>= 1;
 9873	} else if (index == BTRFS_RAID_RAID1) {
 9874		dev_min = 2;
 9875	} else if (index == BTRFS_RAID_DUP) {
 9876		/* Multiply by 2 */
 9877		min_free <<= 1;
 9878	} else if (index == BTRFS_RAID_RAID0) {
 9879		dev_min = fs_devices->rw_devices;
 9880		min_free = div64_u64(min_free, dev_min);
 9881	}
 9882
 9883	/* We need to do this so that we can look at pending chunks */
 9884	trans = btrfs_join_transaction(root);
 9885	if (IS_ERR(trans)) {
 9886		ret = PTR_ERR(trans);
 9887		goto out;
 9888	}
 9889
 9890	mutex_lock(&fs_info->chunk_mutex);
 9891	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
 9892		u64 dev_offset;
 9893
 9894		/*
 9895		 * check to make sure we can actually find a chunk with enough
 9896		 * space to fit our block group in.
 9897		 */
 9898		if (device->total_bytes > device->bytes_used + min_free &&
 9899		    !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
 9900			ret = find_free_dev_extent(trans, device, min_free,
 9901						   &dev_offset, NULL);
 9902			if (!ret)
 9903				dev_nr++;
 9904
 9905			if (dev_nr >= dev_min)
 9906				break;
 9907
 9908			ret = -1;
 9909		}
 9910	}
 9911	if (debug && ret == -1)
 9912		btrfs_warn(fs_info,
 9913			   "no space to allocate a new chunk for block group %llu",
 9914			   block_group->key.objectid);
 9915	mutex_unlock(&fs_info->chunk_mutex);
 9916	btrfs_end_transaction(trans);
 9917out:
 9918	btrfs_put_block_group(block_group);
 9919	return ret;
 9920}
 9921
 9922static int find_first_block_group(struct btrfs_fs_info *fs_info,
 9923				  struct btrfs_path *path,
 9924				  struct btrfs_key *key)
 9925{
 9926	struct btrfs_root *root = fs_info->extent_root;
 9927	int ret = 0;
 9928	struct btrfs_key found_key;
 9929	struct extent_buffer *leaf;
 9930	struct btrfs_block_group_item bg;
 9931	u64 flags;
 9932	int slot;
 9933
 9934	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
 9935	if (ret < 0)
 9936		goto out;
 9937
 9938	while (1) {
 9939		slot = path->slots[0];
 9940		leaf = path->nodes[0];
 9941		if (slot >= btrfs_header_nritems(leaf)) {
 9942			ret = btrfs_next_leaf(root, path);
 9943			if (ret == 0)
 9944				continue;
 9945			if (ret < 0)
 9946				goto out;
 9947			break;
 9948		}
 9949		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 9950
 9951		if (found_key.objectid >= key->objectid &&
 9952		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
 9953			struct extent_map_tree *em_tree;
 9954			struct extent_map *em;
 9955
 9956			em_tree = &root->fs_info->mapping_tree.map_tree;
 9957			read_lock(&em_tree->lock);
 9958			em = lookup_extent_mapping(em_tree, found_key.objectid,
 9959						   found_key.offset);
 9960			read_unlock(&em_tree->lock);
 9961			if (!em) {
 9962				btrfs_err(fs_info,
 9963			"logical %llu len %llu found bg but no related chunk",
 9964					  found_key.objectid, found_key.offset);
 9965				ret = -ENOENT;
 9966			} else if (em->start != found_key.objectid ||
 9967				   em->len != found_key.offset) {
 9968				btrfs_err(fs_info,
 9969		"block group %llu len %llu mismatch with chunk %llu len %llu",
 9970					  found_key.objectid, found_key.offset,
 9971					  em->start, em->len);
 9972				ret = -EUCLEAN;
 9973			} else {
 9974				read_extent_buffer(leaf, &bg,
 9975					btrfs_item_ptr_offset(leaf, slot),
 9976					sizeof(bg));
 9977				flags = btrfs_block_group_flags(&bg) &
 9978					BTRFS_BLOCK_GROUP_TYPE_MASK;
 9979
 9980				if (flags != (em->map_lookup->type &
 9981					      BTRFS_BLOCK_GROUP_TYPE_MASK)) {
 9982					btrfs_err(fs_info,
 9983"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
 9984						found_key.objectid,
 9985						found_key.offset, flags,
 9986						(BTRFS_BLOCK_GROUP_TYPE_MASK &
 9987						 em->map_lookup->type));
 9988					ret = -EUCLEAN;
 9989				} else {
 9990					ret = 0;
 9991				}
 9992			}
 9993			free_extent_map(em);
 9994			goto out;
 9995		}
 9996		path->slots[0]++;
 9997	}
 9998out:
 9999	return ret;
10000}
10001
10002void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
10003{
10004	struct btrfs_block_group_cache *block_group;
10005	u64 last = 0;
10006
10007	while (1) {
10008		struct inode *inode;
10009
10010		block_group = btrfs_lookup_first_block_group(info, last);
10011		while (block_group) {
10012			wait_block_group_cache_done(block_group);
10013			spin_lock(&block_group->lock);
10014			if (block_group->iref)
10015				break;
10016			spin_unlock(&block_group->lock);
10017			block_group = next_block_group(info, block_group);
10018		}
10019		if (!block_group) {
10020			if (last == 0)
10021				break;
10022			last = 0;
10023			continue;
10024		}
10025
10026		inode = block_group->inode;
10027		block_group->iref = 0;
10028		block_group->inode = NULL;
10029		spin_unlock(&block_group->lock);
10030		ASSERT(block_group->io_ctl.inode == NULL);
10031		iput(inode);
10032		last = block_group->key.objectid + block_group->key.offset;
10033		btrfs_put_block_group(block_group);
10034	}
10035}
10036
10037/*
10038 * Must be called only after stopping all workers, since we could have block
10039 * group caching kthreads running, and therefore they could race with us if we
10040 * freed the block groups before stopping them.
10041 */
10042int btrfs_free_block_groups(struct btrfs_fs_info *info)
10043{
10044	struct btrfs_block_group_cache *block_group;
10045	struct btrfs_space_info *space_info;
10046	struct btrfs_caching_control *caching_ctl;
10047	struct rb_node *n;
10048
10049	down_write(&info->commit_root_sem);
10050	while (!list_empty(&info->caching_block_groups)) {
10051		caching_ctl = list_entry(info->caching_block_groups.next,
10052					 struct btrfs_caching_control, list);
10053		list_del(&caching_ctl->list);
10054		put_caching_control(caching_ctl);
10055	}
10056	up_write(&info->commit_root_sem);
10057
10058	spin_lock(&info->unused_bgs_lock);
10059	while (!list_empty(&info->unused_bgs)) {
10060		block_group = list_first_entry(&info->unused_bgs,
10061					       struct btrfs_block_group_cache,
10062					       bg_list);
10063		list_del_init(&block_group->bg_list);
10064		btrfs_put_block_group(block_group);
10065	}
10066	spin_unlock(&info->unused_bgs_lock);
10067
10068	spin_lock(&info->block_group_cache_lock);
10069	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
10070		block_group = rb_entry(n, struct btrfs_block_group_cache,
10071				       cache_node);
10072		rb_erase(&block_group->cache_node,
10073			 &info->block_group_cache_tree);
10074		RB_CLEAR_NODE(&block_group->cache_node);
10075		spin_unlock(&info->block_group_cache_lock);
10076
10077		down_write(&block_group->space_info->groups_sem);
10078		list_del(&block_group->list);
10079		up_write(&block_group->space_info->groups_sem);
10080
10081		/*
10082		 * We haven't cached this block group, which means we could
10083		 * possibly have excluded extents on this block group.
10084		 */
10085		if (block_group->cached == BTRFS_CACHE_NO ||
10086		    block_group->cached == BTRFS_CACHE_ERROR)
10087			free_excluded_extents(block_group);
10088
10089		btrfs_remove_free_space_cache(block_group);
10090		ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
10091		ASSERT(list_empty(&block_group->dirty_list));
10092		ASSERT(list_empty(&block_group->io_list));
10093		ASSERT(list_empty(&block_group->bg_list));
10094		ASSERT(atomic_read(&block_group->count) == 1);
10095		btrfs_put_block_group(block_group);
10096
10097		spin_lock(&info->block_group_cache_lock);
10098	}
10099	spin_unlock(&info->block_group_cache_lock);
10100
10101	/* now that all the block groups are freed, go through and
10102	 * free all the space_info structs.  This is only called during
10103	 * the final stages of unmount, and so we know nobody is
10104	 * using them.  We call synchronize_rcu() once before we start,
10105	 * just to be on the safe side.
10106	 */
10107	synchronize_rcu();
10108
10109	release_global_block_rsv(info);
10110
10111	while (!list_empty(&info->space_info)) {
10112		int i;
10113
10114		space_info = list_entry(info->space_info.next,
10115					struct btrfs_space_info,
10116					list);
10117
10118		/*
10119		 * Do not hide this behind enospc_debug, this is actually
10120		 * important and indicates a real bug if this happens.
10121		 */
10122		if (WARN_ON(space_info->bytes_pinned > 0 ||
10123			    space_info->bytes_reserved > 0 ||
10124			    space_info->bytes_may_use > 0))
10125			dump_space_info(info, space_info, 0, 0);
10126		list_del(&space_info->list);
10127		for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
10128			struct kobject *kobj;
10129			kobj = space_info->block_group_kobjs[i];
10130			space_info->block_group_kobjs[i] = NULL;
10131			if (kobj) {
10132				kobject_del(kobj);
10133				kobject_put(kobj);
10134			}
10135		}
10136		kobject_del(&space_info->kobj);
10137		kobject_put(&space_info->kobj);
10138	}
10139	return 0;
10140}
10141
10142/* link_block_group will queue up kobjects to add when we're reclaim-safe */
10143void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
10144{
10145	struct btrfs_space_info *space_info;
10146	struct raid_kobject *rkobj;
10147	LIST_HEAD(list);
10148	int index;
10149	int ret = 0;
10150
10151	spin_lock(&fs_info->pending_raid_kobjs_lock);
10152	list_splice_init(&fs_info->pending_raid_kobjs, &list);
10153	spin_unlock(&fs_info->pending_raid_kobjs_lock);
10154
10155	list_for_each_entry(rkobj, &list, list) {
10156		space_info = __find_space_info(fs_info, rkobj->flags);
10157		index = btrfs_bg_flags_to_raid_index(rkobj->flags);
10158
10159		ret = kobject_add(&rkobj->kobj, &space_info->kobj,
10160				  "%s", get_raid_name(index));
10161		if (ret) {
10162			kobject_put(&rkobj->kobj);
10163			break;
10164		}
10165	}
10166	if (ret)
10167		btrfs_warn(fs_info,
10168			   "failed to add kobject for block cache, ignoring");
10169}
10170
10171static void link_block_group(struct btrfs_block_group_cache *cache)
10172{
10173	struct btrfs_space_info *space_info = cache->space_info;
10174	struct btrfs_fs_info *fs_info = cache->fs_info;
10175	int index = btrfs_bg_flags_to_raid_index(cache->flags);
10176	bool first = false;
10177
10178	down_write(&space_info->groups_sem);
10179	if (list_empty(&space_info->block_groups[index]))
10180		first = true;
10181	list_add_tail(&cache->list, &space_info->block_groups[index]);
10182	up_write(&space_info->groups_sem);
10183
10184	if (first) {
10185		struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
10186		if (!rkobj) {
10187			btrfs_warn(cache->fs_info,
10188				"couldn't alloc memory for raid level kobject");
10189			return;
10190		}
10191		rkobj->flags = cache->flags;
10192		kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
10193
10194		spin_lock(&fs_info->pending_raid_kobjs_lock);
10195		list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
10196		spin_unlock(&fs_info->pending_raid_kobjs_lock);
10197		space_info->block_group_kobjs[index] = &rkobj->kobj;
10198	}
10199}
10200
10201static struct btrfs_block_group_cache *
10202btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
10203			       u64 start, u64 size)
10204{
10205	struct btrfs_block_group_cache *cache;
10206
10207	cache = kzalloc(sizeof(*cache), GFP_NOFS);
10208	if (!cache)
10209		return NULL;
10210
10211	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
10212					GFP_NOFS);
10213	if (!cache->free_space_ctl) {
10214		kfree(cache);
10215		return NULL;
10216	}
10217
10218	cache->key.objectid = start;
10219	cache->key.offset = size;
10220	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10221
10222	cache->fs_info = fs_info;
10223	cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
10224	set_free_space_tree_thresholds(cache);
10225
10226	atomic_set(&cache->count, 1);
10227	spin_lock_init(&cache->lock);
10228	init_rwsem(&cache->data_rwsem);
10229	INIT_LIST_HEAD(&cache->list);
10230	INIT_LIST_HEAD(&cache->cluster_list);
10231	INIT_LIST_HEAD(&cache->bg_list);
10232	INIT_LIST_HEAD(&cache->ro_list);
10233	INIT_LIST_HEAD(&cache->dirty_list);
10234	INIT_LIST_HEAD(&cache->io_list);
10235	btrfs_init_free_space_ctl(cache);
10236	atomic_set(&cache->trimming, 0);
10237	mutex_init(&cache->free_space_lock);
10238	btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
10239
10240	return cache;
10241}
10242
10243
10244/*
10245 * Iterate all chunks and verify that each of them has the corresponding block
10246 * group
10247 */
10248static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
10249{
10250	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
10251	struct extent_map *em;
10252	struct btrfs_block_group_cache *bg;
10253	u64 start = 0;
10254	int ret = 0;
10255
10256	while (1) {
10257		read_lock(&map_tree->map_tree.lock);
10258		/*
10259		 * lookup_extent_mapping will return the first extent map
10260		 * intersecting the range, so setting @len to 1 is enough to
10261		 * get the first chunk.
10262		 */
10263		em = lookup_extent_mapping(&map_tree->map_tree, start, 1);
10264		read_unlock(&map_tree->map_tree.lock);
10265		if (!em)
10266			break;
10267
10268		bg = btrfs_lookup_block_group(fs_info, em->start);
10269		if (!bg) {
10270			btrfs_err(fs_info,
10271	"chunk start=%llu len=%llu doesn't have corresponding block group",
10272				     em->start, em->len);
10273			ret = -EUCLEAN;
10274			free_extent_map(em);
10275			break;
10276		}
10277		if (bg->key.objectid != em->start ||
10278		    bg->key.offset != em->len ||
10279		    (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
10280		    (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
10281			btrfs_err(fs_info,
10282"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
10283				em->start, em->len,
10284				em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
10285				bg->key.objectid, bg->key.offset,
10286				bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
10287			ret = -EUCLEAN;
10288			free_extent_map(em);
10289			btrfs_put_block_group(bg);
10290			break;
10291		}
10292		start = em->start + em->len;
10293		free_extent_map(em);
10294		btrfs_put_block_group(bg);
10295	}
10296	return ret;
10297}
10298
10299int btrfs_read_block_groups(struct btrfs_fs_info *info)
10300{
10301	struct btrfs_path *path;
10302	int ret;
10303	struct btrfs_block_group_cache *cache;
10304	struct btrfs_space_info *space_info;
10305	struct btrfs_key key;
10306	struct btrfs_key found_key;
10307	struct extent_buffer *leaf;
10308	int need_clear = 0;
10309	u64 cache_gen;
10310	u64 feature;
10311	int mixed;
10312
10313	feature = btrfs_super_incompat_flags(info->super_copy);
10314	mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
10315
10316	key.objectid = 0;
10317	key.offset = 0;
10318	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10319	path = btrfs_alloc_path();
10320	if (!path)
10321		return -ENOMEM;
10322	path->reada = READA_FORWARD;
10323
10324	cache_gen = btrfs_super_cache_generation(info->super_copy);
10325	if (btrfs_test_opt(info, SPACE_CACHE) &&
10326	    btrfs_super_generation(info->super_copy) != cache_gen)
10327		need_clear = 1;
10328	if (btrfs_test_opt(info, CLEAR_CACHE))
10329		need_clear = 1;
10330
10331	while (1) {
10332		ret = find_first_block_group(info, path, &key);
10333		if (ret > 0)
10334			break;
10335		if (ret != 0)
10336			goto error;
10337
10338		leaf = path->nodes[0];
10339		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
10340
10341		cache = btrfs_create_block_group_cache(info, found_key.objectid,
10342						       found_key.offset);
10343		if (!cache) {
10344			ret = -ENOMEM;
10345			goto error;
10346		}
10347
10348		if (need_clear) {
10349			/*
10350			 * When we mount with old space cache, we need to
10351			 * set BTRFS_DC_CLEAR and set dirty flag.
10352			 *
10353			 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
10354			 *    truncate the old free space cache inode and
10355			 *    setup a new one.
10356			 * b) Setting 'dirty flag' makes sure that we flush
10357			 *    the new space cache info onto disk.
10358			 */
10359			if (btrfs_test_opt(info, SPACE_CACHE))
10360				cache->disk_cache_state = BTRFS_DC_CLEAR;
10361		}
10362
10363		read_extent_buffer(leaf, &cache->item,
10364				   btrfs_item_ptr_offset(leaf, path->slots[0]),
10365				   sizeof(cache->item));
10366		cache->flags = btrfs_block_group_flags(&cache->item);
10367		if (!mixed &&
10368		    ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
10369		    (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
10370			btrfs_err(info,
10371"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
10372				  cache->key.objectid);
10373			ret = -EINVAL;
10374			goto error;
10375		}
10376
10377		key.objectid = found_key.objectid + found_key.offset;
10378		btrfs_release_path(path);
10379
10380		/*
10381		 * We need to exclude the super stripes now so that the space
10382		 * info has super bytes accounted for, otherwise we'll think
10383		 * we have more space than we actually do.
10384		 */
10385		ret = exclude_super_stripes(cache);
10386		if (ret) {
10387			/*
10388			 * We may have excluded something, so call this just in
10389			 * case.
10390			 */
10391			free_excluded_extents(cache);
10392			btrfs_put_block_group(cache);
10393			goto error;
10394		}
10395
10396		/*
10397		 * check for two cases, either we are full, and therefore
10398		 * don't need to bother with the caching work since we won't
10399		 * find any space, or we are empty, and we can just add all
10400		 * the space in and be done with it.  This saves us _a_lot_ of
10401		 * time, particularly in the full case.
10402		 */
10403		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
10404			cache->last_byte_to_unpin = (u64)-1;
10405			cache->cached = BTRFS_CACHE_FINISHED;
10406			free_excluded_extents(cache);
10407		} else if (btrfs_block_group_used(&cache->item) == 0) {
10408			cache->last_byte_to_unpin = (u64)-1;
10409			cache->cached = BTRFS_CACHE_FINISHED;
10410			add_new_free_space(cache, found_key.objectid,
10411					   found_key.objectid +
10412					   found_key.offset);
10413			free_excluded_extents(cache);
10414		}
10415
10416		ret = btrfs_add_block_group_cache(info, cache);
10417		if (ret) {
10418			btrfs_remove_free_space_cache(cache);
10419			btrfs_put_block_group(cache);
10420			goto error;
10421		}
10422
10423		trace_btrfs_add_block_group(info, cache, 0);
10424		update_space_info(info, cache->flags, found_key.offset,
10425				  btrfs_block_group_used(&cache->item),
10426				  cache->bytes_super, &space_info);
10427
10428		cache->space_info = space_info;
10429
10430		link_block_group(cache);
10431
10432		set_avail_alloc_bits(info, cache->flags);
10433		if (btrfs_chunk_readonly(info, cache->key.objectid)) {
10434			inc_block_group_ro(cache, 1);
10435		} else if (btrfs_block_group_used(&cache->item) == 0) {
10436			ASSERT(list_empty(&cache->bg_list));
10437			btrfs_mark_bg_unused(cache);
10438		}
10439	}
10440
10441	list_for_each_entry_rcu(space_info, &info->space_info, list) {
10442		if (!(get_alloc_profile(info, space_info->flags) &
10443		      (BTRFS_BLOCK_GROUP_RAID10 |
10444		       BTRFS_BLOCK_GROUP_RAID1 |
10445		       BTRFS_BLOCK_GROUP_RAID5 |
10446		       BTRFS_BLOCK_GROUP_RAID6 |
10447		       BTRFS_BLOCK_GROUP_DUP)))
10448			continue;
10449		/*
10450		 * avoid allocating from un-mirrored block group if there are
10451		 * mirrored block groups.
10452		 */
10453		list_for_each_entry(cache,
10454				&space_info->block_groups[BTRFS_RAID_RAID0],
10455				list)
10456			inc_block_group_ro(cache, 1);
10457		list_for_each_entry(cache,
10458				&space_info->block_groups[BTRFS_RAID_SINGLE],
10459				list)
10460			inc_block_group_ro(cache, 1);
10461	}
10462
10463	btrfs_add_raid_kobjects(info);
10464	init_global_block_rsv(info);
10465	ret = check_chunk_block_group_mappings(info);
10466error:
10467	btrfs_free_path(path);
10468	return ret;
10469}
10470
10471void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
10472{
10473	struct btrfs_fs_info *fs_info = trans->fs_info;
10474	struct btrfs_block_group_cache *block_group;
10475	struct btrfs_root *extent_root = fs_info->extent_root;
10476	struct btrfs_block_group_item item;
10477	struct btrfs_key key;
10478	int ret = 0;
10479
10480	if (!trans->can_flush_pending_bgs)
10481		return;
10482
10483	while (!list_empty(&trans->new_bgs)) {
10484		block_group = list_first_entry(&trans->new_bgs,
10485					       struct btrfs_block_group_cache,
10486					       bg_list);
10487		if (ret)
10488			goto next;
10489
10490		spin_lock(&block_group->lock);
10491		memcpy(&item, &block_group->item, sizeof(item));
10492		memcpy(&key, &block_group->key, sizeof(key));
10493		spin_unlock(&block_group->lock);
10494
10495		ret = btrfs_insert_item(trans, extent_root, &key, &item,
10496					sizeof(item));
10497		if (ret)
10498			btrfs_abort_transaction(trans, ret);
10499		ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
10500		if (ret)
10501			btrfs_abort_transaction(trans, ret);
10502		add_block_group_free_space(trans, block_group);
10503		/* already aborted the transaction if it failed. */
10504next:
10505		btrfs_delayed_refs_rsv_release(fs_info, 1);
10506		list_del_init(&block_group->bg_list);
10507	}
10508	btrfs_trans_release_chunk_metadata(trans);
10509}
10510
10511int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
10512			   u64 type, u64 chunk_offset, u64 size)
10513{
10514	struct btrfs_fs_info *fs_info = trans->fs_info;
10515	struct btrfs_block_group_cache *cache;
10516	int ret;
10517
10518	btrfs_set_log_full_commit(fs_info, trans);
10519
10520	cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
10521	if (!cache)
10522		return -ENOMEM;
10523
10524	btrfs_set_block_group_used(&cache->item, bytes_used);
10525	btrfs_set_block_group_chunk_objectid(&cache->item,
10526					     BTRFS_FIRST_CHUNK_TREE_OBJECTID);
10527	btrfs_set_block_group_flags(&cache->item, type);
10528
10529	cache->flags = type;
10530	cache->last_byte_to_unpin = (u64)-1;
10531	cache->cached = BTRFS_CACHE_FINISHED;
10532	cache->needs_free_space = 1;
10533	ret = exclude_super_stripes(cache);
10534	if (ret) {
10535		/*
10536		 * We may have excluded something, so call this just in
10537		 * case.
10538		 */
10539		free_excluded_extents(cache);
10540		btrfs_put_block_group(cache);
10541		return ret;
10542	}
10543
10544	add_new_free_space(cache, chunk_offset, chunk_offset + size);
10545
10546	free_excluded_extents(cache);
10547
10548#ifdef CONFIG_BTRFS_DEBUG
10549	if (btrfs_should_fragment_free_space(cache)) {
10550		u64 new_bytes_used = size - bytes_used;
10551
10552		bytes_used += new_bytes_used >> 1;
10553		fragment_free_space(cache);
10554	}
10555#endif
10556	/*
10557	 * Ensure the corresponding space_info object is created and
10558	 * assigned to our block group. We want our bg to be added to the rbtree
10559	 * with its ->space_info set.
10560	 */
10561	cache->space_info = __find_space_info(fs_info, cache->flags);
10562	ASSERT(cache->space_info);
10563
10564	ret = btrfs_add_block_group_cache(fs_info, cache);
10565	if (ret) {
10566		btrfs_remove_free_space_cache(cache);
10567		btrfs_put_block_group(cache);
10568		return ret;
10569	}
10570
10571	/*
10572	 * Now that our block group has its ->space_info set and is inserted in
10573	 * the rbtree, update the space info's counters.
10574	 */
10575	trace_btrfs_add_block_group(fs_info, cache, 1);
10576	update_space_info(fs_info, cache->flags, size, bytes_used,
10577				cache->bytes_super, &cache->space_info);
10578	update_global_block_rsv(fs_info);
10579
10580	link_block_group(cache);
10581
10582	list_add_tail(&cache->bg_list, &trans->new_bgs);
10583	trans->delayed_ref_updates++;
10584	btrfs_update_delayed_refs_rsv(trans);
10585
10586	set_avail_alloc_bits(fs_info, type);
10587	return 0;
10588}
10589
10590static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
10591{
10592	u64 extra_flags = chunk_to_extended(flags) &
10593				BTRFS_EXTENDED_PROFILE_MASK;
10594
10595	write_seqlock(&fs_info->profiles_lock);
10596	if (flags & BTRFS_BLOCK_GROUP_DATA)
10597		fs_info->avail_data_alloc_bits &= ~extra_flags;
10598	if (flags & BTRFS_BLOCK_GROUP_METADATA)
10599		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
10600	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
10601		fs_info->avail_system_alloc_bits &= ~extra_flags;
10602	write_sequnlock(&fs_info->profiles_lock);
10603}
10604
10605int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10606			     u64 group_start, struct extent_map *em)
10607{
10608	struct btrfs_fs_info *fs_info = trans->fs_info;
10609	struct btrfs_root *root = fs_info->extent_root;
10610	struct btrfs_path *path;
10611	struct btrfs_block_group_cache *block_group;
10612	struct btrfs_free_cluster *cluster;
10613	struct btrfs_root *tree_root = fs_info->tree_root;
10614	struct btrfs_key key;
10615	struct inode *inode;
10616	struct kobject *kobj = NULL;
10617	int ret;
10618	int index;
10619	int factor;
10620	struct btrfs_caching_control *caching_ctl = NULL;
10621	bool remove_em;
10622	bool remove_rsv = false;
10623
10624	block_group = btrfs_lookup_block_group(fs_info, group_start);
10625	BUG_ON(!block_group);
10626	BUG_ON(!block_group->ro);
10627
10628	trace_btrfs_remove_block_group(block_group);
10629	/*
10630	 * Free the reserved super bytes from this block group before
10631	 * remove it.
10632	 */
10633	free_excluded_extents(block_group);
10634	btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
10635				  block_group->key.offset);
10636
10637	memcpy(&key, &block_group->key, sizeof(key));
10638	index = btrfs_bg_flags_to_raid_index(block_group->flags);
10639	factor = btrfs_bg_type_to_factor(block_group->flags);
10640
10641	/* make sure this block group isn't part of an allocation cluster */
10642	cluster = &fs_info->data_alloc_cluster;
10643	spin_lock(&cluster->refill_lock);
10644	btrfs_return_cluster_to_free_space(block_group, cluster);
10645	spin_unlock(&cluster->refill_lock);
10646
10647	/*
10648	 * make sure this block group isn't part of a metadata
10649	 * allocation cluster
10650	 */
10651	cluster = &fs_info->meta_alloc_cluster;
10652	spin_lock(&cluster->refill_lock);
10653	btrfs_return_cluster_to_free_space(block_group, cluster);
10654	spin_unlock(&cluster->refill_lock);
10655
10656	path = btrfs_alloc_path();
10657	if (!path) {
10658		ret = -ENOMEM;
10659		goto out;
10660	}
10661
10662	/*
10663	 * get the inode first so any iput calls done for the io_list
10664	 * aren't the final iput (no unlinks allowed now)
10665	 */
10666	inode = lookup_free_space_inode(fs_info, block_group, path);
10667
10668	mutex_lock(&trans->transaction->cache_write_mutex);
10669	/*
10670	 * Make sure our free space cache IO is done before removing the
10671	 * free space inode
10672	 */
10673	spin_lock(&trans->transaction->dirty_bgs_lock);
10674	if (!list_empty(&block_group->io_list)) {
10675		list_del_init(&block_group->io_list);
10676
10677		WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
10678
10679		spin_unlock(&trans->transaction->dirty_bgs_lock);
10680		btrfs_wait_cache_io(trans, block_group, path);
10681		btrfs_put_block_group(block_group);
10682		spin_lock(&trans->transaction->dirty_bgs_lock);
10683	}
10684
10685	if (!list_empty(&block_group->dirty_list)) {
10686		list_del_init(&block_group->dirty_list);
10687		remove_rsv = true;
10688		btrfs_put_block_group(block_group);
10689	}
10690	spin_unlock(&trans->transaction->dirty_bgs_lock);
10691	mutex_unlock(&trans->transaction->cache_write_mutex);
10692
10693	if (!IS_ERR(inode)) {
10694		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
10695		if (ret) {
10696			btrfs_add_delayed_iput(inode);
10697			goto out;
10698		}
10699		clear_nlink(inode);
10700		/* One for the block groups ref */
10701		spin_lock(&block_group->lock);
10702		if (block_group->iref) {
10703			block_group->iref = 0;
10704			block_group->inode = NULL;
10705			spin_unlock(&block_group->lock);
10706			iput(inode);
10707		} else {
10708			spin_unlock(&block_group->lock);
10709		}
10710		/* One for our lookup ref */
10711		btrfs_add_delayed_iput(inode);
10712	}
10713
10714	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
10715	key.offset = block_group->key.objectid;
10716	key.type = 0;
10717
10718	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
10719	if (ret < 0)
10720		goto out;
10721	if (ret > 0)
10722		btrfs_release_path(path);
10723	if (ret == 0) {
10724		ret = btrfs_del_item(trans, tree_root, path);
10725		if (ret)
10726			goto out;
10727		btrfs_release_path(path);
10728	}
10729
10730	spin_lock(&fs_info->block_group_cache_lock);
10731	rb_erase(&block_group->cache_node,
10732		 &fs_info->block_group_cache_tree);
10733	RB_CLEAR_NODE(&block_group->cache_node);
10734
10735	if (fs_info->first_logical_byte == block_group->key.objectid)
10736		fs_info->first_logical_byte = (u64)-1;
10737	spin_unlock(&fs_info->block_group_cache_lock);
10738
10739	down_write(&block_group->space_info->groups_sem);
10740	/*
10741	 * we must use list_del_init so people can check to see if they
10742	 * are still on the list after taking the semaphore
10743	 */
10744	list_del_init(&block_group->list);
10745	if (list_empty(&block_group->space_info->block_groups[index])) {
10746		kobj = block_group->space_info->block_group_kobjs[index];
10747		block_group->space_info->block_group_kobjs[index] = NULL;
10748		clear_avail_alloc_bits(fs_info, block_group->flags);
10749	}
10750	up_write(&block_group->space_info->groups_sem);
10751	if (kobj) {
10752		kobject_del(kobj);
10753		kobject_put(kobj);
10754	}
10755
10756	if (block_group->has_caching_ctl)
10757		caching_ctl = get_caching_control(block_group);
10758	if (block_group->cached == BTRFS_CACHE_STARTED)
10759		wait_block_group_cache_done(block_group);
10760	if (block_group->has_caching_ctl) {
10761		down_write(&fs_info->commit_root_sem);
10762		if (!caching_ctl) {
10763			struct btrfs_caching_control *ctl;
10764
10765			list_for_each_entry(ctl,
10766				    &fs_info->caching_block_groups, list)
10767				if (ctl->block_group == block_group) {
10768					caching_ctl = ctl;
10769					refcount_inc(&caching_ctl->count);
10770					break;
10771				}
10772		}
10773		if (caching_ctl)
10774			list_del_init(&caching_ctl->list);
10775		up_write(&fs_info->commit_root_sem);
10776		if (caching_ctl) {
10777			/* Once for the caching bgs list and once for us. */
10778			put_caching_control(caching_ctl);
10779			put_caching_control(caching_ctl);
10780		}
10781	}
10782
10783	spin_lock(&trans->transaction->dirty_bgs_lock);
10784	if (!list_empty(&block_group->dirty_list)) {
10785		WARN_ON(1);
10786	}
10787	if (!list_empty(&block_group->io_list)) {
10788		WARN_ON(1);
10789	}
10790	spin_unlock(&trans->transaction->dirty_bgs_lock);
10791	btrfs_remove_free_space_cache(block_group);
10792
10793	spin_lock(&block_group->space_info->lock);
10794	list_del_init(&block_group->ro_list);
10795
10796	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
10797		WARN_ON(block_group->space_info->total_bytes
10798			< block_group->key.offset);
10799		WARN_ON(block_group->space_info->bytes_readonly
10800			< block_group->key.offset);
10801		WARN_ON(block_group->space_info->disk_total
10802			< block_group->key.offset * factor);
10803	}
10804	block_group->space_info->total_bytes -= block_group->key.offset;
10805	block_group->space_info->bytes_readonly -= block_group->key.offset;
10806	block_group->space_info->disk_total -= block_group->key.offset * factor;
10807
10808	spin_unlock(&block_group->space_info->lock);
10809
10810	memcpy(&key, &block_group->key, sizeof(key));
10811
10812	mutex_lock(&fs_info->chunk_mutex);
10813	if (!list_empty(&em->list)) {
10814		/* We're in the transaction->pending_chunks list. */
10815		free_extent_map(em);
10816	}
10817	spin_lock(&block_group->lock);
10818	block_group->removed = 1;
10819	/*
10820	 * At this point trimming can't start on this block group, because we
10821	 * removed the block group from the tree fs_info->block_group_cache_tree
10822	 * so no one can't find it anymore and even if someone already got this
10823	 * block group before we removed it from the rbtree, they have already
10824	 * incremented block_group->trimming - if they didn't, they won't find
10825	 * any free space entries because we already removed them all when we
10826	 * called btrfs_remove_free_space_cache().
10827	 *
10828	 * And we must not remove the extent map from the fs_info->mapping_tree
10829	 * to prevent the same logical address range and physical device space
10830	 * ranges from being reused for a new block group. This is because our
10831	 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
10832	 * completely transactionless, so while it is trimming a range the
10833	 * currently running transaction might finish and a new one start,
10834	 * allowing for new block groups to be created that can reuse the same
10835	 * physical device locations unless we take this special care.
10836	 *
10837	 * There may also be an implicit trim operation if the file system
10838	 * is mounted with -odiscard. The same protections must remain
10839	 * in place until the extents have been discarded completely when
10840	 * the transaction commit has completed.
10841	 */
10842	remove_em = (atomic_read(&block_group->trimming) == 0);
10843	/*
10844	 * Make sure a trimmer task always sees the em in the pinned_chunks list
10845	 * if it sees block_group->removed == 1 (needs to lock block_group->lock
10846	 * before checking block_group->removed).
10847	 */
10848	if (!remove_em) {
10849		/*
10850		 * Our em might be in trans->transaction->pending_chunks which
10851		 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
10852		 * and so is the fs_info->pinned_chunks list.
10853		 *
10854		 * So at this point we must be holding the chunk_mutex to avoid
10855		 * any races with chunk allocation (more specifically at
10856		 * volumes.c:contains_pending_extent()), to ensure it always
10857		 * sees the em, either in the pending_chunks list or in the
10858		 * pinned_chunks list.
10859		 */
10860		list_move_tail(&em->list, &fs_info->pinned_chunks);
10861	}
10862	spin_unlock(&block_group->lock);
10863
10864	if (remove_em) {
10865		struct extent_map_tree *em_tree;
10866
10867		em_tree = &fs_info->mapping_tree.map_tree;
10868		write_lock(&em_tree->lock);
10869		/*
10870		 * The em might be in the pending_chunks list, so make sure the
10871		 * chunk mutex is locked, since remove_extent_mapping() will
10872		 * delete us from that list.
10873		 */
10874		remove_extent_mapping(em_tree, em);
10875		write_unlock(&em_tree->lock);
10876		/* once for the tree */
10877		free_extent_map(em);
10878	}
10879
10880	mutex_unlock(&fs_info->chunk_mutex);
10881
10882	ret = remove_block_group_free_space(trans, block_group);
10883	if (ret)
10884		goto out;
10885
10886	btrfs_put_block_group(block_group);
10887	btrfs_put_block_group(block_group);
10888
10889	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
10890	if (ret > 0)
10891		ret = -EIO;
10892	if (ret < 0)
10893		goto out;
10894
10895	ret = btrfs_del_item(trans, root, path);
10896out:
10897	if (remove_rsv)
10898		btrfs_delayed_refs_rsv_release(fs_info, 1);
10899	btrfs_free_path(path);
10900	return ret;
10901}
10902
10903struct btrfs_trans_handle *
10904btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
10905				     const u64 chunk_offset)
10906{
10907	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
10908	struct extent_map *em;
10909	struct map_lookup *map;
10910	unsigned int num_items;
10911
10912	read_lock(&em_tree->lock);
10913	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
10914	read_unlock(&em_tree->lock);
10915	ASSERT(em && em->start == chunk_offset);
10916
10917	/*
10918	 * We need to reserve 3 + N units from the metadata space info in order
10919	 * to remove a block group (done at btrfs_remove_chunk() and at
10920	 * btrfs_remove_block_group()), which are used for:
10921	 *
10922	 * 1 unit for adding the free space inode's orphan (located in the tree
10923	 * of tree roots).
10924	 * 1 unit for deleting the block group item (located in the extent
10925	 * tree).
10926	 * 1 unit for deleting the free space item (located in tree of tree
10927	 * roots).
10928	 * N units for deleting N device extent items corresponding to each
10929	 * stripe (located in the device tree).
10930	 *
10931	 * In order to remove a block group we also need to reserve units in the
10932	 * system space info in order to update the chunk tree (update one or
10933	 * more device items and remove one chunk item), but this is done at
10934	 * btrfs_remove_chunk() through a call to check_system_chunk().
10935	 */
10936	map = em->map_lookup;
10937	num_items = 3 + map->num_stripes;
10938	free_extent_map(em);
10939
10940	return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
10941							   num_items, 1);
10942}
10943
10944/*
10945 * Process the unused_bgs list and remove any that don't have any allocated
10946 * space inside of them.
10947 */
10948void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
10949{
10950	struct btrfs_block_group_cache *block_group;
10951	struct btrfs_space_info *space_info;
10952	struct btrfs_trans_handle *trans;
10953	int ret = 0;
10954
10955	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
10956		return;
10957
10958	spin_lock(&fs_info->unused_bgs_lock);
10959	while (!list_empty(&fs_info->unused_bgs)) {
10960		u64 start, end;
10961		int trimming;
10962
10963		block_group = list_first_entry(&fs_info->unused_bgs,
10964					       struct btrfs_block_group_cache,
10965					       bg_list);
10966		list_del_init(&block_group->bg_list);
10967
10968		space_info = block_group->space_info;
10969
10970		if (ret || btrfs_mixed_space_info(space_info)) {
10971			btrfs_put_block_group(block_group);
10972			continue;
10973		}
10974		spin_unlock(&fs_info->unused_bgs_lock);
10975
10976		mutex_lock(&fs_info->delete_unused_bgs_mutex);
10977
10978		/* Don't want to race with allocators so take the groups_sem */
10979		down_write(&space_info->groups_sem);
10980		spin_lock(&block_group->lock);
10981		if (block_group->reserved || block_group->pinned ||
10982		    btrfs_block_group_used(&block_group->item) ||
10983		    block_group->ro ||
10984		    list_is_singular(&block_group->list)) {
10985			/*
10986			 * We want to bail if we made new allocations or have
10987			 * outstanding allocations in this block group.  We do
10988			 * the ro check in case balance is currently acting on
10989			 * this block group.
10990			 */
10991			trace_btrfs_skip_unused_block_group(block_group);
10992			spin_unlock(&block_group->lock);
10993			up_write(&space_info->groups_sem);
10994			goto next;
10995		}
10996		spin_unlock(&block_group->lock);
10997
10998		/* We don't want to force the issue, only flip if it's ok. */
10999		ret = inc_block_group_ro(block_group, 0);
11000		up_write(&space_info->groups_sem);
11001		if (ret < 0) {
11002			ret = 0;
11003			goto next;
11004		}
11005
11006		/*
11007		 * Want to do this before we do anything else so we can recover
11008		 * properly if we fail to join the transaction.
11009		 */
11010		trans = btrfs_start_trans_remove_block_group(fs_info,
11011						     block_group->key.objectid);
11012		if (IS_ERR(trans)) {
11013			btrfs_dec_block_group_ro(block_group);
11014			ret = PTR_ERR(trans);
11015			goto next;
11016		}
11017
11018		/*
11019		 * We could have pending pinned extents for this block group,
11020		 * just delete them, we don't care about them anymore.
11021		 */
11022		start = block_group->key.objectid;
11023		end = start + block_group->key.offset - 1;
11024		/*
11025		 * Hold the unused_bg_unpin_mutex lock to avoid racing with
11026		 * btrfs_finish_extent_commit(). If we are at transaction N,
11027		 * another task might be running finish_extent_commit() for the
11028		 * previous transaction N - 1, and have seen a range belonging
11029		 * to the block group in freed_extents[] before we were able to
11030		 * clear the whole block group range from freed_extents[]. This
11031		 * means that task can lookup for the block group after we
11032		 * unpinned it from freed_extents[] and removed it, leading to
11033		 * a BUG_ON() at btrfs_unpin_extent_range().
11034		 */
11035		mutex_lock(&fs_info->unused_bg_unpin_mutex);
11036		ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
11037				  EXTENT_DIRTY);
11038		if (ret) {
11039			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
11040			btrfs_dec_block_group_ro(block_group);
11041			goto end_trans;
11042		}
11043		ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
11044				  EXTENT_DIRTY);
11045		if (ret) {
11046			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
11047			btrfs_dec_block_group_ro(block_group);
11048			goto end_trans;
11049		}
11050		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
11051
11052		/* Reset pinned so btrfs_put_block_group doesn't complain */
11053		spin_lock(&space_info->lock);
11054		spin_lock(&block_group->lock);
11055
11056		update_bytes_pinned(space_info, -block_group->pinned);
11057		space_info->bytes_readonly += block_group->pinned;
11058		percpu_counter_add_batch(&space_info->total_bytes_pinned,
11059				   -block_group->pinned,
11060				   BTRFS_TOTAL_BYTES_PINNED_BATCH);
11061		block_group->pinned = 0;
11062
11063		spin_unlock(&block_group->lock);
11064		spin_unlock(&space_info->lock);
11065
11066		/* DISCARD can flip during remount */
11067		trimming = btrfs_test_opt(fs_info, DISCARD);
11068
11069		/* Implicit trim during transaction commit. */
11070		if (trimming)
11071			btrfs_get_block_group_trimming(block_group);
11072
11073		/*
11074		 * Btrfs_remove_chunk will abort the transaction if things go
11075		 * horribly wrong.
11076		 */
11077		ret = btrfs_remove_chunk(trans, block_group->key.objectid);
11078
11079		if (ret) {
11080			if (trimming)
11081				btrfs_put_block_group_trimming(block_group);
11082			goto end_trans;
11083		}
11084
11085		/*
11086		 * If we're not mounted with -odiscard, we can just forget
11087		 * about this block group. Otherwise we'll need to wait
11088		 * until transaction commit to do the actual discard.
11089		 */
11090		if (trimming) {
11091			spin_lock(&fs_info->unused_bgs_lock);
11092			/*
11093			 * A concurrent scrub might have added us to the list
11094			 * fs_info->unused_bgs, so use a list_move operation
11095			 * to add the block group to the deleted_bgs list.
11096			 */
11097			list_move(&block_group->bg_list,
11098				  &trans->transaction->deleted_bgs);
11099			spin_unlock(&fs_info->unused_bgs_lock);
11100			btrfs_get_block_group(block_group);
11101		}
11102end_trans:
11103		btrfs_end_transaction(trans);
11104next:
11105		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
11106		btrfs_put_block_group(block_group);
11107		spin_lock(&fs_info->unused_bgs_lock);
11108	}
11109	spin_unlock(&fs_info->unused_bgs_lock);
11110}
11111
11112int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
11113{
11114	struct btrfs_super_block *disk_super;
11115	u64 features;
11116	u64 flags;
11117	int mixed = 0;
11118	int ret;
11119
11120	disk_super = fs_info->super_copy;
11121	if (!btrfs_super_root(disk_super))
11122		return -EINVAL;
11123
11124	features = btrfs_super_incompat_flags(disk_super);
11125	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
11126		mixed = 1;
11127
11128	flags = BTRFS_BLOCK_GROUP_SYSTEM;
11129	ret = create_space_info(fs_info, flags);
11130	if (ret)
11131		goto out;
11132
11133	if (mixed) {
11134		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
11135		ret = create_space_info(fs_info, flags);
11136	} else {
11137		flags = BTRFS_BLOCK_GROUP_METADATA;
11138		ret = create_space_info(fs_info, flags);
11139		if (ret)
11140			goto out;
11141
11142		flags = BTRFS_BLOCK_GROUP_DATA;
11143		ret = create_space_info(fs_info, flags);
11144	}
11145out:
11146	return ret;
11147}
11148
11149int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
11150				   u64 start, u64 end)
11151{
11152	return unpin_extent_range(fs_info, start, end, false);
11153}
11154
11155/*
11156 * It used to be that old block groups would be left around forever.
11157 * Iterating over them would be enough to trim unused space.  Since we
11158 * now automatically remove them, we also need to iterate over unallocated
11159 * space.
11160 *
11161 * We don't want a transaction for this since the discard may take a
11162 * substantial amount of time.  We don't require that a transaction be
11163 * running, but we do need to take a running transaction into account
11164 * to ensure that we're not discarding chunks that were released or
11165 * allocated in the current transaction.
11166 *
11167 * Holding the chunks lock will prevent other threads from allocating
11168 * or releasing chunks, but it won't prevent a running transaction
11169 * from committing and releasing the memory that the pending chunks
11170 * list head uses.  For that, we need to take a reference to the
11171 * transaction and hold the commit root sem.  We only need to hold
11172 * it while performing the free space search since we have already
11173 * held back allocations.
11174 */
11175static int btrfs_trim_free_extents(struct btrfs_device *device,
11176				   u64 minlen, u64 *trimmed)
11177{
11178	u64 start = 0, len = 0;
11179	int ret;
11180
11181	*trimmed = 0;
11182
11183	/* Discard not supported = nothing to do. */
11184	if (!blk_queue_discard(bdev_get_queue(device->bdev)))
11185		return 0;
11186
11187	/* Not writable = nothing to do. */
11188	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
11189		return 0;
11190
11191	/* No free space = nothing to do. */
11192	if (device->total_bytes <= device->bytes_used)
11193		return 0;
11194
11195	ret = 0;
11196
11197	while (1) {
11198		struct btrfs_fs_info *fs_info = device->fs_info;
11199		struct btrfs_transaction *trans;
11200		u64 bytes;
11201
11202		ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
11203		if (ret)
11204			break;
11205
11206		ret = down_read_killable(&fs_info->commit_root_sem);
11207		if (ret) {
11208			mutex_unlock(&fs_info->chunk_mutex);
11209			break;
11210		}
11211
11212		spin_lock(&fs_info->trans_lock);
11213		trans = fs_info->running_transaction;
11214		if (trans)
11215			refcount_inc(&trans->use_count);
11216		spin_unlock(&fs_info->trans_lock);
11217
11218		if (!trans)
11219			up_read(&fs_info->commit_root_sem);
11220
11221		ret = find_free_dev_extent_start(trans, device, minlen, start,
11222						 &start, &len);
11223		if (trans) {
11224			up_read(&fs_info->commit_root_sem);
11225			btrfs_put_transaction(trans);
11226		}
11227
11228		if (ret) {
11229			mutex_unlock(&fs_info->chunk_mutex);
11230			if (ret == -ENOSPC)
11231				ret = 0;
11232			break;
11233		}
11234
11235		ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
11236		mutex_unlock(&fs_info->chunk_mutex);
11237
11238		if (ret)
11239			break;
11240
11241		start += len;
11242		*trimmed += bytes;
11243
11244		if (fatal_signal_pending(current)) {
11245			ret = -ERESTARTSYS;
11246			break;
11247		}
11248
11249		cond_resched();
11250	}
11251
11252	return ret;
11253}
11254
11255/*
11256 * Trim the whole filesystem by:
11257 * 1) trimming the free space in each block group
11258 * 2) trimming the unallocated space on each device
11259 *
11260 * This will also continue trimming even if a block group or device encounters
11261 * an error.  The return value will be the last error, or 0 if nothing bad
11262 * happens.
11263 */
11264int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
11265{
11266	struct btrfs_block_group_cache *cache = NULL;
11267	struct btrfs_device *device;
11268	struct list_head *devices;
11269	u64 group_trimmed;
11270	u64 start;
11271	u64 end;
11272	u64 trimmed = 0;
11273	u64 bg_failed = 0;
11274	u64 dev_failed = 0;
11275	int bg_ret = 0;
11276	int dev_ret = 0;
11277	int ret = 0;
11278
11279	cache = btrfs_lookup_first_block_group(fs_info, range->start);
11280	for (; cache; cache = next_block_group(fs_info, cache)) {
11281		if (cache->key.objectid >= (range->start + range->len)) {
11282			btrfs_put_block_group(cache);
11283			break;
11284		}
11285
11286		start = max(range->start, cache->key.objectid);
11287		end = min(range->start + range->len,
11288				cache->key.objectid + cache->key.offset);
11289
11290		if (end - start >= range->minlen) {
11291			if (!block_group_cache_done(cache)) {
11292				ret = cache_block_group(cache, 0);
11293				if (ret) {
11294					bg_failed++;
11295					bg_ret = ret;
11296					continue;
11297				}
11298				ret = wait_block_group_cache_done(cache);
11299				if (ret) {
11300					bg_failed++;
11301					bg_ret = ret;
11302					continue;
11303				}
11304			}
11305			ret = btrfs_trim_block_group(cache,
11306						     &group_trimmed,
11307						     start,
11308						     end,
11309						     range->minlen);
11310
11311			trimmed += group_trimmed;
11312			if (ret) {
11313				bg_failed++;
11314				bg_ret = ret;
11315				continue;
11316			}
11317		}
11318	}
11319
11320	if (bg_failed)
11321		btrfs_warn(fs_info,
11322			"failed to trim %llu block group(s), last error %d",
11323			bg_failed, bg_ret);
11324	mutex_lock(&fs_info->fs_devices->device_list_mutex);
11325	devices = &fs_info->fs_devices->devices;
11326	list_for_each_entry(device, devices, dev_list) {
11327		ret = btrfs_trim_free_extents(device, range->minlen,
11328					      &group_trimmed);
11329		if (ret) {
11330			dev_failed++;
11331			dev_ret = ret;
11332			break;
11333		}
11334
11335		trimmed += group_trimmed;
11336	}
11337	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
11338
11339	if (dev_failed)
11340		btrfs_warn(fs_info,
11341			"failed to trim %llu device(s), last error %d",
11342			dev_failed, dev_ret);
11343	range->len = trimmed;
11344	if (bg_ret)
11345		return bg_ret;
11346	return dev_ret;
11347}
11348
11349/*
11350 * btrfs_{start,end}_write_no_snapshotting() are similar to
11351 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
11352 * data into the page cache through nocow before the subvolume is snapshoted,
11353 * but flush the data into disk after the snapshot creation, or to prevent
11354 * operations while snapshotting is ongoing and that cause the snapshot to be
11355 * inconsistent (writes followed by expanding truncates for example).
11356 */
11357void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
11358{
11359	percpu_counter_dec(&root->subv_writers->counter);
11360	cond_wake_up(&root->subv_writers->wait);
11361}
11362
11363int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
11364{
11365	if (atomic_read(&root->will_be_snapshotted))
11366		return 0;
11367
11368	percpu_counter_inc(&root->subv_writers->counter);
11369	/*
11370	 * Make sure counter is updated before we check for snapshot creation.
11371	 */
11372	smp_mb();
11373	if (atomic_read(&root->will_be_snapshotted)) {
11374		btrfs_end_write_no_snapshotting(root);
11375		return 0;
11376	}
11377	return 1;
11378}
11379
11380void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
11381{
11382	while (true) {
11383		int ret;
11384
11385		ret = btrfs_start_write_no_snapshotting(root);
11386		if (ret)
11387			break;
11388		wait_var_event(&root->will_be_snapshotted,
11389			       !atomic_read(&root->will_be_snapshotted));
11390	}
11391}
11392
11393void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
11394{
11395	struct btrfs_fs_info *fs_info = bg->fs_info;
11396
11397	spin_lock(&fs_info->unused_bgs_lock);
11398	if (list_empty(&bg->bg_list)) {
11399		btrfs_get_block_group(bg);
11400		trace_btrfs_add_unused_block_group(bg);
11401		list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
11402	}
11403	spin_unlock(&fs_info->unused_bgs_lock);
11404}