fs/btrfs/extent-tree.c at v4.19-rc1

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / fs / btrfs / extent-tree.c
at v4.19-rc1 10965 lines 306 kB view raw
wrap content
    1// SPDX-License-Identifier: GPL-2.0
    2/*
    3 * Copyright (C) 2007 Oracle.  All rights reserved.
    4 */
    5
    6#include <linux/sched.h>
    7#include <linux/sched/signal.h>
    8#include <linux/pagemap.h>
    9#include <linux/writeback.h>
   10#include <linux/blkdev.h>
   11#include <linux/sort.h>
   12#include <linux/rcupdate.h>
   13#include <linux/kthread.h>
   14#include <linux/slab.h>
   15#include <linux/ratelimit.h>
   16#include <linux/percpu_counter.h>
   17#include <linux/lockdep.h>
   18#include <linux/crc32c.h>
   19#include "tree-log.h"
   20#include "disk-io.h"
   21#include "print-tree.h"
   22#include "volumes.h"
   23#include "raid56.h"
   24#include "locking.h"
   25#include "free-space-cache.h"
   26#include "free-space-tree.h"
   27#include "math.h"
   28#include "sysfs.h"
   29#include "qgroup.h"
   30#include "ref-verify.h"
   31
   32#undef SCRAMBLE_DELAYED_REFS
   33
   34/*
   35 * control flags for do_chunk_alloc's force field
   36 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
   37 * if we really need one.
   38 *
   39 * CHUNK_ALLOC_LIMITED means to only try and allocate one
   40 * if we have very few chunks already allocated.  This is
   41 * used as part of the clustering code to help make sure
   42 * we have a good pool of storage to cluster in, without
   43 * filling the FS with empty chunks
   44 *
   45 * CHUNK_ALLOC_FORCE means it must try to allocate one
   46 *
   47 */
   48enum {
   49	CHUNK_ALLOC_NO_FORCE = 0,
   50	CHUNK_ALLOC_LIMITED = 1,
   51	CHUNK_ALLOC_FORCE = 2,
   52};
   53
   54static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
   55			       struct btrfs_delayed_ref_node *node, u64 parent,
   56			       u64 root_objectid, u64 owner_objectid,
   57			       u64 owner_offset, int refs_to_drop,
   58			       struct btrfs_delayed_extent_op *extra_op);
   59static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
   60				    struct extent_buffer *leaf,
   61				    struct btrfs_extent_item *ei);
   62static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
   63				      u64 parent, u64 root_objectid,
   64				      u64 flags, u64 owner, u64 offset,
   65				      struct btrfs_key *ins, int ref_mod);
   66static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
   67				     struct btrfs_delayed_ref_node *node,
   68				     struct btrfs_delayed_extent_op *extent_op);
   69static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
   70			  int force);
   71static int find_next_key(struct btrfs_path *path, int level,
   72			 struct btrfs_key *key);
   73static void dump_space_info(struct btrfs_fs_info *fs_info,
   74			    struct btrfs_space_info *info, u64 bytes,
   75			    int dump_block_groups);
   76static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
   77			       u64 num_bytes);
   78static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
   79				     struct btrfs_space_info *space_info,
   80				     u64 num_bytes);
   81static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
   82				     struct btrfs_space_info *space_info,
   83				     u64 num_bytes);
   84
   85static noinline int
   86block_group_cache_done(struct btrfs_block_group_cache *cache)
   87{
   88	smp_mb();
   89	return cache->cached == BTRFS_CACHE_FINISHED ||
   90		cache->cached == BTRFS_CACHE_ERROR;
   91}
   92
   93static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
   94{
   95	return (cache->flags & bits) == bits;
   96}
   97
   98void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
   99{
  100	atomic_inc(&cache->count);
  101}
  102
  103void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
  104{
  105	if (atomic_dec_and_test(&cache->count)) {
  106		WARN_ON(cache->pinned > 0);
  107		WARN_ON(cache->reserved > 0);
  108
  109		/*
  110		 * If not empty, someone is still holding mutex of
  111		 * full_stripe_lock, which can only be released by caller.
  112		 * And it will definitely cause use-after-free when caller
  113		 * tries to release full stripe lock.
  114		 *
  115		 * No better way to resolve, but only to warn.
  116		 */
  117		WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
  118		kfree(cache->free_space_ctl);
  119		kfree(cache);
  120	}
  121}
  122
  123/*
  124 * this adds the block group to the fs_info rb tree for the block group
  125 * cache
  126 */
  127static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
  128				struct btrfs_block_group_cache *block_group)
  129{
  130	struct rb_node **p;
  131	struct rb_node *parent = NULL;
  132	struct btrfs_block_group_cache *cache;
  133
  134	spin_lock(&info->block_group_cache_lock);
  135	p = &info->block_group_cache_tree.rb_node;
  136
  137	while (*p) {
  138		parent = *p;
  139		cache = rb_entry(parent, struct btrfs_block_group_cache,
  140				 cache_node);
  141		if (block_group->key.objectid < cache->key.objectid) {
  142			p = &(*p)->rb_left;
  143		} else if (block_group->key.objectid > cache->key.objectid) {
  144			p = &(*p)->rb_right;
  145		} else {
  146			spin_unlock(&info->block_group_cache_lock);
  147			return -EEXIST;
  148		}
  149	}
  150
  151	rb_link_node(&block_group->cache_node, parent, p);
  152	rb_insert_color(&block_group->cache_node,
  153			&info->block_group_cache_tree);
  154
  155	if (info->first_logical_byte > block_group->key.objectid)
  156		info->first_logical_byte = block_group->key.objectid;
  157
  158	spin_unlock(&info->block_group_cache_lock);
  159
  160	return 0;
  161}
  162
  163/*
  164 * This will return the block group at or after bytenr if contains is 0, else
  165 * it will return the block group that contains the bytenr
  166 */
  167static struct btrfs_block_group_cache *
  168block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
  169			      int contains)
  170{
  171	struct btrfs_block_group_cache *cache, *ret = NULL;
  172	struct rb_node *n;
  173	u64 end, start;
  174
  175	spin_lock(&info->block_group_cache_lock);
  176	n = info->block_group_cache_tree.rb_node;
  177
  178	while (n) {
  179		cache = rb_entry(n, struct btrfs_block_group_cache,
  180				 cache_node);
  181		end = cache->key.objectid + cache->key.offset - 1;
  182		start = cache->key.objectid;
  183
  184		if (bytenr < start) {
  185			if (!contains && (!ret || start < ret->key.objectid))
  186				ret = cache;
  187			n = n->rb_left;
  188		} else if (bytenr > start) {
  189			if (contains && bytenr <= end) {
  190				ret = cache;
  191				break;
  192			}
  193			n = n->rb_right;
  194		} else {
  195			ret = cache;
  196			break;
  197		}
  198	}
  199	if (ret) {
  200		btrfs_get_block_group(ret);
  201		if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
  202			info->first_logical_byte = ret->key.objectid;
  203	}
  204	spin_unlock(&info->block_group_cache_lock);
  205
  206	return ret;
  207}
  208
  209static int add_excluded_extent(struct btrfs_fs_info *fs_info,
  210			       u64 start, u64 num_bytes)
  211{
  212	u64 end = start + num_bytes - 1;
  213	set_extent_bits(&fs_info->freed_extents[0],
  214			start, end, EXTENT_UPTODATE);
  215	set_extent_bits(&fs_info->freed_extents[1],
  216			start, end, EXTENT_UPTODATE);
  217	return 0;
  218}
  219
  220static void free_excluded_extents(struct btrfs_block_group_cache *cache)
  221{
  222	struct btrfs_fs_info *fs_info = cache->fs_info;
  223	u64 start, end;
  224
  225	start = cache->key.objectid;
  226	end = start + cache->key.offset - 1;
  227
  228	clear_extent_bits(&fs_info->freed_extents[0],
  229			  start, end, EXTENT_UPTODATE);
  230	clear_extent_bits(&fs_info->freed_extents[1],
  231			  start, end, EXTENT_UPTODATE);
  232}
  233
  234static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
  235{
  236	struct btrfs_fs_info *fs_info = cache->fs_info;
  237	u64 bytenr;
  238	u64 *logical;
  239	int stripe_len;
  240	int i, nr, ret;
  241
  242	if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
  243		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
  244		cache->bytes_super += stripe_len;
  245		ret = add_excluded_extent(fs_info, cache->key.objectid,
  246					  stripe_len);
  247		if (ret)
  248			return ret;
  249	}
  250
  251	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
  252		bytenr = btrfs_sb_offset(i);
  253		ret = btrfs_rmap_block(fs_info, cache->key.objectid,
  254				       bytenr, &logical, &nr, &stripe_len);
  255		if (ret)
  256			return ret;
  257
  258		while (nr--) {
  259			u64 start, len;
  260
  261			if (logical[nr] > cache->key.objectid +
  262			    cache->key.offset)
  263				continue;
  264
  265			if (logical[nr] + stripe_len <= cache->key.objectid)
  266				continue;
  267
  268			start = logical[nr];
  269			if (start < cache->key.objectid) {
  270				start = cache->key.objectid;
  271				len = (logical[nr] + stripe_len) - start;
  272			} else {
  273				len = min_t(u64, stripe_len,
  274					    cache->key.objectid +
  275					    cache->key.offset - start);
  276			}
  277
  278			cache->bytes_super += len;
  279			ret = add_excluded_extent(fs_info, start, len);
  280			if (ret) {
  281				kfree(logical);
  282				return ret;
  283			}
  284		}
  285
  286		kfree(logical);
  287	}
  288	return 0;
  289}
  290
  291static struct btrfs_caching_control *
  292get_caching_control(struct btrfs_block_group_cache *cache)
  293{
  294	struct btrfs_caching_control *ctl;
  295
  296	spin_lock(&cache->lock);
  297	if (!cache->caching_ctl) {
  298		spin_unlock(&cache->lock);
  299		return NULL;
  300	}
  301
  302	ctl = cache->caching_ctl;
  303	refcount_inc(&ctl->count);
  304	spin_unlock(&cache->lock);
  305	return ctl;
  306}
  307
  308static void put_caching_control(struct btrfs_caching_control *ctl)
  309{
  310	if (refcount_dec_and_test(&ctl->count))
  311		kfree(ctl);
  312}
  313
  314#ifdef CONFIG_BTRFS_DEBUG
  315static void fragment_free_space(struct btrfs_block_group_cache *block_group)
  316{
  317	struct btrfs_fs_info *fs_info = block_group->fs_info;
  318	u64 start = block_group->key.objectid;
  319	u64 len = block_group->key.offset;
  320	u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
  321		fs_info->nodesize : fs_info->sectorsize;
  322	u64 step = chunk << 1;
  323
  324	while (len > chunk) {
  325		btrfs_remove_free_space(block_group, start, chunk);
  326		start += step;
  327		if (len < step)
  328			len = 0;
  329		else
  330			len -= step;
  331	}
  332}
  333#endif
  334
  335/*
  336 * this is only called by cache_block_group, since we could have freed extents
  337 * we need to check the pinned_extents for any extents that can't be used yet
  338 * since their free space will be released as soon as the transaction commits.
  339 */
  340u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
  341		       u64 start, u64 end)
  342{
  343	struct btrfs_fs_info *info = block_group->fs_info;
  344	u64 extent_start, extent_end, size, total_added = 0;
  345	int ret;
  346
  347	while (start < end) {
  348		ret = find_first_extent_bit(info->pinned_extents, start,
  349					    &extent_start, &extent_end,
  350					    EXTENT_DIRTY | EXTENT_UPTODATE,
  351					    NULL);
  352		if (ret)
  353			break;
  354
  355		if (extent_start <= start) {
  356			start = extent_end + 1;
  357		} else if (extent_start > start && extent_start < end) {
  358			size = extent_start - start;
  359			total_added += size;
  360			ret = btrfs_add_free_space(block_group, start,
  361						   size);
  362			BUG_ON(ret); /* -ENOMEM or logic error */
  363			start = extent_end + 1;
  364		} else {
  365			break;
  366		}
  367	}
  368
  369	if (start < end) {
  370		size = end - start;
  371		total_added += size;
  372		ret = btrfs_add_free_space(block_group, start, size);
  373		BUG_ON(ret); /* -ENOMEM or logic error */
  374	}
  375
  376	return total_added;
  377}
  378
  379static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
  380{
  381	struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
  382	struct btrfs_fs_info *fs_info = block_group->fs_info;
  383	struct btrfs_root *extent_root = fs_info->extent_root;
  384	struct btrfs_path *path;
  385	struct extent_buffer *leaf;
  386	struct btrfs_key key;
  387	u64 total_found = 0;
  388	u64 last = 0;
  389	u32 nritems;
  390	int ret;
  391	bool wakeup = true;
  392
  393	path = btrfs_alloc_path();
  394	if (!path)
  395		return -ENOMEM;
  396
  397	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
  398
  399#ifdef CONFIG_BTRFS_DEBUG
  400	/*
  401	 * If we're fragmenting we don't want to make anybody think we can
  402	 * allocate from this block group until we've had a chance to fragment
  403	 * the free space.
  404	 */
  405	if (btrfs_should_fragment_free_space(block_group))
  406		wakeup = false;
  407#endif
  408	/*
  409	 * We don't want to deadlock with somebody trying to allocate a new
  410	 * extent for the extent root while also trying to search the extent
  411	 * root to add free space.  So we skip locking and search the commit
  412	 * root, since its read-only
  413	 */
  414	path->skip_locking = 1;
  415	path->search_commit_root = 1;
  416	path->reada = READA_FORWARD;
  417
  418	key.objectid = last;
  419	key.offset = 0;
  420	key.type = BTRFS_EXTENT_ITEM_KEY;
  421
  422next:
  423	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
  424	if (ret < 0)
  425		goto out;
  426
  427	leaf = path->nodes[0];
  428	nritems = btrfs_header_nritems(leaf);
  429
  430	while (1) {
  431		if (btrfs_fs_closing(fs_info) > 1) {
  432			last = (u64)-1;
  433			break;
  434		}
  435
  436		if (path->slots[0] < nritems) {
  437			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
  438		} else {
  439			ret = find_next_key(path, 0, &key);
  440			if (ret)
  441				break;
  442
  443			if (need_resched() ||
  444			    rwsem_is_contended(&fs_info->commit_root_sem)) {
  445				if (wakeup)
  446					caching_ctl->progress = last;
  447				btrfs_release_path(path);
  448				up_read(&fs_info->commit_root_sem);
  449				mutex_unlock(&caching_ctl->mutex);
  450				cond_resched();
  451				mutex_lock(&caching_ctl->mutex);
  452				down_read(&fs_info->commit_root_sem);
  453				goto next;
  454			}
  455
  456			ret = btrfs_next_leaf(extent_root, path);
  457			if (ret < 0)
  458				goto out;
  459			if (ret)
  460				break;
  461			leaf = path->nodes[0];
  462			nritems = btrfs_header_nritems(leaf);
  463			continue;
  464		}
  465
  466		if (key.objectid < last) {
  467			key.objectid = last;
  468			key.offset = 0;
  469			key.type = BTRFS_EXTENT_ITEM_KEY;
  470
  471			if (wakeup)
  472				caching_ctl->progress = last;
  473			btrfs_release_path(path);
  474			goto next;
  475		}
  476
  477		if (key.objectid < block_group->key.objectid) {
  478			path->slots[0]++;
  479			continue;
  480		}
  481
  482		if (key.objectid >= block_group->key.objectid +
  483		    block_group->key.offset)
  484			break;
  485
  486		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
  487		    key.type == BTRFS_METADATA_ITEM_KEY) {
  488			total_found += add_new_free_space(block_group, last,
  489							  key.objectid);
  490			if (key.type == BTRFS_METADATA_ITEM_KEY)
  491				last = key.objectid +
  492					fs_info->nodesize;
  493			else
  494				last = key.objectid + key.offset;
  495
  496			if (total_found > CACHING_CTL_WAKE_UP) {
  497				total_found = 0;
  498				if (wakeup)
  499					wake_up(&caching_ctl->wait);
  500			}
  501		}
  502		path->slots[0]++;
  503	}
  504	ret = 0;
  505
  506	total_found += add_new_free_space(block_group, last,
  507					  block_group->key.objectid +
  508					  block_group->key.offset);
  509	caching_ctl->progress = (u64)-1;
  510
  511out:
  512	btrfs_free_path(path);
  513	return ret;
  514}
  515
  516static noinline void caching_thread(struct btrfs_work *work)
  517{
  518	struct btrfs_block_group_cache *block_group;
  519	struct btrfs_fs_info *fs_info;
  520	struct btrfs_caching_control *caching_ctl;
  521	int ret;
  522
  523	caching_ctl = container_of(work, struct btrfs_caching_control, work);
  524	block_group = caching_ctl->block_group;
  525	fs_info = block_group->fs_info;
  526
  527	mutex_lock(&caching_ctl->mutex);
  528	down_read(&fs_info->commit_root_sem);
  529
  530	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
  531		ret = load_free_space_tree(caching_ctl);
  532	else
  533		ret = load_extent_tree_free(caching_ctl);
  534
  535	spin_lock(&block_group->lock);
  536	block_group->caching_ctl = NULL;
  537	block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
  538	spin_unlock(&block_group->lock);
  539
  540#ifdef CONFIG_BTRFS_DEBUG
  541	if (btrfs_should_fragment_free_space(block_group)) {
  542		u64 bytes_used;
  543
  544		spin_lock(&block_group->space_info->lock);
  545		spin_lock(&block_group->lock);
  546		bytes_used = block_group->key.offset -
  547			btrfs_block_group_used(&block_group->item);
  548		block_group->space_info->bytes_used += bytes_used >> 1;
  549		spin_unlock(&block_group->lock);
  550		spin_unlock(&block_group->space_info->lock);
  551		fragment_free_space(block_group);
  552	}
  553#endif
  554
  555	caching_ctl->progress = (u64)-1;
  556
  557	up_read(&fs_info->commit_root_sem);
  558	free_excluded_extents(block_group);
  559	mutex_unlock(&caching_ctl->mutex);
  560
  561	wake_up(&caching_ctl->wait);
  562
  563	put_caching_control(caching_ctl);
  564	btrfs_put_block_group(block_group);
  565}
  566
  567static int cache_block_group(struct btrfs_block_group_cache *cache,
  568			     int load_cache_only)
  569{
  570	DEFINE_WAIT(wait);
  571	struct btrfs_fs_info *fs_info = cache->fs_info;
  572	struct btrfs_caching_control *caching_ctl;
  573	int ret = 0;
  574
  575	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
  576	if (!caching_ctl)
  577		return -ENOMEM;
  578
  579	INIT_LIST_HEAD(&caching_ctl->list);
  580	mutex_init(&caching_ctl->mutex);
  581	init_waitqueue_head(&caching_ctl->wait);
  582	caching_ctl->block_group = cache;
  583	caching_ctl->progress = cache->key.objectid;
  584	refcount_set(&caching_ctl->count, 1);
  585	btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
  586			caching_thread, NULL, NULL);
  587
  588	spin_lock(&cache->lock);
  589	/*
  590	 * This should be a rare occasion, but this could happen I think in the
  591	 * case where one thread starts to load the space cache info, and then
  592	 * some other thread starts a transaction commit which tries to do an
  593	 * allocation while the other thread is still loading the space cache
  594	 * info.  The previous loop should have kept us from choosing this block
  595	 * group, but if we've moved to the state where we will wait on caching
  596	 * block groups we need to first check if we're doing a fast load here,
  597	 * so we can wait for it to finish, otherwise we could end up allocating
  598	 * from a block group who's cache gets evicted for one reason or
  599	 * another.
  600	 */
  601	while (cache->cached == BTRFS_CACHE_FAST) {
  602		struct btrfs_caching_control *ctl;
  603
  604		ctl = cache->caching_ctl;
  605		refcount_inc(&ctl->count);
  606		prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
  607		spin_unlock(&cache->lock);
  608
  609		schedule();
  610
  611		finish_wait(&ctl->wait, &wait);
  612		put_caching_control(ctl);
  613		spin_lock(&cache->lock);
  614	}
  615
  616	if (cache->cached != BTRFS_CACHE_NO) {
  617		spin_unlock(&cache->lock);
  618		kfree(caching_ctl);
  619		return 0;
  620	}
  621	WARN_ON(cache->caching_ctl);
  622	cache->caching_ctl = caching_ctl;
  623	cache->cached = BTRFS_CACHE_FAST;
  624	spin_unlock(&cache->lock);
  625
  626	if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
  627		mutex_lock(&caching_ctl->mutex);
  628		ret = load_free_space_cache(fs_info, cache);
  629
  630		spin_lock(&cache->lock);
  631		if (ret == 1) {
  632			cache->caching_ctl = NULL;
  633			cache->cached = BTRFS_CACHE_FINISHED;
  634			cache->last_byte_to_unpin = (u64)-1;
  635			caching_ctl->progress = (u64)-1;
  636		} else {
  637			if (load_cache_only) {
  638				cache->caching_ctl = NULL;
  639				cache->cached = BTRFS_CACHE_NO;
  640			} else {
  641				cache->cached = BTRFS_CACHE_STARTED;
  642				cache->has_caching_ctl = 1;
  643			}
  644		}
  645		spin_unlock(&cache->lock);
  646#ifdef CONFIG_BTRFS_DEBUG
  647		if (ret == 1 &&
  648		    btrfs_should_fragment_free_space(cache)) {
  649			u64 bytes_used;
  650
  651			spin_lock(&cache->space_info->lock);
  652			spin_lock(&cache->lock);
  653			bytes_used = cache->key.offset -
  654				btrfs_block_group_used(&cache->item);
  655			cache->space_info->bytes_used += bytes_used >> 1;
  656			spin_unlock(&cache->lock);
  657			spin_unlock(&cache->space_info->lock);
  658			fragment_free_space(cache);
  659		}
  660#endif
  661		mutex_unlock(&caching_ctl->mutex);
  662
  663		wake_up(&caching_ctl->wait);
  664		if (ret == 1) {
  665			put_caching_control(caching_ctl);
  666			free_excluded_extents(cache);
  667			return 0;
  668		}
  669	} else {
  670		/*
  671		 * We're either using the free space tree or no caching at all.
  672		 * Set cached to the appropriate value and wakeup any waiters.
  673		 */
  674		spin_lock(&cache->lock);
  675		if (load_cache_only) {
  676			cache->caching_ctl = NULL;
  677			cache->cached = BTRFS_CACHE_NO;
  678		} else {
  679			cache->cached = BTRFS_CACHE_STARTED;
  680			cache->has_caching_ctl = 1;
  681		}
  682		spin_unlock(&cache->lock);
  683		wake_up(&caching_ctl->wait);
  684	}
  685
  686	if (load_cache_only) {
  687		put_caching_control(caching_ctl);
  688		return 0;
  689	}
  690
  691	down_write(&fs_info->commit_root_sem);
  692	refcount_inc(&caching_ctl->count);
  693	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
  694	up_write(&fs_info->commit_root_sem);
  695
  696	btrfs_get_block_group(cache);
  697
  698	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
  699
  700	return ret;
  701}
  702
  703/*
  704 * return the block group that starts at or after bytenr
  705 */
  706static struct btrfs_block_group_cache *
  707btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
  708{
  709	return block_group_cache_tree_search(info, bytenr, 0);
  710}
  711
  712/*
  713 * return the block group that contains the given bytenr
  714 */
  715struct btrfs_block_group_cache *btrfs_lookup_block_group(
  716						 struct btrfs_fs_info *info,
  717						 u64 bytenr)
  718{
  719	return block_group_cache_tree_search(info, bytenr, 1);
  720}
  721
  722static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
  723						  u64 flags)
  724{
  725	struct list_head *head = &info->space_info;
  726	struct btrfs_space_info *found;
  727
  728	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
  729
  730	rcu_read_lock();
  731	list_for_each_entry_rcu(found, head, list) {
  732		if (found->flags & flags) {
  733			rcu_read_unlock();
  734			return found;
  735		}
  736	}
  737	rcu_read_unlock();
  738	return NULL;
  739}
  740
  741static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
  742			     bool metadata, u64 root_objectid)
  743{
  744	struct btrfs_space_info *space_info;
  745	u64 flags;
  746
  747	if (metadata) {
  748		if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
  749			flags = BTRFS_BLOCK_GROUP_SYSTEM;
  750		else
  751			flags = BTRFS_BLOCK_GROUP_METADATA;
  752	} else {
  753		flags = BTRFS_BLOCK_GROUP_DATA;
  754	}
  755
  756	space_info = __find_space_info(fs_info, flags);
  757	ASSERT(space_info);
  758	percpu_counter_add_batch(&space_info->total_bytes_pinned, num_bytes,
  759		    BTRFS_TOTAL_BYTES_PINNED_BATCH);
  760}
  761
  762/*
  763 * after adding space to the filesystem, we need to clear the full flags
  764 * on all the space infos.
  765 */
  766void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
  767{
  768	struct list_head *head = &info->space_info;
  769	struct btrfs_space_info *found;
  770
  771	rcu_read_lock();
  772	list_for_each_entry_rcu(found, head, list)
  773		found->full = 0;
  774	rcu_read_unlock();
  775}
  776
  777/* simple helper to search for an existing data extent at a given offset */
  778int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
  779{
  780	int ret;
  781	struct btrfs_key key;
  782	struct btrfs_path *path;
  783
  784	path = btrfs_alloc_path();
  785	if (!path)
  786		return -ENOMEM;
  787
  788	key.objectid = start;
  789	key.offset = len;
  790	key.type = BTRFS_EXTENT_ITEM_KEY;
  791	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
  792	btrfs_free_path(path);
  793	return ret;
  794}
  795
  796/*
  797 * helper function to lookup reference count and flags of a tree block.
  798 *
  799 * the head node for delayed ref is used to store the sum of all the
  800 * reference count modifications queued up in the rbtree. the head
  801 * node may also store the extent flags to set. This way you can check
  802 * to see what the reference count and extent flags would be if all of
  803 * the delayed refs are not processed.
  804 */
  805int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
  806			     struct btrfs_fs_info *fs_info, u64 bytenr,
  807			     u64 offset, int metadata, u64 *refs, u64 *flags)
  808{
  809	struct btrfs_delayed_ref_head *head;
  810	struct btrfs_delayed_ref_root *delayed_refs;
  811	struct btrfs_path *path;
  812	struct btrfs_extent_item *ei;
  813	struct extent_buffer *leaf;
  814	struct btrfs_key key;
  815	u32 item_size;
  816	u64 num_refs;
  817	u64 extent_flags;
  818	int ret;
  819
  820	/*
  821	 * If we don't have skinny metadata, don't bother doing anything
  822	 * different
  823	 */
  824	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
  825		offset = fs_info->nodesize;
  826		metadata = 0;
  827	}
  828
  829	path = btrfs_alloc_path();
  830	if (!path)
  831		return -ENOMEM;
  832
  833	if (!trans) {
  834		path->skip_locking = 1;
  835		path->search_commit_root = 1;
  836	}
  837
  838search_again:
  839	key.objectid = bytenr;
  840	key.offset = offset;
  841	if (metadata)
  842		key.type = BTRFS_METADATA_ITEM_KEY;
  843	else
  844		key.type = BTRFS_EXTENT_ITEM_KEY;
  845
  846	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
  847	if (ret < 0)
  848		goto out_free;
  849
  850	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
  851		if (path->slots[0]) {
  852			path->slots[0]--;
  853			btrfs_item_key_to_cpu(path->nodes[0], &key,
  854					      path->slots[0]);
  855			if (key.objectid == bytenr &&
  856			    key.type == BTRFS_EXTENT_ITEM_KEY &&
  857			    key.offset == fs_info->nodesize)
  858				ret = 0;
  859		}
  860	}
  861
  862	if (ret == 0) {
  863		leaf = path->nodes[0];
  864		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
  865		if (item_size >= sizeof(*ei)) {
  866			ei = btrfs_item_ptr(leaf, path->slots[0],
  867					    struct btrfs_extent_item);
  868			num_refs = btrfs_extent_refs(leaf, ei);
  869			extent_flags = btrfs_extent_flags(leaf, ei);
  870		} else {
  871			ret = -EINVAL;
  872			btrfs_print_v0_err(fs_info);
  873			if (trans)
  874				btrfs_abort_transaction(trans, ret);
  875			else
  876				btrfs_handle_fs_error(fs_info, ret, NULL);
  877
  878			goto out_free;
  879		}
  880
  881		BUG_ON(num_refs == 0);
  882	} else {
  883		num_refs = 0;
  884		extent_flags = 0;
  885		ret = 0;
  886	}
  887
  888	if (!trans)
  889		goto out;
  890
  891	delayed_refs = &trans->transaction->delayed_refs;
  892	spin_lock(&delayed_refs->lock);
  893	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
  894	if (head) {
  895		if (!mutex_trylock(&head->mutex)) {
  896			refcount_inc(&head->refs);
  897			spin_unlock(&delayed_refs->lock);
  898
  899			btrfs_release_path(path);
  900
  901			/*
  902			 * Mutex was contended, block until it's released and try
  903			 * again
  904			 */
  905			mutex_lock(&head->mutex);
  906			mutex_unlock(&head->mutex);
  907			btrfs_put_delayed_ref_head(head);
  908			goto search_again;
  909		}
  910		spin_lock(&head->lock);
  911		if (head->extent_op && head->extent_op->update_flags)
  912			extent_flags |= head->extent_op->flags_to_set;
  913		else
  914			BUG_ON(num_refs == 0);
  915
  916		num_refs += head->ref_mod;
  917		spin_unlock(&head->lock);
  918		mutex_unlock(&head->mutex);
  919	}
  920	spin_unlock(&delayed_refs->lock);
  921out:
  922	WARN_ON(num_refs == 0);
  923	if (refs)
  924		*refs = num_refs;
  925	if (flags)
  926		*flags = extent_flags;
  927out_free:
  928	btrfs_free_path(path);
  929	return ret;
  930}
  931
  932/*
  933 * Back reference rules.  Back refs have three main goals:
  934 *
  935 * 1) differentiate between all holders of references to an extent so that
  936 *    when a reference is dropped we can make sure it was a valid reference
  937 *    before freeing the extent.
  938 *
  939 * 2) Provide enough information to quickly find the holders of an extent
  940 *    if we notice a given block is corrupted or bad.
  941 *
  942 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
  943 *    maintenance.  This is actually the same as #2, but with a slightly
  944 *    different use case.
  945 *
  946 * There are two kinds of back refs. The implicit back refs is optimized
  947 * for pointers in non-shared tree blocks. For a given pointer in a block,
  948 * back refs of this kind provide information about the block's owner tree
  949 * and the pointer's key. These information allow us to find the block by
  950 * b-tree searching. The full back refs is for pointers in tree blocks not
  951 * referenced by their owner trees. The location of tree block is recorded
  952 * in the back refs. Actually the full back refs is generic, and can be
  953 * used in all cases the implicit back refs is used. The major shortcoming
  954 * of the full back refs is its overhead. Every time a tree block gets
  955 * COWed, we have to update back refs entry for all pointers in it.
  956 *
  957 * For a newly allocated tree block, we use implicit back refs for
  958 * pointers in it. This means most tree related operations only involve
  959 * implicit back refs. For a tree block created in old transaction, the
  960 * only way to drop a reference to it is COW it. So we can detect the
  961 * event that tree block loses its owner tree's reference and do the
  962 * back refs conversion.
  963 *
  964 * When a tree block is COWed through a tree, there are four cases:
  965 *
  966 * The reference count of the block is one and the tree is the block's
  967 * owner tree. Nothing to do in this case.
  968 *
  969 * The reference count of the block is one and the tree is not the
  970 * block's owner tree. In this case, full back refs is used for pointers
  971 * in the block. Remove these full back refs, add implicit back refs for
  972 * every pointers in the new block.
  973 *
  974 * The reference count of the block is greater than one and the tree is
  975 * the block's owner tree. In this case, implicit back refs is used for
  976 * pointers in the block. Add full back refs for every pointers in the
  977 * block, increase lower level extents' reference counts. The original
  978 * implicit back refs are entailed to the new block.
  979 *
  980 * The reference count of the block is greater than one and the tree is
  981 * not the block's owner tree. Add implicit back refs for every pointer in
  982 * the new block, increase lower level extents' reference count.
  983 *
  984 * Back Reference Key composing:
  985 *
  986 * The key objectid corresponds to the first byte in the extent,
  987 * The key type is used to differentiate between types of back refs.
  988 * There are different meanings of the key offset for different types
  989 * of back refs.
  990 *
  991 * File extents can be referenced by:
  992 *
  993 * - multiple snapshots, subvolumes, or different generations in one subvol
  994 * - different files inside a single subvolume
  995 * - different offsets inside a file (bookend extents in file.c)
  996 *
  997 * The extent ref structure for the implicit back refs has fields for:
  998 *
  999 * - Objectid of the subvolume root
 1000 * - objectid of the file holding the reference
 1001 * - original offset in the file
 1002 * - how many bookend extents
 1003 *
 1004 * The key offset for the implicit back refs is hash of the first
 1005 * three fields.
 1006 *
 1007 * The extent ref structure for the full back refs has field for:
 1008 *
 1009 * - number of pointers in the tree leaf
 1010 *
 1011 * The key offset for the implicit back refs is the first byte of
 1012 * the tree leaf
 1013 *
 1014 * When a file extent is allocated, The implicit back refs is used.
 1015 * the fields are filled in:
 1016 *
 1017 *     (root_key.objectid, inode objectid, offset in file, 1)
 1018 *
 1019 * When a file extent is removed file truncation, we find the
 1020 * corresponding implicit back refs and check the following fields:
 1021 *
 1022 *     (btrfs_header_owner(leaf), inode objectid, offset in file)
 1023 *
 1024 * Btree extents can be referenced by:
 1025 *
 1026 * - Different subvolumes
 1027 *
 1028 * Both the implicit back refs and the full back refs for tree blocks
 1029 * only consist of key. The key offset for the implicit back refs is
 1030 * objectid of block's owner tree. The key offset for the full back refs
 1031 * is the first byte of parent block.
 1032 *
 1033 * When implicit back refs is used, information about the lowest key and
 1034 * level of the tree block are required. These information are stored in
 1035 * tree block info structure.
 1036 */
 1037
 1038/*
 1039 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
 1040 * is_data == BTRFS_REF_TYPE_DATA, data type is requried,
 1041 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
 1042 */
 1043int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
 1044				     struct btrfs_extent_inline_ref *iref,
 1045				     enum btrfs_inline_ref_type is_data)
 1046{
 1047	int type = btrfs_extent_inline_ref_type(eb, iref);
 1048	u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
 1049
 1050	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
 1051	    type == BTRFS_SHARED_BLOCK_REF_KEY ||
 1052	    type == BTRFS_SHARED_DATA_REF_KEY ||
 1053	    type == BTRFS_EXTENT_DATA_REF_KEY) {
 1054		if (is_data == BTRFS_REF_TYPE_BLOCK) {
 1055			if (type == BTRFS_TREE_BLOCK_REF_KEY)
 1056				return type;
 1057			if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
 1058				ASSERT(eb->fs_info);
 1059				/*
 1060				 * Every shared one has parent tree
 1061				 * block, which must be aligned to
 1062				 * nodesize.
 1063				 */
 1064				if (offset &&
 1065				    IS_ALIGNED(offset, eb->fs_info->nodesize))
 1066					return type;
 1067			}
 1068		} else if (is_data == BTRFS_REF_TYPE_DATA) {
 1069			if (type == BTRFS_EXTENT_DATA_REF_KEY)
 1070				return type;
 1071			if (type == BTRFS_SHARED_DATA_REF_KEY) {
 1072				ASSERT(eb->fs_info);
 1073				/*
 1074				 * Every shared one has parent tree
 1075				 * block, which must be aligned to
 1076				 * nodesize.
 1077				 */
 1078				if (offset &&
 1079				    IS_ALIGNED(offset, eb->fs_info->nodesize))
 1080					return type;
 1081			}
 1082		} else {
 1083			ASSERT(is_data == BTRFS_REF_TYPE_ANY);
 1084			return type;
 1085		}
 1086	}
 1087
 1088	btrfs_print_leaf((struct extent_buffer *)eb);
 1089	btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
 1090		  eb->start, type);
 1091	WARN_ON(1);
 1092
 1093	return BTRFS_REF_TYPE_INVALID;
 1094}
 1095
 1096static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
 1097{
 1098	u32 high_crc = ~(u32)0;
 1099	u32 low_crc = ~(u32)0;
 1100	__le64 lenum;
 1101
 1102	lenum = cpu_to_le64(root_objectid);
 1103	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
 1104	lenum = cpu_to_le64(owner);
 1105	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
 1106	lenum = cpu_to_le64(offset);
 1107	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
 1108
 1109	return ((u64)high_crc << 31) ^ (u64)low_crc;
 1110}
 1111
 1112static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
 1113				     struct btrfs_extent_data_ref *ref)
 1114{
 1115	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
 1116				    btrfs_extent_data_ref_objectid(leaf, ref),
 1117				    btrfs_extent_data_ref_offset(leaf, ref));
 1118}
 1119
 1120static int match_extent_data_ref(struct extent_buffer *leaf,
 1121				 struct btrfs_extent_data_ref *ref,
 1122				 u64 root_objectid, u64 owner, u64 offset)
 1123{
 1124	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
 1125	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
 1126	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
 1127		return 0;
 1128	return 1;
 1129}
 1130
 1131static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
 1132					   struct btrfs_path *path,
 1133					   u64 bytenr, u64 parent,
 1134					   u64 root_objectid,
 1135					   u64 owner, u64 offset)
 1136{
 1137	struct btrfs_root *root = trans->fs_info->extent_root;
 1138	struct btrfs_key key;
 1139	struct btrfs_extent_data_ref *ref;
 1140	struct extent_buffer *leaf;
 1141	u32 nritems;
 1142	int ret;
 1143	int recow;
 1144	int err = -ENOENT;
 1145
 1146	key.objectid = bytenr;
 1147	if (parent) {
 1148		key.type = BTRFS_SHARED_DATA_REF_KEY;
 1149		key.offset = parent;
 1150	} else {
 1151		key.type = BTRFS_EXTENT_DATA_REF_KEY;
 1152		key.offset = hash_extent_data_ref(root_objectid,
 1153						  owner, offset);
 1154	}
 1155again:
 1156	recow = 0;
 1157	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 1158	if (ret < 0) {
 1159		err = ret;
 1160		goto fail;
 1161	}
 1162
 1163	if (parent) {
 1164		if (!ret)
 1165			return 0;
 1166		goto fail;
 1167	}
 1168
 1169	leaf = path->nodes[0];
 1170	nritems = btrfs_header_nritems(leaf);
 1171	while (1) {
 1172		if (path->slots[0] >= nritems) {
 1173			ret = btrfs_next_leaf(root, path);
 1174			if (ret < 0)
 1175				err = ret;
 1176			if (ret)
 1177				goto fail;
 1178
 1179			leaf = path->nodes[0];
 1180			nritems = btrfs_header_nritems(leaf);
 1181			recow = 1;
 1182		}
 1183
 1184		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 1185		if (key.objectid != bytenr ||
 1186		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
 1187			goto fail;
 1188
 1189		ref = btrfs_item_ptr(leaf, path->slots[0],
 1190				     struct btrfs_extent_data_ref);
 1191
 1192		if (match_extent_data_ref(leaf, ref, root_objectid,
 1193					  owner, offset)) {
 1194			if (recow) {
 1195				btrfs_release_path(path);
 1196				goto again;
 1197			}
 1198			err = 0;
 1199			break;
 1200		}
 1201		path->slots[0]++;
 1202	}
 1203fail:
 1204	return err;
 1205}
 1206
 1207static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
 1208					   struct btrfs_path *path,
 1209					   u64 bytenr, u64 parent,
 1210					   u64 root_objectid, u64 owner,
 1211					   u64 offset, int refs_to_add)
 1212{
 1213	struct btrfs_root *root = trans->fs_info->extent_root;
 1214	struct btrfs_key key;
 1215	struct extent_buffer *leaf;
 1216	u32 size;
 1217	u32 num_refs;
 1218	int ret;
 1219
 1220	key.objectid = bytenr;
 1221	if (parent) {
 1222		key.type = BTRFS_SHARED_DATA_REF_KEY;
 1223		key.offset = parent;
 1224		size = sizeof(struct btrfs_shared_data_ref);
 1225	} else {
 1226		key.type = BTRFS_EXTENT_DATA_REF_KEY;
 1227		key.offset = hash_extent_data_ref(root_objectid,
 1228						  owner, offset);
 1229		size = sizeof(struct btrfs_extent_data_ref);
 1230	}
 1231
 1232	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
 1233	if (ret && ret != -EEXIST)
 1234		goto fail;
 1235
 1236	leaf = path->nodes[0];
 1237	if (parent) {
 1238		struct btrfs_shared_data_ref *ref;
 1239		ref = btrfs_item_ptr(leaf, path->slots[0],
 1240				     struct btrfs_shared_data_ref);
 1241		if (ret == 0) {
 1242			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
 1243		} else {
 1244			num_refs = btrfs_shared_data_ref_count(leaf, ref);
 1245			num_refs += refs_to_add;
 1246			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
 1247		}
 1248	} else {
 1249		struct btrfs_extent_data_ref *ref;
 1250		while (ret == -EEXIST) {
 1251			ref = btrfs_item_ptr(leaf, path->slots[0],
 1252					     struct btrfs_extent_data_ref);
 1253			if (match_extent_data_ref(leaf, ref, root_objectid,
 1254						  owner, offset))
 1255				break;
 1256			btrfs_release_path(path);
 1257			key.offset++;
 1258			ret = btrfs_insert_empty_item(trans, root, path, &key,
 1259						      size);
 1260			if (ret && ret != -EEXIST)
 1261				goto fail;
 1262
 1263			leaf = path->nodes[0];
 1264		}
 1265		ref = btrfs_item_ptr(leaf, path->slots[0],
 1266				     struct btrfs_extent_data_ref);
 1267		if (ret == 0) {
 1268			btrfs_set_extent_data_ref_root(leaf, ref,
 1269						       root_objectid);
 1270			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
 1271			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
 1272			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
 1273		} else {
 1274			num_refs = btrfs_extent_data_ref_count(leaf, ref);
 1275			num_refs += refs_to_add;
 1276			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
 1277		}
 1278	}
 1279	btrfs_mark_buffer_dirty(leaf);
 1280	ret = 0;
 1281fail:
 1282	btrfs_release_path(path);
 1283	return ret;
 1284}
 1285
 1286static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
 1287					   struct btrfs_path *path,
 1288					   int refs_to_drop, int *last_ref)
 1289{
 1290	struct btrfs_key key;
 1291	struct btrfs_extent_data_ref *ref1 = NULL;
 1292	struct btrfs_shared_data_ref *ref2 = NULL;
 1293	struct extent_buffer *leaf;
 1294	u32 num_refs = 0;
 1295	int ret = 0;
 1296
 1297	leaf = path->nodes[0];
 1298	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 1299
 1300	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
 1301		ref1 = btrfs_item_ptr(leaf, path->slots[0],
 1302				      struct btrfs_extent_data_ref);
 1303		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
 1304	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
 1305		ref2 = btrfs_item_ptr(leaf, path->slots[0],
 1306				      struct btrfs_shared_data_ref);
 1307		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
 1308	} else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
 1309		btrfs_print_v0_err(trans->fs_info);
 1310		btrfs_abort_transaction(trans, -EINVAL);
 1311		return -EINVAL;
 1312	} else {
 1313		BUG();
 1314	}
 1315
 1316	BUG_ON(num_refs < refs_to_drop);
 1317	num_refs -= refs_to_drop;
 1318
 1319	if (num_refs == 0) {
 1320		ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
 1321		*last_ref = 1;
 1322	} else {
 1323		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
 1324			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
 1325		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
 1326			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
 1327		btrfs_mark_buffer_dirty(leaf);
 1328	}
 1329	return ret;
 1330}
 1331
 1332static noinline u32 extent_data_ref_count(struct btrfs_path *path,
 1333					  struct btrfs_extent_inline_ref *iref)
 1334{
 1335	struct btrfs_key key;
 1336	struct extent_buffer *leaf;
 1337	struct btrfs_extent_data_ref *ref1;
 1338	struct btrfs_shared_data_ref *ref2;
 1339	u32 num_refs = 0;
 1340	int type;
 1341
 1342	leaf = path->nodes[0];
 1343	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 1344
 1345	BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
 1346	if (iref) {
 1347		/*
 1348		 * If type is invalid, we should have bailed out earlier than
 1349		 * this call.
 1350		 */
 1351		type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
 1352		ASSERT(type != BTRFS_REF_TYPE_INVALID);
 1353		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
 1354			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
 1355			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
 1356		} else {
 1357			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
 1358			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
 1359		}
 1360	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
 1361		ref1 = btrfs_item_ptr(leaf, path->slots[0],
 1362				      struct btrfs_extent_data_ref);
 1363		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
 1364	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
 1365		ref2 = btrfs_item_ptr(leaf, path->slots[0],
 1366				      struct btrfs_shared_data_ref);
 1367		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
 1368	} else {
 1369		WARN_ON(1);
 1370	}
 1371	return num_refs;
 1372}
 1373
 1374static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
 1375					  struct btrfs_path *path,
 1376					  u64 bytenr, u64 parent,
 1377					  u64 root_objectid)
 1378{
 1379	struct btrfs_root *root = trans->fs_info->extent_root;
 1380	struct btrfs_key key;
 1381	int ret;
 1382
 1383	key.objectid = bytenr;
 1384	if (parent) {
 1385		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
 1386		key.offset = parent;
 1387	} else {
 1388		key.type = BTRFS_TREE_BLOCK_REF_KEY;
 1389		key.offset = root_objectid;
 1390	}
 1391
 1392	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 1393	if (ret > 0)
 1394		ret = -ENOENT;
 1395	return ret;
 1396}
 1397
 1398static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
 1399					  struct btrfs_path *path,
 1400					  u64 bytenr, u64 parent,
 1401					  u64 root_objectid)
 1402{
 1403	struct btrfs_key key;
 1404	int ret;
 1405
 1406	key.objectid = bytenr;
 1407	if (parent) {
 1408		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
 1409		key.offset = parent;
 1410	} else {
 1411		key.type = BTRFS_TREE_BLOCK_REF_KEY;
 1412		key.offset = root_objectid;
 1413	}
 1414
 1415	ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
 1416				      path, &key, 0);
 1417	btrfs_release_path(path);
 1418	return ret;
 1419}
 1420
 1421static inline int extent_ref_type(u64 parent, u64 owner)
 1422{
 1423	int type;
 1424	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
 1425		if (parent > 0)
 1426			type = BTRFS_SHARED_BLOCK_REF_KEY;
 1427		else
 1428			type = BTRFS_TREE_BLOCK_REF_KEY;
 1429	} else {
 1430		if (parent > 0)
 1431			type = BTRFS_SHARED_DATA_REF_KEY;
 1432		else
 1433			type = BTRFS_EXTENT_DATA_REF_KEY;
 1434	}
 1435	return type;
 1436}
 1437
 1438static int find_next_key(struct btrfs_path *path, int level,
 1439			 struct btrfs_key *key)
 1440
 1441{
 1442	for (; level < BTRFS_MAX_LEVEL; level++) {
 1443		if (!path->nodes[level])
 1444			break;
 1445		if (path->slots[level] + 1 >=
 1446		    btrfs_header_nritems(path->nodes[level]))
 1447			continue;
 1448		if (level == 0)
 1449			btrfs_item_key_to_cpu(path->nodes[level], key,
 1450					      path->slots[level] + 1);
 1451		else
 1452			btrfs_node_key_to_cpu(path->nodes[level], key,
 1453					      path->slots[level] + 1);
 1454		return 0;
 1455	}
 1456	return 1;
 1457}
 1458
 1459/*
 1460 * look for inline back ref. if back ref is found, *ref_ret is set
 1461 * to the address of inline back ref, and 0 is returned.
 1462 *
 1463 * if back ref isn't found, *ref_ret is set to the address where it
 1464 * should be inserted, and -ENOENT is returned.
 1465 *
 1466 * if insert is true and there are too many inline back refs, the path
 1467 * points to the extent item, and -EAGAIN is returned.
 1468 *
 1469 * NOTE: inline back refs are ordered in the same way that back ref
 1470 *	 items in the tree are ordered.
 1471 */
 1472static noinline_for_stack
 1473int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
 1474				 struct btrfs_path *path,
 1475				 struct btrfs_extent_inline_ref **ref_ret,
 1476				 u64 bytenr, u64 num_bytes,
 1477				 u64 parent, u64 root_objectid,
 1478				 u64 owner, u64 offset, int insert)
 1479{
 1480	struct btrfs_fs_info *fs_info = trans->fs_info;
 1481	struct btrfs_root *root = fs_info->extent_root;
 1482	struct btrfs_key key;
 1483	struct extent_buffer *leaf;
 1484	struct btrfs_extent_item *ei;
 1485	struct btrfs_extent_inline_ref *iref;
 1486	u64 flags;
 1487	u64 item_size;
 1488	unsigned long ptr;
 1489	unsigned long end;
 1490	int extra_size;
 1491	int type;
 1492	int want;
 1493	int ret;
 1494	int err = 0;
 1495	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
 1496	int needed;
 1497
 1498	key.objectid = bytenr;
 1499	key.type = BTRFS_EXTENT_ITEM_KEY;
 1500	key.offset = num_bytes;
 1501
 1502	want = extent_ref_type(parent, owner);
 1503	if (insert) {
 1504		extra_size = btrfs_extent_inline_ref_size(want);
 1505		path->keep_locks = 1;
 1506	} else
 1507		extra_size = -1;
 1508
 1509	/*
 1510	 * Owner is our level, so we can just add one to get the level for the
 1511	 * block we are interested in.
 1512	 */
 1513	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
 1514		key.type = BTRFS_METADATA_ITEM_KEY;
 1515		key.offset = owner;
 1516	}
 1517
 1518again:
 1519	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
 1520	if (ret < 0) {
 1521		err = ret;
 1522		goto out;
 1523	}
 1524
 1525	/*
 1526	 * We may be a newly converted file system which still has the old fat
 1527	 * extent entries for metadata, so try and see if we have one of those.
 1528	 */
 1529	if (ret > 0 && skinny_metadata) {
 1530		skinny_metadata = false;
 1531		if (path->slots[0]) {
 1532			path->slots[0]--;
 1533			btrfs_item_key_to_cpu(path->nodes[0], &key,
 1534					      path->slots[0]);
 1535			if (key.objectid == bytenr &&
 1536			    key.type == BTRFS_EXTENT_ITEM_KEY &&
 1537			    key.offset == num_bytes)
 1538				ret = 0;
 1539		}
 1540		if (ret) {
 1541			key.objectid = bytenr;
 1542			key.type = BTRFS_EXTENT_ITEM_KEY;
 1543			key.offset = num_bytes;
 1544			btrfs_release_path(path);
 1545			goto again;
 1546		}
 1547	}
 1548
 1549	if (ret && !insert) {
 1550		err = -ENOENT;
 1551		goto out;
 1552	} else if (WARN_ON(ret)) {
 1553		err = -EIO;
 1554		goto out;
 1555	}
 1556
 1557	leaf = path->nodes[0];
 1558	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 1559	if (unlikely(item_size < sizeof(*ei))) {
 1560		err = -EINVAL;
 1561		btrfs_print_v0_err(fs_info);
 1562		btrfs_abort_transaction(trans, err);
 1563		goto out;
 1564	}
 1565
 1566	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 1567	flags = btrfs_extent_flags(leaf, ei);
 1568
 1569	ptr = (unsigned long)(ei + 1);
 1570	end = (unsigned long)ei + item_size;
 1571
 1572	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
 1573		ptr += sizeof(struct btrfs_tree_block_info);
 1574		BUG_ON(ptr > end);
 1575	}
 1576
 1577	if (owner >= BTRFS_FIRST_FREE_OBJECTID)
 1578		needed = BTRFS_REF_TYPE_DATA;
 1579	else
 1580		needed = BTRFS_REF_TYPE_BLOCK;
 1581
 1582	err = -ENOENT;
 1583	while (1) {
 1584		if (ptr >= end) {
 1585			WARN_ON(ptr > end);
 1586			break;
 1587		}
 1588		iref = (struct btrfs_extent_inline_ref *)ptr;
 1589		type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
 1590		if (type == BTRFS_REF_TYPE_INVALID) {
 1591			err = -EUCLEAN;
 1592			goto out;
 1593		}
 1594
 1595		if (want < type)
 1596			break;
 1597		if (want > type) {
 1598			ptr += btrfs_extent_inline_ref_size(type);
 1599			continue;
 1600		}
 1601
 1602		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
 1603			struct btrfs_extent_data_ref *dref;
 1604			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
 1605			if (match_extent_data_ref(leaf, dref, root_objectid,
 1606						  owner, offset)) {
 1607				err = 0;
 1608				break;
 1609			}
 1610			if (hash_extent_data_ref_item(leaf, dref) <
 1611			    hash_extent_data_ref(root_objectid, owner, offset))
 1612				break;
 1613		} else {
 1614			u64 ref_offset;
 1615			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
 1616			if (parent > 0) {
 1617				if (parent == ref_offset) {
 1618					err = 0;
 1619					break;
 1620				}
 1621				if (ref_offset < parent)
 1622					break;
 1623			} else {
 1624				if (root_objectid == ref_offset) {
 1625					err = 0;
 1626					break;
 1627				}
 1628				if (ref_offset < root_objectid)
 1629					break;
 1630			}
 1631		}
 1632		ptr += btrfs_extent_inline_ref_size(type);
 1633	}
 1634	if (err == -ENOENT && insert) {
 1635		if (item_size + extra_size >=
 1636		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
 1637			err = -EAGAIN;
 1638			goto out;
 1639		}
 1640		/*
 1641		 * To add new inline back ref, we have to make sure
 1642		 * there is no corresponding back ref item.
 1643		 * For simplicity, we just do not add new inline back
 1644		 * ref if there is any kind of item for this block
 1645		 */
 1646		if (find_next_key(path, 0, &key) == 0 &&
 1647		    key.objectid == bytenr &&
 1648		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
 1649			err = -EAGAIN;
 1650			goto out;
 1651		}
 1652	}
 1653	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
 1654out:
 1655	if (insert) {
 1656		path->keep_locks = 0;
 1657		btrfs_unlock_up_safe(path, 1);
 1658	}
 1659	return err;
 1660}
 1661
 1662/*
 1663 * helper to add new inline back ref
 1664 */
 1665static noinline_for_stack
 1666void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
 1667				 struct btrfs_path *path,
 1668				 struct btrfs_extent_inline_ref *iref,
 1669				 u64 parent, u64 root_objectid,
 1670				 u64 owner, u64 offset, int refs_to_add,
 1671				 struct btrfs_delayed_extent_op *extent_op)
 1672{
 1673	struct extent_buffer *leaf;
 1674	struct btrfs_extent_item *ei;
 1675	unsigned long ptr;
 1676	unsigned long end;
 1677	unsigned long item_offset;
 1678	u64 refs;
 1679	int size;
 1680	int type;
 1681
 1682	leaf = path->nodes[0];
 1683	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 1684	item_offset = (unsigned long)iref - (unsigned long)ei;
 1685
 1686	type = extent_ref_type(parent, owner);
 1687	size = btrfs_extent_inline_ref_size(type);
 1688
 1689	btrfs_extend_item(fs_info, path, size);
 1690
 1691	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 1692	refs = btrfs_extent_refs(leaf, ei);
 1693	refs += refs_to_add;
 1694	btrfs_set_extent_refs(leaf, ei, refs);
 1695	if (extent_op)
 1696		__run_delayed_extent_op(extent_op, leaf, ei);
 1697
 1698	ptr = (unsigned long)ei + item_offset;
 1699	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
 1700	if (ptr < end - size)
 1701		memmove_extent_buffer(leaf, ptr + size, ptr,
 1702				      end - size - ptr);
 1703
 1704	iref = (struct btrfs_extent_inline_ref *)ptr;
 1705	btrfs_set_extent_inline_ref_type(leaf, iref, type);
 1706	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
 1707		struct btrfs_extent_data_ref *dref;
 1708		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
 1709		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
 1710		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
 1711		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
 1712		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
 1713	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
 1714		struct btrfs_shared_data_ref *sref;
 1715		sref = (struct btrfs_shared_data_ref *)(iref + 1);
 1716		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
 1717		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
 1718	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
 1719		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
 1720	} else {
 1721		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
 1722	}
 1723	btrfs_mark_buffer_dirty(leaf);
 1724}
 1725
 1726static int lookup_extent_backref(struct btrfs_trans_handle *trans,
 1727				 struct btrfs_path *path,
 1728				 struct btrfs_extent_inline_ref **ref_ret,
 1729				 u64 bytenr, u64 num_bytes, u64 parent,
 1730				 u64 root_objectid, u64 owner, u64 offset)
 1731{
 1732	int ret;
 1733
 1734	ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
 1735					   num_bytes, parent, root_objectid,
 1736					   owner, offset, 0);
 1737	if (ret != -ENOENT)
 1738		return ret;
 1739
 1740	btrfs_release_path(path);
 1741	*ref_ret = NULL;
 1742
 1743	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
 1744		ret = lookup_tree_block_ref(trans, path, bytenr, parent,
 1745					    root_objectid);
 1746	} else {
 1747		ret = lookup_extent_data_ref(trans, path, bytenr, parent,
 1748					     root_objectid, owner, offset);
 1749	}
 1750	return ret;
 1751}
 1752
 1753/*
 1754 * helper to update/remove inline back ref
 1755 */
 1756static noinline_for_stack
 1757void update_inline_extent_backref(struct btrfs_path *path,
 1758				  struct btrfs_extent_inline_ref *iref,
 1759				  int refs_to_mod,
 1760				  struct btrfs_delayed_extent_op *extent_op,
 1761				  int *last_ref)
 1762{
 1763	struct extent_buffer *leaf = path->nodes[0];
 1764	struct btrfs_fs_info *fs_info = leaf->fs_info;
 1765	struct btrfs_extent_item *ei;
 1766	struct btrfs_extent_data_ref *dref = NULL;
 1767	struct btrfs_shared_data_ref *sref = NULL;
 1768	unsigned long ptr;
 1769	unsigned long end;
 1770	u32 item_size;
 1771	int size;
 1772	int type;
 1773	u64 refs;
 1774
 1775	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 1776	refs = btrfs_extent_refs(leaf, ei);
 1777	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
 1778	refs += refs_to_mod;
 1779	btrfs_set_extent_refs(leaf, ei, refs);
 1780	if (extent_op)
 1781		__run_delayed_extent_op(extent_op, leaf, ei);
 1782
 1783	/*
 1784	 * If type is invalid, we should have bailed out after
 1785	 * lookup_inline_extent_backref().
 1786	 */
 1787	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
 1788	ASSERT(type != BTRFS_REF_TYPE_INVALID);
 1789
 1790	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
 1791		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
 1792		refs = btrfs_extent_data_ref_count(leaf, dref);
 1793	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
 1794		sref = (struct btrfs_shared_data_ref *)(iref + 1);
 1795		refs = btrfs_shared_data_ref_count(leaf, sref);
 1796	} else {
 1797		refs = 1;
 1798		BUG_ON(refs_to_mod != -1);
 1799	}
 1800
 1801	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
 1802	refs += refs_to_mod;
 1803
 1804	if (refs > 0) {
 1805		if (type == BTRFS_EXTENT_DATA_REF_KEY)
 1806			btrfs_set_extent_data_ref_count(leaf, dref, refs);
 1807		else
 1808			btrfs_set_shared_data_ref_count(leaf, sref, refs);
 1809	} else {
 1810		*last_ref = 1;
 1811		size =  btrfs_extent_inline_ref_size(type);
 1812		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 1813		ptr = (unsigned long)iref;
 1814		end = (unsigned long)ei + item_size;
 1815		if (ptr + size < end)
 1816			memmove_extent_buffer(leaf, ptr, ptr + size,
 1817					      end - ptr - size);
 1818		item_size -= size;
 1819		btrfs_truncate_item(fs_info, path, item_size, 1);
 1820	}
 1821	btrfs_mark_buffer_dirty(leaf);
 1822}
 1823
 1824static noinline_for_stack
 1825int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
 1826				 struct btrfs_path *path,
 1827				 u64 bytenr, u64 num_bytes, u64 parent,
 1828				 u64 root_objectid, u64 owner,
 1829				 u64 offset, int refs_to_add,
 1830				 struct btrfs_delayed_extent_op *extent_op)
 1831{
 1832	struct btrfs_extent_inline_ref *iref;
 1833	int ret;
 1834
 1835	ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
 1836					   num_bytes, parent, root_objectid,
 1837					   owner, offset, 1);
 1838	if (ret == 0) {
 1839		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
 1840		update_inline_extent_backref(path, iref, refs_to_add,
 1841					     extent_op, NULL);
 1842	} else if (ret == -ENOENT) {
 1843		setup_inline_extent_backref(trans->fs_info, path, iref, parent,
 1844					    root_objectid, owner, offset,
 1845					    refs_to_add, extent_op);
 1846		ret = 0;
 1847	}
 1848	return ret;
 1849}
 1850
 1851static int insert_extent_backref(struct btrfs_trans_handle *trans,
 1852				 struct btrfs_path *path,
 1853				 u64 bytenr, u64 parent, u64 root_objectid,
 1854				 u64 owner, u64 offset, int refs_to_add)
 1855{
 1856	int ret;
 1857	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
 1858		BUG_ON(refs_to_add != 1);
 1859		ret = insert_tree_block_ref(trans, path, bytenr, parent,
 1860					    root_objectid);
 1861	} else {
 1862		ret = insert_extent_data_ref(trans, path, bytenr, parent,
 1863					     root_objectid, owner, offset,
 1864					     refs_to_add);
 1865	}
 1866	return ret;
 1867}
 1868
 1869static int remove_extent_backref(struct btrfs_trans_handle *trans,
 1870				 struct btrfs_path *path,
 1871				 struct btrfs_extent_inline_ref *iref,
 1872				 int refs_to_drop, int is_data, int *last_ref)
 1873{
 1874	int ret = 0;
 1875
 1876	BUG_ON(!is_data && refs_to_drop != 1);
 1877	if (iref) {
 1878		update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
 1879					     last_ref);
 1880	} else if (is_data) {
 1881		ret = remove_extent_data_ref(trans, path, refs_to_drop,
 1882					     last_ref);
 1883	} else {
 1884		*last_ref = 1;
 1885		ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
 1886	}
 1887	return ret;
 1888}
 1889
 1890#define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
 1891static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
 1892			       u64 *discarded_bytes)
 1893{
 1894	int j, ret = 0;
 1895	u64 bytes_left, end;
 1896	u64 aligned_start = ALIGN(start, 1 << 9);
 1897
 1898	if (WARN_ON(start != aligned_start)) {
 1899		len -= aligned_start - start;
 1900		len = round_down(len, 1 << 9);
 1901		start = aligned_start;
 1902	}
 1903
 1904	*discarded_bytes = 0;
 1905
 1906	if (!len)
 1907		return 0;
 1908
 1909	end = start + len;
 1910	bytes_left = len;
 1911
 1912	/* Skip any superblocks on this device. */
 1913	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
 1914		u64 sb_start = btrfs_sb_offset(j);
 1915		u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
 1916		u64 size = sb_start - start;
 1917
 1918		if (!in_range(sb_start, start, bytes_left) &&
 1919		    !in_range(sb_end, start, bytes_left) &&
 1920		    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
 1921			continue;
 1922
 1923		/*
 1924		 * Superblock spans beginning of range.  Adjust start and
 1925		 * try again.
 1926		 */
 1927		if (sb_start <= start) {
 1928			start += sb_end - start;
 1929			if (start > end) {
 1930				bytes_left = 0;
 1931				break;
 1932			}
 1933			bytes_left = end - start;
 1934			continue;
 1935		}
 1936
 1937		if (size) {
 1938			ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
 1939						   GFP_NOFS, 0);
 1940			if (!ret)
 1941				*discarded_bytes += size;
 1942			else if (ret != -EOPNOTSUPP)
 1943				return ret;
 1944		}
 1945
 1946		start = sb_end;
 1947		if (start > end) {
 1948			bytes_left = 0;
 1949			break;
 1950		}
 1951		bytes_left = end - start;
 1952	}
 1953
 1954	if (bytes_left) {
 1955		ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
 1956					   GFP_NOFS, 0);
 1957		if (!ret)
 1958			*discarded_bytes += bytes_left;
 1959	}
 1960	return ret;
 1961}
 1962
 1963int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
 1964			 u64 num_bytes, u64 *actual_bytes)
 1965{
 1966	int ret;
 1967	u64 discarded_bytes = 0;
 1968	struct btrfs_bio *bbio = NULL;
 1969
 1970
 1971	/*
 1972	 * Avoid races with device replace and make sure our bbio has devices
 1973	 * associated to its stripes that don't go away while we are discarding.
 1974	 */
 1975	btrfs_bio_counter_inc_blocked(fs_info);
 1976	/* Tell the block device(s) that the sectors can be discarded */
 1977	ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
 1978			      &bbio, 0);
 1979	/* Error condition is -ENOMEM */
 1980	if (!ret) {
 1981		struct btrfs_bio_stripe *stripe = bbio->stripes;
 1982		int i;
 1983
 1984
 1985		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
 1986			u64 bytes;
 1987			struct request_queue *req_q;
 1988
 1989			if (!stripe->dev->bdev) {
 1990				ASSERT(btrfs_test_opt(fs_info, DEGRADED));
 1991				continue;
 1992			}
 1993			req_q = bdev_get_queue(stripe->dev->bdev);
 1994			if (!blk_queue_discard(req_q))
 1995				continue;
 1996
 1997			ret = btrfs_issue_discard(stripe->dev->bdev,
 1998						  stripe->physical,
 1999						  stripe->length,
 2000						  &bytes);
 2001			if (!ret)
 2002				discarded_bytes += bytes;
 2003			else if (ret != -EOPNOTSUPP)
 2004				break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
 2005
 2006			/*
 2007			 * Just in case we get back EOPNOTSUPP for some reason,
 2008			 * just ignore the return value so we don't screw up
 2009			 * people calling discard_extent.
 2010			 */
 2011			ret = 0;
 2012		}
 2013		btrfs_put_bbio(bbio);
 2014	}
 2015	btrfs_bio_counter_dec(fs_info);
 2016
 2017	if (actual_bytes)
 2018		*actual_bytes = discarded_bytes;
 2019
 2020
 2021	if (ret == -EOPNOTSUPP)
 2022		ret = 0;
 2023	return ret;
 2024}
 2025
 2026/* Can return -ENOMEM */
 2027int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 2028			 struct btrfs_root *root,
 2029			 u64 bytenr, u64 num_bytes, u64 parent,
 2030			 u64 root_objectid, u64 owner, u64 offset)
 2031{
 2032	struct btrfs_fs_info *fs_info = root->fs_info;
 2033	int old_ref_mod, new_ref_mod;
 2034	int ret;
 2035
 2036	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
 2037	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
 2038
 2039	btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid,
 2040			   owner, offset, BTRFS_ADD_DELAYED_REF);
 2041
 2042	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
 2043		ret = btrfs_add_delayed_tree_ref(trans, bytenr,
 2044						 num_bytes, parent,
 2045						 root_objectid, (int)owner,
 2046						 BTRFS_ADD_DELAYED_REF, NULL,
 2047						 &old_ref_mod, &new_ref_mod);
 2048	} else {
 2049		ret = btrfs_add_delayed_data_ref(trans, bytenr,
 2050						 num_bytes, parent,
 2051						 root_objectid, owner, offset,
 2052						 0, BTRFS_ADD_DELAYED_REF,
 2053						 &old_ref_mod, &new_ref_mod);
 2054	}
 2055
 2056	if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) {
 2057		bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
 2058
 2059		add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid);
 2060	}
 2061
 2062	return ret;
 2063}
 2064
 2065/*
 2066 * __btrfs_inc_extent_ref - insert backreference for a given extent
 2067 *
 2068 * @trans:	    Handle of transaction
 2069 *
 2070 * @node:	    The delayed ref node used to get the bytenr/length for
 2071 *		    extent whose references are incremented.
 2072 *
 2073 * @parent:	    If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
 2074 *		    BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
 2075 *		    bytenr of the parent block. Since new extents are always
 2076 *		    created with indirect references, this will only be the case
 2077 *		    when relocating a shared extent. In that case, root_objectid
 2078 *		    will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
 2079 *		    be 0
 2080 *
 2081 * @root_objectid:  The id of the root where this modification has originated,
 2082 *		    this can be either one of the well-known metadata trees or
 2083 *		    the subvolume id which references this extent.
 2084 *
 2085 * @owner:	    For data extents it is the inode number of the owning file.
 2086 *		    For metadata extents this parameter holds the level in the
 2087 *		    tree of the extent.
 2088 *
 2089 * @offset:	    For metadata extents the offset is ignored and is currently
 2090 *		    always passed as 0. For data extents it is the fileoffset
 2091 *		    this extent belongs to.
 2092 *
 2093 * @refs_to_add     Number of references to add
 2094 *
 2095 * @extent_op       Pointer to a structure, holding information necessary when
 2096 *                  updating a tree block's flags
 2097 *
 2098 */
 2099static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 2100				  struct btrfs_delayed_ref_node *node,
 2101				  u64 parent, u64 root_objectid,
 2102				  u64 owner, u64 offset, int refs_to_add,
 2103				  struct btrfs_delayed_extent_op *extent_op)
 2104{
 2105	struct btrfs_path *path;
 2106	struct extent_buffer *leaf;
 2107	struct btrfs_extent_item *item;
 2108	struct btrfs_key key;
 2109	u64 bytenr = node->bytenr;
 2110	u64 num_bytes = node->num_bytes;
 2111	u64 refs;
 2112	int ret;
 2113
 2114	path = btrfs_alloc_path();
 2115	if (!path)
 2116		return -ENOMEM;
 2117
 2118	path->reada = READA_FORWARD;
 2119	path->leave_spinning = 1;
 2120	/* this will setup the path even if it fails to insert the back ref */
 2121	ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
 2122					   parent, root_objectid, owner,
 2123					   offset, refs_to_add, extent_op);
 2124	if ((ret < 0 && ret != -EAGAIN) || !ret)
 2125		goto out;
 2126
 2127	/*
 2128	 * Ok we had -EAGAIN which means we didn't have space to insert and
 2129	 * inline extent ref, so just update the reference count and add a
 2130	 * normal backref.
 2131	 */
 2132	leaf = path->nodes[0];
 2133	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 2134	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 2135	refs = btrfs_extent_refs(leaf, item);
 2136	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
 2137	if (extent_op)
 2138		__run_delayed_extent_op(extent_op, leaf, item);
 2139
 2140	btrfs_mark_buffer_dirty(leaf);
 2141	btrfs_release_path(path);
 2142
 2143	path->reada = READA_FORWARD;
 2144	path->leave_spinning = 1;
 2145	/* now insert the actual backref */
 2146	ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
 2147				    owner, offset, refs_to_add);
 2148	if (ret)
 2149		btrfs_abort_transaction(trans, ret);
 2150out:
 2151	btrfs_free_path(path);
 2152	return ret;
 2153}
 2154
 2155static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
 2156				struct btrfs_delayed_ref_node *node,
 2157				struct btrfs_delayed_extent_op *extent_op,
 2158				int insert_reserved)
 2159{
 2160	int ret = 0;
 2161	struct btrfs_delayed_data_ref *ref;
 2162	struct btrfs_key ins;
 2163	u64 parent = 0;
 2164	u64 ref_root = 0;
 2165	u64 flags = 0;
 2166
 2167	ins.objectid = node->bytenr;
 2168	ins.offset = node->num_bytes;
 2169	ins.type = BTRFS_EXTENT_ITEM_KEY;
 2170
 2171	ref = btrfs_delayed_node_to_data_ref(node);
 2172	trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
 2173
 2174	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
 2175		parent = ref->parent;
 2176	ref_root = ref->root;
 2177
 2178	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
 2179		if (extent_op)
 2180			flags |= extent_op->flags_to_set;
 2181		ret = alloc_reserved_file_extent(trans, parent, ref_root,
 2182						 flags, ref->objectid,
 2183						 ref->offset, &ins,
 2184						 node->ref_mod);
 2185	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
 2186		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
 2187					     ref->objectid, ref->offset,
 2188					     node->ref_mod, extent_op);
 2189	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
 2190		ret = __btrfs_free_extent(trans, node, parent,
 2191					  ref_root, ref->objectid,
 2192					  ref->offset, node->ref_mod,
 2193					  extent_op);
 2194	} else {
 2195		BUG();
 2196	}
 2197	return ret;
 2198}
 2199
 2200static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
 2201				    struct extent_buffer *leaf,
 2202				    struct btrfs_extent_item *ei)
 2203{
 2204	u64 flags = btrfs_extent_flags(leaf, ei);
 2205	if (extent_op->update_flags) {
 2206		flags |= extent_op->flags_to_set;
 2207		btrfs_set_extent_flags(leaf, ei, flags);
 2208	}
 2209
 2210	if (extent_op->update_key) {
 2211		struct btrfs_tree_block_info *bi;
 2212		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
 2213		bi = (struct btrfs_tree_block_info *)(ei + 1);
 2214		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
 2215	}
 2216}
 2217
 2218static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
 2219				 struct btrfs_delayed_ref_head *head,
 2220				 struct btrfs_delayed_extent_op *extent_op)
 2221{
 2222	struct btrfs_fs_info *fs_info = trans->fs_info;
 2223	struct btrfs_key key;
 2224	struct btrfs_path *path;
 2225	struct btrfs_extent_item *ei;
 2226	struct extent_buffer *leaf;
 2227	u32 item_size;
 2228	int ret;
 2229	int err = 0;
 2230	int metadata = !extent_op->is_data;
 2231
 2232	if (trans->aborted)
 2233		return 0;
 2234
 2235	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
 2236		metadata = 0;
 2237
 2238	path = btrfs_alloc_path();
 2239	if (!path)
 2240		return -ENOMEM;
 2241
 2242	key.objectid = head->bytenr;
 2243
 2244	if (metadata) {
 2245		key.type = BTRFS_METADATA_ITEM_KEY;
 2246		key.offset = extent_op->level;
 2247	} else {
 2248		key.type = BTRFS_EXTENT_ITEM_KEY;
 2249		key.offset = head->num_bytes;
 2250	}
 2251
 2252again:
 2253	path->reada = READA_FORWARD;
 2254	path->leave_spinning = 1;
 2255	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
 2256	if (ret < 0) {
 2257		err = ret;
 2258		goto out;
 2259	}
 2260	if (ret > 0) {
 2261		if (metadata) {
 2262			if (path->slots[0] > 0) {
 2263				path->slots[0]--;
 2264				btrfs_item_key_to_cpu(path->nodes[0], &key,
 2265						      path->slots[0]);
 2266				if (key.objectid == head->bytenr &&
 2267				    key.type == BTRFS_EXTENT_ITEM_KEY &&
 2268				    key.offset == head->num_bytes)
 2269					ret = 0;
 2270			}
 2271			if (ret > 0) {
 2272				btrfs_release_path(path);
 2273				metadata = 0;
 2274
 2275				key.objectid = head->bytenr;
 2276				key.offset = head->num_bytes;
 2277				key.type = BTRFS_EXTENT_ITEM_KEY;
 2278				goto again;
 2279			}
 2280		} else {
 2281			err = -EIO;
 2282			goto out;
 2283		}
 2284	}
 2285
 2286	leaf = path->nodes[0];
 2287	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 2288
 2289	if (unlikely(item_size < sizeof(*ei))) {
 2290		err = -EINVAL;
 2291		btrfs_print_v0_err(fs_info);
 2292		btrfs_abort_transaction(trans, err);
 2293		goto out;
 2294	}
 2295
 2296	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 2297	__run_delayed_extent_op(extent_op, leaf, ei);
 2298
 2299	btrfs_mark_buffer_dirty(leaf);
 2300out:
 2301	btrfs_free_path(path);
 2302	return err;
 2303}
 2304
 2305static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
 2306				struct btrfs_delayed_ref_node *node,
 2307				struct btrfs_delayed_extent_op *extent_op,
 2308				int insert_reserved)
 2309{
 2310	int ret = 0;
 2311	struct btrfs_delayed_tree_ref *ref;
 2312	u64 parent = 0;
 2313	u64 ref_root = 0;
 2314
 2315	ref = btrfs_delayed_node_to_tree_ref(node);
 2316	trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
 2317
 2318	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
 2319		parent = ref->parent;
 2320	ref_root = ref->root;
 2321
 2322	if (node->ref_mod != 1) {
 2323		btrfs_err(trans->fs_info,
 2324	"btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
 2325			  node->bytenr, node->ref_mod, node->action, ref_root,
 2326			  parent);
 2327		return -EIO;
 2328	}
 2329	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
 2330		BUG_ON(!extent_op || !extent_op->update_flags);
 2331		ret = alloc_reserved_tree_block(trans, node, extent_op);
 2332	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
 2333		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
 2334					     ref->level, 0, 1, extent_op);
 2335	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
 2336		ret = __btrfs_free_extent(trans, node, parent, ref_root,
 2337					  ref->level, 0, 1, extent_op);
 2338	} else {
 2339		BUG();
 2340	}
 2341	return ret;
 2342}
 2343
 2344/* helper function to actually process a single delayed ref entry */
 2345static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 2346			       struct btrfs_delayed_ref_node *node,
 2347			       struct btrfs_delayed_extent_op *extent_op,
 2348			       int insert_reserved)
 2349{
 2350	int ret = 0;
 2351
 2352	if (trans->aborted) {
 2353		if (insert_reserved)
 2354			btrfs_pin_extent(trans->fs_info, node->bytenr,
 2355					 node->num_bytes, 1);
 2356		return 0;
 2357	}
 2358
 2359	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
 2360	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
 2361		ret = run_delayed_tree_ref(trans, node, extent_op,
 2362					   insert_reserved);
 2363	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
 2364		 node->type == BTRFS_SHARED_DATA_REF_KEY)
 2365		ret = run_delayed_data_ref(trans, node, extent_op,
 2366					   insert_reserved);
 2367	else
 2368		BUG();
 2369	return ret;
 2370}
 2371
 2372static inline struct btrfs_delayed_ref_node *
 2373select_delayed_ref(struct btrfs_delayed_ref_head *head)
 2374{
 2375	struct btrfs_delayed_ref_node *ref;
 2376
 2377	if (RB_EMPTY_ROOT(&head->ref_tree))
 2378		return NULL;
 2379
 2380	/*
 2381	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
 2382	 * This is to prevent a ref count from going down to zero, which deletes
 2383	 * the extent item from the extent tree, when there still are references
 2384	 * to add, which would fail because they would not find the extent item.
 2385	 */
 2386	if (!list_empty(&head->ref_add_list))
 2387		return list_first_entry(&head->ref_add_list,
 2388				struct btrfs_delayed_ref_node, add_list);
 2389
 2390	ref = rb_entry(rb_first(&head->ref_tree),
 2391		       struct btrfs_delayed_ref_node, ref_node);
 2392	ASSERT(list_empty(&ref->add_list));
 2393	return ref;
 2394}
 2395
 2396static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
 2397				      struct btrfs_delayed_ref_head *head)
 2398{
 2399	spin_lock(&delayed_refs->lock);
 2400	head->processing = 0;
 2401	delayed_refs->num_heads_ready++;
 2402	spin_unlock(&delayed_refs->lock);
 2403	btrfs_delayed_ref_unlock(head);
 2404}
 2405
 2406static int cleanup_extent_op(struct btrfs_trans_handle *trans,
 2407			     struct btrfs_delayed_ref_head *head)
 2408{
 2409	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
 2410	int ret;
 2411
 2412	if (!extent_op)
 2413		return 0;
 2414	head->extent_op = NULL;
 2415	if (head->must_insert_reserved) {
 2416		btrfs_free_delayed_extent_op(extent_op);
 2417		return 0;
 2418	}
 2419	spin_unlock(&head->lock);
 2420	ret = run_delayed_extent_op(trans, head, extent_op);
 2421	btrfs_free_delayed_extent_op(extent_op);
 2422	return ret ? ret : 1;
 2423}
 2424
 2425static int cleanup_ref_head(struct btrfs_trans_handle *trans,
 2426			    struct btrfs_delayed_ref_head *head)
 2427{
 2428
 2429	struct btrfs_fs_info *fs_info = trans->fs_info;
 2430	struct btrfs_delayed_ref_root *delayed_refs;
 2431	int ret;
 2432
 2433	delayed_refs = &trans->transaction->delayed_refs;
 2434
 2435	ret = cleanup_extent_op(trans, head);
 2436	if (ret < 0) {
 2437		unselect_delayed_ref_head(delayed_refs, head);
 2438		btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
 2439		return ret;
 2440	} else if (ret) {
 2441		return ret;
 2442	}
 2443
 2444	/*
 2445	 * Need to drop our head ref lock and re-acquire the delayed ref lock
 2446	 * and then re-check to make sure nobody got added.
 2447	 */
 2448	spin_unlock(&head->lock);
 2449	spin_lock(&delayed_refs->lock);
 2450	spin_lock(&head->lock);
 2451	if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) {
 2452		spin_unlock(&head->lock);
 2453		spin_unlock(&delayed_refs->lock);
 2454		return 1;
 2455	}
 2456	delayed_refs->num_heads--;
 2457	rb_erase(&head->href_node, &delayed_refs->href_root);
 2458	RB_CLEAR_NODE(&head->href_node);
 2459	spin_unlock(&head->lock);
 2460	spin_unlock(&delayed_refs->lock);
 2461	atomic_dec(&delayed_refs->num_entries);
 2462
 2463	trace_run_delayed_ref_head(fs_info, head, 0);
 2464
 2465	if (head->total_ref_mod < 0) {
 2466		struct btrfs_space_info *space_info;
 2467		u64 flags;
 2468
 2469		if (head->is_data)
 2470			flags = BTRFS_BLOCK_GROUP_DATA;
 2471		else if (head->is_system)
 2472			flags = BTRFS_BLOCK_GROUP_SYSTEM;
 2473		else
 2474			flags = BTRFS_BLOCK_GROUP_METADATA;
 2475		space_info = __find_space_info(fs_info, flags);
 2476		ASSERT(space_info);
 2477		percpu_counter_add_batch(&space_info->total_bytes_pinned,
 2478				   -head->num_bytes,
 2479				   BTRFS_TOTAL_BYTES_PINNED_BATCH);
 2480
 2481		if (head->is_data) {
 2482			spin_lock(&delayed_refs->lock);
 2483			delayed_refs->pending_csums -= head->num_bytes;
 2484			spin_unlock(&delayed_refs->lock);
 2485		}
 2486	}
 2487
 2488	if (head->must_insert_reserved) {
 2489		btrfs_pin_extent(fs_info, head->bytenr,
 2490				 head->num_bytes, 1);
 2491		if (head->is_data) {
 2492			ret = btrfs_del_csums(trans, fs_info, head->bytenr,
 2493					      head->num_bytes);
 2494		}
 2495	}
 2496
 2497	/* Also free its reserved qgroup space */
 2498	btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
 2499				      head->qgroup_reserved);
 2500	btrfs_delayed_ref_unlock(head);
 2501	btrfs_put_delayed_ref_head(head);
 2502	return 0;
 2503}
 2504
 2505/*
 2506 * Returns 0 on success or if called with an already aborted transaction.
 2507 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
 2508 */
 2509static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 2510					     unsigned long nr)
 2511{
 2512	struct btrfs_fs_info *fs_info = trans->fs_info;
 2513	struct btrfs_delayed_ref_root *delayed_refs;
 2514	struct btrfs_delayed_ref_node *ref;
 2515	struct btrfs_delayed_ref_head *locked_ref = NULL;
 2516	struct btrfs_delayed_extent_op *extent_op;
 2517	ktime_t start = ktime_get();
 2518	int ret;
 2519	unsigned long count = 0;
 2520	unsigned long actual_count = 0;
 2521	int must_insert_reserved = 0;
 2522
 2523	delayed_refs = &trans->transaction->delayed_refs;
 2524	while (1) {
 2525		if (!locked_ref) {
 2526			if (count >= nr)
 2527				break;
 2528
 2529			spin_lock(&delayed_refs->lock);
 2530			locked_ref = btrfs_select_ref_head(trans);
 2531			if (!locked_ref) {
 2532				spin_unlock(&delayed_refs->lock);
 2533				break;
 2534			}
 2535
 2536			/* grab the lock that says we are going to process
 2537			 * all the refs for this head */
 2538			ret = btrfs_delayed_ref_lock(trans, locked_ref);
 2539			spin_unlock(&delayed_refs->lock);
 2540			/*
 2541			 * we may have dropped the spin lock to get the head
 2542			 * mutex lock, and that might have given someone else
 2543			 * time to free the head.  If that's true, it has been
 2544			 * removed from our list and we can move on.
 2545			 */
 2546			if (ret == -EAGAIN) {
 2547				locked_ref = NULL;
 2548				count++;
 2549				continue;
 2550			}
 2551		}
 2552
 2553		/*
 2554		 * We need to try and merge add/drops of the same ref since we
 2555		 * can run into issues with relocate dropping the implicit ref
 2556		 * and then it being added back again before the drop can
 2557		 * finish.  If we merged anything we need to re-loop so we can
 2558		 * get a good ref.
 2559		 * Or we can get node references of the same type that weren't
 2560		 * merged when created due to bumps in the tree mod seq, and
 2561		 * we need to merge them to prevent adding an inline extent
 2562		 * backref before dropping it (triggering a BUG_ON at
 2563		 * insert_inline_extent_backref()).
 2564		 */
 2565		spin_lock(&locked_ref->lock);
 2566		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
 2567
 2568		ref = select_delayed_ref(locked_ref);
 2569
 2570		if (ref && ref->seq &&
 2571		    btrfs_check_delayed_seq(fs_info, ref->seq)) {
 2572			spin_unlock(&locked_ref->lock);
 2573			unselect_delayed_ref_head(delayed_refs, locked_ref);
 2574			locked_ref = NULL;
 2575			cond_resched();
 2576			count++;
 2577			continue;
 2578		}
 2579
 2580		/*
 2581		 * We're done processing refs in this ref_head, clean everything
 2582		 * up and move on to the next ref_head.
 2583		 */
 2584		if (!ref) {
 2585			ret = cleanup_ref_head(trans, locked_ref);
 2586			if (ret > 0 ) {
 2587				/* We dropped our lock, we need to loop. */
 2588				ret = 0;
 2589				continue;
 2590			} else if (ret) {
 2591				return ret;
 2592			}
 2593			locked_ref = NULL;
 2594			count++;
 2595			continue;
 2596		}
 2597
 2598		actual_count++;
 2599		ref->in_tree = 0;
 2600		rb_erase(&ref->ref_node, &locked_ref->ref_tree);
 2601		RB_CLEAR_NODE(&ref->ref_node);
 2602		if (!list_empty(&ref->add_list))
 2603			list_del(&ref->add_list);
 2604		/*
 2605		 * When we play the delayed ref, also correct the ref_mod on
 2606		 * head
 2607		 */
 2608		switch (ref->action) {
 2609		case BTRFS_ADD_DELAYED_REF:
 2610		case BTRFS_ADD_DELAYED_EXTENT:
 2611			locked_ref->ref_mod -= ref->ref_mod;
 2612			break;
 2613		case BTRFS_DROP_DELAYED_REF:
 2614			locked_ref->ref_mod += ref->ref_mod;
 2615			break;
 2616		default:
 2617			WARN_ON(1);
 2618		}
 2619		atomic_dec(&delayed_refs->num_entries);
 2620
 2621		/*
 2622		 * Record the must-insert_reserved flag before we drop the spin
 2623		 * lock.
 2624		 */
 2625		must_insert_reserved = locked_ref->must_insert_reserved;
 2626		locked_ref->must_insert_reserved = 0;
 2627
 2628		extent_op = locked_ref->extent_op;
 2629		locked_ref->extent_op = NULL;
 2630		spin_unlock(&locked_ref->lock);
 2631
 2632		ret = run_one_delayed_ref(trans, ref, extent_op,
 2633					  must_insert_reserved);
 2634
 2635		btrfs_free_delayed_extent_op(extent_op);
 2636		if (ret) {
 2637			unselect_delayed_ref_head(delayed_refs, locked_ref);
 2638			btrfs_put_delayed_ref(ref);
 2639			btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
 2640				    ret);
 2641			return ret;
 2642		}
 2643
 2644		btrfs_put_delayed_ref(ref);
 2645		count++;
 2646		cond_resched();
 2647	}
 2648
 2649	/*
 2650	 * We don't want to include ref heads since we can have empty ref heads
 2651	 * and those will drastically skew our runtime down since we just do
 2652	 * accounting, no actual extent tree updates.
 2653	 */
 2654	if (actual_count > 0) {
 2655		u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
 2656		u64 avg;
 2657
 2658		/*
 2659		 * We weigh the current average higher than our current runtime
 2660		 * to avoid large swings in the average.
 2661		 */
 2662		spin_lock(&delayed_refs->lock);
 2663		avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
 2664		fs_info->avg_delayed_ref_runtime = avg >> 2;	/* div by 4 */
 2665		spin_unlock(&delayed_refs->lock);
 2666	}
 2667	return 0;
 2668}
 2669
 2670#ifdef SCRAMBLE_DELAYED_REFS
 2671/*
 2672 * Normally delayed refs get processed in ascending bytenr order. This
 2673 * correlates in most cases to the order added. To expose dependencies on this
 2674 * order, we start to process the tree in the middle instead of the beginning
 2675 */
 2676static u64 find_middle(struct rb_root *root)
 2677{
 2678	struct rb_node *n = root->rb_node;
 2679	struct btrfs_delayed_ref_node *entry;
 2680	int alt = 1;
 2681	u64 middle;
 2682	u64 first = 0, last = 0;
 2683
 2684	n = rb_first(root);
 2685	if (n) {
 2686		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
 2687		first = entry->bytenr;
 2688	}
 2689	n = rb_last(root);
 2690	if (n) {
 2691		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
 2692		last = entry->bytenr;
 2693	}
 2694	n = root->rb_node;
 2695
 2696	while (n) {
 2697		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
 2698		WARN_ON(!entry->in_tree);
 2699
 2700		middle = entry->bytenr;
 2701
 2702		if (alt)
 2703			n = n->rb_left;
 2704		else
 2705			n = n->rb_right;
 2706
 2707		alt = 1 - alt;
 2708	}
 2709	return middle;
 2710}
 2711#endif
 2712
 2713static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
 2714{
 2715	u64 num_bytes;
 2716
 2717	num_bytes = heads * (sizeof(struct btrfs_extent_item) +
 2718			     sizeof(struct btrfs_extent_inline_ref));
 2719	if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
 2720		num_bytes += heads * sizeof(struct btrfs_tree_block_info);
 2721
 2722	/*
 2723	 * We don't ever fill up leaves all the way so multiply by 2 just to be
 2724	 * closer to what we're really going to want to use.
 2725	 */
 2726	return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
 2727}
 2728
 2729/*
 2730 * Takes the number of bytes to be csumm'ed and figures out how many leaves it
 2731 * would require to store the csums for that many bytes.
 2732 */
 2733u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
 2734{
 2735	u64 csum_size;
 2736	u64 num_csums_per_leaf;
 2737	u64 num_csums;
 2738
 2739	csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
 2740	num_csums_per_leaf = div64_u64(csum_size,
 2741			(u64)btrfs_super_csum_size(fs_info->super_copy));
 2742	num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
 2743	num_csums += num_csums_per_leaf - 1;
 2744	num_csums = div64_u64(num_csums, num_csums_per_leaf);
 2745	return num_csums;
 2746}
 2747
 2748int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
 2749				       struct btrfs_fs_info *fs_info)
 2750{
 2751	struct btrfs_block_rsv *global_rsv;
 2752	u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
 2753	u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
 2754	unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs;
 2755	u64 num_bytes, num_dirty_bgs_bytes;
 2756	int ret = 0;
 2757
 2758	num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
 2759	num_heads = heads_to_leaves(fs_info, num_heads);
 2760	if (num_heads > 1)
 2761		num_bytes += (num_heads - 1) * fs_info->nodesize;
 2762	num_bytes <<= 1;
 2763	num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) *
 2764							fs_info->nodesize;
 2765	num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info,
 2766							     num_dirty_bgs);
 2767	global_rsv = &fs_info->global_block_rsv;
 2768
 2769	/*
 2770	 * If we can't allocate any more chunks lets make sure we have _lots_ of
 2771	 * wiggle room since running delayed refs can create more delayed refs.
 2772	 */
 2773	if (global_rsv->space_info->full) {
 2774		num_dirty_bgs_bytes <<= 1;
 2775		num_bytes <<= 1;
 2776	}
 2777
 2778	spin_lock(&global_rsv->lock);
 2779	if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
 2780		ret = 1;
 2781	spin_unlock(&global_rsv->lock);
 2782	return ret;
 2783}
 2784
 2785int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
 2786				       struct btrfs_fs_info *fs_info)
 2787{
 2788	u64 num_entries =
 2789		atomic_read(&trans->transaction->delayed_refs.num_entries);
 2790	u64 avg_runtime;
 2791	u64 val;
 2792
 2793	smp_mb();
 2794	avg_runtime = fs_info->avg_delayed_ref_runtime;
 2795	val = num_entries * avg_runtime;
 2796	if (val >= NSEC_PER_SEC)
 2797		return 1;
 2798	if (val >= NSEC_PER_SEC / 2)
 2799		return 2;
 2800
 2801	return btrfs_check_space_for_delayed_refs(trans, fs_info);
 2802}
 2803
 2804struct async_delayed_refs {
 2805	struct btrfs_root *root;
 2806	u64 transid;
 2807	int count;
 2808	int error;
 2809	int sync;
 2810	struct completion wait;
 2811	struct btrfs_work work;
 2812};
 2813
 2814static inline struct async_delayed_refs *
 2815to_async_delayed_refs(struct btrfs_work *work)
 2816{
 2817	return container_of(work, struct async_delayed_refs, work);
 2818}
 2819
 2820static void delayed_ref_async_start(struct btrfs_work *work)
 2821{
 2822	struct async_delayed_refs *async = to_async_delayed_refs(work);
 2823	struct btrfs_trans_handle *trans;
 2824	struct btrfs_fs_info *fs_info = async->root->fs_info;
 2825	int ret;
 2826
 2827	/* if the commit is already started, we don't need to wait here */
 2828	if (btrfs_transaction_blocked(fs_info))
 2829		goto done;
 2830
 2831	trans = btrfs_join_transaction(async->root);
 2832	if (IS_ERR(trans)) {
 2833		async->error = PTR_ERR(trans);
 2834		goto done;
 2835	}
 2836
 2837	/*
 2838	 * trans->sync means that when we call end_transaction, we won't
 2839	 * wait on delayed refs
 2840	 */
 2841	trans->sync = true;
 2842
 2843	/* Don't bother flushing if we got into a different transaction */
 2844	if (trans->transid > async->transid)
 2845		goto end;
 2846
 2847	ret = btrfs_run_delayed_refs(trans, async->count);
 2848	if (ret)
 2849		async->error = ret;
 2850end:
 2851	ret = btrfs_end_transaction(trans);
 2852	if (ret && !async->error)
 2853		async->error = ret;
 2854done:
 2855	if (async->sync)
 2856		complete(&async->wait);
 2857	else
 2858		kfree(async);
 2859}
 2860
 2861int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
 2862				 unsigned long count, u64 transid, int wait)
 2863{
 2864	struct async_delayed_refs *async;
 2865	int ret;
 2866
 2867	async = kmalloc(sizeof(*async), GFP_NOFS);
 2868	if (!async)
 2869		return -ENOMEM;
 2870
 2871	async->root = fs_info->tree_root;
 2872	async->count = count;
 2873	async->error = 0;
 2874	async->transid = transid;
 2875	if (wait)
 2876		async->sync = 1;
 2877	else
 2878		async->sync = 0;
 2879	init_completion(&async->wait);
 2880
 2881	btrfs_init_work(&async->work, btrfs_extent_refs_helper,
 2882			delayed_ref_async_start, NULL, NULL);
 2883
 2884	btrfs_queue_work(fs_info->extent_workers, &async->work);
 2885
 2886	if (wait) {
 2887		wait_for_completion(&async->wait);
 2888		ret = async->error;
 2889		kfree(async);
 2890		return ret;
 2891	}
 2892	return 0;
 2893}
 2894
 2895/*
 2896 * this starts processing the delayed reference count updates and
 2897 * extent insertions we have queued up so far.  count can be
 2898 * 0, which means to process everything in the tree at the start
 2899 * of the run (but not newly added entries), or it can be some target
 2900 * number you'd like to process.
 2901 *
 2902 * Returns 0 on success or if called with an aborted transaction
 2903 * Returns <0 on error and aborts the transaction
 2904 */
 2905int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 2906			   unsigned long count)
 2907{
 2908	struct btrfs_fs_info *fs_info = trans->fs_info;
 2909	struct rb_node *node;
 2910	struct btrfs_delayed_ref_root *delayed_refs;
 2911	struct btrfs_delayed_ref_head *head;
 2912	int ret;
 2913	int run_all = count == (unsigned long)-1;
 2914	bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
 2915
 2916	/* We'll clean this up in btrfs_cleanup_transaction */
 2917	if (trans->aborted)
 2918		return 0;
 2919
 2920	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
 2921		return 0;
 2922
 2923	delayed_refs = &trans->transaction->delayed_refs;
 2924	if (count == 0)
 2925		count = atomic_read(&delayed_refs->num_entries) * 2;
 2926
 2927again:
 2928#ifdef SCRAMBLE_DELAYED_REFS
 2929	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
 2930#endif
 2931	trans->can_flush_pending_bgs = false;
 2932	ret = __btrfs_run_delayed_refs(trans, count);
 2933	if (ret < 0) {
 2934		btrfs_abort_transaction(trans, ret);
 2935		return ret;
 2936	}
 2937
 2938	if (run_all) {
 2939		if (!list_empty(&trans->new_bgs))
 2940			btrfs_create_pending_block_groups(trans);
 2941
 2942		spin_lock(&delayed_refs->lock);
 2943		node = rb_first(&delayed_refs->href_root);
 2944		if (!node) {
 2945			spin_unlock(&delayed_refs->lock);
 2946			goto out;
 2947		}
 2948		head = rb_entry(node, struct btrfs_delayed_ref_head,
 2949				href_node);
 2950		refcount_inc(&head->refs);
 2951		spin_unlock(&delayed_refs->lock);
 2952
 2953		/* Mutex was contended, block until it's released and retry. */
 2954		mutex_lock(&head->mutex);
 2955		mutex_unlock(&head->mutex);
 2956
 2957		btrfs_put_delayed_ref_head(head);
 2958		cond_resched();
 2959		goto again;
 2960	}
 2961out:
 2962	trans->can_flush_pending_bgs = can_flush_pending_bgs;
 2963	return 0;
 2964}
 2965
 2966int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 2967				struct btrfs_fs_info *fs_info,
 2968				u64 bytenr, u64 num_bytes, u64 flags,
 2969				int level, int is_data)
 2970{
 2971	struct btrfs_delayed_extent_op *extent_op;
 2972	int ret;
 2973
 2974	extent_op = btrfs_alloc_delayed_extent_op();
 2975	if (!extent_op)
 2976		return -ENOMEM;
 2977
 2978	extent_op->flags_to_set = flags;
 2979	extent_op->update_flags = true;
 2980	extent_op->update_key = false;
 2981	extent_op->is_data = is_data ? true : false;
 2982	extent_op->level = level;
 2983
 2984	ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr,
 2985					  num_bytes, extent_op);
 2986	if (ret)
 2987		btrfs_free_delayed_extent_op(extent_op);
 2988	return ret;
 2989}
 2990
 2991static noinline int check_delayed_ref(struct btrfs_root *root,
 2992				      struct btrfs_path *path,
 2993				      u64 objectid, u64 offset, u64 bytenr)
 2994{
 2995	struct btrfs_delayed_ref_head *head;
 2996	struct btrfs_delayed_ref_node *ref;
 2997	struct btrfs_delayed_data_ref *data_ref;
 2998	struct btrfs_delayed_ref_root *delayed_refs;
 2999	struct btrfs_transaction *cur_trans;
 3000	struct rb_node *node;
 3001	int ret = 0;
 3002
 3003	spin_lock(&root->fs_info->trans_lock);
 3004	cur_trans = root->fs_info->running_transaction;
 3005	if (cur_trans)
 3006		refcount_inc(&cur_trans->use_count);
 3007	spin_unlock(&root->fs_info->trans_lock);
 3008	if (!cur_trans)
 3009		return 0;
 3010
 3011	delayed_refs = &cur_trans->delayed_refs;
 3012	spin_lock(&delayed_refs->lock);
 3013	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
 3014	if (!head) {
 3015		spin_unlock(&delayed_refs->lock);
 3016		btrfs_put_transaction(cur_trans);
 3017		return 0;
 3018	}
 3019
 3020	if (!mutex_trylock(&head->mutex)) {
 3021		refcount_inc(&head->refs);
 3022		spin_unlock(&delayed_refs->lock);
 3023
 3024		btrfs_release_path(path);
 3025
 3026		/*
 3027		 * Mutex was contended, block until it's released and let
 3028		 * caller try again
 3029		 */
 3030		mutex_lock(&head->mutex);
 3031		mutex_unlock(&head->mutex);
 3032		btrfs_put_delayed_ref_head(head);
 3033		btrfs_put_transaction(cur_trans);
 3034		return -EAGAIN;
 3035	}
 3036	spin_unlock(&delayed_refs->lock);
 3037
 3038	spin_lock(&head->lock);
 3039	/*
 3040	 * XXX: We should replace this with a proper search function in the
 3041	 * future.
 3042	 */
 3043	for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
 3044		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
 3045		/* If it's a shared ref we know a cross reference exists */
 3046		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
 3047			ret = 1;
 3048			break;
 3049		}
 3050
 3051		data_ref = btrfs_delayed_node_to_data_ref(ref);
 3052
 3053		/*
 3054		 * If our ref doesn't match the one we're currently looking at
 3055		 * then we have a cross reference.
 3056		 */
 3057		if (data_ref->root != root->root_key.objectid ||
 3058		    data_ref->objectid != objectid ||
 3059		    data_ref->offset != offset) {
 3060			ret = 1;
 3061			break;
 3062		}
 3063	}
 3064	spin_unlock(&head->lock);
 3065	mutex_unlock(&head->mutex);
 3066	btrfs_put_transaction(cur_trans);
 3067	return ret;
 3068}
 3069
 3070static noinline int check_committed_ref(struct btrfs_root *root,
 3071					struct btrfs_path *path,
 3072					u64 objectid, u64 offset, u64 bytenr)
 3073{
 3074	struct btrfs_fs_info *fs_info = root->fs_info;
 3075	struct btrfs_root *extent_root = fs_info->extent_root;
 3076	struct extent_buffer *leaf;
 3077	struct btrfs_extent_data_ref *ref;
 3078	struct btrfs_extent_inline_ref *iref;
 3079	struct btrfs_extent_item *ei;
 3080	struct btrfs_key key;
 3081	u32 item_size;
 3082	int type;
 3083	int ret;
 3084
 3085	key.objectid = bytenr;
 3086	key.offset = (u64)-1;
 3087	key.type = BTRFS_EXTENT_ITEM_KEY;
 3088
 3089	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 3090	if (ret < 0)
 3091		goto out;
 3092	BUG_ON(ret == 0); /* Corruption */
 3093
 3094	ret = -ENOENT;
 3095	if (path->slots[0] == 0)
 3096		goto out;
 3097
 3098	path->slots[0]--;
 3099	leaf = path->nodes[0];
 3100	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 3101
 3102	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
 3103		goto out;
 3104
 3105	ret = 1;
 3106	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 3107	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 3108
 3109	if (item_size != sizeof(*ei) +
 3110	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
 3111		goto out;
 3112
 3113	if (btrfs_extent_generation(leaf, ei) <=
 3114	    btrfs_root_last_snapshot(&root->root_item))
 3115		goto out;
 3116
 3117	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
 3118
 3119	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
 3120	if (type != BTRFS_EXTENT_DATA_REF_KEY)
 3121		goto out;
 3122
 3123	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
 3124	if (btrfs_extent_refs(leaf, ei) !=
 3125	    btrfs_extent_data_ref_count(leaf, ref) ||
 3126	    btrfs_extent_data_ref_root(leaf, ref) !=
 3127	    root->root_key.objectid ||
 3128	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
 3129	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
 3130		goto out;
 3131
 3132	ret = 0;
 3133out:
 3134	return ret;
 3135}
 3136
 3137int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
 3138			  u64 bytenr)
 3139{
 3140	struct btrfs_path *path;
 3141	int ret;
 3142	int ret2;
 3143
 3144	path = btrfs_alloc_path();
 3145	if (!path)
 3146		return -ENOMEM;
 3147
 3148	do {
 3149		ret = check_committed_ref(root, path, objectid,
 3150					  offset, bytenr);
 3151		if (ret && ret != -ENOENT)
 3152			goto out;
 3153
 3154		ret2 = check_delayed_ref(root, path, objectid,
 3155					 offset, bytenr);
 3156	} while (ret2 == -EAGAIN);
 3157
 3158	if (ret2 && ret2 != -ENOENT) {
 3159		ret = ret2;
 3160		goto out;
 3161	}
 3162
 3163	if (ret != -ENOENT || ret2 != -ENOENT)
 3164		ret = 0;
 3165out:
 3166	btrfs_free_path(path);
 3167	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
 3168		WARN_ON(ret > 0);
 3169	return ret;
 3170}
 3171
 3172static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 3173			   struct btrfs_root *root,
 3174			   struct extent_buffer *buf,
 3175			   int full_backref, int inc)
 3176{
 3177	struct btrfs_fs_info *fs_info = root->fs_info;
 3178	u64 bytenr;
 3179	u64 num_bytes;
 3180	u64 parent;
 3181	u64 ref_root;
 3182	u32 nritems;
 3183	struct btrfs_key key;
 3184	struct btrfs_file_extent_item *fi;
 3185	int i;
 3186	int level;
 3187	int ret = 0;
 3188	int (*process_func)(struct btrfs_trans_handle *,
 3189			    struct btrfs_root *,
 3190			    u64, u64, u64, u64, u64, u64);
 3191
 3192
 3193	if (btrfs_is_testing(fs_info))
 3194		return 0;
 3195
 3196	ref_root = btrfs_header_owner(buf);
 3197	nritems = btrfs_header_nritems(buf);
 3198	level = btrfs_header_level(buf);
 3199
 3200	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
 3201		return 0;
 3202
 3203	if (inc)
 3204		process_func = btrfs_inc_extent_ref;
 3205	else
 3206		process_func = btrfs_free_extent;
 3207
 3208	if (full_backref)
 3209		parent = buf->start;
 3210	else
 3211		parent = 0;
 3212
 3213	for (i = 0; i < nritems; i++) {
 3214		if (level == 0) {
 3215			btrfs_item_key_to_cpu(buf, &key, i);
 3216			if (key.type != BTRFS_EXTENT_DATA_KEY)
 3217				continue;
 3218			fi = btrfs_item_ptr(buf, i,
 3219					    struct btrfs_file_extent_item);
 3220			if (btrfs_file_extent_type(buf, fi) ==
 3221			    BTRFS_FILE_EXTENT_INLINE)
 3222				continue;
 3223			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
 3224			if (bytenr == 0)
 3225				continue;
 3226
 3227			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
 3228			key.offset -= btrfs_file_extent_offset(buf, fi);
 3229			ret = process_func(trans, root, bytenr, num_bytes,
 3230					   parent, ref_root, key.objectid,
 3231					   key.offset);
 3232			if (ret)
 3233				goto fail;
 3234		} else {
 3235			bytenr = btrfs_node_blockptr(buf, i);
 3236			num_bytes = fs_info->nodesize;
 3237			ret = process_func(trans, root, bytenr, num_bytes,
 3238					   parent, ref_root, level - 1, 0);
 3239			if (ret)
 3240				goto fail;
 3241		}
 3242	}
 3243	return 0;
 3244fail:
 3245	return ret;
 3246}
 3247
 3248int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 3249		  struct extent_buffer *buf, int full_backref)
 3250{
 3251	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
 3252}
 3253
 3254int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 3255		  struct extent_buffer *buf, int full_backref)
 3256{
 3257	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
 3258}
 3259
 3260static int write_one_cache_group(struct btrfs_trans_handle *trans,
 3261				 struct btrfs_fs_info *fs_info,
 3262				 struct btrfs_path *path,
 3263				 struct btrfs_block_group_cache *cache)
 3264{
 3265	int ret;
 3266	struct btrfs_root *extent_root = fs_info->extent_root;
 3267	unsigned long bi;
 3268	struct extent_buffer *leaf;
 3269
 3270	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
 3271	if (ret) {
 3272		if (ret > 0)
 3273			ret = -ENOENT;
 3274		goto fail;
 3275	}
 3276
 3277	leaf = path->nodes[0];
 3278	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
 3279	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
 3280	btrfs_mark_buffer_dirty(leaf);
 3281fail:
 3282	btrfs_release_path(path);
 3283	return ret;
 3284
 3285}
 3286
 3287static struct btrfs_block_group_cache *
 3288next_block_group(struct btrfs_fs_info *fs_info,
 3289		 struct btrfs_block_group_cache *cache)
 3290{
 3291	struct rb_node *node;
 3292
 3293	spin_lock(&fs_info->block_group_cache_lock);
 3294
 3295	/* If our block group was removed, we need a full search. */
 3296	if (RB_EMPTY_NODE(&cache->cache_node)) {
 3297		const u64 next_bytenr = cache->key.objectid + cache->key.offset;
 3298
 3299		spin_unlock(&fs_info->block_group_cache_lock);
 3300		btrfs_put_block_group(cache);
 3301		cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
 3302	}
 3303	node = rb_next(&cache->cache_node);
 3304	btrfs_put_block_group(cache);
 3305	if (node) {
 3306		cache = rb_entry(node, struct btrfs_block_group_cache,
 3307				 cache_node);
 3308		btrfs_get_block_group(cache);
 3309	} else
 3310		cache = NULL;
 3311	spin_unlock(&fs_info->block_group_cache_lock);
 3312	return cache;
 3313}
 3314
 3315static int cache_save_setup(struct btrfs_block_group_cache *block_group,
 3316			    struct btrfs_trans_handle *trans,
 3317			    struct btrfs_path *path)
 3318{
 3319	struct btrfs_fs_info *fs_info = block_group->fs_info;
 3320	struct btrfs_root *root = fs_info->tree_root;
 3321	struct inode *inode = NULL;
 3322	struct extent_changeset *data_reserved = NULL;
 3323	u64 alloc_hint = 0;
 3324	int dcs = BTRFS_DC_ERROR;
 3325	u64 num_pages = 0;
 3326	int retries = 0;
 3327	int ret = 0;
 3328
 3329	/*
 3330	 * If this block group is smaller than 100 megs don't bother caching the
 3331	 * block group.
 3332	 */
 3333	if (block_group->key.offset < (100 * SZ_1M)) {
 3334		spin_lock(&block_group->lock);
 3335		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
 3336		spin_unlock(&block_group->lock);
 3337		return 0;
 3338	}
 3339
 3340	if (trans->aborted)
 3341		return 0;
 3342again:
 3343	inode = lookup_free_space_inode(fs_info, block_group, path);
 3344	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
 3345		ret = PTR_ERR(inode);
 3346		btrfs_release_path(path);
 3347		goto out;
 3348	}
 3349
 3350	if (IS_ERR(inode)) {
 3351		BUG_ON(retries);
 3352		retries++;
 3353
 3354		if (block_group->ro)
 3355			goto out_free;
 3356
 3357		ret = create_free_space_inode(fs_info, trans, block_group,
 3358					      path);
 3359		if (ret)
 3360			goto out_free;
 3361		goto again;
 3362	}
 3363
 3364	/*
 3365	 * We want to set the generation to 0, that way if anything goes wrong
 3366	 * from here on out we know not to trust this cache when we load up next
 3367	 * time.
 3368	 */
 3369	BTRFS_I(inode)->generation = 0;
 3370	ret = btrfs_update_inode(trans, root, inode);
 3371	if (ret) {
 3372		/*
 3373		 * So theoretically we could recover from this, simply set the
 3374		 * super cache generation to 0 so we know to invalidate the
 3375		 * cache, but then we'd have to keep track of the block groups
 3376		 * that fail this way so we know we _have_ to reset this cache
 3377		 * before the next commit or risk reading stale cache.  So to
 3378		 * limit our exposure to horrible edge cases lets just abort the
 3379		 * transaction, this only happens in really bad situations
 3380		 * anyway.
 3381		 */
 3382		btrfs_abort_transaction(trans, ret);
 3383		goto out_put;
 3384	}
 3385	WARN_ON(ret);
 3386
 3387	/* We've already setup this transaction, go ahead and exit */
 3388	if (block_group->cache_generation == trans->transid &&
 3389	    i_size_read(inode)) {
 3390		dcs = BTRFS_DC_SETUP;
 3391		goto out_put;
 3392	}
 3393
 3394	if (i_size_read(inode) > 0) {
 3395		ret = btrfs_check_trunc_cache_free_space(fs_info,
 3396					&fs_info->global_block_rsv);
 3397		if (ret)
 3398			goto out_put;
 3399
 3400		ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
 3401		if (ret)
 3402			goto out_put;
 3403	}
 3404
 3405	spin_lock(&block_group->lock);
 3406	if (block_group->cached != BTRFS_CACHE_FINISHED ||
 3407	    !btrfs_test_opt(fs_info, SPACE_CACHE)) {
 3408		/*
 3409		 * don't bother trying to write stuff out _if_
 3410		 * a) we're not cached,
 3411		 * b) we're with nospace_cache mount option,
 3412		 * c) we're with v2 space_cache (FREE_SPACE_TREE).
 3413		 */
 3414		dcs = BTRFS_DC_WRITTEN;
 3415		spin_unlock(&block_group->lock);
 3416		goto out_put;
 3417	}
 3418	spin_unlock(&block_group->lock);
 3419
 3420	/*
 3421	 * We hit an ENOSPC when setting up the cache in this transaction, just
 3422	 * skip doing the setup, we've already cleared the cache so we're safe.
 3423	 */
 3424	if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
 3425		ret = -ENOSPC;
 3426		goto out_put;
 3427	}
 3428
 3429	/*
 3430	 * Try to preallocate enough space based on how big the block group is.
 3431	 * Keep in mind this has to include any pinned space which could end up
 3432	 * taking up quite a bit since it's not folded into the other space
 3433	 * cache.
 3434	 */
 3435	num_pages = div_u64(block_group->key.offset, SZ_256M);
 3436	if (!num_pages)
 3437		num_pages = 1;
 3438
 3439	num_pages *= 16;
 3440	num_pages *= PAGE_SIZE;
 3441
 3442	ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
 3443	if (ret)
 3444		goto out_put;
 3445
 3446	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
 3447					      num_pages, num_pages,
 3448					      &alloc_hint);
 3449	/*
 3450	 * Our cache requires contiguous chunks so that we don't modify a bunch
 3451	 * of metadata or split extents when writing the cache out, which means
 3452	 * we can enospc if we are heavily fragmented in addition to just normal
 3453	 * out of space conditions.  So if we hit this just skip setting up any
 3454	 * other block groups for this transaction, maybe we'll unpin enough
 3455	 * space the next time around.
 3456	 */
 3457	if (!ret)
 3458		dcs = BTRFS_DC_SETUP;
 3459	else if (ret == -ENOSPC)
 3460		set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
 3461
 3462out_put:
 3463	iput(inode);
 3464out_free:
 3465	btrfs_release_path(path);
 3466out:
 3467	spin_lock(&block_group->lock);
 3468	if (!ret && dcs == BTRFS_DC_SETUP)
 3469		block_group->cache_generation = trans->transid;
 3470	block_group->disk_cache_state = dcs;
 3471	spin_unlock(&block_group->lock);
 3472
 3473	extent_changeset_free(data_reserved);
 3474	return ret;
 3475}
 3476
 3477int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
 3478			    struct btrfs_fs_info *fs_info)
 3479{
 3480	struct btrfs_block_group_cache *cache, *tmp;
 3481	struct btrfs_transaction *cur_trans = trans->transaction;
 3482	struct btrfs_path *path;
 3483
 3484	if (list_empty(&cur_trans->dirty_bgs) ||
 3485	    !btrfs_test_opt(fs_info, SPACE_CACHE))
 3486		return 0;
 3487
 3488	path = btrfs_alloc_path();
 3489	if (!path)
 3490		return -ENOMEM;
 3491
 3492	/* Could add new block groups, use _safe just in case */
 3493	list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
 3494				 dirty_list) {
 3495		if (cache->disk_cache_state == BTRFS_DC_CLEAR)
 3496			cache_save_setup(cache, trans, path);
 3497	}
 3498
 3499	btrfs_free_path(path);
 3500	return 0;
 3501}
 3502
 3503/*
 3504 * transaction commit does final block group cache writeback during a
 3505 * critical section where nothing is allowed to change the FS.  This is
 3506 * required in order for the cache to actually match the block group,
 3507 * but can introduce a lot of latency into the commit.
 3508 *
 3509 * So, btrfs_start_dirty_block_groups is here to kick off block group
 3510 * cache IO.  There's a chance we'll have to redo some of it if the
 3511 * block group changes again during the commit, but it greatly reduces
 3512 * the commit latency by getting rid of the easy block groups while
 3513 * we're still allowing others to join the commit.
 3514 */
 3515int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
 3516{
 3517	struct btrfs_fs_info *fs_info = trans->fs_info;
 3518	struct btrfs_block_group_cache *cache;
 3519	struct btrfs_transaction *cur_trans = trans->transaction;
 3520	int ret = 0;
 3521	int should_put;
 3522	struct btrfs_path *path = NULL;
 3523	LIST_HEAD(dirty);
 3524	struct list_head *io = &cur_trans->io_bgs;
 3525	int num_started = 0;
 3526	int loops = 0;
 3527
 3528	spin_lock(&cur_trans->dirty_bgs_lock);
 3529	if (list_empty(&cur_trans->dirty_bgs)) {
 3530		spin_unlock(&cur_trans->dirty_bgs_lock);
 3531		return 0;
 3532	}
 3533	list_splice_init(&cur_trans->dirty_bgs, &dirty);
 3534	spin_unlock(&cur_trans->dirty_bgs_lock);
 3535
 3536again:
 3537	/*
 3538	 * make sure all the block groups on our dirty list actually
 3539	 * exist
 3540	 */
 3541	btrfs_create_pending_block_groups(trans);
 3542
 3543	if (!path) {
 3544		path = btrfs_alloc_path();
 3545		if (!path)
 3546			return -ENOMEM;
 3547	}
 3548
 3549	/*
 3550	 * cache_write_mutex is here only to save us from balance or automatic
 3551	 * removal of empty block groups deleting this block group while we are
 3552	 * writing out the cache
 3553	 */
 3554	mutex_lock(&trans->transaction->cache_write_mutex);
 3555	while (!list_empty(&dirty)) {
 3556		cache = list_first_entry(&dirty,
 3557					 struct btrfs_block_group_cache,
 3558					 dirty_list);
 3559		/*
 3560		 * this can happen if something re-dirties a block
 3561		 * group that is already under IO.  Just wait for it to
 3562		 * finish and then do it all again
 3563		 */
 3564		if (!list_empty(&cache->io_list)) {
 3565			list_del_init(&cache->io_list);
 3566			btrfs_wait_cache_io(trans, cache, path);
 3567			btrfs_put_block_group(cache);
 3568		}
 3569
 3570
 3571		/*
 3572		 * btrfs_wait_cache_io uses the cache->dirty_list to decide
 3573		 * if it should update the cache_state.  Don't delete
 3574		 * until after we wait.
 3575		 *
 3576		 * Since we're not running in the commit critical section
 3577		 * we need the dirty_bgs_lock to protect from update_block_group
 3578		 */
 3579		spin_lock(&cur_trans->dirty_bgs_lock);
 3580		list_del_init(&cache->dirty_list);
 3581		spin_unlock(&cur_trans->dirty_bgs_lock);
 3582
 3583		should_put = 1;
 3584
 3585		cache_save_setup(cache, trans, path);
 3586
 3587		if (cache->disk_cache_state == BTRFS_DC_SETUP) {
 3588			cache->io_ctl.inode = NULL;
 3589			ret = btrfs_write_out_cache(fs_info, trans,
 3590						    cache, path);
 3591			if (ret == 0 && cache->io_ctl.inode) {
 3592				num_started++;
 3593				should_put = 0;
 3594
 3595				/*
 3596				 * The cache_write_mutex is protecting the
 3597				 * io_list, also refer to the definition of
 3598				 * btrfs_transaction::io_bgs for more details
 3599				 */
 3600				list_add_tail(&cache->io_list, io);
 3601			} else {
 3602				/*
 3603				 * if we failed to write the cache, the
 3604				 * generation will be bad and life goes on
 3605				 */
 3606				ret = 0;
 3607			}
 3608		}
 3609		if (!ret) {
 3610			ret = write_one_cache_group(trans, fs_info,
 3611						    path, cache);
 3612			/*
 3613			 * Our block group might still be attached to the list
 3614			 * of new block groups in the transaction handle of some
 3615			 * other task (struct btrfs_trans_handle->new_bgs). This
 3616			 * means its block group item isn't yet in the extent
 3617			 * tree. If this happens ignore the error, as we will
 3618			 * try again later in the critical section of the
 3619			 * transaction commit.
 3620			 */
 3621			if (ret == -ENOENT) {
 3622				ret = 0;
 3623				spin_lock(&cur_trans->dirty_bgs_lock);
 3624				if (list_empty(&cache->dirty_list)) {
 3625					list_add_tail(&cache->dirty_list,
 3626						      &cur_trans->dirty_bgs);
 3627					btrfs_get_block_group(cache);
 3628				}
 3629				spin_unlock(&cur_trans->dirty_bgs_lock);
 3630			} else if (ret) {
 3631				btrfs_abort_transaction(trans, ret);
 3632			}
 3633		}
 3634
 3635		/* if its not on the io list, we need to put the block group */
 3636		if (should_put)
 3637			btrfs_put_block_group(cache);
 3638
 3639		if (ret)
 3640			break;
 3641
 3642		/*
 3643		 * Avoid blocking other tasks for too long. It might even save
 3644		 * us from writing caches for block groups that are going to be
 3645		 * removed.
 3646		 */
 3647		mutex_unlock(&trans->transaction->cache_write_mutex);
 3648		mutex_lock(&trans->transaction->cache_write_mutex);
 3649	}
 3650	mutex_unlock(&trans->transaction->cache_write_mutex);
 3651
 3652	/*
 3653	 * go through delayed refs for all the stuff we've just kicked off
 3654	 * and then loop back (just once)
 3655	 */
 3656	ret = btrfs_run_delayed_refs(trans, 0);
 3657	if (!ret && loops == 0) {
 3658		loops++;
 3659		spin_lock(&cur_trans->dirty_bgs_lock);
 3660		list_splice_init(&cur_trans->dirty_bgs, &dirty);
 3661		/*
 3662		 * dirty_bgs_lock protects us from concurrent block group
 3663		 * deletes too (not just cache_write_mutex).
 3664		 */
 3665		if (!list_empty(&dirty)) {
 3666			spin_unlock(&cur_trans->dirty_bgs_lock);
 3667			goto again;
 3668		}
 3669		spin_unlock(&cur_trans->dirty_bgs_lock);
 3670	} else if (ret < 0) {
 3671		btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
 3672	}
 3673
 3674	btrfs_free_path(path);
 3675	return ret;
 3676}
 3677
 3678int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 3679				   struct btrfs_fs_info *fs_info)
 3680{
 3681	struct btrfs_block_group_cache *cache;
 3682	struct btrfs_transaction *cur_trans = trans->transaction;
 3683	int ret = 0;
 3684	int should_put;
 3685	struct btrfs_path *path;
 3686	struct list_head *io = &cur_trans->io_bgs;
 3687	int num_started = 0;
 3688
 3689	path = btrfs_alloc_path();
 3690	if (!path)
 3691		return -ENOMEM;
 3692
 3693	/*
 3694	 * Even though we are in the critical section of the transaction commit,
 3695	 * we can still have concurrent tasks adding elements to this
 3696	 * transaction's list of dirty block groups. These tasks correspond to
 3697	 * endio free space workers started when writeback finishes for a
 3698	 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
 3699	 * allocate new block groups as a result of COWing nodes of the root
 3700	 * tree when updating the free space inode. The writeback for the space
 3701	 * caches is triggered by an earlier call to
 3702	 * btrfs_start_dirty_block_groups() and iterations of the following
 3703	 * loop.
 3704	 * Also we want to do the cache_save_setup first and then run the
 3705	 * delayed refs to make sure we have the best chance at doing this all
 3706	 * in one shot.
 3707	 */
 3708	spin_lock(&cur_trans->dirty_bgs_lock);
 3709	while (!list_empty(&cur_trans->dirty_bgs)) {
 3710		cache = list_first_entry(&cur_trans->dirty_bgs,
 3711					 struct btrfs_block_group_cache,
 3712					 dirty_list);
 3713
 3714		/*
 3715		 * this can happen if cache_save_setup re-dirties a block
 3716		 * group that is already under IO.  Just wait for it to
 3717		 * finish and then do it all again
 3718		 */
 3719		if (!list_empty(&cache->io_list)) {
 3720			spin_unlock(&cur_trans->dirty_bgs_lock);
 3721			list_del_init(&cache->io_list);
 3722			btrfs_wait_cache_io(trans, cache, path);
 3723			btrfs_put_block_group(cache);
 3724			spin_lock(&cur_trans->dirty_bgs_lock);
 3725		}
 3726
 3727		/*
 3728		 * don't remove from the dirty list until after we've waited
 3729		 * on any pending IO
 3730		 */
 3731		list_del_init(&cache->dirty_list);
 3732		spin_unlock(&cur_trans->dirty_bgs_lock);
 3733		should_put = 1;
 3734
 3735		cache_save_setup(cache, trans, path);
 3736
 3737		if (!ret)
 3738			ret = btrfs_run_delayed_refs(trans,
 3739						     (unsigned long) -1);
 3740
 3741		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
 3742			cache->io_ctl.inode = NULL;
 3743			ret = btrfs_write_out_cache(fs_info, trans,
 3744						    cache, path);
 3745			if (ret == 0 && cache->io_ctl.inode) {
 3746				num_started++;
 3747				should_put = 0;
 3748				list_add_tail(&cache->io_list, io);
 3749			} else {
 3750				/*
 3751				 * if we failed to write the cache, the
 3752				 * generation will be bad and life goes on
 3753				 */
 3754				ret = 0;
 3755			}
 3756		}
 3757		if (!ret) {
 3758			ret = write_one_cache_group(trans, fs_info,
 3759						    path, cache);
 3760			/*
 3761			 * One of the free space endio workers might have
 3762			 * created a new block group while updating a free space
 3763			 * cache's inode (at inode.c:btrfs_finish_ordered_io())
 3764			 * and hasn't released its transaction handle yet, in
 3765			 * which case the new block group is still attached to
 3766			 * its transaction handle and its creation has not
 3767			 * finished yet (no block group item in the extent tree
 3768			 * yet, etc). If this is the case, wait for all free
 3769			 * space endio workers to finish and retry. This is a
 3770			 * a very rare case so no need for a more efficient and
 3771			 * complex approach.
 3772			 */
 3773			if (ret == -ENOENT) {
 3774				wait_event(cur_trans->writer_wait,
 3775				   atomic_read(&cur_trans->num_writers) == 1);
 3776				ret = write_one_cache_group(trans, fs_info,
 3777							    path, cache);
 3778			}
 3779			if (ret)
 3780				btrfs_abort_transaction(trans, ret);
 3781		}
 3782
 3783		/* if its not on the io list, we need to put the block group */
 3784		if (should_put)
 3785			btrfs_put_block_group(cache);
 3786		spin_lock(&cur_trans->dirty_bgs_lock);
 3787	}
 3788	spin_unlock(&cur_trans->dirty_bgs_lock);
 3789
 3790	/*
 3791	 * Refer to the definition of io_bgs member for details why it's safe
 3792	 * to use it without any locking
 3793	 */
 3794	while (!list_empty(io)) {
 3795		cache = list_first_entry(io, struct btrfs_block_group_cache,
 3796					 io_list);
 3797		list_del_init(&cache->io_list);
 3798		btrfs_wait_cache_io(trans, cache, path);
 3799		btrfs_put_block_group(cache);
 3800	}
 3801
 3802	btrfs_free_path(path);
 3803	return ret;
 3804}
 3805
 3806int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
 3807{
 3808	struct btrfs_block_group_cache *block_group;
 3809	int readonly = 0;
 3810
 3811	block_group = btrfs_lookup_block_group(fs_info, bytenr);
 3812	if (!block_group || block_group->ro)
 3813		readonly = 1;
 3814	if (block_group)
 3815		btrfs_put_block_group(block_group);
 3816	return readonly;
 3817}
 3818
 3819bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
 3820{
 3821	struct btrfs_block_group_cache *bg;
 3822	bool ret = true;
 3823
 3824	bg = btrfs_lookup_block_group(fs_info, bytenr);
 3825	if (!bg)
 3826		return false;
 3827
 3828	spin_lock(&bg->lock);
 3829	if (bg->ro)
 3830		ret = false;
 3831	else
 3832		atomic_inc(&bg->nocow_writers);
 3833	spin_unlock(&bg->lock);
 3834
 3835	/* no put on block group, done by btrfs_dec_nocow_writers */
 3836	if (!ret)
 3837		btrfs_put_block_group(bg);
 3838
 3839	return ret;
 3840
 3841}
 3842
 3843void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
 3844{
 3845	struct btrfs_block_group_cache *bg;
 3846
 3847	bg = btrfs_lookup_block_group(fs_info, bytenr);
 3848	ASSERT(bg);
 3849	if (atomic_dec_and_test(&bg->nocow_writers))
 3850		wake_up_var(&bg->nocow_writers);
 3851	/*
 3852	 * Once for our lookup and once for the lookup done by a previous call
 3853	 * to btrfs_inc_nocow_writers()
 3854	 */
 3855	btrfs_put_block_group(bg);
 3856	btrfs_put_block_group(bg);
 3857}
 3858
 3859void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
 3860{
 3861	wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
 3862}
 3863
 3864static const char *alloc_name(u64 flags)
 3865{
 3866	switch (flags) {
 3867	case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
 3868		return "mixed";
 3869	case BTRFS_BLOCK_GROUP_METADATA:
 3870		return "metadata";
 3871	case BTRFS_BLOCK_GROUP_DATA:
 3872		return "data";
 3873	case BTRFS_BLOCK_GROUP_SYSTEM:
 3874		return "system";
 3875	default:
 3876		WARN_ON(1);
 3877		return "invalid-combination";
 3878	};
 3879}
 3880
 3881static int create_space_info(struct btrfs_fs_info *info, u64 flags)
 3882{
 3883
 3884	struct btrfs_space_info *space_info;
 3885	int i;
 3886	int ret;
 3887
 3888	space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
 3889	if (!space_info)
 3890		return -ENOMEM;
 3891
 3892	ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
 3893				 GFP_KERNEL);
 3894	if (ret) {
 3895		kfree(space_info);
 3896		return ret;
 3897	}
 3898
 3899	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
 3900		INIT_LIST_HEAD(&space_info->block_groups[i]);
 3901	init_rwsem(&space_info->groups_sem);
 3902	spin_lock_init(&space_info->lock);
 3903	space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
 3904	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
 3905	init_waitqueue_head(&space_info->wait);
 3906	INIT_LIST_HEAD(&space_info->ro_bgs);
 3907	INIT_LIST_HEAD(&space_info->tickets);
 3908	INIT_LIST_HEAD(&space_info->priority_tickets);
 3909
 3910	ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
 3911				    info->space_info_kobj, "%s",
 3912				    alloc_name(space_info->flags));
 3913	if (ret) {
 3914		percpu_counter_destroy(&space_info->total_bytes_pinned);
 3915		kfree(space_info);
 3916		return ret;
 3917	}
 3918
 3919	list_add_rcu(&space_info->list, &info->space_info);
 3920	if (flags & BTRFS_BLOCK_GROUP_DATA)
 3921		info->data_sinfo = space_info;
 3922
 3923	return ret;
 3924}
 3925
 3926static void update_space_info(struct btrfs_fs_info *info, u64 flags,
 3927			     u64 total_bytes, u64 bytes_used,
 3928			     u64 bytes_readonly,
 3929			     struct btrfs_space_info **space_info)
 3930{
 3931	struct btrfs_space_info *found;
 3932	int factor;
 3933
 3934	factor = btrfs_bg_type_to_factor(flags);
 3935
 3936	found = __find_space_info(info, flags);
 3937	ASSERT(found);
 3938	spin_lock(&found->lock);
 3939	found->total_bytes += total_bytes;
 3940	found->disk_total += total_bytes * factor;
 3941	found->bytes_used += bytes_used;
 3942	found->disk_used += bytes_used * factor;
 3943	found->bytes_readonly += bytes_readonly;
 3944	if (total_bytes > 0)
 3945		found->full = 0;
 3946	space_info_add_new_bytes(info, found, total_bytes -
 3947				 bytes_used - bytes_readonly);
 3948	spin_unlock(&found->lock);
 3949	*space_info = found;
 3950}
 3951
 3952static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 3953{
 3954	u64 extra_flags = chunk_to_extended(flags) &
 3955				BTRFS_EXTENDED_PROFILE_MASK;
 3956
 3957	write_seqlock(&fs_info->profiles_lock);
 3958	if (flags & BTRFS_BLOCK_GROUP_DATA)
 3959		fs_info->avail_data_alloc_bits |= extra_flags;
 3960	if (flags & BTRFS_BLOCK_GROUP_METADATA)
 3961		fs_info->avail_metadata_alloc_bits |= extra_flags;
 3962	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
 3963		fs_info->avail_system_alloc_bits |= extra_flags;
 3964	write_sequnlock(&fs_info->profiles_lock);
 3965}
 3966
 3967/*
 3968 * returns target flags in extended format or 0 if restripe for this
 3969 * chunk_type is not in progress
 3970 *
 3971 * should be called with balance_lock held
 3972 */
 3973static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
 3974{
 3975	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
 3976	u64 target = 0;
 3977
 3978	if (!bctl)
 3979		return 0;
 3980
 3981	if (flags & BTRFS_BLOCK_GROUP_DATA &&
 3982	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
 3983		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
 3984	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
 3985		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
 3986		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
 3987	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
 3988		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
 3989		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
 3990	}
 3991
 3992	return target;
 3993}
 3994
 3995/*
 3996 * @flags: available profiles in extended format (see ctree.h)
 3997 *
 3998 * Returns reduced profile in chunk format.  If profile changing is in
 3999 * progress (either running or paused) picks the target profile (if it's
 4000 * already available), otherwise falls back to plain reducing.
 4001 */
 4002static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
 4003{
 4004	u64 num_devices = fs_info->fs_devices->rw_devices;
 4005	u64 target;
 4006	u64 raid_type;
 4007	u64 allowed = 0;
 4008
 4009	/*
 4010	 * see if restripe for this chunk_type is in progress, if so
 4011	 * try to reduce to the target profile
 4012	 */
 4013	spin_lock(&fs_info->balance_lock);
 4014	target = get_restripe_target(fs_info, flags);
 4015	if (target) {
 4016		/* pick target profile only if it's already available */
 4017		if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
 4018			spin_unlock(&fs_info->balance_lock);
 4019			return extended_to_chunk(target);
 4020		}
 4021	}
 4022	spin_unlock(&fs_info->balance_lock);
 4023
 4024	/* First, mask out the RAID levels which aren't possible */
 4025	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
 4026		if (num_devices >= btrfs_raid_array[raid_type].devs_min)
 4027			allowed |= btrfs_raid_array[raid_type].bg_flag;
 4028	}
 4029	allowed &= flags;
 4030
 4031	if (allowed & BTRFS_BLOCK_GROUP_RAID6)
 4032		allowed = BTRFS_BLOCK_GROUP_RAID6;
 4033	else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
 4034		allowed = BTRFS_BLOCK_GROUP_RAID5;
 4035	else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
 4036		allowed = BTRFS_BLOCK_GROUP_RAID10;
 4037	else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
 4038		allowed = BTRFS_BLOCK_GROUP_RAID1;
 4039	else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
 4040		allowed = BTRFS_BLOCK_GROUP_RAID0;
 4041
 4042	flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
 4043
 4044	return extended_to_chunk(flags | allowed);
 4045}
 4046
 4047static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
 4048{
 4049	unsigned seq;
 4050	u64 flags;
 4051
 4052	do {
 4053		flags = orig_flags;
 4054		seq = read_seqbegin(&fs_info->profiles_lock);
 4055
 4056		if (flags & BTRFS_BLOCK_GROUP_DATA)
 4057			flags |= fs_info->avail_data_alloc_bits;
 4058		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
 4059			flags |= fs_info->avail_system_alloc_bits;
 4060		else if (flags & BTRFS_BLOCK_GROUP_METADATA)
 4061			flags |= fs_info->avail_metadata_alloc_bits;
 4062	} while (read_seqretry(&fs_info->profiles_lock, seq));
 4063
 4064	return btrfs_reduce_alloc_profile(fs_info, flags);
 4065}
 4066
 4067static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
 4068{
 4069	struct btrfs_fs_info *fs_info = root->fs_info;
 4070	u64 flags;
 4071	u64 ret;
 4072
 4073	if (data)
 4074		flags = BTRFS_BLOCK_GROUP_DATA;
 4075	else if (root == fs_info->chunk_root)
 4076		flags = BTRFS_BLOCK_GROUP_SYSTEM;
 4077	else
 4078		flags = BTRFS_BLOCK_GROUP_METADATA;
 4079
 4080	ret = get_alloc_profile(fs_info, flags);
 4081	return ret;
 4082}
 4083
 4084u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
 4085{
 4086	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
 4087}
 4088
 4089u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
 4090{
 4091	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 4092}
 4093
 4094u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
 4095{
 4096	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
 4097}
 4098
 4099static u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
 4100				 bool may_use_included)
 4101{
 4102	ASSERT(s_info);
 4103	return s_info->bytes_used + s_info->bytes_reserved +
 4104		s_info->bytes_pinned + s_info->bytes_readonly +
 4105		(may_use_included ? s_info->bytes_may_use : 0);
 4106}
 4107
 4108int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
 4109{
 4110	struct btrfs_root *root = inode->root;
 4111	struct btrfs_fs_info *fs_info = root->fs_info;
 4112	struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
 4113	u64 used;
 4114	int ret = 0;
 4115	int need_commit = 2;
 4116	int have_pinned_space;
 4117
 4118	/* make sure bytes are sectorsize aligned */
 4119	bytes = ALIGN(bytes, fs_info->sectorsize);
 4120
 4121	if (btrfs_is_free_space_inode(inode)) {
 4122		need_commit = 0;
 4123		ASSERT(current->journal_info);
 4124	}
 4125
 4126again:
 4127	/* make sure we have enough space to handle the data first */
 4128	spin_lock(&data_sinfo->lock);
 4129	used = btrfs_space_info_used(data_sinfo, true);
 4130
 4131	if (used + bytes > data_sinfo->total_bytes) {
 4132		struct btrfs_trans_handle *trans;
 4133
 4134		/*
 4135		 * if we don't have enough free bytes in this space then we need
 4136		 * to alloc a new chunk.
 4137		 */
 4138		if (!data_sinfo->full) {
 4139			u64 alloc_target;
 4140
 4141			data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
 4142			spin_unlock(&data_sinfo->lock);
 4143
 4144			alloc_target = btrfs_data_alloc_profile(fs_info);
 4145			/*
 4146			 * It is ugly that we don't call nolock join
 4147			 * transaction for the free space inode case here.
 4148			 * But it is safe because we only do the data space
 4149			 * reservation for the free space cache in the
 4150			 * transaction context, the common join transaction
 4151			 * just increase the counter of the current transaction
 4152			 * handler, doesn't try to acquire the trans_lock of
 4153			 * the fs.
 4154			 */
 4155			trans = btrfs_join_transaction(root);
 4156			if (IS_ERR(trans))
 4157				return PTR_ERR(trans);
 4158
 4159			ret = do_chunk_alloc(trans, alloc_target,
 4160					     CHUNK_ALLOC_NO_FORCE);
 4161			btrfs_end_transaction(trans);
 4162			if (ret < 0) {
 4163				if (ret != -ENOSPC)
 4164					return ret;
 4165				else {
 4166					have_pinned_space = 1;
 4167					goto commit_trans;
 4168				}
 4169			}
 4170
 4171			goto again;
 4172		}
 4173
 4174		/*
 4175		 * If we don't have enough pinned space to deal with this
 4176		 * allocation, and no removed chunk in current transaction,
 4177		 * don't bother committing the transaction.
 4178		 */
 4179		have_pinned_space = __percpu_counter_compare(
 4180			&data_sinfo->total_bytes_pinned,
 4181			used + bytes - data_sinfo->total_bytes,
 4182			BTRFS_TOTAL_BYTES_PINNED_BATCH);
 4183		spin_unlock(&data_sinfo->lock);
 4184
 4185		/* commit the current transaction and try again */
 4186commit_trans:
 4187		if (need_commit) {
 4188			need_commit--;
 4189
 4190			if (need_commit > 0) {
 4191				btrfs_start_delalloc_roots(fs_info, -1);
 4192				btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
 4193							 (u64)-1);
 4194			}
 4195
 4196			trans = btrfs_join_transaction(root);
 4197			if (IS_ERR(trans))
 4198				return PTR_ERR(trans);
 4199			if (have_pinned_space >= 0 ||
 4200			    test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
 4201				     &trans->transaction->flags) ||
 4202			    need_commit > 0) {
 4203				ret = btrfs_commit_transaction(trans);
 4204				if (ret)
 4205					return ret;
 4206				/*
 4207				 * The cleaner kthread might still be doing iput
 4208				 * operations. Wait for it to finish so that
 4209				 * more space is released.
 4210				 */
 4211				mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
 4212				mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
 4213				goto again;
 4214			} else {
 4215				btrfs_end_transaction(trans);
 4216			}
 4217		}
 4218
 4219		trace_btrfs_space_reservation(fs_info,
 4220					      "space_info:enospc",
 4221					      data_sinfo->flags, bytes, 1);
 4222		return -ENOSPC;
 4223	}
 4224	data_sinfo->bytes_may_use += bytes;
 4225	trace_btrfs_space_reservation(fs_info, "space_info",
 4226				      data_sinfo->flags, bytes, 1);
 4227	spin_unlock(&data_sinfo->lock);
 4228
 4229	return 0;
 4230}
 4231
 4232int btrfs_check_data_free_space(struct inode *inode,
 4233			struct extent_changeset **reserved, u64 start, u64 len)
 4234{
 4235	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 4236	int ret;
 4237
 4238	/* align the range */
 4239	len = round_up(start + len, fs_info->sectorsize) -
 4240	      round_down(start, fs_info->sectorsize);
 4241	start = round_down(start, fs_info->sectorsize);
 4242
 4243	ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
 4244	if (ret < 0)
 4245		return ret;
 4246
 4247	/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
 4248	ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
 4249	if (ret < 0)
 4250		btrfs_free_reserved_data_space_noquota(inode, start, len);
 4251	else
 4252		ret = 0;
 4253	return ret;
 4254}
 4255
 4256/*
 4257 * Called if we need to clear a data reservation for this inode
 4258 * Normally in a error case.
 4259 *
 4260 * This one will *NOT* use accurate qgroup reserved space API, just for case
 4261 * which we can't sleep and is sure it won't affect qgroup reserved space.
 4262 * Like clear_bit_hook().
 4263 */
 4264void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
 4265					    u64 len)
 4266{
 4267	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 4268	struct btrfs_space_info *data_sinfo;
 4269
 4270	/* Make sure the range is aligned to sectorsize */
 4271	len = round_up(start + len, fs_info->sectorsize) -
 4272	      round_down(start, fs_info->sectorsize);
 4273	start = round_down(start, fs_info->sectorsize);
 4274
 4275	data_sinfo = fs_info->data_sinfo;
 4276	spin_lock(&data_sinfo->lock);
 4277	if (WARN_ON(data_sinfo->bytes_may_use < len))
 4278		data_sinfo->bytes_may_use = 0;
 4279	else
 4280		data_sinfo->bytes_may_use -= len;
 4281	trace_btrfs_space_reservation(fs_info, "space_info",
 4282				      data_sinfo->flags, len, 0);
 4283	spin_unlock(&data_sinfo->lock);
 4284}
 4285
 4286/*
 4287 * Called if we need to clear a data reservation for this inode
 4288 * Normally in a error case.
 4289 *
 4290 * This one will handle the per-inode data rsv map for accurate reserved
 4291 * space framework.
 4292 */
 4293void btrfs_free_reserved_data_space(struct inode *inode,
 4294			struct extent_changeset *reserved, u64 start, u64 len)
 4295{
 4296	struct btrfs_root *root = BTRFS_I(inode)->root;
 4297
 4298	/* Make sure the range is aligned to sectorsize */
 4299	len = round_up(start + len, root->fs_info->sectorsize) -
 4300	      round_down(start, root->fs_info->sectorsize);
 4301	start = round_down(start, root->fs_info->sectorsize);
 4302
 4303	btrfs_free_reserved_data_space_noquota(inode, start, len);
 4304	btrfs_qgroup_free_data(inode, reserved, start, len);
 4305}
 4306
 4307static void force_metadata_allocation(struct btrfs_fs_info *info)
 4308{
 4309	struct list_head *head = &info->space_info;
 4310	struct btrfs_space_info *found;
 4311
 4312	rcu_read_lock();
 4313	list_for_each_entry_rcu(found, head, list) {
 4314		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
 4315			found->force_alloc = CHUNK_ALLOC_FORCE;
 4316	}
 4317	rcu_read_unlock();
 4318}
 4319
 4320static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
 4321{
 4322	return (global->size << 1);
 4323}
 4324
 4325static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
 4326			      struct btrfs_space_info *sinfo, int force)
 4327{
 4328	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 4329	u64 bytes_used = btrfs_space_info_used(sinfo, false);
 4330	u64 thresh;
 4331
 4332	if (force == CHUNK_ALLOC_FORCE)
 4333		return 1;
 4334
 4335	/*
 4336	 * We need to take into account the global rsv because for all intents
 4337	 * and purposes it's used space.  Don't worry about locking the
 4338	 * global_rsv, it doesn't change except when the transaction commits.
 4339	 */
 4340	if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
 4341		bytes_used += calc_global_rsv_need_space(global_rsv);
 4342
 4343	/*
 4344	 * in limited mode, we want to have some free space up to
 4345	 * about 1% of the FS size.
 4346	 */
 4347	if (force == CHUNK_ALLOC_LIMITED) {
 4348		thresh = btrfs_super_total_bytes(fs_info->super_copy);
 4349		thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
 4350
 4351		if (sinfo->total_bytes - bytes_used < thresh)
 4352			return 1;
 4353	}
 4354
 4355	if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
 4356		return 0;
 4357	return 1;
 4358}
 4359
 4360static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
 4361{
 4362	u64 num_dev;
 4363
 4364	if (type & (BTRFS_BLOCK_GROUP_RAID10 |
 4365		    BTRFS_BLOCK_GROUP_RAID0 |
 4366		    BTRFS_BLOCK_GROUP_RAID5 |
 4367		    BTRFS_BLOCK_GROUP_RAID6))
 4368		num_dev = fs_info->fs_devices->rw_devices;
 4369	else if (type & BTRFS_BLOCK_GROUP_RAID1)
 4370		num_dev = 2;
 4371	else
 4372		num_dev = 1;	/* DUP or single */
 4373
 4374	return num_dev;
 4375}
 4376
 4377/*
 4378 * If @is_allocation is true, reserve space in the system space info necessary
 4379 * for allocating a chunk, otherwise if it's false, reserve space necessary for
 4380 * removing a chunk.
 4381 */
 4382void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
 4383{
 4384	struct btrfs_fs_info *fs_info = trans->fs_info;
 4385	struct btrfs_space_info *info;
 4386	u64 left;
 4387	u64 thresh;
 4388	int ret = 0;
 4389	u64 num_devs;
 4390
 4391	/*
 4392	 * Needed because we can end up allocating a system chunk and for an
 4393	 * atomic and race free space reservation in the chunk block reserve.
 4394	 */
 4395	lockdep_assert_held(&fs_info->chunk_mutex);
 4396
 4397	info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
 4398	spin_lock(&info->lock);
 4399	left = info->total_bytes - btrfs_space_info_used(info, true);
 4400	spin_unlock(&info->lock);
 4401
 4402	num_devs = get_profile_num_devs(fs_info, type);
 4403
 4404	/* num_devs device items to update and 1 chunk item to add or remove */
 4405	thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
 4406		btrfs_calc_trans_metadata_size(fs_info, 1);
 4407
 4408	if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
 4409		btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
 4410			   left, thresh, type);
 4411		dump_space_info(fs_info, info, 0, 0);
 4412	}
 4413
 4414	if (left < thresh) {
 4415		u64 flags = btrfs_system_alloc_profile(fs_info);
 4416
 4417		/*
 4418		 * Ignore failure to create system chunk. We might end up not
 4419		 * needing it, as we might not need to COW all nodes/leafs from
 4420		 * the paths we visit in the chunk tree (they were already COWed
 4421		 * or created in the current transaction for example).
 4422		 */
 4423		ret = btrfs_alloc_chunk(trans, flags);
 4424	}
 4425
 4426	if (!ret) {
 4427		ret = btrfs_block_rsv_add(fs_info->chunk_root,
 4428					  &fs_info->chunk_block_rsv,
 4429					  thresh, BTRFS_RESERVE_NO_FLUSH);
 4430		if (!ret)
 4431			trans->chunk_bytes_reserved += thresh;
 4432	}
 4433}
 4434
 4435/*
 4436 * If force is CHUNK_ALLOC_FORCE:
 4437 *    - return 1 if it successfully allocates a chunk,
 4438 *    - return errors including -ENOSPC otherwise.
 4439 * If force is NOT CHUNK_ALLOC_FORCE:
 4440 *    - return 0 if it doesn't need to allocate a new chunk,
 4441 *    - return 1 if it successfully allocates a chunk,
 4442 *    - return errors including -ENOSPC otherwise.
 4443 */
 4444static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
 4445			  int force)
 4446{
 4447	struct btrfs_fs_info *fs_info = trans->fs_info;
 4448	struct btrfs_space_info *space_info;
 4449	bool wait_for_alloc = false;
 4450	bool should_alloc = false;
 4451	int ret = 0;
 4452
 4453	/* Don't re-enter if we're already allocating a chunk */
 4454	if (trans->allocating_chunk)
 4455		return -ENOSPC;
 4456
 4457	space_info = __find_space_info(fs_info, flags);
 4458	ASSERT(space_info);
 4459
 4460	do {
 4461		spin_lock(&space_info->lock);
 4462		if (force < space_info->force_alloc)
 4463			force = space_info->force_alloc;
 4464		should_alloc = should_alloc_chunk(fs_info, space_info, force);
 4465		if (space_info->full) {
 4466			/* No more free physical space */
 4467			if (should_alloc)
 4468				ret = -ENOSPC;
 4469			else
 4470				ret = 0;
 4471			spin_unlock(&space_info->lock);
 4472			return ret;
 4473		} else if (!should_alloc) {
 4474			spin_unlock(&space_info->lock);
 4475			return 0;
 4476		} else if (space_info->chunk_alloc) {
 4477			/*
 4478			 * Someone is already allocating, so we need to block
 4479			 * until this someone is finished and then loop to
 4480			 * recheck if we should continue with our allocation
 4481			 * attempt.
 4482			 */
 4483			wait_for_alloc = true;
 4484			spin_unlock(&space_info->lock);
 4485			mutex_lock(&fs_info->chunk_mutex);
 4486			mutex_unlock(&fs_info->chunk_mutex);
 4487		} else {
 4488			/* Proceed with allocation */
 4489			space_info->chunk_alloc = 1;
 4490			wait_for_alloc = false;
 4491			spin_unlock(&space_info->lock);
 4492		}
 4493
 4494		cond_resched();
 4495	} while (wait_for_alloc);
 4496
 4497	mutex_lock(&fs_info->chunk_mutex);
 4498	trans->allocating_chunk = true;
 4499
 4500	/*
 4501	 * If we have mixed data/metadata chunks we want to make sure we keep
 4502	 * allocating mixed chunks instead of individual chunks.
 4503	 */
 4504	if (btrfs_mixed_space_info(space_info))
 4505		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
 4506
 4507	/*
 4508	 * if we're doing a data chunk, go ahead and make sure that
 4509	 * we keep a reasonable number of metadata chunks allocated in the
 4510	 * FS as well.
 4511	 */
 4512	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
 4513		fs_info->data_chunk_allocations++;
 4514		if (!(fs_info->data_chunk_allocations %
 4515		      fs_info->metadata_ratio))
 4516			force_metadata_allocation(fs_info);
 4517	}
 4518
 4519	/*
 4520	 * Check if we have enough space in SYSTEM chunk because we may need
 4521	 * to update devices.
 4522	 */
 4523	check_system_chunk(trans, flags);
 4524
 4525	ret = btrfs_alloc_chunk(trans, flags);
 4526	trans->allocating_chunk = false;
 4527
 4528	spin_lock(&space_info->lock);
 4529	if (ret < 0) {
 4530		if (ret == -ENOSPC)
 4531			space_info->full = 1;
 4532		else
 4533			goto out;
 4534	} else {
 4535		ret = 1;
 4536	}
 4537
 4538	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
 4539out:
 4540	space_info->chunk_alloc = 0;
 4541	spin_unlock(&space_info->lock);
 4542	mutex_unlock(&fs_info->chunk_mutex);
 4543	/*
 4544	 * When we allocate a new chunk we reserve space in the chunk block
 4545	 * reserve to make sure we can COW nodes/leafs in the chunk tree or
 4546	 * add new nodes/leafs to it if we end up needing to do it when
 4547	 * inserting the chunk item and updating device items as part of the
 4548	 * second phase of chunk allocation, performed by
 4549	 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
 4550	 * large number of new block groups to create in our transaction
 4551	 * handle's new_bgs list to avoid exhausting the chunk block reserve
 4552	 * in extreme cases - like having a single transaction create many new
 4553	 * block groups when starting to write out the free space caches of all
 4554	 * the block groups that were made dirty during the lifetime of the
 4555	 * transaction.
 4556	 */
 4557	if (trans->can_flush_pending_bgs &&
 4558	    trans->chunk_bytes_reserved >= (u64)SZ_2M) {
 4559		btrfs_create_pending_block_groups(trans);
 4560		btrfs_trans_release_chunk_metadata(trans);
 4561	}
 4562	return ret;
 4563}
 4564
 4565static int can_overcommit(struct btrfs_fs_info *fs_info,
 4566			  struct btrfs_space_info *space_info, u64 bytes,
 4567			  enum btrfs_reserve_flush_enum flush,
 4568			  bool system_chunk)
 4569{
 4570	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 4571	u64 profile;
 4572	u64 space_size;
 4573	u64 avail;
 4574	u64 used;
 4575	int factor;
 4576
 4577	/* Don't overcommit when in mixed mode. */
 4578	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
 4579		return 0;
 4580
 4581	if (system_chunk)
 4582		profile = btrfs_system_alloc_profile(fs_info);
 4583	else
 4584		profile = btrfs_metadata_alloc_profile(fs_info);
 4585
 4586	used = btrfs_space_info_used(space_info, false);
 4587
 4588	/*
 4589	 * We only want to allow over committing if we have lots of actual space
 4590	 * free, but if we don't have enough space to handle the global reserve
 4591	 * space then we could end up having a real enospc problem when trying
 4592	 * to allocate a chunk or some other such important allocation.
 4593	 */
 4594	spin_lock(&global_rsv->lock);
 4595	space_size = calc_global_rsv_need_space(global_rsv);
 4596	spin_unlock(&global_rsv->lock);
 4597	if (used + space_size >= space_info->total_bytes)
 4598		return 0;
 4599
 4600	used += space_info->bytes_may_use;
 4601
 4602	avail = atomic64_read(&fs_info->free_chunk_space);
 4603
 4604	/*
 4605	 * If we have dup, raid1 or raid10 then only half of the free
 4606	 * space is actually useable.  For raid56, the space info used
 4607	 * doesn't include the parity drive, so we don't have to
 4608	 * change the math
 4609	 */
 4610	factor = btrfs_bg_type_to_factor(profile);
 4611	avail = div_u64(avail, factor);
 4612
 4613	/*
 4614	 * If we aren't flushing all things, let us overcommit up to
 4615	 * 1/2th of the space. If we can flush, don't let us overcommit
 4616	 * too much, let it overcommit up to 1/8 of the space.
 4617	 */
 4618	if (flush == BTRFS_RESERVE_FLUSH_ALL)
 4619		avail >>= 3;
 4620	else
 4621		avail >>= 1;
 4622
 4623	if (used + bytes < space_info->total_bytes + avail)
 4624		return 1;
 4625	return 0;
 4626}
 4627
 4628static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
 4629					 unsigned long nr_pages, int nr_items)
 4630{
 4631	struct super_block *sb = fs_info->sb;
 4632
 4633	if (down_read_trylock(&sb->s_umount)) {
 4634		writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
 4635		up_read(&sb->s_umount);
 4636	} else {
 4637		/*
 4638		 * We needn't worry the filesystem going from r/w to r/o though
 4639		 * we don't acquire ->s_umount mutex, because the filesystem
 4640		 * should guarantee the delalloc inodes list be empty after
 4641		 * the filesystem is readonly(all dirty pages are written to
 4642		 * the disk).
 4643		 */
 4644		btrfs_start_delalloc_roots(fs_info, nr_items);
 4645		if (!current->journal_info)
 4646			btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
 4647	}
 4648}
 4649
 4650static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
 4651					u64 to_reclaim)
 4652{
 4653	u64 bytes;
 4654	u64 nr;
 4655
 4656	bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
 4657	nr = div64_u64(to_reclaim, bytes);
 4658	if (!nr)
 4659		nr = 1;
 4660	return nr;
 4661}
 4662
 4663#define EXTENT_SIZE_PER_ITEM	SZ_256K
 4664
 4665/*
 4666 * shrink metadata reservation for delalloc
 4667 */
 4668static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
 4669			    u64 orig, bool wait_ordered)
 4670{
 4671	struct btrfs_space_info *space_info;
 4672	struct btrfs_trans_handle *trans;
 4673	u64 delalloc_bytes;
 4674	u64 max_reclaim;
 4675	u64 items;
 4676	long time_left;
 4677	unsigned long nr_pages;
 4678	int loops;
 4679
 4680	/* Calc the number of the pages we need flush for space reservation */
 4681	items = calc_reclaim_items_nr(fs_info, to_reclaim);
 4682	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
 4683
 4684	trans = (struct btrfs_trans_handle *)current->journal_info;
 4685	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 4686
 4687	delalloc_bytes = percpu_counter_sum_positive(
 4688						&fs_info->delalloc_bytes);
 4689	if (delalloc_bytes == 0) {
 4690		if (trans)
 4691			return;
 4692		if (wait_ordered)
 4693			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
 4694		return;
 4695	}
 4696
 4697	loops = 0;
 4698	while (delalloc_bytes && loops < 3) {
 4699		max_reclaim = min(delalloc_bytes, to_reclaim);
 4700		nr_pages = max_reclaim >> PAGE_SHIFT;
 4701		btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
 4702		/*
 4703		 * We need to wait for the async pages to actually start before
 4704		 * we do anything.
 4705		 */
 4706		max_reclaim = atomic_read(&fs_info->async_delalloc_pages);
 4707		if (!max_reclaim)
 4708			goto skip_async;
 4709
 4710		if (max_reclaim <= nr_pages)
 4711			max_reclaim = 0;
 4712		else
 4713			max_reclaim -= nr_pages;
 4714
 4715		wait_event(fs_info->async_submit_wait,
 4716			   atomic_read(&fs_info->async_delalloc_pages) <=
 4717			   (int)max_reclaim);
 4718skip_async:
 4719		spin_lock(&space_info->lock);
 4720		if (list_empty(&space_info->tickets) &&
 4721		    list_empty(&space_info->priority_tickets)) {
 4722			spin_unlock(&space_info->lock);
 4723			break;
 4724		}
 4725		spin_unlock(&space_info->lock);
 4726
 4727		loops++;
 4728		if (wait_ordered && !trans) {
 4729			btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
 4730		} else {
 4731			time_left = schedule_timeout_killable(1);
 4732			if (time_left)
 4733				break;
 4734		}
 4735		delalloc_bytes = percpu_counter_sum_positive(
 4736						&fs_info->delalloc_bytes);
 4737	}
 4738}
 4739
 4740struct reserve_ticket {
 4741	u64 bytes;
 4742	int error;
 4743	struct list_head list;
 4744	wait_queue_head_t wait;
 4745};
 4746
 4747/**
 4748 * maybe_commit_transaction - possibly commit the transaction if its ok to
 4749 * @root - the root we're allocating for
 4750 * @bytes - the number of bytes we want to reserve
 4751 * @force - force the commit
 4752 *
 4753 * This will check to make sure that committing the transaction will actually
 4754 * get us somewhere and then commit the transaction if it does.  Otherwise it
 4755 * will return -ENOSPC.
 4756 */
 4757static int may_commit_transaction(struct btrfs_fs_info *fs_info,
 4758				  struct btrfs_space_info *space_info)
 4759{
 4760	struct reserve_ticket *ticket = NULL;
 4761	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
 4762	struct btrfs_trans_handle *trans;
 4763	u64 bytes;
 4764
 4765	trans = (struct btrfs_trans_handle *)current->journal_info;
 4766	if (trans)
 4767		return -EAGAIN;
 4768
 4769	spin_lock(&space_info->lock);
 4770	if (!list_empty(&space_info->priority_tickets))
 4771		ticket = list_first_entry(&space_info->priority_tickets,
 4772					  struct reserve_ticket, list);
 4773	else if (!list_empty(&space_info->tickets))
 4774		ticket = list_first_entry(&space_info->tickets,
 4775					  struct reserve_ticket, list);
 4776	bytes = (ticket) ? ticket->bytes : 0;
 4777	spin_unlock(&space_info->lock);
 4778
 4779	if (!bytes)
 4780		return 0;
 4781
 4782	/* See if there is enough pinned space to make this reservation */
 4783	if (__percpu_counter_compare(&space_info->total_bytes_pinned,
 4784				   bytes,
 4785				   BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
 4786		goto commit;
 4787
 4788	/*
 4789	 * See if there is some space in the delayed insertion reservation for
 4790	 * this reservation.
 4791	 */
 4792	if (space_info != delayed_rsv->space_info)
 4793		return -ENOSPC;
 4794
 4795	spin_lock(&delayed_rsv->lock);
 4796	if (delayed_rsv->size > bytes)
 4797		bytes = 0;
 4798	else
 4799		bytes -= delayed_rsv->size;
 4800	spin_unlock(&delayed_rsv->lock);
 4801
 4802	if (__percpu_counter_compare(&space_info->total_bytes_pinned,
 4803				   bytes,
 4804				   BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) {
 4805		return -ENOSPC;
 4806	}
 4807
 4808commit:
 4809	trans = btrfs_join_transaction(fs_info->extent_root);
 4810	if (IS_ERR(trans))
 4811		return -ENOSPC;
 4812
 4813	return btrfs_commit_transaction(trans);
 4814}
 4815
 4816/*
 4817 * Try to flush some data based on policy set by @state. This is only advisory
 4818 * and may fail for various reasons. The caller is supposed to examine the
 4819 * state of @space_info to detect the outcome.
 4820 */
 4821static void flush_space(struct btrfs_fs_info *fs_info,
 4822		       struct btrfs_space_info *space_info, u64 num_bytes,
 4823		       int state)
 4824{
 4825	struct btrfs_root *root = fs_info->extent_root;
 4826	struct btrfs_trans_handle *trans;
 4827	int nr;
 4828	int ret = 0;
 4829
 4830	switch (state) {
 4831	case FLUSH_DELAYED_ITEMS_NR:
 4832	case FLUSH_DELAYED_ITEMS:
 4833		if (state == FLUSH_DELAYED_ITEMS_NR)
 4834			nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
 4835		else
 4836			nr = -1;
 4837
 4838		trans = btrfs_join_transaction(root);
 4839		if (IS_ERR(trans)) {
 4840			ret = PTR_ERR(trans);
 4841			break;
 4842		}
 4843		ret = btrfs_run_delayed_items_nr(trans, nr);
 4844		btrfs_end_transaction(trans);
 4845		break;
 4846	case FLUSH_DELALLOC:
 4847	case FLUSH_DELALLOC_WAIT:
 4848		shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
 4849				state == FLUSH_DELALLOC_WAIT);
 4850		break;
 4851	case ALLOC_CHUNK:
 4852		trans = btrfs_join_transaction(root);
 4853		if (IS_ERR(trans)) {
 4854			ret = PTR_ERR(trans);
 4855			break;
 4856		}
 4857		ret = do_chunk_alloc(trans,
 4858				     btrfs_metadata_alloc_profile(fs_info),
 4859				     CHUNK_ALLOC_NO_FORCE);
 4860		btrfs_end_transaction(trans);
 4861		if (ret > 0 || ret == -ENOSPC)
 4862			ret = 0;
 4863		break;
 4864	case COMMIT_TRANS:
 4865		ret = may_commit_transaction(fs_info, space_info);
 4866		break;
 4867	default:
 4868		ret = -ENOSPC;
 4869		break;
 4870	}
 4871
 4872	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
 4873				ret);
 4874	return;
 4875}
 4876
 4877static inline u64
 4878btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
 4879				 struct btrfs_space_info *space_info,
 4880				 bool system_chunk)
 4881{
 4882	struct reserve_ticket *ticket;
 4883	u64 used;
 4884	u64 expected;
 4885	u64 to_reclaim = 0;
 4886
 4887	list_for_each_entry(ticket, &space_info->tickets, list)
 4888		to_reclaim += ticket->bytes;
 4889	list_for_each_entry(ticket, &space_info->priority_tickets, list)
 4890		to_reclaim += ticket->bytes;
 4891	if (to_reclaim)
 4892		return to_reclaim;
 4893
 4894	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
 4895	if (can_overcommit(fs_info, space_info, to_reclaim,
 4896			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
 4897		return 0;
 4898
 4899	used = btrfs_space_info_used(space_info, true);
 4900
 4901	if (can_overcommit(fs_info, space_info, SZ_1M,
 4902			   BTRFS_RESERVE_FLUSH_ALL, system_chunk))
 4903		expected = div_factor_fine(space_info->total_bytes, 95);
 4904	else
 4905		expected = div_factor_fine(space_info->total_bytes, 90);
 4906
 4907	if (used > expected)
 4908		to_reclaim = used - expected;
 4909	else
 4910		to_reclaim = 0;
 4911	to_reclaim = min(to_reclaim, space_info->bytes_may_use +
 4912				     space_info->bytes_reserved);
 4913	return to_reclaim;
 4914}
 4915
 4916static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
 4917					struct btrfs_space_info *space_info,
 4918					u64 used, bool system_chunk)
 4919{
 4920	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
 4921
 4922	/* If we're just plain full then async reclaim just slows us down. */
 4923	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
 4924		return 0;
 4925
 4926	if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
 4927					      system_chunk))
 4928		return 0;
 4929
 4930	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
 4931		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
 4932}
 4933
 4934static void wake_all_tickets(struct list_head *head)
 4935{
 4936	struct reserve_ticket *ticket;
 4937
 4938	while (!list_empty(head)) {
 4939		ticket = list_first_entry(head, struct reserve_ticket, list);
 4940		list_del_init(&ticket->list);
 4941		ticket->error = -ENOSPC;
 4942		wake_up(&ticket->wait);
 4943	}
 4944}
 4945
 4946/*
 4947 * This is for normal flushers, we can wait all goddamned day if we want to.  We
 4948 * will loop and continuously try to flush as long as we are making progress.
 4949 * We count progress as clearing off tickets each time we have to loop.
 4950 */
 4951static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 4952{
 4953	struct btrfs_fs_info *fs_info;
 4954	struct btrfs_space_info *space_info;
 4955	u64 to_reclaim;
 4956	int flush_state;
 4957	int commit_cycles = 0;
 4958	u64 last_tickets_id;
 4959
 4960	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
 4961	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 4962
 4963	spin_lock(&space_info->lock);
 4964	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
 4965						      false);
 4966	if (!to_reclaim) {
 4967		space_info->flush = 0;
 4968		spin_unlock(&space_info->lock);
 4969		return;
 4970	}
 4971	last_tickets_id = space_info->tickets_id;
 4972	spin_unlock(&space_info->lock);
 4973
 4974	flush_state = FLUSH_DELAYED_ITEMS_NR;
 4975	do {
 4976		flush_space(fs_info, space_info, to_reclaim, flush_state);
 4977		spin_lock(&space_info->lock);
 4978		if (list_empty(&space_info->tickets)) {
 4979			space_info->flush = 0;
 4980			spin_unlock(&space_info->lock);
 4981			return;
 4982		}
 4983		to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
 4984							      space_info,
 4985							      false);
 4986		if (last_tickets_id == space_info->tickets_id) {
 4987			flush_state++;
 4988		} else {
 4989			last_tickets_id = space_info->tickets_id;
 4990			flush_state = FLUSH_DELAYED_ITEMS_NR;
 4991			if (commit_cycles)
 4992				commit_cycles--;
 4993		}
 4994
 4995		if (flush_state > COMMIT_TRANS) {
 4996			commit_cycles++;
 4997			if (commit_cycles > 2) {
 4998				wake_all_tickets(&space_info->tickets);
 4999				space_info->flush = 0;
 5000			} else {
 5001				flush_state = FLUSH_DELAYED_ITEMS_NR;
 5002			}
 5003		}
 5004		spin_unlock(&space_info->lock);
 5005	} while (flush_state <= COMMIT_TRANS);
 5006}
 5007
 5008void btrfs_init_async_reclaim_work(struct work_struct *work)
 5009{
 5010	INIT_WORK(work, btrfs_async_reclaim_metadata_space);
 5011}
 5012
 5013static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
 5014					    struct btrfs_space_info *space_info,
 5015					    struct reserve_ticket *ticket)
 5016{
 5017	u64 to_reclaim;
 5018	int flush_state = FLUSH_DELAYED_ITEMS_NR;
 5019
 5020	spin_lock(&space_info->lock);
 5021	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
 5022						      false);
 5023	if (!to_reclaim) {
 5024		spin_unlock(&space_info->lock);
 5025		return;
 5026	}
 5027	spin_unlock(&space_info->lock);
 5028
 5029	do {
 5030		flush_space(fs_info, space_info, to_reclaim, flush_state);
 5031		flush_state++;
 5032		spin_lock(&space_info->lock);
 5033		if (ticket->bytes == 0) {
 5034			spin_unlock(&space_info->lock);
 5035			return;
 5036		}
 5037		spin_unlock(&space_info->lock);
 5038
 5039		/*
 5040		 * Priority flushers can't wait on delalloc without
 5041		 * deadlocking.
 5042		 */
 5043		if (flush_state == FLUSH_DELALLOC ||
 5044		    flush_state == FLUSH_DELALLOC_WAIT)
 5045			flush_state = ALLOC_CHUNK;
 5046	} while (flush_state < COMMIT_TRANS);
 5047}
 5048
 5049static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
 5050			       struct btrfs_space_info *space_info,
 5051			       struct reserve_ticket *ticket, u64 orig_bytes)
 5052
 5053{
 5054	DEFINE_WAIT(wait);
 5055	int ret = 0;
 5056
 5057	spin_lock(&space_info->lock);
 5058	while (ticket->bytes > 0 && ticket->error == 0) {
 5059		ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
 5060		if (ret) {
 5061			ret = -EINTR;
 5062			break;
 5063		}
 5064		spin_unlock(&space_info->lock);
 5065
 5066		schedule();
 5067
 5068		finish_wait(&ticket->wait, &wait);
 5069		spin_lock(&space_info->lock);
 5070	}
 5071	if (!ret)
 5072		ret = ticket->error;
 5073	if (!list_empty(&ticket->list))
 5074		list_del_init(&ticket->list);
 5075	if (ticket->bytes && ticket->bytes < orig_bytes) {
 5076		u64 num_bytes = orig_bytes - ticket->bytes;
 5077		space_info->bytes_may_use -= num_bytes;
 5078		trace_btrfs_space_reservation(fs_info, "space_info",
 5079					      space_info->flags, num_bytes, 0);
 5080	}
 5081	spin_unlock(&space_info->lock);
 5082
 5083	return ret;
 5084}
 5085
 5086/**
 5087 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
 5088 * @root - the root we're allocating for
 5089 * @space_info - the space info we want to allocate from
 5090 * @orig_bytes - the number of bytes we want
 5091 * @flush - whether or not we can flush to make our reservation
 5092 *
 5093 * This will reserve orig_bytes number of bytes from the space info associated
 5094 * with the block_rsv.  If there is not enough space it will make an attempt to
 5095 * flush out space to make room.  It will do this by flushing delalloc if
 5096 * possible or committing the transaction.  If flush is 0 then no attempts to
 5097 * regain reservations will be made and this will fail if there is not enough
 5098 * space already.
 5099 */
 5100static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
 5101				    struct btrfs_space_info *space_info,
 5102				    u64 orig_bytes,
 5103				    enum btrfs_reserve_flush_enum flush,
 5104				    bool system_chunk)
 5105{
 5106	struct reserve_ticket ticket;
 5107	u64 used;
 5108	int ret = 0;
 5109
 5110	ASSERT(orig_bytes);
 5111	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
 5112
 5113	spin_lock(&space_info->lock);
 5114	ret = -ENOSPC;
 5115	used = btrfs_space_info_used(space_info, true);
 5116
 5117	/*
 5118	 * If we have enough space then hooray, make our reservation and carry
 5119	 * on.  If not see if we can overcommit, and if we can, hooray carry on.
 5120	 * If not things get more complicated.
 5121	 */
 5122	if (used + orig_bytes <= space_info->total_bytes) {
 5123		space_info->bytes_may_use += orig_bytes;
 5124		trace_btrfs_space_reservation(fs_info, "space_info",
 5125					      space_info->flags, orig_bytes, 1);
 5126		ret = 0;
 5127	} else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
 5128				  system_chunk)) {
 5129		space_info->bytes_may_use += orig_bytes;
 5130		trace_btrfs_space_reservation(fs_info, "space_info",
 5131					      space_info->flags, orig_bytes, 1);
 5132		ret = 0;
 5133	}
 5134
 5135	/*
 5136	 * If we couldn't make a reservation then setup our reservation ticket
 5137	 * and kick the async worker if it's not already running.
 5138	 *
 5139	 * If we are a priority flusher then we just need to add our ticket to
 5140	 * the list and we will do our own flushing further down.
 5141	 */
 5142	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
 5143		ticket.bytes = orig_bytes;
 5144		ticket.error = 0;
 5145		init_waitqueue_head(&ticket.wait);
 5146		if (flush == BTRFS_RESERVE_FLUSH_ALL) {
 5147			list_add_tail(&ticket.list, &space_info->tickets);
 5148			if (!space_info->flush) {
 5149				space_info->flush = 1;
 5150				trace_btrfs_trigger_flush(fs_info,
 5151							  space_info->flags,
 5152							  orig_bytes, flush,
 5153							  "enospc");
 5154				queue_work(system_unbound_wq,
 5155					   &fs_info->async_reclaim_work);
 5156			}
 5157		} else {
 5158			list_add_tail(&ticket.list,
 5159				      &space_info->priority_tickets);
 5160		}
 5161	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
 5162		used += orig_bytes;
 5163		/*
 5164		 * We will do the space reservation dance during log replay,
 5165		 * which means we won't have fs_info->fs_root set, so don't do
 5166		 * the async reclaim as we will panic.
 5167		 */
 5168		if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
 5169		    need_do_async_reclaim(fs_info, space_info,
 5170					  used, system_chunk) &&
 5171		    !work_busy(&fs_info->async_reclaim_work)) {
 5172			trace_btrfs_trigger_flush(fs_info, space_info->flags,
 5173						  orig_bytes, flush, "preempt");
 5174			queue_work(system_unbound_wq,
 5175				   &fs_info->async_reclaim_work);
 5176		}
 5177	}
 5178	spin_unlock(&space_info->lock);
 5179	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
 5180		return ret;
 5181
 5182	if (flush == BTRFS_RESERVE_FLUSH_ALL)
 5183		return wait_reserve_ticket(fs_info, space_info, &ticket,
 5184					   orig_bytes);
 5185
 5186	ret = 0;
 5187	priority_reclaim_metadata_space(fs_info, space_info, &ticket);
 5188	spin_lock(&space_info->lock);
 5189	if (ticket.bytes) {
 5190		if (ticket.bytes < orig_bytes) {
 5191			u64 num_bytes = orig_bytes - ticket.bytes;
 5192			space_info->bytes_may_use -= num_bytes;
 5193			trace_btrfs_space_reservation(fs_info, "space_info",
 5194						      space_info->flags,
 5195						      num_bytes, 0);
 5196
 5197		}
 5198		list_del_init(&ticket.list);
 5199		ret = -ENOSPC;
 5200	}
 5201	spin_unlock(&space_info->lock);
 5202	ASSERT(list_empty(&ticket.list));
 5203	return ret;
 5204}
 5205
 5206/**
 5207 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
 5208 * @root - the root we're allocating for
 5209 * @block_rsv - the block_rsv we're allocating for
 5210 * @orig_bytes - the number of bytes we want
 5211 * @flush - whether or not we can flush to make our reservation
 5212 *
 5213 * This will reserve orgi_bytes number of bytes from the space info associated
 5214 * with the block_rsv.  If there is not enough space it will make an attempt to
 5215 * flush out space to make room.  It will do this by flushing delalloc if
 5216 * possible or committing the transaction.  If flush is 0 then no attempts to
 5217 * regain reservations will be made and this will fail if there is not enough
 5218 * space already.
 5219 */
 5220static int reserve_metadata_bytes(struct btrfs_root *root,
 5221				  struct btrfs_block_rsv *block_rsv,
 5222				  u64 orig_bytes,
 5223				  enum btrfs_reserve_flush_enum flush)
 5224{
 5225	struct btrfs_fs_info *fs_info = root->fs_info;
 5226	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 5227	int ret;
 5228	bool system_chunk = (root == fs_info->chunk_root);
 5229
 5230	ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
 5231				       orig_bytes, flush, system_chunk);
 5232	if (ret == -ENOSPC &&
 5233	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
 5234		if (block_rsv != global_rsv &&
 5235		    !block_rsv_use_bytes(global_rsv, orig_bytes))
 5236			ret = 0;
 5237	}
 5238	if (ret == -ENOSPC) {
 5239		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
 5240					      block_rsv->space_info->flags,
 5241					      orig_bytes, 1);
 5242
 5243		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
 5244			dump_space_info(fs_info, block_rsv->space_info,
 5245					orig_bytes, 0);
 5246	}
 5247	return ret;
 5248}
 5249
 5250static struct btrfs_block_rsv *get_block_rsv(
 5251					const struct btrfs_trans_handle *trans,
 5252					const struct btrfs_root *root)
 5253{
 5254	struct btrfs_fs_info *fs_info = root->fs_info;
 5255	struct btrfs_block_rsv *block_rsv = NULL;
 5256
 5257	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
 5258	    (root == fs_info->csum_root && trans->adding_csums) ||
 5259	    (root == fs_info->uuid_root))
 5260		block_rsv = trans->block_rsv;
 5261
 5262	if (!block_rsv)
 5263		block_rsv = root->block_rsv;
 5264
 5265	if (!block_rsv)
 5266		block_rsv = &fs_info->empty_block_rsv;
 5267
 5268	return block_rsv;
 5269}
 5270
 5271static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
 5272			       u64 num_bytes)
 5273{
 5274	int ret = -ENOSPC;
 5275	spin_lock(&block_rsv->lock);
 5276	if (block_rsv->reserved >= num_bytes) {
 5277		block_rsv->reserved -= num_bytes;
 5278		if (block_rsv->reserved < block_rsv->size)
 5279			block_rsv->full = 0;
 5280		ret = 0;
 5281	}
 5282	spin_unlock(&block_rsv->lock);
 5283	return ret;
 5284}
 5285
 5286static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
 5287				u64 num_bytes, int update_size)
 5288{
 5289	spin_lock(&block_rsv->lock);
 5290	block_rsv->reserved += num_bytes;
 5291	if (update_size)
 5292		block_rsv->size += num_bytes;
 5293	else if (block_rsv->reserved >= block_rsv->size)
 5294		block_rsv->full = 1;
 5295	spin_unlock(&block_rsv->lock);
 5296}
 5297
 5298int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
 5299			     struct btrfs_block_rsv *dest, u64 num_bytes,
 5300			     int min_factor)
 5301{
 5302	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 5303	u64 min_bytes;
 5304
 5305	if (global_rsv->space_info != dest->space_info)
 5306		return -ENOSPC;
 5307
 5308	spin_lock(&global_rsv->lock);
 5309	min_bytes = div_factor(global_rsv->size, min_factor);
 5310	if (global_rsv->reserved < min_bytes + num_bytes) {
 5311		spin_unlock(&global_rsv->lock);
 5312		return -ENOSPC;
 5313	}
 5314	global_rsv->reserved -= num_bytes;
 5315	if (global_rsv->reserved < global_rsv->size)
 5316		global_rsv->full = 0;
 5317	spin_unlock(&global_rsv->lock);
 5318
 5319	block_rsv_add_bytes(dest, num_bytes, 1);
 5320	return 0;
 5321}
 5322
 5323/*
 5324 * This is for space we already have accounted in space_info->bytes_may_use, so
 5325 * basically when we're returning space from block_rsv's.
 5326 */
 5327static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
 5328				     struct btrfs_space_info *space_info,
 5329				     u64 num_bytes)
 5330{
 5331	struct reserve_ticket *ticket;
 5332	struct list_head *head;
 5333	u64 used;
 5334	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
 5335	bool check_overcommit = false;
 5336
 5337	spin_lock(&space_info->lock);
 5338	head = &space_info->priority_tickets;
 5339
 5340	/*
 5341	 * If we are over our limit then we need to check and see if we can
 5342	 * overcommit, and if we can't then we just need to free up our space
 5343	 * and not satisfy any requests.
 5344	 */
 5345	used = btrfs_space_info_used(space_info, true);
 5346	if (used - num_bytes >= space_info->total_bytes)
 5347		check_overcommit = true;
 5348again:
 5349	while (!list_empty(head) && num_bytes) {
 5350		ticket = list_first_entry(head, struct reserve_ticket,
 5351					  list);
 5352		/*
 5353		 * We use 0 bytes because this space is already reserved, so
 5354		 * adding the ticket space would be a double count.
 5355		 */
 5356		if (check_overcommit &&
 5357		    !can_overcommit(fs_info, space_info, 0, flush, false))
 5358			break;
 5359		if (num_bytes >= ticket->bytes) {
 5360			list_del_init(&ticket->list);
 5361			num_bytes -= ticket->bytes;
 5362			ticket->bytes = 0;
 5363			space_info->tickets_id++;
 5364			wake_up(&ticket->wait);
 5365		} else {
 5366			ticket->bytes -= num_bytes;
 5367			num_bytes = 0;
 5368		}
 5369	}
 5370
 5371	if (num_bytes && head == &space_info->priority_tickets) {
 5372		head = &space_info->tickets;
 5373		flush = BTRFS_RESERVE_FLUSH_ALL;
 5374		goto again;
 5375	}
 5376	space_info->bytes_may_use -= num_bytes;
 5377	trace_btrfs_space_reservation(fs_info, "space_info",
 5378				      space_info->flags, num_bytes, 0);
 5379	spin_unlock(&space_info->lock);
 5380}
 5381
 5382/*
 5383 * This is for newly allocated space that isn't accounted in
 5384 * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
 5385 * we use this helper.
 5386 */
 5387static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
 5388				     struct btrfs_space_info *space_info,
 5389				     u64 num_bytes)
 5390{
 5391	struct reserve_ticket *ticket;
 5392	struct list_head *head = &space_info->priority_tickets;
 5393
 5394again:
 5395	while (!list_empty(head) && num_bytes) {
 5396		ticket = list_first_entry(head, struct reserve_ticket,
 5397					  list);
 5398		if (num_bytes >= ticket->bytes) {
 5399			trace_btrfs_space_reservation(fs_info, "space_info",
 5400						      space_info->flags,
 5401						      ticket->bytes, 1);
 5402			list_del_init(&ticket->list);
 5403			num_bytes -= ticket->bytes;
 5404			space_info->bytes_may_use += ticket->bytes;
 5405			ticket->bytes = 0;
 5406			space_info->tickets_id++;
 5407			wake_up(&ticket->wait);
 5408		} else {
 5409			trace_btrfs_space_reservation(fs_info, "space_info",
 5410						      space_info->flags,
 5411						      num_bytes, 1);
 5412			space_info->bytes_may_use += num_bytes;
 5413			ticket->bytes -= num_bytes;
 5414			num_bytes = 0;
 5415		}
 5416	}
 5417
 5418	if (num_bytes && head == &space_info->priority_tickets) {
 5419		head = &space_info->tickets;
 5420		goto again;
 5421	}
 5422}
 5423
 5424static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
 5425				    struct btrfs_block_rsv *block_rsv,
 5426				    struct btrfs_block_rsv *dest, u64 num_bytes,
 5427				    u64 *qgroup_to_release_ret)
 5428{
 5429	struct btrfs_space_info *space_info = block_rsv->space_info;
 5430	u64 qgroup_to_release = 0;
 5431	u64 ret;
 5432
 5433	spin_lock(&block_rsv->lock);
 5434	if (num_bytes == (u64)-1) {
 5435		num_bytes = block_rsv->size;
 5436		qgroup_to_release = block_rsv->qgroup_rsv_size;
 5437	}
 5438	block_rsv->size -= num_bytes;
 5439	if (block_rsv->reserved >= block_rsv->size) {
 5440		num_bytes = block_rsv->reserved - block_rsv->size;
 5441		block_rsv->reserved = block_rsv->size;
 5442		block_rsv->full = 1;
 5443	} else {
 5444		num_bytes = 0;
 5445	}
 5446	if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
 5447		qgroup_to_release = block_rsv->qgroup_rsv_reserved -
 5448				    block_rsv->qgroup_rsv_size;
 5449		block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
 5450	} else {
 5451		qgroup_to_release = 0;
 5452	}
 5453	spin_unlock(&block_rsv->lock);
 5454
 5455	ret = num_bytes;
 5456	if (num_bytes > 0) {
 5457		if (dest) {
 5458			spin_lock(&dest->lock);
 5459			if (!dest->full) {
 5460				u64 bytes_to_add;
 5461
 5462				bytes_to_add = dest->size - dest->reserved;
 5463				bytes_to_add = min(num_bytes, bytes_to_add);
 5464				dest->reserved += bytes_to_add;
 5465				if (dest->reserved >= dest->size)
 5466					dest->full = 1;
 5467				num_bytes -= bytes_to_add;
 5468			}
 5469			spin_unlock(&dest->lock);
 5470		}
 5471		if (num_bytes)
 5472			space_info_add_old_bytes(fs_info, space_info,
 5473						 num_bytes);
 5474	}
 5475	if (qgroup_to_release_ret)
 5476		*qgroup_to_release_ret = qgroup_to_release;
 5477	return ret;
 5478}
 5479
 5480int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
 5481			    struct btrfs_block_rsv *dst, u64 num_bytes,
 5482			    int update_size)
 5483{
 5484	int ret;
 5485
 5486	ret = block_rsv_use_bytes(src, num_bytes);
 5487	if (ret)
 5488		return ret;
 5489
 5490	block_rsv_add_bytes(dst, num_bytes, update_size);
 5491	return 0;
 5492}
 5493
 5494void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
 5495{
 5496	memset(rsv, 0, sizeof(*rsv));
 5497	spin_lock_init(&rsv->lock);
 5498	rsv->type = type;
 5499}
 5500
 5501void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
 5502				   struct btrfs_block_rsv *rsv,
 5503				   unsigned short type)
 5504{
 5505	btrfs_init_block_rsv(rsv, type);
 5506	rsv->space_info = __find_space_info(fs_info,
 5507					    BTRFS_BLOCK_GROUP_METADATA);
 5508}
 5509
 5510struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
 5511					      unsigned short type)
 5512{
 5513	struct btrfs_block_rsv *block_rsv;
 5514
 5515	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
 5516	if (!block_rsv)
 5517		return NULL;
 5518
 5519	btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
 5520	return block_rsv;
 5521}
 5522
 5523void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
 5524			  struct btrfs_block_rsv *rsv)
 5525{
 5526	if (!rsv)
 5527		return;
 5528	btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
 5529	kfree(rsv);
 5530}
 5531
 5532int btrfs_block_rsv_add(struct btrfs_root *root,
 5533			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
 5534			enum btrfs_reserve_flush_enum flush)
 5535{
 5536	int ret;
 5537
 5538	if (num_bytes == 0)
 5539		return 0;
 5540
 5541	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
 5542	if (!ret) {
 5543		block_rsv_add_bytes(block_rsv, num_bytes, 1);
 5544		return 0;
 5545	}
 5546
 5547	return ret;
 5548}
 5549
 5550int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
 5551{
 5552	u64 num_bytes = 0;
 5553	int ret = -ENOSPC;
 5554
 5555	if (!block_rsv)
 5556		return 0;
 5557
 5558	spin_lock(&block_rsv->lock);
 5559	num_bytes = div_factor(block_rsv->size, min_factor);
 5560	if (block_rsv->reserved >= num_bytes)
 5561		ret = 0;
 5562	spin_unlock(&block_rsv->lock);
 5563
 5564	return ret;
 5565}
 5566
 5567int btrfs_block_rsv_refill(struct btrfs_root *root,
 5568			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
 5569			   enum btrfs_reserve_flush_enum flush)
 5570{
 5571	u64 num_bytes = 0;
 5572	int ret = -ENOSPC;
 5573
 5574	if (!block_rsv)
 5575		return 0;
 5576
 5577	spin_lock(&block_rsv->lock);
 5578	num_bytes = min_reserved;
 5579	if (block_rsv->reserved >= num_bytes)
 5580		ret = 0;
 5581	else
 5582		num_bytes -= block_rsv->reserved;
 5583	spin_unlock(&block_rsv->lock);
 5584
 5585	if (!ret)
 5586		return 0;
 5587
 5588	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
 5589	if (!ret) {
 5590		block_rsv_add_bytes(block_rsv, num_bytes, 0);
 5591		return 0;
 5592	}
 5593
 5594	return ret;
 5595}
 5596
 5597/**
 5598 * btrfs_inode_rsv_refill - refill the inode block rsv.
 5599 * @inode - the inode we are refilling.
 5600 * @flush - the flusing restriction.
 5601 *
 5602 * Essentially the same as btrfs_block_rsv_refill, except it uses the
 5603 * block_rsv->size as the minimum size.  We'll either refill the missing amount
 5604 * or return if we already have enough space.  This will also handle the resreve
 5605 * tracepoint for the reserved amount.
 5606 */
 5607static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
 5608				  enum btrfs_reserve_flush_enum flush)
 5609{
 5610	struct btrfs_root *root = inode->root;
 5611	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
 5612	u64 num_bytes = 0;
 5613	u64 qgroup_num_bytes = 0;
 5614	int ret = -ENOSPC;
 5615
 5616	spin_lock(&block_rsv->lock);
 5617	if (block_rsv->reserved < block_rsv->size)
 5618		num_bytes = block_rsv->size - block_rsv->reserved;
 5619	if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
 5620		qgroup_num_bytes = block_rsv->qgroup_rsv_size -
 5621				   block_rsv->qgroup_rsv_reserved;
 5622	spin_unlock(&block_rsv->lock);
 5623
 5624	if (num_bytes == 0)
 5625		return 0;
 5626
 5627	ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
 5628	if (ret)
 5629		return ret;
 5630	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
 5631	if (!ret) {
 5632		block_rsv_add_bytes(block_rsv, num_bytes, 0);
 5633		trace_btrfs_space_reservation(root->fs_info, "delalloc",
 5634					      btrfs_ino(inode), num_bytes, 1);
 5635
 5636		/* Don't forget to increase qgroup_rsv_reserved */
 5637		spin_lock(&block_rsv->lock);
 5638		block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
 5639		spin_unlock(&block_rsv->lock);
 5640	} else
 5641		btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
 5642	return ret;
 5643}
 5644
 5645/**
 5646 * btrfs_inode_rsv_release - release any excessive reservation.
 5647 * @inode - the inode we need to release from.
 5648 * @qgroup_free - free or convert qgroup meta.
 5649 *   Unlike normal operation, qgroup meta reservation needs to know if we are
 5650 *   freeing qgroup reservation or just converting it into per-trans.  Normally
 5651 *   @qgroup_free is true for error handling, and false for normal release.
 5652 *
 5653 * This is the same as btrfs_block_rsv_release, except that it handles the
 5654 * tracepoint for the reservation.
 5655 */
 5656static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
 5657{
 5658	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 5659	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 5660	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
 5661	u64 released = 0;
 5662	u64 qgroup_to_release = 0;
 5663
 5664	/*
 5665	 * Since we statically set the block_rsv->size we just want to say we
 5666	 * are releasing 0 bytes, and then we'll just get the reservation over
 5667	 * the size free'd.
 5668	 */
 5669	released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0,
 5670					   &qgroup_to_release);
 5671	if (released > 0)
 5672		trace_btrfs_space_reservation(fs_info, "delalloc",
 5673					      btrfs_ino(inode), released, 0);
 5674	if (qgroup_free)
 5675		btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
 5676	else
 5677		btrfs_qgroup_convert_reserved_meta(inode->root,
 5678						   qgroup_to_release);
 5679}
 5680
 5681void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
 5682			     struct btrfs_block_rsv *block_rsv,
 5683			     u64 num_bytes)
 5684{
 5685	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 5686
 5687	if (global_rsv == block_rsv ||
 5688	    block_rsv->space_info != global_rsv->space_info)
 5689		global_rsv = NULL;
 5690	block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL);
 5691}
 5692
 5693static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
 5694{
 5695	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
 5696	struct btrfs_space_info *sinfo = block_rsv->space_info;
 5697	u64 num_bytes;
 5698
 5699	/*
 5700	 * The global block rsv is based on the size of the extent tree, the
 5701	 * checksum tree and the root tree.  If the fs is empty we want to set
 5702	 * it to a minimal amount for safety.
 5703	 */
 5704	num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
 5705		btrfs_root_used(&fs_info->csum_root->root_item) +
 5706		btrfs_root_used(&fs_info->tree_root->root_item);
 5707	num_bytes = max_t(u64, num_bytes, SZ_16M);
 5708
 5709	spin_lock(&sinfo->lock);
 5710	spin_lock(&block_rsv->lock);
 5711
 5712	block_rsv->size = min_t(u64, num_bytes, SZ_512M);
 5713
 5714	if (block_rsv->reserved < block_rsv->size) {
 5715		num_bytes = btrfs_space_info_used(sinfo, true);
 5716		if (sinfo->total_bytes > num_bytes) {
 5717			num_bytes = sinfo->total_bytes - num_bytes;
 5718			num_bytes = min(num_bytes,
 5719					block_rsv->size - block_rsv->reserved);
 5720			block_rsv->reserved += num_bytes;
 5721			sinfo->bytes_may_use += num_bytes;
 5722			trace_btrfs_space_reservation(fs_info, "space_info",
 5723						      sinfo->flags, num_bytes,
 5724						      1);
 5725		}
 5726	} else if (block_rsv->reserved > block_rsv->size) {
 5727		num_bytes = block_rsv->reserved - block_rsv->size;
 5728		sinfo->bytes_may_use -= num_bytes;
 5729		trace_btrfs_space_reservation(fs_info, "space_info",
 5730				      sinfo->flags, num_bytes, 0);
 5731		block_rsv->reserved = block_rsv->size;
 5732	}
 5733
 5734	if (block_rsv->reserved == block_rsv->size)
 5735		block_rsv->full = 1;
 5736	else
 5737		block_rsv->full = 0;
 5738
 5739	spin_unlock(&block_rsv->lock);
 5740	spin_unlock(&sinfo->lock);
 5741}
 5742
 5743static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 5744{
 5745	struct btrfs_space_info *space_info;
 5746
 5747	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
 5748	fs_info->chunk_block_rsv.space_info = space_info;
 5749
 5750	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 5751	fs_info->global_block_rsv.space_info = space_info;
 5752	fs_info->trans_block_rsv.space_info = space_info;
 5753	fs_info->empty_block_rsv.space_info = space_info;
 5754	fs_info->delayed_block_rsv.space_info = space_info;
 5755
 5756	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
 5757	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
 5758	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
 5759	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
 5760	if (fs_info->quota_root)
 5761		fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
 5762	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
 5763
 5764	update_global_block_rsv(fs_info);
 5765}
 5766
 5767static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 5768{
 5769	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
 5770				(u64)-1, NULL);
 5771	WARN_ON(fs_info->trans_block_rsv.size > 0);
 5772	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
 5773	WARN_ON(fs_info->chunk_block_rsv.size > 0);
 5774	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
 5775	WARN_ON(fs_info->delayed_block_rsv.size > 0);
 5776	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
 5777}
 5778
 5779
 5780/*
 5781 * To be called after all the new block groups attached to the transaction
 5782 * handle have been created (btrfs_create_pending_block_groups()).
 5783 */
 5784void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
 5785{
 5786	struct btrfs_fs_info *fs_info = trans->fs_info;
 5787
 5788	if (!trans->chunk_bytes_reserved)
 5789		return;
 5790
 5791	WARN_ON_ONCE(!list_empty(&trans->new_bgs));
 5792
 5793	block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
 5794				trans->chunk_bytes_reserved, NULL);
 5795	trans->chunk_bytes_reserved = 0;
 5796}
 5797
 5798/*
 5799 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
 5800 * root: the root of the parent directory
 5801 * rsv: block reservation
 5802 * items: the number of items that we need do reservation
 5803 * qgroup_reserved: used to return the reserved size in qgroup
 5804 *
 5805 * This function is used to reserve the space for snapshot/subvolume
 5806 * creation and deletion. Those operations are different with the
 5807 * common file/directory operations, they change two fs/file trees
 5808 * and root tree, the number of items that the qgroup reserves is
 5809 * different with the free space reservation. So we can not use
 5810 * the space reservation mechanism in start_transaction().
 5811 */
 5812int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 5813				     struct btrfs_block_rsv *rsv,
 5814				     int items,
 5815				     bool use_global_rsv)
 5816{
 5817	u64 num_bytes;
 5818	int ret;
 5819	struct btrfs_fs_info *fs_info = root->fs_info;
 5820	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 5821
 5822	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
 5823		/* One for parent inode, two for dir entries */
 5824		num_bytes = 3 * fs_info->nodesize;
 5825		ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
 5826		if (ret)
 5827			return ret;
 5828	} else {
 5829		num_bytes = 0;
 5830	}
 5831
 5832	num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
 5833	rsv->space_info = __find_space_info(fs_info,
 5834					    BTRFS_BLOCK_GROUP_METADATA);
 5835	ret = btrfs_block_rsv_add(root, rsv, num_bytes,
 5836				  BTRFS_RESERVE_FLUSH_ALL);
 5837
 5838	if (ret == -ENOSPC && use_global_rsv)
 5839		ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
 5840
 5841	if (ret && num_bytes)
 5842		btrfs_qgroup_free_meta_prealloc(root, num_bytes);
 5843
 5844	return ret;
 5845}
 5846
 5847void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
 5848				      struct btrfs_block_rsv *rsv)
 5849{
 5850	btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
 5851}
 5852
 5853static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
 5854						 struct btrfs_inode *inode)
 5855{
 5856	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
 5857	u64 reserve_size = 0;
 5858	u64 qgroup_rsv_size = 0;
 5859	u64 csum_leaves;
 5860	unsigned outstanding_extents;
 5861
 5862	lockdep_assert_held(&inode->lock);
 5863	outstanding_extents = inode->outstanding_extents;
 5864	if (outstanding_extents)
 5865		reserve_size = btrfs_calc_trans_metadata_size(fs_info,
 5866						outstanding_extents + 1);
 5867	csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
 5868						 inode->csum_bytes);
 5869	reserve_size += btrfs_calc_trans_metadata_size(fs_info,
 5870						       csum_leaves);
 5871	/*
 5872	 * For qgroup rsv, the calculation is very simple:
 5873	 * account one nodesize for each outstanding extent
 5874	 *
 5875	 * This is overestimating in most cases.
 5876	 */
 5877	qgroup_rsv_size = outstanding_extents * fs_info->nodesize;
 5878
 5879	spin_lock(&block_rsv->lock);
 5880	block_rsv->size = reserve_size;
 5881	block_rsv->qgroup_rsv_size = qgroup_rsv_size;
 5882	spin_unlock(&block_rsv->lock);
 5883}
 5884
 5885int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 5886{
 5887	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 5888	unsigned nr_extents;
 5889	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
 5890	int ret = 0;
 5891	bool delalloc_lock = true;
 5892
 5893	/* If we are a free space inode we need to not flush since we will be in
 5894	 * the middle of a transaction commit.  We also don't need the delalloc
 5895	 * mutex since we won't race with anybody.  We need this mostly to make
 5896	 * lockdep shut its filthy mouth.
 5897	 *
 5898	 * If we have a transaction open (can happen if we call truncate_block
 5899	 * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
 5900	 */
 5901	if (btrfs_is_free_space_inode(inode)) {
 5902		flush = BTRFS_RESERVE_NO_FLUSH;
 5903		delalloc_lock = false;
 5904	} else {
 5905		if (current->journal_info)
 5906			flush = BTRFS_RESERVE_FLUSH_LIMIT;
 5907
 5908		if (btrfs_transaction_in_commit(fs_info))
 5909			schedule_timeout(1);
 5910	}
 5911
 5912	if (delalloc_lock)
 5913		mutex_lock(&inode->delalloc_mutex);
 5914
 5915	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
 5916
 5917	/* Add our new extents and calculate the new rsv size. */
 5918	spin_lock(&inode->lock);
 5919	nr_extents = count_max_extents(num_bytes);
 5920	btrfs_mod_outstanding_extents(inode, nr_extents);
 5921	inode->csum_bytes += num_bytes;
 5922	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 5923	spin_unlock(&inode->lock);
 5924
 5925	ret = btrfs_inode_rsv_refill(inode, flush);
 5926	if (unlikely(ret))
 5927		goto out_fail;
 5928
 5929	if (delalloc_lock)
 5930		mutex_unlock(&inode->delalloc_mutex);
 5931	return 0;
 5932
 5933out_fail:
 5934	spin_lock(&inode->lock);
 5935	nr_extents = count_max_extents(num_bytes);
 5936	btrfs_mod_outstanding_extents(inode, -nr_extents);
 5937	inode->csum_bytes -= num_bytes;
 5938	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 5939	spin_unlock(&inode->lock);
 5940
 5941	btrfs_inode_rsv_release(inode, true);
 5942	if (delalloc_lock)
 5943		mutex_unlock(&inode->delalloc_mutex);
 5944	return ret;
 5945}
 5946
 5947/**
 5948 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
 5949 * @inode: the inode to release the reservation for.
 5950 * @num_bytes: the number of bytes we are releasing.
 5951 * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
 5952 *
 5953 * This will release the metadata reservation for an inode.  This can be called
 5954 * once we complete IO for a given set of bytes to release their metadata
 5955 * reservations, or on error for the same reason.
 5956 */
 5957void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
 5958				     bool qgroup_free)
 5959{
 5960	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 5961
 5962	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
 5963	spin_lock(&inode->lock);
 5964	inode->csum_bytes -= num_bytes;
 5965	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 5966	spin_unlock(&inode->lock);
 5967
 5968	if (btrfs_is_testing(fs_info))
 5969		return;
 5970
 5971	btrfs_inode_rsv_release(inode, qgroup_free);
 5972}
 5973
 5974/**
 5975 * btrfs_delalloc_release_extents - release our outstanding_extents
 5976 * @inode: the inode to balance the reservation for.
 5977 * @num_bytes: the number of bytes we originally reserved with
 5978 * @qgroup_free: do we need to free qgroup meta reservation or convert them.
 5979 *
 5980 * When we reserve space we increase outstanding_extents for the extents we may
 5981 * add.  Once we've set the range as delalloc or created our ordered extents we
 5982 * have outstanding_extents to track the real usage, so we use this to free our
 5983 * temporarily tracked outstanding_extents.  This _must_ be used in conjunction
 5984 * with btrfs_delalloc_reserve_metadata.
 5985 */
 5986void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
 5987				    bool qgroup_free)
 5988{
 5989	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 5990	unsigned num_extents;
 5991
 5992	spin_lock(&inode->lock);
 5993	num_extents = count_max_extents(num_bytes);
 5994	btrfs_mod_outstanding_extents(inode, -num_extents);
 5995	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 5996	spin_unlock(&inode->lock);
 5997
 5998	if (btrfs_is_testing(fs_info))
 5999		return;
 6000
 6001	btrfs_inode_rsv_release(inode, qgroup_free);
 6002}
 6003
 6004/**
 6005 * btrfs_delalloc_reserve_space - reserve data and metadata space for
 6006 * delalloc
 6007 * @inode: inode we're writing to
 6008 * @start: start range we are writing to
 6009 * @len: how long the range we are writing to
 6010 * @reserved: mandatory parameter, record actually reserved qgroup ranges of
 6011 * 	      current reservation.
 6012 *
 6013 * This will do the following things
 6014 *
 6015 * o reserve space in data space info for num bytes
 6016 *   and reserve precious corresponding qgroup space
 6017 *   (Done in check_data_free_space)
 6018 *
 6019 * o reserve space for metadata space, based on the number of outstanding
 6020 *   extents and how much csums will be needed
 6021 *   also reserve metadata space in a per root over-reserve method.
 6022 * o add to the inodes->delalloc_bytes
 6023 * o add it to the fs_info's delalloc inodes list.
 6024 *   (Above 3 all done in delalloc_reserve_metadata)
 6025 *
 6026 * Return 0 for success
 6027 * Return <0 for error(-ENOSPC or -EQUOT)
 6028 */
 6029int btrfs_delalloc_reserve_space(struct inode *inode,
 6030			struct extent_changeset **reserved, u64 start, u64 len)
 6031{
 6032	int ret;
 6033
 6034	ret = btrfs_check_data_free_space(inode, reserved, start, len);
 6035	if (ret < 0)
 6036		return ret;
 6037	ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
 6038	if (ret < 0)
 6039		btrfs_free_reserved_data_space(inode, *reserved, start, len);
 6040	return ret;
 6041}
 6042
 6043/**
 6044 * btrfs_delalloc_release_space - release data and metadata space for delalloc
 6045 * @inode: inode we're releasing space for
 6046 * @start: start position of the space already reserved
 6047 * @len: the len of the space already reserved
 6048 * @release_bytes: the len of the space we consumed or didn't use
 6049 *
 6050 * This function will release the metadata space that was not used and will
 6051 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
 6052 * list if there are no delalloc bytes left.
 6053 * Also it will handle the qgroup reserved space.
 6054 */
 6055void btrfs_delalloc_release_space(struct inode *inode,
 6056				  struct extent_changeset *reserved,
 6057				  u64 start, u64 len, bool qgroup_free)
 6058{
 6059	btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
 6060	btrfs_free_reserved_data_space(inode, reserved, start, len);
 6061}
 6062
 6063static int update_block_group(struct btrfs_trans_handle *trans,
 6064			      struct btrfs_fs_info *info, u64 bytenr,
 6065			      u64 num_bytes, int alloc)
 6066{
 6067	struct btrfs_block_group_cache *cache = NULL;
 6068	u64 total = num_bytes;
 6069	u64 old_val;
 6070	u64 byte_in_group;
 6071	int factor;
 6072
 6073	/* block accounting for super block */
 6074	spin_lock(&info->delalloc_root_lock);
 6075	old_val = btrfs_super_bytes_used(info->super_copy);
 6076	if (alloc)
 6077		old_val += num_bytes;
 6078	else
 6079		old_val -= num_bytes;
 6080	btrfs_set_super_bytes_used(info->super_copy, old_val);
 6081	spin_unlock(&info->delalloc_root_lock);
 6082
 6083	while (total) {
 6084		cache = btrfs_lookup_block_group(info, bytenr);
 6085		if (!cache)
 6086			return -ENOENT;
 6087		factor = btrfs_bg_type_to_factor(cache->flags);
 6088
 6089		/*
 6090		 * If this block group has free space cache written out, we
 6091		 * need to make sure to load it if we are removing space.  This
 6092		 * is because we need the unpinning stage to actually add the
 6093		 * space back to the block group, otherwise we will leak space.
 6094		 */
 6095		if (!alloc && cache->cached == BTRFS_CACHE_NO)
 6096			cache_block_group(cache, 1);
 6097
 6098		byte_in_group = bytenr - cache->key.objectid;
 6099		WARN_ON(byte_in_group > cache->key.offset);
 6100
 6101		spin_lock(&cache->space_info->lock);
 6102		spin_lock(&cache->lock);
 6103
 6104		if (btrfs_test_opt(info, SPACE_CACHE) &&
 6105		    cache->disk_cache_state < BTRFS_DC_CLEAR)
 6106			cache->disk_cache_state = BTRFS_DC_CLEAR;
 6107
 6108		old_val = btrfs_block_group_used(&cache->item);
 6109		num_bytes = min(total, cache->key.offset - byte_in_group);
 6110		if (alloc) {
 6111			old_val += num_bytes;
 6112			btrfs_set_block_group_used(&cache->item, old_val);
 6113			cache->reserved -= num_bytes;
 6114			cache->space_info->bytes_reserved -= num_bytes;
 6115			cache->space_info->bytes_used += num_bytes;
 6116			cache->space_info->disk_used += num_bytes * factor;
 6117			spin_unlock(&cache->lock);
 6118			spin_unlock(&cache->space_info->lock);
 6119		} else {
 6120			old_val -= num_bytes;
 6121			btrfs_set_block_group_used(&cache->item, old_val);
 6122			cache->pinned += num_bytes;
 6123			cache->space_info->bytes_pinned += num_bytes;
 6124			cache->space_info->bytes_used -= num_bytes;
 6125			cache->space_info->disk_used -= num_bytes * factor;
 6126			spin_unlock(&cache->lock);
 6127			spin_unlock(&cache->space_info->lock);
 6128
 6129			trace_btrfs_space_reservation(info, "pinned",
 6130						      cache->space_info->flags,
 6131						      num_bytes, 1);
 6132			percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
 6133					   num_bytes,
 6134					   BTRFS_TOTAL_BYTES_PINNED_BATCH);
 6135			set_extent_dirty(info->pinned_extents,
 6136					 bytenr, bytenr + num_bytes - 1,
 6137					 GFP_NOFS | __GFP_NOFAIL);
 6138		}
 6139
 6140		spin_lock(&trans->transaction->dirty_bgs_lock);
 6141		if (list_empty(&cache->dirty_list)) {
 6142			list_add_tail(&cache->dirty_list,
 6143				      &trans->transaction->dirty_bgs);
 6144			trans->transaction->num_dirty_bgs++;
 6145			btrfs_get_block_group(cache);
 6146		}
 6147		spin_unlock(&trans->transaction->dirty_bgs_lock);
 6148
 6149		/*
 6150		 * No longer have used bytes in this block group, queue it for
 6151		 * deletion. We do this after adding the block group to the
 6152		 * dirty list to avoid races between cleaner kthread and space
 6153		 * cache writeout.
 6154		 */
 6155		if (!alloc && old_val == 0)
 6156			btrfs_mark_bg_unused(cache);
 6157
 6158		btrfs_put_block_group(cache);
 6159		total -= num_bytes;
 6160		bytenr += num_bytes;
 6161	}
 6162	return 0;
 6163}
 6164
 6165static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
 6166{
 6167	struct btrfs_block_group_cache *cache;
 6168	u64 bytenr;
 6169
 6170	spin_lock(&fs_info->block_group_cache_lock);
 6171	bytenr = fs_info->first_logical_byte;
 6172	spin_unlock(&fs_info->block_group_cache_lock);
 6173
 6174	if (bytenr < (u64)-1)
 6175		return bytenr;
 6176
 6177	cache = btrfs_lookup_first_block_group(fs_info, search_start);
 6178	if (!cache)
 6179		return 0;
 6180
 6181	bytenr = cache->key.objectid;
 6182	btrfs_put_block_group(cache);
 6183
 6184	return bytenr;
 6185}
 6186
 6187static int pin_down_extent(struct btrfs_fs_info *fs_info,
 6188			   struct btrfs_block_group_cache *cache,
 6189			   u64 bytenr, u64 num_bytes, int reserved)
 6190{
 6191	spin_lock(&cache->space_info->lock);
 6192	spin_lock(&cache->lock);
 6193	cache->pinned += num_bytes;
 6194	cache->space_info->bytes_pinned += num_bytes;
 6195	if (reserved) {
 6196		cache->reserved -= num_bytes;
 6197		cache->space_info->bytes_reserved -= num_bytes;
 6198	}
 6199	spin_unlock(&cache->lock);
 6200	spin_unlock(&cache->space_info->lock);
 6201
 6202	trace_btrfs_space_reservation(fs_info, "pinned",
 6203				      cache->space_info->flags, num_bytes, 1);
 6204	percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
 6205		    num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
 6206	set_extent_dirty(fs_info->pinned_extents, bytenr,
 6207			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
 6208	return 0;
 6209}
 6210
 6211/*
 6212 * this function must be called within transaction
 6213 */
 6214int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
 6215		     u64 bytenr, u64 num_bytes, int reserved)
 6216{
 6217	struct btrfs_block_group_cache *cache;
 6218
 6219	cache = btrfs_lookup_block_group(fs_info, bytenr);
 6220	BUG_ON(!cache); /* Logic error */
 6221
 6222	pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved);
 6223
 6224	btrfs_put_block_group(cache);
 6225	return 0;
 6226}
 6227
 6228/*
 6229 * this function must be called within transaction
 6230 */
 6231int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
 6232				    u64 bytenr, u64 num_bytes)
 6233{
 6234	struct btrfs_block_group_cache *cache;
 6235	int ret;
 6236
 6237	cache = btrfs_lookup_block_group(fs_info, bytenr);
 6238	if (!cache)
 6239		return -EINVAL;
 6240
 6241	/*
 6242	 * pull in the free space cache (if any) so that our pin
 6243	 * removes the free space from the cache.  We have load_only set
 6244	 * to one because the slow code to read in the free extents does check
 6245	 * the pinned extents.
 6246	 */
 6247	cache_block_group(cache, 1);
 6248
 6249	pin_down_extent(fs_info, cache, bytenr, num_bytes, 0);
 6250
 6251	/* remove us from the free space cache (if we're there at all) */
 6252	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
 6253	btrfs_put_block_group(cache);
 6254	return ret;
 6255}
 6256
 6257static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
 6258				   u64 start, u64 num_bytes)
 6259{
 6260	int ret;
 6261	struct btrfs_block_group_cache *block_group;
 6262	struct btrfs_caching_control *caching_ctl;
 6263
 6264	block_group = btrfs_lookup_block_group(fs_info, start);
 6265	if (!block_group)
 6266		return -EINVAL;
 6267
 6268	cache_block_group(block_group, 0);
 6269	caching_ctl = get_caching_control(block_group);
 6270
 6271	if (!caching_ctl) {
 6272		/* Logic error */
 6273		BUG_ON(!block_group_cache_done(block_group));
 6274		ret = btrfs_remove_free_space(block_group, start, num_bytes);
 6275	} else {
 6276		mutex_lock(&caching_ctl->mutex);
 6277
 6278		if (start >= caching_ctl->progress) {
 6279			ret = add_excluded_extent(fs_info, start, num_bytes);
 6280		} else if (start + num_bytes <= caching_ctl->progress) {
 6281			ret = btrfs_remove_free_space(block_group,
 6282						      start, num_bytes);
 6283		} else {
 6284			num_bytes = caching_ctl->progress - start;
 6285			ret = btrfs_remove_free_space(block_group,
 6286						      start, num_bytes);
 6287			if (ret)
 6288				goto out_lock;
 6289
 6290			num_bytes = (start + num_bytes) -
 6291				caching_ctl->progress;
 6292			start = caching_ctl->progress;
 6293			ret = add_excluded_extent(fs_info, start, num_bytes);
 6294		}
 6295out_lock:
 6296		mutex_unlock(&caching_ctl->mutex);
 6297		put_caching_control(caching_ctl);
 6298	}
 6299	btrfs_put_block_group(block_group);
 6300	return ret;
 6301}
 6302
 6303int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
 6304				 struct extent_buffer *eb)
 6305{
 6306	struct btrfs_file_extent_item *item;
 6307	struct btrfs_key key;
 6308	int found_type;
 6309	int i;
 6310	int ret = 0;
 6311
 6312	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
 6313		return 0;
 6314
 6315	for (i = 0; i < btrfs_header_nritems(eb); i++) {
 6316		btrfs_item_key_to_cpu(eb, &key, i);
 6317		if (key.type != BTRFS_EXTENT_DATA_KEY)
 6318			continue;
 6319		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
 6320		found_type = btrfs_file_extent_type(eb, item);
 6321		if (found_type == BTRFS_FILE_EXTENT_INLINE)
 6322			continue;
 6323		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
 6324			continue;
 6325		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
 6326		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
 6327		ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
 6328		if (ret)
 6329			break;
 6330	}
 6331
 6332	return ret;
 6333}
 6334
 6335static void
 6336btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
 6337{
 6338	atomic_inc(&bg->reservations);
 6339}
 6340
 6341void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
 6342					const u64 start)
 6343{
 6344	struct btrfs_block_group_cache *bg;
 6345
 6346	bg = btrfs_lookup_block_group(fs_info, start);
 6347	ASSERT(bg);
 6348	if (atomic_dec_and_test(&bg->reservations))
 6349		wake_up_var(&bg->reservations);
 6350	btrfs_put_block_group(bg);
 6351}
 6352
 6353void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
 6354{
 6355	struct btrfs_space_info *space_info = bg->space_info;
 6356
 6357	ASSERT(bg->ro);
 6358
 6359	if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
 6360		return;
 6361
 6362	/*
 6363	 * Our block group is read only but before we set it to read only,
 6364	 * some task might have had allocated an extent from it already, but it
 6365	 * has not yet created a respective ordered extent (and added it to a
 6366	 * root's list of ordered extents).
 6367	 * Therefore wait for any task currently allocating extents, since the
 6368	 * block group's reservations counter is incremented while a read lock
 6369	 * on the groups' semaphore is held and decremented after releasing
 6370	 * the read access on that semaphore and creating the ordered extent.
 6371	 */
 6372	down_write(&space_info->groups_sem);
 6373	up_write(&space_info->groups_sem);
 6374
 6375	wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
 6376}
 6377
 6378/**
 6379 * btrfs_add_reserved_bytes - update the block_group and space info counters
 6380 * @cache:	The cache we are manipulating
 6381 * @ram_bytes:  The number of bytes of file content, and will be same to
 6382 *              @num_bytes except for the compress path.
 6383 * @num_bytes:	The number of bytes in question
 6384 * @delalloc:   The blocks are allocated for the delalloc write
 6385 *
 6386 * This is called by the allocator when it reserves space. If this is a
 6387 * reservation and the block group has become read only we cannot make the
 6388 * reservation and return -EAGAIN, otherwise this function always succeeds.
 6389 */
 6390static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
 6391				    u64 ram_bytes, u64 num_bytes, int delalloc)
 6392{
 6393	struct btrfs_space_info *space_info = cache->space_info;
 6394	int ret = 0;
 6395
 6396	spin_lock(&space_info->lock);
 6397	spin_lock(&cache->lock);
 6398	if (cache->ro) {
 6399		ret = -EAGAIN;
 6400	} else {
 6401		cache->reserved += num_bytes;
 6402		space_info->bytes_reserved += num_bytes;
 6403
 6404		trace_btrfs_space_reservation(cache->fs_info,
 6405				"space_info", space_info->flags,
 6406				ram_bytes, 0);
 6407		space_info->bytes_may_use -= ram_bytes;
 6408		if (delalloc)
 6409			cache->delalloc_bytes += num_bytes;
 6410	}
 6411	spin_unlock(&cache->lock);
 6412	spin_unlock(&space_info->lock);
 6413	return ret;
 6414}
 6415
 6416/**
 6417 * btrfs_free_reserved_bytes - update the block_group and space info counters
 6418 * @cache:      The cache we are manipulating
 6419 * @num_bytes:  The number of bytes in question
 6420 * @delalloc:   The blocks are allocated for the delalloc write
 6421 *
 6422 * This is called by somebody who is freeing space that was never actually used
 6423 * on disk.  For example if you reserve some space for a new leaf in transaction
 6424 * A and before transaction A commits you free that leaf, you call this with
 6425 * reserve set to 0 in order to clear the reservation.
 6426 */
 6427
 6428static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
 6429				     u64 num_bytes, int delalloc)
 6430{
 6431	struct btrfs_space_info *space_info = cache->space_info;
 6432	int ret = 0;
 6433
 6434	spin_lock(&space_info->lock);
 6435	spin_lock(&cache->lock);
 6436	if (cache->ro)
 6437		space_info->bytes_readonly += num_bytes;
 6438	cache->reserved -= num_bytes;
 6439	space_info->bytes_reserved -= num_bytes;
 6440
 6441	if (delalloc)
 6442		cache->delalloc_bytes -= num_bytes;
 6443	spin_unlock(&cache->lock);
 6444	spin_unlock(&space_info->lock);
 6445	return ret;
 6446}
 6447void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
 6448{
 6449	struct btrfs_caching_control *next;
 6450	struct btrfs_caching_control *caching_ctl;
 6451	struct btrfs_block_group_cache *cache;
 6452
 6453	down_write(&fs_info->commit_root_sem);
 6454
 6455	list_for_each_entry_safe(caching_ctl, next,
 6456				 &fs_info->caching_block_groups, list) {
 6457		cache = caching_ctl->block_group;
 6458		if (block_group_cache_done(cache)) {
 6459			cache->last_byte_to_unpin = (u64)-1;
 6460			list_del_init(&caching_ctl->list);
 6461			put_caching_control(caching_ctl);
 6462		} else {
 6463			cache->last_byte_to_unpin = caching_ctl->progress;
 6464		}
 6465	}
 6466
 6467	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
 6468		fs_info->pinned_extents = &fs_info->freed_extents[1];
 6469	else
 6470		fs_info->pinned_extents = &fs_info->freed_extents[0];
 6471
 6472	up_write(&fs_info->commit_root_sem);
 6473
 6474	update_global_block_rsv(fs_info);
 6475}
 6476
 6477/*
 6478 * Returns the free cluster for the given space info and sets empty_cluster to
 6479 * what it should be based on the mount options.
 6480 */
 6481static struct btrfs_free_cluster *
 6482fetch_cluster_info(struct btrfs_fs_info *fs_info,
 6483		   struct btrfs_space_info *space_info, u64 *empty_cluster)
 6484{
 6485	struct btrfs_free_cluster *ret = NULL;
 6486
 6487	*empty_cluster = 0;
 6488	if (btrfs_mixed_space_info(space_info))
 6489		return ret;
 6490
 6491	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
 6492		ret = &fs_info->meta_alloc_cluster;
 6493		if (btrfs_test_opt(fs_info, SSD))
 6494			*empty_cluster = SZ_2M;
 6495		else
 6496			*empty_cluster = SZ_64K;
 6497	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
 6498		   btrfs_test_opt(fs_info, SSD_SPREAD)) {
 6499		*empty_cluster = SZ_2M;
 6500		ret = &fs_info->data_alloc_cluster;
 6501	}
 6502
 6503	return ret;
 6504}
 6505
 6506static int unpin_extent_range(struct btrfs_fs_info *fs_info,
 6507			      u64 start, u64 end,
 6508			      const bool return_free_space)
 6509{
 6510	struct btrfs_block_group_cache *cache = NULL;
 6511	struct btrfs_space_info *space_info;
 6512	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 6513	struct btrfs_free_cluster *cluster = NULL;
 6514	u64 len;
 6515	u64 total_unpinned = 0;
 6516	u64 empty_cluster = 0;
 6517	bool readonly;
 6518
 6519	while (start <= end) {
 6520		readonly = false;
 6521		if (!cache ||
 6522		    start >= cache->key.objectid + cache->key.offset) {
 6523			if (cache)
 6524				btrfs_put_block_group(cache);
 6525			total_unpinned = 0;
 6526			cache = btrfs_lookup_block_group(fs_info, start);
 6527			BUG_ON(!cache); /* Logic error */
 6528
 6529			cluster = fetch_cluster_info(fs_info,
 6530						     cache->space_info,
 6531						     &empty_cluster);
 6532			empty_cluster <<= 1;
 6533		}
 6534
 6535		len = cache->key.objectid + cache->key.offset - start;
 6536		len = min(len, end + 1 - start);
 6537
 6538		if (start < cache->last_byte_to_unpin) {
 6539			len = min(len, cache->last_byte_to_unpin - start);
 6540			if (return_free_space)
 6541				btrfs_add_free_space(cache, start, len);
 6542		}
 6543
 6544		start += len;
 6545		total_unpinned += len;
 6546		space_info = cache->space_info;
 6547
 6548		/*
 6549		 * If this space cluster has been marked as fragmented and we've
 6550		 * unpinned enough in this block group to potentially allow a
 6551		 * cluster to be created inside of it go ahead and clear the
 6552		 * fragmented check.
 6553		 */
 6554		if (cluster && cluster->fragmented &&
 6555		    total_unpinned > empty_cluster) {
 6556			spin_lock(&cluster->lock);
 6557			cluster->fragmented = 0;
 6558			spin_unlock(&cluster->lock);
 6559		}
 6560
 6561		spin_lock(&space_info->lock);
 6562		spin_lock(&cache->lock);
 6563		cache->pinned -= len;
 6564		space_info->bytes_pinned -= len;
 6565
 6566		trace_btrfs_space_reservation(fs_info, "pinned",
 6567					      space_info->flags, len, 0);
 6568		space_info->max_extent_size = 0;
 6569		percpu_counter_add_batch(&space_info->total_bytes_pinned,
 6570			    -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
 6571		if (cache->ro) {
 6572			space_info->bytes_readonly += len;
 6573			readonly = true;
 6574		}
 6575		spin_unlock(&cache->lock);
 6576		if (!readonly && return_free_space &&
 6577		    global_rsv->space_info == space_info) {
 6578			u64 to_add = len;
 6579
 6580			spin_lock(&global_rsv->lock);
 6581			if (!global_rsv->full) {
 6582				to_add = min(len, global_rsv->size -
 6583					     global_rsv->reserved);
 6584				global_rsv->reserved += to_add;
 6585				space_info->bytes_may_use += to_add;
 6586				if (global_rsv->reserved >= global_rsv->size)
 6587					global_rsv->full = 1;
 6588				trace_btrfs_space_reservation(fs_info,
 6589							      "space_info",
 6590							      space_info->flags,
 6591							      to_add, 1);
 6592				len -= to_add;
 6593			}
 6594			spin_unlock(&global_rsv->lock);
 6595			/* Add to any tickets we may have */
 6596			if (len)
 6597				space_info_add_new_bytes(fs_info, space_info,
 6598							 len);
 6599		}
 6600		spin_unlock(&space_info->lock);
 6601	}
 6602
 6603	if (cache)
 6604		btrfs_put_block_group(cache);
 6605	return 0;
 6606}
 6607
 6608int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
 6609{
 6610	struct btrfs_fs_info *fs_info = trans->fs_info;
 6611	struct btrfs_block_group_cache *block_group, *tmp;
 6612	struct list_head *deleted_bgs;
 6613	struct extent_io_tree *unpin;
 6614	u64 start;
 6615	u64 end;
 6616	int ret;
 6617
 6618	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
 6619		unpin = &fs_info->freed_extents[1];
 6620	else
 6621		unpin = &fs_info->freed_extents[0];
 6622
 6623	while (!trans->aborted) {
 6624		mutex_lock(&fs_info->unused_bg_unpin_mutex);
 6625		ret = find_first_extent_bit(unpin, 0, &start, &end,
 6626					    EXTENT_DIRTY, NULL);
 6627		if (ret) {
 6628			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
 6629			break;
 6630		}
 6631
 6632		if (btrfs_test_opt(fs_info, DISCARD))
 6633			ret = btrfs_discard_extent(fs_info, start,
 6634						   end + 1 - start, NULL);
 6635
 6636		clear_extent_dirty(unpin, start, end);
 6637		unpin_extent_range(fs_info, start, end, true);
 6638		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
 6639		cond_resched();
 6640	}
 6641
 6642	/*
 6643	 * Transaction is finished.  We don't need the lock anymore.  We
 6644	 * do need to clean up the block groups in case of a transaction
 6645	 * abort.
 6646	 */
 6647	deleted_bgs = &trans->transaction->deleted_bgs;
 6648	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
 6649		u64 trimmed = 0;
 6650
 6651		ret = -EROFS;
 6652		if (!trans->aborted)
 6653			ret = btrfs_discard_extent(fs_info,
 6654						   block_group->key.objectid,
 6655						   block_group->key.offset,
 6656						   &trimmed);
 6657
 6658		list_del_init(&block_group->bg_list);
 6659		btrfs_put_block_group_trimming(block_group);
 6660		btrfs_put_block_group(block_group);
 6661
 6662		if (ret) {
 6663			const char *errstr = btrfs_decode_error(ret);
 6664			btrfs_warn(fs_info,
 6665			   "discard failed while removing blockgroup: errno=%d %s",
 6666				   ret, errstr);
 6667		}
 6668	}
 6669
 6670	return 0;
 6671}
 6672
 6673static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 6674			       struct btrfs_delayed_ref_node *node, u64 parent,
 6675			       u64 root_objectid, u64 owner_objectid,
 6676			       u64 owner_offset, int refs_to_drop,
 6677			       struct btrfs_delayed_extent_op *extent_op)
 6678{
 6679	struct btrfs_fs_info *info = trans->fs_info;
 6680	struct btrfs_key key;
 6681	struct btrfs_path *path;
 6682	struct btrfs_root *extent_root = info->extent_root;
 6683	struct extent_buffer *leaf;
 6684	struct btrfs_extent_item *ei;
 6685	struct btrfs_extent_inline_ref *iref;
 6686	int ret;
 6687	int is_data;
 6688	int extent_slot = 0;
 6689	int found_extent = 0;
 6690	int num_to_del = 1;
 6691	u32 item_size;
 6692	u64 refs;
 6693	u64 bytenr = node->bytenr;
 6694	u64 num_bytes = node->num_bytes;
 6695	int last_ref = 0;
 6696	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
 6697
 6698	path = btrfs_alloc_path();
 6699	if (!path)
 6700		return -ENOMEM;
 6701
 6702	path->reada = READA_FORWARD;
 6703	path->leave_spinning = 1;
 6704
 6705	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
 6706	BUG_ON(!is_data && refs_to_drop != 1);
 6707
 6708	if (is_data)
 6709		skinny_metadata = false;
 6710
 6711	ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
 6712				    parent, root_objectid, owner_objectid,
 6713				    owner_offset);
 6714	if (ret == 0) {
 6715		extent_slot = path->slots[0];
 6716		while (extent_slot >= 0) {
 6717			btrfs_item_key_to_cpu(path->nodes[0], &key,
 6718					      extent_slot);
 6719			if (key.objectid != bytenr)
 6720				break;
 6721			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
 6722			    key.offset == num_bytes) {
 6723				found_extent = 1;
 6724				break;
 6725			}
 6726			if (key.type == BTRFS_METADATA_ITEM_KEY &&
 6727			    key.offset == owner_objectid) {
 6728				found_extent = 1;
 6729				break;
 6730			}
 6731			if (path->slots[0] - extent_slot > 5)
 6732				break;
 6733			extent_slot--;
 6734		}
 6735
 6736		if (!found_extent) {
 6737			BUG_ON(iref);
 6738			ret = remove_extent_backref(trans, path, NULL,
 6739						    refs_to_drop,
 6740						    is_data, &last_ref);
 6741			if (ret) {
 6742				btrfs_abort_transaction(trans, ret);
 6743				goto out;
 6744			}
 6745			btrfs_release_path(path);
 6746			path->leave_spinning = 1;
 6747
 6748			key.objectid = bytenr;
 6749			key.type = BTRFS_EXTENT_ITEM_KEY;
 6750			key.offset = num_bytes;
 6751
 6752			if (!is_data && skinny_metadata) {
 6753				key.type = BTRFS_METADATA_ITEM_KEY;
 6754				key.offset = owner_objectid;
 6755			}
 6756
 6757			ret = btrfs_search_slot(trans, extent_root,
 6758						&key, path, -1, 1);
 6759			if (ret > 0 && skinny_metadata && path->slots[0]) {
 6760				/*
 6761				 * Couldn't find our skinny metadata item,
 6762				 * see if we have ye olde extent item.
 6763				 */
 6764				path->slots[0]--;
 6765				btrfs_item_key_to_cpu(path->nodes[0], &key,
 6766						      path->slots[0]);
 6767				if (key.objectid == bytenr &&
 6768				    key.type == BTRFS_EXTENT_ITEM_KEY &&
 6769				    key.offset == num_bytes)
 6770					ret = 0;
 6771			}
 6772
 6773			if (ret > 0 && skinny_metadata) {
 6774				skinny_metadata = false;
 6775				key.objectid = bytenr;
 6776				key.type = BTRFS_EXTENT_ITEM_KEY;
 6777				key.offset = num_bytes;
 6778				btrfs_release_path(path);
 6779				ret = btrfs_search_slot(trans, extent_root,
 6780							&key, path, -1, 1);
 6781			}
 6782
 6783			if (ret) {
 6784				btrfs_err(info,
 6785					  "umm, got %d back from search, was looking for %llu",
 6786					  ret, bytenr);
 6787				if (ret > 0)
 6788					btrfs_print_leaf(path->nodes[0]);
 6789			}
 6790			if (ret < 0) {
 6791				btrfs_abort_transaction(trans, ret);
 6792				goto out;
 6793			}
 6794			extent_slot = path->slots[0];
 6795		}
 6796	} else if (WARN_ON(ret == -ENOENT)) {
 6797		btrfs_print_leaf(path->nodes[0]);
 6798		btrfs_err(info,
 6799			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
 6800			bytenr, parent, root_objectid, owner_objectid,
 6801			owner_offset);
 6802		btrfs_abort_transaction(trans, ret);
 6803		goto out;
 6804	} else {
 6805		btrfs_abort_transaction(trans, ret);
 6806		goto out;
 6807	}
 6808
 6809	leaf = path->nodes[0];
 6810	item_size = btrfs_item_size_nr(leaf, extent_slot);
 6811	if (unlikely(item_size < sizeof(*ei))) {
 6812		ret = -EINVAL;
 6813		btrfs_print_v0_err(info);
 6814		btrfs_abort_transaction(trans, ret);
 6815		goto out;
 6816	}
 6817	ei = btrfs_item_ptr(leaf, extent_slot,
 6818			    struct btrfs_extent_item);
 6819	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
 6820	    key.type == BTRFS_EXTENT_ITEM_KEY) {
 6821		struct btrfs_tree_block_info *bi;
 6822		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
 6823		bi = (struct btrfs_tree_block_info *)(ei + 1);
 6824		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
 6825	}
 6826
 6827	refs = btrfs_extent_refs(leaf, ei);
 6828	if (refs < refs_to_drop) {
 6829		btrfs_err(info,
 6830			  "trying to drop %d refs but we only have %Lu for bytenr %Lu",
 6831			  refs_to_drop, refs, bytenr);
 6832		ret = -EINVAL;
 6833		btrfs_abort_transaction(trans, ret);
 6834		goto out;
 6835	}
 6836	refs -= refs_to_drop;
 6837
 6838	if (refs > 0) {
 6839		if (extent_op)
 6840			__run_delayed_extent_op(extent_op, leaf, ei);
 6841		/*
 6842		 * In the case of inline back ref, reference count will
 6843		 * be updated by remove_extent_backref
 6844		 */
 6845		if (iref) {
 6846			BUG_ON(!found_extent);
 6847		} else {
 6848			btrfs_set_extent_refs(leaf, ei, refs);
 6849			btrfs_mark_buffer_dirty(leaf);
 6850		}
 6851		if (found_extent) {
 6852			ret = remove_extent_backref(trans, path, iref,
 6853						    refs_to_drop, is_data,
 6854						    &last_ref);
 6855			if (ret) {
 6856				btrfs_abort_transaction(trans, ret);
 6857				goto out;
 6858			}
 6859		}
 6860	} else {
 6861		if (found_extent) {
 6862			BUG_ON(is_data && refs_to_drop !=
 6863			       extent_data_ref_count(path, iref));
 6864			if (iref) {
 6865				BUG_ON(path->slots[0] != extent_slot);
 6866			} else {
 6867				BUG_ON(path->slots[0] != extent_slot + 1);
 6868				path->slots[0] = extent_slot;
 6869				num_to_del = 2;
 6870			}
 6871		}
 6872
 6873		last_ref = 1;
 6874		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
 6875				      num_to_del);
 6876		if (ret) {
 6877			btrfs_abort_transaction(trans, ret);
 6878			goto out;
 6879		}
 6880		btrfs_release_path(path);
 6881
 6882		if (is_data) {
 6883			ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
 6884			if (ret) {
 6885				btrfs_abort_transaction(trans, ret);
 6886				goto out;
 6887			}
 6888		}
 6889
 6890		ret = add_to_free_space_tree(trans, bytenr, num_bytes);
 6891		if (ret) {
 6892			btrfs_abort_transaction(trans, ret);
 6893			goto out;
 6894		}
 6895
 6896		ret = update_block_group(trans, info, bytenr, num_bytes, 0);
 6897		if (ret) {
 6898			btrfs_abort_transaction(trans, ret);
 6899			goto out;
 6900		}
 6901	}
 6902	btrfs_release_path(path);
 6903
 6904out:
 6905	btrfs_free_path(path);
 6906	return ret;
 6907}
 6908
 6909/*
 6910 * when we free an block, it is possible (and likely) that we free the last
 6911 * delayed ref for that extent as well.  This searches the delayed ref tree for
 6912 * a given extent, and if there are no other delayed refs to be processed, it
 6913 * removes it from the tree.
 6914 */
 6915static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 6916				      u64 bytenr)
 6917{
 6918	struct btrfs_delayed_ref_head *head;
 6919	struct btrfs_delayed_ref_root *delayed_refs;
 6920	int ret = 0;
 6921
 6922	delayed_refs = &trans->transaction->delayed_refs;
 6923	spin_lock(&delayed_refs->lock);
 6924	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
 6925	if (!head)
 6926		goto out_delayed_unlock;
 6927
 6928	spin_lock(&head->lock);
 6929	if (!RB_EMPTY_ROOT(&head->ref_tree))
 6930		goto out;
 6931
 6932	if (head->extent_op) {
 6933		if (!head->must_insert_reserved)
 6934			goto out;
 6935		btrfs_free_delayed_extent_op(head->extent_op);
 6936		head->extent_op = NULL;
 6937	}
 6938
 6939	/*
 6940	 * waiting for the lock here would deadlock.  If someone else has it
 6941	 * locked they are already in the process of dropping it anyway
 6942	 */
 6943	if (!mutex_trylock(&head->mutex))
 6944		goto out;
 6945
 6946	/*
 6947	 * at this point we have a head with no other entries.  Go
 6948	 * ahead and process it.
 6949	 */
 6950	rb_erase(&head->href_node, &delayed_refs->href_root);
 6951	RB_CLEAR_NODE(&head->href_node);
 6952	atomic_dec(&delayed_refs->num_entries);
 6953
 6954	/*
 6955	 * we don't take a ref on the node because we're removing it from the
 6956	 * tree, so we just steal the ref the tree was holding.
 6957	 */
 6958	delayed_refs->num_heads--;
 6959	if (head->processing == 0)
 6960		delayed_refs->num_heads_ready--;
 6961	head->processing = 0;
 6962	spin_unlock(&head->lock);
 6963	spin_unlock(&delayed_refs->lock);
 6964
 6965	BUG_ON(head->extent_op);
 6966	if (head->must_insert_reserved)
 6967		ret = 1;
 6968
 6969	mutex_unlock(&head->mutex);
 6970	btrfs_put_delayed_ref_head(head);
 6971	return ret;
 6972out:
 6973	spin_unlock(&head->lock);
 6974
 6975out_delayed_unlock:
 6976	spin_unlock(&delayed_refs->lock);
 6977	return 0;
 6978}
 6979
 6980void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 6981			   struct btrfs_root *root,
 6982			   struct extent_buffer *buf,
 6983			   u64 parent, int last_ref)
 6984{
 6985	struct btrfs_fs_info *fs_info = root->fs_info;
 6986	int pin = 1;
 6987	int ret;
 6988
 6989	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
 6990		int old_ref_mod, new_ref_mod;
 6991
 6992		btrfs_ref_tree_mod(root, buf->start, buf->len, parent,
 6993				   root->root_key.objectid,
 6994				   btrfs_header_level(buf), 0,
 6995				   BTRFS_DROP_DELAYED_REF);
 6996		ret = btrfs_add_delayed_tree_ref(trans, buf->start,
 6997						 buf->len, parent,
 6998						 root->root_key.objectid,
 6999						 btrfs_header_level(buf),
 7000						 BTRFS_DROP_DELAYED_REF, NULL,
 7001						 &old_ref_mod, &new_ref_mod);
 7002		BUG_ON(ret); /* -ENOMEM */
 7003		pin = old_ref_mod >= 0 && new_ref_mod < 0;
 7004	}
 7005
 7006	if (last_ref && btrfs_header_generation(buf) == trans->transid) {
 7007		struct btrfs_block_group_cache *cache;
 7008
 7009		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
 7010			ret = check_ref_cleanup(trans, buf->start);
 7011			if (!ret)
 7012				goto out;
 7013		}
 7014
 7015		pin = 0;
 7016		cache = btrfs_lookup_block_group(fs_info, buf->start);
 7017
 7018		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
 7019			pin_down_extent(fs_info, cache, buf->start,
 7020					buf->len, 1);
 7021			btrfs_put_block_group(cache);
 7022			goto out;
 7023		}
 7024
 7025		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
 7026
 7027		btrfs_add_free_space(cache, buf->start, buf->len);
 7028		btrfs_free_reserved_bytes(cache, buf->len, 0);
 7029		btrfs_put_block_group(cache);
 7030		trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
 7031	}
 7032out:
 7033	if (pin)
 7034		add_pinned_bytes(fs_info, buf->len, true,
 7035				 root->root_key.objectid);
 7036
 7037	if (last_ref) {
 7038		/*
 7039		 * Deleting the buffer, clear the corrupt flag since it doesn't
 7040		 * matter anymore.
 7041		 */
 7042		clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
 7043	}
 7044}
 7045
 7046/* Can return -ENOMEM */
 7047int btrfs_free_extent(struct btrfs_trans_handle *trans,
 7048		      struct btrfs_root *root,
 7049		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
 7050		      u64 owner, u64 offset)
 7051{
 7052	struct btrfs_fs_info *fs_info = root->fs_info;
 7053	int old_ref_mod, new_ref_mod;
 7054	int ret;
 7055
 7056	if (btrfs_is_testing(fs_info))
 7057		return 0;
 7058
 7059	if (root_objectid != BTRFS_TREE_LOG_OBJECTID)
 7060		btrfs_ref_tree_mod(root, bytenr, num_bytes, parent,
 7061				   root_objectid, owner, offset,
 7062				   BTRFS_DROP_DELAYED_REF);
 7063
 7064	/*
 7065	 * tree log blocks never actually go into the extent allocation
 7066	 * tree, just update pinning info and exit early.
 7067	 */
 7068	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
 7069		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
 7070		/* unlocks the pinned mutex */
 7071		btrfs_pin_extent(fs_info, bytenr, num_bytes, 1);
 7072		old_ref_mod = new_ref_mod = 0;
 7073		ret = 0;
 7074	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
 7075		ret = btrfs_add_delayed_tree_ref(trans, bytenr,
 7076						 num_bytes, parent,
 7077						 root_objectid, (int)owner,
 7078						 BTRFS_DROP_DELAYED_REF, NULL,
 7079						 &old_ref_mod, &new_ref_mod);
 7080	} else {
 7081		ret = btrfs_add_delayed_data_ref(trans, bytenr,
 7082						 num_bytes, parent,
 7083						 root_objectid, owner, offset,
 7084						 0, BTRFS_DROP_DELAYED_REF,
 7085						 &old_ref_mod, &new_ref_mod);
 7086	}
 7087
 7088	if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) {
 7089		bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
 7090
 7091		add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid);
 7092	}
 7093
 7094	return ret;
 7095}
 7096
 7097/*
 7098 * when we wait for progress in the block group caching, its because
 7099 * our allocation attempt failed at least once.  So, we must sleep
 7100 * and let some progress happen before we try again.
 7101 *
 7102 * This function will sleep at least once waiting for new free space to
 7103 * show up, and then it will check the block group free space numbers
 7104 * for our min num_bytes.  Another option is to have it go ahead
 7105 * and look in the rbtree for a free extent of a given size, but this
 7106 * is a good start.
 7107 *
 7108 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
 7109 * any of the information in this block group.
 7110 */
 7111static noinline void
 7112wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
 7113				u64 num_bytes)
 7114{
 7115	struct btrfs_caching_control *caching_ctl;
 7116
 7117	caching_ctl = get_caching_control(cache);
 7118	if (!caching_ctl)
 7119		return;
 7120
 7121	wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
 7122		   (cache->free_space_ctl->free_space >= num_bytes));
 7123
 7124	put_caching_control(caching_ctl);
 7125}
 7126
 7127static noinline int
 7128wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
 7129{
 7130	struct btrfs_caching_control *caching_ctl;
 7131	int ret = 0;
 7132
 7133	caching_ctl = get_caching_control(cache);
 7134	if (!caching_ctl)
 7135		return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
 7136
 7137	wait_event(caching_ctl->wait, block_group_cache_done(cache));
 7138	if (cache->cached == BTRFS_CACHE_ERROR)
 7139		ret = -EIO;
 7140	put_caching_control(caching_ctl);
 7141	return ret;
 7142}
 7143
 7144enum btrfs_loop_type {
 7145	LOOP_CACHING_NOWAIT = 0,
 7146	LOOP_CACHING_WAIT = 1,
 7147	LOOP_ALLOC_CHUNK = 2,
 7148	LOOP_NO_EMPTY_SIZE = 3,
 7149};
 7150
 7151static inline void
 7152btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
 7153		       int delalloc)
 7154{
 7155	if (delalloc)
 7156		down_read(&cache->data_rwsem);
 7157}
 7158
 7159static inline void
 7160btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
 7161		       int delalloc)
 7162{
 7163	btrfs_get_block_group(cache);
 7164	if (delalloc)
 7165		down_read(&cache->data_rwsem);
 7166}
 7167
 7168static struct btrfs_block_group_cache *
 7169btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
 7170		   struct btrfs_free_cluster *cluster,
 7171		   int delalloc)
 7172{
 7173	struct btrfs_block_group_cache *used_bg = NULL;
 7174
 7175	spin_lock(&cluster->refill_lock);
 7176	while (1) {
 7177		used_bg = cluster->block_group;
 7178		if (!used_bg)
 7179			return NULL;
 7180
 7181		if (used_bg == block_group)
 7182			return used_bg;
 7183
 7184		btrfs_get_block_group(used_bg);
 7185
 7186		if (!delalloc)
 7187			return used_bg;
 7188
 7189		if (down_read_trylock(&used_bg->data_rwsem))
 7190			return used_bg;
 7191
 7192		spin_unlock(&cluster->refill_lock);
 7193
 7194		/* We should only have one-level nested. */
 7195		down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
 7196
 7197		spin_lock(&cluster->refill_lock);
 7198		if (used_bg == cluster->block_group)
 7199			return used_bg;
 7200
 7201		up_read(&used_bg->data_rwsem);
 7202		btrfs_put_block_group(used_bg);
 7203	}
 7204}
 7205
 7206static inline void
 7207btrfs_release_block_group(struct btrfs_block_group_cache *cache,
 7208			 int delalloc)
 7209{
 7210	if (delalloc)
 7211		up_read(&cache->data_rwsem);
 7212	btrfs_put_block_group(cache);
 7213}
 7214
 7215/*
 7216 * walks the btree of allocated extents and find a hole of a given size.
 7217 * The key ins is changed to record the hole:
 7218 * ins->objectid == start position
 7219 * ins->flags = BTRFS_EXTENT_ITEM_KEY
 7220 * ins->offset == the size of the hole.
 7221 * Any available blocks before search_start are skipped.
 7222 *
 7223 * If there is no suitable free space, we will record the max size of
 7224 * the free space extent currently.
 7225 */
 7226static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
 7227				u64 ram_bytes, u64 num_bytes, u64 empty_size,
 7228				u64 hint_byte, struct btrfs_key *ins,
 7229				u64 flags, int delalloc)
 7230{
 7231	int ret = 0;
 7232	struct btrfs_root *root = fs_info->extent_root;
 7233	struct btrfs_free_cluster *last_ptr = NULL;
 7234	struct btrfs_block_group_cache *block_group = NULL;
 7235	u64 search_start = 0;
 7236	u64 max_extent_size = 0;
 7237	u64 empty_cluster = 0;
 7238	struct btrfs_space_info *space_info;
 7239	int loop = 0;
 7240	int index = btrfs_bg_flags_to_raid_index(flags);
 7241	bool failed_cluster_refill = false;
 7242	bool failed_alloc = false;
 7243	bool use_cluster = true;
 7244	bool have_caching_bg = false;
 7245	bool orig_have_caching_bg = false;
 7246	bool full_search = false;
 7247
 7248	WARN_ON(num_bytes < fs_info->sectorsize);
 7249	ins->type = BTRFS_EXTENT_ITEM_KEY;
 7250	ins->objectid = 0;
 7251	ins->offset = 0;
 7252
 7253	trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
 7254
 7255	space_info = __find_space_info(fs_info, flags);
 7256	if (!space_info) {
 7257		btrfs_err(fs_info, "No space info for %llu", flags);
 7258		return -ENOSPC;
 7259	}
 7260
 7261	/*
 7262	 * If our free space is heavily fragmented we may not be able to make
 7263	 * big contiguous allocations, so instead of doing the expensive search
 7264	 * for free space, simply return ENOSPC with our max_extent_size so we
 7265	 * can go ahead and search for a more manageable chunk.
 7266	 *
 7267	 * If our max_extent_size is large enough for our allocation simply
 7268	 * disable clustering since we will likely not be able to find enough
 7269	 * space to create a cluster and induce latency trying.
 7270	 */
 7271	if (unlikely(space_info->max_extent_size)) {
 7272		spin_lock(&space_info->lock);
 7273		if (space_info->max_extent_size &&
 7274		    num_bytes > space_info->max_extent_size) {
 7275			ins->offset = space_info->max_extent_size;
 7276			spin_unlock(&space_info->lock);
 7277			return -ENOSPC;
 7278		} else if (space_info->max_extent_size) {
 7279			use_cluster = false;
 7280		}
 7281		spin_unlock(&space_info->lock);
 7282	}
 7283
 7284	last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster);
 7285	if (last_ptr) {
 7286		spin_lock(&last_ptr->lock);
 7287		if (last_ptr->block_group)
 7288			hint_byte = last_ptr->window_start;
 7289		if (last_ptr->fragmented) {
 7290			/*
 7291			 * We still set window_start so we can keep track of the
 7292			 * last place we found an allocation to try and save
 7293			 * some time.
 7294			 */
 7295			hint_byte = last_ptr->window_start;
 7296			use_cluster = false;
 7297		}
 7298		spin_unlock(&last_ptr->lock);
 7299	}
 7300
 7301	search_start = max(search_start, first_logical_byte(fs_info, 0));
 7302	search_start = max(search_start, hint_byte);
 7303	if (search_start == hint_byte) {
 7304		block_group = btrfs_lookup_block_group(fs_info, search_start);
 7305		/*
 7306		 * we don't want to use the block group if it doesn't match our
 7307		 * allocation bits, or if its not cached.
 7308		 *
 7309		 * However if we are re-searching with an ideal block group
 7310		 * picked out then we don't care that the block group is cached.
 7311		 */
 7312		if (block_group && block_group_bits(block_group, flags) &&
 7313		    block_group->cached != BTRFS_CACHE_NO) {
 7314			down_read(&space_info->groups_sem);
 7315			if (list_empty(&block_group->list) ||
 7316			    block_group->ro) {
 7317				/*
 7318				 * someone is removing this block group,
 7319				 * we can't jump into the have_block_group
 7320				 * target because our list pointers are not
 7321				 * valid
 7322				 */
 7323				btrfs_put_block_group(block_group);
 7324				up_read(&space_info->groups_sem);
 7325			} else {
 7326				index = btrfs_bg_flags_to_raid_index(
 7327						block_group->flags);
 7328				btrfs_lock_block_group(block_group, delalloc);
 7329				goto have_block_group;
 7330			}
 7331		} else if (block_group) {
 7332			btrfs_put_block_group(block_group);
 7333		}
 7334	}
 7335search:
 7336	have_caching_bg = false;
 7337	if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags))
 7338		full_search = true;
 7339	down_read(&space_info->groups_sem);
 7340	list_for_each_entry(block_group, &space_info->block_groups[index],
 7341			    list) {
 7342		u64 offset;
 7343		int cached;
 7344
 7345		/* If the block group is read-only, we can skip it entirely. */
 7346		if (unlikely(block_group->ro))
 7347			continue;
 7348
 7349		btrfs_grab_block_group(block_group, delalloc);
 7350		search_start = block_group->key.objectid;
 7351
 7352		/*
 7353		 * this can happen if we end up cycling through all the
 7354		 * raid types, but we want to make sure we only allocate
 7355		 * for the proper type.
 7356		 */
 7357		if (!block_group_bits(block_group, flags)) {
 7358			u64 extra = BTRFS_BLOCK_GROUP_DUP |
 7359				BTRFS_BLOCK_GROUP_RAID1 |
 7360				BTRFS_BLOCK_GROUP_RAID5 |
 7361				BTRFS_BLOCK_GROUP_RAID6 |
 7362				BTRFS_BLOCK_GROUP_RAID10;
 7363
 7364			/*
 7365			 * if they asked for extra copies and this block group
 7366			 * doesn't provide them, bail.  This does allow us to
 7367			 * fill raid0 from raid1.
 7368			 */
 7369			if ((flags & extra) && !(block_group->flags & extra))
 7370				goto loop;
 7371		}
 7372
 7373have_block_group:
 7374		cached = block_group_cache_done(block_group);
 7375		if (unlikely(!cached)) {
 7376			have_caching_bg = true;
 7377			ret = cache_block_group(block_group, 0);
 7378			BUG_ON(ret < 0);
 7379			ret = 0;
 7380		}
 7381
 7382		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
 7383			goto loop;
 7384
 7385		/*
 7386		 * Ok we want to try and use the cluster allocator, so
 7387		 * lets look there
 7388		 */
 7389		if (last_ptr && use_cluster) {
 7390			struct btrfs_block_group_cache *used_block_group;
 7391			unsigned long aligned_cluster;
 7392			/*
 7393			 * the refill lock keeps out other
 7394			 * people trying to start a new cluster
 7395			 */
 7396			used_block_group = btrfs_lock_cluster(block_group,
 7397							      last_ptr,
 7398							      delalloc);
 7399			if (!used_block_group)
 7400				goto refill_cluster;
 7401
 7402			if (used_block_group != block_group &&
 7403			    (used_block_group->ro ||
 7404			     !block_group_bits(used_block_group, flags)))
 7405				goto release_cluster;
 7406
 7407			offset = btrfs_alloc_from_cluster(used_block_group,
 7408						last_ptr,
 7409						num_bytes,
 7410						used_block_group->key.objectid,
 7411						&max_extent_size);
 7412			if (offset) {
 7413				/* we have a block, we're done */
 7414				spin_unlock(&last_ptr->refill_lock);
 7415				trace_btrfs_reserve_extent_cluster(
 7416						used_block_group,
 7417						search_start, num_bytes);
 7418				if (used_block_group != block_group) {
 7419					btrfs_release_block_group(block_group,
 7420								  delalloc);
 7421					block_group = used_block_group;
 7422				}
 7423				goto checks;
 7424			}
 7425
 7426			WARN_ON(last_ptr->block_group != used_block_group);
 7427release_cluster:
 7428			/* If we are on LOOP_NO_EMPTY_SIZE, we can't
 7429			 * set up a new clusters, so lets just skip it
 7430			 * and let the allocator find whatever block
 7431			 * it can find.  If we reach this point, we
 7432			 * will have tried the cluster allocator
 7433			 * plenty of times and not have found
 7434			 * anything, so we are likely way too
 7435			 * fragmented for the clustering stuff to find
 7436			 * anything.
 7437			 *
 7438			 * However, if the cluster is taken from the
 7439			 * current block group, release the cluster
 7440			 * first, so that we stand a better chance of
 7441			 * succeeding in the unclustered
 7442			 * allocation.  */
 7443			if (loop >= LOOP_NO_EMPTY_SIZE &&
 7444			    used_block_group != block_group) {
 7445				spin_unlock(&last_ptr->refill_lock);
 7446				btrfs_release_block_group(used_block_group,
 7447							  delalloc);
 7448				goto unclustered_alloc;
 7449			}
 7450
 7451			/*
 7452			 * this cluster didn't work out, free it and
 7453			 * start over
 7454			 */
 7455			btrfs_return_cluster_to_free_space(NULL, last_ptr);
 7456
 7457			if (used_block_group != block_group)
 7458				btrfs_release_block_group(used_block_group,
 7459							  delalloc);
 7460refill_cluster:
 7461			if (loop >= LOOP_NO_EMPTY_SIZE) {
 7462				spin_unlock(&last_ptr->refill_lock);
 7463				goto unclustered_alloc;
 7464			}
 7465
 7466			aligned_cluster = max_t(unsigned long,
 7467						empty_cluster + empty_size,
 7468					      block_group->full_stripe_len);
 7469
 7470			/* allocate a cluster in this block group */
 7471			ret = btrfs_find_space_cluster(fs_info, block_group,
 7472						       last_ptr, search_start,
 7473						       num_bytes,
 7474						       aligned_cluster);
 7475			if (ret == 0) {
 7476				/*
 7477				 * now pull our allocation out of this
 7478				 * cluster
 7479				 */
 7480				offset = btrfs_alloc_from_cluster(block_group,
 7481							last_ptr,
 7482							num_bytes,
 7483							search_start,
 7484							&max_extent_size);
 7485				if (offset) {
 7486					/* we found one, proceed */
 7487					spin_unlock(&last_ptr->refill_lock);
 7488					trace_btrfs_reserve_extent_cluster(
 7489						block_group, search_start,
 7490						num_bytes);
 7491					goto checks;
 7492				}
 7493			} else if (!cached && loop > LOOP_CACHING_NOWAIT
 7494				   && !failed_cluster_refill) {
 7495				spin_unlock(&last_ptr->refill_lock);
 7496
 7497				failed_cluster_refill = true;
 7498				wait_block_group_cache_progress(block_group,
 7499				       num_bytes + empty_cluster + empty_size);
 7500				goto have_block_group;
 7501			}
 7502
 7503			/*
 7504			 * at this point we either didn't find a cluster
 7505			 * or we weren't able to allocate a block from our
 7506			 * cluster.  Free the cluster we've been trying
 7507			 * to use, and go to the next block group
 7508			 */
 7509			btrfs_return_cluster_to_free_space(NULL, last_ptr);
 7510			spin_unlock(&last_ptr->refill_lock);
 7511			goto loop;
 7512		}
 7513
 7514unclustered_alloc:
 7515		/*
 7516		 * We are doing an unclustered alloc, set the fragmented flag so
 7517		 * we don't bother trying to setup a cluster again until we get
 7518		 * more space.
 7519		 */
 7520		if (unlikely(last_ptr)) {
 7521			spin_lock(&last_ptr->lock);
 7522			last_ptr->fragmented = 1;
 7523			spin_unlock(&last_ptr->lock);
 7524		}
 7525		if (cached) {
 7526			struct btrfs_free_space_ctl *ctl =
 7527				block_group->free_space_ctl;
 7528
 7529			spin_lock(&ctl->tree_lock);
 7530			if (ctl->free_space <
 7531			    num_bytes + empty_cluster + empty_size) {
 7532				if (ctl->free_space > max_extent_size)
 7533					max_extent_size = ctl->free_space;
 7534				spin_unlock(&ctl->tree_lock);
 7535				goto loop;
 7536			}
 7537			spin_unlock(&ctl->tree_lock);
 7538		}
 7539
 7540		offset = btrfs_find_space_for_alloc(block_group, search_start,
 7541						    num_bytes, empty_size,
 7542						    &max_extent_size);
 7543		/*
 7544		 * If we didn't find a chunk, and we haven't failed on this
 7545		 * block group before, and this block group is in the middle of
 7546		 * caching and we are ok with waiting, then go ahead and wait
 7547		 * for progress to be made, and set failed_alloc to true.
 7548		 *
 7549		 * If failed_alloc is true then we've already waited on this
 7550		 * block group once and should move on to the next block group.
 7551		 */
 7552		if (!offset && !failed_alloc && !cached &&
 7553		    loop > LOOP_CACHING_NOWAIT) {
 7554			wait_block_group_cache_progress(block_group,
 7555						num_bytes + empty_size);
 7556			failed_alloc = true;
 7557			goto have_block_group;
 7558		} else if (!offset) {
 7559			goto loop;
 7560		}
 7561checks:
 7562		search_start = round_up(offset, fs_info->stripesize);
 7563
 7564		/* move on to the next group */
 7565		if (search_start + num_bytes >
 7566		    block_group->key.objectid + block_group->key.offset) {
 7567			btrfs_add_free_space(block_group, offset, num_bytes);
 7568			goto loop;
 7569		}
 7570
 7571		if (offset < search_start)
 7572			btrfs_add_free_space(block_group, offset,
 7573					     search_start - offset);
 7574
 7575		ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
 7576				num_bytes, delalloc);
 7577		if (ret == -EAGAIN) {
 7578			btrfs_add_free_space(block_group, offset, num_bytes);
 7579			goto loop;
 7580		}
 7581		btrfs_inc_block_group_reservations(block_group);
 7582
 7583		/* we are all good, lets return */
 7584		ins->objectid = search_start;
 7585		ins->offset = num_bytes;
 7586
 7587		trace_btrfs_reserve_extent(block_group, search_start, num_bytes);
 7588		btrfs_release_block_group(block_group, delalloc);
 7589		break;
 7590loop:
 7591		failed_cluster_refill = false;
 7592		failed_alloc = false;
 7593		BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
 7594		       index);
 7595		btrfs_release_block_group(block_group, delalloc);
 7596		cond_resched();
 7597	}
 7598	up_read(&space_info->groups_sem);
 7599
 7600	if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
 7601		&& !orig_have_caching_bg)
 7602		orig_have_caching_bg = true;
 7603
 7604	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
 7605		goto search;
 7606
 7607	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
 7608		goto search;
 7609
 7610	/*
 7611	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
 7612	 *			caching kthreads as we move along
 7613	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
 7614	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
 7615	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
 7616	 *			again
 7617	 */
 7618	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
 7619		index = 0;
 7620		if (loop == LOOP_CACHING_NOWAIT) {
 7621			/*
 7622			 * We want to skip the LOOP_CACHING_WAIT step if we
 7623			 * don't have any uncached bgs and we've already done a
 7624			 * full search through.
 7625			 */
 7626			if (orig_have_caching_bg || !full_search)
 7627				loop = LOOP_CACHING_WAIT;
 7628			else
 7629				loop = LOOP_ALLOC_CHUNK;
 7630		} else {
 7631			loop++;
 7632		}
 7633
 7634		if (loop == LOOP_ALLOC_CHUNK) {
 7635			struct btrfs_trans_handle *trans;
 7636			int exist = 0;
 7637
 7638			trans = current->journal_info;
 7639			if (trans)
 7640				exist = 1;
 7641			else
 7642				trans = btrfs_join_transaction(root);
 7643
 7644			if (IS_ERR(trans)) {
 7645				ret = PTR_ERR(trans);
 7646				goto out;
 7647			}
 7648
 7649			ret = do_chunk_alloc(trans, flags, CHUNK_ALLOC_FORCE);
 7650
 7651			/*
 7652			 * If we can't allocate a new chunk we've already looped
 7653			 * through at least once, move on to the NO_EMPTY_SIZE
 7654			 * case.
 7655			 */
 7656			if (ret == -ENOSPC)
 7657				loop = LOOP_NO_EMPTY_SIZE;
 7658
 7659			/*
 7660			 * Do not bail out on ENOSPC since we
 7661			 * can do more things.
 7662			 */
 7663			if (ret < 0 && ret != -ENOSPC)
 7664				btrfs_abort_transaction(trans, ret);
 7665			else
 7666				ret = 0;
 7667			if (!exist)
 7668				btrfs_end_transaction(trans);
 7669			if (ret)
 7670				goto out;
 7671		}
 7672
 7673		if (loop == LOOP_NO_EMPTY_SIZE) {
 7674			/*
 7675			 * Don't loop again if we already have no empty_size and
 7676			 * no empty_cluster.
 7677			 */
 7678			if (empty_size == 0 &&
 7679			    empty_cluster == 0) {
 7680				ret = -ENOSPC;
 7681				goto out;
 7682			}
 7683			empty_size = 0;
 7684			empty_cluster = 0;
 7685		}
 7686
 7687		goto search;
 7688	} else if (!ins->objectid) {
 7689		ret = -ENOSPC;
 7690	} else if (ins->objectid) {
 7691		if (!use_cluster && last_ptr) {
 7692			spin_lock(&last_ptr->lock);
 7693			last_ptr->window_start = ins->objectid;
 7694			spin_unlock(&last_ptr->lock);
 7695		}
 7696		ret = 0;
 7697	}
 7698out:
 7699	if (ret == -ENOSPC) {
 7700		spin_lock(&space_info->lock);
 7701		space_info->max_extent_size = max_extent_size;
 7702		spin_unlock(&space_info->lock);
 7703		ins->offset = max_extent_size;
 7704	}
 7705	return ret;
 7706}
 7707
 7708static void dump_space_info(struct btrfs_fs_info *fs_info,
 7709			    struct btrfs_space_info *info, u64 bytes,
 7710			    int dump_block_groups)
 7711{
 7712	struct btrfs_block_group_cache *cache;
 7713	int index = 0;
 7714
 7715	spin_lock(&info->lock);
 7716	btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
 7717		   info->flags,
 7718		   info->total_bytes - btrfs_space_info_used(info, true),
 7719		   info->full ? "" : "not ");
 7720	btrfs_info(fs_info,
 7721		"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
 7722		info->total_bytes, info->bytes_used, info->bytes_pinned,
 7723		info->bytes_reserved, info->bytes_may_use,
 7724		info->bytes_readonly);
 7725	spin_unlock(&info->lock);
 7726
 7727	if (!dump_block_groups)
 7728		return;
 7729
 7730	down_read(&info->groups_sem);
 7731again:
 7732	list_for_each_entry(cache, &info->block_groups[index], list) {
 7733		spin_lock(&cache->lock);
 7734		btrfs_info(fs_info,
 7735			"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
 7736			cache->key.objectid, cache->key.offset,
 7737			btrfs_block_group_used(&cache->item), cache->pinned,
 7738			cache->reserved, cache->ro ? "[readonly]" : "");
 7739		btrfs_dump_free_space(cache, bytes);
 7740		spin_unlock(&cache->lock);
 7741	}
 7742	if (++index < BTRFS_NR_RAID_TYPES)
 7743		goto again;
 7744	up_read(&info->groups_sem);
 7745}
 7746
 7747/*
 7748 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
 7749 *			  hole that is at least as big as @num_bytes.
 7750 *
 7751 * @root           -	The root that will contain this extent
 7752 *
 7753 * @ram_bytes      -	The amount of space in ram that @num_bytes take. This
 7754 *			is used for accounting purposes. This value differs
 7755 *			from @num_bytes only in the case of compressed extents.
 7756 *
 7757 * @num_bytes      -	Number of bytes to allocate on-disk.
 7758 *
 7759 * @min_alloc_size -	Indicates the minimum amount of space that the
 7760 *			allocator should try to satisfy. In some cases
 7761 *			@num_bytes may be larger than what is required and if
 7762 *			the filesystem is fragmented then allocation fails.
 7763 *			However, the presence of @min_alloc_size gives a
 7764 *			chance to try and satisfy the smaller allocation.
 7765 *
 7766 * @empty_size     -	A hint that you plan on doing more COW. This is the
 7767 *			size in bytes the allocator should try to find free
 7768 *			next to the block it returns.  This is just a hint and
 7769 *			may be ignored by the allocator.
 7770 *
 7771 * @hint_byte      -	Hint to the allocator to start searching above the byte
 7772 *			address passed. It might be ignored.
 7773 *
 7774 * @ins            -	This key is modified to record the found hole. It will
 7775 *			have the following values:
 7776 *			ins->objectid == start position
 7777 *			ins->flags = BTRFS_EXTENT_ITEM_KEY
 7778 *			ins->offset == the size of the hole.
 7779 *
 7780 * @is_data        -	Boolean flag indicating whether an extent is
 7781 *			allocated for data (true) or metadata (false)
 7782 *
 7783 * @delalloc       -	Boolean flag indicating whether this allocation is for
 7784 *			delalloc or not. If 'true' data_rwsem of block groups
 7785 *			is going to be acquired.
 7786 *
 7787 *
 7788 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
 7789 * case -ENOSPC is returned then @ins->offset will contain the size of the
 7790 * largest available hole the allocator managed to find.
 7791 */
 7792int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
 7793			 u64 num_bytes, u64 min_alloc_size,
 7794			 u64 empty_size, u64 hint_byte,
 7795			 struct btrfs_key *ins, int is_data, int delalloc)
 7796{
 7797	struct btrfs_fs_info *fs_info = root->fs_info;
 7798	bool final_tried = num_bytes == min_alloc_size;
 7799	u64 flags;
 7800	int ret;
 7801
 7802	flags = get_alloc_profile_by_root(root, is_data);
 7803again:
 7804	WARN_ON(num_bytes < fs_info->sectorsize);
 7805	ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
 7806			       hint_byte, ins, flags, delalloc);
 7807	if (!ret && !is_data) {
 7808		btrfs_dec_block_group_reservations(fs_info, ins->objectid);
 7809	} else if (ret == -ENOSPC) {
 7810		if (!final_tried && ins->offset) {
 7811			num_bytes = min(num_bytes >> 1, ins->offset);
 7812			num_bytes = round_down(num_bytes,
 7813					       fs_info->sectorsize);
 7814			num_bytes = max(num_bytes, min_alloc_size);
 7815			ram_bytes = num_bytes;
 7816			if (num_bytes == min_alloc_size)
 7817				final_tried = true;
 7818			goto again;
 7819		} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
 7820			struct btrfs_space_info *sinfo;
 7821
 7822			sinfo = __find_space_info(fs_info, flags);
 7823			btrfs_err(fs_info,
 7824				  "allocation failed flags %llu, wanted %llu",
 7825				  flags, num_bytes);
 7826			if (sinfo)
 7827				dump_space_info(fs_info, sinfo, num_bytes, 1);
 7828		}
 7829	}
 7830
 7831	return ret;
 7832}
 7833
 7834static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
 7835					u64 start, u64 len,
 7836					int pin, int delalloc)
 7837{
 7838	struct btrfs_block_group_cache *cache;
 7839	int ret = 0;
 7840
 7841	cache = btrfs_lookup_block_group(fs_info, start);
 7842	if (!cache) {
 7843		btrfs_err(fs_info, "Unable to find block group for %llu",
 7844			  start);
 7845		return -ENOSPC;
 7846	}
 7847
 7848	if (pin)
 7849		pin_down_extent(fs_info, cache, start, len, 1);
 7850	else {
 7851		if (btrfs_test_opt(fs_info, DISCARD))
 7852			ret = btrfs_discard_extent(fs_info, start, len, NULL);
 7853		btrfs_add_free_space(cache, start, len);
 7854		btrfs_free_reserved_bytes(cache, len, delalloc);
 7855		trace_btrfs_reserved_extent_free(fs_info, start, len);
 7856	}
 7857
 7858	btrfs_put_block_group(cache);
 7859	return ret;
 7860}
 7861
 7862int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
 7863			       u64 start, u64 len, int delalloc)
 7864{
 7865	return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
 7866}
 7867
 7868int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
 7869				       u64 start, u64 len)
 7870{
 7871	return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
 7872}
 7873
 7874static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 7875				      u64 parent, u64 root_objectid,
 7876				      u64 flags, u64 owner, u64 offset,
 7877				      struct btrfs_key *ins, int ref_mod)
 7878{
 7879	struct btrfs_fs_info *fs_info = trans->fs_info;
 7880	int ret;
 7881	struct btrfs_extent_item *extent_item;
 7882	struct btrfs_extent_inline_ref *iref;
 7883	struct btrfs_path *path;
 7884	struct extent_buffer *leaf;
 7885	int type;
 7886	u32 size;
 7887
 7888	if (parent > 0)
 7889		type = BTRFS_SHARED_DATA_REF_KEY;
 7890	else
 7891		type = BTRFS_EXTENT_DATA_REF_KEY;
 7892
 7893	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
 7894
 7895	path = btrfs_alloc_path();
 7896	if (!path)
 7897		return -ENOMEM;
 7898
 7899	path->leave_spinning = 1;
 7900	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
 7901				      ins, size);
 7902	if (ret) {
 7903		btrfs_free_path(path);
 7904		return ret;
 7905	}
 7906
 7907	leaf = path->nodes[0];
 7908	extent_item = btrfs_item_ptr(leaf, path->slots[0],
 7909				     struct btrfs_extent_item);
 7910	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
 7911	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
 7912	btrfs_set_extent_flags(leaf, extent_item,
 7913			       flags | BTRFS_EXTENT_FLAG_DATA);
 7914
 7915	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
 7916	btrfs_set_extent_inline_ref_type(leaf, iref, type);
 7917	if (parent > 0) {
 7918		struct btrfs_shared_data_ref *ref;
 7919		ref = (struct btrfs_shared_data_ref *)(iref + 1);
 7920		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
 7921		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
 7922	} else {
 7923		struct btrfs_extent_data_ref *ref;
 7924		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
 7925		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
 7926		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
 7927		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
 7928		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
 7929	}
 7930
 7931	btrfs_mark_buffer_dirty(path->nodes[0]);
 7932	btrfs_free_path(path);
 7933
 7934	ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
 7935	if (ret)
 7936		return ret;
 7937
 7938	ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1);
 7939	if (ret) { /* -ENOENT, logic error */
 7940		btrfs_err(fs_info, "update block group failed for %llu %llu",
 7941			ins->objectid, ins->offset);
 7942		BUG();
 7943	}
 7944	trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
 7945	return ret;
 7946}
 7947
 7948static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 7949				     struct btrfs_delayed_ref_node *node,
 7950				     struct btrfs_delayed_extent_op *extent_op)
 7951{
 7952	struct btrfs_fs_info *fs_info = trans->fs_info;
 7953	int ret;
 7954	struct btrfs_extent_item *extent_item;
 7955	struct btrfs_key extent_key;
 7956	struct btrfs_tree_block_info *block_info;
 7957	struct btrfs_extent_inline_ref *iref;
 7958	struct btrfs_path *path;
 7959	struct extent_buffer *leaf;
 7960	struct btrfs_delayed_tree_ref *ref;
 7961	u32 size = sizeof(*extent_item) + sizeof(*iref);
 7962	u64 num_bytes;
 7963	u64 flags = extent_op->flags_to_set;
 7964	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
 7965
 7966	ref = btrfs_delayed_node_to_tree_ref(node);
 7967
 7968	extent_key.objectid = node->bytenr;
 7969	if (skinny_metadata) {
 7970		extent_key.offset = ref->level;
 7971		extent_key.type = BTRFS_METADATA_ITEM_KEY;
 7972		num_bytes = fs_info->nodesize;
 7973	} else {
 7974		extent_key.offset = node->num_bytes;
 7975		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
 7976		size += sizeof(*block_info);
 7977		num_bytes = node->num_bytes;
 7978	}
 7979
 7980	path = btrfs_alloc_path();
 7981	if (!path) {
 7982		btrfs_free_and_pin_reserved_extent(fs_info,
 7983						   extent_key.objectid,
 7984						   fs_info->nodesize);
 7985		return -ENOMEM;
 7986	}
 7987
 7988	path->leave_spinning = 1;
 7989	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
 7990				      &extent_key, size);
 7991	if (ret) {
 7992		btrfs_free_path(path);
 7993		btrfs_free_and_pin_reserved_extent(fs_info,
 7994						   extent_key.objectid,
 7995						   fs_info->nodesize);
 7996		return ret;
 7997	}
 7998
 7999	leaf = path->nodes[0];
 8000	extent_item = btrfs_item_ptr(leaf, path->slots[0],
 8001				     struct btrfs_extent_item);
 8002	btrfs_set_extent_refs(leaf, extent_item, 1);
 8003	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
 8004	btrfs_set_extent_flags(leaf, extent_item,
 8005			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
 8006
 8007	if (skinny_metadata) {
 8008		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
 8009	} else {
 8010		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
 8011		btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
 8012		btrfs_set_tree_block_level(leaf, block_info, ref->level);
 8013		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
 8014	}
 8015
 8016	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
 8017		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
 8018		btrfs_set_extent_inline_ref_type(leaf, iref,
 8019						 BTRFS_SHARED_BLOCK_REF_KEY);
 8020		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
 8021	} else {
 8022		btrfs_set_extent_inline_ref_type(leaf, iref,
 8023						 BTRFS_TREE_BLOCK_REF_KEY);
 8024		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
 8025	}
 8026
 8027	btrfs_mark_buffer_dirty(leaf);
 8028	btrfs_free_path(path);
 8029
 8030	ret = remove_from_free_space_tree(trans, extent_key.objectid,
 8031					  num_bytes);
 8032	if (ret)
 8033		return ret;
 8034
 8035	ret = update_block_group(trans, fs_info, extent_key.objectid,
 8036				 fs_info->nodesize, 1);
 8037	if (ret) { /* -ENOENT, logic error */
 8038		btrfs_err(fs_info, "update block group failed for %llu %llu",
 8039			extent_key.objectid, extent_key.offset);
 8040		BUG();
 8041	}
 8042
 8043	trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
 8044					  fs_info->nodesize);
 8045	return ret;
 8046}
 8047
 8048int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 8049				     struct btrfs_root *root, u64 owner,
 8050				     u64 offset, u64 ram_bytes,
 8051				     struct btrfs_key *ins)
 8052{
 8053	int ret;
 8054
 8055	BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
 8056
 8057	btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0,
 8058			   root->root_key.objectid, owner, offset,
 8059			   BTRFS_ADD_DELAYED_EXTENT);
 8060
 8061	ret = btrfs_add_delayed_data_ref(trans, ins->objectid,
 8062					 ins->offset, 0,
 8063					 root->root_key.objectid, owner,
 8064					 offset, ram_bytes,
 8065					 BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
 8066	return ret;
 8067}
 8068
 8069/*
 8070 * this is used by the tree logging recovery code.  It records that
 8071 * an extent has been allocated and makes sure to clear the free
 8072 * space cache bits as well
 8073 */
 8074int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 8075				   u64 root_objectid, u64 owner, u64 offset,
 8076				   struct btrfs_key *ins)
 8077{
 8078	struct btrfs_fs_info *fs_info = trans->fs_info;
 8079	int ret;
 8080	struct btrfs_block_group_cache *block_group;
 8081	struct btrfs_space_info *space_info;
 8082
 8083	/*
 8084	 * Mixed block groups will exclude before processing the log so we only
 8085	 * need to do the exclude dance if this fs isn't mixed.
 8086	 */
 8087	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
 8088		ret = __exclude_logged_extent(fs_info, ins->objectid,
 8089					      ins->offset);
 8090		if (ret)
 8091			return ret;
 8092	}
 8093
 8094	block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
 8095	if (!block_group)
 8096		return -EINVAL;
 8097
 8098	space_info = block_group->space_info;
 8099	spin_lock(&space_info->lock);
 8100	spin_lock(&block_group->lock);
 8101	space_info->bytes_reserved += ins->offset;
 8102	block_group->reserved += ins->offset;
 8103	spin_unlock(&block_group->lock);
 8104	spin_unlock(&space_info->lock);
 8105
 8106	ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
 8107					 offset, ins, 1);
 8108	btrfs_put_block_group(block_group);
 8109	return ret;
 8110}
 8111
 8112static struct extent_buffer *
 8113btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 8114		      u64 bytenr, int level, u64 owner)
 8115{
 8116	struct btrfs_fs_info *fs_info = root->fs_info;
 8117	struct extent_buffer *buf;
 8118
 8119	buf = btrfs_find_create_tree_block(fs_info, bytenr);
 8120	if (IS_ERR(buf))
 8121		return buf;
 8122
 8123	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
 8124	btrfs_tree_lock(buf);
 8125	clean_tree_block(fs_info, buf);
 8126	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
 8127
 8128	btrfs_set_lock_blocking(buf);
 8129	set_extent_buffer_uptodate(buf);
 8130
 8131	memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
 8132	btrfs_set_header_level(buf, level);
 8133	btrfs_set_header_bytenr(buf, buf->start);
 8134	btrfs_set_header_generation(buf, trans->transid);
 8135	btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
 8136	btrfs_set_header_owner(buf, owner);
 8137	write_extent_buffer_fsid(buf, fs_info->fsid);
 8138	write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
 8139	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
 8140		buf->log_index = root->log_transid % 2;
 8141		/*
 8142		 * we allow two log transactions at a time, use different
 8143		 * EXENT bit to differentiate dirty pages.
 8144		 */
 8145		if (buf->log_index == 0)
 8146			set_extent_dirty(&root->dirty_log_pages, buf->start,
 8147					buf->start + buf->len - 1, GFP_NOFS);
 8148		else
 8149			set_extent_new(&root->dirty_log_pages, buf->start,
 8150					buf->start + buf->len - 1);
 8151	} else {
 8152		buf->log_index = -1;
 8153		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
 8154			 buf->start + buf->len - 1, GFP_NOFS);
 8155	}
 8156	trans->dirty = true;
 8157	/* this returns a buffer locked for blocking */
 8158	return buf;
 8159}
 8160
 8161static struct btrfs_block_rsv *
 8162use_block_rsv(struct btrfs_trans_handle *trans,
 8163	      struct btrfs_root *root, u32 blocksize)
 8164{
 8165	struct btrfs_fs_info *fs_info = root->fs_info;
 8166	struct btrfs_block_rsv *block_rsv;
 8167	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 8168	int ret;
 8169	bool global_updated = false;
 8170
 8171	block_rsv = get_block_rsv(trans, root);
 8172
 8173	if (unlikely(block_rsv->size == 0))
 8174		goto try_reserve;
 8175again:
 8176	ret = block_rsv_use_bytes(block_rsv, blocksize);
 8177	if (!ret)
 8178		return block_rsv;
 8179
 8180	if (block_rsv->failfast)
 8181		return ERR_PTR(ret);
 8182
 8183	if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
 8184		global_updated = true;
 8185		update_global_block_rsv(fs_info);
 8186		goto again;
 8187	}
 8188
 8189	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
 8190		static DEFINE_RATELIMIT_STATE(_rs,
 8191				DEFAULT_RATELIMIT_INTERVAL * 10,
 8192				/*DEFAULT_RATELIMIT_BURST*/ 1);
 8193		if (__ratelimit(&_rs))
 8194			WARN(1, KERN_DEBUG
 8195				"BTRFS: block rsv returned %d\n", ret);
 8196	}
 8197try_reserve:
 8198	ret = reserve_metadata_bytes(root, block_rsv, blocksize,
 8199				     BTRFS_RESERVE_NO_FLUSH);
 8200	if (!ret)
 8201		return block_rsv;
 8202	/*
 8203	 * If we couldn't reserve metadata bytes try and use some from
 8204	 * the global reserve if its space type is the same as the global
 8205	 * reservation.
 8206	 */
 8207	if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
 8208	    block_rsv->space_info == global_rsv->space_info) {
 8209		ret = block_rsv_use_bytes(global_rsv, blocksize);
 8210		if (!ret)
 8211			return global_rsv;
 8212	}
 8213	return ERR_PTR(ret);
 8214}
 8215
 8216static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
 8217			    struct btrfs_block_rsv *block_rsv, u32 blocksize)
 8218{
 8219	block_rsv_add_bytes(block_rsv, blocksize, 0);
 8220	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
 8221}
 8222
 8223/*
 8224 * finds a free extent and does all the dirty work required for allocation
 8225 * returns the tree buffer or an ERR_PTR on error.
 8226 */
 8227struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 8228					     struct btrfs_root *root,
 8229					     u64 parent, u64 root_objectid,
 8230					     const struct btrfs_disk_key *key,
 8231					     int level, u64 hint,
 8232					     u64 empty_size)
 8233{
 8234	struct btrfs_fs_info *fs_info = root->fs_info;
 8235	struct btrfs_key ins;
 8236	struct btrfs_block_rsv *block_rsv;
 8237	struct extent_buffer *buf;
 8238	struct btrfs_delayed_extent_op *extent_op;
 8239	u64 flags = 0;
 8240	int ret;
 8241	u32 blocksize = fs_info->nodesize;
 8242	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
 8243
 8244#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 8245	if (btrfs_is_testing(fs_info)) {
 8246		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
 8247					    level, root_objectid);
 8248		if (!IS_ERR(buf))
 8249			root->alloc_bytenr += blocksize;
 8250		return buf;
 8251	}
 8252#endif
 8253
 8254	block_rsv = use_block_rsv(trans, root, blocksize);
 8255	if (IS_ERR(block_rsv))
 8256		return ERR_CAST(block_rsv);
 8257
 8258	ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
 8259				   empty_size, hint, &ins, 0, 0);
 8260	if (ret)
 8261		goto out_unuse;
 8262
 8263	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
 8264				    root_objectid);
 8265	if (IS_ERR(buf)) {
 8266		ret = PTR_ERR(buf);
 8267		goto out_free_reserved;
 8268	}
 8269
 8270	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
 8271		if (parent == 0)
 8272			parent = ins.objectid;
 8273		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
 8274	} else
 8275		BUG_ON(parent > 0);
 8276
 8277	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
 8278		extent_op = btrfs_alloc_delayed_extent_op();
 8279		if (!extent_op) {
 8280			ret = -ENOMEM;
 8281			goto out_free_buf;
 8282		}
 8283		if (key)
 8284			memcpy(&extent_op->key, key, sizeof(extent_op->key));
 8285		else
 8286			memset(&extent_op->key, 0, sizeof(extent_op->key));
 8287		extent_op->flags_to_set = flags;
 8288		extent_op->update_key = skinny_metadata ? false : true;
 8289		extent_op->update_flags = true;
 8290		extent_op->is_data = false;
 8291		extent_op->level = level;
 8292
 8293		btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent,
 8294				   root_objectid, level, 0,
 8295				   BTRFS_ADD_DELAYED_EXTENT);
 8296		ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
 8297						 ins.offset, parent,
 8298						 root_objectid, level,
 8299						 BTRFS_ADD_DELAYED_EXTENT,
 8300						 extent_op, NULL, NULL);
 8301		if (ret)
 8302			goto out_free_delayed;
 8303	}
 8304	return buf;
 8305
 8306out_free_delayed:
 8307	btrfs_free_delayed_extent_op(extent_op);
 8308out_free_buf:
 8309	free_extent_buffer(buf);
 8310out_free_reserved:
 8311	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
 8312out_unuse:
 8313	unuse_block_rsv(fs_info, block_rsv, blocksize);
 8314	return ERR_PTR(ret);
 8315}
 8316
 8317struct walk_control {
 8318	u64 refs[BTRFS_MAX_LEVEL];
 8319	u64 flags[BTRFS_MAX_LEVEL];
 8320	struct btrfs_key update_progress;
 8321	int stage;
 8322	int level;
 8323	int shared_level;
 8324	int update_ref;
 8325	int keep_locks;
 8326	int reada_slot;
 8327	int reada_count;
 8328};
 8329
 8330#define DROP_REFERENCE	1
 8331#define UPDATE_BACKREF	2
 8332
 8333static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
 8334				     struct btrfs_root *root,
 8335				     struct walk_control *wc,
 8336				     struct btrfs_path *path)
 8337{
 8338	struct btrfs_fs_info *fs_info = root->fs_info;
 8339	u64 bytenr;
 8340	u64 generation;
 8341	u64 refs;
 8342	u64 flags;
 8343	u32 nritems;
 8344	struct btrfs_key key;
 8345	struct extent_buffer *eb;
 8346	int ret;
 8347	int slot;
 8348	int nread = 0;
 8349
 8350	if (path->slots[wc->level] < wc->reada_slot) {
 8351		wc->reada_count = wc->reada_count * 2 / 3;
 8352		wc->reada_count = max(wc->reada_count, 2);
 8353	} else {
 8354		wc->reada_count = wc->reada_count * 3 / 2;
 8355		wc->reada_count = min_t(int, wc->reada_count,
 8356					BTRFS_NODEPTRS_PER_BLOCK(fs_info));
 8357	}
 8358
 8359	eb = path->nodes[wc->level];
 8360	nritems = btrfs_header_nritems(eb);
 8361
 8362	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
 8363		if (nread >= wc->reada_count)
 8364			break;
 8365
 8366		cond_resched();
 8367		bytenr = btrfs_node_blockptr(eb, slot);
 8368		generation = btrfs_node_ptr_generation(eb, slot);
 8369
 8370		if (slot == path->slots[wc->level])
 8371			goto reada;
 8372
 8373		if (wc->stage == UPDATE_BACKREF &&
 8374		    generation <= root->root_key.offset)
 8375			continue;
 8376
 8377		/* We don't lock the tree block, it's OK to be racy here */
 8378		ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
 8379					       wc->level - 1, 1, &refs,
 8380					       &flags);
 8381		/* We don't care about errors in readahead. */
 8382		if (ret < 0)
 8383			continue;
 8384		BUG_ON(refs == 0);
 8385
 8386		if (wc->stage == DROP_REFERENCE) {
 8387			if (refs == 1)
 8388				goto reada;
 8389
 8390			if (wc->level == 1 &&
 8391			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
 8392				continue;
 8393			if (!wc->update_ref ||
 8394			    generation <= root->root_key.offset)
 8395				continue;
 8396			btrfs_node_key_to_cpu(eb, &key, slot);
 8397			ret = btrfs_comp_cpu_keys(&key,
 8398						  &wc->update_progress);
 8399			if (ret < 0)
 8400				continue;
 8401		} else {
 8402			if (wc->level == 1 &&
 8403			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
 8404				continue;
 8405		}
 8406reada:
 8407		readahead_tree_block(fs_info, bytenr);
 8408		nread++;
 8409	}
 8410	wc->reada_slot = slot;
 8411}
 8412
 8413/*
 8414 * helper to process tree block while walking down the tree.
 8415 *
 8416 * when wc->stage == UPDATE_BACKREF, this function updates
 8417 * back refs for pointers in the block.
 8418 *
 8419 * NOTE: return value 1 means we should stop walking down.
 8420 */
 8421static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 8422				   struct btrfs_root *root,
 8423				   struct btrfs_path *path,
 8424				   struct walk_control *wc, int lookup_info)
 8425{
 8426	struct btrfs_fs_info *fs_info = root->fs_info;
 8427	int level = wc->level;
 8428	struct extent_buffer *eb = path->nodes[level];
 8429	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 8430	int ret;
 8431
 8432	if (wc->stage == UPDATE_BACKREF &&
 8433	    btrfs_header_owner(eb) != root->root_key.objectid)
 8434		return 1;
 8435
 8436	/*
 8437	 * when reference count of tree block is 1, it won't increase
 8438	 * again. once full backref flag is set, we never clear it.
 8439	 */
 8440	if (lookup_info &&
 8441	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
 8442	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
 8443		BUG_ON(!path->locks[level]);
 8444		ret = btrfs_lookup_extent_info(trans, fs_info,
 8445					       eb->start, level, 1,
 8446					       &wc->refs[level],
 8447					       &wc->flags[level]);
 8448		BUG_ON(ret == -ENOMEM);
 8449		if (ret)
 8450			return ret;
 8451		BUG_ON(wc->refs[level] == 0);
 8452	}
 8453
 8454	if (wc->stage == DROP_REFERENCE) {
 8455		if (wc->refs[level] > 1)
 8456			return 1;
 8457
 8458		if (path->locks[level] && !wc->keep_locks) {
 8459			btrfs_tree_unlock_rw(eb, path->locks[level]);
 8460			path->locks[level] = 0;
 8461		}
 8462		return 0;
 8463	}
 8464
 8465	/* wc->stage == UPDATE_BACKREF */
 8466	if (!(wc->flags[level] & flag)) {
 8467		BUG_ON(!path->locks[level]);
 8468		ret = btrfs_inc_ref(trans, root, eb, 1);
 8469		BUG_ON(ret); /* -ENOMEM */
 8470		ret = btrfs_dec_ref(trans, root, eb, 0);
 8471		BUG_ON(ret); /* -ENOMEM */
 8472		ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start,
 8473						  eb->len, flag,
 8474						  btrfs_header_level(eb), 0);
 8475		BUG_ON(ret); /* -ENOMEM */
 8476		wc->flags[level] |= flag;
 8477	}
 8478
 8479	/*
 8480	 * the block is shared by multiple trees, so it's not good to
 8481	 * keep the tree lock
 8482	 */
 8483	if (path->locks[level] && level > 0) {
 8484		btrfs_tree_unlock_rw(eb, path->locks[level]);
 8485		path->locks[level] = 0;
 8486	}
 8487	return 0;
 8488}
 8489
 8490/*
 8491 * helper to process tree block pointer.
 8492 *
 8493 * when wc->stage == DROP_REFERENCE, this function checks
 8494 * reference count of the block pointed to. if the block
 8495 * is shared and we need update back refs for the subtree
 8496 * rooted at the block, this function changes wc->stage to
 8497 * UPDATE_BACKREF. if the block is shared and there is no
 8498 * need to update back, this function drops the reference
 8499 * to the block.
 8500 *
 8501 * NOTE: return value 1 means we should stop walking down.
 8502 */
 8503static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 8504				 struct btrfs_root *root,
 8505				 struct btrfs_path *path,
 8506				 struct walk_control *wc, int *lookup_info)
 8507{
 8508	struct btrfs_fs_info *fs_info = root->fs_info;
 8509	u64 bytenr;
 8510	u64 generation;
 8511	u64 parent;
 8512	u32 blocksize;
 8513	struct btrfs_key key;
 8514	struct btrfs_key first_key;
 8515	struct extent_buffer *next;
 8516	int level = wc->level;
 8517	int reada = 0;
 8518	int ret = 0;
 8519	bool need_account = false;
 8520
 8521	generation = btrfs_node_ptr_generation(path->nodes[level],
 8522					       path->slots[level]);
 8523	/*
 8524	 * if the lower level block was created before the snapshot
 8525	 * was created, we know there is no need to update back refs
 8526	 * for the subtree
 8527	 */
 8528	if (wc->stage == UPDATE_BACKREF &&
 8529	    generation <= root->root_key.offset) {
 8530		*lookup_info = 1;
 8531		return 1;
 8532	}
 8533
 8534	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
 8535	btrfs_node_key_to_cpu(path->nodes[level], &first_key,
 8536			      path->slots[level]);
 8537	blocksize = fs_info->nodesize;
 8538
 8539	next = find_extent_buffer(fs_info, bytenr);
 8540	if (!next) {
 8541		next = btrfs_find_create_tree_block(fs_info, bytenr);
 8542		if (IS_ERR(next))
 8543			return PTR_ERR(next);
 8544
 8545		btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
 8546					       level - 1);
 8547		reada = 1;
 8548	}
 8549	btrfs_tree_lock(next);
 8550	btrfs_set_lock_blocking(next);
 8551
 8552	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
 8553				       &wc->refs[level - 1],
 8554				       &wc->flags[level - 1]);
 8555	if (ret < 0)
 8556		goto out_unlock;
 8557
 8558	if (unlikely(wc->refs[level - 1] == 0)) {
 8559		btrfs_err(fs_info, "Missing references.");
 8560		ret = -EIO;
 8561		goto out_unlock;
 8562	}
 8563	*lookup_info = 0;
 8564
 8565	if (wc->stage == DROP_REFERENCE) {
 8566		if (wc->refs[level - 1] > 1) {
 8567			need_account = true;
 8568			if (level == 1 &&
 8569			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
 8570				goto skip;
 8571
 8572			if (!wc->update_ref ||
 8573			    generation <= root->root_key.offset)
 8574				goto skip;
 8575
 8576			btrfs_node_key_to_cpu(path->nodes[level], &key,
 8577					      path->slots[level]);
 8578			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
 8579			if (ret < 0)
 8580				goto skip;
 8581
 8582			wc->stage = UPDATE_BACKREF;
 8583			wc->shared_level = level - 1;
 8584		}
 8585	} else {
 8586		if (level == 1 &&
 8587		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
 8588			goto skip;
 8589	}
 8590
 8591	if (!btrfs_buffer_uptodate(next, generation, 0)) {
 8592		btrfs_tree_unlock(next);
 8593		free_extent_buffer(next);
 8594		next = NULL;
 8595		*lookup_info = 1;
 8596	}
 8597
 8598	if (!next) {
 8599		if (reada && level == 1)
 8600			reada_walk_down(trans, root, wc, path);
 8601		next = read_tree_block(fs_info, bytenr, generation, level - 1,
 8602				       &first_key);
 8603		if (IS_ERR(next)) {
 8604			return PTR_ERR(next);
 8605		} else if (!extent_buffer_uptodate(next)) {
 8606			free_extent_buffer(next);
 8607			return -EIO;
 8608		}
 8609		btrfs_tree_lock(next);
 8610		btrfs_set_lock_blocking(next);
 8611	}
 8612
 8613	level--;
 8614	ASSERT(level == btrfs_header_level(next));
 8615	if (level != btrfs_header_level(next)) {
 8616		btrfs_err(root->fs_info, "mismatched level");
 8617		ret = -EIO;
 8618		goto out_unlock;
 8619	}
 8620	path->nodes[level] = next;
 8621	path->slots[level] = 0;
 8622	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 8623	wc->level = level;
 8624	if (wc->level == 1)
 8625		wc->reada_slot = 0;
 8626	return 0;
 8627skip:
 8628	wc->refs[level - 1] = 0;
 8629	wc->flags[level - 1] = 0;
 8630	if (wc->stage == DROP_REFERENCE) {
 8631		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
 8632			parent = path->nodes[level]->start;
 8633		} else {
 8634			ASSERT(root->root_key.objectid ==
 8635			       btrfs_header_owner(path->nodes[level]));
 8636			if (root->root_key.objectid !=
 8637			    btrfs_header_owner(path->nodes[level])) {
 8638				btrfs_err(root->fs_info,
 8639						"mismatched block owner");
 8640				ret = -EIO;
 8641				goto out_unlock;
 8642			}
 8643			parent = 0;
 8644		}
 8645
 8646		if (need_account) {
 8647			ret = btrfs_qgroup_trace_subtree(trans, next,
 8648							 generation, level - 1);
 8649			if (ret) {
 8650				btrfs_err_rl(fs_info,
 8651					     "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
 8652					     ret);
 8653			}
 8654		}
 8655		ret = btrfs_free_extent(trans, root, bytenr, blocksize,
 8656					parent, root->root_key.objectid,
 8657					level - 1, 0);
 8658		if (ret)
 8659			goto out_unlock;
 8660	}
 8661
 8662	*lookup_info = 1;
 8663	ret = 1;
 8664
 8665out_unlock:
 8666	btrfs_tree_unlock(next);
 8667	free_extent_buffer(next);
 8668
 8669	return ret;
 8670}
 8671
 8672/*
 8673 * helper to process tree block while walking up the tree.
 8674 *
 8675 * when wc->stage == DROP_REFERENCE, this function drops
 8676 * reference count on the block.
 8677 *
 8678 * when wc->stage == UPDATE_BACKREF, this function changes
 8679 * wc->stage back to DROP_REFERENCE if we changed wc->stage
 8680 * to UPDATE_BACKREF previously while processing the block.
 8681 *
 8682 * NOTE: return value 1 means we should stop walking up.
 8683 */
 8684static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 8685				 struct btrfs_root *root,
 8686				 struct btrfs_path *path,
 8687				 struct walk_control *wc)
 8688{
 8689	struct btrfs_fs_info *fs_info = root->fs_info;
 8690	int ret;
 8691	int level = wc->level;
 8692	struct extent_buffer *eb = path->nodes[level];
 8693	u64 parent = 0;
 8694
 8695	if (wc->stage == UPDATE_BACKREF) {
 8696		BUG_ON(wc->shared_level < level);
 8697		if (level < wc->shared_level)
 8698			goto out;
 8699
 8700		ret = find_next_key(path, level + 1, &wc->update_progress);
 8701		if (ret > 0)
 8702			wc->update_ref = 0;
 8703
 8704		wc->stage = DROP_REFERENCE;
 8705		wc->shared_level = -1;
 8706		path->slots[level] = 0;
 8707
 8708		/*
 8709		 * check reference count again if the block isn't locked.
 8710		 * we should start walking down the tree again if reference
 8711		 * count is one.
 8712		 */
 8713		if (!path->locks[level]) {
 8714			BUG_ON(level == 0);
 8715			btrfs_tree_lock(eb);
 8716			btrfs_set_lock_blocking(eb);
 8717			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 8718
 8719			ret = btrfs_lookup_extent_info(trans, fs_info,
 8720						       eb->start, level, 1,
 8721						       &wc->refs[level],
 8722						       &wc->flags[level]);
 8723			if (ret < 0) {
 8724				btrfs_tree_unlock_rw(eb, path->locks[level]);
 8725				path->locks[level] = 0;
 8726				return ret;
 8727			}
 8728			BUG_ON(wc->refs[level] == 0);
 8729			if (wc->refs[level] == 1) {
 8730				btrfs_tree_unlock_rw(eb, path->locks[level]);
 8731				path->locks[level] = 0;
 8732				return 1;
 8733			}
 8734		}
 8735	}
 8736
 8737	/* wc->stage == DROP_REFERENCE */
 8738	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
 8739
 8740	if (wc->refs[level] == 1) {
 8741		if (level == 0) {
 8742			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
 8743				ret = btrfs_dec_ref(trans, root, eb, 1);
 8744			else
 8745				ret = btrfs_dec_ref(trans, root, eb, 0);
 8746			BUG_ON(ret); /* -ENOMEM */
 8747			ret = btrfs_qgroup_trace_leaf_items(trans, eb);
 8748			if (ret) {
 8749				btrfs_err_rl(fs_info,
 8750					     "error %d accounting leaf items. Quota is out of sync, rescan required.",
 8751					     ret);
 8752			}
 8753		}
 8754		/* make block locked assertion in clean_tree_block happy */
 8755		if (!path->locks[level] &&
 8756		    btrfs_header_generation(eb) == trans->transid) {
 8757			btrfs_tree_lock(eb);
 8758			btrfs_set_lock_blocking(eb);
 8759			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 8760		}
 8761		clean_tree_block(fs_info, eb);
 8762	}
 8763
 8764	if (eb == root->node) {
 8765		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
 8766			parent = eb->start;
 8767		else
 8768			BUG_ON(root->root_key.objectid !=
 8769			       btrfs_header_owner(eb));
 8770	} else {
 8771		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
 8772			parent = path->nodes[level + 1]->start;
 8773		else
 8774			BUG_ON(root->root_key.objectid !=
 8775			       btrfs_header_owner(path->nodes[level + 1]));
 8776	}
 8777
 8778	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
 8779out:
 8780	wc->refs[level] = 0;
 8781	wc->flags[level] = 0;
 8782	return 0;
 8783}
 8784
 8785static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 8786				   struct btrfs_root *root,
 8787				   struct btrfs_path *path,
 8788				   struct walk_control *wc)
 8789{
 8790	int level = wc->level;
 8791	int lookup_info = 1;
 8792	int ret;
 8793
 8794	while (level >= 0) {
 8795		ret = walk_down_proc(trans, root, path, wc, lookup_info);
 8796		if (ret > 0)
 8797			break;
 8798
 8799		if (level == 0)
 8800			break;
 8801
 8802		if (path->slots[level] >=
 8803		    btrfs_header_nritems(path->nodes[level]))
 8804			break;
 8805
 8806		ret = do_walk_down(trans, root, path, wc, &lookup_info);
 8807		if (ret > 0) {
 8808			path->slots[level]++;
 8809			continue;
 8810		} else if (ret < 0)
 8811			return ret;
 8812		level = wc->level;
 8813	}
 8814	return 0;
 8815}
 8816
 8817static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 8818				 struct btrfs_root *root,
 8819				 struct btrfs_path *path,
 8820				 struct walk_control *wc, int max_level)
 8821{
 8822	int level = wc->level;
 8823	int ret;
 8824
 8825	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
 8826	while (level < max_level && path->nodes[level]) {
 8827		wc->level = level;
 8828		if (path->slots[level] + 1 <
 8829		    btrfs_header_nritems(path->nodes[level])) {
 8830			path->slots[level]++;
 8831			return 0;
 8832		} else {
 8833			ret = walk_up_proc(trans, root, path, wc);
 8834			if (ret > 0)
 8835				return 0;
 8836
 8837			if (path->locks[level]) {
 8838				btrfs_tree_unlock_rw(path->nodes[level],
 8839						     path->locks[level]);
 8840				path->locks[level] = 0;
 8841			}
 8842			free_extent_buffer(path->nodes[level]);
 8843			path->nodes[level] = NULL;
 8844			level++;
 8845		}
 8846	}
 8847	return 1;
 8848}
 8849
 8850/*
 8851 * drop a subvolume tree.
 8852 *
 8853 * this function traverses the tree freeing any blocks that only
 8854 * referenced by the tree.
 8855 *
 8856 * when a shared tree block is found. this function decreases its
 8857 * reference count by one. if update_ref is true, this function
 8858 * also make sure backrefs for the shared block and all lower level
 8859 * blocks are properly updated.
 8860 *
 8861 * If called with for_reloc == 0, may exit early with -EAGAIN
 8862 */
 8863int btrfs_drop_snapshot(struct btrfs_root *root,
 8864			 struct btrfs_block_rsv *block_rsv, int update_ref,
 8865			 int for_reloc)
 8866{
 8867	struct btrfs_fs_info *fs_info = root->fs_info;
 8868	struct btrfs_path *path;
 8869	struct btrfs_trans_handle *trans;
 8870	struct btrfs_root *tree_root = fs_info->tree_root;
 8871	struct btrfs_root_item *root_item = &root->root_item;
 8872	struct walk_control *wc;
 8873	struct btrfs_key key;
 8874	int err = 0;
 8875	int ret;
 8876	int level;
 8877	bool root_dropped = false;
 8878
 8879	btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid);
 8880
 8881	path = btrfs_alloc_path();
 8882	if (!path) {
 8883		err = -ENOMEM;
 8884		goto out;
 8885	}
 8886
 8887	wc = kzalloc(sizeof(*wc), GFP_NOFS);
 8888	if (!wc) {
 8889		btrfs_free_path(path);
 8890		err = -ENOMEM;
 8891		goto out;
 8892	}
 8893
 8894	trans = btrfs_start_transaction(tree_root, 0);
 8895	if (IS_ERR(trans)) {
 8896		err = PTR_ERR(trans);
 8897		goto out_free;
 8898	}
 8899
 8900	if (block_rsv)
 8901		trans->block_rsv = block_rsv;
 8902
 8903	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
 8904		level = btrfs_header_level(root->node);
 8905		path->nodes[level] = btrfs_lock_root_node(root);
 8906		btrfs_set_lock_blocking(path->nodes[level]);
 8907		path->slots[level] = 0;
 8908		path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 8909		memset(&wc->update_progress, 0,
 8910		       sizeof(wc->update_progress));
 8911	} else {
 8912		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
 8913		memcpy(&wc->update_progress, &key,
 8914		       sizeof(wc->update_progress));
 8915
 8916		level = root_item->drop_level;
 8917		BUG_ON(level == 0);
 8918		path->lowest_level = level;
 8919		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 8920		path->lowest_level = 0;
 8921		if (ret < 0) {
 8922			err = ret;
 8923			goto out_end_trans;
 8924		}
 8925		WARN_ON(ret > 0);
 8926
 8927		/*
 8928		 * unlock our path, this is safe because only this
 8929		 * function is allowed to delete this snapshot
 8930		 */
 8931		btrfs_unlock_up_safe(path, 0);
 8932
 8933		level = btrfs_header_level(root->node);
 8934		while (1) {
 8935			btrfs_tree_lock(path->nodes[level]);
 8936			btrfs_set_lock_blocking(path->nodes[level]);
 8937			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 8938
 8939			ret = btrfs_lookup_extent_info(trans, fs_info,
 8940						path->nodes[level]->start,
 8941						level, 1, &wc->refs[level],
 8942						&wc->flags[level]);
 8943			if (ret < 0) {
 8944				err = ret;
 8945				goto out_end_trans;
 8946			}
 8947			BUG_ON(wc->refs[level] == 0);
 8948
 8949			if (level == root_item->drop_level)
 8950				break;
 8951
 8952			btrfs_tree_unlock(path->nodes[level]);
 8953			path->locks[level] = 0;
 8954			WARN_ON(wc->refs[level] != 1);
 8955			level--;
 8956		}
 8957	}
 8958
 8959	wc->level = level;
 8960	wc->shared_level = -1;
 8961	wc->stage = DROP_REFERENCE;
 8962	wc->update_ref = update_ref;
 8963	wc->keep_locks = 0;
 8964	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
 8965
 8966	while (1) {
 8967
 8968		ret = walk_down_tree(trans, root, path, wc);
 8969		if (ret < 0) {
 8970			err = ret;
 8971			break;
 8972		}
 8973
 8974		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
 8975		if (ret < 0) {
 8976			err = ret;
 8977			break;
 8978		}
 8979
 8980		if (ret > 0) {
 8981			BUG_ON(wc->stage != DROP_REFERENCE);
 8982			break;
 8983		}
 8984
 8985		if (wc->stage == DROP_REFERENCE) {
 8986			level = wc->level;
 8987			btrfs_node_key(path->nodes[level],
 8988				       &root_item->drop_progress,
 8989				       path->slots[level]);
 8990			root_item->drop_level = level;
 8991		}
 8992
 8993		BUG_ON(wc->level == 0);
 8994		if (btrfs_should_end_transaction(trans) ||
 8995		    (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
 8996			ret = btrfs_update_root(trans, tree_root,
 8997						&root->root_key,
 8998						root_item);
 8999			if (ret) {
 9000				btrfs_abort_transaction(trans, ret);
 9001				err = ret;
 9002				goto out_end_trans;
 9003			}
 9004
 9005			btrfs_end_transaction_throttle(trans);
 9006			if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
 9007				btrfs_debug(fs_info,
 9008					    "drop snapshot early exit");
 9009				err = -EAGAIN;
 9010				goto out_free;
 9011			}
 9012
 9013			trans = btrfs_start_transaction(tree_root, 0);
 9014			if (IS_ERR(trans)) {
 9015				err = PTR_ERR(trans);
 9016				goto out_free;
 9017			}
 9018			if (block_rsv)
 9019				trans->block_rsv = block_rsv;
 9020		}
 9021	}
 9022	btrfs_release_path(path);
 9023	if (err)
 9024		goto out_end_trans;
 9025
 9026	ret = btrfs_del_root(trans, &root->root_key);
 9027	if (ret) {
 9028		btrfs_abort_transaction(trans, ret);
 9029		err = ret;
 9030		goto out_end_trans;
 9031	}
 9032
 9033	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
 9034		ret = btrfs_find_root(tree_root, &root->root_key, path,
 9035				      NULL, NULL);
 9036		if (ret < 0) {
 9037			btrfs_abort_transaction(trans, ret);
 9038			err = ret;
 9039			goto out_end_trans;
 9040		} else if (ret > 0) {
 9041			/* if we fail to delete the orphan item this time
 9042			 * around, it'll get picked up the next time.
 9043			 *
 9044			 * The most common failure here is just -ENOENT.
 9045			 */
 9046			btrfs_del_orphan_item(trans, tree_root,
 9047					      root->root_key.objectid);
 9048		}
 9049	}
 9050
 9051	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
 9052		btrfs_add_dropped_root(trans, root);
 9053	} else {
 9054		free_extent_buffer(root->node);
 9055		free_extent_buffer(root->commit_root);
 9056		btrfs_put_fs_root(root);
 9057	}
 9058	root_dropped = true;
 9059out_end_trans:
 9060	btrfs_end_transaction_throttle(trans);
 9061out_free:
 9062	kfree(wc);
 9063	btrfs_free_path(path);
 9064out:
 9065	/*
 9066	 * So if we need to stop dropping the snapshot for whatever reason we
 9067	 * need to make sure to add it back to the dead root list so that we
 9068	 * keep trying to do the work later.  This also cleans up roots if we
 9069	 * don't have it in the radix (like when we recover after a power fail
 9070	 * or unmount) so we don't leak memory.
 9071	 */
 9072	if (!for_reloc && !root_dropped)
 9073		btrfs_add_dead_root(root);
 9074	if (err && err != -EAGAIN)
 9075		btrfs_handle_fs_error(fs_info, err, NULL);
 9076	return err;
 9077}
 9078
 9079/*
 9080 * drop subtree rooted at tree block 'node'.
 9081 *
 9082 * NOTE: this function will unlock and release tree block 'node'
 9083 * only used by relocation code
 9084 */
 9085int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 9086			struct btrfs_root *root,
 9087			struct extent_buffer *node,
 9088			struct extent_buffer *parent)
 9089{
 9090	struct btrfs_fs_info *fs_info = root->fs_info;
 9091	struct btrfs_path *path;
 9092	struct walk_control *wc;
 9093	int level;
 9094	int parent_level;
 9095	int ret = 0;
 9096	int wret;
 9097
 9098	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
 9099
 9100	path = btrfs_alloc_path();
 9101	if (!path)
 9102		return -ENOMEM;
 9103
 9104	wc = kzalloc(sizeof(*wc), GFP_NOFS);
 9105	if (!wc) {
 9106		btrfs_free_path(path);
 9107		return -ENOMEM;
 9108	}
 9109
 9110	btrfs_assert_tree_locked(parent);
 9111	parent_level = btrfs_header_level(parent);
 9112	extent_buffer_get(parent);
 9113	path->nodes[parent_level] = parent;
 9114	path->slots[parent_level] = btrfs_header_nritems(parent);
 9115
 9116	btrfs_assert_tree_locked(node);
 9117	level = btrfs_header_level(node);
 9118	path->nodes[level] = node;
 9119	path->slots[level] = 0;
 9120	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 9121
 9122	wc->refs[parent_level] = 1;
 9123	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 9124	wc->level = level;
 9125	wc->shared_level = -1;
 9126	wc->stage = DROP_REFERENCE;
 9127	wc->update_ref = 0;
 9128	wc->keep_locks = 1;
 9129	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
 9130
 9131	while (1) {
 9132		wret = walk_down_tree(trans, root, path, wc);
 9133		if (wret < 0) {
 9134			ret = wret;
 9135			break;
 9136		}
 9137
 9138		wret = walk_up_tree(trans, root, path, wc, parent_level);
 9139		if (wret < 0)
 9140			ret = wret;
 9141		if (wret != 0)
 9142			break;
 9143	}
 9144
 9145	kfree(wc);
 9146	btrfs_free_path(path);
 9147	return ret;
 9148}
 9149
 9150static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
 9151{
 9152	u64 num_devices;
 9153	u64 stripped;
 9154
 9155	/*
 9156	 * if restripe for this chunk_type is on pick target profile and
 9157	 * return, otherwise do the usual balance
 9158	 */
 9159	stripped = get_restripe_target(fs_info, flags);
 9160	if (stripped)
 9161		return extended_to_chunk(stripped);
 9162
 9163	num_devices = fs_info->fs_devices->rw_devices;
 9164
 9165	stripped = BTRFS_BLOCK_GROUP_RAID0 |
 9166		BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
 9167		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
 9168
 9169	if (num_devices == 1) {
 9170		stripped |= BTRFS_BLOCK_GROUP_DUP;
 9171		stripped = flags & ~stripped;
 9172
 9173		/* turn raid0 into single device chunks */
 9174		if (flags & BTRFS_BLOCK_GROUP_RAID0)
 9175			return stripped;
 9176
 9177		/* turn mirroring into duplication */
 9178		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
 9179			     BTRFS_BLOCK_GROUP_RAID10))
 9180			return stripped | BTRFS_BLOCK_GROUP_DUP;
 9181	} else {
 9182		/* they already had raid on here, just return */
 9183		if (flags & stripped)
 9184			return flags;
 9185
 9186		stripped |= BTRFS_BLOCK_GROUP_DUP;
 9187		stripped = flags & ~stripped;
 9188
 9189		/* switch duplicated blocks with raid1 */
 9190		if (flags & BTRFS_BLOCK_GROUP_DUP)
 9191			return stripped | BTRFS_BLOCK_GROUP_RAID1;
 9192
 9193		/* this is drive concat, leave it alone */
 9194	}
 9195
 9196	return flags;
 9197}
 9198
 9199static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 9200{
 9201	struct btrfs_space_info *sinfo = cache->space_info;
 9202	u64 num_bytes;
 9203	u64 min_allocable_bytes;
 9204	int ret = -ENOSPC;
 9205
 9206	/*
 9207	 * We need some metadata space and system metadata space for
 9208	 * allocating chunks in some corner cases until we force to set
 9209	 * it to be readonly.
 9210	 */
 9211	if ((sinfo->flags &
 9212	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
 9213	    !force)
 9214		min_allocable_bytes = SZ_1M;
 9215	else
 9216		min_allocable_bytes = 0;
 9217
 9218	spin_lock(&sinfo->lock);
 9219	spin_lock(&cache->lock);
 9220
 9221	if (cache->ro) {
 9222		cache->ro++;
 9223		ret = 0;
 9224		goto out;
 9225	}
 9226
 9227	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
 9228		    cache->bytes_super - btrfs_block_group_used(&cache->item);
 9229
 9230	if (btrfs_space_info_used(sinfo, true) + num_bytes +
 9231	    min_allocable_bytes <= sinfo->total_bytes) {
 9232		sinfo->bytes_readonly += num_bytes;
 9233		cache->ro++;
 9234		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
 9235		ret = 0;
 9236	}
 9237out:
 9238	spin_unlock(&cache->lock);
 9239	spin_unlock(&sinfo->lock);
 9240	return ret;
 9241}
 9242
 9243int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)
 9244
 9245{
 9246	struct btrfs_fs_info *fs_info = cache->fs_info;
 9247	struct btrfs_trans_handle *trans;
 9248	u64 alloc_flags;
 9249	int ret;
 9250
 9251again:
 9252	trans = btrfs_join_transaction(fs_info->extent_root);
 9253	if (IS_ERR(trans))
 9254		return PTR_ERR(trans);
 9255
 9256	/*
 9257	 * we're not allowed to set block groups readonly after the dirty
 9258	 * block groups cache has started writing.  If it already started,
 9259	 * back off and let this transaction commit
 9260	 */
 9261	mutex_lock(&fs_info->ro_block_group_mutex);
 9262	if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
 9263		u64 transid = trans->transid;
 9264
 9265		mutex_unlock(&fs_info->ro_block_group_mutex);
 9266		btrfs_end_transaction(trans);
 9267
 9268		ret = btrfs_wait_for_commit(fs_info, transid);
 9269		if (ret)
 9270			return ret;
 9271		goto again;
 9272	}
 9273
 9274	/*
 9275	 * if we are changing raid levels, try to allocate a corresponding
 9276	 * block group with the new raid level.
 9277	 */
 9278	alloc_flags = update_block_group_flags(fs_info, cache->flags);
 9279	if (alloc_flags != cache->flags) {
 9280		ret = do_chunk_alloc(trans, alloc_flags,
 9281				     CHUNK_ALLOC_FORCE);
 9282		/*
 9283		 * ENOSPC is allowed here, we may have enough space
 9284		 * already allocated at the new raid level to
 9285		 * carry on
 9286		 */
 9287		if (ret == -ENOSPC)
 9288			ret = 0;
 9289		if (ret < 0)
 9290			goto out;
 9291	}
 9292
 9293	ret = inc_block_group_ro(cache, 0);
 9294	if (!ret)
 9295		goto out;
 9296	alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
 9297	ret = do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
 9298	if (ret < 0)
 9299		goto out;
 9300	ret = inc_block_group_ro(cache, 0);
 9301out:
 9302	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
 9303		alloc_flags = update_block_group_flags(fs_info, cache->flags);
 9304		mutex_lock(&fs_info->chunk_mutex);
 9305		check_system_chunk(trans, alloc_flags);
 9306		mutex_unlock(&fs_info->chunk_mutex);
 9307	}
 9308	mutex_unlock(&fs_info->ro_block_group_mutex);
 9309
 9310	btrfs_end_transaction(trans);
 9311	return ret;
 9312}
 9313
 9314int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
 9315{
 9316	u64 alloc_flags = get_alloc_profile(trans->fs_info, type);
 9317
 9318	return do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
 9319}
 9320
 9321/*
 9322 * helper to account the unused space of all the readonly block group in the
 9323 * space_info. takes mirrors into account.
 9324 */
 9325u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
 9326{
 9327	struct btrfs_block_group_cache *block_group;
 9328	u64 free_bytes = 0;
 9329	int factor;
 9330
 9331	/* It's df, we don't care if it's racy */
 9332	if (list_empty(&sinfo->ro_bgs))
 9333		return 0;
 9334
 9335	spin_lock(&sinfo->lock);
 9336	list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
 9337		spin_lock(&block_group->lock);
 9338
 9339		if (!block_group->ro) {
 9340			spin_unlock(&block_group->lock);
 9341			continue;
 9342		}
 9343
 9344		factor = btrfs_bg_type_to_factor(block_group->flags);
 9345		free_bytes += (block_group->key.offset -
 9346			       btrfs_block_group_used(&block_group->item)) *
 9347			       factor;
 9348
 9349		spin_unlock(&block_group->lock);
 9350	}
 9351	spin_unlock(&sinfo->lock);
 9352
 9353	return free_bytes;
 9354}
 9355
 9356void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
 9357{
 9358	struct btrfs_space_info *sinfo = cache->space_info;
 9359	u64 num_bytes;
 9360
 9361	BUG_ON(!cache->ro);
 9362
 9363	spin_lock(&sinfo->lock);
 9364	spin_lock(&cache->lock);
 9365	if (!--cache->ro) {
 9366		num_bytes = cache->key.offset - cache->reserved -
 9367			    cache->pinned - cache->bytes_super -
 9368			    btrfs_block_group_used(&cache->item);
 9369		sinfo->bytes_readonly -= num_bytes;
 9370		list_del_init(&cache->ro_list);
 9371	}
 9372	spin_unlock(&cache->lock);
 9373	spin_unlock(&sinfo->lock);
 9374}
 9375
 9376/*
 9377 * checks to see if its even possible to relocate this block group.
 9378 *
 9379 * @return - -1 if it's not a good idea to relocate this block group, 0 if its
 9380 * ok to go ahead and try.
 9381 */
 9382int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
 9383{
 9384	struct btrfs_root *root = fs_info->extent_root;
 9385	struct btrfs_block_group_cache *block_group;
 9386	struct btrfs_space_info *space_info;
 9387	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 9388	struct btrfs_device *device;
 9389	struct btrfs_trans_handle *trans;
 9390	u64 min_free;
 9391	u64 dev_min = 1;
 9392	u64 dev_nr = 0;
 9393	u64 target;
 9394	int debug;
 9395	int index;
 9396	int full = 0;
 9397	int ret = 0;
 9398
 9399	debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
 9400
 9401	block_group = btrfs_lookup_block_group(fs_info, bytenr);
 9402
 9403	/* odd, couldn't find the block group, leave it alone */
 9404	if (!block_group) {
 9405		if (debug)
 9406			btrfs_warn(fs_info,
 9407				   "can't find block group for bytenr %llu",
 9408				   bytenr);
 9409		return -1;
 9410	}
 9411
 9412	min_free = btrfs_block_group_used(&block_group->item);
 9413
 9414	/* no bytes used, we're good */
 9415	if (!min_free)
 9416		goto out;
 9417
 9418	space_info = block_group->space_info;
 9419	spin_lock(&space_info->lock);
 9420
 9421	full = space_info->full;
 9422
 9423	/*
 9424	 * if this is the last block group we have in this space, we can't
 9425	 * relocate it unless we're able to allocate a new chunk below.
 9426	 *
 9427	 * Otherwise, we need to make sure we have room in the space to handle
 9428	 * all of the extents from this block group.  If we can, we're good
 9429	 */
 9430	if ((space_info->total_bytes != block_group->key.offset) &&
 9431	    (btrfs_space_info_used(space_info, false) + min_free <
 9432	     space_info->total_bytes)) {
 9433		spin_unlock(&space_info->lock);
 9434		goto out;
 9435	}
 9436	spin_unlock(&space_info->lock);
 9437
 9438	/*
 9439	 * ok we don't have enough space, but maybe we have free space on our
 9440	 * devices to allocate new chunks for relocation, so loop through our
 9441	 * alloc devices and guess if we have enough space.  if this block
 9442	 * group is going to be restriped, run checks against the target
 9443	 * profile instead of the current one.
 9444	 */
 9445	ret = -1;
 9446
 9447	/*
 9448	 * index:
 9449	 *      0: raid10
 9450	 *      1: raid1
 9451	 *      2: dup
 9452	 *      3: raid0
 9453	 *      4: single
 9454	 */
 9455	target = get_restripe_target(fs_info, block_group->flags);
 9456	if (target) {
 9457		index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
 9458	} else {
 9459		/*
 9460		 * this is just a balance, so if we were marked as full
 9461		 * we know there is no space for a new chunk
 9462		 */
 9463		if (full) {
 9464			if (debug)
 9465				btrfs_warn(fs_info,
 9466					   "no space to alloc new chunk for block group %llu",
 9467					   block_group->key.objectid);
 9468			goto out;
 9469		}
 9470
 9471		index = btrfs_bg_flags_to_raid_index(block_group->flags);
 9472	}
 9473
 9474	if (index == BTRFS_RAID_RAID10) {
 9475		dev_min = 4;
 9476		/* Divide by 2 */
 9477		min_free >>= 1;
 9478	} else if (index == BTRFS_RAID_RAID1) {
 9479		dev_min = 2;
 9480	} else if (index == BTRFS_RAID_DUP) {
 9481		/* Multiply by 2 */
 9482		min_free <<= 1;
 9483	} else if (index == BTRFS_RAID_RAID0) {
 9484		dev_min = fs_devices->rw_devices;
 9485		min_free = div64_u64(min_free, dev_min);
 9486	}
 9487
 9488	/* We need to do this so that we can look at pending chunks */
 9489	trans = btrfs_join_transaction(root);
 9490	if (IS_ERR(trans)) {
 9491		ret = PTR_ERR(trans);
 9492		goto out;
 9493	}
 9494
 9495	mutex_lock(&fs_info->chunk_mutex);
 9496	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
 9497		u64 dev_offset;
 9498
 9499		/*
 9500		 * check to make sure we can actually find a chunk with enough
 9501		 * space to fit our block group in.
 9502		 */
 9503		if (device->total_bytes > device->bytes_used + min_free &&
 9504		    !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
 9505			ret = find_free_dev_extent(trans, device, min_free,
 9506						   &dev_offset, NULL);
 9507			if (!ret)
 9508				dev_nr++;
 9509
 9510			if (dev_nr >= dev_min)
 9511				break;
 9512
 9513			ret = -1;
 9514		}
 9515	}
 9516	if (debug && ret == -1)
 9517		btrfs_warn(fs_info,
 9518			   "no space to allocate a new chunk for block group %llu",
 9519			   block_group->key.objectid);
 9520	mutex_unlock(&fs_info->chunk_mutex);
 9521	btrfs_end_transaction(trans);
 9522out:
 9523	btrfs_put_block_group(block_group);
 9524	return ret;
 9525}
 9526
 9527static int find_first_block_group(struct btrfs_fs_info *fs_info,
 9528				  struct btrfs_path *path,
 9529				  struct btrfs_key *key)
 9530{
 9531	struct btrfs_root *root = fs_info->extent_root;
 9532	int ret = 0;
 9533	struct btrfs_key found_key;
 9534	struct extent_buffer *leaf;
 9535	struct btrfs_block_group_item bg;
 9536	u64 flags;
 9537	int slot;
 9538
 9539	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
 9540	if (ret < 0)
 9541		goto out;
 9542
 9543	while (1) {
 9544		slot = path->slots[0];
 9545		leaf = path->nodes[0];
 9546		if (slot >= btrfs_header_nritems(leaf)) {
 9547			ret = btrfs_next_leaf(root, path);
 9548			if (ret == 0)
 9549				continue;
 9550			if (ret < 0)
 9551				goto out;
 9552			break;
 9553		}
 9554		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 9555
 9556		if (found_key.objectid >= key->objectid &&
 9557		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
 9558			struct extent_map_tree *em_tree;
 9559			struct extent_map *em;
 9560
 9561			em_tree = &root->fs_info->mapping_tree.map_tree;
 9562			read_lock(&em_tree->lock);
 9563			em = lookup_extent_mapping(em_tree, found_key.objectid,
 9564						   found_key.offset);
 9565			read_unlock(&em_tree->lock);
 9566			if (!em) {
 9567				btrfs_err(fs_info,
 9568			"logical %llu len %llu found bg but no related chunk",
 9569					  found_key.objectid, found_key.offset);
 9570				ret = -ENOENT;
 9571			} else if (em->start != found_key.objectid ||
 9572				   em->len != found_key.offset) {
 9573				btrfs_err(fs_info,
 9574		"block group %llu len %llu mismatch with chunk %llu len %llu",
 9575					  found_key.objectid, found_key.offset,
 9576					  em->start, em->len);
 9577				ret = -EUCLEAN;
 9578			} else {
 9579				read_extent_buffer(leaf, &bg,
 9580					btrfs_item_ptr_offset(leaf, slot),
 9581					sizeof(bg));
 9582				flags = btrfs_block_group_flags(&bg) &
 9583					BTRFS_BLOCK_GROUP_TYPE_MASK;
 9584
 9585				if (flags != (em->map_lookup->type &
 9586					      BTRFS_BLOCK_GROUP_TYPE_MASK)) {
 9587					btrfs_err(fs_info,
 9588"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
 9589						found_key.objectid,
 9590						found_key.offset, flags,
 9591						(BTRFS_BLOCK_GROUP_TYPE_MASK &
 9592						 em->map_lookup->type));
 9593					ret = -EUCLEAN;
 9594				} else {
 9595					ret = 0;
 9596				}
 9597			}
 9598			free_extent_map(em);
 9599			goto out;
 9600		}
 9601		path->slots[0]++;
 9602	}
 9603out:
 9604	return ret;
 9605}
 9606
 9607void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
 9608{
 9609	struct btrfs_block_group_cache *block_group;
 9610	u64 last = 0;
 9611
 9612	while (1) {
 9613		struct inode *inode;
 9614
 9615		block_group = btrfs_lookup_first_block_group(info, last);
 9616		while (block_group) {
 9617			spin_lock(&block_group->lock);
 9618			if (block_group->iref)
 9619				break;
 9620			spin_unlock(&block_group->lock);
 9621			block_group = next_block_group(info, block_group);
 9622		}
 9623		if (!block_group) {
 9624			if (last == 0)
 9625				break;
 9626			last = 0;
 9627			continue;
 9628		}
 9629
 9630		inode = block_group->inode;
 9631		block_group->iref = 0;
 9632		block_group->inode = NULL;
 9633		spin_unlock(&block_group->lock);
 9634		ASSERT(block_group->io_ctl.inode == NULL);
 9635		iput(inode);
 9636		last = block_group->key.objectid + block_group->key.offset;
 9637		btrfs_put_block_group(block_group);
 9638	}
 9639}
 9640
 9641/*
 9642 * Must be called only after stopping all workers, since we could have block
 9643 * group caching kthreads running, and therefore they could race with us if we
 9644 * freed the block groups before stopping them.
 9645 */
 9646int btrfs_free_block_groups(struct btrfs_fs_info *info)
 9647{
 9648	struct btrfs_block_group_cache *block_group;
 9649	struct btrfs_space_info *space_info;
 9650	struct btrfs_caching_control *caching_ctl;
 9651	struct rb_node *n;
 9652
 9653	down_write(&info->commit_root_sem);
 9654	while (!list_empty(&info->caching_block_groups)) {
 9655		caching_ctl = list_entry(info->caching_block_groups.next,
 9656					 struct btrfs_caching_control, list);
 9657		list_del(&caching_ctl->list);
 9658		put_caching_control(caching_ctl);
 9659	}
 9660	up_write(&info->commit_root_sem);
 9661
 9662	spin_lock(&info->unused_bgs_lock);
 9663	while (!list_empty(&info->unused_bgs)) {
 9664		block_group = list_first_entry(&info->unused_bgs,
 9665					       struct btrfs_block_group_cache,
 9666					       bg_list);
 9667		list_del_init(&block_group->bg_list);
 9668		btrfs_put_block_group(block_group);
 9669	}
 9670	spin_unlock(&info->unused_bgs_lock);
 9671
 9672	spin_lock(&info->block_group_cache_lock);
 9673	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
 9674		block_group = rb_entry(n, struct btrfs_block_group_cache,
 9675				       cache_node);
 9676		rb_erase(&block_group->cache_node,
 9677			 &info->block_group_cache_tree);
 9678		RB_CLEAR_NODE(&block_group->cache_node);
 9679		spin_unlock(&info->block_group_cache_lock);
 9680
 9681		down_write(&block_group->space_info->groups_sem);
 9682		list_del(&block_group->list);
 9683		up_write(&block_group->space_info->groups_sem);
 9684
 9685		/*
 9686		 * We haven't cached this block group, which means we could
 9687		 * possibly have excluded extents on this block group.
 9688		 */
 9689		if (block_group->cached == BTRFS_CACHE_NO ||
 9690		    block_group->cached == BTRFS_CACHE_ERROR)
 9691			free_excluded_extents(block_group);
 9692
 9693		btrfs_remove_free_space_cache(block_group);
 9694		ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
 9695		ASSERT(list_empty(&block_group->dirty_list));
 9696		ASSERT(list_empty(&block_group->io_list));
 9697		ASSERT(list_empty(&block_group->bg_list));
 9698		ASSERT(atomic_read(&block_group->count) == 1);
 9699		btrfs_put_block_group(block_group);
 9700
 9701		spin_lock(&info->block_group_cache_lock);
 9702	}
 9703	spin_unlock(&info->block_group_cache_lock);
 9704
 9705	/* now that all the block groups are freed, go through and
 9706	 * free all the space_info structs.  This is only called during
 9707	 * the final stages of unmount, and so we know nobody is
 9708	 * using them.  We call synchronize_rcu() once before we start,
 9709	 * just to be on the safe side.
 9710	 */
 9711	synchronize_rcu();
 9712
 9713	release_global_block_rsv(info);
 9714
 9715	while (!list_empty(&info->space_info)) {
 9716		int i;
 9717
 9718		space_info = list_entry(info->space_info.next,
 9719					struct btrfs_space_info,
 9720					list);
 9721
 9722		/*
 9723		 * Do not hide this behind enospc_debug, this is actually
 9724		 * important and indicates a real bug if this happens.
 9725		 */
 9726		if (WARN_ON(space_info->bytes_pinned > 0 ||
 9727			    space_info->bytes_reserved > 0 ||
 9728			    space_info->bytes_may_use > 0))
 9729			dump_space_info(info, space_info, 0, 0);
 9730		list_del(&space_info->list);
 9731		for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
 9732			struct kobject *kobj;
 9733			kobj = space_info->block_group_kobjs[i];
 9734			space_info->block_group_kobjs[i] = NULL;
 9735			if (kobj) {
 9736				kobject_del(kobj);
 9737				kobject_put(kobj);
 9738			}
 9739		}
 9740		kobject_del(&space_info->kobj);
 9741		kobject_put(&space_info->kobj);
 9742	}
 9743	return 0;
 9744}
 9745
 9746/* link_block_group will queue up kobjects to add when we're reclaim-safe */
 9747void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
 9748{
 9749	struct btrfs_space_info *space_info;
 9750	struct raid_kobject *rkobj;
 9751	LIST_HEAD(list);
 9752	int index;
 9753	int ret = 0;
 9754
 9755	spin_lock(&fs_info->pending_raid_kobjs_lock);
 9756	list_splice_init(&fs_info->pending_raid_kobjs, &list);
 9757	spin_unlock(&fs_info->pending_raid_kobjs_lock);
 9758
 9759	list_for_each_entry(rkobj, &list, list) {
 9760		space_info = __find_space_info(fs_info, rkobj->flags);
 9761		index = btrfs_bg_flags_to_raid_index(rkobj->flags);
 9762
 9763		ret = kobject_add(&rkobj->kobj, &space_info->kobj,
 9764				  "%s", get_raid_name(index));
 9765		if (ret) {
 9766			kobject_put(&rkobj->kobj);
 9767			break;
 9768		}
 9769	}
 9770	if (ret)
 9771		btrfs_warn(fs_info,
 9772			   "failed to add kobject for block cache, ignoring");
 9773}
 9774
 9775static void link_block_group(struct btrfs_block_group_cache *cache)
 9776{
 9777	struct btrfs_space_info *space_info = cache->space_info;
 9778	struct btrfs_fs_info *fs_info = cache->fs_info;
 9779	int index = btrfs_bg_flags_to_raid_index(cache->flags);
 9780	bool first = false;
 9781
 9782	down_write(&space_info->groups_sem);
 9783	if (list_empty(&space_info->block_groups[index]))
 9784		first = true;
 9785	list_add_tail(&cache->list, &space_info->block_groups[index]);
 9786	up_write(&space_info->groups_sem);
 9787
 9788	if (first) {
 9789		struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
 9790		if (!rkobj) {
 9791			btrfs_warn(cache->fs_info,
 9792				"couldn't alloc memory for raid level kobject");
 9793			return;
 9794		}
 9795		rkobj->flags = cache->flags;
 9796		kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
 9797
 9798		spin_lock(&fs_info->pending_raid_kobjs_lock);
 9799		list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
 9800		spin_unlock(&fs_info->pending_raid_kobjs_lock);
 9801		space_info->block_group_kobjs[index] = &rkobj->kobj;
 9802	}
 9803}
 9804
 9805static struct btrfs_block_group_cache *
 9806btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
 9807			       u64 start, u64 size)
 9808{
 9809	struct btrfs_block_group_cache *cache;
 9810
 9811	cache = kzalloc(sizeof(*cache), GFP_NOFS);
 9812	if (!cache)
 9813		return NULL;
 9814
 9815	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
 9816					GFP_NOFS);
 9817	if (!cache->free_space_ctl) {
 9818		kfree(cache);
 9819		return NULL;
 9820	}
 9821
 9822	cache->key.objectid = start;
 9823	cache->key.offset = size;
 9824	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
 9825
 9826	cache->fs_info = fs_info;
 9827	cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
 9828	set_free_space_tree_thresholds(cache);
 9829
 9830	atomic_set(&cache->count, 1);
 9831	spin_lock_init(&cache->lock);
 9832	init_rwsem(&cache->data_rwsem);
 9833	INIT_LIST_HEAD(&cache->list);
 9834	INIT_LIST_HEAD(&cache->cluster_list);
 9835	INIT_LIST_HEAD(&cache->bg_list);
 9836	INIT_LIST_HEAD(&cache->ro_list);
 9837	INIT_LIST_HEAD(&cache->dirty_list);
 9838	INIT_LIST_HEAD(&cache->io_list);
 9839	btrfs_init_free_space_ctl(cache);
 9840	atomic_set(&cache->trimming, 0);
 9841	mutex_init(&cache->free_space_lock);
 9842	btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
 9843
 9844	return cache;
 9845}
 9846
 9847
 9848/*
 9849 * Iterate all chunks and verify that each of them has the corresponding block
 9850 * group
 9851 */
 9852static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
 9853{
 9854	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 9855	struct extent_map *em;
 9856	struct btrfs_block_group_cache *bg;
 9857	u64 start = 0;
 9858	int ret = 0;
 9859
 9860	while (1) {
 9861		read_lock(&map_tree->map_tree.lock);
 9862		/*
 9863		 * lookup_extent_mapping will return the first extent map
 9864		 * intersecting the range, so setting @len to 1 is enough to
 9865		 * get the first chunk.
 9866		 */
 9867		em = lookup_extent_mapping(&map_tree->map_tree, start, 1);
 9868		read_unlock(&map_tree->map_tree.lock);
 9869		if (!em)
 9870			break;
 9871
 9872		bg = btrfs_lookup_block_group(fs_info, em->start);
 9873		if (!bg) {
 9874			btrfs_err(fs_info,
 9875	"chunk start=%llu len=%llu doesn't have corresponding block group",
 9876				     em->start, em->len);
 9877			ret = -EUCLEAN;
 9878			free_extent_map(em);
 9879			break;
 9880		}
 9881		if (bg->key.objectid != em->start ||
 9882		    bg->key.offset != em->len ||
 9883		    (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
 9884		    (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
 9885			btrfs_err(fs_info,
 9886"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
 9887				em->start, em->len,
 9888				em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
 9889				bg->key.objectid, bg->key.offset,
 9890				bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
 9891			ret = -EUCLEAN;
 9892			free_extent_map(em);
 9893			btrfs_put_block_group(bg);
 9894			break;
 9895		}
 9896		start = em->start + em->len;
 9897		free_extent_map(em);
 9898		btrfs_put_block_group(bg);
 9899	}
 9900	return ret;
 9901}
 9902
 9903int btrfs_read_block_groups(struct btrfs_fs_info *info)
 9904{
 9905	struct btrfs_path *path;
 9906	int ret;
 9907	struct btrfs_block_group_cache *cache;
 9908	struct btrfs_space_info *space_info;
 9909	struct btrfs_key key;
 9910	struct btrfs_key found_key;
 9911	struct extent_buffer *leaf;
 9912	int need_clear = 0;
 9913	u64 cache_gen;
 9914	u64 feature;
 9915	int mixed;
 9916
 9917	feature = btrfs_super_incompat_flags(info->super_copy);
 9918	mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
 9919
 9920	key.objectid = 0;
 9921	key.offset = 0;
 9922	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
 9923	path = btrfs_alloc_path();
 9924	if (!path)
 9925		return -ENOMEM;
 9926	path->reada = READA_FORWARD;
 9927
 9928	cache_gen = btrfs_super_cache_generation(info->super_copy);
 9929	if (btrfs_test_opt(info, SPACE_CACHE) &&
 9930	    btrfs_super_generation(info->super_copy) != cache_gen)
 9931		need_clear = 1;
 9932	if (btrfs_test_opt(info, CLEAR_CACHE))
 9933		need_clear = 1;
 9934
 9935	while (1) {
 9936		ret = find_first_block_group(info, path, &key);
 9937		if (ret > 0)
 9938			break;
 9939		if (ret != 0)
 9940			goto error;
 9941
 9942		leaf = path->nodes[0];
 9943		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 9944
 9945		cache = btrfs_create_block_group_cache(info, found_key.objectid,
 9946						       found_key.offset);
 9947		if (!cache) {
 9948			ret = -ENOMEM;
 9949			goto error;
 9950		}
 9951
 9952		if (need_clear) {
 9953			/*
 9954			 * When we mount with old space cache, we need to
 9955			 * set BTRFS_DC_CLEAR and set dirty flag.
 9956			 *
 9957			 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
 9958			 *    truncate the old free space cache inode and
 9959			 *    setup a new one.
 9960			 * b) Setting 'dirty flag' makes sure that we flush
 9961			 *    the new space cache info onto disk.
 9962			 */
 9963			if (btrfs_test_opt(info, SPACE_CACHE))
 9964				cache->disk_cache_state = BTRFS_DC_CLEAR;
 9965		}
 9966
 9967		read_extent_buffer(leaf, &cache->item,
 9968				   btrfs_item_ptr_offset(leaf, path->slots[0]),
 9969				   sizeof(cache->item));
 9970		cache->flags = btrfs_block_group_flags(&cache->item);
 9971		if (!mixed &&
 9972		    ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
 9973		    (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
 9974			btrfs_err(info,
 9975"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
 9976				  cache->key.objectid);
 9977			ret = -EINVAL;
 9978			goto error;
 9979		}
 9980
 9981		key.objectid = found_key.objectid + found_key.offset;
 9982		btrfs_release_path(path);
 9983
 9984		/*
 9985		 * We need to exclude the super stripes now so that the space
 9986		 * info has super bytes accounted for, otherwise we'll think
 9987		 * we have more space than we actually do.
 9988		 */
 9989		ret = exclude_super_stripes(cache);
 9990		if (ret) {
 9991			/*
 9992			 * We may have excluded something, so call this just in
 9993			 * case.
 9994			 */
 9995			free_excluded_extents(cache);
 9996			btrfs_put_block_group(cache);
 9997			goto error;
 9998		}
 9999
10000		/*
10001		 * check for two cases, either we are full, and therefore
10002		 * don't need to bother with the caching work since we won't
10003		 * find any space, or we are empty, and we can just add all
10004		 * the space in and be done with it.  This saves us _alot_ of
10005		 * time, particularly in the full case.
10006		 */
10007		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
10008			cache->last_byte_to_unpin = (u64)-1;
10009			cache->cached = BTRFS_CACHE_FINISHED;
10010			free_excluded_extents(cache);
10011		} else if (btrfs_block_group_used(&cache->item) == 0) {
10012			cache->last_byte_to_unpin = (u64)-1;
10013			cache->cached = BTRFS_CACHE_FINISHED;
10014			add_new_free_space(cache, found_key.objectid,
10015					   found_key.objectid +
10016					   found_key.offset);
10017			free_excluded_extents(cache);
10018		}
10019
10020		ret = btrfs_add_block_group_cache(info, cache);
10021		if (ret) {
10022			btrfs_remove_free_space_cache(cache);
10023			btrfs_put_block_group(cache);
10024			goto error;
10025		}
10026
10027		trace_btrfs_add_block_group(info, cache, 0);
10028		update_space_info(info, cache->flags, found_key.offset,
10029				  btrfs_block_group_used(&cache->item),
10030				  cache->bytes_super, &space_info);
10031
10032		cache->space_info = space_info;
10033
10034		link_block_group(cache);
10035
10036		set_avail_alloc_bits(info, cache->flags);
10037		if (btrfs_chunk_readonly(info, cache->key.objectid)) {
10038			inc_block_group_ro(cache, 1);
10039		} else if (btrfs_block_group_used(&cache->item) == 0) {
10040			ASSERT(list_empty(&cache->bg_list));
10041			btrfs_mark_bg_unused(cache);
10042		}
10043	}
10044
10045	list_for_each_entry_rcu(space_info, &info->space_info, list) {
10046		if (!(get_alloc_profile(info, space_info->flags) &
10047		      (BTRFS_BLOCK_GROUP_RAID10 |
10048		       BTRFS_BLOCK_GROUP_RAID1 |
10049		       BTRFS_BLOCK_GROUP_RAID5 |
10050		       BTRFS_BLOCK_GROUP_RAID6 |
10051		       BTRFS_BLOCK_GROUP_DUP)))
10052			continue;
10053		/*
10054		 * avoid allocating from un-mirrored block group if there are
10055		 * mirrored block groups.
10056		 */
10057		list_for_each_entry(cache,
10058				&space_info->block_groups[BTRFS_RAID_RAID0],
10059				list)
10060			inc_block_group_ro(cache, 1);
10061		list_for_each_entry(cache,
10062				&space_info->block_groups[BTRFS_RAID_SINGLE],
10063				list)
10064			inc_block_group_ro(cache, 1);
10065	}
10066
10067	btrfs_add_raid_kobjects(info);
10068	init_global_block_rsv(info);
10069	ret = check_chunk_block_group_mappings(info);
10070error:
10071	btrfs_free_path(path);
10072	return ret;
10073}
10074
10075void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
10076{
10077	struct btrfs_fs_info *fs_info = trans->fs_info;
10078	struct btrfs_block_group_cache *block_group, *tmp;
10079	struct btrfs_root *extent_root = fs_info->extent_root;
10080	struct btrfs_block_group_item item;
10081	struct btrfs_key key;
10082	int ret = 0;
10083	bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
10084
10085	trans->can_flush_pending_bgs = false;
10086	list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
10087		if (ret)
10088			goto next;
10089
10090		spin_lock(&block_group->lock);
10091		memcpy(&item, &block_group->item, sizeof(item));
10092		memcpy(&key, &block_group->key, sizeof(key));
10093		spin_unlock(&block_group->lock);
10094
10095		ret = btrfs_insert_item(trans, extent_root, &key, &item,
10096					sizeof(item));
10097		if (ret)
10098			btrfs_abort_transaction(trans, ret);
10099		ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
10100		if (ret)
10101			btrfs_abort_transaction(trans, ret);
10102		add_block_group_free_space(trans, block_group);
10103		/* already aborted the transaction if it failed. */
10104next:
10105		list_del_init(&block_group->bg_list);
10106	}
10107	trans->can_flush_pending_bgs = can_flush_pending_bgs;
10108}
10109
10110int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
10111			   u64 type, u64 chunk_offset, u64 size)
10112{
10113	struct btrfs_fs_info *fs_info = trans->fs_info;
10114	struct btrfs_block_group_cache *cache;
10115	int ret;
10116
10117	btrfs_set_log_full_commit(fs_info, trans);
10118
10119	cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
10120	if (!cache)
10121		return -ENOMEM;
10122
10123	btrfs_set_block_group_used(&cache->item, bytes_used);
10124	btrfs_set_block_group_chunk_objectid(&cache->item,
10125					     BTRFS_FIRST_CHUNK_TREE_OBJECTID);
10126	btrfs_set_block_group_flags(&cache->item, type);
10127
10128	cache->flags = type;
10129	cache->last_byte_to_unpin = (u64)-1;
10130	cache->cached = BTRFS_CACHE_FINISHED;
10131	cache->needs_free_space = 1;
10132	ret = exclude_super_stripes(cache);
10133	if (ret) {
10134		/*
10135		 * We may have excluded something, so call this just in
10136		 * case.
10137		 */
10138		free_excluded_extents(cache);
10139		btrfs_put_block_group(cache);
10140		return ret;
10141	}
10142
10143	add_new_free_space(cache, chunk_offset, chunk_offset + size);
10144
10145	free_excluded_extents(cache);
10146
10147#ifdef CONFIG_BTRFS_DEBUG
10148	if (btrfs_should_fragment_free_space(cache)) {
10149		u64 new_bytes_used = size - bytes_used;
10150
10151		bytes_used += new_bytes_used >> 1;
10152		fragment_free_space(cache);
10153	}
10154#endif
10155	/*
10156	 * Ensure the corresponding space_info object is created and
10157	 * assigned to our block group. We want our bg to be added to the rbtree
10158	 * with its ->space_info set.
10159	 */
10160	cache->space_info = __find_space_info(fs_info, cache->flags);
10161	ASSERT(cache->space_info);
10162
10163	ret = btrfs_add_block_group_cache(fs_info, cache);
10164	if (ret) {
10165		btrfs_remove_free_space_cache(cache);
10166		btrfs_put_block_group(cache);
10167		return ret;
10168	}
10169
10170	/*
10171	 * Now that our block group has its ->space_info set and is inserted in
10172	 * the rbtree, update the space info's counters.
10173	 */
10174	trace_btrfs_add_block_group(fs_info, cache, 1);
10175	update_space_info(fs_info, cache->flags, size, bytes_used,
10176				cache->bytes_super, &cache->space_info);
10177	update_global_block_rsv(fs_info);
10178
10179	link_block_group(cache);
10180
10181	list_add_tail(&cache->bg_list, &trans->new_bgs);
10182
10183	set_avail_alloc_bits(fs_info, type);
10184	return 0;
10185}
10186
10187static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
10188{
10189	u64 extra_flags = chunk_to_extended(flags) &
10190				BTRFS_EXTENDED_PROFILE_MASK;
10191
10192	write_seqlock(&fs_info->profiles_lock);
10193	if (flags & BTRFS_BLOCK_GROUP_DATA)
10194		fs_info->avail_data_alloc_bits &= ~extra_flags;
10195	if (flags & BTRFS_BLOCK_GROUP_METADATA)
10196		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
10197	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
10198		fs_info->avail_system_alloc_bits &= ~extra_flags;
10199	write_sequnlock(&fs_info->profiles_lock);
10200}
10201
10202int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10203			     u64 group_start, struct extent_map *em)
10204{
10205	struct btrfs_fs_info *fs_info = trans->fs_info;
10206	struct btrfs_root *root = fs_info->extent_root;
10207	struct btrfs_path *path;
10208	struct btrfs_block_group_cache *block_group;
10209	struct btrfs_free_cluster *cluster;
10210	struct btrfs_root *tree_root = fs_info->tree_root;
10211	struct btrfs_key key;
10212	struct inode *inode;
10213	struct kobject *kobj = NULL;
10214	int ret;
10215	int index;
10216	int factor;
10217	struct btrfs_caching_control *caching_ctl = NULL;
10218	bool remove_em;
10219
10220	block_group = btrfs_lookup_block_group(fs_info, group_start);
10221	BUG_ON(!block_group);
10222	BUG_ON(!block_group->ro);
10223
10224	trace_btrfs_remove_block_group(block_group);
10225	/*
10226	 * Free the reserved super bytes from this block group before
10227	 * remove it.
10228	 */
10229	free_excluded_extents(block_group);
10230	btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
10231				  block_group->key.offset);
10232
10233	memcpy(&key, &block_group->key, sizeof(key));
10234	index = btrfs_bg_flags_to_raid_index(block_group->flags);
10235	factor = btrfs_bg_type_to_factor(block_group->flags);
10236
10237	/* make sure this block group isn't part of an allocation cluster */
10238	cluster = &fs_info->data_alloc_cluster;
10239	spin_lock(&cluster->refill_lock);
10240	btrfs_return_cluster_to_free_space(block_group, cluster);
10241	spin_unlock(&cluster->refill_lock);
10242
10243	/*
10244	 * make sure this block group isn't part of a metadata
10245	 * allocation cluster
10246	 */
10247	cluster = &fs_info->meta_alloc_cluster;
10248	spin_lock(&cluster->refill_lock);
10249	btrfs_return_cluster_to_free_space(block_group, cluster);
10250	spin_unlock(&cluster->refill_lock);
10251
10252	path = btrfs_alloc_path();
10253	if (!path) {
10254		ret = -ENOMEM;
10255		goto out;
10256	}
10257
10258	/*
10259	 * get the inode first so any iput calls done for the io_list
10260	 * aren't the final iput (no unlinks allowed now)
10261	 */
10262	inode = lookup_free_space_inode(fs_info, block_group, path);
10263
10264	mutex_lock(&trans->transaction->cache_write_mutex);
10265	/*
10266	 * make sure our free spache cache IO is done before remove the
10267	 * free space inode
10268	 */
10269	spin_lock(&trans->transaction->dirty_bgs_lock);
10270	if (!list_empty(&block_group->io_list)) {
10271		list_del_init(&block_group->io_list);
10272
10273		WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
10274
10275		spin_unlock(&trans->transaction->dirty_bgs_lock);
10276		btrfs_wait_cache_io(trans, block_group, path);
10277		btrfs_put_block_group(block_group);
10278		spin_lock(&trans->transaction->dirty_bgs_lock);
10279	}
10280
10281	if (!list_empty(&block_group->dirty_list)) {
10282		list_del_init(&block_group->dirty_list);
10283		btrfs_put_block_group(block_group);
10284	}
10285	spin_unlock(&trans->transaction->dirty_bgs_lock);
10286	mutex_unlock(&trans->transaction->cache_write_mutex);
10287
10288	if (!IS_ERR(inode)) {
10289		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
10290		if (ret) {
10291			btrfs_add_delayed_iput(inode);
10292			goto out;
10293		}
10294		clear_nlink(inode);
10295		/* One for the block groups ref */
10296		spin_lock(&block_group->lock);
10297		if (block_group->iref) {
10298			block_group->iref = 0;
10299			block_group->inode = NULL;
10300			spin_unlock(&block_group->lock);
10301			iput(inode);
10302		} else {
10303			spin_unlock(&block_group->lock);
10304		}
10305		/* One for our lookup ref */
10306		btrfs_add_delayed_iput(inode);
10307	}
10308
10309	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
10310	key.offset = block_group->key.objectid;
10311	key.type = 0;
10312
10313	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
10314	if (ret < 0)
10315		goto out;
10316	if (ret > 0)
10317		btrfs_release_path(path);
10318	if (ret == 0) {
10319		ret = btrfs_del_item(trans, tree_root, path);
10320		if (ret)
10321			goto out;
10322		btrfs_release_path(path);
10323	}
10324
10325	spin_lock(&fs_info->block_group_cache_lock);
10326	rb_erase(&block_group->cache_node,
10327		 &fs_info->block_group_cache_tree);
10328	RB_CLEAR_NODE(&block_group->cache_node);
10329
10330	if (fs_info->first_logical_byte == block_group->key.objectid)
10331		fs_info->first_logical_byte = (u64)-1;
10332	spin_unlock(&fs_info->block_group_cache_lock);
10333
10334	down_write(&block_group->space_info->groups_sem);
10335	/*
10336	 * we must use list_del_init so people can check to see if they
10337	 * are still on the list after taking the semaphore
10338	 */
10339	list_del_init(&block_group->list);
10340	if (list_empty(&block_group->space_info->block_groups[index])) {
10341		kobj = block_group->space_info->block_group_kobjs[index];
10342		block_group->space_info->block_group_kobjs[index] = NULL;
10343		clear_avail_alloc_bits(fs_info, block_group->flags);
10344	}
10345	up_write(&block_group->space_info->groups_sem);
10346	if (kobj) {
10347		kobject_del(kobj);
10348		kobject_put(kobj);
10349	}
10350
10351	if (block_group->has_caching_ctl)
10352		caching_ctl = get_caching_control(block_group);
10353	if (block_group->cached == BTRFS_CACHE_STARTED)
10354		wait_block_group_cache_done(block_group);
10355	if (block_group->has_caching_ctl) {
10356		down_write(&fs_info->commit_root_sem);
10357		if (!caching_ctl) {
10358			struct btrfs_caching_control *ctl;
10359
10360			list_for_each_entry(ctl,
10361				    &fs_info->caching_block_groups, list)
10362				if (ctl->block_group == block_group) {
10363					caching_ctl = ctl;
10364					refcount_inc(&caching_ctl->count);
10365					break;
10366				}
10367		}
10368		if (caching_ctl)
10369			list_del_init(&caching_ctl->list);
10370		up_write(&fs_info->commit_root_sem);
10371		if (caching_ctl) {
10372			/* Once for the caching bgs list and once for us. */
10373			put_caching_control(caching_ctl);
10374			put_caching_control(caching_ctl);
10375		}
10376	}
10377
10378	spin_lock(&trans->transaction->dirty_bgs_lock);
10379	if (!list_empty(&block_group->dirty_list)) {
10380		WARN_ON(1);
10381	}
10382	if (!list_empty(&block_group->io_list)) {
10383		WARN_ON(1);
10384	}
10385	spin_unlock(&trans->transaction->dirty_bgs_lock);
10386	btrfs_remove_free_space_cache(block_group);
10387
10388	spin_lock(&block_group->space_info->lock);
10389	list_del_init(&block_group->ro_list);
10390
10391	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
10392		WARN_ON(block_group->space_info->total_bytes
10393			< block_group->key.offset);
10394		WARN_ON(block_group->space_info->bytes_readonly
10395			< block_group->key.offset);
10396		WARN_ON(block_group->space_info->disk_total
10397			< block_group->key.offset * factor);
10398	}
10399	block_group->space_info->total_bytes -= block_group->key.offset;
10400	block_group->space_info->bytes_readonly -= block_group->key.offset;
10401	block_group->space_info->disk_total -= block_group->key.offset * factor;
10402
10403	spin_unlock(&block_group->space_info->lock);
10404
10405	memcpy(&key, &block_group->key, sizeof(key));
10406
10407	mutex_lock(&fs_info->chunk_mutex);
10408	if (!list_empty(&em->list)) {
10409		/* We're in the transaction->pending_chunks list. */
10410		free_extent_map(em);
10411	}
10412	spin_lock(&block_group->lock);
10413	block_group->removed = 1;
10414	/*
10415	 * At this point trimming can't start on this block group, because we
10416	 * removed the block group from the tree fs_info->block_group_cache_tree
10417	 * so no one can't find it anymore and even if someone already got this
10418	 * block group before we removed it from the rbtree, they have already
10419	 * incremented block_group->trimming - if they didn't, they won't find
10420	 * any free space entries because we already removed them all when we
10421	 * called btrfs_remove_free_space_cache().
10422	 *
10423	 * And we must not remove the extent map from the fs_info->mapping_tree
10424	 * to prevent the same logical address range and physical device space
10425	 * ranges from being reused for a new block group. This is because our
10426	 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
10427	 * completely transactionless, so while it is trimming a range the
10428	 * currently running transaction might finish and a new one start,
10429	 * allowing for new block groups to be created that can reuse the same
10430	 * physical device locations unless we take this special care.
10431	 *
10432	 * There may also be an implicit trim operation if the file system
10433	 * is mounted with -odiscard. The same protections must remain
10434	 * in place until the extents have been discarded completely when
10435	 * the transaction commit has completed.
10436	 */
10437	remove_em = (atomic_read(&block_group->trimming) == 0);
10438	/*
10439	 * Make sure a trimmer task always sees the em in the pinned_chunks list
10440	 * if it sees block_group->removed == 1 (needs to lock block_group->lock
10441	 * before checking block_group->removed).
10442	 */
10443	if (!remove_em) {
10444		/*
10445		 * Our em might be in trans->transaction->pending_chunks which
10446		 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
10447		 * and so is the fs_info->pinned_chunks list.
10448		 *
10449		 * So at this point we must be holding the chunk_mutex to avoid
10450		 * any races with chunk allocation (more specifically at
10451		 * volumes.c:contains_pending_extent()), to ensure it always
10452		 * sees the em, either in the pending_chunks list or in the
10453		 * pinned_chunks list.
10454		 */
10455		list_move_tail(&em->list, &fs_info->pinned_chunks);
10456	}
10457	spin_unlock(&block_group->lock);
10458
10459	if (remove_em) {
10460		struct extent_map_tree *em_tree;
10461
10462		em_tree = &fs_info->mapping_tree.map_tree;
10463		write_lock(&em_tree->lock);
10464		/*
10465		 * The em might be in the pending_chunks list, so make sure the
10466		 * chunk mutex is locked, since remove_extent_mapping() will
10467		 * delete us from that list.
10468		 */
10469		remove_extent_mapping(em_tree, em);
10470		write_unlock(&em_tree->lock);
10471		/* once for the tree */
10472		free_extent_map(em);
10473	}
10474
10475	mutex_unlock(&fs_info->chunk_mutex);
10476
10477	ret = remove_block_group_free_space(trans, block_group);
10478	if (ret)
10479		goto out;
10480
10481	btrfs_put_block_group(block_group);
10482	btrfs_put_block_group(block_group);
10483
10484	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
10485	if (ret > 0)
10486		ret = -EIO;
10487	if (ret < 0)
10488		goto out;
10489
10490	ret = btrfs_del_item(trans, root, path);
10491out:
10492	btrfs_free_path(path);
10493	return ret;
10494}
10495
10496struct btrfs_trans_handle *
10497btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
10498				     const u64 chunk_offset)
10499{
10500	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
10501	struct extent_map *em;
10502	struct map_lookup *map;
10503	unsigned int num_items;
10504
10505	read_lock(&em_tree->lock);
10506	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
10507	read_unlock(&em_tree->lock);
10508	ASSERT(em && em->start == chunk_offset);
10509
10510	/*
10511	 * We need to reserve 3 + N units from the metadata space info in order
10512	 * to remove a block group (done at btrfs_remove_chunk() and at
10513	 * btrfs_remove_block_group()), which are used for:
10514	 *
10515	 * 1 unit for adding the free space inode's orphan (located in the tree
10516	 * of tree roots).
10517	 * 1 unit for deleting the block group item (located in the extent
10518	 * tree).
10519	 * 1 unit for deleting the free space item (located in tree of tree
10520	 * roots).
10521	 * N units for deleting N device extent items corresponding to each
10522	 * stripe (located in the device tree).
10523	 *
10524	 * In order to remove a block group we also need to reserve units in the
10525	 * system space info in order to update the chunk tree (update one or
10526	 * more device items and remove one chunk item), but this is done at
10527	 * btrfs_remove_chunk() through a call to check_system_chunk().
10528	 */
10529	map = em->map_lookup;
10530	num_items = 3 + map->num_stripes;
10531	free_extent_map(em);
10532
10533	return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
10534							   num_items, 1);
10535}
10536
10537/*
10538 * Process the unused_bgs list and remove any that don't have any allocated
10539 * space inside of them.
10540 */
10541void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
10542{
10543	struct btrfs_block_group_cache *block_group;
10544	struct btrfs_space_info *space_info;
10545	struct btrfs_trans_handle *trans;
10546	int ret = 0;
10547
10548	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
10549		return;
10550
10551	spin_lock(&fs_info->unused_bgs_lock);
10552	while (!list_empty(&fs_info->unused_bgs)) {
10553		u64 start, end;
10554		int trimming;
10555
10556		block_group = list_first_entry(&fs_info->unused_bgs,
10557					       struct btrfs_block_group_cache,
10558					       bg_list);
10559		list_del_init(&block_group->bg_list);
10560
10561		space_info = block_group->space_info;
10562
10563		if (ret || btrfs_mixed_space_info(space_info)) {
10564			btrfs_put_block_group(block_group);
10565			continue;
10566		}
10567		spin_unlock(&fs_info->unused_bgs_lock);
10568
10569		mutex_lock(&fs_info->delete_unused_bgs_mutex);
10570
10571		/* Don't want to race with allocators so take the groups_sem */
10572		down_write(&space_info->groups_sem);
10573		spin_lock(&block_group->lock);
10574		if (block_group->reserved || block_group->pinned ||
10575		    btrfs_block_group_used(&block_group->item) ||
10576		    block_group->ro ||
10577		    list_is_singular(&block_group->list)) {
10578			/*
10579			 * We want to bail if we made new allocations or have
10580			 * outstanding allocations in this block group.  We do
10581			 * the ro check in case balance is currently acting on
10582			 * this block group.
10583			 */
10584			trace_btrfs_skip_unused_block_group(block_group);
10585			spin_unlock(&block_group->lock);
10586			up_write(&space_info->groups_sem);
10587			goto next;
10588		}
10589		spin_unlock(&block_group->lock);
10590
10591		/* We don't want to force the issue, only flip if it's ok. */
10592		ret = inc_block_group_ro(block_group, 0);
10593		up_write(&space_info->groups_sem);
10594		if (ret < 0) {
10595			ret = 0;
10596			goto next;
10597		}
10598
10599		/*
10600		 * Want to do this before we do anything else so we can recover
10601		 * properly if we fail to join the transaction.
10602		 */
10603		trans = btrfs_start_trans_remove_block_group(fs_info,
10604						     block_group->key.objectid);
10605		if (IS_ERR(trans)) {
10606			btrfs_dec_block_group_ro(block_group);
10607			ret = PTR_ERR(trans);
10608			goto next;
10609		}
10610
10611		/*
10612		 * We could have pending pinned extents for this block group,
10613		 * just delete them, we don't care about them anymore.
10614		 */
10615		start = block_group->key.objectid;
10616		end = start + block_group->key.offset - 1;
10617		/*
10618		 * Hold the unused_bg_unpin_mutex lock to avoid racing with
10619		 * btrfs_finish_extent_commit(). If we are at transaction N,
10620		 * another task might be running finish_extent_commit() for the
10621		 * previous transaction N - 1, and have seen a range belonging
10622		 * to the block group in freed_extents[] before we were able to
10623		 * clear the whole block group range from freed_extents[]. This
10624		 * means that task can lookup for the block group after we
10625		 * unpinned it from freed_extents[] and removed it, leading to
10626		 * a BUG_ON() at btrfs_unpin_extent_range().
10627		 */
10628		mutex_lock(&fs_info->unused_bg_unpin_mutex);
10629		ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
10630				  EXTENT_DIRTY);
10631		if (ret) {
10632			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10633			btrfs_dec_block_group_ro(block_group);
10634			goto end_trans;
10635		}
10636		ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
10637				  EXTENT_DIRTY);
10638		if (ret) {
10639			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10640			btrfs_dec_block_group_ro(block_group);
10641			goto end_trans;
10642		}
10643		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10644
10645		/* Reset pinned so btrfs_put_block_group doesn't complain */
10646		spin_lock(&space_info->lock);
10647		spin_lock(&block_group->lock);
10648
10649		space_info->bytes_pinned -= block_group->pinned;
10650		space_info->bytes_readonly += block_group->pinned;
10651		percpu_counter_add_batch(&space_info->total_bytes_pinned,
10652				   -block_group->pinned,
10653				   BTRFS_TOTAL_BYTES_PINNED_BATCH);
10654		block_group->pinned = 0;
10655
10656		spin_unlock(&block_group->lock);
10657		spin_unlock(&space_info->lock);
10658
10659		/* DISCARD can flip during remount */
10660		trimming = btrfs_test_opt(fs_info, DISCARD);
10661
10662		/* Implicit trim during transaction commit. */
10663		if (trimming)
10664			btrfs_get_block_group_trimming(block_group);
10665
10666		/*
10667		 * Btrfs_remove_chunk will abort the transaction if things go
10668		 * horribly wrong.
10669		 */
10670		ret = btrfs_remove_chunk(trans, block_group->key.objectid);
10671
10672		if (ret) {
10673			if (trimming)
10674				btrfs_put_block_group_trimming(block_group);
10675			goto end_trans;
10676		}
10677
10678		/*
10679		 * If we're not mounted with -odiscard, we can just forget
10680		 * about this block group. Otherwise we'll need to wait
10681		 * until transaction commit to do the actual discard.
10682		 */
10683		if (trimming) {
10684			spin_lock(&fs_info->unused_bgs_lock);
10685			/*
10686			 * A concurrent scrub might have added us to the list
10687			 * fs_info->unused_bgs, so use a list_move operation
10688			 * to add the block group to the deleted_bgs list.
10689			 */
10690			list_move(&block_group->bg_list,
10691				  &trans->transaction->deleted_bgs);
10692			spin_unlock(&fs_info->unused_bgs_lock);
10693			btrfs_get_block_group(block_group);
10694		}
10695end_trans:
10696		btrfs_end_transaction(trans);
10697next:
10698		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
10699		btrfs_put_block_group(block_group);
10700		spin_lock(&fs_info->unused_bgs_lock);
10701	}
10702	spin_unlock(&fs_info->unused_bgs_lock);
10703}
10704
10705int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
10706{
10707	struct btrfs_super_block *disk_super;
10708	u64 features;
10709	u64 flags;
10710	int mixed = 0;
10711	int ret;
10712
10713	disk_super = fs_info->super_copy;
10714	if (!btrfs_super_root(disk_super))
10715		return -EINVAL;
10716
10717	features = btrfs_super_incompat_flags(disk_super);
10718	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
10719		mixed = 1;
10720
10721	flags = BTRFS_BLOCK_GROUP_SYSTEM;
10722	ret = create_space_info(fs_info, flags);
10723	if (ret)
10724		goto out;
10725
10726	if (mixed) {
10727		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
10728		ret = create_space_info(fs_info, flags);
10729	} else {
10730		flags = BTRFS_BLOCK_GROUP_METADATA;
10731		ret = create_space_info(fs_info, flags);
10732		if (ret)
10733			goto out;
10734
10735		flags = BTRFS_BLOCK_GROUP_DATA;
10736		ret = create_space_info(fs_info, flags);
10737	}
10738out:
10739	return ret;
10740}
10741
10742int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
10743				   u64 start, u64 end)
10744{
10745	return unpin_extent_range(fs_info, start, end, false);
10746}
10747
10748/*
10749 * It used to be that old block groups would be left around forever.
10750 * Iterating over them would be enough to trim unused space.  Since we
10751 * now automatically remove them, we also need to iterate over unallocated
10752 * space.
10753 *
10754 * We don't want a transaction for this since the discard may take a
10755 * substantial amount of time.  We don't require that a transaction be
10756 * running, but we do need to take a running transaction into account
10757 * to ensure that we're not discarding chunks that were released in
10758 * the current transaction.
10759 *
10760 * Holding the chunks lock will prevent other threads from allocating
10761 * or releasing chunks, but it won't prevent a running transaction
10762 * from committing and releasing the memory that the pending chunks
10763 * list head uses.  For that, we need to take a reference to the
10764 * transaction.
10765 */
10766static int btrfs_trim_free_extents(struct btrfs_device *device,
10767				   u64 minlen, u64 *trimmed)
10768{
10769	u64 start = 0, len = 0;
10770	int ret;
10771
10772	*trimmed = 0;
10773
10774	/* Not writeable = nothing to do. */
10775	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
10776		return 0;
10777
10778	/* No free space = nothing to do. */
10779	if (device->total_bytes <= device->bytes_used)
10780		return 0;
10781
10782	ret = 0;
10783
10784	while (1) {
10785		struct btrfs_fs_info *fs_info = device->fs_info;
10786		struct btrfs_transaction *trans;
10787		u64 bytes;
10788
10789		ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
10790		if (ret)
10791			return ret;
10792
10793		down_read(&fs_info->commit_root_sem);
10794
10795		spin_lock(&fs_info->trans_lock);
10796		trans = fs_info->running_transaction;
10797		if (trans)
10798			refcount_inc(&trans->use_count);
10799		spin_unlock(&fs_info->trans_lock);
10800
10801		ret = find_free_dev_extent_start(trans, device, minlen, start,
10802						 &start, &len);
10803		if (trans)
10804			btrfs_put_transaction(trans);
10805
10806		if (ret) {
10807			up_read(&fs_info->commit_root_sem);
10808			mutex_unlock(&fs_info->chunk_mutex);
10809			if (ret == -ENOSPC)
10810				ret = 0;
10811			break;
10812		}
10813
10814		ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
10815		up_read(&fs_info->commit_root_sem);
10816		mutex_unlock(&fs_info->chunk_mutex);
10817
10818		if (ret)
10819			break;
10820
10821		start += len;
10822		*trimmed += bytes;
10823
10824		if (fatal_signal_pending(current)) {
10825			ret = -ERESTARTSYS;
10826			break;
10827		}
10828
10829		cond_resched();
10830	}
10831
10832	return ret;
10833}
10834
10835int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
10836{
10837	struct btrfs_block_group_cache *cache = NULL;
10838	struct btrfs_device *device;
10839	struct list_head *devices;
10840	u64 group_trimmed;
10841	u64 start;
10842	u64 end;
10843	u64 trimmed = 0;
10844	u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
10845	int ret = 0;
10846
10847	/*
10848	 * try to trim all FS space, our block group may start from non-zero.
10849	 */
10850	if (range->len == total_bytes)
10851		cache = btrfs_lookup_first_block_group(fs_info, range->start);
10852	else
10853		cache = btrfs_lookup_block_group(fs_info, range->start);
10854
10855	while (cache) {
10856		if (cache->key.objectid >= (range->start + range->len)) {
10857			btrfs_put_block_group(cache);
10858			break;
10859		}
10860
10861		start = max(range->start, cache->key.objectid);
10862		end = min(range->start + range->len,
10863				cache->key.objectid + cache->key.offset);
10864
10865		if (end - start >= range->minlen) {
10866			if (!block_group_cache_done(cache)) {
10867				ret = cache_block_group(cache, 0);
10868				if (ret) {
10869					btrfs_put_block_group(cache);
10870					break;
10871				}
10872				ret = wait_block_group_cache_done(cache);
10873				if (ret) {
10874					btrfs_put_block_group(cache);
10875					break;
10876				}
10877			}
10878			ret = btrfs_trim_block_group(cache,
10879						     &group_trimmed,
10880						     start,
10881						     end,
10882						     range->minlen);
10883
10884			trimmed += group_trimmed;
10885			if (ret) {
10886				btrfs_put_block_group(cache);
10887				break;
10888			}
10889		}
10890
10891		cache = next_block_group(fs_info, cache);
10892	}
10893
10894	mutex_lock(&fs_info->fs_devices->device_list_mutex);
10895	devices = &fs_info->fs_devices->alloc_list;
10896	list_for_each_entry(device, devices, dev_alloc_list) {
10897		ret = btrfs_trim_free_extents(device, range->minlen,
10898					      &group_trimmed);
10899		if (ret)
10900			break;
10901
10902		trimmed += group_trimmed;
10903	}
10904	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
10905
10906	range->len = trimmed;
10907	return ret;
10908}
10909
10910/*
10911 * btrfs_{start,end}_write_no_snapshotting() are similar to
10912 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
10913 * data into the page cache through nocow before the subvolume is snapshoted,
10914 * but flush the data into disk after the snapshot creation, or to prevent
10915 * operations while snapshotting is ongoing and that cause the snapshot to be
10916 * inconsistent (writes followed by expanding truncates for example).
10917 */
10918void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
10919{
10920	percpu_counter_dec(&root->subv_writers->counter);
10921	cond_wake_up(&root->subv_writers->wait);
10922}
10923
10924int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
10925{
10926	if (atomic_read(&root->will_be_snapshotted))
10927		return 0;
10928
10929	percpu_counter_inc(&root->subv_writers->counter);
10930	/*
10931	 * Make sure counter is updated before we check for snapshot creation.
10932	 */
10933	smp_mb();
10934	if (atomic_read(&root->will_be_snapshotted)) {
10935		btrfs_end_write_no_snapshotting(root);
10936		return 0;
10937	}
10938	return 1;
10939}
10940
10941void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
10942{
10943	while (true) {
10944		int ret;
10945
10946		ret = btrfs_start_write_no_snapshotting(root);
10947		if (ret)
10948			break;
10949		wait_var_event(&root->will_be_snapshotted,
10950			       !atomic_read(&root->will_be_snapshotted));
10951	}
10952}
10953
10954void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
10955{
10956	struct btrfs_fs_info *fs_info = bg->fs_info;
10957
10958	spin_lock(&fs_info->unused_bgs_lock);
10959	if (list_empty(&bg->bg_list)) {
10960		btrfs_get_block_group(bg);
10961		trace_btrfs_add_unused_block_group(bg);
10962		list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
10963	}
10964	spin_unlock(&fs_info->unused_bgs_lock);
10965}
Configure Feed

Configure Feed