fs/btrfs/qgroup.c at master · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / btrfs / qgroup.c
at master 136 kB view raw
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2011 STRATO.  All rights reserved.
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/pagemap.h>
   8#include <linux/writeback.h>
   9#include <linux/blkdev.h>
  10#include <linux/rbtree.h>
  11#include <linux/slab.h>
  12#include <linux/workqueue.h>
  13#include <linux/btrfs.h>
  14#include <linux/sched/mm.h>
  15
  16#include "ctree.h"
  17#include "transaction.h"
  18#include "disk-io.h"
  19#include "locking.h"
  20#include "ulist.h"
  21#include "backref.h"
  22#include "extent_io.h"
  23#include "qgroup.h"
  24#include "block-group.h"
  25#include "sysfs.h"
  26#include "tree-mod-log.h"
  27#include "fs.h"
  28#include "accessors.h"
  29#include "extent-tree.h"
  30#include "root-tree.h"
  31#include "tree-checker.h"
  32
  33enum btrfs_qgroup_mode btrfs_qgroup_mode(const struct btrfs_fs_info *fs_info)
  34{
  35	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
  36		return BTRFS_QGROUP_MODE_DISABLED;
  37	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE)
  38		return BTRFS_QGROUP_MODE_SIMPLE;
  39	return BTRFS_QGROUP_MODE_FULL;
  40}
  41
  42bool btrfs_qgroup_enabled(const struct btrfs_fs_info *fs_info)
  43{
  44	return btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_DISABLED;
  45}
  46
  47bool btrfs_qgroup_full_accounting(const struct btrfs_fs_info *fs_info)
  48{
  49	return btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL;
  50}
  51
  52/*
  53 * Helpers to access qgroup reservation
  54 *
  55 * Callers should ensure the lock context and type are valid
  56 */
  57
  58static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup)
  59{
  60	u64 ret = 0;
  61	int i;
  62
  63	for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
  64		ret += qgroup->rsv.values[i];
  65
  66	return ret;
  67}
  68
  69#ifdef CONFIG_BTRFS_DEBUG
  70static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type)
  71{
  72	if (type == BTRFS_QGROUP_RSV_DATA)
  73		return "data";
  74	if (type == BTRFS_QGROUP_RSV_META_PERTRANS)
  75		return "meta_pertrans";
  76	if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
  77		return "meta_prealloc";
  78	return NULL;
  79}
  80#endif
  81
  82static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
  83			   struct btrfs_qgroup *qgroup, u64 num_bytes,
  84			   enum btrfs_qgroup_rsv_type type)
  85{
  86	trace_btrfs_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
  87	qgroup->rsv.values[type] += num_bytes;
  88}
  89
  90static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
  91			       struct btrfs_qgroup *qgroup, u64 num_bytes,
  92			       enum btrfs_qgroup_rsv_type type)
  93{
  94	trace_btrfs_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
  95	if (qgroup->rsv.values[type] >= num_bytes) {
  96		qgroup->rsv.values[type] -= num_bytes;
  97		return;
  98	}
  99#ifdef CONFIG_BTRFS_DEBUG
 100	WARN_RATELIMIT(1,
 101		"qgroup %llu %s reserved space underflow, have %llu to free %llu",
 102		qgroup->qgroupid, qgroup_rsv_type_str(type),
 103		qgroup->rsv.values[type], num_bytes);
 104#endif
 105	qgroup->rsv.values[type] = 0;
 106}
 107
 108static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
 109				     struct btrfs_qgroup *dest,
 110				     const struct btrfs_qgroup *src)
 111{
 112	int i;
 113
 114	for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
 115		qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i);
 116}
 117
 118static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
 119					 struct btrfs_qgroup *dest,
 120					 const struct btrfs_qgroup *src)
 121{
 122	int i;
 123
 124	for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
 125		qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i);
 126}
 127
 128static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
 129					   int mod)
 130{
 131	if (qg->old_refcnt < seq)
 132		qg->old_refcnt = seq;
 133	qg->old_refcnt += mod;
 134}
 135
 136static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
 137					   int mod)
 138{
 139	if (qg->new_refcnt < seq)
 140		qg->new_refcnt = seq;
 141	qg->new_refcnt += mod;
 142}
 143
 144static inline u64 btrfs_qgroup_get_old_refcnt(const struct btrfs_qgroup *qg, u64 seq)
 145{
 146	if (qg->old_refcnt < seq)
 147		return 0;
 148	return qg->old_refcnt - seq;
 149}
 150
 151static inline u64 btrfs_qgroup_get_new_refcnt(const struct btrfs_qgroup *qg, u64 seq)
 152{
 153	if (qg->new_refcnt < seq)
 154		return 0;
 155	return qg->new_refcnt - seq;
 156}
 157
 158static int
 159qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 160		   int init_flags);
 161static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
 162
 163static int btrfs_qgroup_qgroupid_key_cmp(const void *key, const struct rb_node *node)
 164{
 165	const u64 *qgroupid = key;
 166	const struct btrfs_qgroup *qgroup = rb_entry(node, struct btrfs_qgroup, node);
 167
 168	if (qgroup->qgroupid < *qgroupid)
 169		return -1;
 170	else if (qgroup->qgroupid > *qgroupid)
 171		return 1;
 172
 173	return 0;
 174}
 175
 176/* must be called with qgroup_ioctl_lock held */
 177static struct btrfs_qgroup *find_qgroup_rb(const struct btrfs_fs_info *fs_info,
 178					   u64 qgroupid)
 179{
 180	struct rb_node *node;
 181
 182	node = rb_find(&qgroupid, &fs_info->qgroup_tree, btrfs_qgroup_qgroupid_key_cmp);
 183	return rb_entry_safe(node, struct btrfs_qgroup, node);
 184}
 185
 186static int btrfs_qgroup_qgroupid_cmp(struct rb_node *new, const struct rb_node *existing)
 187{
 188	const struct btrfs_qgroup *new_qgroup = rb_entry(new, struct btrfs_qgroup, node);
 189
 190	return btrfs_qgroup_qgroupid_key_cmp(&new_qgroup->qgroupid, existing);
 191}
 192
 193/*
 194 * Add qgroup to the filesystem's qgroup tree.
 195 *
 196 * Must be called with qgroup_lock held and @prealloc preallocated.
 197 *
 198 * The control on the lifespan of @prealloc would be transferred to this
 199 * function, thus caller should no longer touch @prealloc.
 200 */
 201static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
 202					  struct btrfs_qgroup *prealloc,
 203					  u64 qgroupid)
 204{
 205	struct rb_node *node;
 206
 207	/* Caller must have pre-allocated @prealloc. */
 208	ASSERT(prealloc);
 209
 210	prealloc->qgroupid = qgroupid;
 211	node = rb_find_add(&prealloc->node, &fs_info->qgroup_tree, btrfs_qgroup_qgroupid_cmp);
 212	if (node) {
 213		kfree(prealloc);
 214		return rb_entry(node, struct btrfs_qgroup, node);
 215	}
 216
 217	INIT_LIST_HEAD(&prealloc->groups);
 218	INIT_LIST_HEAD(&prealloc->members);
 219	INIT_LIST_HEAD(&prealloc->dirty);
 220	INIT_LIST_HEAD(&prealloc->iterator);
 221	INIT_LIST_HEAD(&prealloc->nested_iterator);
 222
 223	return prealloc;
 224}
 225
 226static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
 227{
 228	struct btrfs_qgroup_list *list;
 229
 230	list_del(&qgroup->dirty);
 231	while (!list_empty(&qgroup->groups)) {
 232		list = list_first_entry(&qgroup->groups,
 233					struct btrfs_qgroup_list, next_group);
 234		list_del(&list->next_group);
 235		list_del(&list->next_member);
 236		kfree(list);
 237	}
 238
 239	while (!list_empty(&qgroup->members)) {
 240		list = list_first_entry(&qgroup->members,
 241					struct btrfs_qgroup_list, next_member);
 242		list_del(&list->next_group);
 243		list_del(&list->next_member);
 244		kfree(list);
 245	}
 246}
 247
 248/* must be called with qgroup_lock held */
 249static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
 250{
 251	struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
 252
 253	if (!qgroup)
 254		return -ENOENT;
 255
 256	rb_erase(&qgroup->node, &fs_info->qgroup_tree);
 257	__del_qgroup_rb(qgroup);
 258	return 0;
 259}
 260
 261/*
 262 * Add relation specified by two qgroups.
 263 *
 264 * Must be called with qgroup_lock held, the ownership of @prealloc is
 265 * transferred to this function and caller should not touch it anymore.
 266 *
 267 * Return: 0        on success
 268 *         -ENOENT  if one of the qgroups is NULL
 269 *         <0       other errors
 270 */
 271static int __add_relation_rb(struct btrfs_qgroup_list *prealloc,
 272			     struct btrfs_qgroup *member,
 273			     struct btrfs_qgroup *parent)
 274{
 275	if (!member || !parent) {
 276		kfree(prealloc);
 277		return -ENOENT;
 278	}
 279
 280	prealloc->group = parent;
 281	prealloc->member = member;
 282	list_add_tail(&prealloc->next_group, &member->groups);
 283	list_add_tail(&prealloc->next_member, &parent->members);
 284
 285	return 0;
 286}
 287
 288/*
 289 * Add relation specified by two qgroup ids.
 290 *
 291 * Must be called with qgroup_lock held.
 292 *
 293 * Return: 0        on success
 294 *         -ENOENT  if one of the ids does not exist
 295 *         <0       other errors
 296 */
 297static int add_relation_rb(struct btrfs_fs_info *fs_info,
 298			   struct btrfs_qgroup_list *prealloc,
 299			   u64 memberid, u64 parentid)
 300{
 301	struct btrfs_qgroup *member;
 302	struct btrfs_qgroup *parent;
 303
 304	member = find_qgroup_rb(fs_info, memberid);
 305	parent = find_qgroup_rb(fs_info, parentid);
 306
 307	return __add_relation_rb(prealloc, member, parent);
 308}
 309
 310/* Must be called with qgroup_lock held */
 311static int del_relation_rb(struct btrfs_fs_info *fs_info,
 312			   u64 memberid, u64 parentid)
 313{
 314	struct btrfs_qgroup *member;
 315	struct btrfs_qgroup *parent;
 316	struct btrfs_qgroup_list *list;
 317
 318	member = find_qgroup_rb(fs_info, memberid);
 319	parent = find_qgroup_rb(fs_info, parentid);
 320	if (!member || !parent)
 321		return -ENOENT;
 322
 323	list_for_each_entry(list, &member->groups, next_group) {
 324		if (list->group == parent) {
 325			list_del(&list->next_group);
 326			list_del(&list->next_member);
 327			kfree(list);
 328			return 0;
 329		}
 330	}
 331	return -ENOENT;
 332}
 333
 334#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 335int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid,
 336			       u64 rfer, u64 excl)
 337{
 338	struct btrfs_qgroup *qgroup;
 339
 340	qgroup = find_qgroup_rb(fs_info, qgroupid);
 341	if (!qgroup)
 342		return -EINVAL;
 343	if (qgroup->rfer != rfer || qgroup->excl != excl)
 344		return -EINVAL;
 345	return 0;
 346}
 347#endif
 348
 349__printf(2, 3)
 350static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info, const char *fmt, ...)
 351{
 352	const u64 old_flags = fs_info->qgroup_flags;
 353
 354	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
 355		return;
 356	fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT |
 357				  BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
 358				  BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
 359	if (!(old_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
 360		struct va_format vaf;
 361		va_list args;
 362
 363		va_start(args, fmt);
 364		vaf.fmt = fmt;
 365		vaf.va = &args;
 366
 367		btrfs_warn_rl(fs_info, "qgroup marked inconsistent, %pV", &vaf);
 368		va_end(args);
 369	}
 370}
 371
 372static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info,
 373				   struct extent_buffer *leaf, int slot,
 374				   struct btrfs_qgroup_status_item *ptr)
 375{
 376	ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
 377	ASSERT(btrfs_item_size(leaf, slot) >= sizeof(*ptr));
 378	fs_info->qgroup_enable_gen = btrfs_qgroup_status_enable_gen(leaf, ptr);
 379}
 380
 381/*
 382 * The full config is read in one go, only called from open_ctree()
 383 * It doesn't use any locking, as at this point we're still single-threaded
 384 */
 385int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
 386{
 387	struct btrfs_key key;
 388	struct btrfs_key found_key;
 389	struct btrfs_root *quota_root = fs_info->quota_root;
 390	struct btrfs_path *path = NULL;
 391	struct extent_buffer *l;
 392	int slot;
 393	int ret = 0;
 394	u64 flags = 0;
 395	u64 rescan_progress = 0;
 396
 397	if (!fs_info->quota_root)
 398		return 0;
 399
 400	path = btrfs_alloc_path();
 401	if (!path) {
 402		ret = -ENOMEM;
 403		goto out;
 404	}
 405
 406	ret = btrfs_sysfs_add_qgroups(fs_info);
 407	if (ret < 0)
 408		goto out;
 409	/* default this to quota off, in case no status key is found */
 410	fs_info->qgroup_flags = 0;
 411
 412	/*
 413	 * pass 1: read status, all qgroup infos and limits
 414	 */
 415	key.objectid = 0;
 416	key.type = 0;
 417	key.offset = 0;
 418	ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
 419	if (ret)
 420		goto out;
 421
 422	while (1) {
 423		struct btrfs_qgroup *qgroup;
 424
 425		slot = path->slots[0];
 426		l = path->nodes[0];
 427		btrfs_item_key_to_cpu(l, &found_key, slot);
 428
 429		if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
 430			struct btrfs_qgroup_status_item *ptr;
 431
 432			ptr = btrfs_item_ptr(l, slot,
 433					     struct btrfs_qgroup_status_item);
 434
 435			if (btrfs_qgroup_status_version(l, ptr) !=
 436			    BTRFS_QGROUP_STATUS_VERSION) {
 437				btrfs_err(fs_info,
 438				 "old qgroup version, quota disabled");
 439				goto out;
 440			}
 441			fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr);
 442			if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE)
 443				qgroup_read_enable_gen(fs_info, l, slot, ptr);
 444			else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation)
 445				qgroup_mark_inconsistent(fs_info, "qgroup generation mismatch");
 446			rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
 447			goto next1;
 448		}
 449
 450		if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
 451		    found_key.type != BTRFS_QGROUP_LIMIT_KEY)
 452			goto next1;
 453
 454		qgroup = find_qgroup_rb(fs_info, found_key.offset);
 455		if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
 456		    (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY))
 457			qgroup_mark_inconsistent(fs_info, "inconsistent qgroup config");
 458		if (!qgroup) {
 459			struct btrfs_qgroup *prealloc;
 460			struct btrfs_root *tree_root = fs_info->tree_root;
 461
 462			prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
 463			if (!prealloc) {
 464				ret = -ENOMEM;
 465				goto out;
 466			}
 467			qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
 468			/*
 469			 * If a qgroup exists for a subvolume ID, it is possible
 470			 * that subvolume has been deleted, in which case
 471			 * reusing that ID would lead to incorrect accounting.
 472			 *
 473			 * Ensure that we skip any such subvol ids.
 474			 *
 475			 * We don't need to lock because this is only called
 476			 * during mount before we start doing things like creating
 477			 * subvolumes.
 478			 */
 479			if (btrfs_is_fstree(qgroup->qgroupid) &&
 480			    qgroup->qgroupid > tree_root->free_objectid)
 481				/*
 482				 * Don't need to check against BTRFS_LAST_FREE_OBJECTID,
 483				 * as it will get checked on the next call to
 484				 * btrfs_get_free_objectid.
 485				 */
 486				tree_root->free_objectid = qgroup->qgroupid + 1;
 487		}
 488		ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
 489		if (ret < 0)
 490			goto out;
 491
 492		switch (found_key.type) {
 493		case BTRFS_QGROUP_INFO_KEY: {
 494			struct btrfs_qgroup_info_item *ptr;
 495
 496			ptr = btrfs_item_ptr(l, slot,
 497					     struct btrfs_qgroup_info_item);
 498			qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
 499			qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
 500			qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
 501			qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
 502			/* generation currently unused */
 503			break;
 504		}
 505		case BTRFS_QGROUP_LIMIT_KEY: {
 506			struct btrfs_qgroup_limit_item *ptr;
 507
 508			ptr = btrfs_item_ptr(l, slot,
 509					     struct btrfs_qgroup_limit_item);
 510			qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
 511			qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
 512			qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
 513			qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
 514			qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
 515			break;
 516		}
 517		}
 518next1:
 519		ret = btrfs_next_item(quota_root, path);
 520		if (ret < 0)
 521			goto out;
 522		if (ret)
 523			break;
 524	}
 525	btrfs_release_path(path);
 526
 527	/*
 528	 * pass 2: read all qgroup relations
 529	 */
 530	key.objectid = 0;
 531	key.type = BTRFS_QGROUP_RELATION_KEY;
 532	key.offset = 0;
 533	ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
 534	if (ret)
 535		goto out;
 536	while (1) {
 537		struct btrfs_qgroup_list *list = NULL;
 538
 539		slot = path->slots[0];
 540		l = path->nodes[0];
 541		btrfs_item_key_to_cpu(l, &found_key, slot);
 542
 543		if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
 544			goto next2;
 545
 546		if (found_key.objectid > found_key.offset) {
 547			/* parent <- member, not needed to build config */
 548			/* FIXME should we omit the key completely? */
 549			goto next2;
 550		}
 551
 552		list = kzalloc(sizeof(*list), GFP_KERNEL);
 553		if (!list) {
 554			ret = -ENOMEM;
 555			goto out;
 556		}
 557		ret = add_relation_rb(fs_info, list, found_key.objectid,
 558				      found_key.offset);
 559		list = NULL;
 560		if (ret == -ENOENT) {
 561			btrfs_warn(fs_info,
 562				"orphan qgroup relation 0x%llx->0x%llx",
 563				found_key.objectid, found_key.offset);
 564			ret = 0;	/* ignore the error */
 565		}
 566		if (ret)
 567			goto out;
 568next2:
 569		ret = btrfs_next_item(quota_root, path);
 570		if (ret < 0)
 571			goto out;
 572		if (ret)
 573			break;
 574	}
 575out:
 576	btrfs_free_path(path);
 577	fs_info->qgroup_flags |= flags;
 578	if (ret >= 0) {
 579		if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)
 580			set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
 581		if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
 582			ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
 583	} else {
 584		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
 585		btrfs_sysfs_del_qgroups(fs_info);
 586	}
 587
 588	return ret < 0 ? ret : 0;
 589}
 590
 591/*
 592 * Called in close_ctree() when quota is still enabled.  This verifies we don't
 593 * leak some reserved space.
 594 *
 595 * Return false if no reserved space is left.
 596 * Return true if some reserved space is leaked.
 597 */
 598bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info)
 599{
 600	struct rb_node *node;
 601	bool ret = false;
 602
 603	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED)
 604		return ret;
 605	/*
 606	 * Since we're unmounting, there is no race and no need to grab qgroup
 607	 * lock.  And here we don't go post-order to provide a more user
 608	 * friendly sorted result.
 609	 */
 610	for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
 611		struct btrfs_qgroup *qgroup;
 612		int i;
 613
 614		qgroup = rb_entry(node, struct btrfs_qgroup, node);
 615		for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) {
 616			if (qgroup->rsv.values[i]) {
 617				ret = true;
 618				btrfs_warn(fs_info,
 619		"qgroup %hu/%llu has unreleased space, type %d rsv %llu",
 620				   btrfs_qgroup_level(qgroup->qgroupid),
 621				   btrfs_qgroup_subvolid(qgroup->qgroupid),
 622				   i, qgroup->rsv.values[i]);
 623			}
 624		}
 625	}
 626	return ret;
 627}
 628
 629/*
 630 * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
 631 * first two are in single-threaded paths.
 632 */
 633void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
 634{
 635	struct rb_node *n;
 636	struct btrfs_qgroup *qgroup;
 637
 638	/*
 639	 * btrfs_quota_disable() can be called concurrently with
 640	 * btrfs_qgroup_rescan() -> qgroup_rescan_zero_tracking(), so take the
 641	 * lock.
 642	 */
 643	spin_lock(&fs_info->qgroup_lock);
 644	while ((n = rb_first(&fs_info->qgroup_tree))) {
 645		qgroup = rb_entry(n, struct btrfs_qgroup, node);
 646		rb_erase(n, &fs_info->qgroup_tree);
 647		__del_qgroup_rb(qgroup);
 648		spin_unlock(&fs_info->qgroup_lock);
 649		btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
 650		kfree(qgroup);
 651		spin_lock(&fs_info->qgroup_lock);
 652	}
 653	spin_unlock(&fs_info->qgroup_lock);
 654
 655	btrfs_sysfs_del_qgroups(fs_info);
 656}
 657
 658static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
 659				    u64 dst)
 660{
 661	int ret;
 662	struct btrfs_root *quota_root = trans->fs_info->quota_root;
 663	BTRFS_PATH_AUTO_FREE(path);
 664	struct btrfs_key key;
 665
 666	path = btrfs_alloc_path();
 667	if (!path)
 668		return -ENOMEM;
 669
 670	key.objectid = src;
 671	key.type = BTRFS_QGROUP_RELATION_KEY;
 672	key.offset = dst;
 673
 674	ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
 675	return ret;
 676}
 677
 678static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
 679				    u64 dst)
 680{
 681	int ret;
 682	struct btrfs_root *quota_root = trans->fs_info->quota_root;
 683	BTRFS_PATH_AUTO_FREE(path);
 684	struct btrfs_key key;
 685
 686	path = btrfs_alloc_path();
 687	if (!path)
 688		return -ENOMEM;
 689
 690	key.objectid = src;
 691	key.type = BTRFS_QGROUP_RELATION_KEY;
 692	key.offset = dst;
 693
 694	ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
 695	if (ret < 0)
 696		return ret;
 697
 698	if (ret > 0)
 699		return -ENOENT;
 700
 701	return btrfs_del_item(trans, quota_root, path);
 702}
 703
 704static int add_qgroup_item(struct btrfs_trans_handle *trans,
 705			   struct btrfs_root *quota_root, u64 qgroupid)
 706{
 707	int ret;
 708	BTRFS_PATH_AUTO_FREE(path);
 709	struct btrfs_qgroup_info_item *qgroup_info;
 710	struct btrfs_qgroup_limit_item *qgroup_limit;
 711	struct extent_buffer *leaf;
 712	struct btrfs_key key;
 713
 714	if (btrfs_is_testing(quota_root->fs_info))
 715		return 0;
 716
 717	path = btrfs_alloc_path();
 718	if (!path)
 719		return -ENOMEM;
 720
 721	key.objectid = 0;
 722	key.type = BTRFS_QGROUP_INFO_KEY;
 723	key.offset = qgroupid;
 724
 725	/*
 726	 * Avoid a transaction abort by catching -EEXIST here. In that
 727	 * case, we proceed by re-initializing the existing structure
 728	 * on disk.
 729	 */
 730
 731	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
 732				      sizeof(*qgroup_info));
 733	if (ret && ret != -EEXIST)
 734		return ret;
 735
 736	leaf = path->nodes[0];
 737	qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
 738				 struct btrfs_qgroup_info_item);
 739	btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
 740	btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
 741	btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
 742	btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
 743	btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
 744
 745	btrfs_release_path(path);
 746
 747	key.type = BTRFS_QGROUP_LIMIT_KEY;
 748	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
 749				      sizeof(*qgroup_limit));
 750	if (ret && ret != -EEXIST)
 751		return ret;
 752
 753	leaf = path->nodes[0];
 754	qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
 755				  struct btrfs_qgroup_limit_item);
 756	btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
 757	btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
 758	btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
 759	btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
 760	btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
 761
 762	return 0;
 763}
 764
 765static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid)
 766{
 767	int ret;
 768	struct btrfs_root *quota_root = trans->fs_info->quota_root;
 769	BTRFS_PATH_AUTO_FREE(path);
 770	struct btrfs_key key;
 771
 772	path = btrfs_alloc_path();
 773	if (!path)
 774		return -ENOMEM;
 775
 776	key.objectid = 0;
 777	key.type = BTRFS_QGROUP_INFO_KEY;
 778	key.offset = qgroupid;
 779	ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
 780	if (ret < 0)
 781		return ret;
 782
 783	if (ret > 0)
 784		return -ENOENT;
 785
 786	ret = btrfs_del_item(trans, quota_root, path);
 787	if (ret)
 788		return ret;
 789
 790	btrfs_release_path(path);
 791
 792	key.type = BTRFS_QGROUP_LIMIT_KEY;
 793	ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
 794	if (ret < 0)
 795		return ret;
 796
 797	if (ret > 0)
 798		return -ENOENT;
 799
 800	ret = btrfs_del_item(trans, quota_root, path);
 801
 802	return ret;
 803}
 804
 805static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
 806				    struct btrfs_qgroup *qgroup)
 807{
 808	struct btrfs_root *quota_root = trans->fs_info->quota_root;
 809	BTRFS_PATH_AUTO_FREE(path);
 810	struct btrfs_key key;
 811	struct extent_buffer *l;
 812	struct btrfs_qgroup_limit_item *qgroup_limit;
 813	int ret;
 814	int slot;
 815
 816	key.objectid = 0;
 817	key.type = BTRFS_QGROUP_LIMIT_KEY;
 818	key.offset = qgroup->qgroupid;
 819
 820	path = btrfs_alloc_path();
 821	if (!path)
 822		return -ENOMEM;
 823
 824	ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
 825	if (ret > 0)
 826		ret = -ENOENT;
 827
 828	if (ret)
 829		return ret;
 830
 831	l = path->nodes[0];
 832	slot = path->slots[0];
 833	qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
 834	btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
 835	btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
 836	btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
 837	btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
 838	btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
 839
 840	return ret;
 841}
 842
 843static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
 844				   struct btrfs_qgroup *qgroup)
 845{
 846	struct btrfs_fs_info *fs_info = trans->fs_info;
 847	struct btrfs_root *quota_root = fs_info->quota_root;
 848	BTRFS_PATH_AUTO_FREE(path);
 849	struct btrfs_key key;
 850	struct extent_buffer *l;
 851	struct btrfs_qgroup_info_item *qgroup_info;
 852	int ret;
 853	int slot;
 854
 855	if (btrfs_is_testing(fs_info))
 856		return 0;
 857
 858	key.objectid = 0;
 859	key.type = BTRFS_QGROUP_INFO_KEY;
 860	key.offset = qgroup->qgroupid;
 861
 862	path = btrfs_alloc_path();
 863	if (!path)
 864		return -ENOMEM;
 865
 866	ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
 867	if (ret > 0)
 868		ret = -ENOENT;
 869
 870	if (ret)
 871		return ret;
 872
 873	l = path->nodes[0];
 874	slot = path->slots[0];
 875	qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
 876	btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
 877	btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
 878	btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
 879	btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
 880	btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
 881
 882	return ret;
 883}
 884
 885static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
 886{
 887	struct btrfs_fs_info *fs_info = trans->fs_info;
 888	struct btrfs_root *quota_root = fs_info->quota_root;
 889	BTRFS_PATH_AUTO_FREE(path);
 890	struct btrfs_key key;
 891	struct extent_buffer *l;
 892	struct btrfs_qgroup_status_item *ptr;
 893	int ret;
 894	int slot;
 895
 896	key.objectid = 0;
 897	key.type = BTRFS_QGROUP_STATUS_KEY;
 898	key.offset = 0;
 899
 900	path = btrfs_alloc_path();
 901	if (!path)
 902		return -ENOMEM;
 903
 904	ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
 905	if (ret > 0)
 906		ret = -ENOENT;
 907
 908	if (ret)
 909		return ret;
 910
 911	l = path->nodes[0];
 912	slot = path->slots[0];
 913	ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
 914	btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags &
 915				      BTRFS_QGROUP_STATUS_FLAGS_MASK);
 916	btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
 917	btrfs_set_qgroup_status_rescan(l, ptr,
 918				fs_info->qgroup_rescan_progress.objectid);
 919
 920	return ret;
 921}
 922
 923/*
 924 * called with qgroup_lock held
 925 */
 926static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
 927				  struct btrfs_root *root)
 928{
 929	BTRFS_PATH_AUTO_FREE(path);
 930	struct btrfs_key key;
 931	struct extent_buffer *leaf = NULL;
 932	int ret;
 933	int nr = 0;
 934
 935	path = btrfs_alloc_path();
 936	if (!path)
 937		return -ENOMEM;
 938
 939	key.objectid = 0;
 940	key.type = 0;
 941	key.offset = 0;
 942
 943	while (1) {
 944		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 945		if (ret < 0)
 946			return ret;
 947		leaf = path->nodes[0];
 948		nr = btrfs_header_nritems(leaf);
 949		if (!nr)
 950			break;
 951		/*
 952		 * delete the leaf one by one
 953		 * since the whole tree is going
 954		 * to be deleted.
 955		 */
 956		path->slots[0] = 0;
 957		ret = btrfs_del_items(trans, root, path, 0, nr);
 958		if (ret)
 959			return ret;
 960
 961		btrfs_release_path(path);
 962	}
 963
 964	return 0;
 965}
 966
 967int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
 968		       struct btrfs_ioctl_quota_ctl_args *quota_ctl_args)
 969{
 970	struct btrfs_root *quota_root;
 971	struct btrfs_root *tree_root = fs_info->tree_root;
 972	struct btrfs_path *path = NULL;
 973	struct btrfs_qgroup_status_item *ptr;
 974	struct extent_buffer *leaf;
 975	struct btrfs_key key;
 976	struct btrfs_key found_key;
 977	struct btrfs_qgroup *qgroup = NULL;
 978	struct btrfs_qgroup *prealloc = NULL;
 979	struct btrfs_trans_handle *trans = NULL;
 980	const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA);
 981	int ret = 0;
 982	int slot;
 983
 984	/*
 985	 * We need to have subvol_sem write locked, to prevent races between
 986	 * concurrent tasks trying to enable quotas, because we will unlock
 987	 * and relock qgroup_ioctl_lock before setting fs_info->quota_root
 988	 * and before setting BTRFS_FS_QUOTA_ENABLED.
 989	 */
 990	lockdep_assert_held_write(&fs_info->subvol_sem);
 991
 992	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
 993		btrfs_err(fs_info,
 994			  "qgroups are currently unsupported in extent tree v2");
 995		return -EINVAL;
 996	}
 997
 998	mutex_lock(&fs_info->qgroup_ioctl_lock);
 999	if (fs_info->quota_root)
1000		goto out;
1001
1002	ret = btrfs_sysfs_add_qgroups(fs_info);
1003	if (ret < 0)
1004		goto out;
1005
1006	/*
1007	 * Unlock qgroup_ioctl_lock before starting the transaction. This is to
1008	 * avoid lock acquisition inversion problems (reported by lockdep) between
1009	 * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we
1010	 * start a transaction.
1011	 * After we started the transaction lock qgroup_ioctl_lock again and
1012	 * check if someone else created the quota root in the meanwhile. If so,
1013	 * just return success and release the transaction handle.
1014	 *
1015	 * Also we don't need to worry about someone else calling
1016	 * btrfs_sysfs_add_qgroups() after we unlock and getting an error because
1017	 * that function returns 0 (success) when the sysfs entries already exist.
1018	 */
1019	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1020
1021	/*
1022	 * 1 for quota root item
1023	 * 1 for BTRFS_QGROUP_STATUS item
1024	 *
1025	 * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items
1026	 * per subvolume. However those are not currently reserved since it
1027	 * would be a lot of overkill.
1028	 */
1029	trans = btrfs_start_transaction(tree_root, 2);
1030
1031	mutex_lock(&fs_info->qgroup_ioctl_lock);
1032	if (IS_ERR(trans)) {
1033		ret = PTR_ERR(trans);
1034		trans = NULL;
1035		goto out;
1036	}
1037
1038	if (fs_info->quota_root)
1039		goto out;
1040
1041	/*
1042	 * initially create the quota tree
1043	 */
1044	quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID);
1045	if (IS_ERR(quota_root)) {
1046		ret =  PTR_ERR(quota_root);
1047		btrfs_abort_transaction(trans, ret);
1048		goto out;
1049	}
1050
1051	path = btrfs_alloc_path();
1052	if (unlikely(!path)) {
1053		ret = -ENOMEM;
1054		btrfs_abort_transaction(trans, ret);
1055		goto out_free_root;
1056	}
1057
1058	key.objectid = 0;
1059	key.type = BTRFS_QGROUP_STATUS_KEY;
1060	key.offset = 0;
1061
1062	ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
1063				      sizeof(*ptr));
1064	if (unlikely(ret)) {
1065		btrfs_abort_transaction(trans, ret);
1066		goto out_free_path;
1067	}
1068
1069	leaf = path->nodes[0];
1070	ptr = btrfs_item_ptr(leaf, path->slots[0],
1071				 struct btrfs_qgroup_status_item);
1072	btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
1073	btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
1074	fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON;
1075	if (simple) {
1076		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
1077		btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA);
1078		btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid);
1079	} else {
1080		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1081	}
1082	btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags &
1083				      BTRFS_QGROUP_STATUS_FLAGS_MASK);
1084	btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
1085
1086	key.objectid = 0;
1087	key.type = BTRFS_ROOT_REF_KEY;
1088	key.offset = 0;
1089
1090	btrfs_release_path(path);
1091	ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
1092	if (ret > 0)
1093		goto out_add_root;
1094	if (unlikely(ret < 0)) {
1095		btrfs_abort_transaction(trans, ret);
1096		goto out_free_path;
1097	}
1098
1099	while (1) {
1100		slot = path->slots[0];
1101		leaf = path->nodes[0];
1102		btrfs_item_key_to_cpu(leaf, &found_key, slot);
1103
1104		if (found_key.type == BTRFS_ROOT_REF_KEY) {
1105
1106			/* Release locks on tree_root before we access quota_root */
1107			btrfs_release_path(path);
1108
1109			/* We should not have a stray @prealloc pointer. */
1110			ASSERT(prealloc == NULL);
1111			prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
1112			if (unlikely(!prealloc)) {
1113				ret = -ENOMEM;
1114				btrfs_abort_transaction(trans, ret);
1115				goto out_free_path;
1116			}
1117
1118			ret = add_qgroup_item(trans, quota_root,
1119					      found_key.offset);
1120			if (unlikely(ret)) {
1121				btrfs_abort_transaction(trans, ret);
1122				goto out_free_path;
1123			}
1124
1125			qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
1126			prealloc = NULL;
1127			ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1128			if (unlikely(ret < 0)) {
1129				btrfs_abort_transaction(trans, ret);
1130				goto out_free_path;
1131			}
1132			ret = btrfs_search_slot_for_read(tree_root, &found_key,
1133							 path, 1, 0);
1134			if (unlikely(ret < 0)) {
1135				btrfs_abort_transaction(trans, ret);
1136				goto out_free_path;
1137			}
1138			if (ret > 0) {
1139				/*
1140				 * Shouldn't happen, but in case it does we
1141				 * don't need to do the btrfs_next_item, just
1142				 * continue.
1143				 */
1144				continue;
1145			}
1146		}
1147		ret = btrfs_next_item(tree_root, path);
1148		if (unlikely(ret < 0)) {
1149			btrfs_abort_transaction(trans, ret);
1150			goto out_free_path;
1151		}
1152		if (ret)
1153			break;
1154	}
1155
1156out_add_root:
1157	btrfs_release_path(path);
1158	ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
1159	if (unlikely(ret)) {
1160		btrfs_abort_transaction(trans, ret);
1161		goto out_free_path;
1162	}
1163
1164	ASSERT(prealloc == NULL);
1165	prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
1166	if (!prealloc) {
1167		ret = -ENOMEM;
1168		goto out_free_path;
1169	}
1170	qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID);
1171	prealloc = NULL;
1172	ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1173	if (unlikely(ret < 0)) {
1174		btrfs_abort_transaction(trans, ret);
1175		goto out_free_path;
1176	}
1177
1178	fs_info->qgroup_enable_gen = trans->transid;
1179
1180	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1181	/*
1182	 * Commit the transaction while not holding qgroup_ioctl_lock, to avoid
1183	 * a deadlock with tasks concurrently doing other qgroup operations, such
1184	 * adding/removing qgroups or adding/deleting qgroup relations for example,
1185	 * because all qgroup operations first start or join a transaction and then
1186	 * lock the qgroup_ioctl_lock mutex.
1187	 * We are safe from a concurrent task trying to enable quotas, by calling
1188	 * this function, since we are serialized by fs_info->subvol_sem.
1189	 */
1190	ret = btrfs_commit_transaction(trans);
1191	trans = NULL;
1192	mutex_lock(&fs_info->qgroup_ioctl_lock);
1193	if (ret)
1194		goto out_free_path;
1195
1196	/*
1197	 * Set quota enabled flag after committing the transaction, to avoid
1198	 * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot
1199	 * creation.
1200	 */
1201	spin_lock(&fs_info->qgroup_lock);
1202	fs_info->quota_root = quota_root;
1203	set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1204	spin_unlock(&fs_info->qgroup_lock);
1205
1206	/* Skip rescan for simple qgroups. */
1207	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
1208		goto out_free_path;
1209
1210	ret = qgroup_rescan_init(fs_info, 0, 1);
1211	if (!ret) {
1212	        qgroup_rescan_zero_tracking(fs_info);
1213		fs_info->qgroup_rescan_running = true;
1214	        btrfs_queue_work(fs_info->qgroup_rescan_workers,
1215	                         &fs_info->qgroup_rescan_work);
1216	} else {
1217		/*
1218		 * We have set both BTRFS_FS_QUOTA_ENABLED and
1219		 * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with
1220		 * -EINPROGRESS. That can happen because someone started the
1221		 * rescan worker by calling quota rescan ioctl before we
1222		 * attempted to initialize the rescan worker. Failure due to
1223		 * quotas disabled in the meanwhile is not possible, because
1224		 * we are holding a write lock on fs_info->subvol_sem, which
1225		 * is also acquired when disabling quotas.
1226		 * Ignore such error, and any other error would need to undo
1227		 * everything we did in the transaction we just committed.
1228		 */
1229		ASSERT(ret == -EINPROGRESS);
1230		ret = 0;
1231	}
1232
1233out_free_path:
1234	btrfs_free_path(path);
1235out_free_root:
1236	if (ret)
1237		btrfs_put_root(quota_root);
1238out:
1239	if (ret)
1240		btrfs_sysfs_del_qgroups(fs_info);
1241	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1242	if (ret && trans)
1243		btrfs_end_transaction(trans);
1244	else if (trans)
1245		ret = btrfs_end_transaction(trans);
1246	kfree(prealloc);
1247	return ret;
1248}
1249
1250/*
1251 * It is possible to have outstanding ordered extents which reserved bytes
1252 * before we disabled. We need to fully flush delalloc, ordered extents, and a
1253 * commit to ensure that we don't leak such reservations, only to have them
1254 * come back if we re-enable.
1255 *
1256 * - enable simple quotas
1257 * - reserve space
1258 * - release it, store rsv_bytes in OE
1259 * - disable quotas
1260 * - enable simple quotas (qgroup rsv are all 0)
1261 * - OE finishes
1262 * - run delayed refs
1263 * - free rsv_bytes, resulting in miscounting or even underflow
1264 */
1265static int flush_reservations(struct btrfs_fs_info *fs_info)
1266{
1267	int ret;
1268
1269	ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
1270	if (ret)
1271		return ret;
1272	btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
1273
1274	return btrfs_commit_current_transaction(fs_info->tree_root);
1275}
1276
1277int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
1278{
1279	struct btrfs_root *quota_root = NULL;
1280	struct btrfs_trans_handle *trans = NULL;
1281	int ret = 0;
1282
1283	/*
1284	 * We need to have subvol_sem write locked to prevent races with
1285	 * snapshot creation.
1286	 */
1287	lockdep_assert_held_write(&fs_info->subvol_sem);
1288
1289	/*
1290	 * Relocation will mess with backrefs, so make sure we have the
1291	 * cleaner_mutex held to protect us from relocate.
1292	 */
1293	lockdep_assert_held(&fs_info->cleaner_mutex);
1294
1295	mutex_lock(&fs_info->qgroup_ioctl_lock);
1296	if (!fs_info->quota_root)
1297		goto out;
1298
1299	/*
1300	 * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
1301	 * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
1302	 * to lock that mutex while holding a transaction handle and the rescan
1303	 * worker needs to commit a transaction.
1304	 */
1305	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1306
1307	/*
1308	 * Request qgroup rescan worker to complete and wait for it. This wait
1309	 * must be done before transaction start for quota disable since it may
1310	 * deadlock with transaction by the qgroup rescan worker.
1311	 */
1312	clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1313	btrfs_qgroup_wait_for_completion(fs_info, false);
1314
1315	/*
1316	 * We have nothing held here and no trans handle, just return the error
1317	 * if there is one and set back the quota enabled bit since we didn't
1318	 * actually disable quotas.
1319	 */
1320	ret = flush_reservations(fs_info);
1321	if (ret) {
1322		set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1323		return ret;
1324	}
1325
1326	/*
1327	 * 1 For the root item
1328	 *
1329	 * We should also reserve enough items for the quota tree deletion in
1330	 * btrfs_clean_quota_tree but this is not done.
1331	 *
1332	 * Also, we must always start a transaction without holding the mutex
1333	 * qgroup_ioctl_lock, see btrfs_quota_enable().
1334	 */
1335	trans = btrfs_start_transaction(fs_info->tree_root, 1);
1336
1337	mutex_lock(&fs_info->qgroup_ioctl_lock);
1338	if (IS_ERR(trans)) {
1339		ret = PTR_ERR(trans);
1340		trans = NULL;
1341		set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1342		goto out;
1343	}
1344
1345	if (!fs_info->quota_root)
1346		goto out;
1347
1348	spin_lock(&fs_info->qgroup_lock);
1349	quota_root = fs_info->quota_root;
1350	fs_info->quota_root = NULL;
1351	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
1352	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
1353	fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT;
1354	spin_unlock(&fs_info->qgroup_lock);
1355
1356	btrfs_free_qgroup_config(fs_info);
1357
1358	ret = btrfs_clean_quota_tree(trans, quota_root);
1359	if (unlikely(ret)) {
1360		btrfs_abort_transaction(trans, ret);
1361		goto out;
1362	}
1363
1364	ret = btrfs_del_root(trans, &quota_root->root_key);
1365	if (unlikely(ret)) {
1366		btrfs_abort_transaction(trans, ret);
1367		goto out;
1368	}
1369
1370	spin_lock(&fs_info->trans_lock);
1371	list_del(&quota_root->dirty_list);
1372	spin_unlock(&fs_info->trans_lock);
1373
1374	btrfs_tree_lock(quota_root->node);
1375	btrfs_clear_buffer_dirty(trans, quota_root->node);
1376	btrfs_tree_unlock(quota_root->node);
1377	ret = btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
1378				    quota_root->node, 0, 1);
1379
1380	if (ret < 0)
1381		btrfs_abort_transaction(trans, ret);
1382
1383out:
1384	btrfs_put_root(quota_root);
1385	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1386	if (ret && trans)
1387		btrfs_end_transaction(trans);
1388	else if (trans)
1389		ret = btrfs_commit_transaction(trans);
1390	return ret;
1391}
1392
1393static void qgroup_dirty(struct btrfs_fs_info *fs_info,
1394			 struct btrfs_qgroup *qgroup)
1395{
1396	if (list_empty(&qgroup->dirty))
1397		list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
1398}
1399
1400static void qgroup_iterator_add(struct list_head *head, struct btrfs_qgroup *qgroup)
1401{
1402	if (!list_empty(&qgroup->iterator))
1403		return;
1404
1405	list_add_tail(&qgroup->iterator, head);
1406}
1407
1408static void qgroup_iterator_clean(struct list_head *head)
1409{
1410	while (!list_empty(head)) {
1411		struct btrfs_qgroup *qgroup;
1412
1413		qgroup = list_first_entry(head, struct btrfs_qgroup, iterator);
1414		list_del_init(&qgroup->iterator);
1415	}
1416}
1417
1418/*
1419 * The easy accounting, we're updating qgroup relationship whose child qgroup
1420 * only has exclusive extents.
1421 *
1422 * In this case, all exclusive extents will also be exclusive for parent, so
1423 * excl/rfer just get added/removed.
1424 *
1425 * So is qgroup reservation space, which should also be added/removed to
1426 * parent.
1427 * Or when child tries to release reservation space, parent will underflow its
1428 * reservation (for relationship adding case).
1429 *
1430 * Caller should hold fs_info->qgroup_lock.
1431 */
1432static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
1433				    struct btrfs_qgroup *src, int sign)
1434{
1435	struct btrfs_qgroup *qgroup;
1436	LIST_HEAD(qgroup_list);
1437	u64 num_bytes = src->excl;
1438	u64 num_bytes_cmpr = src->excl_cmpr;
1439	int ret = 0;
1440
1441	qgroup = find_qgroup_rb(fs_info, ref_root);
1442	if (!qgroup)
1443		goto out;
1444
1445	qgroup_iterator_add(&qgroup_list, qgroup);
1446	list_for_each_entry(qgroup, &qgroup_list, iterator) {
1447		struct btrfs_qgroup_list *glist;
1448
1449		qgroup->rfer += sign * num_bytes;
1450		qgroup->rfer_cmpr += sign * num_bytes_cmpr;
1451
1452		WARN_ON(sign < 0 && qgroup->excl < num_bytes);
1453		WARN_ON(sign < 0 && qgroup->excl_cmpr < num_bytes_cmpr);
1454		qgroup->excl += sign * num_bytes;
1455		qgroup->excl_cmpr += sign * num_bytes_cmpr;
1456
1457		if (sign > 0)
1458			qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
1459		else
1460			qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
1461		qgroup_dirty(fs_info, qgroup);
1462
1463		/* Append parent qgroups to @qgroup_list. */
1464		list_for_each_entry(glist, &qgroup->groups, next_group)
1465			qgroup_iterator_add(&qgroup_list, glist->group);
1466	}
1467	ret = 0;
1468out:
1469	qgroup_iterator_clean(&qgroup_list);
1470	return ret;
1471}
1472
1473
1474/*
1475 * Quick path for updating qgroup with only excl refs.
1476 *
1477 * In that case, just update all parent will be enough.
1478 * Or we needs to do a full rescan.
1479 * Caller should also hold fs_info->qgroup_lock.
1480 *
1481 * Return 0 for quick update, return >0 for need to full rescan
1482 * and mark INCONSISTENT flag.
1483 * Return < 0 for other error.
1484 */
1485static int quick_update_accounting(struct btrfs_fs_info *fs_info,
1486				   u64 src, u64 dst, int sign)
1487{
1488	struct btrfs_qgroup *qgroup;
1489	int ret = 1;
1490
1491	qgroup = find_qgroup_rb(fs_info, src);
1492	if (!qgroup)
1493		goto out;
1494	if (qgroup->excl == qgroup->rfer) {
1495		ret = __qgroup_excl_accounting(fs_info, dst, qgroup, sign);
1496		if (ret < 0)
1497			goto out;
1498		ret = 0;
1499	}
1500out:
1501	if (ret)
1502		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1503	return ret;
1504}
1505
1506/*
1507 * Add relation between @src and @dst qgroup. The @prealloc is allocated by the
1508 * callers and transferred here (either used or freed on error).
1509 */
1510int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst,
1511			      struct btrfs_qgroup_list *prealloc)
1512{
1513	struct btrfs_fs_info *fs_info = trans->fs_info;
1514	struct btrfs_qgroup *parent;
1515	struct btrfs_qgroup *member;
1516	struct btrfs_qgroup_list *list;
1517	int ret = 0;
1518
1519	ASSERT(prealloc);
1520
1521	/* Check the level of src and dst first */
1522	if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) {
1523		kfree(prealloc);
1524		return -EINVAL;
1525	}
1526
1527	mutex_lock(&fs_info->qgroup_ioctl_lock);
1528	if (!fs_info->quota_root) {
1529		ret = -ENOTCONN;
1530		goto out;
1531	}
1532	member = find_qgroup_rb(fs_info, src);
1533	parent = find_qgroup_rb(fs_info, dst);
1534	if (!member || !parent) {
1535		ret = -EINVAL;
1536		goto out;
1537	}
1538
1539	/* check if such qgroup relation exist firstly */
1540	list_for_each_entry(list, &member->groups, next_group) {
1541		if (list->group == parent) {
1542			ret = -EEXIST;
1543			goto out;
1544		}
1545	}
1546
1547	ret = add_qgroup_relation_item(trans, src, dst);
1548	if (ret)
1549		goto out;
1550
1551	ret = add_qgroup_relation_item(trans, dst, src);
1552	if (ret) {
1553		del_qgroup_relation_item(trans, src, dst);
1554		goto out;
1555	}
1556
1557	spin_lock(&fs_info->qgroup_lock);
1558	ret = __add_relation_rb(prealloc, member, parent);
1559	prealloc = NULL;
1560	if (ret < 0) {
1561		spin_unlock(&fs_info->qgroup_lock);
1562		goto out;
1563	}
1564	ret = quick_update_accounting(fs_info, src, dst, 1);
1565	spin_unlock(&fs_info->qgroup_lock);
1566out:
1567	kfree(prealloc);
1568	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1569	return ret;
1570}
1571
1572static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1573				 u64 dst)
1574{
1575	struct btrfs_fs_info *fs_info = trans->fs_info;
1576	struct btrfs_qgroup *parent;
1577	struct btrfs_qgroup *member;
1578	struct btrfs_qgroup_list *list;
1579	bool found = false;
1580	int ret = 0;
1581	int ret2;
1582
1583	if (!fs_info->quota_root) {
1584		ret = -ENOTCONN;
1585		goto out;
1586	}
1587
1588	member = find_qgroup_rb(fs_info, src);
1589	parent = find_qgroup_rb(fs_info, dst);
1590	/*
1591	 * The parent/member pair doesn't exist, then try to delete the dead
1592	 * relation items only.
1593	 */
1594	if (!member || !parent)
1595		goto delete_item;
1596
1597	/* check if such qgroup relation exist firstly */
1598	list_for_each_entry(list, &member->groups, next_group) {
1599		if (list->group == parent) {
1600			found = true;
1601			break;
1602		}
1603	}
1604
1605delete_item:
1606	ret = del_qgroup_relation_item(trans, src, dst);
1607	if (ret < 0 && ret != -ENOENT)
1608		goto out;
1609	ret2 = del_qgroup_relation_item(trans, dst, src);
1610	if (ret2 < 0 && ret2 != -ENOENT)
1611		goto out;
1612
1613	/* At least one deletion succeeded, return 0 */
1614	if (!ret || !ret2)
1615		ret = 0;
1616
1617	if (found) {
1618		spin_lock(&fs_info->qgroup_lock);
1619		del_relation_rb(fs_info, src, dst);
1620		ret = quick_update_accounting(fs_info, src, dst, -1);
1621		spin_unlock(&fs_info->qgroup_lock);
1622	}
1623out:
1624	return ret;
1625}
1626
1627int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1628			      u64 dst)
1629{
1630	struct btrfs_fs_info *fs_info = trans->fs_info;
1631	int ret = 0;
1632
1633	mutex_lock(&fs_info->qgroup_ioctl_lock);
1634	ret = __del_qgroup_relation(trans, src, dst);
1635	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1636
1637	return ret;
1638}
1639
1640int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
1641{
1642	struct btrfs_fs_info *fs_info = trans->fs_info;
1643	struct btrfs_root *quota_root;
1644	struct btrfs_qgroup *qgroup;
1645	struct btrfs_qgroup *prealloc = NULL;
1646	int ret = 0;
1647
1648	mutex_lock(&fs_info->qgroup_ioctl_lock);
1649	if (!fs_info->quota_root) {
1650		ret = -ENOTCONN;
1651		goto out;
1652	}
1653	quota_root = fs_info->quota_root;
1654	qgroup = find_qgroup_rb(fs_info, qgroupid);
1655	if (qgroup) {
1656		ret = -EEXIST;
1657		goto out;
1658	}
1659
1660	prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
1661	if (!prealloc) {
1662		ret = -ENOMEM;
1663		goto out;
1664	}
1665
1666	ret = add_qgroup_item(trans, quota_root, qgroupid);
1667	if (ret)
1668		goto out;
1669
1670	spin_lock(&fs_info->qgroup_lock);
1671	qgroup = add_qgroup_rb(fs_info, prealloc, qgroupid);
1672	spin_unlock(&fs_info->qgroup_lock);
1673	prealloc = NULL;
1674
1675	ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1676out:
1677	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1678	kfree(prealloc);
1679	return ret;
1680}
1681
1682/*
1683 * Return 0 if we can not delete the qgroup (not empty or has children etc).
1684 * Return >0 if we can delete the qgroup.
1685 * Return <0 for other errors during tree search.
1686 */
1687static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup)
1688{
1689	struct btrfs_key key;
1690	BTRFS_PATH_AUTO_FREE(path);
1691
1692	/*
1693	 * Squota would never be inconsistent, but there can still be case
1694	 * where a dropped subvolume still has qgroup numbers, and squota
1695	 * relies on such qgroup for future accounting.
1696	 *
1697	 * So for squota, do not allow dropping any non-zero qgroup.
1698	 */
1699	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE &&
1700	    (qgroup->rfer || qgroup->excl || qgroup->excl_cmpr || qgroup->rfer_cmpr))
1701		return 0;
1702
1703	/* For higher level qgroup, we can only delete it if it has no child. */
1704	if (btrfs_qgroup_level(qgroup->qgroupid)) {
1705		if (!list_empty(&qgroup->members))
1706			return 0;
1707		return 1;
1708	}
1709
1710	/*
1711	 * For level-0 qgroups, we can only delete it if it has no subvolume
1712	 * for it.
1713	 * This means even a subvolume is unlinked but not yet fully dropped,
1714	 * we can not delete the qgroup.
1715	 */
1716	key.objectid = qgroup->qgroupid;
1717	key.type = BTRFS_ROOT_ITEM_KEY;
1718	key.offset = -1ULL;
1719	path = btrfs_alloc_path();
1720	if (!path)
1721		return -ENOMEM;
1722
1723	/*
1724	 * The @ret from btrfs_find_root() exactly matches our definition for
1725	 * the return value, thus can be returned directly.
1726	 */
1727	return btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL);
1728}
1729
1730int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
1731{
1732	struct btrfs_fs_info *fs_info = trans->fs_info;
1733	struct btrfs_qgroup *qgroup;
1734	struct btrfs_qgroup_list *list;
1735	int ret = 0;
1736
1737	mutex_lock(&fs_info->qgroup_ioctl_lock);
1738	if (!fs_info->quota_root) {
1739		ret = -ENOTCONN;
1740		goto out;
1741	}
1742
1743	qgroup = find_qgroup_rb(fs_info, qgroupid);
1744	if (!qgroup) {
1745		ret = -ENOENT;
1746		goto out;
1747	}
1748
1749	ret = can_delete_qgroup(fs_info, qgroup);
1750	if (ret < 0)
1751		goto out;
1752	if (ret == 0) {
1753		ret = -EBUSY;
1754		goto out;
1755	}
1756
1757	/* Check if there are no children of this qgroup */
1758	if (!list_empty(&qgroup->members)) {
1759		ret = -EBUSY;
1760		goto out;
1761	}
1762
1763	ret = del_qgroup_item(trans, qgroupid);
1764	if (ret && ret != -ENOENT)
1765		goto out;
1766
1767	while (!list_empty(&qgroup->groups)) {
1768		list = list_first_entry(&qgroup->groups,
1769					struct btrfs_qgroup_list, next_group);
1770		ret = __del_qgroup_relation(trans, qgroupid,
1771					    list->group->qgroupid);
1772		if (ret)
1773			goto out;
1774	}
1775
1776	spin_lock(&fs_info->qgroup_lock);
1777	/*
1778	 * Warn on reserved space. The subvolume should has no child nor
1779	 * corresponding subvolume.
1780	 * Thus its reserved space should all be zero, no matter if qgroup
1781	 * is consistent or the mode.
1782	 */
1783	if (qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
1784	    qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
1785	    qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]) {
1786		DEBUG_WARN();
1787		btrfs_warn_rl(fs_info,
1788"to be deleted qgroup %u/%llu has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu",
1789			      btrfs_qgroup_level(qgroup->qgroupid),
1790			      btrfs_qgroup_subvolid(qgroup->qgroupid),
1791			      qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA],
1792			      qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC],
1793			      qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
1794
1795	}
1796	/*
1797	 * The same for rfer/excl numbers, but that's only if our qgroup is
1798	 * consistent and if it's in regular qgroup mode.
1799	 * For simple mode it's not as accurate thus we can hit non-zero values
1800	 * very frequently.
1801	 */
1802	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL &&
1803	    !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
1804		if (qgroup->rfer || qgroup->excl ||
1805		    qgroup->rfer_cmpr || qgroup->excl_cmpr) {
1806			DEBUG_WARN();
1807			qgroup_mark_inconsistent(fs_info,
1808				"to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
1809				btrfs_qgroup_level(qgroup->qgroupid),
1810				btrfs_qgroup_subvolid(qgroup->qgroupid),
1811				qgroup->rfer, qgroup->rfer_cmpr,
1812				qgroup->excl, qgroup->excl_cmpr);
1813		}
1814	}
1815	del_qgroup_rb(fs_info, qgroupid);
1816	spin_unlock(&fs_info->qgroup_lock);
1817
1818	/*
1819	 * Remove the qgroup from sysfs now without holding the qgroup_lock
1820	 * spinlock, since the sysfs_remove_group() function needs to take
1821	 * the mutex kernfs_mutex through kernfs_remove_by_name_ns().
1822	 */
1823	btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
1824	kfree(qgroup);
1825out:
1826	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1827	return ret;
1828}
1829
1830int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 subvolid)
1831{
1832	struct btrfs_trans_handle *trans;
1833	int ret;
1834
1835	if (!btrfs_is_fstree(subvolid) || !btrfs_qgroup_enabled(fs_info) ||
1836	    !fs_info->quota_root)
1837		return 0;
1838
1839	/*
1840	 * Commit current transaction to make sure all the rfer/excl numbers
1841	 * get updated.
1842	 */
1843	ret = btrfs_commit_current_transaction(fs_info->quota_root);
1844	if (ret < 0)
1845		return ret;
1846
1847	/* Start new trans to delete the qgroup info and limit items. */
1848	trans = btrfs_start_transaction(fs_info->quota_root, 2);
1849	if (IS_ERR(trans))
1850		return PTR_ERR(trans);
1851	ret = btrfs_remove_qgroup(trans, subvolid);
1852	btrfs_end_transaction(trans);
1853	/*
1854	 * It's squota and the subvolume still has numbers needed for future
1855	 * accounting, in this case we can not delete it.  Just skip it.
1856	 *
1857	 * Or the qgroup is already removed by a qgroup rescan. For both cases we're
1858	 * safe to ignore them.
1859	 */
1860	if (ret == -EBUSY || ret == -ENOENT)
1861		ret = 0;
1862	return ret;
1863}
1864
1865int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
1866		       struct btrfs_qgroup_limit *limit)
1867{
1868	struct btrfs_fs_info *fs_info = trans->fs_info;
1869	struct btrfs_qgroup *qgroup;
1870	int ret = 0;
1871	/* Sometimes we would want to clear the limit on this qgroup.
1872	 * To meet this requirement, we treat the -1 as a special value
1873	 * which tell kernel to clear the limit on this qgroup.
1874	 */
1875	const u64 CLEAR_VALUE = -1;
1876
1877	mutex_lock(&fs_info->qgroup_ioctl_lock);
1878	if (!fs_info->quota_root) {
1879		ret = -ENOTCONN;
1880		goto out;
1881	}
1882
1883	qgroup = find_qgroup_rb(fs_info, qgroupid);
1884	if (!qgroup) {
1885		ret = -ENOENT;
1886		goto out;
1887	}
1888
1889	spin_lock(&fs_info->qgroup_lock);
1890	if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) {
1891		if (limit->max_rfer == CLEAR_VALUE) {
1892			qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
1893			limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
1894			qgroup->max_rfer = 0;
1895		} else {
1896			qgroup->max_rfer = limit->max_rfer;
1897		}
1898	}
1899	if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
1900		if (limit->max_excl == CLEAR_VALUE) {
1901			qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
1902			limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
1903			qgroup->max_excl = 0;
1904		} else {
1905			qgroup->max_excl = limit->max_excl;
1906		}
1907	}
1908	if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
1909		if (limit->rsv_rfer == CLEAR_VALUE) {
1910			qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
1911			limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
1912			qgroup->rsv_rfer = 0;
1913		} else {
1914			qgroup->rsv_rfer = limit->rsv_rfer;
1915		}
1916	}
1917	if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
1918		if (limit->rsv_excl == CLEAR_VALUE) {
1919			qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
1920			limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
1921			qgroup->rsv_excl = 0;
1922		} else {
1923			qgroup->rsv_excl = limit->rsv_excl;
1924		}
1925	}
1926	qgroup->lim_flags |= limit->flags;
1927
1928	spin_unlock(&fs_info->qgroup_lock);
1929
1930	ret = update_qgroup_limit_item(trans, qgroup);
1931	if (ret)
1932		qgroup_mark_inconsistent(fs_info, "qgroup item update error %d", ret);
1933
1934out:
1935	mutex_unlock(&fs_info->qgroup_ioctl_lock);
1936	return ret;
1937}
1938
1939/*
1940 * Inform qgroup to trace one dirty extent, its info is recorded in @record.
1941 * So qgroup can account it at transaction committing time.
1942 *
1943 * No lock version, caller must acquire delayed ref lock and allocated memory,
1944 * then call btrfs_qgroup_trace_extent_post() after exiting lock context.
1945 *
1946 * Return 0 for success insert
1947 * Return >0 for existing record, caller can free @record safely.
1948 * Return <0 for insertion failure, caller can free @record safely.
1949 */
1950int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
1951				     struct btrfs_delayed_ref_root *delayed_refs,
1952				     struct btrfs_qgroup_extent_record *record,
1953				     u64 bytenr)
1954{
1955	struct btrfs_qgroup_extent_record *existing, *ret;
1956	const unsigned long index = (bytenr >> fs_info->sectorsize_bits);
1957
1958	if (!btrfs_qgroup_full_accounting(fs_info))
1959		return 1;
1960
1961#if BITS_PER_LONG == 32
1962	if (bytenr >= MAX_LFS_FILESIZE) {
1963		btrfs_err_rl(fs_info,
1964"qgroup record for extent at %llu is beyond 32bit page cache and xarray index limit",
1965			     bytenr);
1966		btrfs_err_32bit_limit(fs_info);
1967		return -EOVERFLOW;
1968	}
1969#endif
1970
1971	trace_btrfs_qgroup_trace_extent(fs_info, record, bytenr);
1972
1973	xa_lock(&delayed_refs->dirty_extents);
1974	existing = xa_load(&delayed_refs->dirty_extents, index);
1975	if (existing) {
1976		if (record->data_rsv && !existing->data_rsv) {
1977			existing->data_rsv = record->data_rsv;
1978			existing->data_rsv_refroot = record->data_rsv_refroot;
1979		}
1980		xa_unlock(&delayed_refs->dirty_extents);
1981		return 1;
1982	}
1983
1984	ret = __xa_store(&delayed_refs->dirty_extents, index, record, GFP_ATOMIC);
1985	xa_unlock(&delayed_refs->dirty_extents);
1986	if (xa_is_err(ret)) {
1987		qgroup_mark_inconsistent(fs_info, "xarray insert error: %d", xa_err(ret));
1988		return xa_err(ret);
1989	}
1990
1991	return 0;
1992}
1993
1994/*
1995 * Post handler after qgroup_trace_extent_nolock().
1996 *
1997 * NOTE: Current qgroup does the expensive backref walk at transaction
1998 * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming
1999 * new transaction.
2000 * This is designed to allow btrfs_find_all_roots() to get correct new_roots
2001 * result.
2002 *
2003 * However for old_roots there is no need to do backref walk at that time,
2004 * since we search commit roots to walk backref and result will always be
2005 * correct.
2006 *
2007 * Due to the nature of no lock version, we can't do backref there.
2008 * So we must call btrfs_qgroup_trace_extent_post() after exiting
2009 * spinlock context.
2010 *
2011 * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result
2012 * using current root, then we can move all expensive backref walk out of
2013 * transaction committing, but not now as qgroup accounting will be wrong again.
2014 */
2015int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
2016				   struct btrfs_qgroup_extent_record *qrecord,
2017				   u64 bytenr)
2018{
2019	struct btrfs_fs_info *fs_info = trans->fs_info;
2020	struct btrfs_backref_walk_ctx ctx = {
2021		.bytenr = bytenr,
2022		.fs_info = fs_info,
2023	};
2024	int ret;
2025
2026	if (!btrfs_qgroup_full_accounting(fs_info))
2027		return 0;
2028	/*
2029	 * We are always called in a context where we are already holding a
2030	 * transaction handle. Often we are called when adding a data delayed
2031	 * reference from btrfs_truncate_inode_items() (truncating or unlinking),
2032	 * in which case we will be holding a write lock on extent buffer from a
2033	 * subvolume tree. In this case we can't allow btrfs_find_all_roots() to
2034	 * acquire fs_info->commit_root_sem, because that is a higher level lock
2035	 * that must be acquired before locking any extent buffers.
2036	 *
2037	 * So we want btrfs_find_all_roots() to not acquire the commit_root_sem
2038	 * but we can't pass it a non-NULL transaction handle, because otherwise
2039	 * it would not use commit roots and would lock extent buffers, causing
2040	 * a deadlock if it ends up trying to read lock the same extent buffer
2041	 * that was previously write locked at btrfs_truncate_inode_items().
2042	 *
2043	 * So pass a NULL transaction handle to btrfs_find_all_roots() and
2044	 * explicitly tell it to not acquire the commit_root_sem - if we are
2045	 * holding a transaction handle we don't need its protection.
2046	 */
2047	ASSERT(trans != NULL);
2048
2049	if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
2050		return 0;
2051
2052	ret = btrfs_find_all_roots(&ctx, true);
2053	if (ret < 0) {
2054		qgroup_mark_inconsistent(fs_info,
2055				"error accounting new delayed refs extent: %d", ret);
2056		return 0;
2057	}
2058
2059	/*
2060	 * Here we don't need to get the lock of
2061	 * trans->transaction->delayed_refs, since inserted qrecord won't
2062	 * be deleted, only qrecord->node may be modified (new qrecord insert)
2063	 *
2064	 * So modifying qrecord->old_roots is safe here
2065	 */
2066	qrecord->old_roots = ctx.roots;
2067	return 0;
2068}
2069
2070/*
2071 * Inform qgroup to trace one dirty extent, specified by @bytenr and
2072 * @num_bytes.
2073 * So qgroup can account it at commit trans time.
2074 *
2075 * Better encapsulated version, with memory allocation and backref walk for
2076 * commit roots.
2077 * So this can sleep.
2078 *
2079 * Return 0 if the operation is done.
2080 * Return <0 for error, like memory allocation failure or invalid parameter
2081 * (NULL trans)
2082 */
2083int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
2084			      u64 num_bytes)
2085{
2086	struct btrfs_fs_info *fs_info = trans->fs_info;
2087	struct btrfs_qgroup_extent_record *record;
2088	struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs;
2089	const unsigned long index = (bytenr >> fs_info->sectorsize_bits);
2090	int ret;
2091
2092	if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0)
2093		return 0;
2094	record = kzalloc(sizeof(*record), GFP_NOFS);
2095	if (!record)
2096		return -ENOMEM;
2097
2098	if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) {
2099		kfree(record);
2100		return -ENOMEM;
2101	}
2102
2103	record->num_bytes = num_bytes;
2104
2105	ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record, bytenr);
2106	if (ret) {
2107		/* Clean up if insertion fails or item exists. */
2108		xa_release(&delayed_refs->dirty_extents, index);
2109		kfree(record);
2110		return 0;
2111	}
2112	return btrfs_qgroup_trace_extent_post(trans, record, bytenr);
2113}
2114
2115/*
2116 * Inform qgroup to trace all leaf items of data
2117 *
2118 * Return 0 for success
2119 * Return <0 for error(ENOMEM)
2120 */
2121int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
2122				  struct extent_buffer *eb)
2123{
2124	struct btrfs_fs_info *fs_info = trans->fs_info;
2125	int nr = btrfs_header_nritems(eb);
2126	int i, extent_type, ret;
2127	struct btrfs_key key;
2128	struct btrfs_file_extent_item *fi;
2129	u64 bytenr, num_bytes;
2130
2131	/* We can be called directly from walk_up_proc() */
2132	if (!btrfs_qgroup_full_accounting(fs_info))
2133		return 0;
2134
2135	for (i = 0; i < nr; i++) {
2136		btrfs_item_key_to_cpu(eb, &key, i);
2137
2138		if (key.type != BTRFS_EXTENT_DATA_KEY)
2139			continue;
2140
2141		fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
2142		/* filter out non qgroup-accountable extents  */
2143		extent_type = btrfs_file_extent_type(eb, fi);
2144
2145		if (extent_type == BTRFS_FILE_EXTENT_INLINE)
2146			continue;
2147
2148		bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
2149		if (!bytenr)
2150			continue;
2151
2152		num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
2153
2154		ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes);
2155		if (ret)
2156			return ret;
2157	}
2158	cond_resched();
2159	return 0;
2160}
2161
2162/*
2163 * Walk up the tree from the bottom, freeing leaves and any interior
2164 * nodes which have had all slots visited. If a node (leaf or
2165 * interior) is freed, the node above it will have it's slot
2166 * incremented. The root node will never be freed.
2167 *
2168 * At the end of this function, we should have a path which has all
2169 * slots incremented to the next position for a search. If we need to
2170 * read a new node it will be NULL and the node above it will have the
2171 * correct slot selected for a later read.
2172 *
2173 * If we increment the root nodes slot counter past the number of
2174 * elements, 1 is returned to signal completion of the search.
2175 */
2176static int adjust_slots_upwards(struct btrfs_path *path, int root_level)
2177{
2178	int level = 0;
2179	int nr, slot;
2180	struct extent_buffer *eb;
2181
2182	if (root_level == 0)
2183		return 1;
2184
2185	while (level <= root_level) {
2186		eb = path->nodes[level];
2187		nr = btrfs_header_nritems(eb);
2188		path->slots[level]++;
2189		slot = path->slots[level];
2190		if (slot >= nr || level == 0) {
2191			/*
2192			 * Don't free the root -  we will detect this
2193			 * condition after our loop and return a
2194			 * positive value for caller to stop walking the tree.
2195			 */
2196			if (level != root_level) {
2197				btrfs_tree_unlock_rw(eb, path->locks[level]);
2198				path->locks[level] = 0;
2199
2200				free_extent_buffer(eb);
2201				path->nodes[level] = NULL;
2202				path->slots[level] = 0;
2203			}
2204		} else {
2205			/*
2206			 * We have a valid slot to walk back down
2207			 * from. Stop here so caller can process these
2208			 * new nodes.
2209			 */
2210			break;
2211		}
2212
2213		level++;
2214	}
2215
2216	eb = path->nodes[root_level];
2217	if (path->slots[root_level] >= btrfs_header_nritems(eb))
2218		return 1;
2219
2220	return 0;
2221}
2222
2223/*
2224 * Helper function to trace a subtree tree block swap.
2225 *
2226 * The swap will happen in highest tree block, but there may be a lot of
2227 * tree blocks involved.
2228 *
2229 * For example:
2230 *  OO = Old tree blocks
2231 *  NN = New tree blocks allocated during balance
2232 *
2233 *           File tree (257)                  Reloc tree for 257
2234 * L2              OO                                NN
2235 *               /    \                            /    \
2236 * L1          OO      OO (a)                    OO      NN (a)
2237 *            / \     / \                       / \     / \
2238 * L0       OO   OO OO   OO                   OO   OO NN   NN
2239 *                  (b)  (c)                          (b)  (c)
2240 *
2241 * When calling qgroup_trace_extent_swap(), we will pass:
2242 * @src_eb = OO(a)
2243 * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
2244 * @dst_level = 0
2245 * @root_level = 1
2246 *
2247 * In that case, qgroup_trace_extent_swap() will search from OO(a) to
2248 * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
2249 *
2250 * The main work of qgroup_trace_extent_swap() can be split into 3 parts:
2251 *
2252 * 1) Tree search from @src_eb
2253 *    It should acts as a simplified btrfs_search_slot().
2254 *    The key for search can be extracted from @dst_path->nodes[dst_level]
2255 *    (first key).
2256 *
2257 * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
2258 *    NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
2259 *    They should be marked during previous (@dst_level = 1) iteration.
2260 *
2261 * 3) Mark file extents in leaves dirty
2262 *    We don't have good way to pick out new file extents only.
2263 *    So we still follow the old method by scanning all file extents in
2264 *    the leave.
2265 *
2266 * This function can free us from keeping two paths, thus later we only need
2267 * to care about how to iterate all new tree blocks in reloc tree.
2268 */
2269static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
2270				    struct extent_buffer *src_eb,
2271				    struct btrfs_path *dst_path,
2272				    int dst_level, int root_level,
2273				    bool trace_leaf)
2274{
2275	struct btrfs_key key;
2276	BTRFS_PATH_AUTO_FREE(src_path);
2277	struct btrfs_fs_info *fs_info = trans->fs_info;
2278	u32 nodesize = fs_info->nodesize;
2279	int cur_level = root_level;
2280	int ret;
2281
2282	BUG_ON(dst_level > root_level);
2283	/* Level mismatch */
2284	if (btrfs_header_level(src_eb) != root_level)
2285		return -EINVAL;
2286
2287	src_path = btrfs_alloc_path();
2288	if (!src_path)
2289		return -ENOMEM;
2290
2291	if (dst_level)
2292		btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
2293	else
2294		btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
2295
2296	/* For src_path */
2297	refcount_inc(&src_eb->refs);
2298	src_path->nodes[root_level] = src_eb;
2299	src_path->slots[root_level] = dst_path->slots[root_level];
2300	src_path->locks[root_level] = 0;
2301
2302	/* A simplified version of btrfs_search_slot() */
2303	while (cur_level >= dst_level) {
2304		struct btrfs_key src_key;
2305		struct btrfs_key dst_key;
2306
2307		if (src_path->nodes[cur_level] == NULL) {
2308			struct extent_buffer *eb;
2309			int parent_slot;
2310
2311			eb = src_path->nodes[cur_level + 1];
2312			parent_slot = src_path->slots[cur_level + 1];
2313
2314			eb = btrfs_read_node_slot(eb, parent_slot);
2315			if (IS_ERR(eb))
2316				return PTR_ERR(eb);
2317
2318			src_path->nodes[cur_level] = eb;
2319
2320			btrfs_tree_read_lock(eb);
2321			src_path->locks[cur_level] = BTRFS_READ_LOCK;
2322		}
2323
2324		src_path->slots[cur_level] = dst_path->slots[cur_level];
2325		if (cur_level) {
2326			btrfs_node_key_to_cpu(dst_path->nodes[cur_level],
2327					&dst_key, dst_path->slots[cur_level]);
2328			btrfs_node_key_to_cpu(src_path->nodes[cur_level],
2329					&src_key, src_path->slots[cur_level]);
2330		} else {
2331			btrfs_item_key_to_cpu(dst_path->nodes[cur_level],
2332					&dst_key, dst_path->slots[cur_level]);
2333			btrfs_item_key_to_cpu(src_path->nodes[cur_level],
2334					&src_key, src_path->slots[cur_level]);
2335		}
2336		/* Content mismatch, something went wrong */
2337		if (btrfs_comp_cpu_keys(&dst_key, &src_key))
2338			return -ENOENT;
2339		cur_level--;
2340	}
2341
2342	/*
2343	 * Now both @dst_path and @src_path have been populated, record the tree
2344	 * blocks for qgroup accounting.
2345	 */
2346	ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
2347					nodesize);
2348	if (ret < 0)
2349		return ret;
2350	ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start,
2351					nodesize);
2352	if (ret < 0)
2353		return ret;
2354
2355	/* Record leaf file extents */
2356	if (dst_level == 0 && trace_leaf) {
2357		ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
2358		if (ret < 0)
2359			return ret;
2360		ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
2361	}
2362
2363	return ret;
2364}
2365
2366/*
2367 * Helper function to do recursive generation-aware depth-first search, to
2368 * locate all new tree blocks in a subtree of reloc tree.
2369 *
2370 * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
2371 *         reloc tree
2372 * L2         NN (a)
2373 *          /    \
2374 * L1    OO        NN (b)
2375 *      /  \      /  \
2376 * L0  OO  OO    OO  NN
2377 *               (c) (d)
2378 * If we pass:
2379 * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
2380 * @cur_level = 1
2381 * @root_level = 1
2382 *
2383 * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
2384 * above tree blocks along with their counter parts in file tree.
2385 * While during search, old tree blocks OO(c) will be skipped as tree block swap
2386 * won't affect OO(c).
2387 */
2388static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
2389					   struct extent_buffer *src_eb,
2390					   struct btrfs_path *dst_path,
2391					   int cur_level, int root_level,
2392					   u64 last_snapshot, bool trace_leaf)
2393{
2394	struct btrfs_fs_info *fs_info = trans->fs_info;
2395	struct extent_buffer *eb;
2396	bool need_cleanup = false;
2397	int ret = 0;
2398	int i;
2399
2400	/* Level sanity check */
2401	if (unlikely(cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
2402		     root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
2403		     root_level < cur_level)) {
2404		btrfs_err_rl(fs_info,
2405			"%s: bad levels, cur_level=%d root_level=%d",
2406			__func__, cur_level, root_level);
2407		return -EUCLEAN;
2408	}
2409
2410	/* Read the tree block if needed */
2411	if (dst_path->nodes[cur_level] == NULL) {
2412		int parent_slot;
2413		u64 child_gen;
2414
2415		/*
2416		 * dst_path->nodes[root_level] must be initialized before
2417		 * calling this function.
2418		 */
2419		if (unlikely(cur_level == root_level)) {
2420			btrfs_err_rl(fs_info,
2421	"%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
2422				__func__, root_level, root_level, cur_level);
2423			return -EUCLEAN;
2424		}
2425
2426		/*
2427		 * We need to get child blockptr/gen from parent before we can
2428		 * read it.
2429		  */
2430		eb = dst_path->nodes[cur_level + 1];
2431		parent_slot = dst_path->slots[cur_level + 1];
2432		child_gen = btrfs_node_ptr_generation(eb, parent_slot);
2433
2434		/* This node is old, no need to trace */
2435		if (child_gen < last_snapshot)
2436			goto out;
2437
2438		eb = btrfs_read_node_slot(eb, parent_slot);
2439		if (IS_ERR(eb)) {
2440			ret = PTR_ERR(eb);
2441			goto out;
2442		}
2443
2444		dst_path->nodes[cur_level] = eb;
2445		dst_path->slots[cur_level] = 0;
2446
2447		btrfs_tree_read_lock(eb);
2448		dst_path->locks[cur_level] = BTRFS_READ_LOCK;
2449		need_cleanup = true;
2450	}
2451
2452	/* Now record this tree block and its counter part for qgroups */
2453	ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
2454				       root_level, trace_leaf);
2455	if (ret < 0)
2456		goto cleanup;
2457
2458	eb = dst_path->nodes[cur_level];
2459
2460	if (cur_level > 0) {
2461		/* Iterate all child tree blocks */
2462		for (i = 0; i < btrfs_header_nritems(eb); i++) {
2463			/* Skip old tree blocks as they won't be swapped */
2464			if (btrfs_node_ptr_generation(eb, i) < last_snapshot)
2465				continue;
2466			dst_path->slots[cur_level] = i;
2467
2468			/* Recursive call (at most 7 times) */
2469			ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
2470					dst_path, cur_level - 1, root_level,
2471					last_snapshot, trace_leaf);
2472			if (ret < 0)
2473				goto cleanup;
2474		}
2475	}
2476
2477cleanup:
2478	if (need_cleanup) {
2479		/* Clean up */
2480		btrfs_tree_unlock_rw(dst_path->nodes[cur_level],
2481				     dst_path->locks[cur_level]);
2482		free_extent_buffer(dst_path->nodes[cur_level]);
2483		dst_path->nodes[cur_level] = NULL;
2484		dst_path->slots[cur_level] = 0;
2485		dst_path->locks[cur_level] = 0;
2486	}
2487out:
2488	return ret;
2489}
2490
2491static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
2492				struct extent_buffer *src_eb,
2493				struct extent_buffer *dst_eb,
2494				u64 last_snapshot, bool trace_leaf)
2495{
2496	struct btrfs_fs_info *fs_info = trans->fs_info;
2497	struct btrfs_path *dst_path = NULL;
2498	int level;
2499	int ret;
2500
2501	if (!btrfs_qgroup_full_accounting(fs_info))
2502		return 0;
2503
2504	/* Wrong parameter order */
2505	if (unlikely(btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb))) {
2506		btrfs_err_rl(fs_info,
2507		"%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
2508			     btrfs_header_generation(src_eb),
2509			     btrfs_header_generation(dst_eb));
2510		return -EUCLEAN;
2511	}
2512
2513	if (unlikely(!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb))) {
2514		ret = -EIO;
2515		goto out;
2516	}
2517
2518	level = btrfs_header_level(dst_eb);
2519	dst_path = btrfs_alloc_path();
2520	if (!dst_path) {
2521		ret = -ENOMEM;
2522		goto out;
2523	}
2524	/* For dst_path */
2525	refcount_inc(&dst_eb->refs);
2526	dst_path->nodes[level] = dst_eb;
2527	dst_path->slots[level] = 0;
2528	dst_path->locks[level] = 0;
2529
2530	/* Do the generation aware breadth-first search */
2531	ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
2532					      level, last_snapshot, trace_leaf);
2533	if (ret < 0)
2534		goto out;
2535	ret = 0;
2536
2537out:
2538	btrfs_free_path(dst_path);
2539	if (ret < 0)
2540		qgroup_mark_inconsistent(fs_info, "%s error: %d", __func__, ret);
2541	return ret;
2542}
2543
2544/*
2545 * Inform qgroup to trace a whole subtree, including all its child tree
2546 * blocks and data.
2547 * The root tree block is specified by @root_eb.
2548 *
2549 * Normally used by relocation(tree block swap) and subvolume deletion.
2550 *
2551 * Return 0 for success
2552 * Return <0 for error(ENOMEM or tree search error)
2553 */
2554int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
2555			       struct extent_buffer *root_eb,
2556			       u64 root_gen, int root_level)
2557{
2558	struct btrfs_fs_info *fs_info = trans->fs_info;
2559	int ret = 0;
2560	int level;
2561	u8 drop_subptree_thres;
2562	struct extent_buffer *eb = root_eb;
2563	BTRFS_PATH_AUTO_FREE(path);
2564
2565	ASSERT(0 <= root_level && root_level < BTRFS_MAX_LEVEL);
2566	ASSERT(root_eb != NULL);
2567
2568	if (!btrfs_qgroup_full_accounting(fs_info))
2569		return 0;
2570
2571	spin_lock(&fs_info->qgroup_lock);
2572	drop_subptree_thres = fs_info->qgroup_drop_subtree_thres;
2573	spin_unlock(&fs_info->qgroup_lock);
2574
2575	/*
2576	 * This function only gets called for snapshot drop, if we hit a high
2577	 * node here, it means we are going to change ownership for quite a lot
2578	 * of extents, which will greatly slow down btrfs_commit_transaction().
2579	 *
2580	 * So here if we find a high tree here, we just skip the accounting and
2581	 * mark qgroup inconsistent.
2582	 */
2583	if (root_level >= drop_subptree_thres) {
2584		qgroup_mark_inconsistent(fs_info, "subtree level reached threshold");
2585		return 0;
2586	}
2587
2588	if (!extent_buffer_uptodate(root_eb)) {
2589		struct btrfs_tree_parent_check check = {
2590			.transid = root_gen,
2591			.level = root_level
2592		};
2593
2594		ret = btrfs_read_extent_buffer(root_eb, &check);
2595		if (ret)
2596			return ret;
2597	}
2598
2599	if (root_level == 0) {
2600		ret = btrfs_qgroup_trace_leaf_items(trans, root_eb);
2601		return ret;
2602	}
2603
2604	path = btrfs_alloc_path();
2605	if (!path)
2606		return -ENOMEM;
2607
2608	/*
2609	 * Walk down the tree.  Missing extent blocks are filled in as
2610	 * we go. Metadata is accounted every time we read a new
2611	 * extent block.
2612	 *
2613	 * When we reach a leaf, we account for file extent items in it,
2614	 * walk back up the tree (adjusting slot pointers as we go)
2615	 * and restart the search process.
2616	 */
2617	refcount_inc(&root_eb->refs);	/* For path */
2618	path->nodes[root_level] = root_eb;
2619	path->slots[root_level] = 0;
2620	path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
2621walk_down:
2622	level = root_level;
2623	while (level >= 0) {
2624		if (path->nodes[level] == NULL) {
2625			int parent_slot;
2626			u64 child_bytenr;
2627
2628			/*
2629			 * We need to get child blockptr from parent before we
2630			 * can read it.
2631			  */
2632			eb = path->nodes[level + 1];
2633			parent_slot = path->slots[level + 1];
2634			child_bytenr = btrfs_node_blockptr(eb, parent_slot);
2635
2636			eb = btrfs_read_node_slot(eb, parent_slot);
2637			if (IS_ERR(eb))
2638				return PTR_ERR(eb);
2639
2640			path->nodes[level] = eb;
2641			path->slots[level] = 0;
2642
2643			btrfs_tree_read_lock(eb);
2644			path->locks[level] = BTRFS_READ_LOCK;
2645
2646			ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
2647							fs_info->nodesize);
2648			if (ret)
2649				return ret;
2650		}
2651
2652		if (level == 0) {
2653			ret = btrfs_qgroup_trace_leaf_items(trans,
2654							    path->nodes[level]);
2655			if (ret)
2656				return ret;
2657
2658			/* Nonzero return here means we completed our search */
2659			ret = adjust_slots_upwards(path, root_level);
2660			if (ret)
2661				break;
2662
2663			/* Restart search with new slots */
2664			goto walk_down;
2665		}
2666
2667		level--;
2668	}
2669
2670	return 0;
2671}
2672
2673static void qgroup_iterator_nested_add(struct list_head *head, struct btrfs_qgroup *qgroup)
2674{
2675	if (!list_empty(&qgroup->nested_iterator))
2676		return;
2677
2678	list_add_tail(&qgroup->nested_iterator, head);
2679}
2680
2681static void qgroup_iterator_nested_clean(struct list_head *head)
2682{
2683	while (!list_empty(head)) {
2684		struct btrfs_qgroup *qgroup;
2685
2686		qgroup = list_first_entry(head, struct btrfs_qgroup, nested_iterator);
2687		list_del_init(&qgroup->nested_iterator);
2688	}
2689}
2690
2691#define UPDATE_NEW	0
2692#define UPDATE_OLD	1
2693/*
2694 * Walk all of the roots that points to the bytenr and adjust their refcnts.
2695 */
2696static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
2697				 struct ulist *roots, struct list_head *qgroups,
2698				 u64 seq, bool update_old)
2699{
2700	struct ulist_node *unode;
2701	struct ulist_iterator uiter;
2702	struct btrfs_qgroup *qg;
2703
2704	if (!roots)
2705		return;
2706	ULIST_ITER_INIT(&uiter);
2707	while ((unode = ulist_next(roots, &uiter))) {
2708		LIST_HEAD(tmp);
2709
2710		qg = find_qgroup_rb(fs_info, unode->val);
2711		if (!qg)
2712			continue;
2713
2714		qgroup_iterator_nested_add(qgroups, qg);
2715		qgroup_iterator_add(&tmp, qg);
2716		list_for_each_entry(qg, &tmp, iterator) {
2717			struct btrfs_qgroup_list *glist;
2718
2719			if (update_old)
2720				btrfs_qgroup_update_old_refcnt(qg, seq, 1);
2721			else
2722				btrfs_qgroup_update_new_refcnt(qg, seq, 1);
2723
2724			list_for_each_entry(glist, &qg->groups, next_group) {
2725				qgroup_iterator_nested_add(qgroups, glist->group);
2726				qgroup_iterator_add(&tmp, glist->group);
2727			}
2728		}
2729		qgroup_iterator_clean(&tmp);
2730	}
2731}
2732
2733/*
2734 * Update qgroup rfer/excl counters.
2735 * Rfer update is easy, codes can explain themselves.
2736 *
2737 * Excl update is tricky, the update is split into 2 parts.
2738 * Part 1: Possible exclusive <-> sharing detect:
2739 *	|	A	|	!A	|
2740 *  -------------------------------------
2741 *  B	|	*	|	-	|
2742 *  -------------------------------------
2743 *  !B	|	+	|	**	|
2744 *  -------------------------------------
2745 *
2746 * Conditions:
2747 * A:	cur_old_roots < nr_old_roots	(not exclusive before)
2748 * !A:	cur_old_roots == nr_old_roots	(possible exclusive before)
2749 * B:	cur_new_roots < nr_new_roots	(not exclusive now)
2750 * !B:	cur_new_roots == nr_new_roots	(possible exclusive now)
2751 *
2752 * Results:
2753 * +: Possible sharing -> exclusive	-: Possible exclusive -> sharing
2754 * *: Definitely not changed.		**: Possible unchanged.
2755 *
2756 * For !A and !B condition, the exception is cur_old/new_roots == 0 case.
2757 *
2758 * To make the logic clear, we first use condition A and B to split
2759 * combination into 4 results.
2760 *
2761 * Then, for result "+" and "-", check old/new_roots == 0 case, as in them
2762 * only on variant maybe 0.
2763 *
2764 * Lastly, check result **, since there are 2 variants maybe 0, split them
2765 * again(2x2).
2766 * But this time we don't need to consider other things, the codes and logic
2767 * is easy to understand now.
2768 */
2769static void qgroup_update_counters(struct btrfs_fs_info *fs_info,
2770				   struct list_head *qgroups, u64 nr_old_roots,
2771				   u64 nr_new_roots, u64 num_bytes, u64 seq)
2772{
2773	struct btrfs_qgroup *qg;
2774
2775	list_for_each_entry(qg, qgroups, nested_iterator) {
2776		u64 cur_new_count, cur_old_count;
2777		bool dirty = false;
2778
2779		cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
2780		cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
2781
2782		trace_btrfs_qgroup_update_counters(fs_info, qg, cur_old_count,
2783						   cur_new_count);
2784
2785		/* Rfer update part */
2786		if (cur_old_count == 0 && cur_new_count > 0) {
2787			qg->rfer += num_bytes;
2788			qg->rfer_cmpr += num_bytes;
2789			dirty = true;
2790		}
2791		if (cur_old_count > 0 && cur_new_count == 0) {
2792			qg->rfer -= num_bytes;
2793			qg->rfer_cmpr -= num_bytes;
2794			dirty = true;
2795		}
2796
2797		/* Excl update part */
2798		/* Exclusive/none -> shared case */
2799		if (cur_old_count == nr_old_roots &&
2800		    cur_new_count < nr_new_roots) {
2801			/* Exclusive -> shared */
2802			if (cur_old_count != 0) {
2803				qg->excl -= num_bytes;
2804				qg->excl_cmpr -= num_bytes;
2805				dirty = true;
2806			}
2807		}
2808
2809		/* Shared -> exclusive/none case */
2810		if (cur_old_count < nr_old_roots &&
2811		    cur_new_count == nr_new_roots) {
2812			/* Shared->exclusive */
2813			if (cur_new_count != 0) {
2814				qg->excl += num_bytes;
2815				qg->excl_cmpr += num_bytes;
2816				dirty = true;
2817			}
2818		}
2819
2820		/* Exclusive/none -> exclusive/none case */
2821		if (cur_old_count == nr_old_roots &&
2822		    cur_new_count == nr_new_roots) {
2823			if (cur_old_count == 0) {
2824				/* None -> exclusive/none */
2825
2826				if (cur_new_count != 0) {
2827					/* None -> exclusive */
2828					qg->excl += num_bytes;
2829					qg->excl_cmpr += num_bytes;
2830					dirty = true;
2831				}
2832				/* None -> none, nothing changed */
2833			} else {
2834				/* Exclusive -> exclusive/none */
2835
2836				if (cur_new_count == 0) {
2837					/* Exclusive -> none */
2838					qg->excl -= num_bytes;
2839					qg->excl_cmpr -= num_bytes;
2840					dirty = true;
2841				}
2842				/* Exclusive -> exclusive, nothing changed */
2843			}
2844		}
2845
2846		if (dirty)
2847			qgroup_dirty(fs_info, qg);
2848	}
2849}
2850
2851/*
2852 * Check if the @roots potentially is a list of fs tree roots
2853 *
2854 * Return 0 for definitely not a fs/subvol tree roots ulist
2855 * Return 1 for possible fs/subvol tree roots in the list (considering an empty
2856 *          one as well)
2857 */
2858static int maybe_fs_roots(struct ulist *roots)
2859{
2860	struct ulist_node *unode;
2861	struct ulist_iterator uiter;
2862
2863	/* Empty one, still possible for fs roots */
2864	if (!roots || roots->nnodes == 0)
2865		return 1;
2866
2867	ULIST_ITER_INIT(&uiter);
2868	unode = ulist_next(roots, &uiter);
2869	if (!unode)
2870		return 1;
2871
2872	/*
2873	 * If it contains fs tree roots, then it must belong to fs/subvol
2874	 * trees.
2875	 * If it contains a non-fs tree, it won't be shared with fs/subvol trees.
2876	 */
2877	return btrfs_is_fstree(unode->val);
2878}
2879
2880int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
2881				u64 num_bytes, struct ulist *old_roots,
2882				struct ulist *new_roots)
2883{
2884	struct btrfs_fs_info *fs_info = trans->fs_info;
2885	LIST_HEAD(qgroups);
2886	u64 seq;
2887	u64 nr_new_roots = 0;
2888	u64 nr_old_roots = 0;
2889	int ret = 0;
2890
2891	/*
2892	 * If quotas get disabled meanwhile, the resources need to be freed and
2893	 * we can't just exit here.
2894	 */
2895	if (!btrfs_qgroup_full_accounting(fs_info) ||
2896	    fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
2897		goto out_free;
2898
2899	if (new_roots) {
2900		if (!maybe_fs_roots(new_roots))
2901			goto out_free;
2902		nr_new_roots = new_roots->nnodes;
2903	}
2904	if (old_roots) {
2905		if (!maybe_fs_roots(old_roots))
2906			goto out_free;
2907		nr_old_roots = old_roots->nnodes;
2908	}
2909
2910	/* Quick exit, either not fs tree roots, or won't affect any qgroup */
2911	if (nr_old_roots == 0 && nr_new_roots == 0)
2912		goto out_free;
2913
2914	trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
2915					num_bytes, nr_old_roots, nr_new_roots);
2916
2917	mutex_lock(&fs_info->qgroup_rescan_lock);
2918	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
2919		if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
2920			mutex_unlock(&fs_info->qgroup_rescan_lock);
2921			ret = 0;
2922			goto out_free;
2923		}
2924	}
2925	mutex_unlock(&fs_info->qgroup_rescan_lock);
2926
2927	spin_lock(&fs_info->qgroup_lock);
2928	seq = fs_info->qgroup_seq;
2929
2930	/* Update old refcnts using old_roots */
2931	qgroup_update_refcnt(fs_info, old_roots, &qgroups, seq, UPDATE_OLD);
2932
2933	/* Update new refcnts using new_roots */
2934	qgroup_update_refcnt(fs_info, new_roots, &qgroups, seq, UPDATE_NEW);
2935
2936	qgroup_update_counters(fs_info, &qgroups, nr_old_roots, nr_new_roots,
2937			       num_bytes, seq);
2938
2939	/*
2940	 * We're done using the iterator, release all its qgroups while holding
2941	 * fs_info->qgroup_lock so that we don't race with btrfs_remove_qgroup()
2942	 * and trigger use-after-free accesses to qgroups.
2943	 */
2944	qgroup_iterator_nested_clean(&qgroups);
2945
2946	/*
2947	 * Bump qgroup_seq to avoid seq overlap
2948	 */
2949	fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
2950	spin_unlock(&fs_info->qgroup_lock);
2951out_free:
2952	ulist_free(old_roots);
2953	ulist_free(new_roots);
2954	return ret;
2955}
2956
2957int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
2958{
2959	struct btrfs_fs_info *fs_info = trans->fs_info;
2960	struct btrfs_qgroup_extent_record *record;
2961	struct btrfs_delayed_ref_root *delayed_refs;
2962	struct ulist *new_roots = NULL;
2963	unsigned long index;
2964	u64 num_dirty_extents = 0;
2965	u64 qgroup_to_skip;
2966	int ret = 0;
2967
2968	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
2969		return 0;
2970
2971	delayed_refs = &trans->transaction->delayed_refs;
2972	qgroup_to_skip = delayed_refs->qgroup_to_skip;
2973	xa_for_each(&delayed_refs->dirty_extents, index, record) {
2974		const u64 bytenr = (((u64)index) << fs_info->sectorsize_bits);
2975
2976		num_dirty_extents++;
2977		trace_btrfs_qgroup_account_extents(fs_info, record, bytenr);
2978
2979		if (!ret && !(fs_info->qgroup_flags &
2980			      BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) {
2981			struct btrfs_backref_walk_ctx ctx = { 0 };
2982
2983			ctx.bytenr = bytenr;
2984			ctx.fs_info = fs_info;
2985
2986			/*
2987			 * Old roots should be searched when inserting qgroup
2988			 * extent record.
2989			 *
2990			 * But for INCONSISTENT (NO_ACCOUNTING) -> rescan case,
2991			 * we may have some record inserted during
2992			 * NO_ACCOUNTING (thus no old_roots populated), but
2993			 * later we start rescan, which clears NO_ACCOUNTING,
2994			 * leaving some inserted records without old_roots
2995			 * populated.
2996			 *
2997			 * Those cases are rare and should not cause too much
2998			 * time spent during commit_transaction().
2999			 */
3000			if (!record->old_roots) {
3001				/* Search commit root to find old_roots */
3002				ret = btrfs_find_all_roots(&ctx, false);
3003				if (ret < 0)
3004					goto cleanup;
3005				record->old_roots = ctx.roots;
3006				ctx.roots = NULL;
3007			}
3008
3009			/*
3010			 * Use BTRFS_SEQ_LAST as time_seq to do special search,
3011			 * which doesn't lock tree or delayed_refs and search
3012			 * current root. It's safe inside commit_transaction().
3013			 */
3014			ctx.trans = trans;
3015			ctx.time_seq = BTRFS_SEQ_LAST;
3016			ret = btrfs_find_all_roots(&ctx, false);
3017			if (ret < 0)
3018				goto cleanup;
3019			new_roots = ctx.roots;
3020			if (qgroup_to_skip) {
3021				ulist_del(new_roots, qgroup_to_skip, 0);
3022				ulist_del(record->old_roots, qgroup_to_skip,
3023					  0);
3024			}
3025			ret = btrfs_qgroup_account_extent(trans, bytenr,
3026							  record->num_bytes,
3027							  record->old_roots,
3028							  new_roots);
3029			record->old_roots = NULL;
3030			new_roots = NULL;
3031		}
3032		/* Free the reserved data space */
3033		btrfs_qgroup_free_refroot(fs_info,
3034				record->data_rsv_refroot,
3035				record->data_rsv,
3036				BTRFS_QGROUP_RSV_DATA);
3037cleanup:
3038		ulist_free(record->old_roots);
3039		ulist_free(new_roots);
3040		new_roots = NULL;
3041		xa_erase(&delayed_refs->dirty_extents, index);
3042		kfree(record);
3043
3044	}
3045	trace_btrfs_qgroup_num_dirty_extents(fs_info, trans->transid, num_dirty_extents);
3046	return ret;
3047}
3048
3049/*
3050 * Writes all changed qgroups to disk.
3051 * Called by the transaction commit path and the qgroup assign ioctl.
3052 */
3053int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
3054{
3055	struct btrfs_fs_info *fs_info = trans->fs_info;
3056	int ret = 0;
3057
3058	/*
3059	 * In case we are called from the qgroup assign ioctl, assert that we
3060	 * are holding the qgroup_ioctl_lock, otherwise we can race with a quota
3061	 * disable operation (ioctl) and access a freed quota root.
3062	 */
3063	if (trans->transaction->state != TRANS_STATE_COMMIT_DOING)
3064		lockdep_assert_held(&fs_info->qgroup_ioctl_lock);
3065
3066	if (!fs_info->quota_root)
3067		return ret;
3068
3069	spin_lock(&fs_info->qgroup_lock);
3070	while (!list_empty(&fs_info->dirty_qgroups)) {
3071		struct btrfs_qgroup *qgroup;
3072		qgroup = list_first_entry(&fs_info->dirty_qgroups,
3073					  struct btrfs_qgroup, dirty);
3074		list_del_init(&qgroup->dirty);
3075		spin_unlock(&fs_info->qgroup_lock);
3076		ret = update_qgroup_info_item(trans, qgroup);
3077		if (ret)
3078			qgroup_mark_inconsistent(fs_info,
3079						 "qgroup info item update error %d", ret);
3080		ret = update_qgroup_limit_item(trans, qgroup);
3081		if (ret)
3082			qgroup_mark_inconsistent(fs_info,
3083						 "qgroup limit item update error %d", ret);
3084		spin_lock(&fs_info->qgroup_lock);
3085	}
3086	if (btrfs_qgroup_enabled(fs_info))
3087		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
3088	else
3089		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
3090	spin_unlock(&fs_info->qgroup_lock);
3091
3092	ret = update_qgroup_status_item(trans);
3093	if (ret)
3094		qgroup_mark_inconsistent(fs_info,
3095					 "qgroup status item update error %d", ret);
3096
3097	return ret;
3098}
3099
3100int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info,
3101			       struct btrfs_qgroup_inherit *inherit,
3102			       size_t size)
3103{
3104	if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP)
3105		return -EOPNOTSUPP;
3106	if (size < sizeof(*inherit) || size > PAGE_SIZE)
3107		return -EINVAL;
3108
3109	/*
3110	 * In the past we allowed btrfs_qgroup_inherit to specify to copy
3111	 * rfer/excl numbers directly from other qgroups.  This behavior has
3112	 * been disabled in userspace for a very long time, but here we should
3113	 * also disable it in kernel, as this behavior is known to mark qgroup
3114	 * inconsistent, and a rescan would wipe out the changes anyway.
3115	 *
3116	 * Reject any btrfs_qgroup_inherit with num_ref_copies or num_excl_copies.
3117	 */
3118	if (inherit->num_ref_copies > 0 || inherit->num_excl_copies > 0)
3119		return -EINVAL;
3120
3121	if (size != struct_size(inherit, qgroups, inherit->num_qgroups))
3122		return -EINVAL;
3123
3124	/*
3125	 * Skip the inherit source qgroups check if qgroup is not enabled.
3126	 * Qgroup can still be later enabled causing problems, but in that case
3127	 * btrfs_qgroup_inherit() would just ignore those invalid ones.
3128	 */
3129	if (!btrfs_qgroup_enabled(fs_info))
3130		return 0;
3131
3132	/*
3133	 * Now check all the remaining qgroups, they should all:
3134	 *
3135	 * - Exist
3136	 * - Be higher level qgroups.
3137	 */
3138	for (int i = 0; i < inherit->num_qgroups; i++) {
3139		struct btrfs_qgroup *qgroup;
3140		u64 qgroupid = inherit->qgroups[i];
3141
3142		if (btrfs_qgroup_level(qgroupid) == 0)
3143			return -EINVAL;
3144
3145		spin_lock(&fs_info->qgroup_lock);
3146		qgroup = find_qgroup_rb(fs_info, qgroupid);
3147		if (!qgroup) {
3148			spin_unlock(&fs_info->qgroup_lock);
3149			return -ENOENT;
3150		}
3151		spin_unlock(&fs_info->qgroup_lock);
3152	}
3153	return 0;
3154}
3155
3156static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info,
3157			       u64 inode_rootid,
3158			       struct btrfs_qgroup_inherit **inherit)
3159{
3160	int i = 0;
3161	u64 num_qgroups = 0;
3162	struct btrfs_qgroup *inode_qg;
3163	struct btrfs_qgroup_list *qg_list;
3164	struct btrfs_qgroup_inherit *res;
3165	size_t struct_sz;
3166	u64 *qgids;
3167
3168	if (*inherit)
3169		return -EEXIST;
3170
3171	inode_qg = find_qgroup_rb(fs_info, inode_rootid);
3172	if (!inode_qg)
3173		return -ENOENT;
3174
3175	num_qgroups = list_count_nodes(&inode_qg->groups);
3176
3177	if (!num_qgroups)
3178		return 0;
3179
3180	struct_sz = struct_size(res, qgroups, num_qgroups);
3181	if (struct_sz == SIZE_MAX)
3182		return -ERANGE;
3183
3184	res = kzalloc(struct_sz, GFP_NOFS);
3185	if (!res)
3186		return -ENOMEM;
3187	res->num_qgroups = num_qgroups;
3188	qgids = res->qgroups;
3189
3190	list_for_each_entry(qg_list, &inode_qg->groups, next_group)
3191		qgids[i++] = qg_list->group->qgroupid;
3192
3193	*inherit = res;
3194	return 0;
3195}
3196
3197/*
3198 * Check if we can skip rescan when inheriting qgroups.  If @src has a single
3199 * @parent, and that @parent is owning all its bytes exclusively, we can skip
3200 * the full rescan, by just adding nodesize to the @parent's excl/rfer.
3201 *
3202 * Return <0 for fatal errors (like srcid/parentid has no qgroup).
3203 * Return 0 if a quick inherit is done.
3204 * Return >0 if a quick inherit is not possible, and a full rescan is needed.
3205 */
3206static int qgroup_snapshot_quick_inherit(struct btrfs_fs_info *fs_info,
3207					 u64 srcid, u64 parentid)
3208{
3209	struct btrfs_qgroup *src;
3210	struct btrfs_qgroup *parent;
3211	struct btrfs_qgroup *qgroup;
3212	struct btrfs_qgroup_list *list;
3213	LIST_HEAD(qgroup_list);
3214	const u32 nodesize = fs_info->nodesize;
3215	int nr_parents = 0;
3216
3217	if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_FULL)
3218		return 0;
3219
3220	src = find_qgroup_rb(fs_info, srcid);
3221	if (!src)
3222		return -ENOENT;
3223	parent = find_qgroup_rb(fs_info, parentid);
3224	if (!parent)
3225		return -ENOENT;
3226
3227	/*
3228	 * Source has no parent qgroup, but our new qgroup would have one.
3229	 * Qgroup numbers would become inconsistent.
3230	 */
3231	if (list_empty(&src->groups))
3232		return 1;
3233
3234	list_for_each_entry(list, &src->groups, next_group) {
3235		/* The parent is not the same, quick update is not possible. */
3236		if (list->group->qgroupid != parentid)
3237			return 1;
3238		nr_parents++;
3239		/*
3240		 * More than one parent qgroup, we can't be sure about accounting
3241		 * consistency.
3242		 */
3243		if (nr_parents > 1)
3244			return 1;
3245	}
3246
3247	/*
3248	 * The parent is not exclusively owning all its bytes.  We're not sure
3249	 * if the source has any bytes not fully owned by the parent.
3250	 */
3251	if (parent->excl != parent->rfer)
3252		return 1;
3253
3254	qgroup_iterator_add(&qgroup_list, parent);
3255	list_for_each_entry(qgroup, &qgroup_list, iterator) {
3256		qgroup->rfer += nodesize;
3257		qgroup->rfer_cmpr += nodesize;
3258		qgroup->excl += nodesize;
3259		qgroup->excl_cmpr += nodesize;
3260		qgroup_dirty(fs_info, qgroup);
3261
3262		/* Append parent qgroups to @qgroup_list. */
3263		list_for_each_entry(list, &qgroup->groups, next_group)
3264			qgroup_iterator_add(&qgroup_list, list->group);
3265	}
3266	qgroup_iterator_clean(&qgroup_list);
3267	return 0;
3268}
3269
3270/*
3271 * Copy the accounting information between qgroups. This is necessary
3272 * when a snapshot or a subvolume is created. Throwing an error will
3273 * cause a transaction abort so we take extra care here to only error
3274 * when a readonly fs is a reasonable outcome.
3275 */
3276int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
3277			 u64 objectid, u64 inode_rootid,
3278			 struct btrfs_qgroup_inherit *inherit)
3279{
3280	int ret = 0;
3281	u64 *i_qgroups;
3282	bool committing = false;
3283	struct btrfs_fs_info *fs_info = trans->fs_info;
3284	struct btrfs_root *quota_root;
3285	struct btrfs_qgroup *srcgroup;
3286	struct btrfs_qgroup *dstgroup;
3287	struct btrfs_qgroup *prealloc;
3288	struct btrfs_qgroup_list **qlist_prealloc = NULL;
3289	bool free_inherit = false;
3290	bool need_rescan = false;
3291	u32 level_size = 0;
3292	u64 nums;
3293
3294	if (!btrfs_qgroup_enabled(fs_info))
3295		return 0;
3296
3297	prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
3298	if (!prealloc)
3299		return -ENOMEM;
3300
3301	/*
3302	 * There are only two callers of this function.
3303	 *
3304	 * One in create_subvol() in the ioctl context, which needs to hold
3305	 * the qgroup_ioctl_lock.
3306	 *
3307	 * The other one in create_pending_snapshot() where no other qgroup
3308	 * code can modify the fs as they all need to either start a new trans
3309	 * or hold a trans handler, thus we don't need to hold
3310	 * qgroup_ioctl_lock.
3311	 * This would avoid long and complex lock chain and make lockdep happy.
3312	 */
3313	spin_lock(&fs_info->trans_lock);
3314	if (trans->transaction->state == TRANS_STATE_COMMIT_DOING)
3315		committing = true;
3316	spin_unlock(&fs_info->trans_lock);
3317
3318	if (!committing)
3319		mutex_lock(&fs_info->qgroup_ioctl_lock);
3320
3321	quota_root = fs_info->quota_root;
3322	if (!quota_root) {
3323		ret = -EINVAL;
3324		goto out;
3325	}
3326
3327	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && !inherit) {
3328		ret = qgroup_auto_inherit(fs_info, inode_rootid, &inherit);
3329		if (ret)
3330			goto out;
3331		free_inherit = true;
3332	}
3333
3334	if (inherit) {
3335		i_qgroups = (u64 *)(inherit + 1);
3336		nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
3337		       2 * inherit->num_excl_copies;
3338		for (int i = 0; i < nums; i++) {
3339			srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
3340
3341			/*
3342			 * Zero out invalid groups so we can ignore
3343			 * them later.
3344			 */
3345			if (!srcgroup ||
3346			    ((srcgroup->qgroupid >> 48) <= (objectid >> 48)))
3347				*i_qgroups = 0ULL;
3348
3349			++i_qgroups;
3350		}
3351	}
3352
3353	/*
3354	 * create a tracking group for the subvol itself
3355	 */
3356	ret = add_qgroup_item(trans, quota_root, objectid);
3357	if (ret)
3358		goto out;
3359
3360	/*
3361	 * add qgroup to all inherited groups
3362	 */
3363	if (inherit) {
3364		i_qgroups = (u64 *)(inherit + 1);
3365		for (int i = 0; i < inherit->num_qgroups; i++, i_qgroups++) {
3366			if (*i_qgroups == 0)
3367				continue;
3368			ret = add_qgroup_relation_item(trans, objectid,
3369						       *i_qgroups);
3370			if (ret && ret != -EEXIST)
3371				goto out;
3372			ret = add_qgroup_relation_item(trans, *i_qgroups,
3373						       objectid);
3374			if (ret && ret != -EEXIST)
3375				goto out;
3376		}
3377		ret = 0;
3378
3379		qlist_prealloc = kcalloc(inherit->num_qgroups,
3380					 sizeof(struct btrfs_qgroup_list *),
3381					 GFP_NOFS);
3382		if (!qlist_prealloc) {
3383			ret = -ENOMEM;
3384			goto out;
3385		}
3386		for (int i = 0; i < inherit->num_qgroups; i++) {
3387			qlist_prealloc[i] = kzalloc(sizeof(struct btrfs_qgroup_list),
3388						    GFP_NOFS);
3389			if (!qlist_prealloc[i]) {
3390				ret = -ENOMEM;
3391				goto out;
3392			}
3393		}
3394	}
3395
3396	spin_lock(&fs_info->qgroup_lock);
3397
3398	dstgroup = add_qgroup_rb(fs_info, prealloc, objectid);
3399	prealloc = NULL;
3400
3401	if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
3402		dstgroup->lim_flags = inherit->lim.flags;
3403		dstgroup->max_rfer = inherit->lim.max_rfer;
3404		dstgroup->max_excl = inherit->lim.max_excl;
3405		dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
3406		dstgroup->rsv_excl = inherit->lim.rsv_excl;
3407
3408		qgroup_dirty(fs_info, dstgroup);
3409	}
3410
3411	if (srcid && btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) {
3412		srcgroup = find_qgroup_rb(fs_info, srcid);
3413		if (!srcgroup)
3414			goto unlock;
3415
3416		/*
3417		 * We call inherit after we clone the root in order to make sure
3418		 * our counts don't go crazy, so at this point the only
3419		 * difference between the two roots should be the root node.
3420		 */
3421		level_size = fs_info->nodesize;
3422		dstgroup->rfer = srcgroup->rfer;
3423		dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
3424		dstgroup->excl = level_size;
3425		dstgroup->excl_cmpr = level_size;
3426		srcgroup->excl = level_size;
3427		srcgroup->excl_cmpr = level_size;
3428
3429		/* inherit the limit info */
3430		dstgroup->lim_flags = srcgroup->lim_flags;
3431		dstgroup->max_rfer = srcgroup->max_rfer;
3432		dstgroup->max_excl = srcgroup->max_excl;
3433		dstgroup->rsv_rfer = srcgroup->rsv_rfer;
3434		dstgroup->rsv_excl = srcgroup->rsv_excl;
3435
3436		qgroup_dirty(fs_info, dstgroup);
3437		qgroup_dirty(fs_info, srcgroup);
3438
3439		/*
3440		 * If the source qgroup has parent but the new one doesn't,
3441		 * we need a full rescan.
3442		 */
3443		if (!inherit && !list_empty(&srcgroup->groups))
3444			need_rescan = true;
3445	}
3446
3447	if (!inherit)
3448		goto unlock;
3449
3450	i_qgroups = (u64 *)(inherit + 1);
3451	for (int i = 0; i < inherit->num_qgroups; i++) {
3452		if (*i_qgroups) {
3453			ret = add_relation_rb(fs_info, qlist_prealloc[i], objectid,
3454					      *i_qgroups);
3455			qlist_prealloc[i] = NULL;
3456			if (ret)
3457				goto unlock;
3458		}
3459		if (srcid) {
3460			/* Check if we can do a quick inherit. */
3461			ret = qgroup_snapshot_quick_inherit(fs_info, srcid, *i_qgroups);
3462			if (ret < 0)
3463				goto unlock;
3464			if (ret > 0)
3465				need_rescan = true;
3466			ret = 0;
3467		}
3468		++i_qgroups;
3469	}
3470
3471	for (int i = 0; i < inherit->num_ref_copies; i++, i_qgroups += 2) {
3472		struct btrfs_qgroup *src;
3473		struct btrfs_qgroup *dst;
3474
3475		if (!i_qgroups[0] || !i_qgroups[1])
3476			continue;
3477
3478		src = find_qgroup_rb(fs_info, i_qgroups[0]);
3479		dst = find_qgroup_rb(fs_info, i_qgroups[1]);
3480
3481		if (!src || !dst) {
3482			ret = -EINVAL;
3483			goto unlock;
3484		}
3485
3486		dst->rfer = src->rfer - level_size;
3487		dst->rfer_cmpr = src->rfer_cmpr - level_size;
3488
3489		/* Manually tweaking numbers certainly needs a rescan */
3490		need_rescan = true;
3491	}
3492	for (int i = 0; i < inherit->num_excl_copies; i++, i_qgroups += 2) {
3493		struct btrfs_qgroup *src;
3494		struct btrfs_qgroup *dst;
3495
3496		if (!i_qgroups[0] || !i_qgroups[1])
3497			continue;
3498
3499		src = find_qgroup_rb(fs_info, i_qgroups[0]);
3500		dst = find_qgroup_rb(fs_info, i_qgroups[1]);
3501
3502		if (!src || !dst) {
3503			ret = -EINVAL;
3504			goto unlock;
3505		}
3506
3507		dst->excl = src->excl + level_size;
3508		dst->excl_cmpr = src->excl_cmpr + level_size;
3509		need_rescan = true;
3510	}
3511
3512unlock:
3513	spin_unlock(&fs_info->qgroup_lock);
3514	if (!ret)
3515		ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup);
3516out:
3517	if (!committing)
3518		mutex_unlock(&fs_info->qgroup_ioctl_lock);
3519	if (need_rescan)
3520		qgroup_mark_inconsistent(fs_info, "qgroup inherit needs a rescan");
3521	if (qlist_prealloc) {
3522		for (int i = 0; i < inherit->num_qgroups; i++)
3523			kfree(qlist_prealloc[i]);
3524		kfree(qlist_prealloc);
3525	}
3526	if (free_inherit)
3527		kfree(inherit);
3528	kfree(prealloc);
3529	return ret;
3530}
3531
3532static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
3533{
3534	if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
3535	    qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
3536		return false;
3537
3538	if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
3539	    qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
3540		return false;
3541
3542	return true;
3543}
3544
3545static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
3546			  enum btrfs_qgroup_rsv_type type)
3547{
3548	struct btrfs_qgroup *qgroup;
3549	struct btrfs_fs_info *fs_info = root->fs_info;
3550	u64 ref_root = btrfs_root_id(root);
3551	int ret = 0;
3552	LIST_HEAD(qgroup_list);
3553
3554	if (!btrfs_is_fstree(ref_root))
3555		return 0;
3556
3557	if (num_bytes == 0)
3558		return 0;
3559
3560	if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) &&
3561	    capable(CAP_SYS_RESOURCE))
3562		enforce = false;
3563
3564	spin_lock(&fs_info->qgroup_lock);
3565	if (!fs_info->quota_root)
3566		goto out;
3567
3568	qgroup = find_qgroup_rb(fs_info, ref_root);
3569	if (!qgroup)
3570		goto out;
3571
3572	qgroup_iterator_add(&qgroup_list, qgroup);
3573	list_for_each_entry(qgroup, &qgroup_list, iterator) {
3574		struct btrfs_qgroup_list *glist;
3575
3576		if (enforce && !qgroup_check_limits(qgroup, num_bytes)) {
3577			ret = -EDQUOT;
3578			goto out;
3579		}
3580
3581		list_for_each_entry(glist, &qgroup->groups, next_group)
3582			qgroup_iterator_add(&qgroup_list, glist->group);
3583	}
3584
3585	ret = 0;
3586	/*
3587	 * no limits exceeded, now record the reservation into all qgroups
3588	 */
3589	list_for_each_entry(qgroup, &qgroup_list, iterator)
3590		qgroup_rsv_add(fs_info, qgroup, num_bytes, type);
3591
3592out:
3593	qgroup_iterator_clean(&qgroup_list);
3594	spin_unlock(&fs_info->qgroup_lock);
3595	return ret;
3596}
3597
3598/*
3599 * Free @num_bytes of reserved space with @type for qgroup.  (Normally level 0
3600 * qgroup).
3601 *
3602 * Will handle all higher level qgroup too.
3603 *
3604 * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup.
3605 * This special case is only used for META_PERTRANS type.
3606 */
3607void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
3608			       u64 ref_root, u64 num_bytes,
3609			       enum btrfs_qgroup_rsv_type type)
3610{
3611	struct btrfs_qgroup *qgroup;
3612	LIST_HEAD(qgroup_list);
3613
3614	if (!btrfs_is_fstree(ref_root))
3615		return;
3616
3617	if (num_bytes == 0)
3618		return;
3619
3620	if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) {
3621		WARN(1, "%s: Invalid type to free", __func__);
3622		return;
3623	}
3624	spin_lock(&fs_info->qgroup_lock);
3625
3626	if (!fs_info->quota_root)
3627		goto out;
3628
3629	qgroup = find_qgroup_rb(fs_info, ref_root);
3630	if (!qgroup)
3631		goto out;
3632
3633	if (num_bytes == (u64)-1)
3634		/*
3635		 * We're freeing all pertrans rsv, get reserved value from
3636		 * level 0 qgroup as real num_bytes to free.
3637		 */
3638		num_bytes = qgroup->rsv.values[type];
3639
3640	qgroup_iterator_add(&qgroup_list, qgroup);
3641	list_for_each_entry(qgroup, &qgroup_list, iterator) {
3642		struct btrfs_qgroup_list *glist;
3643
3644		qgroup_rsv_release(fs_info, qgroup, num_bytes, type);
3645		list_for_each_entry(glist, &qgroup->groups, next_group) {
3646			qgroup_iterator_add(&qgroup_list, glist->group);
3647		}
3648	}
3649out:
3650	qgroup_iterator_clean(&qgroup_list);
3651	spin_unlock(&fs_info->qgroup_lock);
3652}
3653
3654/*
3655 * Check if the leaf is the last leaf. Which means all node pointers
3656 * are at their last position.
3657 */
3658static bool is_last_leaf(struct btrfs_path *path)
3659{
3660	int i;
3661
3662	for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
3663		if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1)
3664			return false;
3665	}
3666	return true;
3667}
3668
3669/*
3670 * returns < 0 on error, 0 when more leafs are to be scanned.
3671 * returns 1 when done.
3672 */
3673static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
3674			      struct btrfs_path *path)
3675{
3676	struct btrfs_fs_info *fs_info = trans->fs_info;
3677	struct btrfs_root *extent_root;
3678	struct btrfs_key found;
3679	struct extent_buffer *scratch_leaf = NULL;
3680	u64 num_bytes;
3681	bool done;
3682	int slot;
3683	int ret;
3684
3685	if (!btrfs_qgroup_full_accounting(fs_info))
3686		return 1;
3687
3688	mutex_lock(&fs_info->qgroup_rescan_lock);
3689	extent_root = btrfs_extent_root(fs_info,
3690				fs_info->qgroup_rescan_progress.objectid);
3691	ret = btrfs_search_slot_for_read(extent_root,
3692					 &fs_info->qgroup_rescan_progress,
3693					 path, 1, 0);
3694
3695	btrfs_debug(fs_info,
3696		    "current progress key " BTRFS_KEY_FMT ", search_slot ret %d",
3697		    BTRFS_KEY_FMT_VALUE(&fs_info->qgroup_rescan_progress), ret);
3698
3699	if (ret) {
3700		/*
3701		 * The rescan is about to end, we will not be scanning any
3702		 * further blocks. We cannot unset the RESCAN flag here, because
3703		 * we want to commit the transaction if everything went well.
3704		 * To make the live accounting work in this phase, we set our
3705		 * scan progress pointer such that every real extent objectid
3706		 * will be smaller.
3707		 */
3708		fs_info->qgroup_rescan_progress.objectid = (u64)-1;
3709		btrfs_release_path(path);
3710		mutex_unlock(&fs_info->qgroup_rescan_lock);
3711		return ret;
3712	}
3713	done = is_last_leaf(path);
3714
3715	btrfs_item_key_to_cpu(path->nodes[0], &found,
3716			      btrfs_header_nritems(path->nodes[0]) - 1);
3717	fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
3718
3719	scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
3720	if (!scratch_leaf) {
3721		ret = -ENOMEM;
3722		mutex_unlock(&fs_info->qgroup_rescan_lock);
3723		goto out;
3724	}
3725	slot = path->slots[0];
3726	btrfs_release_path(path);
3727	mutex_unlock(&fs_info->qgroup_rescan_lock);
3728
3729	for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
3730		struct btrfs_backref_walk_ctx ctx = { 0 };
3731
3732		btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
3733		if (found.type != BTRFS_EXTENT_ITEM_KEY &&
3734		    found.type != BTRFS_METADATA_ITEM_KEY)
3735			continue;
3736		if (found.type == BTRFS_METADATA_ITEM_KEY)
3737			num_bytes = fs_info->nodesize;
3738		else
3739			num_bytes = found.offset;
3740
3741		ctx.bytenr = found.objectid;
3742		ctx.fs_info = fs_info;
3743
3744		ret = btrfs_find_all_roots(&ctx, false);
3745		if (ret < 0)
3746			goto out;
3747		/* For rescan, just pass old_roots as NULL */
3748		ret = btrfs_qgroup_account_extent(trans, found.objectid,
3749						  num_bytes, NULL, ctx.roots);
3750		if (ret < 0)
3751			goto out;
3752	}
3753out:
3754	if (scratch_leaf)
3755		free_extent_buffer(scratch_leaf);
3756
3757	if (done && !ret) {
3758		ret = 1;
3759		fs_info->qgroup_rescan_progress.objectid = (u64)-1;
3760	}
3761	return ret;
3762}
3763
3764static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
3765{
3766	if (btrfs_fs_closing(fs_info))
3767		return true;
3768	if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
3769		return true;
3770	if (!btrfs_qgroup_enabled(fs_info))
3771		return true;
3772	if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
3773		return true;
3774	return false;
3775}
3776
3777static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
3778{
3779	struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
3780						     qgroup_rescan_work);
3781	struct btrfs_path *path;
3782	struct btrfs_trans_handle *trans = NULL;
3783	int ret = 0;
3784	bool stopped = false;
3785	bool did_leaf_rescans = false;
3786
3787	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
3788		return;
3789
3790	path = btrfs_alloc_path();
3791	if (!path) {
3792		ret = -ENOMEM;
3793		goto out;
3794	}
3795	/*
3796	 * Rescan should only search for commit root, and any later difference
3797	 * should be recorded by qgroup
3798	 */
3799	path->search_commit_root = true;
3800	path->skip_locking = true;
3801
3802	while (!ret && !(stopped = rescan_should_stop(fs_info))) {
3803		trans = btrfs_start_transaction(fs_info->fs_root, 0);
3804		if (IS_ERR(trans)) {
3805			ret = PTR_ERR(trans);
3806			break;
3807		}
3808
3809		ret = qgroup_rescan_leaf(trans, path);
3810		did_leaf_rescans = true;
3811
3812		if (ret > 0)
3813			btrfs_commit_transaction(trans);
3814		else
3815			btrfs_end_transaction(trans);
3816	}
3817
3818out:
3819	btrfs_free_path(path);
3820
3821	mutex_lock(&fs_info->qgroup_rescan_lock);
3822	if (ret > 0 &&
3823	    fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
3824		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3825	} else if (ret < 0 || stopped) {
3826		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3827	}
3828	mutex_unlock(&fs_info->qgroup_rescan_lock);
3829
3830	/*
3831	 * Only update status, since the previous part has already updated the
3832	 * qgroup info, and only if we did any actual work. This also prevents
3833	 * race with a concurrent quota disable, which has already set
3834	 * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at
3835	 * btrfs_quota_disable().
3836	 */
3837	if (did_leaf_rescans) {
3838		trans = btrfs_start_transaction(fs_info->quota_root, 1);
3839		if (IS_ERR(trans)) {
3840			ret = PTR_ERR(trans);
3841			trans = NULL;
3842			btrfs_err(fs_info,
3843				  "fail to start transaction for status update: %d",
3844				  ret);
3845		}
3846	} else {
3847		trans = NULL;
3848	}
3849
3850	mutex_lock(&fs_info->qgroup_rescan_lock);
3851	if (!stopped ||
3852	    fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
3853		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3854	if (trans) {
3855		int ret2 = update_qgroup_status_item(trans);
3856
3857		if (ret2 < 0) {
3858			ret = ret2;
3859			btrfs_err(fs_info, "fail to update qgroup status: %d", ret);
3860		}
3861	}
3862	fs_info->qgroup_rescan_running = false;
3863	fs_info->qgroup_flags &= ~BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
3864	complete_all(&fs_info->qgroup_rescan_completion);
3865	mutex_unlock(&fs_info->qgroup_rescan_lock);
3866
3867	if (!trans)
3868		return;
3869
3870	btrfs_end_transaction(trans);
3871
3872	if (stopped) {
3873		btrfs_info(fs_info, "qgroup scan paused");
3874	} else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) {
3875		btrfs_info(fs_info, "qgroup scan cancelled");
3876	} else if (ret >= 0) {
3877		btrfs_info(fs_info, "qgroup scan completed%s",
3878			ret > 0 ? " (inconsistency flag cleared)" : "");
3879	} else {
3880		btrfs_err(fs_info, "qgroup scan failed with %d", ret);
3881	}
3882}
3883
3884/*
3885 * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
3886 * memory required for the rescan context.
3887 */
3888static int
3889qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
3890		   int init_flags)
3891{
3892	int ret = 0;
3893
3894	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) {
3895		btrfs_warn(fs_info, "qgroup rescan init failed, running in simple mode");
3896		return -EINVAL;
3897	}
3898
3899	if (!init_flags) {
3900		/* we're resuming qgroup rescan at mount time */
3901		if (!(fs_info->qgroup_flags &
3902		      BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
3903			btrfs_debug(fs_info,
3904			"qgroup rescan init failed, qgroup rescan is not queued");
3905			ret = -EINVAL;
3906		} else if (!(fs_info->qgroup_flags &
3907			     BTRFS_QGROUP_STATUS_FLAG_ON)) {
3908			btrfs_debug(fs_info,
3909			"qgroup rescan init failed, qgroup is not enabled");
3910			ret = -ENOTCONN;
3911		}
3912
3913		if (ret)
3914			return ret;
3915	}
3916
3917	mutex_lock(&fs_info->qgroup_rescan_lock);
3918
3919	if (init_flags) {
3920		if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
3921			ret = -EINPROGRESS;
3922		} else if (!(fs_info->qgroup_flags &
3923			     BTRFS_QGROUP_STATUS_FLAG_ON)) {
3924			btrfs_debug(fs_info,
3925			"qgroup rescan init failed, qgroup is not enabled");
3926			ret = -ENOTCONN;
3927		} else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
3928			/* Quota disable is in progress */
3929			ret = -EBUSY;
3930		}
3931
3932		if (ret) {
3933			mutex_unlock(&fs_info->qgroup_rescan_lock);
3934			return ret;
3935		}
3936		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3937	}
3938
3939	memset(&fs_info->qgroup_rescan_progress, 0,
3940		sizeof(fs_info->qgroup_rescan_progress));
3941	fs_info->qgroup_flags &= ~(BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
3942				   BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
3943	fs_info->qgroup_rescan_progress.objectid = progress_objectid;
3944	init_completion(&fs_info->qgroup_rescan_completion);
3945	mutex_unlock(&fs_info->qgroup_rescan_lock);
3946
3947	btrfs_init_work(&fs_info->qgroup_rescan_work,
3948			btrfs_qgroup_rescan_worker, NULL);
3949	return 0;
3950}
3951
3952static void
3953qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
3954{
3955	struct rb_node *n;
3956	struct btrfs_qgroup *qgroup;
3957
3958	spin_lock(&fs_info->qgroup_lock);
3959	/* clear all current qgroup tracking information */
3960	for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
3961		qgroup = rb_entry(n, struct btrfs_qgroup, node);
3962		qgroup->rfer = 0;
3963		qgroup->rfer_cmpr = 0;
3964		qgroup->excl = 0;
3965		qgroup->excl_cmpr = 0;
3966		qgroup_dirty(fs_info, qgroup);
3967	}
3968	spin_unlock(&fs_info->qgroup_lock);
3969}
3970
3971int
3972btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
3973{
3974	int ret = 0;
3975
3976	ret = qgroup_rescan_init(fs_info, 0, 1);
3977	if (ret)
3978		return ret;
3979
3980	/*
3981	 * We have set the rescan_progress to 0, which means no more
3982	 * delayed refs will be accounted by btrfs_qgroup_account_ref.
3983	 * However, btrfs_qgroup_account_ref may be right after its call
3984	 * to btrfs_find_all_roots, in which case it would still do the
3985	 * accounting.
3986	 * To solve this, we're committing the transaction, which will
3987	 * ensure we run all delayed refs and only after that, we are
3988	 * going to clear all tracking information for a clean start.
3989	 */
3990
3991	ret = btrfs_commit_current_transaction(fs_info->fs_root);
3992	if (ret) {
3993		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3994		return ret;
3995	}
3996
3997	qgroup_rescan_zero_tracking(fs_info);
3998
3999	mutex_lock(&fs_info->qgroup_rescan_lock);
4000	/*
4001	 * The rescan worker is only for full accounting qgroups, check if it's
4002	 * enabled as it is pointless to queue it otherwise. A concurrent quota
4003	 * disable may also have just cleared BTRFS_FS_QUOTA_ENABLED.
4004	 */
4005	if (btrfs_qgroup_full_accounting(fs_info)) {
4006		fs_info->qgroup_rescan_running = true;
4007		btrfs_queue_work(fs_info->qgroup_rescan_workers,
4008				 &fs_info->qgroup_rescan_work);
4009	} else {
4010		ret = -ENOTCONN;
4011	}
4012	mutex_unlock(&fs_info->qgroup_rescan_lock);
4013
4014	return ret;
4015}
4016
4017int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
4018				     bool interruptible)
4019{
4020	int running;
4021	int ret = 0;
4022
4023	mutex_lock(&fs_info->qgroup_rescan_lock);
4024	running = fs_info->qgroup_rescan_running;
4025	mutex_unlock(&fs_info->qgroup_rescan_lock);
4026
4027	if (!running)
4028		return 0;
4029
4030	if (interruptible)
4031		ret = wait_for_completion_interruptible(
4032					&fs_info->qgroup_rescan_completion);
4033	else
4034		wait_for_completion(&fs_info->qgroup_rescan_completion);
4035
4036	return ret;
4037}
4038
4039/*
4040 * this is only called from open_ctree where we're still single threaded, thus
4041 * locking is omitted here.
4042 */
4043void
4044btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
4045{
4046	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
4047		mutex_lock(&fs_info->qgroup_rescan_lock);
4048		fs_info->qgroup_rescan_running = true;
4049		btrfs_queue_work(fs_info->qgroup_rescan_workers,
4050				 &fs_info->qgroup_rescan_work);
4051		mutex_unlock(&fs_info->qgroup_rescan_lock);
4052	}
4053}
4054
4055#define rbtree_iterate_from_safe(node, next, start)				\
4056       for (node = start; node && ({ next = rb_next(node); 1;}); node = next)
4057
4058static int qgroup_unreserve_range(struct btrfs_inode *inode,
4059				  struct extent_changeset *reserved, u64 start,
4060				  u64 len)
4061{
4062	struct rb_node *node;
4063	struct rb_node *next;
4064	struct ulist_node *entry;
4065	int ret = 0;
4066
4067	node = reserved->range_changed.root.rb_node;
4068	if (!node)
4069		return 0;
4070	while (node) {
4071		entry = rb_entry(node, struct ulist_node, rb_node);
4072		if (entry->val < start)
4073			node = node->rb_right;
4074		else
4075			node = node->rb_left;
4076	}
4077
4078	if (entry->val > start && rb_prev(&entry->rb_node))
4079		entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
4080				 rb_node);
4081
4082	rbtree_iterate_from_safe(node, next, &entry->rb_node) {
4083		u64 entry_start;
4084		u64 entry_end;
4085		u64 entry_len;
4086		int clear_ret;
4087
4088		entry = rb_entry(node, struct ulist_node, rb_node);
4089		entry_start = entry->val;
4090		entry_end = entry->aux;
4091		entry_len = entry_end - entry_start + 1;
4092
4093		if (entry_start >= start + len)
4094			break;
4095		if (entry_start + entry_len <= start)
4096			continue;
4097		/*
4098		 * Now the entry is in [start, start + len), revert the
4099		 * EXTENT_QGROUP_RESERVED bit.
4100		 */
4101		clear_ret = btrfs_clear_extent_bit(&inode->io_tree, entry_start, entry_end,
4102						   EXTENT_QGROUP_RESERVED, NULL);
4103		if (!ret && clear_ret < 0)
4104			ret = clear_ret;
4105
4106		ulist_del(&reserved->range_changed, entry->val, entry->aux);
4107		if (likely(reserved->bytes_changed >= entry_len)) {
4108			reserved->bytes_changed -= entry_len;
4109		} else {
4110			WARN_ON(1);
4111			reserved->bytes_changed = 0;
4112		}
4113	}
4114
4115	return ret;
4116}
4117
4118/*
4119 * Try to free some space for qgroup.
4120 *
4121 * For qgroup, there are only 3 ways to free qgroup space:
4122 * - Flush nodatacow write
4123 *   Any nodatacow write will free its reserved data space at run_delalloc_range().
4124 *   In theory, we should only flush nodatacow inodes, but it's not yet
4125 *   possible, so we need to flush the whole root.
4126 *
4127 * - Wait for ordered extents
4128 *   When ordered extents are finished, their reserved metadata is finally
4129 *   converted to per_trans status, which can be freed by later commit
4130 *   transaction.
4131 *
4132 * - Commit transaction
4133 *   This would free the meta_per_trans space.
4134 *   In theory this shouldn't provide much space, but any more qgroup space
4135 *   is needed.
4136 */
4137static int try_flush_qgroup(struct btrfs_root *root)
4138{
4139	int ret;
4140
4141	/* Can't hold an open transaction or we run the risk of deadlocking. */
4142	ASSERT(current->journal_info == NULL);
4143	if (WARN_ON(current->journal_info))
4144		return 0;
4145
4146	/*
4147	 * We don't want to run flush again and again, so if there is a running
4148	 * one, we won't try to start a new flush, but exit directly.
4149	 */
4150	if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
4151		wait_event(root->qgroup_flush_wait,
4152			!test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
4153		return 0;
4154	}
4155
4156	ret = btrfs_start_delalloc_snapshot(root, true);
4157	if (ret < 0)
4158		goto out;
4159	btrfs_wait_ordered_extents(root, U64_MAX, NULL);
4160
4161	/*
4162	 * After waiting for ordered extents run delayed iputs in order to free
4163	 * space from unlinked files before committing the current transaction,
4164	 * as ordered extents may have been holding the last reference of an
4165	 * inode and they add a delayed iput when they complete.
4166	 */
4167	btrfs_run_delayed_iputs(root->fs_info);
4168	btrfs_wait_on_delayed_iputs(root->fs_info);
4169
4170	ret = btrfs_commit_current_transaction(root);
4171out:
4172	clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
4173	wake_up(&root->qgroup_flush_wait);
4174	return ret;
4175}
4176
4177static int qgroup_reserve_data(struct btrfs_inode *inode,
4178			struct extent_changeset **reserved_ret, u64 start,
4179			u64 len)
4180{
4181	struct btrfs_root *root = inode->root;
4182	struct extent_changeset *reserved;
4183	bool new_reserved = false;
4184	u64 orig_reserved;
4185	u64 to_reserve;
4186	int ret;
4187
4188	if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
4189	    !btrfs_is_fstree(btrfs_root_id(root)) || len == 0)
4190		return 0;
4191
4192	/* @reserved parameter is mandatory for qgroup */
4193	if (WARN_ON(!reserved_ret))
4194		return -EINVAL;
4195	if (!*reserved_ret) {
4196		new_reserved = true;
4197		*reserved_ret = extent_changeset_alloc();
4198		if (!*reserved_ret)
4199			return -ENOMEM;
4200	}
4201	reserved = *reserved_ret;
4202	/* Record already reserved space */
4203	orig_reserved = reserved->bytes_changed;
4204	ret = btrfs_set_record_extent_bits(&inode->io_tree, start,
4205					   start + len - 1, EXTENT_QGROUP_RESERVED,
4206					   reserved);
4207
4208	/* Newly reserved space */
4209	to_reserve = reserved->bytes_changed - orig_reserved;
4210	trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
4211					to_reserve, QGROUP_RESERVE);
4212	if (ret < 0)
4213		goto out;
4214	ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
4215	if (ret < 0)
4216		goto cleanup;
4217
4218	return ret;
4219
4220cleanup:
4221	qgroup_unreserve_range(inode, reserved, start, len);
4222out:
4223	if (new_reserved) {
4224		extent_changeset_free(reserved);
4225		*reserved_ret = NULL;
4226	}
4227	return ret;
4228}
4229
4230/*
4231 * Reserve qgroup space for range [start, start + len).
4232 *
4233 * This function will either reserve space from related qgroups or do nothing
4234 * if the range is already reserved.
4235 *
4236 * Return 0 for successful reservation
4237 * Return <0 for error (including -EQUOT)
4238 *
4239 * NOTE: This function may sleep for memory allocation, dirty page flushing and
4240 *	 commit transaction. So caller should not hold any dirty page locked.
4241 */
4242int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
4243			struct extent_changeset **reserved_ret, u64 start,
4244			u64 len)
4245{
4246	int ret;
4247
4248	ret = qgroup_reserve_data(inode, reserved_ret, start, len);
4249	if (ret <= 0 && ret != -EDQUOT)
4250		return ret;
4251
4252	ret = try_flush_qgroup(inode->root);
4253	if (ret < 0)
4254		return ret;
4255	return qgroup_reserve_data(inode, reserved_ret, start, len);
4256}
4257
4258/* Free ranges specified by @reserved, normally in error path */
4259static int qgroup_free_reserved_data(struct btrfs_inode *inode,
4260				     struct extent_changeset *reserved,
4261				     u64 start, u64 len, u64 *freed_ret)
4262{
4263	struct btrfs_root *root = inode->root;
4264	struct ulist_node *unode;
4265	struct ulist_iterator uiter;
4266	struct extent_changeset changeset;
4267	u64 freed = 0;
4268	int ret;
4269
4270	extent_changeset_init(&changeset);
4271	len = round_up(start + len, root->fs_info->sectorsize);
4272	start = round_down(start, root->fs_info->sectorsize);
4273
4274	ULIST_ITER_INIT(&uiter);
4275	while ((unode = ulist_next(&reserved->range_changed, &uiter))) {
4276		u64 range_start = unode->val;
4277		/* unode->aux is the inclusive end */
4278		u64 range_len = unode->aux - range_start + 1;
4279		u64 free_start;
4280		u64 free_len;
4281
4282		extent_changeset_release(&changeset);
4283
4284		/* Only free range in range [start, start + len) */
4285		if (range_start >= start + len ||
4286		    range_start + range_len <= start)
4287			continue;
4288		free_start = max(range_start, start);
4289		free_len = min(start + len, range_start + range_len) -
4290			   free_start;
4291		/*
4292		 * TODO: To also modify reserved->ranges_reserved to reflect
4293		 * the modification.
4294		 *
4295		 * However as long as we free qgroup reserved according to
4296		 * EXTENT_QGROUP_RESERVED, we won't double free.
4297		 * So not need to rush.
4298		 */
4299		ret = btrfs_clear_record_extent_bits(&inode->io_tree, free_start,
4300						     free_start + free_len - 1,
4301						     EXTENT_QGROUP_RESERVED,
4302						     &changeset);
4303		if (ret < 0)
4304			goto out;
4305		freed += changeset.bytes_changed;
4306	}
4307	btrfs_qgroup_free_refroot(root->fs_info, btrfs_root_id(root), freed,
4308				  BTRFS_QGROUP_RSV_DATA);
4309	if (freed_ret)
4310		*freed_ret = freed;
4311	ret = 0;
4312out:
4313	extent_changeset_release(&changeset);
4314	return ret;
4315}
4316
4317static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
4318			struct extent_changeset *reserved, u64 start, u64 len,
4319			u64 *released, int free)
4320{
4321	struct extent_changeset changeset;
4322	int trace_op = QGROUP_RELEASE;
4323	int ret;
4324
4325	if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
4326		return btrfs_clear_record_extent_bits(&inode->io_tree, start,
4327						      start + len - 1,
4328						      EXTENT_QGROUP_RESERVED, NULL);
4329	}
4330
4331	/* In release case, we shouldn't have @reserved */
4332	WARN_ON(!free && reserved);
4333	if (free && reserved)
4334		return qgroup_free_reserved_data(inode, reserved, start, len, released);
4335	extent_changeset_init(&changeset);
4336	ret = btrfs_clear_record_extent_bits(&inode->io_tree, start, start + len - 1,
4337					     EXTENT_QGROUP_RESERVED, &changeset);
4338	if (ret < 0)
4339		goto out;
4340
4341	if (free)
4342		trace_op = QGROUP_FREE;
4343	trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len,
4344					changeset.bytes_changed, trace_op);
4345	if (free)
4346		btrfs_qgroup_free_refroot(inode->root->fs_info,
4347				btrfs_root_id(inode->root),
4348				changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
4349	if (released)
4350		*released = changeset.bytes_changed;
4351out:
4352	extent_changeset_release(&changeset);
4353	return ret;
4354}
4355
4356/*
4357 * Free a reserved space range from io_tree and related qgroups
4358 *
4359 * Should be called when a range of pages get invalidated before reaching disk.
4360 * Or for error cleanup case.
4361 * if @reserved is given, only reserved range in [@start, @start + @len) will
4362 * be freed.
4363 *
4364 * For data written to disk, use btrfs_qgroup_release_data().
4365 *
4366 * NOTE: This function may sleep for memory allocation.
4367 */
4368int btrfs_qgroup_free_data(struct btrfs_inode *inode,
4369			   struct extent_changeset *reserved,
4370			   u64 start, u64 len, u64 *freed)
4371{
4372	return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1);
4373}
4374
4375/*
4376 * Release a reserved space range from io_tree only.
4377 *
4378 * Should be called when a range of pages get written to disk and corresponding
4379 * FILE_EXTENT is inserted into corresponding root.
4380 *
4381 * Since new qgroup accounting framework will only update qgroup numbers at
4382 * commit_transaction() time, its reserved space shouldn't be freed from
4383 * related qgroups.
4384 *
4385 * But we should release the range from io_tree, to allow further write to be
4386 * COWed.
4387 *
4388 * NOTE: This function may sleep for memory allocation.
4389 */
4390int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released)
4391{
4392	return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0);
4393}
4394
4395static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
4396			      enum btrfs_qgroup_rsv_type type)
4397{
4398	if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
4399	    type != BTRFS_QGROUP_RSV_META_PERTRANS)
4400		return;
4401	if (num_bytes == 0)
4402		return;
4403
4404	spin_lock(&root->qgroup_meta_rsv_lock);
4405	if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
4406		root->qgroup_meta_rsv_prealloc += num_bytes;
4407	else
4408		root->qgroup_meta_rsv_pertrans += num_bytes;
4409	spin_unlock(&root->qgroup_meta_rsv_lock);
4410}
4411
4412static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
4413			     enum btrfs_qgroup_rsv_type type)
4414{
4415	if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
4416	    type != BTRFS_QGROUP_RSV_META_PERTRANS)
4417		return 0;
4418	if (num_bytes == 0)
4419		return 0;
4420
4421	spin_lock(&root->qgroup_meta_rsv_lock);
4422	if (type == BTRFS_QGROUP_RSV_META_PREALLOC) {
4423		num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc,
4424				  num_bytes);
4425		root->qgroup_meta_rsv_prealloc -= num_bytes;
4426	} else {
4427		num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans,
4428				  num_bytes);
4429		root->qgroup_meta_rsv_pertrans -= num_bytes;
4430	}
4431	spin_unlock(&root->qgroup_meta_rsv_lock);
4432	return num_bytes;
4433}
4434
4435int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
4436			      enum btrfs_qgroup_rsv_type type, bool enforce)
4437{
4438	struct btrfs_fs_info *fs_info = root->fs_info;
4439	int ret;
4440
4441	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
4442	    !btrfs_is_fstree(btrfs_root_id(root)) || num_bytes == 0)
4443		return 0;
4444
4445	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
4446	trace_btrfs_qgroup_meta_reserve(root, (s64)num_bytes, type);
4447	ret = qgroup_reserve(root, num_bytes, enforce, type);
4448	if (ret < 0)
4449		return ret;
4450	/*
4451	 * Record what we have reserved into root.
4452	 *
4453	 * To avoid quota disabled->enabled underflow.
4454	 * In that case, we may try to free space we haven't reserved
4455	 * (since quota was disabled), so record what we reserved into root.
4456	 * And ensure later release won't underflow this number.
4457	 */
4458	add_root_meta_rsv(root, num_bytes, type);
4459	return ret;
4460}
4461
4462int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
4463				enum btrfs_qgroup_rsv_type type, bool enforce,
4464				bool noflush)
4465{
4466	int ret;
4467
4468	ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
4469	if ((ret <= 0 && ret != -EDQUOT) || noflush)
4470		return ret;
4471
4472	ret = try_flush_qgroup(root);
4473	if (ret < 0)
4474		return ret;
4475	return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
4476}
4477
4478/*
4479 * Per-transaction meta reservation should be all freed at transaction commit
4480 * time
4481 */
4482void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
4483{
4484	struct btrfs_fs_info *fs_info = root->fs_info;
4485
4486	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
4487	    !btrfs_is_fstree(btrfs_root_id(root)))
4488		return;
4489
4490	/* TODO: Update trace point to handle such free */
4491	trace_btrfs_qgroup_meta_free_all_pertrans(root);
4492	/* Special value -1 means to free all reserved space */
4493	btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), (u64)-1,
4494				  BTRFS_QGROUP_RSV_META_PERTRANS);
4495}
4496
4497void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
4498			      enum btrfs_qgroup_rsv_type type)
4499{
4500	struct btrfs_fs_info *fs_info = root->fs_info;
4501
4502	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
4503	    !btrfs_is_fstree(btrfs_root_id(root)))
4504		return;
4505
4506	/*
4507	 * reservation for META_PREALLOC can happen before quota is enabled,
4508	 * which can lead to underflow.
4509	 * Here ensure we will only free what we really have reserved.
4510	 */
4511	num_bytes = sub_root_meta_rsv(root, num_bytes, type);
4512	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
4513	trace_btrfs_qgroup_meta_reserve(root, -(s64)num_bytes, type);
4514	btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), num_bytes, type);
4515}
4516
4517static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
4518				int num_bytes)
4519{
4520	struct btrfs_qgroup *qgroup;
4521	LIST_HEAD(qgroup_list);
4522
4523	if (num_bytes == 0)
4524		return;
4525	if (!fs_info->quota_root)
4526		return;
4527
4528	spin_lock(&fs_info->qgroup_lock);
4529	qgroup = find_qgroup_rb(fs_info, ref_root);
4530	if (!qgroup)
4531		goto out;
4532
4533	qgroup_iterator_add(&qgroup_list, qgroup);
4534	list_for_each_entry(qgroup, &qgroup_list, iterator) {
4535		struct btrfs_qgroup_list *glist;
4536
4537		qgroup_rsv_release(fs_info, qgroup, num_bytes,
4538				BTRFS_QGROUP_RSV_META_PREALLOC);
4539		if (!sb_rdonly(fs_info->sb))
4540			qgroup_rsv_add(fs_info, qgroup, num_bytes,
4541				       BTRFS_QGROUP_RSV_META_PERTRANS);
4542
4543		list_for_each_entry(glist, &qgroup->groups, next_group)
4544			qgroup_iterator_add(&qgroup_list, glist->group);
4545	}
4546out:
4547	qgroup_iterator_clean(&qgroup_list);
4548	spin_unlock(&fs_info->qgroup_lock);
4549}
4550
4551/*
4552 * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS.
4553 *
4554 * This is called when preallocated meta reservation needs to be used.
4555 * Normally after btrfs_join_transaction() call.
4556 */
4557void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
4558{
4559	struct btrfs_fs_info *fs_info = root->fs_info;
4560
4561	if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
4562	    !btrfs_is_fstree(btrfs_root_id(root)))
4563		return;
4564	/* Same as btrfs_qgroup_free_meta_prealloc() */
4565	num_bytes = sub_root_meta_rsv(root, num_bytes,
4566				      BTRFS_QGROUP_RSV_META_PREALLOC);
4567	trace_btrfs_qgroup_meta_convert(root, num_bytes);
4568	qgroup_convert_meta(fs_info, btrfs_root_id(root), num_bytes);
4569	if (!sb_rdonly(fs_info->sb))
4570		add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS);
4571}
4572
4573/*
4574 * Check qgroup reserved space leaking, normally at destroy inode
4575 * time
4576 */
4577void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
4578{
4579	struct extent_changeset changeset;
4580	struct ulist_node *unode;
4581	struct ulist_iterator iter;
4582	int ret;
4583
4584	extent_changeset_init(&changeset);
4585	ret = btrfs_clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
4586					     EXTENT_QGROUP_RESERVED, &changeset);
4587
4588	WARN_ON(ret < 0);
4589	if (WARN_ON(changeset.bytes_changed)) {
4590		ULIST_ITER_INIT(&iter);
4591		while ((unode = ulist_next(&changeset.range_changed, &iter))) {
4592			btrfs_warn(inode->root->fs_info,
4593		"leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu",
4594				btrfs_ino(inode), unode->val, unode->aux);
4595		}
4596		btrfs_qgroup_free_refroot(inode->root->fs_info,
4597				btrfs_root_id(inode->root),
4598				changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
4599
4600	}
4601	extent_changeset_release(&changeset);
4602}
4603
4604void btrfs_qgroup_init_swapped_blocks(
4605	struct btrfs_qgroup_swapped_blocks *swapped_blocks)
4606{
4607	int i;
4608
4609	spin_lock_init(&swapped_blocks->lock);
4610	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
4611		swapped_blocks->blocks[i] = RB_ROOT;
4612	swapped_blocks->swapped = false;
4613}
4614
4615/*
4616 * Delete all swapped blocks record of @root.
4617 * Every record here means we skipped a full subtree scan for qgroup.
4618 *
4619 * Gets called when committing one transaction.
4620 */
4621void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
4622{
4623	struct btrfs_qgroup_swapped_blocks *swapped_blocks;
4624	int i;
4625
4626	swapped_blocks = &root->swapped_blocks;
4627
4628	spin_lock(&swapped_blocks->lock);
4629	if (!swapped_blocks->swapped)
4630		goto out;
4631	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
4632		struct rb_root *cur_root = &swapped_blocks->blocks[i];
4633		struct btrfs_qgroup_swapped_block *entry;
4634		struct btrfs_qgroup_swapped_block *next;
4635
4636		rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
4637						     node)
4638			kfree(entry);
4639		swapped_blocks->blocks[i] = RB_ROOT;
4640	}
4641	swapped_blocks->swapped = false;
4642out:
4643	spin_unlock(&swapped_blocks->lock);
4644}
4645
4646static int qgroup_swapped_block_bytenr_key_cmp(const void *key, const struct rb_node *node)
4647{
4648	const u64 *bytenr = key;
4649	const struct btrfs_qgroup_swapped_block *block = rb_entry(node,
4650					  struct btrfs_qgroup_swapped_block, node);
4651
4652	if (block->subvol_bytenr < *bytenr)
4653		return -1;
4654	else if (block->subvol_bytenr > *bytenr)
4655		return 1;
4656
4657	return 0;
4658}
4659
4660static int qgroup_swapped_block_bytenr_cmp(struct rb_node *new, const struct rb_node *existing)
4661{
4662	const struct btrfs_qgroup_swapped_block *new_block = rb_entry(new,
4663					      struct btrfs_qgroup_swapped_block, node);
4664
4665	return qgroup_swapped_block_bytenr_key_cmp(&new_block->subvol_bytenr, existing);
4666}
4667
4668/*
4669 * Add subtree roots record into @subvol_root.
4670 *
4671 * @subvol_root:	tree root of the subvolume tree get swapped
4672 * @bg:			block group under balance
4673 * @subvol_parent/slot:	pointer to the subtree root in subvolume tree
4674 * @reloc_parent/slot:	pointer to the subtree root in reloc tree
4675 *			BOTH POINTERS ARE BEFORE TREE SWAP
4676 * @last_snapshot:	last snapshot generation of the subvolume tree
4677 */
4678int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
4679		struct btrfs_block_group *bg,
4680		struct extent_buffer *subvol_parent, int subvol_slot,
4681		struct extent_buffer *reloc_parent, int reloc_slot,
4682		u64 last_snapshot)
4683{
4684	struct btrfs_fs_info *fs_info = subvol_root->fs_info;
4685	struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
4686	struct btrfs_qgroup_swapped_block *block;
4687	struct rb_node *node;
4688	int level = btrfs_header_level(subvol_parent) - 1;
4689	int ret = 0;
4690
4691	if (!btrfs_qgroup_full_accounting(fs_info))
4692		return 0;
4693
4694	if (unlikely(btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
4695		     btrfs_node_ptr_generation(reloc_parent, reloc_slot))) {
4696		btrfs_err_rl(fs_info,
4697		"%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
4698			__func__,
4699			btrfs_node_ptr_generation(subvol_parent, subvol_slot),
4700			btrfs_node_ptr_generation(reloc_parent, reloc_slot));
4701		return -EUCLEAN;
4702	}
4703
4704	block = kmalloc(sizeof(*block), GFP_NOFS);
4705	if (!block) {
4706		ret = -ENOMEM;
4707		goto out;
4708	}
4709
4710	/*
4711	 * @reloc_parent/slot is still before swap, while @block is going to
4712	 * record the bytenr after swap, so we do the swap here.
4713	 */
4714	block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
4715	block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
4716							     reloc_slot);
4717	block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
4718	block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
4719							    subvol_slot);
4720	block->last_snapshot = last_snapshot;
4721	block->level = level;
4722
4723	/*
4724	 * If we have bg == NULL, we're called from btrfs_recover_relocation(),
4725	 * no one else can modify tree blocks thus we qgroup will not change
4726	 * no matter the value of trace_leaf.
4727	 */
4728	if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA)
4729		block->trace_leaf = true;
4730	else
4731		block->trace_leaf = false;
4732	btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
4733
4734	/* Insert @block into @blocks */
4735	spin_lock(&blocks->lock);
4736	node = rb_find_add(&block->node, &blocks->blocks[level], qgroup_swapped_block_bytenr_cmp);
4737	if (node) {
4738		struct btrfs_qgroup_swapped_block *entry;
4739
4740		entry = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
4741
4742		if (entry->subvol_generation != block->subvol_generation ||
4743		    entry->reloc_bytenr != block->reloc_bytenr ||
4744		    entry->reloc_generation != block->reloc_generation) {
4745			/*
4746			 * Duplicated but mismatch entry found.  Shouldn't happen.
4747			 * Marking qgroup inconsistent should be enough for end
4748			 * users.
4749			 */
4750			DEBUG_WARN("duplicated but mismatched entry found");
4751			ret = -EEXIST;
4752		}
4753		kfree(block);
4754		goto out_unlock;
4755	}
4756	blocks->swapped = true;
4757out_unlock:
4758	spin_unlock(&blocks->lock);
4759out:
4760	if (ret < 0)
4761		qgroup_mark_inconsistent(fs_info, "%s error: %d", __func__, ret);
4762	return ret;
4763}
4764
4765/*
4766 * Check if the tree block is a subtree root, and if so do the needed
4767 * delayed subtree trace for qgroup.
4768 *
4769 * This is called during btrfs_cow_block().
4770 */
4771int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
4772					 struct btrfs_root *root,
4773					 struct extent_buffer *subvol_eb)
4774{
4775	struct btrfs_fs_info *fs_info = root->fs_info;
4776	struct btrfs_tree_parent_check check = { 0 };
4777	struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
4778	struct btrfs_qgroup_swapped_block AUTO_KFREE(block);
4779	struct extent_buffer *reloc_eb = NULL;
4780	struct rb_node *node;
4781	bool swapped = false;
4782	int level = btrfs_header_level(subvol_eb);
4783	int ret = 0;
4784	int i;
4785
4786	if (!btrfs_qgroup_full_accounting(fs_info))
4787		return 0;
4788	if (!btrfs_is_fstree(btrfs_root_id(root)) || !root->reloc_root)
4789		return 0;
4790
4791	spin_lock(&blocks->lock);
4792	if (!blocks->swapped) {
4793		spin_unlock(&blocks->lock);
4794		return 0;
4795	}
4796	node = rb_find(&subvol_eb->start, &blocks->blocks[level],
4797			qgroup_swapped_block_bytenr_key_cmp);
4798	if (!node) {
4799		spin_unlock(&blocks->lock);
4800		goto out;
4801	}
4802	block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
4803
4804	/* Found one, remove it from @blocks first and update blocks->swapped */
4805	rb_erase(&block->node, &blocks->blocks[level]);
4806	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
4807		if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
4808			swapped = true;
4809			break;
4810		}
4811	}
4812	blocks->swapped = swapped;
4813	spin_unlock(&blocks->lock);
4814
4815	check.level = block->level;
4816	check.transid = block->reloc_generation;
4817	check.has_first_key = true;
4818	memcpy(&check.first_key, &block->first_key, sizeof(check.first_key));
4819
4820	/* Read out reloc subtree root */
4821	reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, &check);
4822	if (IS_ERR(reloc_eb)) {
4823		ret = PTR_ERR(reloc_eb);
4824		reloc_eb = NULL;
4825		goto free_out;
4826	}
4827	if (unlikely(!extent_buffer_uptodate(reloc_eb))) {
4828		ret = -EIO;
4829		goto free_out;
4830	}
4831
4832	ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
4833			block->last_snapshot, block->trace_leaf);
4834free_out:
4835	free_extent_buffer(reloc_eb);
4836out:
4837	if (ret < 0) {
4838		qgroup_mark_inconsistent(fs_info,
4839				"failed to account subtree at bytenr %llu: %d",
4840				subvol_eb->start, ret);
4841	}
4842	return ret;
4843}
4844
4845void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
4846{
4847	struct btrfs_qgroup_extent_record *entry;
4848	unsigned long index;
4849
4850	xa_for_each(&trans->delayed_refs.dirty_extents, index, entry) {
4851		ulist_free(entry->old_roots);
4852		kfree(entry);
4853	}
4854	xa_destroy(&trans->delayed_refs.dirty_extents);
4855}
4856
4857int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
4858			      const struct btrfs_squota_delta *delta)
4859{
4860	int ret;
4861	struct btrfs_qgroup *qgroup;
4862	struct btrfs_qgroup *qg;
4863	LIST_HEAD(qgroup_list);
4864	u64 root = delta->root;
4865	u64 num_bytes = delta->num_bytes;
4866	const int sign = (delta->is_inc ? 1 : -1);
4867
4868	if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
4869		return 0;
4870
4871	if (!btrfs_is_fstree(root))
4872		return 0;
4873
4874	/* If the extent predates enabling quotas, don't count it. */
4875	if (delta->generation < fs_info->qgroup_enable_gen)
4876		return 0;
4877
4878	spin_lock(&fs_info->qgroup_lock);
4879	qgroup = find_qgroup_rb(fs_info, root);
4880	if (!qgroup) {
4881		ret = -ENOENT;
4882		goto out;
4883	}
4884
4885	ret = 0;
4886	qgroup_iterator_add(&qgroup_list, qgroup);
4887	list_for_each_entry(qg, &qgroup_list, iterator) {
4888		struct btrfs_qgroup_list *glist;
4889
4890		qg->excl += num_bytes * sign;
4891		qg->rfer += num_bytes * sign;
4892		qgroup_dirty(fs_info, qg);
4893
4894		list_for_each_entry(glist, &qg->groups, next_group)
4895			qgroup_iterator_add(&qgroup_list, glist->group);
4896	}
4897	qgroup_iterator_clean(&qgroup_list);
4898
4899out:
4900	spin_unlock(&fs_info->qgroup_lock);
4901	return ret;
4902}