Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0
2
3#include "misc.h"
4#include "ctree.h"
5#include "block-rsv.h"
6#include "space-info.h"
7#include "transaction.h"
8#include "block-group.h"
9#include "fs.h"
10#include "accessors.h"
11
12/*
13 * HOW DO BLOCK RESERVES WORK
14 *
15 * Think of block_rsv's as buckets for logically grouped metadata
16 * reservations. Each block_rsv has a ->size and a ->reserved. ->size is
17 * how large we want our block rsv to be, ->reserved is how much space is
18 * currently reserved for this block reserve.
19 *
20 * ->failfast exists for the truncate case, and is described below.
21 *
22 * NORMAL OPERATION
23 *
24 * -> Reserve
25 * Entrance: btrfs_block_rsv_add, btrfs_block_rsv_refill
26 *
27 * We call into btrfs_reserve_metadata_bytes() with our bytes, which is
28 * accounted for in space_info->bytes_may_use, and then add the bytes to
29 * ->reserved, and ->size in the case of btrfs_block_rsv_add.
30 *
31 * ->size is an over-estimation of how much we may use for a particular
32 * operation.
33 *
34 * -> Use
35 * Entrance: btrfs_use_block_rsv
36 *
37 * When we do a btrfs_alloc_tree_block() we call into btrfs_use_block_rsv()
38 * to determine the appropriate block_rsv to use, and then verify that
39 * ->reserved has enough space for our tree block allocation. Once
40 * successful we subtract fs_info->nodesize from ->reserved.
41 *
42 * -> Finish
43 * Entrance: btrfs_block_rsv_release
44 *
45 * We are finished with our operation, subtract our individual reservation
46 * from ->size, and then subtract ->size from ->reserved and free up the
47 * excess if there is any.
48 *
49 * There is some logic here to refill the delayed refs rsv or the global rsv
50 * as needed, otherwise the excess is subtracted from
51 * space_info->bytes_may_use.
52 *
53 * TYPES OF BLOCK RESERVES
54 *
55 * BLOCK_RSV_TRANS, BLOCK_RSV_DELOPS, BLOCK_RSV_CHUNK
56 * These behave normally, as described above, just within the confines of the
57 * lifetime of their particular operation (transaction for the whole trans
58 * handle lifetime, for example).
59 *
60 * BLOCK_RSV_GLOBAL
61 * It is impossible to properly account for all the space that may be required
62 * to make our extent tree updates. This block reserve acts as an overflow
63 * buffer in case our delayed refs reserve does not reserve enough space to
64 * update the extent tree.
65 *
66 * We can steal from this in some cases as well, notably on evict() or
67 * truncate() in order to help users recover from ENOSPC conditions.
68 *
69 * BLOCK_RSV_DELALLOC
70 * The individual item sizes are determined by the per-inode size
71 * calculations, which are described with the delalloc code. This is pretty
72 * straightforward, it's just the calculation of ->size encodes a lot of
73 * different items, and thus it gets used when updating inodes, inserting file
74 * extents, and inserting checksums.
75 *
76 * BLOCK_RSV_DELREFS
77 * We keep a running tally of how many delayed refs we have on the system.
78 * We assume each one of these delayed refs are going to use a full
79 * reservation. We use the transaction items and pre-reserve space for every
80 * operation, and use this reservation to refill any gap between ->size and
81 * ->reserved that may exist.
82 *
83 * From there it's straightforward, removing a delayed ref means we remove its
84 * count from ->size and free up reservations as necessary. Since this is
85 * the most dynamic block reserve in the system, we will try to refill this
86 * block reserve first with any excess returned by any other block reserve.
87 *
88 * BLOCK_RSV_EMPTY
89 * This is the fallback block reserve to make us try to reserve space if we
90 * don't have a specific bucket for this allocation. It is mostly used for
91 * updating the device tree and such, since that is a separate pool we're
92 * content to just reserve space from the space_info on demand.
93 *
94 * BLOCK_RSV_TEMP
95 * This is used by things like truncate and iput. We will temporarily
96 * allocate a block reserve, set it to some size, and then truncate bytes
97 * until we have no space left. With ->failfast set we'll simply return
98 * ENOSPC from btrfs_use_block_rsv() to signal that we need to unwind and try
99 * to make a new reservation. This is because these operations are
100 * unbounded, so we want to do as much work as we can, and then back off and
101 * re-reserve.
102 */
103
104static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
105 struct btrfs_block_rsv *block_rsv,
106 struct btrfs_block_rsv *dest, u64 num_bytes,
107 u64 *qgroup_to_release_ret)
108{
109 struct btrfs_space_info *space_info = block_rsv->space_info;
110 u64 qgroup_to_release = 0;
111 u64 ret;
112
113 spin_lock(&block_rsv->lock);
114 if (num_bytes == (u64)-1) {
115 num_bytes = block_rsv->size;
116 qgroup_to_release = block_rsv->qgroup_rsv_size;
117 }
118 block_rsv->size -= num_bytes;
119 if (block_rsv->reserved >= block_rsv->size) {
120 num_bytes = block_rsv->reserved - block_rsv->size;
121 block_rsv->reserved = block_rsv->size;
122 block_rsv->full = true;
123 } else {
124 num_bytes = 0;
125 }
126 if (qgroup_to_release_ret &&
127 block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
128 qgroup_to_release = block_rsv->qgroup_rsv_reserved -
129 block_rsv->qgroup_rsv_size;
130 block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
131 } else {
132 qgroup_to_release = 0;
133 }
134 spin_unlock(&block_rsv->lock);
135
136 ret = num_bytes;
137 if (num_bytes > 0) {
138 if (dest) {
139 spin_lock(&dest->lock);
140 if (!dest->full) {
141 u64 bytes_to_add;
142
143 bytes_to_add = dest->size - dest->reserved;
144 bytes_to_add = min(num_bytes, bytes_to_add);
145 dest->reserved += bytes_to_add;
146 if (dest->reserved >= dest->size)
147 dest->full = true;
148 num_bytes -= bytes_to_add;
149 }
150 spin_unlock(&dest->lock);
151 }
152 if (num_bytes)
153 btrfs_space_info_free_bytes_may_use(space_info, num_bytes);
154 }
155 if (qgroup_to_release_ret)
156 *qgroup_to_release_ret = qgroup_to_release;
157 return ret;
158}
159
160int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
161 struct btrfs_block_rsv *dst, u64 num_bytes,
162 bool update_size)
163{
164 int ret;
165
166 ret = btrfs_block_rsv_use_bytes(src, num_bytes);
167 if (ret)
168 return ret;
169
170 btrfs_block_rsv_add_bytes(dst, num_bytes, update_size);
171 return 0;
172}
173
174void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type)
175{
176 memset(rsv, 0, sizeof(*rsv));
177 spin_lock_init(&rsv->lock);
178 rsv->type = type;
179}
180
181void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
182 struct btrfs_block_rsv *rsv,
183 enum btrfs_rsv_type type)
184{
185 btrfs_init_block_rsv(rsv, type);
186 rsv->space_info = btrfs_find_space_info(fs_info,
187 BTRFS_BLOCK_GROUP_METADATA);
188}
189
190struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
191 enum btrfs_rsv_type type)
192{
193 struct btrfs_block_rsv *block_rsv;
194
195 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
196 if (!block_rsv)
197 return NULL;
198
199 btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
200 return block_rsv;
201}
202
203void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
204 struct btrfs_block_rsv *rsv)
205{
206 if (!rsv)
207 return;
208 btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL);
209 kfree(rsv);
210}
211
212int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
213 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
214 enum btrfs_reserve_flush_enum flush)
215{
216 int ret;
217
218 if (num_bytes == 0)
219 return 0;
220
221 ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, num_bytes, flush);
222 if (!ret)
223 btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true);
224
225 return ret;
226}
227
228int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent)
229{
230 u64 num_bytes = 0;
231 int ret = -ENOSPC;
232
233 spin_lock(&block_rsv->lock);
234 num_bytes = mult_perc(block_rsv->size, min_percent);
235 if (block_rsv->reserved >= num_bytes)
236 ret = 0;
237 spin_unlock(&block_rsv->lock);
238
239 return ret;
240}
241
242int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
243 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
244 enum btrfs_reserve_flush_enum flush)
245{
246 int ret = -ENOSPC;
247
248 if (!block_rsv)
249 return 0;
250
251 spin_lock(&block_rsv->lock);
252 if (block_rsv->reserved >= num_bytes)
253 ret = 0;
254 else
255 num_bytes -= block_rsv->reserved;
256 spin_unlock(&block_rsv->lock);
257
258 if (!ret)
259 return 0;
260
261 ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, num_bytes, flush);
262 if (!ret) {
263 btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
264 return 0;
265 }
266
267 return ret;
268}
269
270u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
271 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
272 u64 *qgroup_to_release)
273{
274 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
275 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
276 struct btrfs_block_rsv *target = NULL;
277
278 /*
279 * If we are a delayed block reserve then push to the global rsv,
280 * otherwise dump into the global delayed reserve if it is not full.
281 */
282 if (block_rsv->type == BTRFS_BLOCK_RSV_DELOPS)
283 target = global_rsv;
284 else if (block_rsv != global_rsv && !btrfs_block_rsv_full(delayed_rsv))
285 target = delayed_rsv;
286
287 if (target && block_rsv->space_info != target->space_info)
288 target = NULL;
289
290 return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
291 qgroup_to_release);
292}
293
294int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes)
295{
296 int ret = -ENOSPC;
297
298 spin_lock(&block_rsv->lock);
299 if (block_rsv->reserved >= num_bytes) {
300 block_rsv->reserved -= num_bytes;
301 if (block_rsv->reserved < block_rsv->size)
302 block_rsv->full = false;
303 ret = 0;
304 }
305 spin_unlock(&block_rsv->lock);
306 return ret;
307}
308
309void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
310 u64 num_bytes, bool update_size)
311{
312 spin_lock(&block_rsv->lock);
313 block_rsv->reserved += num_bytes;
314 if (update_size)
315 block_rsv->size += num_bytes;
316 else if (block_rsv->reserved >= block_rsv->size)
317 block_rsv->full = true;
318 spin_unlock(&block_rsv->lock);
319}
320
321void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
322{
323 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
324 struct btrfs_space_info *sinfo = block_rsv->space_info;
325 struct btrfs_root *root, *tmp;
326 u64 num_bytes = btrfs_root_used(&fs_info->tree_root->root_item);
327 unsigned int min_items = 1;
328
329 /*
330 * The global block rsv is based on the size of the extent tree, the
331 * checksum tree and the root tree. If the fs is empty we want to set
332 * it to a minimal amount for safety.
333 *
334 * We also are going to need to modify the minimum of the tree root and
335 * any global roots we could touch.
336 */
337 read_lock(&fs_info->global_root_lock);
338 rbtree_postorder_for_each_entry_safe(root, tmp, &fs_info->global_root_tree,
339 rb_node) {
340 if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID ||
341 btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID ||
342 btrfs_root_id(root) == BTRFS_FREE_SPACE_TREE_OBJECTID) {
343 num_bytes += btrfs_root_used(&root->root_item);
344 min_items++;
345 }
346 }
347 read_unlock(&fs_info->global_root_lock);
348
349 if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
350 num_bytes += btrfs_root_used(&fs_info->block_group_root->root_item);
351 min_items++;
352 }
353
354 if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) {
355 num_bytes += btrfs_root_used(&fs_info->stripe_root->root_item);
356 min_items++;
357 }
358
359 /*
360 * But we also want to reserve enough space so we can do the fallback
361 * global reserve for an unlink, which is an additional
362 * BTRFS_UNLINK_METADATA_UNITS items.
363 *
364 * But we also need space for the delayed ref updates from the unlink,
365 * so add BTRFS_UNLINK_METADATA_UNITS units for delayed refs, one for
366 * each unlink metadata item.
367 */
368 min_items += BTRFS_UNLINK_METADATA_UNITS;
369
370 num_bytes = max_t(u64, num_bytes,
371 btrfs_calc_insert_metadata_size(fs_info, min_items) +
372 btrfs_calc_delayed_ref_bytes(fs_info,
373 BTRFS_UNLINK_METADATA_UNITS));
374
375 spin_lock(&sinfo->lock);
376 spin_lock(&block_rsv->lock);
377
378 block_rsv->size = min_t(u64, num_bytes, SZ_512M);
379
380 if (block_rsv->reserved < block_rsv->size) {
381 num_bytes = block_rsv->size - block_rsv->reserved;
382 btrfs_space_info_update_bytes_may_use(sinfo, num_bytes);
383 block_rsv->reserved = block_rsv->size;
384 } else if (block_rsv->reserved > block_rsv->size) {
385 num_bytes = block_rsv->reserved - block_rsv->size;
386 btrfs_space_info_update_bytes_may_use(sinfo, -num_bytes);
387 block_rsv->reserved = block_rsv->size;
388 btrfs_try_granting_tickets(sinfo);
389 }
390
391 block_rsv->full = (block_rsv->reserved == block_rsv->size);
392
393 if (block_rsv->size >= sinfo->total_bytes)
394 sinfo->force_alloc = CHUNK_ALLOC_FORCE;
395 spin_unlock(&block_rsv->lock);
396 spin_unlock(&sinfo->lock);
397}
398
399void btrfs_init_root_block_rsv(struct btrfs_root *root)
400{
401 struct btrfs_fs_info *fs_info = root->fs_info;
402
403 switch (btrfs_root_id(root)) {
404 case BTRFS_CSUM_TREE_OBJECTID:
405 case BTRFS_EXTENT_TREE_OBJECTID:
406 case BTRFS_FREE_SPACE_TREE_OBJECTID:
407 case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
408 case BTRFS_RAID_STRIPE_TREE_OBJECTID:
409 root->block_rsv = &fs_info->delayed_refs_rsv;
410 break;
411 case BTRFS_ROOT_TREE_OBJECTID:
412 case BTRFS_DEV_TREE_OBJECTID:
413 case BTRFS_QUOTA_TREE_OBJECTID:
414 root->block_rsv = &fs_info->global_block_rsv;
415 break;
416 case BTRFS_CHUNK_TREE_OBJECTID:
417 root->block_rsv = &fs_info->chunk_block_rsv;
418 break;
419 case BTRFS_TREE_LOG_OBJECTID:
420 root->block_rsv = &fs_info->treelog_rsv;
421 break;
422 default:
423 root->block_rsv = NULL;
424 break;
425 }
426}
427
428void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
429{
430 struct btrfs_space_info *space_info;
431
432 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
433 fs_info->chunk_block_rsv.space_info = space_info;
434
435 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
436 fs_info->global_block_rsv.space_info = space_info;
437 fs_info->trans_block_rsv.space_info = space_info;
438 fs_info->empty_block_rsv.space_info = space_info;
439 fs_info->delayed_block_rsv.space_info = space_info;
440 fs_info->delayed_refs_rsv.space_info = space_info;
441
442 /* The treelog_rsv uses a dedicated space_info on the zoned mode. */
443 if (!btrfs_is_zoned(fs_info)) {
444 fs_info->treelog_rsv.space_info = space_info;
445 } else {
446 ASSERT(space_info->sub_group[0]->subgroup_id == BTRFS_SUB_GROUP_TREELOG);
447 fs_info->treelog_rsv.space_info = space_info->sub_group[0];
448 }
449
450 btrfs_update_global_block_rsv(fs_info);
451}
452
453void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info)
454{
455 btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1,
456 NULL);
457 WARN_ON(fs_info->trans_block_rsv.size > 0);
458 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
459 WARN_ON(fs_info->chunk_block_rsv.size > 0);
460 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
461 WARN_ON(fs_info->delayed_block_rsv.size > 0);
462 WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
463 WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
464 WARN_ON(fs_info->delayed_refs_rsv.size > 0);
465}
466
467static struct btrfs_block_rsv *get_block_rsv(
468 const struct btrfs_trans_handle *trans,
469 const struct btrfs_root *root)
470{
471 struct btrfs_fs_info *fs_info = root->fs_info;
472 struct btrfs_block_rsv *block_rsv = NULL;
473
474 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
475 (root == fs_info->uuid_root) ||
476 (trans->adding_csums && btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID))
477 block_rsv = trans->block_rsv;
478
479 if (!block_rsv)
480 block_rsv = root->block_rsv;
481
482 if (!block_rsv)
483 block_rsv = &fs_info->empty_block_rsv;
484
485 return block_rsv;
486}
487
488struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
489 struct btrfs_root *root,
490 u32 blocksize)
491{
492 struct btrfs_fs_info *fs_info = root->fs_info;
493 struct btrfs_block_rsv *block_rsv;
494 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
495 int ret;
496 bool global_updated = false;
497
498 block_rsv = get_block_rsv(trans, root);
499
500 if (unlikely(btrfs_block_rsv_size(block_rsv) == 0))
501 goto try_reserve;
502again:
503 ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize);
504 if (!ret)
505 return block_rsv;
506
507 if (block_rsv->failfast)
508 return ERR_PTR(ret);
509
510 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
511 global_updated = true;
512 btrfs_update_global_block_rsv(fs_info);
513 goto again;
514 }
515
516 /*
517 * The global reserve still exists to save us from ourselves, so don't
518 * warn_on if we are short on our delayed refs reserve.
519 */
520 if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
521 btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
522 static DEFINE_RATELIMIT_STATE(_rs,
523 DEFAULT_RATELIMIT_INTERVAL * 10,
524 /*DEFAULT_RATELIMIT_BURST*/ 1);
525 if (__ratelimit(&_rs))
526 WARN(1, KERN_DEBUG
527 "BTRFS: block rsv %d returned %d\n",
528 block_rsv->type, ret);
529 }
530try_reserve:
531 ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, blocksize,
532 BTRFS_RESERVE_NO_FLUSH);
533 if (!ret)
534 return block_rsv;
535 /*
536 * If we couldn't reserve metadata bytes try and use some from
537 * the global reserve if its space type is the same as the global
538 * reservation.
539 */
540 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
541 block_rsv->space_info == global_rsv->space_info) {
542 ret = btrfs_block_rsv_use_bytes(global_rsv, blocksize);
543 if (!ret)
544 return global_rsv;
545 }
546
547 /*
548 * All hope is lost, but of course our reservations are overly
549 * pessimistic, so instead of possibly having an ENOSPC abort here, try
550 * one last time to force a reservation if there's enough actual space
551 * on disk to make the reservation.
552 */
553 ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, blocksize,
554 BTRFS_RESERVE_FLUSH_EMERGENCY);
555 if (!ret)
556 return block_rsv;
557
558 return ERR_PTR(ret);
559}
560
561int btrfs_check_trunc_cache_free_space(const struct btrfs_fs_info *fs_info,
562 struct btrfs_block_rsv *rsv)
563{
564 u64 needed_bytes;
565 int ret;
566
567 /* 1 for slack space, 1 for updating the inode */
568 needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
569 btrfs_calc_metadata_size(fs_info, 1);
570
571 spin_lock(&rsv->lock);
572 if (rsv->reserved < needed_bytes)
573 ret = -ENOSPC;
574 else
575 ret = 0;
576 spin_unlock(&rsv->lock);
577 return ret;
578}