Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) 2014 Red Hat, Inc.
4 * All Rights Reserved.
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_log_format.h"
11#include "xfs_trans_resv.h"
12#include "xfs_sb.h"
13#include "xfs_mount.h"
14#include "xfs_trans.h"
15#include "xfs_alloc.h"
16#include "xfs_btree.h"
17#include "xfs_btree_staging.h"
18#include "xfs_rmap.h"
19#include "xfs_rmap_btree.h"
20#include "xfs_trace.h"
21#include "xfs_error.h"
22#include "xfs_extent_busy.h"
23#include "xfs_ag_resv.h"
24
25/*
26 * Reverse map btree.
27 *
28 * This is a per-ag tree used to track the owner(s) of a given extent. With
29 * reflink it is possible for there to be multiple owners, which is a departure
30 * from classic XFS. Owner records for data extents are inserted when the
31 * extent is mapped and removed when an extent is unmapped. Owner records for
32 * all other block types (i.e. metadata) are inserted when an extent is
33 * allocated and removed when an extent is freed. There can only be one owner
34 * of a metadata extent, usually an inode or some other metadata structure like
35 * an AG btree.
36 *
37 * The rmap btree is part of the free space management, so blocks for the tree
38 * are sourced from the agfl. Hence we need transaction reservation support for
39 * this tree so that the freelist is always large enough. This also impacts on
40 * the minimum space we need to leave free in the AG.
41 *
42 * The tree is ordered by [ag block, owner, offset]. This is a large key size,
43 * but it is the only way to enforce unique keys when a block can be owned by
44 * multiple files at any offset. There's no need to order/search by extent
45 * size for online updating/management of the tree. It is intended that most
46 * reverse lookups will be to find the owner(s) of a particular block, or to
47 * try to recover tree and file data from corrupt primary metadata.
48 */
49
50static struct xfs_btree_cur *
51xfs_rmapbt_dup_cursor(
52 struct xfs_btree_cur *cur)
53{
54 return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp,
55 cur->bc_ag.agbp, cur->bc_ag.agno);
56}
57
58STATIC void
59xfs_rmapbt_set_root(
60 struct xfs_btree_cur *cur,
61 union xfs_btree_ptr *ptr,
62 int inc)
63{
64 struct xfs_buf *agbp = cur->bc_ag.agbp;
65 struct xfs_agf *agf = agbp->b_addr;
66 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
67 int btnum = cur->bc_btnum;
68 struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno);
69
70 ASSERT(ptr->s != 0);
71
72 agf->agf_roots[btnum] = ptr->s;
73 be32_add_cpu(&agf->agf_levels[btnum], inc);
74 pag->pagf_levels[btnum] += inc;
75 xfs_perag_put(pag);
76
77 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
78}
79
80STATIC int
81xfs_rmapbt_alloc_block(
82 struct xfs_btree_cur *cur,
83 union xfs_btree_ptr *start,
84 union xfs_btree_ptr *new,
85 int *stat)
86{
87 struct xfs_buf *agbp = cur->bc_ag.agbp;
88 struct xfs_agf *agf = agbp->b_addr;
89 int error;
90 xfs_agblock_t bno;
91
92 /* Allocate the new block from the freelist. If we can't, give up. */
93 error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp,
94 &bno, 1);
95 if (error)
96 return error;
97
98 trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_ag.agno,
99 bno, 1);
100 if (bno == NULLAGBLOCK) {
101 *stat = 0;
102 return 0;
103 }
104
105 xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1,
106 false);
107
108 xfs_trans_agbtree_delta(cur->bc_tp, 1);
109 new->s = cpu_to_be32(bno);
110 be32_add_cpu(&agf->agf_rmap_blocks, 1);
111 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
112
113 xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_ag.agno);
114
115 *stat = 1;
116 return 0;
117}
118
119STATIC int
120xfs_rmapbt_free_block(
121 struct xfs_btree_cur *cur,
122 struct xfs_buf *bp)
123{
124 struct xfs_buf *agbp = cur->bc_ag.agbp;
125 struct xfs_agf *agf = agbp->b_addr;
126 xfs_agblock_t bno;
127 int error;
128
129 bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
130 trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_ag.agno,
131 bno, 1);
132 be32_add_cpu(&agf->agf_rmap_blocks, -1);
133 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
134 error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
135 if (error)
136 return error;
137
138 xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
139 XFS_EXTENT_BUSY_SKIP_DISCARD);
140 xfs_trans_agbtree_delta(cur->bc_tp, -1);
141
142 xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_ag.agno);
143
144 return 0;
145}
146
147STATIC int
148xfs_rmapbt_get_minrecs(
149 struct xfs_btree_cur *cur,
150 int level)
151{
152 return cur->bc_mp->m_rmap_mnr[level != 0];
153}
154
155STATIC int
156xfs_rmapbt_get_maxrecs(
157 struct xfs_btree_cur *cur,
158 int level)
159{
160 return cur->bc_mp->m_rmap_mxr[level != 0];
161}
162
163STATIC void
164xfs_rmapbt_init_key_from_rec(
165 union xfs_btree_key *key,
166 union xfs_btree_rec *rec)
167{
168 key->rmap.rm_startblock = rec->rmap.rm_startblock;
169 key->rmap.rm_owner = rec->rmap.rm_owner;
170 key->rmap.rm_offset = rec->rmap.rm_offset;
171}
172
173/*
174 * The high key for a reverse mapping record can be computed by shifting
175 * the startblock and offset to the highest value that would still map
176 * to that record. In practice this means that we add blockcount-1 to
177 * the startblock for all records, and if the record is for a data/attr
178 * fork mapping, we add blockcount-1 to the offset too.
179 */
180STATIC void
181xfs_rmapbt_init_high_key_from_rec(
182 union xfs_btree_key *key,
183 union xfs_btree_rec *rec)
184{
185 uint64_t off;
186 int adj;
187
188 adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
189
190 key->rmap.rm_startblock = rec->rmap.rm_startblock;
191 be32_add_cpu(&key->rmap.rm_startblock, adj);
192 key->rmap.rm_owner = rec->rmap.rm_owner;
193 key->rmap.rm_offset = rec->rmap.rm_offset;
194 if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
195 XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
196 return;
197 off = be64_to_cpu(key->rmap.rm_offset);
198 off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
199 key->rmap.rm_offset = cpu_to_be64(off);
200}
201
202STATIC void
203xfs_rmapbt_init_rec_from_cur(
204 struct xfs_btree_cur *cur,
205 union xfs_btree_rec *rec)
206{
207 rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
208 rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
209 rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
210 rec->rmap.rm_offset = cpu_to_be64(
211 xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
212}
213
214STATIC void
215xfs_rmapbt_init_ptr_from_cur(
216 struct xfs_btree_cur *cur,
217 union xfs_btree_ptr *ptr)
218{
219 struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
220
221 ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno));
222
223 ptr->s = agf->agf_roots[cur->bc_btnum];
224}
225
226STATIC int64_t
227xfs_rmapbt_key_diff(
228 struct xfs_btree_cur *cur,
229 union xfs_btree_key *key)
230{
231 struct xfs_rmap_irec *rec = &cur->bc_rec.r;
232 struct xfs_rmap_key *kp = &key->rmap;
233 __u64 x, y;
234 int64_t d;
235
236 d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
237 if (d)
238 return d;
239
240 x = be64_to_cpu(kp->rm_owner);
241 y = rec->rm_owner;
242 if (x > y)
243 return 1;
244 else if (y > x)
245 return -1;
246
247 x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset));
248 y = rec->rm_offset;
249 if (x > y)
250 return 1;
251 else if (y > x)
252 return -1;
253 return 0;
254}
255
256STATIC int64_t
257xfs_rmapbt_diff_two_keys(
258 struct xfs_btree_cur *cur,
259 union xfs_btree_key *k1,
260 union xfs_btree_key *k2)
261{
262 struct xfs_rmap_key *kp1 = &k1->rmap;
263 struct xfs_rmap_key *kp2 = &k2->rmap;
264 int64_t d;
265 __u64 x, y;
266
267 d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
268 be32_to_cpu(kp2->rm_startblock);
269 if (d)
270 return d;
271
272 x = be64_to_cpu(kp1->rm_owner);
273 y = be64_to_cpu(kp2->rm_owner);
274 if (x > y)
275 return 1;
276 else if (y > x)
277 return -1;
278
279 x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset));
280 y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset));
281 if (x > y)
282 return 1;
283 else if (y > x)
284 return -1;
285 return 0;
286}
287
288static xfs_failaddr_t
289xfs_rmapbt_verify(
290 struct xfs_buf *bp)
291{
292 struct xfs_mount *mp = bp->b_mount;
293 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
294 struct xfs_perag *pag = bp->b_pag;
295 xfs_failaddr_t fa;
296 unsigned int level;
297
298 /*
299 * magic number and level verification
300 *
301 * During growfs operations, we can't verify the exact level or owner as
302 * the perag is not fully initialised and hence not attached to the
303 * buffer. In this case, check against the maximum tree depth.
304 *
305 * Similarly, during log recovery we will have a perag structure
306 * attached, but the agf information will not yet have been initialised
307 * from the on disk AGF. Again, we can only check against maximum limits
308 * in this case.
309 */
310 if (!xfs_verify_magic(bp, block->bb_magic))
311 return __this_address;
312
313 if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
314 return __this_address;
315 fa = xfs_btree_sblock_v5hdr_verify(bp);
316 if (fa)
317 return fa;
318
319 level = be16_to_cpu(block->bb_level);
320 if (pag && pag->pagf_init) {
321 if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi])
322 return __this_address;
323 } else if (level >= mp->m_rmap_maxlevels)
324 return __this_address;
325
326 return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]);
327}
328
329static void
330xfs_rmapbt_read_verify(
331 struct xfs_buf *bp)
332{
333 xfs_failaddr_t fa;
334
335 if (!xfs_btree_sblock_verify_crc(bp))
336 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
337 else {
338 fa = xfs_rmapbt_verify(bp);
339 if (fa)
340 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
341 }
342
343 if (bp->b_error)
344 trace_xfs_btree_corrupt(bp, _RET_IP_);
345}
346
347static void
348xfs_rmapbt_write_verify(
349 struct xfs_buf *bp)
350{
351 xfs_failaddr_t fa;
352
353 fa = xfs_rmapbt_verify(bp);
354 if (fa) {
355 trace_xfs_btree_corrupt(bp, _RET_IP_);
356 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
357 return;
358 }
359 xfs_btree_sblock_calc_crc(bp);
360
361}
362
363const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
364 .name = "xfs_rmapbt",
365 .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
366 .verify_read = xfs_rmapbt_read_verify,
367 .verify_write = xfs_rmapbt_write_verify,
368 .verify_struct = xfs_rmapbt_verify,
369};
370
371STATIC int
372xfs_rmapbt_keys_inorder(
373 struct xfs_btree_cur *cur,
374 union xfs_btree_key *k1,
375 union xfs_btree_key *k2)
376{
377 uint32_t x;
378 uint32_t y;
379 uint64_t a;
380 uint64_t b;
381
382 x = be32_to_cpu(k1->rmap.rm_startblock);
383 y = be32_to_cpu(k2->rmap.rm_startblock);
384 if (x < y)
385 return 1;
386 else if (x > y)
387 return 0;
388 a = be64_to_cpu(k1->rmap.rm_owner);
389 b = be64_to_cpu(k2->rmap.rm_owner);
390 if (a < b)
391 return 1;
392 else if (a > b)
393 return 0;
394 a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset));
395 b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset));
396 if (a <= b)
397 return 1;
398 return 0;
399}
400
401STATIC int
402xfs_rmapbt_recs_inorder(
403 struct xfs_btree_cur *cur,
404 union xfs_btree_rec *r1,
405 union xfs_btree_rec *r2)
406{
407 uint32_t x;
408 uint32_t y;
409 uint64_t a;
410 uint64_t b;
411
412 x = be32_to_cpu(r1->rmap.rm_startblock);
413 y = be32_to_cpu(r2->rmap.rm_startblock);
414 if (x < y)
415 return 1;
416 else if (x > y)
417 return 0;
418 a = be64_to_cpu(r1->rmap.rm_owner);
419 b = be64_to_cpu(r2->rmap.rm_owner);
420 if (a < b)
421 return 1;
422 else if (a > b)
423 return 0;
424 a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset));
425 b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset));
426 if (a <= b)
427 return 1;
428 return 0;
429}
430
431static const struct xfs_btree_ops xfs_rmapbt_ops = {
432 .rec_len = sizeof(struct xfs_rmap_rec),
433 .key_len = 2 * sizeof(struct xfs_rmap_key),
434
435 .dup_cursor = xfs_rmapbt_dup_cursor,
436 .set_root = xfs_rmapbt_set_root,
437 .alloc_block = xfs_rmapbt_alloc_block,
438 .free_block = xfs_rmapbt_free_block,
439 .get_minrecs = xfs_rmapbt_get_minrecs,
440 .get_maxrecs = xfs_rmapbt_get_maxrecs,
441 .init_key_from_rec = xfs_rmapbt_init_key_from_rec,
442 .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
443 .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur,
444 .init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur,
445 .key_diff = xfs_rmapbt_key_diff,
446 .buf_ops = &xfs_rmapbt_buf_ops,
447 .diff_two_keys = xfs_rmapbt_diff_two_keys,
448 .keys_inorder = xfs_rmapbt_keys_inorder,
449 .recs_inorder = xfs_rmapbt_recs_inorder,
450};
451
452static struct xfs_btree_cur *
453xfs_rmapbt_init_common(
454 struct xfs_mount *mp,
455 struct xfs_trans *tp,
456 xfs_agnumber_t agno)
457{
458 struct xfs_btree_cur *cur;
459
460 cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
461 cur->bc_tp = tp;
462 cur->bc_mp = mp;
463 /* Overlapping btree; 2 keys per pointer. */
464 cur->bc_btnum = XFS_BTNUM_RMAP;
465 cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
466 cur->bc_blocklog = mp->m_sb.sb_blocklog;
467 cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
468 cur->bc_ag.agno = agno;
469 cur->bc_ops = &xfs_rmapbt_ops;
470
471 return cur;
472}
473
474/* Create a new reverse mapping btree cursor. */
475struct xfs_btree_cur *
476xfs_rmapbt_init_cursor(
477 struct xfs_mount *mp,
478 struct xfs_trans *tp,
479 struct xfs_buf *agbp,
480 xfs_agnumber_t agno)
481{
482 struct xfs_agf *agf = agbp->b_addr;
483 struct xfs_btree_cur *cur;
484
485 cur = xfs_rmapbt_init_common(mp, tp, agno);
486 cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
487 cur->bc_ag.agbp = agbp;
488 return cur;
489}
490
491/* Create a new reverse mapping btree cursor with a fake root for staging. */
492struct xfs_btree_cur *
493xfs_rmapbt_stage_cursor(
494 struct xfs_mount *mp,
495 struct xbtree_afakeroot *afake,
496 xfs_agnumber_t agno)
497{
498 struct xfs_btree_cur *cur;
499
500 cur = xfs_rmapbt_init_common(mp, NULL, agno);
501 xfs_btree_stage_afakeroot(cur, afake);
502 return cur;
503}
504
505/*
506 * Install a new reverse mapping btree root. Caller is responsible for
507 * invalidating and freeing the old btree blocks.
508 */
509void
510xfs_rmapbt_commit_staged_btree(
511 struct xfs_btree_cur *cur,
512 struct xfs_trans *tp,
513 struct xfs_buf *agbp)
514{
515 struct xfs_agf *agf = agbp->b_addr;
516 struct xbtree_afakeroot *afake = cur->bc_ag.afake;
517
518 ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
519
520 agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root);
521 agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels);
522 agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks);
523 xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS |
524 XFS_AGF_RMAP_BLOCKS);
525 xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops);
526}
527
528/*
529 * Calculate number of records in an rmap btree block.
530 */
531int
532xfs_rmapbt_maxrecs(
533 int blocklen,
534 int leaf)
535{
536 blocklen -= XFS_RMAP_BLOCK_LEN;
537
538 if (leaf)
539 return blocklen / sizeof(struct xfs_rmap_rec);
540 return blocklen /
541 (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
542}
543
544/* Compute the maximum height of an rmap btree. */
545void
546xfs_rmapbt_compute_maxlevels(
547 struct xfs_mount *mp)
548{
549 /*
550 * On a non-reflink filesystem, the maximum number of rmap
551 * records is the number of blocks in the AG, hence the max
552 * rmapbt height is log_$maxrecs($agblocks). However, with
553 * reflink each AG block can have up to 2^32 (per the refcount
554 * record format) owners, which means that theoretically we
555 * could face up to 2^64 rmap records.
556 *
557 * That effectively means that the max rmapbt height must be
558 * XFS_BTREE_MAXLEVELS. "Fortunately" we'll run out of AG
559 * blocks to feed the rmapbt long before the rmapbt reaches
560 * maximum height. The reflink code uses ag_resv_critical to
561 * disallow reflinking when less than 10% of the per-AG metadata
562 * block reservation since the fallback is a regular file copy.
563 */
564 if (xfs_sb_version_hasreflink(&mp->m_sb))
565 mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS;
566 else
567 mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(
568 mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
569}
570
571/* Calculate the refcount btree size for some records. */
572xfs_extlen_t
573xfs_rmapbt_calc_size(
574 struct xfs_mount *mp,
575 unsigned long long len)
576{
577 return xfs_btree_calc_size(mp->m_rmap_mnr, len);
578}
579
580/*
581 * Calculate the maximum refcount btree size.
582 */
583xfs_extlen_t
584xfs_rmapbt_max_size(
585 struct xfs_mount *mp,
586 xfs_agblock_t agblocks)
587{
588 /* Bail out if we're uninitialized, which can happen in mkfs. */
589 if (mp->m_rmap_mxr[0] == 0)
590 return 0;
591
592 return xfs_rmapbt_calc_size(mp, agblocks);
593}
594
595/*
596 * Figure out how many blocks to reserve and how many are used by this btree.
597 */
598int
599xfs_rmapbt_calc_reserves(
600 struct xfs_mount *mp,
601 struct xfs_trans *tp,
602 xfs_agnumber_t agno,
603 xfs_extlen_t *ask,
604 xfs_extlen_t *used)
605{
606 struct xfs_buf *agbp;
607 struct xfs_agf *agf;
608 xfs_agblock_t agblocks;
609 xfs_extlen_t tree_len;
610 int error;
611
612 if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
613 return 0;
614
615 error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
616 if (error)
617 return error;
618
619 agf = agbp->b_addr;
620 agblocks = be32_to_cpu(agf->agf_length);
621 tree_len = be32_to_cpu(agf->agf_rmap_blocks);
622 xfs_trans_brelse(tp, agbp);
623
624 /*
625 * The log is permanently allocated, so the space it occupies will
626 * never be available for the kinds of things that would require btree
627 * expansion. We therefore can pretend the space isn't there.
628 */
629 if (mp->m_sb.sb_logstart &&
630 XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno)
631 agblocks -= mp->m_sb.sb_logblocks;
632
633 /* Reserve 1% of the AG or enough for 1 block per record. */
634 *ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks));
635 *used += tree_len;
636
637 return error;
638}