Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v4.11-rc5 596 lines 16 kB view raw
1/* 2 * Copyright (c) 2014 Red Hat, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18#include "xfs.h" 19#include "xfs_fs.h" 20#include "xfs_shared.h" 21#include "xfs_format.h" 22#include "xfs_log_format.h" 23#include "xfs_trans_resv.h" 24#include "xfs_bit.h" 25#include "xfs_sb.h" 26#include "xfs_mount.h" 27#include "xfs_defer.h" 28#include "xfs_inode.h" 29#include "xfs_trans.h" 30#include "xfs_alloc.h" 31#include "xfs_btree.h" 32#include "xfs_rmap.h" 33#include "xfs_rmap_btree.h" 34#include "xfs_trace.h" 35#include "xfs_cksum.h" 36#include "xfs_error.h" 37#include "xfs_extent_busy.h" 38#include "xfs_ag_resv.h" 39 40/* 41 * Reverse map btree. 42 * 43 * This is a per-ag tree used to track the owner(s) of a given extent. With 44 * reflink it is possible for there to be multiple owners, which is a departure 45 * from classic XFS. Owner records for data extents are inserted when the 46 * extent is mapped and removed when an extent is unmapped. Owner records for 47 * all other block types (i.e. metadata) are inserted when an extent is 48 * allocated and removed when an extent is freed. There can only be one owner 49 * of a metadata extent, usually an inode or some other metadata structure like 50 * an AG btree. 51 * 52 * The rmap btree is part of the free space management, so blocks for the tree 53 * are sourced from the agfl. Hence we need transaction reservation support for 54 * this tree so that the freelist is always large enough. This also impacts on 55 * the minimum space we need to leave free in the AG. 56 * 57 * The tree is ordered by [ag block, owner, offset]. This is a large key size, 58 * but it is the only way to enforce unique keys when a block can be owned by 59 * multiple files at any offset. There's no need to order/search by extent 60 * size for online updating/management of the tree. It is intended that most 61 * reverse lookups will be to find the owner(s) of a particular block, or to 62 * try to recover tree and file data from corrupt primary metadata. 63 */ 64 65static struct xfs_btree_cur * 66xfs_rmapbt_dup_cursor( 67 struct xfs_btree_cur *cur) 68{ 69 return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp, 70 cur->bc_private.a.agbp, cur->bc_private.a.agno); 71} 72 73STATIC void 74xfs_rmapbt_set_root( 75 struct xfs_btree_cur *cur, 76 union xfs_btree_ptr *ptr, 77 int inc) 78{ 79 struct xfs_buf *agbp = cur->bc_private.a.agbp; 80 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); 81 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); 82 int btnum = cur->bc_btnum; 83 struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); 84 85 ASSERT(ptr->s != 0); 86 87 agf->agf_roots[btnum] = ptr->s; 88 be32_add_cpu(&agf->agf_levels[btnum], inc); 89 pag->pagf_levels[btnum] += inc; 90 xfs_perag_put(pag); 91 92 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); 93} 94 95STATIC int 96xfs_rmapbt_alloc_block( 97 struct xfs_btree_cur *cur, 98 union xfs_btree_ptr *start, 99 union xfs_btree_ptr *new, 100 int *stat) 101{ 102 struct xfs_buf *agbp = cur->bc_private.a.agbp; 103 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); 104 int error; 105 xfs_agblock_t bno; 106 107 XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); 108 109 /* Allocate the new block from the freelist. If we can't, give up. */ 110 error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, 111 &bno, 1); 112 if (error) { 113 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); 114 return error; 115 } 116 117 trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, 118 bno, 1); 119 if (bno == NULLAGBLOCK) { 120 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 121 *stat = 0; 122 return 0; 123 } 124 125 xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, 126 false); 127 128 xfs_trans_agbtree_delta(cur->bc_tp, 1); 129 new->s = cpu_to_be32(bno); 130 be32_add_cpu(&agf->agf_rmap_blocks, 1); 131 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); 132 133 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 134 *stat = 1; 135 return 0; 136} 137 138STATIC int 139xfs_rmapbt_free_block( 140 struct xfs_btree_cur *cur, 141 struct xfs_buf *bp) 142{ 143 struct xfs_buf *agbp = cur->bc_private.a.agbp; 144 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); 145 xfs_agblock_t bno; 146 int error; 147 148 bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); 149 trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno, 150 bno, 1); 151 be32_add_cpu(&agf->agf_rmap_blocks, -1); 152 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); 153 error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); 154 if (error) 155 return error; 156 157 xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, 158 XFS_EXTENT_BUSY_SKIP_DISCARD); 159 xfs_trans_agbtree_delta(cur->bc_tp, -1); 160 161 return 0; 162} 163 164STATIC int 165xfs_rmapbt_get_minrecs( 166 struct xfs_btree_cur *cur, 167 int level) 168{ 169 return cur->bc_mp->m_rmap_mnr[level != 0]; 170} 171 172STATIC int 173xfs_rmapbt_get_maxrecs( 174 struct xfs_btree_cur *cur, 175 int level) 176{ 177 return cur->bc_mp->m_rmap_mxr[level != 0]; 178} 179 180STATIC void 181xfs_rmapbt_init_key_from_rec( 182 union xfs_btree_key *key, 183 union xfs_btree_rec *rec) 184{ 185 key->rmap.rm_startblock = rec->rmap.rm_startblock; 186 key->rmap.rm_owner = rec->rmap.rm_owner; 187 key->rmap.rm_offset = rec->rmap.rm_offset; 188} 189 190/* 191 * The high key for a reverse mapping record can be computed by shifting 192 * the startblock and offset to the highest value that would still map 193 * to that record. In practice this means that we add blockcount-1 to 194 * the startblock for all records, and if the record is for a data/attr 195 * fork mapping, we add blockcount-1 to the offset too. 196 */ 197STATIC void 198xfs_rmapbt_init_high_key_from_rec( 199 union xfs_btree_key *key, 200 union xfs_btree_rec *rec) 201{ 202 __uint64_t off; 203 int adj; 204 205 adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1; 206 207 key->rmap.rm_startblock = rec->rmap.rm_startblock; 208 be32_add_cpu(&key->rmap.rm_startblock, adj); 209 key->rmap.rm_owner = rec->rmap.rm_owner; 210 key->rmap.rm_offset = rec->rmap.rm_offset; 211 if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) || 212 XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset))) 213 return; 214 off = be64_to_cpu(key->rmap.rm_offset); 215 off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK); 216 key->rmap.rm_offset = cpu_to_be64(off); 217} 218 219STATIC void 220xfs_rmapbt_init_rec_from_cur( 221 struct xfs_btree_cur *cur, 222 union xfs_btree_rec *rec) 223{ 224 rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock); 225 rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount); 226 rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner); 227 rec->rmap.rm_offset = cpu_to_be64( 228 xfs_rmap_irec_offset_pack(&cur->bc_rec.r)); 229} 230 231STATIC void 232xfs_rmapbt_init_ptr_from_cur( 233 struct xfs_btree_cur *cur, 234 union xfs_btree_ptr *ptr) 235{ 236 struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); 237 238 ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); 239 ASSERT(agf->agf_roots[cur->bc_btnum] != 0); 240 241 ptr->s = agf->agf_roots[cur->bc_btnum]; 242} 243 244STATIC __int64_t 245xfs_rmapbt_key_diff( 246 struct xfs_btree_cur *cur, 247 union xfs_btree_key *key) 248{ 249 struct xfs_rmap_irec *rec = &cur->bc_rec.r; 250 struct xfs_rmap_key *kp = &key->rmap; 251 __u64 x, y; 252 __int64_t d; 253 254 d = (__int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; 255 if (d) 256 return d; 257 258 x = be64_to_cpu(kp->rm_owner); 259 y = rec->rm_owner; 260 if (x > y) 261 return 1; 262 else if (y > x) 263 return -1; 264 265 x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset)); 266 y = rec->rm_offset; 267 if (x > y) 268 return 1; 269 else if (y > x) 270 return -1; 271 return 0; 272} 273 274STATIC __int64_t 275xfs_rmapbt_diff_two_keys( 276 struct xfs_btree_cur *cur, 277 union xfs_btree_key *k1, 278 union xfs_btree_key *k2) 279{ 280 struct xfs_rmap_key *kp1 = &k1->rmap; 281 struct xfs_rmap_key *kp2 = &k2->rmap; 282 __int64_t d; 283 __u64 x, y; 284 285 d = (__int64_t)be32_to_cpu(kp1->rm_startblock) - 286 be32_to_cpu(kp2->rm_startblock); 287 if (d) 288 return d; 289 290 x = be64_to_cpu(kp1->rm_owner); 291 y = be64_to_cpu(kp2->rm_owner); 292 if (x > y) 293 return 1; 294 else if (y > x) 295 return -1; 296 297 x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset)); 298 y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset)); 299 if (x > y) 300 return 1; 301 else if (y > x) 302 return -1; 303 return 0; 304} 305 306static bool 307xfs_rmapbt_verify( 308 struct xfs_buf *bp) 309{ 310 struct xfs_mount *mp = bp->b_target->bt_mount; 311 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 312 struct xfs_perag *pag = bp->b_pag; 313 unsigned int level; 314 315 /* 316 * magic number and level verification 317 * 318 * During growfs operations, we can't verify the exact level or owner as 319 * the perag is not fully initialised and hence not attached to the 320 * buffer. In this case, check against the maximum tree depth. 321 * 322 * Similarly, during log recovery we will have a perag structure 323 * attached, but the agf information will not yet have been initialised 324 * from the on disk AGF. Again, we can only check against maximum limits 325 * in this case. 326 */ 327 if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC)) 328 return false; 329 330 if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) 331 return false; 332 if (!xfs_btree_sblock_v5hdr_verify(bp)) 333 return false; 334 335 level = be16_to_cpu(block->bb_level); 336 if (pag && pag->pagf_init) { 337 if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi]) 338 return false; 339 } else if (level >= mp->m_rmap_maxlevels) 340 return false; 341 342 return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]); 343} 344 345static void 346xfs_rmapbt_read_verify( 347 struct xfs_buf *bp) 348{ 349 if (!xfs_btree_sblock_verify_crc(bp)) 350 xfs_buf_ioerror(bp, -EFSBADCRC); 351 else if (!xfs_rmapbt_verify(bp)) 352 xfs_buf_ioerror(bp, -EFSCORRUPTED); 353 354 if (bp->b_error) { 355 trace_xfs_btree_corrupt(bp, _RET_IP_); 356 xfs_verifier_error(bp); 357 } 358} 359 360static void 361xfs_rmapbt_write_verify( 362 struct xfs_buf *bp) 363{ 364 if (!xfs_rmapbt_verify(bp)) { 365 trace_xfs_btree_corrupt(bp, _RET_IP_); 366 xfs_buf_ioerror(bp, -EFSCORRUPTED); 367 xfs_verifier_error(bp); 368 return; 369 } 370 xfs_btree_sblock_calc_crc(bp); 371 372} 373 374const struct xfs_buf_ops xfs_rmapbt_buf_ops = { 375 .name = "xfs_rmapbt", 376 .verify_read = xfs_rmapbt_read_verify, 377 .verify_write = xfs_rmapbt_write_verify, 378}; 379 380#if defined(DEBUG) || defined(XFS_WARN) 381STATIC int 382xfs_rmapbt_keys_inorder( 383 struct xfs_btree_cur *cur, 384 union xfs_btree_key *k1, 385 union xfs_btree_key *k2) 386{ 387 __uint32_t x; 388 __uint32_t y; 389 __uint64_t a; 390 __uint64_t b; 391 392 x = be32_to_cpu(k1->rmap.rm_startblock); 393 y = be32_to_cpu(k2->rmap.rm_startblock); 394 if (x < y) 395 return 1; 396 else if (x > y) 397 return 0; 398 a = be64_to_cpu(k1->rmap.rm_owner); 399 b = be64_to_cpu(k2->rmap.rm_owner); 400 if (a < b) 401 return 1; 402 else if (a > b) 403 return 0; 404 a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset)); 405 b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset)); 406 if (a <= b) 407 return 1; 408 return 0; 409} 410 411STATIC int 412xfs_rmapbt_recs_inorder( 413 struct xfs_btree_cur *cur, 414 union xfs_btree_rec *r1, 415 union xfs_btree_rec *r2) 416{ 417 __uint32_t x; 418 __uint32_t y; 419 __uint64_t a; 420 __uint64_t b; 421 422 x = be32_to_cpu(r1->rmap.rm_startblock); 423 y = be32_to_cpu(r2->rmap.rm_startblock); 424 if (x < y) 425 return 1; 426 else if (x > y) 427 return 0; 428 a = be64_to_cpu(r1->rmap.rm_owner); 429 b = be64_to_cpu(r2->rmap.rm_owner); 430 if (a < b) 431 return 1; 432 else if (a > b) 433 return 0; 434 a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset)); 435 b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset)); 436 if (a <= b) 437 return 1; 438 return 0; 439} 440#endif /* DEBUG */ 441 442static const struct xfs_btree_ops xfs_rmapbt_ops = { 443 .rec_len = sizeof(struct xfs_rmap_rec), 444 .key_len = 2 * sizeof(struct xfs_rmap_key), 445 446 .dup_cursor = xfs_rmapbt_dup_cursor, 447 .set_root = xfs_rmapbt_set_root, 448 .alloc_block = xfs_rmapbt_alloc_block, 449 .free_block = xfs_rmapbt_free_block, 450 .get_minrecs = xfs_rmapbt_get_minrecs, 451 .get_maxrecs = xfs_rmapbt_get_maxrecs, 452 .init_key_from_rec = xfs_rmapbt_init_key_from_rec, 453 .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec, 454 .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur, 455 .init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur, 456 .key_diff = xfs_rmapbt_key_diff, 457 .buf_ops = &xfs_rmapbt_buf_ops, 458 .diff_two_keys = xfs_rmapbt_diff_two_keys, 459#if defined(DEBUG) || defined(XFS_WARN) 460 .keys_inorder = xfs_rmapbt_keys_inorder, 461 .recs_inorder = xfs_rmapbt_recs_inorder, 462#endif 463}; 464 465/* 466 * Allocate a new allocation btree cursor. 467 */ 468struct xfs_btree_cur * 469xfs_rmapbt_init_cursor( 470 struct xfs_mount *mp, 471 struct xfs_trans *tp, 472 struct xfs_buf *agbp, 473 xfs_agnumber_t agno) 474{ 475 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); 476 struct xfs_btree_cur *cur; 477 478 cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); 479 cur->bc_tp = tp; 480 cur->bc_mp = mp; 481 /* Overlapping btree; 2 keys per pointer. */ 482 cur->bc_btnum = XFS_BTNUM_RMAP; 483 cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING; 484 cur->bc_blocklog = mp->m_sb.sb_blocklog; 485 cur->bc_ops = &xfs_rmapbt_ops; 486 cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); 487 cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); 488 489 cur->bc_private.a.agbp = agbp; 490 cur->bc_private.a.agno = agno; 491 492 return cur; 493} 494 495/* 496 * Calculate number of records in an rmap btree block. 497 */ 498int 499xfs_rmapbt_maxrecs( 500 struct xfs_mount *mp, 501 int blocklen, 502 int leaf) 503{ 504 blocklen -= XFS_RMAP_BLOCK_LEN; 505 506 if (leaf) 507 return blocklen / sizeof(struct xfs_rmap_rec); 508 return blocklen / 509 (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t)); 510} 511 512/* Compute the maximum height of an rmap btree. */ 513void 514xfs_rmapbt_compute_maxlevels( 515 struct xfs_mount *mp) 516{ 517 /* 518 * On a non-reflink filesystem, the maximum number of rmap 519 * records is the number of blocks in the AG, hence the max 520 * rmapbt height is log_$maxrecs($agblocks). However, with 521 * reflink each AG block can have up to 2^32 (per the refcount 522 * record format) owners, which means that theoretically we 523 * could face up to 2^64 rmap records. 524 * 525 * That effectively means that the max rmapbt height must be 526 * XFS_BTREE_MAXLEVELS. "Fortunately" we'll run out of AG 527 * blocks to feed the rmapbt long before the rmapbt reaches 528 * maximum height. The reflink code uses ag_resv_critical to 529 * disallow reflinking when less than 10% of the per-AG metadata 530 * block reservation since the fallback is a regular file copy. 531 */ 532 if (xfs_sb_version_hasreflink(&mp->m_sb)) 533 mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS; 534 else 535 mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp, 536 mp->m_rmap_mnr, mp->m_sb.sb_agblocks); 537} 538 539/* Calculate the refcount btree size for some records. */ 540xfs_extlen_t 541xfs_rmapbt_calc_size( 542 struct xfs_mount *mp, 543 unsigned long long len) 544{ 545 return xfs_btree_calc_size(mp, mp->m_rmap_mnr, len); 546} 547 548/* 549 * Calculate the maximum refcount btree size. 550 */ 551xfs_extlen_t 552xfs_rmapbt_max_size( 553 struct xfs_mount *mp, 554 xfs_agblock_t agblocks) 555{ 556 /* Bail out if we're uninitialized, which can happen in mkfs. */ 557 if (mp->m_rmap_mxr[0] == 0) 558 return 0; 559 560 return xfs_rmapbt_calc_size(mp, agblocks); 561} 562 563/* 564 * Figure out how many blocks to reserve and how many are used by this btree. 565 */ 566int 567xfs_rmapbt_calc_reserves( 568 struct xfs_mount *mp, 569 xfs_agnumber_t agno, 570 xfs_extlen_t *ask, 571 xfs_extlen_t *used) 572{ 573 struct xfs_buf *agbp; 574 struct xfs_agf *agf; 575 xfs_agblock_t agblocks; 576 xfs_extlen_t tree_len; 577 int error; 578 579 if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) 580 return 0; 581 582 error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); 583 if (error) 584 return error; 585 586 agf = XFS_BUF_TO_AGF(agbp); 587 agblocks = be32_to_cpu(agf->agf_length); 588 tree_len = be32_to_cpu(agf->agf_rmap_blocks); 589 xfs_buf_relse(agbp); 590 591 /* Reserve 1% of the AG or enough for 1 block per record. */ 592 *ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks)); 593 *used += tree_len; 594 595 return error; 596}