Merge branch 'xfs-sparse-inode' into for-next

+37 -5

fs/xfs/libxfs/xfs_alloc.c

··· 149 149 { 150 150 xfs_agblock_t bno; 151 151 xfs_extlen_t len; 152 + xfs_extlen_t diff; 152 153 153 154 /* Trim busy sections out of found extent */ 154 155 xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len); 155 156 157 + /* 158 + * If we have a largish extent that happens to start before min_agbno, 159 + * see if we can shift it into range... 160 + */ 161 + if (bno < args->min_agbno && bno + len > args->min_agbno) { 162 + diff = args->min_agbno - bno; 163 + if (len > diff) { 164 + bno += diff; 165 + len -= diff; 166 + } 167 + } 168 + 156 169 if (args->alignment > 1 && len >= args->minlen) { 157 170 xfs_agblock_t aligned_bno = roundup(bno, args->alignment); 158 - xfs_extlen_t diff = aligned_bno - bno; 171 + 172 + diff = aligned_bno - bno; 159 173 160 174 *resbno = aligned_bno; 161 175 *reslen = diff >= len ? 0 : len - diff; ··· 809 795 * The good extent is closer than this one. 810 796 */ 811 797 if (!dir) { 798 + if (*sbnoa > args->max_agbno) 799 + goto out_use_good; 812 800 if (*sbnoa >= args->agbno + gdiff) 813 801 goto out_use_good; 814 802 } else { 803 + if (*sbnoa < args->min_agbno) 804 + goto out_use_good; 815 805 if (*sbnoa <= args->agbno - gdiff) 816 806 goto out_use_good; 817 807 } ··· 901 883 902 884 dofirst = prandom_u32() & 1; 903 885 #endif 886 + 887 + /* handle unitialized agbno range so caller doesn't have to */ 888 + if (!args->min_agbno && !args->max_agbno) 889 + args->max_agbno = args->mp->m_sb.sb_agblocks - 1; 890 + ASSERT(args->min_agbno <= args->max_agbno); 891 + 892 + /* clamp agbno to the range if it's outside */ 893 + if (args->agbno < args->min_agbno) 894 + args->agbno = args->min_agbno; 895 + if (args->agbno > args->max_agbno) 896 + args->agbno = args->max_agbno; 904 897 905 898 restart: 906 899 bno_cur_lt = NULL; ··· 1004 975 xfs_alloc_compute_aligned(args, ltbno, ltlen, 1005 976 &ltbnoa, &ltlena); 1006 977 if (ltlena < args->minlen) 978 + continue; 979 + if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno) 1007 980 continue; 1008 981 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 1009 982 xfs_alloc_fix_len(args); ··· 1127 1096 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); 1128 1097 xfs_alloc_compute_aligned(args, ltbno, ltlen, 1129 1098 &ltbnoa, &ltlena); 1130 - if (ltlena >= args->minlen) 1099 + if (ltlena >= args->minlen && ltbnoa >= args->min_agbno) 1131 1100 break; 1132 1101 if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) 1133 1102 goto error0; 1134 - if (!i) { 1103 + if (!i || ltbnoa < args->min_agbno) { 1135 1104 xfs_btree_del_cursor(bno_cur_lt, 1136 1105 XFS_BTREE_NOERROR); 1137 1106 bno_cur_lt = NULL; ··· 1143 1112 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); 1144 1113 xfs_alloc_compute_aligned(args, gtbno, gtlen, 1145 1114 &gtbnoa, &gtlena); 1146 - if (gtlena >= args->minlen) 1115 + if (gtlena >= args->minlen && gtbnoa <= args->max_agbno) 1147 1116 break; 1148 1117 if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) 1149 1118 goto error0; 1150 - if (!i) { 1119 + if (!i || gtbnoa > args->max_agbno) { 1151 1120 xfs_btree_del_cursor(bno_cur_gt, 1152 1121 XFS_BTREE_NOERROR); 1153 1122 bno_cur_gt = NULL; ··· 1247 1216 ASSERT(ltnew >= ltbno); 1248 1217 ASSERT(ltnew + rlen <= ltbnoa + ltlena); 1249 1218 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); 1219 + ASSERT(ltnew >= args->min_agbno && ltnew <= args->max_agbno); 1250 1220 args->agbno = ltnew; 1251 1221 1252 1222 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,

+2

fs/xfs/libxfs/xfs_alloc.h

··· 112 112 xfs_extlen_t total; /* total blocks needed in xaction */ 113 113 xfs_extlen_t alignment; /* align answer to multiple of this */ 114 114 xfs_extlen_t minalignslop; /* slop for minlen+alignment calcs */ 115 + xfs_agblock_t min_agbno; /* set an agbno range for NEAR allocs */ 116 + xfs_agblock_t max_agbno; /* ... */ 115 117 xfs_extlen_t len; /* output: actual size of extent */ 116 118 xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */ 117 119 xfs_alloctype_t otype; /* original allocation type */

+42 -6

fs/xfs/libxfs/xfs_format.h

··· 170 170 __uint32_t sb_features_log_incompat; 171 171 172 172 __uint32_t sb_crc; /* superblock crc */ 173 - __uint32_t sb_pad; 173 + xfs_extlen_t sb_spino_align; /* sparse inode chunk alignment */ 174 174 175 175 xfs_ino_t sb_pquotino; /* project quota inode */ 176 176 xfs_lsn_t sb_lsn; /* last write sequence */ ··· 256 256 __be32 sb_features_log_incompat; 257 257 258 258 __le32 sb_crc; /* superblock crc */ 259 - __be32 sb_pad; 259 + __be32 sb_spino_align; /* sparse inode chunk alignment */ 260 260 261 261 __be64 sb_pquotino; /* project quota inode */ 262 262 __be64 sb_lsn; /* last write sequence */ ··· 457 457 } 458 458 459 459 #define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ 460 + #define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */ 460 461 #define XFS_SB_FEAT_INCOMPAT_ALL \ 461 - (XFS_SB_FEAT_INCOMPAT_FTYPE) 462 + (XFS_SB_FEAT_INCOMPAT_FTYPE| \ 463 + XFS_SB_FEAT_INCOMPAT_SPINODES) 462 464 463 465 #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL 464 466 static inline bool ··· 506 504 { 507 505 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && 508 506 (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); 507 + } 508 + 509 + static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp) 510 + { 511 + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && 512 + xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_SPINODES); 509 513 } 510 514 511 515 /* ··· 1224 1216 #define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) 1225 1217 #define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) 1226 1218 1219 + #define XFS_INOBT_HOLEMASK_FULL 0 /* holemask for full chunk */ 1220 + #define XFS_INOBT_HOLEMASK_BITS (NBBY * sizeof(__uint16_t)) 1221 + #define XFS_INODES_PER_HOLEMASK_BIT \ 1222 + (XFS_INODES_PER_CHUNK / (NBBY * sizeof(__uint16_t))) 1223 + 1227 1224 static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) 1228 1225 { 1229 1226 return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i; 1230 1227 } 1231 1228 1232 1229 /* 1233 - * Data record structure 1230 + * The on-disk inode record structure has two formats. The original "full" 1231 + * format uses a 4-byte freecount. The "sparse" format uses a 1-byte freecount 1232 + * and replaces the 3 high-order freecount bytes wth the holemask and inode 1233 + * count. 1234 + * 1235 + * The holemask of the sparse record format allows an inode chunk to have holes 1236 + * that refer to blocks not owned by the inode record. This facilitates inode 1237 + * allocation in the event of severe free space fragmentation. 1234 1238 */ 1235 1239 typedef struct xfs_inobt_rec { 1236 1240 __be32 ir_startino; /* starting inode number */ 1237 - __be32 ir_freecount; /* count of free inodes (set bits) */ 1241 + union { 1242 + struct { 1243 + __be32 ir_freecount; /* count of free inodes */ 1244 + } f; 1245 + struct { 1246 + __be16 ir_holemask;/* hole mask for sparse chunks */ 1247 + __u8 ir_count; /* total inode count */ 1248 + __u8 ir_freecount; /* count of free inodes */ 1249 + } sp; 1250 + } ir_u; 1238 1251 __be64 ir_free; /* free inode mask */ 1239 1252 } xfs_inobt_rec_t; 1240 1253 1241 1254 typedef struct xfs_inobt_rec_incore { 1242 1255 xfs_agino_t ir_startino; /* starting inode number */ 1243 - __int32_t ir_freecount; /* count of free inodes (set bits) */ 1256 + __uint16_t ir_holemask; /* hole mask for sparse chunks */ 1257 + __uint8_t ir_count; /* total inode count */ 1258 + __uint8_t ir_freecount; /* count of free inodes (set bits) */ 1244 1259 xfs_inofree_t ir_free; /* free inode mask */ 1245 1260 } xfs_inobt_rec_incore_t; 1246 1261 1262 + static inline bool xfs_inobt_issparse(uint16_t holemask) 1263 + { 1264 + /* non-zero holemask represents a sparse rec. */ 1265 + return holemask; 1266 + } 1247 1267 1248 1268 /* 1249 1269 * Key structure

+1

fs/xfs/libxfs/xfs_fs.h

··· 239 239 #define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */ 240 240 #define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */ 241 241 #define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */ 242 + #define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */ 242 243 243 244 /* 244 245 * Minimum and maximum sizes need for growth checks.

+494 -47

fs/xfs/libxfs/xfs_ialloc.c

··· 65 65 int *stat) /* success/failure */ 66 66 { 67 67 cur->bc_rec.i.ir_startino = ino; 68 + cur->bc_rec.i.ir_holemask = 0; 69 + cur->bc_rec.i.ir_count = 0; 68 70 cur->bc_rec.i.ir_freecount = 0; 69 71 cur->bc_rec.i.ir_free = 0; 70 72 return xfs_btree_lookup(cur, dir, stat); ··· 84 82 union xfs_btree_rec rec; 85 83 86 84 rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino); 87 - rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount); 85 + if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { 86 + rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask); 87 + rec.inobt.ir_u.sp.ir_count = irec->ir_count; 88 + rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount; 89 + } else { 90 + /* ir_holemask/ir_count not supported on-disk */ 91 + rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount); 92 + } 88 93 rec.inobt.ir_free = cpu_to_be64(irec->ir_free); 89 94 return xfs_btree_update(cur, &rec); 90 95 } ··· 109 100 int error; 110 101 111 102 error = xfs_btree_get_rec(cur, &rec, stat); 112 - if (!error && *stat == 1) { 113 - irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); 114 - irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount); 115 - irec->ir_free = be64_to_cpu(rec->inobt.ir_free); 103 + if (error || *stat == 0) 104 + return error; 105 + 106 + irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); 107 + if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { 108 + irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask); 109 + irec->ir_count = rec->inobt.ir_u.sp.ir_count; 110 + irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount; 111 + } else { 112 + /* 113 + * ir_holemask/ir_count not supported on-disk. Fill in hardcoded 114 + * values for full inode chunks. 115 + */ 116 + irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL; 117 + irec->ir_count = XFS_INODES_PER_CHUNK; 118 + irec->ir_freecount = 119 + be32_to_cpu(rec->inobt.ir_u.f.ir_freecount); 116 120 } 117 - return error; 121 + irec->ir_free = be64_to_cpu(rec->inobt.ir_free); 122 + 123 + return 0; 118 124 } 119 125 120 126 /* ··· 138 114 STATIC int 139 115 xfs_inobt_insert_rec( 140 116 struct xfs_btree_cur *cur, 117 + __uint16_t holemask, 118 + __uint8_t count, 141 119 __int32_t freecount, 142 120 xfs_inofree_t free, 143 121 int *stat) 144 122 { 123 + cur->bc_rec.i.ir_holemask = holemask; 124 + cur->bc_rec.i.ir_count = count; 145 125 cur->bc_rec.i.ir_freecount = freecount; 146 126 cur->bc_rec.i.ir_free = free; 147 127 return xfs_btree_insert(cur, stat); ··· 182 154 } 183 155 ASSERT(i == 0); 184 156 185 - error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK, 157 + error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL, 158 + XFS_INODES_PER_CHUNK, 159 + XFS_INODES_PER_CHUNK, 186 160 XFS_INOBT_ALL_FREE, &i); 187 161 if (error) { 188 162 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); ··· 250 220 struct xfs_mount *mp, 251 221 struct xfs_trans *tp, 252 222 struct list_head *buffer_list, 223 + int icount, 253 224 xfs_agnumber_t agno, 254 225 xfs_agblock_t agbno, 255 226 xfs_agblock_t length, ··· 306 275 * they track in the AIL as if they were physically logged. 307 276 */ 308 277 if (tp) 309 - xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos, 278 + xfs_icreate_log(tp, agno, agbno, icount, 310 279 mp->m_sb.sb_inodesize, length, gen); 311 280 } else 312 281 version = 2; ··· 378 347 } 379 348 380 349 /* 350 + * Align startino and allocmask for a recently allocated sparse chunk such that 351 + * they are fit for insertion (or merge) into the on-disk inode btrees. 352 + * 353 + * Background: 354 + * 355 + * When enabled, sparse inode support increases the inode alignment from cluster 356 + * size to inode chunk size. This means that the minimum range between two 357 + * non-adjacent inode records in the inobt is large enough for a full inode 358 + * record. This allows for cluster sized, cluster aligned block allocation 359 + * without need to worry about whether the resulting inode record overlaps with 360 + * another record in the tree. Without this basic rule, we would have to deal 361 + * with the consequences of overlap by potentially undoing recent allocations in 362 + * the inode allocation codepath. 363 + * 364 + * Because of this alignment rule (which is enforced on mount), there are two 365 + * inobt possibilities for newly allocated sparse chunks. One is that the 366 + * aligned inode record for the chunk covers a range of inodes not already 367 + * covered in the inobt (i.e., it is safe to insert a new sparse record). The 368 + * other is that a record already exists at the aligned startino that considers 369 + * the newly allocated range as sparse. In the latter case, record content is 370 + * merged in hope that sparse inode chunks fill to full chunks over time. 371 + */ 372 + STATIC void 373 + xfs_align_sparse_ino( 374 + struct xfs_mount *mp, 375 + xfs_agino_t *startino, 376 + uint16_t *allocmask) 377 + { 378 + xfs_agblock_t agbno; 379 + xfs_agblock_t mod; 380 + int offset; 381 + 382 + agbno = XFS_AGINO_TO_AGBNO(mp, *startino); 383 + mod = agbno % mp->m_sb.sb_inoalignmt; 384 + if (!mod) 385 + return; 386 + 387 + /* calculate the inode offset and align startino */ 388 + offset = mod << mp->m_sb.sb_inopblog; 389 + *startino -= offset; 390 + 391 + /* 392 + * Since startino has been aligned down, left shift allocmask such that 393 + * it continues to represent the same physical inodes relative to the 394 + * new startino. 395 + */ 396 + *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT; 397 + } 398 + 399 + /* 400 + * Determine whether the source inode record can merge into the target. Both 401 + * records must be sparse, the inode ranges must match and there must be no 402 + * allocation overlap between the records. 403 + */ 404 + STATIC bool 405 + __xfs_inobt_can_merge( 406 + struct xfs_inobt_rec_incore *trec, /* tgt record */ 407 + struct xfs_inobt_rec_incore *srec) /* src record */ 408 + { 409 + uint64_t talloc; 410 + uint64_t salloc; 411 + 412 + /* records must cover the same inode range */ 413 + if (trec->ir_startino != srec->ir_startino) 414 + return false; 415 + 416 + /* both records must be sparse */ 417 + if (!xfs_inobt_issparse(trec->ir_holemask) || 418 + !xfs_inobt_issparse(srec->ir_holemask)) 419 + return false; 420 + 421 + /* both records must track some inodes */ 422 + if (!trec->ir_count || !srec->ir_count) 423 + return false; 424 + 425 + /* can't exceed capacity of a full record */ 426 + if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK) 427 + return false; 428 + 429 + /* verify there is no allocation overlap */ 430 + talloc = xfs_inobt_irec_to_allocmask(trec); 431 + salloc = xfs_inobt_irec_to_allocmask(srec); 432 + if (talloc & salloc) 433 + return false; 434 + 435 + return true; 436 + } 437 + 438 + /* 439 + * Merge the source inode record into the target. The caller must call 440 + * __xfs_inobt_can_merge() to ensure the merge is valid. 441 + */ 442 + STATIC void 443 + __xfs_inobt_rec_merge( 444 + struct xfs_inobt_rec_incore *trec, /* target */ 445 + struct xfs_inobt_rec_incore *srec) /* src */ 446 + { 447 + ASSERT(trec->ir_startino == srec->ir_startino); 448 + 449 + /* combine the counts */ 450 + trec->ir_count += srec->ir_count; 451 + trec->ir_freecount += srec->ir_freecount; 452 + 453 + /* 454 + * Merge the holemask and free mask. For both fields, 0 bits refer to 455 + * allocated inodes. We combine the allocated ranges with bitwise AND. 456 + */ 457 + trec->ir_holemask &= srec->ir_holemask; 458 + trec->ir_free &= srec->ir_free; 459 + } 460 + 461 + /* 462 + * Insert a new sparse inode chunk into the associated inode btree. The inode 463 + * record for the sparse chunk is pre-aligned to a startino that should match 464 + * any pre-existing sparse inode record in the tree. This allows sparse chunks 465 + * to fill over time. 466 + * 467 + * This function supports two modes of handling preexisting records depending on 468 + * the merge flag. If merge is true, the provided record is merged with the 469 + * existing record and updated in place. The merged record is returned in nrec. 470 + * If merge is false, an existing record is replaced with the provided record. 471 + * If no preexisting record exists, the provided record is always inserted. 472 + * 473 + * It is considered corruption if a merge is requested and not possible. Given 474 + * the sparse inode alignment constraints, this should never happen. 475 + */ 476 + STATIC int 477 + xfs_inobt_insert_sprec( 478 + struct xfs_mount *mp, 479 + struct xfs_trans *tp, 480 + struct xfs_buf *agbp, 481 + int btnum, 482 + struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */ 483 + bool merge) /* merge or replace */ 484 + { 485 + struct xfs_btree_cur *cur; 486 + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); 487 + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); 488 + int error; 489 + int i; 490 + struct xfs_inobt_rec_incore rec; 491 + 492 + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); 493 + 494 + /* the new record is pre-aligned so we know where to look */ 495 + error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); 496 + if (error) 497 + goto error; 498 + /* if nothing there, insert a new record and return */ 499 + if (i == 0) { 500 + error = xfs_inobt_insert_rec(cur, nrec->ir_holemask, 501 + nrec->ir_count, nrec->ir_freecount, 502 + nrec->ir_free, &i); 503 + if (error) 504 + goto error; 505 + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); 506 + 507 + goto out; 508 + } 509 + 510 + /* 511 + * A record exists at this startino. Merge or replace the record 512 + * depending on what we've been asked to do. 513 + */ 514 + if (merge) { 515 + error = xfs_inobt_get_rec(cur, &rec, &i); 516 + if (error) 517 + goto error; 518 + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); 519 + XFS_WANT_CORRUPTED_GOTO(mp, 520 + rec.ir_startino == nrec->ir_startino, 521 + error); 522 + 523 + /* 524 + * This should never fail. If we have coexisting records that 525 + * cannot merge, something is seriously wrong. 526 + */ 527 + XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec), 528 + error); 529 + 530 + trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino, 531 + rec.ir_holemask, nrec->ir_startino, 532 + nrec->ir_holemask); 533 + 534 + /* merge to nrec to output the updated record */ 535 + __xfs_inobt_rec_merge(nrec, &rec); 536 + 537 + trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino, 538 + nrec->ir_holemask); 539 + 540 + error = xfs_inobt_rec_check_count(mp, nrec); 541 + if (error) 542 + goto error; 543 + } 544 + 545 + error = xfs_inobt_update(cur, nrec); 546 + if (error) 547 + goto error; 548 + 549 + out: 550 + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 551 + return 0; 552 + error: 553 + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 554 + return error; 555 + } 556 + 557 + /* 381 558 * Allocate new inodes in the allocation group specified by agbp. 382 559 * Return 0 for success, else error code. 383 560 */ ··· 603 364 xfs_agino_t newlen; /* new number of inodes */ 604 365 int isaligned = 0; /* inode allocation at stripe unit */ 605 366 /* boundary */ 367 + uint16_t allocmask = (uint16_t) -1; /* init. to full chunk */ 368 + struct xfs_inobt_rec_incore rec; 606 369 struct xfs_perag *pag; 370 + 371 + int do_sparse = 0; 372 + 373 + #ifdef DEBUG 374 + /* randomly do sparse inode allocations */ 375 + if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb)) 376 + do_sparse = prandom_u32() & 1; 377 + #endif 607 378 608 379 memset(&args, 0, sizeof(args)); 609 380 args.tp = tp; 610 381 args.mp = tp->t_mountp; 382 + args.fsbno = NULLFSBLOCK; 611 383 612 384 /* 613 385 * Locking will ensure that we don't have two callers in here ··· 640 390 agno = be32_to_cpu(agi->agi_seqno); 641 391 args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + 642 392 args.mp->m_ialloc_blks; 393 + if (do_sparse) 394 + goto sparse_alloc; 643 395 if (likely(newino != NULLAGINO && 644 396 (args.agbno < be32_to_cpu(agi->agi_length)))) { 645 397 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); ··· 680 428 * subsequent requests. 681 429 */ 682 430 args.minalignslop = 0; 683 - } else 684 - args.fsbno = NULLFSBLOCK; 431 + } 685 432 686 433 if (unlikely(args.fsbno == NULLFSBLOCK)) { 687 434 /* ··· 731 480 return error; 732 481 } 733 482 483 + /* 484 + * Finally, try a sparse allocation if the filesystem supports it and 485 + * the sparse allocation length is smaller than a full chunk. 486 + */ 487 + if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) && 488 + args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks && 489 + args.fsbno == NULLFSBLOCK) { 490 + sparse_alloc: 491 + args.type = XFS_ALLOCTYPE_NEAR_BNO; 492 + args.agbno = be32_to_cpu(agi->agi_root); 493 + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); 494 + args.alignment = args.mp->m_sb.sb_spino_align; 495 + args.prod = 1; 496 + 497 + args.minlen = args.mp->m_ialloc_min_blks; 498 + args.maxlen = args.minlen; 499 + 500 + /* 501 + * The inode record will be aligned to full chunk size. We must 502 + * prevent sparse allocation from AG boundaries that result in 503 + * invalid inode records, such as records that start at agbno 0 504 + * or extend beyond the AG. 505 + * 506 + * Set min agbno to the first aligned, non-zero agbno and max to 507 + * the last aligned agbno that is at least one full chunk from 508 + * the end of the AG. 509 + */ 510 + args.min_agbno = args.mp->m_sb.sb_inoalignmt; 511 + args.max_agbno = round_down(args.mp->m_sb.sb_agblocks, 512 + args.mp->m_sb.sb_inoalignmt) - 513 + args.mp->m_ialloc_blks; 514 + 515 + error = xfs_alloc_vextent(&args); 516 + if (error) 517 + return error; 518 + 519 + newlen = args.len << args.mp->m_sb.sb_inopblog; 520 + allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1; 521 + } 522 + 734 523 if (args.fsbno == NULLFSBLOCK) { 735 524 *alloc = 0; 736 525 return 0; ··· 786 495 * rather than a linear progression to prevent the next generation 787 496 * number from being easily guessable. 788 497 */ 789 - error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno, 790 - args.len, prandom_u32()); 498 + error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, agno, 499 + args.agbno, args.len, prandom_u32()); 791 500 792 501 if (error) 793 502 return error; ··· 795 504 * Convert the results. 796 505 */ 797 506 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); 507 + 508 + if (xfs_inobt_issparse(~allocmask)) { 509 + /* 510 + * We've allocated a sparse chunk. Align the startino and mask. 511 + */ 512 + xfs_align_sparse_ino(args.mp, &newino, &allocmask); 513 + 514 + rec.ir_startino = newino; 515 + rec.ir_holemask = ~allocmask; 516 + rec.ir_count = newlen; 517 + rec.ir_freecount = newlen; 518 + rec.ir_free = XFS_INOBT_ALL_FREE; 519 + 520 + /* 521 + * Insert the sparse record into the inobt and allow for a merge 522 + * if necessary. If a merge does occur, rec is updated to the 523 + * merged record. 524 + */ 525 + error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO, 526 + &rec, true); 527 + if (error == -EFSCORRUPTED) { 528 + xfs_alert(args.mp, 529 + "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u", 530 + XFS_AGINO_TO_INO(args.mp, agno, 531 + rec.ir_startino), 532 + rec.ir_holemask, rec.ir_count); 533 + xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE); 534 + } 535 + if (error) 536 + return error; 537 + 538 + /* 539 + * We can't merge the part we've just allocated as for the inobt 540 + * due to finobt semantics. The original record may or may not 541 + * exist independent of whether physical inodes exist in this 542 + * sparse chunk. 543 + * 544 + * We must update the finobt record based on the inobt record. 545 + * rec contains the fully merged and up to date inobt record 546 + * from the previous call. Set merge false to replace any 547 + * existing record with this one. 548 + */ 549 + if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { 550 + error = xfs_inobt_insert_sprec(args.mp, tp, agbp, 551 + XFS_BTNUM_FINO, &rec, 552 + false); 553 + if (error) 554 + return error; 555 + } 556 + } else { 557 + /* full chunk - insert new records to both btrees */ 558 + error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, 559 + XFS_BTNUM_INO); 560 + if (error) 561 + return error; 562 + 563 + if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { 564 + error = xfs_inobt_insert(args.mp, tp, agbp, newino, 565 + newlen, XFS_BTNUM_FINO); 566 + if (error) 567 + return error; 568 + } 569 + } 570 + 571 + /* 572 + * Update AGI counts and newino. 573 + */ 798 574 be32_add_cpu(&agi->agi_count, newlen); 799 575 be32_add_cpu(&agi->agi_freecount, newlen); 800 576 pag = xfs_perag_get(args.mp, agno); ··· 869 511 xfs_perag_put(pag); 870 512 agi->agi_newino = cpu_to_be32(newino); 871 513 872 - /* 873 - * Insert records describing the new inode chunk into the btrees. 874 - */ 875 - error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, 876 - XFS_BTNUM_INO); 877 - if (error) 878 - return error; 879 - 880 - if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { 881 - error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, 882 - XFS_BTNUM_FINO); 883 - if (error) 884 - return error; 885 - } 886 514 /* 887 515 * Log allocation group header fields 888 516 */ ··· 989 645 * if we fail allocation due to alignment issues then it is most 990 646 * likely a real ENOSPC condition. 991 647 */ 992 - ineed = mp->m_ialloc_blks; 648 + ineed = mp->m_ialloc_min_blks; 993 649 if (flags && ineed > 1) 994 650 ineed += xfs_ialloc_cluster_alignment(mp); 995 651 longest = pag->pagf_longest; ··· 1073 729 } 1074 730 1075 731 return 0; 732 + } 733 + 734 + /* 735 + * Return the offset of the first free inode in the record. If the inode chunk 736 + * is sparsely allocated, we convert the record holemask to inode granularity 737 + * and mask off the unallocated regions from the inode free mask. 738 + */ 739 + STATIC int 740 + xfs_inobt_first_free_inode( 741 + struct xfs_inobt_rec_incore *rec) 742 + { 743 + xfs_inofree_t realfree; 744 + 745 + /* if there are no holes, return the first available offset */ 746 + if (!xfs_inobt_issparse(rec->ir_holemask)) 747 + return xfs_lowbit64(rec->ir_free); 748 + 749 + realfree = xfs_inobt_irec_to_allocmask(rec); 750 + realfree &= rec->ir_free; 751 + 752 + return xfs_lowbit64(realfree); 1076 753 } 1077 754 1078 755 /* ··· 1326 961 } 1327 962 1328 963 alloc_inode: 1329 - offset = xfs_lowbit64(rec.ir_free); 964 + offset = xfs_inobt_first_free_inode(&rec); 1330 965 ASSERT(offset >= 0); 1331 966 ASSERT(offset < XFS_INODES_PER_CHUNK); 1332 967 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % ··· 1575 1210 if (error) 1576 1211 goto error_cur; 1577 1212 1578 - offset = xfs_lowbit64(rec.ir_free); 1213 + offset = xfs_inobt_first_free_inode(&rec); 1579 1214 ASSERT(offset >= 0); 1580 1215 ASSERT(offset < XFS_INODES_PER_CHUNK); 1581 1216 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % ··· 1804 1439 return error; 1805 1440 } 1806 1441 1442 + /* 1443 + * Free the blocks of an inode chunk. We must consider that the inode chunk 1444 + * might be sparse and only free the regions that are allocated as part of the 1445 + * chunk. 1446 + */ 1447 + STATIC void 1448 + xfs_difree_inode_chunk( 1449 + struct xfs_mount *mp, 1450 + xfs_agnumber_t agno, 1451 + struct xfs_inobt_rec_incore *rec, 1452 + struct xfs_bmap_free *flist) 1453 + { 1454 + xfs_agblock_t sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino); 1455 + int startidx, endidx; 1456 + int nextbit; 1457 + xfs_agblock_t agbno; 1458 + int contigblk; 1459 + DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS); 1460 + 1461 + if (!xfs_inobt_issparse(rec->ir_holemask)) { 1462 + /* not sparse, calculate extent info directly */ 1463 + xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, 1464 + XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)), 1465 + mp->m_ialloc_blks, flist, mp); 1466 + return; 1467 + } 1468 + 1469 + /* holemask is only 16-bits (fits in an unsigned long) */ 1470 + ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0])); 1471 + holemask[0] = rec->ir_holemask; 1472 + 1473 + /* 1474 + * Find contiguous ranges of zeroes (i.e., allocated regions) in the 1475 + * holemask and convert the start/end index of each range to an extent. 1476 + * We start with the start and end index both pointing at the first 0 in 1477 + * the mask. 1478 + */ 1479 + startidx = endidx = find_first_zero_bit(holemask, 1480 + XFS_INOBT_HOLEMASK_BITS); 1481 + nextbit = startidx + 1; 1482 + while (startidx < XFS_INOBT_HOLEMASK_BITS) { 1483 + nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS, 1484 + nextbit); 1485 + /* 1486 + * If the next zero bit is contiguous, update the end index of 1487 + * the current range and continue. 1488 + */ 1489 + if (nextbit != XFS_INOBT_HOLEMASK_BITS && 1490 + nextbit == endidx + 1) { 1491 + endidx = nextbit; 1492 + goto next; 1493 + } 1494 + 1495 + /* 1496 + * nextbit is not contiguous with the current end index. Convert 1497 + * the current start/end to an extent and add it to the free 1498 + * list. 1499 + */ 1500 + agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) / 1501 + mp->m_sb.sb_inopblock; 1502 + contigblk = ((endidx - startidx + 1) * 1503 + XFS_INODES_PER_HOLEMASK_BIT) / 1504 + mp->m_sb.sb_inopblock; 1505 + 1506 + ASSERT(agbno % mp->m_sb.sb_spino_align == 0); 1507 + ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); 1508 + xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, 1509 + flist, mp); 1510 + 1511 + /* reset range to current bit and carry on... */ 1512 + startidx = endidx = nextbit; 1513 + 1514 + next: 1515 + nextbit++; 1516 + } 1517 + } 1518 + 1807 1519 STATIC int 1808 1520 xfs_difree_inobt( 1809 1521 struct xfs_mount *mp, ··· 1888 1446 struct xfs_buf *agbp, 1889 1447 xfs_agino_t agino, 1890 1448 struct xfs_bmap_free *flist, 1891 - int *deleted, 1892 - xfs_ino_t *first_ino, 1449 + struct xfs_icluster *xic, 1893 1450 struct xfs_inobt_rec_incore *orec) 1894 1451 { 1895 1452 struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); ··· 1942 1501 rec.ir_freecount++; 1943 1502 1944 1503 /* 1945 - * When an inode cluster is free, it becomes eligible for removal 1504 + * When an inode chunk is free, it becomes eligible for removal. Don't 1505 + * remove the chunk if the block size is large enough for multiple inode 1506 + * chunks (that might not be free). 1946 1507 */ 1947 1508 if (!(mp->m_flags & XFS_MOUNT_IKEEP) && 1948 - (rec.ir_freecount == mp->m_ialloc_inos)) { 1949 - 1950 - *deleted = 1; 1951 - *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); 1509 + rec.ir_free == XFS_INOBT_ALL_FREE && 1510 + mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { 1511 + xic->deleted = 1; 1512 + xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); 1513 + xic->alloc = xfs_inobt_irec_to_allocmask(&rec); 1952 1514 1953 1515 /* 1954 1516 * Remove the inode cluster from the AGI B+Tree, adjust the 1955 1517 * AGI and Superblock inode counts, and mark the disk space 1956 1518 * to be freed when the transaction is committed. 1957 1519 */ 1958 - ilen = mp->m_ialloc_inos; 1520 + ilen = rec.ir_freecount; 1959 1521 be32_add_cpu(&agi->agi_count, -ilen); 1960 1522 be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); 1961 1523 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); ··· 1974 1530 goto error0; 1975 1531 } 1976 1532 1977 - xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, 1978 - XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)), 1979 - mp->m_ialloc_blks, flist, mp); 1533 + xfs_difree_inode_chunk(mp, agno, &rec, flist); 1980 1534 } else { 1981 - *deleted = 0; 1535 + xic->deleted = 0; 1982 1536 1983 1537 error = xfs_inobt_update(cur, &rec); 1984 1538 if (error) { ··· 2041 1599 */ 2042 1600 XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error); 2043 1601 2044 - error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount, 1602 + error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask, 1603 + ibtrec->ir_count, 1604 + ibtrec->ir_freecount, 2045 1605 ibtrec->ir_free, &i); 2046 1606 if (error) 2047 1607 goto error; ··· 2078 1634 * free inode. Hence, if all of the inodes are free and we aren't 2079 1635 * keeping inode chunks permanently on disk, remove the record. 2080 1636 * Otherwise, update the record with the new information. 1637 + * 1638 + * Note that we currently can't free chunks when the block size is large 1639 + * enough for multiple chunks. Leave the finobt record to remain in sync 1640 + * with the inobt. 2081 1641 */ 2082 - if (rec.ir_freecount == mp->m_ialloc_inos && 1642 + if (rec.ir_free == XFS_INOBT_ALL_FREE && 1643 + mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK && 2083 1644 !(mp->m_flags & XFS_MOUNT_IKEEP)) { 2084 1645 error = xfs_btree_delete(cur, &i); 2085 1646 if (error) ··· 2120 1671 struct xfs_trans *tp, /* transaction pointer */ 2121 1672 xfs_ino_t inode, /* inode to be freed */ 2122 1673 struct xfs_bmap_free *flist, /* extents to free */ 2123 - int *deleted,/* set if inode cluster was deleted */ 2124 - xfs_ino_t *first_ino)/* first inode in deleted cluster */ 1674 + struct xfs_icluster *xic) /* cluster info if deleted */ 2125 1675 { 2126 1676 /* REFERENCED */ 2127 1677 xfs_agblock_t agbno; /* block number containing inode */ ··· 2171 1723 /* 2172 1724 * Fix up the inode allocation btree. 2173 1725 */ 2174 - error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino, 2175 - &rec); 1726 + error = xfs_difree_inobt(mp, tp, agbp, agino, flist, xic, &rec); 2176 1727 if (error) 2177 1728 goto error0; 2178 1729

+9 -3

fs/xfs/libxfs/xfs_ialloc.h

··· 28 28 /* Move inodes in clusters of this size */ 29 29 #define XFS_INODE_BIG_CLUSTER_SIZE 8192 30 30 31 + struct xfs_icluster { 32 + bool deleted; /* record is deleted */ 33 + xfs_ino_t first_ino; /* first inode number */ 34 + uint64_t alloc; /* inode phys. allocation bitmap for 35 + * sparse chunks */ 36 + }; 37 + 31 38 /* Calculate and return the number of filesystem blocks per inode cluster */ 32 39 static inline int 33 40 xfs_icluster_size_fsb( ··· 97 90 struct xfs_trans *tp, /* transaction pointer */ 98 91 xfs_ino_t inode, /* inode to be freed */ 99 92 struct xfs_bmap_free *flist, /* extents to free */ 100 - int *deleted, /* set if inode cluster was deleted */ 101 - xfs_ino_t *first_ino); /* first inode in deleted cluster */ 93 + struct xfs_icluster *ifree); /* cluster info if deleted */ 102 94 103 95 /* 104 96 * Return the location of the inode in imap, for mapping it into a buffer. ··· 162 156 * Inode chunk initialisation routine 163 157 */ 164 158 int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, 165 - struct list_head *buffer_list, 159 + struct list_head *buffer_list, int icount, 166 160 xfs_agnumber_t agno, xfs_agblock_t agbno, 167 161 xfs_agblock_t length, unsigned int gen); 168 162

+92 -1

fs/xfs/libxfs/xfs_ialloc_btree.c

··· 167 167 union xfs_btree_rec *rec) 168 168 { 169 169 rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino); 170 - rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount); 170 + if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { 171 + rec->inobt.ir_u.sp.ir_holemask = 172 + cpu_to_be16(cur->bc_rec.i.ir_holemask); 173 + rec->inobt.ir_u.sp.ir_count = cur->bc_rec.i.ir_count; 174 + rec->inobt.ir_u.sp.ir_freecount = cur->bc_rec.i.ir_freecount; 175 + } else { 176 + /* ir_holemask/ir_count not supported on-disk */ 177 + rec->inobt.ir_u.f.ir_freecount = 178 + cpu_to_be32(cur->bc_rec.i.ir_freecount); 179 + } 171 180 rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free); 172 181 } 173 182 ··· 427 418 return blocklen / sizeof(xfs_inobt_rec_t); 428 419 return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t)); 429 420 } 421 + 422 + /* 423 + * Convert the inode record holemask to an inode allocation bitmap. The inode 424 + * allocation bitmap is inode granularity and specifies whether an inode is 425 + * physically allocated on disk (not whether the inode is considered allocated 426 + * or free by the fs). 427 + * 428 + * A bit value of 1 means the inode is allocated, a value of 0 means it is free. 429 + */ 430 + uint64_t 431 + xfs_inobt_irec_to_allocmask( 432 + struct xfs_inobt_rec_incore *rec) 433 + { 434 + uint64_t bitmap = 0; 435 + uint64_t inodespbit; 436 + int nextbit; 437 + uint allocbitmap; 438 + 439 + /* 440 + * The holemask has 16-bits for a 64 inode record. Therefore each 441 + * holemask bit represents multiple inodes. Create a mask of bits to set 442 + * in the allocmask for each holemask bit. 443 + */ 444 + inodespbit = (1 << XFS_INODES_PER_HOLEMASK_BIT) - 1; 445 + 446 + /* 447 + * Allocated inodes are represented by 0 bits in holemask. Invert the 0 448 + * bits to 1 and convert to a uint so we can use xfs_next_bit(). Mask 449 + * anything beyond the 16 holemask bits since this casts to a larger 450 + * type. 451 + */ 452 + allocbitmap = ~rec->ir_holemask & ((1 << XFS_INOBT_HOLEMASK_BITS) - 1); 453 + 454 + /* 455 + * allocbitmap is the inverted holemask so every set bit represents 456 + * allocated inodes. To expand from 16-bit holemask granularity to 457 + * 64-bit (e.g., bit-per-inode), set inodespbit bits in the target 458 + * bitmap for every holemask bit. 459 + */ 460 + nextbit = xfs_next_bit(&allocbitmap, 1, 0); 461 + while (nextbit != -1) { 462 + ASSERT(nextbit < (sizeof(rec->ir_holemask) * NBBY)); 463 + 464 + bitmap |= (inodespbit << 465 + (nextbit * XFS_INODES_PER_HOLEMASK_BIT)); 466 + 467 + nextbit = xfs_next_bit(&allocbitmap, 1, nextbit + 1); 468 + } 469 + 470 + return bitmap; 471 + } 472 + 473 + #if defined(DEBUG) || defined(XFS_WARN) 474 + /* 475 + * Verify that an in-core inode record has a valid inode count. 476 + */ 477 + int 478 + xfs_inobt_rec_check_count( 479 + struct xfs_mount *mp, 480 + struct xfs_inobt_rec_incore *rec) 481 + { 482 + int inocount = 0; 483 + int nextbit = 0; 484 + uint64_t allocbmap; 485 + int wordsz; 486 + 487 + wordsz = sizeof(allocbmap) / sizeof(unsigned int); 488 + allocbmap = xfs_inobt_irec_to_allocmask(rec); 489 + 490 + nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, nextbit); 491 + while (nextbit != -1) { 492 + inocount++; 493 + nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, 494 + nextbit + 1); 495 + } 496 + 497 + if (inocount != rec->ir_count) 498 + return -EFSCORRUPTED; 499 + 500 + return 0; 501 + } 502 + #endif /* DEBUG */

+10

fs/xfs/libxfs/xfs_ialloc_btree.h

··· 62 62 xfs_btnum_t); 63 63 extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); 64 64 65 + /* ir_holemask to inode allocation bitmap conversion */ 66 + uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *); 67 + 68 + #if defined(DEBUG) || defined(XFS_WARN) 69 + int xfs_inobt_rec_check_count(struct xfs_mount *, 70 + struct xfs_inobt_rec_incore *); 71 + #else 72 + #define xfs_inobt_rec_check_count(mp, rec) 0 73 + #endif /* DEBUG */ 74 + 65 75 #endif /* __XFS_IALLOC_BTREE_H__ */

+28 -2

fs/xfs/libxfs/xfs_sb.c

··· 174 174 return -EFSCORRUPTED; 175 175 } 176 176 177 + /* 178 + * Full inode chunks must be aligned to inode chunk size when 179 + * sparse inodes are enabled to support the sparse chunk 180 + * allocation algorithm and prevent overlapping inode records. 181 + */ 182 + if (xfs_sb_version_hassparseinodes(sbp)) { 183 + uint32_t align; 184 + 185 + xfs_alert(mp, 186 + "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!"); 187 + 188 + align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize 189 + >> sbp->sb_blocklog; 190 + if (sbp->sb_inoalignmt != align) { 191 + xfs_warn(mp, 192 + "Inode block alignment (%u) must match chunk size (%u) for sparse inodes.", 193 + sbp->sb_inoalignmt, align); 194 + return -EINVAL; 195 + } 196 + } 197 + 177 198 if (unlikely( 178 199 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { 179 200 xfs_warn(mp, ··· 395 374 be32_to_cpu(from->sb_features_log_incompat); 396 375 /* crc is only used on disk, not in memory; just init to 0 here. */ 397 376 to->sb_crc = 0; 398 - to->sb_pad = 0; 377 + to->sb_spino_align = be32_to_cpu(from->sb_spino_align); 399 378 to->sb_pquotino = be64_to_cpu(from->sb_pquotino); 400 379 to->sb_lsn = be64_to_cpu(from->sb_lsn); 401 380 /* Convert on-disk flags to in-memory flags? */ ··· 537 516 cpu_to_be32(from->sb_features_incompat); 538 517 to->sb_features_log_incompat = 539 518 cpu_to_be32(from->sb_features_log_incompat); 540 - to->sb_pad = 0; 519 + to->sb_spino_align = cpu_to_be32(from->sb_spino_align); 541 520 to->sb_lsn = cpu_to_be64(from->sb_lsn); 542 521 } 543 522 } ··· 710 689 mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, 711 690 sbp->sb_inopblock); 712 691 mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; 692 + 693 + if (sbp->sb_spino_align) 694 + mp->m_ialloc_min_blks = sbp->sb_spino_align; 695 + else 696 + mp->m_ialloc_min_blks = mp->m_ialloc_blks; 713 697 } 714 698 715 699 /*

+3 -1

fs/xfs/xfs_fsops.c

··· 101 101 (xfs_sb_version_hasftype(&mp->m_sb) ? 102 102 XFS_FSOP_GEOM_FLAGS_FTYPE : 0) | 103 103 (xfs_sb_version_hasfinobt(&mp->m_sb) ? 104 - XFS_FSOP_GEOM_FLAGS_FINOBT : 0); 104 + XFS_FSOP_GEOM_FLAGS_FINOBT : 0) | 105 + (xfs_sb_version_hassparseinodes(&mp->m_sb) ? 106 + XFS_FSOP_GEOM_FLAGS_SPINODES : 0); 105 107 geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? 106 108 mp->m_sb.sb_logsectsize : BBSIZE; 107 109 geo->rtsectsize = mp->m_sb.sb_blocksize;

+20 -8

fs/xfs/xfs_inode.c

··· 2235 2235 */ 2236 2236 STATIC int 2237 2237 xfs_ifree_cluster( 2238 - xfs_inode_t *free_ip, 2239 - xfs_trans_t *tp, 2240 - xfs_ino_t inum) 2238 + xfs_inode_t *free_ip, 2239 + xfs_trans_t *tp, 2240 + struct xfs_icluster *xic) 2241 2241 { 2242 2242 xfs_mount_t *mp = free_ip->i_mount; 2243 2243 int blks_per_cluster; ··· 2250 2250 xfs_inode_log_item_t *iip; 2251 2251 xfs_log_item_t *lip; 2252 2252 struct xfs_perag *pag; 2253 + xfs_ino_t inum; 2253 2254 2255 + inum = xic->first_ino; 2254 2256 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); 2255 2257 blks_per_cluster = xfs_icluster_size_fsb(mp); 2256 2258 inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; 2257 2259 nbufs = mp->m_ialloc_blks / blks_per_cluster; 2258 2260 2259 2261 for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) { 2262 + /* 2263 + * The allocation bitmap tells us which inodes of the chunk were 2264 + * physically allocated. Skip the cluster if an inode falls into 2265 + * a sparse region. 2266 + */ 2267 + if ((xic->alloc & XFS_INOBT_MASK(inum - xic->first_ino)) == 0) { 2268 + ASSERT(((inum - xic->first_ino) % 2269 + inodes_per_cluster) == 0); 2270 + continue; 2271 + } 2272 + 2260 2273 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 2261 2274 XFS_INO_TO_AGBNO(mp, inum)); 2262 2275 ··· 2427 2414 xfs_bmap_free_t *flist) 2428 2415 { 2429 2416 int error; 2430 - int delete; 2431 - xfs_ino_t first_ino; 2417 + struct xfs_icluster xic = { 0 }; 2432 2418 2433 2419 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 2434 2420 ASSERT(ip->i_d.di_nlink == 0); ··· 2443 2431 if (error) 2444 2432 return error; 2445 2433 2446 - error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); 2434 + error = xfs_difree(tp, ip->i_ino, flist, &xic); 2447 2435 if (error) 2448 2436 return error; 2449 2437 ··· 2460 2448 ip->i_d.di_gen++; 2461 2449 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2462 2450 2463 - if (delete) 2464 - error = xfs_ifree_cluster(ip, tp, first_ino); 2451 + if (xic.deleted) 2452 + error = xfs_ifree_cluster(ip, tp, &xic); 2465 2453 2466 2454 return error; 2467 2455 }

+8 -5

fs/xfs/xfs_itable.c

··· 252 252 } 253 253 254 254 irec->ir_free |= xfs_inobt_maskn(0, idx); 255 - *icount = XFS_INODES_PER_CHUNK - irec->ir_freecount; 255 + *icount = irec->ir_count - irec->ir_freecount; 256 256 } 257 257 258 258 return 0; ··· 415 415 goto del_cursor; 416 416 if (icount) { 417 417 irbp->ir_startino = r.ir_startino; 418 + irbp->ir_holemask = r.ir_holemask; 419 + irbp->ir_count = r.ir_count; 418 420 irbp->ir_freecount = r.ir_freecount; 419 421 irbp->ir_free = r.ir_free; 420 422 irbp++; ··· 449 447 * If this chunk has any allocated inodes, save it. 450 448 * Also start read-ahead now for this chunk. 451 449 */ 452 - if (r.ir_freecount < XFS_INODES_PER_CHUNK) { 450 + if (r.ir_freecount < r.ir_count) { 453 451 xfs_bulkstat_ichunk_ra(mp, agno, &r); 454 452 irbp->ir_startino = r.ir_startino; 453 + irbp->ir_holemask = r.ir_holemask; 454 + irbp->ir_count = r.ir_count; 455 455 irbp->ir_freecount = r.ir_freecount; 456 456 irbp->ir_free = r.ir_free; 457 457 irbp++; 458 - icount += XFS_INODES_PER_CHUNK - r.ir_freecount; 458 + icount += r.ir_count - r.ir_freecount; 459 459 } 460 460 error = xfs_btree_increment(cur, 0, &stat); 461 461 if (error || stat == 0) { ··· 603 599 agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1; 604 600 buffer[bufidx].xi_startino = 605 601 XFS_AGINO_TO_INO(mp, agno, r.ir_startino); 606 - buffer[bufidx].xi_alloccount = 607 - XFS_INODES_PER_CHUNK - r.ir_freecount; 602 + buffer[bufidx].xi_alloccount = r.ir_count - r.ir_freecount; 608 603 buffer[bufidx].xi_allocmask = ~r.ir_free; 609 604 if (++bufidx == bcount) { 610 605 long written;

+18 -8

fs/xfs/xfs_log_recover.c

··· 3068 3068 return -EINVAL; 3069 3069 } 3070 3070 3071 - /* existing allocation is fixed value */ 3072 - ASSERT(count == mp->m_ialloc_inos); 3073 - ASSERT(length == mp->m_ialloc_blks); 3074 - if (count != mp->m_ialloc_inos || 3075 - length != mp->m_ialloc_blks) { 3076 - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); 3071 + /* 3072 + * The inode chunk is either full or sparse and we only support 3073 + * m_ialloc_min_blks sized sparse allocations at this time. 3074 + */ 3075 + if (length != mp->m_ialloc_blks && 3076 + length != mp->m_ialloc_min_blks) { 3077 + xfs_warn(log->l_mp, 3078 + "%s: unsupported chunk length", __FUNCTION__); 3079 + return -EINVAL; 3080 + } 3081 + 3082 + /* verify inode count is consistent with extent length */ 3083 + if ((count >> mp->m_sb.sb_inopblog) != length) { 3084 + xfs_warn(log->l_mp, 3085 + "%s: inconsistent inode count and chunk length", 3086 + __FUNCTION__); 3077 3087 return -EINVAL; 3078 3088 } 3079 3089 ··· 3101 3091 XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) 3102 3092 return 0; 3103 3093 3104 - xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length, 3105 - be32_to_cpu(icl->icl_gen)); 3094 + xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, length, 3095 + be32_to_cpu(icl->icl_gen)); 3106 3096 return 0; 3107 3097 } 3108 3098

+16

fs/xfs/xfs_mount.c

··· 725 725 } 726 726 727 727 /* 728 + * If enabled, sparse inode chunk alignment is expected to match the 729 + * cluster size. Full inode chunk alignment must match the chunk size, 730 + * but that is checked on sb read verification... 731 + */ 732 + if (xfs_sb_version_hassparseinodes(&mp->m_sb) && 733 + mp->m_sb.sb_spino_align != 734 + XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) { 735 + xfs_warn(mp, 736 + "Sparse inode block alignment (%u) must match cluster size (%llu).", 737 + mp->m_sb.sb_spino_align, 738 + XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)); 739 + error = -EINVAL; 740 + goto out_remove_uuid; 741 + } 742 + 743 + /* 728 744 * Set inode alignment fields 729 745 */ 730 746 xfs_set_inoalignment(mp);

+2

fs/xfs/xfs_mount.h

··· 101 101 __uint64_t m_flags; /* global mount flags */ 102 102 int m_ialloc_inos; /* inodes in inode allocation */ 103 103 int m_ialloc_blks; /* blocks in inode allocation */ 104 + int m_ialloc_min_blks;/* min blocks in sparse inode 105 + * allocation */ 104 106 int m_inoalign_mask;/* mask sb_inoalignmt if used */ 105 107 uint m_qflags; /* quota status flags */ 106 108 struct xfs_trans_resv m_resv; /* precomputed res values */

+47

fs/xfs/xfs_trace.h

··· 738 738 __entry->blocks, __entry->shift, __entry->writeio_blocks) 739 739 ) 740 740 741 + TRACE_EVENT(xfs_irec_merge_pre, 742 + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, 743 + uint16_t holemask, xfs_agino_t nagino, uint16_t nholemask), 744 + TP_ARGS(mp, agno, agino, holemask, nagino, nholemask), 745 + TP_STRUCT__entry( 746 + __field(dev_t, dev) 747 + __field(xfs_agnumber_t, agno) 748 + __field(xfs_agino_t, agino) 749 + __field(uint16_t, holemask) 750 + __field(xfs_agino_t, nagino) 751 + __field(uint16_t, nholemask) 752 + ), 753 + TP_fast_assign( 754 + __entry->dev = mp->m_super->s_dev; 755 + __entry->agno = agno; 756 + __entry->agino = agino; 757 + __entry->holemask = holemask; 758 + __entry->nagino = nagino; 759 + __entry->nholemask = holemask; 760 + ), 761 + TP_printk("dev %d:%d agno %d inobt (%u:0x%x) new (%u:0x%x)", 762 + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, 763 + __entry->agino, __entry->holemask, __entry->nagino, 764 + __entry->nholemask) 765 + ) 766 + 767 + TRACE_EVENT(xfs_irec_merge_post, 768 + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, 769 + uint16_t holemask), 770 + TP_ARGS(mp, agno, agino, holemask), 771 + TP_STRUCT__entry( 772 + __field(dev_t, dev) 773 + __field(xfs_agnumber_t, agno) 774 + __field(xfs_agino_t, agino) 775 + __field(uint16_t, holemask) 776 + ), 777 + TP_fast_assign( 778 + __entry->dev = mp->m_super->s_dev; 779 + __entry->agno = agno; 780 + __entry->agino = agino; 781 + __entry->holemask = holemask; 782 + ), 783 + TP_printk("dev %d:%d agno %d inobt (%u:0x%x)", MAJOR(__entry->dev), 784 + MINOR(__entry->dev), __entry->agno, __entry->agino, 785 + __entry->holemask) 786 + ) 787 + 741 788 #define DEFINE_IREF_EVENT(name) \ 742 789 DEFINE_EVENT(xfs_iref_class, name, \ 743 790 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \