Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ext4: Speedup ext4 orphan inode handling

Ext4 orphan inode handling is a bottleneck for workloads which heavily
truncate / unlink small files since it contends on the global
s_orphan_mutex lock (and generally it's difficult to improve scalability
of the ondisk linked list of orphaned inodes).

This patch implements new way of handling orphan inodes. Instead of
linking orphaned inode into a linked list, we store it's inode number in
a new special file which we call "orphan file". Only if there's no more
space in the orphan file (too many inodes are currently orphaned) we
fall back to using old style linked list. Currently we protect
operations in the orphan file with a spinlock for simplicity but even in
this setting we can substantially reduce the length of the critical
section and thus speedup some workloads. In the next patch we improve
this by making orphan handling lockless.

Note that the change is backwards compatible when the filesystem is
clean - the existence of the orphan file is a compat feature, we set
another ro-compat feature indicating orphan file needs scanning for
orphaned inodes when mounting filesystem read-write. This ro-compat
feature gets cleared on unmount / remount read-only.

Some performance data from 80 CPU Xeon Server with 512 GB of RAM,
filesystem located on SSD, average of 5 runs:

stress-orphan (microbenchmark truncating files byte-by-byte from N
processes in parallel)

Threads Time Time
Vanilla Patched
1 1.057200 0.945600
2 1.680400 1.331800
4 2.547000 1.995000
8 7.049400 6.424200
16 14.827800 14.937600
32 40.948200 33.038200
64 87.787400 60.823600
128 206.504000 122.941400

So we can see significant wins all over the board.

Reviewed-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20210816095713.16537-3-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>

authored by

Jan Kara and committed by
Theodore Ts'o
02f310fc 25c6d98f

+394 -52
+62 -7
fs/ext4/ext4.h
··· 1034 1034 */ 1035 1035 struct rw_semaphore xattr_sem; 1036 1036 1037 - struct list_head i_orphan; /* unlinked but open inodes */ 1037 + /* 1038 + * Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise 1039 + * i_orphan is used. 1040 + */ 1041 + union { 1042 + struct list_head i_orphan; /* unlinked but open inodes */ 1043 + unsigned int i_orphan_idx; /* Index in orphan file */ 1044 + }; 1038 1045 1039 1046 /* Fast commit related info */ 1040 1047 ··· 1435 1428 __u8 s_last_error_errcode; 1436 1429 __le16 s_encoding; /* Filename charset encoding */ 1437 1430 __le16 s_encoding_flags; /* Filename charset encoding flags */ 1438 - __le32 s_reserved[95]; /* Padding to the end of the block */ 1431 + __le32 s_orphan_file_inum; /* Inode for tracking orphan inodes */ 1432 + __le32 s_reserved[94]; /* Padding to the end of the block */ 1439 1433 __le32 s_checksum; /* crc32c(superblock) */ 1440 1434 }; 1441 1435 ··· 1457 1449 1458 1450 /* Types of ext4 journal triggers */ 1459 1451 enum ext4_journal_trigger_type { 1452 + EXT4_JTR_ORPHAN_FILE, 1460 1453 EXT4_JTR_NONE /* This must be the last entry for indexing to work! */ 1461 1454 }; 1462 1455 ··· 1473 1464 { 1474 1465 return container_of(trigger, struct ext4_journal_trigger, tr_triggers); 1475 1466 } 1467 + 1468 + #define EXT4_ORPHAN_BLOCK_MAGIC 0x0b10ca04 1469 + 1470 + /* Structure at the tail of orphan block */ 1471 + struct ext4_orphan_block_tail { 1472 + __le32 ob_magic; 1473 + __le32 ob_checksum; 1474 + }; 1475 + 1476 + static inline int ext4_inodes_per_orphan_block(struct super_block *sb) 1477 + { 1478 + return (sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)) / 1479 + sizeof(u32); 1480 + } 1481 + 1482 + struct ext4_orphan_block { 1483 + int ob_free_entries; /* Number of free orphan entries in block */ 1484 + struct buffer_head *ob_bh; /* Buffer for orphan block */ 1485 + }; 1486 + 1487 + /* 1488 + * Info about orphan file. 1489 + */ 1490 + struct ext4_orphan_info { 1491 + spinlock_t of_lock; 1492 + int of_blocks; /* Number of orphan blocks in a file */ 1493 + __u32 of_csum_seed; /* Checksum seed for orphan file */ 1494 + struct ext4_orphan_block *of_binfo; /* Array with info about orphan 1495 + * file blocks */ 1496 + }; 1476 1497 1477 1498 /* 1478 1499 * fourth extended-fs super-block data in memory ··· 1558 1519 1559 1520 /* Journaling */ 1560 1521 struct journal_s *s_journal; 1561 - struct list_head s_orphan; 1562 - struct mutex s_orphan_lock; 1563 1522 unsigned long s_ext4_flags; /* Ext4 superblock flags */ 1523 + struct mutex s_orphan_lock; /* Protects on disk list changes */ 1524 + struct list_head s_orphan; /* List of orphaned inodes in on disk 1525 + list */ 1526 + struct ext4_orphan_info s_orphan_info; 1564 1527 unsigned long s_commit_interval; 1565 1528 u32 s_max_batch_time; 1566 1529 u32 s_min_batch_time; ··· 1900 1859 EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ 1901 1860 EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ 1902 1861 EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */ 1862 + EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */ 1903 1863 }; 1904 1864 1905 1865 #define EXT4_INODE_BIT_FNS(name, field, offset) \ ··· 2002 1960 */ 2003 1961 #define EXT4_FEATURE_COMPAT_FAST_COMMIT 0x0400 2004 1962 #define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800 1963 + #define EXT4_FEATURE_COMPAT_ORPHAN_FILE 0x1000 /* Orphan file exists */ 2005 1964 2006 1965 #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 2007 1966 #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 ··· 2023 1980 #define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 2024 1981 #define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000 2025 1982 #define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000 1983 + #define EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT 0x10000 /* Orphan file may be 1984 + non-empty */ 2026 1985 2027 1986 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 2028 1987 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 ··· 2108 2063 EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2) 2109 2064 EXT4_FEATURE_COMPAT_FUNCS(fast_commit, FAST_COMMIT) 2110 2065 EXT4_FEATURE_COMPAT_FUNCS(stable_inodes, STABLE_INODES) 2066 + EXT4_FEATURE_COMPAT_FUNCS(orphan_file, ORPHAN_FILE) 2111 2067 2112 2068 EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER) 2113 2069 EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE) ··· 2123 2077 EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY) 2124 2078 EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT) 2125 2079 EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY) 2080 + EXT4_FEATURE_RO_COMPAT_FUNCS(orphan_present, ORPHAN_PRESENT) 2126 2081 2127 2082 EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION) 2128 2083 EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE) ··· 2157 2110 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ 2158 2111 EXT4_FEATURE_RO_COMPAT_BTREE_DIR) 2159 2112 2160 - #define EXT4_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR 2113 + #define EXT4_FEATURE_COMPAT_SUPP (EXT4_FEATURE_COMPAT_EXT_ATTR| \ 2114 + EXT4_FEATURE_COMPAT_ORPHAN_FILE) 2161 2115 #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ 2162 2116 EXT4_FEATURE_INCOMPAT_RECOVER| \ 2163 2117 EXT4_FEATURE_INCOMPAT_META_BG| \ ··· 2183 2135 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ 2184 2136 EXT4_FEATURE_RO_COMPAT_QUOTA |\ 2185 2137 EXT4_FEATURE_RO_COMPAT_PROJECT |\ 2186 - EXT4_FEATURE_RO_COMPAT_VERITY) 2138 + EXT4_FEATURE_RO_COMPAT_VERITY |\ 2139 + EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT) 2187 2140 2188 2141 #define EXTN_FEATURE_FUNCS(ver) \ 2189 2142 static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ ··· 2233 2184 { 2234 2185 return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); 2235 2186 } 2236 - 2237 2187 2238 2188 /* 2239 2189 * Default values for user and/or group using reserved blocks ··· 3816 3768 extern int ext4_orphan_del(handle_t *, struct inode *); 3817 3769 extern void ext4_orphan_cleanup(struct super_block *sb, 3818 3770 struct ext4_super_block *es); 3771 + extern void ext4_release_orphan_info(struct super_block *sb); 3772 + extern int ext4_init_orphan_info(struct super_block *sb); 3773 + extern int ext4_orphan_file_empty(struct super_block *sb); 3774 + extern void ext4_orphan_file_block_trigger( 3775 + struct jbd2_buffer_trigger_type *triggers, 3776 + struct buffer_head *bh, 3777 + void *data, size_t size); 3819 3778 3820 3779 /* 3821 3780 * Add new method to test whether block and inode bitmaps are properly
+2 -1
fs/ext4/inode.c
··· 4624 4624 ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || 4625 4625 ino == le32_to_cpu(es->s_usr_quota_inum) || 4626 4626 ino == le32_to_cpu(es->s_grp_quota_inum) || 4627 - ino == le32_to_cpu(es->s_prj_quota_inum))) || 4627 + ino == le32_to_cpu(es->s_prj_quota_inum) || 4628 + ino == le32_to_cpu(es->s_orphan_file_inum))) || 4628 4629 (ino < EXT4_ROOT_INO) || 4629 4630 (ino > le32_to_cpu(es->s_inodes_count))) { 4630 4631 if (flags & EXT4_IGET_HANDLE)
+301 -39
fs/ext4/orphan.c
··· 8 8 #include "ext4.h" 9 9 #include "ext4_jbd2.h" 10 10 11 + static int ext4_orphan_file_add(handle_t *handle, struct inode *inode) 12 + { 13 + int i, j; 14 + struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info; 15 + int ret = 0; 16 + __le32 *bdata; 17 + int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb); 18 + 19 + spin_lock(&oi->of_lock); 20 + for (i = 0; i < oi->of_blocks && !oi->of_binfo[i].ob_free_entries; i++); 21 + if (i == oi->of_blocks) { 22 + spin_unlock(&oi->of_lock); 23 + /* 24 + * For now we don't grow or shrink orphan file. We just use 25 + * whatever was allocated at mke2fs time. The additional 26 + * credits we would have to reserve for each orphan inode 27 + * operation just don't seem worth it. 28 + */ 29 + return -ENOSPC; 30 + } 31 + oi->of_binfo[i].ob_free_entries--; 32 + spin_unlock(&oi->of_lock); 33 + 34 + /* 35 + * Get access to orphan block. We have dropped of_lock but since we 36 + * have decremented number of free entries we are guaranteed free entry 37 + * in our block. 38 + */ 39 + ret = ext4_journal_get_write_access(handle, inode->i_sb, 40 + oi->of_binfo[i].ob_bh, EXT4_JTR_ORPHAN_FILE); 41 + if (ret) 42 + return ret; 43 + 44 + bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); 45 + spin_lock(&oi->of_lock); 46 + /* Find empty slot in a block */ 47 + for (j = 0; j < inodes_per_ob && bdata[j]; j++); 48 + BUG_ON(j == inodes_per_ob); 49 + bdata[j] = cpu_to_le32(inode->i_ino); 50 + EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j; 51 + ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE); 52 + spin_unlock(&oi->of_lock); 53 + 54 + return ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[i].ob_bh); 55 + } 56 + 11 57 /* 12 58 * ext4_orphan_add() links an unlinked or truncated inode into a list of 13 59 * such inodes, starting at the superblock, in case we crash before the ··· 80 34 WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && 81 35 !inode_is_locked(inode)); 82 36 /* 83 - * Exit early if inode already is on orphan list. This is a big speedup 84 - * since we don't have to contend on the global s_orphan_lock. 37 + * Inode orphaned in orphan file or in orphan list? 85 38 */ 86 - if (!list_empty(&EXT4_I(inode)->i_orphan)) 39 + if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) || 40 + !list_empty(&EXT4_I(inode)->i_orphan)) 87 41 return 0; 88 42 89 43 /* ··· 94 48 */ 95 49 ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 96 50 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); 51 + 52 + if (sbi->s_orphan_info.of_blocks) { 53 + err = ext4_orphan_file_add(handle, inode); 54 + /* 55 + * Fallback to normal orphan list of orphan file is 56 + * out of space 57 + */ 58 + if (err != -ENOSPC) 59 + return err; 60 + } 97 61 98 62 BUFFER_TRACE(sbi->s_sbh, "get_write_access"); 99 63 err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, ··· 159 103 return err; 160 104 } 161 105 106 + static int ext4_orphan_file_del(handle_t *handle, struct inode *inode) 107 + { 108 + struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info; 109 + __le32 *bdata; 110 + int blk, off; 111 + int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb); 112 + int ret = 0; 113 + 114 + if (!handle) 115 + goto out; 116 + blk = EXT4_I(inode)->i_orphan_idx / inodes_per_ob; 117 + off = EXT4_I(inode)->i_orphan_idx % inodes_per_ob; 118 + if (WARN_ON_ONCE(blk >= oi->of_blocks)) 119 + goto out; 120 + 121 + ret = ext4_journal_get_write_access(handle, inode->i_sb, 122 + oi->of_binfo[blk].ob_bh, EXT4_JTR_ORPHAN_FILE); 123 + if (ret) 124 + goto out; 125 + 126 + bdata = (__le32 *)(oi->of_binfo[blk].ob_bh->b_data); 127 + spin_lock(&oi->of_lock); 128 + bdata[off] = 0; 129 + oi->of_binfo[blk].ob_free_entries++; 130 + spin_unlock(&oi->of_lock); 131 + ret = ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[blk].ob_bh); 132 + out: 133 + ext4_clear_inode_state(inode, EXT4_STATE_ORPHAN_FILE); 134 + INIT_LIST_HEAD(&EXT4_I(inode)->i_orphan); 135 + 136 + return ret; 137 + } 138 + 162 139 /* 163 140 * ext4_orphan_del() removes an unlinked or truncated inode from the list 164 141 * of such inodes stored on disk, because it is finally being cleaned up. ··· 210 121 211 122 WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && 212 123 !inode_is_locked(inode)); 124 + if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE)) 125 + return ext4_orphan_file_del(handle, inode); 126 + 213 127 /* Do this quick check before taking global s_orphan_lock. */ 214 128 if (list_empty(&ei->i_orphan)) 215 129 return 0; ··· 292 200 } 293 201 #endif 294 202 203 + static void ext4_process_orphan(struct inode *inode, 204 + int *nr_truncates, int *nr_orphans) 205 + { 206 + struct super_block *sb = inode->i_sb; 207 + int ret; 208 + 209 + dquot_initialize(inode); 210 + if (inode->i_nlink) { 211 + if (test_opt(sb, DEBUG)) 212 + ext4_msg(sb, KERN_DEBUG, 213 + "%s: truncating inode %lu to %lld bytes", 214 + __func__, inode->i_ino, inode->i_size); 215 + jbd_debug(2, "truncating inode %lu to %lld bytes\n", 216 + inode->i_ino, inode->i_size); 217 + inode_lock(inode); 218 + truncate_inode_pages(inode->i_mapping, inode->i_size); 219 + ret = ext4_truncate(inode); 220 + if (ret) { 221 + /* 222 + * We need to clean up the in-core orphan list 223 + * manually if ext4_truncate() failed to get a 224 + * transaction handle. 225 + */ 226 + ext4_orphan_del(NULL, inode); 227 + ext4_std_error(inode->i_sb, ret); 228 + } 229 + inode_unlock(inode); 230 + (*nr_truncates)++; 231 + } else { 232 + if (test_opt(sb, DEBUG)) 233 + ext4_msg(sb, KERN_DEBUG, 234 + "%s: deleting unreferenced inode %lu", 235 + __func__, inode->i_ino); 236 + jbd_debug(2, "deleting unreferenced inode %lu\n", 237 + inode->i_ino); 238 + (*nr_orphans)++; 239 + } 240 + iput(inode); /* The delete magic happens here! */ 241 + } 242 + 295 243 /* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at 296 244 * the superblock) which were deleted from all directories, but held open by 297 245 * a process at the time of a crash. We walk the list and try to delete these ··· 352 220 void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) 353 221 { 354 222 unsigned int s_flags = sb->s_flags; 355 - int ret, nr_orphans = 0, nr_truncates = 0; 223 + int nr_orphans = 0, nr_truncates = 0; 224 + struct inode *inode; 225 + int i, j; 356 226 #ifdef CONFIG_QUOTA 357 227 int quota_update = 0; 358 - int i; 359 228 #endif 360 - if (!es->s_last_orphan) { 229 + __le32 *bdata; 230 + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; 231 + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); 232 + 233 + if (!es->s_last_orphan && !oi->of_blocks) { 361 234 jbd_debug(4, "no orphan inodes to clean up\n"); 362 235 return; 363 236 } ··· 426 289 #endif 427 290 428 291 while (es->s_last_orphan) { 429 - struct inode *inode; 430 - 431 292 /* 432 293 * We may have encountered an error during cleanup; if 433 294 * so, skip the rest. ··· 443 308 } 444 309 445 310 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); 446 - dquot_initialize(inode); 447 - if (inode->i_nlink) { 448 - if (test_opt(sb, DEBUG)) 449 - ext4_msg(sb, KERN_DEBUG, 450 - "%s: truncating inode %lu to %lld bytes", 451 - __func__, inode->i_ino, inode->i_size); 452 - jbd_debug(2, "truncating inode %lu to %lld bytes\n", 453 - inode->i_ino, inode->i_size); 454 - inode_lock(inode); 455 - truncate_inode_pages(inode->i_mapping, inode->i_size); 456 - ret = ext4_truncate(inode); 457 - if (ret) { 458 - /* 459 - * We need to clean up the in-core orphan list 460 - * manually if ext4_truncate() failed to get a 461 - * transaction handle. 462 - */ 463 - ext4_orphan_del(NULL, inode); 464 - ext4_std_error(inode->i_sb, ret); 465 - } 466 - inode_unlock(inode); 467 - nr_truncates++; 468 - } else { 469 - if (test_opt(sb, DEBUG)) 470 - ext4_msg(sb, KERN_DEBUG, 471 - "%s: deleting unreferenced inode %lu", 472 - __func__, inode->i_ino); 473 - jbd_debug(2, "deleting unreferenced inode %lu\n", 474 - inode->i_ino); 475 - nr_orphans++; 311 + ext4_process_orphan(inode, &nr_truncates, &nr_orphans); 312 + } 313 + 314 + for (i = 0; i < oi->of_blocks; i++) { 315 + bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); 316 + for (j = 0; j < inodes_per_ob; j++) { 317 + if (!bdata[j]) 318 + continue; 319 + inode = ext4_orphan_get(sb, le32_to_cpu(bdata[j])); 320 + if (IS_ERR(inode)) 321 + continue; 322 + ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE); 323 + EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j; 324 + ext4_process_orphan(inode, &nr_truncates, &nr_orphans); 476 325 } 477 - iput(inode); /* The delete magic happens here! */ 478 326 } 479 327 480 328 #define PLURAL(x) (x), ((x) == 1) ? "" : "s" ··· 478 360 } 479 361 #endif 480 362 sb->s_flags = s_flags; /* Restore SB_RDONLY status */ 363 + } 364 + 365 + void ext4_release_orphan_info(struct super_block *sb) 366 + { 367 + int i; 368 + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; 369 + 370 + if (!oi->of_blocks) 371 + return; 372 + for (i = 0; i < oi->of_blocks; i++) 373 + brelse(oi->of_binfo[i].ob_bh); 374 + kfree(oi->of_binfo); 375 + } 376 + 377 + static struct ext4_orphan_block_tail *ext4_orphan_block_tail( 378 + struct super_block *sb, 379 + struct buffer_head *bh) 380 + { 381 + return (struct ext4_orphan_block_tail *)(bh->b_data + sb->s_blocksize - 382 + sizeof(struct ext4_orphan_block_tail)); 383 + } 384 + 385 + static int ext4_orphan_file_block_csum_verify(struct super_block *sb, 386 + struct buffer_head *bh) 387 + { 388 + __u32 calculated; 389 + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); 390 + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; 391 + struct ext4_orphan_block_tail *ot; 392 + __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); 393 + 394 + if (!ext4_has_metadata_csum(sb)) 395 + return 1; 396 + 397 + ot = ext4_orphan_block_tail(sb, bh); 398 + calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, 399 + (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); 400 + calculated = ext4_chksum(EXT4_SB(sb), calculated, (__u8 *)bh->b_data, 401 + inodes_per_ob * sizeof(__u32)); 402 + return le32_to_cpu(ot->ob_checksum) == calculated; 403 + } 404 + 405 + /* This gets called only when checksumming is enabled */ 406 + void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers, 407 + struct buffer_head *bh, 408 + void *data, size_t size) 409 + { 410 + struct super_block *sb = EXT4_TRIGGER(triggers)->sb; 411 + __u32 csum; 412 + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); 413 + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; 414 + struct ext4_orphan_block_tail *ot; 415 + __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); 416 + 417 + csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, 418 + (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); 419 + csum = ext4_chksum(EXT4_SB(sb), csum, (__u8 *)data, 420 + inodes_per_ob * sizeof(__u32)); 421 + ot = ext4_orphan_block_tail(sb, bh); 422 + ot->ob_checksum = cpu_to_le32(csum); 423 + } 424 + 425 + int ext4_init_orphan_info(struct super_block *sb) 426 + { 427 + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; 428 + struct inode *inode; 429 + int i, j; 430 + int ret; 431 + int free; 432 + __le32 *bdata; 433 + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); 434 + struct ext4_orphan_block_tail *ot; 435 + ino_t orphan_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_orphan_file_inum); 436 + 437 + spin_lock_init(&oi->of_lock); 438 + 439 + if (!ext4_has_feature_orphan_file(sb)) 440 + return 0; 441 + 442 + inode = ext4_iget(sb, orphan_ino, EXT4_IGET_SPECIAL); 443 + if (IS_ERR(inode)) { 444 + ext4_msg(sb, KERN_ERR, "get orphan inode failed"); 445 + return PTR_ERR(inode); 446 + } 447 + oi->of_blocks = inode->i_size >> sb->s_blocksize_bits; 448 + oi->of_csum_seed = EXT4_I(inode)->i_csum_seed; 449 + oi->of_binfo = kmalloc(oi->of_blocks*sizeof(struct ext4_orphan_block), 450 + GFP_KERNEL); 451 + if (!oi->of_binfo) { 452 + ret = -ENOMEM; 453 + goto out_put; 454 + } 455 + for (i = 0; i < oi->of_blocks; i++) { 456 + oi->of_binfo[i].ob_bh = ext4_bread(NULL, inode, i, 0); 457 + if (IS_ERR(oi->of_binfo[i].ob_bh)) { 458 + ret = PTR_ERR(oi->of_binfo[i].ob_bh); 459 + goto out_free; 460 + } 461 + if (!oi->of_binfo[i].ob_bh) { 462 + ret = -EIO; 463 + goto out_free; 464 + } 465 + ot = ext4_orphan_block_tail(sb, oi->of_binfo[i].ob_bh); 466 + if (le32_to_cpu(ot->ob_magic) != EXT4_ORPHAN_BLOCK_MAGIC) { 467 + ext4_error(sb, "orphan file block %d: bad magic", i); 468 + ret = -EIO; 469 + goto out_free; 470 + } 471 + if (!ext4_orphan_file_block_csum_verify(sb, 472 + oi->of_binfo[i].ob_bh)) { 473 + ext4_error(sb, "orphan file block %d: bad checksum", i); 474 + ret = -EIO; 475 + goto out_free; 476 + } 477 + bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); 478 + free = 0; 479 + for (j = 0; j < inodes_per_ob; j++) 480 + if (bdata[j] == 0) 481 + free++; 482 + oi->of_binfo[i].ob_free_entries = free; 483 + } 484 + iput(inode); 485 + return 0; 486 + out_free: 487 + for (i--; i >= 0; i--) 488 + brelse(oi->of_binfo[i].ob_bh); 489 + kfree(oi->of_binfo); 490 + out_put: 491 + iput(inode); 492 + return ret; 493 + } 494 + 495 + int ext4_orphan_file_empty(struct super_block *sb) 496 + { 497 + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; 498 + int i; 499 + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); 500 + 501 + if (!ext4_has_feature_orphan_file(sb)) 502 + return 1; 503 + for (i = 0; i < oi->of_blocks; i++) 504 + if (oi->of_binfo[i].ob_free_entries != inodes_per_ob) 505 + return 0; 506 + return 1; 481 507 }
+29 -5
fs/ext4/super.c
··· 1174 1174 1175 1175 flush_work(&sbi->s_error_work); 1176 1176 destroy_workqueue(sbi->rsv_conversion_wq); 1177 + ext4_release_orphan_info(sb); 1177 1178 1178 1179 /* 1179 1180 * Unregister sysfs before destroying jbd2 journal. ··· 1200 1199 1201 1200 if (!sb_rdonly(sb) && !aborted) { 1202 1201 ext4_clear_feature_journal_needs_recovery(sb); 1202 + ext4_clear_feature_orphan_present(sb); 1203 1203 es->s_state = cpu_to_le16(sbi->s_mount_state); 1204 1204 } 1205 1205 if (!sb_rdonly(sb)) ··· 2686 2684 es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); 2687 2685 le16_add_cpu(&es->s_mnt_count, 1); 2688 2686 ext4_update_tstamp(es, s_mtime); 2689 - if (sbi->s_journal) 2687 + if (sbi->s_journal) { 2690 2688 ext4_set_feature_journal_needs_recovery(sb); 2689 + if (ext4_has_feature_orphan_file(sb)) 2690 + ext4_set_feature_orphan_present(sb); 2691 + } 2691 2692 2692 2693 err = ext4_commit_super(sb); 2693 2694 done: ··· 3965 3960 silent = 1; 3966 3961 goto cantfind_ext4; 3967 3962 } 3963 + ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE, 3964 + ext4_orphan_file_block_trigger); 3968 3965 3969 3966 /* Load the checksum driver */ 3970 3967 sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); ··· 4631 4624 sb->s_root = NULL; 4632 4625 4633 4626 needs_recovery = (es->s_last_orphan != 0 || 4627 + ext4_has_feature_orphan_present(sb) || 4634 4628 ext4_has_feature_journal_needs_recovery(sb)); 4635 4629 4636 4630 if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) ··· 4930 4922 if (err) 4931 4923 goto failed_mount7; 4932 4924 4925 + err = ext4_init_orphan_info(sb); 4926 + if (err) 4927 + goto failed_mount8; 4933 4928 #ifdef CONFIG_QUOTA 4934 4929 /* Enable quota usage during mount. */ 4935 4930 if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) { 4936 4931 err = ext4_enable_quotas(sb); 4937 4932 if (err) 4938 - goto failed_mount8; 4933 + goto failed_mount9; 4939 4934 } 4940 4935 #endif /* CONFIG_QUOTA */ 4941 4936 ··· 4957 4946 ext4_msg(sb, KERN_INFO, "recovery complete"); 4958 4947 err = ext4_mark_recovery_complete(sb, es); 4959 4948 if (err) 4960 - goto failed_mount8; 4949 + goto failed_mount9; 4961 4950 } 4962 4951 if (EXT4_SB(sb)->s_journal) { 4963 4952 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) ··· 5003 4992 ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); 5004 4993 goto failed_mount; 5005 4994 4995 + failed_mount9: 4996 + ext4_release_orphan_info(sb); 5006 4997 failed_mount8: 5007 4998 ext4_unregister_sysfs(sb); 5008 4999 kobject_put(&sbi->s_kobj); ··· 5515 5502 if (err < 0) 5516 5503 goto out; 5517 5504 5518 - if (ext4_has_feature_journal_needs_recovery(sb) && sb_rdonly(sb)) { 5505 + if (sb_rdonly(sb) && (ext4_has_feature_journal_needs_recovery(sb) || 5506 + ext4_has_feature_orphan_present(sb))) { 5507 + if (!ext4_orphan_file_empty(sb)) { 5508 + ext4_error(sb, "Orphan file not empty on read-only fs."); 5509 + err = -EFSCORRUPTED; 5510 + goto out; 5511 + } 5519 5512 ext4_clear_feature_journal_needs_recovery(sb); 5513 + ext4_clear_feature_orphan_present(sb); 5520 5514 ext4_commit_super(sb); 5521 5515 } 5522 5516 out: ··· 5666 5646 5667 5647 /* Journal blocked and flushed, clear needs_recovery flag. */ 5668 5648 ext4_clear_feature_journal_needs_recovery(sb); 5649 + if (ext4_orphan_file_empty(sb)) 5650 + ext4_clear_feature_orphan_present(sb); 5669 5651 } 5670 5652 5671 5653 error = ext4_commit_super(sb); ··· 5690 5668 if (EXT4_SB(sb)->s_journal) { 5691 5669 /* Reset the needs_recovery flag before the fs is unlocked. */ 5692 5670 ext4_set_feature_journal_needs_recovery(sb); 5671 + if (ext4_has_feature_orphan_file(sb)) 5672 + ext4_set_feature_orphan_present(sb); 5693 5673 } 5694 5674 5695 5675 ext4_commit_super(sb); ··· 5895 5871 * around from a previously readonly bdev mount, 5896 5872 * require a full umount/remount for now. 5897 5873 */ 5898 - if (es->s_last_orphan) { 5874 + if (es->s_last_orphan || !ext4_orphan_file_empty(sb)) { 5899 5875 ext4_msg(sb, KERN_WARNING, "Couldn't " 5900 5876 "remount RDWR because of unprocessed " 5901 5877 "orphan inode list. Please "