commit 555a6e8c11e6282bb2704ef1cee64ceaeb41773e · tjh.dev/kernel

+50

Documentation/filesystems/ext4/journal.rst

··· 681 - Stores the TID of the commit, CRC of the fast commit of which this tag 682 represents the end of 683

··· 681 - Stores the TID of the commit, CRC of the fast commit of which this tag 682 represents the end of 683 684 + Fast Commit Replay Idempotence 685 + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 686 + 687 + Fast commits tags are idempotent in nature provided the recovery code follows 688 + certain rules. The guiding principle that the commit path follows while 689 + committing is that it stores the result of a particular operation instead of 690 + storing the procedure. 691 + 692 + Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' 693 + was associated with inode 10. During fast commit, instead of storing this 694 + operation as a procedure "rename a to b", we store the resulting file system 695 + state as a "series" of outcomes: 696 + 697 + - Link dirent b to inode 10 698 + - Unlink dirent a 699 + - Inode 10 with valid refcount 700 + 701 + Now when recovery code runs, it needs "enforce" this state on the file 702 + system. This is what guarantees idempotence of fast commit replay. 703 + 704 + Let's take an example of a procedure that is not idempotent and see how fast 705 + commits make it idempotent. Consider following sequence of operations: 706 + 707 + 1) rm A 708 + 2) mv B A 709 + 3) read A 710 + 711 + If we store this sequence of operations as is then the replay is not idempotent. 712 + Let's say while in replay, we crash after (2). During the second replay, 713 + file A (which was actually created as a result of "mv B A" operation) would get 714 + deleted. Thus, file named A would be absent when we try to read A. So, this 715 + sequence of operations is not idempotent. However, as mentioned above, instead 716 + of storing the procedure fast commits store the outcome of each procedure. Thus 717 + the fast commit log for above procedure would be as follows: 718 + 719 + (Let's assume dirent A was linked to inode 10 and dirent B was linked to 720 + inode 11 before the replay) 721 + 722 + 1) Unlink A 723 + 2) Link A to inode 11 724 + 3) Unlink B 725 + 4) Inode 11 726 + 727 + If we crash after (3) we will have file A linked to inode 11. During the second 728 + replay, we will remove file A (inode 11). But we will create it back and make 729 + it point to inode 11. We won't find B, so we'll just skip that step. At this 730 + point, the refcount for inode 11 is not reliable, but that gets fixed by the 731 + replay of last inode 11 tag. Thus, by converting a non-idempotent procedure 732 + into a series of idempotent outcomes, fast commits ensured idempotence during 733 + the replay.

+1 -1

fs/ext4/balloc.c

··· 185 struct ext4_sb_info *sbi = EXT4_SB(sb); 186 ext4_fsblk_t start, tmp; 187 188 - J_ASSERT_BH(bh, buffer_locked(bh)); 189 190 /* If checksum is bad mark all blocks used to prevent allocation 191 * essentially implementing a per-group read-only flag. */

··· 185 struct ext4_sb_info *sbi = EXT4_SB(sb); 186 ext4_fsblk_t start, tmp; 187 188 + ASSERT(buffer_locked(bh)); 189 190 /* If checksum is bad mark all blocks used to prevent allocation 191 * essentially implementing a per-group read-only flag. */

+7 -9

fs/ext4/block_validity.c

··· 176 err = add_system_zone(system_blks, map.m_pblk, n, ino); 177 if (err < 0) { 178 if (err == -EFSCORRUPTED) { 179 - __ext4_error(sb, __func__, __LINE__, 180 - -err, map.m_pblk, 181 - "blocks %llu-%llu from inode %u overlap system zone", 182 - map.m_pblk, 183 - map.m_pblk + map.m_len - 1, 184 - ino); 185 } 186 break; 187 } ··· 204 * 205 * The update of system_blks pointer in this function is protected by 206 * sb->s_umount semaphore. However we have to be careful as we can be 207 - * racing with ext4_data_block_valid() calls reading system_blks rbtree 208 * protected only by RCU. That's why we first build the rbtree and then 209 * swap it in place. 210 */ ··· 256 257 /* 258 * System blks rbtree complete, announce it once to prevent racing 259 - * with ext4_data_block_valid() accessing the rbtree at the same 260 * time. 261 */ 262 rcu_assign_pointer(sbi->s_system_blks, system_blks); ··· 276 * 277 * The update of system_blks pointer in this function is protected by 278 * sb->s_umount semaphore. However we have to be careful as we can be 279 - * racing with ext4_data_block_valid() calls reading system_blks rbtree 280 * protected only by RCU. So we first clear the system_blks pointer and 281 * then free the rbtree only after RCU grace period expires. 282 */

··· 176 err = add_system_zone(system_blks, map.m_pblk, n, ino); 177 if (err < 0) { 178 if (err == -EFSCORRUPTED) { 179 + EXT4_ERROR_INODE_ERR(inode, -err, 180 + "blocks %llu-%llu from inode overlap system zone", 181 + map.m_pblk, 182 + map.m_pblk + map.m_len - 1); 183 } 184 break; 185 } ··· 206 * 207 * The update of system_blks pointer in this function is protected by 208 * sb->s_umount semaphore. However we have to be careful as we can be 209 + * racing with ext4_inode_block_valid() calls reading system_blks rbtree 210 * protected only by RCU. That's why we first build the rbtree and then 211 * swap it in place. 212 */ ··· 258 259 /* 260 * System blks rbtree complete, announce it once to prevent racing 261 + * with ext4_inode_block_valid() accessing the rbtree at the same 262 * time. 263 */ 264 rcu_assign_pointer(sbi->s_system_blks, system_blks); ··· 278 * 279 * The update of system_blks pointer in this function is protected by 280 * sb->s_umount semaphore. However we have to be careful as we can be 281 + * racing with ext4_inode_block_valid() calls reading system_blks rbtree 282 * protected only by RCU. So we first clear the system_blks pointer and 283 * then free the rbtree only after RCU grace period expires. 284 */

+58 -19

fs/ext4/ext4.h

··· 98 #define ext_debug(ino, fmt, ...) no_printk(fmt, ##__VA_ARGS__) 99 #endif 100 101 /* data type for block offset of block group */ 102 typedef int ext4_grpblk_t; 103 ··· 1629 errseq_t s_bdev_wb_err; 1630 spinlock_t s_bdev_wb_lock; 1631 1632 /* Ext4 fast commit stuff */ 1633 atomic_t s_fc_subtid; 1634 atomic_t s_fc_ineligible_updates; ··· 1889 #define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */ 1890 #define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ 1891 1892 - #define EXT4_CURRENT_REV EXT4_GOOD_OLD_REV 1893 #define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV 1894 1895 #define EXT4_GOOD_OLD_INODE_SIZE 128 ··· 2982 ext4_group_t block_group, 2983 unsigned int flags); 2984 2985 - extern __printf(6, 7) 2986 - void __ext4_error(struct super_block *, const char *, unsigned int, int, __u64, 2987 - const char *, ...); 2988 extern __printf(6, 7) 2989 void __ext4_error_inode(struct inode *, const char *, unsigned int, 2990 ext4_fsblk_t, int, const char *, ...); ··· 2993 const char *, ...); 2994 extern void __ext4_std_error(struct super_block *, const char *, 2995 unsigned int, int); 2996 - extern __printf(5, 6) 2997 - void __ext4_abort(struct super_block *, const char *, unsigned int, int, 2998 - const char *, ...); 2999 extern __printf(4, 5) 3000 void __ext4_warning(struct super_block *, const char *, unsigned int, 3001 const char *, ...); ··· 3022 #define EXT4_ERROR_FILE(file, block, fmt, a...) \ 3023 ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) 3024 3025 #ifdef CONFIG_PRINTK 3026 3027 #define ext4_error_inode(inode, func, line, block, fmt, ...) \ ··· 3035 #define ext4_error_file(file, func, line, block, fmt, ...) \ 3036 __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) 3037 #define ext4_error(sb, fmt, ...) \ 3038 - __ext4_error((sb), __func__, __LINE__, 0, 0, (fmt), ##__VA_ARGS__) 3039 #define ext4_error_err(sb, err, fmt, ...) \ 3040 - __ext4_error((sb), __func__, __LINE__, (err), 0, (fmt), ##__VA_ARGS__) 3041 - #define ext4_abort(sb, err, fmt, ...) \ 3042 - __ext4_abort((sb), __func__, __LINE__, (err), (fmt), ##__VA_ARGS__) 3043 #define ext4_warning(sb, fmt, ...) \ 3044 __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) 3045 #define ext4_warning_inode(inode, fmt, ...) \ ··· 3072 #define ext4_error(sb, fmt, ...) \ 3073 do { \ 3074 no_printk(fmt, ##__VA_ARGS__); \ 3075 - __ext4_error(sb, "", 0, 0, 0, " "); \ 3076 } while (0) 3077 #define ext4_error_err(sb, err, fmt, ...) \ 3078 do { \ 3079 no_printk(fmt, ##__VA_ARGS__); \ 3080 - __ext4_error(sb, "", 0, err, 0, " "); \ 3081 - } while (0) 3082 - #define ext4_abort(sb, err, fmt, ...) \ 3083 - do { \ 3084 - no_printk(fmt, ##__VA_ARGS__); \ 3085 - __ext4_abort(sb, "", 0, err, " "); \ 3086 } while (0) 3087 #define ext4_warning(sb, fmt, ...) \ 3088 do { \ ··· 3386 spin_unlock(ext4_group_lock_ptr(sb, group)); 3387 } 3388 3389 /* 3390 * Block validity checking 3391 */ ··· 3649 extern int ext4_bio_write_page(struct ext4_io_submit *io, 3650 struct page *page, 3651 int len, 3652 - struct writeback_control *wbc, 3653 bool keep_towrite); 3654 extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); 3655 extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);

··· 98 #define ext_debug(ino, fmt, ...) no_printk(fmt, ##__VA_ARGS__) 99 #endif 100 101 + #define ASSERT(assert) \ 102 + do { \ 103 + if (unlikely(!(assert))) { \ 104 + printk(KERN_EMERG \ 105 + "Assertion failure in %s() at %s:%d: '%s'\n", \ 106 + __func__, __FILE__, __LINE__, #assert); \ 107 + BUG(); \ 108 + } \ 109 + } while (0) 110 + 111 /* data type for block offset of block group */ 112 typedef int ext4_grpblk_t; 113 ··· 1619 errseq_t s_bdev_wb_err; 1620 spinlock_t s_bdev_wb_lock; 1621 1622 + /* Information about errors that happened during this mount */ 1623 + spinlock_t s_error_lock; 1624 + int s_add_error_count; 1625 + int s_first_error_code; 1626 + __u32 s_first_error_line; 1627 + __u32 s_first_error_ino; 1628 + __u64 s_first_error_block; 1629 + const char *s_first_error_func; 1630 + time64_t s_first_error_time; 1631 + int s_last_error_code; 1632 + __u32 s_last_error_line; 1633 + __u32 s_last_error_ino; 1634 + __u64 s_last_error_block; 1635 + const char *s_last_error_func; 1636 + time64_t s_last_error_time; 1637 + /* 1638 + * If we are in a context where we cannot update error information in 1639 + * the on-disk superblock, we queue this work to do it. 1640 + */ 1641 + struct work_struct s_error_work; 1642 + 1643 /* Ext4 fast commit stuff */ 1644 atomic_t s_fc_subtid; 1645 atomic_t s_fc_ineligible_updates; ··· 1858 #define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */ 1859 #define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ 1860 1861 #define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV 1862 1863 #define EXT4_GOOD_OLD_INODE_SIZE 128 ··· 2952 ext4_group_t block_group, 2953 unsigned int flags); 2954 2955 + extern __printf(7, 8) 2956 + void __ext4_error(struct super_block *, const char *, unsigned int, bool, 2957 + int, __u64, const char *, ...); 2958 extern __printf(6, 7) 2959 void __ext4_error_inode(struct inode *, const char *, unsigned int, 2960 ext4_fsblk_t, int, const char *, ...); ··· 2963 const char *, ...); 2964 extern void __ext4_std_error(struct super_block *, const char *, 2965 unsigned int, int); 2966 extern __printf(4, 5) 2967 void __ext4_warning(struct super_block *, const char *, unsigned int, 2968 const char *, ...); ··· 2995 #define EXT4_ERROR_FILE(file, block, fmt, a...) \ 2996 ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) 2997 2998 + #define ext4_abort(sb, err, fmt, a...) \ 2999 + __ext4_error((sb), __func__, __LINE__, true, (err), 0, (fmt), ## a) 3000 + 3001 #ifdef CONFIG_PRINTK 3002 3003 #define ext4_error_inode(inode, func, line, block, fmt, ...) \ ··· 3005 #define ext4_error_file(file, func, line, block, fmt, ...) \ 3006 __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) 3007 #define ext4_error(sb, fmt, ...) \ 3008 + __ext4_error((sb), __func__, __LINE__, false, 0, 0, (fmt), \ 3009 + ##__VA_ARGS__) 3010 #define ext4_error_err(sb, err, fmt, ...) \ 3011 + __ext4_error((sb), __func__, __LINE__, false, (err), 0, (fmt), \ 3012 + ##__VA_ARGS__) 3013 #define ext4_warning(sb, fmt, ...) \ 3014 __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) 3015 #define ext4_warning_inode(inode, fmt, ...) \ ··· 3042 #define ext4_error(sb, fmt, ...) \ 3043 do { \ 3044 no_printk(fmt, ##__VA_ARGS__); \ 3045 + __ext4_error(sb, "", 0, false, 0, 0, " "); \ 3046 } while (0) 3047 #define ext4_error_err(sb, err, fmt, ...) \ 3048 do { \ 3049 no_printk(fmt, ##__VA_ARGS__); \ 3050 + __ext4_error(sb, "", 0, false, err, 0, " "); \ 3051 } while (0) 3052 #define ext4_warning(sb, fmt, ...) \ 3053 do { \ ··· 3361 spin_unlock(ext4_group_lock_ptr(sb, group)); 3362 } 3363 3364 + #ifdef CONFIG_QUOTA 3365 + static inline bool ext4_quota_capable(struct super_block *sb) 3366 + { 3367 + return (test_opt(sb, QUOTA) || ext4_has_feature_quota(sb)); 3368 + } 3369 + 3370 + static inline bool ext4_is_quota_journalled(struct super_block *sb) 3371 + { 3372 + struct ext4_sb_info *sbi = EXT4_SB(sb); 3373 + 3374 + return (ext4_has_feature_quota(sb) || 3375 + sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]); 3376 + } 3377 + #endif 3378 + 3379 /* 3380 * Block validity checking 3381 */ ··· 3609 extern int ext4_bio_write_page(struct ext4_io_submit *io, 3610 struct page *page, 3611 int len, 3612 bool keep_towrite); 3613 extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); 3614 extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);

+2 -2

fs/ext4/ext4_jbd2.c

··· 296 if (err) { 297 ext4_journal_abort_handle(where, line, __func__, 298 bh, handle, err); 299 - __ext4_abort(inode->i_sb, where, line, -err, 300 - "error %d when attempting revoke", err); 301 } 302 BUFFER_TRACE(bh, "exit"); 303 return err;

··· 296 if (err) { 297 ext4_journal_abort_handle(where, line, __func__, 298 bh, handle, err); 299 + __ext4_error(inode->i_sb, where, line, true, -err, 0, 300 + "error %d when attempting revoke", err); 301 } 302 BUFFER_TRACE(bh, "exit"); 303 return err;

+3 -6

fs/ext4/ext4_jbd2.h

··· 86 #ifdef CONFIG_QUOTA 87 /* Amount of blocks needed for quota update - we know that the structure was 88 * allocated so we need to update only data block */ 89 - #define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ 90 - ext4_has_feature_quota(sb)) ? 1 : 0) 91 /* Amount of blocks needed for quota insert/delete - we do some block writes 92 * but inode, sb and group updates are done only once */ 93 - #define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ 94 - ext4_has_feature_quota(sb)) ?\ 95 (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ 96 +3+DQUOT_INIT_REWRITE) : 0) 97 98 - #define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ 99 - ext4_has_feature_quota(sb)) ?\ 100 (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ 101 +3+DQUOT_DEL_REWRITE) : 0) 102 #else

··· 86 #ifdef CONFIG_QUOTA 87 /* Amount of blocks needed for quota update - we know that the structure was 88 * allocated so we need to update only data block */ 89 + #define EXT4_QUOTA_TRANS_BLOCKS(sb) ((ext4_quota_capable(sb)) ? 1 : 0) 90 /* Amount of blocks needed for quota insert/delete - we do some block writes 91 * but inode, sb and group updates are done only once */ 92 + #define EXT4_QUOTA_INIT_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\ 93 (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ 94 +3+DQUOT_INIT_REWRITE) : 0) 95 96 + #define EXT4_QUOTA_DEL_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\ 97 (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ 98 +3+DQUOT_DEL_REWRITE) : 0) 99 #else

+2 -3

fs/ext4/extents.c

··· 5815 int ret; 5816 5817 path = ext4_find_extent(inode, start, NULL, 0); 5818 - if (!path) 5819 - return -EINVAL; 5820 ex = path[path->p_depth].p_ext; 5821 if (!ex) { 5822 ret = -EFSCORRUPTED; ··· 5988 kfree(path); 5989 break; 5990 } 5991 - ex = path2[path2->p_depth].p_ext; 5992 for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) { 5993 cmp1 = cmp2 = 0; 5994 if (i <= path->p_depth)

··· 5815 int ret; 5816 5817 path = ext4_find_extent(inode, start, NULL, 0); 5818 + if (IS_ERR(path)) 5819 + return PTR_ERR(path); 5820 ex = path[path->p_depth].p_ext; 5821 if (!ex) { 5822 ret = -EFSCORRUPTED; ··· 5988 kfree(path); 5989 break; 5990 } 5991 for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) { 5992 cmp1 = cmp2 = 0; 5993 if (i <= path->p_depth)

+61 -38

fs/ext4/fast_commit.c

··· 103 * 104 * Replay code should thus check for all the valid tails in the FC area. 105 * 106 * TODOs 107 * ----- 108 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit 109 * eligible update must be protected within ext4_fc_start_update() and 110 * ext4_fc_stop_update(). These routines are called at much higher ··· 1281 1282 /* Ext4 Replay Path Routines */ 1283 1284 - /* Get length of a particular tlv */ 1285 - static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl) 1286 - { 1287 - return le16_to_cpu(tl->fc_len); 1288 - } 1289 - 1290 - /* Get a pointer to "value" of a tlv */ 1291 - static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl) 1292 - { 1293 - return (u8 *)tl + sizeof(*tl); 1294 - } 1295 - 1296 /* Helper struct for dentry replay routines */ 1297 struct dentry_info_args { 1298 int parent_ino, dname_len, ino, inode_len; ··· 1817 iput(inode); 1818 1819 return 0; 1820 - } 1821 - 1822 - static inline const char *tag2str(u16 tag) 1823 - { 1824 - switch (tag) { 1825 - case EXT4_FC_TAG_LINK: 1826 - return "TAG_ADD_ENTRY"; 1827 - case EXT4_FC_TAG_UNLINK: 1828 - return "TAG_DEL_ENTRY"; 1829 - case EXT4_FC_TAG_ADD_RANGE: 1830 - return "TAG_ADD_RANGE"; 1831 - case EXT4_FC_TAG_CREAT: 1832 - return "TAG_CREAT_DENTRY"; 1833 - case EXT4_FC_TAG_DEL_RANGE: 1834 - return "TAG_DEL_RANGE"; 1835 - case EXT4_FC_TAG_INODE: 1836 - return "TAG_INODE"; 1837 - case EXT4_FC_TAG_PAD: 1838 - return "TAG_PAD"; 1839 - case EXT4_FC_TAG_TAIL: 1840 - return "TAG_TAIL"; 1841 - case EXT4_FC_TAG_HEAD: 1842 - return "TAG_HEAD"; 1843 - default: 1844 - return "TAG_ERROR"; 1845 - } 1846 } 1847 1848 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)

··· 103 * 104 * Replay code should thus check for all the valid tails in the FC area. 105 * 106 + * Fast Commit Replay Idempotence 107 + * ------------------------------ 108 + * 109 + * Fast commits tags are idempotent in nature provided the recovery code follows 110 + * certain rules. The guiding principle that the commit path follows while 111 + * committing is that it stores the result of a particular operation instead of 112 + * storing the procedure. 113 + * 114 + * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' 115 + * was associated with inode 10. During fast commit, instead of storing this 116 + * operation as a procedure "rename a to b", we store the resulting file system 117 + * state as a "series" of outcomes: 118 + * 119 + * - Link dirent b to inode 10 120 + * - Unlink dirent a 121 + * - Inode <10> with valid refcount 122 + * 123 + * Now when recovery code runs, it needs "enforce" this state on the file 124 + * system. This is what guarantees idempotence of fast commit replay. 125 + * 126 + * Let's take an example of a procedure that is not idempotent and see how fast 127 + * commits make it idempotent. Consider following sequence of operations: 128 + * 129 + * rm A; mv B A; read A 130 + * (x) (y) (z) 131 + * 132 + * (x), (y) and (z) are the points at which we can crash. If we store this 133 + * sequence of operations as is then the replay is not idempotent. Let's say 134 + * while in replay, we crash at (z). During the second replay, file A (which was 135 + * actually created as a result of "mv B A" operation) would get deleted. Thus, 136 + * file named A would be absent when we try to read A. So, this sequence of 137 + * operations is not idempotent. However, as mentioned above, instead of storing 138 + * the procedure fast commits store the outcome of each procedure. Thus the fast 139 + * commit log for above procedure would be as follows: 140 + * 141 + * (Let's assume dirent A was linked to inode 10 and dirent B was linked to 142 + * inode 11 before the replay) 143 + * 144 + * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] 145 + * (w) (x) (y) (z) 146 + * 147 + * If we crash at (z), we will have file A linked to inode 11. During the second 148 + * replay, we will remove file A (inode 11). But we will create it back and make 149 + * it point to inode 11. We won't find B, so we'll just skip that step. At this 150 + * point, the refcount for inode 11 is not reliable, but that gets fixed by the 151 + * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled 152 + * similarly. Thus, by converting a non-idempotent procedure into a series of 153 + * idempotent outcomes, fast commits ensured idempotence during the replay. 154 + * 155 * TODOs 156 * ----- 157 + * 158 + * 0) Fast commit replay path hardening: Fast commit replay code should use 159 + * journal handles to make sure all the updates it does during the replay 160 + * path are atomic. With that if we crash during fast commit replay, after 161 + * trying to do recovery again, we will find a file system where fast commit 162 + * area is invalid (because new full commit would be found). In order to deal 163 + * with that, fast commit replay code should ensure that the "FC_REPLAY" 164 + * superblock state is persisted before starting the replay, so that after 165 + * the crash, fast commit recovery code can look at that flag and perform 166 + * fast commit recovery even if that area is invalidated by later full 167 + * commits. 168 + * 169 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit 170 * eligible update must be protected within ext4_fc_start_update() and 171 * ext4_fc_stop_update(). These routines are called at much higher ··· 1220 1221 /* Ext4 Replay Path Routines */ 1222 1223 /* Helper struct for dentry replay routines */ 1224 struct dentry_info_args { 1225 int parent_ino, dname_len, ino, inode_len; ··· 1768 iput(inode); 1769 1770 return 0; 1771 } 1772 1773 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)

+61 -17

fs/ext4/fast_commit.h

··· 3 #ifndef __FAST_COMMIT_H__ 4 #define __FAST_COMMIT_H__ 5 6 /* Fast commit tags */ 7 #define EXT4_FC_TAG_ADD_RANGE 0x0001 8 #define EXT4_FC_TAG_DEL_RANGE 0x0002 ··· 55 struct ext4_fc_dentry_info { 56 __le32 fc_parent_ino; 57 __le32 fc_ino; 58 - u8 fc_dname[0]; 59 }; 60 61 /* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */ ··· 68 struct ext4_fc_tail { 69 __le32 fc_tid; 70 __le32 fc_crc; 71 - }; 72 - 73 - /* 74 - * In memory list of dentry updates that are performed on the file 75 - * system used by fast commit code. 76 - */ 77 - struct ext4_fc_dentry_update { 78 - int fcd_op; /* Type of update create / unlink / link */ 79 - int fcd_parent; /* Parent inode number */ 80 - int fcd_ino; /* Inode number */ 81 - struct qstr fcd_name; /* Dirent name */ 82 - unsigned char fcd_iname[DNAME_INLINE_LEN]; /* Dirent name string */ 83 - struct list_head fcd_list; 84 }; 85 86 /* ··· 97 EXT4_FC_REASON_INODE_JOURNAL_DATA, 98 EXT4_FC_COMMIT_FAILED, 99 EXT4_FC_REASON_MAX 100 }; 101 102 struct ext4_fc_stats { ··· 151 }; 152 153 #define region_last(__region) (((__region)->lblk) + ((__region)->len) - 1) 154 155 #define fc_for_each_tl(__start, __end, __tl) \ 156 - for (tl = (struct ext4_fc_tl *)start; \ 157 - (u8 *)tl < (u8 *)end; \ 158 - tl = (struct ext4_fc_tl *)((u8 *)tl + \ 159 sizeof(struct ext4_fc_tl) + \ 160 + le16_to_cpu(tl->fc_len))) 161 162 163 #endif /* __FAST_COMMIT_H__ */

··· 3 #ifndef __FAST_COMMIT_H__ 4 #define __FAST_COMMIT_H__ 5 6 + /* 7 + * Note this file is present in e2fsprogs/lib/ext2fs/fast_commit.h and 8 + * linux/fs/ext4/fast_commit.h. These file should always be byte identical. 9 + */ 10 + 11 /* Fast commit tags */ 12 #define EXT4_FC_TAG_ADD_RANGE 0x0001 13 #define EXT4_FC_TAG_DEL_RANGE 0x0002 ··· 50 struct ext4_fc_dentry_info { 51 __le32 fc_parent_ino; 52 __le32 fc_ino; 53 + __u8 fc_dname[0]; 54 }; 55 56 /* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */ ··· 63 struct ext4_fc_tail { 64 __le32 fc_tid; 65 __le32 fc_crc; 66 }; 67 68 /* ··· 105 EXT4_FC_REASON_INODE_JOURNAL_DATA, 106 EXT4_FC_COMMIT_FAILED, 107 EXT4_FC_REASON_MAX 108 + }; 109 + 110 + #ifdef __KERNEL__ 111 + /* 112 + * In memory list of dentry updates that are performed on the file 113 + * system used by fast commit code. 114 + */ 115 + struct ext4_fc_dentry_update { 116 + int fcd_op; /* Type of update create / unlink / link */ 117 + int fcd_parent; /* Parent inode number */ 118 + int fcd_ino; /* Inode number */ 119 + struct qstr fcd_name; /* Dirent name */ 120 + unsigned char fcd_iname[DNAME_INLINE_LEN]; /* Dirent name string */ 121 + struct list_head fcd_list; 122 }; 123 124 struct ext4_fc_stats { ··· 145 }; 146 147 #define region_last(__region) (((__region)->lblk) + ((__region)->len) - 1) 148 + #endif 149 150 #define fc_for_each_tl(__start, __end, __tl) \ 151 + for (tl = (struct ext4_fc_tl *)(__start); \ 152 + (__u8 *)tl < (__u8 *)(__end); \ 153 + tl = (struct ext4_fc_tl *)((__u8 *)tl + \ 154 sizeof(struct ext4_fc_tl) + \ 155 + le16_to_cpu(tl->fc_len))) 156 157 + static inline const char *tag2str(__u16 tag) 158 + { 159 + switch (tag) { 160 + case EXT4_FC_TAG_LINK: 161 + return "ADD_ENTRY"; 162 + case EXT4_FC_TAG_UNLINK: 163 + return "DEL_ENTRY"; 164 + case EXT4_FC_TAG_ADD_RANGE: 165 + return "ADD_RANGE"; 166 + case EXT4_FC_TAG_CREAT: 167 + return "CREAT_DENTRY"; 168 + case EXT4_FC_TAG_DEL_RANGE: 169 + return "DEL_RANGE"; 170 + case EXT4_FC_TAG_INODE: 171 + return "INODE"; 172 + case EXT4_FC_TAG_PAD: 173 + return "PAD"; 174 + case EXT4_FC_TAG_TAIL: 175 + return "TAIL"; 176 + case EXT4_FC_TAG_HEAD: 177 + return "HEAD"; 178 + default: 179 + return "ERROR"; 180 + } 181 + } 182 + 183 + /* Get length of a particular tlv */ 184 + static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl) 185 + { 186 + return le16_to_cpu(tl->fc_len); 187 + } 188 + 189 + /* Get a pointer to "value" of a tlv */ 190 + static inline __u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl) 191 + { 192 + return (__u8 *)tl + sizeof(*tl); 193 + } 194 195 #endif /* __FAST_COMMIT_H__ */

+1 -1

fs/ext4/fsync.c

··· 136 if (unlikely(ext4_forced_shutdown(sbi))) 137 return -EIO; 138 139 - J_ASSERT(ext4_journal_current_handle() == NULL); 140 141 trace_ext4_sync_file_enter(file, datasync); 142

··· 136 if (unlikely(ext4_forced_shutdown(sbi))) 137 return -EIO; 138 139 + ASSERT(ext4_journal_current_handle() == NULL); 140 141 trace_ext4_sync_file_enter(file, datasync); 142

+2 -2

fs/ext4/indirect.c

··· 534 ext4_fsblk_t first_block = 0; 535 536 trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 537 - J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); 538 - J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); 539 depth = ext4_block_to_path(inode, map->m_lblk, offsets, 540 &blocks_to_boundary); 541

··· 534 ext4_fsblk_t first_block = 0; 535 536 trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 537 + ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); 538 + ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); 539 depth = ext4_block_to_path(inode, map->m_lblk, offsets, 540 &blocks_to_boundary); 541

+22 -13

fs/ext4/inode.c

··· 175 */ 176 int extra_credits = 6; 177 struct ext4_xattr_inode_array *ea_inode_array = NULL; 178 179 trace_ext4_evict_inode(inode); 180 ··· 233 234 /* 235 * Protect us against freezing - iput() caller didn't have to have any 236 - * protection against it 237 */ 238 - sb_start_intwrite(inode->i_sb); 239 240 if (!IS_NOQUOTA(inode)) 241 extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); ··· 259 * cleaned up. 260 */ 261 ext4_orphan_del(NULL, inode); 262 - sb_end_intwrite(inode->i_sb); 263 goto no_delete; 264 } 265 ··· 301 stop_handle: 302 ext4_journal_stop(handle); 303 ext4_orphan_del(NULL, inode); 304 - sb_end_intwrite(inode->i_sb); 305 ext4_xattr_inode_array_free(ea_inode_array); 306 goto no_delete; 307 } ··· 331 else 332 ext4_free_inode(handle, inode); 333 ext4_journal_stop(handle); 334 - sb_end_intwrite(inode->i_sb); 335 ext4_xattr_inode_array_free(ea_inode_array); 336 return; 337 no_delete: ··· 839 int create = map_flags & EXT4_GET_BLOCKS_CREATE; 840 int err; 841 842 - J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 843 - || handle != NULL || create == 0); 844 845 map.m_lblk = block; 846 map.m_len = 1; ··· 855 if (unlikely(!bh)) 856 return ERR_PTR(-ENOMEM); 857 if (map.m_flags & EXT4_MAP_NEW) { 858 - J_ASSERT(create != 0); 859 - J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 860 - || (handle != NULL)); 861 862 /* 863 * Now that we do not always journal data, we should ··· 2064 unlock_page(page); 2065 return -ENOMEM; 2066 } 2067 - ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite); 2068 ext4_io_submit(&io_submit); 2069 /* Drop io_end reference we got from init */ 2070 ext4_put_io_end_defer(io_submit.io_end); ··· 2098 len = size & ~PAGE_MASK; 2099 else 2100 len = PAGE_SIZE; 2101 - err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false); 2102 if (!err) 2103 mpd->wbc->nr_to_write--; 2104 mpd->first_page++; ··· 4619 (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) { 4620 if (flags & EXT4_IGET_HANDLE) 4621 return ERR_PTR(-ESTALE); 4622 - __ext4_error(sb, function, line, EFSCORRUPTED, 0, 4623 "inode #%lu: comm %s: iget: illegal inode #", 4624 ino, current->comm); 4625 return ERR_PTR(-EFSCORRUPTED);

··· 175 */ 176 int extra_credits = 6; 177 struct ext4_xattr_inode_array *ea_inode_array = NULL; 178 + bool freeze_protected = false; 179 180 trace_ext4_evict_inode(inode); 181 ··· 232 233 /* 234 * Protect us against freezing - iput() caller didn't have to have any 235 + * protection against it. When we are in a running transaction though, 236 + * we are already protected against freezing and we cannot grab further 237 + * protection due to lock ordering constraints. 238 */ 239 + if (!ext4_journal_current_handle()) { 240 + sb_start_intwrite(inode->i_sb); 241 + freeze_protected = true; 242 + } 243 244 if (!IS_NOQUOTA(inode)) 245 extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); ··· 253 * cleaned up. 254 */ 255 ext4_orphan_del(NULL, inode); 256 + if (freeze_protected) 257 + sb_end_intwrite(inode->i_sb); 258 goto no_delete; 259 } 260 ··· 294 stop_handle: 295 ext4_journal_stop(handle); 296 ext4_orphan_del(NULL, inode); 297 + if (freeze_protected) 298 + sb_end_intwrite(inode->i_sb); 299 ext4_xattr_inode_array_free(ea_inode_array); 300 goto no_delete; 301 } ··· 323 else 324 ext4_free_inode(handle, inode); 325 ext4_journal_stop(handle); 326 + if (freeze_protected) 327 + sb_end_intwrite(inode->i_sb); 328 ext4_xattr_inode_array_free(ea_inode_array); 329 return; 330 no_delete: ··· 830 int create = map_flags & EXT4_GET_BLOCKS_CREATE; 831 int err; 832 833 + ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 834 + || handle != NULL || create == 0); 835 836 map.m_lblk = block; 837 map.m_len = 1; ··· 846 if (unlikely(!bh)) 847 return ERR_PTR(-ENOMEM); 848 if (map.m_flags & EXT4_MAP_NEW) { 849 + ASSERT(create != 0); 850 + ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 851 + || (handle != NULL)); 852 853 /* 854 * Now that we do not always journal data, we should ··· 2055 unlock_page(page); 2056 return -ENOMEM; 2057 } 2058 + ret = ext4_bio_write_page(&io_submit, page, len, keep_towrite); 2059 ext4_io_submit(&io_submit); 2060 /* Drop io_end reference we got from init */ 2061 ext4_put_io_end_defer(io_submit.io_end); ··· 2089 len = size & ~PAGE_MASK; 2090 else 2091 len = PAGE_SIZE; 2092 + err = ext4_bio_write_page(&mpd->io_submit, page, len, false); 2093 if (!err) 2094 mpd->wbc->nr_to_write--; 2095 mpd->first_page++; ··· 4610 (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) { 4611 if (flags & EXT4_IGET_HANDLE) 4612 return ERR_PTR(-ESTALE); 4613 + __ext4_error(sb, function, line, false, EFSCORRUPTED, 0, 4614 "inode #%lu: comm %s: iget: illegal inode #", 4615 ino, current->comm); 4616 return ERR_PTR(-EFSCORRUPTED);

+9 -30

fs/ext4/mballoc.c

··· 822 spin_unlock(&sbi->s_bal_lock); 823 } 824 825 - static void mb_regenerate_buddy(struct ext4_buddy *e4b) 826 - { 827 - int count; 828 - int order = 1; 829 - void *buddy; 830 - 831 - while ((buddy = mb_find_buddy(e4b, order++, &count))) { 832 - ext4_set_bits(buddy, 0, count); 833 - } 834 - e4b->bd_info->bb_fragments = 0; 835 - memset(e4b->bd_info->bb_counters, 0, 836 - sizeof(*e4b->bd_info->bb_counters) * 837 - (e4b->bd_sb->s_blocksize_bits + 2)); 838 - 839 - ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, 840 - e4b->bd_bitmap, e4b->bd_group); 841 - } 842 - 843 /* The buddy information is attached the buddy cache inode 844 * for convenience. The information regarding each group 845 * is loaded via ext4_mb_load_buddy. The information involve ··· 1289 1290 static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) 1291 { 1292 - int order = 1; 1293 - int bb_incr = 1 << (e4b->bd_blkbits - 1); 1294 void *bb; 1295 1296 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); 1297 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); 1298 1299 - bb = e4b->bd_buddy; 1300 while (order <= e4b->bd_blkbits + 1) { 1301 - block = block >> 1; 1302 - if (!mb_test_bit(block, bb)) { 1303 /* this block is part of buddy of order 'order' */ 1304 return order; 1305 } 1306 - bb += bb_incr; 1307 - bb_incr >>= 1; 1308 order++; 1309 } 1310 return 0; ··· 1490 sb, e4b->bd_group, 1491 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 1492 } 1493 - mb_regenerate_buddy(e4b); 1494 goto done; 1495 } 1496 ··· 2372 2373 nr = sbi->s_mb_prefetch; 2374 if (ext4_has_feature_flex_bg(sb)) { 2375 - nr = (group / sbi->s_mb_prefetch) * 2376 - sbi->s_mb_prefetch; 2377 - nr = nr + sbi->s_mb_prefetch - group; 2378 } 2379 prefetch_grp = ext4_mb_prefetch(sb, group, 2380 nr, &prefetch_ios); ··· 2710 2711 if (ext4_has_feature_flex_bg(sb)) { 2712 /* a single flex group is supposed to be read by a single IO */ 2713 - sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex; 2714 sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ 2715 } else { 2716 sbi->s_mb_prefetch = 32; ··· 5104 ext4_group_first_block_no(sb, group) + 5105 EXT4_C2B(sbi, cluster), 5106 "Block already on to-be-freed list"); 5107 return 0; 5108 } 5109 }

··· 822 spin_unlock(&sbi->s_bal_lock); 823 } 824 825 /* The buddy information is attached the buddy cache inode 826 * for convenience. The information regarding each group 827 * is loaded via ext4_mb_load_buddy. The information involve ··· 1307 1308 static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) 1309 { 1310 + int order = 1, max; 1311 void *bb; 1312 1313 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); 1314 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); 1315 1316 while (order <= e4b->bd_blkbits + 1) { 1317 + bb = mb_find_buddy(e4b, order, &max); 1318 + if (!mb_test_bit(block >> order, bb)) { 1319 /* this block is part of buddy of order 'order' */ 1320 return order; 1321 } 1322 order++; 1323 } 1324 return 0; ··· 1512 sb, e4b->bd_group, 1513 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 1514 } 1515 goto done; 1516 } 1517 ··· 2395 2396 nr = sbi->s_mb_prefetch; 2397 if (ext4_has_feature_flex_bg(sb)) { 2398 + nr = 1 << sbi->s_log_groups_per_flex; 2399 + nr -= group & (nr - 1); 2400 + nr = min(nr, sbi->s_mb_prefetch); 2401 } 2402 prefetch_grp = ext4_mb_prefetch(sb, group, 2403 nr, &prefetch_ios); ··· 2733 2734 if (ext4_has_feature_flex_bg(sb)) { 2735 /* a single flex group is supposed to be read by a single IO */ 2736 + sbi->s_mb_prefetch = min(1 << sbi->s_es->s_log_groups_per_flex, 2737 + BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9)); 2738 sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ 2739 } else { 2740 sbi->s_mb_prefetch = 32; ··· 5126 ext4_group_first_block_no(sb, group) + 5127 EXT4_C2B(sbi, cluster), 5128 "Block already on to-be-freed list"); 5129 + kmem_cache_free(ext4_free_data_cachep, new_entry); 5130 return 0; 5131 } 5132 }

+4 -8

fs/ext4/namei.c

··· 182 return bh; 183 } 184 185 - #ifndef assert 186 - #define assert(test) J_ASSERT(test) 187 - #endif 188 - 189 #ifdef DX_DEBUG 190 #define dxtrace(command) command 191 #else ··· 839 break; 840 } 841 } 842 - assert (at == p - 1); 843 } 844 845 at = p - 1; ··· 1255 struct dx_entry *old = frame->at, *new = old + 1; 1256 int count = dx_get_count(entries); 1257 1258 - assert(count < dx_get_limit(entries)); 1259 - assert(old < entries + count); 1260 memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); 1261 dx_set_hash(new, hash); 1262 dx_set_block(new, block); ··· 2955 * hold i_mutex, or the inode can not be referenced from outside, 2956 * so i_nlink should not be bumped due to race 2957 */ 2958 - J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 2959 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); 2960 2961 BUFFER_TRACE(sbi->s_sbh, "get_write_access");

··· 182 return bh; 183 } 184 185 #ifdef DX_DEBUG 186 #define dxtrace(command) command 187 #else ··· 843 break; 844 } 845 } 846 + ASSERT(at == p - 1); 847 } 848 849 at = p - 1; ··· 1259 struct dx_entry *old = frame->at, *new = old + 1; 1260 int count = dx_get_count(entries); 1261 1262 + ASSERT(count < dx_get_limit(entries)); 1263 + ASSERT(old < entries + count); 1264 memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); 1265 dx_set_hash(new, hash); 1266 dx_set_block(new, block); ··· 2959 * hold i_mutex, or the inode can not be referenced from outside, 2960 * so i_nlink should not be bumped due to race 2961 */ 2962 + ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 2963 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); 2964 2965 BUFFER_TRACE(sbi->s_sbh, "get_write_access");

+1 -4

fs/ext4/page-io.c

··· 111 unsigned under_io = 0; 112 unsigned long flags; 113 114 - if (!page) 115 - continue; 116 - 117 if (fscrypt_is_bounce_page(page)) { 118 bounce_page = page; 119 page = fscrypt_pagecache_page(bounce_page); ··· 435 int ext4_bio_write_page(struct ext4_io_submit *io, 436 struct page *page, 437 int len, 438 - struct writeback_control *wbc, 439 bool keep_towrite) 440 { 441 struct page *bounce_page = NULL; ··· 444 int ret = 0; 445 int nr_submitted = 0; 446 int nr_to_submit = 0; 447 448 BUG_ON(!PageLocked(page)); 449 BUG_ON(PageWriteback(page));

··· 111 unsigned under_io = 0; 112 unsigned long flags; 113 114 if (fscrypt_is_bounce_page(page)) { 115 bounce_page = page; 116 page = fscrypt_pagecache_page(bounce_page); ··· 438 int ext4_bio_write_page(struct ext4_io_submit *io, 439 struct page *page, 440 int len, 441 bool keep_towrite) 442 { 443 struct page *bounce_page = NULL; ··· 448 int ret = 0; 449 int nr_submitted = 0; 450 int nr_to_submit = 0; 451 + struct writeback_control *wbc = io->io_wbc; 452 453 BUG_ON(!PageLocked(page)); 454 BUG_ON(PageWriteback(page));

+207 -215

fs/ext4/super.c

··· 404 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); 405 } 406 407 - static void __ext4_update_tstamp(__le32 *lo, __u8 *hi) 408 { 409 - time64_t now = ktime_get_real_seconds(); 410 - 411 now = clamp_val(now, 0, (1ull << 40) - 1); 412 413 *lo = cpu_to_le32(lower_32_bits(now)); ··· 417 return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo); 418 } 419 #define ext4_update_tstamp(es, tstamp) \ 420 - __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) 421 #define ext4_get_tstamp(es, tstamp) \ 422 __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) 423 - 424 - static void __save_error_info(struct super_block *sb, int error, 425 - __u32 ino, __u64 block, 426 - const char *func, unsigned int line) 427 - { 428 - struct ext4_super_block *es = EXT4_SB(sb)->s_es; 429 - int err; 430 - 431 - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 432 - if (bdev_read_only(sb->s_bdev)) 433 - return; 434 - es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 435 - ext4_update_tstamp(es, s_last_error_time); 436 - strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); 437 - es->s_last_error_line = cpu_to_le32(line); 438 - es->s_last_error_ino = cpu_to_le32(ino); 439 - es->s_last_error_block = cpu_to_le64(block); 440 - switch (error) { 441 - case EIO: 442 - err = EXT4_ERR_EIO; 443 - break; 444 - case ENOMEM: 445 - err = EXT4_ERR_ENOMEM; 446 - break; 447 - case EFSBADCRC: 448 - err = EXT4_ERR_EFSBADCRC; 449 - break; 450 - case 0: 451 - case EFSCORRUPTED: 452 - err = EXT4_ERR_EFSCORRUPTED; 453 - break; 454 - case ENOSPC: 455 - err = EXT4_ERR_ENOSPC; 456 - break; 457 - case ENOKEY: 458 - err = EXT4_ERR_ENOKEY; 459 - break; 460 - case EROFS: 461 - err = EXT4_ERR_EROFS; 462 - break; 463 - case EFBIG: 464 - err = EXT4_ERR_EFBIG; 465 - break; 466 - case EEXIST: 467 - err = EXT4_ERR_EEXIST; 468 - break; 469 - case ERANGE: 470 - err = EXT4_ERR_ERANGE; 471 - break; 472 - case EOVERFLOW: 473 - err = EXT4_ERR_EOVERFLOW; 474 - break; 475 - case EBUSY: 476 - err = EXT4_ERR_EBUSY; 477 - break; 478 - case ENOTDIR: 479 - err = EXT4_ERR_ENOTDIR; 480 - break; 481 - case ENOTEMPTY: 482 - err = EXT4_ERR_ENOTEMPTY; 483 - break; 484 - case ESHUTDOWN: 485 - err = EXT4_ERR_ESHUTDOWN; 486 - break; 487 - case EFAULT: 488 - err = EXT4_ERR_EFAULT; 489 - break; 490 - default: 491 - err = EXT4_ERR_UNKNOWN; 492 - } 493 - es->s_last_error_errcode = err; 494 - if (!es->s_first_error_time) { 495 - es->s_first_error_time = es->s_last_error_time; 496 - es->s_first_error_time_hi = es->s_last_error_time_hi; 497 - strncpy(es->s_first_error_func, func, 498 - sizeof(es->s_first_error_func)); 499 - es->s_first_error_line = cpu_to_le32(line); 500 - es->s_first_error_ino = es->s_last_error_ino; 501 - es->s_first_error_block = es->s_last_error_block; 502 - es->s_first_error_errcode = es->s_last_error_errcode; 503 - } 504 - /* 505 - * Start the daily error reporting function if it hasn't been 506 - * started already 507 - */ 508 - if (!es->s_error_count) 509 - mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ); 510 - le32_add_cpu(&es->s_error_count, 1); 511 - } 512 - 513 - static void save_error_info(struct super_block *sb, int error, 514 - __u32 ino, __u64 block, 515 - const char *func, unsigned int line) 516 - { 517 - __save_error_info(sb, error, ino, block, func, line); 518 - if (!bdev_read_only(sb->s_bdev)) 519 - ext4_commit_super(sb, 1); 520 - } 521 522 /* 523 * The del_gendisk() function uninitializes the disk-specific data ··· 550 || system_state == SYSTEM_RESTART; 551 } 552 553 /* Deal with the reporting of failure conditions on a filesystem such as 554 * inconsistencies detected or read IO failures. 555 * ··· 640 * We'll just use the jbd2_journal_abort() error code to record an error in 641 * the journal instead. On recovery, the journal will complain about 642 * that error until we've noted it down and cleared it. 643 */ 644 - 645 - static void ext4_handle_error(struct super_block *sb) 646 { 647 if (test_opt(sb, WARN_ON_ERROR)) 648 WARN_ON_ONCE(1); 649 650 - if (sb_rdonly(sb)) 651 return; 652 653 - if (!test_opt(sb, ERRORS_CONT)) { 654 - journal_t *journal = EXT4_SB(sb)->s_journal; 655 - 656 - ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); 657 - if (journal) 658 - jbd2_journal_abort(journal, -EIO); 659 - } 660 /* 661 * We force ERRORS_RO behavior when system is rebooting. Otherwise we 662 * could panic during 'reboot -f' as the underlying device got already 663 * disabled. 664 */ 665 - if (test_opt(sb, ERRORS_RO) || system_going_down()) { 666 - ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 667 - /* 668 - * Make sure updated value of ->s_mount_flags will be visible 669 - * before ->s_flags update 670 - */ 671 - smp_wmb(); 672 - sb->s_flags |= SB_RDONLY; 673 - } else if (test_opt(sb, ERRORS_PANIC)) { 674 panic("EXT4-fs (device %s): panic forced after error\n", 675 sb->s_id); 676 } 677 } 678 679 #define ext4_error_ratelimit(sb) \ ··· 691 "EXT4-fs error") 692 693 void __ext4_error(struct super_block *sb, const char *function, 694 - unsigned int line, int error, __u64 block, 695 const char *fmt, ...) 696 { 697 struct va_format vaf; ··· 711 va_end(args); 712 } 713 save_error_info(sb, error, 0, block, function, line); 714 - ext4_handle_error(sb); 715 } 716 717 void __ext4_error_inode(struct inode *inode, const char *function, ··· 743 } 744 save_error_info(inode->i_sb, error, inode->i_ino, block, 745 function, line); 746 - ext4_handle_error(inode->i_sb); 747 } 748 749 void __ext4_error_file(struct file *file, const char *function, ··· 782 } 783 save_error_info(inode->i_sb, EFSCORRUPTED, inode->i_ino, block, 784 function, line); 785 - ext4_handle_error(inode->i_sb); 786 } 787 788 const char *ext4_decode_error(struct super_block *sb, int errno, ··· 850 } 851 852 save_error_info(sb, -errno, 0, 0, function, line); 853 - ext4_handle_error(sb); 854 - } 855 - 856 - /* 857 - * ext4_abort is a much stronger failure handler than ext4_error. The 858 - * abort function may be used to deal with unrecoverable failures such 859 - * as journal IO errors or ENOMEM at a critical moment in log management. 860 - * 861 - * We unconditionally force the filesystem into an ABORT|READONLY state, 862 - * unless the error response on the fs has been set to panic in which 863 - * case we take the easy way out and panic immediately. 864 - */ 865 - 866 - void __ext4_abort(struct super_block *sb, const char *function, 867 - unsigned int line, int error, const char *fmt, ...) 868 - { 869 - struct va_format vaf; 870 - va_list args; 871 - 872 - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) 873 - return; 874 - 875 - save_error_info(sb, error, 0, 0, function, line); 876 - va_start(args, fmt); 877 - vaf.fmt = fmt; 878 - vaf.va = &args; 879 - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: %pV\n", 880 - sb->s_id, function, line, &vaf); 881 - va_end(args); 882 - 883 - if (sb_rdonly(sb) == 0) { 884 - ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); 885 - if (EXT4_SB(sb)->s_journal) 886 - jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); 887 - 888 - ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 889 - /* 890 - * Make sure updated value of ->s_mount_flags will be visible 891 - * before ->s_flags update 892 - */ 893 - smp_wmb(); 894 - sb->s_flags |= SB_RDONLY; 895 - } 896 - if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) 897 - panic("EXT4-fs panic from previous error\n"); 898 } 899 900 void __ext4_msg(struct super_block *sb, ··· 926 return; 927 928 trace_ext4_error(sb, function, line); 929 - __save_error_info(sb, EFSCORRUPTED, ino, block, function, line); 930 - 931 if (ext4_error_ratelimit(sb)) { 932 va_start(args, fmt); 933 vaf.fmt = fmt; ··· 941 va_end(args); 942 } 943 944 - if (test_opt(sb, WARN_ON_ERROR)) 945 - WARN_ON_ONCE(1); 946 - 947 if (test_opt(sb, ERRORS_CONT)) { 948 - ext4_commit_super(sb, 0); 949 return; 950 } 951 - 952 ext4_unlock_group(sb, grp); 953 - ext4_commit_super(sb, 1); 954 - ext4_handle_error(sb); 955 /* 956 * We only get here in the ERRORS_RO case; relocking the group 957 * may be dangerous, but nothing bad will happen since the ··· 1122 ext4_unregister_li_request(sb); 1123 ext4_quota_off_umount(sb); 1124 1125 destroy_workqueue(sbi->rsv_conversion_wq); 1126 1127 /* ··· 1182 * in-memory list had better be clean by this point. */ 1183 if (!list_empty(&sbi->s_orphan)) 1184 dump_orphan_list(sb, sbi); 1185 - J_ASSERT(list_empty(&sbi->s_orphan)); 1186 1187 sync_blockdev(sb->s_bdev); 1188 invalidate_bdev(sb->s_bdev); ··· 3947 atomic64_set(&sbi->s_resv_clusters, resv_clusters); 3948 } 3949 3950 static int ext4_fill_super(struct super_block *sb, void *data, int silent) 3951 { 3952 struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev); ··· 4030 if (IS_ERR(bh)) { 4031 ext4_msg(sb, KERN_ERR, "unable to read superblock"); 4032 ret = PTR_ERR(bh); 4033 - bh = NULL; 4034 goto out_fail; 4035 } 4036 /* ··· 4143 */ 4144 sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; 4145 4146 - blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); 4147 4148 if (blocksize == PAGE_SIZE) 4149 set_opt(sb, DIOREAD_NOLOCK); 4150 - 4151 - if (blocksize < EXT4_MIN_BLOCK_SIZE || 4152 - blocksize > EXT4_MAX_BLOCK_SIZE) { 4153 - ext4_msg(sb, KERN_ERR, 4154 - "Unsupported filesystem blocksize %d (%d log_block_size)", 4155 - blocksize, le32_to_cpu(es->s_log_block_size)); 4156 - goto failed_mount; 4157 - } 4158 4159 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { 4160 sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; ··· 4379 */ 4380 if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) 4381 goto failed_mount; 4382 - 4383 - if (le32_to_cpu(es->s_log_block_size) > 4384 - (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { 4385 - ext4_msg(sb, KERN_ERR, 4386 - "Invalid log block size: %u", 4387 - le32_to_cpu(es->s_log_block_size)); 4388 - goto failed_mount; 4389 - } 4390 - if (le32_to_cpu(es->s_log_cluster_size) > 4391 - (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { 4392 - ext4_msg(sb, KERN_ERR, 4393 - "Invalid log cluster size: %u", 4394 - le32_to_cpu(es->s_log_cluster_size)); 4395 - goto failed_mount; 4396 - } 4397 4398 if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) { 4399 ext4_msg(sb, KERN_ERR, ··· 4650 "can't read group descriptor %d", i); 4651 db_count = i; 4652 ret = PTR_ERR(bh); 4653 - bh = NULL; 4654 goto failed_mount2; 4655 } 4656 rcu_read_lock(); ··· 4664 } 4665 4666 timer_setup(&sbi->s_err_report, print_daily_error_info, 0); 4667 4668 /* Register extent status tree shrinker */ 4669 if (ext4_es_register_shrinker(sbi)) ··· 4821 "requested data journaling mode"); 4822 goto failed_mount_wq; 4823 } 4824 default: 4825 break; 4826 } ··· 4950 block = ext4_count_free_clusters(sb); 4951 ext4_free_blocks_count_set(sbi->s_es, 4952 EXT4_C2B(sbi, block)); 4953 - ext4_superblock_csum_set(sb); 4954 err = percpu_counter_init(&sbi->s_freeclusters_counter, block, 4955 GFP_KERNEL); 4956 if (!err) { 4957 unsigned long freei = ext4_count_free_inodes(sb); 4958 sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); 4959 - ext4_superblock_csum_set(sb); 4960 err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, 4961 GFP_KERNEL); 4962 } ··· 5034 5035 if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) 5036 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " 5037 - "Opts: %.*s%s%s", descr, 5038 (int) sizeof(sbi->s_es->s_mount_opts), 5039 sbi->s_es->s_mount_opts, 5040 - *sbi->s_es->s_mount_opts ? "; " : "", orig_data); 5041 5042 if (es->s_error_count) 5043 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ ··· 5103 ext4_es_unregister_shrinker(sbi); 5104 failed_mount3: 5105 del_timer_sync(&sbi->s_err_report); 5106 if (sbi->s_mmp_tsk) 5107 kthread_stop(sbi->s_mmp_tsk); 5108 failed_mount2: ··· 5430 5431 static int ext4_commit_super(struct super_block *sb, int sync) 5432 { 5433 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 5434 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; 5435 int error = 0; ··· 5462 es->s_free_inodes_count = 5463 cpu_to_le32(percpu_counter_sum_positive( 5464 &EXT4_SB(sb)->s_freeinodes_counter)); 5465 BUFFER_TRACE(sbh, "marking dirty"); 5466 ext4_superblock_csum_set(sb); 5467 if (sync) ··· 5855 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 5856 } 5857 5858 if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { 5859 if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) { 5860 err = -EROFS; ··· 6016 */ 6017 *flags = (*flags & ~vfs_flags) | (sb->s_flags & vfs_flags); 6018 6019 - ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data); 6020 kfree(orig_data); 6021 return 0; 6022 ··· 6196 static int ext4_mark_dquot_dirty(struct dquot *dquot) 6197 { 6198 struct super_block *sb = dquot->dq_sb; 6199 - struct ext4_sb_info *sbi = EXT4_SB(sb); 6200 6201 - /* Are we journaling quotas? */ 6202 - if (ext4_has_feature_quota(sb) || 6203 - sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { 6204 dquot_mark_dquot_dirty(dquot); 6205 return ext4_write_dquot(dquot); 6206 } else {

··· 404 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); 405 } 406 407 + static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now) 408 { 409 now = clamp_val(now, 0, (1ull << 40) - 1); 410 411 *lo = cpu_to_le32(lower_32_bits(now)); ··· 419 return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo); 420 } 421 #define ext4_update_tstamp(es, tstamp) \ 422 + __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \ 423 + ktime_get_real_seconds()) 424 #define ext4_get_tstamp(es, tstamp) \ 425 __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) 426 427 /* 428 * The del_gendisk() function uninitializes the disk-specific data ··· 649 || system_state == SYSTEM_RESTART; 650 } 651 652 + struct ext4_err_translation { 653 + int code; 654 + int errno; 655 + }; 656 + 657 + #define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err } 658 + 659 + static struct ext4_err_translation err_translation[] = { 660 + EXT4_ERR_TRANSLATE(EIO), 661 + EXT4_ERR_TRANSLATE(ENOMEM), 662 + EXT4_ERR_TRANSLATE(EFSBADCRC), 663 + EXT4_ERR_TRANSLATE(EFSCORRUPTED), 664 + EXT4_ERR_TRANSLATE(ENOSPC), 665 + EXT4_ERR_TRANSLATE(ENOKEY), 666 + EXT4_ERR_TRANSLATE(EROFS), 667 + EXT4_ERR_TRANSLATE(EFBIG), 668 + EXT4_ERR_TRANSLATE(EEXIST), 669 + EXT4_ERR_TRANSLATE(ERANGE), 670 + EXT4_ERR_TRANSLATE(EOVERFLOW), 671 + EXT4_ERR_TRANSLATE(EBUSY), 672 + EXT4_ERR_TRANSLATE(ENOTDIR), 673 + EXT4_ERR_TRANSLATE(ENOTEMPTY), 674 + EXT4_ERR_TRANSLATE(ESHUTDOWN), 675 + EXT4_ERR_TRANSLATE(EFAULT), 676 + }; 677 + 678 + static int ext4_errno_to_code(int errno) 679 + { 680 + int i; 681 + 682 + for (i = 0; i < ARRAY_SIZE(err_translation); i++) 683 + if (err_translation[i].errno == errno) 684 + return err_translation[i].code; 685 + return EXT4_ERR_UNKNOWN; 686 + } 687 + 688 + static void __save_error_info(struct super_block *sb, int error, 689 + __u32 ino, __u64 block, 690 + const char *func, unsigned int line) 691 + { 692 + struct ext4_sb_info *sbi = EXT4_SB(sb); 693 + 694 + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 695 + if (bdev_read_only(sb->s_bdev)) 696 + return; 697 + /* We default to EFSCORRUPTED error... */ 698 + if (error == 0) 699 + error = EFSCORRUPTED; 700 + 701 + spin_lock(&sbi->s_error_lock); 702 + sbi->s_add_error_count++; 703 + sbi->s_last_error_code = error; 704 + sbi->s_last_error_line = line; 705 + sbi->s_last_error_ino = ino; 706 + sbi->s_last_error_block = block; 707 + sbi->s_last_error_func = func; 708 + sbi->s_last_error_time = ktime_get_real_seconds(); 709 + if (!sbi->s_first_error_time) { 710 + sbi->s_first_error_code = error; 711 + sbi->s_first_error_line = line; 712 + sbi->s_first_error_ino = ino; 713 + sbi->s_first_error_block = block; 714 + sbi->s_first_error_func = func; 715 + sbi->s_first_error_time = sbi->s_last_error_time; 716 + } 717 + spin_unlock(&sbi->s_error_lock); 718 + } 719 + 720 + static void save_error_info(struct super_block *sb, int error, 721 + __u32 ino, __u64 block, 722 + const char *func, unsigned int line) 723 + { 724 + __save_error_info(sb, error, ino, block, func, line); 725 + if (!bdev_read_only(sb->s_bdev)) 726 + ext4_commit_super(sb, 1); 727 + } 728 + 729 /* Deal with the reporting of failure conditions on a filesystem such as 730 * inconsistencies detected or read IO failures. 731 * ··· 662 * We'll just use the jbd2_journal_abort() error code to record an error in 663 * the journal instead. On recovery, the journal will complain about 664 * that error until we've noted it down and cleared it. 665 + * 666 + * If force_ro is set, we unconditionally force the filesystem into an 667 + * ABORT|READONLY state, unless the error response on the fs has been set to 668 + * panic in which case we take the easy way out and panic immediately. This is 669 + * used to deal with unrecoverable failures such as journal IO errors or ENOMEM 670 + * at a critical moment in log management. 671 */ 672 + static void ext4_handle_error(struct super_block *sb, bool force_ro) 673 { 674 + journal_t *journal = EXT4_SB(sb)->s_journal; 675 + 676 if (test_opt(sb, WARN_ON_ERROR)) 677 WARN_ON_ONCE(1); 678 679 + if (sb_rdonly(sb) || (!force_ro && test_opt(sb, ERRORS_CONT))) 680 return; 681 682 + ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); 683 + if (journal) 684 + jbd2_journal_abort(journal, -EIO); 685 /* 686 * We force ERRORS_RO behavior when system is rebooting. Otherwise we 687 * could panic during 'reboot -f' as the underlying device got already 688 * disabled. 689 */ 690 + if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) { 691 panic("EXT4-fs (device %s): panic forced after error\n", 692 sb->s_id); 693 } 694 + ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 695 + /* 696 + * Make sure updated value of ->s_mount_flags will be visible before 697 + * ->s_flags update 698 + */ 699 + smp_wmb(); 700 + sb->s_flags |= SB_RDONLY; 701 + } 702 + 703 + static void flush_stashed_error_work(struct work_struct *work) 704 + { 705 + struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, 706 + s_error_work); 707 + 708 + ext4_commit_super(sbi->s_sb, 1); 709 } 710 711 #define ext4_error_ratelimit(sb) \ ··· 703 "EXT4-fs error") 704 705 void __ext4_error(struct super_block *sb, const char *function, 706 + unsigned int line, bool force_ro, int error, __u64 block, 707 const char *fmt, ...) 708 { 709 struct va_format vaf; ··· 723 va_end(args); 724 } 725 save_error_info(sb, error, 0, block, function, line); 726 + ext4_handle_error(sb, force_ro); 727 } 728 729 void __ext4_error_inode(struct inode *inode, const char *function, ··· 755 } 756 save_error_info(inode->i_sb, error, inode->i_ino, block, 757 function, line); 758 + ext4_handle_error(inode->i_sb, false); 759 } 760 761 void __ext4_error_file(struct file *file, const char *function, ··· 794 } 795 save_error_info(inode->i_sb, EFSCORRUPTED, inode->i_ino, block, 796 function, line); 797 + ext4_handle_error(inode->i_sb, false); 798 } 799 800 const char *ext4_decode_error(struct super_block *sb, int errno, ··· 862 } 863 864 save_error_info(sb, -errno, 0, 0, function, line); 865 + ext4_handle_error(sb, false); 866 } 867 868 void __ext4_msg(struct super_block *sb, ··· 982 return; 983 984 trace_ext4_error(sb, function, line); 985 if (ext4_error_ratelimit(sb)) { 986 va_start(args, fmt); 987 vaf.fmt = fmt; ··· 999 va_end(args); 1000 } 1001 1002 if (test_opt(sb, ERRORS_CONT)) { 1003 + if (test_opt(sb, WARN_ON_ERROR)) 1004 + WARN_ON_ONCE(1); 1005 + __save_error_info(sb, EFSCORRUPTED, ino, block, function, line); 1006 + schedule_work(&EXT4_SB(sb)->s_error_work); 1007 return; 1008 } 1009 ext4_unlock_group(sb, grp); 1010 + save_error_info(sb, EFSCORRUPTED, ino, block, function, line); 1011 + ext4_handle_error(sb, false); 1012 /* 1013 * We only get here in the ERRORS_RO case; relocking the group 1014 * may be dangerous, but nothing bad will happen since the ··· 1181 ext4_unregister_li_request(sb); 1182 ext4_quota_off_umount(sb); 1183 1184 + flush_work(&sbi->s_error_work); 1185 destroy_workqueue(sbi->rsv_conversion_wq); 1186 1187 /* ··· 1240 * in-memory list had better be clean by this point. */ 1241 if (!list_empty(&sbi->s_orphan)) 1242 dump_orphan_list(sb, sbi); 1243 + ASSERT(list_empty(&sbi->s_orphan)); 1244 1245 sync_blockdev(sb->s_bdev); 1246 invalidate_bdev(sb->s_bdev); ··· 4005 atomic64_set(&sbi->s_resv_clusters, resv_clusters); 4006 } 4007 4008 + static const char *ext4_quota_mode(struct super_block *sb) 4009 + { 4010 + #ifdef CONFIG_QUOTA 4011 + if (!ext4_quota_capable(sb)) 4012 + return "none"; 4013 + 4014 + if (EXT4_SB(sb)->s_journal && ext4_is_quota_journalled(sb)) 4015 + return "journalled"; 4016 + else 4017 + return "writeback"; 4018 + #else 4019 + return "disabled"; 4020 + #endif 4021 + } 4022 + 4023 static int ext4_fill_super(struct super_block *sb, void *data, int silent) 4024 { 4025 struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev); ··· 4073 if (IS_ERR(bh)) { 4074 ext4_msg(sb, KERN_ERR, "unable to read superblock"); 4075 ret = PTR_ERR(bh); 4076 goto out_fail; 4077 } 4078 /* ··· 4187 */ 4188 sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; 4189 4190 + if (le32_to_cpu(es->s_log_block_size) > 4191 + (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { 4192 + ext4_msg(sb, KERN_ERR, 4193 + "Invalid log block size: %u", 4194 + le32_to_cpu(es->s_log_block_size)); 4195 + goto failed_mount; 4196 + } 4197 + if (le32_to_cpu(es->s_log_cluster_size) > 4198 + (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { 4199 + ext4_msg(sb, KERN_ERR, 4200 + "Invalid log cluster size: %u", 4201 + le32_to_cpu(es->s_log_cluster_size)); 4202 + goto failed_mount; 4203 + } 4204 + 4205 + blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); 4206 4207 if (blocksize == PAGE_SIZE) 4208 set_opt(sb, DIOREAD_NOLOCK); 4209 4210 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { 4211 sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; ··· 4416 */ 4417 if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) 4418 goto failed_mount; 4419 4420 if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) { 4421 ext4_msg(sb, KERN_ERR, ··· 4702 "can't read group descriptor %d", i); 4703 db_count = i; 4704 ret = PTR_ERR(bh); 4705 goto failed_mount2; 4706 } 4707 rcu_read_lock(); ··· 4717 } 4718 4719 timer_setup(&sbi->s_err_report, print_daily_error_info, 0); 4720 + spin_lock_init(&sbi->s_error_lock); 4721 + INIT_WORK(&sbi->s_error_work, flush_stashed_error_work); 4722 4723 /* Register extent status tree shrinker */ 4724 if (ext4_es_register_shrinker(sbi)) ··· 4872 "requested data journaling mode"); 4873 goto failed_mount_wq; 4874 } 4875 + break; 4876 default: 4877 break; 4878 } ··· 5000 block = ext4_count_free_clusters(sb); 5001 ext4_free_blocks_count_set(sbi->s_es, 5002 EXT4_C2B(sbi, block)); 5003 err = percpu_counter_init(&sbi->s_freeclusters_counter, block, 5004 GFP_KERNEL); 5005 if (!err) { 5006 unsigned long freei = ext4_count_free_inodes(sb); 5007 sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); 5008 err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, 5009 GFP_KERNEL); 5010 } ··· 5086 5087 if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) 5088 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " 5089 + "Opts: %.*s%s%s. Quota mode: %s.", descr, 5090 (int) sizeof(sbi->s_es->s_mount_opts), 5091 sbi->s_es->s_mount_opts, 5092 + *sbi->s_es->s_mount_opts ? "; " : "", orig_data, 5093 + ext4_quota_mode(sb)); 5094 5095 if (es->s_error_count) 5096 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ ··· 5154 ext4_es_unregister_shrinker(sbi); 5155 failed_mount3: 5156 del_timer_sync(&sbi->s_err_report); 5157 + flush_work(&sbi->s_error_work); 5158 if (sbi->s_mmp_tsk) 5159 kthread_stop(sbi->s_mmp_tsk); 5160 failed_mount2: ··· 5480 5481 static int ext4_commit_super(struct super_block *sb, int sync) 5482 { 5483 + struct ext4_sb_info *sbi = EXT4_SB(sb); 5484 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 5485 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; 5486 int error = 0; ··· 5511 es->s_free_inodes_count = 5512 cpu_to_le32(percpu_counter_sum_positive( 5513 &EXT4_SB(sb)->s_freeinodes_counter)); 5514 + /* Copy error information to the on-disk superblock */ 5515 + spin_lock(&sbi->s_error_lock); 5516 + if (sbi->s_add_error_count > 0) { 5517 + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 5518 + if (!es->s_first_error_time && !es->s_first_error_time_hi) { 5519 + __ext4_update_tstamp(&es->s_first_error_time, 5520 + &es->s_first_error_time_hi, 5521 + sbi->s_first_error_time); 5522 + strncpy(es->s_first_error_func, sbi->s_first_error_func, 5523 + sizeof(es->s_first_error_func)); 5524 + es->s_first_error_line = 5525 + cpu_to_le32(sbi->s_first_error_line); 5526 + es->s_first_error_ino = 5527 + cpu_to_le32(sbi->s_first_error_ino); 5528 + es->s_first_error_block = 5529 + cpu_to_le64(sbi->s_first_error_block); 5530 + es->s_first_error_errcode = 5531 + ext4_errno_to_code(sbi->s_first_error_code); 5532 + } 5533 + __ext4_update_tstamp(&es->s_last_error_time, 5534 + &es->s_last_error_time_hi, 5535 + sbi->s_last_error_time); 5536 + strncpy(es->s_last_error_func, sbi->s_last_error_func, 5537 + sizeof(es->s_last_error_func)); 5538 + es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line); 5539 + es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino); 5540 + es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block); 5541 + es->s_last_error_errcode = 5542 + ext4_errno_to_code(sbi->s_last_error_code); 5543 + /* 5544 + * Start the daily error reporting function if it hasn't been 5545 + * started already 5546 + */ 5547 + if (!es->s_error_count) 5548 + mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); 5549 + le32_add_cpu(&es->s_error_count, sbi->s_add_error_count); 5550 + sbi->s_add_error_count = 0; 5551 + } 5552 + spin_unlock(&sbi->s_error_lock); 5553 + 5554 BUFFER_TRACE(sbh, "marking dirty"); 5555 ext4_superblock_csum_set(sb); 5556 if (sync) ··· 5864 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 5865 } 5866 5867 + /* Flush outstanding errors before changing fs state */ 5868 + flush_work(&sbi->s_error_work); 5869 + 5870 if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { 5871 if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) { 5872 err = -EROFS; ··· 6022 */ 6023 *flags = (*flags & ~vfs_flags) | (sb->s_flags & vfs_flags); 6024 6025 + ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s. Quota mode: %s.", 6026 + orig_data, ext4_quota_mode(sb)); 6027 kfree(orig_data); 6028 return 0; 6029 ··· 6201 static int ext4_mark_dquot_dirty(struct dquot *dquot) 6202 { 6203 struct super_block *sb = dquot->dq_sb; 6204 6205 + if (ext4_is_quota_journalled(sb)) { 6206 dquot_mark_dquot_dirty(dquot); 6207 return ext4_write_dquot(dquot); 6208 } else {

-1

fs/ext4/xattr.c

··· 1927 } else { 1928 /* Allocate a buffer where we construct the new block. */ 1929 s->base = kzalloc(sb->s_blocksize, GFP_NOFS); 1930 - /* assert(header == s->base) */ 1931 error = -ENOMEM; 1932 if (s->base == NULL) 1933 goto cleanup;

··· 1927 } else { 1928 /* Allocate a buffer where we construct the new block. */ 1929 s->base = kzalloc(sb->s_blocksize, GFP_NOFS); 1930 error = -ENOMEM; 1931 if (s->base == NULL) 1932 goto cleanup;

+2 -6

fs/jbd2/journal.c

··· 1869 1870 if (jbd2_has_feature_fast_commit(journal)) { 1871 journal->j_fc_last = be32_to_cpu(sb->s_maxlen); 1872 - num_fc_blocks = be32_to_cpu(sb->s_num_fc_blks); 1873 - if (!num_fc_blocks) 1874 - num_fc_blocks = JBD2_MIN_FC_BLOCKS; 1875 if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS) 1876 journal->j_last = journal->j_fc_last - num_fc_blocks; 1877 journal->j_fc_first = journal->j_last + 1; ··· 2100 journal_superblock_t *sb = journal->j_superblock; 2101 unsigned long long num_fc_blks; 2102 2103 - num_fc_blks = be32_to_cpu(sb->s_num_fc_blks); 2104 - if (num_fc_blks == 0) 2105 - num_fc_blks = JBD2_MIN_FC_BLOCKS; 2106 if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS) 2107 return -ENOSPC; 2108

··· 1869 1870 if (jbd2_has_feature_fast_commit(journal)) { 1871 journal->j_fc_last = be32_to_cpu(sb->s_maxlen); 1872 + num_fc_blocks = jbd2_journal_get_num_fc_blks(sb); 1873 if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS) 1874 journal->j_last = journal->j_fc_last - num_fc_blocks; 1875 journal->j_fc_first = journal->j_last + 1; ··· 2102 journal_superblock_t *sb = journal->j_superblock; 2103 unsigned long long num_fc_blks; 2104 2105 + num_fc_blks = jbd2_journal_get_num_fc_blks(sb); 2106 if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS) 2107 return -ENOSPC; 2108

+11 -3

include/linux/jbd2.h

··· 68 extern void jbd2_free(void *ptr, size_t size); 69 70 #define JBD2_MIN_JOURNAL_BLOCKS 1024 71 - #define JBD2_MIN_FC_BLOCKS 256 72 73 #ifdef __KERNEL__ 74 ··· 538 * The transaction keeps track of all of the buffers modified by a 539 * running transaction, and all of the buffers committed but not yet 540 * flushed to home for finished transactions. 541 */ 542 543 /* ··· 659 unsigned long t_start; 660 661 /* 662 - * When commit was requested 663 */ 664 unsigned long t_requested; 665 666 /* 667 - * Checkpointing stats [j_checkpoint_sem] 668 */ 669 struct transaction_chp_stats_s t_chp_stats; 670 ··· 1690 journal->j_chksum_driver == NULL); 1691 1692 return journal->j_chksum_driver != NULL; 1693 } 1694 1695 /*

··· 68 extern void jbd2_free(void *ptr, size_t size); 69 70 #define JBD2_MIN_JOURNAL_BLOCKS 1024 71 + #define JBD2_DEFAULT_FAST_COMMIT_BLOCKS 256 72 73 #ifdef __KERNEL__ 74 ··· 538 * The transaction keeps track of all of the buffers modified by a 539 * running transaction, and all of the buffers committed but not yet 540 * flushed to home for finished transactions. 541 + * (Locking Documentation improved by LockDoc) 542 */ 543 544 /* ··· 658 unsigned long t_start; 659 660 /* 661 + * When commit was requested [j_state_lock] 662 */ 663 unsigned long t_requested; 664 665 /* 666 + * Checkpointing stats [j_list_lock] 667 */ 668 struct transaction_chp_stats_s t_chp_stats; 669 ··· 1689 journal->j_chksum_driver == NULL); 1690 1691 return journal->j_chksum_driver != NULL; 1692 + } 1693 + 1694 + static inline int jbd2_journal_get_num_fc_blks(journal_superblock_t *jsb) 1695 + { 1696 + int num_fc_blocks = be32_to_cpu(jsb->s_num_fc_blks); 1697 + 1698 + return num_fc_blocks ? num_fc_blocks : JBD2_DEFAULT_FAST_COMMIT_BLOCKS; 1699 } 1700 1701 /*