commit 555a6e8c11e6282bb2704ef1cee64ceaeb41773e · tjh.dev/kernel

+50

Documentation/filesystems/ext4/journal.rst

··· 681 681 - Stores the TID of the commit, CRC of the fast commit of which this tag 682 682 represents the end of 683 683 684 + Fast Commit Replay Idempotence 685 + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 686 + 687 + Fast commits tags are idempotent in nature provided the recovery code follows 688 + certain rules. The guiding principle that the commit path follows while 689 + committing is that it stores the result of a particular operation instead of 690 + storing the procedure. 691 + 692 + Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' 693 + was associated with inode 10. During fast commit, instead of storing this 694 + operation as a procedure "rename a to b", we store the resulting file system 695 + state as a "series" of outcomes: 696 + 697 + - Link dirent b to inode 10 698 + - Unlink dirent a 699 + - Inode 10 with valid refcount 700 + 701 + Now when recovery code runs, it needs "enforce" this state on the file 702 + system. This is what guarantees idempotence of fast commit replay. 703 + 704 + Let's take an example of a procedure that is not idempotent and see how fast 705 + commits make it idempotent. Consider following sequence of operations: 706 + 707 + 1) rm A 708 + 2) mv B A 709 + 3) read A 710 + 711 + If we store this sequence of operations as is then the replay is not idempotent. 712 + Let's say while in replay, we crash after (2). During the second replay, 713 + file A (which was actually created as a result of "mv B A" operation) would get 714 + deleted. Thus, file named A would be absent when we try to read A. So, this 715 + sequence of operations is not idempotent. However, as mentioned above, instead 716 + of storing the procedure fast commits store the outcome of each procedure. Thus 717 + the fast commit log for above procedure would be as follows: 718 + 719 + (Let's assume dirent A was linked to inode 10 and dirent B was linked to 720 + inode 11 before the replay) 721 + 722 + 1) Unlink A 723 + 2) Link A to inode 11 724 + 3) Unlink B 725 + 4) Inode 11 726 + 727 + If we crash after (3) we will have file A linked to inode 11. During the second 728 + replay, we will remove file A (inode 11). But we will create it back and make 729 + it point to inode 11. We won't find B, so we'll just skip that step. At this 730 + point, the refcount for inode 11 is not reliable, but that gets fixed by the 731 + replay of last inode 11 tag. Thus, by converting a non-idempotent procedure 732 + into a series of idempotent outcomes, fast commits ensured idempotence during 733 + the replay.

+1 -1

fs/ext4/balloc.c

··· 185 185 struct ext4_sb_info *sbi = EXT4_SB(sb); 186 186 ext4_fsblk_t start, tmp; 187 187 188 - J_ASSERT_BH(bh, buffer_locked(bh)); 188 + ASSERT(buffer_locked(bh)); 189 189 190 190 /* If checksum is bad mark all blocks used to prevent allocation 191 191 * essentially implementing a per-group read-only flag. */

+7 -9

fs/ext4/block_validity.c

··· 176 176 err = add_system_zone(system_blks, map.m_pblk, n, ino); 177 177 if (err < 0) { 178 178 if (err == -EFSCORRUPTED) { 179 - __ext4_error(sb, __func__, __LINE__, 180 - -err, map.m_pblk, 181 - "blocks %llu-%llu from inode %u overlap system zone", 182 - map.m_pblk, 183 - map.m_pblk + map.m_len - 1, 184 - ino); 179 + EXT4_ERROR_INODE_ERR(inode, -err, 180 + "blocks %llu-%llu from inode overlap system zone", 181 + map.m_pblk, 182 + map.m_pblk + map.m_len - 1); 185 183 } 186 184 break; 187 185 } ··· 204 206 * 205 207 * The update of system_blks pointer in this function is protected by 206 208 * sb->s_umount semaphore. However we have to be careful as we can be 207 - * racing with ext4_data_block_valid() calls reading system_blks rbtree 209 + * racing with ext4_inode_block_valid() calls reading system_blks rbtree 208 210 * protected only by RCU. That's why we first build the rbtree and then 209 211 * swap it in place. 210 212 */ ··· 256 258 257 259 /* 258 260 * System blks rbtree complete, announce it once to prevent racing 259 - * with ext4_data_block_valid() accessing the rbtree at the same 261 + * with ext4_inode_block_valid() accessing the rbtree at the same 260 262 * time. 261 263 */ 262 264 rcu_assign_pointer(sbi->s_system_blks, system_blks); ··· 276 278 * 277 279 * The update of system_blks pointer in this function is protected by 278 280 * sb->s_umount semaphore. However we have to be careful as we can be 279 - * racing with ext4_data_block_valid() calls reading system_blks rbtree 281 + * racing with ext4_inode_block_valid() calls reading system_blks rbtree 280 282 * protected only by RCU. So we first clear the system_blks pointer and 281 283 * then free the rbtree only after RCU grace period expires. 282 284 */

+58 -19

fs/ext4/ext4.h

··· 98 98 #define ext_debug(ino, fmt, ...) no_printk(fmt, ##__VA_ARGS__) 99 99 #endif 100 100 101 + #define ASSERT(assert) \ 102 + do { \ 103 + if (unlikely(!(assert))) { \ 104 + printk(KERN_EMERG \ 105 + "Assertion failure in %s() at %s:%d: '%s'\n", \ 106 + __func__, __FILE__, __LINE__, #assert); \ 107 + BUG(); \ 108 + } \ 109 + } while (0) 110 + 101 111 /* data type for block offset of block group */ 102 112 typedef int ext4_grpblk_t; 103 113 ··· 1629 1619 errseq_t s_bdev_wb_err; 1630 1620 spinlock_t s_bdev_wb_lock; 1631 1621 1622 + /* Information about errors that happened during this mount */ 1623 + spinlock_t s_error_lock; 1624 + int s_add_error_count; 1625 + int s_first_error_code; 1626 + __u32 s_first_error_line; 1627 + __u32 s_first_error_ino; 1628 + __u64 s_first_error_block; 1629 + const char *s_first_error_func; 1630 + time64_t s_first_error_time; 1631 + int s_last_error_code; 1632 + __u32 s_last_error_line; 1633 + __u32 s_last_error_ino; 1634 + __u64 s_last_error_block; 1635 + const char *s_last_error_func; 1636 + time64_t s_last_error_time; 1637 + /* 1638 + * If we are in a context where we cannot update error information in 1639 + * the on-disk superblock, we queue this work to do it. 1640 + */ 1641 + struct work_struct s_error_work; 1642 + 1632 1643 /* Ext4 fast commit stuff */ 1633 1644 atomic_t s_fc_subtid; 1634 1645 atomic_t s_fc_ineligible_updates; ··· 1889 1858 #define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */ 1890 1859 #define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ 1891 1860 1892 - #define EXT4_CURRENT_REV EXT4_GOOD_OLD_REV 1893 1861 #define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV 1894 1862 1895 1863 #define EXT4_GOOD_OLD_INODE_SIZE 128 ··· 2982 2952 ext4_group_t block_group, 2983 2953 unsigned int flags); 2984 2954 2985 - extern __printf(6, 7) 2986 - void __ext4_error(struct super_block *, const char *, unsigned int, int, __u64, 2987 - const char *, ...); 2955 + extern __printf(7, 8) 2956 + void __ext4_error(struct super_block *, const char *, unsigned int, bool, 2957 + int, __u64, const char *, ...); 2988 2958 extern __printf(6, 7) 2989 2959 void __ext4_error_inode(struct inode *, const char *, unsigned int, 2990 2960 ext4_fsblk_t, int, const char *, ...); ··· 2993 2963 const char *, ...); 2994 2964 extern void __ext4_std_error(struct super_block *, const char *, 2995 2965 unsigned int, int); 2996 - extern __printf(5, 6) 2997 - void __ext4_abort(struct super_block *, const char *, unsigned int, int, 2998 - const char *, ...); 2999 2966 extern __printf(4, 5) 3000 2967 void __ext4_warning(struct super_block *, const char *, unsigned int, 3001 2968 const char *, ...); ··· 3022 2995 #define EXT4_ERROR_FILE(file, block, fmt, a...) \ 3023 2996 ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) 3024 2997 2998 + #define ext4_abort(sb, err, fmt, a...) \ 2999 + __ext4_error((sb), __func__, __LINE__, true, (err), 0, (fmt), ## a) 3000 + 3025 3001 #ifdef CONFIG_PRINTK 3026 3002 3027 3003 #define ext4_error_inode(inode, func, line, block, fmt, ...) \ ··· 3035 3005 #define ext4_error_file(file, func, line, block, fmt, ...) \ 3036 3006 __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) 3037 3007 #define ext4_error(sb, fmt, ...) \ 3038 - __ext4_error((sb), __func__, __LINE__, 0, 0, (fmt), ##__VA_ARGS__) 3008 + __ext4_error((sb), __func__, __LINE__, false, 0, 0, (fmt), \ 3009 + ##__VA_ARGS__) 3039 3010 #define ext4_error_err(sb, err, fmt, ...) \ 3040 - __ext4_error((sb), __func__, __LINE__, (err), 0, (fmt), ##__VA_ARGS__) 3041 - #define ext4_abort(sb, err, fmt, ...) \ 3042 - __ext4_abort((sb), __func__, __LINE__, (err), (fmt), ##__VA_ARGS__) 3011 + __ext4_error((sb), __func__, __LINE__, false, (err), 0, (fmt), \ 3012 + ##__VA_ARGS__) 3043 3013 #define ext4_warning(sb, fmt, ...) \ 3044 3014 __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) 3045 3015 #define ext4_warning_inode(inode, fmt, ...) \ ··· 3072 3042 #define ext4_error(sb, fmt, ...) \ 3073 3043 do { \ 3074 3044 no_printk(fmt, ##__VA_ARGS__); \ 3075 - __ext4_error(sb, "", 0, 0, 0, " "); \ 3045 + __ext4_error(sb, "", 0, false, 0, 0, " "); \ 3076 3046 } while (0) 3077 3047 #define ext4_error_err(sb, err, fmt, ...) \ 3078 3048 do { \ 3079 3049 no_printk(fmt, ##__VA_ARGS__); \ 3080 - __ext4_error(sb, "", 0, err, 0, " "); \ 3081 - } while (0) 3082 - #define ext4_abort(sb, err, fmt, ...) \ 3083 - do { \ 3084 - no_printk(fmt, ##__VA_ARGS__); \ 3085 - __ext4_abort(sb, "", 0, err, " "); \ 3050 + __ext4_error(sb, "", 0, false, err, 0, " "); \ 3086 3051 } while (0) 3087 3052 #define ext4_warning(sb, fmt, ...) \ 3088 3053 do { \ ··· 3386 3361 spin_unlock(ext4_group_lock_ptr(sb, group)); 3387 3362 } 3388 3363 3364 + #ifdef CONFIG_QUOTA 3365 + static inline bool ext4_quota_capable(struct super_block *sb) 3366 + { 3367 + return (test_opt(sb, QUOTA) || ext4_has_feature_quota(sb)); 3368 + } 3369 + 3370 + static inline bool ext4_is_quota_journalled(struct super_block *sb) 3371 + { 3372 + struct ext4_sb_info *sbi = EXT4_SB(sb); 3373 + 3374 + return (ext4_has_feature_quota(sb) || 3375 + sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]); 3376 + } 3377 + #endif 3378 + 3389 3379 /* 3390 3380 * Block validity checking 3391 3381 */ ··· 3649 3609 extern int ext4_bio_write_page(struct ext4_io_submit *io, 3650 3610 struct page *page, 3651 3611 int len, 3652 - struct writeback_control *wbc, 3653 3612 bool keep_towrite); 3654 3613 extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); 3655 3614 extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);

+2 -2

fs/ext4/ext4_jbd2.c

··· 296 296 if (err) { 297 297 ext4_journal_abort_handle(where, line, __func__, 298 298 bh, handle, err); 299 - __ext4_abort(inode->i_sb, where, line, -err, 300 - "error %d when attempting revoke", err); 299 + __ext4_error(inode->i_sb, where, line, true, -err, 0, 300 + "error %d when attempting revoke", err); 301 301 } 302 302 BUFFER_TRACE(bh, "exit"); 303 303 return err;

+3 -6

fs/ext4/ext4_jbd2.h

··· 86 86 #ifdef CONFIG_QUOTA 87 87 /* Amount of blocks needed for quota update - we know that the structure was 88 88 * allocated so we need to update only data block */ 89 - #define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ 90 - ext4_has_feature_quota(sb)) ? 1 : 0) 89 + #define EXT4_QUOTA_TRANS_BLOCKS(sb) ((ext4_quota_capable(sb)) ? 1 : 0) 91 90 /* Amount of blocks needed for quota insert/delete - we do some block writes 92 91 * but inode, sb and group updates are done only once */ 93 - #define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ 94 - ext4_has_feature_quota(sb)) ?\ 92 + #define EXT4_QUOTA_INIT_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\ 95 93 (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ 96 94 +3+DQUOT_INIT_REWRITE) : 0) 97 95 98 - #define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ 99 - ext4_has_feature_quota(sb)) ?\ 96 + #define EXT4_QUOTA_DEL_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\ 100 97 (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ 101 98 +3+DQUOT_DEL_REWRITE) : 0) 102 99 #else

+2 -3

fs/ext4/extents.c

··· 5815 5815 int ret; 5816 5816 5817 5817 path = ext4_find_extent(inode, start, NULL, 0); 5818 - if (!path) 5819 - return -EINVAL; 5818 + if (IS_ERR(path)) 5819 + return PTR_ERR(path); 5820 5820 ex = path[path->p_depth].p_ext; 5821 5821 if (!ex) { 5822 5822 ret = -EFSCORRUPTED; ··· 5988 5988 kfree(path); 5989 5989 break; 5990 5990 } 5991 - ex = path2[path2->p_depth].p_ext; 5992 5991 for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) { 5993 5992 cmp1 = cmp2 = 0; 5994 5993 if (i <= path->p_depth)

+61 -38

fs/ext4/fast_commit.c

··· 103 103 * 104 104 * Replay code should thus check for all the valid tails in the FC area. 105 105 * 106 + * Fast Commit Replay Idempotence 107 + * ------------------------------ 108 + * 109 + * Fast commits tags are idempotent in nature provided the recovery code follows 110 + * certain rules. The guiding principle that the commit path follows while 111 + * committing is that it stores the result of a particular operation instead of 112 + * storing the procedure. 113 + * 114 + * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' 115 + * was associated with inode 10. During fast commit, instead of storing this 116 + * operation as a procedure "rename a to b", we store the resulting file system 117 + * state as a "series" of outcomes: 118 + * 119 + * - Link dirent b to inode 10 120 + * - Unlink dirent a 121 + * - Inode <10> with valid refcount 122 + * 123 + * Now when recovery code runs, it needs "enforce" this state on the file 124 + * system. This is what guarantees idempotence of fast commit replay. 125 + * 126 + * Let's take an example of a procedure that is not idempotent and see how fast 127 + * commits make it idempotent. Consider following sequence of operations: 128 + * 129 + * rm A; mv B A; read A 130 + * (x) (y) (z) 131 + * 132 + * (x), (y) and (z) are the points at which we can crash. If we store this 133 + * sequence of operations as is then the replay is not idempotent. Let's say 134 + * while in replay, we crash at (z). During the second replay, file A (which was 135 + * actually created as a result of "mv B A" operation) would get deleted. Thus, 136 + * file named A would be absent when we try to read A. So, this sequence of 137 + * operations is not idempotent. However, as mentioned above, instead of storing 138 + * the procedure fast commits store the outcome of each procedure. Thus the fast 139 + * commit log for above procedure would be as follows: 140 + * 141 + * (Let's assume dirent A was linked to inode 10 and dirent B was linked to 142 + * inode 11 before the replay) 143 + * 144 + * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] 145 + * (w) (x) (y) (z) 146 + * 147 + * If we crash at (z), we will have file A linked to inode 11. During the second 148 + * replay, we will remove file A (inode 11). But we will create it back and make 149 + * it point to inode 11. We won't find B, so we'll just skip that step. At this 150 + * point, the refcount for inode 11 is not reliable, but that gets fixed by the 151 + * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled 152 + * similarly. Thus, by converting a non-idempotent procedure into a series of 153 + * idempotent outcomes, fast commits ensured idempotence during the replay. 154 + * 106 155 * TODOs 107 156 * ----- 157 + * 158 + * 0) Fast commit replay path hardening: Fast commit replay code should use 159 + * journal handles to make sure all the updates it does during the replay 160 + * path are atomic. With that if we crash during fast commit replay, after 161 + * trying to do recovery again, we will find a file system where fast commit 162 + * area is invalid (because new full commit would be found). In order to deal 163 + * with that, fast commit replay code should ensure that the "FC_REPLAY" 164 + * superblock state is persisted before starting the replay, so that after 165 + * the crash, fast commit recovery code can look at that flag and perform 166 + * fast commit recovery even if that area is invalidated by later full 167 + * commits. 168 + * 108 169 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit 109 170 * eligible update must be protected within ext4_fc_start_update() and 110 171 * ext4_fc_stop_update(). These routines are called at much higher ··· 1281 1220 1282 1221 /* Ext4 Replay Path Routines */ 1283 1222 1284 - /* Get length of a particular tlv */ 1285 - static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl) 1286 - { 1287 - return le16_to_cpu(tl->fc_len); 1288 - } 1289 - 1290 - /* Get a pointer to "value" of a tlv */ 1291 - static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl) 1292 - { 1293 - return (u8 *)tl + sizeof(*tl); 1294 - } 1295 - 1296 1223 /* Helper struct for dentry replay routines */ 1297 1224 struct dentry_info_args { 1298 1225 int parent_ino, dname_len, ino, inode_len; ··· 1817 1768 iput(inode); 1818 1769 1819 1770 return 0; 1820 - } 1821 - 1822 - static inline const char *tag2str(u16 tag) 1823 - { 1824 - switch (tag) { 1825 - case EXT4_FC_TAG_LINK: 1826 - return "TAG_ADD_ENTRY"; 1827 - case EXT4_FC_TAG_UNLINK: 1828 - return "TAG_DEL_ENTRY"; 1829 - case EXT4_FC_TAG_ADD_RANGE: 1830 - return "TAG_ADD_RANGE"; 1831 - case EXT4_FC_TAG_CREAT: 1832 - return "TAG_CREAT_DENTRY"; 1833 - case EXT4_FC_TAG_DEL_RANGE: 1834 - return "TAG_DEL_RANGE"; 1835 - case EXT4_FC_TAG_INODE: 1836 - return "TAG_INODE"; 1837 - case EXT4_FC_TAG_PAD: 1838 - return "TAG_PAD"; 1839 - case EXT4_FC_TAG_TAIL: 1840 - return "TAG_TAIL"; 1841 - case EXT4_FC_TAG_HEAD: 1842 - return "TAG_HEAD"; 1843 - default: 1844 - return "TAG_ERROR"; 1845 - } 1846 1771 } 1847 1772 1848 1773 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)

+61 -17

fs/ext4/fast_commit.h

··· 3 3 #ifndef __FAST_COMMIT_H__ 4 4 #define __FAST_COMMIT_H__ 5 5 6 + /* 7 + * Note this file is present in e2fsprogs/lib/ext2fs/fast_commit.h and 8 + * linux/fs/ext4/fast_commit.h. These file should always be byte identical. 9 + */ 10 + 6 11 /* Fast commit tags */ 7 12 #define EXT4_FC_TAG_ADD_RANGE 0x0001 8 13 #define EXT4_FC_TAG_DEL_RANGE 0x0002 ··· 55 50 struct ext4_fc_dentry_info { 56 51 __le32 fc_parent_ino; 57 52 __le32 fc_ino; 58 - u8 fc_dname[0]; 53 + __u8 fc_dname[0]; 59 54 }; 60 55 61 56 /* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */ ··· 68 63 struct ext4_fc_tail { 69 64 __le32 fc_tid; 70 65 __le32 fc_crc; 71 - }; 72 - 73 - /* 74 - * In memory list of dentry updates that are performed on the file 75 - * system used by fast commit code. 76 - */ 77 - struct ext4_fc_dentry_update { 78 - int fcd_op; /* Type of update create / unlink / link */ 79 - int fcd_parent; /* Parent inode number */ 80 - int fcd_ino; /* Inode number */ 81 - struct qstr fcd_name; /* Dirent name */ 82 - unsigned char fcd_iname[DNAME_INLINE_LEN]; /* Dirent name string */ 83 - struct list_head fcd_list; 84 66 }; 85 67 86 68 /* ··· 97 105 EXT4_FC_REASON_INODE_JOURNAL_DATA, 98 106 EXT4_FC_COMMIT_FAILED, 99 107 EXT4_FC_REASON_MAX 108 + }; 109 + 110 + #ifdef __KERNEL__ 111 + /* 112 + * In memory list of dentry updates that are performed on the file 113 + * system used by fast commit code. 114 + */ 115 + struct ext4_fc_dentry_update { 116 + int fcd_op; /* Type of update create / unlink / link */ 117 + int fcd_parent; /* Parent inode number */ 118 + int fcd_ino; /* Inode number */ 119 + struct qstr fcd_name; /* Dirent name */ 120 + unsigned char fcd_iname[DNAME_INLINE_LEN]; /* Dirent name string */ 121 + struct list_head fcd_list; 100 122 }; 101 123 102 124 struct ext4_fc_stats { ··· 151 145 }; 152 146 153 147 #define region_last(__region) (((__region)->lblk) + ((__region)->len) - 1) 148 + #endif 154 149 155 150 #define fc_for_each_tl(__start, __end, __tl) \ 156 - for (tl = (struct ext4_fc_tl *)start; \ 157 - (u8 *)tl < (u8 *)end; \ 158 - tl = (struct ext4_fc_tl *)((u8 *)tl + \ 151 + for (tl = (struct ext4_fc_tl *)(__start); \ 152 + (__u8 *)tl < (__u8 *)(__end); \ 153 + tl = (struct ext4_fc_tl *)((__u8 *)tl + \ 159 154 sizeof(struct ext4_fc_tl) + \ 160 155 + le16_to_cpu(tl->fc_len))) 161 156 157 + static inline const char *tag2str(__u16 tag) 158 + { 159 + switch (tag) { 160 + case EXT4_FC_TAG_LINK: 161 + return "ADD_ENTRY"; 162 + case EXT4_FC_TAG_UNLINK: 163 + return "DEL_ENTRY"; 164 + case EXT4_FC_TAG_ADD_RANGE: 165 + return "ADD_RANGE"; 166 + case EXT4_FC_TAG_CREAT: 167 + return "CREAT_DENTRY"; 168 + case EXT4_FC_TAG_DEL_RANGE: 169 + return "DEL_RANGE"; 170 + case EXT4_FC_TAG_INODE: 171 + return "INODE"; 172 + case EXT4_FC_TAG_PAD: 173 + return "PAD"; 174 + case EXT4_FC_TAG_TAIL: 175 + return "TAIL"; 176 + case EXT4_FC_TAG_HEAD: 177 + return "HEAD"; 178 + default: 179 + return "ERROR"; 180 + } 181 + } 182 + 183 + /* Get length of a particular tlv */ 184 + static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl) 185 + { 186 + return le16_to_cpu(tl->fc_len); 187 + } 188 + 189 + /* Get a pointer to "value" of a tlv */ 190 + static inline __u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl) 191 + { 192 + return (__u8 *)tl + sizeof(*tl); 193 + } 162 194 163 195 #endif /* __FAST_COMMIT_H__ */

+1 -1

fs/ext4/fsync.c

··· 136 136 if (unlikely(ext4_forced_shutdown(sbi))) 137 137 return -EIO; 138 138 139 - J_ASSERT(ext4_journal_current_handle() == NULL); 139 + ASSERT(ext4_journal_current_handle() == NULL); 140 140 141 141 trace_ext4_sync_file_enter(file, datasync); 142 142

+2 -2

fs/ext4/indirect.c

··· 534 534 ext4_fsblk_t first_block = 0; 535 535 536 536 trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 537 - J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); 538 - J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); 537 + ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); 538 + ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); 539 539 depth = ext4_block_to_path(inode, map->m_lblk, offsets, 540 540 &blocks_to_boundary); 541 541

+22 -13

fs/ext4/inode.c

··· 175 175 */ 176 176 int extra_credits = 6; 177 177 struct ext4_xattr_inode_array *ea_inode_array = NULL; 178 + bool freeze_protected = false; 178 179 179 180 trace_ext4_evict_inode(inode); 180 181 ··· 233 232 234 233 /* 235 234 * Protect us against freezing - iput() caller didn't have to have any 236 - * protection against it 235 + * protection against it. When we are in a running transaction though, 236 + * we are already protected against freezing and we cannot grab further 237 + * protection due to lock ordering constraints. 237 238 */ 238 - sb_start_intwrite(inode->i_sb); 239 + if (!ext4_journal_current_handle()) { 240 + sb_start_intwrite(inode->i_sb); 241 + freeze_protected = true; 242 + } 239 243 240 244 if (!IS_NOQUOTA(inode)) 241 245 extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); ··· 259 253 * cleaned up. 260 254 */ 261 255 ext4_orphan_del(NULL, inode); 262 - sb_end_intwrite(inode->i_sb); 256 + if (freeze_protected) 257 + sb_end_intwrite(inode->i_sb); 263 258 goto no_delete; 264 259 } 265 260 ··· 301 294 stop_handle: 302 295 ext4_journal_stop(handle); 303 296 ext4_orphan_del(NULL, inode); 304 - sb_end_intwrite(inode->i_sb); 297 + if (freeze_protected) 298 + sb_end_intwrite(inode->i_sb); 305 299 ext4_xattr_inode_array_free(ea_inode_array); 306 300 goto no_delete; 307 301 } ··· 331 323 else 332 324 ext4_free_inode(handle, inode); 333 325 ext4_journal_stop(handle); 334 - sb_end_intwrite(inode->i_sb); 326 + if (freeze_protected) 327 + sb_end_intwrite(inode->i_sb); 335 328 ext4_xattr_inode_array_free(ea_inode_array); 336 329 return; 337 330 no_delete: ··· 839 830 int create = map_flags & EXT4_GET_BLOCKS_CREATE; 840 831 int err; 841 832 842 - J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 843 - || handle != NULL || create == 0); 833 + ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 834 + || handle != NULL || create == 0); 844 835 845 836 map.m_lblk = block; 846 837 map.m_len = 1; ··· 855 846 if (unlikely(!bh)) 856 847 return ERR_PTR(-ENOMEM); 857 848 if (map.m_flags & EXT4_MAP_NEW) { 858 - J_ASSERT(create != 0); 859 - J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 860 - || (handle != NULL)); 849 + ASSERT(create != 0); 850 + ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 851 + || (handle != NULL)); 861 852 862 853 /* 863 854 * Now that we do not always journal data, we should ··· 2064 2055 unlock_page(page); 2065 2056 return -ENOMEM; 2066 2057 } 2067 - ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite); 2058 + ret = ext4_bio_write_page(&io_submit, page, len, keep_towrite); 2068 2059 ext4_io_submit(&io_submit); 2069 2060 /* Drop io_end reference we got from init */ 2070 2061 ext4_put_io_end_defer(io_submit.io_end); ··· 2098 2089 len = size & ~PAGE_MASK; 2099 2090 else 2100 2091 len = PAGE_SIZE; 2101 - err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false); 2092 + err = ext4_bio_write_page(&mpd->io_submit, page, len, false); 2102 2093 if (!err) 2103 2094 mpd->wbc->nr_to_write--; 2104 2095 mpd->first_page++; ··· 4619 4610 (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) { 4620 4611 if (flags & EXT4_IGET_HANDLE) 4621 4612 return ERR_PTR(-ESTALE); 4622 - __ext4_error(sb, function, line, EFSCORRUPTED, 0, 4613 + __ext4_error(sb, function, line, false, EFSCORRUPTED, 0, 4623 4614 "inode #%lu: comm %s: iget: illegal inode #", 4624 4615 ino, current->comm); 4625 4616 return ERR_PTR(-EFSCORRUPTED);

+9 -30

fs/ext4/mballoc.c

··· 822 822 spin_unlock(&sbi->s_bal_lock); 823 823 } 824 824 825 - static void mb_regenerate_buddy(struct ext4_buddy *e4b) 826 - { 827 - int count; 828 - int order = 1; 829 - void *buddy; 830 - 831 - while ((buddy = mb_find_buddy(e4b, order++, &count))) { 832 - ext4_set_bits(buddy, 0, count); 833 - } 834 - e4b->bd_info->bb_fragments = 0; 835 - memset(e4b->bd_info->bb_counters, 0, 836 - sizeof(*e4b->bd_info->bb_counters) * 837 - (e4b->bd_sb->s_blocksize_bits + 2)); 838 - 839 - ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, 840 - e4b->bd_bitmap, e4b->bd_group); 841 - } 842 - 843 825 /* The buddy information is attached the buddy cache inode 844 826 * for convenience. The information regarding each group 845 827 * is loaded via ext4_mb_load_buddy. The information involve ··· 1289 1307 1290 1308 static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) 1291 1309 { 1292 - int order = 1; 1293 - int bb_incr = 1 << (e4b->bd_blkbits - 1); 1310 + int order = 1, max; 1294 1311 void *bb; 1295 1312 1296 1313 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); 1297 1314 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); 1298 1315 1299 - bb = e4b->bd_buddy; 1300 1316 while (order <= e4b->bd_blkbits + 1) { 1301 - block = block >> 1; 1302 - if (!mb_test_bit(block, bb)) { 1317 + bb = mb_find_buddy(e4b, order, &max); 1318 + if (!mb_test_bit(block >> order, bb)) { 1303 1319 /* this block is part of buddy of order 'order' */ 1304 1320 return order; 1305 1321 } 1306 - bb += bb_incr; 1307 - bb_incr >>= 1; 1308 1322 order++; 1309 1323 } 1310 1324 return 0; ··· 1490 1512 sb, e4b->bd_group, 1491 1513 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 1492 1514 } 1493 - mb_regenerate_buddy(e4b); 1494 1515 goto done; 1495 1516 } 1496 1517 ··· 2372 2395 2373 2396 nr = sbi->s_mb_prefetch; 2374 2397 if (ext4_has_feature_flex_bg(sb)) { 2375 - nr = (group / sbi->s_mb_prefetch) * 2376 - sbi->s_mb_prefetch; 2377 - nr = nr + sbi->s_mb_prefetch - group; 2398 + nr = 1 << sbi->s_log_groups_per_flex; 2399 + nr -= group & (nr - 1); 2400 + nr = min(nr, sbi->s_mb_prefetch); 2378 2401 } 2379 2402 prefetch_grp = ext4_mb_prefetch(sb, group, 2380 2403 nr, &prefetch_ios); ··· 2710 2733 2711 2734 if (ext4_has_feature_flex_bg(sb)) { 2712 2735 /* a single flex group is supposed to be read by a single IO */ 2713 - sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex; 2736 + sbi->s_mb_prefetch = min(1 << sbi->s_es->s_log_groups_per_flex, 2737 + BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9)); 2714 2738 sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ 2715 2739 } else { 2716 2740 sbi->s_mb_prefetch = 32; ··· 5104 5126 ext4_group_first_block_no(sb, group) + 5105 5127 EXT4_C2B(sbi, cluster), 5106 5128 "Block already on to-be-freed list"); 5129 + kmem_cache_free(ext4_free_data_cachep, new_entry); 5107 5130 return 0; 5108 5131 } 5109 5132 }

+4 -8

fs/ext4/namei.c

··· 182 182 return bh; 183 183 } 184 184 185 - #ifndef assert 186 - #define assert(test) J_ASSERT(test) 187 - #endif 188 - 189 185 #ifdef DX_DEBUG 190 186 #define dxtrace(command) command 191 187 #else ··· 839 843 break; 840 844 } 841 845 } 842 - assert (at == p - 1); 846 + ASSERT(at == p - 1); 843 847 } 844 848 845 849 at = p - 1; ··· 1255 1259 struct dx_entry *old = frame->at, *new = old + 1; 1256 1260 int count = dx_get_count(entries); 1257 1261 1258 - assert(count < dx_get_limit(entries)); 1259 - assert(old < entries + count); 1262 + ASSERT(count < dx_get_limit(entries)); 1263 + ASSERT(old < entries + count); 1260 1264 memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); 1261 1265 dx_set_hash(new, hash); 1262 1266 dx_set_block(new, block); ··· 2955 2959 * hold i_mutex, or the inode can not be referenced from outside, 2956 2960 * so i_nlink should not be bumped due to race 2957 2961 */ 2958 - J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 2962 + ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 2959 2963 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); 2960 2964 2961 2965 BUFFER_TRACE(sbi->s_sbh, "get_write_access");

+1 -4

fs/ext4/page-io.c

··· 111 111 unsigned under_io = 0; 112 112 unsigned long flags; 113 113 114 - if (!page) 115 - continue; 116 - 117 114 if (fscrypt_is_bounce_page(page)) { 118 115 bounce_page = page; 119 116 page = fscrypt_pagecache_page(bounce_page); ··· 435 438 int ext4_bio_write_page(struct ext4_io_submit *io, 436 439 struct page *page, 437 440 int len, 438 - struct writeback_control *wbc, 439 441 bool keep_towrite) 440 442 { 441 443 struct page *bounce_page = NULL; ··· 444 448 int ret = 0; 445 449 int nr_submitted = 0; 446 450 int nr_to_submit = 0; 451 + struct writeback_control *wbc = io->io_wbc; 447 452 448 453 BUG_ON(!PageLocked(page)); 449 454 BUG_ON(PageWriteback(page));

+207 -215

fs/ext4/super.c

··· 404 404 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); 405 405 } 406 406 407 - static void __ext4_update_tstamp(__le32 *lo, __u8 *hi) 407 + static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now) 408 408 { 409 - time64_t now = ktime_get_real_seconds(); 410 - 411 409 now = clamp_val(now, 0, (1ull << 40) - 1); 412 410 413 411 *lo = cpu_to_le32(lower_32_bits(now)); ··· 417 419 return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo); 418 420 } 419 421 #define ext4_update_tstamp(es, tstamp) \ 420 - __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) 422 + __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \ 423 + ktime_get_real_seconds()) 421 424 #define ext4_get_tstamp(es, tstamp) \ 422 425 __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) 423 - 424 - static void __save_error_info(struct super_block *sb, int error, 425 - __u32 ino, __u64 block, 426 - const char *func, unsigned int line) 427 - { 428 - struct ext4_super_block *es = EXT4_SB(sb)->s_es; 429 - int err; 430 - 431 - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 432 - if (bdev_read_only(sb->s_bdev)) 433 - return; 434 - es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 435 - ext4_update_tstamp(es, s_last_error_time); 436 - strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); 437 - es->s_last_error_line = cpu_to_le32(line); 438 - es->s_last_error_ino = cpu_to_le32(ino); 439 - es->s_last_error_block = cpu_to_le64(block); 440 - switch (error) { 441 - case EIO: 442 - err = EXT4_ERR_EIO; 443 - break; 444 - case ENOMEM: 445 - err = EXT4_ERR_ENOMEM; 446 - break; 447 - case EFSBADCRC: 448 - err = EXT4_ERR_EFSBADCRC; 449 - break; 450 - case 0: 451 - case EFSCORRUPTED: 452 - err = EXT4_ERR_EFSCORRUPTED; 453 - break; 454 - case ENOSPC: 455 - err = EXT4_ERR_ENOSPC; 456 - break; 457 - case ENOKEY: 458 - err = EXT4_ERR_ENOKEY; 459 - break; 460 - case EROFS: 461 - err = EXT4_ERR_EROFS; 462 - break; 463 - case EFBIG: 464 - err = EXT4_ERR_EFBIG; 465 - break; 466 - case EEXIST: 467 - err = EXT4_ERR_EEXIST; 468 - break; 469 - case ERANGE: 470 - err = EXT4_ERR_ERANGE; 471 - break; 472 - case EOVERFLOW: 473 - err = EXT4_ERR_EOVERFLOW; 474 - break; 475 - case EBUSY: 476 - err = EXT4_ERR_EBUSY; 477 - break; 478 - case ENOTDIR: 479 - err = EXT4_ERR_ENOTDIR; 480 - break; 481 - case ENOTEMPTY: 482 - err = EXT4_ERR_ENOTEMPTY; 483 - break; 484 - case ESHUTDOWN: 485 - err = EXT4_ERR_ESHUTDOWN; 486 - break; 487 - case EFAULT: 488 - err = EXT4_ERR_EFAULT; 489 - break; 490 - default: 491 - err = EXT4_ERR_UNKNOWN; 492 - } 493 - es->s_last_error_errcode = err; 494 - if (!es->s_first_error_time) { 495 - es->s_first_error_time = es->s_last_error_time; 496 - es->s_first_error_time_hi = es->s_last_error_time_hi; 497 - strncpy(es->s_first_error_func, func, 498 - sizeof(es->s_first_error_func)); 499 - es->s_first_error_line = cpu_to_le32(line); 500 - es->s_first_error_ino = es->s_last_error_ino; 501 - es->s_first_error_block = es->s_last_error_block; 502 - es->s_first_error_errcode = es->s_last_error_errcode; 503 - } 504 - /* 505 - * Start the daily error reporting function if it hasn't been 506 - * started already 507 - */ 508 - if (!es->s_error_count) 509 - mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ); 510 - le32_add_cpu(&es->s_error_count, 1); 511 - } 512 - 513 - static void save_error_info(struct super_block *sb, int error, 514 - __u32 ino, __u64 block, 515 - const char *func, unsigned int line) 516 - { 517 - __save_error_info(sb, error, ino, block, func, line); 518 - if (!bdev_read_only(sb->s_bdev)) 519 - ext4_commit_super(sb, 1); 520 - } 521 426 522 427 /* 523 428 * The del_gendisk() function uninitializes the disk-specific data ··· 550 649 || system_state == SYSTEM_RESTART; 551 650 } 552 651 652 + struct ext4_err_translation { 653 + int code; 654 + int errno; 655 + }; 656 + 657 + #define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err } 658 + 659 + static struct ext4_err_translation err_translation[] = { 660 + EXT4_ERR_TRANSLATE(EIO), 661 + EXT4_ERR_TRANSLATE(ENOMEM), 662 + EXT4_ERR_TRANSLATE(EFSBADCRC), 663 + EXT4_ERR_TRANSLATE(EFSCORRUPTED), 664 + EXT4_ERR_TRANSLATE(ENOSPC), 665 + EXT4_ERR_TRANSLATE(ENOKEY), 666 + EXT4_ERR_TRANSLATE(EROFS), 667 + EXT4_ERR_TRANSLATE(EFBIG), 668 + EXT4_ERR_TRANSLATE(EEXIST), 669 + EXT4_ERR_TRANSLATE(ERANGE), 670 + EXT4_ERR_TRANSLATE(EOVERFLOW), 671 + EXT4_ERR_TRANSLATE(EBUSY), 672 + EXT4_ERR_TRANSLATE(ENOTDIR), 673 + EXT4_ERR_TRANSLATE(ENOTEMPTY), 674 + EXT4_ERR_TRANSLATE(ESHUTDOWN), 675 + EXT4_ERR_TRANSLATE(EFAULT), 676 + }; 677 + 678 + static int ext4_errno_to_code(int errno) 679 + { 680 + int i; 681 + 682 + for (i = 0; i < ARRAY_SIZE(err_translation); i++) 683 + if (err_translation[i].errno == errno) 684 + return err_translation[i].code; 685 + return EXT4_ERR_UNKNOWN; 686 + } 687 + 688 + static void __save_error_info(struct super_block *sb, int error, 689 + __u32 ino, __u64 block, 690 + const char *func, unsigned int line) 691 + { 692 + struct ext4_sb_info *sbi = EXT4_SB(sb); 693 + 694 + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 695 + if (bdev_read_only(sb->s_bdev)) 696 + return; 697 + /* We default to EFSCORRUPTED error... */ 698 + if (error == 0) 699 + error = EFSCORRUPTED; 700 + 701 + spin_lock(&sbi->s_error_lock); 702 + sbi->s_add_error_count++; 703 + sbi->s_last_error_code = error; 704 + sbi->s_last_error_line = line; 705 + sbi->s_last_error_ino = ino; 706 + sbi->s_last_error_block = block; 707 + sbi->s_last_error_func = func; 708 + sbi->s_last_error_time = ktime_get_real_seconds(); 709 + if (!sbi->s_first_error_time) { 710 + sbi->s_first_error_code = error; 711 + sbi->s_first_error_line = line; 712 + sbi->s_first_error_ino = ino; 713 + sbi->s_first_error_block = block; 714 + sbi->s_first_error_func = func; 715 + sbi->s_first_error_time = sbi->s_last_error_time; 716 + } 717 + spin_unlock(&sbi->s_error_lock); 718 + } 719 + 720 + static void save_error_info(struct super_block *sb, int error, 721 + __u32 ino, __u64 block, 722 + const char *func, unsigned int line) 723 + { 724 + __save_error_info(sb, error, ino, block, func, line); 725 + if (!bdev_read_only(sb->s_bdev)) 726 + ext4_commit_super(sb, 1); 727 + } 728 + 553 729 /* Deal with the reporting of failure conditions on a filesystem such as 554 730 * inconsistencies detected or read IO failures. 555 731 * ··· 640 662 * We'll just use the jbd2_journal_abort() error code to record an error in 641 663 * the journal instead. On recovery, the journal will complain about 642 664 * that error until we've noted it down and cleared it. 665 + * 666 + * If force_ro is set, we unconditionally force the filesystem into an 667 + * ABORT|READONLY state, unless the error response on the fs has been set to 668 + * panic in which case we take the easy way out and panic immediately. This is 669 + * used to deal with unrecoverable failures such as journal IO errors or ENOMEM 670 + * at a critical moment in log management. 643 671 */ 644 - 645 - static void ext4_handle_error(struct super_block *sb) 672 + static void ext4_handle_error(struct super_block *sb, bool force_ro) 646 673 { 674 + journal_t *journal = EXT4_SB(sb)->s_journal; 675 + 647 676 if (test_opt(sb, WARN_ON_ERROR)) 648 677 WARN_ON_ONCE(1); 649 678 650 - if (sb_rdonly(sb)) 679 + if (sb_rdonly(sb) || (!force_ro && test_opt(sb, ERRORS_CONT))) 651 680 return; 652 681 653 - if (!test_opt(sb, ERRORS_CONT)) { 654 - journal_t *journal = EXT4_SB(sb)->s_journal; 655 - 656 - ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); 657 - if (journal) 658 - jbd2_journal_abort(journal, -EIO); 659 - } 682 + ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); 683 + if (journal) 684 + jbd2_journal_abort(journal, -EIO); 660 685 /* 661 686 * We force ERRORS_RO behavior when system is rebooting. Otherwise we 662 687 * could panic during 'reboot -f' as the underlying device got already 663 688 * disabled. 664 689 */ 665 - if (test_opt(sb, ERRORS_RO) || system_going_down()) { 666 - ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 667 - /* 668 - * Make sure updated value of ->s_mount_flags will be visible 669 - * before ->s_flags update 670 - */ 671 - smp_wmb(); 672 - sb->s_flags |= SB_RDONLY; 673 - } else if (test_opt(sb, ERRORS_PANIC)) { 690 + if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) { 674 691 panic("EXT4-fs (device %s): panic forced after error\n", 675 692 sb->s_id); 676 693 } 694 + ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 695 + /* 696 + * Make sure updated value of ->s_mount_flags will be visible before 697 + * ->s_flags update 698 + */ 699 + smp_wmb(); 700 + sb->s_flags |= SB_RDONLY; 701 + } 702 + 703 + static void flush_stashed_error_work(struct work_struct *work) 704 + { 705 + struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, 706 + s_error_work); 707 + 708 + ext4_commit_super(sbi->s_sb, 1); 677 709 } 678 710 679 711 #define ext4_error_ratelimit(sb) \ ··· 691 703 "EXT4-fs error") 692 704 693 705 void __ext4_error(struct super_block *sb, const char *function, 694 - unsigned int line, int error, __u64 block, 706 + unsigned int line, bool force_ro, int error, __u64 block, 695 707 const char *fmt, ...) 696 708 { 697 709 struct va_format vaf; ··· 711 723 va_end(args); 712 724 } 713 725 save_error_info(sb, error, 0, block, function, line); 714 - ext4_handle_error(sb); 726 + ext4_handle_error(sb, force_ro); 715 727 } 716 728 717 729 void __ext4_error_inode(struct inode *inode, const char *function, ··· 743 755 } 744 756 save_error_info(inode->i_sb, error, inode->i_ino, block, 745 757 function, line); 746 - ext4_handle_error(inode->i_sb); 758 + ext4_handle_error(inode->i_sb, false); 747 759 } 748 760 749 761 void __ext4_error_file(struct file *file, const char *function, ··· 782 794 } 783 795 save_error_info(inode->i_sb, EFSCORRUPTED, inode->i_ino, block, 784 796 function, line); 785 - ext4_handle_error(inode->i_sb); 797 + ext4_handle_error(inode->i_sb, false); 786 798 } 787 799 788 800 const char *ext4_decode_error(struct super_block *sb, int errno, ··· 850 862 } 851 863 852 864 save_error_info(sb, -errno, 0, 0, function, line); 853 - ext4_handle_error(sb); 854 - } 855 - 856 - /* 857 - * ext4_abort is a much stronger failure handler than ext4_error. The 858 - * abort function may be used to deal with unrecoverable failures such 859 - * as journal IO errors or ENOMEM at a critical moment in log management. 860 - * 861 - * We unconditionally force the filesystem into an ABORT|READONLY state, 862 - * unless the error response on the fs has been set to panic in which 863 - * case we take the easy way out and panic immediately. 864 - */ 865 - 866 - void __ext4_abort(struct super_block *sb, const char *function, 867 - unsigned int line, int error, const char *fmt, ...) 868 - { 869 - struct va_format vaf; 870 - va_list args; 871 - 872 - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) 873 - return; 874 - 875 - save_error_info(sb, error, 0, 0, function, line); 876 - va_start(args, fmt); 877 - vaf.fmt = fmt; 878 - vaf.va = &args; 879 - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: %pV\n", 880 - sb->s_id, function, line, &vaf); 881 - va_end(args); 882 - 883 - if (sb_rdonly(sb) == 0) { 884 - ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); 885 - if (EXT4_SB(sb)->s_journal) 886 - jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); 887 - 888 - ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 889 - /* 890 - * Make sure updated value of ->s_mount_flags will be visible 891 - * before ->s_flags update 892 - */ 893 - smp_wmb(); 894 - sb->s_flags |= SB_RDONLY; 895 - } 896 - if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) 897 - panic("EXT4-fs panic from previous error\n"); 865 + ext4_handle_error(sb, false); 898 866 } 899 867 900 868 void __ext4_msg(struct super_block *sb, ··· 926 982 return; 927 983 928 984 trace_ext4_error(sb, function, line); 929 - __save_error_info(sb, EFSCORRUPTED, ino, block, function, line); 930 - 931 985 if (ext4_error_ratelimit(sb)) { 932 986 va_start(args, fmt); 933 987 vaf.fmt = fmt; ··· 941 999 va_end(args); 942 1000 } 943 1001 944 - if (test_opt(sb, WARN_ON_ERROR)) 945 - WARN_ON_ONCE(1); 946 - 947 1002 if (test_opt(sb, ERRORS_CONT)) { 948 - ext4_commit_super(sb, 0); 1003 + if (test_opt(sb, WARN_ON_ERROR)) 1004 + WARN_ON_ONCE(1); 1005 + __save_error_info(sb, EFSCORRUPTED, ino, block, function, line); 1006 + schedule_work(&EXT4_SB(sb)->s_error_work); 949 1007 return; 950 1008 } 951 - 952 1009 ext4_unlock_group(sb, grp); 953 - ext4_commit_super(sb, 1); 954 - ext4_handle_error(sb); 1010 + save_error_info(sb, EFSCORRUPTED, ino, block, function, line); 1011 + ext4_handle_error(sb, false); 955 1012 /* 956 1013 * We only get here in the ERRORS_RO case; relocking the group 957 1014 * may be dangerous, but nothing bad will happen since the ··· 1122 1181 ext4_unregister_li_request(sb); 1123 1182 ext4_quota_off_umount(sb); 1124 1183 1184 + flush_work(&sbi->s_error_work); 1125 1185 destroy_workqueue(sbi->rsv_conversion_wq); 1126 1186 1127 1187 /* ··· 1182 1240 * in-memory list had better be clean by this point. */ 1183 1241 if (!list_empty(&sbi->s_orphan)) 1184 1242 dump_orphan_list(sb, sbi); 1185 - J_ASSERT(list_empty(&sbi->s_orphan)); 1243 + ASSERT(list_empty(&sbi->s_orphan)); 1186 1244 1187 1245 sync_blockdev(sb->s_bdev); 1188 1246 invalidate_bdev(sb->s_bdev); ··· 3947 4005 atomic64_set(&sbi->s_resv_clusters, resv_clusters); 3948 4006 } 3949 4007 4008 + static const char *ext4_quota_mode(struct super_block *sb) 4009 + { 4010 + #ifdef CONFIG_QUOTA 4011 + if (!ext4_quota_capable(sb)) 4012 + return "none"; 4013 + 4014 + if (EXT4_SB(sb)->s_journal && ext4_is_quota_journalled(sb)) 4015 + return "journalled"; 4016 + else 4017 + return "writeback"; 4018 + #else 4019 + return "disabled"; 4020 + #endif 4021 + } 4022 + 3950 4023 static int ext4_fill_super(struct super_block *sb, void *data, int silent) 3951 4024 { 3952 4025 struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev); ··· 4030 4073 if (IS_ERR(bh)) { 4031 4074 ext4_msg(sb, KERN_ERR, "unable to read superblock"); 4032 4075 ret = PTR_ERR(bh); 4033 - bh = NULL; 4034 4076 goto out_fail; 4035 4077 } 4036 4078 /* ··· 4143 4187 */ 4144 4188 sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; 4145 4189 4146 - blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); 4190 + if (le32_to_cpu(es->s_log_block_size) > 4191 + (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { 4192 + ext4_msg(sb, KERN_ERR, 4193 + "Invalid log block size: %u", 4194 + le32_to_cpu(es->s_log_block_size)); 4195 + goto failed_mount; 4196 + } 4197 + if (le32_to_cpu(es->s_log_cluster_size) > 4198 + (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { 4199 + ext4_msg(sb, KERN_ERR, 4200 + "Invalid log cluster size: %u", 4201 + le32_to_cpu(es->s_log_cluster_size)); 4202 + goto failed_mount; 4203 + } 4204 + 4205 + blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); 4147 4206 4148 4207 if (blocksize == PAGE_SIZE) 4149 4208 set_opt(sb, DIOREAD_NOLOCK); 4150 - 4151 - if (blocksize < EXT4_MIN_BLOCK_SIZE || 4152 - blocksize > EXT4_MAX_BLOCK_SIZE) { 4153 - ext4_msg(sb, KERN_ERR, 4154 - "Unsupported filesystem blocksize %d (%d log_block_size)", 4155 - blocksize, le32_to_cpu(es->s_log_block_size)); 4156 - goto failed_mount; 4157 - } 4158 4209 4159 4210 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { 4160 4211 sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; ··· 4379 4416 */ 4380 4417 if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) 4381 4418 goto failed_mount; 4382 - 4383 - if (le32_to_cpu(es->s_log_block_size) > 4384 - (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { 4385 - ext4_msg(sb, KERN_ERR, 4386 - "Invalid log block size: %u", 4387 - le32_to_cpu(es->s_log_block_size)); 4388 - goto failed_mount; 4389 - } 4390 - if (le32_to_cpu(es->s_log_cluster_size) > 4391 - (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { 4392 - ext4_msg(sb, KERN_ERR, 4393 - "Invalid log cluster size: %u", 4394 - le32_to_cpu(es->s_log_cluster_size)); 4395 - goto failed_mount; 4396 - } 4397 4419 4398 4420 if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) { 4399 4421 ext4_msg(sb, KERN_ERR, ··· 4650 4702 "can't read group descriptor %d", i); 4651 4703 db_count = i; 4652 4704 ret = PTR_ERR(bh); 4653 - bh = NULL; 4654 4705 goto failed_mount2; 4655 4706 } 4656 4707 rcu_read_lock(); ··· 4664 4717 } 4665 4718 4666 4719 timer_setup(&sbi->s_err_report, print_daily_error_info, 0); 4720 + spin_lock_init(&sbi->s_error_lock); 4721 + INIT_WORK(&sbi->s_error_work, flush_stashed_error_work); 4667 4722 4668 4723 /* Register extent status tree shrinker */ 4669 4724 if (ext4_es_register_shrinker(sbi)) ··· 4821 4872 "requested data journaling mode"); 4822 4873 goto failed_mount_wq; 4823 4874 } 4875 + break; 4824 4876 default: 4825 4877 break; 4826 4878 } ··· 4950 5000 block = ext4_count_free_clusters(sb); 4951 5001 ext4_free_blocks_count_set(sbi->s_es, 4952 5002 EXT4_C2B(sbi, block)); 4953 - ext4_superblock_csum_set(sb); 4954 5003 err = percpu_counter_init(&sbi->s_freeclusters_counter, block, 4955 5004 GFP_KERNEL); 4956 5005 if (!err) { 4957 5006 unsigned long freei = ext4_count_free_inodes(sb); 4958 5007 sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); 4959 - ext4_superblock_csum_set(sb); 4960 5008 err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, 4961 5009 GFP_KERNEL); 4962 5010 } ··· 5034 5086 5035 5087 if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) 5036 5088 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " 5037 - "Opts: %.*s%s%s", descr, 5089 + "Opts: %.*s%s%s. Quota mode: %s.", descr, 5038 5090 (int) sizeof(sbi->s_es->s_mount_opts), 5039 5091 sbi->s_es->s_mount_opts, 5040 - *sbi->s_es->s_mount_opts ? "; " : "", orig_data); 5092 + *sbi->s_es->s_mount_opts ? "; " : "", orig_data, 5093 + ext4_quota_mode(sb)); 5041 5094 5042 5095 if (es->s_error_count) 5043 5096 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ ··· 5103 5154 ext4_es_unregister_shrinker(sbi); 5104 5155 failed_mount3: 5105 5156 del_timer_sync(&sbi->s_err_report); 5157 + flush_work(&sbi->s_error_work); 5106 5158 if (sbi->s_mmp_tsk) 5107 5159 kthread_stop(sbi->s_mmp_tsk); 5108 5160 failed_mount2: ··· 5430 5480 5431 5481 static int ext4_commit_super(struct super_block *sb, int sync) 5432 5482 { 5483 + struct ext4_sb_info *sbi = EXT4_SB(sb); 5433 5484 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 5434 5485 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; 5435 5486 int error = 0; ··· 5462 5511 es->s_free_inodes_count = 5463 5512 cpu_to_le32(percpu_counter_sum_positive( 5464 5513 &EXT4_SB(sb)->s_freeinodes_counter)); 5514 + /* Copy error information to the on-disk superblock */ 5515 + spin_lock(&sbi->s_error_lock); 5516 + if (sbi->s_add_error_count > 0) { 5517 + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 5518 + if (!es->s_first_error_time && !es->s_first_error_time_hi) { 5519 + __ext4_update_tstamp(&es->s_first_error_time, 5520 + &es->s_first_error_time_hi, 5521 + sbi->s_first_error_time); 5522 + strncpy(es->s_first_error_func, sbi->s_first_error_func, 5523 + sizeof(es->s_first_error_func)); 5524 + es->s_first_error_line = 5525 + cpu_to_le32(sbi->s_first_error_line); 5526 + es->s_first_error_ino = 5527 + cpu_to_le32(sbi->s_first_error_ino); 5528 + es->s_first_error_block = 5529 + cpu_to_le64(sbi->s_first_error_block); 5530 + es->s_first_error_errcode = 5531 + ext4_errno_to_code(sbi->s_first_error_code); 5532 + } 5533 + __ext4_update_tstamp(&es->s_last_error_time, 5534 + &es->s_last_error_time_hi, 5535 + sbi->s_last_error_time); 5536 + strncpy(es->s_last_error_func, sbi->s_last_error_func, 5537 + sizeof(es->s_last_error_func)); 5538 + es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line); 5539 + es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino); 5540 + es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block); 5541 + es->s_last_error_errcode = 5542 + ext4_errno_to_code(sbi->s_last_error_code); 5543 + /* 5544 + * Start the daily error reporting function if it hasn't been 5545 + * started already 5546 + */ 5547 + if (!es->s_error_count) 5548 + mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); 5549 + le32_add_cpu(&es->s_error_count, sbi->s_add_error_count); 5550 + sbi->s_add_error_count = 0; 5551 + } 5552 + spin_unlock(&sbi->s_error_lock); 5553 + 5465 5554 BUFFER_TRACE(sbh, "marking dirty"); 5466 5555 ext4_superblock_csum_set(sb); 5467 5556 if (sync) ··· 5855 5864 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 5856 5865 } 5857 5866 5867 + /* Flush outstanding errors before changing fs state */ 5868 + flush_work(&sbi->s_error_work); 5869 + 5858 5870 if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { 5859 5871 if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) { 5860 5872 err = -EROFS; ··· 6016 6022 */ 6017 6023 *flags = (*flags & ~vfs_flags) | (sb->s_flags & vfs_flags); 6018 6024 6019 - ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data); 6025 + ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s. Quota mode: %s.", 6026 + orig_data, ext4_quota_mode(sb)); 6020 6027 kfree(orig_data); 6021 6028 return 0; 6022 6029 ··· 6196 6201 static int ext4_mark_dquot_dirty(struct dquot *dquot) 6197 6202 { 6198 6203 struct super_block *sb = dquot->dq_sb; 6199 - struct ext4_sb_info *sbi = EXT4_SB(sb); 6200 6204 6201 - /* Are we journaling quotas? */ 6202 - if (ext4_has_feature_quota(sb) || 6203 - sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { 6205 + if (ext4_is_quota_journalled(sb)) { 6204 6206 dquot_mark_dquot_dirty(dquot); 6205 6207 return ext4_write_dquot(dquot); 6206 6208 } else {

-1

fs/ext4/xattr.c

··· 1927 1927 } else { 1928 1928 /* Allocate a buffer where we construct the new block. */ 1929 1929 s->base = kzalloc(sb->s_blocksize, GFP_NOFS); 1930 - /* assert(header == s->base) */ 1931 1930 error = -ENOMEM; 1932 1931 if (s->base == NULL) 1933 1932 goto cleanup;

+2 -6

fs/jbd2/journal.c

··· 1869 1869 1870 1870 if (jbd2_has_feature_fast_commit(journal)) { 1871 1871 journal->j_fc_last = be32_to_cpu(sb->s_maxlen); 1872 - num_fc_blocks = be32_to_cpu(sb->s_num_fc_blks); 1873 - if (!num_fc_blocks) 1874 - num_fc_blocks = JBD2_MIN_FC_BLOCKS; 1872 + num_fc_blocks = jbd2_journal_get_num_fc_blks(sb); 1875 1873 if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS) 1876 1874 journal->j_last = journal->j_fc_last - num_fc_blocks; 1877 1875 journal->j_fc_first = journal->j_last + 1; ··· 2100 2102 journal_superblock_t *sb = journal->j_superblock; 2101 2103 unsigned long long num_fc_blks; 2102 2104 2103 - num_fc_blks = be32_to_cpu(sb->s_num_fc_blks); 2104 - if (num_fc_blks == 0) 2105 - num_fc_blks = JBD2_MIN_FC_BLOCKS; 2105 + num_fc_blks = jbd2_journal_get_num_fc_blks(sb); 2106 2106 if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS) 2107 2107 return -ENOSPC; 2108 2108

+11 -3

include/linux/jbd2.h

··· 68 68 extern void jbd2_free(void *ptr, size_t size); 69 69 70 70 #define JBD2_MIN_JOURNAL_BLOCKS 1024 71 - #define JBD2_MIN_FC_BLOCKS 256 71 + #define JBD2_DEFAULT_FAST_COMMIT_BLOCKS 256 72 72 73 73 #ifdef __KERNEL__ 74 74 ··· 538 538 * The transaction keeps track of all of the buffers modified by a 539 539 * running transaction, and all of the buffers committed but not yet 540 540 * flushed to home for finished transactions. 541 + * (Locking Documentation improved by LockDoc) 541 542 */ 542 543 543 544 /* ··· 659 658 unsigned long t_start; 660 659 661 660 /* 662 - * When commit was requested 661 + * When commit was requested [j_state_lock] 663 662 */ 664 663 unsigned long t_requested; 665 664 666 665 /* 667 - * Checkpointing stats [j_checkpoint_sem] 666 + * Checkpointing stats [j_list_lock] 668 667 */ 669 668 struct transaction_chp_stats_s t_chp_stats; 670 669 ··· 1690 1689 journal->j_chksum_driver == NULL); 1691 1690 1692 1691 return journal->j_chksum_driver != NULL; 1692 + } 1693 + 1694 + static inline int jbd2_journal_get_num_fc_blks(journal_superblock_t *jsb) 1695 + { 1696 + int num_fc_blocks = be32_to_cpu(jsb->s_num_fc_blks); 1697 + 1698 + return num_fc_blocks ? num_fc_blocks : JBD2_DEFAULT_FAST_COMMIT_BLOCKS; 1693 1699 } 1694 1700 1695 1701 /*