Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'ext4_for_linus_stable' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 bugfixes from Ted Ts'o:
"Ext4 bug fixes for 3.17, to provide better handling of memory
allocation failures, and to fix some journaling bugs involving
journal checksums and FALLOC_FL_ZERO_RANGE"

* tag 'ext4_for_linus_stable' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
ext4: fix same-dir rename when inline data directory overflows
jbd2: fix descriptor block size handling errors with journal_csum
jbd2: fix infinite loop when recovering corrupt journal blocks
ext4: update i_disksize coherently with block allocation on error path
ext4: fix transaction issues for ext4_fallocate and ext_zero_range
ext4: fix incorect journal credits reservation in ext4_zero_range
ext4: move i_size,i_disksize update routines to helper function
ext4: fix BUG_ON in mb_free_blocks()
ext4: propagate errors up to ext4_find_entry()'s callers

+233 -129
+17 -1
fs/ext4/ext4.h
··· 1825 1825 /* 1826 1826 * Special error return code only used by dx_probe() and its callers. 1827 1827 */ 1828 - #define ERR_BAD_DX_DIR -75000 1828 + #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) 1829 1829 1830 1830 /* 1831 1831 * Timeout and state flag for lazy initialization inode thread. ··· 2452 2452 if (newsize > EXT4_I(inode)->i_disksize) 2453 2453 EXT4_I(inode)->i_disksize = newsize; 2454 2454 up_write(&EXT4_I(inode)->i_data_sem); 2455 + } 2456 + 2457 + /* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */ 2458 + static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) 2459 + { 2460 + int changed = 0; 2461 + 2462 + if (newsize > inode->i_size) { 2463 + i_size_write(inode, newsize); 2464 + changed = 1; 2465 + } 2466 + if (newsize > EXT4_I(inode)->i_disksize) { 2467 + ext4_update_i_disksize(inode, newsize); 2468 + changed |= 2; 2469 + } 2470 + return changed; 2455 2471 } 2456 2472 2457 2473 struct ext4_group_info {
+44 -44
fs/ext4/extents.c
··· 4665 4665 } 4666 4666 4667 4667 static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, 4668 - ext4_lblk_t len, int flags, int mode) 4668 + ext4_lblk_t len, loff_t new_size, 4669 + int flags, int mode) 4669 4670 { 4670 4671 struct inode *inode = file_inode(file); 4671 4672 handle_t *handle; ··· 4675 4674 int retries = 0; 4676 4675 struct ext4_map_blocks map; 4677 4676 unsigned int credits; 4677 + loff_t epos; 4678 4678 4679 4679 map.m_lblk = offset; 4680 + map.m_len = len; 4680 4681 /* 4681 4682 * Don't normalize the request if it can fit in one extent so 4682 4683 * that it doesn't get unnecessarily split into multiple ··· 4693 4690 credits = ext4_chunk_trans_blocks(inode, len); 4694 4691 4695 4692 retry: 4696 - while (ret >= 0 && ret < len) { 4697 - map.m_lblk = map.m_lblk + ret; 4698 - map.m_len = len = len - ret; 4693 + while (ret >= 0 && len) { 4699 4694 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, 4700 4695 credits); 4701 4696 if (IS_ERR(handle)) { ··· 4710 4709 ret2 = ext4_journal_stop(handle); 4711 4710 break; 4712 4711 } 4712 + map.m_lblk += ret; 4713 + map.m_len = len = len - ret; 4714 + epos = (loff_t)map.m_lblk << inode->i_blkbits; 4715 + inode->i_ctime = ext4_current_time(inode); 4716 + if (new_size) { 4717 + if (epos > new_size) 4718 + epos = new_size; 4719 + if (ext4_update_inode_size(inode, epos) & 0x1) 4720 + inode->i_mtime = inode->i_ctime; 4721 + } else { 4722 + if (epos > inode->i_size) 4723 + ext4_set_inode_flag(inode, 4724 + EXT4_INODE_EOFBLOCKS); 4725 + } 4726 + ext4_mark_inode_dirty(handle, inode); 4713 4727 ret2 = ext4_journal_stop(handle); 4714 4728 if (ret2) 4715 4729 break; ··· 4747 4731 loff_t new_size = 0; 4748 4732 int ret = 0; 4749 4733 int flags; 4750 - int partial; 4734 + int credits; 4735 + int partial_begin, partial_end; 4751 4736 loff_t start, end; 4752 4737 ext4_lblk_t lblk; 4753 4738 struct address_space *mapping = inode->i_mapping; ··· 4788 4771 4789 4772 if (start < offset || end > offset + len) 4790 4773 return -EINVAL; 4791 - partial = (offset + len) & ((1 << blkbits) - 1); 4774 + partial_begin = offset & ((1 << blkbits) - 1); 4775 + partial_end = (offset + len) & ((1 << blkbits) - 1); 4792 4776 4793 4777 lblk = start >> blkbits; 4794 4778 max_blocks = (end >> blkbits); ··· 4823 4805 * If we have a partial block after EOF we have to allocate 4824 4806 * the entire block. 4825 4807 */ 4826 - if (partial) 4808 + if (partial_end) 4827 4809 max_blocks += 1; 4828 4810 } 4829 4811 ··· 4831 4813 4832 4814 /* Now release the pages and zero block aligned part of pages*/ 4833 4815 truncate_pagecache_range(inode, start, end - 1); 4816 + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4834 4817 4835 4818 /* Wait all existing dio workers, newcomers will block on i_mutex */ 4836 4819 ext4_inode_block_unlocked_dio(inode); ··· 4844 4825 if (ret) 4845 4826 goto out_dio; 4846 4827 4847 - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, 4848 - mode); 4828 + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, 4829 + flags, mode); 4849 4830 if (ret) 4850 4831 goto out_dio; 4851 4832 } 4833 + if (!partial_begin && !partial_end) 4834 + goto out_dio; 4852 4835 4853 - handle = ext4_journal_start(inode, EXT4_HT_MISC, 4); 4836 + /* 4837 + * In worst case we have to writeout two nonadjacent unwritten 4838 + * blocks and update the inode 4839 + */ 4840 + credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1; 4841 + if (ext4_should_journal_data(inode)) 4842 + credits += 2; 4843 + handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); 4854 4844 if (IS_ERR(handle)) { 4855 4845 ret = PTR_ERR(handle); 4856 4846 ext4_std_error(inode->i_sb, ret); ··· 4867 4839 } 4868 4840 4869 4841 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4870 - 4871 4842 if (new_size) { 4872 - if (new_size > i_size_read(inode)) 4873 - i_size_write(inode, new_size); 4874 - if (new_size > EXT4_I(inode)->i_disksize) 4875 - ext4_update_i_disksize(inode, new_size); 4843 + ext4_update_inode_size(inode, new_size); 4876 4844 } else { 4877 4845 /* 4878 4846 * Mark that we allocate beyond EOF so the subsequent truncate ··· 4877 4853 if ((offset + len) > i_size_read(inode)) 4878 4854 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 4879 4855 } 4880 - 4881 4856 ext4_mark_inode_dirty(handle, inode); 4882 4857 4883 4858 /* Zero out partial block at the edges of the range */ ··· 4903 4880 long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) 4904 4881 { 4905 4882 struct inode *inode = file_inode(file); 4906 - handle_t *handle; 4907 4883 loff_t new_size = 0; 4908 4884 unsigned int max_blocks; 4909 4885 int ret = 0; 4910 4886 int flags; 4911 4887 ext4_lblk_t lblk; 4912 - struct timespec tv; 4913 4888 unsigned int blkbits = inode->i_blkbits; 4914 4889 4915 4890 /* Return error if mode is not supported */ ··· 4958 4937 goto out; 4959 4938 } 4960 4939 4961 - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode); 4940 + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, 4941 + flags, mode); 4962 4942 if (ret) 4963 4943 goto out; 4964 4944 4965 - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 4966 - if (IS_ERR(handle)) 4967 - goto out; 4968 - 4969 - tv = inode->i_ctime = ext4_current_time(inode); 4970 - 4971 - if (new_size) { 4972 - if (new_size > i_size_read(inode)) { 4973 - i_size_write(inode, new_size); 4974 - inode->i_mtime = tv; 4975 - } 4976 - if (new_size > EXT4_I(inode)->i_disksize) 4977 - ext4_update_i_disksize(inode, new_size); 4978 - } else { 4979 - /* 4980 - * Mark that we allocate beyond EOF so the subsequent truncate 4981 - * can proceed even if the new size is the same as i_size. 4982 - */ 4983 - if ((offset + len) > i_size_read(inode)) 4984 - ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 4945 + if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { 4946 + ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal, 4947 + EXT4_I(inode)->i_sync_tid); 4985 4948 } 4986 - ext4_mark_inode_dirty(handle, inode); 4987 - if (file->f_flags & O_SYNC) 4988 - ext4_handle_sync(handle); 4989 - 4990 - ext4_journal_stop(handle); 4991 4949 out: 4992 4950 mutex_unlock(&inode->i_mutex); 4993 4951 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
+16 -28
fs/ext4/inode.c
··· 1055 1055 } else 1056 1056 copied = block_write_end(file, mapping, pos, 1057 1057 len, copied, page, fsdata); 1058 - 1059 1058 /* 1060 - * No need to use i_size_read() here, the i_size 1061 - * cannot change under us because we hole i_mutex. 1062 - * 1063 - * But it's important to update i_size while still holding page lock: 1059 + * it's important to update i_size while still holding page lock: 1064 1060 * page writeout could otherwise come in and zero beyond i_size. 1065 1061 */ 1066 - if (pos + copied > inode->i_size) { 1067 - i_size_write(inode, pos + copied); 1068 - i_size_changed = 1; 1069 - } 1070 - 1071 - if (pos + copied > EXT4_I(inode)->i_disksize) { 1072 - /* We need to mark inode dirty even if 1073 - * new_i_size is less that inode->i_size 1074 - * but greater than i_disksize. (hint delalloc) 1075 - */ 1076 - ext4_update_i_disksize(inode, (pos + copied)); 1077 - i_size_changed = 1; 1078 - } 1062 + i_size_changed = ext4_update_inode_size(inode, pos + copied); 1079 1063 unlock_page(page); 1080 1064 page_cache_release(page); 1081 1065 ··· 1107 1123 int ret = 0, ret2; 1108 1124 int partial = 0; 1109 1125 unsigned from, to; 1110 - loff_t new_i_size; 1126 + int size_changed = 0; 1111 1127 1112 1128 trace_ext4_journalled_write_end(inode, pos, len, copied); 1113 1129 from = pos & (PAGE_CACHE_SIZE - 1); ··· 1130 1146 if (!partial) 1131 1147 SetPageUptodate(page); 1132 1148 } 1133 - new_i_size = pos + copied; 1134 - if (new_i_size > inode->i_size) 1135 - i_size_write(inode, pos+copied); 1149 + size_changed = ext4_update_inode_size(inode, pos + copied); 1136 1150 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 1137 1151 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; 1138 - if (new_i_size > EXT4_I(inode)->i_disksize) { 1139 - ext4_update_i_disksize(inode, new_i_size); 1152 + unlock_page(page); 1153 + page_cache_release(page); 1154 + 1155 + if (size_changed) { 1140 1156 ret2 = ext4_mark_inode_dirty(handle, inode); 1141 1157 if (!ret) 1142 1158 ret = ret2; 1143 1159 } 1144 1160 1145 - unlock_page(page); 1146 - page_cache_release(page); 1147 1161 if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1148 1162 /* if we have allocated more blocks and copied 1149 1163 * less. We will have blocks allocated outside ··· 2077 2095 struct ext4_map_blocks *map = &mpd->map; 2078 2096 int err; 2079 2097 loff_t disksize; 2098 + int progress = 0; 2080 2099 2081 2100 mpd->io_submit.io_end->offset = 2082 2101 ((loff_t)map->m_lblk) << inode->i_blkbits; ··· 2094 2111 * is non-zero, a commit should free up blocks. 2095 2112 */ 2096 2113 if ((err == -ENOMEM) || 2097 - (err == -ENOSPC && ext4_count_free_clusters(sb))) 2114 + (err == -ENOSPC && ext4_count_free_clusters(sb))) { 2115 + if (progress) 2116 + goto update_disksize; 2098 2117 return err; 2118 + } 2099 2119 ext4_msg(sb, KERN_CRIT, 2100 2120 "Delayed block allocation failed for " 2101 2121 "inode %lu at logical offset %llu with" ··· 2115 2129 *give_up_on_write = true; 2116 2130 return err; 2117 2131 } 2132 + progress = 1; 2118 2133 /* 2119 2134 * Update buffer state, submit mapped pages, and get us new 2120 2135 * extent to map 2121 2136 */ 2122 2137 err = mpage_map_and_submit_buffers(mpd); 2123 2138 if (err < 0) 2124 - return err; 2139 + goto update_disksize; 2125 2140 } while (map->m_len); 2126 2141 2142 + update_disksize: 2127 2143 /* 2128 2144 * Update on-disk size after IO is submitted. Races with 2129 2145 * truncate are avoided by checking i_size under i_data_sem.
+5
fs/ext4/mballoc.c
··· 1412 1412 int last = first + count - 1; 1413 1413 struct super_block *sb = e4b->bd_sb; 1414 1414 1415 + if (WARN_ON(count == 0)) 1416 + return; 1415 1417 BUG_ON(last >= (sb->s_blocksize << 3)); 1416 1418 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); 1417 1419 /* Don't bother if the block group is corrupt. */ ··· 3223 3221 int err; 3224 3222 3225 3223 if (pa == NULL) { 3224 + if (ac->ac_f_ex.fe_len == 0) 3225 + return; 3226 3226 err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b); 3227 3227 if (err) { 3228 3228 /* ··· 3239 3235 mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start, 3240 3236 ac->ac_f_ex.fe_len); 3241 3237 ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group); 3238 + ext4_mb_unload_buddy(&e4b); 3242 3239 return; 3243 3240 } 3244 3241 if (pa->pa_type == MB_INODE_PA)
+51 -5
fs/ext4/namei.c
··· 1227 1227 buffer */ 1228 1228 int num = 0; 1229 1229 ext4_lblk_t nblocks; 1230 - int i, err; 1230 + int i, err = 0; 1231 1231 int namelen; 1232 1232 1233 1233 *res_dir = NULL; ··· 1264 1264 * return. Otherwise, fall back to doing a search the 1265 1265 * old fashioned way. 1266 1266 */ 1267 - if (bh || (err != ERR_BAD_DX_DIR)) 1267 + if (err == -ENOENT) 1268 + return NULL; 1269 + if (err && err != ERR_BAD_DX_DIR) 1270 + return ERR_PTR(err); 1271 + if (bh) 1268 1272 return bh; 1269 1273 dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " 1270 1274 "falling back\n")); ··· 1299 1295 } 1300 1296 num++; 1301 1297 bh = ext4_getblk(NULL, dir, b++, 0, &err); 1298 + if (unlikely(err)) { 1299 + if (ra_max == 0) 1300 + return ERR_PTR(err); 1301 + break; 1302 + } 1302 1303 bh_use[ra_max] = bh; 1303 1304 if (bh) 1304 1305 ll_rw_block(READ | REQ_META | REQ_PRIO, ··· 1426 1417 return ERR_PTR(-ENAMETOOLONG); 1427 1418 1428 1419 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); 1420 + if (IS_ERR(bh)) 1421 + return (struct dentry *) bh; 1429 1422 inode = NULL; 1430 1423 if (bh) { 1431 1424 __u32 ino = le32_to_cpu(de->inode); ··· 1461 1450 struct buffer_head *bh; 1462 1451 1463 1452 bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL); 1453 + if (IS_ERR(bh)) 1454 + return (struct dentry *) bh; 1464 1455 if (!bh) 1465 1456 return ERR_PTR(-ENOENT); 1466 1457 ino = le32_to_cpu(de->inode); ··· 2740 2727 2741 2728 retval = -ENOENT; 2742 2729 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); 2730 + if (IS_ERR(bh)) 2731 + return PTR_ERR(bh); 2743 2732 if (!bh) 2744 2733 goto end_rmdir; 2745 2734 ··· 2809 2794 2810 2795 retval = -ENOENT; 2811 2796 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); 2797 + if (IS_ERR(bh)) 2798 + return PTR_ERR(bh); 2812 2799 if (!bh) 2813 2800 goto end_unlink; 2814 2801 ··· 3138 3121 struct ext4_dir_entry_2 *de; 3139 3122 3140 3123 bh = ext4_find_entry(dir, d_name, &de, NULL); 3124 + if (IS_ERR(bh)) 3125 + return PTR_ERR(bh); 3141 3126 if (bh) { 3142 3127 retval = ext4_delete_entry(handle, dir, de, bh); 3143 3128 brelse(bh); ··· 3147 3128 return retval; 3148 3129 } 3149 3130 3150 - static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent) 3131 + static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent, 3132 + int force_reread) 3151 3133 { 3152 3134 int retval; 3153 3135 /* ··· 3160 3140 if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino || 3161 3141 ent->de->name_len != ent->dentry->d_name.len || 3162 3142 strncmp(ent->de->name, ent->dentry->d_name.name, 3163 - ent->de->name_len)) { 3143 + ent->de->name_len) || 3144 + force_reread) { 3164 3145 retval = ext4_find_delete_entry(handle, ent->dir, 3165 3146 &ent->dentry->d_name); 3166 3147 } else { ··· 3212 3191 .dentry = new_dentry, 3213 3192 .inode = new_dentry->d_inode, 3214 3193 }; 3194 + int force_reread; 3215 3195 int retval; 3216 3196 3217 3197 dquot_initialize(old.dir); ··· 3224 3202 dquot_initialize(new.inode); 3225 3203 3226 3204 old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL); 3205 + if (IS_ERR(old.bh)) 3206 + return PTR_ERR(old.bh); 3227 3207 /* 3228 3208 * Check for inode number is _not_ due to possible IO errors. 3229 3209 * We might rmdir the source, keep it as pwd of some process ··· 3238 3214 3239 3215 new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, 3240 3216 &new.de, &new.inlined); 3217 + if (IS_ERR(new.bh)) { 3218 + retval = PTR_ERR(new.bh); 3219 + goto end_rename; 3220 + } 3241 3221 if (new.bh) { 3242 3222 if (!new.inode) { 3243 3223 brelse(new.bh); ··· 3274 3246 if (retval) 3275 3247 goto end_rename; 3276 3248 } 3249 + /* 3250 + * If we're renaming a file within an inline_data dir and adding or 3251 + * setting the new dirent causes a conversion from inline_data to 3252 + * extents/blockmap, we need to force the dirent delete code to 3253 + * re-read the directory, or else we end up trying to delete a dirent 3254 + * from what is now the extent tree root (or a block map). 3255 + */ 3256 + force_reread = (new.dir->i_ino == old.dir->i_ino && 3257 + ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA)); 3277 3258 if (!new.bh) { 3278 3259 retval = ext4_add_entry(handle, new.dentry, old.inode); 3279 3260 if (retval) ··· 3293 3256 if (retval) 3294 3257 goto end_rename; 3295 3258 } 3259 + if (force_reread) 3260 + force_reread = !ext4_test_inode_flag(new.dir, 3261 + EXT4_INODE_INLINE_DATA); 3296 3262 3297 3263 /* 3298 3264 * Like most other Unix systems, set the ctime for inodes on a ··· 3307 3267 /* 3308 3268 * ok, that's it 3309 3269 */ 3310 - ext4_rename_delete(handle, &old); 3270 + ext4_rename_delete(handle, &old, force_reread); 3311 3271 3312 3272 if (new.inode) { 3313 3273 ext4_dec_count(handle, new.inode); ··· 3370 3330 3371 3331 old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, 3372 3332 &old.de, &old.inlined); 3333 + if (IS_ERR(old.bh)) 3334 + return PTR_ERR(old.bh); 3373 3335 /* 3374 3336 * Check for inode number is _not_ due to possible IO errors. 3375 3337 * We might rmdir the source, keep it as pwd of some process ··· 3384 3342 3385 3343 new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, 3386 3344 &new.de, &new.inlined); 3345 + if (IS_ERR(new.bh)) { 3346 + retval = PTR_ERR(new.bh); 3347 + goto end_rename; 3348 + } 3387 3349 3388 3350 /* RENAME_EXCHANGE case: old *and* new must both exist */ 3389 3351 if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino)
+3 -2
fs/ext4/super.c
··· 3181 3181 3182 3182 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 3183 3183 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { 3184 - /* journal checksum v2 */ 3184 + /* journal checksum v3 */ 3185 3185 compat = 0; 3186 - incompat = JBD2_FEATURE_INCOMPAT_CSUM_V2; 3186 + incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; 3187 3187 } else { 3188 3188 /* journal checksum v1 */ 3189 3189 compat = JBD2_FEATURE_COMPAT_CHECKSUM; ··· 3205 3205 jbd2_journal_clear_features(sbi->s_journal, 3206 3206 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 3207 3207 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | 3208 + JBD2_FEATURE_INCOMPAT_CSUM_V3 | 3208 3209 JBD2_FEATURE_INCOMPAT_CSUM_V2); 3209 3210 } 3210 3211
+12 -9
fs/jbd2/commit.c
··· 97 97 struct commit_header *h; 98 98 __u32 csum; 99 99 100 - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 100 + if (!jbd2_journal_has_csum_v2or3(j)) 101 101 return; 102 102 103 103 h = (struct commit_header *)(bh->b_data); ··· 313 313 return checksum; 314 314 } 315 315 316 - static void write_tag_block(int tag_bytes, journal_block_tag_t *tag, 316 + static void write_tag_block(journal_t *j, journal_block_tag_t *tag, 317 317 unsigned long long block) 318 318 { 319 319 tag->t_blocknr = cpu_to_be32(block & (u32)~0); 320 - if (tag_bytes > JBD2_TAG_SIZE32) 320 + if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_64BIT)) 321 321 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); 322 322 } 323 323 ··· 327 327 struct jbd2_journal_block_tail *tail; 328 328 __u32 csum; 329 329 330 - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 330 + if (!jbd2_journal_has_csum_v2or3(j)) 331 331 return; 332 332 333 333 tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - ··· 340 340 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, 341 341 struct buffer_head *bh, __u32 sequence) 342 342 { 343 + journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag; 343 344 struct page *page = bh->b_page; 344 345 __u8 *addr; 345 346 __u32 csum32; 346 347 __be32 seq; 347 348 348 - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 349 + if (!jbd2_journal_has_csum_v2or3(j)) 349 350 return; 350 351 351 352 seq = cpu_to_be32(sequence); ··· 356 355 bh->b_size); 357 356 kunmap_atomic(addr); 358 357 359 - /* We only have space to store the lower 16 bits of the crc32c. */ 360 - tag->t_checksum = cpu_to_be16(csum32); 358 + if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3)) 359 + tag3->t_checksum = cpu_to_be32(csum32); 360 + else 361 + tag->t_checksum = cpu_to_be16(csum32); 361 362 } 362 363 /* 363 364 * jbd2_journal_commit_transaction ··· 399 396 LIST_HEAD(io_bufs); 400 397 LIST_HEAD(log_bufs); 401 398 402 - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 399 + if (jbd2_journal_has_csum_v2or3(journal)) 403 400 csum_size = sizeof(struct jbd2_journal_block_tail); 404 401 405 402 /* ··· 693 690 tag_flag |= JBD2_FLAG_SAME_UUID; 694 691 695 692 tag = (journal_block_tag_t *) tagp; 696 - write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); 693 + write_tag_block(journal, tag, jh2bh(jh)->b_blocknr); 697 694 tag->t_flags = cpu_to_be16(tag_flag); 698 695 jbd2_block_tag_csum_set(journal, tag, wbuf[bufs], 699 696 commit_transaction->t_tid);
+37 -19
fs/jbd2/journal.c
··· 124 124 /* Checksumming functions */ 125 125 static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) 126 126 { 127 - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 127 + if (!jbd2_journal_has_csum_v2or3(j)) 128 128 return 1; 129 129 130 130 return sb->s_checksum_type == JBD2_CRC32C_CHKSUM; ··· 145 145 146 146 static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) 147 147 { 148 - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 148 + if (!jbd2_journal_has_csum_v2or3(j)) 149 149 return 1; 150 150 151 151 return sb->s_checksum == jbd2_superblock_csum(j, sb); ··· 153 153 154 154 static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) 155 155 { 156 - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 156 + if (!jbd2_journal_has_csum_v2or3(j)) 157 157 return; 158 158 159 159 sb->s_checksum = jbd2_superblock_csum(j, sb); ··· 1522 1522 goto out; 1523 1523 } 1524 1524 1525 - if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) && 1526 - JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { 1525 + if (jbd2_journal_has_csum_v2or3(journal) && 1526 + JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) { 1527 1527 /* Can't have checksum v1 and v2 on at the same time! */ 1528 1528 printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 " 1529 + "at the same time!\n"); 1530 + goto out; 1531 + } 1532 + 1533 + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) && 1534 + JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) { 1535 + /* Can't have checksum v2 and v3 at the same time! */ 1536 + printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 " 1529 1537 "at the same time!\n"); 1530 1538 goto out; 1531 1539 } ··· 1544 1536 } 1545 1537 1546 1538 /* Load the checksum driver */ 1547 - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { 1539 + if (jbd2_journal_has_csum_v2or3(journal)) { 1548 1540 journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); 1549 1541 if (IS_ERR(journal->j_chksum_driver)) { 1550 1542 printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); ··· 1561 1553 } 1562 1554 1563 1555 /* Precompute checksum seed for all metadata */ 1564 - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 1556 + if (jbd2_journal_has_csum_v2or3(journal)) 1565 1557 journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, 1566 1558 sizeof(sb->s_uuid)); 1567 1559 ··· 1821 1813 if (!jbd2_journal_check_available_features(journal, compat, ro, incompat)) 1822 1814 return 0; 1823 1815 1824 - /* Asking for checksumming v2 and v1? Only give them v2. */ 1825 - if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2 && 1816 + /* If enabling v2 checksums, turn on v3 instead */ 1817 + if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2) { 1818 + incompat &= ~JBD2_FEATURE_INCOMPAT_CSUM_V2; 1819 + incompat |= JBD2_FEATURE_INCOMPAT_CSUM_V3; 1820 + } 1821 + 1822 + /* Asking for checksumming v3 and v1? Only give them v3. */ 1823 + if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V3 && 1826 1824 compat & JBD2_FEATURE_COMPAT_CHECKSUM) 1827 1825 compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM; 1828 1826 ··· 1837 1823 1838 1824 sb = journal->j_superblock; 1839 1825 1840 - /* If enabling v2 checksums, update superblock */ 1841 - if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V2)) { 1826 + /* If enabling v3 checksums, update superblock */ 1827 + if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { 1842 1828 sb->s_checksum_type = JBD2_CRC32C_CHKSUM; 1843 1829 sb->s_feature_compat &= 1844 1830 ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM); ··· 1856 1842 } 1857 1843 1858 1844 /* Precompute checksum seed for all metadata */ 1859 - if (JBD2_HAS_INCOMPAT_FEATURE(journal, 1860 - JBD2_FEATURE_INCOMPAT_CSUM_V2)) 1845 + if (jbd2_journal_has_csum_v2or3(journal)) 1861 1846 journal->j_csum_seed = jbd2_chksum(journal, ~0, 1862 1847 sb->s_uuid, 1863 1848 sizeof(sb->s_uuid)); ··· 1865 1852 /* If enabling v1 checksums, downgrade superblock */ 1866 1853 if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM)) 1867 1854 sb->s_feature_incompat &= 1868 - ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2); 1855 + ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2 | 1856 + JBD2_FEATURE_INCOMPAT_CSUM_V3); 1869 1857 1870 1858 sb->s_feature_compat |= cpu_to_be32(compat); 1871 1859 sb->s_feature_ro_compat |= cpu_to_be32(ro); ··· 2179 2165 */ 2180 2166 size_t journal_tag_bytes(journal_t *journal) 2181 2167 { 2182 - journal_block_tag_t tag; 2183 - size_t x = 0; 2168 + size_t sz; 2169 + 2170 + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) 2171 + return sizeof(journal_block_tag3_t); 2172 + 2173 + sz = sizeof(journal_block_tag_t); 2184 2174 2185 2175 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 2186 - x += sizeof(tag.t_checksum); 2176 + sz += sizeof(__u16); 2187 2177 2188 2178 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) 2189 - return x + JBD2_TAG_SIZE64; 2179 + return sz; 2190 2180 else 2191 - return x + JBD2_TAG_SIZE32; 2181 + return sz - sizeof(__u32); 2192 2182 } 2193 2183 2194 2184 /*
+20 -13
fs/jbd2/recovery.c
··· 181 181 __be32 provided; 182 182 __u32 calculated; 183 183 184 - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 184 + if (!jbd2_journal_has_csum_v2or3(j)) 185 185 return 1; 186 186 187 187 tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize - ··· 205 205 int nr = 0, size = journal->j_blocksize; 206 206 int tag_bytes = journal_tag_bytes(journal); 207 207 208 - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 208 + if (jbd2_journal_has_csum_v2or3(journal)) 209 209 size -= sizeof(struct jbd2_journal_block_tail); 210 210 211 211 tagp = &bh->b_data[sizeof(journal_header_t)]; ··· 338 338 return err; 339 339 } 340 340 341 - static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag) 341 + static inline unsigned long long read_tag_block(journal_t *journal, 342 + journal_block_tag_t *tag) 342 343 { 343 344 unsigned long long block = be32_to_cpu(tag->t_blocknr); 344 - if (tag_bytes > JBD2_TAG_SIZE32) 345 + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) 345 346 block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32; 346 347 return block; 347 348 } ··· 385 384 __be32 provided; 386 385 __u32 calculated; 387 386 388 - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 387 + if (!jbd2_journal_has_csum_v2or3(j)) 389 388 return 1; 390 389 391 390 h = buf; ··· 400 399 static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, 401 400 void *buf, __u32 sequence) 402 401 { 402 + journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag; 403 403 __u32 csum32; 404 404 __be32 seq; 405 405 406 - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 406 + if (!jbd2_journal_has_csum_v2or3(j)) 407 407 return 1; 408 408 409 409 seq = cpu_to_be32(sequence); 410 410 csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); 411 411 csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize); 412 412 413 - return tag->t_checksum == cpu_to_be16(csum32); 413 + if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3)) 414 + return tag3->t_checksum == cpu_to_be32(csum32); 415 + else 416 + return tag->t_checksum == cpu_to_be16(csum32); 414 417 } 415 418 416 419 static int do_one_pass(journal_t *journal, ··· 431 426 int tag_bytes = journal_tag_bytes(journal); 432 427 __u32 crc32_sum = ~0; /* Transactional Checksums */ 433 428 int descr_csum_size = 0; 429 + int block_error = 0; 434 430 435 431 /* 436 432 * First thing is to establish what we expect to find in the log ··· 518 512 switch(blocktype) { 519 513 case JBD2_DESCRIPTOR_BLOCK: 520 514 /* Verify checksum first */ 521 - if (JBD2_HAS_INCOMPAT_FEATURE(journal, 522 - JBD2_FEATURE_INCOMPAT_CSUM_V2)) 515 + if (jbd2_journal_has_csum_v2or3(journal)) 523 516 descr_csum_size = 524 517 sizeof(struct jbd2_journal_block_tail); 525 518 if (descr_csum_size > 0 && ··· 579 574 unsigned long long blocknr; 580 575 581 576 J_ASSERT(obh != NULL); 582 - blocknr = read_tag_block(tag_bytes, 577 + blocknr = read_tag_block(journal, 583 578 tag); 584 579 585 580 /* If the block has been ··· 603 598 "checksum recovering " 604 599 "block %llu in log\n", 605 600 blocknr); 606 - continue; 601 + block_error = 1; 602 + goto skip_write; 607 603 } 608 604 609 605 /* Find a buffer for the new ··· 803 797 success = -EIO; 804 798 } 805 799 } 806 - 800 + if (block_error && success == 0) 801 + success = -EIO; 807 802 return success; 808 803 809 804 failed: ··· 818 811 __be32 provided; 819 812 __u32 calculated; 820 813 821 - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 814 + if (!jbd2_journal_has_csum_v2or3(j)) 822 815 return 1; 823 816 824 817 tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize -
+3 -3
fs/jbd2/revoke.c
··· 91 91 #include <linux/list.h> 92 92 #include <linux/init.h> 93 93 #include <linux/bio.h> 94 - #endif 95 94 #include <linux/log2.h> 95 + #endif 96 96 97 97 static struct kmem_cache *jbd2_revoke_record_cache; 98 98 static struct kmem_cache *jbd2_revoke_table_cache; ··· 597 597 offset = *offsetp; 598 598 599 599 /* Do we need to leave space at the end for a checksum? */ 600 - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 600 + if (jbd2_journal_has_csum_v2or3(journal)) 601 601 csum_size = sizeof(struct jbd2_journal_revoke_tail); 602 602 603 603 /* Make sure we have a descriptor with space left for the record */ ··· 644 644 struct jbd2_journal_revoke_tail *tail; 645 645 __u32 csum; 646 646 647 - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 647 + if (!jbd2_journal_has_csum_v2or3(j)) 648 648 return; 649 649 650 650 tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
+25 -5
include/linux/jbd2.h
··· 159 159 * journal_block_tag (in the descriptor). The other h_chksum* fields are 160 160 * not used. 161 161 * 162 - * Checksum v1 and v2 are mutually exclusive features. 162 + * If FEATURE_INCOMPAT_CSUM_V3 is set, the descriptor block uses 163 + * journal_block_tag3_t to store a full 32-bit checksum. Everything else 164 + * is the same as v2. 165 + * 166 + * Checksum v1, v2, and v3 are mutually exclusive features. 163 167 */ 164 168 struct commit_header { 165 169 __be32 h_magic; ··· 183 179 * raw struct shouldn't be used for pointer math or sizeof() - use 184 180 * journal_tag_bytes(journal) instead to compute this. 185 181 */ 182 + typedef struct journal_block_tag3_s 183 + { 184 + __be32 t_blocknr; /* The on-disk block number */ 185 + __be32 t_flags; /* See below */ 186 + __be32 t_blocknr_high; /* most-significant high 32bits. */ 187 + __be32 t_checksum; /* crc32c(uuid+seq+block) */ 188 + } journal_block_tag3_t; 189 + 186 190 typedef struct journal_block_tag_s 187 191 { 188 192 __be32 t_blocknr; /* The on-disk block number */ ··· 198 186 __be16 t_flags; /* See below */ 199 187 __be32 t_blocknr_high; /* most-significant high 32bits. */ 200 188 } journal_block_tag_t; 201 - 202 - #define JBD2_TAG_SIZE32 (offsetof(journal_block_tag_t, t_blocknr_high)) 203 - #define JBD2_TAG_SIZE64 (sizeof(journal_block_tag_t)) 204 189 205 190 /* Tail of descriptor block, for checksumming */ 206 191 struct jbd2_journal_block_tail { ··· 293 284 #define JBD2_FEATURE_INCOMPAT_64BIT 0x00000002 294 285 #define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004 295 286 #define JBD2_FEATURE_INCOMPAT_CSUM_V2 0x00000008 287 + #define JBD2_FEATURE_INCOMPAT_CSUM_V3 0x00000010 296 288 297 289 /* Features known to this kernel version: */ 298 290 #define JBD2_KNOWN_COMPAT_FEATURES JBD2_FEATURE_COMPAT_CHECKSUM ··· 301 291 #define JBD2_KNOWN_INCOMPAT_FEATURES (JBD2_FEATURE_INCOMPAT_REVOKE | \ 302 292 JBD2_FEATURE_INCOMPAT_64BIT | \ 303 293 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | \ 304 - JBD2_FEATURE_INCOMPAT_CSUM_V2) 294 + JBD2_FEATURE_INCOMPAT_CSUM_V2 | \ 295 + JBD2_FEATURE_INCOMPAT_CSUM_V3) 305 296 306 297 #ifdef __KERNEL__ 307 298 ··· 1306 1295 1307 1296 extern int jbd2_journal_blocks_per_page(struct inode *inode); 1308 1297 extern size_t journal_tag_bytes(journal_t *journal); 1298 + 1299 + static inline int jbd2_journal_has_csum_v2or3(journal_t *journal) 1300 + { 1301 + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) || 1302 + JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) 1303 + return 1; 1304 + 1305 + return 0; 1306 + } 1309 1307 1310 1308 /* 1311 1309 * We reserve t_outstanding_credits >> JBD2_CONTROL_BLOCKS_SHIFT for