Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ext4: make the zero-out chunk size tunable

Currently in ext4 the length of zero-out chunk is set to 7 file system
blocks. But if an inode has uninitailized extents from using
fallocate to preallocate space, and the workload issues many random
writes, this can cause a fragmented extent tree that will
unnecessarily grow the extent tree.

So create a new sysfs tunable, extent_max_zeroout_kb, which controls
the maximum size where blocks will be zeroed out instead of creating a
new uninitialized extent. The default of this has been sent to 32kb.

CC: Zach Brown <zab@zabbo.net>
CC: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

authored by

Zheng Liu and committed by
Theodore Ts'o
67a5da56 81370291

+32 -12
+13
Documentation/ABI/testing/sysfs-fs-ext4
··· 96 96 Description: 97 97 The maximum number of megabytes the writeback code will 98 98 try to write out before move on to another inode. 99 + 100 + What: /sys/fs/ext4/<disk>/extent_max_zeroout_kb 101 + Date: August 2012 102 + Contact: "Theodore Ts'o" <tytso@mit.edu> 103 + Description: 104 + The maximum number of kilobytes which will be zeroed 105 + out in preference to creating a new uninitialized 106 + extent when manipulating an inode's extent tree. Note 107 + that using a larger value will increase the 108 + variability of time necessary to complete a random 109 + write operation (since a 4k random write might turn 110 + into a much larger write due to the zeroout 111 + operation).
+3
fs/ext4/ext4.h
··· 1271 1271 unsigned long s_sectors_written_start; 1272 1272 u64 s_kbytes_written; 1273 1273 1274 + /* the size of zero-out chunk */ 1275 + unsigned int s_extent_max_zeroout_kb; 1276 + 1274 1277 unsigned int s_log_groups_per_flex; 1275 1278 struct flex_groups *s_flex_groups; 1276 1279
+13 -12
fs/ext4/extents.c
··· 3085 3085 return err ? err : map->m_len; 3086 3086 } 3087 3087 3088 - #define EXT4_EXT_ZERO_LEN 7 3089 3088 /* 3090 3089 * This function is called by ext4_ext_map_blocks() if someone tries to write 3091 3090 * to an uninitialized extent. It may result in splitting the uninitialized ··· 3110 3111 struct ext4_map_blocks *map, 3111 3112 struct ext4_ext_path *path) 3112 3113 { 3114 + struct ext4_sb_info *sbi; 3113 3115 struct ext4_extent_header *eh; 3114 3116 struct ext4_map_blocks split_map; 3115 3117 struct ext4_extent zero_ex; 3116 3118 struct ext4_extent *ex; 3117 3119 ext4_lblk_t ee_block, eof_block; 3118 3120 unsigned int ee_len, depth; 3119 - int allocated; 3121 + int allocated, max_zeroout = 0; 3120 3122 int err = 0; 3121 3123 int split_flag = 0; 3122 3124 ··· 3125 3125 "block %llu, max_blocks %u\n", inode->i_ino, 3126 3126 (unsigned long long)map->m_lblk, map->m_len); 3127 3127 3128 + sbi = EXT4_SB(inode->i_sb); 3128 3129 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 3129 3130 inode->i_sb->s_blocksize_bits; 3130 3131 if (eof_block < map->m_lblk + map->m_len) ··· 3225 3224 */ 3226 3225 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; 3227 3226 3228 - /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ 3229 - if (ee_len <= 2*EXT4_EXT_ZERO_LEN && 3230 - (EXT4_EXT_MAY_ZEROOUT & split_flag)) { 3227 + if (EXT4_EXT_MAY_ZEROOUT & split_flag) 3228 + max_zeroout = sbi->s_extent_max_zeroout_kb >> 3229 + inode->i_sb->s_blocksize_bits; 3230 + 3231 + /* If extent is less than s_max_zeroout_kb, zeroout directly */ 3232 + if (max_zeroout && (ee_len <= max_zeroout)) { 3231 3233 err = ext4_ext_zeroout(inode, ex); 3232 3234 if (err) 3233 3235 goto out; ··· 3254 3250 split_map.m_lblk = map->m_lblk; 3255 3251 split_map.m_len = map->m_len; 3256 3252 3257 - if (allocated > map->m_len) { 3258 - if (allocated <= EXT4_EXT_ZERO_LEN && 3259 - (EXT4_EXT_MAY_ZEROOUT & split_flag)) { 3253 + if (max_zeroout && (allocated > map->m_len)) { 3254 + if (allocated <= max_zeroout) { 3260 3255 /* case 3 */ 3261 3256 zero_ex.ee_block = 3262 3257 cpu_to_le32(map->m_lblk); ··· 3267 3264 goto out; 3268 3265 split_map.m_lblk = map->m_lblk; 3269 3266 split_map.m_len = allocated; 3270 - } else if ((map->m_lblk - ee_block + map->m_len < 3271 - EXT4_EXT_ZERO_LEN) && 3272 - (EXT4_EXT_MAY_ZEROOUT & split_flag)) { 3267 + } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) { 3273 3268 /* case 2 */ 3274 3269 if (map->m_lblk != ee_block) { 3275 3270 zero_ex.ee_block = ex->ee_block; ··· 3287 3286 } 3288 3287 3289 3288 allocated = ext4_split_extent(handle, inode, path, 3290 - &split_map, split_flag, 0); 3289 + &split_map, split_flag, 0); 3291 3290 if (allocated < 0) 3292 3291 err = allocated; 3293 3292
+3
fs/ext4/super.c
··· 2541 2541 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 2542 2542 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 2543 2543 EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); 2544 + EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); 2544 2545 EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); 2545 2546 2546 2547 static struct attribute *ext4_attrs[] = { ··· 2557 2556 ATTR_LIST(mb_stream_req), 2558 2557 ATTR_LIST(mb_group_prealloc), 2559 2558 ATTR_LIST(max_writeback_mb_bump), 2559 + ATTR_LIST(extent_max_zeroout_kb), 2560 2560 ATTR_LIST(trigger_fs_error), 2561 2561 NULL, 2562 2562 }; ··· 3758 3756 3759 3757 sbi->s_stripe = ext4_get_stripe_size(sbi); 3760 3758 sbi->s_max_writeback_mb_bump = 128; 3759 + sbi->s_extent_max_zeroout_kb = 32; 3761 3760 3762 3761 /* 3763 3762 * set up enough so that it can read an inode