Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ext4: limit the length of per-inode prealloc list

In the scenario of writing sparse files, the per-inode prealloc list may
be very long, resulting in high overhead for ext4_mb_use_preallocated().
To circumvent this problem, we limit the maximum length of per-inode
prealloc list to 512 and allow users to modify it.

After patching, we observed that the sys ratio of cpu has dropped, and
the system throughput has increased significantly. We created a process
to write the sparse file, and the running time of the process on the
fixed kernel was significantly reduced, as follows:

Running time on unfixed kernel:
[root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat
real 0m2.051s
user 0m0.008s
sys 0m2.026s

Running time on fixed kernel:
[root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat
real 0m0.471s
user 0m0.004s
sys 0m0.395s

Signed-off-by: Chunguang Xu <brookxu@tencent.com>
Link: https://lore.kernel.org/r/d7a98178-056b-6db5-6bce-4ead23f4a257@gmail.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>

authored by

brookxu and committed by
Theodore Ts'o
27bc446e 66d5e027

+104 -29
+3
Documentation/admin-guide/ext4.rst
··· 482 482 multiple of this tuning parameter if the stripe size is not set in the 483 483 ext4 superblock 484 484 485 + mb_max_inode_prealloc 486 + The maximum length of per-inode ext4_prealloc_space list. 487 + 485 488 mb_max_to_scan 486 489 The maximum number of extents the multiblock allocator will search to 487 490 find the best extent.
+3 -1
fs/ext4/ext4.h
··· 1070 1070 struct timespec64 i_crtime; 1071 1071 1072 1072 /* mballoc */ 1073 + atomic_t i_prealloc_active; 1073 1074 struct list_head i_prealloc_list; 1074 1075 spinlock_t i_prealloc_lock; 1075 1076 ··· 1519 1518 unsigned int s_mb_stats; 1520 1519 unsigned int s_mb_order2_reqs; 1521 1520 unsigned int s_mb_group_prealloc; 1521 + unsigned int s_mb_max_inode_prealloc; 1522 1522 unsigned int s_max_dir_size_kb; 1523 1523 /* where last allocation was done - for stream allocation */ 1524 1524 unsigned long s_mb_last_group; ··· 2684 2682 extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, 2685 2683 struct ext4_allocation_request *, int *); 2686 2684 extern int ext4_mb_reserve_blocks(struct super_block *, int); 2687 - extern void ext4_discard_preallocations(struct inode *); 2685 + extern void ext4_discard_preallocations(struct inode *, unsigned int); 2688 2686 extern int __init ext4_init_mballoc(void); 2689 2687 extern void ext4_exit_mballoc(void); 2690 2688 extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
+5 -5
fs/ext4/extents.c
··· 100 100 * i_mutex. So we can safely drop the i_data_sem here. 101 101 */ 102 102 BUG_ON(EXT4_JOURNAL(inode) == NULL); 103 - ext4_discard_preallocations(inode); 103 + ext4_discard_preallocations(inode, 0); 104 104 up_write(&EXT4_I(inode)->i_data_sem); 105 105 *dropped = 1; 106 106 return 0; ··· 4266 4266 * not a good idea to call discard here directly, 4267 4267 * but otherwise we'd need to call it every free(). 4268 4268 */ 4269 - ext4_discard_preallocations(inode); 4269 + ext4_discard_preallocations(inode, 0); 4270 4270 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 4271 4271 fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE; 4272 4272 ext4_free_blocks(handle, inode, NULL, newblock, ··· 5293 5293 } 5294 5294 5295 5295 down_write(&EXT4_I(inode)->i_data_sem); 5296 - ext4_discard_preallocations(inode); 5296 + ext4_discard_preallocations(inode, 0); 5297 5297 5298 5298 ret = ext4_es_remove_extent(inode, punch_start, 5299 5299 EXT_MAX_BLOCKS - punch_start); ··· 5307 5307 up_write(&EXT4_I(inode)->i_data_sem); 5308 5308 goto out_stop; 5309 5309 } 5310 - ext4_discard_preallocations(inode); 5310 + ext4_discard_preallocations(inode, 0); 5311 5311 5312 5312 ret = ext4_ext_shift_extents(inode, handle, punch_stop, 5313 5313 punch_stop - punch_start, SHIFT_LEFT); ··· 5439 5439 goto out_stop; 5440 5440 5441 5441 down_write(&EXT4_I(inode)->i_data_sem); 5442 - ext4_discard_preallocations(inode); 5442 + ext4_discard_preallocations(inode, 0); 5443 5443 5444 5444 path = ext4_find_extent(inode, offset_lblk, NULL, 0); 5445 5445 if (IS_ERR(path)) {
+1 -1
fs/ext4/file.c
··· 147 147 (atomic_read(&inode->i_writecount) == 1) && 148 148 !EXT4_I(inode)->i_reserved_data_blocks) { 149 149 down_write(&EXT4_I(inode)->i_data_sem); 150 - ext4_discard_preallocations(inode); 150 + ext4_discard_preallocations(inode, 0); 151 151 up_write(&EXT4_I(inode)->i_data_sem); 152 152 } 153 153 if (is_dx(inode) && filp->private_data)
+1 -1
fs/ext4/indirect.c
··· 696 696 * i_mutex. So we can safely drop the i_data_sem here. 697 697 */ 698 698 BUG_ON(EXT4_JOURNAL(inode) == NULL); 699 - ext4_discard_preallocations(inode); 699 + ext4_discard_preallocations(inode, 0); 700 700 up_write(&EXT4_I(inode)->i_data_sem); 701 701 *dropped = 1; 702 702 return 0;
+3 -3
fs/ext4/inode.c
··· 383 383 */ 384 384 if ((ei->i_reserved_data_blocks == 0) && 385 385 !inode_is_open_for_write(inode)) 386 - ext4_discard_preallocations(inode); 386 + ext4_discard_preallocations(inode, 0); 387 387 } 388 388 389 389 static int __check_block_validity(struct inode *inode, const char *func, ··· 4055 4055 if (stop_block > first_block) { 4056 4056 4057 4057 down_write(&EXT4_I(inode)->i_data_sem); 4058 - ext4_discard_preallocations(inode); 4058 + ext4_discard_preallocations(inode, 0); 4059 4059 4060 4060 ret = ext4_es_remove_extent(inode, first_block, 4061 4061 stop_block - first_block); ··· 4210 4210 4211 4211 down_write(&EXT4_I(inode)->i_data_sem); 4212 4212 4213 - ext4_discard_preallocations(inode); 4213 + ext4_discard_preallocations(inode, 0); 4214 4214 4215 4215 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4216 4216 err = ext4_ext_truncate(handle, inode);
+1 -1
fs/ext4/ioctl.c
··· 202 202 reset_inode_seed(inode); 203 203 reset_inode_seed(inode_bl); 204 204 205 - ext4_discard_preallocations(inode); 205 + ext4_discard_preallocations(inode, 0); 206 206 207 207 err = ext4_mark_inode_dirty(handle, inode); 208 208 if (err < 0) {
+66 -8
fs/ext4/mballoc.c
··· 2878 2878 sbi->s_mb_stats = MB_DEFAULT_STATS; 2879 2879 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; 2880 2880 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; 2881 + sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC; 2881 2882 /* 2882 2883 * The default group preallocation is 512, which for 4k block 2883 2884 * sizes translates to 2 megabytes. However for bigalloc file ··· 3817 3816 mb_debug(sb, "preallocated %d for group %u\n", preallocated, group); 3818 3817 } 3819 3818 3819 + static void ext4_mb_mark_pa_deleted(struct super_block *sb, 3820 + struct ext4_prealloc_space *pa) 3821 + { 3822 + struct ext4_inode_info *ei; 3823 + 3824 + if (pa->pa_deleted) { 3825 + ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n", 3826 + pa->pa_type, pa->pa_pstart, pa->pa_lstart, 3827 + pa->pa_len); 3828 + return; 3829 + } 3830 + 3831 + pa->pa_deleted = 1; 3832 + 3833 + if (pa->pa_type == MB_INODE_PA) { 3834 + ei = EXT4_I(pa->pa_inode); 3835 + atomic_dec(&ei->i_prealloc_active); 3836 + } 3837 + } 3838 + 3820 3839 static void ext4_mb_pa_callback(struct rcu_head *head) 3821 3840 { 3822 3841 struct ext4_prealloc_space *pa; ··· 3869 3848 return; 3870 3849 } 3871 3850 3872 - pa->pa_deleted = 1; 3851 + ext4_mb_mark_pa_deleted(sb, pa); 3873 3852 spin_unlock(&pa->pa_lock); 3874 3853 3875 3854 grp_blk = pa->pa_pstart; ··· 3993 3972 spin_lock(pa->pa_obj_lock); 3994 3973 list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); 3995 3974 spin_unlock(pa->pa_obj_lock); 3975 + atomic_inc(&ei->i_prealloc_active); 3996 3976 } 3997 3977 3998 3978 /* ··· 4204 4182 } 4205 4183 4206 4184 /* seems this one can be freed ... */ 4207 - pa->pa_deleted = 1; 4185 + ext4_mb_mark_pa_deleted(sb, pa); 4208 4186 4209 4187 /* we can trust pa_free ... */ 4210 4188 free += pa->pa_free; ··· 4267 4245 * 4268 4246 * FIXME!! Make sure it is valid at all the call sites 4269 4247 */ 4270 - void ext4_discard_preallocations(struct inode *inode) 4248 + void ext4_discard_preallocations(struct inode *inode, unsigned int needed) 4271 4249 { 4272 4250 struct ext4_inode_info *ei = EXT4_I(inode); 4273 4251 struct super_block *sb = inode->i_sb; ··· 4285 4263 4286 4264 mb_debug(sb, "discard preallocation for inode %lu\n", 4287 4265 inode->i_ino); 4288 - trace_ext4_discard_preallocations(inode); 4266 + trace_ext4_discard_preallocations(inode, 4267 + atomic_read(&ei->i_prealloc_active), needed); 4289 4268 4290 4269 INIT_LIST_HEAD(&list); 4270 + 4271 + if (needed == 0) 4272 + needed = UINT_MAX; 4291 4273 4292 4274 repeat: 4293 4275 /* first, collect all pa's in the inode */ 4294 4276 spin_lock(&ei->i_prealloc_lock); 4295 - while (!list_empty(&ei->i_prealloc_list)) { 4296 - pa = list_entry(ei->i_prealloc_list.next, 4277 + while (!list_empty(&ei->i_prealloc_list) && needed) { 4278 + pa = list_entry(ei->i_prealloc_list.prev, 4297 4279 struct ext4_prealloc_space, pa_inode_list); 4298 4280 BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); 4299 4281 spin_lock(&pa->pa_lock); ··· 4314 4288 4315 4289 } 4316 4290 if (pa->pa_deleted == 0) { 4317 - pa->pa_deleted = 1; 4291 + ext4_mb_mark_pa_deleted(sb, pa); 4318 4292 spin_unlock(&pa->pa_lock); 4319 4293 list_del_rcu(&pa->pa_inode_list); 4320 4294 list_add(&pa->u.pa_tmp_list, &list); 4295 + needed--; 4321 4296 continue; 4322 4297 } 4323 4298 ··· 4619 4592 BUG_ON(pa->pa_type != MB_GROUP_PA); 4620 4593 4621 4594 /* seems this one can be freed ... */ 4622 - pa->pa_deleted = 1; 4595 + ext4_mb_mark_pa_deleted(sb, pa); 4623 4596 spin_unlock(&pa->pa_lock); 4624 4597 4625 4598 list_del_rcu(&pa->pa_inode_list); ··· 4718 4691 } 4719 4692 4720 4693 /* 4694 + * if per-inode prealloc list is too long, trim some PA 4695 + */ 4696 + static void ext4_mb_trim_inode_pa(struct inode *inode) 4697 + { 4698 + struct ext4_inode_info *ei = EXT4_I(inode); 4699 + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4700 + int count, delta; 4701 + 4702 + count = atomic_read(&ei->i_prealloc_active); 4703 + delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1; 4704 + if (count > sbi->s_mb_max_inode_prealloc + delta) { 4705 + count -= sbi->s_mb_max_inode_prealloc; 4706 + ext4_discard_preallocations(inode, count); 4707 + } 4708 + } 4709 + 4710 + /* 4721 4711 * release all resource we used in allocation 4722 4712 */ 4723 4713 static int ext4_mb_release_context(struct ext4_allocation_context *ac) 4724 4714 { 4715 + struct inode *inode = ac->ac_inode; 4716 + struct ext4_inode_info *ei = EXT4_I(inode); 4725 4717 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 4726 4718 struct ext4_prealloc_space *pa = ac->ac_pa; 4727 4719 if (pa) { ··· 4766 4720 ext4_mb_add_n_trim(ac); 4767 4721 } 4768 4722 } 4723 + 4724 + if (pa->pa_type == MB_INODE_PA) { 4725 + /* 4726 + * treat per-inode prealloc list as a lru list, then try 4727 + * to trim the least recently used PA. 4728 + */ 4729 + spin_lock(pa->pa_obj_lock); 4730 + list_move(&pa->pa_inode_list, &ei->i_prealloc_list); 4731 + spin_unlock(pa->pa_obj_lock); 4732 + } 4733 + 4769 4734 ext4_mb_put_pa(ac, ac->ac_sb, pa); 4770 4735 } 4771 4736 if (ac->ac_bitmap_page) ··· 4786 4729 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 4787 4730 mutex_unlock(&ac->ac_lg->lg_mutex); 4788 4731 ext4_mb_collect_stats(ac); 4732 + ext4_mb_trim_inode_pa(inode); 4789 4733 return 0; 4790 4734 } 4791 4735
+4
fs/ext4/mballoc.h
··· 73 73 */ 74 74 #define MB_DEFAULT_GROUP_PREALLOC 512 75 75 76 + /* 77 + * maximum length of inode prealloc list 78 + */ 79 + #define MB_DEFAULT_MAX_INODE_PREALLOC 512 76 80 77 81 struct ext4_free_data { 78 82 /* this links the free block information from sb_info */
+2 -2
fs/ext4/move_extent.c
··· 686 686 687 687 out: 688 688 if (*moved_len) { 689 - ext4_discard_preallocations(orig_inode); 690 - ext4_discard_preallocations(donor_inode); 689 + ext4_discard_preallocations(orig_inode, 0); 690 + ext4_discard_preallocations(donor_inode, 0); 691 691 } 692 692 693 693 ext4_ext_drop_refs(path);
+2 -1
fs/ext4/super.c
··· 1127 1127 inode_set_iversion(&ei->vfs_inode, 1); 1128 1128 spin_lock_init(&ei->i_raw_lock); 1129 1129 INIT_LIST_HEAD(&ei->i_prealloc_list); 1130 + atomic_set(&ei->i_prealloc_active, 0); 1130 1131 spin_lock_init(&ei->i_prealloc_lock); 1131 1132 ext4_es_init_tree(&ei->i_es_tree); 1132 1133 rwlock_init(&ei->i_es_lock); ··· 1221 1220 { 1222 1221 invalidate_inode_buffers(inode); 1223 1222 clear_inode(inode); 1224 - ext4_discard_preallocations(inode); 1223 + ext4_discard_preallocations(inode, 0); 1225 1224 ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); 1226 1225 dquot_drop(inode); 1227 1226 if (EXT4_I(inode)->jinode) {
+2
fs/ext4/sysfs.c
··· 218 218 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); 219 219 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 220 220 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 221 + EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc); 221 222 EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); 222 223 EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); 223 224 EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); ··· 265 264 ATTR_LIST(mb_order2_req), 266 265 ATTR_LIST(mb_stream_req), 267 266 ATTR_LIST(mb_group_prealloc), 267 + ATTR_LIST(mb_max_inode_prealloc), 268 268 ATTR_LIST(max_writeback_mb_bump), 269 269 ATTR_LIST(extent_max_zeroout_kb), 270 270 ATTR_LIST(trigger_fs_error),
+11 -6
include/trace/events/ext4.h
··· 746 746 ); 747 747 748 748 TRACE_EVENT(ext4_discard_preallocations, 749 - TP_PROTO(struct inode *inode), 749 + TP_PROTO(struct inode *inode, unsigned int len, unsigned int needed), 750 750 751 - TP_ARGS(inode), 751 + TP_ARGS(inode, len, needed), 752 752 753 753 TP_STRUCT__entry( 754 - __field( dev_t, dev ) 755 - __field( ino_t, ino ) 754 + __field( dev_t, dev ) 755 + __field( ino_t, ino ) 756 + __field( unsigned int, len ) 757 + __field( unsigned int, needed ) 756 758 757 759 ), 758 760 759 761 TP_fast_assign( 760 762 __entry->dev = inode->i_sb->s_dev; 761 763 __entry->ino = inode->i_ino; 764 + __entry->len = len; 765 + __entry->needed = needed; 762 766 ), 763 767 764 - TP_printk("dev %d,%d ino %lu", 768 + TP_printk("dev %d,%d ino %lu len: %u needed %u", 765 769 MAJOR(__entry->dev), MINOR(__entry->dev), 766 - (unsigned long) __entry->ino) 770 + (unsigned long) __entry->ino, __entry->len, 771 + __entry->needed) 767 772 ); 768 773 769 774 TRACE_EVENT(ext4_mb_discard_preallocations,