Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ext4: Add allocation criteria 1.5 (CR1_5)

CR1_5 aims to optimize allocations which can't be satisfied in CR1. The
fact that we couldn't find a group in CR1 suggests that it would be
difficult to find a continuous extent to compleltely satisfy our
allocations. So before falling to the slower CR2, in CR1.5 we
proactively trim the the preallocations so we can find a group with
(free / fragments) big enough. This speeds up our allocation at the
cost of slightly reduced preallocation.

The patch also adds a new sysfs tunable:

* /sys/fs/ext4/<partition>/mb_cr1_5_max_trim_order

This controls how much CR1.5 can trim a request before falling to CR2.
For example, for a request of order 7 and max trim order 2, CR1.5 can
trim this upto order 5.

Suggested-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Link: https://lore.kernel.org/r/150fdf65c8e4cc4dba71e020ce0859bcf636a5ff.1685449706.git.ojaswin@linux.ibm.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>

authored by

Ojaswin Mujoo and committed by
Theodore Ts'o
7e170922 856d865c

+150 -10
+7 -1
fs/ext4/ext4.h
··· 133 133 * criteria the slower the allocation. We start at lower criterias and keep 134 134 * falling back to higher ones if we are not able to find any blocks. 135 135 */ 136 - #define EXT4_MB_NUM_CRS 4 136 + #define EXT4_MB_NUM_CRS 5 137 137 /* 138 138 * All possible allocation criterias for mballoc 139 139 */ 140 140 enum criteria { 141 141 CR0, 142 142 CR1, 143 + CR1_5, 143 144 CR2, 144 145 CR3, 145 146 }; ··· 186 185 #define EXT4_MB_CR0_OPTIMIZED 0x8000 187 186 /* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ 188 187 #define EXT4_MB_CR1_OPTIMIZED 0x00010000 188 + /* Avg fragment size rb tree lookup succeeded at least once for cr = 1.5 */ 189 + #define EXT4_MB_CR1_5_OPTIMIZED 0x00020000 190 + 189 191 struct ext4_allocation_request { 190 192 /* target inode for block we're allocating */ 191 193 struct inode *inode; ··· 1553 1549 unsigned long s_mb_last_start; 1554 1550 unsigned int s_mb_prefetch; 1555 1551 unsigned int s_mb_prefetch_limit; 1552 + unsigned int s_mb_cr1_5_max_trim_order; 1556 1553 1557 1554 /* stats for buddy allocator */ 1558 1555 atomic_t s_bal_reqs; /* number of reqs with len > 1 */ ··· 1568 1563 atomic_t s_bal_2orders; /* 2^order hits */ 1569 1564 atomic_t s_bal_cr0_bad_suggestions; 1570 1565 atomic_t s_bal_cr1_bad_suggestions; 1566 + atomic_t s_bal_cr1_5_bad_suggestions; 1571 1567 atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS]; 1572 1568 atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS]; 1573 1569 atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */
+126 -9
fs/ext4/mballoc.c
··· 165 165 * equal to request size using our average fragment size group lists (data 166 166 * structure 2) in O(1) time. 167 167 * 168 + * At CR1.5 (aka CR1_5), we aim to optimize allocations which can't be satisfied 169 + * in CR1. The fact that we couldn't find a group in CR1 suggests that there is 170 + * no BG that has average fragment size > goal length. So before falling to the 171 + * slower CR2, in CR1.5 we proactively trim goal length and then use the same 172 + * fragment lists as CR1 to find a BG with a big enough average fragment size. 173 + * This increases the chances of finding a suitable block group in O(1) time and 174 + * results * in faster allocation at the cost of reduced size of allocation. 175 + * 168 176 * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in 169 177 * linear order which requires O(N) search time for each CR0 and CR1 phase. 170 178 * ··· 970 962 *group = grp->bb_group; 971 963 ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; 972 964 } else { 965 + *new_cr = CR1_5; 966 + } 967 + } 968 + 969 + /* 970 + * We couldn't find a group in CR1 so try to find the highest free fragment 971 + * order we have and proactively trim the goal request length to that order to 972 + * find a suitable group faster. 973 + * 974 + * This optimizes allocation speed at the cost of slightly reduced 975 + * preallocations. However, we make sure that we don't trim the request too 976 + * much and fall to CR2 in that case. 977 + */ 978 + static void ext4_mb_choose_next_group_cr1_5(struct ext4_allocation_context *ac, 979 + enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) 980 + { 981 + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 982 + struct ext4_group_info *grp = NULL; 983 + int i, order, min_order; 984 + unsigned long num_stripe_clusters = 0; 985 + 986 + if (unlikely(ac->ac_flags & EXT4_MB_CR1_5_OPTIMIZED)) { 987 + if (sbi->s_mb_stats) 988 + atomic_inc(&sbi->s_bal_cr1_5_bad_suggestions); 989 + } 990 + 991 + /* 992 + * mb_avg_fragment_size_order() returns order in a way that makes 993 + * retrieving back the length using (1 << order) inaccurate. Hence, use 994 + * fls() instead since we need to know the actual length while modifying 995 + * goal length. 996 + */ 997 + order = fls(ac->ac_g_ex.fe_len); 998 + min_order = order - sbi->s_mb_cr1_5_max_trim_order; 999 + if (min_order < 0) 1000 + min_order = 0; 1001 + 1002 + if (1 << min_order < ac->ac_o_ex.fe_len) 1003 + min_order = fls(ac->ac_o_ex.fe_len) + 1; 1004 + 1005 + if (sbi->s_stripe > 0) { 1006 + /* 1007 + * We are assuming that stripe size is always a multiple of 1008 + * cluster ratio otherwise __ext4_fill_super exists early. 1009 + */ 1010 + num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe); 1011 + if (1 << min_order < num_stripe_clusters) 1012 + min_order = fls(num_stripe_clusters); 1013 + } 1014 + 1015 + for (i = order; i >= min_order; i--) { 1016 + int frag_order; 1017 + /* 1018 + * Scale down goal len to make sure we find something 1019 + * in the free fragments list. Basically, reduce 1020 + * preallocations. 1021 + */ 1022 + ac->ac_g_ex.fe_len = 1 << i; 1023 + 1024 + if (num_stripe_clusters > 0) { 1025 + /* 1026 + * Try to round up the adjusted goal to stripe size 1027 + * (in cluster units) multiple for efficiency. 1028 + * 1029 + * XXX: Is s->stripe always a power of 2? In that case 1030 + * we can use the faster round_up() variant. 1031 + */ 1032 + ac->ac_g_ex.fe_len = roundup(ac->ac_g_ex.fe_len, 1033 + num_stripe_clusters); 1034 + } 1035 + 1036 + frag_order = mb_avg_fragment_size_order(ac->ac_sb, 1037 + ac->ac_g_ex.fe_len); 1038 + 1039 + grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order); 1040 + if (grp) 1041 + break; 1042 + } 1043 + 1044 + if (grp) { 1045 + *group = grp->bb_group; 1046 + ac->ac_flags |= EXT4_MB_CR1_5_OPTIMIZED; 1047 + } else { 1048 + /* Reset goal length to original goal length before falling into CR2 */ 1049 + ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; 973 1050 *new_cr = CR2; 974 1051 } 975 1052 } ··· 1121 1028 ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); 1122 1029 } else if (*new_cr == CR1) { 1123 1030 ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups); 1031 + } else if (*new_cr == CR1_5) { 1032 + ext4_mb_choose_next_group_cr1_5(ac, new_cr, group, ngroups); 1124 1033 } else { 1125 1034 /* 1126 1035 * TODO: For CR=2, we can arrange groups in an rb tree sorted by ··· 2446 2351 2447 2352 if (ac->ac_criteria < CR2) { 2448 2353 /* 2449 - * In CR1, we are sure that this group will 2354 + * In CR1 and CR1_5, we are sure that this group will 2450 2355 * have a large enough continuous free extent, so skip 2451 2356 * over the smaller free extents 2452 2357 */ ··· 2578 2483 2579 2484 return true; 2580 2485 case CR1: 2486 + case CR1_5: 2581 2487 if ((free / fragments) >= ac->ac_g_ex.fe_len) 2582 2488 return true; 2583 2489 break; ··· 2843 2747 * spend a lot of time loading imperfect groups 2844 2748 */ 2845 2749 if ((prefetch_grp == group) && 2846 - (cr > CR1 || 2750 + (cr > CR1_5 || 2847 2751 prefetch_ios < sbi->s_mb_prefetch_limit)) { 2848 2752 nr = sbi->s_mb_prefetch; 2849 2753 if (ext4_has_feature_flex_bg(sb)) { ··· 2883 2787 ac->ac_groups_scanned++; 2884 2788 if (cr == CR0) 2885 2789 ext4_mb_simple_scan_group(ac, &e4b); 2886 - else if (cr == CR1 && sbi->s_stripe && 2790 + else if ((cr == CR1 || cr == CR1_5) && sbi->s_stripe && 2887 2791 !(ac->ac_g_ex.fe_len % 2888 2792 EXT4_B2C(sbi, sbi->s_stripe))) 2889 2793 ext4_mb_scan_aligned(ac, &e4b); ··· 2899 2803 /* Processed all groups and haven't found blocks */ 2900 2804 if (sbi->s_mb_stats && i == ngroups) 2901 2805 atomic64_inc(&sbi->s_bal_cX_failed[cr]); 2806 + 2807 + if (i == ngroups && ac->ac_criteria == CR1_5) 2808 + /* Reset goal length to original goal length before 2809 + * falling into CR2 */ 2810 + ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; 2902 2811 } 2903 2812 2904 2813 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && ··· 3072 2971 atomic64_read(&sbi->s_bal_cX_failed[CR1])); 3073 2972 seq_printf(seq, "\t\tbad_suggestions: %u\n", 3074 2973 atomic_read(&sbi->s_bal_cr1_bad_suggestions)); 2974 + 2975 + seq_puts(seq, "\tcr1.5_stats:\n"); 2976 + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR1_5])); 2977 + seq_printf(seq, "\t\tgroups_considered: %llu\n", 2978 + atomic64_read(&sbi->s_bal_cX_groups_considered[CR1_5])); 2979 + seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR1_5])); 2980 + seq_printf(seq, "\t\tuseless_loops: %llu\n", 2981 + atomic64_read(&sbi->s_bal_cX_failed[CR1_5])); 2982 + seq_printf(seq, "\t\tbad_suggestions: %u\n", 2983 + atomic_read(&sbi->s_bal_cr1_5_bad_suggestions)); 3075 2984 3076 2985 seq_puts(seq, "\tcr2_stats:\n"); 3077 2986 seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR2])); ··· 3600 3489 sbi->s_mb_stats = MB_DEFAULT_STATS; 3601 3490 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; 3602 3491 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; 3492 + sbi->s_mb_cr1_5_max_trim_order = MB_DEFAULT_CR1_5_TRIM_ORDER; 3493 + 3603 3494 /* 3604 3495 * The default group preallocation is 512, which for 4k block 3605 3496 * sizes translates to 2 megabytes. However for bigalloc file ··· 4505 4392 * placement or satisfy big request as is */ 4506 4393 ac->ac_g_ex.fe_logical = start; 4507 4394 ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); 4395 + ac->ac_orig_goal_len = ac->ac_g_ex.fe_len; 4508 4396 4509 4397 /* define goal start in order to merge */ 4510 4398 if (ar->pright && (ar->lright == (start + size)) && ··· 4549 4435 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && 4550 4436 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) 4551 4437 atomic_inc(&sbi->s_bal_goals); 4552 - if (ac->ac_f_ex.fe_len == ac->ac_g_ex.fe_len) 4438 + /* did we allocate as much as normalizer originally wanted? */ 4439 + if (ac->ac_f_ex.fe_len == ac->ac_orig_goal_len) 4553 4440 atomic_inc(&sbi->s_bal_len_goals); 4441 + 4554 4442 if (ac->ac_found > sbi->s_mb_max_to_scan) 4555 4443 atomic_inc(&sbi->s_bal_breaks); 4556 4444 } ··· 5037 4921 5038 4922 pa = ac->ac_pa; 5039 4923 5040 - if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { 4924 + if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) { 5041 4925 int new_bex_start; 5042 4926 int new_bex_end; 5043 4927 ··· 5052 4936 * fragmentation in check while ensuring logical range of best 5053 4937 * extent doesn't overflow out of goal extent: 5054 4938 * 5055 - * 1. Check if best ex can be kept at end of goal and still 5056 - * cover original start 4939 + * 1. Check if best ex can be kept at end of goal (before 4940 + * cr_best_avail trimmed it) and still cover original start 5057 4941 * 2. Else, check if best ex can be kept at start of goal and 5058 4942 * still cover original start 5059 4943 * 3. Else, keep the best ex at start of original request. 5060 4944 */ 5061 4945 new_bex_end = ac->ac_g_ex.fe_logical + 5062 - EXT4_C2B(sbi, ac->ac_g_ex.fe_len); 4946 + EXT4_C2B(sbi, ac->ac_orig_goal_len); 5063 4947 new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 5064 4948 if (ac->ac_o_ex.fe_logical >= new_bex_start) 5065 4949 goto adjust_bex; ··· 5080 4964 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); 5081 4965 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); 5082 4966 BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical + 5083 - EXT4_C2B(sbi, ac->ac_g_ex.fe_len))); 4967 + EXT4_C2B(sbi, ac->ac_orig_goal_len))); 5084 4968 } 5085 4969 5086 4970 pa->pa_lstart = ac->ac_b_ex.fe_logical; ··· 5700 5584 ac->ac_o_ex.fe_start = block; 5701 5585 ac->ac_o_ex.fe_len = len; 5702 5586 ac->ac_g_ex = ac->ac_o_ex; 5587 + ac->ac_orig_goal_len = ac->ac_g_ex.fe_len; 5703 5588 ac->ac_flags = ar->flags; 5704 5589 5705 5590 /* we have to define context: we'll work with a file or
+13
fs/ext4/mballoc.h
··· 86 86 #define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16 87 87 88 88 /* 89 + * The maximum order upto which CR1.5 can trim a particular allocation request. 90 + * Example, if we have an order 7 request and max trim order of 3, CR1.5 can 91 + * trim this upto order 4. 92 + */ 93 + #define MB_DEFAULT_CR1_5_TRIM_ORDER 3 94 + 95 + /* 89 96 * Number of valid buddy orders 90 97 */ 91 98 #define MB_NUM_ORDERS(sb) ((sb)->s_blocksize_bits + 2) ··· 185 178 186 179 /* copy of the best found extent taken before preallocation efforts */ 187 180 struct ext4_free_extent ac_f_ex; 181 + 182 + /* 183 + * goal len can change in CR1.5, so save the original len. This is 184 + * used while adjusting the PA window and for accounting. 185 + */ 186 + ext4_grpblk_t ac_orig_goal_len; 188 187 189 188 __u32 ac_groups_considered; 190 189 __u32 ac_flags; /* allocation hints */
+2
fs/ext4/sysfs.c
··· 223 223 EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); 224 224 EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); 225 225 EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); 226 + EXT4_RW_ATTR_SBI_UI(mb_cr1_5_max_trim_order, s_mb_cr1_5_max_trim_order); 226 227 #ifdef CONFIG_EXT4_DEBUG 227 228 EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail); 228 229 #endif ··· 274 273 ATTR_LIST(warning_ratelimit_burst), 275 274 ATTR_LIST(msg_ratelimit_interval_ms), 276 275 ATTR_LIST(msg_ratelimit_burst), 276 + ATTR_LIST(mb_cr1_5_max_trim_order), 277 277 ATTR_LIST(errors_count), 278 278 ATTR_LIST(warning_count), 279 279 ATTR_LIST(msg_count),
+2
include/trace/events/ext4.h
··· 122 122 123 123 TRACE_DEFINE_ENUM(CR0); 124 124 TRACE_DEFINE_ENUM(CR1); 125 + TRACE_DEFINE_ENUM(CR1_5); 125 126 TRACE_DEFINE_ENUM(CR2); 126 127 TRACE_DEFINE_ENUM(CR3); 127 128 ··· 130 129 __print_symbolic(cr, \ 131 130 { CR0, "CR0" }, \ 132 131 { CR1, "CR1" }, \ 132 + { CR1_5, "CR1.5" } \ 133 133 { CR2, "CR2" }, \ 134 134 { CR3, "CR3" }) 135 135