Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ext4: track extent status tree shrinker delay statictics

This commit adds some statictics in extent status tree shrinker. The
purpose to add these is that we want to collect more details when we
encounter a stall caused by extent status tree shrinker. Here we count
the following statictics:
stats:
the number of all objects on all extent status trees
the number of reclaimable objects on lru list
cache hits/misses
the last sorted interval
the number of inodes on lru list
average:
scan time for shrinking some objects
the number of shrunk objects
maximum:
the inode that has max nr. of objects on lru list
the maximum scan time for shrinking some objects

The output looks like below:
$ cat /proc/fs/ext4/sda1/es_shrinker_info
stats:
28228 objects
6341 reclaimable objects
5281/631 cache hits/misses
586 ms last sorted interval
250 inodes on lru list
average:
153 us scan time
128 shrunk objects
maximum:
255 inode (255 objects, 198 reclaimable)
125723 us max scan time

If the lru list has never been sorted, the following line will not be
printed:
586ms last sorted interval
If there is an empty lru list, the following lines also will not be
printed:
250 inodes on lru list
...
maximum:
255 inode (255 objects, 198 reclaimable)
0 us max scan time

Meanwhile in this commit a new trace point is defined to print some
details in __ext4_es_shrink().

Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Jan Kara <jack@suse.cz>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>

authored by

Zheng Liu and committed by
Theodore Ts'o
eb68d0e2 e963bb1d

+224 -21
+2 -2
fs/ext4/ext4.h
··· 891 891 struct ext4_es_tree i_es_tree; 892 892 rwlock_t i_es_lock; 893 893 struct list_head i_es_lru; 894 + unsigned int i_es_all_nr; /* protected by i_es_lock */ 894 895 unsigned int i_es_lru_nr; /* protected by i_es_lock */ 895 896 unsigned long i_touch_when; /* jiffies of last accessing */ 896 897 ··· 1332 1331 /* Reclaim extents from extent status tree */ 1333 1332 struct shrinker s_es_shrinker; 1334 1333 struct list_head s_es_lru; 1335 - unsigned long s_es_last_sorted; 1336 - struct percpu_counter s_extent_cache_cnt; 1334 + struct ext4_es_stats s_es_stats; 1337 1335 struct mb_cache *s_mb_cache; 1338 1336 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; 1339 1337
+176 -10
fs/ext4/extents_status.c
··· 11 11 */ 12 12 #include <linux/rbtree.h> 13 13 #include <linux/list_sort.h> 14 + #include <linux/proc_fs.h> 15 + #include <linux/seq_file.h> 14 16 #include "ext4.h" 15 17 #include "extents_status.h" 16 18 ··· 315 313 */ 316 314 if (!ext4_es_is_delayed(es)) { 317 315 EXT4_I(inode)->i_es_lru_nr++; 318 - percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); 316 + percpu_counter_inc(&EXT4_SB(inode->i_sb)-> 317 + s_es_stats.es_stats_lru_cnt); 319 318 } 319 + 320 + EXT4_I(inode)->i_es_all_nr++; 321 + percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); 320 322 321 323 return es; 322 324 } 323 325 324 326 static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) 325 327 { 328 + EXT4_I(inode)->i_es_all_nr--; 329 + percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); 330 + 326 331 /* Decrease the lru counter when this es is not delayed */ 327 332 if (!ext4_es_is_delayed(es)) { 328 333 BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); 329 334 EXT4_I(inode)->i_es_lru_nr--; 330 - percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); 335 + percpu_counter_dec(&EXT4_SB(inode->i_sb)-> 336 + s_es_stats.es_stats_lru_cnt); 331 337 } 332 338 333 339 kmem_cache_free(ext4_es_cachep, es); ··· 739 729 struct extent_status *es) 740 730 { 741 731 struct ext4_es_tree *tree; 732 + struct ext4_es_stats *stats; 742 733 struct extent_status *es1 = NULL; 743 734 struct rb_node *node; 744 735 int found = 0; ··· 776 765 } 777 766 778 767 out: 768 + stats = &EXT4_SB(inode->i_sb)->s_es_stats; 779 769 if (found) { 780 770 BUG_ON(!es1); 781 771 es->es_lblk = es1->es_lblk; 782 772 es->es_len = es1->es_len; 783 773 es->es_pblk = es1->es_pblk; 774 + stats->es_stats_cache_hits++; 775 + } else { 776 + stats->es_stats_cache_misses++; 784 777 } 785 778 786 779 read_unlock(&EXT4_I(inode)->i_es_lock); ··· 946 931 struct ext4_inode_info *locked_ei) 947 932 { 948 933 struct ext4_inode_info *ei; 934 + struct ext4_es_stats *es_stats; 949 935 struct list_head *cur, *tmp; 950 936 LIST_HEAD(skipped); 937 + ktime_t start_time; 938 + u64 scan_time; 951 939 int nr_shrunk = 0; 952 940 int retried = 0, skip_precached = 1, nr_skipped = 0; 953 941 942 + es_stats = &sbi->s_es_stats; 943 + start_time = ktime_get(); 954 944 spin_lock(&sbi->s_es_lru_lock); 955 945 956 946 retry: ··· 966 946 * If we have already reclaimed all extents from extent 967 947 * status tree, just stop the loop immediately. 968 948 */ 969 - if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0) 949 + if (percpu_counter_read_positive( 950 + &es_stats->es_stats_lru_cnt) == 0) 970 951 break; 971 952 972 953 ei = list_entry(cur, struct ext4_inode_info, i_es_lru); ··· 977 956 * time. Normally we try hard to avoid shrinking 978 957 * precached inodes, but we will as a last resort. 979 958 */ 980 - if ((sbi->s_es_last_sorted < ei->i_touch_when) || 959 + if ((es_stats->es_stats_last_sorted < ei->i_touch_when) || 981 960 (skip_precached && ext4_test_inode_state(&ei->vfs_inode, 982 961 EXT4_STATE_EXT_PRECACHED))) { 983 962 nr_skipped++; ··· 1011 990 if ((nr_shrunk == 0) && nr_skipped && !retried) { 1012 991 retried++; 1013 992 list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); 1014 - sbi->s_es_last_sorted = jiffies; 993 + es_stats->es_stats_last_sorted = jiffies; 1015 994 ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, 1016 995 i_es_lru); 1017 996 /* ··· 1029 1008 if (locked_ei && nr_shrunk == 0) 1030 1009 nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); 1031 1010 1011 + scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1012 + if (likely(es_stats->es_stats_scan_time)) 1013 + es_stats->es_stats_scan_time = (scan_time + 1014 + es_stats->es_stats_scan_time*3) / 4; 1015 + else 1016 + es_stats->es_stats_scan_time = scan_time; 1017 + if (scan_time > es_stats->es_stats_max_scan_time) 1018 + es_stats->es_stats_max_scan_time = scan_time; 1019 + if (likely(es_stats->es_stats_shrunk)) 1020 + es_stats->es_stats_shrunk = (nr_shrunk + 1021 + es_stats->es_stats_shrunk*3) / 4; 1022 + else 1023 + es_stats->es_stats_shrunk = nr_shrunk; 1024 + 1025 + trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached, 1026 + nr_skipped, retried); 1032 1027 return nr_shrunk; 1033 1028 } 1034 1029 ··· 1055 1018 struct ext4_sb_info *sbi; 1056 1019 1057 1020 sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); 1058 - nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); 1021 + nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); 1059 1022 trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); 1060 1023 return nr; 1061 1024 } ··· 1068 1031 int nr_to_scan = sc->nr_to_scan; 1069 1032 int ret, nr_shrunk; 1070 1033 1071 - ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); 1034 + ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); 1072 1035 trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret); 1073 1036 1074 1037 if (!nr_to_scan) ··· 1080 1043 return nr_shrunk; 1081 1044 } 1082 1045 1083 - void ext4_es_register_shrinker(struct ext4_sb_info *sbi) 1046 + static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos) 1084 1047 { 1048 + return *pos ? NULL : SEQ_START_TOKEN; 1049 + } 1050 + 1051 + static void * 1052 + ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos) 1053 + { 1054 + return NULL; 1055 + } 1056 + 1057 + static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) 1058 + { 1059 + struct ext4_sb_info *sbi = seq->private; 1060 + struct ext4_es_stats *es_stats = &sbi->s_es_stats; 1061 + struct ext4_inode_info *ei, *max = NULL; 1062 + unsigned int inode_cnt = 0; 1063 + 1064 + if (v != SEQ_START_TOKEN) 1065 + return 0; 1066 + 1067 + /* here we just find an inode that has the max nr. of objects */ 1068 + spin_lock(&sbi->s_es_lru_lock); 1069 + list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) { 1070 + inode_cnt++; 1071 + if (max && max->i_es_all_nr < ei->i_es_all_nr) 1072 + max = ei; 1073 + else if (!max) 1074 + max = ei; 1075 + } 1076 + spin_unlock(&sbi->s_es_lru_lock); 1077 + 1078 + seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", 1079 + percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), 1080 + percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt)); 1081 + seq_printf(seq, " %lu/%lu cache hits/misses\n", 1082 + es_stats->es_stats_cache_hits, 1083 + es_stats->es_stats_cache_misses); 1084 + if (es_stats->es_stats_last_sorted != 0) 1085 + seq_printf(seq, " %u ms last sorted interval\n", 1086 + jiffies_to_msecs(jiffies - 1087 + es_stats->es_stats_last_sorted)); 1088 + if (inode_cnt) 1089 + seq_printf(seq, " %d inodes on lru list\n", inode_cnt); 1090 + 1091 + seq_printf(seq, "average:\n %llu us scan time\n", 1092 + div_u64(es_stats->es_stats_scan_time, 1000)); 1093 + seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk); 1094 + if (inode_cnt) 1095 + seq_printf(seq, 1096 + "maximum:\n %lu inode (%u objects, %u reclaimable)\n" 1097 + " %llu us max scan time\n", 1098 + max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr, 1099 + div_u64(es_stats->es_stats_max_scan_time, 1000)); 1100 + 1101 + return 0; 1102 + } 1103 + 1104 + static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v) 1105 + { 1106 + } 1107 + 1108 + static const struct seq_operations ext4_es_seq_shrinker_info_ops = { 1109 + .start = ext4_es_seq_shrinker_info_start, 1110 + .next = ext4_es_seq_shrinker_info_next, 1111 + .stop = ext4_es_seq_shrinker_info_stop, 1112 + .show = ext4_es_seq_shrinker_info_show, 1113 + }; 1114 + 1115 + static int 1116 + ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file) 1117 + { 1118 + int ret; 1119 + 1120 + ret = seq_open(file, &ext4_es_seq_shrinker_info_ops); 1121 + if (!ret) { 1122 + struct seq_file *m = file->private_data; 1123 + m->private = PDE_DATA(inode); 1124 + } 1125 + 1126 + return ret; 1127 + } 1128 + 1129 + static int 1130 + ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file) 1131 + { 1132 + return seq_release(inode, file); 1133 + } 1134 + 1135 + static const struct file_operations ext4_es_seq_shrinker_info_fops = { 1136 + .owner = THIS_MODULE, 1137 + .open = ext4_es_seq_shrinker_info_open, 1138 + .read = seq_read, 1139 + .llseek = seq_lseek, 1140 + .release = ext4_es_seq_shrinker_info_release, 1141 + }; 1142 + 1143 + int ext4_es_register_shrinker(struct ext4_sb_info *sbi) 1144 + { 1145 + int err; 1146 + 1085 1147 INIT_LIST_HEAD(&sbi->s_es_lru); 1086 1148 spin_lock_init(&sbi->s_es_lru_lock); 1087 - sbi->s_es_last_sorted = 0; 1149 + sbi->s_es_stats.es_stats_last_sorted = 0; 1150 + sbi->s_es_stats.es_stats_shrunk = 0; 1151 + sbi->s_es_stats.es_stats_cache_hits = 0; 1152 + sbi->s_es_stats.es_stats_cache_misses = 0; 1153 + sbi->s_es_stats.es_stats_scan_time = 0; 1154 + sbi->s_es_stats.es_stats_max_scan_time = 0; 1155 + err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0); 1156 + if (err) 1157 + return err; 1158 + err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0); 1159 + if (err) 1160 + goto err1; 1161 + 1088 1162 sbi->s_es_shrinker.scan_objects = ext4_es_scan; 1089 1163 sbi->s_es_shrinker.count_objects = ext4_es_count; 1090 1164 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; 1091 - register_shrinker(&sbi->s_es_shrinker); 1165 + err = register_shrinker(&sbi->s_es_shrinker); 1166 + if (err) 1167 + goto err2; 1168 + 1169 + if (sbi->s_proc) 1170 + proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc, 1171 + &ext4_es_seq_shrinker_info_fops, sbi); 1172 + 1173 + return 0; 1174 + 1175 + err2: 1176 + percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); 1177 + err1: 1178 + percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); 1179 + return err; 1092 1180 } 1093 1181 1094 1182 void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) 1095 1183 { 1184 + if (sbi->s_proc) 1185 + remove_proc_entry("es_shrinker_info", sbi->s_proc); 1186 + percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); 1187 + percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); 1096 1188 unregister_shrinker(&sbi->s_es_shrinker); 1097 1189 } 1098 1190
+12 -1
fs/ext4/extents_status.h
··· 64 64 struct extent_status *cache_es; /* recently accessed extent */ 65 65 }; 66 66 67 + struct ext4_es_stats { 68 + unsigned long es_stats_last_sorted; 69 + unsigned long es_stats_shrunk; 70 + unsigned long es_stats_cache_hits; 71 + unsigned long es_stats_cache_misses; 72 + u64 es_stats_scan_time; 73 + u64 es_stats_max_scan_time; 74 + struct percpu_counter es_stats_all_cnt; 75 + struct percpu_counter es_stats_lru_cnt; 76 + }; 77 + 67 78 extern int __init ext4_init_es(void); 68 79 extern void ext4_exit_es(void); 69 80 extern void ext4_es_init_tree(struct ext4_es_tree *tree); ··· 149 138 (pb & ~ES_MASK)); 150 139 } 151 140 152 - extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); 141 + extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); 153 142 extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); 154 143 extern void ext4_es_lru_add(struct inode *inode); 155 144 extern void ext4_es_lru_del(struct inode *inode);
+3 -8
fs/ext4/super.c
··· 820 820 percpu_counter_destroy(&sbi->s_freeinodes_counter); 821 821 percpu_counter_destroy(&sbi->s_dirs_counter); 822 822 percpu_counter_destroy(&sbi->s_dirtyclusters_counter); 823 - percpu_counter_destroy(&sbi->s_extent_cache_cnt); 824 823 brelse(sbi->s_sbh); 825 824 #ifdef CONFIG_QUOTA 826 825 for (i = 0; i < MAXQUOTAS; i++) ··· 884 885 ext4_es_init_tree(&ei->i_es_tree); 885 886 rwlock_init(&ei->i_es_lock); 886 887 INIT_LIST_HEAD(&ei->i_es_lru); 888 + ei->i_es_all_nr = 0; 887 889 ei->i_es_lru_nr = 0; 888 890 ei->i_touch_when = 0; 889 891 ei->i_reserved_data_blocks = 0; ··· 3890 3890 sbi->s_err_report.data = (unsigned long) sb; 3891 3891 3892 3892 /* Register extent status tree shrinker */ 3893 - ext4_es_register_shrinker(sbi); 3894 - 3895 - if ((err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0)) != 0) { 3896 - ext4_msg(sb, KERN_ERR, "insufficient memory"); 3893 + if (ext4_es_register_shrinker(sbi)) 3897 3894 goto failed_mount3; 3898 - } 3899 3895 3900 3896 sbi->s_stripe = ext4_get_stripe_size(sbi); 3901 3897 sbi->s_extent_max_zeroout_kb = 32; ··· 4221 4225 jbd2_journal_destroy(sbi->s_journal); 4222 4226 sbi->s_journal = NULL; 4223 4227 } 4224 - failed_mount3: 4225 4228 ext4_es_unregister_shrinker(sbi); 4229 + failed_mount3: 4226 4230 del_timer_sync(&sbi->s_err_report); 4227 - percpu_counter_destroy(&sbi->s_extent_cache_cnt); 4228 4231 if (sbi->s_mmp_tsk) 4229 4232 kthread_stop(sbi->s_mmp_tsk); 4230 4233 failed_mount2:
+31
include/trace/events/ext4.h
··· 2450 2450 __entry->offset, __entry->len) 2451 2451 ); 2452 2452 2453 + TRACE_EVENT(ext4_es_shrink, 2454 + TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time, 2455 + int skip_precached, int nr_skipped, int retried), 2456 + 2457 + TP_ARGS(sb, nr_shrunk, scan_time, skip_precached, nr_skipped, retried), 2458 + 2459 + TP_STRUCT__entry( 2460 + __field( dev_t, dev ) 2461 + __field( int, nr_shrunk ) 2462 + __field( unsigned long long, scan_time ) 2463 + __field( int, skip_precached ) 2464 + __field( int, nr_skipped ) 2465 + __field( int, retried ) 2466 + ), 2467 + 2468 + TP_fast_assign( 2469 + __entry->dev = sb->s_dev; 2470 + __entry->nr_shrunk = nr_shrunk; 2471 + __entry->scan_time = div_u64(scan_time, 1000); 2472 + __entry->skip_precached = skip_precached; 2473 + __entry->nr_skipped = nr_skipped; 2474 + __entry->retried = retried; 2475 + ), 2476 + 2477 + TP_printk("dev %d,%d nr_shrunk %d, scan_time %llu skip_precached %d " 2478 + "nr_skipped %d retried %d", 2479 + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk, 2480 + __entry->scan_time, __entry->skip_precached, 2481 + __entry->nr_skipped, __entry->retried) 2482 + ); 2483 + 2453 2484 #endif /* _TRACE_EXT4_H */ 2454 2485 2455 2486 /* This part must be outside protection */