Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ext4: reclaim extents from extent status tree

Although extent status is loaded on-demand, we also need to reclaim
extent from the tree when we are under a heavy memory pressure because
in some cases fragmented extent tree causes status tree costs too much
memory.

Here we maintain a lru list in super_block. When the extent status of
an inode is accessed and changed, this inode will be move to the tail
of the list. The inode will be dropped from this list when it is
cleared. In the inode, a counter is added to count the number of
cached objects in extent status tree. Here only written/unwritten/hole
extent is counted because delayed extent doesn't be reclaimed due to
fiemap, bigalloc and seek_data/hole need it. The counter will be
increased as a new extent is allocated, and it will be decreased as a
extent is freed.

In this commit we use normal shrinker framework to reclaim memory from
the status tree. ext4_es_reclaim_extents_count() traverses the lru list
to count the number of reclaimable extents. ext4_es_shrink() tries to
reclaim written/unwritten/hole extents from extent status tree. The
inode that has been shrunk is moved to the tail of lru list.

Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Jan kara <jack@suse.cz>

authored by

Zheng Liu and committed by
Theodore Ts'o
74cd15cd bdedbb7b

+235
+7
fs/ext4/ext4.h
··· 888 888 /* extents status tree */ 889 889 struct ext4_es_tree i_es_tree; 890 890 rwlock_t i_es_lock; 891 + struct list_head i_es_lru; 892 + unsigned int i_es_lru_nr; /* protected by i_es_lock */ 891 893 892 894 /* ialloc */ 893 895 ext4_group_t i_last_alloc_group; ··· 1305 1303 1306 1304 /* Precomputed FS UUID checksum for seeding other checksums */ 1307 1305 __u32 s_csum_seed; 1306 + 1307 + /* Reclaim extents from extent status tree */ 1308 + struct shrinker s_es_shrinker; 1309 + struct list_head s_es_lru; 1310 + spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; 1308 1311 }; 1309 1312 1310 1313 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
+156
fs/ext4/extents_status.c
··· 145 145 static int __es_insert_extent(struct inode *inode, struct extent_status *newes); 146 146 static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, 147 147 ext4_lblk_t end); 148 + static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, 149 + int nr_to_scan); 150 + static int ext4_es_reclaim_extents_count(struct super_block *sb); 148 151 149 152 int __init ext4_init_es(void) 150 153 { ··· 283 280 284 281 read_unlock(&EXT4_I(inode)->i_es_lock); 285 282 283 + ext4_es_lru_add(inode); 286 284 trace_ext4_es_find_delayed_extent_exit(inode, es); 287 285 } 288 286 ··· 298 294 es->es_lblk = lblk; 299 295 es->es_len = len; 300 296 es->es_pblk = pblk; 297 + 298 + /* 299 + * We don't count delayed extent because we never try to reclaim them 300 + */ 301 + if (!ext4_es_is_delayed(es)) 302 + EXT4_I(inode)->i_es_lru_nr++; 303 + 301 304 return es; 302 305 } 303 306 304 307 static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) 305 308 { 309 + /* Decrease the lru counter when this es is not delayed */ 310 + if (!ext4_es_is_delayed(es)) { 311 + BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); 312 + EXT4_I(inode)->i_es_lru_nr--; 313 + } 314 + 306 315 kmem_cache_free(ext4_es_cachep, es); 307 316 } 308 317 ··· 473 456 error: 474 457 write_unlock(&EXT4_I(inode)->i_es_lock); 475 458 459 + ext4_es_lru_add(inode); 476 460 ext4_es_print_tree(inode); 477 461 478 462 return err; ··· 535 517 536 518 read_unlock(&EXT4_I(inode)->i_es_lock); 537 519 520 + ext4_es_lru_add(inode); 538 521 trace_ext4_es_lookup_extent_exit(inode, es, found); 539 522 return found; 540 523 } ··· 657 638 write_unlock(&EXT4_I(inode)->i_es_lock); 658 639 ext4_es_print_tree(inode); 659 640 return err; 641 + } 642 + 643 + static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) 644 + { 645 + struct ext4_sb_info *sbi = container_of(shrink, 646 + struct ext4_sb_info, s_es_shrinker); 647 + struct ext4_inode_info *ei; 648 + struct list_head *cur, *tmp, scanned; 649 + int nr_to_scan = sc->nr_to_scan; 650 + int ret, nr_shrunk = 0; 651 + 652 + trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan); 653 + 654 + if (!nr_to_scan) 655 + return ext4_es_reclaim_extents_count(sbi->s_sb); 656 + 657 + INIT_LIST_HEAD(&scanned); 658 + 659 + spin_lock(&sbi->s_es_lru_lock); 660 + list_for_each_safe(cur, tmp, &sbi->s_es_lru) { 661 + list_move_tail(cur, &scanned); 662 + 663 + ei = list_entry(cur, struct ext4_inode_info, i_es_lru); 664 + 665 + read_lock(&ei->i_es_lock); 666 + if (ei->i_es_lru_nr == 0) { 667 + read_unlock(&ei->i_es_lock); 668 + continue; 669 + } 670 + read_unlock(&ei->i_es_lock); 671 + 672 + write_lock(&ei->i_es_lock); 673 + ret = __es_try_to_reclaim_extents(ei, nr_to_scan); 674 + write_unlock(&ei->i_es_lock); 675 + 676 + nr_shrunk += ret; 677 + nr_to_scan -= ret; 678 + if (nr_to_scan == 0) 679 + break; 680 + } 681 + list_splice_tail(&scanned, &sbi->s_es_lru); 682 + spin_unlock(&sbi->s_es_lru_lock); 683 + trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk); 684 + 685 + return ext4_es_reclaim_extents_count(sbi->s_sb); 686 + } 687 + 688 + void ext4_es_register_shrinker(struct super_block *sb) 689 + { 690 + struct ext4_sb_info *sbi; 691 + 692 + sbi = EXT4_SB(sb); 693 + INIT_LIST_HEAD(&sbi->s_es_lru); 694 + spin_lock_init(&sbi->s_es_lru_lock); 695 + sbi->s_es_shrinker.shrink = ext4_es_shrink; 696 + sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; 697 + register_shrinker(&sbi->s_es_shrinker); 698 + } 699 + 700 + void ext4_es_unregister_shrinker(struct super_block *sb) 701 + { 702 + unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker); 703 + } 704 + 705 + void ext4_es_lru_add(struct inode *inode) 706 + { 707 + struct ext4_inode_info *ei = EXT4_I(inode); 708 + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 709 + 710 + spin_lock(&sbi->s_es_lru_lock); 711 + if (list_empty(&ei->i_es_lru)) 712 + list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); 713 + else 714 + list_move_tail(&ei->i_es_lru, &sbi->s_es_lru); 715 + spin_unlock(&sbi->s_es_lru_lock); 716 + } 717 + 718 + void ext4_es_lru_del(struct inode *inode) 719 + { 720 + struct ext4_inode_info *ei = EXT4_I(inode); 721 + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 722 + 723 + spin_lock(&sbi->s_es_lru_lock); 724 + if (!list_empty(&ei->i_es_lru)) 725 + list_del_init(&ei->i_es_lru); 726 + spin_unlock(&sbi->s_es_lru_lock); 727 + } 728 + 729 + static int ext4_es_reclaim_extents_count(struct super_block *sb) 730 + { 731 + struct ext4_sb_info *sbi = EXT4_SB(sb); 732 + struct ext4_inode_info *ei; 733 + struct list_head *cur; 734 + int nr_cached = 0; 735 + 736 + spin_lock(&sbi->s_es_lru_lock); 737 + list_for_each(cur, &sbi->s_es_lru) { 738 + ei = list_entry(cur, struct ext4_inode_info, i_es_lru); 739 + read_lock(&ei->i_es_lock); 740 + nr_cached += ei->i_es_lru_nr; 741 + read_unlock(&ei->i_es_lock); 742 + } 743 + spin_unlock(&sbi->s_es_lru_lock); 744 + trace_ext4_es_reclaim_extents_count(sb, nr_cached); 745 + return nr_cached; 746 + } 747 + 748 + static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, 749 + int nr_to_scan) 750 + { 751 + struct inode *inode = &ei->vfs_inode; 752 + struct ext4_es_tree *tree = &ei->i_es_tree; 753 + struct rb_node *node; 754 + struct extent_status *es; 755 + int nr_shrunk = 0; 756 + 757 + if (ei->i_es_lru_nr == 0) 758 + return 0; 759 + 760 + node = rb_first(&tree->root); 761 + while (node != NULL) { 762 + es = rb_entry(node, struct extent_status, rb_node); 763 + node = rb_next(&es->rb_node); 764 + /* 765 + * We can't reclaim delayed extent from status tree because 766 + * fiemap, bigallic, and seek_data/hole need to use it. 767 + */ 768 + if (!ext4_es_is_delayed(es)) { 769 + rb_erase(&es->rb_node, &tree->root); 770 + ext4_es_free_extent(inode, es); 771 + nr_shrunk++; 772 + if (--nr_to_scan == 0) 773 + break; 774 + } 775 + } 776 + tree->cache_es = NULL; 777 + return nr_shrunk; 660 778 }
+5
fs/ext4/extents_status.h
··· 106 106 es->es_pblk = block; 107 107 } 108 108 109 + extern void ext4_es_register_shrinker(struct super_block *sb); 110 + extern void ext4_es_unregister_shrinker(struct super_block *sb); 111 + extern void ext4_es_lru_add(struct inode *inode); 112 + extern void ext4_es_lru_del(struct inode *inode); 113 + 109 114 #endif /* _EXT4_EXTENTS_STATUS_H */
+7
fs/ext4/super.c
··· 755 755 ext4_abort(sb, "Couldn't clean up the journal"); 756 756 } 757 757 758 + ext4_es_unregister_shrinker(sb); 758 759 del_timer(&sbi->s_err_report); 759 760 ext4_release_system_zone(sb); 760 761 ext4_mb_release(sb); ··· 841 840 spin_lock_init(&ei->i_prealloc_lock); 842 841 ext4_es_init_tree(&ei->i_es_tree); 843 842 rwlock_init(&ei->i_es_lock); 843 + INIT_LIST_HEAD(&ei->i_es_lru); 844 + ei->i_es_lru_nr = 0; 844 845 ei->i_reserved_data_blocks = 0; 845 846 ei->i_reserved_meta_blocks = 0; 846 847 ei->i_allocated_meta_blocks = 0; ··· 931 928 dquot_drop(inode); 932 929 ext4_discard_preallocations(inode); 933 930 ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); 931 + ext4_es_lru_del(inode); 934 932 if (EXT4_I(inode)->jinode) { 935 933 jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), 936 934 EXT4_I(inode)->jinode); ··· 3696 3692 sbi->s_stripe = ext4_get_stripe_size(sbi); 3697 3693 sbi->s_max_writeback_mb_bump = 128; 3698 3694 sbi->s_extent_max_zeroout_kb = 32; 3695 + 3696 + /* Register extent status tree shrinker */ 3697 + ext4_es_register_shrinker(sb); 3699 3698 3700 3699 /* 3701 3700 * set up enough so that it can read an inode
+60
include/trace/events/ext4.h
··· 2255 2255 __entry->found ? __entry->status : 0) 2256 2256 ); 2257 2257 2258 + TRACE_EVENT(ext4_es_reclaim_extents_count, 2259 + TP_PROTO(struct super_block *sb, int nr_cached), 2260 + 2261 + TP_ARGS(sb, nr_cached), 2262 + 2263 + TP_STRUCT__entry( 2264 + __field( dev_t, dev ) 2265 + __field( int, nr_cached ) 2266 + ), 2267 + 2268 + TP_fast_assign( 2269 + __entry->dev = sb->s_dev; 2270 + __entry->nr_cached = nr_cached; 2271 + ), 2272 + 2273 + TP_printk("dev %d,%d cached objects nr %d", 2274 + MAJOR(__entry->dev), MINOR(__entry->dev), 2275 + __entry->nr_cached) 2276 + ); 2277 + 2278 + TRACE_EVENT(ext4_es_shrink_enter, 2279 + TP_PROTO(struct super_block *sb, int nr_to_scan), 2280 + 2281 + TP_ARGS(sb, nr_to_scan), 2282 + 2283 + TP_STRUCT__entry( 2284 + __field( dev_t, dev ) 2285 + __field( int, nr_to_scan ) 2286 + ), 2287 + 2288 + TP_fast_assign( 2289 + __entry->dev = sb->s_dev; 2290 + __entry->nr_to_scan = nr_to_scan; 2291 + ), 2292 + 2293 + TP_printk("dev %d,%d nr to scan %d", 2294 + MAJOR(__entry->dev), MINOR(__entry->dev), 2295 + __entry->nr_to_scan) 2296 + ); 2297 + 2298 + TRACE_EVENT(ext4_es_shrink_exit, 2299 + TP_PROTO(struct super_block *sb, int shrunk_nr), 2300 + 2301 + TP_ARGS(sb, shrunk_nr), 2302 + 2303 + TP_STRUCT__entry( 2304 + __field( dev_t, dev ) 2305 + __field( int, shrunk_nr ) 2306 + ), 2307 + 2308 + TP_fast_assign( 2309 + __entry->dev = sb->s_dev; 2310 + __entry->shrunk_nr = shrunk_nr; 2311 + ), 2312 + 2313 + TP_printk("dev %d,%d nr to scan %d", 2314 + MAJOR(__entry->dev), MINOR(__entry->dev), 2315 + __entry->shrunk_nr) 2316 + ); 2317 + 2258 2318 #endif /* _TRACE_EXT4_H */ 2259 2319 2260 2320 /* This part must be outside protection */