Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ext4: add support for extent pre-caching

Add a new fiemap flag which forces the all of the extents in an inode
to be cached in the extent_status tree. This is critically important
when using AIO to a preallocated file, since if we need to read in
blocks from the extent tree, the io_submit(2) system call becomes
synchronous, and the AIO is no longer "A", which is bad.

In addition, for most files which have an external leaf tree block,
the cost of caching the information in the extent status tree will be
less than caching the entire 4k block in the buffer cache. So it is
generally a win to keep the extent information cached.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

+137 -29
+10 -7
fs/ext4/ext4.h
··· 561 561 #define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 562 562 563 563 /* 564 - * The bit position of this flag must not overlap with any of the 565 - * EXT4_GET_BLOCKS_*. It is used by ext4_ext_find_extent(), 564 + * The bit position of these flags must not overlap with any of the 565 + * EXT4_GET_BLOCKS_*. They are used by ext4_ext_find_extent(), 566 566 * read_extent_tree_block(), ext4_split_extent_at(), 567 - * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf() to 568 - * indicate that the we shouldn't be caching the extents when reading 569 - * from the extent tree while a truncate or punch hole operation 570 - * is in progress. 567 + * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). 568 + * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be 569 + * caching the extents when reading from the extent tree while a 570 + * truncate or punch hole operation is in progress. 571 571 */ 572 572 #define EXT4_EX_NOCACHE 0x0400 573 + #define EXT4_EX_FORCE_CACHE 0x0800 573 574 574 575 /* 575 576 * Flags used by ext4_free_blocks ··· 602 601 #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) 603 602 #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) 604 603 #define EXT4_IOC_SWAP_BOOT _IO('f', 17) 604 + #define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) 605 605 606 606 #if defined(__KERNEL__) && defined(CONFIG_COMPAT) 607 607 /* ··· 1388 1386 nolocking */ 1389 1387 EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ 1390 1388 EXT4_STATE_ORDERED_MODE, /* data=ordered mode */ 1389 + EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ 1391 1390 }; 1392 1391 1393 1392 #define EXT4_INODE_BIT_FNS(name, field, offset) \ ··· 2708 2705 extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); 2709 2706 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2710 2707 __u64 start, __u64 len); 2711 - 2708 + extern int ext4_ext_precache(struct inode *inode); 2712 2709 2713 2710 /* move_extent.c */ 2714 2711 extern void ext4_double_down_write_data_sem(struct inode *first,
+72 -1
fs/ext4/extents.c
··· 482 482 if (err < 0) 483 483 goto errout; 484 484 } 485 - if (buffer_verified(bh)) 485 + if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) 486 486 return bh; 487 487 err = __ext4_ext_check(function, line, inode, 488 488 ext_block_hdr(bh), depth, pblk); ··· 525 525 #define read_extent_tree_block(inode, pblk, depth, flags) \ 526 526 __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \ 527 527 (depth), (flags)) 528 + 529 + /* 530 + * This function is called to cache a file's extent information in the 531 + * extent status tree 532 + */ 533 + int ext4_ext_precache(struct inode *inode) 534 + { 535 + struct ext4_inode_info *ei = EXT4_I(inode); 536 + struct ext4_ext_path *path = NULL; 537 + struct buffer_head *bh; 538 + int i = 0, depth, ret = 0; 539 + 540 + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 541 + return 0; /* not an extent-mapped inode */ 542 + 543 + down_read(&ei->i_data_sem); 544 + depth = ext_depth(inode); 545 + 546 + path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), 547 + GFP_NOFS); 548 + if (path == NULL) { 549 + up_read(&ei->i_data_sem); 550 + return -ENOMEM; 551 + } 552 + 553 + /* Don't cache anything if there are no external extent blocks */ 554 + if (depth == 0) 555 + goto out; 556 + path[0].p_hdr = ext_inode_hdr(inode); 557 + ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0); 558 + if (ret) 559 + goto out; 560 + path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr); 561 + while (i >= 0) { 562 + /* 563 + * If this is a leaf block or we've reached the end of 564 + * the index block, go up 565 + */ 566 + if ((i == depth) || 567 + path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) { 568 + brelse(path[i].p_bh); 569 + path[i].p_bh = NULL; 570 + i--; 571 + continue; 572 + } 573 + bh = read_extent_tree_block(inode, 574 + ext4_idx_pblock(path[i].p_idx++), 575 + depth - i - 1, 576 + EXT4_EX_FORCE_CACHE); 577 + if (IS_ERR(bh)) { 578 + ret = PTR_ERR(bh); 579 + break; 580 + } 581 + i++; 582 + path[i].p_bh = bh; 583 + path[i].p_hdr = ext_block_hdr(bh); 584 + path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr); 585 + } 586 + ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED); 587 + out: 588 + up_read(&ei->i_data_sem); 589 + ext4_ext_drop_refs(path); 590 + kfree(path); 591 + return ret; 592 + } 528 593 529 594 #ifdef EXT_DEBUG 530 595 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) ··· 4828 4763 error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline); 4829 4764 4830 4765 if (has_inline) 4766 + return error; 4767 + } 4768 + 4769 + if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { 4770 + error = ext4_ext_precache(inode); 4771 + if (error) 4831 4772 return error; 4832 4773 } 4833 4774
+51 -21
fs/ext4/extents_status.c
··· 710 710 write_lock(&EXT4_I(inode)->i_es_lock); 711 711 712 712 es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk); 713 - if (es && ((es->es_lblk <= lblk) || (es->es_lblk <= end))) 714 - goto out; 715 - 716 - __es_insert_extent(inode, &newes); 717 - out: 713 + if (!es || es->es_lblk > end) 714 + __es_insert_extent(inode, &newes); 718 715 write_unlock(&EXT4_I(inode)->i_es_lock); 719 716 } 720 717 ··· 927 930 eia = list_entry(a, struct ext4_inode_info, i_es_lru); 928 931 eib = list_entry(b, struct ext4_inode_info, i_es_lru); 929 932 933 + if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && 934 + !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) 935 + return 1; 936 + if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && 937 + ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) 938 + return -1; 930 939 if (eia->i_touch_when == eib->i_touch_when) 931 940 return 0; 932 941 if (time_after(eia->i_touch_when, eib->i_touch_when)) ··· 946 943 { 947 944 struct ext4_inode_info *ei; 948 945 struct list_head *cur, *tmp; 949 - LIST_HEAD(skiped); 946 + LIST_HEAD(skipped); 950 947 int ret, nr_shrunk = 0; 948 + int retried = 0, skip_precached = 1, nr_skipped = 0; 951 949 952 950 spin_lock(&sbi->s_es_lru_lock); 953 951 954 - /* 955 - * If the inode that is at the head of LRU list is newer than 956 - * last_sorted time, that means that we need to sort this list. 957 - */ 958 - ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru); 959 - if (sbi->s_es_last_sorted < ei->i_touch_when) { 960 - list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); 961 - sbi->s_es_last_sorted = jiffies; 962 - } 963 - 952 + retry: 964 953 list_for_each_safe(cur, tmp, &sbi->s_es_lru) { 965 954 /* 966 955 * If we have already reclaimed all extents from extent ··· 963 968 964 969 ei = list_entry(cur, struct ext4_inode_info, i_es_lru); 965 970 966 - /* Skip the inode that is newer than the last_sorted time */ 967 - if (sbi->s_es_last_sorted < ei->i_touch_when) { 968 - list_move_tail(cur, &skiped); 971 + /* 972 + * Skip the inode that is newer than the last_sorted 973 + * time. Normally we try hard to avoid shrinking 974 + * precached inodes, but we will as a last resort. 975 + */ 976 + if ((sbi->s_es_last_sorted < ei->i_touch_when) || 977 + (skip_precached && ext4_test_inode_state(&ei->vfs_inode, 978 + EXT4_STATE_EXT_PRECACHED))) { 979 + nr_skipped++; 980 + list_move_tail(cur, &skipped); 969 981 continue; 970 982 } 971 983 ··· 992 990 } 993 991 994 992 /* Move the newer inodes into the tail of the LRU list. */ 995 - list_splice_tail(&skiped, &sbi->s_es_lru); 993 + list_splice_tail(&skipped, &sbi->s_es_lru); 994 + INIT_LIST_HEAD(&skipped); 995 + 996 + /* 997 + * If we skipped any inodes, and we weren't able to make any 998 + * forward progress, sort the list and try again. 999 + */ 1000 + if ((nr_shrunk == 0) && nr_skipped && !retried) { 1001 + retried++; 1002 + list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); 1003 + sbi->s_es_last_sorted = jiffies; 1004 + ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, 1005 + i_es_lru); 1006 + /* 1007 + * If there are no non-precached inodes left on the 1008 + * list, start releasing precached extents. 1009 + */ 1010 + if (ext4_test_inode_state(&ei->vfs_inode, 1011 + EXT4_STATE_EXT_PRECACHED)) 1012 + skip_precached = 0; 1013 + goto retry; 1014 + } 1015 + 996 1016 spin_unlock(&sbi->s_es_lru_lock); 997 1017 998 1018 if (locked_ei && nr_shrunk == 0) 999 - nr_shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); 1019 + nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); 1000 1020 1001 1021 return nr_shrunk; 1002 1022 } ··· 1093 1069 struct rb_node *node; 1094 1070 struct extent_status *es; 1095 1071 int nr_shrunk = 0; 1072 + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, 1073 + DEFAULT_RATELIMIT_BURST); 1096 1074 1097 1075 if (ei->i_es_lru_nr == 0) 1098 1076 return 0; 1077 + 1078 + if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) && 1079 + __ratelimit(&_rs)) 1080 + ext4_warning(inode->i_sb, "forced shrink of precached extents"); 1099 1081 1100 1082 node = rb_first(&tree->root); 1101 1083 while (node != NULL) {
+3
fs/ext4/ioctl.c
··· 624 624 625 625 return 0; 626 626 } 627 + case EXT4_IOC_PRECACHE_EXTENTS: 628 + return ext4_ext_precache(inode); 627 629 628 630 default: 629 631 return -ENOTTY; ··· 690 688 case EXT4_IOC_MOVE_EXT: 691 689 case FITRIM: 692 690 case EXT4_IOC_RESIZE_FS: 691 + case EXT4_IOC_PRECACHE_EXTENTS: 693 692 break; 694 693 default: 695 694 return -ENOIOCTLCMD;
+1
include/uapi/linux/fiemap.h
··· 40 40 41 41 #define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */ 42 42 #define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */ 43 + #define FIEMAP_FLAG_CACHE 0x00000004 /* request caching of the extents */ 43 44 44 45 #define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR) 45 46