[PATCH] ext3: reduce allocate-with-reservation lock latencies

Currently in ext3 block reservation code, the global filesystem reservation
tree lock (rsv_block) is hold during the process of searching for a space
to make a new reservation window, including while scaning the block bitmap
to verify if the avalible window has a free block. Holding the lock during
bitmap scan is unnecessary and could possibly cause scalability issue and
latency issues.

This patch tries to address this by dropping the lock before scan the
bitmap. Before that we need to reserve the open window in case someone
else is targetting at the same window. Question was should we reserve the
whole free reservable space or just the window size we need. Reserve the
whole free reservable space will possibly force other threads which
intended to do block allocation nearby move to another block group(cause
bad layout). In this patch, we just reserve the desired size before drop
the lock and scan the block bitmap. This patch fixed a ext3 reservation
latency issue seen on a cvs check out test. Patch is tested with many fsx,
tiobench, dbench and untar a kernel test.

Signed-Off-By: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by Mingming Cao and committed by Linus Torvalds 21fe3471 fb3cc432

+67 -72
+63 -72
fs/ext3/balloc.c
··· 749 * to find a free region that is of my size and has not 750 * been reserved. 751 * 752 - * on succeed, it returns the reservation window to be appended to. 753 - * failed, return NULL. 754 */ 755 - static struct ext3_reserve_window_node *find_next_reservable_window( 756 struct ext3_reserve_window_node *search_head, 757 - unsigned long size, int *start_block, 758 int last_block) 759 { 760 struct rb_node *next; 761 struct ext3_reserve_window_node *rsv, *prev; 762 int cur; 763 764 /* TODO: make the start of the reservation window byte-aligned */ 765 /* cur = *start_block & ~7;*/ 766 - cur = *start_block; 767 rsv = search_head; 768 if (!rsv) 769 - return NULL; 770 771 while (1) { 772 if (cur <= rsv->rsv_end) ··· 782 * space with expected-size (or more)... 783 */ 784 if (cur > last_block) 785 - return NULL; /* fail */ 786 787 prev = rsv; 788 next = rb_next(&rsv->rsv_node); 789 - rsv = list_entry(next, struct ext3_reserve_window_node, rsv_node); 790 791 /* 792 * Reached the last reservation, we can just append to the ··· 813 * return the reservation window that we could append to. 814 * succeed. 815 */ 816 - *start_block = cur; 817 - return prev; 818 } 819 820 /** ··· 869 * @sb: the super block 870 * @group: the group we are trying to allocate in 871 * @bitmap_bh: the block group block bitmap 872 */ 873 static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, 874 int goal, struct super_block *sb, ··· 878 struct ext3_reserve_window_node *search_head; 879 int group_first_block, group_end_block, start_block; 880 int first_free_block; 881 - int reservable_space_start; 882 - struct ext3_reserve_window_node *prev_rsv; 883 struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root; 884 unsigned long size; 885 886 group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + 887 group * EXT3_BLOCKS_PER_GROUP(sb); ··· 893 start_block = goal + group_first_block; 894 895 size = my_rsv->rsv_goal_size; 896 if (!rsv_is_empty(&my_rsv->rsv_window)) { 897 /* 898 * if the old reservation is cross group boundary ··· 927 my_rsv->rsv_goal_size= size; 928 } 929 } 930 /* 931 * shift the search start to the window near the goal block 932 */ ··· 942 * need to check the bitmap after we found a reservable window. 943 */ 944 retry: 945 - prev_rsv = find_next_reservable_window(search_head, size, 946 - &start_block, group_end_block); 947 - if (prev_rsv == NULL) 948 - goto failed; 949 - reservable_space_start = start_block; 950 /* 951 * On success, find_next_reservable_window() returns the 952 * reservation window where there is a reservable space after it. ··· 963 * block. Search start from the start block of the reservable space 964 * we just found. 965 */ 966 first_free_block = bitmap_search_next_usable_block( 967 - reservable_space_start - group_first_block, 968 bitmap_bh, group_end_block - group_first_block + 1); 969 970 if (first_free_block < 0) { ··· 973 * no free block left on the bitmap, no point 974 * to reserve the space. return failed. 975 */ 976 - goto failed; 977 } 978 start_block = first_free_block + group_first_block; 979 /* 980 * check if the first free block is within the 981 - * free space we just found 982 */ 983 - if ((start_block >= reservable_space_start) && 984 - (start_block < reservable_space_start + size)) 985 - goto found_rsv_window; 986 /* 987 * if the first free bit we found is out of the reservable space 988 - * this means there is no free block on the reservable space 989 - * we should continue search for next reservable space, 990 * start from where the free block is, 991 * we also shift the list head to where we stopped last time 992 */ 993 - search_head = prev_rsv; 994 goto retry; 995 - 996 - found_rsv_window: 997 - /* 998 - * great! the reservable space contains some free blocks. 999 - * if the search returns that we should add the new 1000 - * window just next to where the old window, we don't 1001 - * need to remove the old window first then add it to the 1002 - * same place, just update the new start and new end. 1003 - */ 1004 - if (my_rsv != prev_rsv) { 1005 - if (!rsv_is_empty(&my_rsv->rsv_window)) 1006 - rsv_window_remove(sb, my_rsv); 1007 - } 1008 - my_rsv->rsv_start = reservable_space_start; 1009 - my_rsv->rsv_end = my_rsv->rsv_start + size - 1; 1010 - my_rsv->rsv_alloc_hit = 0; 1011 - if (my_rsv != prev_rsv) { 1012 - ext3_rsv_window_add(sb, my_rsv); 1013 - } 1014 - return 0; /* succeed */ 1015 - failed: 1016 - /* 1017 - * failed to find a new reservation window in the current 1018 - * group, remove the current(stale) reservation window 1019 - * if there is any 1020 - */ 1021 - if (!rsv_is_empty(&my_rsv->rsv_window)) 1022 - rsv_window_remove(sb, my_rsv); 1023 - return -1; /* failed */ 1024 } 1025 1026 /* ··· 1025 int goal, struct ext3_reserve_window_node * my_rsv, 1026 int *errp) 1027 { 1028 - spinlock_t *rsv_lock; 1029 unsigned long group_first_block; 1030 int ret = 0; 1031 int fatal; ··· 1053 ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal, NULL); 1054 goto out; 1055 } 1056 - rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock; 1057 /* 1058 * goal is a group relative block number (if there is a goal) 1059 * 0 < goal < EXT3_BLOCKS_PER_GROUP(sb) ··· 1078 * then we could go to allocate from the reservation window directly. 1079 */ 1080 while (1) { 1081 - struct ext3_reserve_window rsv_copy; 1082 - 1083 - rsv_copy._rsv_start = my_rsv->rsv_start; 1084 - rsv_copy._rsv_end = my_rsv->rsv_end; 1085 - 1086 - if (rsv_is_empty(&rsv_copy) || (ret < 0) || 1087 - !goal_in_my_reservation(&rsv_copy, goal, group, sb)) { 1088 - spin_lock(rsv_lock); 1089 ret = alloc_new_reservation(my_rsv, goal, sb, 1090 group, bitmap_bh); 1091 - rsv_copy._rsv_start = my_rsv->rsv_start; 1092 - rsv_copy._rsv_end = my_rsv->rsv_end; 1093 - spin_unlock(rsv_lock); 1094 if (ret < 0) 1095 break; /* failed */ 1096 1097 - if (!goal_in_my_reservation(&rsv_copy, goal, group, sb)) 1098 goal = -1; 1099 } 1100 - if ((rsv_copy._rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb)) 1101 - || (rsv_copy._rsv_end < group_first_block)) 1102 BUG(); 1103 ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal, 1104 - &rsv_copy); 1105 if (ret >= 0) { 1106 my_rsv->rsv_alloc_hit++; 1107 break; /* succeed */
··· 749 * to find a free region that is of my size and has not 750 * been reserved. 751 * 752 */ 753 + static int find_next_reservable_window( 754 struct ext3_reserve_window_node *search_head, 755 + struct ext3_reserve_window_node *my_rsv, 756 + struct super_block * sb, int start_block, 757 int last_block) 758 { 759 struct rb_node *next; 760 struct ext3_reserve_window_node *rsv, *prev; 761 int cur; 762 + int size = my_rsv->rsv_goal_size; 763 764 /* TODO: make the start of the reservation window byte-aligned */ 765 /* cur = *start_block & ~7;*/ 766 + cur = start_block; 767 rsv = search_head; 768 if (!rsv) 769 + return -1; 770 771 while (1) { 772 if (cur <= rsv->rsv_end) ··· 782 * space with expected-size (or more)... 783 */ 784 if (cur > last_block) 785 + return -1; /* fail */ 786 787 prev = rsv; 788 next = rb_next(&rsv->rsv_node); 789 + rsv = list_entry(next,struct ext3_reserve_window_node,rsv_node); 790 791 /* 792 * Reached the last reservation, we can just append to the ··· 813 * return the reservation window that we could append to. 814 * succeed. 815 */ 816 + 817 + if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window))) 818 + rsv_window_remove(sb, my_rsv); 819 + 820 + /* 821 + * Let's book the whole avaliable window for now. We will check the 822 + * disk bitmap later and then, if there are free blocks then we adjust 823 + * the window size if it's larger than requested. 824 + * Otherwise, we will remove this node from the tree next time 825 + * call find_next_reservable_window. 826 + */ 827 + my_rsv->rsv_start = cur; 828 + my_rsv->rsv_end = cur + size - 1; 829 + my_rsv->rsv_alloc_hit = 0; 830 + 831 + if (prev != my_rsv) 832 + ext3_rsv_window_add(sb, my_rsv); 833 + 834 + return 0; 835 } 836 837 /** ··· 852 * @sb: the super block 853 * @group: the group we are trying to allocate in 854 * @bitmap_bh: the block group block bitmap 855 + * 856 */ 857 static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, 858 int goal, struct super_block *sb, ··· 860 struct ext3_reserve_window_node *search_head; 861 int group_first_block, group_end_block, start_block; 862 int first_free_block; 863 struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root; 864 unsigned long size; 865 + int ret; 866 + spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock; 867 868 group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + 869 group * EXT3_BLOCKS_PER_GROUP(sb); ··· 875 start_block = goal + group_first_block; 876 877 size = my_rsv->rsv_goal_size; 878 + 879 if (!rsv_is_empty(&my_rsv->rsv_window)) { 880 /* 881 * if the old reservation is cross group boundary ··· 908 my_rsv->rsv_goal_size= size; 909 } 910 } 911 + 912 + spin_lock(rsv_lock); 913 /* 914 * shift the search start to the window near the goal block 915 */ ··· 921 * need to check the bitmap after we found a reservable window. 922 */ 923 retry: 924 + ret = find_next_reservable_window(search_head, my_rsv, sb, 925 + start_block, group_end_block); 926 + 927 + if (ret == -1) { 928 + if (!rsv_is_empty(&my_rsv->rsv_window)) 929 + rsv_window_remove(sb, my_rsv); 930 + spin_unlock(rsv_lock); 931 + return -1; 932 + } 933 + 934 /* 935 * On success, find_next_reservable_window() returns the 936 * reservation window where there is a reservable space after it. ··· 937 * block. Search start from the start block of the reservable space 938 * we just found. 939 */ 940 + spin_unlock(rsv_lock); 941 first_free_block = bitmap_search_next_usable_block( 942 + my_rsv->rsv_start - group_first_block, 943 bitmap_bh, group_end_block - group_first_block + 1); 944 945 if (first_free_block < 0) { ··· 946 * no free block left on the bitmap, no point 947 * to reserve the space. return failed. 948 */ 949 + spin_lock(rsv_lock); 950 + if (!rsv_is_empty(&my_rsv->rsv_window)) 951 + rsv_window_remove(sb, my_rsv); 952 + spin_unlock(rsv_lock); 953 + return -1; /* failed */ 954 } 955 + 956 start_block = first_free_block + group_first_block; 957 /* 958 * check if the first free block is within the 959 + * free space we just reserved 960 */ 961 + if (start_block >= my_rsv->rsv_start && start_block < my_rsv->rsv_end) 962 + return 0; /* success */ 963 /* 964 * if the first free bit we found is out of the reservable space 965 + * continue search for next reservable space, 966 * start from where the free block is, 967 * we also shift the list head to where we stopped last time 968 */ 969 + search_head = my_rsv; 970 + spin_lock(rsv_lock); 971 goto retry; 972 } 973 974 /* ··· 1023 int goal, struct ext3_reserve_window_node * my_rsv, 1024 int *errp) 1025 { 1026 unsigned long group_first_block; 1027 int ret = 0; 1028 int fatal; ··· 1052 ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal, NULL); 1053 goto out; 1054 } 1055 /* 1056 * goal is a group relative block number (if there is a goal) 1057 * 0 < goal < EXT3_BLOCKS_PER_GROUP(sb) ··· 1078 * then we could go to allocate from the reservation window directly. 1079 */ 1080 while (1) { 1081 + if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) || 1082 + !goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb)) { 1083 ret = alloc_new_reservation(my_rsv, goal, sb, 1084 group, bitmap_bh); 1085 if (ret < 0) 1086 break; /* failed */ 1087 1088 + if (!goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb)) 1089 goal = -1; 1090 } 1091 + if ((my_rsv->rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb)) 1092 + || (my_rsv->rsv_end < group_first_block)) 1093 BUG(); 1094 ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal, 1095 + &my_rsv->rsv_window); 1096 if (ret >= 0) { 1097 my_rsv->rsv_alloc_hit++; 1098 break; /* succeed */
+4
fs/ext3/file.c
··· 36 /* if we are the last writer on the inode, drop the block reservation */ 37 if ((filp->f_mode & FMODE_WRITE) && 38 (atomic_read(&inode->i_writecount) == 1)) 39 ext3_discard_reservation(inode); 40 if (is_dx(inode) && filp->private_data) 41 ext3_htree_free_dir_info(filp->private_data); 42
··· 36 /* if we are the last writer on the inode, drop the block reservation */ 37 if ((filp->f_mode & FMODE_WRITE) && 38 (atomic_read(&inode->i_writecount) == 1)) 39 + { 40 + down(&EXT3_I(inode)->truncate_sem); 41 ext3_discard_reservation(inode); 42 + up(&EXT3_I(inode)->truncate_sem); 43 + } 44 if (is_dx(inode) && filp->private_data) 45 ext3_htree_free_dir_info(filp->private_data); 46