[PATCH] ext3: reduce allocate-with-reservation lock latencies

Currently in ext3 block reservation code, the global filesystem reservation
tree lock (rsv_block) is hold during the process of searching for a space
to make a new reservation window, including while scaning the block bitmap
to verify if the avalible window has a free block. Holding the lock during
bitmap scan is unnecessary and could possibly cause scalability issue and
latency issues.

This patch tries to address this by dropping the lock before scan the
bitmap. Before that we need to reserve the open window in case someone
else is targetting at the same window. Question was should we reserve the
whole free reservable space or just the window size we need. Reserve the
whole free reservable space will possibly force other threads which
intended to do block allocation nearby move to another block group(cause
bad layout). In this patch, we just reserve the desired size before drop
the lock and scan the block bitmap. This patch fixed a ext3 reservation
latency issue seen on a cvs check out test. Patch is tested with many fsx,
tiobench, dbench and untar a kernel test.

Signed-Off-By: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

authored by Mingming Cao and committed by Linus Torvalds 21fe3471 fb3cc432

+67 -72
+63 -72
fs/ext3/balloc.c
··· 749 749 * to find a free region that is of my size and has not 750 750 * been reserved. 751 751 * 752 - * on succeed, it returns the reservation window to be appended to. 753 - * failed, return NULL. 754 752 */ 755 - static struct ext3_reserve_window_node *find_next_reservable_window( 753 + static int find_next_reservable_window( 756 754 struct ext3_reserve_window_node *search_head, 757 - unsigned long size, int *start_block, 755 + struct ext3_reserve_window_node *my_rsv, 756 + struct super_block * sb, int start_block, 758 757 int last_block) 759 758 { 760 759 struct rb_node *next; 761 760 struct ext3_reserve_window_node *rsv, *prev; 762 761 int cur; 762 + int size = my_rsv->rsv_goal_size; 763 763 764 764 /* TODO: make the start of the reservation window byte-aligned */ 765 765 /* cur = *start_block & ~7;*/ 766 - cur = *start_block; 766 + cur = start_block; 767 767 rsv = search_head; 768 768 if (!rsv) 769 - return NULL; 769 + return -1; 770 770 771 771 while (1) { 772 772 if (cur <= rsv->rsv_end) ··· 782 782 * space with expected-size (or more)... 783 783 */ 784 784 if (cur > last_block) 785 - return NULL; /* fail */ 785 + return -1; /* fail */ 786 786 787 787 prev = rsv; 788 788 next = rb_next(&rsv->rsv_node); 789 - rsv = list_entry(next, struct ext3_reserve_window_node, rsv_node); 789 + rsv = list_entry(next,struct ext3_reserve_window_node,rsv_node); 790 790 791 791 /* 792 792 * Reached the last reservation, we can just append to the ··· 813 813 * return the reservation window that we could append to. 814 814 * succeed. 815 815 */ 816 - *start_block = cur; 817 - return prev; 816 + 817 + if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window))) 818 + rsv_window_remove(sb, my_rsv); 819 + 820 + /* 821 + * Let's book the whole avaliable window for now. We will check the 822 + * disk bitmap later and then, if there are free blocks then we adjust 823 + * the window size if it's larger than requested. 824 + * Otherwise, we will remove this node from the tree next time 825 + * call find_next_reservable_window. 826 + */ 827 + my_rsv->rsv_start = cur; 828 + my_rsv->rsv_end = cur + size - 1; 829 + my_rsv->rsv_alloc_hit = 0; 830 + 831 + if (prev != my_rsv) 832 + ext3_rsv_window_add(sb, my_rsv); 833 + 834 + return 0; 818 835 } 819 836 820 837 /** ··· 869 852 * @sb: the super block 870 853 * @group: the group we are trying to allocate in 871 854 * @bitmap_bh: the block group block bitmap 855 + * 872 856 */ 873 857 static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, 874 858 int goal, struct super_block *sb, ··· 878 860 struct ext3_reserve_window_node *search_head; 879 861 int group_first_block, group_end_block, start_block; 880 862 int first_free_block; 881 - int reservable_space_start; 882 - struct ext3_reserve_window_node *prev_rsv; 883 863 struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root; 884 864 unsigned long size; 865 + int ret; 866 + spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock; 885 867 886 868 group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + 887 869 group * EXT3_BLOCKS_PER_GROUP(sb); ··· 893 875 start_block = goal + group_first_block; 894 876 895 877 size = my_rsv->rsv_goal_size; 878 + 896 879 if (!rsv_is_empty(&my_rsv->rsv_window)) { 897 880 /* 898 881 * if the old reservation is cross group boundary ··· 927 908 my_rsv->rsv_goal_size= size; 928 909 } 929 910 } 911 + 912 + spin_lock(rsv_lock); 930 913 /* 931 914 * shift the search start to the window near the goal block 932 915 */ ··· 942 921 * need to check the bitmap after we found a reservable window. 943 922 */ 944 923 retry: 945 - prev_rsv = find_next_reservable_window(search_head, size, 946 - &start_block, group_end_block); 947 - if (prev_rsv == NULL) 948 - goto failed; 949 - reservable_space_start = start_block; 924 + ret = find_next_reservable_window(search_head, my_rsv, sb, 925 + start_block, group_end_block); 926 + 927 + if (ret == -1) { 928 + if (!rsv_is_empty(&my_rsv->rsv_window)) 929 + rsv_window_remove(sb, my_rsv); 930 + spin_unlock(rsv_lock); 931 + return -1; 932 + } 933 + 950 934 /* 951 935 * On success, find_next_reservable_window() returns the 952 936 * reservation window where there is a reservable space after it. ··· 963 937 * block. Search start from the start block of the reservable space 964 938 * we just found. 965 939 */ 940 + spin_unlock(rsv_lock); 966 941 first_free_block = bitmap_search_next_usable_block( 967 - reservable_space_start - group_first_block, 942 + my_rsv->rsv_start - group_first_block, 968 943 bitmap_bh, group_end_block - group_first_block + 1); 969 944 970 945 if (first_free_block < 0) { ··· 973 946 * no free block left on the bitmap, no point 974 947 * to reserve the space. return failed. 975 948 */ 976 - goto failed; 949 + spin_lock(rsv_lock); 950 + if (!rsv_is_empty(&my_rsv->rsv_window)) 951 + rsv_window_remove(sb, my_rsv); 952 + spin_unlock(rsv_lock); 953 + return -1; /* failed */ 977 954 } 955 + 978 956 start_block = first_free_block + group_first_block; 979 957 /* 980 958 * check if the first free block is within the 981 - * free space we just found 959 + * free space we just reserved 982 960 */ 983 - if ((start_block >= reservable_space_start) && 984 - (start_block < reservable_space_start + size)) 985 - goto found_rsv_window; 961 + if (start_block >= my_rsv->rsv_start && start_block < my_rsv->rsv_end) 962 + return 0; /* success */ 986 963 /* 987 964 * if the first free bit we found is out of the reservable space 988 - * this means there is no free block on the reservable space 989 - * we should continue search for next reservable space, 965 + * continue search for next reservable space, 990 966 * start from where the free block is, 991 967 * we also shift the list head to where we stopped last time 992 968 */ 993 - search_head = prev_rsv; 969 + search_head = my_rsv; 970 + spin_lock(rsv_lock); 994 971 goto retry; 995 - 996 - found_rsv_window: 997 - /* 998 - * great! the reservable space contains some free blocks. 999 - * if the search returns that we should add the new 1000 - * window just next to where the old window, we don't 1001 - * need to remove the old window first then add it to the 1002 - * same place, just update the new start and new end. 1003 - */ 1004 - if (my_rsv != prev_rsv) { 1005 - if (!rsv_is_empty(&my_rsv->rsv_window)) 1006 - rsv_window_remove(sb, my_rsv); 1007 - } 1008 - my_rsv->rsv_start = reservable_space_start; 1009 - my_rsv->rsv_end = my_rsv->rsv_start + size - 1; 1010 - my_rsv->rsv_alloc_hit = 0; 1011 - if (my_rsv != prev_rsv) { 1012 - ext3_rsv_window_add(sb, my_rsv); 1013 - } 1014 - return 0; /* succeed */ 1015 - failed: 1016 - /* 1017 - * failed to find a new reservation window in the current 1018 - * group, remove the current(stale) reservation window 1019 - * if there is any 1020 - */ 1021 - if (!rsv_is_empty(&my_rsv->rsv_window)) 1022 - rsv_window_remove(sb, my_rsv); 1023 - return -1; /* failed */ 1024 972 } 1025 973 1026 974 /* ··· 1025 1023 int goal, struct ext3_reserve_window_node * my_rsv, 1026 1024 int *errp) 1027 1025 { 1028 - spinlock_t *rsv_lock; 1029 1026 unsigned long group_first_block; 1030 1027 int ret = 0; 1031 1028 int fatal; ··· 1053 1052 ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal, NULL); 1054 1053 goto out; 1055 1054 } 1056 - rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock; 1057 1055 /* 1058 1056 * goal is a group relative block number (if there is a goal) 1059 1057 * 0 < goal < EXT3_BLOCKS_PER_GROUP(sb) ··· 1078 1078 * then we could go to allocate from the reservation window directly. 1079 1079 */ 1080 1080 while (1) { 1081 - struct ext3_reserve_window rsv_copy; 1082 - 1083 - rsv_copy._rsv_start = my_rsv->rsv_start; 1084 - rsv_copy._rsv_end = my_rsv->rsv_end; 1085 - 1086 - if (rsv_is_empty(&rsv_copy) || (ret < 0) || 1087 - !goal_in_my_reservation(&rsv_copy, goal, group, sb)) { 1088 - spin_lock(rsv_lock); 1081 + if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) || 1082 + !goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb)) { 1089 1083 ret = alloc_new_reservation(my_rsv, goal, sb, 1090 1084 group, bitmap_bh); 1091 - rsv_copy._rsv_start = my_rsv->rsv_start; 1092 - rsv_copy._rsv_end = my_rsv->rsv_end; 1093 - spin_unlock(rsv_lock); 1094 1085 if (ret < 0) 1095 1086 break; /* failed */ 1096 1087 1097 - if (!goal_in_my_reservation(&rsv_copy, goal, group, sb)) 1088 + if (!goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb)) 1098 1089 goal = -1; 1099 1090 } 1100 - if ((rsv_copy._rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb)) 1101 - || (rsv_copy._rsv_end < group_first_block)) 1091 + if ((my_rsv->rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb)) 1092 + || (my_rsv->rsv_end < group_first_block)) 1102 1093 BUG(); 1103 1094 ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal, 1104 - &rsv_copy); 1095 + &my_rsv->rsv_window); 1105 1096 if (ret >= 0) { 1106 1097 my_rsv->rsv_alloc_hit++; 1107 1098 break; /* succeed */
+4
fs/ext3/file.c
··· 36 36 /* if we are the last writer on the inode, drop the block reservation */ 37 37 if ((filp->f_mode & FMODE_WRITE) && 38 38 (atomic_read(&inode->i_writecount) == 1)) 39 + { 40 + down(&EXT3_I(inode)->truncate_sem); 39 41 ext3_discard_reservation(inode); 42 + up(&EXT3_I(inode)->truncate_sem); 43 + } 40 44 if (is_dx(inode) && filp->private_data) 41 45 ext3_htree_free_dir_info(filp->private_data); 42 46