Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Btrfs: kill btrfs_clear_path_blocking

Btrfs's btree locking has two modes, spinning mode and blocking mode,
while searching btree, locking is always acquired in spinning mode and
then converted to blocking mode if necessary, and in some hot paths we may
switch the locking back to spinning mode by btrfs_clear_path_blocking().

When acquiring locks, both of reader and writer need to wait for blocking
readers and writers to complete before doing read_lock()/write_lock().

The problem is that btrfs_clear_path_blocking() needs to switch nodes
in the path to blocking mode at first (by btrfs_set_path_blocking) to
make lockdep happy before doing its actual clearing blocking job.

When switching to blocking mode from spinning mode, it consists of

step 1) bumping up blocking readers counter and
step 2) read_unlock()/write_unlock(),

this has caused serious ping-pong effect if there're a great amount of
concurrent readers/writers, as waiters will be woken up and go to
sleep immediately.

1) Killing this kind of ping-pong results in a big improvement in my 1600k
files creation script,

MNT=/mnt/btrfs
mkfs.btrfs -f /dev/sdf
mount /dev/def $MNT
time fsmark -D 10000 -S0 -n 100000 -s 0 -L 1 -l /tmp/fs_log.txt \
-d $MNT/0 -d $MNT/1 \
-d $MNT/2 -d $MNT/3 \
-d $MNT/4 -d $MNT/5 \
-d $MNT/6 -d $MNT/7 \
-d $MNT/8 -d $MNT/9 \
-d $MNT/10 -d $MNT/11 \
-d $MNT/12 -d $MNT/13 \
-d $MNT/14 -d $MNT/15

w/o patch:
real 2m27.307s
user 0m12.839s
sys 13m42.831s

w/ patch:
real 1m2.273s
user 0m15.802s
sys 8m16.495s

1.1) latency histogram from funclatency[1]

Overall with the patch, there're ~50% less write lock acquisition and
the 95% max latency that write lock takes also reduces to ~100ms from
>500ms.

--------------------------------------------
w/o patch:
--------------------------------------------
Function = btrfs_tree_lock
msecs : count distribution
0 -> 1 : 2385222 |****************************************|
2 -> 3 : 37147 | |
4 -> 7 : 20452 | |
8 -> 15 : 13131 | |
16 -> 31 : 3877 | |
32 -> 63 : 3900 | |
64 -> 127 : 2612 | |
128 -> 255 : 974 | |
256 -> 511 : 165 | |
512 -> 1023 : 13 | |

Function = btrfs_tree_read_lock
msecs : count distribution
0 -> 1 : 6743860 |****************************************|
2 -> 3 : 2146 | |
4 -> 7 : 190 | |
8 -> 15 : 38 | |
16 -> 31 : 4 | |

--------------------------------------------
w/ patch:
--------------------------------------------
Function = btrfs_tree_lock
msecs : count distribution
0 -> 1 : 1318454 |****************************************|
2 -> 3 : 6800 | |
4 -> 7 : 3664 | |
8 -> 15 : 2145 | |
16 -> 31 : 809 | |
32 -> 63 : 219 | |
64 -> 127 : 10 | |

Function = btrfs_tree_read_lock
msecs : count distribution
0 -> 1 : 6854317 |****************************************|
2 -> 3 : 2383 | |
4 -> 7 : 601 | |
8 -> 15 : 92 | |

2) dbench also proves the improvement,
dbench -t 120 -D /mnt/btrfs 16

w/o patch:
Throughput 158.363 MB/sec

w/ patch:
Throughput 449.52 MB/sec

3) xfstests didn't show any additional failures.

One thing to note is that callers may set path->leave_spinning to have
all nodes in the path stay in spinning mode, which means callers are
ready to not sleep before releasing the path, but it won't cause
problems if they don't want to sleep in blocking mode.

[1]: https://github.com/iovisor/bcc/blob/master/tools/funclatency.py

Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
Signed-off-by: David Sterba <dsterba@suse.com>

authored by

Liu Bo and committed by
David Sterba
52398340 9b142115

+4 -58
+4 -53
fs/btrfs/ctree.c
··· 52 52 } 53 53 } 54 54 55 - /* 56 - * reset all the locked nodes in the patch to spinning locks. 57 - * 58 - * held is used to keep lockdep happy, when lockdep is enabled 59 - * we set held to a blocking lock before we go around and 60 - * retake all the spinlocks in the path. You can safely use NULL 61 - * for held 62 - */ 63 - noinline void btrfs_clear_path_blocking(struct btrfs_path *p, 64 - struct extent_buffer *held, int held_rw) 65 - { 66 - int i; 67 - 68 - if (held) { 69 - btrfs_set_lock_blocking_rw(held, held_rw); 70 - if (held_rw == BTRFS_WRITE_LOCK) 71 - held_rw = BTRFS_WRITE_LOCK_BLOCKING; 72 - else if (held_rw == BTRFS_READ_LOCK) 73 - held_rw = BTRFS_READ_LOCK_BLOCKING; 74 - } 75 - btrfs_set_path_blocking(p); 76 - 77 - for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { 78 - if (p->nodes[i] && p->locks[i]) { 79 - btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]); 80 - if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING) 81 - p->locks[i] = BTRFS_WRITE_LOCK; 82 - else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING) 83 - p->locks[i] = BTRFS_READ_LOCK; 84 - } 85 - } 86 - 87 - if (held) 88 - btrfs_clear_lock_blocking_rw(held, held_rw); 89 - } 90 - 91 55 /* this also releases the path */ 92 56 void btrfs_free_path(struct btrfs_path *p) 93 57 { ··· 1270 1306 } 1271 1307 } 1272 1308 1273 - btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK); 1274 1309 btrfs_tree_read_unlock_blocking(eb); 1275 1310 free_extent_buffer(eb); 1276 1311 ··· 2445 2482 btrfs_set_path_blocking(p); 2446 2483 reada_for_balance(fs_info, p, level); 2447 2484 sret = split_node(trans, root, p, level); 2448 - btrfs_clear_path_blocking(p, NULL, 0); 2449 2485 2450 2486 BUG_ON(sret > 0); 2451 2487 if (sret) { ··· 2465 2503 btrfs_set_path_blocking(p); 2466 2504 reada_for_balance(fs_info, p, level); 2467 2505 sret = balance_level(trans, root, p, level); 2468 - btrfs_clear_path_blocking(p, NULL, 0); 2469 2506 2470 2507 if (sret) { 2471 2508 ret = sret; ··· 2749 2788 } 2750 2789 cow_done: 2751 2790 p->nodes[level] = b; 2752 - btrfs_clear_path_blocking(p, NULL, 0); 2791 + /* 2792 + * Leave path with blocking locks to avoid massive 2793 + * lock context switch, this is made on purpose. 2794 + */ 2753 2795 2754 2796 /* 2755 2797 * we have a lock on b and as long as we aren't changing ··· 2834 2870 if (!err) { 2835 2871 btrfs_set_path_blocking(p); 2836 2872 btrfs_tree_lock(b); 2837 - btrfs_clear_path_blocking(p, b, 2838 - BTRFS_WRITE_LOCK); 2839 2873 } 2840 2874 p->locks[level] = BTRFS_WRITE_LOCK; 2841 2875 } else { ··· 2841 2879 if (!err) { 2842 2880 btrfs_set_path_blocking(p); 2843 2881 btrfs_tree_read_lock(b); 2844 - btrfs_clear_path_blocking(p, b, 2845 - BTRFS_READ_LOCK); 2846 2882 } 2847 2883 p->locks[level] = BTRFS_READ_LOCK; 2848 2884 } ··· 2859 2899 btrfs_set_path_blocking(p); 2860 2900 err = split_leaf(trans, root, key, 2861 2901 p, ins_len, ret == 0); 2862 - btrfs_clear_path_blocking(p, NULL, 0); 2863 2902 2864 2903 BUG_ON(err > 0); 2865 2904 if (err) { ··· 2929 2970 while (b) { 2930 2971 level = btrfs_header_level(b); 2931 2972 p->nodes[level] = b; 2932 - btrfs_clear_path_blocking(p, NULL, 0); 2933 2973 2934 2974 /* 2935 2975 * we have a lock on b and as long as we aren't changing ··· 2974 3016 if (!err) { 2975 3017 btrfs_set_path_blocking(p); 2976 3018 btrfs_tree_read_lock(b); 2977 - btrfs_clear_path_blocking(p, b, 2978 - BTRFS_READ_LOCK); 2979 3019 } 2980 3020 b = tree_mod_log_rewind(fs_info, p, b, time_seq); 2981 3021 if (!b) { ··· 5157 5201 path->locks[level - 1] = BTRFS_READ_LOCK; 5158 5202 path->nodes[level - 1] = cur; 5159 5203 unlock_up(path, level, 1, 0, NULL); 5160 - btrfs_clear_path_blocking(path, NULL, 0); 5161 5204 } 5162 5205 out: 5163 5206 path->keep_locks = keep_locks; ··· 5741 5786 if (!ret) { 5742 5787 btrfs_set_path_blocking(path); 5743 5788 btrfs_tree_read_lock(next); 5744 - btrfs_clear_path_blocking(path, next, 5745 - BTRFS_READ_LOCK); 5746 5789 } 5747 5790 next_rw_lock = BTRFS_READ_LOCK; 5748 5791 } ··· 5776 5823 if (!ret) { 5777 5824 btrfs_set_path_blocking(path); 5778 5825 btrfs_tree_read_lock(next); 5779 - btrfs_clear_path_blocking(path, next, 5780 - BTRFS_READ_LOCK); 5781 5826 } 5782 5827 next_rw_lock = BTRFS_READ_LOCK; 5783 5828 }
-2
fs/btrfs/ctree.h
··· 2868 2868 struct btrfs_path *btrfs_alloc_path(void); 2869 2869 void btrfs_free_path(struct btrfs_path *p); 2870 2870 void btrfs_set_path_blocking(struct btrfs_path *p); 2871 - void btrfs_clear_path_blocking(struct btrfs_path *p, 2872 - struct extent_buffer *held, int held_rw); 2873 2871 void btrfs_unlock_up_safe(struct btrfs_path *p, int level); 2874 2872 2875 2873 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-3
fs/btrfs/delayed-inode.c
··· 765 765 i++; 766 766 } 767 767 768 - /* reset all the locked nodes in the patch to spinning locks. */ 769 - btrfs_clear_path_blocking(path, NULL, 0); 770 - 771 768 /* insert the keys of the items */ 772 769 setup_items_for_insert(root, path, keys, data_size, 773 770 total_data_size, total_size, nitems);