Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Btrfs: fix deadlock on tree root leaf when finding free extent

When we are writing out a free space cache, during the transaction commit
phase, we can end up in a deadlock which results in a stack trace like the
following:

schedule+0x28/0x80
btrfs_tree_read_lock+0x8e/0x120 [btrfs]
? finish_wait+0x80/0x80
btrfs_read_lock_root_node+0x2f/0x40 [btrfs]
btrfs_search_slot+0xf6/0x9f0 [btrfs]
? evict_refill_and_join+0xd0/0xd0 [btrfs]
? inode_insert5+0x119/0x190
btrfs_lookup_inode+0x3a/0xc0 [btrfs]
? kmem_cache_alloc+0x166/0x1d0
btrfs_iget+0x113/0x690 [btrfs]
__lookup_free_space_inode+0xd8/0x150 [btrfs]
lookup_free_space_inode+0x5b/0xb0 [btrfs]
load_free_space_cache+0x7c/0x170 [btrfs]
? cache_block_group+0x72/0x3b0 [btrfs]
cache_block_group+0x1b3/0x3b0 [btrfs]
? finish_wait+0x80/0x80
find_free_extent+0x799/0x1010 [btrfs]
btrfs_reserve_extent+0x9b/0x180 [btrfs]
btrfs_alloc_tree_block+0x1b3/0x4f0 [btrfs]
__btrfs_cow_block+0x11d/0x500 [btrfs]
btrfs_cow_block+0xdc/0x180 [btrfs]
btrfs_search_slot+0x3bd/0x9f0 [btrfs]
btrfs_lookup_inode+0x3a/0xc0 [btrfs]
? kmem_cache_alloc+0x166/0x1d0
btrfs_update_inode_item+0x46/0x100 [btrfs]
cache_save_setup+0xe4/0x3a0 [btrfs]
btrfs_start_dirty_block_groups+0x1be/0x480 [btrfs]
btrfs_commit_transaction+0xcb/0x8b0 [btrfs]

At cache_save_setup() we need to update the inode item of a block group's
cache which is located in the tree root (fs_info->tree_root), which means
that it may result in COWing a leaf from that tree. If that happens we
need to find a free metadata extent and while looking for one, if we find
a block group which was not cached yet we attempt to load its cache by
calling cache_block_group(). However this function will try to load the
inode of the free space cache, which requires finding the matching inode
item in the tree root - if that inode item is located in the same leaf as
the inode item of the space cache we are updating at cache_save_setup(),
we end up in a deadlock, since we try to obtain a read lock on the same
extent buffer that we previously write locked.

So fix this by using the tree root's commit root when searching for a
block group's free space cache inode item when we are attempting to load
a free space cache. This is safe since block groups once loaded stay in
memory forever, as well as their caches, so after they are first loaded
we will never need to read their inode items again. For new block groups,
once they are created they get their ->cached field set to
BTRFS_CACHE_FINISHED meaning we will not need to read their inode item.

Reported-by: Andrew Nelson <andrew.s.nelson@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/CAPTELenq9x5KOWuQ+fa7h1r3nsJG8vyiTH8+ifjURc_duHh2Wg@mail.gmail.com/
Fixes: 9d66e233c704 ("Btrfs: load free space cache if it exists")
Tested-by: Andrew Nelson <andrew.s.nelson@gmail.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>

authored by

Filipe Manana and committed by
David Sterba
4222ea71 7e17916b

+46 -11
+3
fs/btrfs/ctree.h
··· 3163 3163 int btrfs_drop_inode(struct inode *inode); 3164 3164 int __init btrfs_init_cachep(void); 3165 3165 void __cold btrfs_destroy_cachep(void); 3166 + struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location, 3167 + struct btrfs_root *root, int *new, 3168 + struct btrfs_path *path); 3166 3169 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 3167 3170 struct btrfs_root *root, int *was_new); 3168 3171 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
+21 -1
fs/btrfs/free-space-cache.c
··· 75 75 * sure NOFS is set to keep us from deadlocking. 76 76 */ 77 77 nofs_flag = memalloc_nofs_save(); 78 - inode = btrfs_iget(fs_info->sb, &location, root, NULL); 78 + inode = btrfs_iget_path(fs_info->sb, &location, root, NULL, path); 79 + btrfs_release_path(path); 79 80 memalloc_nofs_restore(nofs_flag); 80 81 if (IS_ERR(inode)) 81 82 return inode; ··· 839 838 path->search_commit_root = 1; 840 839 path->skip_locking = 1; 841 840 841 + /* 842 + * We must pass a path with search_commit_root set to btrfs_iget in 843 + * order to avoid a deadlock when allocating extents for the tree root. 844 + * 845 + * When we are COWing an extent buffer from the tree root, when looking 846 + * for a free extent, at extent-tree.c:find_free_extent(), we can find 847 + * block group without its free space cache loaded. When we find one 848 + * we must load its space cache which requires reading its free space 849 + * cache's inode item from the root tree. If this inode item is located 850 + * in the same leaf that we started COWing before, then we end up in 851 + * deadlock on the extent buffer (trying to read lock it when we 852 + * previously write locked it). 853 + * 854 + * It's safe to read the inode item using the commit root because 855 + * block groups, once loaded, stay in memory forever (until they are 856 + * removed) as well as their space caches once loaded. New block groups 857 + * once created get their ->cached field set to BTRFS_CACHE_FINISHED so 858 + * we will never try to read their inode item while the fs is mounted. 859 + */ 842 860 inode = lookup_free_space_inode(fs_info, block_group, path); 843 861 if (IS_ERR(inode)) { 844 862 btrfs_free_path(path);
+22 -10
fs/btrfs/inode.c
··· 3569 3569 /* 3570 3570 * read an inode from the btree into the in-memory inode 3571 3571 */ 3572 - static int btrfs_read_locked_inode(struct inode *inode) 3572 + static int btrfs_read_locked_inode(struct inode *inode, 3573 + struct btrfs_path *in_path) 3573 3574 { 3574 3575 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3575 - struct btrfs_path *path; 3576 + struct btrfs_path *path = in_path; 3576 3577 struct extent_buffer *leaf; 3577 3578 struct btrfs_inode_item *inode_item; 3578 3579 struct btrfs_root *root = BTRFS_I(inode)->root; ··· 3589 3588 if (!ret) 3590 3589 filled = true; 3591 3590 3592 - path = btrfs_alloc_path(); 3593 - if (!path) 3594 - return -ENOMEM; 3591 + if (!path) { 3592 + path = btrfs_alloc_path(); 3593 + if (!path) 3594 + return -ENOMEM; 3595 + } 3595 3596 3596 3597 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 3597 3598 3598 3599 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 3599 3600 if (ret) { 3600 - btrfs_free_path(path); 3601 + if (path != in_path) 3602 + btrfs_free_path(path); 3601 3603 return ret; 3602 3604 } 3603 3605 ··· 3725 3721 btrfs_ino(BTRFS_I(inode)), 3726 3722 root->root_key.objectid, ret); 3727 3723 } 3728 - btrfs_free_path(path); 3724 + if (path != in_path) 3725 + btrfs_free_path(path); 3729 3726 3730 3727 if (!maybe_acls) 3731 3728 cache_no_acl(inode); ··· 5648 5643 /* Get an inode object given its location and corresponding root. 5649 5644 * Returns in *is_new if the inode was read from disk 5650 5645 */ 5651 - struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 5652 - struct btrfs_root *root, int *new) 5646 + struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location, 5647 + struct btrfs_root *root, int *new, 5648 + struct btrfs_path *path) 5653 5649 { 5654 5650 struct inode *inode; 5655 5651 ··· 5661 5655 if (inode->i_state & I_NEW) { 5662 5656 int ret; 5663 5657 5664 - ret = btrfs_read_locked_inode(inode); 5658 + ret = btrfs_read_locked_inode(inode, path); 5665 5659 if (!ret) { 5666 5660 inode_tree_add(inode); 5667 5661 unlock_new_inode(inode); ··· 5681 5675 } 5682 5676 5683 5677 return inode; 5678 + } 5679 + 5680 + struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 5681 + struct btrfs_root *root, int *new) 5682 + { 5683 + return btrfs_iget_path(s, location, root, new, NULL); 5684 5684 } 5685 5685 5686 5686 static struct inode *new_simple_dir(struct super_block *s,