Merge tag 'for-6.3-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:

- scan block devices in non-exclusive mode to avoid temporary mkfs
failures

- fix race between quota disable and quota assign ioctls

- fix deadlock when aborting transaction during relocation with scrub

- ignore fiemap path cache when there are multiple paths for a node

* tag 'for-6.3-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
btrfs: ignore fiemap path cache when there are multiple paths for a node
btrfs: fix deadlock when aborting transaction during relocation with scrub
btrfs: scan device in non-exclusive mode
btrfs: fix race between quota disable and quota assign ioctls

Changed files
+107 -26
fs
+63 -22
fs/btrfs/backref.c
··· 1921 1921 level = -1; 1922 1922 ULIST_ITER_INIT(&uiter); 1923 1923 while (1) { 1924 - bool is_shared; 1925 - bool cached; 1924 + const unsigned long prev_ref_count = ctx->refs.nnodes; 1926 1925 1927 1926 walk_ctx.bytenr = bytenr; 1928 1927 ret = find_parent_nodes(&walk_ctx, &shared); ··· 1939 1940 ret = 0; 1940 1941 1941 1942 /* 1942 - * If our data extent was not directly shared (without multiple 1943 - * reference items), than it might have a single reference item 1944 - * with a count > 1 for the same offset, which means there are 2 1945 - * (or more) file extent items that point to the data extent - 1946 - * this happens when a file extent item needs to be split and 1947 - * then one item gets moved to another leaf due to a b+tree leaf 1948 - * split when inserting some item. In this case the file extent 1949 - * items may be located in different leaves and therefore some 1950 - * of the leaves may be referenced through shared subtrees while 1951 - * others are not. Since our extent buffer cache only works for 1952 - * a single path (by far the most common case and simpler to 1953 - * deal with), we can not use it if we have multiple leaves 1954 - * (which implies multiple paths). 1943 + * More than one extent buffer (bytenr) may have been added to 1944 + * the ctx->refs ulist, in which case we have to check multiple 1945 + * tree paths in case the first one is not shared, so we can not 1946 + * use the path cache which is made for a single path. Multiple 1947 + * extent buffers at the current level happen when: 1948 + * 1949 + * 1) level -1, the data extent: If our data extent was not 1950 + * directly shared (without multiple reference items), then 1951 + * it might have a single reference item with a count > 1 for 1952 + * the same offset, which means there are 2 (or more) file 1953 + * extent items that point to the data extent - this happens 1954 + * when a file extent item needs to be split and then one 1955 + * item gets moved to another leaf due to a b+tree leaf split 1956 + * when inserting some item. In this case the file extent 1957 + * items may be located in different leaves and therefore 1958 + * some of the leaves may be referenced through shared 1959 + * subtrees while others are not. Since our extent buffer 1960 + * cache only works for a single path (by far the most common 1961 + * case and simpler to deal with), we can not use it if we 1962 + * have multiple leaves (which implies multiple paths). 1963 + * 1964 + * 2) level >= 0, a tree node/leaf: We can have a mix of direct 1965 + * and indirect references on a b+tree node/leaf, so we have 1966 + * to check multiple paths, and the extent buffer (the 1967 + * current bytenr) may be shared or not. One example is 1968 + * during relocation as we may get a shared tree block ref 1969 + * (direct ref) and a non-shared tree block ref (indirect 1970 + * ref) for the same node/leaf. 1955 1971 */ 1956 - if (level == -1 && ctx->refs.nnodes > 1) 1972 + if ((ctx->refs.nnodes - prev_ref_count) > 1) 1957 1973 ctx->use_path_cache = false; 1958 1974 1959 1975 if (level >= 0) ··· 1978 1964 if (!node) 1979 1965 break; 1980 1966 bytenr = node->val; 1981 - level++; 1982 - cached = lookup_backref_shared_cache(ctx, root, bytenr, level, 1983 - &is_shared); 1984 - if (cached) { 1985 - ret = (is_shared ? 1 : 0); 1986 - break; 1967 + if (ctx->use_path_cache) { 1968 + bool is_shared; 1969 + bool cached; 1970 + 1971 + level++; 1972 + cached = lookup_backref_shared_cache(ctx, root, bytenr, 1973 + level, &is_shared); 1974 + if (cached) { 1975 + ret = (is_shared ? 1 : 0); 1976 + break; 1977 + } 1987 1978 } 1988 1979 shared.share_count = 0; 1989 1980 shared.have_delayed_delete_refs = false; 1990 1981 cond_resched(); 1982 + } 1983 + 1984 + /* 1985 + * If the path cache is disabled, then it means at some tree level we 1986 + * got multiple parents due to a mix of direct and indirect backrefs or 1987 + * multiple leaves with file extent items pointing to the same data 1988 + * extent. We have to invalidate the cache and cache only the sharedness 1989 + * result for the levels where we got only one node/reference. 1990 + */ 1991 + if (!ctx->use_path_cache) { 1992 + int i = 0; 1993 + 1994 + level--; 1995 + if (ret >= 0 && level >= 0) { 1996 + bytenr = ctx->path_cache_entries[level].bytenr; 1997 + ctx->use_path_cache = true; 1998 + store_backref_shared_cache(ctx, root, bytenr, level, ret); 1999 + i = level + 1; 2000 + } 2001 + 2002 + for ( ; i < BTRFS_MAX_LEVEL; i++) 2003 + ctx->path_cache_entries[i].bytenr = 0; 1991 2004 } 1992 2005 1993 2006 /*
+2
fs/btrfs/ioctl.c
··· 3732 3732 } 3733 3733 3734 3734 /* update qgroup status and info */ 3735 + mutex_lock(&fs_info->qgroup_ioctl_lock); 3735 3736 err = btrfs_run_qgroups(trans); 3737 + mutex_unlock(&fs_info->qgroup_ioctl_lock); 3736 3738 if (err < 0) 3737 3739 btrfs_handle_fs_error(fs_info, err, 3738 3740 "failed to update qgroup status and info");
+10 -1
fs/btrfs/qgroup.c
··· 2828 2828 } 2829 2829 2830 2830 /* 2831 - * called from commit_transaction. Writes all changed qgroups to disk. 2831 + * Writes all changed qgroups to disk. 2832 + * Called by the transaction commit path and the qgroup assign ioctl. 2832 2833 */ 2833 2834 int btrfs_run_qgroups(struct btrfs_trans_handle *trans) 2834 2835 { 2835 2836 struct btrfs_fs_info *fs_info = trans->fs_info; 2836 2837 int ret = 0; 2838 + 2839 + /* 2840 + * In case we are called from the qgroup assign ioctl, assert that we 2841 + * are holding the qgroup_ioctl_lock, otherwise we can race with a quota 2842 + * disable operation (ioctl) and access a freed quota root. 2843 + */ 2844 + if (trans->transaction->state != TRANS_STATE_COMMIT_DOING) 2845 + lockdep_assert_held(&fs_info->qgroup_ioctl_lock); 2837 2846 2838 2847 if (!fs_info->quota_root) 2839 2848 return ret;
+14 -1
fs/btrfs/transaction.c
··· 2035 2035 2036 2036 if (current->journal_info == trans) 2037 2037 current->journal_info = NULL; 2038 - btrfs_scrub_cancel(fs_info); 2038 + 2039 + /* 2040 + * If relocation is running, we can't cancel scrub because that will 2041 + * result in a deadlock. Before relocating a block group, relocation 2042 + * pauses scrub, then starts and commits a transaction before unpausing 2043 + * scrub. If the transaction commit is being done by the relocation 2044 + * task or triggered by another task and the relocation task is waiting 2045 + * for the commit, and we end up here due to an error in the commit 2046 + * path, then calling btrfs_scrub_cancel() will deadlock, as we are 2047 + * asking for scrub to stop while having it asked to be paused higher 2048 + * above in relocation code. 2049 + */ 2050 + if (!test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) 2051 + btrfs_scrub_cancel(fs_info); 2039 2052 2040 2053 kmem_cache_free(btrfs_trans_handle_cachep, trans); 2041 2054 }
+18 -2
fs/btrfs/volumes.c
··· 1366 1366 * So, we need to add a special mount option to scan for 1367 1367 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 1368 1368 */ 1369 - flags |= FMODE_EXCL; 1370 1369 1370 + /* 1371 + * Avoid using flag |= FMODE_EXCL here, as the systemd-udev may 1372 + * initiate the device scan which may race with the user's mount 1373 + * or mkfs command, resulting in failure. 1374 + * Since the device scan is solely for reading purposes, there is 1375 + * no need for FMODE_EXCL. Additionally, the devices are read again 1376 + * during the mount process. It is ok to get some inconsistent 1377 + * values temporarily, as the device paths of the fsid are the only 1378 + * required information for assembling the volume. 1379 + */ 1371 1380 bdev = blkdev_get_by_path(path, flags, holder); 1372 1381 if (IS_ERR(bdev)) 1373 1382 return ERR_CAST(bdev); ··· 3275 3266 btrfs_scrub_pause(fs_info); 3276 3267 ret = btrfs_relocate_block_group(fs_info, chunk_offset); 3277 3268 btrfs_scrub_continue(fs_info); 3278 - if (ret) 3269 + if (ret) { 3270 + /* 3271 + * If we had a transaction abort, stop all running scrubs. 3272 + * See transaction.c:cleanup_transaction() why we do it here. 3273 + */ 3274 + if (BTRFS_FS_ERROR(fs_info)) 3275 + btrfs_scrub_cancel(fs_info); 3279 3276 return ret; 3277 + } 3280 3278 3281 3279 block_group = btrfs_lookup_block_group(fs_info, chunk_offset); 3282 3280 if (!block_group)