Merge tag 'for-6.11-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

tjh.dev / kernel

fork atom

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork atom

Merge tag 'for-6.11-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:

- fix regression in extent map rework when handling insertion of
overlapping compressed extent

- fix unexpected file length when appending to a file using direct io
and buffer not faulted in

- in zoned mode, fix accounting of unusable space when flipping
read-only block group back to read-write

- fix page locking when COWing an inline range, assertion failure found
by syzbot

- fix calculation of space info in debugging print

- tree-checker, add validation of data reference item

- fix a few -Wmaybe-uninitialized build warnings

* tag 'for-6.11-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
btrfs: initialize location to fix -Wmaybe-uninitialized in btrfs_lookup_dentry()
btrfs: fix corruption after buffer fault in during direct IO append write
btrfs: zoned: fix zone_unusable accounting on making block group read-write again
btrfs: do not subtract delalloc from avail bytes
btrfs: make cow_file_range_inline() honor locked_page on error
btrfs: fix corrupt read due to bad offset of a compressed extent map
btrfs: tree-checker: validate dref root and objectid

Linus Torvalds 2 years ago e4fc196f e254e0c5

+225 -31

13 changed files

expand all collapse all

btrfs

block-group.c

ctree.h

direct-io.c

extent-tree.c

extent_map.c

file.c

free-space-cache.c

inode.c

space-info.c

space-info.h

tests

extent-map-tests.c

tree-checker.c

include

trace

events

btrfs.h

+8 -5

fs/btrfs/block-group.c

reviewed

··· 1223 1223 block_group->space_info->total_bytes -= block_group->length; 1224 1224 block_group->space_info->bytes_readonly -= 1225 1225 (block_group->length - block_group->zone_unusable); 1226 1226 - block_group->space_info->bytes_zone_unusable -= 1227 1227 - block_group->zone_unusable; 1226 1226 + btrfs_space_info_update_bytes_zone_unusable(fs_info, block_group->space_info, 1227 1227 + -block_group->zone_unusable); 1228 1228 block_group->space_info->disk_total -= block_group->length * factor; 1229 1229 1230 1230 spin_unlock(&block_group->space_info->lock); ··· 1396 1396 if (btrfs_is_zoned(cache->fs_info)) { 1397 1397 /* Migrate zone_unusable bytes to readonly */ 1398 1398 sinfo->bytes_readonly += cache->zone_unusable; 1399 1399 - sinfo->bytes_zone_unusable -= cache->zone_unusable; 1399 1399 + btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo, 1400 1400 + -cache->zone_unusable); 1400 1401 cache->zone_unusable = 0; 1401 1402 } 1402 1403 cache->ro++; ··· 3057 3056 if (btrfs_is_zoned(cache->fs_info)) { 3058 3057 /* Migrate zone_unusable bytes back */ 3059 3058 cache->zone_unusable = 3060 3060 - (cache->alloc_offset - cache->used) + 3059 3059 + (cache->alloc_offset - cache->used - cache->pinned - 3060 3060 + cache->reserved) + 3061 3061 (cache->length - cache->zone_capacity); 3062 3062 - sinfo->bytes_zone_unusable += cache->zone_unusable; 3062 3062 + btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo, 3063 3063 + cache->zone_unusable); 3063 3064 sinfo->bytes_readonly -= cache->zone_unusable; 3064 3065 } 3065 3066 num_bytes = cache->length - cache->reserved -

fs/btrfs/ctree.h

reviewed

··· 459 459 void *filldir_buf; 460 460 u64 last_index; 461 461 struct extent_state *llseek_cached_state; 462 462 + bool fsync_skip_inode_lock; 462 463 }; 463 464 464 465 static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)

+28 -10

fs/btrfs/direct-io.c

reviewed

··· 856 856 * So here we disable page faults in the iov_iter and then retry if we 857 857 * got -EFAULT, faulting in the pages before the retry. 858 858 */ 859 859 + again: 859 860 from->nofault = true; 860 861 dio = btrfs_dio_write(iocb, from, written); 861 862 from->nofault = false; 862 863 863 863 - /* 864 864 - * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync 865 865 - * iocb, and that needs to lock the inode. So unlock it before calling 866 866 - * iomap_dio_complete() to avoid a deadlock. 867 867 - */ 868 868 - btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 869 869 - 870 870 - if (IS_ERR_OR_NULL(dio)) 864 864 + if (IS_ERR_OR_NULL(dio)) { 871 865 ret = PTR_ERR_OR_ZERO(dio); 872 872 - else 866 866 + } else { 867 867 + struct btrfs_file_private stack_private = { 0 }; 868 868 + struct btrfs_file_private *private; 869 869 + const bool have_private = (file->private_data != NULL); 870 870 + 871 871 + if (!have_private) 872 872 + file->private_data = &stack_private; 873 873 + 874 874 + /* 875 875 + * If we have a synchronous write, we must make sure the fsync 876 876 + * triggered by the iomap_dio_complete() call below doesn't 877 877 + * deadlock on the inode lock - we are already holding it and we 878 878 + * can't call it after unlocking because we may need to complete 879 879 + * partial writes due to the input buffer (or parts of it) not 880 880 + * being already faulted in. 881 881 + */ 882 882 + private = file->private_data; 883 883 + private->fsync_skip_inode_lock = true; 873 884 ret = iomap_dio_complete(dio); 885 885 + private->fsync_skip_inode_lock = false; 886 886 + 887 887 + if (!have_private) 888 888 + file->private_data = NULL; 889 889 + } 874 890 875 891 /* No increment (+=) because iomap returns a cumulative value. */ 876 892 if (ret > 0) ··· 913 897 } else { 914 898 fault_in_iov_iter_readable(from, left); 915 899 prev_left = left; 916 916 - goto relock; 900 900 + goto again; 917 901 } 918 902 } 903 903 + 904 904 + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 919 905 920 906 /* 921 907 * If 'ret' is -ENOTBLK or we have not written all data, then it means

+2 -1

fs/btrfs/extent-tree.c

reviewed

··· 2793 2793 readonly = true; 2794 2794 } else if (btrfs_is_zoned(fs_info)) { 2795 2795 /* Need reset before reusing in a zoned block group */ 2796 2796 - space_info->bytes_zone_unusable += len; 2796 2796 + btrfs_space_info_update_bytes_zone_unusable(fs_info, space_info, 2797 2797 + len); 2797 2798 readonly = true; 2798 2799 } 2799 2800 spin_unlock(&cache->lock);

+1 -1

fs/btrfs/extent_map.c

reviewed

··· 664 664 start_diff = start - em->start; 665 665 em->start = start; 666 666 em->len = end - start; 667 667 - if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE && !extent_map_is_compressed(em)) 667 667 + if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) 668 668 em->offset += start_diff; 669 669 return add_extent_mapping(inode, em, 0); 670 670 }

+14 -3

fs/btrfs/file.c

reviewed

··· 1603 1603 */ 1604 1604 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) 1605 1605 { 1606 1606 + struct btrfs_file_private *private = file->private_data; 1606 1607 struct dentry *dentry = file_dentry(file); 1607 1608 struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); 1608 1609 struct btrfs_root *root = inode->root; ··· 1613 1612 int ret = 0, err; 1614 1613 u64 len; 1615 1614 bool full_sync; 1615 1615 + const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false); 1616 1616 1617 1617 trace_btrfs_sync_file(file, datasync); 1618 1618 ··· 1641 1639 if (ret) 1642 1640 goto out; 1643 1641 1644 1644 - btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); 1642 1642 + if (skip_ilock) 1643 1643 + down_write(&inode->i_mmap_lock); 1644 1644 + else 1645 1645 + btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); 1645 1646 1646 1647 atomic_inc(&root->log_batch); 1647 1648 ··· 1668 1663 */ 1669 1664 ret = start_ordered_ops(inode, start, end); 1670 1665 if (ret) { 1671 1671 - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); 1666 1666 + if (skip_ilock) 1667 1667 + up_write(&inode->i_mmap_lock); 1668 1668 + else 1669 1669 + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); 1672 1670 goto out; 1673 1671 } 1674 1672 ··· 1796 1788 * file again, but that will end up using the synchronization 1797 1789 * inside btrfs_sync_log to keep things safe. 1798 1790 */ 1799 1799 - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); 1791 1791 + if (skip_ilock) 1792 1792 + up_write(&inode->i_mmap_lock); 1793 1793 + else 1794 1794 + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); 1800 1795 1801 1796 if (ret == BTRFS_NO_LOG_SYNC) { 1802 1797 ret = btrfs_end_transaction(trans);

+3 -1

fs/btrfs/free-space-cache.c

reviewed

··· 2723 2723 * If the block group is read-only, we should account freed space into 2724 2724 * bytes_readonly. 2725 2725 */ 2726 2726 - if (!block_group->ro) 2726 2726 + if (!block_group->ro) { 2727 2727 block_group->zone_unusable += to_unusable; 2728 2728 + WARN_ON(block_group->zone_unusable > block_group->length); 2729 2729 + } 2728 2730 spin_unlock(&ctl->tree_lock); 2729 2731 if (!used) { 2730 2732 spin_lock(&block_group->lock);

+11 -7

fs/btrfs/inode.c

reviewed

··· 714 714 return ret; 715 715 } 716 716 717 717 - static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 offset, 718 718 - u64 end, 717 717 + static noinline int cow_file_range_inline(struct btrfs_inode *inode, 718 718 + struct page *locked_page, 719 719 + u64 offset, u64 end, 719 720 size_t compressed_size, 720 721 int compress_type, 721 722 struct folio *compressed_folio, ··· 740 739 return ret; 741 740 } 742 741 743 743 - extent_clear_unlock_delalloc(inode, offset, end, NULL, &cached, 742 742 + if (ret == 0) 743 743 + locked_page = NULL; 744 744 + 745 745 + extent_clear_unlock_delalloc(inode, offset, end, locked_page, &cached, 744 746 clear_flags, 745 747 PAGE_UNLOCK | PAGE_START_WRITEBACK | 746 748 PAGE_END_WRITEBACK); ··· 1047 1043 * extent for the subpage case. 1048 1044 */ 1049 1045 if (total_in < actual_end) 1050 1050 - ret = cow_file_range_inline(inode, start, end, 0, 1046 1046 + ret = cow_file_range_inline(inode, NULL, start, end, 0, 1051 1047 BTRFS_COMPRESS_NONE, NULL, false); 1052 1048 else 1053 1053 - ret = cow_file_range_inline(inode, start, end, total_compressed, 1049 1049 + ret = cow_file_range_inline(inode, NULL, start, end, total_compressed, 1054 1050 compress_type, folios[0], false); 1055 1051 if (ret <= 0) { 1056 1052 if (ret < 0) ··· 1363 1359 1364 1360 if (!no_inline) { 1365 1361 /* lets try to make an inline extent */ 1366 1366 - ret = cow_file_range_inline(inode, start, end, 0, 1362 1362 + ret = cow_file_range_inline(inode, locked_page, start, end, 0, 1367 1363 BTRFS_COMPRESS_NONE, NULL, false); 1368 1364 if (ret <= 0) { 1369 1365 /* ··· 5664 5660 struct inode *inode; 5665 5661 struct btrfs_root *root = BTRFS_I(dir)->root; 5666 5662 struct btrfs_root *sub_root = root; 5667 5667 - struct btrfs_key location; 5663 5663 + struct btrfs_key location = { 0 }; 5668 5664 u8 di_type = 0; 5669 5665 int ret = 0; 5670 5666

+2 -3

fs/btrfs/space-info.c

reviewed

··· 316 316 found->bytes_used += block_group->used; 317 317 found->disk_used += block_group->used * factor; 318 318 found->bytes_readonly += block_group->bytes_super; 319 319 - found->bytes_zone_unusable += block_group->zone_unusable; 319 319 + btrfs_space_info_update_bytes_zone_unusable(info, found, block_group->zone_unusable); 320 320 if (block_group->length > 0) 321 321 found->full = 0; 322 322 btrfs_try_granting_tickets(info, found); ··· 583 583 584 584 spin_lock(&cache->lock); 585 585 avail = cache->length - cache->used - cache->pinned - 586 586 - cache->reserved - cache->delalloc_bytes - 587 587 - cache->bytes_super - cache->zone_unusable; 586 586 + cache->reserved - cache->bytes_super - cache->zone_unusable; 588 587 btrfs_info(fs_info, 589 588 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s", 590 589 cache->start, cache->length, cache->used, cache->pinned,

fs/btrfs/space-info.h

reviewed

··· 249 249 250 250 DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info"); 251 251 DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned"); 252 252 + DECLARE_SPACE_INFO_UPDATE(bytes_zone_unusable, "zone_unusable"); 252 253 253 254 int btrfs_init_space_info(struct btrfs_fs_info *fs_info); 254 255 void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,

+99

fs/btrfs/tests/extent-map-tests.c

reviewed

··· 900 900 return ret; 901 901 } 902 902 903 903 + /* 904 904 + * Test a regression for compressed extent map adjustment when we attempt to 905 905 + * add an extent map that is partially overlapped by another existing extent 906 906 + * map. The resulting extent map offset was left unchanged despite having 907 907 + * incremented its start offset. 908 908 + */ 909 909 + static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) 910 910 + { 911 911 + struct extent_map_tree *em_tree = &inode->extent_tree; 912 912 + struct extent_map *em; 913 913 + int ret; 914 914 + int ret2; 915 915 + 916 916 + em = alloc_extent_map(); 917 917 + if (!em) { 918 918 + test_std_err(TEST_ALLOC_EXTENT_MAP); 919 919 + return -ENOMEM; 920 920 + } 921 921 + 922 922 + /* Compressed extent for the file range [120K, 128K). */ 923 923 + em->start = SZ_1K * 120; 924 924 + em->len = SZ_8K; 925 925 + em->disk_num_bytes = SZ_4K; 926 926 + em->ram_bytes = SZ_8K; 927 927 + em->flags |= EXTENT_FLAG_COMPRESS_ZLIB; 928 928 + write_lock(&em_tree->lock); 929 929 + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); 930 930 + write_unlock(&em_tree->lock); 931 931 + free_extent_map(em); 932 932 + if (ret < 0) { 933 933 + test_err("couldn't add extent map for range [120K, 128K)"); 934 934 + goto out; 935 935 + } 936 936 + 937 937 + em = alloc_extent_map(); 938 938 + if (!em) { 939 939 + test_std_err(TEST_ALLOC_EXTENT_MAP); 940 940 + ret = -ENOMEM; 941 941 + goto out; 942 942 + } 943 943 + 944 944 + /* 945 945 + * Compressed extent for the file range [108K, 144K), which overlaps 946 946 + * with the [120K, 128K) we previously inserted. 947 947 + */ 948 948 + em->start = SZ_1K * 108; 949 949 + em->len = SZ_1K * 36; 950 950 + em->disk_num_bytes = SZ_4K; 951 951 + em->ram_bytes = SZ_1K * 36; 952 952 + em->flags |= EXTENT_FLAG_COMPRESS_ZLIB; 953 953 + 954 954 + /* 955 955 + * Try to add the extent map but with a search range of [140K, 144K), 956 956 + * this should succeed and adjust the extent map to the range 957 957 + * [128K, 144K), with a length of 16K and an offset of 20K. 958 958 + * 959 959 + * This simulates a scenario where in the subvolume tree of an inode we 960 960 + * have a compressed file extent item for the range [108K, 144K) and we 961 961 + * have an overlapping compressed extent map for the range [120K, 128K), 962 962 + * which was created by an encoded write, but its ordered extent was not 963 963 + * yet completed, so the subvolume tree doesn't have yet the file extent 964 964 + * item for that range - we only have the extent map in the inode's 965 965 + * extent map tree. 966 966 + */ 967 967 + write_lock(&em_tree->lock); 968 968 + ret = btrfs_add_extent_mapping(inode, &em, SZ_1K * 140, SZ_4K); 969 969 + write_unlock(&em_tree->lock); 970 970 + free_extent_map(em); 971 971 + if (ret < 0) { 972 972 + test_err("couldn't add extent map for range [108K, 144K)"); 973 973 + goto out; 974 974 + } 975 975 + 976 976 + if (em->start != SZ_128K) { 977 977 + test_err("unexpected extent map start %llu (should be 128K)", em->start); 978 978 + ret = -EINVAL; 979 979 + goto out; 980 980 + } 981 981 + if (em->len != SZ_16K) { 982 982 + test_err("unexpected extent map length %llu (should be 16K)", em->len); 983 983 + ret = -EINVAL; 984 984 + goto out; 985 985 + } 986 986 + if (em->offset != SZ_1K * 20) { 987 987 + test_err("unexpected extent map offset %llu (should be 20K)", em->offset); 988 988 + ret = -EINVAL; 989 989 + goto out; 990 990 + } 991 991 + out: 992 992 + ret2 = free_extent_map_tree(inode); 993 993 + if (ret == 0) 994 994 + ret = ret2; 995 995 + 996 996 + return ret; 997 997 + } 998 998 + 903 999 struct rmap_test_vector { 904 1000 u64 raid_type; 905 1001 u64 physical_start; ··· 1172 1076 if (ret) 1173 1077 goto out; 1174 1078 ret = test_case_7(fs_info, BTRFS_I(inode)); 1079 1079 + if (ret) 1080 1080 + goto out; 1081 1081 + ret = test_case_8(fs_info, BTRFS_I(inode)); 1175 1082 if (ret) 1176 1083 goto out; 1177 1084

+47

fs/btrfs/tree-checker.c

reviewed

··· 1289 1289 va_end(args); 1290 1290 } 1291 1291 1292 1292 + static bool is_valid_dref_root(u64 rootid) 1293 1293 + { 1294 1294 + /* 1295 1295 + * The following tree root objectids are allowed to have a data backref: 1296 1296 + * - subvolume trees 1297 1297 + * - data reloc tree 1298 1298 + * - tree root 1299 1299 + * For v1 space cache 1300 1300 + */ 1301 1301 + return is_fstree(rootid) || rootid == BTRFS_DATA_RELOC_TREE_OBJECTID || 1302 1302 + rootid == BTRFS_ROOT_TREE_OBJECTID; 1303 1303 + } 1304 1304 + 1292 1305 static int check_extent_item(struct extent_buffer *leaf, 1293 1306 struct btrfs_key *key, int slot, 1294 1307 struct btrfs_key *prev_key) ··· 1454 1441 struct btrfs_extent_data_ref *dref; 1455 1442 struct btrfs_shared_data_ref *sref; 1456 1443 u64 seq; 1444 1444 + u64 dref_root; 1445 1445 + u64 dref_objectid; 1457 1446 u64 dref_offset; 1458 1447 u64 inline_offset; 1459 1448 u8 inline_type; ··· 1499 1484 */ 1500 1485 case BTRFS_EXTENT_DATA_REF_KEY: 1501 1486 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1487 1487 + dref_root = btrfs_extent_data_ref_root(leaf, dref); 1488 1488 + dref_objectid = btrfs_extent_data_ref_objectid(leaf, dref); 1502 1489 dref_offset = btrfs_extent_data_ref_offset(leaf, dref); 1503 1490 seq = hash_extent_data_ref( 1504 1491 btrfs_extent_data_ref_root(leaf, dref), 1505 1492 btrfs_extent_data_ref_objectid(leaf, dref), 1506 1493 btrfs_extent_data_ref_offset(leaf, dref)); 1494 1494 + if (unlikely(!is_valid_dref_root(dref_root))) { 1495 1495 + extent_err(leaf, slot, 1496 1496 + "invalid data ref root value %llu", 1497 1497 + dref_root); 1498 1498 + return -EUCLEAN; 1499 1499 + } 1500 1500 + if (unlikely(dref_objectid < BTRFS_FIRST_FREE_OBJECTID || 1501 1501 + dref_objectid > BTRFS_LAST_FREE_OBJECTID)) { 1502 1502 + extent_err(leaf, slot, 1503 1503 + "invalid data ref objectid value %llu", 1504 1504 + dref_root); 1505 1505 + return -EUCLEAN; 1506 1506 + } 1507 1507 if (unlikely(!IS_ALIGNED(dref_offset, 1508 1508 fs_info->sectorsize))) { 1509 1509 extent_err(leaf, slot, ··· 1657 1627 return -EUCLEAN; 1658 1628 } 1659 1629 for (; ptr < end; ptr += sizeof(*dref)) { 1630 1630 + u64 root; 1631 1631 + u64 objectid; 1660 1632 u64 offset; 1661 1633 1662 1634 /* ··· 1666 1634 * overflow from the leaf due to hash collisions. 1667 1635 */ 1668 1636 dref = (struct btrfs_extent_data_ref *)ptr; 1637 1637 + root = btrfs_extent_data_ref_root(leaf, dref); 1638 1638 + objectid = btrfs_extent_data_ref_objectid(leaf, dref); 1669 1639 offset = btrfs_extent_data_ref_offset(leaf, dref); 1640 1640 + if (unlikely(!is_valid_dref_root(root))) { 1641 1641 + extent_err(leaf, slot, 1642 1642 + "invalid extent data backref root value %llu", 1643 1643 + root); 1644 1644 + return -EUCLEAN; 1645 1645 + } 1646 1646 + if (unlikely(objectid < BTRFS_FIRST_FREE_OBJECTID || 1647 1647 + objectid > BTRFS_LAST_FREE_OBJECTID)) { 1648 1648 + extent_err(leaf, slot, 1649 1649 + "invalid extent data backref objectid value %llu", 1650 1650 + root); 1651 1651 + return -EUCLEAN; 1652 1652 + } 1670 1653 if (unlikely(!IS_ALIGNED(offset, leaf->fs_info->sectorsize))) { 1671 1654 extent_err(leaf, slot, 1672 1655 "invalid extent data backref offset, have %llu expect aligned to %u",

include/trace/events/btrfs.h

reviewed

··· 2383 2383 TP_ARGS(fs_info, sinfo, old, diff) 2384 2384 ); 2385 2385 2386 2386 + DEFINE_EVENT(btrfs__space_info_update, update_bytes_zone_unusable, 2387 2387 + 2388 2388 + TP_PROTO(const struct btrfs_fs_info *fs_info, 2389 2389 + const struct btrfs_space_info *sinfo, u64 old, s64 diff), 2390 2390 + 2391 2391 + TP_ARGS(fs_info, sinfo, old, diff) 2392 2392 + ); 2393 2393 + 2386 2394 DECLARE_EVENT_CLASS(btrfs_raid56_bio, 2387 2395 2388 2396 TP_PROTO(const struct btrfs_raid_bio *rbio,