Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Btrfs: fix fsync log replay for inodes with a mix of regular refs and extrefs

If we have an inode with a large number of hard links, some of which may
be extrefs, turn a regular ref into an extref, fsync the inode and then
replay the fsync log (after a crash/reboot), we can endup with an fsync
log that makes the replay code always fail with -EOVERFLOW when processing
the inode's references.

This is easy to reproduce with the test case I made for xfstests. Its steps
are the following:

_scratch_mkfs "-O extref" >> $seqres.full 2>&1
_init_flakey
_mount_flakey

# Create a test file with 3001 hard links. This number is large enough to
# make btrfs start using extrefs at some point even if the fs has the maximum
# possible leaf/node size (64Kb).
echo "hello world" > $SCRATCH_MNT/foo
for i in `seq 1 3000`; do
ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_`printf "%04d" $i`
done

# Make sure all metadata and data are durably persisted.
sync

# Now remove one link, add a new one with a new name, add another new one with
# the same name as the one we just removed and fsync the inode.
rm -f $SCRATCH_MNT/foo_link_0001
ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_3001
ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_0001
rm -f $SCRATCH_MNT/foo_link_0002
ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_3002
ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link_3003
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo

# Simulate a crash/power loss. This makes sure the next mount
# will see an fsync log and will replay that log.

_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey

_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey

# Check that the number of hard links is correct, we are able to remove all
# the hard links and read the file's data. This is just to verify we don't
# get stale file handle errors (due to dangling directory index entries that
# point to inodes that no longer exist).
echo "Link count: $(stat --format=%h $SCRATCH_MNT/foo)"
[ -f $SCRATCH_MNT/foo ] || echo "Link foo is missing"
for ((i = 1; i <= 3003; i++)); do
name=foo_link_`printf "%04d" $i`
if [ $i -eq 2 ]; then
[ -f $SCRATCH_MNT/$name ] && echo "Link $name found"
else
[ -f $SCRATCH_MNT/$name ] || echo "Link $name is missing"
fi
done
rm -f $SCRATCH_MNT/foo_link_*
cat $SCRATCH_MNT/foo
rm -f $SCRATCH_MNT/foo

status=0
exit

The fix is simply to correct the overflow condition when overwriting a
reference item because it was wrong, trying to increase the item in the
fs/subvol tree by an impossible amount. Also ensure that we don't insert
one normal ref and one ext ref for the same dentry - this happened because
processing a dir index entry from the parent in the log happened when
the normal ref item was full, which made the logic insert an extref and
later when the normal ref had enough room, it would be inserted again
when processing the ref item from the child inode in the log.

This issue has been present since the introduction of the extrefs feature
(2012).

A test case for xfstests follows soon. This test only passes if the previous
patch titled "Btrfs: fix fsync when extend references are added to an inode"
is applied too.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>

authored by

Filipe Manana and committed by
Chris Mason
df8d116f 2c2c452b

+43 -5
+7 -2
fs/btrfs/inode-item.c
··· 344 344 return -ENOMEM; 345 345 346 346 path->leave_spinning = 1; 347 + path->skip_release_on_error = 1; 347 348 ret = btrfs_insert_empty_item(trans, root, path, &key, 348 349 ins_len); 349 350 if (ret == -EEXIST) { ··· 363 362 ptr = (unsigned long)(ref + 1); 364 363 ret = 0; 365 364 } else if (ret < 0) { 366 - if (ret == -EOVERFLOW) 367 - ret = -EMLINK; 365 + if (ret == -EOVERFLOW) { 366 + if (find_name_in_backref(path, name, name_len, &ref)) 367 + ret = -EEXIST; 368 + else 369 + ret = -EMLINK; 370 + } 368 371 goto out; 369 372 } else { 370 373 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+36 -3
fs/btrfs/tree-log.c
··· 453 453 insert: 454 454 btrfs_release_path(path); 455 455 /* try to insert the key into the destination tree */ 456 + path->skip_release_on_error = 1; 456 457 ret = btrfs_insert_empty_item(trans, root, path, 457 458 key, item_size); 459 + path->skip_release_on_error = 0; 458 460 459 461 /* make sure any existing item is the correct size */ 460 - if (ret == -EEXIST) { 462 + if (ret == -EEXIST || ret == -EOVERFLOW) { 461 463 u32 found_size; 462 464 found_size = btrfs_item_size_nr(path->nodes[0], 463 465 path->slots[0]); ··· 846 844 static noinline int backref_in_log(struct btrfs_root *log, 847 845 struct btrfs_key *key, 848 846 u64 ref_objectid, 849 - char *name, int namelen) 847 + const char *name, int namelen) 850 848 { 851 849 struct btrfs_path *path; 852 850 struct btrfs_inode_ref *ref; ··· 1558 1556 } 1559 1557 1560 1558 /* 1559 + * Return true if an inode reference exists in the log for the given name, 1560 + * inode and parent inode. 1561 + */ 1562 + static bool name_in_log_ref(struct btrfs_root *log_root, 1563 + const char *name, const int name_len, 1564 + const u64 dirid, const u64 ino) 1565 + { 1566 + struct btrfs_key search_key; 1567 + 1568 + search_key.objectid = ino; 1569 + search_key.type = BTRFS_INODE_REF_KEY; 1570 + search_key.offset = dirid; 1571 + if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1572 + return true; 1573 + 1574 + search_key.type = BTRFS_INODE_EXTREF_KEY; 1575 + search_key.offset = btrfs_extref_hash(dirid, name, name_len); 1576 + if (backref_in_log(log_root, &search_key, dirid, name, name_len)) 1577 + return true; 1578 + 1579 + return false; 1580 + } 1581 + 1582 + /* 1561 1583 * take a single entry in a log directory item and replay it into 1562 1584 * the subvolume. 1563 1585 * ··· 1691 1665 return ret; 1692 1666 1693 1667 insert: 1668 + if (name_in_log_ref(root->log_root, name, name_len, 1669 + key->objectid, log_key.objectid)) { 1670 + /* The dentry will be added later. */ 1671 + ret = 0; 1672 + update_size = false; 1673 + goto out; 1674 + } 1694 1675 btrfs_release_path(path); 1695 1676 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1696 1677 name, name_len, log_type, &log_key); 1697 - if (ret && ret != -ENOENT) 1678 + if (ret && ret != -ENOENT && ret != -EEXIST) 1698 1679 goto out; 1699 1680 update_size = false; 1700 1681 ret = 0;