commit 3ee65c0f0778b8fa95381cd7676cde2c03e0f889 · tjh.dev/kernel

+10

fs/btrfs/ctree.h

··· 602 602 /* Indicate that we want the transaction kthread to commit right now. */ 603 603 BTRFS_FS_COMMIT_TRANS, 604 604 605 + /* Indicate we have half completed snapshot deletions pending. */ 606 + BTRFS_FS_UNFINISHED_DROPS, 607 + 605 608 #if BITS_PER_LONG == 32 606 609 /* Indicate if we have error/warn message printed on 32bit systems */ 607 610 BTRFS_FS_32BIT_ERROR, ··· 1109 1106 BTRFS_ROOT_QGROUP_FLUSHING, 1110 1107 /* We started the orphan cleanup for this root. */ 1111 1108 BTRFS_ROOT_ORPHAN_CLEANUP, 1109 + /* This root has a drop operation that was started previously. */ 1110 + BTRFS_ROOT_UNFINISHED_DROP, 1112 1111 }; 1112 + 1113 + static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) 1114 + { 1115 + clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); 1116 + } 1113 1117 1114 1118 /* 1115 1119 * Record swapped tree blocks of a subvolume tree for delayed subtree trace

+10

fs/btrfs/disk-io.c

··· 3813 3813 3814 3814 set_bit(BTRFS_FS_OPEN, &fs_info->flags); 3815 3815 3816 + /* Kick the cleaner thread so it'll start deleting snapshots. */ 3817 + if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags)) 3818 + wake_up_process(fs_info->cleaner_kthread); 3819 + 3816 3820 clear_oneshot: 3817 3821 btrfs_clear_oneshot_options(fs_info); 3818 3822 return 0; ··· 4541 4537 * still try to wake up the cleaner. 4542 4538 */ 4543 4539 kthread_park(fs_info->cleaner_kthread); 4540 + 4541 + /* 4542 + * If we had UNFINISHED_DROPS we could still be processing them, so 4543 + * clear that bit and wake up relocation so it can stop. 4544 + */ 4545 + btrfs_wake_unfinished_drop(fs_info); 4544 4546 4545 4547 /* wait for the qgroup rescan worker to stop */ 4546 4548 btrfs_qgroup_wait_for_completion(fs_info, false);

+10

fs/btrfs/extent-tree.c

··· 5622 5622 int ret; 5623 5623 int level; 5624 5624 bool root_dropped = false; 5625 + bool unfinished_drop = false; 5625 5626 5626 5627 btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); 5627 5628 ··· 5665 5664 * already dropped. 5666 5665 */ 5667 5666 set_bit(BTRFS_ROOT_DELETING, &root->state); 5667 + unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state); 5668 + 5668 5669 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 5669 5670 level = btrfs_header_level(root->node); 5670 5671 path->nodes[level] = btrfs_lock_root_node(root); ··· 5841 5838 kfree(wc); 5842 5839 btrfs_free_path(path); 5843 5840 out: 5841 + /* 5842 + * We were an unfinished drop root, check to see if there are any 5843 + * pending, and if not clear and wake up any waiters. 5844 + */ 5845 + if (!err && unfinished_drop) 5846 + btrfs_maybe_wake_unfinished_drop(fs_info); 5847 + 5844 5848 /* 5845 5849 * So if we need to stop dropping the snapshot for whatever reason we 5846 5850 * need to make sure to add it back to the dead root list so that we

+13 -3

fs/btrfs/extent_io.c

··· 6841 6841 { 6842 6842 struct btrfs_fs_info *fs_info = eb->fs_info; 6843 6843 6844 + /* 6845 + * If we are using the commit root we could potentially clear a page 6846 + * Uptodate while we're using the extent buffer that we've previously 6847 + * looked up. We don't want to complain in this case, as the page was 6848 + * valid before, we just didn't write it out. Instead we want to catch 6849 + * the case where we didn't actually read the block properly, which 6850 + * would have !PageUptodate && !PageError, as we clear PageError before 6851 + * reading. 6852 + */ 6844 6853 if (fs_info->sectorsize < PAGE_SIZE) { 6845 - bool uptodate; 6854 + bool uptodate, error; 6846 6855 6847 6856 uptodate = btrfs_subpage_test_uptodate(fs_info, page, 6848 6857 eb->start, eb->len); 6849 - WARN_ON(!uptodate); 6858 + error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len); 6859 + WARN_ON(!uptodate && !error); 6850 6860 } else { 6851 - WARN_ON(!PageUptodate(page)); 6861 + WARN_ON(!PageUptodate(page) && !PageError(page)); 6852 6862 } 6853 6863 } 6854 6864

+28

fs/btrfs/inode.c

··· 7600 7600 } 7601 7601 7602 7602 len = min(len, em->len - (start - em->start)); 7603 + 7604 + /* 7605 + * If we have a NOWAIT request and the range contains multiple extents 7606 + * (or a mix of extents and holes), then we return -EAGAIN to make the 7607 + * caller fallback to a context where it can do a blocking (without 7608 + * NOWAIT) request. This way we avoid doing partial IO and returning 7609 + * success to the caller, which is not optimal for writes and for reads 7610 + * it can result in unexpected behaviour for an application. 7611 + * 7612 + * When doing a read, because we use IOMAP_DIO_PARTIAL when calling 7613 + * iomap_dio_rw(), we can end up returning less data then what the caller 7614 + * asked for, resulting in an unexpected, and incorrect, short read. 7615 + * That is, the caller asked to read N bytes and we return less than that, 7616 + * which is wrong unless we are crossing EOF. This happens if we get a 7617 + * page fault error when trying to fault in pages for the buffer that is 7618 + * associated to the struct iov_iter passed to iomap_dio_rw(), and we 7619 + * have previously submitted bios for other extents in the range, in 7620 + * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of 7621 + * those bios have completed by the time we get the page fault error, 7622 + * which we return back to our caller - we should only return EIOCBQUEUED 7623 + * after we have submitted bios for all the extents in the range. 7624 + */ 7625 + if ((flags & IOMAP_NOWAIT) && len < length) { 7626 + free_extent_map(em); 7627 + ret = -EAGAIN; 7628 + goto unlock_err; 7629 + } 7630 + 7603 7631 if (write) { 7604 7632 ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, 7605 7633 start, len);

+8 -1

fs/btrfs/qgroup.c

··· 1197 1197 goto out; 1198 1198 1199 1199 /* 1200 + * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to 1201 + * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs 1202 + * to lock that mutex while holding a transaction handle and the rescan 1203 + * worker needs to commit a transaction. 1204 + */ 1205 + mutex_unlock(&fs_info->qgroup_ioctl_lock); 1206 + 1207 + /* 1200 1208 * Request qgroup rescan worker to complete and wait for it. This wait 1201 1209 * must be done before transaction start for quota disable since it may 1202 1210 * deadlock with transaction by the qgroup rescan worker. 1203 1211 */ 1204 1212 clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1205 1213 btrfs_qgroup_wait_for_completion(fs_info, false); 1206 - mutex_unlock(&fs_info->qgroup_ioctl_lock); 1207 1214 1208 1215 /* 1209 1216 * 1 For the root item

+13

fs/btrfs/relocation.c

··· 3960 3960 int rw = 0; 3961 3961 int err = 0; 3962 3962 3963 + /* 3964 + * This only gets set if we had a half-deleted snapshot on mount. We 3965 + * cannot allow relocation to start while we're still trying to clean up 3966 + * these pending deletions. 3967 + */ 3968 + ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE); 3969 + if (ret) 3970 + return ret; 3971 + 3972 + /* We may have been woken up by close_ctree, so bail if we're closing. */ 3973 + if (btrfs_fs_closing(fs_info)) 3974 + return -EINTR; 3975 + 3963 3976 bg = btrfs_lookup_block_group(fs_info, group_start); 3964 3977 if (!bg) 3965 3978 return -ENOENT;

+15

fs/btrfs/root-tree.c

··· 278 278 279 279 WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)); 280 280 if (btrfs_root_refs(&root->root_item) == 0) { 281 + struct btrfs_key drop_key; 282 + 283 + btrfs_disk_key_to_cpu(&drop_key, &root->root_item.drop_progress); 284 + /* 285 + * If we have a non-zero drop_progress then we know we 286 + * made it partly through deleting this snapshot, and 287 + * thus we need to make sure we block any balance from 288 + * happening until this snapshot is completely dropped. 289 + */ 290 + if (drop_key.objectid != 0 || drop_key.type != 0 || 291 + drop_key.offset != 0) { 292 + set_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); 293 + set_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state); 294 + } 295 + 281 296 set_bit(BTRFS_ROOT_DEAD_TREE, &root->state); 282 297 btrfs_add_dead_root(root); 283 298 }

+1 -1

fs/btrfs/subpage.c

··· 736 736 * Since we own the page lock, no one else could touch subpage::writers 737 737 * and we are safe to do several atomic operations without spinlock. 738 738 */ 739 - if (atomic_read(&subpage->writers)) 739 + if (atomic_read(&subpage->writers) == 0) 740 740 /* No writers, locked by plain lock_page() */ 741 741 return unlock_page(page); 742 742

+63 -2

fs/btrfs/transaction.c

··· 854 854 static noinline void wait_for_commit(struct btrfs_transaction *commit, 855 855 const enum btrfs_trans_state min_state) 856 856 { 857 - wait_event(commit->commit_wait, commit->state >= min_state); 857 + struct btrfs_fs_info *fs_info = commit->fs_info; 858 + u64 transid = commit->transid; 859 + bool put = false; 860 + 861 + while (1) { 862 + wait_event(commit->commit_wait, commit->state >= min_state); 863 + if (put) 864 + btrfs_put_transaction(commit); 865 + 866 + if (min_state < TRANS_STATE_COMPLETED) 867 + break; 868 + 869 + /* 870 + * A transaction isn't really completed until all of the 871 + * previous transactions are completed, but with fsync we can 872 + * end up with SUPER_COMMITTED transactions before a COMPLETED 873 + * transaction. Wait for those. 874 + */ 875 + 876 + spin_lock(&fs_info->trans_lock); 877 + commit = list_first_entry_or_null(&fs_info->trans_list, 878 + struct btrfs_transaction, 879 + list); 880 + if (!commit || commit->transid > transid) { 881 + spin_unlock(&fs_info->trans_lock); 882 + break; 883 + } 884 + refcount_inc(&commit->use_count); 885 + put = true; 886 + spin_unlock(&fs_info->trans_lock); 887 + } 858 888 } 859 889 860 890 int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) ··· 1350 1320 } 1351 1321 1352 1322 /* 1323 + * If we had a pending drop we need to see if there are any others left in our 1324 + * dead roots list, and if not clear our bit and wake any waiters. 1325 + */ 1326 + void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info) 1327 + { 1328 + /* 1329 + * We put the drop in progress roots at the front of the list, so if the 1330 + * first entry doesn't have UNFINISHED_DROP set we can wake everybody 1331 + * up. 1332 + */ 1333 + spin_lock(&fs_info->trans_lock); 1334 + if (!list_empty(&fs_info->dead_roots)) { 1335 + struct btrfs_root *root = list_first_entry(&fs_info->dead_roots, 1336 + struct btrfs_root, 1337 + root_list); 1338 + if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) { 1339 + spin_unlock(&fs_info->trans_lock); 1340 + return; 1341 + } 1342 + } 1343 + spin_unlock(&fs_info->trans_lock); 1344 + 1345 + btrfs_wake_unfinished_drop(fs_info); 1346 + } 1347 + 1348 + /* 1353 1349 * dead roots are old snapshots that need to be deleted. This allocates 1354 1350 * a dirty root struct and adds it into the list of dead roots that need to 1355 1351 * be deleted ··· 1387 1331 spin_lock(&fs_info->trans_lock); 1388 1332 if (list_empty(&root->root_list)) { 1389 1333 btrfs_grab_root(root); 1390 - list_add_tail(&root->root_list, &fs_info->dead_roots); 1334 + 1335 + /* We want to process the partially complete drops first. */ 1336 + if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) 1337 + list_add(&root->root_list, &fs_info->dead_roots); 1338 + else 1339 + list_add_tail(&root->root_list, &fs_info->dead_roots); 1391 1340 } 1392 1341 spin_unlock(&fs_info->trans_lock); 1393 1342 }

+1

fs/btrfs/transaction.h

··· 216 216 217 217 void btrfs_add_dead_root(struct btrfs_root *root); 218 218 int btrfs_defrag_root(struct btrfs_root *root); 219 + void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info); 219 220 int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); 220 221 int btrfs_commit_transaction(struct btrfs_trans_handle *trans); 221 222 void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);

+9 -9

fs/btrfs/tree-checker.c

··· 1682 1682 */ 1683 1683 for (slot = 0; slot < nritems; slot++) { 1684 1684 u32 item_end_expected; 1685 + u64 item_data_end; 1685 1686 int ret; 1686 1687 1687 1688 btrfs_item_key_to_cpu(leaf, &key, slot); ··· 1697 1696 return -EUCLEAN; 1698 1697 } 1699 1698 1699 + item_data_end = (u64)btrfs_item_offset(leaf, slot) + 1700 + btrfs_item_size(leaf, slot); 1700 1701 /* 1701 1702 * Make sure the offset and ends are right, remember that the 1702 1703 * item data starts at the end of the leaf and grows towards the ··· 1709 1706 else 1710 1707 item_end_expected = btrfs_item_offset(leaf, 1711 1708 slot - 1); 1712 - if (unlikely(btrfs_item_data_end(leaf, slot) != item_end_expected)) { 1709 + if (unlikely(item_data_end != item_end_expected)) { 1713 1710 generic_err(leaf, slot, 1714 - "unexpected item end, have %u expect %u", 1715 - btrfs_item_data_end(leaf, slot), 1716 - item_end_expected); 1711 + "unexpected item end, have %llu expect %u", 1712 + item_data_end, item_end_expected); 1717 1713 return -EUCLEAN; 1718 1714 } 1719 1715 ··· 1721 1719 * just in case all the items are consistent to each other, but 1722 1720 * all point outside of the leaf. 1723 1721 */ 1724 - if (unlikely(btrfs_item_data_end(leaf, slot) > 1725 - BTRFS_LEAF_DATA_SIZE(fs_info))) { 1722 + if (unlikely(item_data_end > BTRFS_LEAF_DATA_SIZE(fs_info))) { 1726 1723 generic_err(leaf, slot, 1727 - "slot end outside of leaf, have %u expect range [0, %u]", 1728 - btrfs_item_data_end(leaf, slot), 1729 - BTRFS_LEAF_DATA_SIZE(fs_info)); 1724 + "slot end outside of leaf, have %llu expect range [0, %u]", 1725 + item_data_end, BTRFS_LEAF_DATA_SIZE(fs_info)); 1730 1726 return -EUCLEAN; 1731 1727 } 1732 1728

+49 -12

fs/btrfs/tree-log.c

··· 1362 1362 inode, name, namelen); 1363 1363 kfree(name); 1364 1364 iput(dir); 1365 + /* 1366 + * Whenever we need to check if a name exists or not, we 1367 + * check the subvolume tree. So after an unlink we must 1368 + * run delayed items, so that future checks for a name 1369 + * during log replay see that the name does not exists 1370 + * anymore. 1371 + */ 1372 + if (!ret) 1373 + ret = btrfs_run_delayed_items(trans); 1365 1374 if (ret) 1366 1375 goto out; 1367 1376 goto again; ··· 1623 1614 */ 1624 1615 if (!ret && inode->i_nlink == 0) 1625 1616 inc_nlink(inode); 1617 + /* 1618 + * Whenever we need to check if a name exists or 1619 + * not, we check the subvolume tree. So after an 1620 + * unlink we must run delayed items, so that future 1621 + * checks for a name during log replay see that the 1622 + * name does not exists anymore. 1623 + */ 1624 + if (!ret) 1625 + ret = btrfs_run_delayed_items(trans); 1626 1626 } 1627 1627 if (ret < 0) 1628 1628 goto out; ··· 4653 4635 4654 4636 /* 4655 4637 * Log all prealloc extents beyond the inode's i_size to make sure we do not 4656 - * lose them after doing a fast fsync and replaying the log. We scan the 4638 + * lose them after doing a full/fast fsync and replaying the log. We scan the 4657 4639 * subvolume's root instead of iterating the inode's extent map tree because 4658 4640 * otherwise we can log incorrect extent items based on extent map conversion. 4659 4641 * That can happen due to the fact that extent maps are merged when they ··· 5432 5414 struct btrfs_log_ctx *ctx, 5433 5415 bool *need_log_inode_item) 5434 5416 { 5417 + const u64 i_size = i_size_read(&inode->vfs_inode); 5435 5418 struct btrfs_root *root = inode->root; 5436 5419 int ins_start_slot = 0; 5437 5420 int ins_nr = 0; ··· 5453 5434 if (min_key->type > max_key->type) 5454 5435 break; 5455 5436 5456 - if (min_key->type == BTRFS_INODE_ITEM_KEY) 5437 + if (min_key->type == BTRFS_INODE_ITEM_KEY) { 5457 5438 *need_log_inode_item = false; 5458 - 5459 - if ((min_key->type == BTRFS_INODE_REF_KEY || 5460 - min_key->type == BTRFS_INODE_EXTREF_KEY) && 5461 - inode->generation == trans->transid && 5462 - !recursive_logging) { 5439 + } else if (min_key->type == BTRFS_EXTENT_DATA_KEY && 5440 + min_key->offset >= i_size) { 5441 + /* 5442 + * Extents at and beyond eof are logged with 5443 + * btrfs_log_prealloc_extents(). 5444 + * Only regular files have BTRFS_EXTENT_DATA_KEY keys, 5445 + * and no keys greater than that, so bail out. 5446 + */ 5447 + break; 5448 + } else if ((min_key->type == BTRFS_INODE_REF_KEY || 5449 + min_key->type == BTRFS_INODE_EXTREF_KEY) && 5450 + inode->generation == trans->transid && 5451 + !recursive_logging) { 5463 5452 u64 other_ino = 0; 5464 5453 u64 other_parent = 0; 5465 5454 ··· 5498 5471 btrfs_release_path(path); 5499 5472 goto next_key; 5500 5473 } 5501 - } 5502 - 5503 - /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ 5504 - if (min_key->type == BTRFS_XATTR_ITEM_KEY) { 5474 + } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) { 5475 + /* Skip xattrs, logged later with btrfs_log_all_xattrs() */ 5505 5476 if (ins_nr == 0) 5506 5477 goto next_slot; 5507 5478 ret = copy_items(trans, inode, dst_path, path, ··· 5552 5527 break; 5553 5528 } 5554 5529 } 5555 - if (ins_nr) 5530 + if (ins_nr) { 5556 5531 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5557 5532 ins_nr, inode_only, logged_isize); 5533 + if (ret) 5534 + return ret; 5535 + } 5536 + 5537 + if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) { 5538 + /* 5539 + * Release the path because otherwise we might attempt to double 5540 + * lock the same leaf with btrfs_log_prealloc_extents() below. 5541 + */ 5542 + btrfs_release_path(path); 5543 + ret = btrfs_log_prealloc_extents(trans, inode, dst_path); 5544 + } 5558 5545 5559 5546 return ret; 5560 5547 }