Btrfs: Make btrfs_drop_snapshot work in larger and more efficient chunks

Every transaction in btrfs creates a new snapshot, and then schedules the
snapshot from the last transaction for deletion. Snapshot deletion
works by walking down the btree and dropping the reference counts
on each btree block during the walk.

If if a given leaf or node has a reference count greater than one,
the reference count is decremented and the subtree pointed to by that
node is ignored.

If the reference count is one, walking continues down into that node
or leaf, and the references of everything it points to are decremented.

The old code would try to work in small pieces, walking down the tree
until it found the lowest leaf or node to free and then returning. This
was very friendly to the rest of the FS because it didn't have a huge
impact on other operations.

But it wouldn't always keep up with the rate that new commits added new
snapshots for deletion, and it wasn't very optimal for the extent
allocation tree because it wasn't finding leaves that were close together
on disk and processing them at the same time.

This changes things to walk down to a level 1 node and then process it
in bulk. All the leaf pointers are sorted and the leaves are dropped
in order based on their extent number.

The extent allocation tree and commit code are now fast enough for
this kind of bulk processing to work without slowing the rest of the FS
down. Overall it does less IO and is better able to keep up with
snapshot deletions under high load.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

+266 -46
+263 -45
fs/btrfs/extent-tree.c
··· 1533 * struct refsort is used to match byte number to slot in the btree block. 1534 * we sort based on the byte number and then use the slot to actually 1535 * find the item. 1536 */ 1537 struct refsort { 1538 u64 bytenr; ··· 3462 { 3463 u64 leaf_owner; 3464 u64 leaf_generation; 3465 struct btrfs_key key; 3466 struct btrfs_file_extent_item *fi; 3467 int i; 3468 int nritems; 3469 int ret; 3470 3471 BUG_ON(!btrfs_is_leaf(leaf)); 3472 nritems = btrfs_header_nritems(leaf); 3473 leaf_owner = btrfs_header_owner(leaf); 3474 leaf_generation = btrfs_header_generation(leaf); 3475 3476 for (i = 0; i < nritems; i++) { 3477 u64 disk_bytenr; 3478 cond_resched(); 3479 3480 btrfs_item_key_to_cpu(leaf, &key, i); 3481 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3482 continue; 3483 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3484 if (btrfs_file_extent_type(leaf, fi) == 3485 BTRFS_FILE_EXTENT_INLINE) 3486 continue; 3487 - /* 3488 - * FIXME make sure to insert a trans record that 3489 - * repeats the snapshot del on crash 3490 - */ 3491 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 3492 if (disk_bytenr == 0) 3493 continue; 3494 3495 ret = __btrfs_free_extent(trans, root, disk_bytenr, 3496 btrfs_file_extent_disk_num_bytes(leaf, fi), ··· 3539 wake_up(&root->fs_info->transaction_throttle); 3540 cond_resched(); 3541 } 3542 return 0; 3543 } 3544 ··· 3550 { 3551 int i; 3552 int ret; 3553 - struct btrfs_extent_info *info = ref->extents; 3554 3555 for (i = 0; i < ref->nritems; i++) { 3556 ret = __btrfs_free_extent(trans, root, info->bytenr, 3557 info->num_bytes, ref->bytenr, 3558 ref->owner, ref->generation, ··· 3626 } 3627 3628 /* 3629 * helper function for drop_snapshot, this walks down the tree dropping ref 3630 * counts as it goes. 3631 */ ··· 3786 struct extent_buffer *next; 3787 struct extent_buffer *cur; 3788 struct extent_buffer *parent; 3789 - struct btrfs_leaf_ref *ref; 3790 u32 blocksize; 3791 int ret; 3792 u32 refs; ··· 3812 if (path->slots[*level] >= 3813 btrfs_header_nritems(cur)) 3814 break; 3815 if (*level == 0) { 3816 ret = btrfs_drop_leaf_ref(trans, root, cur); 3817 BUG_ON(ret); 3818 break; 3819 } 3820 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 3821 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 3822 blocksize = btrfs_level_size(root, *level - 1); 3823 3824 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 3825 BUG_ON(ret); 3826 if (refs != 1) { 3827 parent = path->nodes[*level]; 3828 root_owner = btrfs_header_owner(parent); ··· 3870 3871 continue; 3872 } 3873 - /* 3874 - * at this point, we have a single ref, and since the 3875 - * only place referencing this extent is a dead root 3876 - * the reference count should never go higher. 3877 - * So, we don't need to check it again 3878 - */ 3879 - if (*level == 1) { 3880 - ref = btrfs_lookup_leaf_ref(root, bytenr); 3881 - if (ref && ref->generation != ptr_gen) { 3882 - btrfs_free_leaf_ref(root, ref); 3883 - ref = NULL; 3884 - } 3885 - if (ref) { 3886 - ret = cache_drop_leaf_ref(trans, root, ref); 3887 - BUG_ON(ret); 3888 - btrfs_remove_leaf_ref(root, ref); 3889 - btrfs_free_leaf_ref(root, ref); 3890 - *level = 0; 3891 - break; 3892 - } 3893 - } 3894 - next = btrfs_find_tree_block(root, bytenr, blocksize); 3895 - if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) { 3896 - free_extent_buffer(next); 3897 3898 - next = read_tree_block(root, bytenr, blocksize, 3899 - ptr_gen); 3900 - cond_resched(); 3901 - #if 0 3902 - /* 3903 - * this is a debugging check and can go away 3904 - * the ref should never go all the way down to 1 3905 - * at this point 3906 - */ 3907 - ret = lookup_extent_ref(NULL, root, bytenr, blocksize, 3908 - &refs); 3909 - BUG_ON(ret); 3910 - WARN_ON(refs != 1); 3911 - #endif 3912 - } 3913 WARN_ON(*level <= 0); 3914 if (path->nodes[*level-1]) 3915 free_extent_buffer(path->nodes[*level-1]); ··· 3900 root_owner = btrfs_header_owner(parent); 3901 root_gen = btrfs_header_generation(parent); 3902 3903 ret = __btrfs_free_extent(trans, root, bytenr, blocksize, 3904 parent->start, root_owner, root_gen, 3905 *level, 1); 3906 free_extent_buffer(path->nodes[*level]); 3907 path->nodes[*level] = NULL; 3908 *level += 1; 3909 BUG_ON(ret); 3910 ··· 4029 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 4030 struct extent_buffer *node; 4031 struct btrfs_disk_key disk_key; 4032 node = path->nodes[i]; 4033 path->slots[i]++; 4034 *level = i; ··· 4047 return 0; 4048 } else { 4049 struct extent_buffer *parent; 4050 if (path->nodes[*level] == root->node) 4051 parent = path->nodes[*level]; 4052 else ··· 5066 ref->bytenr = buf->start; 5067 ref->owner = btrfs_header_owner(buf); 5068 ref->generation = btrfs_header_generation(buf); 5069 ret = btrfs_add_leaf_ref(root, ref, 0); 5070 WARN_ON(ret); 5071 btrfs_free_leaf_ref(root, ref);
··· 1533 * struct refsort is used to match byte number to slot in the btree block. 1534 * we sort based on the byte number and then use the slot to actually 1535 * find the item. 1536 + * 1537 + * struct refsort is smaller than strcut btrfs_item and smaller than 1538 + * struct btrfs_key_ptr. Since we're currently limited to the page size 1539 + * for a btree block, there's no way for a kmalloc of refsorts for a 1540 + * single node to be bigger than a page. 1541 */ 1542 struct refsort { 1543 u64 bytenr; ··· 3457 { 3458 u64 leaf_owner; 3459 u64 leaf_generation; 3460 + struct refsort *sorted; 3461 struct btrfs_key key; 3462 struct btrfs_file_extent_item *fi; 3463 int i; 3464 int nritems; 3465 int ret; 3466 + int refi = 0; 3467 + int slot; 3468 3469 BUG_ON(!btrfs_is_leaf(leaf)); 3470 nritems = btrfs_header_nritems(leaf); 3471 leaf_owner = btrfs_header_owner(leaf); 3472 leaf_generation = btrfs_header_generation(leaf); 3473 3474 + sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS); 3475 + /* we do this loop twice. The first time we build a list 3476 + * of the extents we have a reference on, then we sort the list 3477 + * by bytenr. The second time around we actually do the 3478 + * extent freeing. 3479 + */ 3480 for (i = 0; i < nritems; i++) { 3481 u64 disk_bytenr; 3482 cond_resched(); 3483 3484 btrfs_item_key_to_cpu(leaf, &key, i); 3485 + 3486 + /* only extents have references, skip everything else */ 3487 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3488 continue; 3489 + 3490 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3491 + 3492 + /* inline extents live in the btree, they don't have refs */ 3493 if (btrfs_file_extent_type(leaf, fi) == 3494 BTRFS_FILE_EXTENT_INLINE) 3495 continue; 3496 + 3497 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 3498 + 3499 + /* holes don't have refs */ 3500 if (disk_bytenr == 0) 3501 continue; 3502 + 3503 + sorted[refi].bytenr = disk_bytenr; 3504 + sorted[refi].slot = i; 3505 + refi++; 3506 + } 3507 + 3508 + if (refi == 0) 3509 + goto out; 3510 + 3511 + sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); 3512 + 3513 + for (i = 0; i < refi; i++) { 3514 + u64 disk_bytenr; 3515 + 3516 + disk_bytenr = sorted[i].bytenr; 3517 + slot = sorted[i].slot; 3518 + 3519 + cond_resched(); 3520 + 3521 + btrfs_item_key_to_cpu(leaf, &key, slot); 3522 + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3523 + continue; 3524 + 3525 + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 3526 3527 ret = __btrfs_free_extent(trans, root, disk_bytenr, 3528 btrfs_file_extent_disk_num_bytes(leaf, fi), ··· 3497 wake_up(&root->fs_info->transaction_throttle); 3498 cond_resched(); 3499 } 3500 + out: 3501 + kfree(sorted); 3502 return 0; 3503 } 3504 ··· 3506 { 3507 int i; 3508 int ret; 3509 + struct btrfs_extent_info *info; 3510 + struct refsort *sorted; 3511 3512 + if (ref->nritems == 0) 3513 + return 0; 3514 + 3515 + sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS); 3516 for (i = 0; i < ref->nritems; i++) { 3517 + sorted[i].bytenr = ref->extents[i].bytenr; 3518 + sorted[i].slot = i; 3519 + } 3520 + sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL); 3521 + 3522 + /* 3523 + * the items in the ref were sorted when the ref was inserted 3524 + * into the ref cache, so this is already in order 3525 + */ 3526 + for (i = 0; i < ref->nritems; i++) { 3527 + info = ref->extents + sorted[i].slot; 3528 ret = __btrfs_free_extent(trans, root, info->bytenr, 3529 info->num_bytes, ref->bytenr, 3530 ref->owner, ref->generation, ··· 3566 } 3567 3568 /* 3569 + * this is used while deleting old snapshots, and it drops the refs 3570 + * on a whole subtree starting from a level 1 node. 3571 + * 3572 + * The idea is to sort all the leaf pointers, and then drop the 3573 + * ref on all the leaves in order. Most of the time the leaves 3574 + * will have ref cache entries, so no leaf IOs will be required to 3575 + * find the extents they have references on. 3576 + * 3577 + * For each leaf, any references it has are also dropped in order 3578 + * 3579 + * This ends up dropping the references in something close to optimal 3580 + * order for reading and modifying the extent allocation tree. 3581 + */ 3582 + static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans, 3583 + struct btrfs_root *root, 3584 + struct btrfs_path *path) 3585 + { 3586 + u64 bytenr; 3587 + u64 root_owner; 3588 + u64 root_gen; 3589 + struct extent_buffer *eb = path->nodes[1]; 3590 + struct extent_buffer *leaf; 3591 + struct btrfs_leaf_ref *ref; 3592 + struct refsort *sorted = NULL; 3593 + int nritems = btrfs_header_nritems(eb); 3594 + int ret; 3595 + int i; 3596 + int refi = 0; 3597 + int slot = path->slots[1]; 3598 + u32 blocksize = btrfs_level_size(root, 0); 3599 + u32 refs; 3600 + 3601 + if (nritems == 0) 3602 + goto out; 3603 + 3604 + root_owner = btrfs_header_owner(eb); 3605 + root_gen = btrfs_header_generation(eb); 3606 + sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS); 3607 + 3608 + /* 3609 + * step one, sort all the leaf pointers so we don't scribble 3610 + * randomly into the extent allocation tree 3611 + */ 3612 + for (i = slot; i < nritems; i++) { 3613 + sorted[refi].bytenr = btrfs_node_blockptr(eb, i); 3614 + sorted[refi].slot = i; 3615 + refi++; 3616 + } 3617 + 3618 + /* 3619 + * nritems won't be zero, but if we're picking up drop_snapshot 3620 + * after a crash, slot might be > 0, so double check things 3621 + * just in case. 3622 + */ 3623 + if (refi == 0) 3624 + goto out; 3625 + 3626 + sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); 3627 + 3628 + /* 3629 + * the first loop frees everything the leaves point to 3630 + */ 3631 + for (i = 0; i < refi; i++) { 3632 + u64 ptr_gen; 3633 + 3634 + bytenr = sorted[i].bytenr; 3635 + 3636 + /* 3637 + * check the reference count on this leaf. If it is > 1 3638 + * we just decrement it below and don't update any 3639 + * of the refs the leaf points to. 3640 + */ 3641 + ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 3642 + BUG_ON(ret); 3643 + if (refs != 1) 3644 + continue; 3645 + 3646 + ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot); 3647 + 3648 + /* 3649 + * the leaf only had one reference, which means the 3650 + * only thing pointing to this leaf is the snapshot 3651 + * we're deleting. It isn't possible for the reference 3652 + * count to increase again later 3653 + * 3654 + * The reference cache is checked for the leaf, 3655 + * and if found we'll be able to drop any refs held by 3656 + * the leaf without needing to read it in. 3657 + */ 3658 + ref = btrfs_lookup_leaf_ref(root, bytenr); 3659 + if (ref && ref->generation != ptr_gen) { 3660 + btrfs_free_leaf_ref(root, ref); 3661 + ref = NULL; 3662 + } 3663 + if (ref) { 3664 + ret = cache_drop_leaf_ref(trans, root, ref); 3665 + BUG_ON(ret); 3666 + btrfs_remove_leaf_ref(root, ref); 3667 + btrfs_free_leaf_ref(root, ref); 3668 + } else { 3669 + /* 3670 + * the leaf wasn't in the reference cache, so 3671 + * we have to read it. 3672 + */ 3673 + leaf = read_tree_block(root, bytenr, blocksize, 3674 + ptr_gen); 3675 + ret = btrfs_drop_leaf_ref(trans, root, leaf); 3676 + BUG_ON(ret); 3677 + free_extent_buffer(leaf); 3678 + } 3679 + atomic_inc(&root->fs_info->throttle_gen); 3680 + wake_up(&root->fs_info->transaction_throttle); 3681 + cond_resched(); 3682 + } 3683 + 3684 + /* 3685 + * run through the loop again to free the refs on the leaves. 3686 + * This is faster than doing it in the loop above because 3687 + * the leaves are likely to be clustered together. We end up 3688 + * working in nice chunks on the extent allocation tree. 3689 + */ 3690 + for (i = 0; i < refi; i++) { 3691 + bytenr = sorted[i].bytenr; 3692 + ret = __btrfs_free_extent(trans, root, bytenr, 3693 + blocksize, eb->start, 3694 + root_owner, root_gen, 0, 1); 3695 + BUG_ON(ret); 3696 + 3697 + atomic_inc(&root->fs_info->throttle_gen); 3698 + wake_up(&root->fs_info->transaction_throttle); 3699 + cond_resched(); 3700 + } 3701 + out: 3702 + kfree(sorted); 3703 + 3704 + /* 3705 + * update the path to show we've processed the entire level 1 3706 + * node. This will get saved into the root's drop_snapshot_progress 3707 + * field so these drops are not repeated again if this transaction 3708 + * commits. 3709 + */ 3710 + path->slots[1] = nritems; 3711 + return 0; 3712 + } 3713 + 3714 + /* 3715 * helper function for drop_snapshot, this walks down the tree dropping ref 3716 * counts as it goes. 3717 */ ··· 3580 struct extent_buffer *next; 3581 struct extent_buffer *cur; 3582 struct extent_buffer *parent; 3583 u32 blocksize; 3584 int ret; 3585 u32 refs; ··· 3607 if (path->slots[*level] >= 3608 btrfs_header_nritems(cur)) 3609 break; 3610 + 3611 + /* the new code goes down to level 1 and does all the 3612 + * leaves pointed to that node in bulk. So, this check 3613 + * for level 0 will always be false. 3614 + * 3615 + * But, the disk format allows the drop_snapshot_progress 3616 + * field in the root to leave things in a state where 3617 + * a leaf will need cleaning up here. If someone crashes 3618 + * with the old code and then boots with the new code, 3619 + * we might find a leaf here. 3620 + */ 3621 if (*level == 0) { 3622 ret = btrfs_drop_leaf_ref(trans, root, cur); 3623 BUG_ON(ret); 3624 break; 3625 } 3626 + 3627 + /* 3628 + * once we get to level one, process the whole node 3629 + * at once, including everything below it. 3630 + */ 3631 + if (*level == 1) { 3632 + ret = drop_level_one_refs(trans, root, path); 3633 + BUG_ON(ret); 3634 + break; 3635 + } 3636 + 3637 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 3638 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 3639 blocksize = btrfs_level_size(root, *level - 1); 3640 3641 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 3642 BUG_ON(ret); 3643 + 3644 + /* 3645 + * if there is more than one reference, we don't need 3646 + * to read that node to drop any references it has. We 3647 + * just drop the ref we hold on that node and move on to the 3648 + * next slot in this level. 3649 + */ 3650 if (refs != 1) { 3651 parent = path->nodes[*level]; 3652 root_owner = btrfs_header_owner(parent); ··· 3636 3637 continue; 3638 } 3639 3640 + /* 3641 + * we need to keep freeing things in the next level down. 3642 + * read the block and loop around to process it 3643 + */ 3644 + next = read_tree_block(root, bytenr, blocksize, ptr_gen); 3645 WARN_ON(*level <= 0); 3646 if (path->nodes[*level-1]) 3647 free_extent_buffer(path->nodes[*level-1]); ··· 3700 root_owner = btrfs_header_owner(parent); 3701 root_gen = btrfs_header_generation(parent); 3702 3703 + /* 3704 + * cleanup and free the reference on the last node 3705 + * we processed 3706 + */ 3707 ret = __btrfs_free_extent(trans, root, bytenr, blocksize, 3708 parent->start, root_owner, root_gen, 3709 *level, 1); 3710 free_extent_buffer(path->nodes[*level]); 3711 path->nodes[*level] = NULL; 3712 + 3713 *level += 1; 3714 BUG_ON(ret); 3715 ··· 3824 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 3825 struct extent_buffer *node; 3826 struct btrfs_disk_key disk_key; 3827 + 3828 + /* 3829 + * there is more work to do in this level. 3830 + * Update the drop_progress marker to reflect 3831 + * the work we've done so far, and then bump 3832 + * the slot number 3833 + */ 3834 node = path->nodes[i]; 3835 path->slots[i]++; 3836 *level = i; ··· 3835 return 0; 3836 } else { 3837 struct extent_buffer *parent; 3838 + 3839 + /* 3840 + * this whole node is done, free our reference 3841 + * on it and go up one level 3842 + */ 3843 if (path->nodes[*level] == root->node) 3844 parent = path->nodes[*level]; 3845 else ··· 4849 ref->bytenr = buf->start; 4850 ref->owner = btrfs_header_owner(buf); 4851 ref->generation = btrfs_header_generation(buf); 4852 + 4853 ret = btrfs_add_leaf_ref(root, ref, 0); 4854 WARN_ON(ret); 4855 btrfs_free_leaf_ref(root, ref);
+2
fs/btrfs/inode.c
··· 2441 ref->generation = leaf_gen; 2442 ref->nritems = 0; 2443 2444 ret = btrfs_add_leaf_ref(root, ref, 0); 2445 WARN_ON(ret); 2446 btrfs_free_leaf_ref(root, ref);
··· 2441 ref->generation = leaf_gen; 2442 ref->nritems = 0; 2443 2444 + btrfs_sort_leaf_ref(ref); 2445 + 2446 ret = btrfs_add_leaf_ref(root, ref, 0); 2447 WARN_ON(ret); 2448 btrfs_free_leaf_ref(root, ref);
+1
fs/btrfs/ref-cache.c
··· 17 */ 18 19 #include <linux/sched.h> 20 #include "ctree.h" 21 #include "ref-cache.h" 22 #include "transaction.h"
··· 17 */ 18 19 #include <linux/sched.h> 20 + #include <linux/sort.h> 21 #include "ctree.h" 22 #include "ref-cache.h" 23 #include "transaction.h"
-1
fs/btrfs/ref-cache.h
··· 73 int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, 74 int shared); 75 int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); 76 - 77 #endif
··· 73 int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, 74 int shared); 75 int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); 76 #endif