Btrfs: Make btrfs_drop_snapshot work in larger and more efficient chunks

Every transaction in btrfs creates a new snapshot, and then schedules the
snapshot from the last transaction for deletion. Snapshot deletion
works by walking down the btree and dropping the reference counts
on each btree block during the walk.

If if a given leaf or node has a reference count greater than one,
the reference count is decremented and the subtree pointed to by that
node is ignored.

If the reference count is one, walking continues down into that node
or leaf, and the references of everything it points to are decremented.

The old code would try to work in small pieces, walking down the tree
until it found the lowest leaf or node to free and then returning. This
was very friendly to the rest of the FS because it didn't have a huge
impact on other operations.

But it wouldn't always keep up with the rate that new commits added new
snapshots for deletion, and it wasn't very optimal for the extent
allocation tree because it wasn't finding leaves that were close together
on disk and processing them at the same time.

This changes things to walk down to a level 1 node and then process it
in bulk. All the leaf pointers are sorted and the leaves are dropped
in order based on their extent number.

The extent allocation tree and commit code are now fast enough for
this kind of bulk processing to work without slowing the rest of the FS
down. Overall it does less IO and is better able to keep up with
snapshot deletions under high load.

Signed-off-by: Chris Mason <chris.mason@oracle.com>

+266 -46
+263 -45
fs/btrfs/extent-tree.c
··· 1533 1533 * struct refsort is used to match byte number to slot in the btree block. 1534 1534 * we sort based on the byte number and then use the slot to actually 1535 1535 * find the item. 1536 + * 1537 + * struct refsort is smaller than strcut btrfs_item and smaller than 1538 + * struct btrfs_key_ptr. Since we're currently limited to the page size 1539 + * for a btree block, there's no way for a kmalloc of refsorts for a 1540 + * single node to be bigger than a page. 1536 1541 */ 1537 1542 struct refsort { 1538 1543 u64 bytenr; ··· 3462 3457 { 3463 3458 u64 leaf_owner; 3464 3459 u64 leaf_generation; 3460 + struct refsort *sorted; 3465 3461 struct btrfs_key key; 3466 3462 struct btrfs_file_extent_item *fi; 3467 3463 int i; 3468 3464 int nritems; 3469 3465 int ret; 3466 + int refi = 0; 3467 + int slot; 3470 3468 3471 3469 BUG_ON(!btrfs_is_leaf(leaf)); 3472 3470 nritems = btrfs_header_nritems(leaf); 3473 3471 leaf_owner = btrfs_header_owner(leaf); 3474 3472 leaf_generation = btrfs_header_generation(leaf); 3475 3473 3474 + sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS); 3475 + /* we do this loop twice. The first time we build a list 3476 + * of the extents we have a reference on, then we sort the list 3477 + * by bytenr. The second time around we actually do the 3478 + * extent freeing. 3479 + */ 3476 3480 for (i = 0; i < nritems; i++) { 3477 3481 u64 disk_bytenr; 3478 3482 cond_resched(); 3479 3483 3480 3484 btrfs_item_key_to_cpu(leaf, &key, i); 3485 + 3486 + /* only extents have references, skip everything else */ 3481 3487 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3482 3488 continue; 3489 + 3483 3490 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3491 + 3492 + /* inline extents live in the btree, they don't have refs */ 3484 3493 if (btrfs_file_extent_type(leaf, fi) == 3485 3494 BTRFS_FILE_EXTENT_INLINE) 3486 3495 continue; 3487 - /* 3488 - * FIXME make sure to insert a trans record that 3489 - * repeats the snapshot del on crash 3490 - */ 3496 + 3491 3497 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 3498 + 3499 + /* holes don't have refs */ 3492 3500 if (disk_bytenr == 0) 3493 3501 continue; 3502 + 3503 + sorted[refi].bytenr = disk_bytenr; 3504 + sorted[refi].slot = i; 3505 + refi++; 3506 + } 3507 + 3508 + if (refi == 0) 3509 + goto out; 3510 + 3511 + sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); 3512 + 3513 + for (i = 0; i < refi; i++) { 3514 + u64 disk_bytenr; 3515 + 3516 + disk_bytenr = sorted[i].bytenr; 3517 + slot = sorted[i].slot; 3518 + 3519 + cond_resched(); 3520 + 3521 + btrfs_item_key_to_cpu(leaf, &key, slot); 3522 + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3523 + continue; 3524 + 3525 + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 3494 3526 3495 3527 ret = __btrfs_free_extent(trans, root, disk_bytenr, 3496 3528 btrfs_file_extent_disk_num_bytes(leaf, fi), ··· 3539 3497 wake_up(&root->fs_info->transaction_throttle); 3540 3498 cond_resched(); 3541 3499 } 3500 + out: 3501 + kfree(sorted); 3542 3502 return 0; 3543 3503 } 3544 3504 ··· 3550 3506 { 3551 3507 int i; 3552 3508 int ret; 3553 - struct btrfs_extent_info *info = ref->extents; 3509 + struct btrfs_extent_info *info; 3510 + struct refsort *sorted; 3554 3511 3512 + if (ref->nritems == 0) 3513 + return 0; 3514 + 3515 + sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS); 3555 3516 for (i = 0; i < ref->nritems; i++) { 3517 + sorted[i].bytenr = ref->extents[i].bytenr; 3518 + sorted[i].slot = i; 3519 + } 3520 + sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL); 3521 + 3522 + /* 3523 + * the items in the ref were sorted when the ref was inserted 3524 + * into the ref cache, so this is already in order 3525 + */ 3526 + for (i = 0; i < ref->nritems; i++) { 3527 + info = ref->extents + sorted[i].slot; 3556 3528 ret = __btrfs_free_extent(trans, root, info->bytenr, 3557 3529 info->num_bytes, ref->bytenr, 3558 3530 ref->owner, ref->generation, ··· 3626 3566 } 3627 3567 3628 3568 /* 3569 + * this is used while deleting old snapshots, and it drops the refs 3570 + * on a whole subtree starting from a level 1 node. 3571 + * 3572 + * The idea is to sort all the leaf pointers, and then drop the 3573 + * ref on all the leaves in order. Most of the time the leaves 3574 + * will have ref cache entries, so no leaf IOs will be required to 3575 + * find the extents they have references on. 3576 + * 3577 + * For each leaf, any references it has are also dropped in order 3578 + * 3579 + * This ends up dropping the references in something close to optimal 3580 + * order for reading and modifying the extent allocation tree. 3581 + */ 3582 + static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans, 3583 + struct btrfs_root *root, 3584 + struct btrfs_path *path) 3585 + { 3586 + u64 bytenr; 3587 + u64 root_owner; 3588 + u64 root_gen; 3589 + struct extent_buffer *eb = path->nodes[1]; 3590 + struct extent_buffer *leaf; 3591 + struct btrfs_leaf_ref *ref; 3592 + struct refsort *sorted = NULL; 3593 + int nritems = btrfs_header_nritems(eb); 3594 + int ret; 3595 + int i; 3596 + int refi = 0; 3597 + int slot = path->slots[1]; 3598 + u32 blocksize = btrfs_level_size(root, 0); 3599 + u32 refs; 3600 + 3601 + if (nritems == 0) 3602 + goto out; 3603 + 3604 + root_owner = btrfs_header_owner(eb); 3605 + root_gen = btrfs_header_generation(eb); 3606 + sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS); 3607 + 3608 + /* 3609 + * step one, sort all the leaf pointers so we don't scribble 3610 + * randomly into the extent allocation tree 3611 + */ 3612 + for (i = slot; i < nritems; i++) { 3613 + sorted[refi].bytenr = btrfs_node_blockptr(eb, i); 3614 + sorted[refi].slot = i; 3615 + refi++; 3616 + } 3617 + 3618 + /* 3619 + * nritems won't be zero, but if we're picking up drop_snapshot 3620 + * after a crash, slot might be > 0, so double check things 3621 + * just in case. 3622 + */ 3623 + if (refi == 0) 3624 + goto out; 3625 + 3626 + sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); 3627 + 3628 + /* 3629 + * the first loop frees everything the leaves point to 3630 + */ 3631 + for (i = 0; i < refi; i++) { 3632 + u64 ptr_gen; 3633 + 3634 + bytenr = sorted[i].bytenr; 3635 + 3636 + /* 3637 + * check the reference count on this leaf. If it is > 1 3638 + * we just decrement it below and don't update any 3639 + * of the refs the leaf points to. 3640 + */ 3641 + ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 3642 + BUG_ON(ret); 3643 + if (refs != 1) 3644 + continue; 3645 + 3646 + ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot); 3647 + 3648 + /* 3649 + * the leaf only had one reference, which means the 3650 + * only thing pointing to this leaf is the snapshot 3651 + * we're deleting. It isn't possible for the reference 3652 + * count to increase again later 3653 + * 3654 + * The reference cache is checked for the leaf, 3655 + * and if found we'll be able to drop any refs held by 3656 + * the leaf without needing to read it in. 3657 + */ 3658 + ref = btrfs_lookup_leaf_ref(root, bytenr); 3659 + if (ref && ref->generation != ptr_gen) { 3660 + btrfs_free_leaf_ref(root, ref); 3661 + ref = NULL; 3662 + } 3663 + if (ref) { 3664 + ret = cache_drop_leaf_ref(trans, root, ref); 3665 + BUG_ON(ret); 3666 + btrfs_remove_leaf_ref(root, ref); 3667 + btrfs_free_leaf_ref(root, ref); 3668 + } else { 3669 + /* 3670 + * the leaf wasn't in the reference cache, so 3671 + * we have to read it. 3672 + */ 3673 + leaf = read_tree_block(root, bytenr, blocksize, 3674 + ptr_gen); 3675 + ret = btrfs_drop_leaf_ref(trans, root, leaf); 3676 + BUG_ON(ret); 3677 + free_extent_buffer(leaf); 3678 + } 3679 + atomic_inc(&root->fs_info->throttle_gen); 3680 + wake_up(&root->fs_info->transaction_throttle); 3681 + cond_resched(); 3682 + } 3683 + 3684 + /* 3685 + * run through the loop again to free the refs on the leaves. 3686 + * This is faster than doing it in the loop above because 3687 + * the leaves are likely to be clustered together. We end up 3688 + * working in nice chunks on the extent allocation tree. 3689 + */ 3690 + for (i = 0; i < refi; i++) { 3691 + bytenr = sorted[i].bytenr; 3692 + ret = __btrfs_free_extent(trans, root, bytenr, 3693 + blocksize, eb->start, 3694 + root_owner, root_gen, 0, 1); 3695 + BUG_ON(ret); 3696 + 3697 + atomic_inc(&root->fs_info->throttle_gen); 3698 + wake_up(&root->fs_info->transaction_throttle); 3699 + cond_resched(); 3700 + } 3701 + out: 3702 + kfree(sorted); 3703 + 3704 + /* 3705 + * update the path to show we've processed the entire level 1 3706 + * node. This will get saved into the root's drop_snapshot_progress 3707 + * field so these drops are not repeated again if this transaction 3708 + * commits. 3709 + */ 3710 + path->slots[1] = nritems; 3711 + return 0; 3712 + } 3713 + 3714 + /* 3629 3715 * helper function for drop_snapshot, this walks down the tree dropping ref 3630 3716 * counts as it goes. 3631 3717 */ ··· 3786 3580 struct extent_buffer *next; 3787 3581 struct extent_buffer *cur; 3788 3582 struct extent_buffer *parent; 3789 - struct btrfs_leaf_ref *ref; 3790 3583 u32 blocksize; 3791 3584 int ret; 3792 3585 u32 refs; ··· 3812 3607 if (path->slots[*level] >= 3813 3608 btrfs_header_nritems(cur)) 3814 3609 break; 3610 + 3611 + /* the new code goes down to level 1 and does all the 3612 + * leaves pointed to that node in bulk. So, this check 3613 + * for level 0 will always be false. 3614 + * 3615 + * But, the disk format allows the drop_snapshot_progress 3616 + * field in the root to leave things in a state where 3617 + * a leaf will need cleaning up here. If someone crashes 3618 + * with the old code and then boots with the new code, 3619 + * we might find a leaf here. 3620 + */ 3815 3621 if (*level == 0) { 3816 3622 ret = btrfs_drop_leaf_ref(trans, root, cur); 3817 3623 BUG_ON(ret); 3818 3624 break; 3819 3625 } 3626 + 3627 + /* 3628 + * once we get to level one, process the whole node 3629 + * at once, including everything below it. 3630 + */ 3631 + if (*level == 1) { 3632 + ret = drop_level_one_refs(trans, root, path); 3633 + BUG_ON(ret); 3634 + break; 3635 + } 3636 + 3820 3637 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 3821 3638 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 3822 3639 blocksize = btrfs_level_size(root, *level - 1); 3823 3640 3824 3641 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 3825 3642 BUG_ON(ret); 3643 + 3644 + /* 3645 + * if there is more than one reference, we don't need 3646 + * to read that node to drop any references it has. We 3647 + * just drop the ref we hold on that node and move on to the 3648 + * next slot in this level. 3649 + */ 3826 3650 if (refs != 1) { 3827 3651 parent = path->nodes[*level]; 3828 3652 root_owner = btrfs_header_owner(parent); ··· 3870 3636 3871 3637 continue; 3872 3638 } 3873 - /* 3874 - * at this point, we have a single ref, and since the 3875 - * only place referencing this extent is a dead root 3876 - * the reference count should never go higher. 3877 - * So, we don't need to check it again 3878 - */ 3879 - if (*level == 1) { 3880 - ref = btrfs_lookup_leaf_ref(root, bytenr); 3881 - if (ref && ref->generation != ptr_gen) { 3882 - btrfs_free_leaf_ref(root, ref); 3883 - ref = NULL; 3884 - } 3885 - if (ref) { 3886 - ret = cache_drop_leaf_ref(trans, root, ref); 3887 - BUG_ON(ret); 3888 - btrfs_remove_leaf_ref(root, ref); 3889 - btrfs_free_leaf_ref(root, ref); 3890 - *level = 0; 3891 - break; 3892 - } 3893 - } 3894 - next = btrfs_find_tree_block(root, bytenr, blocksize); 3895 - if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) { 3896 - free_extent_buffer(next); 3897 3639 3898 - next = read_tree_block(root, bytenr, blocksize, 3899 - ptr_gen); 3900 - cond_resched(); 3901 - #if 0 3902 - /* 3903 - * this is a debugging check and can go away 3904 - * the ref should never go all the way down to 1 3905 - * at this point 3906 - */ 3907 - ret = lookup_extent_ref(NULL, root, bytenr, blocksize, 3908 - &refs); 3909 - BUG_ON(ret); 3910 - WARN_ON(refs != 1); 3911 - #endif 3912 - } 3640 + /* 3641 + * we need to keep freeing things in the next level down. 3642 + * read the block and loop around to process it 3643 + */ 3644 + next = read_tree_block(root, bytenr, blocksize, ptr_gen); 3913 3645 WARN_ON(*level <= 0); 3914 3646 if (path->nodes[*level-1]) 3915 3647 free_extent_buffer(path->nodes[*level-1]); ··· 3900 3700 root_owner = btrfs_header_owner(parent); 3901 3701 root_gen = btrfs_header_generation(parent); 3902 3702 3703 + /* 3704 + * cleanup and free the reference on the last node 3705 + * we processed 3706 + */ 3903 3707 ret = __btrfs_free_extent(trans, root, bytenr, blocksize, 3904 3708 parent->start, root_owner, root_gen, 3905 3709 *level, 1); 3906 3710 free_extent_buffer(path->nodes[*level]); 3907 3711 path->nodes[*level] = NULL; 3712 + 3908 3713 *level += 1; 3909 3714 BUG_ON(ret); 3910 3715 ··· 4029 3824 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 4030 3825 struct extent_buffer *node; 4031 3826 struct btrfs_disk_key disk_key; 3827 + 3828 + /* 3829 + * there is more work to do in this level. 3830 + * Update the drop_progress marker to reflect 3831 + * the work we've done so far, and then bump 3832 + * the slot number 3833 + */ 4032 3834 node = path->nodes[i]; 4033 3835 path->slots[i]++; 4034 3836 *level = i; ··· 4047 3835 return 0; 4048 3836 } else { 4049 3837 struct extent_buffer *parent; 3838 + 3839 + /* 3840 + * this whole node is done, free our reference 3841 + * on it and go up one level 3842 + */ 4050 3843 if (path->nodes[*level] == root->node) 4051 3844 parent = path->nodes[*level]; 4052 3845 else ··· 5066 4849 ref->bytenr = buf->start; 5067 4850 ref->owner = btrfs_header_owner(buf); 5068 4851 ref->generation = btrfs_header_generation(buf); 4852 + 5069 4853 ret = btrfs_add_leaf_ref(root, ref, 0); 5070 4854 WARN_ON(ret); 5071 4855 btrfs_free_leaf_ref(root, ref);
+2
fs/btrfs/inode.c
··· 2441 2441 ref->generation = leaf_gen; 2442 2442 ref->nritems = 0; 2443 2443 2444 + btrfs_sort_leaf_ref(ref); 2445 + 2444 2446 ret = btrfs_add_leaf_ref(root, ref, 0); 2445 2447 WARN_ON(ret); 2446 2448 btrfs_free_leaf_ref(root, ref);
+1
fs/btrfs/ref-cache.c
··· 17 17 */ 18 18 19 19 #include <linux/sched.h> 20 + #include <linux/sort.h> 20 21 #include "ctree.h" 21 22 #include "ref-cache.h" 22 23 #include "transaction.h"
-1
fs/btrfs/ref-cache.h
··· 73 73 int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, 74 74 int shared); 75 75 int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); 76 - 77 76 #endif