Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

btrfs: convert the buffer_radix to an xarray

In order to fully utilize xarray tagging to improve writeback we need to
convert the buffer_radix to a proper xarray. This conversion is
relatively straightforward as the radix code uses the xarray underneath.
Using xarray directly allows for quite a lot less code.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>

authored by

Josef Bacik and committed by
David Sterba
19d7f65f 656e9f51

+111 -167
+12 -2
fs/btrfs/disk-io.c
··· 2761 2761 return ret; 2762 2762 } 2763 2763 2764 + /* 2765 + * Lockdep gets confused between our buffer_tree which requires IRQ locking because 2766 + * we modify marks in the IRQ context, and our delayed inode xarray which doesn't 2767 + * have these requirements. Use a class key so lockdep doesn't get them mixed up. 2768 + */ 2769 + static struct lock_class_key buffer_xa_class; 2770 + 2764 2771 void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) 2765 2772 { 2766 2773 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); 2767 - INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); 2774 + 2775 + /* Use the same flags as mapping->i_pages. */ 2776 + xa_init_flags(&fs_info->buffer_tree, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); 2777 + lockdep_set_class(&fs_info->buffer_tree.xa_lock, &buffer_xa_class); 2778 + 2768 2779 INIT_LIST_HEAD(&fs_info->trans_list); 2769 2780 INIT_LIST_HEAD(&fs_info->dead_roots); 2770 2781 INIT_LIST_HEAD(&fs_info->delayed_iputs); ··· 2787 2776 spin_lock_init(&fs_info->delayed_iput_lock); 2788 2777 spin_lock_init(&fs_info->defrag_inodes_lock); 2789 2778 spin_lock_init(&fs_info->super_lock); 2790 - spin_lock_init(&fs_info->buffer_lock); 2791 2779 spin_lock_init(&fs_info->unused_bgs_lock); 2792 2780 spin_lock_init(&fs_info->treelog_bg_lock); 2793 2781 spin_lock_init(&fs_info->zone_active_bgs_lock);
+88 -128
fs/btrfs/extent_io.c
··· 1866 1866 * context. 1867 1867 */ 1868 1868 static struct extent_buffer *find_extent_buffer_nolock( 1869 - const struct btrfs_fs_info *fs_info, u64 start) 1869 + struct btrfs_fs_info *fs_info, u64 start) 1870 1870 { 1871 1871 struct extent_buffer *eb; 1872 + unsigned long index = (start >> fs_info->sectorsize_bits); 1872 1873 1873 1874 rcu_read_lock(); 1874 - eb = radix_tree_lookup(&fs_info->buffer_radix, 1875 - start >> fs_info->sectorsize_bits); 1876 - if (eb && atomic_inc_not_zero(&eb->refs)) { 1877 - rcu_read_unlock(); 1878 - return eb; 1879 - } 1875 + eb = xa_load(&fs_info->buffer_tree, index); 1876 + if (eb && !atomic_inc_not_zero(&eb->refs)) 1877 + eb = NULL; 1880 1878 rcu_read_unlock(); 1881 - return NULL; 1879 + return eb; 1882 1880 } 1883 1881 1884 1882 static void end_bbio_meta_write(struct btrfs_bio *bbio) ··· 2740 2742 2741 2743 if (!btrfs_meta_is_subpage(fs_info)) { 2742 2744 /* 2743 - * We do this since we'll remove the pages after we've 2744 - * removed the eb from the radix tree, so we could race 2745 - * and have this page now attached to the new eb. So 2746 - * only clear folio if it's still connected to 2747 - * this eb. 2745 + * We do this since we'll remove the pages after we've removed 2746 + * the eb from the xarray, so we could race and have this page 2747 + * now attached to the new eb. So only clear folio if it's 2748 + * still connected to this eb. 2748 2749 */ 2749 2750 if (folio_test_private(folio) && folio_get_private(folio) == eb) { 2750 2751 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); ··· 2908 2911 { 2909 2912 int refs; 2910 2913 /* 2911 - * The TREE_REF bit is first set when the extent_buffer is added 2912 - * to the radix tree. It is also reset, if unset, when a new reference 2913 - * is created by find_extent_buffer. 2914 + * The TREE_REF bit is first set when the extent_buffer is added to the 2915 + * xarray. It is also reset, if unset, when a new reference is created 2916 + * by find_extent_buffer. 2914 2917 * 2915 2918 * It is only cleared in two cases: freeing the last non-tree 2916 2919 * reference to the extent_buffer when its STALE bit is set or ··· 2922 2925 * conditions between the calls to check_buffer_tree_ref in those 2923 2926 * codepaths and clearing TREE_REF in try_release_extent_buffer. 2924 2927 * 2925 - * The actual lifetime of the extent_buffer in the radix tree is 2926 - * adequately protected by the refcount, but the TREE_REF bit and 2927 - * its corresponding reference are not. To protect against this 2928 - * class of races, we call check_buffer_tree_ref from the codepaths 2929 - * which trigger io. Note that once io is initiated, TREE_REF can no 2930 - * longer be cleared, so that is the moment at which any such race is 2931 - * best fixed. 2928 + * The actual lifetime of the extent_buffer in the xarray is adequately 2929 + * protected by the refcount, but the TREE_REF bit and its corresponding 2930 + * reference are not. To protect against this class of races, we call 2931 + * check_buffer_tree_ref() from the code paths which trigger io. Note that 2932 + * once io is initiated, TREE_REF can no longer be cleared, so that is 2933 + * the moment at which any such race is best fixed. 2932 2934 */ 2933 2935 refs = atomic_read(&eb->refs); 2934 2936 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) ··· 2991 2995 return ERR_PTR(-ENOMEM); 2992 2996 eb->fs_info = fs_info; 2993 2997 again: 2994 - ret = radix_tree_preload(GFP_NOFS); 2995 - if (ret) { 2996 - exists = ERR_PTR(ret); 2998 + xa_lock_irq(&fs_info->buffer_tree); 2999 + exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits, 3000 + NULL, eb, GFP_NOFS); 3001 + if (xa_is_err(exists)) { 3002 + ret = xa_err(exists); 3003 + xa_unlock_irq(&fs_info->buffer_tree); 3004 + btrfs_release_extent_buffer(eb); 3005 + return ERR_PTR(ret); 3006 + } 3007 + if (exists) { 3008 + if (!atomic_inc_not_zero(&exists->refs)) { 3009 + /* The extent buffer is being freed, retry. */ 3010 + xa_unlock_irq(&fs_info->buffer_tree); 3011 + goto again; 3012 + } 3013 + xa_unlock_irq(&fs_info->buffer_tree); 2997 3014 goto free_eb; 2998 3015 } 2999 - spin_lock(&fs_info->buffer_lock); 3000 - ret = radix_tree_insert(&fs_info->buffer_radix, 3001 - start >> fs_info->sectorsize_bits, eb); 3002 - spin_unlock(&fs_info->buffer_lock); 3003 - radix_tree_preload_end(); 3004 - if (ret == -EEXIST) { 3005 - exists = find_extent_buffer(fs_info, start); 3006 - if (exists) 3007 - goto free_eb; 3008 - else 3009 - goto again; 3010 - } 3016 + xa_unlock_irq(&fs_info->buffer_tree); 3011 3017 check_buffer_tree_ref(eb); 3012 3018 3013 3019 return eb; ··· 3030 3032 lockdep_assert_held(&folio->mapping->i_private_lock); 3031 3033 3032 3034 /* 3033 - * For subpage case, we completely rely on radix tree to ensure we 3034 - * don't try to insert two ebs for the same bytenr. So here we always 3035 - * return NULL and just continue. 3035 + * For subpage case, we completely rely on xarray to ensure we don't try 3036 + * to insert two ebs for the same bytenr. So here we always return NULL 3037 + * and just continue. 3036 3038 */ 3037 3039 if (btrfs_meta_is_subpage(fs_info)) 3038 3040 return NULL; ··· 3163 3165 /* 3164 3166 * To inform we have an extra eb under allocation, so that 3165 3167 * detach_extent_buffer_page() won't release the folio private when the 3166 - * eb hasn't been inserted into radix tree yet. 3168 + * eb hasn't been inserted into the xarray yet. 3167 3169 * 3168 3170 * The ref will be decreased when the eb releases the page, in 3169 3171 * detach_extent_buffer_page(). Thus needs no special handling in the ··· 3297 3299 3298 3300 /* 3299 3301 * We can't unlock the pages just yet since the extent buffer 3300 - * hasn't been properly inserted in the radix tree, this 3301 - * opens a race with btree_release_folio which can free a page 3302 - * while we are still filling in all pages for the buffer and 3303 - * we could crash. 3302 + * hasn't been properly inserted into the xarray, this opens a 3303 + * race with btree_release_folio() which can free a page while we 3304 + * are still filling in all pages for the buffer and we could crash. 3304 3305 */ 3305 3306 } 3306 3307 if (uptodate) ··· 3308 3311 if (page_contig) 3309 3312 eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start); 3310 3313 again: 3311 - ret = radix_tree_preload(GFP_NOFS); 3312 - if (ret) 3314 + xa_lock_irq(&fs_info->buffer_tree); 3315 + existing_eb = __xa_cmpxchg(&fs_info->buffer_tree, 3316 + start >> fs_info->sectorsize_bits, NULL, eb, 3317 + GFP_NOFS); 3318 + if (xa_is_err(existing_eb)) { 3319 + ret = xa_err(existing_eb); 3320 + xa_unlock_irq(&fs_info->buffer_tree); 3313 3321 goto out; 3314 - 3315 - spin_lock(&fs_info->buffer_lock); 3316 - ret = radix_tree_insert(&fs_info->buffer_radix, 3317 - start >> fs_info->sectorsize_bits, eb); 3318 - spin_unlock(&fs_info->buffer_lock); 3319 - radix_tree_preload_end(); 3320 - if (ret == -EEXIST) { 3321 - ret = 0; 3322 - existing_eb = find_extent_buffer(fs_info, start); 3323 - if (existing_eb) 3324 - goto out; 3325 - else 3326 - goto again; 3327 3322 } 3323 + if (existing_eb) { 3324 + if (!atomic_inc_not_zero(&existing_eb->refs)) { 3325 + xa_unlock_irq(&fs_info->buffer_tree); 3326 + goto again; 3327 + } 3328 + xa_unlock_irq(&fs_info->buffer_tree); 3329 + goto out; 3330 + } 3331 + xa_unlock_irq(&fs_info->buffer_tree); 3332 + 3328 3333 /* add one reference for the tree */ 3329 3334 check_buffer_tree_ref(eb); 3330 3335 ··· 3396 3397 3397 3398 spin_unlock(&eb->refs_lock); 3398 3399 3399 - spin_lock(&fs_info->buffer_lock); 3400 - radix_tree_delete_item(&fs_info->buffer_radix, 3401 - eb->start >> fs_info->sectorsize_bits, eb); 3402 - spin_unlock(&fs_info->buffer_lock); 3400 + /* 3401 + * We're erasing, theoretically there will be no allocations, so 3402 + * just use GFP_ATOMIC. 3403 + * 3404 + * We use cmpxchg instead of erase because we do not know if 3405 + * this eb is actually in the tree or not, we could be cleaning 3406 + * up an eb that we allocated but never inserted into the tree. 3407 + * Thus use cmpxchg to remove it from the tree if it is there, 3408 + * or leave the other entry if this isn't in the tree. 3409 + * 3410 + * The documentation says that putting a NULL value is the same 3411 + * as erase as long as XA_FLAGS_ALLOC is not set, which it isn't 3412 + * in this case. 3413 + */ 3414 + xa_cmpxchg_irq(&fs_info->buffer_tree, 3415 + eb->start >> fs_info->sectorsize_bits, eb, NULL, 3416 + GFP_ATOMIC); 3403 3417 3404 3418 btrfs_leak_debug_del_eb(eb); 3405 3419 /* Should be safe to release folios at this point. */ ··· 4243 4231 } 4244 4232 } 4245 4233 4246 - #define GANG_LOOKUP_SIZE 16 4247 - static struct extent_buffer *get_next_extent_buffer( 4248 - const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr) 4249 - { 4250 - struct extent_buffer *gang[GANG_LOOKUP_SIZE]; 4251 - struct extent_buffer *found = NULL; 4252 - u64 folio_start = folio_pos(folio); 4253 - u64 cur = folio_start; 4254 - 4255 - ASSERT(in_range(bytenr, folio_start, PAGE_SIZE)); 4256 - lockdep_assert_held(&fs_info->buffer_lock); 4257 - 4258 - while (cur < folio_start + PAGE_SIZE) { 4259 - int ret; 4260 - int i; 4261 - 4262 - ret = radix_tree_gang_lookup(&fs_info->buffer_radix, 4263 - (void **)gang, cur >> fs_info->sectorsize_bits, 4264 - min_t(unsigned int, GANG_LOOKUP_SIZE, 4265 - PAGE_SIZE / fs_info->nodesize)); 4266 - if (ret == 0) 4267 - goto out; 4268 - for (i = 0; i < ret; i++) { 4269 - /* Already beyond page end */ 4270 - if (gang[i]->start >= folio_start + PAGE_SIZE) 4271 - goto out; 4272 - /* Found one */ 4273 - if (gang[i]->start >= bytenr) { 4274 - found = gang[i]; 4275 - goto out; 4276 - } 4277 - } 4278 - cur = gang[ret - 1]->start + gang[ret - 1]->len; 4279 - } 4280 - out: 4281 - return found; 4282 - } 4283 - 4284 4234 static int try_release_subpage_extent_buffer(struct folio *folio) 4285 4235 { 4286 4236 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); 4287 - u64 cur = folio_pos(folio); 4288 - const u64 end = cur + PAGE_SIZE; 4237 + struct extent_buffer *eb; 4238 + unsigned long start = (folio_pos(folio) >> fs_info->sectorsize_bits); 4239 + unsigned long index = start; 4240 + unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1; 4289 4241 int ret; 4290 4242 4291 - while (cur < end) { 4292 - struct extent_buffer *eb = NULL; 4293 - 4294 - /* 4295 - * Unlike try_release_extent_buffer() which uses folio private 4296 - * to grab buffer, for subpage case we rely on radix tree, thus 4297 - * we need to ensure radix tree consistency. 4298 - * 4299 - * We also want an atomic snapshot of the radix tree, thus go 4300 - * with spinlock rather than RCU. 4301 - */ 4302 - spin_lock(&fs_info->buffer_lock); 4303 - eb = get_next_extent_buffer(fs_info, folio, cur); 4304 - if (!eb) { 4305 - /* No more eb in the page range after or at cur */ 4306 - spin_unlock(&fs_info->buffer_lock); 4307 - break; 4308 - } 4309 - cur = eb->start + eb->len; 4310 - 4243 + xa_lock_irq(&fs_info->buffer_tree); 4244 + xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) { 4311 4245 /* 4312 4246 * The same as try_release_extent_buffer(), to ensure the eb 4313 4247 * won't disappear out from under us. ··· 4261 4303 spin_lock(&eb->refs_lock); 4262 4304 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 4263 4305 spin_unlock(&eb->refs_lock); 4264 - spin_unlock(&fs_info->buffer_lock); 4265 - break; 4306 + continue; 4266 4307 } 4267 - spin_unlock(&fs_info->buffer_lock); 4308 + xa_unlock_irq(&fs_info->buffer_tree); 4268 4309 4269 4310 /* 4270 4311 * If tree ref isn't set then we know the ref on this eb is a ··· 4281 4324 * release_extent_buffer() will release the refs_lock. 4282 4325 */ 4283 4326 release_extent_buffer(eb); 4327 + xa_lock_irq(&fs_info->buffer_tree); 4284 4328 } 4329 + xa_unlock_irq(&fs_info->buffer_tree); 4330 + 4285 4331 /* 4286 4332 * Finally to check if we have cleared folio private, as if we have 4287 4333 * released all ebs in the page, the folio private should be cleared now.
+1 -3
fs/btrfs/fs.h
··· 777 777 778 778 struct btrfs_delayed_root *delayed_root; 779 779 780 - /* Extent buffer radix tree */ 781 - spinlock_t buffer_lock; 782 780 /* Entries are eb->start / sectorsize */ 783 - struct radix_tree_root buffer_radix; 781 + struct xarray buffer_tree; 784 782 785 783 /* Next backup root to be overwritten */ 786 784 int backup_root_index;
+8 -20
fs/btrfs/tests/btrfs-tests.c
··· 157 157 158 158 void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) 159 159 { 160 - struct radix_tree_iter iter; 161 - void **slot; 162 160 struct btrfs_device *dev, *tmp; 161 + struct extent_buffer *eb; 162 + unsigned long index; 163 163 164 164 if (!fs_info) 165 165 return; ··· 169 169 170 170 test_mnt->mnt_sb->s_fs_info = NULL; 171 171 172 - spin_lock(&fs_info->buffer_lock); 173 - radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { 174 - struct extent_buffer *eb; 175 - 176 - eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock); 177 - if (!eb) 178 - continue; 179 - /* Shouldn't happen but that kind of thinking creates CVE's */ 180 - if (radix_tree_exception(eb)) { 181 - if (radix_tree_deref_retry(eb)) 182 - slot = radix_tree_iter_retry(&iter); 183 - continue; 184 - } 185 - slot = radix_tree_iter_resume(slot, &iter); 186 - spin_unlock(&fs_info->buffer_lock); 187 - free_extent_buffer_stale(eb); 188 - spin_lock(&fs_info->buffer_lock); 172 + xa_lock_irq(&fs_info->buffer_tree); 173 + xa_for_each(&fs_info->buffer_tree, index, eb) { 174 + xa_unlock_irq(&fs_info->buffer_tree); 175 + free_extent_buffer(eb); 176 + xa_lock_irq(&fs_info->buffer_tree); 189 177 } 190 - spin_unlock(&fs_info->buffer_lock); 178 + xa_unlock_irq(&fs_info->buffer_tree); 191 179 192 180 btrfs_mapping_tree_free(fs_info); 193 181 list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices,
+2 -14
fs/btrfs/zoned.c
··· 2171 2171 { 2172 2172 struct btrfs_fs_info *fs_info = block_group->fs_info; 2173 2173 const u64 end = block_group->start + block_group->length; 2174 - struct radix_tree_iter iter; 2175 2174 struct extent_buffer *eb; 2176 - void __rcu **slot; 2175 + unsigned long index, start = (block_group->start >> fs_info->sectorsize_bits); 2177 2176 2178 2177 rcu_read_lock(); 2179 - radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 2180 - block_group->start >> fs_info->sectorsize_bits) { 2181 - eb = radix_tree_deref_slot(slot); 2182 - if (!eb) 2183 - continue; 2184 - if (radix_tree_deref_retry(eb)) { 2185 - slot = radix_tree_iter_retry(&iter); 2186 - continue; 2187 - } 2188 - 2178 + xa_for_each_start(&fs_info->buffer_tree, index, eb, start) { 2189 2179 if (eb->start < block_group->start) 2190 2180 continue; 2191 2181 if (eb->start >= end) 2192 2182 break; 2193 - 2194 - slot = radix_tree_iter_resume(slot, &iter); 2195 2183 rcu_read_unlock(); 2196 2184 wait_on_extent_buffer_writeback(eb); 2197 2185 rcu_read_lock();