Merge tag 'for-5.19-rc3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:

- zoned relocation fixes:
- fix critical section end for extent writeback, this could lead
to out of order write
- prevent writing to previous data relocation block group if space
gets low

- reflink fixes:
- fix race between reflinking and ordered extent completion
- proper error handling when block reserve migration fails
- add missing inode iversion/mtime/ctime updates on each iteration
when replacing extents

- fix deadlock when running fsync/fiemap/commit at the same time

- fix false-positive KCSAN report regarding pid tracking for read locks
and data race

- minor documentation update and link to new site

* tag 'for-5.19-rc3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
Documentation: update btrfs list of features and link to readthedocs.io
btrfs: fix deadlock with fsync+fiemap+transaction commit
btrfs: don't set lock_owner when locking extent buffer for reading
btrfs: zoned: fix critical section of relocation inode writeback
btrfs: zoned: prevent allocation from previous data relocation BG
btrfs: do not BUG_ON() on failure to migrate space when replacing extents
btrfs: add missing inode updates on each iteration when replacing extents
btrfs: fix race between reflinking and ordered extent completion

Changed files
+160 -32
Documentation
filesystems
fs
+13 -3
Documentation/filesystems/btrfs.rst
··· 19 19 * Subvolumes (separate internal filesystem roots) 20 20 * Object level mirroring and striping 21 21 * Checksums on data and metadata (multiple algorithms available) 22 - * Compression 22 + * Compression (multiple algorithms available) 23 + * Reflink, deduplication 24 + * Scrub (on-line checksum verification) 25 + * Hierarchical quota groups (subvolume and snapshot support) 23 26 * Integrated multiple device support, with several raid algorithms 24 27 * Offline filesystem check 25 - * Efficient incremental backup and FS mirroring 28 + * Efficient incremental backup and FS mirroring (send/receive) 29 + * Trim/discard 26 30 * Online filesystem defragmentation 31 + * Swapfile support 32 + * Zoned mode 33 + * Read/write metadata verification 34 + * Online resize (shrink, grow) 27 35 28 - For more information please refer to the wiki 36 + For more information please refer to the documentation site or wiki 37 + 38 + https://btrfs.readthedocs.io 29 39 30 40 https://btrfs.wiki.kernel.org 31 41
+1
fs/btrfs/block-group.h
··· 104 104 unsigned int relocating_repair:1; 105 105 unsigned int chunk_item_inserted:1; 106 106 unsigned int zone_is_active:1; 107 + unsigned int zoned_data_reloc_ongoing:1; 107 108 108 109 int disk_cache_state; 109 110
+2
fs/btrfs/ctree.h
··· 1330 1330 * existing extent into a file range. 1331 1331 */ 1332 1332 bool is_new_extent; 1333 + /* Indicate if we should update the inode's mtime and ctime. */ 1334 + bool update_times; 1333 1335 /* Meaningful only if is_new_extent is true. */ 1334 1336 int qgroup_reserved; 1335 1337 /*
+18 -2
fs/btrfs/extent-tree.c
··· 3832 3832 block_group->start == fs_info->data_reloc_bg || 3833 3833 fs_info->data_reloc_bg == 0); 3834 3834 3835 - if (block_group->ro) { 3835 + if (block_group->ro || block_group->zoned_data_reloc_ongoing) { 3836 3836 ret = 1; 3837 3837 goto out; 3838 3838 } ··· 3894 3894 out: 3895 3895 if (ret && ffe_ctl->for_treelog) 3896 3896 fs_info->treelog_bg = 0; 3897 - if (ret && ffe_ctl->for_data_reloc) 3897 + if (ret && ffe_ctl->for_data_reloc && 3898 + fs_info->data_reloc_bg == block_group->start) { 3899 + /* 3900 + * Do not allow further allocations from this block group. 3901 + * Compared to increasing the ->ro, setting the 3902 + * ->zoned_data_reloc_ongoing flag still allows nocow 3903 + * writers to come in. See btrfs_inc_nocow_writers(). 3904 + * 3905 + * We need to disable an allocation to avoid an allocation of 3906 + * regular (non-relocation data) extent. With mix of relocation 3907 + * extents and regular extents, we can dispatch WRITE commands 3908 + * (for relocation extents) and ZONE APPEND commands (for 3909 + * regular extents) at the same time to the same zone, which 3910 + * easily break the write pointer. 3911 + */ 3912 + block_group->zoned_data_reloc_ongoing = 1; 3898 3913 fs_info->data_reloc_bg = 0; 3914 + } 3899 3915 spin_unlock(&fs_info->relocation_bg_lock); 3900 3916 spin_unlock(&fs_info->treelog_bg_lock); 3901 3917 spin_unlock(&block_group->lock);
+2 -1
fs/btrfs/extent_io.c
··· 5241 5241 */ 5242 5242 btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); 5243 5243 ret = extent_write_cache_pages(mapping, wbc, &epd); 5244 - btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); 5245 5244 ASSERT(ret <= 0); 5246 5245 if (ret < 0) { 5246 + btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); 5247 5247 end_write_bio(&epd, ret); 5248 5248 return ret; 5249 5249 } 5250 5250 flush_write_bio(&epd); 5251 + btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); 5251 5252 return ret; 5252 5253 } 5253 5254
+77 -19
fs/btrfs/file.c
··· 2323 2323 */ 2324 2324 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); 2325 2325 2326 - if (ret != BTRFS_NO_LOG_SYNC) { 2327 - if (!ret) { 2328 - ret = btrfs_sync_log(trans, root, &ctx); 2329 - if (!ret) { 2330 - ret = btrfs_end_transaction(trans); 2331 - goto out; 2332 - } 2333 - } 2334 - if (!full_sync) { 2335 - ret = btrfs_wait_ordered_range(inode, start, len); 2336 - if (ret) { 2337 - btrfs_end_transaction(trans); 2338 - goto out; 2339 - } 2340 - } 2341 - ret = btrfs_commit_transaction(trans); 2342 - } else { 2326 + if (ret == BTRFS_NO_LOG_SYNC) { 2343 2327 ret = btrfs_end_transaction(trans); 2328 + goto out; 2344 2329 } 2330 + 2331 + /* We successfully logged the inode, attempt to sync the log. */ 2332 + if (!ret) { 2333 + ret = btrfs_sync_log(trans, root, &ctx); 2334 + if (!ret) { 2335 + ret = btrfs_end_transaction(trans); 2336 + goto out; 2337 + } 2338 + } 2339 + 2340 + /* 2341 + * At this point we need to commit the transaction because we had 2342 + * btrfs_need_log_full_commit() or some other error. 2343 + * 2344 + * If we didn't do a full sync we have to stop the trans handle, wait on 2345 + * the ordered extents, start it again and commit the transaction. If 2346 + * we attempt to wait on the ordered extents here we could deadlock with 2347 + * something like fallocate() that is holding the extent lock trying to 2348 + * start a transaction while some other thread is trying to commit the 2349 + * transaction while we (fsync) are currently holding the transaction 2350 + * open. 2351 + */ 2352 + if (!full_sync) { 2353 + ret = btrfs_end_transaction(trans); 2354 + if (ret) 2355 + goto out; 2356 + ret = btrfs_wait_ordered_range(inode, start, len); 2357 + if (ret) 2358 + goto out; 2359 + 2360 + /* 2361 + * This is safe to use here because we're only interested in 2362 + * making sure the transaction that had the ordered extents is 2363 + * committed. We aren't waiting on anything past this point, 2364 + * we're purely getting the transaction and committing it. 2365 + */ 2366 + trans = btrfs_attach_transaction_barrier(root); 2367 + if (IS_ERR(trans)) { 2368 + ret = PTR_ERR(trans); 2369 + 2370 + /* 2371 + * We committed the transaction and there's no currently 2372 + * running transaction, this means everything we care 2373 + * about made it to disk and we are done. 2374 + */ 2375 + if (ret == -ENOENT) 2376 + ret = 0; 2377 + goto out; 2378 + } 2379 + } 2380 + 2381 + ret = btrfs_commit_transaction(trans); 2345 2382 out: 2346 2383 ASSERT(list_empty(&ctx.list)); 2347 2384 err = file_check_and_advance_wb_err(file); ··· 2756 2719 2757 2720 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, 2758 2721 min_size, false); 2759 - BUG_ON(ret); 2722 + if (WARN_ON(ret)) 2723 + goto out_trans; 2760 2724 trans->block_rsv = rsv; 2761 2725 2762 2726 cur_offset = start; ··· 2841 2803 extent_info->file_offset += replace_len; 2842 2804 } 2843 2805 2806 + /* 2807 + * We are releasing our handle on the transaction, balance the 2808 + * dirty pages of the btree inode and flush delayed items, and 2809 + * then get a new transaction handle, which may now point to a 2810 + * new transaction in case someone else may have committed the 2811 + * transaction we used to replace/drop file extent items. So 2812 + * bump the inode's iversion and update mtime and ctime except 2813 + * if we are called from a dedupe context. This is because a 2814 + * power failure/crash may happen after the transaction is 2815 + * committed and before we finish replacing/dropping all the 2816 + * file extent items we need. 2817 + */ 2818 + inode_inc_iversion(&inode->vfs_inode); 2819 + 2820 + if (!extent_info || extent_info->update_times) { 2821 + inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode); 2822 + inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime; 2823 + } 2824 + 2844 2825 ret = btrfs_update_inode(trans, root, inode); 2845 2826 if (ret) 2846 2827 break; ··· 2876 2819 2877 2820 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, 2878 2821 rsv, min_size, false); 2879 - BUG_ON(ret); /* shouldn't happen */ 2822 + if (WARN_ON(ret)) 2823 + break; 2880 2824 trans->block_rsv = rsv; 2881 2825 2882 2826 cur_offset = drop_args.drop_end;
+3
fs/btrfs/inode.c
··· 3195 3195 ordered_extent->file_offset, 3196 3196 ordered_extent->file_offset + 3197 3197 logical_len); 3198 + btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr, 3199 + ordered_extent->disk_num_bytes); 3198 3200 } else { 3199 3201 BUG_ON(root == fs_info->tree_root); 3200 3202 ret = insert_ordered_extent_file_extent(trans, ordered_extent); ··· 9899 9897 extent_info.file_offset = file_offset; 9900 9898 extent_info.extent_buf = (char *)&stack_fi; 9901 9899 extent_info.is_new_extent = true; 9900 + extent_info.update_times = true; 9902 9901 extent_info.qgroup_reserved = qgroup_released; 9903 9902 extent_info.insertions = 0; 9904 9903
-3
fs/btrfs/locking.c
··· 45 45 start_ns = ktime_get_ns(); 46 46 47 47 down_read_nested(&eb->lock, nest); 48 - eb->lock_owner = current->pid; 49 48 trace_btrfs_tree_read_lock(eb, start_ns); 50 49 } 51 50 ··· 61 62 int btrfs_try_tree_read_lock(struct extent_buffer *eb) 62 63 { 63 64 if (down_read_trylock(&eb->lock)) { 64 - eb->lock_owner = current->pid; 65 65 trace_btrfs_try_tree_read_lock(eb); 66 66 return 1; 67 67 } ··· 88 90 void btrfs_tree_read_unlock(struct extent_buffer *eb) 89 91 { 90 92 trace_btrfs_tree_read_unlock(eb); 91 - eb->lock_owner = 0; 92 93 up_read(&eb->lock); 93 94 } 94 95
+12 -4
fs/btrfs/reflink.c
··· 344 344 int ret; 345 345 const u64 len = olen_aligned; 346 346 u64 last_dest_end = destoff; 347 + u64 prev_extent_end = off; 347 348 348 349 ret = -ENOMEM; 349 350 buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); ··· 364 363 key.offset = off; 365 364 366 365 while (1) { 367 - u64 next_key_min_offset = key.offset + 1; 368 366 struct btrfs_file_extent_item *extent; 369 367 u64 extent_gen; 370 368 int type; ··· 431 431 * The first search might have left us at an extent item that 432 432 * ends before our target range's start, can happen if we have 433 433 * holes and NO_HOLES feature enabled. 434 + * 435 + * Subsequent searches may leave us on a file range we have 436 + * processed before - this happens due to a race with ordered 437 + * extent completion for a file range that is outside our source 438 + * range, but that range was part of a file extent item that 439 + * also covered a leading part of our source range. 434 440 */ 435 - if (key.offset + datal <= off) { 441 + if (key.offset + datal <= prev_extent_end) { 436 442 path->slots[0]++; 437 443 goto process_slot; 438 444 } else if (key.offset >= off + len) { 439 445 break; 440 446 } 441 - next_key_min_offset = key.offset + datal; 447 + 448 + prev_extent_end = key.offset + datal; 442 449 size = btrfs_item_size(leaf, slot); 443 450 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), 444 451 size); ··· 496 489 clone_info.file_offset = new_key.offset; 497 490 clone_info.extent_buf = buf; 498 491 clone_info.is_new_extent = false; 492 + clone_info.update_times = !no_time_update; 499 493 ret = btrfs_replace_file_extents(BTRFS_I(inode), path, 500 494 drop_start, new_key.offset + datal - 1, 501 495 &clone_info, &trans); ··· 558 550 break; 559 551 560 552 btrfs_release_path(path); 561 - key.offset = next_key_min_offset; 553 + key.offset = prev_extent_end; 562 554 563 555 if (fatal_signal_pending(current)) { 564 556 ret = -EINTR;
+27
fs/btrfs/zoned.c
··· 2139 2139 factor = div64_u64(used * 100, total); 2140 2140 return factor >= fs_info->bg_reclaim_threshold; 2141 2141 } 2142 + 2143 + void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, 2144 + u64 length) 2145 + { 2146 + struct btrfs_block_group *block_group; 2147 + 2148 + if (!btrfs_is_zoned(fs_info)) 2149 + return; 2150 + 2151 + block_group = btrfs_lookup_block_group(fs_info, logical); 2152 + /* It should be called on a previous data relocation block group. */ 2153 + ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)); 2154 + 2155 + spin_lock(&block_group->lock); 2156 + if (!block_group->zoned_data_reloc_ongoing) 2157 + goto out; 2158 + 2159 + /* All relocation extents are written. */ 2160 + if (block_group->start + block_group->alloc_offset == logical + length) { 2161 + /* Now, release this block group for further allocations. */ 2162 + block_group->zoned_data_reloc_ongoing = 0; 2163 + } 2164 + 2165 + out: 2166 + spin_unlock(&block_group->lock); 2167 + btrfs_put_block_group(block_group); 2168 + }
+5
fs/btrfs/zoned.h
··· 77 77 void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); 78 78 void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); 79 79 bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info); 80 + void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, 81 + u64 length); 80 82 #else /* CONFIG_BLK_DEV_ZONED */ 81 83 static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, 82 84 struct blk_zone *zone) ··· 245 243 { 246 244 return false; 247 245 } 246 + 247 + static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, 248 + u64 logical, u64 length) { } 248 249 #endif 249 250 250 251 static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)