Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

btrfs: make btrfs_repair_io_failure() handle bs > ps cases without large folios

Currently btrfs_repair_io_failure() only accept a single @paddr
parameter, and for bs > ps cases it's required that @paddr is backed by
a large folio.

That assumption has quite some limitations, preventing us from utilizing
true zero-copy direct-io and encoded read/writes.

To address the problem, enhance btrfs_repair_io_failure() by:

- Accept an array of paddrs, up to 64K / PAGE_SIZE entries
This kind of acts like a bio_vec, but with very limited entries, as the
function is only utilized to repair one fs data block, or a tree block.

Both have an upper size limit (BTRFS_MAX_BLOCK_SIZE, i.e. 64K), so we
don't need the full bio_vec thing to handle it.

- Allocate a bio with multiple slots
Previously even for bs > ps cases, we only passed in a contiguous
physical address range, thus a single slot will be enough.

But not anymore, so we have to allocate a bio structure, other than
using the on-stack one.

- Use on-stack memory to allocate @paddrs array
It's at most 16 pages (4K page size, 64K block size), will take up at
most 128 bytes.
I think the on-stack cost is still acceptable.

- Add one extra check to make sure the repair bio is exactly one block

- Utilize btrfs_repair_io_failure() to submit a single bio for metadata
This should improve the read-repair performance for metadata, as now
we submit a node sized bio then wait, other than submit each block of
the metadata and wait for each submitted block.

- Add one extra parameter indicating the step
This is due to the fact that metadata step can be as large as
nodesize, instead of sectorsize.
So we need a way to distinguish metadata and data repair.

- Reduce the width of @length parameter of btrfs_repair_io_failure()
Since we only call btrfs_repair_io_failure() on a single data or
metadata block, u64 is overkilled.
Use u32 instead and add one extra ASSERT()s to make sure the length
never exceed BTRFS_MAX_BLOCK_SIZE.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>

authored by

Qu Wenruo and committed by
David Sterba
2574e901 62bcbdca

+75 -27
+54 -14
fs/btrfs/bio.c
··· 172 172 struct btrfs_inode *inode = repair_bbio->inode; 173 173 struct btrfs_fs_info *fs_info = inode->root->fs_info; 174 174 struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); 175 + /* 176 + * We can not move forward the saved_iter, as it will be later 177 + * utilized by repair_bbio again. 178 + */ 179 + struct bvec_iter saved_iter = repair_bbio->saved_iter; 180 + const u32 step = min(fs_info->sectorsize, PAGE_SIZE); 181 + const u64 logical = repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT; 182 + const u32 nr_steps = repair_bbio->saved_iter.bi_size / step; 175 183 int mirror = repair_bbio->mirror_num; 184 + phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; 185 + phys_addr_t paddr; 186 + unsigned int slot = 0; 187 + 188 + /* Repair bbio should be eaxctly one block sized. */ 189 + ASSERT(repair_bbio->saved_iter.bi_size == fs_info->sectorsize); 176 190 177 191 if (repair_bbio->bio.bi_status || 178 192 !btrfs_data_csum_ok(repair_bbio, dev, 0, bvec_phys(bv))) { ··· 204 190 return; 205 191 } 206 192 193 + btrfs_bio_for_each_block(paddr, &repair_bbio->bio, &saved_iter, step) { 194 + ASSERT(slot < nr_steps); 195 + paddrs[slot] = paddr; 196 + slot++; 197 + } 198 + 207 199 do { 208 200 mirror = prev_repair_mirror(fbio, mirror); 209 201 btrfs_repair_io_failure(fs_info, btrfs_ino(inode), 210 202 repair_bbio->file_offset, fs_info->sectorsize, 211 - repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, 212 - bvec_phys(bv), mirror); 203 + logical, paddrs, step, mirror); 213 204 } while (mirror != fbio->bbio->mirror_num); 214 205 215 206 done: ··· 885 866 * 886 867 * The I/O is issued synchronously to block the repair read completion from 887 868 * freeing the bio. 869 + * 870 + * @ino: Offending inode number 871 + * @fileoff: File offset inside the inode 872 + * @length: Length of the repair write 873 + * @logical: Logical address of the range 874 + * @paddrs: Physical address array of the content 875 + * @step: Length of for each paddrs 876 + * @mirror_num: Mirror number to write to. Must not be zero 888 877 */ 889 - int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, 890 - u64 length, u64 logical, phys_addr_t paddr, int mirror_num) 878 + int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 fileoff, 879 + u32 length, u64 logical, const phys_addr_t paddrs[], 880 + unsigned int step, int mirror_num) 891 881 { 882 + const u32 nr_steps = DIV_ROUND_UP_POW2(length, step); 892 883 struct btrfs_io_stripe smap = { 0 }; 893 - struct bio_vec bvec; 894 - struct bio bio; 884 + struct bio *bio = NULL; 895 885 int ret = 0; 896 886 897 887 ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); 898 888 BUG_ON(!mirror_num); 889 + 890 + /* Basic alignment checks. */ 891 + ASSERT(IS_ALIGNED(logical, fs_info->sectorsize)); 892 + ASSERT(IS_ALIGNED(length, fs_info->sectorsize)); 893 + ASSERT(IS_ALIGNED(fileoff, fs_info->sectorsize)); 894 + /* Either it's a single data or metadata block. */ 895 + ASSERT(length <= BTRFS_MAX_BLOCKSIZE); 896 + ASSERT(step <= length); 897 + ASSERT(is_power_of_2(step)); 899 898 900 899 if (btrfs_repair_one_zone(fs_info, logical)) 901 900 return 0; ··· 934 897 goto out_counter_dec; 935 898 } 936 899 937 - bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); 938 - bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; 939 - __bio_add_page(&bio, phys_to_page(paddr), length, offset_in_page(paddr)); 940 - ret = submit_bio_wait(&bio); 900 + bio = bio_alloc(smap.dev->bdev, nr_steps, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS); 901 + bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; 902 + for (int i = 0; i < nr_steps; i++) { 903 + ret = bio_add_page(bio, phys_to_page(paddrs[i]), step, offset_in_page(paddrs[i])); 904 + /* We should have allocated enough slots to contain all the different pages. */ 905 + ASSERT(ret == step); 906 + } 907 + ret = submit_bio_wait(bio); 908 + bio_put(bio); 941 909 if (ret) { 942 910 /* try to remap that extent elsewhere? */ 943 911 btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS); 944 - goto out_bio_uninit; 912 + goto out_counter_dec; 945 913 } 946 914 947 915 btrfs_info_rl(fs_info, 948 916 "read error corrected: ino %llu off %llu (dev %s sector %llu)", 949 - ino, start, btrfs_dev_name(smap.dev), 917 + ino, fileoff, btrfs_dev_name(smap.dev), 950 918 smap.physical >> SECTOR_SHIFT); 951 919 ret = 0; 952 920 953 - out_bio_uninit: 954 - bio_uninit(&bio); 955 921 out_counter_dec: 956 922 btrfs_bio_counter_dec(fs_info); 957 923 return ret;
+3 -2
fs/btrfs/bio.h
··· 117 117 118 118 void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num); 119 119 void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace); 120 - int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, 121 - u64 length, u64 logical, phys_addr_t paddr, int mirror_num); 120 + int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 fileoff, 121 + u32 length, u64 logical, const phys_addr_t paddrs[], 122 + unsigned int step, int mirror_num); 122 123 123 124 #endif
+18 -11
fs/btrfs/disk-io.c
··· 183 183 int mirror_num) 184 184 { 185 185 struct btrfs_fs_info *fs_info = eb->fs_info; 186 + const u32 step = min(fs_info->nodesize, PAGE_SIZE); 187 + const u32 nr_steps = eb->len / step; 188 + phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; 186 189 int ret = 0; 187 190 188 191 if (sb_rdonly(fs_info->sb)) 189 192 return -EROFS; 190 193 191 - for (int i = 0; i < num_extent_folios(eb); i++) { 194 + for (int i = 0; i < num_extent_pages(eb); i++) { 192 195 struct folio *folio = eb->folios[i]; 193 - u64 start = max_t(u64, eb->start, folio_pos(folio)); 194 - u64 end = min_t(u64, eb->start + eb->len, 195 - folio_pos(folio) + eb->folio_size); 196 - u32 len = end - start; 197 - phys_addr_t paddr = PFN_PHYS(folio_pfn(folio)) + 198 - offset_in_folio(folio, start); 199 196 200 - ret = btrfs_repair_io_failure(fs_info, 0, start, len, start, 201 - paddr, mirror_num); 202 - if (ret) 203 - break; 197 + /* No large folio support yet. */ 198 + ASSERT(folio_order(folio) == 0); 199 + ASSERT(i < nr_steps); 200 + 201 + /* 202 + * For nodesize < page size, there is just one paddr, with some 203 + * offset inside the page. 204 + * 205 + * For nodesize >= page size, it's one or more paddrs, and eb->start 206 + * must be aligned to page boundary. 207 + */ 208 + paddrs[i] = page_to_phys(&folio->page) + offset_in_page(eb->start); 204 209 } 205 210 211 + ret = btrfs_repair_io_failure(fs_info, 0, eb->start, eb->len, eb->start, 212 + paddrs, step, mirror_num); 206 213 return ret; 207 214 } 208 215