btrfs: make read verification handle bs > ps cases without large folios

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

The current read verification is also relying on large folios to support
bs > ps cases, but that introduced quite some limits.

To enhance read-repair to support bs > ps without large folios:

- Make btrfs_data_csum_ok() to accept an array of paddrs
Which can pass the paddrs[] direct into
btrfs_calculate_block_csum_pages().

- Make repair_one_sector() to accept an array of paddrs
So that it can submit a repair bio backed by regular pages, not only
large folios.
This requires us to allocate more slots at bio allocation time though.

Also since the caller may have only partially advanced the saved_iter
for bs > ps cases, we can not directly trust the logical bytenr from
saved_iter (can be unaligned), thus a manual round down is necessary
for the logical bytenr.

- Make btrfs_check_read_bio() to build an array of paddrs
The tricky part is that we can only call btrfs_data_csum_ok() after
all involved pages are assembled.

This means at the call time of btrfs_check_read_bio(), our offset
inside the bio is already at the end of the fs block.
Thus we must re-calculate @bio_offset for btrfs_data_csum_ok() and
repair_one_sector().

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>

authored by

Qu Wenruo and committed by

David Sterba 4 months ago 052fd7a5 2574e901

+52 -30

3 changed files

expand all

btrfs

bio.c

btrfs_inode.h

inode.c

+41 -21

fs/btrfs/bio.c

··· 171 171 struct btrfs_failed_bio *fbio = repair_bbio->private; 172 172 struct btrfs_inode *inode = repair_bbio->inode; 173 173 struct btrfs_fs_info *fs_info = inode->root->fs_info; 174 - struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); 175 174 /* 176 175 * We can not move forward the saved_iter, as it will be later 177 176 * utilized by repair_bbio again. ··· 187 188 /* Repair bbio should be eaxctly one block sized. */ 188 189 ASSERT(repair_bbio->saved_iter.bi_size == fs_info->sectorsize); 189 190 191 + btrfs_bio_for_each_block(paddr, &repair_bbio->bio, &saved_iter, step) { 192 + ASSERT(slot < nr_steps); 193 + paddrs[slot] = paddr; 194 + slot++; 195 + } 196 + 190 197 if (repair_bbio->bio.bi_status || 191 - !btrfs_data_csum_ok(repair_bbio, dev, 0, bvec_phys(bv))) { 198 + !btrfs_data_csum_ok(repair_bbio, dev, 0, paddrs)) { 192 199 bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); 193 200 repair_bbio->bio.bi_iter = repair_bbio->saved_iter; 194 201 ··· 207 202 208 203 btrfs_submit_bbio(repair_bbio, mirror); 209 204 return; 210 - } 211 - 212 - btrfs_bio_for_each_block(paddr, &repair_bbio->bio, &saved_iter, step) { 213 - ASSERT(slot < nr_steps); 214 - paddrs[slot] = paddr; 215 - slot++; 216 205 } 217 206 218 207 do { ··· 230 231 */ 231 232 static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, 232 233 u32 bio_offset, 233 - phys_addr_t paddr, 234 + phys_addr_t paddrs[], 234 235 struct btrfs_failed_bio *fbio) 235 236 { 236 237 struct btrfs_inode *inode = failed_bbio->inode; 237 238 struct btrfs_fs_info *fs_info = inode->root->fs_info; 238 - struct folio *folio = page_folio(phys_to_page(paddr)); 239 239 const u32 sectorsize = fs_info->sectorsize; 240 - const u32 foff = offset_in_folio(folio, paddr); 241 - const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); 240 + const u32 step = min(fs_info->sectorsize, PAGE_SIZE); 241 + const u32 nr_steps = sectorsize / step; 242 + /* 243 + * For bs > ps cases, the saved_iter can be partially moved forward. 244 + * In that case we should round it down to the block boundary. 245 + */ 246 + const u64 logical = round_down(failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT, 247 + sectorsize); 242 248 struct btrfs_bio *repair_bbio; 243 249 struct bio *repair_bio; 244 250 int num_copies; 245 251 int mirror; 246 252 247 - ASSERT(foff + sectorsize <= folio_size(folio)); 248 253 btrfs_debug(fs_info, "repair read error: read error at %llu", 249 254 failed_bbio->file_offset + bio_offset); 250 255 ··· 268 265 269 266 atomic_inc(&fbio->repair_count); 270 267 271 - repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, 268 + repair_bio = bio_alloc_bioset(NULL, nr_steps, REQ_OP_READ, GFP_NOFS, 272 269 &btrfs_repair_bioset); 273 - repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; 274 - bio_add_folio_nofail(repair_bio, folio, sectorsize, foff); 270 + repair_bio->bi_iter.bi_sector = logical >> SECTOR_SHIFT; 271 + for (int i = 0; i < nr_steps; i++) { 272 + int ret; 273 + 274 + ASSERT(offset_in_page(paddrs[i]) + step <= PAGE_SIZE); 275 + 276 + ret = bio_add_page(repair_bio, phys_to_page(paddrs[i]), step, 277 + offset_in_page(paddrs[i])); 278 + ASSERT(ret == step); 279 + } 275 280 276 281 repair_bbio = btrfs_bio(repair_bio); 277 282 btrfs_bio_init(repair_bbio, failed_bbio->inode, failed_bbio->file_offset + bio_offset, ··· 295 284 { 296 285 struct btrfs_inode *inode = bbio->inode; 297 286 struct btrfs_fs_info *fs_info = inode->root->fs_info; 298 - u32 sectorsize = fs_info->sectorsize; 287 + const u32 sectorsize = fs_info->sectorsize; 288 + const u32 step = min(sectorsize, PAGE_SIZE); 289 + const u32 nr_steps = sectorsize / step; 299 290 struct bvec_iter *iter = &bbio->saved_iter; 300 291 blk_status_t status = bbio->bio.bi_status; 301 292 struct btrfs_failed_bio *fbio = NULL; 293 + phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; 302 294 phys_addr_t paddr; 303 295 u32 offset = 0; 304 296 ··· 320 306 /* Clear the I/O error. A failed repair will reset it. */ 321 307 bbio->bio.bi_status = BLK_STS_OK; 322 308 323 - btrfs_bio_for_each_block(paddr, &bbio->bio, iter, fs_info->sectorsize) { 324 - if (status || !btrfs_data_csum_ok(bbio, dev, offset, paddr)) 325 - fbio = repair_one_sector(bbio, offset, paddr, fbio); 326 - offset += sectorsize; 309 + btrfs_bio_for_each_block(paddr, &bbio->bio, iter, step) { 310 + paddrs[(offset / step) % nr_steps] = paddr; 311 + offset += step; 312 + 313 + if (IS_ALIGNED(offset, sectorsize)) { 314 + if (status || 315 + !btrfs_data_csum_ok(bbio, dev, offset - sectorsize, paddrs)) 316 + fbio = repair_one_sector(bbio, offset - sectorsize, 317 + paddrs, fbio); 318 + } 327 319 } 328 320 if (bbio->csum != bbio->csum_inline) 329 321 kvfree(bbio->csum);

+1 -1

fs/btrfs/btrfs_inode.h

··· 550 550 int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum, 551 551 const u8 * const csum_expected); 552 552 bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, 553 - u32 bio_offset, phys_addr_t paddr); 553 + u32 bio_offset, const phys_addr_t paddrs[]); 554 554 noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, 555 555 struct btrfs_file_extent *file_extent, 556 556 bool nowait);

+10 -8

fs/btrfs/inode.c

··· 3420 3420 } 3421 3421 3422 3422 /* 3423 - * Verify the checksum of a single data sector. 3423 + * Verify the checksum of a single data sector, which can be scattered at 3424 + * different noncontiguous pages. 3424 3425 * 3425 3426 * @bbio: btrfs_io_bio which contains the csum 3426 3427 * @dev: device the sector is on 3427 3428 * @bio_offset: offset to the beginning of the bio (in bytes) 3428 - * @bv: bio_vec to check 3429 + * @paddrs: physical addresses which back the fs block 3429 3430 * 3430 3431 * Check if the checksum on a data block is valid. When a checksum mismatch is 3431 3432 * detected, report the error and fill the corrupted range with zero. ··· 3434 3433 * Return %true if the sector is ok or had no checksum to start with, else %false. 3435 3434 */ 3436 3435 bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, 3437 - u32 bio_offset, phys_addr_t paddr) 3436 + u32 bio_offset, const phys_addr_t paddrs[]) 3438 3437 { 3439 3438 struct btrfs_inode *inode = bbio->inode; 3440 3439 struct btrfs_fs_info *fs_info = inode->root->fs_info; 3441 3440 const u32 blocksize = fs_info->sectorsize; 3442 - struct folio *folio; 3441 + const u32 step = min(blocksize, PAGE_SIZE); 3442 + const u32 nr_steps = blocksize / step; 3443 3443 u64 file_offset = bbio->file_offset + bio_offset; 3444 3444 u64 end = file_offset + blocksize - 1; 3445 3445 u8 *csum_expected; ··· 3460 3458 3461 3459 csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) * 3462 3460 fs_info->csum_size; 3463 - if (btrfs_check_block_csum(fs_info, paddr, csum, csum_expected)) 3461 + btrfs_calculate_block_csum_pages(fs_info, paddrs, csum); 3462 + if (unlikely(memcmp(csum, csum_expected, fs_info->csum_size) != 0)) 3464 3463 goto zeroit; 3465 3464 return true; 3466 3465 ··· 3470 3467 bbio->mirror_num); 3471 3468 if (dev) 3472 3469 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); 3473 - folio = page_folio(phys_to_page(paddr)); 3474 - ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio)); 3475 - folio_zero_range(folio, offset_in_folio(folio, paddr), blocksize); 3470 + for (int i = 0; i < nr_steps; i++) 3471 + memzero_page(phys_to_page(paddrs[i]), offset_in_page(paddrs[i]), step); 3476 3472 return false; 3477 3473 } 3478 3474