at v6.19 3058 lines 85 kB view raw
1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Copyright (C) 2012 Fusion-io All rights reserved. 4 * Copyright (C) 2012 Intel Corp. All rights reserved. 5 */ 6 7#include <linux/sched.h> 8#include <linux/bio.h> 9#include <linux/slab.h> 10#include <linux/blkdev.h> 11#include <linux/raid/pq.h> 12#include <linux/hash.h> 13#include <linux/list_sort.h> 14#include <linux/raid/xor.h> 15#include <linux/mm.h> 16#include "messages.h" 17#include "ctree.h" 18#include "disk-io.h" 19#include "volumes.h" 20#include "raid56.h" 21#include "async-thread.h" 22#include "file-item.h" 23#include "btrfs_inode.h" 24 25/* set when additional merges to this rbio are not allowed */ 26#define RBIO_RMW_LOCKED_BIT 1 27 28/* 29 * set when this rbio is sitting in the hash, but it is just a cache 30 * of past RMW 31 */ 32#define RBIO_CACHE_BIT 2 33 34/* 35 * set when it is safe to trust the stripe_pages for caching 36 */ 37#define RBIO_CACHE_READY_BIT 3 38 39#define RBIO_CACHE_SIZE 1024 40 41#define BTRFS_STRIPE_HASH_TABLE_BITS 11 42 43static void dump_bioc(const struct btrfs_fs_info *fs_info, const struct btrfs_io_context *bioc) 44{ 45 if (unlikely(!bioc)) { 46 btrfs_crit(fs_info, "bioc=NULL"); 47 return; 48 } 49 btrfs_crit(fs_info, 50"bioc logical=%llu full_stripe=%llu size=%llu map_type=0x%llx mirror=%u replace_nr_stripes=%u replace_stripe_src=%d num_stripes=%u", 51 bioc->logical, bioc->full_stripe_logical, bioc->size, 52 bioc->map_type, bioc->mirror_num, bioc->replace_nr_stripes, 53 bioc->replace_stripe_src, bioc->num_stripes); 54 for (int i = 0; i < bioc->num_stripes; i++) { 55 btrfs_crit(fs_info, " nr=%d devid=%llu physical=%llu", 56 i, bioc->stripes[i].dev->devid, 57 bioc->stripes[i].physical); 58 } 59} 60 61static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info, 62 const struct btrfs_raid_bio *rbio) 63{ 64 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) 65 return; 66 67 dump_bioc(fs_info, rbio->bioc); 68 btrfs_crit(fs_info, 69"rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u sector_nsteps=%u scrubp=%u dbitmap=0x%lx", 70 rbio->flags, rbio->nr_sectors, rbio->nr_data, 71 rbio->real_stripes, rbio->stripe_nsectors, 72 rbio->sector_nsteps, rbio->scrubp, rbio->dbitmap); 73} 74 75#define ASSERT_RBIO(expr, rbio) \ 76({ \ 77 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ 78 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ 79 (rbio)->bioc->fs_info : NULL; \ 80 \ 81 btrfs_dump_rbio(__fs_info, (rbio)); \ 82 } \ 83 ASSERT((expr)); \ 84}) 85 86#define ASSERT_RBIO_STRIPE(expr, rbio, stripe_nr) \ 87({ \ 88 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ 89 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ 90 (rbio)->bioc->fs_info : NULL; \ 91 \ 92 btrfs_dump_rbio(__fs_info, (rbio)); \ 93 btrfs_crit(__fs_info, "stripe_nr=%d", (stripe_nr)); \ 94 } \ 95 ASSERT((expr)); \ 96}) 97 98#define ASSERT_RBIO_SECTOR(expr, rbio, sector_nr) \ 99({ \ 100 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ 101 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ 102 (rbio)->bioc->fs_info : NULL; \ 103 \ 104 btrfs_dump_rbio(__fs_info, (rbio)); \ 105 btrfs_crit(__fs_info, "sector_nr=%d", (sector_nr)); \ 106 } \ 107 ASSERT((expr)); \ 108}) 109 110#define ASSERT_RBIO_LOGICAL(expr, rbio, logical) \ 111({ \ 112 if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ 113 const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ 114 (rbio)->bioc->fs_info : NULL; \ 115 \ 116 btrfs_dump_rbio(__fs_info, (rbio)); \ 117 btrfs_crit(__fs_info, "logical=%llu", (logical)); \ 118 } \ 119 ASSERT((expr)); \ 120}) 121 122/* Used by the raid56 code to lock stripes for read/modify/write */ 123struct btrfs_stripe_hash { 124 struct list_head hash_list; 125 spinlock_t lock; 126}; 127 128/* Used by the raid56 code to lock stripes for read/modify/write */ 129struct btrfs_stripe_hash_table { 130 struct list_head stripe_cache; 131 spinlock_t cache_lock; 132 int cache_size; 133 struct btrfs_stripe_hash table[]; 134}; 135 136/* 137 * The PFN may still be valid, but our paddrs should always be block size 138 * aligned, thus such -1 paddr is definitely not a valid one. 139 */ 140#define INVALID_PADDR (~(phys_addr_t)0) 141 142static void rmw_rbio_work(struct work_struct *work); 143static void rmw_rbio_work_locked(struct work_struct *work); 144static void index_rbio_pages(struct btrfs_raid_bio *rbio); 145static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 146 147static int finish_parity_scrub(struct btrfs_raid_bio *rbio); 148static void scrub_rbio_work_locked(struct work_struct *work); 149 150static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) 151{ 152 bitmap_free(rbio->error_bitmap); 153 bitmap_free(rbio->stripe_uptodate_bitmap); 154 kfree(rbio->stripe_pages); 155 kfree(rbio->bio_paddrs); 156 kfree(rbio->stripe_paddrs); 157 kfree(rbio->finish_pointers); 158} 159 160static void free_raid_bio(struct btrfs_raid_bio *rbio) 161{ 162 int i; 163 164 if (!refcount_dec_and_test(&rbio->refs)) 165 return; 166 167 WARN_ON(!list_empty(&rbio->stripe_cache)); 168 WARN_ON(!list_empty(&rbio->hash_list)); 169 WARN_ON(!bio_list_empty(&rbio->bio_list)); 170 171 for (i = 0; i < rbio->nr_pages; i++) { 172 if (rbio->stripe_pages[i]) { 173 __free_page(rbio->stripe_pages[i]); 174 rbio->stripe_pages[i] = NULL; 175 } 176 } 177 178 btrfs_put_bioc(rbio->bioc); 179 free_raid_bio_pointers(rbio); 180 kfree(rbio); 181} 182 183static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func) 184{ 185 INIT_WORK(&rbio->work, work_func); 186 queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); 187} 188 189/* 190 * the stripe hash table is used for locking, and to collect 191 * bios in hopes of making a full stripe 192 */ 193int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 194{ 195 struct btrfs_stripe_hash_table *table; 196 struct btrfs_stripe_hash_table *x; 197 struct btrfs_stripe_hash *cur; 198 struct btrfs_stripe_hash *h; 199 unsigned int num_entries = 1U << BTRFS_STRIPE_HASH_TABLE_BITS; 200 201 if (info->stripe_hash_table) 202 return 0; 203 204 /* 205 * The table is large, starting with order 4 and can go as high as 206 * order 7 in case lock debugging is turned on. 207 * 208 * Try harder to allocate and fallback to vmalloc to lower the chance 209 * of a failing mount. 210 */ 211 table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL); 212 if (!table) 213 return -ENOMEM; 214 215 spin_lock_init(&table->cache_lock); 216 INIT_LIST_HEAD(&table->stripe_cache); 217 218 h = table->table; 219 220 for (unsigned int i = 0; i < num_entries; i++) { 221 cur = h + i; 222 INIT_LIST_HEAD(&cur->hash_list); 223 spin_lock_init(&cur->lock); 224 } 225 226 x = cmpxchg(&info->stripe_hash_table, NULL, table); 227 kvfree(x); 228 return 0; 229} 230 231static void memcpy_from_bio_to_stripe(struct btrfs_raid_bio *rbio, unsigned int sector_nr) 232{ 233 const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE); 234 235 ASSERT(sector_nr < rbio->nr_sectors); 236 for (int i = 0; i < rbio->sector_nsteps; i++) { 237 unsigned int index = sector_nr * rbio->sector_nsteps + i; 238 phys_addr_t dst = rbio->stripe_paddrs[index]; 239 phys_addr_t src = rbio->bio_paddrs[index]; 240 241 ASSERT(dst != INVALID_PADDR); 242 ASSERT(src != INVALID_PADDR); 243 244 memcpy_page(phys_to_page(dst), offset_in_page(dst), 245 phys_to_page(src), offset_in_page(src), step); 246 } 247} 248 249/* 250 * caching an rbio means to copy anything from the 251 * bio_sectors array into the stripe_pages array. We 252 * use the page uptodate bit in the stripe cache array 253 * to indicate if it has valid data 254 * 255 * once the caching is done, we set the cache ready 256 * bit. 257 */ 258static void cache_rbio_pages(struct btrfs_raid_bio *rbio) 259{ 260 int i; 261 int ret; 262 263 ret = alloc_rbio_pages(rbio); 264 if (ret) 265 return; 266 267 for (i = 0; i < rbio->nr_sectors; i++) { 268 /* Some range not covered by bio (partial write), skip it */ 269 if (rbio->bio_paddrs[i * rbio->sector_nsteps] == INVALID_PADDR) { 270 /* 271 * Even if the sector is not covered by bio, if it is 272 * a data sector it should still be uptodate as it is 273 * read from disk. 274 */ 275 if (i < rbio->nr_data * rbio->stripe_nsectors) 276 ASSERT(test_bit(i, rbio->stripe_uptodate_bitmap)); 277 continue; 278 } 279 280 memcpy_from_bio_to_stripe(rbio, i); 281 set_bit(i, rbio->stripe_uptodate_bitmap); 282 } 283 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 284} 285 286/* 287 * we hash on the first logical address of the stripe 288 */ 289static int rbio_bucket(struct btrfs_raid_bio *rbio) 290{ 291 u64 num = rbio->bioc->full_stripe_logical; 292 293 /* 294 * we shift down quite a bit. We're using byte 295 * addressing, and most of the lower bits are zeros. 296 * This tends to upset hash_64, and it consistently 297 * returns just one or two different values. 298 * 299 * shifting off the lower bits fixes things. 300 */ 301 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 302} 303 304/* Get the sector number of the first sector covered by @page_nr. */ 305static u32 page_nr_to_sector_nr(struct btrfs_raid_bio *rbio, unsigned int page_nr) 306{ 307 u32 sector_nr; 308 309 ASSERT(page_nr < rbio->nr_pages); 310 311 sector_nr = (page_nr << PAGE_SHIFT) >> rbio->bioc->fs_info->sectorsize_bits; 312 ASSERT(sector_nr < rbio->nr_sectors); 313 return sector_nr; 314} 315 316/* 317 * Get the number of sectors covered by @page_nr. 318 * 319 * For bs > ps cases, the result will always be 1. 320 * For bs <= ps cases, the result will be ps / bs. 321 */ 322static u32 page_nr_to_num_sectors(struct btrfs_raid_bio *rbio, unsigned int page_nr) 323{ 324 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 325 u32 nr_sectors; 326 327 ASSERT(page_nr < rbio->nr_pages); 328 329 nr_sectors = round_up(PAGE_SIZE, fs_info->sectorsize) >> fs_info->sectorsize_bits; 330 ASSERT(nr_sectors > 0); 331 return nr_sectors; 332} 333 334static __maybe_unused bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, 335 unsigned int page_nr) 336{ 337 const u32 sector_nr = page_nr_to_sector_nr(rbio, page_nr); 338 const u32 nr_bits = page_nr_to_num_sectors(rbio, page_nr); 339 int i; 340 341 ASSERT(page_nr < rbio->nr_pages); 342 ASSERT(sector_nr + nr_bits < rbio->nr_sectors); 343 344 for (i = sector_nr; i < sector_nr + nr_bits; i++) { 345 if (!test_bit(i, rbio->stripe_uptodate_bitmap)) 346 return false; 347 } 348 return true; 349} 350 351/* 352 * Update the stripe_sectors[] array to use correct page and pgoff 353 * 354 * Should be called every time any page pointer in stripes_pages[] got modified. 355 */ 356static void index_stripe_sectors(struct btrfs_raid_bio *rbio) 357{ 358 const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE); 359 u32 offset; 360 int i; 361 362 for (i = 0, offset = 0; i < rbio->nr_sectors * rbio->sector_nsteps; 363 i++, offset += step) { 364 int page_index = offset >> PAGE_SHIFT; 365 366 ASSERT(page_index < rbio->nr_pages); 367 if (!rbio->stripe_pages[page_index]) 368 continue; 369 370 rbio->stripe_paddrs[i] = page_to_phys(rbio->stripe_pages[page_index]) + 371 offset_in_page(offset); 372 } 373} 374 375static void steal_rbio_page(struct btrfs_raid_bio *src, 376 struct btrfs_raid_bio *dest, int page_nr) 377{ 378 const u32 sector_nr = page_nr_to_sector_nr(src, page_nr); 379 const u32 nr_bits = page_nr_to_num_sectors(src, page_nr); 380 381 ASSERT(page_nr < src->nr_pages); 382 ASSERT(sector_nr + nr_bits < src->nr_sectors); 383 384 if (dest->stripe_pages[page_nr]) 385 __free_page(dest->stripe_pages[page_nr]); 386 dest->stripe_pages[page_nr] = src->stripe_pages[page_nr]; 387 src->stripe_pages[page_nr] = NULL; 388 389 /* Also update the stripe_uptodate_bitmap bits. */ 390 bitmap_set(dest->stripe_uptodate_bitmap, sector_nr, nr_bits); 391} 392 393static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr) 394{ 395 const int sector_nr = page_nr_to_sector_nr(rbio, page_nr); 396 397 /* 398 * We have ensured PAGE_SIZE is aligned with sectorsize, thus 399 * we won't have a page which is half data half parity. 400 * 401 * Thus if the first sector of the page belongs to data stripes, then 402 * the full page belongs to data stripes. 403 */ 404 return (sector_nr < rbio->nr_data * rbio->stripe_nsectors); 405} 406 407/* 408 * Stealing an rbio means taking all the uptodate pages from the stripe array 409 * in the source rbio and putting them into the destination rbio. 410 * 411 * This will also update the involved stripe_sectors[] which are referring to 412 * the old pages. 413 */ 414static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 415{ 416 int i; 417 418 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) 419 return; 420 421 for (i = 0; i < dest->nr_pages; i++) { 422 struct page *p = src->stripe_pages[i]; 423 424 /* 425 * We don't need to steal P/Q pages as they will always be 426 * regenerated for RMW or full write anyway. 427 */ 428 if (!is_data_stripe_page(src, i)) 429 continue; 430 431 /* 432 * If @src already has RBIO_CACHE_READY_BIT, it should have 433 * all data stripe pages present and uptodate. 434 */ 435 ASSERT(p); 436 ASSERT(full_page_sectors_uptodate(src, i)); 437 steal_rbio_page(src, dest, i); 438 } 439 index_stripe_sectors(dest); 440 index_stripe_sectors(src); 441} 442 443/* 444 * merging means we take the bio_list from the victim and 445 * splice it into the destination. The victim should 446 * be discarded afterwards. 447 * 448 * must be called with dest->rbio_list_lock held 449 */ 450static void merge_rbio(struct btrfs_raid_bio *dest, 451 struct btrfs_raid_bio *victim) 452{ 453 bio_list_merge_init(&dest->bio_list, &victim->bio_list); 454 dest->bio_list_bytes += victim->bio_list_bytes; 455 /* Also inherit the bitmaps from @victim. */ 456 bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, 457 dest->stripe_nsectors); 458} 459 460/* 461 * used to prune items that are in the cache. The caller 462 * must hold the hash table lock. 463 */ 464static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 465{ 466 int bucket = rbio_bucket(rbio); 467 struct btrfs_stripe_hash_table *table; 468 struct btrfs_stripe_hash *h; 469 int freeit = 0; 470 471 /* 472 * check the bit again under the hash table lock. 473 */ 474 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 475 return; 476 477 table = rbio->bioc->fs_info->stripe_hash_table; 478 h = table->table + bucket; 479 480 /* hold the lock for the bucket because we may be 481 * removing it from the hash table 482 */ 483 spin_lock(&h->lock); 484 485 /* 486 * hold the lock for the bio list because we need 487 * to make sure the bio list is empty 488 */ 489 spin_lock(&rbio->bio_list_lock); 490 491 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { 492 list_del_init(&rbio->stripe_cache); 493 table->cache_size -= 1; 494 freeit = 1; 495 496 /* if the bio list isn't empty, this rbio is 497 * still involved in an IO. We take it out 498 * of the cache list, and drop the ref that 499 * was held for the list. 500 * 501 * If the bio_list was empty, we also remove 502 * the rbio from the hash_table, and drop 503 * the corresponding ref 504 */ 505 if (bio_list_empty(&rbio->bio_list)) { 506 if (!list_empty(&rbio->hash_list)) { 507 list_del_init(&rbio->hash_list); 508 refcount_dec(&rbio->refs); 509 BUG_ON(!list_empty(&rbio->plug_list)); 510 } 511 } 512 } 513 514 spin_unlock(&rbio->bio_list_lock); 515 spin_unlock(&h->lock); 516 517 if (freeit) 518 free_raid_bio(rbio); 519} 520 521/* 522 * prune a given rbio from the cache 523 */ 524static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 525{ 526 struct btrfs_stripe_hash_table *table; 527 528 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 529 return; 530 531 table = rbio->bioc->fs_info->stripe_hash_table; 532 533 spin_lock(&table->cache_lock); 534 __remove_rbio_from_cache(rbio); 535 spin_unlock(&table->cache_lock); 536} 537 538/* 539 * remove everything in the cache 540 */ 541static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) 542{ 543 struct btrfs_stripe_hash_table *table; 544 struct btrfs_raid_bio *rbio; 545 546 table = info->stripe_hash_table; 547 548 spin_lock(&table->cache_lock); 549 while (!list_empty(&table->stripe_cache)) { 550 rbio = list_first_entry(&table->stripe_cache, 551 struct btrfs_raid_bio, stripe_cache); 552 __remove_rbio_from_cache(rbio); 553 } 554 spin_unlock(&table->cache_lock); 555} 556 557/* 558 * remove all cached entries and free the hash table 559 * used by unmount 560 */ 561void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 562{ 563 if (!info->stripe_hash_table) 564 return; 565 btrfs_clear_rbio_cache(info); 566 kvfree(info->stripe_hash_table); 567 info->stripe_hash_table = NULL; 568} 569 570/* 571 * insert an rbio into the stripe cache. It 572 * must have already been prepared by calling 573 * cache_rbio_pages 574 * 575 * If this rbio was already cached, it gets 576 * moved to the front of the lru. 577 * 578 * If the size of the rbio cache is too big, we 579 * prune an item. 580 */ 581static void cache_rbio(struct btrfs_raid_bio *rbio) 582{ 583 struct btrfs_stripe_hash_table *table; 584 585 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) 586 return; 587 588 table = rbio->bioc->fs_info->stripe_hash_table; 589 590 spin_lock(&table->cache_lock); 591 spin_lock(&rbio->bio_list_lock); 592 593 /* bump our ref if we were not in the list before */ 594 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) 595 refcount_inc(&rbio->refs); 596 597 if (!list_empty(&rbio->stripe_cache)){ 598 list_move(&rbio->stripe_cache, &table->stripe_cache); 599 } else { 600 list_add(&rbio->stripe_cache, &table->stripe_cache); 601 table->cache_size += 1; 602 } 603 604 spin_unlock(&rbio->bio_list_lock); 605 606 if (table->cache_size > RBIO_CACHE_SIZE) { 607 struct btrfs_raid_bio *found; 608 609 found = list_last_entry(&table->stripe_cache, 610 struct btrfs_raid_bio, 611 stripe_cache); 612 613 if (found != rbio) 614 __remove_rbio_from_cache(found); 615 } 616 617 spin_unlock(&table->cache_lock); 618} 619 620/* 621 * helper function to run the xor_blocks api. It is only 622 * able to do MAX_XOR_BLOCKS at a time, so we need to 623 * loop through. 624 */ 625static void run_xor(void **pages, int src_cnt, ssize_t len) 626{ 627 int src_off = 0; 628 int xor_src_cnt = 0; 629 void *dest = pages[src_cnt]; 630 631 while(src_cnt > 0) { 632 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 633 xor_blocks(xor_src_cnt, len, dest, pages + src_off); 634 635 src_cnt -= xor_src_cnt; 636 src_off += xor_src_cnt; 637 } 638} 639 640/* 641 * Returns true if the bio list inside this rbio covers an entire stripe (no 642 * rmw required). 643 */ 644static int rbio_is_full(struct btrfs_raid_bio *rbio) 645{ 646 unsigned long size = rbio->bio_list_bytes; 647 int ret = 1; 648 649 spin_lock(&rbio->bio_list_lock); 650 if (size != rbio->nr_data * BTRFS_STRIPE_LEN) 651 ret = 0; 652 BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN); 653 spin_unlock(&rbio->bio_list_lock); 654 655 return ret; 656} 657 658/* 659 * returns 1 if it is safe to merge two rbios together. 660 * The merging is safe if the two rbios correspond to 661 * the same stripe and if they are both going in the same 662 * direction (read vs write), and if neither one is 663 * locked for final IO 664 * 665 * The caller is responsible for locking such that 666 * rmw_locked is safe to test 667 */ 668static int rbio_can_merge(struct btrfs_raid_bio *last, 669 struct btrfs_raid_bio *cur) 670{ 671 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 672 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 673 return 0; 674 675 /* 676 * we can't merge with cached rbios, since the 677 * idea is that when we merge the destination 678 * rbio is going to run our IO for us. We can 679 * steal from cached rbios though, other functions 680 * handle that. 681 */ 682 if (test_bit(RBIO_CACHE_BIT, &last->flags) || 683 test_bit(RBIO_CACHE_BIT, &cur->flags)) 684 return 0; 685 686 if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical) 687 return 0; 688 689 /* we can't merge with different operations */ 690 if (last->operation != cur->operation) 691 return 0; 692 /* 693 * We've need read the full stripe from the drive. 694 * check and repair the parity and write the new results. 695 * 696 * We're not allowed to add any new bios to the 697 * bio list here, anyone else that wants to 698 * change this stripe needs to do their own rmw. 699 */ 700 if (last->operation == BTRFS_RBIO_PARITY_SCRUB) 701 return 0; 702 703 if (last->operation == BTRFS_RBIO_READ_REBUILD) 704 return 0; 705 706 return 1; 707} 708 709/* Return the sector index for @stripe_nr and @sector_nr. */ 710static unsigned int rbio_sector_index(const struct btrfs_raid_bio *rbio, 711 unsigned int stripe_nr, 712 unsigned int sector_nr) 713{ 714 unsigned int ret; 715 716 ASSERT_RBIO_STRIPE(stripe_nr < rbio->real_stripes, rbio, stripe_nr); 717 ASSERT_RBIO_SECTOR(sector_nr < rbio->stripe_nsectors, rbio, sector_nr); 718 719 ret = stripe_nr * rbio->stripe_nsectors + sector_nr; 720 ASSERT(ret < rbio->nr_sectors); 721 return ret; 722} 723 724/* Return the paddr array index for @stripe_nr, @sector_nr and @step_nr. */ 725static unsigned int rbio_paddr_index(const struct btrfs_raid_bio *rbio, 726 unsigned int stripe_nr, 727 unsigned int sector_nr, 728 unsigned int step_nr) 729{ 730 unsigned int ret; 731 732 ASSERT_RBIO_SECTOR(step_nr < rbio->sector_nsteps, rbio, step_nr); 733 734 ret = rbio_sector_index(rbio, stripe_nr, sector_nr) * rbio->sector_nsteps + step_nr; 735 ASSERT(ret < rbio->nr_sectors * rbio->sector_nsteps); 736 return ret; 737} 738 739static phys_addr_t rbio_stripe_paddr(const struct btrfs_raid_bio *rbio, 740 unsigned int stripe_nr, unsigned int sector_nr, 741 unsigned int step_nr) 742{ 743 return rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr)]; 744} 745 746static phys_addr_t rbio_pstripe_paddr(const struct btrfs_raid_bio *rbio, 747 unsigned int sector_nr, unsigned int step_nr) 748{ 749 return rbio_stripe_paddr(rbio, rbio->nr_data, sector_nr, step_nr); 750} 751 752static phys_addr_t rbio_qstripe_paddr(const struct btrfs_raid_bio *rbio, 753 unsigned int sector_nr, unsigned int step_nr) 754{ 755 if (rbio->nr_data + 1 == rbio->real_stripes) 756 return INVALID_PADDR; 757 return rbio_stripe_paddr(rbio, rbio->nr_data + 1, sector_nr, step_nr); 758} 759 760/* Return a paddr pointer into the rbio::stripe_paddrs[] for the specified sector. */ 761static phys_addr_t *rbio_stripe_paddrs(const struct btrfs_raid_bio *rbio, 762 unsigned int stripe_nr, unsigned int sector_nr) 763{ 764 return &rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, 0)]; 765} 766 767/* 768 * The first stripe in the table for a logical address 769 * has the lock. rbios are added in one of three ways: 770 * 771 * 1) Nobody has the stripe locked yet. The rbio is given 772 * the lock and 0 is returned. The caller must start the IO 773 * themselves. 774 * 775 * 2) Someone has the stripe locked, but we're able to merge 776 * with the lock owner. The rbio is freed and the IO will 777 * start automatically along with the existing rbio. 1 is returned. 778 * 779 * 3) Someone has the stripe locked, but we're not able to merge. 780 * The rbio is added to the lock owner's plug list, or merged into 781 * an rbio already on the plug list. When the lock owner unlocks, 782 * the next rbio on the list is run and the IO is started automatically. 783 * 1 is returned 784 * 785 * If we return 0, the caller still owns the rbio and must continue with 786 * IO submission. If we return 1, the caller must assume the rbio has 787 * already been freed. 788 */ 789static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 790{ 791 struct btrfs_stripe_hash *h; 792 struct btrfs_raid_bio *cur; 793 struct btrfs_raid_bio *pending; 794 struct btrfs_raid_bio *freeit = NULL; 795 struct btrfs_raid_bio *cache_drop = NULL; 796 int ret = 0; 797 798 h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio); 799 800 spin_lock(&h->lock); 801 list_for_each_entry(cur, &h->hash_list, hash_list) { 802 if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical) 803 continue; 804 805 spin_lock(&cur->bio_list_lock); 806 807 /* Can we steal this cached rbio's pages? */ 808 if (bio_list_empty(&cur->bio_list) && 809 list_empty(&cur->plug_list) && 810 test_bit(RBIO_CACHE_BIT, &cur->flags) && 811 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { 812 list_del_init(&cur->hash_list); 813 refcount_dec(&cur->refs); 814 815 steal_rbio(cur, rbio); 816 cache_drop = cur; 817 spin_unlock(&cur->bio_list_lock); 818 819 goto lockit; 820 } 821 822 /* Can we merge into the lock owner? */ 823 if (rbio_can_merge(cur, rbio)) { 824 merge_rbio(cur, rbio); 825 spin_unlock(&cur->bio_list_lock); 826 freeit = rbio; 827 ret = 1; 828 goto out; 829 } 830 831 832 /* 833 * We couldn't merge with the running rbio, see if we can merge 834 * with the pending ones. We don't have to check for rmw_locked 835 * because there is no way they are inside finish_rmw right now 836 */ 837 list_for_each_entry(pending, &cur->plug_list, plug_list) { 838 if (rbio_can_merge(pending, rbio)) { 839 merge_rbio(pending, rbio); 840 spin_unlock(&cur->bio_list_lock); 841 freeit = rbio; 842 ret = 1; 843 goto out; 844 } 845 } 846 847 /* 848 * No merging, put us on the tail of the plug list, our rbio 849 * will be started with the currently running rbio unlocks 850 */ 851 list_add_tail(&rbio->plug_list, &cur->plug_list); 852 spin_unlock(&cur->bio_list_lock); 853 ret = 1; 854 goto out; 855 } 856lockit: 857 refcount_inc(&rbio->refs); 858 list_add(&rbio->hash_list, &h->hash_list); 859out: 860 spin_unlock(&h->lock); 861 if (cache_drop) 862 remove_rbio_from_cache(cache_drop); 863 if (freeit) 864 free_raid_bio(freeit); 865 return ret; 866} 867 868static void recover_rbio_work_locked(struct work_struct *work); 869 870/* 871 * called as rmw or parity rebuild is completed. If the plug list has more 872 * rbios waiting for this stripe, the next one on the list will be started 873 */ 874static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 875{ 876 int bucket; 877 struct btrfs_stripe_hash *h; 878 int keep_cache = 0; 879 880 bucket = rbio_bucket(rbio); 881 h = rbio->bioc->fs_info->stripe_hash_table->table + bucket; 882 883 if (list_empty(&rbio->plug_list)) 884 cache_rbio(rbio); 885 886 spin_lock(&h->lock); 887 spin_lock(&rbio->bio_list_lock); 888 889 if (!list_empty(&rbio->hash_list)) { 890 /* 891 * if we're still cached and there is no other IO 892 * to perform, just leave this rbio here for others 893 * to steal from later 894 */ 895 if (list_empty(&rbio->plug_list) && 896 test_bit(RBIO_CACHE_BIT, &rbio->flags)) { 897 keep_cache = 1; 898 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 899 BUG_ON(!bio_list_empty(&rbio->bio_list)); 900 goto done; 901 } 902 903 list_del_init(&rbio->hash_list); 904 refcount_dec(&rbio->refs); 905 906 /* 907 * we use the plug list to hold all the rbios 908 * waiting for the chance to lock this stripe. 909 * hand the lock over to one of them. 910 */ 911 if (!list_empty(&rbio->plug_list)) { 912 struct btrfs_raid_bio *next; 913 struct list_head *head = rbio->plug_list.next; 914 915 next = list_entry(head, struct btrfs_raid_bio, 916 plug_list); 917 918 list_del_init(&rbio->plug_list); 919 920 list_add(&next->hash_list, &h->hash_list); 921 refcount_inc(&next->refs); 922 spin_unlock(&rbio->bio_list_lock); 923 spin_unlock(&h->lock); 924 925 if (next->operation == BTRFS_RBIO_READ_REBUILD) { 926 start_async_work(next, recover_rbio_work_locked); 927 } else if (next->operation == BTRFS_RBIO_WRITE) { 928 steal_rbio(rbio, next); 929 start_async_work(next, rmw_rbio_work_locked); 930 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { 931 steal_rbio(rbio, next); 932 start_async_work(next, scrub_rbio_work_locked); 933 } 934 935 goto done_nolock; 936 } 937 } 938done: 939 spin_unlock(&rbio->bio_list_lock); 940 spin_unlock(&h->lock); 941 942done_nolock: 943 if (!keep_cache) 944 remove_rbio_from_cache(rbio); 945} 946 947static void rbio_endio_bio_list(struct bio *cur, blk_status_t status) 948{ 949 struct bio *next; 950 951 while (cur) { 952 next = cur->bi_next; 953 cur->bi_next = NULL; 954 cur->bi_status = status; 955 bio_endio(cur); 956 cur = next; 957 } 958} 959 960/* 961 * this frees the rbio and runs through all the bios in the 962 * bio_list and calls end_io on them 963 */ 964static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status) 965{ 966 struct bio *cur = bio_list_get(&rbio->bio_list); 967 struct bio *extra; 968 969 kfree(rbio->csum_buf); 970 bitmap_free(rbio->csum_bitmap); 971 rbio->csum_buf = NULL; 972 rbio->csum_bitmap = NULL; 973 974 /* 975 * Clear the data bitmap, as the rbio may be cached for later usage. 976 * do this before before unlock_stripe() so there will be no new bio 977 * for this bio. 978 */ 979 bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors); 980 981 /* 982 * At this moment, rbio->bio_list is empty, however since rbio does not 983 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the 984 * hash list, rbio may be merged with others so that rbio->bio_list 985 * becomes non-empty. 986 * Once unlock_stripe() is done, rbio->bio_list will not be updated any 987 * more and we can call bio_endio() on all queued bios. 988 */ 989 unlock_stripe(rbio); 990 extra = bio_list_get(&rbio->bio_list); 991 free_raid_bio(rbio); 992 993 rbio_endio_bio_list(cur, status); 994 if (extra) 995 rbio_endio_bio_list(extra, status); 996} 997 998/* 999 * Get paddr pointer for the sector specified by its @stripe_nr and @sector_nr. 1000 * 1001 * @rbio: The raid bio 1002 * @stripe_nr: Stripe number, valid range [0, real_stripe) 1003 * @sector_nr: Sector number inside the stripe, 1004 * valid range [0, stripe_nsectors) 1005 * @bio_list_only: Whether to use sectors inside the bio list only. 1006 * 1007 * The read/modify/write code wants to reuse the original bio page as much 1008 * as possible, and only use stripe_sectors as fallback. 1009 * 1010 * Return NULL if bio_list_only is set but the specified sector has no 1011 * coresponding bio. 1012 */ 1013static phys_addr_t *sector_paddrs_in_rbio(struct btrfs_raid_bio *rbio, 1014 int stripe_nr, int sector_nr, 1015 bool bio_list_only) 1016{ 1017 phys_addr_t *ret = NULL; 1018 const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, 0); 1019 1020 ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps); 1021 1022 scoped_guard(spinlock, &rbio->bio_list_lock) { 1023 if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) { 1024 /* Don't return sector without a valid page pointer */ 1025 if (rbio->bio_paddrs[index] != INVALID_PADDR) 1026 ret = &rbio->bio_paddrs[index]; 1027 return ret; 1028 } 1029 } 1030 return &rbio->stripe_paddrs[index]; 1031} 1032 1033/* 1034 * Similar to sector_paddr_in_rbio(), but with extra consideration for 1035 * bs > ps cases, where we can have multiple steps for a fs block. 1036 */ 1037static phys_addr_t sector_paddr_in_rbio(struct btrfs_raid_bio *rbio, 1038 int stripe_nr, int sector_nr, int step_nr, 1039 bool bio_list_only) 1040{ 1041 phys_addr_t ret = INVALID_PADDR; 1042 const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr); 1043 1044 ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps); 1045 1046 scoped_guard(spinlock, &rbio->bio_list_lock) { 1047 if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) { 1048 /* Don't return sector without a valid page pointer */ 1049 if (rbio->bio_paddrs[index] != INVALID_PADDR) 1050 ret = rbio->bio_paddrs[index]; 1051 return ret; 1052 } 1053 } 1054 return rbio->stripe_paddrs[index]; 1055} 1056 1057/* 1058 * allocation and initial setup for the btrfs_raid_bio. Not 1059 * this does not allocate any pages for rbio->pages. 1060 */ 1061static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, 1062 struct btrfs_io_context *bioc) 1063{ 1064 const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes; 1065 const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT; 1066 const unsigned int num_pages = stripe_npages * real_stripes; 1067 const unsigned int stripe_nsectors = 1068 BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; 1069 const unsigned int num_sectors = stripe_nsectors * real_stripes; 1070 const unsigned int step = min(fs_info->sectorsize, PAGE_SIZE); 1071 const unsigned int sector_nsteps = fs_info->sectorsize / step; 1072 struct btrfs_raid_bio *rbio; 1073 1074 /* 1075 * For bs <= ps cases, ps must be aligned to bs. 1076 * For bs > ps cases, bs must be aligned to ps. 1077 */ 1078 ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize) || 1079 IS_ALIGNED(fs_info->sectorsize, PAGE_SIZE)); 1080 /* 1081 * Our current stripe len should be fixed to 64k thus stripe_nsectors 1082 * (at most 16) should be no larger than BITS_PER_LONG. 1083 */ 1084 ASSERT(stripe_nsectors <= BITS_PER_LONG); 1085 1086 /* 1087 * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256 1088 * (limited by u8). 1089 */ 1090 ASSERT(real_stripes >= 2); 1091 ASSERT(real_stripes <= U8_MAX); 1092 1093 rbio = kzalloc(sizeof(*rbio), GFP_NOFS); 1094 if (!rbio) 1095 return ERR_PTR(-ENOMEM); 1096 rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *), 1097 GFP_NOFS); 1098 rbio->bio_paddrs = kcalloc(num_sectors * sector_nsteps, sizeof(phys_addr_t), GFP_NOFS); 1099 rbio->stripe_paddrs = kcalloc(num_sectors * sector_nsteps, sizeof(phys_addr_t), GFP_NOFS); 1100 rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS); 1101 rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); 1102 rbio->stripe_uptodate_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); 1103 1104 if (!rbio->stripe_pages || !rbio->bio_paddrs || !rbio->stripe_paddrs || 1105 !rbio->finish_pointers || !rbio->error_bitmap || !rbio->stripe_uptodate_bitmap) { 1106 free_raid_bio_pointers(rbio); 1107 kfree(rbio); 1108 return ERR_PTR(-ENOMEM); 1109 } 1110 for (int i = 0; i < num_sectors * sector_nsteps; i++) { 1111 rbio->stripe_paddrs[i] = INVALID_PADDR; 1112 rbio->bio_paddrs[i] = INVALID_PADDR; 1113 } 1114 1115 bio_list_init(&rbio->bio_list); 1116 init_waitqueue_head(&rbio->io_wait); 1117 INIT_LIST_HEAD(&rbio->plug_list); 1118 spin_lock_init(&rbio->bio_list_lock); 1119 INIT_LIST_HEAD(&rbio->stripe_cache); 1120 INIT_LIST_HEAD(&rbio->hash_list); 1121 btrfs_get_bioc(bioc); 1122 rbio->bioc = bioc; 1123 rbio->nr_pages = num_pages; 1124 rbio->nr_sectors = num_sectors; 1125 rbio->real_stripes = real_stripes; 1126 rbio->stripe_npages = stripe_npages; 1127 rbio->stripe_nsectors = stripe_nsectors; 1128 rbio->sector_nsteps = sector_nsteps; 1129 refcount_set(&rbio->refs, 1); 1130 atomic_set(&rbio->stripes_pending, 0); 1131 1132 ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); 1133 rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); 1134 ASSERT(rbio->nr_data > 0); 1135 1136 return rbio; 1137} 1138 1139/* allocate pages for all the stripes in the bio, including parity */ 1140static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 1141{ 1142 int ret; 1143 1144 ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, false); 1145 if (ret < 0) 1146 return ret; 1147 /* Mapping all sectors */ 1148 index_stripe_sectors(rbio); 1149 return 0; 1150} 1151 1152/* only allocate pages for p/q stripes */ 1153static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 1154{ 1155 const int data_pages = rbio->nr_data * rbio->stripe_npages; 1156 int ret; 1157 1158 ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, 1159 rbio->stripe_pages + data_pages, false); 1160 if (ret < 0) 1161 return ret; 1162 1163 index_stripe_sectors(rbio); 1164 return 0; 1165} 1166 1167/* 1168 * Return the total number of errors found in the vertical stripe of @sector_nr. 1169 * 1170 * @faila and @failb will also be updated to the first and second stripe 1171 * number of the errors. 1172 */ 1173static int get_rbio_vertical_errors(struct btrfs_raid_bio *rbio, int sector_nr, 1174 int *faila, int *failb) 1175{ 1176 int stripe_nr; 1177 int found_errors = 0; 1178 1179 if (faila || failb) { 1180 /* 1181 * Both @faila and @failb should be valid pointers if any of 1182 * them is specified. 1183 */ 1184 ASSERT(faila && failb); 1185 *faila = -1; 1186 *failb = -1; 1187 } 1188 1189 for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { 1190 int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr; 1191 1192 if (test_bit(total_sector_nr, rbio->error_bitmap)) { 1193 found_errors++; 1194 if (faila) { 1195 /* Update faila and failb. */ 1196 if (*faila < 0) 1197 *faila = stripe_nr; 1198 else if (*failb < 0) 1199 *failb = stripe_nr; 1200 } 1201 } 1202 } 1203 return found_errors; 1204} 1205 1206static int bio_add_paddrs(struct bio *bio, phys_addr_t *paddrs, unsigned int nr_steps, 1207 unsigned int step) 1208{ 1209 int added = 0; 1210 int ret; 1211 1212 for (int i = 0; i < nr_steps; i++) { 1213 ret = bio_add_page(bio, phys_to_page(paddrs[i]), step, 1214 offset_in_page(paddrs[i])); 1215 if (ret != step) 1216 goto revert; 1217 added += ret; 1218 } 1219 return added; 1220revert: 1221 /* 1222 * We don't need to revert the bvec, as the bio will be submitted immediately, 1223 * as long as the size is reduced the extra bvec will not be accessed. 1224 */ 1225 bio->bi_iter.bi_size -= added; 1226 return 0; 1227} 1228 1229/* 1230 * Add a single sector @sector into our list of bios for IO. 1231 * 1232 * Return 0 if everything went well. 1233 * Return <0 for error, and no byte will be added to @rbio. 1234 */ 1235static int rbio_add_io_paddrs(struct btrfs_raid_bio *rbio, struct bio_list *bio_list, 1236 phys_addr_t *paddrs, unsigned int stripe_nr, 1237 unsigned int sector_nr, enum req_op op) 1238{ 1239 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1240 const u32 step = min(sectorsize, PAGE_SIZE); 1241 struct bio *last = bio_list->tail; 1242 int ret; 1243 struct bio *bio; 1244 struct btrfs_io_stripe *stripe; 1245 u64 disk_start; 1246 1247 /* 1248 * Note: here stripe_nr has taken device replace into consideration, 1249 * thus it can be larger than rbio->real_stripe. 1250 * So here we check against bioc->num_stripes, not rbio->real_stripes. 1251 */ 1252 ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes, 1253 rbio, stripe_nr); 1254 ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, 1255 rbio, sector_nr); 1256 ASSERT(paddrs != NULL); 1257 1258 stripe = &rbio->bioc->stripes[stripe_nr]; 1259 disk_start = stripe->physical + sector_nr * sectorsize; 1260 1261 /* if the device is missing, just fail this stripe */ 1262 if (!stripe->dev->bdev) { 1263 int found_errors; 1264 1265 set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr, 1266 rbio->error_bitmap); 1267 1268 /* Check if we have reached tolerance early. */ 1269 found_errors = get_rbio_vertical_errors(rbio, sector_nr, 1270 NULL, NULL); 1271 if (unlikely(found_errors > rbio->bioc->max_errors)) 1272 return -EIO; 1273 return 0; 1274 } 1275 1276 /* see if we can add this page onto our existing bio */ 1277 if (last) { 1278 u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT; 1279 last_end += last->bi_iter.bi_size; 1280 1281 /* 1282 * we can't merge these if they are from different 1283 * devices or if they are not contiguous 1284 */ 1285 if (last_end == disk_start && !last->bi_status && 1286 last->bi_bdev == stripe->dev->bdev) { 1287 ret = bio_add_paddrs(last, paddrs, rbio->sector_nsteps, step); 1288 if (ret == sectorsize) 1289 return 0; 1290 } 1291 } 1292 1293 /* put a new bio on the list */ 1294 bio = bio_alloc(stripe->dev->bdev, 1295 max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1), 1296 op, GFP_NOFS); 1297 bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; 1298 bio->bi_private = rbio; 1299 1300 ret = bio_add_paddrs(bio, paddrs, rbio->sector_nsteps, step); 1301 ASSERT(ret == sectorsize); 1302 bio_list_add(bio_list, bio); 1303 return 0; 1304} 1305 1306static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) 1307{ 1308 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1309 const u32 step = min(fs_info->sectorsize, PAGE_SIZE); 1310 const u32 step_bits = min(fs_info->sectorsize_bits, PAGE_SHIFT); 1311 struct bvec_iter iter = bio->bi_iter; 1312 phys_addr_t paddr; 1313 u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 1314 rbio->bioc->full_stripe_logical; 1315 1316 btrfs_bio_for_each_block(paddr, bio, &iter, step) { 1317 unsigned int index = (offset >> step_bits); 1318 1319 rbio->bio_paddrs[index] = paddr; 1320 offset += step; 1321 } 1322} 1323 1324/* 1325 * helper function to walk our bio list and populate the bio_pages array with 1326 * the result. This seems expensive, but it is faster than constantly 1327 * searching through the bio list as we setup the IO in finish_rmw or stripe 1328 * reconstruction. 1329 * 1330 * This must be called before you trust the answers from page_in_rbio 1331 */ 1332static void index_rbio_pages(struct btrfs_raid_bio *rbio) 1333{ 1334 struct bio *bio; 1335 1336 spin_lock(&rbio->bio_list_lock); 1337 bio_list_for_each(bio, &rbio->bio_list) 1338 index_one_bio(rbio, bio); 1339 1340 spin_unlock(&rbio->bio_list_lock); 1341} 1342 1343static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, 1344 struct raid56_bio_trace_info *trace_info) 1345{ 1346 const struct btrfs_io_context *bioc = rbio->bioc; 1347 int i; 1348 1349 ASSERT(bioc); 1350 1351 /* We rely on bio->bi_bdev to find the stripe number. */ 1352 if (!bio->bi_bdev) 1353 goto not_found; 1354 1355 for (i = 0; i < bioc->num_stripes; i++) { 1356 if (bio->bi_bdev != bioc->stripes[i].dev->bdev) 1357 continue; 1358 trace_info->stripe_nr = i; 1359 trace_info->devid = bioc->stripes[i].dev->devid; 1360 trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 1361 bioc->stripes[i].physical; 1362 return; 1363 } 1364 1365not_found: 1366 trace_info->devid = -1; 1367 trace_info->offset = -1; 1368 trace_info->stripe_nr = -1; 1369} 1370 1371static inline void bio_list_put(struct bio_list *bio_list) 1372{ 1373 struct bio *bio; 1374 1375 while ((bio = bio_list_pop(bio_list))) 1376 bio_put(bio); 1377} 1378 1379static void assert_rbio(struct btrfs_raid_bio *rbio) 1380{ 1381 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) 1382 return; 1383 1384 /* 1385 * At least two stripes (2 disks RAID5), and since real_stripes is U8, 1386 * we won't go beyond 256 disks anyway. 1387 */ 1388 ASSERT_RBIO(rbio->real_stripes >= 2, rbio); 1389 ASSERT_RBIO(rbio->nr_data > 0, rbio); 1390 1391 /* 1392 * This is another check to make sure nr data stripes is smaller 1393 * than total stripes. 1394 */ 1395 ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio); 1396} 1397 1398static inline void *kmap_local_paddr(phys_addr_t paddr) 1399{ 1400 /* The sector pointer must have a page mapped to it. */ 1401 ASSERT(paddr != INVALID_PADDR); 1402 1403 return kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr); 1404} 1405 1406static void generate_pq_vertical_step(struct btrfs_raid_bio *rbio, unsigned int sector_nr, 1407 unsigned int step_nr) 1408{ 1409 void **pointers = rbio->finish_pointers; 1410 const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE); 1411 int stripe; 1412 const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6; 1413 1414 /* First collect one sector from each data stripe */ 1415 for (stripe = 0; stripe < rbio->nr_data; stripe++) 1416 pointers[stripe] = kmap_local_paddr( 1417 sector_paddr_in_rbio(rbio, stripe, sector_nr, step_nr, 0)); 1418 1419 /* Then add the parity stripe */ 1420 pointers[stripe++] = kmap_local_paddr(rbio_pstripe_paddr(rbio, sector_nr, step_nr)); 1421 1422 if (has_qstripe) { 1423 /* 1424 * RAID6, add the qstripe and call the library function 1425 * to fill in our p/q 1426 */ 1427 pointers[stripe++] = kmap_local_paddr( 1428 rbio_qstripe_paddr(rbio, sector_nr, step_nr)); 1429 1430 assert_rbio(rbio); 1431 raid6_call.gen_syndrome(rbio->real_stripes, step, pointers); 1432 } else { 1433 /* raid5 */ 1434 memcpy(pointers[rbio->nr_data], pointers[0], step); 1435 run_xor(pointers + 1, rbio->nr_data - 1, step); 1436 } 1437 for (stripe = stripe - 1; stripe >= 0; stripe--) 1438 kunmap_local(pointers[stripe]); 1439} 1440 1441/* Generate PQ for one vertical stripe. */ 1442static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) 1443{ 1444 const bool has_qstripe = (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6); 1445 1446 for (int i = 0; i < rbio->sector_nsteps; i++) 1447 generate_pq_vertical_step(rbio, sectornr, i); 1448 1449 set_bit(rbio_sector_index(rbio, rbio->nr_data, sectornr), 1450 rbio->stripe_uptodate_bitmap); 1451 if (has_qstripe) 1452 set_bit(rbio_sector_index(rbio, rbio->nr_data + 1, sectornr), 1453 rbio->stripe_uptodate_bitmap); 1454} 1455 1456static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, 1457 struct bio_list *bio_list) 1458{ 1459 /* The total sector number inside the full stripe. */ 1460 int total_sector_nr; 1461 int sectornr; 1462 int stripe; 1463 int ret; 1464 1465 ASSERT(bio_list_size(bio_list) == 0); 1466 1467 /* We should have at least one data sector. */ 1468 ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); 1469 1470 /* 1471 * Reset errors, as we may have errors inherited from from degraded 1472 * write. 1473 */ 1474 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); 1475 1476 /* 1477 * Start assembly. Make bios for everything from the higher layers (the 1478 * bio_list in our rbio) and our P/Q. Ignore everything else. 1479 */ 1480 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 1481 total_sector_nr++) { 1482 phys_addr_t *paddrs; 1483 1484 stripe = total_sector_nr / rbio->stripe_nsectors; 1485 sectornr = total_sector_nr % rbio->stripe_nsectors; 1486 1487 /* This vertical stripe has no data, skip it. */ 1488 if (!test_bit(sectornr, &rbio->dbitmap)) 1489 continue; 1490 1491 if (stripe < rbio->nr_data) { 1492 paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1); 1493 if (paddrs == NULL) 1494 continue; 1495 } else { 1496 paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); 1497 } 1498 1499 ret = rbio_add_io_paddrs(rbio, bio_list, paddrs, stripe, 1500 sectornr, REQ_OP_WRITE); 1501 if (ret) 1502 goto error; 1503 } 1504 1505 if (likely(!rbio->bioc->replace_nr_stripes)) 1506 return 0; 1507 1508 /* 1509 * Make a copy for the replace target device. 1510 * 1511 * Thus the source stripe number (in replace_stripe_src) should be valid. 1512 */ 1513 ASSERT(rbio->bioc->replace_stripe_src >= 0); 1514 1515 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 1516 total_sector_nr++) { 1517 phys_addr_t *paddrs; 1518 1519 stripe = total_sector_nr / rbio->stripe_nsectors; 1520 sectornr = total_sector_nr % rbio->stripe_nsectors; 1521 1522 /* 1523 * For RAID56, there is only one device that can be replaced, 1524 * and replace_stripe_src[0] indicates the stripe number we 1525 * need to copy from. 1526 */ 1527 if (stripe != rbio->bioc->replace_stripe_src) { 1528 /* 1529 * We can skip the whole stripe completely, note 1530 * total_sector_nr will be increased by one anyway. 1531 */ 1532 ASSERT(sectornr == 0); 1533 total_sector_nr += rbio->stripe_nsectors - 1; 1534 continue; 1535 } 1536 1537 /* This vertical stripe has no data, skip it. */ 1538 if (!test_bit(sectornr, &rbio->dbitmap)) 1539 continue; 1540 1541 if (stripe < rbio->nr_data) { 1542 paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1); 1543 if (paddrs == NULL) 1544 continue; 1545 } else { 1546 paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); 1547 } 1548 1549 ret = rbio_add_io_paddrs(rbio, bio_list, paddrs, 1550 rbio->real_stripes, 1551 sectornr, REQ_OP_WRITE); 1552 if (ret) 1553 goto error; 1554 } 1555 1556 return 0; 1557error: 1558 bio_list_put(bio_list); 1559 return -EIO; 1560} 1561 1562static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) 1563{ 1564 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1565 u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 1566 rbio->bioc->full_stripe_logical; 1567 int total_nr_sector = offset >> fs_info->sectorsize_bits; 1568 1569 ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors); 1570 1571 bitmap_set(rbio->error_bitmap, total_nr_sector, 1572 bio->bi_iter.bi_size >> fs_info->sectorsize_bits); 1573 1574 /* 1575 * Special handling for raid56_alloc_missing_rbio() used by 1576 * scrub/replace. Unlike call path in raid56_parity_recover(), they 1577 * pass an empty bio here. Thus we have to find out the missing device 1578 * and mark the stripe error instead. 1579 */ 1580 if (bio->bi_iter.bi_size == 0) { 1581 bool found_missing = false; 1582 int stripe_nr; 1583 1584 for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { 1585 if (!rbio->bioc->stripes[stripe_nr].dev->bdev) { 1586 found_missing = true; 1587 bitmap_set(rbio->error_bitmap, 1588 stripe_nr * rbio->stripe_nsectors, 1589 rbio->stripe_nsectors); 1590 } 1591 } 1592 ASSERT(found_missing); 1593 } 1594} 1595 1596/* 1597 * Return the index inside the rbio->stripe_sectors[] array. 1598 * 1599 * Return -1 if not found. 1600 */ 1601static int find_stripe_sector_nr(struct btrfs_raid_bio *rbio, phys_addr_t paddr) 1602{ 1603 for (int i = 0; i < rbio->nr_sectors; i++) { 1604 if (rbio->stripe_paddrs[i * rbio->sector_nsteps] == paddr) 1605 return i; 1606 } 1607 return -1; 1608} 1609 1610/* 1611 * this sets each page in the bio uptodate. It should only be used on private 1612 * rbio pages, nothing that comes in from the higher layers 1613 */ 1614static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) 1615{ 1616 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1617 const u32 step = min(sectorsize, PAGE_SIZE); 1618 u32 offset = 0; 1619 phys_addr_t paddr; 1620 1621 ASSERT(!bio_flagged(bio, BIO_CLONED)); 1622 1623 btrfs_bio_for_each_block_all(paddr, bio, step) { 1624 /* Hitting the first step of a sector. */ 1625 if (IS_ALIGNED(offset, sectorsize)) { 1626 int sector_nr = find_stripe_sector_nr(rbio, paddr); 1627 1628 ASSERT(sector_nr >= 0); 1629 if (sector_nr >= 0) 1630 set_bit(sector_nr, rbio->stripe_uptodate_bitmap); 1631 } 1632 offset += step; 1633 } 1634} 1635 1636static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) 1637{ 1638 phys_addr_t bvec_paddr = bvec_phys(bio_first_bvec_all(bio)); 1639 int i; 1640 1641 for (i = 0; i < rbio->nr_sectors; i++) { 1642 if (rbio->stripe_paddrs[i * rbio->sector_nsteps] == bvec_paddr) 1643 break; 1644 if (rbio->bio_paddrs[i * rbio->sector_nsteps] == bvec_paddr) 1645 break; 1646 } 1647 ASSERT(i < rbio->nr_sectors); 1648 return i; 1649} 1650 1651static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio) 1652{ 1653 int total_sector_nr = get_bio_sector_nr(rbio, bio); 1654 u32 bio_size = 0; 1655 struct bio_vec *bvec; 1656 int i; 1657 1658 bio_for_each_bvec_all(bvec, bio, i) 1659 bio_size += bvec->bv_len; 1660 1661 /* 1662 * Since we can have multiple bios touching the error_bitmap, we cannot 1663 * call bitmap_set() without protection. 1664 * 1665 * Instead use set_bit() for each bit, as set_bit() itself is atomic. 1666 */ 1667 for (i = total_sector_nr; i < total_sector_nr + 1668 (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++) 1669 set_bit(i, rbio->error_bitmap); 1670} 1671 1672/* Verify the data sectors at read time. */ 1673static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, 1674 struct bio *bio) 1675{ 1676 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1677 const u32 step = min(fs_info->sectorsize, PAGE_SIZE); 1678 const u32 nr_steps = rbio->sector_nsteps; 1679 int total_sector_nr = get_bio_sector_nr(rbio, bio); 1680 u32 offset = 0; 1681 phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; 1682 phys_addr_t paddr; 1683 1684 /* No data csum for the whole stripe, no need to verify. */ 1685 if (!rbio->csum_bitmap || !rbio->csum_buf) 1686 return; 1687 1688 /* P/Q stripes, they have no data csum to verify against. */ 1689 if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors) 1690 return; 1691 1692 btrfs_bio_for_each_block_all(paddr, bio, step) { 1693 u8 csum_buf[BTRFS_CSUM_SIZE]; 1694 u8 *expected_csum; 1695 1696 paddrs[(offset / step) % nr_steps] = paddr; 1697 offset += step; 1698 1699 /* Not yet covering the full fs block, continue to the next step. */ 1700 if (!IS_ALIGNED(offset, fs_info->sectorsize)) 1701 continue; 1702 1703 /* No csum for this sector, skip to the next sector. */ 1704 if (!test_bit(total_sector_nr, rbio->csum_bitmap)) 1705 continue; 1706 1707 expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size; 1708 btrfs_calculate_block_csum_pages(fs_info, paddrs, csum_buf); 1709 if (unlikely(memcmp(csum_buf, expected_csum, fs_info->csum_size) != 0)) 1710 set_bit(total_sector_nr, rbio->error_bitmap); 1711 total_sector_nr++; 1712 } 1713} 1714 1715static void raid_wait_read_end_io(struct bio *bio) 1716{ 1717 struct btrfs_raid_bio *rbio = bio->bi_private; 1718 1719 if (bio->bi_status) { 1720 rbio_update_error_bitmap(rbio, bio); 1721 } else { 1722 set_bio_pages_uptodate(rbio, bio); 1723 verify_bio_data_sectors(rbio, bio); 1724 } 1725 1726 bio_put(bio); 1727 if (atomic_dec_and_test(&rbio->stripes_pending)) 1728 wake_up(&rbio->io_wait); 1729} 1730 1731static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio, 1732 struct bio_list *bio_list) 1733{ 1734 struct bio *bio; 1735 1736 atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); 1737 while ((bio = bio_list_pop(bio_list))) { 1738 bio->bi_end_io = raid_wait_read_end_io; 1739 1740 if (trace_raid56_read_enabled()) { 1741 struct raid56_bio_trace_info trace_info = { 0 }; 1742 1743 bio_get_trace_info(rbio, bio, &trace_info); 1744 trace_raid56_read(rbio, bio, &trace_info); 1745 } 1746 submit_bio(bio); 1747 } 1748 1749 wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); 1750} 1751 1752static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) 1753{ 1754 const int data_pages = rbio->nr_data * rbio->stripe_npages; 1755 int ret; 1756 1757 ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, false); 1758 if (ret < 0) 1759 return ret; 1760 1761 index_stripe_sectors(rbio); 1762 return 0; 1763} 1764 1765/* 1766 * We use plugging call backs to collect full stripes. 1767 * Any time we get a partial stripe write while plugged 1768 * we collect it into a list. When the unplug comes down, 1769 * we sort the list by logical block number and merge 1770 * everything we can into the same rbios 1771 */ 1772struct btrfs_plug_cb { 1773 struct blk_plug_cb cb; 1774 struct btrfs_fs_info *info; 1775 struct list_head rbio_list; 1776}; 1777 1778/* 1779 * rbios on the plug list are sorted for easier merging. 1780 */ 1781static int plug_cmp(void *priv, const struct list_head *a, 1782 const struct list_head *b) 1783{ 1784 const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, 1785 plug_list); 1786 const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 1787 plug_list); 1788 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; 1789 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; 1790 1791 if (a_sector < b_sector) 1792 return -1; 1793 if (a_sector > b_sector) 1794 return 1; 1795 return 0; 1796} 1797 1798static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule) 1799{ 1800 struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb); 1801 struct btrfs_raid_bio *cur; 1802 struct btrfs_raid_bio *last = NULL; 1803 1804 list_sort(NULL, &plug->rbio_list, plug_cmp); 1805 1806 while (!list_empty(&plug->rbio_list)) { 1807 cur = list_first_entry(&plug->rbio_list, 1808 struct btrfs_raid_bio, plug_list); 1809 list_del_init(&cur->plug_list); 1810 1811 if (rbio_is_full(cur)) { 1812 /* We have a full stripe, queue it down. */ 1813 start_async_work(cur, rmw_rbio_work); 1814 continue; 1815 } 1816 if (last) { 1817 if (rbio_can_merge(last, cur)) { 1818 merge_rbio(last, cur); 1819 free_raid_bio(cur); 1820 continue; 1821 } 1822 start_async_work(last, rmw_rbio_work); 1823 } 1824 last = cur; 1825 } 1826 if (last) 1827 start_async_work(last, rmw_rbio_work); 1828 kfree(plug); 1829} 1830 1831/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ 1832static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) 1833{ 1834 const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1835 const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; 1836 const u64 full_stripe_start = rbio->bioc->full_stripe_logical; 1837 const u32 orig_len = orig_bio->bi_iter.bi_size; 1838 const u32 sectorsize = fs_info->sectorsize; 1839 u64 cur_logical; 1840 1841 ASSERT_RBIO_LOGICAL(orig_logical >= full_stripe_start && 1842 orig_logical + orig_len <= full_stripe_start + 1843 rbio->nr_data * BTRFS_STRIPE_LEN, 1844 rbio, orig_logical); 1845 1846 bio_list_add(&rbio->bio_list, orig_bio); 1847 rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; 1848 1849 /* Update the dbitmap. */ 1850 for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len; 1851 cur_logical += sectorsize) { 1852 int bit = ((u32)(cur_logical - full_stripe_start) >> 1853 fs_info->sectorsize_bits) % rbio->stripe_nsectors; 1854 1855 set_bit(bit, &rbio->dbitmap); 1856 } 1857} 1858 1859/* 1860 * our main entry point for writes from the rest of the FS. 1861 */ 1862void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) 1863{ 1864 struct btrfs_fs_info *fs_info = bioc->fs_info; 1865 struct btrfs_raid_bio *rbio; 1866 struct btrfs_plug_cb *plug = NULL; 1867 struct blk_plug_cb *cb; 1868 1869 rbio = alloc_rbio(fs_info, bioc); 1870 if (IS_ERR(rbio)) { 1871 bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); 1872 bio_endio(bio); 1873 return; 1874 } 1875 rbio->operation = BTRFS_RBIO_WRITE; 1876 rbio_add_bio(rbio, bio); 1877 1878 /* 1879 * Don't plug on full rbios, just get them out the door 1880 * as quickly as we can 1881 */ 1882 if (!rbio_is_full(rbio)) { 1883 cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); 1884 if (cb) { 1885 plug = container_of(cb, struct btrfs_plug_cb, cb); 1886 if (!plug->info) { 1887 plug->info = fs_info; 1888 INIT_LIST_HEAD(&plug->rbio_list); 1889 } 1890 list_add_tail(&rbio->plug_list, &plug->rbio_list); 1891 return; 1892 } 1893 } 1894 1895 /* 1896 * Either we don't have any existing plug, or we're doing a full stripe, 1897 * queue the rmw work now. 1898 */ 1899 start_async_work(rbio, rmw_rbio_work); 1900} 1901 1902static int verify_one_sector(struct btrfs_raid_bio *rbio, 1903 int stripe_nr, int sector_nr) 1904{ 1905 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1906 phys_addr_t *paddrs; 1907 u8 csum_buf[BTRFS_CSUM_SIZE]; 1908 u8 *csum_expected; 1909 1910 if (!rbio->csum_bitmap || !rbio->csum_buf) 1911 return 0; 1912 1913 /* No way to verify P/Q as they are not covered by data csum. */ 1914 if (stripe_nr >= rbio->nr_data) 1915 return 0; 1916 /* 1917 * If we're rebuilding a read, we have to use pages from the 1918 * bio list if possible. 1919 */ 1920 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { 1921 paddrs = sector_paddrs_in_rbio(rbio, stripe_nr, sector_nr, 0); 1922 } else { 1923 paddrs = rbio_stripe_paddrs(rbio, stripe_nr, sector_nr); 1924 } 1925 1926 csum_expected = rbio->csum_buf + 1927 (stripe_nr * rbio->stripe_nsectors + sector_nr) * 1928 fs_info->csum_size; 1929 btrfs_calculate_block_csum_pages(fs_info, paddrs, csum_buf); 1930 if (unlikely(memcmp(csum_buf, csum_expected, fs_info->csum_size) != 0)) 1931 return -EIO; 1932 return 0; 1933} 1934 1935static void recover_vertical_step(struct btrfs_raid_bio *rbio, 1936 unsigned int sector_nr, 1937 unsigned int step_nr, 1938 int faila, int failb, 1939 void **pointers, void **unmap_array) 1940{ 1941 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1942 const u32 step = min(fs_info->sectorsize, PAGE_SIZE); 1943 int stripe_nr; 1944 1945 ASSERT(step_nr < rbio->sector_nsteps); 1946 ASSERT(sector_nr < rbio->stripe_nsectors); 1947 1948 /* 1949 * Setup our array of pointers with sectors from each stripe 1950 * 1951 * NOTE: store a duplicate array of pointers to preserve the 1952 * pointer order. 1953 */ 1954 for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { 1955 phys_addr_t paddr; 1956 1957 /* 1958 * If we're rebuilding a read, we have to use pages from the 1959 * bio list if possible. 1960 */ 1961 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { 1962 paddr = sector_paddr_in_rbio(rbio, stripe_nr, sector_nr, step_nr, 0); 1963 } else { 1964 paddr = rbio_stripe_paddr(rbio, stripe_nr, sector_nr, step_nr); 1965 } 1966 pointers[stripe_nr] = kmap_local_paddr(paddr); 1967 unmap_array[stripe_nr] = pointers[stripe_nr]; 1968 } 1969 1970 /* All raid6 handling here */ 1971 if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { 1972 /* Single failure, rebuild from parity raid5 style */ 1973 if (failb < 0) { 1974 if (faila == rbio->nr_data) 1975 /* 1976 * Just the P stripe has failed, without 1977 * a bad data or Q stripe. 1978 * We have nothing to do, just skip the 1979 * recovery for this stripe. 1980 */ 1981 goto cleanup; 1982 /* 1983 * a single failure in raid6 is rebuilt 1984 * in the pstripe code below 1985 */ 1986 goto pstripe; 1987 } 1988 1989 /* 1990 * If the q stripe is failed, do a pstripe reconstruction from 1991 * the xors. 1992 * If both the q stripe and the P stripe are failed, we're 1993 * here due to a crc mismatch and we can't give them the 1994 * data they want. 1995 */ 1996 if (failb == rbio->real_stripes - 1) { 1997 if (faila == rbio->real_stripes - 2) 1998 /* 1999 * Only P and Q are corrupted. 2000 * We only care about data stripes recovery, 2001 * can skip this vertical stripe. 2002 */ 2003 goto cleanup; 2004 /* 2005 * Otherwise we have one bad data stripe and 2006 * a good P stripe. raid5! 2007 */ 2008 goto pstripe; 2009 } 2010 2011 if (failb == rbio->real_stripes - 2) { 2012 raid6_datap_recov(rbio->real_stripes, step, 2013 faila, pointers); 2014 } else { 2015 raid6_2data_recov(rbio->real_stripes, step, 2016 faila, failb, pointers); 2017 } 2018 } else { 2019 void *p; 2020 2021 /* Rebuild from P stripe here (raid5 or raid6). */ 2022 ASSERT(failb == -1); 2023pstripe: 2024 /* Copy parity block into failed block to start with */ 2025 memcpy(pointers[faila], pointers[rbio->nr_data], step); 2026 2027 /* Rearrange the pointer array */ 2028 p = pointers[faila]; 2029 for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1; 2030 stripe_nr++) 2031 pointers[stripe_nr] = pointers[stripe_nr + 1]; 2032 pointers[rbio->nr_data - 1] = p; 2033 2034 /* Xor in the rest */ 2035 run_xor(pointers, rbio->nr_data - 1, step); 2036 } 2037 2038cleanup: 2039 for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) 2040 kunmap_local(unmap_array[stripe_nr]); 2041} 2042 2043/* 2044 * Recover a vertical stripe specified by @sector_nr. 2045 * @*pointers are the pre-allocated pointers by the caller, so we don't 2046 * need to allocate/free the pointers again and again. 2047 */ 2048static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, 2049 void **pointers, void **unmap_array) 2050{ 2051 int found_errors; 2052 int faila; 2053 int failb; 2054 int ret = 0; 2055 2056 /* 2057 * Now we just use bitmap to mark the horizontal stripes in 2058 * which we have data when doing parity scrub. 2059 */ 2060 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && 2061 !test_bit(sector_nr, &rbio->dbitmap)) 2062 return 0; 2063 2064 found_errors = get_rbio_vertical_errors(rbio, sector_nr, &faila, 2065 &failb); 2066 /* 2067 * No errors in the vertical stripe, skip it. Can happen for recovery 2068 * which only part of a stripe failed csum check. 2069 */ 2070 if (!found_errors) 2071 return 0; 2072 2073 if (unlikely(found_errors > rbio->bioc->max_errors)) 2074 return -EIO; 2075 2076 for (int i = 0; i < rbio->sector_nsteps; i++) 2077 recover_vertical_step(rbio, sector_nr, i, faila, failb, 2078 pointers, unmap_array); 2079 if (faila >= 0) { 2080 ret = verify_one_sector(rbio, faila, sector_nr); 2081 if (ret < 0) 2082 return ret; 2083 2084 set_bit(rbio_sector_index(rbio, faila, sector_nr), 2085 rbio->stripe_uptodate_bitmap); 2086 } 2087 if (failb >= 0) { 2088 ret = verify_one_sector(rbio, failb, sector_nr); 2089 if (ret < 0) 2090 return ret; 2091 2092 set_bit(rbio_sector_index(rbio, failb, sector_nr), 2093 rbio->stripe_uptodate_bitmap); 2094 } 2095 return ret; 2096} 2097 2098static int recover_sectors(struct btrfs_raid_bio *rbio) 2099{ 2100 void **pointers = NULL; 2101 void **unmap_array = NULL; 2102 int sectornr; 2103 int ret = 0; 2104 2105 /* 2106 * @pointers array stores the pointer for each sector. 2107 * 2108 * @unmap_array stores copy of pointers that does not get reordered 2109 * during reconstruction so that kunmap_local works. 2110 */ 2111 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 2112 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 2113 if (!pointers || !unmap_array) { 2114 ret = -ENOMEM; 2115 goto out; 2116 } 2117 2118 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { 2119 spin_lock(&rbio->bio_list_lock); 2120 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 2121 spin_unlock(&rbio->bio_list_lock); 2122 } 2123 2124 index_rbio_pages(rbio); 2125 2126 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 2127 ret = recover_vertical(rbio, sectornr, pointers, unmap_array); 2128 if (ret < 0) 2129 break; 2130 } 2131 2132out: 2133 kfree(pointers); 2134 kfree(unmap_array); 2135 return ret; 2136} 2137 2138static void recover_rbio(struct btrfs_raid_bio *rbio) 2139{ 2140 struct bio_list bio_list = BIO_EMPTY_LIST; 2141 int total_sector_nr; 2142 int ret = 0; 2143 2144 /* 2145 * Either we're doing recover for a read failure or degraded write, 2146 * caller should have set error bitmap correctly. 2147 */ 2148 ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); 2149 2150 /* For recovery, we need to read all sectors including P/Q. */ 2151 ret = alloc_rbio_pages(rbio); 2152 if (ret < 0) 2153 goto out; 2154 2155 index_rbio_pages(rbio); 2156 2157 /* 2158 * Read everything that hasn't failed. However this time we will 2159 * not trust any cached sector. 2160 * As we may read out some stale data but higher layer is not reading 2161 * that stale part. 2162 * 2163 * So here we always re-read everything in recovery path. 2164 */ 2165 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 2166 total_sector_nr++) { 2167 int stripe = total_sector_nr / rbio->stripe_nsectors; 2168 int sectornr = total_sector_nr % rbio->stripe_nsectors; 2169 phys_addr_t *paddrs; 2170 2171 /* 2172 * Skip the range which has error. It can be a range which is 2173 * marked error (for csum mismatch), or it can be a missing 2174 * device. 2175 */ 2176 if (!rbio->bioc->stripes[stripe].dev->bdev || 2177 test_bit(total_sector_nr, rbio->error_bitmap)) { 2178 /* 2179 * Also set the error bit for missing device, which 2180 * may not yet have its error bit set. 2181 */ 2182 set_bit(total_sector_nr, rbio->error_bitmap); 2183 continue; 2184 } 2185 2186 paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); 2187 ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe, 2188 sectornr, REQ_OP_READ); 2189 if (ret < 0) { 2190 bio_list_put(&bio_list); 2191 goto out; 2192 } 2193 } 2194 2195 submit_read_wait_bio_list(rbio, &bio_list); 2196 ret = recover_sectors(rbio); 2197out: 2198 rbio_orig_end_io(rbio, errno_to_blk_status(ret)); 2199} 2200 2201static void recover_rbio_work(struct work_struct *work) 2202{ 2203 struct btrfs_raid_bio *rbio; 2204 2205 rbio = container_of(work, struct btrfs_raid_bio, work); 2206 if (!lock_stripe_add(rbio)) 2207 recover_rbio(rbio); 2208} 2209 2210static void recover_rbio_work_locked(struct work_struct *work) 2211{ 2212 recover_rbio(container_of(work, struct btrfs_raid_bio, work)); 2213} 2214 2215static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num) 2216{ 2217 bool found = false; 2218 int sector_nr; 2219 2220 /* 2221 * This is for RAID6 extra recovery tries, thus mirror number should 2222 * be large than 2. 2223 * Mirror 1 means read from data stripes. Mirror 2 means rebuild using 2224 * RAID5 methods. 2225 */ 2226 ASSERT(mirror_num > 2); 2227 for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { 2228 int found_errors; 2229 int faila; 2230 int failb; 2231 2232 found_errors = get_rbio_vertical_errors(rbio, sector_nr, 2233 &faila, &failb); 2234 /* This vertical stripe doesn't have errors. */ 2235 if (!found_errors) 2236 continue; 2237 2238 /* 2239 * If we found errors, there should be only one error marked 2240 * by previous set_rbio_range_error(). 2241 */ 2242 ASSERT(found_errors == 1); 2243 found = true; 2244 2245 /* Now select another stripe to mark as error. */ 2246 failb = rbio->real_stripes - (mirror_num - 1); 2247 if (failb <= faila) 2248 failb--; 2249 2250 /* Set the extra bit in error bitmap. */ 2251 if (failb >= 0) 2252 set_bit(failb * rbio->stripe_nsectors + sector_nr, 2253 rbio->error_bitmap); 2254 } 2255 2256 /* We should found at least one vertical stripe with error.*/ 2257 ASSERT(found); 2258} 2259 2260/* 2261 * the main entry point for reads from the higher layers. This 2262 * is really only called when the normal read path had a failure, 2263 * so we assume the bio they send down corresponds to a failed part 2264 * of the drive. 2265 */ 2266void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, 2267 int mirror_num) 2268{ 2269 struct btrfs_fs_info *fs_info = bioc->fs_info; 2270 struct btrfs_raid_bio *rbio; 2271 2272 rbio = alloc_rbio(fs_info, bioc); 2273 if (IS_ERR(rbio)) { 2274 bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); 2275 bio_endio(bio); 2276 return; 2277 } 2278 2279 rbio->operation = BTRFS_RBIO_READ_REBUILD; 2280 rbio_add_bio(rbio, bio); 2281 2282 set_rbio_range_error(rbio, bio); 2283 2284 /* 2285 * Loop retry: 2286 * for 'mirror == 2', reconstruct from all other stripes. 2287 * for 'mirror_num > 2', select a stripe to fail on every retry. 2288 */ 2289 if (mirror_num > 2) 2290 set_rbio_raid6_extra_error(rbio, mirror_num); 2291 2292 start_async_work(rbio, recover_rbio_work); 2293} 2294 2295static void fill_data_csums(struct btrfs_raid_bio *rbio) 2296{ 2297 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 2298 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, 2299 rbio->bioc->full_stripe_logical); 2300 const u64 start = rbio->bioc->full_stripe_logical; 2301 const u32 len = (rbio->nr_data * rbio->stripe_nsectors) << 2302 fs_info->sectorsize_bits; 2303 int ret; 2304 2305 /* The rbio should not have its csum buffer initialized. */ 2306 ASSERT(!rbio->csum_buf && !rbio->csum_bitmap); 2307 2308 /* 2309 * Skip the csum search if: 2310 * 2311 * - The rbio doesn't belong to data block groups 2312 * Then we are doing IO for tree blocks, no need to search csums. 2313 * 2314 * - The rbio belongs to mixed block groups 2315 * This is to avoid deadlock, as we're already holding the full 2316 * stripe lock, if we trigger a metadata read, and it needs to do 2317 * raid56 recovery, we will deadlock. 2318 */ 2319 if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) || 2320 rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA) 2321 return; 2322 2323 rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors * 2324 fs_info->csum_size, GFP_NOFS); 2325 rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors, 2326 GFP_NOFS); 2327 if (!rbio->csum_buf || !rbio->csum_bitmap) { 2328 ret = -ENOMEM; 2329 goto error; 2330 } 2331 2332 ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1, 2333 rbio->csum_buf, rbio->csum_bitmap); 2334 if (ret < 0) 2335 goto error; 2336 if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits)) 2337 goto no_csum; 2338 return; 2339 2340error: 2341 /* 2342 * We failed to allocate memory or grab the csum, but it's not fatal, 2343 * we can still continue. But better to warn users that RMW is no 2344 * longer safe for this particular sub-stripe write. 2345 */ 2346 btrfs_warn_rl(fs_info, 2347"sub-stripe write for full stripe %llu is not safe, failed to get csum: %d", 2348 rbio->bioc->full_stripe_logical, ret); 2349no_csum: 2350 kfree(rbio->csum_buf); 2351 bitmap_free(rbio->csum_bitmap); 2352 rbio->csum_buf = NULL; 2353 rbio->csum_bitmap = NULL; 2354} 2355 2356static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) 2357{ 2358 struct bio_list bio_list = BIO_EMPTY_LIST; 2359 int total_sector_nr; 2360 int ret = 0; 2361 2362 /* 2363 * Fill the data csums we need for data verification. We need to fill 2364 * the csum_bitmap/csum_buf first, as our endio function will try to 2365 * verify the data sectors. 2366 */ 2367 fill_data_csums(rbio); 2368 2369 /* 2370 * Build a list of bios to read all sectors (including data and P/Q). 2371 * 2372 * This behavior is to compensate the later csum verification and recovery. 2373 */ 2374 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 2375 total_sector_nr++) { 2376 int stripe = total_sector_nr / rbio->stripe_nsectors; 2377 int sectornr = total_sector_nr % rbio->stripe_nsectors; 2378 phys_addr_t *paddrs; 2379 2380 paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); 2381 ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe, 2382 sectornr, REQ_OP_READ); 2383 if (ret) { 2384 bio_list_put(&bio_list); 2385 return ret; 2386 } 2387 } 2388 2389 /* 2390 * We may or may not have any corrupted sectors (including missing dev 2391 * and csum mismatch), just let recover_sectors() to handle them all. 2392 */ 2393 submit_read_wait_bio_list(rbio, &bio_list); 2394 return recover_sectors(rbio); 2395} 2396 2397static void raid_wait_write_end_io(struct bio *bio) 2398{ 2399 struct btrfs_raid_bio *rbio = bio->bi_private; 2400 2401 if (bio->bi_status) 2402 rbio_update_error_bitmap(rbio, bio); 2403 bio_put(bio); 2404 if (atomic_dec_and_test(&rbio->stripes_pending)) 2405 wake_up(&rbio->io_wait); 2406} 2407 2408static void submit_write_bios(struct btrfs_raid_bio *rbio, 2409 struct bio_list *bio_list) 2410{ 2411 struct bio *bio; 2412 2413 atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); 2414 while ((bio = bio_list_pop(bio_list))) { 2415 bio->bi_end_io = raid_wait_write_end_io; 2416 2417 if (trace_raid56_write_enabled()) { 2418 struct raid56_bio_trace_info trace_info = { 0 }; 2419 2420 bio_get_trace_info(rbio, bio, &trace_info); 2421 trace_raid56_write(rbio, bio, &trace_info); 2422 } 2423 submit_bio(bio); 2424 } 2425} 2426 2427/* 2428 * To determine if we need to read any sector from the disk. 2429 * Should only be utilized in RMW path, to skip cached rbio. 2430 */ 2431static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) 2432{ 2433 int i; 2434 2435 for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) { 2436 phys_addr_t paddr = rbio->stripe_paddrs[i * rbio->sector_nsteps]; 2437 2438 /* 2439 * We have a sector which doesn't have page nor uptodate, 2440 * thus this rbio can not be cached one, as cached one must 2441 * have all its data sectors present and uptodate. 2442 */ 2443 if (paddr == INVALID_PADDR || 2444 !test_bit(i, rbio->stripe_uptodate_bitmap)) 2445 return true; 2446 } 2447 return false; 2448} 2449 2450static void rmw_rbio(struct btrfs_raid_bio *rbio) 2451{ 2452 struct bio_list bio_list; 2453 int sectornr; 2454 int ret = 0; 2455 2456 /* 2457 * Allocate the pages for parity first, as P/Q pages will always be 2458 * needed for both full-stripe and sub-stripe writes. 2459 */ 2460 ret = alloc_rbio_parity_pages(rbio); 2461 if (ret < 0) 2462 goto out; 2463 2464 /* 2465 * Either full stripe write, or we have every data sector already 2466 * cached, can go to write path immediately. 2467 */ 2468 if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) { 2469 /* 2470 * Now we're doing sub-stripe write, also need all data stripes 2471 * to do the full RMW. 2472 */ 2473 ret = alloc_rbio_data_pages(rbio); 2474 if (ret < 0) 2475 goto out; 2476 2477 index_rbio_pages(rbio); 2478 2479 ret = rmw_read_wait_recover(rbio); 2480 if (ret < 0) 2481 goto out; 2482 } 2483 2484 /* 2485 * At this stage we're not allowed to add any new bios to the 2486 * bio list any more, anyone else that wants to change this stripe 2487 * needs to do their own rmw. 2488 */ 2489 spin_lock(&rbio->bio_list_lock); 2490 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 2491 spin_unlock(&rbio->bio_list_lock); 2492 2493 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); 2494 2495 index_rbio_pages(rbio); 2496 2497 /* 2498 * We don't cache full rbios because we're assuming 2499 * the higher layers are unlikely to use this area of 2500 * the disk again soon. If they do use it again, 2501 * hopefully they will send another full bio. 2502 */ 2503 if (!rbio_is_full(rbio)) 2504 cache_rbio_pages(rbio); 2505 else 2506 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2507 2508 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) 2509 generate_pq_vertical(rbio, sectornr); 2510 2511 bio_list_init(&bio_list); 2512 ret = rmw_assemble_write_bios(rbio, &bio_list); 2513 if (ret < 0) 2514 goto out; 2515 2516 /* We should have at least one bio assembled. */ 2517 ASSERT(bio_list_size(&bio_list)); 2518 submit_write_bios(rbio, &bio_list); 2519 wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); 2520 2521 /* We may have more errors than our tolerance during the read. */ 2522 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 2523 int found_errors; 2524 2525 found_errors = get_rbio_vertical_errors(rbio, sectornr, NULL, NULL); 2526 if (unlikely(found_errors > rbio->bioc->max_errors)) { 2527 ret = -EIO; 2528 break; 2529 } 2530 } 2531out: 2532 rbio_orig_end_io(rbio, errno_to_blk_status(ret)); 2533} 2534 2535static void rmw_rbio_work(struct work_struct *work) 2536{ 2537 struct btrfs_raid_bio *rbio; 2538 2539 rbio = container_of(work, struct btrfs_raid_bio, work); 2540 if (lock_stripe_add(rbio) == 0) 2541 rmw_rbio(rbio); 2542} 2543 2544static void rmw_rbio_work_locked(struct work_struct *work) 2545{ 2546 rmw_rbio(container_of(work, struct btrfs_raid_bio, work)); 2547} 2548 2549/* 2550 * The following code is used to scrub/replace the parity stripe 2551 * 2552 * Caller must have already increased bio_counter for getting @bioc. 2553 * 2554 * Note: We need make sure all the pages that add into the scrub/replace 2555 * raid bio are correct and not be changed during the scrub/replace. That 2556 * is those pages just hold metadata or file data with checksum. 2557 */ 2558 2559struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, 2560 struct btrfs_io_context *bioc, 2561 struct btrfs_device *scrub_dev, 2562 unsigned long *dbitmap, int stripe_nsectors) 2563{ 2564 struct btrfs_fs_info *fs_info = bioc->fs_info; 2565 struct btrfs_raid_bio *rbio; 2566 int i; 2567 2568 rbio = alloc_rbio(fs_info, bioc); 2569 if (IS_ERR(rbio)) 2570 return NULL; 2571 bio_list_add(&rbio->bio_list, bio); 2572 /* 2573 * This is a special bio which is used to hold the completion handler 2574 * and make the scrub rbio is similar to the other types 2575 */ 2576 ASSERT(!bio->bi_iter.bi_size); 2577 rbio->operation = BTRFS_RBIO_PARITY_SCRUB; 2578 2579 /* 2580 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted 2581 * to the end position, so this search can start from the first parity 2582 * stripe. 2583 */ 2584 for (i = rbio->nr_data; i < rbio->real_stripes; i++) { 2585 if (bioc->stripes[i].dev == scrub_dev) { 2586 rbio->scrubp = i; 2587 break; 2588 } 2589 } 2590 ASSERT_RBIO_STRIPE(i < rbio->real_stripes, rbio, i); 2591 2592 bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors); 2593 return rbio; 2594} 2595 2596static int alloc_rbio_sector_pages(struct btrfs_raid_bio *rbio, 2597 int sector_nr) 2598{ 2599 const u32 step = min(PAGE_SIZE, rbio->bioc->fs_info->sectorsize); 2600 const u32 base = sector_nr * rbio->sector_nsteps; 2601 2602 for (int i = base; i < base + rbio->sector_nsteps; i++) { 2603 const unsigned int page_index = (i * step) >> PAGE_SHIFT; 2604 struct page *page; 2605 2606 if (rbio->stripe_pages[page_index]) 2607 continue; 2608 page = alloc_page(GFP_NOFS); 2609 if (!page) 2610 return -ENOMEM; 2611 rbio->stripe_pages[page_index] = page; 2612 } 2613 return 0; 2614} 2615 2616/* 2617 * We just scrub the parity that we have correct data on the same horizontal, 2618 * so we needn't allocate all pages for all the stripes. 2619 */ 2620static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) 2621{ 2622 int total_sector_nr; 2623 2624 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 2625 total_sector_nr++) { 2626 int sectornr = total_sector_nr % rbio->stripe_nsectors; 2627 int ret; 2628 2629 if (!test_bit(sectornr, &rbio->dbitmap)) 2630 continue; 2631 ret = alloc_rbio_sector_pages(rbio, total_sector_nr); 2632 if (ret < 0) 2633 return ret; 2634 } 2635 index_stripe_sectors(rbio); 2636 return 0; 2637} 2638 2639/* Return true if the content of the step matches the caclulated one. */ 2640static bool verify_one_parity_step(struct btrfs_raid_bio *rbio, 2641 void *pointers[], unsigned int sector_nr, 2642 unsigned int step_nr) 2643{ 2644 const unsigned int nr_data = rbio->nr_data; 2645 const bool has_qstripe = (rbio->real_stripes - rbio->nr_data == 2); 2646 const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE); 2647 void *parity; 2648 bool ret = false; 2649 2650 ASSERT(step_nr < rbio->sector_nsteps); 2651 2652 /* First collect one page from each data stripe. */ 2653 for (int stripe = 0; stripe < nr_data; stripe++) 2654 pointers[stripe] = kmap_local_paddr( 2655 sector_paddr_in_rbio(rbio, stripe, sector_nr, 2656 step_nr, 0)); 2657 2658 if (has_qstripe) { 2659 assert_rbio(rbio); 2660 /* RAID6, call the library function to fill in our P/Q. */ 2661 raid6_call.gen_syndrome(rbio->real_stripes, step, pointers); 2662 } else { 2663 /* RAID5. */ 2664 memcpy(pointers[nr_data], pointers[0], step); 2665 run_xor(pointers + 1, nr_data - 1, step); 2666 } 2667 2668 /* Check scrubbing parity and repair it. */ 2669 parity = kmap_local_paddr(rbio_stripe_paddr(rbio, rbio->scrubp, sector_nr, step_nr)); 2670 if (memcmp(parity, pointers[rbio->scrubp], step) != 0) 2671 memcpy(parity, pointers[rbio->scrubp], step); 2672 else 2673 ret = true; 2674 kunmap_local(parity); 2675 2676 for (int stripe = nr_data - 1; stripe >= 0; stripe--) 2677 kunmap_local(pointers[stripe]); 2678 return ret; 2679} 2680 2681/* 2682 * The @pointers array should have the P/Q parity already mapped. 2683 */ 2684static void verify_one_parity_sector(struct btrfs_raid_bio *rbio, 2685 void *pointers[], unsigned int sector_nr) 2686{ 2687 bool found_error = false; 2688 2689 for (int step_nr = 0; step_nr < rbio->sector_nsteps; step_nr++) { 2690 bool match; 2691 2692 match = verify_one_parity_step(rbio, pointers, sector_nr, step_nr); 2693 if (!match) 2694 found_error = true; 2695 } 2696 if (!found_error) 2697 bitmap_clear(&rbio->dbitmap, sector_nr, 1); 2698} 2699 2700static int finish_parity_scrub(struct btrfs_raid_bio *rbio) 2701{ 2702 struct btrfs_io_context *bioc = rbio->bioc; 2703 void **pointers = rbio->finish_pointers; 2704 unsigned long *pbitmap = &rbio->finish_pbitmap; 2705 int nr_data = rbio->nr_data; 2706 int sectornr; 2707 bool has_qstripe; 2708 struct page *page; 2709 phys_addr_t p_paddr = INVALID_PADDR; 2710 phys_addr_t q_paddr = INVALID_PADDR; 2711 struct bio_list bio_list; 2712 int is_replace = 0; 2713 int ret; 2714 2715 bio_list_init(&bio_list); 2716 2717 if (rbio->real_stripes - rbio->nr_data == 1) 2718 has_qstripe = false; 2719 else if (rbio->real_stripes - rbio->nr_data == 2) 2720 has_qstripe = true; 2721 else 2722 BUG(); 2723 2724 /* 2725 * Replace is running and our P/Q stripe is being replaced, then we 2726 * need to duplicate the final write to replace target. 2727 */ 2728 if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) { 2729 is_replace = 1; 2730 bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors); 2731 } 2732 2733 /* 2734 * Because the higher layers(scrubber) are unlikely to 2735 * use this area of the disk again soon, so don't cache 2736 * it. 2737 */ 2738 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2739 2740 page = alloc_page(GFP_NOFS); 2741 if (!page) 2742 return -ENOMEM; 2743 p_paddr = page_to_phys(page); 2744 page = NULL; 2745 pointers[nr_data] = kmap_local_paddr(p_paddr); 2746 2747 if (has_qstripe) { 2748 /* RAID6, allocate and map temp space for the Q stripe */ 2749 page = alloc_page(GFP_NOFS); 2750 if (!page) { 2751 __free_page(phys_to_page(p_paddr)); 2752 p_paddr = INVALID_PADDR; 2753 return -ENOMEM; 2754 } 2755 q_paddr = page_to_phys(page); 2756 page = NULL; 2757 pointers[rbio->real_stripes - 1] = kmap_local_paddr(q_paddr); 2758 } 2759 2760 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); 2761 2762 /* Map the parity stripe just once */ 2763 2764 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) 2765 verify_one_parity_sector(rbio, pointers, sectornr); 2766 2767 kunmap_local(pointers[nr_data]); 2768 __free_page(phys_to_page(p_paddr)); 2769 p_paddr = INVALID_PADDR; 2770 if (q_paddr != INVALID_PADDR) { 2771 __free_page(phys_to_page(q_paddr)); 2772 q_paddr = INVALID_PADDR; 2773 } 2774 2775 /* 2776 * time to start writing. Make bios for everything from the 2777 * higher layers (the bio_list in our rbio) and our p/q. Ignore 2778 * everything else. 2779 */ 2780 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { 2781 phys_addr_t *paddrs; 2782 2783 paddrs = rbio_stripe_paddrs(rbio, rbio->scrubp, sectornr); 2784 ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, rbio->scrubp, 2785 sectornr, REQ_OP_WRITE); 2786 if (ret) 2787 goto cleanup; 2788 } 2789 2790 if (!is_replace) 2791 goto submit_write; 2792 2793 /* 2794 * Replace is running and our parity stripe needs to be duplicated to 2795 * the target device. Check we have a valid source stripe number. 2796 */ 2797 ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio); 2798 for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { 2799 phys_addr_t *paddrs; 2800 2801 paddrs = rbio_stripe_paddrs(rbio, rbio->scrubp, sectornr); 2802 ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, rbio->real_stripes, 2803 sectornr, REQ_OP_WRITE); 2804 if (ret) 2805 goto cleanup; 2806 } 2807 2808submit_write: 2809 submit_write_bios(rbio, &bio_list); 2810 return 0; 2811 2812cleanup: 2813 bio_list_put(&bio_list); 2814 return ret; 2815} 2816 2817static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) 2818{ 2819 if (stripe >= 0 && stripe < rbio->nr_data) 2820 return 1; 2821 return 0; 2822} 2823 2824static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) 2825{ 2826 void **pointers = NULL; 2827 void **unmap_array = NULL; 2828 int sector_nr; 2829 int ret = 0; 2830 2831 /* 2832 * @pointers array stores the pointer for each sector. 2833 * 2834 * @unmap_array stores copy of pointers that does not get reordered 2835 * during reconstruction so that kunmap_local works. 2836 */ 2837 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 2838 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 2839 if (!pointers || !unmap_array) { 2840 ret = -ENOMEM; 2841 goto out; 2842 } 2843 2844 for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { 2845 int dfail = 0, failp = -1; 2846 int faila; 2847 int failb; 2848 int found_errors; 2849 2850 found_errors = get_rbio_vertical_errors(rbio, sector_nr, 2851 &faila, &failb); 2852 if (unlikely(found_errors > rbio->bioc->max_errors)) { 2853 ret = -EIO; 2854 goto out; 2855 } 2856 if (found_errors == 0) 2857 continue; 2858 2859 /* We should have at least one error here. */ 2860 ASSERT(faila >= 0 || failb >= 0); 2861 2862 if (is_data_stripe(rbio, faila)) 2863 dfail++; 2864 else if (is_parity_stripe(faila)) 2865 failp = faila; 2866 2867 if (is_data_stripe(rbio, failb)) 2868 dfail++; 2869 else if (is_parity_stripe(failb)) 2870 failp = failb; 2871 /* 2872 * Because we can not use a scrubbing parity to repair the 2873 * data, so the capability of the repair is declined. (In the 2874 * case of RAID5, we can not repair anything.) 2875 */ 2876 if (unlikely(dfail > rbio->bioc->max_errors - 1)) { 2877 ret = -EIO; 2878 goto out; 2879 } 2880 /* 2881 * If all data is good, only parity is correctly, just repair 2882 * the parity, no need to recover data stripes. 2883 */ 2884 if (dfail == 0) 2885 continue; 2886 2887 /* 2888 * Here means we got one corrupted data stripe and one 2889 * corrupted parity on RAID6, if the corrupted parity is 2890 * scrubbing parity, luckily, use the other one to repair the 2891 * data, or we can not repair the data stripe. 2892 */ 2893 if (unlikely(failp != rbio->scrubp)) { 2894 ret = -EIO; 2895 goto out; 2896 } 2897 2898 ret = recover_vertical(rbio, sector_nr, pointers, unmap_array); 2899 if (ret < 0) 2900 goto out; 2901 } 2902out: 2903 kfree(pointers); 2904 kfree(unmap_array); 2905 return ret; 2906} 2907 2908static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) 2909{ 2910 struct bio_list bio_list = BIO_EMPTY_LIST; 2911 int total_sector_nr; 2912 int ret = 0; 2913 2914 /* Build a list of bios to read all the missing parts. */ 2915 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 2916 total_sector_nr++) { 2917 int sectornr = total_sector_nr % rbio->stripe_nsectors; 2918 int stripe = total_sector_nr / rbio->stripe_nsectors; 2919 phys_addr_t *paddrs; 2920 2921 /* No data in the vertical stripe, no need to read. */ 2922 if (!test_bit(sectornr, &rbio->dbitmap)) 2923 continue; 2924 2925 /* 2926 * We want to find all the sectors missing from the rbio and 2927 * read them from the disk. If sector_paddr_in_rbio() finds a sector 2928 * in the bio list we don't need to read it off the stripe. 2929 */ 2930 paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1); 2931 if (paddrs == NULL) 2932 continue; 2933 2934 paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); 2935 /* 2936 * The bio cache may have handed us an uptodate sector. If so, 2937 * use it. 2938 */ 2939 if (test_bit(rbio_sector_index(rbio, stripe, sectornr), 2940 rbio->stripe_uptodate_bitmap)) 2941 continue; 2942 2943 ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe, 2944 sectornr, REQ_OP_READ); 2945 if (ret) { 2946 bio_list_put(&bio_list); 2947 return ret; 2948 } 2949 } 2950 2951 submit_read_wait_bio_list(rbio, &bio_list); 2952 return 0; 2953} 2954 2955static void scrub_rbio(struct btrfs_raid_bio *rbio) 2956{ 2957 int sector_nr; 2958 int ret; 2959 2960 ret = alloc_rbio_essential_pages(rbio); 2961 if (ret) 2962 goto out; 2963 2964 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); 2965 2966 ret = scrub_assemble_read_bios(rbio); 2967 if (ret < 0) 2968 goto out; 2969 2970 /* We may have some failures, recover the failed sectors first. */ 2971 ret = recover_scrub_rbio(rbio); 2972 if (ret < 0) 2973 goto out; 2974 2975 /* 2976 * We have every sector properly prepared. Can finish the scrub 2977 * and writeback the good content. 2978 */ 2979 ret = finish_parity_scrub(rbio); 2980 wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); 2981 for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { 2982 int found_errors; 2983 2984 found_errors = get_rbio_vertical_errors(rbio, sector_nr, NULL, NULL); 2985 if (unlikely(found_errors > rbio->bioc->max_errors)) { 2986 ret = -EIO; 2987 break; 2988 } 2989 } 2990out: 2991 rbio_orig_end_io(rbio, errno_to_blk_status(ret)); 2992} 2993 2994static void scrub_rbio_work_locked(struct work_struct *work) 2995{ 2996 scrub_rbio(container_of(work, struct btrfs_raid_bio, work)); 2997} 2998 2999void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) 3000{ 3001 if (!lock_stripe_add(rbio)) 3002 start_async_work(rbio, scrub_rbio_work_locked); 3003} 3004 3005/* 3006 * This is for scrub call sites where we already have correct data contents. 3007 * This allows us to avoid reading data stripes again. 3008 * 3009 * Unfortunately here we have to do folio copy, other than reusing the pages. 3010 * This is due to the fact rbio has its own page management for its cache. 3011 */ 3012void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio, 3013 struct folio **data_folios, u64 data_logical) 3014{ 3015 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 3016 const u64 offset_in_full_stripe = data_logical - 3017 rbio->bioc->full_stripe_logical; 3018 unsigned int findex = 0; 3019 unsigned int foffset = 0; 3020 int ret; 3021 3022 /* 3023 * If we hit ENOMEM temporarily, but later at 3024 * raid56_parity_submit_scrub_rbio() time it succeeded, we just do 3025 * the extra read, not a big deal. 3026 * 3027 * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time, 3028 * the bio would got proper error number set. 3029 */ 3030 ret = alloc_rbio_data_pages(rbio); 3031 if (ret < 0) 3032 return; 3033 3034 /* data_logical must be at stripe boundary and inside the full stripe. */ 3035 ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN)); 3036 ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT)); 3037 3038 for (unsigned int cur_off = offset_in_full_stripe; 3039 cur_off < offset_in_full_stripe + BTRFS_STRIPE_LEN; 3040 cur_off += PAGE_SIZE) { 3041 const unsigned int pindex = cur_off >> PAGE_SHIFT; 3042 void *kaddr; 3043 3044 kaddr = kmap_local_page(rbio->stripe_pages[pindex]); 3045 memcpy_from_folio(kaddr, data_folios[findex], foffset, PAGE_SIZE); 3046 kunmap_local(kaddr); 3047 3048 foffset += PAGE_SIZE; 3049 ASSERT(foffset <= folio_size(data_folios[findex])); 3050 if (foffset == folio_size(data_folios[findex])) { 3051 findex++; 3052 foffset = 0; 3053 } 3054 } 3055 bitmap_set(rbio->stripe_uptodate_bitmap, 3056 offset_in_full_stripe >> fs_info->sectorsize_bits, 3057 BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); 3058}