Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2011, 2012 STRATO. All rights reserved.
4 */
5
6#include <linux/blkdev.h>
7#include <linux/ratelimit.h>
8#include <linux/sched/mm.h>
9#include <crypto/hash.h>
10#include "ctree.h"
11#include "discard.h"
12#include "volumes.h"
13#include "disk-io.h"
14#include "ordered-data.h"
15#include "transaction.h"
16#include "backref.h"
17#include "extent_io.h"
18#include "dev-replace.h"
19#include "check-integrity.h"
20#include "rcu-string.h"
21#include "raid56.h"
22#include "block-group.h"
23#include "zoned.h"
24
25/*
26 * This is only the first step towards a full-features scrub. It reads all
27 * extent and super block and verifies the checksums. In case a bad checksum
28 * is found or the extent cannot be read, good data will be written back if
29 * any can be found.
30 *
31 * Future enhancements:
32 * - In case an unrepairable extent is encountered, track which files are
33 * affected and report them
34 * - track and record media errors, throw out bad devices
35 * - add a mode to also read unallocated space
36 */
37
38struct scrub_block;
39struct scrub_ctx;
40
41/*
42 * The following three values only influence the performance.
43 *
44 * The last one configures the number of parallel and outstanding I/O
45 * operations. The first one configures an upper limit for the number
46 * of (dynamically allocated) pages that are added to a bio.
47 */
48#define SCRUB_SECTORS_PER_BIO 32 /* 128KiB per bio for 4KiB pages */
49#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for 4KiB pages */
50
51/*
52 * The following value times PAGE_SIZE needs to be large enough to match the
53 * largest node/leaf/sector size that shall be supported.
54 */
55#define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
56
57struct scrub_recover {
58 refcount_t refs;
59 struct btrfs_io_context *bioc;
60 u64 map_length;
61};
62
63struct scrub_sector {
64 struct scrub_block *sblock;
65 struct page *page;
66 struct btrfs_device *dev;
67 struct list_head list;
68 u64 flags; /* extent flags */
69 u64 generation;
70 u64 logical;
71 u64 physical;
72 u64 physical_for_dev_replace;
73 atomic_t refs;
74 u8 mirror_num;
75 unsigned int have_csum:1;
76 unsigned int io_error:1;
77 u8 csum[BTRFS_CSUM_SIZE];
78
79 struct scrub_recover *recover;
80};
81
82struct scrub_bio {
83 int index;
84 struct scrub_ctx *sctx;
85 struct btrfs_device *dev;
86 struct bio *bio;
87 blk_status_t status;
88 u64 logical;
89 u64 physical;
90 struct scrub_sector *sectors[SCRUB_SECTORS_PER_BIO];
91 int sector_count;
92 int next_free;
93 struct work_struct work;
94};
95
96struct scrub_block {
97 struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
98 int sector_count;
99 atomic_t outstanding_sectors;
100 refcount_t refs; /* free mem on transition to zero */
101 struct scrub_ctx *sctx;
102 struct scrub_parity *sparity;
103 struct {
104 unsigned int header_error:1;
105 unsigned int checksum_error:1;
106 unsigned int no_io_error_seen:1;
107 unsigned int generation_error:1; /* also sets header_error */
108
109 /* The following is for the data used to check parity */
110 /* It is for the data with checksum */
111 unsigned int data_corrected:1;
112 };
113 struct work_struct work;
114};
115
116/* Used for the chunks with parity stripe such RAID5/6 */
117struct scrub_parity {
118 struct scrub_ctx *sctx;
119
120 struct btrfs_device *scrub_dev;
121
122 u64 logic_start;
123
124 u64 logic_end;
125
126 int nsectors;
127
128 u32 stripe_len;
129
130 refcount_t refs;
131
132 struct list_head sectors_list;
133
134 /* Work of parity check and repair */
135 struct work_struct work;
136
137 /* Mark the parity blocks which have data */
138 unsigned long dbitmap;
139
140 /*
141 * Mark the parity blocks which have data, but errors happen when
142 * read data or check data
143 */
144 unsigned long ebitmap;
145};
146
147struct scrub_ctx {
148 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
149 struct btrfs_fs_info *fs_info;
150 int first_free;
151 int curr;
152 atomic_t bios_in_flight;
153 atomic_t workers_pending;
154 spinlock_t list_lock;
155 wait_queue_head_t list_wait;
156 struct list_head csum_list;
157 atomic_t cancel_req;
158 int readonly;
159 int sectors_per_bio;
160
161 /* State of IO submission throttling affecting the associated device */
162 ktime_t throttle_deadline;
163 u64 throttle_sent;
164
165 int is_dev_replace;
166 u64 write_pointer;
167
168 struct scrub_bio *wr_curr_bio;
169 struct mutex wr_lock;
170 struct btrfs_device *wr_tgtdev;
171 bool flush_all_writes;
172
173 /*
174 * statistics
175 */
176 struct btrfs_scrub_progress stat;
177 spinlock_t stat_lock;
178
179 /*
180 * Use a ref counter to avoid use-after-free issues. Scrub workers
181 * decrement bios_in_flight and workers_pending and then do a wakeup
182 * on the list_wait wait queue. We must ensure the main scrub task
183 * doesn't free the scrub context before or while the workers are
184 * doing the wakeup() call.
185 */
186 refcount_t refs;
187};
188
189struct scrub_warning {
190 struct btrfs_path *path;
191 u64 extent_item_size;
192 const char *errstr;
193 u64 physical;
194 u64 logical;
195 struct btrfs_device *dev;
196};
197
198struct full_stripe_lock {
199 struct rb_node node;
200 u64 logical;
201 u64 refs;
202 struct mutex mutex;
203};
204
205static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
206 struct scrub_block *sblocks_for_recheck);
207static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
208 struct scrub_block *sblock,
209 int retry_failed_mirror);
210static void scrub_recheck_block_checksum(struct scrub_block *sblock);
211static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
212 struct scrub_block *sblock_good);
213static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
214 struct scrub_block *sblock_good,
215 int sector_num, int force_write);
216static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
217static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
218 int sector_num);
219static int scrub_checksum_data(struct scrub_block *sblock);
220static int scrub_checksum_tree_block(struct scrub_block *sblock);
221static int scrub_checksum_super(struct scrub_block *sblock);
222static void scrub_block_put(struct scrub_block *sblock);
223static void scrub_sector_get(struct scrub_sector *sector);
224static void scrub_sector_put(struct scrub_sector *sector);
225static void scrub_parity_get(struct scrub_parity *sparity);
226static void scrub_parity_put(struct scrub_parity *sparity);
227static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
228 u64 physical, struct btrfs_device *dev, u64 flags,
229 u64 gen, int mirror_num, u8 *csum,
230 u64 physical_for_dev_replace);
231static void scrub_bio_end_io(struct bio *bio);
232static void scrub_bio_end_io_worker(struct work_struct *work);
233static void scrub_block_complete(struct scrub_block *sblock);
234static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
235 u64 extent_logical, u32 extent_len,
236 u64 *extent_physical,
237 struct btrfs_device **extent_dev,
238 int *extent_mirror_num);
239static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
240 struct scrub_sector *sector);
241static void scrub_wr_submit(struct scrub_ctx *sctx);
242static void scrub_wr_bio_end_io(struct bio *bio);
243static void scrub_wr_bio_end_io_worker(struct work_struct *work);
244static void scrub_put_ctx(struct scrub_ctx *sctx);
245
246static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
247{
248 return sector->recover &&
249 (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
250}
251
252static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
253{
254 refcount_inc(&sctx->refs);
255 atomic_inc(&sctx->bios_in_flight);
256}
257
258static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
259{
260 atomic_dec(&sctx->bios_in_flight);
261 wake_up(&sctx->list_wait);
262 scrub_put_ctx(sctx);
263}
264
265static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
266{
267 while (atomic_read(&fs_info->scrub_pause_req)) {
268 mutex_unlock(&fs_info->scrub_lock);
269 wait_event(fs_info->scrub_pause_wait,
270 atomic_read(&fs_info->scrub_pause_req) == 0);
271 mutex_lock(&fs_info->scrub_lock);
272 }
273}
274
275static void scrub_pause_on(struct btrfs_fs_info *fs_info)
276{
277 atomic_inc(&fs_info->scrubs_paused);
278 wake_up(&fs_info->scrub_pause_wait);
279}
280
281static void scrub_pause_off(struct btrfs_fs_info *fs_info)
282{
283 mutex_lock(&fs_info->scrub_lock);
284 __scrub_blocked_if_needed(fs_info);
285 atomic_dec(&fs_info->scrubs_paused);
286 mutex_unlock(&fs_info->scrub_lock);
287
288 wake_up(&fs_info->scrub_pause_wait);
289}
290
291static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
292{
293 scrub_pause_on(fs_info);
294 scrub_pause_off(fs_info);
295}
296
297/*
298 * Insert new full stripe lock into full stripe locks tree
299 *
300 * Return pointer to existing or newly inserted full_stripe_lock structure if
301 * everything works well.
302 * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
303 *
304 * NOTE: caller must hold full_stripe_locks_root->lock before calling this
305 * function
306 */
307static struct full_stripe_lock *insert_full_stripe_lock(
308 struct btrfs_full_stripe_locks_tree *locks_root,
309 u64 fstripe_logical)
310{
311 struct rb_node **p;
312 struct rb_node *parent = NULL;
313 struct full_stripe_lock *entry;
314 struct full_stripe_lock *ret;
315
316 lockdep_assert_held(&locks_root->lock);
317
318 p = &locks_root->root.rb_node;
319 while (*p) {
320 parent = *p;
321 entry = rb_entry(parent, struct full_stripe_lock, node);
322 if (fstripe_logical < entry->logical) {
323 p = &(*p)->rb_left;
324 } else if (fstripe_logical > entry->logical) {
325 p = &(*p)->rb_right;
326 } else {
327 entry->refs++;
328 return entry;
329 }
330 }
331
332 /*
333 * Insert new lock.
334 */
335 ret = kmalloc(sizeof(*ret), GFP_KERNEL);
336 if (!ret)
337 return ERR_PTR(-ENOMEM);
338 ret->logical = fstripe_logical;
339 ret->refs = 1;
340 mutex_init(&ret->mutex);
341
342 rb_link_node(&ret->node, parent, p);
343 rb_insert_color(&ret->node, &locks_root->root);
344 return ret;
345}
346
347/*
348 * Search for a full stripe lock of a block group
349 *
350 * Return pointer to existing full stripe lock if found
351 * Return NULL if not found
352 */
353static struct full_stripe_lock *search_full_stripe_lock(
354 struct btrfs_full_stripe_locks_tree *locks_root,
355 u64 fstripe_logical)
356{
357 struct rb_node *node;
358 struct full_stripe_lock *entry;
359
360 lockdep_assert_held(&locks_root->lock);
361
362 node = locks_root->root.rb_node;
363 while (node) {
364 entry = rb_entry(node, struct full_stripe_lock, node);
365 if (fstripe_logical < entry->logical)
366 node = node->rb_left;
367 else if (fstripe_logical > entry->logical)
368 node = node->rb_right;
369 else
370 return entry;
371 }
372 return NULL;
373}
374
375/*
376 * Helper to get full stripe logical from a normal bytenr.
377 *
378 * Caller must ensure @cache is a RAID56 block group.
379 */
380static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
381{
382 u64 ret;
383
384 /*
385 * Due to chunk item size limit, full stripe length should not be
386 * larger than U32_MAX. Just a sanity check here.
387 */
388 WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
389
390 /*
391 * round_down() can only handle power of 2, while RAID56 full
392 * stripe length can be 64KiB * n, so we need to manually round down.
393 */
394 ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
395 cache->full_stripe_len + cache->start;
396 return ret;
397}
398
399/*
400 * Lock a full stripe to avoid concurrency of recovery and read
401 *
402 * It's only used for profiles with parities (RAID5/6), for other profiles it
403 * does nothing.
404 *
405 * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
406 * So caller must call unlock_full_stripe() at the same context.
407 *
408 * Return <0 if encounters error.
409 */
410static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
411 bool *locked_ret)
412{
413 struct btrfs_block_group *bg_cache;
414 struct btrfs_full_stripe_locks_tree *locks_root;
415 struct full_stripe_lock *existing;
416 u64 fstripe_start;
417 int ret = 0;
418
419 *locked_ret = false;
420 bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
421 if (!bg_cache) {
422 ASSERT(0);
423 return -ENOENT;
424 }
425
426 /* Profiles not based on parity don't need full stripe lock */
427 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
428 goto out;
429 locks_root = &bg_cache->full_stripe_locks_root;
430
431 fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
432
433 /* Now insert the full stripe lock */
434 mutex_lock(&locks_root->lock);
435 existing = insert_full_stripe_lock(locks_root, fstripe_start);
436 mutex_unlock(&locks_root->lock);
437 if (IS_ERR(existing)) {
438 ret = PTR_ERR(existing);
439 goto out;
440 }
441 mutex_lock(&existing->mutex);
442 *locked_ret = true;
443out:
444 btrfs_put_block_group(bg_cache);
445 return ret;
446}
447
448/*
449 * Unlock a full stripe.
450 *
451 * NOTE: Caller must ensure it's the same context calling corresponding
452 * lock_full_stripe().
453 *
454 * Return 0 if we unlock full stripe without problem.
455 * Return <0 for error
456 */
457static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
458 bool locked)
459{
460 struct btrfs_block_group *bg_cache;
461 struct btrfs_full_stripe_locks_tree *locks_root;
462 struct full_stripe_lock *fstripe_lock;
463 u64 fstripe_start;
464 bool freeit = false;
465 int ret = 0;
466
467 /* If we didn't acquire full stripe lock, no need to continue */
468 if (!locked)
469 return 0;
470
471 bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
472 if (!bg_cache) {
473 ASSERT(0);
474 return -ENOENT;
475 }
476 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
477 goto out;
478
479 locks_root = &bg_cache->full_stripe_locks_root;
480 fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
481
482 mutex_lock(&locks_root->lock);
483 fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
484 /* Unpaired unlock_full_stripe() detected */
485 if (!fstripe_lock) {
486 WARN_ON(1);
487 ret = -ENOENT;
488 mutex_unlock(&locks_root->lock);
489 goto out;
490 }
491
492 if (fstripe_lock->refs == 0) {
493 WARN_ON(1);
494 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
495 fstripe_lock->logical);
496 } else {
497 fstripe_lock->refs--;
498 }
499
500 if (fstripe_lock->refs == 0) {
501 rb_erase(&fstripe_lock->node, &locks_root->root);
502 freeit = true;
503 }
504 mutex_unlock(&locks_root->lock);
505
506 mutex_unlock(&fstripe_lock->mutex);
507 if (freeit)
508 kfree(fstripe_lock);
509out:
510 btrfs_put_block_group(bg_cache);
511 return ret;
512}
513
514static void scrub_free_csums(struct scrub_ctx *sctx)
515{
516 while (!list_empty(&sctx->csum_list)) {
517 struct btrfs_ordered_sum *sum;
518 sum = list_first_entry(&sctx->csum_list,
519 struct btrfs_ordered_sum, list);
520 list_del(&sum->list);
521 kfree(sum);
522 }
523}
524
525static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
526{
527 int i;
528
529 if (!sctx)
530 return;
531
532 /* this can happen when scrub is cancelled */
533 if (sctx->curr != -1) {
534 struct scrub_bio *sbio = sctx->bios[sctx->curr];
535
536 for (i = 0; i < sbio->sector_count; i++) {
537 WARN_ON(!sbio->sectors[i]->page);
538 scrub_block_put(sbio->sectors[i]->sblock);
539 }
540 bio_put(sbio->bio);
541 }
542
543 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
544 struct scrub_bio *sbio = sctx->bios[i];
545
546 if (!sbio)
547 break;
548 kfree(sbio);
549 }
550
551 kfree(sctx->wr_curr_bio);
552 scrub_free_csums(sctx);
553 kfree(sctx);
554}
555
556static void scrub_put_ctx(struct scrub_ctx *sctx)
557{
558 if (refcount_dec_and_test(&sctx->refs))
559 scrub_free_ctx(sctx);
560}
561
562static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
563 struct btrfs_fs_info *fs_info, int is_dev_replace)
564{
565 struct scrub_ctx *sctx;
566 int i;
567
568 sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
569 if (!sctx)
570 goto nomem;
571 refcount_set(&sctx->refs, 1);
572 sctx->is_dev_replace = is_dev_replace;
573 sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
574 sctx->curr = -1;
575 sctx->fs_info = fs_info;
576 INIT_LIST_HEAD(&sctx->csum_list);
577 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
578 struct scrub_bio *sbio;
579
580 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
581 if (!sbio)
582 goto nomem;
583 sctx->bios[i] = sbio;
584
585 sbio->index = i;
586 sbio->sctx = sctx;
587 sbio->sector_count = 0;
588 INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
589
590 if (i != SCRUB_BIOS_PER_SCTX - 1)
591 sctx->bios[i]->next_free = i + 1;
592 else
593 sctx->bios[i]->next_free = -1;
594 }
595 sctx->first_free = 0;
596 atomic_set(&sctx->bios_in_flight, 0);
597 atomic_set(&sctx->workers_pending, 0);
598 atomic_set(&sctx->cancel_req, 0);
599
600 spin_lock_init(&sctx->list_lock);
601 spin_lock_init(&sctx->stat_lock);
602 init_waitqueue_head(&sctx->list_wait);
603 sctx->throttle_deadline = 0;
604
605 WARN_ON(sctx->wr_curr_bio != NULL);
606 mutex_init(&sctx->wr_lock);
607 sctx->wr_curr_bio = NULL;
608 if (is_dev_replace) {
609 WARN_ON(!fs_info->dev_replace.tgtdev);
610 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
611 sctx->flush_all_writes = false;
612 }
613
614 return sctx;
615
616nomem:
617 scrub_free_ctx(sctx);
618 return ERR_PTR(-ENOMEM);
619}
620
621static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
622 void *warn_ctx)
623{
624 u32 nlink;
625 int ret;
626 int i;
627 unsigned nofs_flag;
628 struct extent_buffer *eb;
629 struct btrfs_inode_item *inode_item;
630 struct scrub_warning *swarn = warn_ctx;
631 struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
632 struct inode_fs_paths *ipath = NULL;
633 struct btrfs_root *local_root;
634 struct btrfs_key key;
635
636 local_root = btrfs_get_fs_root(fs_info, root, true);
637 if (IS_ERR(local_root)) {
638 ret = PTR_ERR(local_root);
639 goto err;
640 }
641
642 /*
643 * this makes the path point to (inum INODE_ITEM ioff)
644 */
645 key.objectid = inum;
646 key.type = BTRFS_INODE_ITEM_KEY;
647 key.offset = 0;
648
649 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
650 if (ret) {
651 btrfs_put_root(local_root);
652 btrfs_release_path(swarn->path);
653 goto err;
654 }
655
656 eb = swarn->path->nodes[0];
657 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
658 struct btrfs_inode_item);
659 nlink = btrfs_inode_nlink(eb, inode_item);
660 btrfs_release_path(swarn->path);
661
662 /*
663 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
664 * uses GFP_NOFS in this context, so we keep it consistent but it does
665 * not seem to be strictly necessary.
666 */
667 nofs_flag = memalloc_nofs_save();
668 ipath = init_ipath(4096, local_root, swarn->path);
669 memalloc_nofs_restore(nofs_flag);
670 if (IS_ERR(ipath)) {
671 btrfs_put_root(local_root);
672 ret = PTR_ERR(ipath);
673 ipath = NULL;
674 goto err;
675 }
676 ret = paths_from_inode(inum, ipath);
677
678 if (ret < 0)
679 goto err;
680
681 /*
682 * we deliberately ignore the bit ipath might have been too small to
683 * hold all of the paths here
684 */
685 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
686 btrfs_warn_in_rcu(fs_info,
687"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
688 swarn->errstr, swarn->logical,
689 rcu_str_deref(swarn->dev->name),
690 swarn->physical,
691 root, inum, offset,
692 fs_info->sectorsize, nlink,
693 (char *)(unsigned long)ipath->fspath->val[i]);
694
695 btrfs_put_root(local_root);
696 free_ipath(ipath);
697 return 0;
698
699err:
700 btrfs_warn_in_rcu(fs_info,
701 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
702 swarn->errstr, swarn->logical,
703 rcu_str_deref(swarn->dev->name),
704 swarn->physical,
705 root, inum, offset, ret);
706
707 free_ipath(ipath);
708 return 0;
709}
710
711static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
712{
713 struct btrfs_device *dev;
714 struct btrfs_fs_info *fs_info;
715 struct btrfs_path *path;
716 struct btrfs_key found_key;
717 struct extent_buffer *eb;
718 struct btrfs_extent_item *ei;
719 struct scrub_warning swarn;
720 unsigned long ptr = 0;
721 u64 extent_item_pos;
722 u64 flags = 0;
723 u64 ref_root;
724 u32 item_size;
725 u8 ref_level = 0;
726 int ret;
727
728 WARN_ON(sblock->sector_count < 1);
729 dev = sblock->sectors[0]->dev;
730 fs_info = sblock->sctx->fs_info;
731
732 path = btrfs_alloc_path();
733 if (!path)
734 return;
735
736 swarn.physical = sblock->sectors[0]->physical;
737 swarn.logical = sblock->sectors[0]->logical;
738 swarn.errstr = errstr;
739 swarn.dev = NULL;
740
741 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
742 &flags);
743 if (ret < 0)
744 goto out;
745
746 extent_item_pos = swarn.logical - found_key.objectid;
747 swarn.extent_item_size = found_key.offset;
748
749 eb = path->nodes[0];
750 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
751 item_size = btrfs_item_size(eb, path->slots[0]);
752
753 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
754 do {
755 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
756 item_size, &ref_root,
757 &ref_level);
758 btrfs_warn_in_rcu(fs_info,
759"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
760 errstr, swarn.logical,
761 rcu_str_deref(dev->name),
762 swarn.physical,
763 ref_level ? "node" : "leaf",
764 ret < 0 ? -1 : ref_level,
765 ret < 0 ? -1 : ref_root);
766 } while (ret != 1);
767 btrfs_release_path(path);
768 } else {
769 btrfs_release_path(path);
770 swarn.path = path;
771 swarn.dev = dev;
772 iterate_extent_inodes(fs_info, found_key.objectid,
773 extent_item_pos, 1,
774 scrub_print_warning_inode, &swarn, false);
775 }
776
777out:
778 btrfs_free_path(path);
779}
780
781static inline void scrub_get_recover(struct scrub_recover *recover)
782{
783 refcount_inc(&recover->refs);
784}
785
786static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
787 struct scrub_recover *recover)
788{
789 if (refcount_dec_and_test(&recover->refs)) {
790 btrfs_bio_counter_dec(fs_info);
791 btrfs_put_bioc(recover->bioc);
792 kfree(recover);
793 }
794}
795
796/*
797 * scrub_handle_errored_block gets called when either verification of the
798 * sectors failed or the bio failed to read, e.g. with EIO. In the latter
799 * case, this function handles all sectors in the bio, even though only one
800 * may be bad.
801 * The goal of this function is to repair the errored block by using the
802 * contents of one of the mirrors.
803 */
804static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
805{
806 struct scrub_ctx *sctx = sblock_to_check->sctx;
807 struct btrfs_device *dev;
808 struct btrfs_fs_info *fs_info;
809 u64 logical;
810 unsigned int failed_mirror_index;
811 unsigned int is_metadata;
812 unsigned int have_csum;
813 struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
814 struct scrub_block *sblock_bad;
815 int ret;
816 int mirror_index;
817 int sector_num;
818 int success;
819 bool full_stripe_locked;
820 unsigned int nofs_flag;
821 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
822 DEFAULT_RATELIMIT_BURST);
823
824 BUG_ON(sblock_to_check->sector_count < 1);
825 fs_info = sctx->fs_info;
826 if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
827 /*
828 * if we find an error in a super block, we just report it.
829 * They will get written with the next transaction commit
830 * anyway
831 */
832 spin_lock(&sctx->stat_lock);
833 ++sctx->stat.super_errors;
834 spin_unlock(&sctx->stat_lock);
835 return 0;
836 }
837 logical = sblock_to_check->sectors[0]->logical;
838 BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1);
839 failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1;
840 is_metadata = !(sblock_to_check->sectors[0]->flags &
841 BTRFS_EXTENT_FLAG_DATA);
842 have_csum = sblock_to_check->sectors[0]->have_csum;
843 dev = sblock_to_check->sectors[0]->dev;
844
845 if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
846 return 0;
847
848 /*
849 * We must use GFP_NOFS because the scrub task might be waiting for a
850 * worker task executing this function and in turn a transaction commit
851 * might be waiting the scrub task to pause (which needs to wait for all
852 * the worker tasks to complete before pausing).
853 * We do allocations in the workers through insert_full_stripe_lock()
854 * and scrub_add_sector_to_wr_bio(), which happens down the call chain of
855 * this function.
856 */
857 nofs_flag = memalloc_nofs_save();
858 /*
859 * For RAID5/6, race can happen for a different device scrub thread.
860 * For data corruption, Parity and Data threads will both try
861 * to recovery the data.
862 * Race can lead to doubly added csum error, or even unrecoverable
863 * error.
864 */
865 ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
866 if (ret < 0) {
867 memalloc_nofs_restore(nofs_flag);
868 spin_lock(&sctx->stat_lock);
869 if (ret == -ENOMEM)
870 sctx->stat.malloc_errors++;
871 sctx->stat.read_errors++;
872 sctx->stat.uncorrectable_errors++;
873 spin_unlock(&sctx->stat_lock);
874 return ret;
875 }
876
877 /*
878 * read all mirrors one after the other. This includes to
879 * re-read the extent or metadata block that failed (that was
880 * the cause that this fixup code is called) another time,
881 * sector by sector this time in order to know which sectors
882 * caused I/O errors and which ones are good (for all mirrors).
883 * It is the goal to handle the situation when more than one
884 * mirror contains I/O errors, but the errors do not
885 * overlap, i.e. the data can be repaired by selecting the
886 * sectors from those mirrors without I/O error on the
887 * particular sectors. One example (with blocks >= 2 * sectorsize)
888 * would be that mirror #1 has an I/O error on the first sector,
889 * the second sector is good, and mirror #2 has an I/O error on
890 * the second sector, but the first sector is good.
891 * Then the first sector of the first mirror can be repaired by
892 * taking the first sector of the second mirror, and the
893 * second sector of the second mirror can be repaired by
894 * copying the contents of the 2nd sector of the 1st mirror.
895 * One more note: if the sectors of one mirror contain I/O
896 * errors, the checksum cannot be verified. In order to get
897 * the best data for repairing, the first attempt is to find
898 * a mirror without I/O errors and with a validated checksum.
899 * Only if this is not possible, the sectors are picked from
900 * mirrors with I/O errors without considering the checksum.
901 * If the latter is the case, at the end, the checksum of the
902 * repaired area is verified in order to correctly maintain
903 * the statistics.
904 */
905
906 sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
907 sizeof(*sblocks_for_recheck), GFP_KERNEL);
908 if (!sblocks_for_recheck) {
909 spin_lock(&sctx->stat_lock);
910 sctx->stat.malloc_errors++;
911 sctx->stat.read_errors++;
912 sctx->stat.uncorrectable_errors++;
913 spin_unlock(&sctx->stat_lock);
914 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
915 goto out;
916 }
917
918 /* Setup the context, map the logical blocks and alloc the sectors */
919 ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
920 if (ret) {
921 spin_lock(&sctx->stat_lock);
922 sctx->stat.read_errors++;
923 sctx->stat.uncorrectable_errors++;
924 spin_unlock(&sctx->stat_lock);
925 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
926 goto out;
927 }
928 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
929 sblock_bad = sblocks_for_recheck + failed_mirror_index;
930
931 /* build and submit the bios for the failed mirror, check checksums */
932 scrub_recheck_block(fs_info, sblock_bad, 1);
933
934 if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
935 sblock_bad->no_io_error_seen) {
936 /*
937 * The error disappeared after reading sector by sector, or
938 * the area was part of a huge bio and other parts of the
939 * bio caused I/O errors, or the block layer merged several
940 * read requests into one and the error is caused by a
941 * different bio (usually one of the two latter cases is
942 * the cause)
943 */
944 spin_lock(&sctx->stat_lock);
945 sctx->stat.unverified_errors++;
946 sblock_to_check->data_corrected = 1;
947 spin_unlock(&sctx->stat_lock);
948
949 if (sctx->is_dev_replace)
950 scrub_write_block_to_dev_replace(sblock_bad);
951 goto out;
952 }
953
954 if (!sblock_bad->no_io_error_seen) {
955 spin_lock(&sctx->stat_lock);
956 sctx->stat.read_errors++;
957 spin_unlock(&sctx->stat_lock);
958 if (__ratelimit(&rs))
959 scrub_print_warning("i/o error", sblock_to_check);
960 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
961 } else if (sblock_bad->checksum_error) {
962 spin_lock(&sctx->stat_lock);
963 sctx->stat.csum_errors++;
964 spin_unlock(&sctx->stat_lock);
965 if (__ratelimit(&rs))
966 scrub_print_warning("checksum error", sblock_to_check);
967 btrfs_dev_stat_inc_and_print(dev,
968 BTRFS_DEV_STAT_CORRUPTION_ERRS);
969 } else if (sblock_bad->header_error) {
970 spin_lock(&sctx->stat_lock);
971 sctx->stat.verify_errors++;
972 spin_unlock(&sctx->stat_lock);
973 if (__ratelimit(&rs))
974 scrub_print_warning("checksum/header error",
975 sblock_to_check);
976 if (sblock_bad->generation_error)
977 btrfs_dev_stat_inc_and_print(dev,
978 BTRFS_DEV_STAT_GENERATION_ERRS);
979 else
980 btrfs_dev_stat_inc_and_print(dev,
981 BTRFS_DEV_STAT_CORRUPTION_ERRS);
982 }
983
984 if (sctx->readonly) {
985 ASSERT(!sctx->is_dev_replace);
986 goto out;
987 }
988
989 /*
990 * now build and submit the bios for the other mirrors, check
991 * checksums.
992 * First try to pick the mirror which is completely without I/O
993 * errors and also does not have a checksum error.
994 * If one is found, and if a checksum is present, the full block
995 * that is known to contain an error is rewritten. Afterwards
996 * the block is known to be corrected.
997 * If a mirror is found which is completely correct, and no
998 * checksum is present, only those sectors are rewritten that had
999 * an I/O error in the block to be repaired, since it cannot be
1000 * determined, which copy of the other sectors is better (and it
1001 * could happen otherwise that a correct sector would be
1002 * overwritten by a bad one).
1003 */
1004 for (mirror_index = 0; ;mirror_index++) {
1005 struct scrub_block *sblock_other;
1006
1007 if (mirror_index == failed_mirror_index)
1008 continue;
1009
1010 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1011 if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1012 if (mirror_index >= BTRFS_MAX_MIRRORS)
1013 break;
1014 if (!sblocks_for_recheck[mirror_index].sector_count)
1015 break;
1016
1017 sblock_other = sblocks_for_recheck + mirror_index;
1018 } else {
1019 struct scrub_recover *r = sblock_bad->sectors[0]->recover;
1020 int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
1021
1022 if (mirror_index >= max_allowed)
1023 break;
1024 if (!sblocks_for_recheck[1].sector_count)
1025 break;
1026
1027 ASSERT(failed_mirror_index == 0);
1028 sblock_other = sblocks_for_recheck + 1;
1029 sblock_other->sectors[0]->mirror_num = 1 + mirror_index;
1030 }
1031
1032 /* build and submit the bios, check checksums */
1033 scrub_recheck_block(fs_info, sblock_other, 0);
1034
1035 if (!sblock_other->header_error &&
1036 !sblock_other->checksum_error &&
1037 sblock_other->no_io_error_seen) {
1038 if (sctx->is_dev_replace) {
1039 scrub_write_block_to_dev_replace(sblock_other);
1040 goto corrected_error;
1041 } else {
1042 ret = scrub_repair_block_from_good_copy(
1043 sblock_bad, sblock_other);
1044 if (!ret)
1045 goto corrected_error;
1046 }
1047 }
1048 }
1049
1050 if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1051 goto did_not_correct_error;
1052
1053 /*
1054 * In case of I/O errors in the area that is supposed to be
1055 * repaired, continue by picking good copies of those sectors.
1056 * Select the good sectors from mirrors to rewrite bad sectors from
1057 * the area to fix. Afterwards verify the checksum of the block
1058 * that is supposed to be repaired. This verification step is
1059 * only done for the purpose of statistic counting and for the
1060 * final scrub report, whether errors remain.
1061 * A perfect algorithm could make use of the checksum and try
1062 * all possible combinations of sectors from the different mirrors
1063 * until the checksum verification succeeds. For example, when
1064 * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
1065 * of mirror #2 is readable but the final checksum test fails,
1066 * then the 2nd sector of mirror #3 could be tried, whether now
1067 * the final checksum succeeds. But this would be a rare
1068 * exception and is therefore not implemented. At least it is
1069 * avoided that the good copy is overwritten.
1070 * A more useful improvement would be to pick the sectors
1071 * without I/O error based on sector sizes (512 bytes on legacy
1072 * disks) instead of on sectorsize. Then maybe 512 byte of one
1073 * mirror could be repaired by taking 512 byte of a different
1074 * mirror, even if other 512 byte sectors in the same sectorsize
1075 * area are unreadable.
1076 */
1077 success = 1;
1078 for (sector_num = 0; sector_num < sblock_bad->sector_count;
1079 sector_num++) {
1080 struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1081 struct scrub_block *sblock_other = NULL;
1082
1083 /* Skip no-io-error sectors in scrub */
1084 if (!sector_bad->io_error && !sctx->is_dev_replace)
1085 continue;
1086
1087 if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1088 /*
1089 * In case of dev replace, if raid56 rebuild process
1090 * didn't work out correct data, then copy the content
1091 * in sblock_bad to make sure target device is identical
1092 * to source device, instead of writing garbage data in
1093 * sblock_for_recheck array to target device.
1094 */
1095 sblock_other = NULL;
1096 } else if (sector_bad->io_error) {
1097 /* Try to find no-io-error sector in mirrors */
1098 for (mirror_index = 0;
1099 mirror_index < BTRFS_MAX_MIRRORS &&
1100 sblocks_for_recheck[mirror_index].sector_count > 0;
1101 mirror_index++) {
1102 if (!sblocks_for_recheck[mirror_index].
1103 sectors[sector_num]->io_error) {
1104 sblock_other = sblocks_for_recheck +
1105 mirror_index;
1106 break;
1107 }
1108 }
1109 if (!sblock_other)
1110 success = 0;
1111 }
1112
1113 if (sctx->is_dev_replace) {
1114 /*
1115 * Did not find a mirror to fetch the sector from.
1116 * scrub_write_sector_to_dev_replace() handles this
1117 * case (sector->io_error), by filling the block with
1118 * zeros before submitting the write request
1119 */
1120 if (!sblock_other)
1121 sblock_other = sblock_bad;
1122
1123 if (scrub_write_sector_to_dev_replace(sblock_other,
1124 sector_num) != 0) {
1125 atomic64_inc(
1126 &fs_info->dev_replace.num_write_errors);
1127 success = 0;
1128 }
1129 } else if (sblock_other) {
1130 ret = scrub_repair_sector_from_good_copy(sblock_bad,
1131 sblock_other,
1132 sector_num, 0);
1133 if (0 == ret)
1134 sector_bad->io_error = 0;
1135 else
1136 success = 0;
1137 }
1138 }
1139
1140 if (success && !sctx->is_dev_replace) {
1141 if (is_metadata || have_csum) {
1142 /*
1143 * need to verify the checksum now that all
1144 * sectors on disk are repaired (the write
1145 * request for data to be repaired is on its way).
1146 * Just be lazy and use scrub_recheck_block()
1147 * which re-reads the data before the checksum
1148 * is verified, but most likely the data comes out
1149 * of the page cache.
1150 */
1151 scrub_recheck_block(fs_info, sblock_bad, 1);
1152 if (!sblock_bad->header_error &&
1153 !sblock_bad->checksum_error &&
1154 sblock_bad->no_io_error_seen)
1155 goto corrected_error;
1156 else
1157 goto did_not_correct_error;
1158 } else {
1159corrected_error:
1160 spin_lock(&sctx->stat_lock);
1161 sctx->stat.corrected_errors++;
1162 sblock_to_check->data_corrected = 1;
1163 spin_unlock(&sctx->stat_lock);
1164 btrfs_err_rl_in_rcu(fs_info,
1165 "fixed up error at logical %llu on dev %s",
1166 logical, rcu_str_deref(dev->name));
1167 }
1168 } else {
1169did_not_correct_error:
1170 spin_lock(&sctx->stat_lock);
1171 sctx->stat.uncorrectable_errors++;
1172 spin_unlock(&sctx->stat_lock);
1173 btrfs_err_rl_in_rcu(fs_info,
1174 "unable to fixup (regular) error at logical %llu on dev %s",
1175 logical, rcu_str_deref(dev->name));
1176 }
1177
1178out:
1179 if (sblocks_for_recheck) {
1180 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1181 mirror_index++) {
1182 struct scrub_block *sblock = sblocks_for_recheck +
1183 mirror_index;
1184 struct scrub_recover *recover;
1185 int i;
1186
1187 for (i = 0; i < sblock->sector_count; i++) {
1188 sblock->sectors[i]->sblock = NULL;
1189 recover = sblock->sectors[i]->recover;
1190 if (recover) {
1191 scrub_put_recover(fs_info, recover);
1192 sblock->sectors[i]->recover = NULL;
1193 }
1194 scrub_sector_put(sblock->sectors[i]);
1195 }
1196 }
1197 kfree(sblocks_for_recheck);
1198 }
1199
1200 ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1201 memalloc_nofs_restore(nofs_flag);
1202 if (ret < 0)
1203 return ret;
1204 return 0;
1205}
1206
1207static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
1208{
1209 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
1210 return 2;
1211 else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
1212 return 3;
1213 else
1214 return (int)bioc->num_stripes;
1215}
1216
1217static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1218 u64 *raid_map,
1219 int nstripes, int mirror,
1220 int *stripe_index,
1221 u64 *stripe_offset)
1222{
1223 int i;
1224
1225 if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1226 /* RAID5/6 */
1227 for (i = 0; i < nstripes; i++) {
1228 if (raid_map[i] == RAID6_Q_STRIPE ||
1229 raid_map[i] == RAID5_P_STRIPE)
1230 continue;
1231
1232 if (logical >= raid_map[i] &&
1233 logical < raid_map[i] + BTRFS_STRIPE_LEN)
1234 break;
1235 }
1236
1237 *stripe_index = i;
1238 *stripe_offset = logical - raid_map[i];
1239 } else {
1240 /* The other RAID type */
1241 *stripe_index = mirror;
1242 *stripe_offset = 0;
1243 }
1244}
1245
1246static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1247 struct scrub_block *sblocks_for_recheck)
1248{
1249 struct scrub_ctx *sctx = original_sblock->sctx;
1250 struct btrfs_fs_info *fs_info = sctx->fs_info;
1251 u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
1252 u64 logical = original_sblock->sectors[0]->logical;
1253 u64 generation = original_sblock->sectors[0]->generation;
1254 u64 flags = original_sblock->sectors[0]->flags;
1255 u64 have_csum = original_sblock->sectors[0]->have_csum;
1256 struct scrub_recover *recover;
1257 struct btrfs_io_context *bioc;
1258 u64 sublen;
1259 u64 mapped_length;
1260 u64 stripe_offset;
1261 int stripe_index;
1262 int sector_index = 0;
1263 int mirror_index;
1264 int nmirrors;
1265 int ret;
1266
1267 /*
1268 * Note: the two members refs and outstanding_sectors are not used (and
1269 * not set) in the blocks that are used for the recheck procedure.
1270 */
1271
1272 while (length > 0) {
1273 sublen = min_t(u64, length, fs_info->sectorsize);
1274 mapped_length = sublen;
1275 bioc = NULL;
1276
1277 /*
1278 * With a length of sectorsize, each returned stripe represents
1279 * one mirror
1280 */
1281 btrfs_bio_counter_inc_blocked(fs_info);
1282 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1283 logical, &mapped_length, &bioc);
1284 if (ret || !bioc || mapped_length < sublen) {
1285 btrfs_put_bioc(bioc);
1286 btrfs_bio_counter_dec(fs_info);
1287 return -EIO;
1288 }
1289
1290 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1291 if (!recover) {
1292 btrfs_put_bioc(bioc);
1293 btrfs_bio_counter_dec(fs_info);
1294 return -ENOMEM;
1295 }
1296
1297 refcount_set(&recover->refs, 1);
1298 recover->bioc = bioc;
1299 recover->map_length = mapped_length;
1300
1301 ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
1302
1303 nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
1304
1305 for (mirror_index = 0; mirror_index < nmirrors;
1306 mirror_index++) {
1307 struct scrub_block *sblock;
1308 struct scrub_sector *sector;
1309
1310 sblock = sblocks_for_recheck + mirror_index;
1311 sblock->sctx = sctx;
1312
1313 sector = kzalloc(sizeof(*sector), GFP_NOFS);
1314 if (!sector) {
1315leave_nomem:
1316 spin_lock(&sctx->stat_lock);
1317 sctx->stat.malloc_errors++;
1318 spin_unlock(&sctx->stat_lock);
1319 scrub_put_recover(fs_info, recover);
1320 return -ENOMEM;
1321 }
1322 scrub_sector_get(sector);
1323 sblock->sectors[sector_index] = sector;
1324 sector->sblock = sblock;
1325 sector->flags = flags;
1326 sector->generation = generation;
1327 sector->logical = logical;
1328 sector->have_csum = have_csum;
1329 if (have_csum)
1330 memcpy(sector->csum,
1331 original_sblock->sectors[0]->csum,
1332 sctx->fs_info->csum_size);
1333
1334 scrub_stripe_index_and_offset(logical,
1335 bioc->map_type,
1336 bioc->raid_map,
1337 bioc->num_stripes -
1338 bioc->num_tgtdevs,
1339 mirror_index,
1340 &stripe_index,
1341 &stripe_offset);
1342 sector->physical = bioc->stripes[stripe_index].physical +
1343 stripe_offset;
1344 sector->dev = bioc->stripes[stripe_index].dev;
1345
1346 BUG_ON(sector_index >= original_sblock->sector_count);
1347 sector->physical_for_dev_replace =
1348 original_sblock->sectors[sector_index]->
1349 physical_for_dev_replace;
1350 /* For missing devices, dev->bdev is NULL */
1351 sector->mirror_num = mirror_index + 1;
1352 sblock->sector_count++;
1353 sector->page = alloc_page(GFP_NOFS);
1354 if (!sector->page)
1355 goto leave_nomem;
1356
1357 scrub_get_recover(recover);
1358 sector->recover = recover;
1359 }
1360 scrub_put_recover(fs_info, recover);
1361 length -= sublen;
1362 logical += sublen;
1363 sector_index++;
1364 }
1365
1366 return 0;
1367}
1368
1369static void scrub_bio_wait_endio(struct bio *bio)
1370{
1371 complete(bio->bi_private);
1372}
1373
1374static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1375 struct bio *bio,
1376 struct scrub_sector *sector)
1377{
1378 DECLARE_COMPLETION_ONSTACK(done);
1379
1380 bio->bi_iter.bi_sector = sector->logical >> 9;
1381 bio->bi_private = &done;
1382 bio->bi_end_io = scrub_bio_wait_endio;
1383 raid56_parity_recover(bio, sector->recover->bioc,
1384 sector->sblock->sectors[0]->mirror_num, false);
1385
1386 wait_for_completion_io(&done);
1387 return blk_status_to_errno(bio->bi_status);
1388}
1389
1390static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1391 struct scrub_block *sblock)
1392{
1393 struct scrub_sector *first_sector = sblock->sectors[0];
1394 struct bio *bio;
1395 int i;
1396
1397 /* All sectors in sblock belong to the same stripe on the same device. */
1398 ASSERT(first_sector->dev);
1399 if (!first_sector->dev->bdev)
1400 goto out;
1401
1402 bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
1403
1404 for (i = 0; i < sblock->sector_count; i++) {
1405 struct scrub_sector *sector = sblock->sectors[i];
1406
1407 WARN_ON(!sector->page);
1408 bio_add_page(bio, sector->page, PAGE_SIZE, 0);
1409 }
1410
1411 if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
1412 bio_put(bio);
1413 goto out;
1414 }
1415
1416 bio_put(bio);
1417
1418 scrub_recheck_block_checksum(sblock);
1419
1420 return;
1421out:
1422 for (i = 0; i < sblock->sector_count; i++)
1423 sblock->sectors[i]->io_error = 1;
1424
1425 sblock->no_io_error_seen = 0;
1426}
1427
1428/*
1429 * This function will check the on disk data for checksum errors, header errors
1430 * and read I/O errors. If any I/O errors happen, the exact sectors which are
1431 * errored are marked as being bad. The goal is to enable scrub to take those
1432 * sectors that are not errored from all the mirrors so that the sectors that
1433 * are errored in the just handled mirror can be repaired.
1434 */
1435static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1436 struct scrub_block *sblock,
1437 int retry_failed_mirror)
1438{
1439 int i;
1440
1441 sblock->no_io_error_seen = 1;
1442
1443 /* short cut for raid56 */
1444 if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
1445 return scrub_recheck_block_on_raid56(fs_info, sblock);
1446
1447 for (i = 0; i < sblock->sector_count; i++) {
1448 struct scrub_sector *sector = sblock->sectors[i];
1449 struct bio bio;
1450 struct bio_vec bvec;
1451
1452 if (sector->dev->bdev == NULL) {
1453 sector->io_error = 1;
1454 sblock->no_io_error_seen = 0;
1455 continue;
1456 }
1457
1458 WARN_ON(!sector->page);
1459 bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ);
1460 bio_add_page(&bio, sector->page, fs_info->sectorsize, 0);
1461 bio.bi_iter.bi_sector = sector->physical >> 9;
1462
1463 btrfsic_check_bio(&bio);
1464 if (submit_bio_wait(&bio)) {
1465 sector->io_error = 1;
1466 sblock->no_io_error_seen = 0;
1467 }
1468
1469 bio_uninit(&bio);
1470 }
1471
1472 if (sblock->no_io_error_seen)
1473 scrub_recheck_block_checksum(sblock);
1474}
1475
1476static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
1477{
1478 struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices;
1479 int ret;
1480
1481 ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1482 return !ret;
1483}
1484
1485static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1486{
1487 sblock->header_error = 0;
1488 sblock->checksum_error = 0;
1489 sblock->generation_error = 0;
1490
1491 if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1492 scrub_checksum_data(sblock);
1493 else
1494 scrub_checksum_tree_block(sblock);
1495}
1496
1497static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1498 struct scrub_block *sblock_good)
1499{
1500 int i;
1501 int ret = 0;
1502
1503 for (i = 0; i < sblock_bad->sector_count; i++) {
1504 int ret_sub;
1505
1506 ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
1507 sblock_good, i, 1);
1508 if (ret_sub)
1509 ret = ret_sub;
1510 }
1511
1512 return ret;
1513}
1514
1515static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
1516 struct scrub_block *sblock_good,
1517 int sector_num, int force_write)
1518{
1519 struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1520 struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
1521 struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1522 const u32 sectorsize = fs_info->sectorsize;
1523
1524 BUG_ON(sector_bad->page == NULL);
1525 BUG_ON(sector_good->page == NULL);
1526 if (force_write || sblock_bad->header_error ||
1527 sblock_bad->checksum_error || sector_bad->io_error) {
1528 struct bio bio;
1529 struct bio_vec bvec;
1530 int ret;
1531
1532 if (!sector_bad->dev->bdev) {
1533 btrfs_warn_rl(fs_info,
1534 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1535 return -EIO;
1536 }
1537
1538 bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
1539 bio.bi_iter.bi_sector = sector_bad->physical >> 9;
1540 __bio_add_page(&bio, sector_good->page, sectorsize, 0);
1541
1542 btrfsic_check_bio(&bio);
1543 ret = submit_bio_wait(&bio);
1544 bio_uninit(&bio);
1545
1546 if (ret) {
1547 btrfs_dev_stat_inc_and_print(sector_bad->dev,
1548 BTRFS_DEV_STAT_WRITE_ERRS);
1549 atomic64_inc(&fs_info->dev_replace.num_write_errors);
1550 return -EIO;
1551 }
1552 }
1553
1554 return 0;
1555}
1556
1557static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1558{
1559 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1560 int i;
1561
1562 /*
1563 * This block is used for the check of the parity on the source device,
1564 * so the data needn't be written into the destination device.
1565 */
1566 if (sblock->sparity)
1567 return;
1568
1569 for (i = 0; i < sblock->sector_count; i++) {
1570 int ret;
1571
1572 ret = scrub_write_sector_to_dev_replace(sblock, i);
1573 if (ret)
1574 atomic64_inc(&fs_info->dev_replace.num_write_errors);
1575 }
1576}
1577
1578static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
1579{
1580 struct scrub_sector *sector = sblock->sectors[sector_num];
1581
1582 BUG_ON(sector->page == NULL);
1583 if (sector->io_error)
1584 clear_page(page_address(sector->page));
1585
1586 return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
1587}
1588
1589static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
1590{
1591 int ret = 0;
1592 u64 length;
1593
1594 if (!btrfs_is_zoned(sctx->fs_info))
1595 return 0;
1596
1597 if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
1598 return 0;
1599
1600 if (sctx->write_pointer < physical) {
1601 length = physical - sctx->write_pointer;
1602
1603 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
1604 sctx->write_pointer, length);
1605 if (!ret)
1606 sctx->write_pointer = physical;
1607 }
1608 return ret;
1609}
1610
1611static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
1612 struct scrub_sector *sector)
1613{
1614 struct scrub_bio *sbio;
1615 int ret;
1616 const u32 sectorsize = sctx->fs_info->sectorsize;
1617
1618 mutex_lock(&sctx->wr_lock);
1619again:
1620 if (!sctx->wr_curr_bio) {
1621 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1622 GFP_KERNEL);
1623 if (!sctx->wr_curr_bio) {
1624 mutex_unlock(&sctx->wr_lock);
1625 return -ENOMEM;
1626 }
1627 sctx->wr_curr_bio->sctx = sctx;
1628 sctx->wr_curr_bio->sector_count = 0;
1629 }
1630 sbio = sctx->wr_curr_bio;
1631 if (sbio->sector_count == 0) {
1632 ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace);
1633 if (ret) {
1634 mutex_unlock(&sctx->wr_lock);
1635 return ret;
1636 }
1637
1638 sbio->physical = sector->physical_for_dev_replace;
1639 sbio->logical = sector->logical;
1640 sbio->dev = sctx->wr_tgtdev;
1641 if (!sbio->bio) {
1642 sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
1643 REQ_OP_WRITE, GFP_NOFS);
1644 }
1645 sbio->bio->bi_private = sbio;
1646 sbio->bio->bi_end_io = scrub_wr_bio_end_io;
1647 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
1648 sbio->status = 0;
1649 } else if (sbio->physical + sbio->sector_count * sectorsize !=
1650 sector->physical_for_dev_replace ||
1651 sbio->logical + sbio->sector_count * sectorsize !=
1652 sector->logical) {
1653 scrub_wr_submit(sctx);
1654 goto again;
1655 }
1656
1657 ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
1658 if (ret != sectorsize) {
1659 if (sbio->sector_count < 1) {
1660 bio_put(sbio->bio);
1661 sbio->bio = NULL;
1662 mutex_unlock(&sctx->wr_lock);
1663 return -EIO;
1664 }
1665 scrub_wr_submit(sctx);
1666 goto again;
1667 }
1668
1669 sbio->sectors[sbio->sector_count] = sector;
1670 scrub_sector_get(sector);
1671 sbio->sector_count++;
1672 if (sbio->sector_count == sctx->sectors_per_bio)
1673 scrub_wr_submit(sctx);
1674 mutex_unlock(&sctx->wr_lock);
1675
1676 return 0;
1677}
1678
1679static void scrub_wr_submit(struct scrub_ctx *sctx)
1680{
1681 struct scrub_bio *sbio;
1682
1683 if (!sctx->wr_curr_bio)
1684 return;
1685
1686 sbio = sctx->wr_curr_bio;
1687 sctx->wr_curr_bio = NULL;
1688 scrub_pending_bio_inc(sctx);
1689 /* process all writes in a single worker thread. Then the block layer
1690 * orders the requests before sending them to the driver which
1691 * doubled the write performance on spinning disks when measured
1692 * with Linux 3.5 */
1693 btrfsic_check_bio(sbio->bio);
1694 submit_bio(sbio->bio);
1695
1696 if (btrfs_is_zoned(sctx->fs_info))
1697 sctx->write_pointer = sbio->physical + sbio->sector_count *
1698 sctx->fs_info->sectorsize;
1699}
1700
1701static void scrub_wr_bio_end_io(struct bio *bio)
1702{
1703 struct scrub_bio *sbio = bio->bi_private;
1704 struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1705
1706 sbio->status = bio->bi_status;
1707 sbio->bio = bio;
1708
1709 INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
1710 queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1711}
1712
1713static void scrub_wr_bio_end_io_worker(struct work_struct *work)
1714{
1715 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1716 struct scrub_ctx *sctx = sbio->sctx;
1717 int i;
1718
1719 ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
1720 if (sbio->status) {
1721 struct btrfs_dev_replace *dev_replace =
1722 &sbio->sctx->fs_info->dev_replace;
1723
1724 for (i = 0; i < sbio->sector_count; i++) {
1725 struct scrub_sector *sector = sbio->sectors[i];
1726
1727 sector->io_error = 1;
1728 atomic64_inc(&dev_replace->num_write_errors);
1729 }
1730 }
1731
1732 for (i = 0; i < sbio->sector_count; i++)
1733 scrub_sector_put(sbio->sectors[i]);
1734
1735 bio_put(sbio->bio);
1736 kfree(sbio);
1737 scrub_pending_bio_dec(sctx);
1738}
1739
1740static int scrub_checksum(struct scrub_block *sblock)
1741{
1742 u64 flags;
1743 int ret;
1744
1745 /*
1746 * No need to initialize these stats currently,
1747 * because this function only use return value
1748 * instead of these stats value.
1749 *
1750 * Todo:
1751 * always use stats
1752 */
1753 sblock->header_error = 0;
1754 sblock->generation_error = 0;
1755 sblock->checksum_error = 0;
1756
1757 WARN_ON(sblock->sector_count < 1);
1758 flags = sblock->sectors[0]->flags;
1759 ret = 0;
1760 if (flags & BTRFS_EXTENT_FLAG_DATA)
1761 ret = scrub_checksum_data(sblock);
1762 else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1763 ret = scrub_checksum_tree_block(sblock);
1764 else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1765 (void)scrub_checksum_super(sblock);
1766 else
1767 WARN_ON(1);
1768 if (ret)
1769 scrub_handle_errored_block(sblock);
1770
1771 return ret;
1772}
1773
1774static int scrub_checksum_data(struct scrub_block *sblock)
1775{
1776 struct scrub_ctx *sctx = sblock->sctx;
1777 struct btrfs_fs_info *fs_info = sctx->fs_info;
1778 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1779 u8 csum[BTRFS_CSUM_SIZE];
1780 struct scrub_sector *sector;
1781 char *kaddr;
1782
1783 BUG_ON(sblock->sector_count < 1);
1784 sector = sblock->sectors[0];
1785 if (!sector->have_csum)
1786 return 0;
1787
1788 kaddr = page_address(sector->page);
1789
1790 shash->tfm = fs_info->csum_shash;
1791 crypto_shash_init(shash);
1792
1793 /*
1794 * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector
1795 * only contains one sector of data.
1796 */
1797 crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
1798
1799 if (memcmp(csum, sector->csum, fs_info->csum_size))
1800 sblock->checksum_error = 1;
1801 return sblock->checksum_error;
1802}
1803
1804static int scrub_checksum_tree_block(struct scrub_block *sblock)
1805{
1806 struct scrub_ctx *sctx = sblock->sctx;
1807 struct btrfs_header *h;
1808 struct btrfs_fs_info *fs_info = sctx->fs_info;
1809 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1810 u8 calculated_csum[BTRFS_CSUM_SIZE];
1811 u8 on_disk_csum[BTRFS_CSUM_SIZE];
1812 /*
1813 * This is done in sectorsize steps even for metadata as there's a
1814 * constraint for nodesize to be aligned to sectorsize. This will need
1815 * to change so we don't misuse data and metadata units like that.
1816 */
1817 const u32 sectorsize = sctx->fs_info->sectorsize;
1818 const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
1819 int i;
1820 struct scrub_sector *sector;
1821 char *kaddr;
1822
1823 BUG_ON(sblock->sector_count < 1);
1824
1825 /* Each member in sectors is just one sector */
1826 ASSERT(sblock->sector_count == num_sectors);
1827
1828 sector = sblock->sectors[0];
1829 kaddr = page_address(sector->page);
1830 h = (struct btrfs_header *)kaddr;
1831 memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
1832
1833 /*
1834 * we don't use the getter functions here, as we
1835 * a) don't have an extent buffer and
1836 * b) the page is already kmapped
1837 */
1838 if (sector->logical != btrfs_stack_header_bytenr(h))
1839 sblock->header_error = 1;
1840
1841 if (sector->generation != btrfs_stack_header_generation(h)) {
1842 sblock->header_error = 1;
1843 sblock->generation_error = 1;
1844 }
1845
1846 if (!scrub_check_fsid(h->fsid, sector))
1847 sblock->header_error = 1;
1848
1849 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1850 BTRFS_UUID_SIZE))
1851 sblock->header_error = 1;
1852
1853 shash->tfm = fs_info->csum_shash;
1854 crypto_shash_init(shash);
1855 crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
1856 sectorsize - BTRFS_CSUM_SIZE);
1857
1858 for (i = 1; i < num_sectors; i++) {
1859 kaddr = page_address(sblock->sectors[i]->page);
1860 crypto_shash_update(shash, kaddr, sectorsize);
1861 }
1862
1863 crypto_shash_final(shash, calculated_csum);
1864 if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
1865 sblock->checksum_error = 1;
1866
1867 return sblock->header_error || sblock->checksum_error;
1868}
1869
1870static int scrub_checksum_super(struct scrub_block *sblock)
1871{
1872 struct btrfs_super_block *s;
1873 struct scrub_ctx *sctx = sblock->sctx;
1874 struct btrfs_fs_info *fs_info = sctx->fs_info;
1875 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1876 u8 calculated_csum[BTRFS_CSUM_SIZE];
1877 struct scrub_sector *sector;
1878 char *kaddr;
1879 int fail_gen = 0;
1880 int fail_cor = 0;
1881
1882 BUG_ON(sblock->sector_count < 1);
1883 sector = sblock->sectors[0];
1884 kaddr = page_address(sector->page);
1885 s = (struct btrfs_super_block *)kaddr;
1886
1887 if (sector->logical != btrfs_super_bytenr(s))
1888 ++fail_cor;
1889
1890 if (sector->generation != btrfs_super_generation(s))
1891 ++fail_gen;
1892
1893 if (!scrub_check_fsid(s->fsid, sector))
1894 ++fail_cor;
1895
1896 shash->tfm = fs_info->csum_shash;
1897 crypto_shash_init(shash);
1898 crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
1899 BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
1900
1901 if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
1902 ++fail_cor;
1903
1904 if (fail_cor + fail_gen) {
1905 /*
1906 * if we find an error in a super block, we just report it.
1907 * They will get written with the next transaction commit
1908 * anyway
1909 */
1910 spin_lock(&sctx->stat_lock);
1911 ++sctx->stat.super_errors;
1912 spin_unlock(&sctx->stat_lock);
1913 if (fail_cor)
1914 btrfs_dev_stat_inc_and_print(sector->dev,
1915 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1916 else
1917 btrfs_dev_stat_inc_and_print(sector->dev,
1918 BTRFS_DEV_STAT_GENERATION_ERRS);
1919 }
1920
1921 return fail_cor + fail_gen;
1922}
1923
1924static void scrub_block_get(struct scrub_block *sblock)
1925{
1926 refcount_inc(&sblock->refs);
1927}
1928
1929static void scrub_block_put(struct scrub_block *sblock)
1930{
1931 if (refcount_dec_and_test(&sblock->refs)) {
1932 int i;
1933
1934 if (sblock->sparity)
1935 scrub_parity_put(sblock->sparity);
1936
1937 for (i = 0; i < sblock->sector_count; i++)
1938 scrub_sector_put(sblock->sectors[i]);
1939 kfree(sblock);
1940 }
1941}
1942
1943static void scrub_sector_get(struct scrub_sector *sector)
1944{
1945 atomic_inc(§or->refs);
1946}
1947
1948static void scrub_sector_put(struct scrub_sector *sector)
1949{
1950 if (atomic_dec_and_test(§or->refs)) {
1951 if (sector->page)
1952 __free_page(sector->page);
1953 kfree(sector);
1954 }
1955}
1956
1957/*
1958 * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
1959 * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
1960 */
1961static void scrub_throttle(struct scrub_ctx *sctx)
1962{
1963 const int time_slice = 1000;
1964 struct scrub_bio *sbio;
1965 struct btrfs_device *device;
1966 s64 delta;
1967 ktime_t now;
1968 u32 div;
1969 u64 bwlimit;
1970
1971 sbio = sctx->bios[sctx->curr];
1972 device = sbio->dev;
1973 bwlimit = READ_ONCE(device->scrub_speed_max);
1974 if (bwlimit == 0)
1975 return;
1976
1977 /*
1978 * Slice is divided into intervals when the IO is submitted, adjust by
1979 * bwlimit and maximum of 64 intervals.
1980 */
1981 div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
1982 div = min_t(u32, 64, div);
1983
1984 /* Start new epoch, set deadline */
1985 now = ktime_get();
1986 if (sctx->throttle_deadline == 0) {
1987 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
1988 sctx->throttle_sent = 0;
1989 }
1990
1991 /* Still in the time to send? */
1992 if (ktime_before(now, sctx->throttle_deadline)) {
1993 /* If current bio is within the limit, send it */
1994 sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
1995 if (sctx->throttle_sent <= div_u64(bwlimit, div))
1996 return;
1997
1998 /* We're over the limit, sleep until the rest of the slice */
1999 delta = ktime_ms_delta(sctx->throttle_deadline, now);
2000 } else {
2001 /* New request after deadline, start new epoch */
2002 delta = 0;
2003 }
2004
2005 if (delta) {
2006 long timeout;
2007
2008 timeout = div_u64(delta * HZ, 1000);
2009 schedule_timeout_interruptible(timeout);
2010 }
2011
2012 /* Next call will start the deadline period */
2013 sctx->throttle_deadline = 0;
2014}
2015
2016static void scrub_submit(struct scrub_ctx *sctx)
2017{
2018 struct scrub_bio *sbio;
2019
2020 if (sctx->curr == -1)
2021 return;
2022
2023 scrub_throttle(sctx);
2024
2025 sbio = sctx->bios[sctx->curr];
2026 sctx->curr = -1;
2027 scrub_pending_bio_inc(sctx);
2028 btrfsic_check_bio(sbio->bio);
2029 submit_bio(sbio->bio);
2030}
2031
2032static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
2033 struct scrub_sector *sector)
2034{
2035 struct scrub_block *sblock = sector->sblock;
2036 struct scrub_bio *sbio;
2037 const u32 sectorsize = sctx->fs_info->sectorsize;
2038 int ret;
2039
2040again:
2041 /*
2042 * grab a fresh bio or wait for one to become available
2043 */
2044 while (sctx->curr == -1) {
2045 spin_lock(&sctx->list_lock);
2046 sctx->curr = sctx->first_free;
2047 if (sctx->curr != -1) {
2048 sctx->first_free = sctx->bios[sctx->curr]->next_free;
2049 sctx->bios[sctx->curr]->next_free = -1;
2050 sctx->bios[sctx->curr]->sector_count = 0;
2051 spin_unlock(&sctx->list_lock);
2052 } else {
2053 spin_unlock(&sctx->list_lock);
2054 wait_event(sctx->list_wait, sctx->first_free != -1);
2055 }
2056 }
2057 sbio = sctx->bios[sctx->curr];
2058 if (sbio->sector_count == 0) {
2059 sbio->physical = sector->physical;
2060 sbio->logical = sector->logical;
2061 sbio->dev = sector->dev;
2062 if (!sbio->bio) {
2063 sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
2064 REQ_OP_READ, GFP_NOFS);
2065 }
2066 sbio->bio->bi_private = sbio;
2067 sbio->bio->bi_end_io = scrub_bio_end_io;
2068 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
2069 sbio->status = 0;
2070 } else if (sbio->physical + sbio->sector_count * sectorsize !=
2071 sector->physical ||
2072 sbio->logical + sbio->sector_count * sectorsize !=
2073 sector->logical ||
2074 sbio->dev != sector->dev) {
2075 scrub_submit(sctx);
2076 goto again;
2077 }
2078
2079 sbio->sectors[sbio->sector_count] = sector;
2080 ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
2081 if (ret != sectorsize) {
2082 if (sbio->sector_count < 1) {
2083 bio_put(sbio->bio);
2084 sbio->bio = NULL;
2085 return -EIO;
2086 }
2087 scrub_submit(sctx);
2088 goto again;
2089 }
2090
2091 scrub_block_get(sblock); /* one for the page added to the bio */
2092 atomic_inc(&sblock->outstanding_sectors);
2093 sbio->sector_count++;
2094 if (sbio->sector_count == sctx->sectors_per_bio)
2095 scrub_submit(sctx);
2096
2097 return 0;
2098}
2099
2100static void scrub_missing_raid56_end_io(struct bio *bio)
2101{
2102 struct scrub_block *sblock = bio->bi_private;
2103 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2104
2105 if (bio->bi_status)
2106 sblock->no_io_error_seen = 0;
2107
2108 bio_put(bio);
2109
2110 queue_work(fs_info->scrub_workers, &sblock->work);
2111}
2112
2113static void scrub_missing_raid56_worker(struct work_struct *work)
2114{
2115 struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2116 struct scrub_ctx *sctx = sblock->sctx;
2117 struct btrfs_fs_info *fs_info = sctx->fs_info;
2118 u64 logical;
2119 struct btrfs_device *dev;
2120
2121 logical = sblock->sectors[0]->logical;
2122 dev = sblock->sectors[0]->dev;
2123
2124 if (sblock->no_io_error_seen)
2125 scrub_recheck_block_checksum(sblock);
2126
2127 if (!sblock->no_io_error_seen) {
2128 spin_lock(&sctx->stat_lock);
2129 sctx->stat.read_errors++;
2130 spin_unlock(&sctx->stat_lock);
2131 btrfs_err_rl_in_rcu(fs_info,
2132 "IO error rebuilding logical %llu for dev %s",
2133 logical, rcu_str_deref(dev->name));
2134 } else if (sblock->header_error || sblock->checksum_error) {
2135 spin_lock(&sctx->stat_lock);
2136 sctx->stat.uncorrectable_errors++;
2137 spin_unlock(&sctx->stat_lock);
2138 btrfs_err_rl_in_rcu(fs_info,
2139 "failed to rebuild valid logical %llu for dev %s",
2140 logical, rcu_str_deref(dev->name));
2141 } else {
2142 scrub_write_block_to_dev_replace(sblock);
2143 }
2144
2145 if (sctx->is_dev_replace && sctx->flush_all_writes) {
2146 mutex_lock(&sctx->wr_lock);
2147 scrub_wr_submit(sctx);
2148 mutex_unlock(&sctx->wr_lock);
2149 }
2150
2151 scrub_block_put(sblock);
2152 scrub_pending_bio_dec(sctx);
2153}
2154
2155static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2156{
2157 struct scrub_ctx *sctx = sblock->sctx;
2158 struct btrfs_fs_info *fs_info = sctx->fs_info;
2159 u64 length = sblock->sector_count << fs_info->sectorsize_bits;
2160 u64 logical = sblock->sectors[0]->logical;
2161 struct btrfs_io_context *bioc = NULL;
2162 struct bio *bio;
2163 struct btrfs_raid_bio *rbio;
2164 int ret;
2165 int i;
2166
2167 btrfs_bio_counter_inc_blocked(fs_info);
2168 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2169 &length, &bioc);
2170 if (ret || !bioc || !bioc->raid_map)
2171 goto bioc_out;
2172
2173 if (WARN_ON(!sctx->is_dev_replace ||
2174 !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2175 /*
2176 * We shouldn't be scrubbing a missing device. Even for dev
2177 * replace, we should only get here for RAID 5/6. We either
2178 * managed to mount something with no mirrors remaining or
2179 * there's a bug in scrub_find_good_copy()/btrfs_map_block().
2180 */
2181 goto bioc_out;
2182 }
2183
2184 bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2185 bio->bi_iter.bi_sector = logical >> 9;
2186 bio->bi_private = sblock;
2187 bio->bi_end_io = scrub_missing_raid56_end_io;
2188
2189 rbio = raid56_alloc_missing_rbio(bio, bioc);
2190 if (!rbio)
2191 goto rbio_out;
2192
2193 for (i = 0; i < sblock->sector_count; i++) {
2194 struct scrub_sector *sector = sblock->sectors[i];
2195
2196 /*
2197 * For now, our scrub is still one page per sector, so pgoff
2198 * is always 0.
2199 */
2200 raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical);
2201 }
2202
2203 INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
2204 scrub_block_get(sblock);
2205 scrub_pending_bio_inc(sctx);
2206 raid56_submit_missing_rbio(rbio);
2207 return;
2208
2209rbio_out:
2210 bio_put(bio);
2211bioc_out:
2212 btrfs_bio_counter_dec(fs_info);
2213 btrfs_put_bioc(bioc);
2214 spin_lock(&sctx->stat_lock);
2215 sctx->stat.malloc_errors++;
2216 spin_unlock(&sctx->stat_lock);
2217}
2218
2219static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
2220 u64 physical, struct btrfs_device *dev, u64 flags,
2221 u64 gen, int mirror_num, u8 *csum,
2222 u64 physical_for_dev_replace)
2223{
2224 struct scrub_block *sblock;
2225 const u32 sectorsize = sctx->fs_info->sectorsize;
2226 int index;
2227
2228 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2229 if (!sblock) {
2230 spin_lock(&sctx->stat_lock);
2231 sctx->stat.malloc_errors++;
2232 spin_unlock(&sctx->stat_lock);
2233 return -ENOMEM;
2234 }
2235
2236 /* one ref inside this function, plus one for each page added to
2237 * a bio later on */
2238 refcount_set(&sblock->refs, 1);
2239 sblock->sctx = sctx;
2240 sblock->no_io_error_seen = 1;
2241
2242 for (index = 0; len > 0; index++) {
2243 struct scrub_sector *sector;
2244 /*
2245 * Here we will allocate one page for one sector to scrub.
2246 * This is fine if PAGE_SIZE == sectorsize, but will cost
2247 * more memory for PAGE_SIZE > sectorsize case.
2248 */
2249 u32 l = min(sectorsize, len);
2250
2251 sector = kzalloc(sizeof(*sector), GFP_KERNEL);
2252 if (!sector) {
2253leave_nomem:
2254 spin_lock(&sctx->stat_lock);
2255 sctx->stat.malloc_errors++;
2256 spin_unlock(&sctx->stat_lock);
2257 scrub_block_put(sblock);
2258 return -ENOMEM;
2259 }
2260 ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
2261 scrub_sector_get(sector);
2262 sblock->sectors[index] = sector;
2263 sector->sblock = sblock;
2264 sector->dev = dev;
2265 sector->flags = flags;
2266 sector->generation = gen;
2267 sector->logical = logical;
2268 sector->physical = physical;
2269 sector->physical_for_dev_replace = physical_for_dev_replace;
2270 sector->mirror_num = mirror_num;
2271 if (csum) {
2272 sector->have_csum = 1;
2273 memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2274 } else {
2275 sector->have_csum = 0;
2276 }
2277 sblock->sector_count++;
2278 sector->page = alloc_page(GFP_KERNEL);
2279 if (!sector->page)
2280 goto leave_nomem;
2281 len -= l;
2282 logical += l;
2283 physical += l;
2284 physical_for_dev_replace += l;
2285 }
2286
2287 WARN_ON(sblock->sector_count == 0);
2288 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2289 /*
2290 * This case should only be hit for RAID 5/6 device replace. See
2291 * the comment in scrub_missing_raid56_pages() for details.
2292 */
2293 scrub_missing_raid56_pages(sblock);
2294 } else {
2295 for (index = 0; index < sblock->sector_count; index++) {
2296 struct scrub_sector *sector = sblock->sectors[index];
2297 int ret;
2298
2299 ret = scrub_add_sector_to_rd_bio(sctx, sector);
2300 if (ret) {
2301 scrub_block_put(sblock);
2302 return ret;
2303 }
2304 }
2305
2306 if (flags & BTRFS_EXTENT_FLAG_SUPER)
2307 scrub_submit(sctx);
2308 }
2309
2310 /* last one frees, either here or in bio completion for last page */
2311 scrub_block_put(sblock);
2312 return 0;
2313}
2314
2315static void scrub_bio_end_io(struct bio *bio)
2316{
2317 struct scrub_bio *sbio = bio->bi_private;
2318 struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2319
2320 sbio->status = bio->bi_status;
2321 sbio->bio = bio;
2322
2323 queue_work(fs_info->scrub_workers, &sbio->work);
2324}
2325
2326static void scrub_bio_end_io_worker(struct work_struct *work)
2327{
2328 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2329 struct scrub_ctx *sctx = sbio->sctx;
2330 int i;
2331
2332 ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
2333 if (sbio->status) {
2334 for (i = 0; i < sbio->sector_count; i++) {
2335 struct scrub_sector *sector = sbio->sectors[i];
2336
2337 sector->io_error = 1;
2338 sector->sblock->no_io_error_seen = 0;
2339 }
2340 }
2341
2342 /* Now complete the scrub_block items that have all pages completed */
2343 for (i = 0; i < sbio->sector_count; i++) {
2344 struct scrub_sector *sector = sbio->sectors[i];
2345 struct scrub_block *sblock = sector->sblock;
2346
2347 if (atomic_dec_and_test(&sblock->outstanding_sectors))
2348 scrub_block_complete(sblock);
2349 scrub_block_put(sblock);
2350 }
2351
2352 bio_put(sbio->bio);
2353 sbio->bio = NULL;
2354 spin_lock(&sctx->list_lock);
2355 sbio->next_free = sctx->first_free;
2356 sctx->first_free = sbio->index;
2357 spin_unlock(&sctx->list_lock);
2358
2359 if (sctx->is_dev_replace && sctx->flush_all_writes) {
2360 mutex_lock(&sctx->wr_lock);
2361 scrub_wr_submit(sctx);
2362 mutex_unlock(&sctx->wr_lock);
2363 }
2364
2365 scrub_pending_bio_dec(sctx);
2366}
2367
2368static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2369 unsigned long *bitmap,
2370 u64 start, u32 len)
2371{
2372 u64 offset;
2373 u32 nsectors;
2374 u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
2375
2376 if (len >= sparity->stripe_len) {
2377 bitmap_set(bitmap, 0, sparity->nsectors);
2378 return;
2379 }
2380
2381 start -= sparity->logic_start;
2382 start = div64_u64_rem(start, sparity->stripe_len, &offset);
2383 offset = offset >> sectorsize_bits;
2384 nsectors = len >> sectorsize_bits;
2385
2386 if (offset + nsectors <= sparity->nsectors) {
2387 bitmap_set(bitmap, offset, nsectors);
2388 return;
2389 }
2390
2391 bitmap_set(bitmap, offset, sparity->nsectors - offset);
2392 bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2393}
2394
2395static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2396 u64 start, u32 len)
2397{
2398 __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len);
2399}
2400
2401static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2402 u64 start, u32 len)
2403{
2404 __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len);
2405}
2406
2407static void scrub_block_complete(struct scrub_block *sblock)
2408{
2409 int corrupted = 0;
2410
2411 if (!sblock->no_io_error_seen) {
2412 corrupted = 1;
2413 scrub_handle_errored_block(sblock);
2414 } else {
2415 /*
2416 * if has checksum error, write via repair mechanism in
2417 * dev replace case, otherwise write here in dev replace
2418 * case.
2419 */
2420 corrupted = scrub_checksum(sblock);
2421 if (!corrupted && sblock->sctx->is_dev_replace)
2422 scrub_write_block_to_dev_replace(sblock);
2423 }
2424
2425 if (sblock->sparity && corrupted && !sblock->data_corrected) {
2426 u64 start = sblock->sectors[0]->logical;
2427 u64 end = sblock->sectors[sblock->sector_count - 1]->logical +
2428 sblock->sctx->fs_info->sectorsize;
2429
2430 ASSERT(end - start <= U32_MAX);
2431 scrub_parity_mark_sectors_error(sblock->sparity,
2432 start, end - start);
2433 }
2434}
2435
2436static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
2437{
2438 sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
2439 list_del(&sum->list);
2440 kfree(sum);
2441}
2442
2443/*
2444 * Find the desired csum for range [logical, logical + sectorsize), and store
2445 * the csum into @csum.
2446 *
2447 * The search source is sctx->csum_list, which is a pre-populated list
2448 * storing bytenr ordered csum ranges. We're responsible to cleanup any range
2449 * that is before @logical.
2450 *
2451 * Return 0 if there is no csum for the range.
2452 * Return 1 if there is csum for the range and copied to @csum.
2453 */
2454static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2455{
2456 bool found = false;
2457
2458 while (!list_empty(&sctx->csum_list)) {
2459 struct btrfs_ordered_sum *sum = NULL;
2460 unsigned long index;
2461 unsigned long num_sectors;
2462
2463 sum = list_first_entry(&sctx->csum_list,
2464 struct btrfs_ordered_sum, list);
2465 /* The current csum range is beyond our range, no csum found */
2466 if (sum->bytenr > logical)
2467 break;
2468
2469 /*
2470 * The current sum is before our bytenr, since scrub is always
2471 * done in bytenr order, the csum will never be used anymore,
2472 * clean it up so that later calls won't bother with the range,
2473 * and continue search the next range.
2474 */
2475 if (sum->bytenr + sum->len <= logical) {
2476 drop_csum_range(sctx, sum);
2477 continue;
2478 }
2479
2480 /* Now the csum range covers our bytenr, copy the csum */
2481 found = true;
2482 index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
2483 num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
2484
2485 memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
2486 sctx->fs_info->csum_size);
2487
2488 /* Cleanup the range if we're at the end of the csum range */
2489 if (index == num_sectors - 1)
2490 drop_csum_range(sctx, sum);
2491 break;
2492 }
2493 if (!found)
2494 return 0;
2495 return 1;
2496}
2497
2498/* scrub extent tries to collect up to 64 kB for each bio */
2499static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2500 u64 logical, u32 len,
2501 u64 physical, struct btrfs_device *dev, u64 flags,
2502 u64 gen, int mirror_num)
2503{
2504 struct btrfs_device *src_dev = dev;
2505 u64 src_physical = physical;
2506 int src_mirror = mirror_num;
2507 int ret;
2508 u8 csum[BTRFS_CSUM_SIZE];
2509 u32 blocksize;
2510
2511 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2512 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2513 blocksize = map->stripe_len;
2514 else
2515 blocksize = sctx->fs_info->sectorsize;
2516 spin_lock(&sctx->stat_lock);
2517 sctx->stat.data_extents_scrubbed++;
2518 sctx->stat.data_bytes_scrubbed += len;
2519 spin_unlock(&sctx->stat_lock);
2520 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2521 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2522 blocksize = map->stripe_len;
2523 else
2524 blocksize = sctx->fs_info->nodesize;
2525 spin_lock(&sctx->stat_lock);
2526 sctx->stat.tree_extents_scrubbed++;
2527 sctx->stat.tree_bytes_scrubbed += len;
2528 spin_unlock(&sctx->stat_lock);
2529 } else {
2530 blocksize = sctx->fs_info->sectorsize;
2531 WARN_ON(1);
2532 }
2533
2534 /*
2535 * For dev-replace case, we can have @dev being a missing device.
2536 * Regular scrub will avoid its execution on missing device at all,
2537 * as that would trigger tons of read error.
2538 *
2539 * Reading from missing device will cause read error counts to
2540 * increase unnecessarily.
2541 * So here we change the read source to a good mirror.
2542 */
2543 if (sctx->is_dev_replace && !dev->bdev)
2544 scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
2545 &src_dev, &src_mirror);
2546 while (len) {
2547 u32 l = min(len, blocksize);
2548 int have_csum = 0;
2549
2550 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2551 /* push csums to sbio */
2552 have_csum = scrub_find_csum(sctx, logical, csum);
2553 if (have_csum == 0)
2554 ++sctx->stat.no_csum;
2555 }
2556 ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
2557 flags, gen, src_mirror,
2558 have_csum ? csum : NULL, physical);
2559 if (ret)
2560 return ret;
2561 len -= l;
2562 logical += l;
2563 physical += l;
2564 src_physical += l;
2565 }
2566 return 0;
2567}
2568
2569static int scrub_sectors_for_parity(struct scrub_parity *sparity,
2570 u64 logical, u32 len,
2571 u64 physical, struct btrfs_device *dev,
2572 u64 flags, u64 gen, int mirror_num, u8 *csum)
2573{
2574 struct scrub_ctx *sctx = sparity->sctx;
2575 struct scrub_block *sblock;
2576 const u32 sectorsize = sctx->fs_info->sectorsize;
2577 int index;
2578
2579 ASSERT(IS_ALIGNED(len, sectorsize));
2580
2581 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2582 if (!sblock) {
2583 spin_lock(&sctx->stat_lock);
2584 sctx->stat.malloc_errors++;
2585 spin_unlock(&sctx->stat_lock);
2586 return -ENOMEM;
2587 }
2588
2589 /* one ref inside this function, plus one for each page added to
2590 * a bio later on */
2591 refcount_set(&sblock->refs, 1);
2592 sblock->sctx = sctx;
2593 sblock->no_io_error_seen = 1;
2594 sblock->sparity = sparity;
2595 scrub_parity_get(sparity);
2596
2597 for (index = 0; len > 0; index++) {
2598 struct scrub_sector *sector;
2599
2600 sector = kzalloc(sizeof(*sector), GFP_KERNEL);
2601 if (!sector) {
2602leave_nomem:
2603 spin_lock(&sctx->stat_lock);
2604 sctx->stat.malloc_errors++;
2605 spin_unlock(&sctx->stat_lock);
2606 scrub_block_put(sblock);
2607 return -ENOMEM;
2608 }
2609 ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
2610 /* For scrub block */
2611 scrub_sector_get(sector);
2612 sblock->sectors[index] = sector;
2613 /* For scrub parity */
2614 scrub_sector_get(sector);
2615 list_add_tail(§or->list, &sparity->sectors_list);
2616 sector->sblock = sblock;
2617 sector->dev = dev;
2618 sector->flags = flags;
2619 sector->generation = gen;
2620 sector->logical = logical;
2621 sector->physical = physical;
2622 sector->mirror_num = mirror_num;
2623 if (csum) {
2624 sector->have_csum = 1;
2625 memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2626 } else {
2627 sector->have_csum = 0;
2628 }
2629 sblock->sector_count++;
2630 sector->page = alloc_page(GFP_KERNEL);
2631 if (!sector->page)
2632 goto leave_nomem;
2633
2634
2635 /* Iterate over the stripe range in sectorsize steps */
2636 len -= sectorsize;
2637 logical += sectorsize;
2638 physical += sectorsize;
2639 }
2640
2641 WARN_ON(sblock->sector_count == 0);
2642 for (index = 0; index < sblock->sector_count; index++) {
2643 struct scrub_sector *sector = sblock->sectors[index];
2644 int ret;
2645
2646 ret = scrub_add_sector_to_rd_bio(sctx, sector);
2647 if (ret) {
2648 scrub_block_put(sblock);
2649 return ret;
2650 }
2651 }
2652
2653 /* Last one frees, either here or in bio completion for last sector */
2654 scrub_block_put(sblock);
2655 return 0;
2656}
2657
2658static int scrub_extent_for_parity(struct scrub_parity *sparity,
2659 u64 logical, u32 len,
2660 u64 physical, struct btrfs_device *dev,
2661 u64 flags, u64 gen, int mirror_num)
2662{
2663 struct scrub_ctx *sctx = sparity->sctx;
2664 int ret;
2665 u8 csum[BTRFS_CSUM_SIZE];
2666 u32 blocksize;
2667
2668 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2669 scrub_parity_mark_sectors_error(sparity, logical, len);
2670 return 0;
2671 }
2672
2673 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2674 blocksize = sparity->stripe_len;
2675 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2676 blocksize = sparity->stripe_len;
2677 } else {
2678 blocksize = sctx->fs_info->sectorsize;
2679 WARN_ON(1);
2680 }
2681
2682 while (len) {
2683 u32 l = min(len, blocksize);
2684 int have_csum = 0;
2685
2686 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2687 /* push csums to sbio */
2688 have_csum = scrub_find_csum(sctx, logical, csum);
2689 if (have_csum == 0)
2690 goto skip;
2691 }
2692 ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
2693 flags, gen, mirror_num,
2694 have_csum ? csum : NULL);
2695 if (ret)
2696 return ret;
2697skip:
2698 len -= l;
2699 logical += l;
2700 physical += l;
2701 }
2702 return 0;
2703}
2704
2705/*
2706 * Given a physical address, this will calculate it's
2707 * logical offset. if this is a parity stripe, it will return
2708 * the most left data stripe's logical offset.
2709 *
2710 * return 0 if it is a data stripe, 1 means parity stripe.
2711 */
2712static int get_raid56_logic_offset(u64 physical, int num,
2713 struct map_lookup *map, u64 *offset,
2714 u64 *stripe_start)
2715{
2716 int i;
2717 int j = 0;
2718 u64 stripe_nr;
2719 u64 last_offset;
2720 u32 stripe_index;
2721 u32 rot;
2722 const int data_stripes = nr_data_stripes(map);
2723
2724 last_offset = (physical - map->stripes[num].physical) * data_stripes;
2725 if (stripe_start)
2726 *stripe_start = last_offset;
2727
2728 *offset = last_offset;
2729 for (i = 0; i < data_stripes; i++) {
2730 *offset = last_offset + i * map->stripe_len;
2731
2732 stripe_nr = div64_u64(*offset, map->stripe_len);
2733 stripe_nr = div_u64(stripe_nr, data_stripes);
2734
2735 /* Work out the disk rotation on this stripe-set */
2736 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2737 /* calculate which stripe this data locates */
2738 rot += i;
2739 stripe_index = rot % map->num_stripes;
2740 if (stripe_index == num)
2741 return 0;
2742 if (stripe_index < num)
2743 j++;
2744 }
2745 *offset = last_offset + j * map->stripe_len;
2746 return 1;
2747}
2748
2749static void scrub_free_parity(struct scrub_parity *sparity)
2750{
2751 struct scrub_ctx *sctx = sparity->sctx;
2752 struct scrub_sector *curr, *next;
2753 int nbits;
2754
2755 nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors);
2756 if (nbits) {
2757 spin_lock(&sctx->stat_lock);
2758 sctx->stat.read_errors += nbits;
2759 sctx->stat.uncorrectable_errors += nbits;
2760 spin_unlock(&sctx->stat_lock);
2761 }
2762
2763 list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
2764 list_del_init(&curr->list);
2765 scrub_sector_put(curr);
2766 }
2767
2768 kfree(sparity);
2769}
2770
2771static void scrub_parity_bio_endio_worker(struct work_struct *work)
2772{
2773 struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2774 work);
2775 struct scrub_ctx *sctx = sparity->sctx;
2776
2777 scrub_free_parity(sparity);
2778 scrub_pending_bio_dec(sctx);
2779}
2780
2781static void scrub_parity_bio_endio(struct bio *bio)
2782{
2783 struct scrub_parity *sparity = bio->bi_private;
2784 struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2785
2786 if (bio->bi_status)
2787 bitmap_or(&sparity->ebitmap, &sparity->ebitmap,
2788 &sparity->dbitmap, sparity->nsectors);
2789
2790 bio_put(bio);
2791
2792 INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
2793 queue_work(fs_info->scrub_parity_workers, &sparity->work);
2794}
2795
2796static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2797{
2798 struct scrub_ctx *sctx = sparity->sctx;
2799 struct btrfs_fs_info *fs_info = sctx->fs_info;
2800 struct bio *bio;
2801 struct btrfs_raid_bio *rbio;
2802 struct btrfs_io_context *bioc = NULL;
2803 u64 length;
2804 int ret;
2805
2806 if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap,
2807 &sparity->ebitmap, sparity->nsectors))
2808 goto out;
2809
2810 length = sparity->logic_end - sparity->logic_start;
2811
2812 btrfs_bio_counter_inc_blocked(fs_info);
2813 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2814 &length, &bioc);
2815 if (ret || !bioc || !bioc->raid_map)
2816 goto bioc_out;
2817
2818 bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2819 bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2820 bio->bi_private = sparity;
2821 bio->bi_end_io = scrub_parity_bio_endio;
2822
2823 rbio = raid56_parity_alloc_scrub_rbio(bio, bioc,
2824 sparity->scrub_dev,
2825 &sparity->dbitmap,
2826 sparity->nsectors);
2827 if (!rbio)
2828 goto rbio_out;
2829
2830 scrub_pending_bio_inc(sctx);
2831 raid56_parity_submit_scrub_rbio(rbio);
2832 return;
2833
2834rbio_out:
2835 bio_put(bio);
2836bioc_out:
2837 btrfs_bio_counter_dec(fs_info);
2838 btrfs_put_bioc(bioc);
2839 bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
2840 sparity->nsectors);
2841 spin_lock(&sctx->stat_lock);
2842 sctx->stat.malloc_errors++;
2843 spin_unlock(&sctx->stat_lock);
2844out:
2845 scrub_free_parity(sparity);
2846}
2847
2848static void scrub_parity_get(struct scrub_parity *sparity)
2849{
2850 refcount_inc(&sparity->refs);
2851}
2852
2853static void scrub_parity_put(struct scrub_parity *sparity)
2854{
2855 if (!refcount_dec_and_test(&sparity->refs))
2856 return;
2857
2858 scrub_parity_check_and_repair(sparity);
2859}
2860
2861/*
2862 * Return 0 if the extent item range covers any byte of the range.
2863 * Return <0 if the extent item is before @search_start.
2864 * Return >0 if the extent item is after @start_start + @search_len.
2865 */
2866static int compare_extent_item_range(struct btrfs_path *path,
2867 u64 search_start, u64 search_len)
2868{
2869 struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
2870 u64 len;
2871 struct btrfs_key key;
2872
2873 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2874 ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
2875 key.type == BTRFS_METADATA_ITEM_KEY);
2876 if (key.type == BTRFS_METADATA_ITEM_KEY)
2877 len = fs_info->nodesize;
2878 else
2879 len = key.offset;
2880
2881 if (key.objectid + len <= search_start)
2882 return -1;
2883 if (key.objectid >= search_start + search_len)
2884 return 1;
2885 return 0;
2886}
2887
2888/*
2889 * Locate one extent item which covers any byte in range
2890 * [@search_start, @search_start + @search_length)
2891 *
2892 * If the path is not initialized, we will initialize the search by doing
2893 * a btrfs_search_slot().
2894 * If the path is already initialized, we will use the path as the initial
2895 * slot, to avoid duplicated btrfs_search_slot() calls.
2896 *
2897 * NOTE: If an extent item starts before @search_start, we will still
2898 * return the extent item. This is for data extent crossing stripe boundary.
2899 *
2900 * Return 0 if we found such extent item, and @path will point to the extent item.
2901 * Return >0 if no such extent item can be found, and @path will be released.
2902 * Return <0 if hit fatal error, and @path will be released.
2903 */
2904static int find_first_extent_item(struct btrfs_root *extent_root,
2905 struct btrfs_path *path,
2906 u64 search_start, u64 search_len)
2907{
2908 struct btrfs_fs_info *fs_info = extent_root->fs_info;
2909 struct btrfs_key key;
2910 int ret;
2911
2912 /* Continue using the existing path */
2913 if (path->nodes[0])
2914 goto search_forward;
2915
2916 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2917 key.type = BTRFS_METADATA_ITEM_KEY;
2918 else
2919 key.type = BTRFS_EXTENT_ITEM_KEY;
2920 key.objectid = search_start;
2921 key.offset = (u64)-1;
2922
2923 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2924 if (ret < 0)
2925 return ret;
2926
2927 ASSERT(ret > 0);
2928 /*
2929 * Here we intentionally pass 0 as @min_objectid, as there could be
2930 * an extent item starting before @search_start.
2931 */
2932 ret = btrfs_previous_extent_item(extent_root, path, 0);
2933 if (ret < 0)
2934 return ret;
2935 /*
2936 * No matter whether we have found an extent item, the next loop will
2937 * properly do every check on the key.
2938 */
2939search_forward:
2940 while (true) {
2941 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2942 if (key.objectid >= search_start + search_len)
2943 break;
2944 if (key.type != BTRFS_METADATA_ITEM_KEY &&
2945 key.type != BTRFS_EXTENT_ITEM_KEY)
2946 goto next;
2947
2948 ret = compare_extent_item_range(path, search_start, search_len);
2949 if (ret == 0)
2950 return ret;
2951 if (ret > 0)
2952 break;
2953next:
2954 path->slots[0]++;
2955 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2956 ret = btrfs_next_leaf(extent_root, path);
2957 if (ret) {
2958 /* Either no more item or fatal error */
2959 btrfs_release_path(path);
2960 return ret;
2961 }
2962 }
2963 }
2964 btrfs_release_path(path);
2965 return 1;
2966}
2967
2968static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
2969 u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
2970{
2971 struct btrfs_key key;
2972 struct btrfs_extent_item *ei;
2973
2974 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2975 ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
2976 key.type == BTRFS_EXTENT_ITEM_KEY);
2977 *extent_start_ret = key.objectid;
2978 if (key.type == BTRFS_METADATA_ITEM_KEY)
2979 *size_ret = path->nodes[0]->fs_info->nodesize;
2980 else
2981 *size_ret = key.offset;
2982 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
2983 *flags_ret = btrfs_extent_flags(path->nodes[0], ei);
2984 *generation_ret = btrfs_extent_generation(path->nodes[0], ei);
2985}
2986
2987static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
2988 u64 boundary_start, u64 boudary_len)
2989{
2990 return (extent_start < boundary_start &&
2991 extent_start + extent_len > boundary_start) ||
2992 (extent_start < boundary_start + boudary_len &&
2993 extent_start + extent_len > boundary_start + boudary_len);
2994}
2995
2996static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
2997 struct scrub_parity *sparity,
2998 struct map_lookup *map,
2999 struct btrfs_device *sdev,
3000 struct btrfs_path *path,
3001 u64 logical)
3002{
3003 struct btrfs_fs_info *fs_info = sctx->fs_info;
3004 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
3005 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
3006 u64 cur_logical = logical;
3007 int ret;
3008
3009 ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3010
3011 /* Path must not be populated */
3012 ASSERT(!path->nodes[0]);
3013
3014 while (cur_logical < logical + map->stripe_len) {
3015 struct btrfs_io_context *bioc = NULL;
3016 struct btrfs_device *extent_dev;
3017 u64 extent_start;
3018 u64 extent_size;
3019 u64 mapped_length;
3020 u64 extent_flags;
3021 u64 extent_gen;
3022 u64 extent_physical;
3023 u64 extent_mirror_num;
3024
3025 ret = find_first_extent_item(extent_root, path, cur_logical,
3026 logical + map->stripe_len - cur_logical);
3027 /* No more extent item in this data stripe */
3028 if (ret > 0) {
3029 ret = 0;
3030 break;
3031 }
3032 if (ret < 0)
3033 break;
3034 get_extent_info(path, &extent_start, &extent_size, &extent_flags,
3035 &extent_gen);
3036
3037 /* Metadata should not cross stripe boundaries */
3038 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3039 does_range_cross_boundary(extent_start, extent_size,
3040 logical, map->stripe_len)) {
3041 btrfs_err(fs_info,
3042 "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3043 extent_start, logical);
3044 spin_lock(&sctx->stat_lock);
3045 sctx->stat.uncorrectable_errors++;
3046 spin_unlock(&sctx->stat_lock);
3047 cur_logical += extent_size;
3048 continue;
3049 }
3050
3051 /* Skip hole range which doesn't have any extent */
3052 cur_logical = max(extent_start, cur_logical);
3053
3054 /* Truncate the range inside this data stripe */
3055 extent_size = min(extent_start + extent_size,
3056 logical + map->stripe_len) - cur_logical;
3057 extent_start = cur_logical;
3058 ASSERT(extent_size <= U32_MAX);
3059
3060 scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);
3061
3062 mapped_length = extent_size;
3063 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
3064 &mapped_length, &bioc, 0);
3065 if (!ret && (!bioc || mapped_length < extent_size))
3066 ret = -EIO;
3067 if (ret) {
3068 btrfs_put_bioc(bioc);
3069 scrub_parity_mark_sectors_error(sparity, extent_start,
3070 extent_size);
3071 break;
3072 }
3073 extent_physical = bioc->stripes[0].physical;
3074 extent_mirror_num = bioc->mirror_num;
3075 extent_dev = bioc->stripes[0].dev;
3076 btrfs_put_bioc(bioc);
3077
3078 ret = btrfs_lookup_csums_range(csum_root, extent_start,
3079 extent_start + extent_size - 1,
3080 &sctx->csum_list, 1);
3081 if (ret) {
3082 scrub_parity_mark_sectors_error(sparity, extent_start,
3083 extent_size);
3084 break;
3085 }
3086
3087 ret = scrub_extent_for_parity(sparity, extent_start,
3088 extent_size, extent_physical,
3089 extent_dev, extent_flags,
3090 extent_gen, extent_mirror_num);
3091 scrub_free_csums(sctx);
3092
3093 if (ret) {
3094 scrub_parity_mark_sectors_error(sparity, extent_start,
3095 extent_size);
3096 break;
3097 }
3098
3099 cond_resched();
3100 cur_logical += extent_size;
3101 }
3102 btrfs_release_path(path);
3103 return ret;
3104}
3105
3106static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
3107 struct map_lookup *map,
3108 struct btrfs_device *sdev,
3109 u64 logic_start,
3110 u64 logic_end)
3111{
3112 struct btrfs_fs_info *fs_info = sctx->fs_info;
3113 struct btrfs_path *path;
3114 u64 cur_logical;
3115 int ret;
3116 struct scrub_parity *sparity;
3117 int nsectors;
3118
3119 path = btrfs_alloc_path();
3120 if (!path) {
3121 spin_lock(&sctx->stat_lock);
3122 sctx->stat.malloc_errors++;
3123 spin_unlock(&sctx->stat_lock);
3124 return -ENOMEM;
3125 }
3126 path->search_commit_root = 1;
3127 path->skip_locking = 1;
3128
3129 ASSERT(map->stripe_len <= U32_MAX);
3130 nsectors = map->stripe_len >> fs_info->sectorsize_bits;
3131 ASSERT(nsectors <= BITS_PER_LONG);
3132 sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS);
3133 if (!sparity) {
3134 spin_lock(&sctx->stat_lock);
3135 sctx->stat.malloc_errors++;
3136 spin_unlock(&sctx->stat_lock);
3137 btrfs_free_path(path);
3138 return -ENOMEM;
3139 }
3140
3141 ASSERT(map->stripe_len <= U32_MAX);
3142 sparity->stripe_len = map->stripe_len;
3143 sparity->nsectors = nsectors;
3144 sparity->sctx = sctx;
3145 sparity->scrub_dev = sdev;
3146 sparity->logic_start = logic_start;
3147 sparity->logic_end = logic_end;
3148 refcount_set(&sparity->refs, 1);
3149 INIT_LIST_HEAD(&sparity->sectors_list);
3150
3151 ret = 0;
3152 for (cur_logical = logic_start; cur_logical < logic_end;
3153 cur_logical += map->stripe_len) {
3154 ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
3155 sdev, path, cur_logical);
3156 if (ret < 0)
3157 break;
3158 }
3159
3160 scrub_parity_put(sparity);
3161 scrub_submit(sctx);
3162 mutex_lock(&sctx->wr_lock);
3163 scrub_wr_submit(sctx);
3164 mutex_unlock(&sctx->wr_lock);
3165
3166 btrfs_free_path(path);
3167 return ret < 0 ? ret : 0;
3168}
3169
3170static void sync_replace_for_zoned(struct scrub_ctx *sctx)
3171{
3172 if (!btrfs_is_zoned(sctx->fs_info))
3173 return;
3174
3175 sctx->flush_all_writes = true;
3176 scrub_submit(sctx);
3177 mutex_lock(&sctx->wr_lock);
3178 scrub_wr_submit(sctx);
3179 mutex_unlock(&sctx->wr_lock);
3180
3181 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3182}
3183
3184static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
3185 u64 physical, u64 physical_end)
3186{
3187 struct btrfs_fs_info *fs_info = sctx->fs_info;
3188 int ret = 0;
3189
3190 if (!btrfs_is_zoned(fs_info))
3191 return 0;
3192
3193 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3194
3195 mutex_lock(&sctx->wr_lock);
3196 if (sctx->write_pointer < physical_end) {
3197 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
3198 physical,
3199 sctx->write_pointer);
3200 if (ret)
3201 btrfs_err(fs_info,
3202 "zoned: failed to recover write pointer");
3203 }
3204 mutex_unlock(&sctx->wr_lock);
3205 btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
3206
3207 return ret;
3208}
3209
3210/*
3211 * Scrub one range which can only has simple mirror based profile.
3212 * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
3213 * RAID0/RAID10).
3214 *
3215 * Since we may need to handle a subset of block group, we need @logical_start
3216 * and @logical_length parameter.
3217 */
3218static int scrub_simple_mirror(struct scrub_ctx *sctx,
3219 struct btrfs_root *extent_root,
3220 struct btrfs_root *csum_root,
3221 struct btrfs_block_group *bg,
3222 struct map_lookup *map,
3223 u64 logical_start, u64 logical_length,
3224 struct btrfs_device *device,
3225 u64 physical, int mirror_num)
3226{
3227 struct btrfs_fs_info *fs_info = sctx->fs_info;
3228 const u64 logical_end = logical_start + logical_length;
3229 /* An artificial limit, inherit from old scrub behavior */
3230 const u32 max_length = SZ_64K;
3231 struct btrfs_path path = { 0 };
3232 u64 cur_logical = logical_start;
3233 int ret;
3234
3235 /* The range must be inside the bg */
3236 ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
3237
3238 path.search_commit_root = 1;
3239 path.skip_locking = 1;
3240 /* Go through each extent items inside the logical range */
3241 while (cur_logical < logical_end) {
3242 u64 extent_start;
3243 u64 extent_len;
3244 u64 extent_flags;
3245 u64 extent_gen;
3246 u64 scrub_len;
3247
3248 /* Canceled? */
3249 if (atomic_read(&fs_info->scrub_cancel_req) ||
3250 atomic_read(&sctx->cancel_req)) {
3251 ret = -ECANCELED;
3252 break;
3253 }
3254 /* Paused? */
3255 if (atomic_read(&fs_info->scrub_pause_req)) {
3256 /* Push queued extents */
3257 sctx->flush_all_writes = true;
3258 scrub_submit(sctx);
3259 mutex_lock(&sctx->wr_lock);
3260 scrub_wr_submit(sctx);
3261 mutex_unlock(&sctx->wr_lock);
3262 wait_event(sctx->list_wait,
3263 atomic_read(&sctx->bios_in_flight) == 0);
3264 sctx->flush_all_writes = false;
3265 scrub_blocked_if_needed(fs_info);
3266 }
3267 /* Block group removed? */
3268 spin_lock(&bg->lock);
3269 if (bg->removed) {
3270 spin_unlock(&bg->lock);
3271 ret = 0;
3272 break;
3273 }
3274 spin_unlock(&bg->lock);
3275
3276 ret = find_first_extent_item(extent_root, &path, cur_logical,
3277 logical_end - cur_logical);
3278 if (ret > 0) {
3279 /* No more extent, just update the accounting */
3280 sctx->stat.last_physical = physical + logical_length;
3281 ret = 0;
3282 break;
3283 }
3284 if (ret < 0)
3285 break;
3286 get_extent_info(&path, &extent_start, &extent_len,
3287 &extent_flags, &extent_gen);
3288 /* Skip hole range which doesn't have any extent */
3289 cur_logical = max(extent_start, cur_logical);
3290
3291 /*
3292 * Scrub len has three limits:
3293 * - Extent size limit
3294 * - Scrub range limit
3295 * This is especially imporatant for RAID0/RAID10 to reuse
3296 * this function
3297 * - Max scrub size limit
3298 */
3299 scrub_len = min(min(extent_start + extent_len,
3300 logical_end), cur_logical + max_length) -
3301 cur_logical;
3302
3303 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
3304 ret = btrfs_lookup_csums_range(csum_root, cur_logical,
3305 cur_logical + scrub_len - 1,
3306 &sctx->csum_list, 1);
3307 if (ret)
3308 break;
3309 }
3310 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3311 does_range_cross_boundary(extent_start, extent_len,
3312 logical_start, logical_length)) {
3313 btrfs_err(fs_info,
3314"scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
3315 extent_start, logical_start, logical_end);
3316 spin_lock(&sctx->stat_lock);
3317 sctx->stat.uncorrectable_errors++;
3318 spin_unlock(&sctx->stat_lock);
3319 cur_logical += scrub_len;
3320 continue;
3321 }
3322 ret = scrub_extent(sctx, map, cur_logical, scrub_len,
3323 cur_logical - logical_start + physical,
3324 device, extent_flags, extent_gen,
3325 mirror_num);
3326 scrub_free_csums(sctx);
3327 if (ret)
3328 break;
3329 if (sctx->is_dev_replace)
3330 sync_replace_for_zoned(sctx);
3331 cur_logical += scrub_len;
3332 /* Don't hold CPU for too long time */
3333 cond_resched();
3334 }
3335 btrfs_release_path(&path);
3336 return ret;
3337}
3338
3339/* Calculate the full stripe length for simple stripe based profiles */
3340static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
3341{
3342 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3343 BTRFS_BLOCK_GROUP_RAID10));
3344
3345 return map->num_stripes / map->sub_stripes * map->stripe_len;
3346}
3347
3348/* Get the logical bytenr for the stripe */
3349static u64 simple_stripe_get_logical(struct map_lookup *map,
3350 struct btrfs_block_group *bg,
3351 int stripe_index)
3352{
3353 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3354 BTRFS_BLOCK_GROUP_RAID10));
3355 ASSERT(stripe_index < map->num_stripes);
3356
3357 /*
3358 * (stripe_index / sub_stripes) gives how many data stripes we need to
3359 * skip.
3360 */
3361 return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
3362}
3363
3364/* Get the mirror number for the stripe */
3365static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
3366{
3367 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3368 BTRFS_BLOCK_GROUP_RAID10));
3369 ASSERT(stripe_index < map->num_stripes);
3370
3371 /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
3372 return stripe_index % map->sub_stripes + 1;
3373}
3374
3375static int scrub_simple_stripe(struct scrub_ctx *sctx,
3376 struct btrfs_root *extent_root,
3377 struct btrfs_root *csum_root,
3378 struct btrfs_block_group *bg,
3379 struct map_lookup *map,
3380 struct btrfs_device *device,
3381 int stripe_index)
3382{
3383 const u64 logical_increment = simple_stripe_full_stripe_len(map);
3384 const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
3385 const u64 orig_physical = map->stripes[stripe_index].physical;
3386 const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
3387 u64 cur_logical = orig_logical;
3388 u64 cur_physical = orig_physical;
3389 int ret = 0;
3390
3391 while (cur_logical < bg->start + bg->length) {
3392 /*
3393 * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
3394 * just RAID1, so we can reuse scrub_simple_mirror() to scrub
3395 * this stripe.
3396 */
3397 ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
3398 cur_logical, map->stripe_len, device,
3399 cur_physical, mirror_num);
3400 if (ret)
3401 return ret;
3402 /* Skip to next stripe which belongs to the target device */
3403 cur_logical += logical_increment;
3404 /* For physical offset, we just go to next stripe */
3405 cur_physical += map->stripe_len;
3406 }
3407 return ret;
3408}
3409
3410static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3411 struct btrfs_block_group *bg,
3412 struct extent_map *em,
3413 struct btrfs_device *scrub_dev,
3414 int stripe_index)
3415{
3416 struct btrfs_path *path;
3417 struct btrfs_fs_info *fs_info = sctx->fs_info;
3418 struct btrfs_root *root;
3419 struct btrfs_root *csum_root;
3420 struct blk_plug plug;
3421 struct map_lookup *map = em->map_lookup;
3422 const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
3423 const u64 chunk_logical = bg->start;
3424 int ret;
3425 u64 physical = map->stripes[stripe_index].physical;
3426 const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
3427 const u64 physical_end = physical + dev_stripe_len;
3428 u64 logical;
3429 u64 logic_end;
3430 /* The logical increment after finishing one stripe */
3431 u64 increment;
3432 /* Offset inside the chunk */
3433 u64 offset;
3434 u64 stripe_logical;
3435 u64 stripe_end;
3436 int stop_loop = 0;
3437
3438 path = btrfs_alloc_path();
3439 if (!path)
3440 return -ENOMEM;
3441
3442 /*
3443 * work on commit root. The related disk blocks are static as
3444 * long as COW is applied. This means, it is save to rewrite
3445 * them to repair disk errors without any race conditions
3446 */
3447 path->search_commit_root = 1;
3448 path->skip_locking = 1;
3449 path->reada = READA_FORWARD;
3450
3451 wait_event(sctx->list_wait,
3452 atomic_read(&sctx->bios_in_flight) == 0);
3453 scrub_blocked_if_needed(fs_info);
3454
3455 root = btrfs_extent_root(fs_info, bg->start);
3456 csum_root = btrfs_csum_root(fs_info, bg->start);
3457
3458 /*
3459 * collect all data csums for the stripe to avoid seeking during
3460 * the scrub. This might currently (crc32) end up to be about 1MB
3461 */
3462 blk_start_plug(&plug);
3463
3464 if (sctx->is_dev_replace &&
3465 btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
3466 mutex_lock(&sctx->wr_lock);
3467 sctx->write_pointer = physical;
3468 mutex_unlock(&sctx->wr_lock);
3469 sctx->flush_all_writes = true;
3470 }
3471
3472 /*
3473 * There used to be a big double loop to handle all profiles using the
3474 * same routine, which grows larger and more gross over time.
3475 *
3476 * So here we handle each profile differently, so simpler profiles
3477 * have simpler scrubbing function.
3478 */
3479 if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
3480 BTRFS_BLOCK_GROUP_RAID56_MASK))) {
3481 /*
3482 * Above check rules out all complex profile, the remaining
3483 * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
3484 * mirrored duplication without stripe.
3485 *
3486 * Only @physical and @mirror_num needs to calculated using
3487 * @stripe_index.
3488 */
3489 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3490 bg->start, bg->length, scrub_dev,
3491 map->stripes[stripe_index].physical,
3492 stripe_index + 1);
3493 offset = 0;
3494 goto out;
3495 }
3496 if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3497 ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
3498 scrub_dev, stripe_index);
3499 offset = map->stripe_len * (stripe_index / map->sub_stripes);
3500 goto out;
3501 }
3502
3503 /* Only RAID56 goes through the old code */
3504 ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3505 ret = 0;
3506
3507 /* Calculate the logical end of the stripe */
3508 get_raid56_logic_offset(physical_end, stripe_index,
3509 map, &logic_end, NULL);
3510 logic_end += chunk_logical;
3511
3512 /* Initialize @offset in case we need to go to out: label */
3513 get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
3514 increment = map->stripe_len * nr_data_stripes(map);
3515
3516 /*
3517 * Due to the rotation, for RAID56 it's better to iterate each stripe
3518 * using their physical offset.
3519 */
3520 while (physical < physical_end) {
3521 ret = get_raid56_logic_offset(physical, stripe_index, map,
3522 &logical, &stripe_logical);
3523 logical += chunk_logical;
3524 if (ret) {
3525 /* it is parity strip */
3526 stripe_logical += chunk_logical;
3527 stripe_end = stripe_logical + increment;
3528 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3529 stripe_logical,
3530 stripe_end);
3531 if (ret)
3532 goto out;
3533 goto next;
3534 }
3535
3536 /*
3537 * Now we're at a data stripe, scrub each extents in the range.
3538 *
3539 * At this stage, if we ignore the repair part, inside each data
3540 * stripe it is no different than SINGLE profile.
3541 * We can reuse scrub_simple_mirror() here, as the repair part
3542 * is still based on @mirror_num.
3543 */
3544 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3545 logical, map->stripe_len,
3546 scrub_dev, physical, 1);
3547 if (ret < 0)
3548 goto out;
3549next:
3550 logical += increment;
3551 physical += map->stripe_len;
3552 spin_lock(&sctx->stat_lock);
3553 if (stop_loop)
3554 sctx->stat.last_physical =
3555 map->stripes[stripe_index].physical + dev_stripe_len;
3556 else
3557 sctx->stat.last_physical = physical;
3558 spin_unlock(&sctx->stat_lock);
3559 if (stop_loop)
3560 break;
3561 }
3562out:
3563 /* push queued extents */
3564 scrub_submit(sctx);
3565 mutex_lock(&sctx->wr_lock);
3566 scrub_wr_submit(sctx);
3567 mutex_unlock(&sctx->wr_lock);
3568
3569 blk_finish_plug(&plug);
3570 btrfs_free_path(path);
3571
3572 if (sctx->is_dev_replace && ret >= 0) {
3573 int ret2;
3574
3575 ret2 = sync_write_pointer_for_zoned(sctx,
3576 chunk_logical + offset,
3577 map->stripes[stripe_index].physical,
3578 physical_end);
3579 if (ret2)
3580 ret = ret2;
3581 }
3582
3583 return ret < 0 ? ret : 0;
3584}
3585
3586static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3587 struct btrfs_block_group *bg,
3588 struct btrfs_device *scrub_dev,
3589 u64 dev_offset,
3590 u64 dev_extent_len)
3591{
3592 struct btrfs_fs_info *fs_info = sctx->fs_info;
3593 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3594 struct map_lookup *map;
3595 struct extent_map *em;
3596 int i;
3597 int ret = 0;
3598
3599 read_lock(&map_tree->lock);
3600 em = lookup_extent_mapping(map_tree, bg->start, bg->length);
3601 read_unlock(&map_tree->lock);
3602
3603 if (!em) {
3604 /*
3605 * Might have been an unused block group deleted by the cleaner
3606 * kthread or relocation.
3607 */
3608 spin_lock(&bg->lock);
3609 if (!bg->removed)
3610 ret = -EINVAL;
3611 spin_unlock(&bg->lock);
3612
3613 return ret;
3614 }
3615 if (em->start != bg->start)
3616 goto out;
3617 if (em->len < dev_extent_len)
3618 goto out;
3619
3620 map = em->map_lookup;
3621 for (i = 0; i < map->num_stripes; ++i) {
3622 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3623 map->stripes[i].physical == dev_offset) {
3624 ret = scrub_stripe(sctx, bg, em, scrub_dev, i);
3625 if (ret)
3626 goto out;
3627 }
3628 }
3629out:
3630 free_extent_map(em);
3631
3632 return ret;
3633}
3634
3635static int finish_extent_writes_for_zoned(struct btrfs_root *root,
3636 struct btrfs_block_group *cache)
3637{
3638 struct btrfs_fs_info *fs_info = cache->fs_info;
3639 struct btrfs_trans_handle *trans;
3640
3641 if (!btrfs_is_zoned(fs_info))
3642 return 0;
3643
3644 btrfs_wait_block_group_reservations(cache);
3645 btrfs_wait_nocow_writers(cache);
3646 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
3647
3648 trans = btrfs_join_transaction(root);
3649 if (IS_ERR(trans))
3650 return PTR_ERR(trans);
3651 return btrfs_commit_transaction(trans);
3652}
3653
3654static noinline_for_stack
3655int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3656 struct btrfs_device *scrub_dev, u64 start, u64 end)
3657{
3658 struct btrfs_dev_extent *dev_extent = NULL;
3659 struct btrfs_path *path;
3660 struct btrfs_fs_info *fs_info = sctx->fs_info;
3661 struct btrfs_root *root = fs_info->dev_root;
3662 u64 chunk_offset;
3663 int ret = 0;
3664 int ro_set;
3665 int slot;
3666 struct extent_buffer *l;
3667 struct btrfs_key key;
3668 struct btrfs_key found_key;
3669 struct btrfs_block_group *cache;
3670 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3671
3672 path = btrfs_alloc_path();
3673 if (!path)
3674 return -ENOMEM;
3675
3676 path->reada = READA_FORWARD;
3677 path->search_commit_root = 1;
3678 path->skip_locking = 1;
3679
3680 key.objectid = scrub_dev->devid;
3681 key.offset = 0ull;
3682 key.type = BTRFS_DEV_EXTENT_KEY;
3683
3684 while (1) {
3685 u64 dev_extent_len;
3686
3687 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3688 if (ret < 0)
3689 break;
3690 if (ret > 0) {
3691 if (path->slots[0] >=
3692 btrfs_header_nritems(path->nodes[0])) {
3693 ret = btrfs_next_leaf(root, path);
3694 if (ret < 0)
3695 break;
3696 if (ret > 0) {
3697 ret = 0;
3698 break;
3699 }
3700 } else {
3701 ret = 0;
3702 }
3703 }
3704
3705 l = path->nodes[0];
3706 slot = path->slots[0];
3707
3708 btrfs_item_key_to_cpu(l, &found_key, slot);
3709
3710 if (found_key.objectid != scrub_dev->devid)
3711 break;
3712
3713 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3714 break;
3715
3716 if (found_key.offset >= end)
3717 break;
3718
3719 if (found_key.offset < key.offset)
3720 break;
3721
3722 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3723 dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
3724
3725 if (found_key.offset + dev_extent_len <= start)
3726 goto skip;
3727
3728 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3729
3730 /*
3731 * get a reference on the corresponding block group to prevent
3732 * the chunk from going away while we scrub it
3733 */
3734 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3735
3736 /* some chunks are removed but not committed to disk yet,
3737 * continue scrubbing */
3738 if (!cache)
3739 goto skip;
3740
3741 ASSERT(cache->start <= chunk_offset);
3742 /*
3743 * We are using the commit root to search for device extents, so
3744 * that means we could have found a device extent item from a
3745 * block group that was deleted in the current transaction. The
3746 * logical start offset of the deleted block group, stored at
3747 * @chunk_offset, might be part of the logical address range of
3748 * a new block group (which uses different physical extents).
3749 * In this case btrfs_lookup_block_group() has returned the new
3750 * block group, and its start address is less than @chunk_offset.
3751 *
3752 * We skip such new block groups, because it's pointless to
3753 * process them, as we won't find their extents because we search
3754 * for them using the commit root of the extent tree. For a device
3755 * replace it's also fine to skip it, we won't miss copying them
3756 * to the target device because we have the write duplication
3757 * setup through the regular write path (by btrfs_map_block()),
3758 * and we have committed a transaction when we started the device
3759 * replace, right after setting up the device replace state.
3760 */
3761 if (cache->start < chunk_offset) {
3762 btrfs_put_block_group(cache);
3763 goto skip;
3764 }
3765
3766 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
3767 spin_lock(&cache->lock);
3768 if (!cache->to_copy) {
3769 spin_unlock(&cache->lock);
3770 btrfs_put_block_group(cache);
3771 goto skip;
3772 }
3773 spin_unlock(&cache->lock);
3774 }
3775
3776 /*
3777 * Make sure that while we are scrubbing the corresponding block
3778 * group doesn't get its logical address and its device extents
3779 * reused for another block group, which can possibly be of a
3780 * different type and different profile. We do this to prevent
3781 * false error detections and crashes due to bogus attempts to
3782 * repair extents.
3783 */
3784 spin_lock(&cache->lock);
3785 if (cache->removed) {
3786 spin_unlock(&cache->lock);
3787 btrfs_put_block_group(cache);
3788 goto skip;
3789 }
3790 btrfs_freeze_block_group(cache);
3791 spin_unlock(&cache->lock);
3792
3793 /*
3794 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3795 * to avoid deadlock caused by:
3796 * btrfs_inc_block_group_ro()
3797 * -> btrfs_wait_for_commit()
3798 * -> btrfs_commit_transaction()
3799 * -> btrfs_scrub_pause()
3800 */
3801 scrub_pause_on(fs_info);
3802
3803 /*
3804 * Don't do chunk preallocation for scrub.
3805 *
3806 * This is especially important for SYSTEM bgs, or we can hit
3807 * -EFBIG from btrfs_finish_chunk_alloc() like:
3808 * 1. The only SYSTEM bg is marked RO.
3809 * Since SYSTEM bg is small, that's pretty common.
3810 * 2. New SYSTEM bg will be allocated
3811 * Due to regular version will allocate new chunk.
3812 * 3. New SYSTEM bg is empty and will get cleaned up
3813 * Before cleanup really happens, it's marked RO again.
3814 * 4. Empty SYSTEM bg get scrubbed
3815 * We go back to 2.
3816 *
3817 * This can easily boost the amount of SYSTEM chunks if cleaner
3818 * thread can't be triggered fast enough, and use up all space
3819 * of btrfs_super_block::sys_chunk_array
3820 *
3821 * While for dev replace, we need to try our best to mark block
3822 * group RO, to prevent race between:
3823 * - Write duplication
3824 * Contains latest data
3825 * - Scrub copy
3826 * Contains data from commit tree
3827 *
3828 * If target block group is not marked RO, nocow writes can
3829 * be overwritten by scrub copy, causing data corruption.
3830 * So for dev-replace, it's not allowed to continue if a block
3831 * group is not RO.
3832 */
3833 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
3834 if (!ret && sctx->is_dev_replace) {
3835 ret = finish_extent_writes_for_zoned(root, cache);
3836 if (ret) {
3837 btrfs_dec_block_group_ro(cache);
3838 scrub_pause_off(fs_info);
3839 btrfs_put_block_group(cache);
3840 break;
3841 }
3842 }
3843
3844 if (ret == 0) {
3845 ro_set = 1;
3846 } else if (ret == -ENOSPC && !sctx->is_dev_replace) {
3847 /*
3848 * btrfs_inc_block_group_ro return -ENOSPC when it
3849 * failed in creating new chunk for metadata.
3850 * It is not a problem for scrub, because
3851 * metadata are always cowed, and our scrub paused
3852 * commit_transactions.
3853 */
3854 ro_set = 0;
3855 } else if (ret == -ETXTBSY) {
3856 btrfs_warn(fs_info,
3857 "skipping scrub of block group %llu due to active swapfile",
3858 cache->start);
3859 scrub_pause_off(fs_info);
3860 ret = 0;
3861 goto skip_unfreeze;
3862 } else {
3863 btrfs_warn(fs_info,
3864 "failed setting block group ro: %d", ret);
3865 btrfs_unfreeze_block_group(cache);
3866 btrfs_put_block_group(cache);
3867 scrub_pause_off(fs_info);
3868 break;
3869 }
3870
3871 /*
3872 * Now the target block is marked RO, wait for nocow writes to
3873 * finish before dev-replace.
3874 * COW is fine, as COW never overwrites extents in commit tree.
3875 */
3876 if (sctx->is_dev_replace) {
3877 btrfs_wait_nocow_writers(cache);
3878 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
3879 cache->length);
3880 }
3881
3882 scrub_pause_off(fs_info);
3883 down_write(&dev_replace->rwsem);
3884 dev_replace->cursor_right = found_key.offset + dev_extent_len;
3885 dev_replace->cursor_left = found_key.offset;
3886 dev_replace->item_needs_writeback = 1;
3887 up_write(&dev_replace->rwsem);
3888
3889 ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
3890 dev_extent_len);
3891
3892 /*
3893 * flush, submit all pending read and write bios, afterwards
3894 * wait for them.
3895 * Note that in the dev replace case, a read request causes
3896 * write requests that are submitted in the read completion
3897 * worker. Therefore in the current situation, it is required
3898 * that all write requests are flushed, so that all read and
3899 * write requests are really completed when bios_in_flight
3900 * changes to 0.
3901 */
3902 sctx->flush_all_writes = true;
3903 scrub_submit(sctx);
3904 mutex_lock(&sctx->wr_lock);
3905 scrub_wr_submit(sctx);
3906 mutex_unlock(&sctx->wr_lock);
3907
3908 wait_event(sctx->list_wait,
3909 atomic_read(&sctx->bios_in_flight) == 0);
3910
3911 scrub_pause_on(fs_info);
3912
3913 /*
3914 * must be called before we decrease @scrub_paused.
3915 * make sure we don't block transaction commit while
3916 * we are waiting pending workers finished.
3917 */
3918 wait_event(sctx->list_wait,
3919 atomic_read(&sctx->workers_pending) == 0);
3920 sctx->flush_all_writes = false;
3921
3922 scrub_pause_off(fs_info);
3923
3924 if (sctx->is_dev_replace &&
3925 !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
3926 cache, found_key.offset))
3927 ro_set = 0;
3928
3929 down_write(&dev_replace->rwsem);
3930 dev_replace->cursor_left = dev_replace->cursor_right;
3931 dev_replace->item_needs_writeback = 1;
3932 up_write(&dev_replace->rwsem);
3933
3934 if (ro_set)
3935 btrfs_dec_block_group_ro(cache);
3936
3937 /*
3938 * We might have prevented the cleaner kthread from deleting
3939 * this block group if it was already unused because we raced
3940 * and set it to RO mode first. So add it back to the unused
3941 * list, otherwise it might not ever be deleted unless a manual
3942 * balance is triggered or it becomes used and unused again.
3943 */
3944 spin_lock(&cache->lock);
3945 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3946 cache->used == 0) {
3947 spin_unlock(&cache->lock);
3948 if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
3949 btrfs_discard_queue_work(&fs_info->discard_ctl,
3950 cache);
3951 else
3952 btrfs_mark_bg_unused(cache);
3953 } else {
3954 spin_unlock(&cache->lock);
3955 }
3956skip_unfreeze:
3957 btrfs_unfreeze_block_group(cache);
3958 btrfs_put_block_group(cache);
3959 if (ret)
3960 break;
3961 if (sctx->is_dev_replace &&
3962 atomic64_read(&dev_replace->num_write_errors) > 0) {
3963 ret = -EIO;
3964 break;
3965 }
3966 if (sctx->stat.malloc_errors > 0) {
3967 ret = -ENOMEM;
3968 break;
3969 }
3970skip:
3971 key.offset = found_key.offset + dev_extent_len;
3972 btrfs_release_path(path);
3973 }
3974
3975 btrfs_free_path(path);
3976
3977 return ret;
3978}
3979
3980static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3981 struct btrfs_device *scrub_dev)
3982{
3983 int i;
3984 u64 bytenr;
3985 u64 gen;
3986 int ret;
3987 struct btrfs_fs_info *fs_info = sctx->fs_info;
3988
3989 if (BTRFS_FS_ERROR(fs_info))
3990 return -EROFS;
3991
3992 /* Seed devices of a new filesystem has their own generation. */
3993 if (scrub_dev->fs_devices != fs_info->fs_devices)
3994 gen = scrub_dev->generation;
3995 else
3996 gen = fs_info->last_trans_committed;
3997
3998 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3999 bytenr = btrfs_sb_offset(i);
4000 if (bytenr + BTRFS_SUPER_INFO_SIZE >
4001 scrub_dev->commit_total_bytes)
4002 break;
4003 if (!btrfs_check_super_location(scrub_dev, bytenr))
4004 continue;
4005
4006 ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
4007 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
4008 NULL, bytenr);
4009 if (ret)
4010 return ret;
4011 }
4012 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4013
4014 return 0;
4015}
4016
4017static void scrub_workers_put(struct btrfs_fs_info *fs_info)
4018{
4019 if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
4020 &fs_info->scrub_lock)) {
4021 struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
4022 struct workqueue_struct *scrub_wr_comp =
4023 fs_info->scrub_wr_completion_workers;
4024 struct workqueue_struct *scrub_parity =
4025 fs_info->scrub_parity_workers;
4026
4027 fs_info->scrub_workers = NULL;
4028 fs_info->scrub_wr_completion_workers = NULL;
4029 fs_info->scrub_parity_workers = NULL;
4030 mutex_unlock(&fs_info->scrub_lock);
4031
4032 if (scrub_workers)
4033 destroy_workqueue(scrub_workers);
4034 if (scrub_wr_comp)
4035 destroy_workqueue(scrub_wr_comp);
4036 if (scrub_parity)
4037 destroy_workqueue(scrub_parity);
4038 }
4039}
4040
4041/*
4042 * get a reference count on fs_info->scrub_workers. start worker if necessary
4043 */
4044static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4045 int is_dev_replace)
4046{
4047 struct workqueue_struct *scrub_workers = NULL;
4048 struct workqueue_struct *scrub_wr_comp = NULL;
4049 struct workqueue_struct *scrub_parity = NULL;
4050 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
4051 int max_active = fs_info->thread_pool_size;
4052 int ret = -ENOMEM;
4053
4054 if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
4055 return 0;
4056
4057 scrub_workers = alloc_workqueue("btrfs-scrub", flags,
4058 is_dev_replace ? 1 : max_active);
4059 if (!scrub_workers)
4060 goto fail_scrub_workers;
4061
4062 scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
4063 if (!scrub_wr_comp)
4064 goto fail_scrub_wr_completion_workers;
4065
4066 scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
4067 if (!scrub_parity)
4068 goto fail_scrub_parity_workers;
4069
4070 mutex_lock(&fs_info->scrub_lock);
4071 if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
4072 ASSERT(fs_info->scrub_workers == NULL &&
4073 fs_info->scrub_wr_completion_workers == NULL &&
4074 fs_info->scrub_parity_workers == NULL);
4075 fs_info->scrub_workers = scrub_workers;
4076 fs_info->scrub_wr_completion_workers = scrub_wr_comp;
4077 fs_info->scrub_parity_workers = scrub_parity;
4078 refcount_set(&fs_info->scrub_workers_refcnt, 1);
4079 mutex_unlock(&fs_info->scrub_lock);
4080 return 0;
4081 }
4082 /* Other thread raced in and created the workers for us */
4083 refcount_inc(&fs_info->scrub_workers_refcnt);
4084 mutex_unlock(&fs_info->scrub_lock);
4085
4086 ret = 0;
4087 destroy_workqueue(scrub_parity);
4088fail_scrub_parity_workers:
4089 destroy_workqueue(scrub_wr_comp);
4090fail_scrub_wr_completion_workers:
4091 destroy_workqueue(scrub_workers);
4092fail_scrub_workers:
4093 return ret;
4094}
4095
4096int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4097 u64 end, struct btrfs_scrub_progress *progress,
4098 int readonly, int is_dev_replace)
4099{
4100 struct btrfs_dev_lookup_args args = { .devid = devid };
4101 struct scrub_ctx *sctx;
4102 int ret;
4103 struct btrfs_device *dev;
4104 unsigned int nofs_flag;
4105
4106 if (btrfs_fs_closing(fs_info))
4107 return -EAGAIN;
4108
4109 if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
4110 /*
4111 * in this case scrub is unable to calculate the checksum
4112 * the way scrub is implemented. Do not handle this
4113 * situation at all because it won't ever happen.
4114 */
4115 btrfs_err(fs_info,
4116 "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
4117 fs_info->nodesize,
4118 BTRFS_STRIPE_LEN);
4119 return -EINVAL;
4120 }
4121
4122 if (fs_info->nodesize >
4123 SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits ||
4124 fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) {
4125 /*
4126 * Would exhaust the array bounds of sectorv member in
4127 * struct scrub_block
4128 */
4129 btrfs_err(fs_info,
4130"scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails",
4131 fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK,
4132 fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK);
4133 return -EINVAL;
4134 }
4135
4136 /* Allocate outside of device_list_mutex */
4137 sctx = scrub_setup_ctx(fs_info, is_dev_replace);
4138 if (IS_ERR(sctx))
4139 return PTR_ERR(sctx);
4140
4141 ret = scrub_workers_get(fs_info, is_dev_replace);
4142 if (ret)
4143 goto out_free_ctx;
4144
4145 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4146 dev = btrfs_find_device(fs_info->fs_devices, &args);
4147 if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4148 !is_dev_replace)) {
4149 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4150 ret = -ENODEV;
4151 goto out;
4152 }
4153
4154 if (!is_dev_replace && !readonly &&
4155 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
4156 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4157 btrfs_err_in_rcu(fs_info,
4158 "scrub on devid %llu: filesystem on %s is not writable",
4159 devid, rcu_str_deref(dev->name));
4160 ret = -EROFS;
4161 goto out;
4162 }
4163
4164 mutex_lock(&fs_info->scrub_lock);
4165 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4166 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
4167 mutex_unlock(&fs_info->scrub_lock);
4168 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4169 ret = -EIO;
4170 goto out;
4171 }
4172
4173 down_read(&fs_info->dev_replace.rwsem);
4174 if (dev->scrub_ctx ||
4175 (!is_dev_replace &&
4176 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
4177 up_read(&fs_info->dev_replace.rwsem);
4178 mutex_unlock(&fs_info->scrub_lock);
4179 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4180 ret = -EINPROGRESS;
4181 goto out;
4182 }
4183 up_read(&fs_info->dev_replace.rwsem);
4184
4185 sctx->readonly = readonly;
4186 dev->scrub_ctx = sctx;
4187 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4188
4189 /*
4190 * checking @scrub_pause_req here, we can avoid
4191 * race between committing transaction and scrubbing.
4192 */
4193 __scrub_blocked_if_needed(fs_info);
4194 atomic_inc(&fs_info->scrubs_running);
4195 mutex_unlock(&fs_info->scrub_lock);
4196
4197 /*
4198 * In order to avoid deadlock with reclaim when there is a transaction
4199 * trying to pause scrub, make sure we use GFP_NOFS for all the
4200 * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
4201 * invoked by our callees. The pausing request is done when the
4202 * transaction commit starts, and it blocks the transaction until scrub
4203 * is paused (done at specific points at scrub_stripe() or right above
4204 * before incrementing fs_info->scrubs_running).
4205 */
4206 nofs_flag = memalloc_nofs_save();
4207 if (!is_dev_replace) {
4208 btrfs_info(fs_info, "scrub: started on devid %llu", devid);
4209 /*
4210 * by holding device list mutex, we can
4211 * kick off writing super in log tree sync.
4212 */
4213 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4214 ret = scrub_supers(sctx, dev);
4215 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4216 }
4217
4218 if (!ret)
4219 ret = scrub_enumerate_chunks(sctx, dev, start, end);
4220 memalloc_nofs_restore(nofs_flag);
4221
4222 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4223 atomic_dec(&fs_info->scrubs_running);
4224 wake_up(&fs_info->scrub_pause_wait);
4225
4226 wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4227
4228 if (progress)
4229 memcpy(progress, &sctx->stat, sizeof(*progress));
4230
4231 if (!is_dev_replace)
4232 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
4233 ret ? "not finished" : "finished", devid, ret);
4234
4235 mutex_lock(&fs_info->scrub_lock);
4236 dev->scrub_ctx = NULL;
4237 mutex_unlock(&fs_info->scrub_lock);
4238
4239 scrub_workers_put(fs_info);
4240 scrub_put_ctx(sctx);
4241
4242 return ret;
4243out:
4244 scrub_workers_put(fs_info);
4245out_free_ctx:
4246 scrub_free_ctx(sctx);
4247
4248 return ret;
4249}
4250
4251void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4252{
4253 mutex_lock(&fs_info->scrub_lock);
4254 atomic_inc(&fs_info->scrub_pause_req);
4255 while (atomic_read(&fs_info->scrubs_paused) !=
4256 atomic_read(&fs_info->scrubs_running)) {
4257 mutex_unlock(&fs_info->scrub_lock);
4258 wait_event(fs_info->scrub_pause_wait,
4259 atomic_read(&fs_info->scrubs_paused) ==
4260 atomic_read(&fs_info->scrubs_running));
4261 mutex_lock(&fs_info->scrub_lock);
4262 }
4263 mutex_unlock(&fs_info->scrub_lock);
4264}
4265
4266void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4267{
4268 atomic_dec(&fs_info->scrub_pause_req);
4269 wake_up(&fs_info->scrub_pause_wait);
4270}
4271
4272int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4273{
4274 mutex_lock(&fs_info->scrub_lock);
4275 if (!atomic_read(&fs_info->scrubs_running)) {
4276 mutex_unlock(&fs_info->scrub_lock);
4277 return -ENOTCONN;
4278 }
4279
4280 atomic_inc(&fs_info->scrub_cancel_req);
4281 while (atomic_read(&fs_info->scrubs_running)) {
4282 mutex_unlock(&fs_info->scrub_lock);
4283 wait_event(fs_info->scrub_pause_wait,
4284 atomic_read(&fs_info->scrubs_running) == 0);
4285 mutex_lock(&fs_info->scrub_lock);
4286 }
4287 atomic_dec(&fs_info->scrub_cancel_req);
4288 mutex_unlock(&fs_info->scrub_lock);
4289
4290 return 0;
4291}
4292
4293int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
4294{
4295 struct btrfs_fs_info *fs_info = dev->fs_info;
4296 struct scrub_ctx *sctx;
4297
4298 mutex_lock(&fs_info->scrub_lock);
4299 sctx = dev->scrub_ctx;
4300 if (!sctx) {
4301 mutex_unlock(&fs_info->scrub_lock);
4302 return -ENOTCONN;
4303 }
4304 atomic_inc(&sctx->cancel_req);
4305 while (dev->scrub_ctx) {
4306 mutex_unlock(&fs_info->scrub_lock);
4307 wait_event(fs_info->scrub_pause_wait,
4308 dev->scrub_ctx == NULL);
4309 mutex_lock(&fs_info->scrub_lock);
4310 }
4311 mutex_unlock(&fs_info->scrub_lock);
4312
4313 return 0;
4314}
4315
4316int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4317 struct btrfs_scrub_progress *progress)
4318{
4319 struct btrfs_dev_lookup_args args = { .devid = devid };
4320 struct btrfs_device *dev;
4321 struct scrub_ctx *sctx = NULL;
4322
4323 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4324 dev = btrfs_find_device(fs_info->fs_devices, &args);
4325 if (dev)
4326 sctx = dev->scrub_ctx;
4327 if (sctx)
4328 memcpy(progress, &sctx->stat, sizeof(*progress));
4329 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4330
4331 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4332}
4333
4334static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
4335 u64 extent_logical, u32 extent_len,
4336 u64 *extent_physical,
4337 struct btrfs_device **extent_dev,
4338 int *extent_mirror_num)
4339{
4340 u64 mapped_length;
4341 struct btrfs_io_context *bioc = NULL;
4342 int ret;
4343
4344 mapped_length = extent_len;
4345 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4346 &mapped_length, &bioc, 0);
4347 if (ret || !bioc || mapped_length < extent_len ||
4348 !bioc->stripes[0].dev->bdev) {
4349 btrfs_put_bioc(bioc);
4350 return;
4351 }
4352
4353 *extent_physical = bioc->stripes[0].physical;
4354 *extent_mirror_num = bioc->mirror_num;
4355 *extent_dev = bioc->stripes[0].dev;
4356 btrfs_put_bioc(bioc);
4357}