fs/btrfs/scrub.c at v5.18-rc6

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / fs / btrfs / scrub.c
at v5.18-rc6 4321 lines 117 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
   4 */
   5
   6#include <linux/blkdev.h>
   7#include <linux/ratelimit.h>
   8#include <linux/sched/mm.h>
   9#include <crypto/hash.h>
  10#include "ctree.h"
  11#include "discard.h"
  12#include "volumes.h"
  13#include "disk-io.h"
  14#include "ordered-data.h"
  15#include "transaction.h"
  16#include "backref.h"
  17#include "extent_io.h"
  18#include "dev-replace.h"
  19#include "check-integrity.h"
  20#include "rcu-string.h"
  21#include "raid56.h"
  22#include "block-group.h"
  23#include "zoned.h"
  24
  25/*
  26 * This is only the first step towards a full-features scrub. It reads all
  27 * extent and super block and verifies the checksums. In case a bad checksum
  28 * is found or the extent cannot be read, good data will be written back if
  29 * any can be found.
  30 *
  31 * Future enhancements:
  32 *  - In case an unrepairable extent is encountered, track which files are
  33 *    affected and report them
  34 *  - track and record media errors, throw out bad devices
  35 *  - add a mode to also read unallocated space
  36 */
  37
  38struct scrub_block;
  39struct scrub_ctx;
  40
  41/*
  42 * The following three values only influence the performance.
  43 *
  44 * The last one configures the number of parallel and outstanding I/O
  45 * operations. The first one configures an upper limit for the number
  46 * of (dynamically allocated) pages that are added to a bio.
  47 */
  48#define SCRUB_PAGES_PER_BIO	32	/* 128KiB per bio for x86 */
  49#define SCRUB_BIOS_PER_SCTX	64	/* 8MiB per device in flight for x86 */
  50
  51/*
  52 * The following value times PAGE_SIZE needs to be large enough to match the
  53 * largest node/leaf/sector size that shall be supported.
  54 */
  55#define SCRUB_MAX_PAGES_PER_BLOCK	(BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
  56
  57struct scrub_recover {
  58	refcount_t		refs;
  59	struct btrfs_io_context	*bioc;
  60	u64			map_length;
  61};
  62
  63struct scrub_page {
  64	struct scrub_block	*sblock;
  65	struct page		*page;
  66	struct btrfs_device	*dev;
  67	struct list_head	list;
  68	u64			flags;  /* extent flags */
  69	u64			generation;
  70	u64			logical;
  71	u64			physical;
  72	u64			physical_for_dev_replace;
  73	atomic_t		refs;
  74	u8			mirror_num;
  75	unsigned int		have_csum:1;
  76	unsigned int		io_error:1;
  77	u8			csum[BTRFS_CSUM_SIZE];
  78
  79	struct scrub_recover	*recover;
  80};
  81
  82struct scrub_bio {
  83	int			index;
  84	struct scrub_ctx	*sctx;
  85	struct btrfs_device	*dev;
  86	struct bio		*bio;
  87	blk_status_t		status;
  88	u64			logical;
  89	u64			physical;
  90	struct scrub_page	*pagev[SCRUB_PAGES_PER_BIO];
  91	int			page_count;
  92	int			next_free;
  93	struct btrfs_work	work;
  94};
  95
  96struct scrub_block {
  97	struct scrub_page	*pagev[SCRUB_MAX_PAGES_PER_BLOCK];
  98	int			page_count;
  99	atomic_t		outstanding_pages;
 100	refcount_t		refs; /* free mem on transition to zero */
 101	struct scrub_ctx	*sctx;
 102	struct scrub_parity	*sparity;
 103	struct {
 104		unsigned int	header_error:1;
 105		unsigned int	checksum_error:1;
 106		unsigned int	no_io_error_seen:1;
 107		unsigned int	generation_error:1; /* also sets header_error */
 108
 109		/* The following is for the data used to check parity */
 110		/* It is for the data with checksum */
 111		unsigned int	data_corrected:1;
 112	};
 113	struct btrfs_work	work;
 114};
 115
 116/* Used for the chunks with parity stripe such RAID5/6 */
 117struct scrub_parity {
 118	struct scrub_ctx	*sctx;
 119
 120	struct btrfs_device	*scrub_dev;
 121
 122	u64			logic_start;
 123
 124	u64			logic_end;
 125
 126	int			nsectors;
 127
 128	u32			stripe_len;
 129
 130	refcount_t		refs;
 131
 132	struct list_head	spages;
 133
 134	/* Work of parity check and repair */
 135	struct btrfs_work	work;
 136
 137	/* Mark the parity blocks which have data */
 138	unsigned long		*dbitmap;
 139
 140	/*
 141	 * Mark the parity blocks which have data, but errors happen when
 142	 * read data or check data
 143	 */
 144	unsigned long		*ebitmap;
 145
 146	unsigned long		bitmap[];
 147};
 148
 149struct scrub_ctx {
 150	struct scrub_bio	*bios[SCRUB_BIOS_PER_SCTX];
 151	struct btrfs_fs_info	*fs_info;
 152	int			first_free;
 153	int			curr;
 154	atomic_t		bios_in_flight;
 155	atomic_t		workers_pending;
 156	spinlock_t		list_lock;
 157	wait_queue_head_t	list_wait;
 158	struct list_head	csum_list;
 159	atomic_t		cancel_req;
 160	int			readonly;
 161	int			pages_per_bio;
 162
 163	/* State of IO submission throttling affecting the associated device */
 164	ktime_t			throttle_deadline;
 165	u64			throttle_sent;
 166
 167	int			is_dev_replace;
 168	u64			write_pointer;
 169
 170	struct scrub_bio        *wr_curr_bio;
 171	struct mutex            wr_lock;
 172	struct btrfs_device     *wr_tgtdev;
 173	bool                    flush_all_writes;
 174
 175	/*
 176	 * statistics
 177	 */
 178	struct btrfs_scrub_progress stat;
 179	spinlock_t		stat_lock;
 180
 181	/*
 182	 * Use a ref counter to avoid use-after-free issues. Scrub workers
 183	 * decrement bios_in_flight and workers_pending and then do a wakeup
 184	 * on the list_wait wait queue. We must ensure the main scrub task
 185	 * doesn't free the scrub context before or while the workers are
 186	 * doing the wakeup() call.
 187	 */
 188	refcount_t              refs;
 189};
 190
 191struct scrub_warning {
 192	struct btrfs_path	*path;
 193	u64			extent_item_size;
 194	const char		*errstr;
 195	u64			physical;
 196	u64			logical;
 197	struct btrfs_device	*dev;
 198};
 199
 200struct full_stripe_lock {
 201	struct rb_node node;
 202	u64 logical;
 203	u64 refs;
 204	struct mutex mutex;
 205};
 206
 207static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
 208				     struct scrub_block *sblocks_for_recheck);
 209static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 210				struct scrub_block *sblock,
 211				int retry_failed_mirror);
 212static void scrub_recheck_block_checksum(struct scrub_block *sblock);
 213static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 214					     struct scrub_block *sblock_good);
 215static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 216					    struct scrub_block *sblock_good,
 217					    int page_num, int force_write);
 218static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
 219static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
 220					   int page_num);
 221static int scrub_checksum_data(struct scrub_block *sblock);
 222static int scrub_checksum_tree_block(struct scrub_block *sblock);
 223static int scrub_checksum_super(struct scrub_block *sblock);
 224static void scrub_block_put(struct scrub_block *sblock);
 225static void scrub_page_get(struct scrub_page *spage);
 226static void scrub_page_put(struct scrub_page *spage);
 227static void scrub_parity_get(struct scrub_parity *sparity);
 228static void scrub_parity_put(struct scrub_parity *sparity);
 229static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
 230		       u64 physical, struct btrfs_device *dev, u64 flags,
 231		       u64 gen, int mirror_num, u8 *csum,
 232		       u64 physical_for_dev_replace);
 233static void scrub_bio_end_io(struct bio *bio);
 234static void scrub_bio_end_io_worker(struct btrfs_work *work);
 235static void scrub_block_complete(struct scrub_block *sblock);
 236static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
 237			       u64 extent_logical, u32 extent_len,
 238			       u64 *extent_physical,
 239			       struct btrfs_device **extent_dev,
 240			       int *extent_mirror_num);
 241static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 242				    struct scrub_page *spage);
 243static void scrub_wr_submit(struct scrub_ctx *sctx);
 244static void scrub_wr_bio_end_io(struct bio *bio);
 245static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
 246static void scrub_put_ctx(struct scrub_ctx *sctx);
 247
 248static inline int scrub_is_page_on_raid56(struct scrub_page *spage)
 249{
 250	return spage->recover &&
 251	       (spage->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
 252}
 253
 254static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 255{
 256	refcount_inc(&sctx->refs);
 257	atomic_inc(&sctx->bios_in_flight);
 258}
 259
 260static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 261{
 262	atomic_dec(&sctx->bios_in_flight);
 263	wake_up(&sctx->list_wait);
 264	scrub_put_ctx(sctx);
 265}
 266
 267static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 268{
 269	while (atomic_read(&fs_info->scrub_pause_req)) {
 270		mutex_unlock(&fs_info->scrub_lock);
 271		wait_event(fs_info->scrub_pause_wait,
 272		   atomic_read(&fs_info->scrub_pause_req) == 0);
 273		mutex_lock(&fs_info->scrub_lock);
 274	}
 275}
 276
 277static void scrub_pause_on(struct btrfs_fs_info *fs_info)
 278{
 279	atomic_inc(&fs_info->scrubs_paused);
 280	wake_up(&fs_info->scrub_pause_wait);
 281}
 282
 283static void scrub_pause_off(struct btrfs_fs_info *fs_info)
 284{
 285	mutex_lock(&fs_info->scrub_lock);
 286	__scrub_blocked_if_needed(fs_info);
 287	atomic_dec(&fs_info->scrubs_paused);
 288	mutex_unlock(&fs_info->scrub_lock);
 289
 290	wake_up(&fs_info->scrub_pause_wait);
 291}
 292
 293static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 294{
 295	scrub_pause_on(fs_info);
 296	scrub_pause_off(fs_info);
 297}
 298
 299/*
 300 * Insert new full stripe lock into full stripe locks tree
 301 *
 302 * Return pointer to existing or newly inserted full_stripe_lock structure if
 303 * everything works well.
 304 * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
 305 *
 306 * NOTE: caller must hold full_stripe_locks_root->lock before calling this
 307 * function
 308 */
 309static struct full_stripe_lock *insert_full_stripe_lock(
 310		struct btrfs_full_stripe_locks_tree *locks_root,
 311		u64 fstripe_logical)
 312{
 313	struct rb_node **p;
 314	struct rb_node *parent = NULL;
 315	struct full_stripe_lock *entry;
 316	struct full_stripe_lock *ret;
 317
 318	lockdep_assert_held(&locks_root->lock);
 319
 320	p = &locks_root->root.rb_node;
 321	while (*p) {
 322		parent = *p;
 323		entry = rb_entry(parent, struct full_stripe_lock, node);
 324		if (fstripe_logical < entry->logical) {
 325			p = &(*p)->rb_left;
 326		} else if (fstripe_logical > entry->logical) {
 327			p = &(*p)->rb_right;
 328		} else {
 329			entry->refs++;
 330			return entry;
 331		}
 332	}
 333
 334	/*
 335	 * Insert new lock.
 336	 */
 337	ret = kmalloc(sizeof(*ret), GFP_KERNEL);
 338	if (!ret)
 339		return ERR_PTR(-ENOMEM);
 340	ret->logical = fstripe_logical;
 341	ret->refs = 1;
 342	mutex_init(&ret->mutex);
 343
 344	rb_link_node(&ret->node, parent, p);
 345	rb_insert_color(&ret->node, &locks_root->root);
 346	return ret;
 347}
 348
 349/*
 350 * Search for a full stripe lock of a block group
 351 *
 352 * Return pointer to existing full stripe lock if found
 353 * Return NULL if not found
 354 */
 355static struct full_stripe_lock *search_full_stripe_lock(
 356		struct btrfs_full_stripe_locks_tree *locks_root,
 357		u64 fstripe_logical)
 358{
 359	struct rb_node *node;
 360	struct full_stripe_lock *entry;
 361
 362	lockdep_assert_held(&locks_root->lock);
 363
 364	node = locks_root->root.rb_node;
 365	while (node) {
 366		entry = rb_entry(node, struct full_stripe_lock, node);
 367		if (fstripe_logical < entry->logical)
 368			node = node->rb_left;
 369		else if (fstripe_logical > entry->logical)
 370			node = node->rb_right;
 371		else
 372			return entry;
 373	}
 374	return NULL;
 375}
 376
 377/*
 378 * Helper to get full stripe logical from a normal bytenr.
 379 *
 380 * Caller must ensure @cache is a RAID56 block group.
 381 */
 382static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
 383{
 384	u64 ret;
 385
 386	/*
 387	 * Due to chunk item size limit, full stripe length should not be
 388	 * larger than U32_MAX. Just a sanity check here.
 389	 */
 390	WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
 391
 392	/*
 393	 * round_down() can only handle power of 2, while RAID56 full
 394	 * stripe length can be 64KiB * n, so we need to manually round down.
 395	 */
 396	ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
 397			cache->full_stripe_len + cache->start;
 398	return ret;
 399}
 400
 401/*
 402 * Lock a full stripe to avoid concurrency of recovery and read
 403 *
 404 * It's only used for profiles with parities (RAID5/6), for other profiles it
 405 * does nothing.
 406 *
 407 * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
 408 * So caller must call unlock_full_stripe() at the same context.
 409 *
 410 * Return <0 if encounters error.
 411 */
 412static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
 413			    bool *locked_ret)
 414{
 415	struct btrfs_block_group *bg_cache;
 416	struct btrfs_full_stripe_locks_tree *locks_root;
 417	struct full_stripe_lock *existing;
 418	u64 fstripe_start;
 419	int ret = 0;
 420
 421	*locked_ret = false;
 422	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
 423	if (!bg_cache) {
 424		ASSERT(0);
 425		return -ENOENT;
 426	}
 427
 428	/* Profiles not based on parity don't need full stripe lock */
 429	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
 430		goto out;
 431	locks_root = &bg_cache->full_stripe_locks_root;
 432
 433	fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
 434
 435	/* Now insert the full stripe lock */
 436	mutex_lock(&locks_root->lock);
 437	existing = insert_full_stripe_lock(locks_root, fstripe_start);
 438	mutex_unlock(&locks_root->lock);
 439	if (IS_ERR(existing)) {
 440		ret = PTR_ERR(existing);
 441		goto out;
 442	}
 443	mutex_lock(&existing->mutex);
 444	*locked_ret = true;
 445out:
 446	btrfs_put_block_group(bg_cache);
 447	return ret;
 448}
 449
 450/*
 451 * Unlock a full stripe.
 452 *
 453 * NOTE: Caller must ensure it's the same context calling corresponding
 454 * lock_full_stripe().
 455 *
 456 * Return 0 if we unlock full stripe without problem.
 457 * Return <0 for error
 458 */
 459static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
 460			      bool locked)
 461{
 462	struct btrfs_block_group *bg_cache;
 463	struct btrfs_full_stripe_locks_tree *locks_root;
 464	struct full_stripe_lock *fstripe_lock;
 465	u64 fstripe_start;
 466	bool freeit = false;
 467	int ret = 0;
 468
 469	/* If we didn't acquire full stripe lock, no need to continue */
 470	if (!locked)
 471		return 0;
 472
 473	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
 474	if (!bg_cache) {
 475		ASSERT(0);
 476		return -ENOENT;
 477	}
 478	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
 479		goto out;
 480
 481	locks_root = &bg_cache->full_stripe_locks_root;
 482	fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
 483
 484	mutex_lock(&locks_root->lock);
 485	fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
 486	/* Unpaired unlock_full_stripe() detected */
 487	if (!fstripe_lock) {
 488		WARN_ON(1);
 489		ret = -ENOENT;
 490		mutex_unlock(&locks_root->lock);
 491		goto out;
 492	}
 493
 494	if (fstripe_lock->refs == 0) {
 495		WARN_ON(1);
 496		btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
 497			fstripe_lock->logical);
 498	} else {
 499		fstripe_lock->refs--;
 500	}
 501
 502	if (fstripe_lock->refs == 0) {
 503		rb_erase(&fstripe_lock->node, &locks_root->root);
 504		freeit = true;
 505	}
 506	mutex_unlock(&locks_root->lock);
 507
 508	mutex_unlock(&fstripe_lock->mutex);
 509	if (freeit)
 510		kfree(fstripe_lock);
 511out:
 512	btrfs_put_block_group(bg_cache);
 513	return ret;
 514}
 515
 516static void scrub_free_csums(struct scrub_ctx *sctx)
 517{
 518	while (!list_empty(&sctx->csum_list)) {
 519		struct btrfs_ordered_sum *sum;
 520		sum = list_first_entry(&sctx->csum_list,
 521				       struct btrfs_ordered_sum, list);
 522		list_del(&sum->list);
 523		kfree(sum);
 524	}
 525}
 526
 527static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 528{
 529	int i;
 530
 531	if (!sctx)
 532		return;
 533
 534	/* this can happen when scrub is cancelled */
 535	if (sctx->curr != -1) {
 536		struct scrub_bio *sbio = sctx->bios[sctx->curr];
 537
 538		for (i = 0; i < sbio->page_count; i++) {
 539			WARN_ON(!sbio->pagev[i]->page);
 540			scrub_block_put(sbio->pagev[i]->sblock);
 541		}
 542		bio_put(sbio->bio);
 543	}
 544
 545	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 546		struct scrub_bio *sbio = sctx->bios[i];
 547
 548		if (!sbio)
 549			break;
 550		kfree(sbio);
 551	}
 552
 553	kfree(sctx->wr_curr_bio);
 554	scrub_free_csums(sctx);
 555	kfree(sctx);
 556}
 557
 558static void scrub_put_ctx(struct scrub_ctx *sctx)
 559{
 560	if (refcount_dec_and_test(&sctx->refs))
 561		scrub_free_ctx(sctx);
 562}
 563
 564static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
 565		struct btrfs_fs_info *fs_info, int is_dev_replace)
 566{
 567	struct scrub_ctx *sctx;
 568	int		i;
 569
 570	sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
 571	if (!sctx)
 572		goto nomem;
 573	refcount_set(&sctx->refs, 1);
 574	sctx->is_dev_replace = is_dev_replace;
 575	sctx->pages_per_bio = SCRUB_PAGES_PER_BIO;
 576	sctx->curr = -1;
 577	sctx->fs_info = fs_info;
 578	INIT_LIST_HEAD(&sctx->csum_list);
 579	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 580		struct scrub_bio *sbio;
 581
 582		sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
 583		if (!sbio)
 584			goto nomem;
 585		sctx->bios[i] = sbio;
 586
 587		sbio->index = i;
 588		sbio->sctx = sctx;
 589		sbio->page_count = 0;
 590		btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL,
 591				NULL);
 592
 593		if (i != SCRUB_BIOS_PER_SCTX - 1)
 594			sctx->bios[i]->next_free = i + 1;
 595		else
 596			sctx->bios[i]->next_free = -1;
 597	}
 598	sctx->first_free = 0;
 599	atomic_set(&sctx->bios_in_flight, 0);
 600	atomic_set(&sctx->workers_pending, 0);
 601	atomic_set(&sctx->cancel_req, 0);
 602
 603	spin_lock_init(&sctx->list_lock);
 604	spin_lock_init(&sctx->stat_lock);
 605	init_waitqueue_head(&sctx->list_wait);
 606	sctx->throttle_deadline = 0;
 607
 608	WARN_ON(sctx->wr_curr_bio != NULL);
 609	mutex_init(&sctx->wr_lock);
 610	sctx->wr_curr_bio = NULL;
 611	if (is_dev_replace) {
 612		WARN_ON(!fs_info->dev_replace.tgtdev);
 613		sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
 614		sctx->flush_all_writes = false;
 615	}
 616
 617	return sctx;
 618
 619nomem:
 620	scrub_free_ctx(sctx);
 621	return ERR_PTR(-ENOMEM);
 622}
 623
 624static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 625				     void *warn_ctx)
 626{
 627	u32 nlink;
 628	int ret;
 629	int i;
 630	unsigned nofs_flag;
 631	struct extent_buffer *eb;
 632	struct btrfs_inode_item *inode_item;
 633	struct scrub_warning *swarn = warn_ctx;
 634	struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
 635	struct inode_fs_paths *ipath = NULL;
 636	struct btrfs_root *local_root;
 637	struct btrfs_key key;
 638
 639	local_root = btrfs_get_fs_root(fs_info, root, true);
 640	if (IS_ERR(local_root)) {
 641		ret = PTR_ERR(local_root);
 642		goto err;
 643	}
 644
 645	/*
 646	 * this makes the path point to (inum INODE_ITEM ioff)
 647	 */
 648	key.objectid = inum;
 649	key.type = BTRFS_INODE_ITEM_KEY;
 650	key.offset = 0;
 651
 652	ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
 653	if (ret) {
 654		btrfs_put_root(local_root);
 655		btrfs_release_path(swarn->path);
 656		goto err;
 657	}
 658
 659	eb = swarn->path->nodes[0];
 660	inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
 661					struct btrfs_inode_item);
 662	nlink = btrfs_inode_nlink(eb, inode_item);
 663	btrfs_release_path(swarn->path);
 664
 665	/*
 666	 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
 667	 * uses GFP_NOFS in this context, so we keep it consistent but it does
 668	 * not seem to be strictly necessary.
 669	 */
 670	nofs_flag = memalloc_nofs_save();
 671	ipath = init_ipath(4096, local_root, swarn->path);
 672	memalloc_nofs_restore(nofs_flag);
 673	if (IS_ERR(ipath)) {
 674		btrfs_put_root(local_root);
 675		ret = PTR_ERR(ipath);
 676		ipath = NULL;
 677		goto err;
 678	}
 679	ret = paths_from_inode(inum, ipath);
 680
 681	if (ret < 0)
 682		goto err;
 683
 684	/*
 685	 * we deliberately ignore the bit ipath might have been too small to
 686	 * hold all of the paths here
 687	 */
 688	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
 689		btrfs_warn_in_rcu(fs_info,
 690"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
 691				  swarn->errstr, swarn->logical,
 692				  rcu_str_deref(swarn->dev->name),
 693				  swarn->physical,
 694				  root, inum, offset,
 695				  fs_info->sectorsize, nlink,
 696				  (char *)(unsigned long)ipath->fspath->val[i]);
 697
 698	btrfs_put_root(local_root);
 699	free_ipath(ipath);
 700	return 0;
 701
 702err:
 703	btrfs_warn_in_rcu(fs_info,
 704			  "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
 705			  swarn->errstr, swarn->logical,
 706			  rcu_str_deref(swarn->dev->name),
 707			  swarn->physical,
 708			  root, inum, offset, ret);
 709
 710	free_ipath(ipath);
 711	return 0;
 712}
 713
 714static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 715{
 716	struct btrfs_device *dev;
 717	struct btrfs_fs_info *fs_info;
 718	struct btrfs_path *path;
 719	struct btrfs_key found_key;
 720	struct extent_buffer *eb;
 721	struct btrfs_extent_item *ei;
 722	struct scrub_warning swarn;
 723	unsigned long ptr = 0;
 724	u64 extent_item_pos;
 725	u64 flags = 0;
 726	u64 ref_root;
 727	u32 item_size;
 728	u8 ref_level = 0;
 729	int ret;
 730
 731	WARN_ON(sblock->page_count < 1);
 732	dev = sblock->pagev[0]->dev;
 733	fs_info = sblock->sctx->fs_info;
 734
 735	path = btrfs_alloc_path();
 736	if (!path)
 737		return;
 738
 739	swarn.physical = sblock->pagev[0]->physical;
 740	swarn.logical = sblock->pagev[0]->logical;
 741	swarn.errstr = errstr;
 742	swarn.dev = NULL;
 743
 744	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
 745				  &flags);
 746	if (ret < 0)
 747		goto out;
 748
 749	extent_item_pos = swarn.logical - found_key.objectid;
 750	swarn.extent_item_size = found_key.offset;
 751
 752	eb = path->nodes[0];
 753	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 754	item_size = btrfs_item_size(eb, path->slots[0]);
 755
 756	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 757		do {
 758			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
 759						      item_size, &ref_root,
 760						      &ref_level);
 761			btrfs_warn_in_rcu(fs_info,
 762"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
 763				errstr, swarn.logical,
 764				rcu_str_deref(dev->name),
 765				swarn.physical,
 766				ref_level ? "node" : "leaf",
 767				ret < 0 ? -1 : ref_level,
 768				ret < 0 ? -1 : ref_root);
 769		} while (ret != 1);
 770		btrfs_release_path(path);
 771	} else {
 772		btrfs_release_path(path);
 773		swarn.path = path;
 774		swarn.dev = dev;
 775		iterate_extent_inodes(fs_info, found_key.objectid,
 776					extent_item_pos, 1,
 777					scrub_print_warning_inode, &swarn, false);
 778	}
 779
 780out:
 781	btrfs_free_path(path);
 782}
 783
 784static inline void scrub_get_recover(struct scrub_recover *recover)
 785{
 786	refcount_inc(&recover->refs);
 787}
 788
 789static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
 790				     struct scrub_recover *recover)
 791{
 792	if (refcount_dec_and_test(&recover->refs)) {
 793		btrfs_bio_counter_dec(fs_info);
 794		btrfs_put_bioc(recover->bioc);
 795		kfree(recover);
 796	}
 797}
 798
 799/*
 800 * scrub_handle_errored_block gets called when either verification of the
 801 * pages failed or the bio failed to read, e.g. with EIO. In the latter
 802 * case, this function handles all pages in the bio, even though only one
 803 * may be bad.
 804 * The goal of this function is to repair the errored block by using the
 805 * contents of one of the mirrors.
 806 */
 807static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 808{
 809	struct scrub_ctx *sctx = sblock_to_check->sctx;
 810	struct btrfs_device *dev;
 811	struct btrfs_fs_info *fs_info;
 812	u64 logical;
 813	unsigned int failed_mirror_index;
 814	unsigned int is_metadata;
 815	unsigned int have_csum;
 816	struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
 817	struct scrub_block *sblock_bad;
 818	int ret;
 819	int mirror_index;
 820	int page_num;
 821	int success;
 822	bool full_stripe_locked;
 823	unsigned int nofs_flag;
 824	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
 825				      DEFAULT_RATELIMIT_BURST);
 826
 827	BUG_ON(sblock_to_check->page_count < 1);
 828	fs_info = sctx->fs_info;
 829	if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
 830		/*
 831		 * if we find an error in a super block, we just report it.
 832		 * They will get written with the next transaction commit
 833		 * anyway
 834		 */
 835		spin_lock(&sctx->stat_lock);
 836		++sctx->stat.super_errors;
 837		spin_unlock(&sctx->stat_lock);
 838		return 0;
 839	}
 840	logical = sblock_to_check->pagev[0]->logical;
 841	BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
 842	failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
 843	is_metadata = !(sblock_to_check->pagev[0]->flags &
 844			BTRFS_EXTENT_FLAG_DATA);
 845	have_csum = sblock_to_check->pagev[0]->have_csum;
 846	dev = sblock_to_check->pagev[0]->dev;
 847
 848	if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
 849		return 0;
 850
 851	/*
 852	 * We must use GFP_NOFS because the scrub task might be waiting for a
 853	 * worker task executing this function and in turn a transaction commit
 854	 * might be waiting the scrub task to pause (which needs to wait for all
 855	 * the worker tasks to complete before pausing).
 856	 * We do allocations in the workers through insert_full_stripe_lock()
 857	 * and scrub_add_page_to_wr_bio(), which happens down the call chain of
 858	 * this function.
 859	 */
 860	nofs_flag = memalloc_nofs_save();
 861	/*
 862	 * For RAID5/6, race can happen for a different device scrub thread.
 863	 * For data corruption, Parity and Data threads will both try
 864	 * to recovery the data.
 865	 * Race can lead to doubly added csum error, or even unrecoverable
 866	 * error.
 867	 */
 868	ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
 869	if (ret < 0) {
 870		memalloc_nofs_restore(nofs_flag);
 871		spin_lock(&sctx->stat_lock);
 872		if (ret == -ENOMEM)
 873			sctx->stat.malloc_errors++;
 874		sctx->stat.read_errors++;
 875		sctx->stat.uncorrectable_errors++;
 876		spin_unlock(&sctx->stat_lock);
 877		return ret;
 878	}
 879
 880	/*
 881	 * read all mirrors one after the other. This includes to
 882	 * re-read the extent or metadata block that failed (that was
 883	 * the cause that this fixup code is called) another time,
 884	 * sector by sector this time in order to know which sectors
 885	 * caused I/O errors and which ones are good (for all mirrors).
 886	 * It is the goal to handle the situation when more than one
 887	 * mirror contains I/O errors, but the errors do not
 888	 * overlap, i.e. the data can be repaired by selecting the
 889	 * sectors from those mirrors without I/O error on the
 890	 * particular sectors. One example (with blocks >= 2 * sectorsize)
 891	 * would be that mirror #1 has an I/O error on the first sector,
 892	 * the second sector is good, and mirror #2 has an I/O error on
 893	 * the second sector, but the first sector is good.
 894	 * Then the first sector of the first mirror can be repaired by
 895	 * taking the first sector of the second mirror, and the
 896	 * second sector of the second mirror can be repaired by
 897	 * copying the contents of the 2nd sector of the 1st mirror.
 898	 * One more note: if the sectors of one mirror contain I/O
 899	 * errors, the checksum cannot be verified. In order to get
 900	 * the best data for repairing, the first attempt is to find
 901	 * a mirror without I/O errors and with a validated checksum.
 902	 * Only if this is not possible, the sectors are picked from
 903	 * mirrors with I/O errors without considering the checksum.
 904	 * If the latter is the case, at the end, the checksum of the
 905	 * repaired area is verified in order to correctly maintain
 906	 * the statistics.
 907	 */
 908
 909	sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
 910				      sizeof(*sblocks_for_recheck), GFP_KERNEL);
 911	if (!sblocks_for_recheck) {
 912		spin_lock(&sctx->stat_lock);
 913		sctx->stat.malloc_errors++;
 914		sctx->stat.read_errors++;
 915		sctx->stat.uncorrectable_errors++;
 916		spin_unlock(&sctx->stat_lock);
 917		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 918		goto out;
 919	}
 920
 921	/* setup the context, map the logical blocks and alloc the pages */
 922	ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
 923	if (ret) {
 924		spin_lock(&sctx->stat_lock);
 925		sctx->stat.read_errors++;
 926		sctx->stat.uncorrectable_errors++;
 927		spin_unlock(&sctx->stat_lock);
 928		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 929		goto out;
 930	}
 931	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
 932	sblock_bad = sblocks_for_recheck + failed_mirror_index;
 933
 934	/* build and submit the bios for the failed mirror, check checksums */
 935	scrub_recheck_block(fs_info, sblock_bad, 1);
 936
 937	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
 938	    sblock_bad->no_io_error_seen) {
 939		/*
 940		 * the error disappeared after reading page by page, or
 941		 * the area was part of a huge bio and other parts of the
 942		 * bio caused I/O errors, or the block layer merged several
 943		 * read requests into one and the error is caused by a
 944		 * different bio (usually one of the two latter cases is
 945		 * the cause)
 946		 */
 947		spin_lock(&sctx->stat_lock);
 948		sctx->stat.unverified_errors++;
 949		sblock_to_check->data_corrected = 1;
 950		spin_unlock(&sctx->stat_lock);
 951
 952		if (sctx->is_dev_replace)
 953			scrub_write_block_to_dev_replace(sblock_bad);
 954		goto out;
 955	}
 956
 957	if (!sblock_bad->no_io_error_seen) {
 958		spin_lock(&sctx->stat_lock);
 959		sctx->stat.read_errors++;
 960		spin_unlock(&sctx->stat_lock);
 961		if (__ratelimit(&rs))
 962			scrub_print_warning("i/o error", sblock_to_check);
 963		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 964	} else if (sblock_bad->checksum_error) {
 965		spin_lock(&sctx->stat_lock);
 966		sctx->stat.csum_errors++;
 967		spin_unlock(&sctx->stat_lock);
 968		if (__ratelimit(&rs))
 969			scrub_print_warning("checksum error", sblock_to_check);
 970		btrfs_dev_stat_inc_and_print(dev,
 971					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
 972	} else if (sblock_bad->header_error) {
 973		spin_lock(&sctx->stat_lock);
 974		sctx->stat.verify_errors++;
 975		spin_unlock(&sctx->stat_lock);
 976		if (__ratelimit(&rs))
 977			scrub_print_warning("checksum/header error",
 978					    sblock_to_check);
 979		if (sblock_bad->generation_error)
 980			btrfs_dev_stat_inc_and_print(dev,
 981				BTRFS_DEV_STAT_GENERATION_ERRS);
 982		else
 983			btrfs_dev_stat_inc_and_print(dev,
 984				BTRFS_DEV_STAT_CORRUPTION_ERRS);
 985	}
 986
 987	if (sctx->readonly) {
 988		ASSERT(!sctx->is_dev_replace);
 989		goto out;
 990	}
 991
 992	/*
 993	 * now build and submit the bios for the other mirrors, check
 994	 * checksums.
 995	 * First try to pick the mirror which is completely without I/O
 996	 * errors and also does not have a checksum error.
 997	 * If one is found, and if a checksum is present, the full block
 998	 * that is known to contain an error is rewritten. Afterwards
 999	 * the block is known to be corrected.
1000	 * If a mirror is found which is completely correct, and no
1001	 * checksum is present, only those pages are rewritten that had
1002	 * an I/O error in the block to be repaired, since it cannot be
1003	 * determined, which copy of the other pages is better (and it
1004	 * could happen otherwise that a correct page would be
1005	 * overwritten by a bad one).
1006	 */
1007	for (mirror_index = 0; ;mirror_index++) {
1008		struct scrub_block *sblock_other;
1009
1010		if (mirror_index == failed_mirror_index)
1011			continue;
1012
1013		/* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1014		if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1015			if (mirror_index >= BTRFS_MAX_MIRRORS)
1016				break;
1017			if (!sblocks_for_recheck[mirror_index].page_count)
1018				break;
1019
1020			sblock_other = sblocks_for_recheck + mirror_index;
1021		} else {
1022			struct scrub_recover *r = sblock_bad->pagev[0]->recover;
1023			int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
1024
1025			if (mirror_index >= max_allowed)
1026				break;
1027			if (!sblocks_for_recheck[1].page_count)
1028				break;
1029
1030			ASSERT(failed_mirror_index == 0);
1031			sblock_other = sblocks_for_recheck + 1;
1032			sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
1033		}
1034
1035		/* build and submit the bios, check checksums */
1036		scrub_recheck_block(fs_info, sblock_other, 0);
1037
1038		if (!sblock_other->header_error &&
1039		    !sblock_other->checksum_error &&
1040		    sblock_other->no_io_error_seen) {
1041			if (sctx->is_dev_replace) {
1042				scrub_write_block_to_dev_replace(sblock_other);
1043				goto corrected_error;
1044			} else {
1045				ret = scrub_repair_block_from_good_copy(
1046						sblock_bad, sblock_other);
1047				if (!ret)
1048					goto corrected_error;
1049			}
1050		}
1051	}
1052
1053	if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1054		goto did_not_correct_error;
1055
1056	/*
1057	 * In case of I/O errors in the area that is supposed to be
1058	 * repaired, continue by picking good copies of those sectors.
1059	 * Select the good sectors from mirrors to rewrite bad sectors from
1060	 * the area to fix. Afterwards verify the checksum of the block
1061	 * that is supposed to be repaired. This verification step is
1062	 * only done for the purpose of statistic counting and for the
1063	 * final scrub report, whether errors remain.
1064	 * A perfect algorithm could make use of the checksum and try
1065	 * all possible combinations of sectors from the different mirrors
1066	 * until the checksum verification succeeds. For example, when
1067	 * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
1068	 * of mirror #2 is readable but the final checksum test fails,
1069	 * then the 2nd sector of mirror #3 could be tried, whether now
1070	 * the final checksum succeeds. But this would be a rare
1071	 * exception and is therefore not implemented. At least it is
1072	 * avoided that the good copy is overwritten.
1073	 * A more useful improvement would be to pick the sectors
1074	 * without I/O error based on sector sizes (512 bytes on legacy
1075	 * disks) instead of on sectorsize. Then maybe 512 byte of one
1076	 * mirror could be repaired by taking 512 byte of a different
1077	 * mirror, even if other 512 byte sectors in the same sectorsize
1078	 * area are unreadable.
1079	 */
1080	success = 1;
1081	for (page_num = 0; page_num < sblock_bad->page_count;
1082	     page_num++) {
1083		struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
1084		struct scrub_block *sblock_other = NULL;
1085
1086		/* skip no-io-error page in scrub */
1087		if (!spage_bad->io_error && !sctx->is_dev_replace)
1088			continue;
1089
1090		if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1091			/*
1092			 * In case of dev replace, if raid56 rebuild process
1093			 * didn't work out correct data, then copy the content
1094			 * in sblock_bad to make sure target device is identical
1095			 * to source device, instead of writing garbage data in
1096			 * sblock_for_recheck array to target device.
1097			 */
1098			sblock_other = NULL;
1099		} else if (spage_bad->io_error) {
1100			/* try to find no-io-error page in mirrors */
1101			for (mirror_index = 0;
1102			     mirror_index < BTRFS_MAX_MIRRORS &&
1103			     sblocks_for_recheck[mirror_index].page_count > 0;
1104			     mirror_index++) {
1105				if (!sblocks_for_recheck[mirror_index].
1106				    pagev[page_num]->io_error) {
1107					sblock_other = sblocks_for_recheck +
1108						       mirror_index;
1109					break;
1110				}
1111			}
1112			if (!sblock_other)
1113				success = 0;
1114		}
1115
1116		if (sctx->is_dev_replace) {
1117			/*
1118			 * did not find a mirror to fetch the page
1119			 * from. scrub_write_page_to_dev_replace()
1120			 * handles this case (page->io_error), by
1121			 * filling the block with zeros before
1122			 * submitting the write request
1123			 */
1124			if (!sblock_other)
1125				sblock_other = sblock_bad;
1126
1127			if (scrub_write_page_to_dev_replace(sblock_other,
1128							    page_num) != 0) {
1129				atomic64_inc(
1130					&fs_info->dev_replace.num_write_errors);
1131				success = 0;
1132			}
1133		} else if (sblock_other) {
1134			ret = scrub_repair_page_from_good_copy(sblock_bad,
1135							       sblock_other,
1136							       page_num, 0);
1137			if (0 == ret)
1138				spage_bad->io_error = 0;
1139			else
1140				success = 0;
1141		}
1142	}
1143
1144	if (success && !sctx->is_dev_replace) {
1145		if (is_metadata || have_csum) {
1146			/*
1147			 * need to verify the checksum now that all
1148			 * sectors on disk are repaired (the write
1149			 * request for data to be repaired is on its way).
1150			 * Just be lazy and use scrub_recheck_block()
1151			 * which re-reads the data before the checksum
1152			 * is verified, but most likely the data comes out
1153			 * of the page cache.
1154			 */
1155			scrub_recheck_block(fs_info, sblock_bad, 1);
1156			if (!sblock_bad->header_error &&
1157			    !sblock_bad->checksum_error &&
1158			    sblock_bad->no_io_error_seen)
1159				goto corrected_error;
1160			else
1161				goto did_not_correct_error;
1162		} else {
1163corrected_error:
1164			spin_lock(&sctx->stat_lock);
1165			sctx->stat.corrected_errors++;
1166			sblock_to_check->data_corrected = 1;
1167			spin_unlock(&sctx->stat_lock);
1168			btrfs_err_rl_in_rcu(fs_info,
1169				"fixed up error at logical %llu on dev %s",
1170				logical, rcu_str_deref(dev->name));
1171		}
1172	} else {
1173did_not_correct_error:
1174		spin_lock(&sctx->stat_lock);
1175		sctx->stat.uncorrectable_errors++;
1176		spin_unlock(&sctx->stat_lock);
1177		btrfs_err_rl_in_rcu(fs_info,
1178			"unable to fixup (regular) error at logical %llu on dev %s",
1179			logical, rcu_str_deref(dev->name));
1180	}
1181
1182out:
1183	if (sblocks_for_recheck) {
1184		for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1185		     mirror_index++) {
1186			struct scrub_block *sblock = sblocks_for_recheck +
1187						     mirror_index;
1188			struct scrub_recover *recover;
1189			int page_index;
1190
1191			for (page_index = 0; page_index < sblock->page_count;
1192			     page_index++) {
1193				sblock->pagev[page_index]->sblock = NULL;
1194				recover = sblock->pagev[page_index]->recover;
1195				if (recover) {
1196					scrub_put_recover(fs_info, recover);
1197					sblock->pagev[page_index]->recover =
1198									NULL;
1199				}
1200				scrub_page_put(sblock->pagev[page_index]);
1201			}
1202		}
1203		kfree(sblocks_for_recheck);
1204	}
1205
1206	ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1207	memalloc_nofs_restore(nofs_flag);
1208	if (ret < 0)
1209		return ret;
1210	return 0;
1211}
1212
1213static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
1214{
1215	if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
1216		return 2;
1217	else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
1218		return 3;
1219	else
1220		return (int)bioc->num_stripes;
1221}
1222
1223static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1224						 u64 *raid_map,
1225						 u64 mapped_length,
1226						 int nstripes, int mirror,
1227						 int *stripe_index,
1228						 u64 *stripe_offset)
1229{
1230	int i;
1231
1232	if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1233		/* RAID5/6 */
1234		for (i = 0; i < nstripes; i++) {
1235			if (raid_map[i] == RAID6_Q_STRIPE ||
1236			    raid_map[i] == RAID5_P_STRIPE)
1237				continue;
1238
1239			if (logical >= raid_map[i] &&
1240			    logical < raid_map[i] + mapped_length)
1241				break;
1242		}
1243
1244		*stripe_index = i;
1245		*stripe_offset = logical - raid_map[i];
1246	} else {
1247		/* The other RAID type */
1248		*stripe_index = mirror;
1249		*stripe_offset = 0;
1250	}
1251}
1252
1253static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1254				     struct scrub_block *sblocks_for_recheck)
1255{
1256	struct scrub_ctx *sctx = original_sblock->sctx;
1257	struct btrfs_fs_info *fs_info = sctx->fs_info;
1258	u64 length = original_sblock->page_count * fs_info->sectorsize;
1259	u64 logical = original_sblock->pagev[0]->logical;
1260	u64 generation = original_sblock->pagev[0]->generation;
1261	u64 flags = original_sblock->pagev[0]->flags;
1262	u64 have_csum = original_sblock->pagev[0]->have_csum;
1263	struct scrub_recover *recover;
1264	struct btrfs_io_context *bioc;
1265	u64 sublen;
1266	u64 mapped_length;
1267	u64 stripe_offset;
1268	int stripe_index;
1269	int page_index = 0;
1270	int mirror_index;
1271	int nmirrors;
1272	int ret;
1273
1274	/*
1275	 * note: the two members refs and outstanding_pages
1276	 * are not used (and not set) in the blocks that are used for
1277	 * the recheck procedure
1278	 */
1279
1280	while (length > 0) {
1281		sublen = min_t(u64, length, fs_info->sectorsize);
1282		mapped_length = sublen;
1283		bioc = NULL;
1284
1285		/*
1286		 * With a length of sectorsize, each returned stripe represents
1287		 * one mirror
1288		 */
1289		btrfs_bio_counter_inc_blocked(fs_info);
1290		ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1291				       logical, &mapped_length, &bioc);
1292		if (ret || !bioc || mapped_length < sublen) {
1293			btrfs_put_bioc(bioc);
1294			btrfs_bio_counter_dec(fs_info);
1295			return -EIO;
1296		}
1297
1298		recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1299		if (!recover) {
1300			btrfs_put_bioc(bioc);
1301			btrfs_bio_counter_dec(fs_info);
1302			return -ENOMEM;
1303		}
1304
1305		refcount_set(&recover->refs, 1);
1306		recover->bioc = bioc;
1307		recover->map_length = mapped_length;
1308
1309		ASSERT(page_index < SCRUB_MAX_PAGES_PER_BLOCK);
1310
1311		nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
1312
1313		for (mirror_index = 0; mirror_index < nmirrors;
1314		     mirror_index++) {
1315			struct scrub_block *sblock;
1316			struct scrub_page *spage;
1317
1318			sblock = sblocks_for_recheck + mirror_index;
1319			sblock->sctx = sctx;
1320
1321			spage = kzalloc(sizeof(*spage), GFP_NOFS);
1322			if (!spage) {
1323leave_nomem:
1324				spin_lock(&sctx->stat_lock);
1325				sctx->stat.malloc_errors++;
1326				spin_unlock(&sctx->stat_lock);
1327				scrub_put_recover(fs_info, recover);
1328				return -ENOMEM;
1329			}
1330			scrub_page_get(spage);
1331			sblock->pagev[page_index] = spage;
1332			spage->sblock = sblock;
1333			spage->flags = flags;
1334			spage->generation = generation;
1335			spage->logical = logical;
1336			spage->have_csum = have_csum;
1337			if (have_csum)
1338				memcpy(spage->csum,
1339				       original_sblock->pagev[0]->csum,
1340				       sctx->fs_info->csum_size);
1341
1342			scrub_stripe_index_and_offset(logical,
1343						      bioc->map_type,
1344						      bioc->raid_map,
1345						      mapped_length,
1346						      bioc->num_stripes -
1347						      bioc->num_tgtdevs,
1348						      mirror_index,
1349						      &stripe_index,
1350						      &stripe_offset);
1351			spage->physical = bioc->stripes[stripe_index].physical +
1352					 stripe_offset;
1353			spage->dev = bioc->stripes[stripe_index].dev;
1354
1355			BUG_ON(page_index >= original_sblock->page_count);
1356			spage->physical_for_dev_replace =
1357				original_sblock->pagev[page_index]->
1358				physical_for_dev_replace;
1359			/* for missing devices, dev->bdev is NULL */
1360			spage->mirror_num = mirror_index + 1;
1361			sblock->page_count++;
1362			spage->page = alloc_page(GFP_NOFS);
1363			if (!spage->page)
1364				goto leave_nomem;
1365
1366			scrub_get_recover(recover);
1367			spage->recover = recover;
1368		}
1369		scrub_put_recover(fs_info, recover);
1370		length -= sublen;
1371		logical += sublen;
1372		page_index++;
1373	}
1374
1375	return 0;
1376}
1377
1378static void scrub_bio_wait_endio(struct bio *bio)
1379{
1380	complete(bio->bi_private);
1381}
1382
1383static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1384					struct bio *bio,
1385					struct scrub_page *spage)
1386{
1387	DECLARE_COMPLETION_ONSTACK(done);
1388	int ret;
1389	int mirror_num;
1390
1391	bio->bi_iter.bi_sector = spage->logical >> 9;
1392	bio->bi_private = &done;
1393	bio->bi_end_io = scrub_bio_wait_endio;
1394
1395	mirror_num = spage->sblock->pagev[0]->mirror_num;
1396	ret = raid56_parity_recover(bio, spage->recover->bioc,
1397				    spage->recover->map_length,
1398				    mirror_num, 0);
1399	if (ret)
1400		return ret;
1401
1402	wait_for_completion_io(&done);
1403	return blk_status_to_errno(bio->bi_status);
1404}
1405
1406static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1407					  struct scrub_block *sblock)
1408{
1409	struct scrub_page *first_page = sblock->pagev[0];
1410	struct bio *bio;
1411	int page_num;
1412
1413	/* All pages in sblock belong to the same stripe on the same device. */
1414	ASSERT(first_page->dev);
1415	if (!first_page->dev->bdev)
1416		goto out;
1417
1418	bio = btrfs_bio_alloc(BIO_MAX_VECS);
1419	bio_set_dev(bio, first_page->dev->bdev);
1420
1421	for (page_num = 0; page_num < sblock->page_count; page_num++) {
1422		struct scrub_page *spage = sblock->pagev[page_num];
1423
1424		WARN_ON(!spage->page);
1425		bio_add_page(bio, spage->page, PAGE_SIZE, 0);
1426	}
1427
1428	if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
1429		bio_put(bio);
1430		goto out;
1431	}
1432
1433	bio_put(bio);
1434
1435	scrub_recheck_block_checksum(sblock);
1436
1437	return;
1438out:
1439	for (page_num = 0; page_num < sblock->page_count; page_num++)
1440		sblock->pagev[page_num]->io_error = 1;
1441
1442	sblock->no_io_error_seen = 0;
1443}
1444
1445/*
1446 * this function will check the on disk data for checksum errors, header
1447 * errors and read I/O errors. If any I/O errors happen, the exact pages
1448 * which are errored are marked as being bad. The goal is to enable scrub
1449 * to take those pages that are not errored from all the mirrors so that
1450 * the pages that are errored in the just handled mirror can be repaired.
1451 */
1452static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1453				struct scrub_block *sblock,
1454				int retry_failed_mirror)
1455{
1456	int page_num;
1457
1458	sblock->no_io_error_seen = 1;
1459
1460	/* short cut for raid56 */
1461	if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
1462		return scrub_recheck_block_on_raid56(fs_info, sblock);
1463
1464	for (page_num = 0; page_num < sblock->page_count; page_num++) {
1465		struct bio *bio;
1466		struct scrub_page *spage = sblock->pagev[page_num];
1467
1468		if (spage->dev->bdev == NULL) {
1469			spage->io_error = 1;
1470			sblock->no_io_error_seen = 0;
1471			continue;
1472		}
1473
1474		WARN_ON(!spage->page);
1475		bio = btrfs_bio_alloc(1);
1476		bio_set_dev(bio, spage->dev->bdev);
1477
1478		bio_add_page(bio, spage->page, fs_info->sectorsize, 0);
1479		bio->bi_iter.bi_sector = spage->physical >> 9;
1480		bio->bi_opf = REQ_OP_READ;
1481
1482		if (btrfsic_submit_bio_wait(bio)) {
1483			spage->io_error = 1;
1484			sblock->no_io_error_seen = 0;
1485		}
1486
1487		bio_put(bio);
1488	}
1489
1490	if (sblock->no_io_error_seen)
1491		scrub_recheck_block_checksum(sblock);
1492}
1493
1494static inline int scrub_check_fsid(u8 fsid[],
1495				   struct scrub_page *spage)
1496{
1497	struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1498	int ret;
1499
1500	ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1501	return !ret;
1502}
1503
1504static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1505{
1506	sblock->header_error = 0;
1507	sblock->checksum_error = 0;
1508	sblock->generation_error = 0;
1509
1510	if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1511		scrub_checksum_data(sblock);
1512	else
1513		scrub_checksum_tree_block(sblock);
1514}
1515
1516static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1517					     struct scrub_block *sblock_good)
1518{
1519	int page_num;
1520	int ret = 0;
1521
1522	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1523		int ret_sub;
1524
1525		ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1526							   sblock_good,
1527							   page_num, 1);
1528		if (ret_sub)
1529			ret = ret_sub;
1530	}
1531
1532	return ret;
1533}
1534
1535static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1536					    struct scrub_block *sblock_good,
1537					    int page_num, int force_write)
1538{
1539	struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
1540	struct scrub_page *spage_good = sblock_good->pagev[page_num];
1541	struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1542	const u32 sectorsize = fs_info->sectorsize;
1543
1544	BUG_ON(spage_bad->page == NULL);
1545	BUG_ON(spage_good->page == NULL);
1546	if (force_write || sblock_bad->header_error ||
1547	    sblock_bad->checksum_error || spage_bad->io_error) {
1548		struct bio *bio;
1549		int ret;
1550
1551		if (!spage_bad->dev->bdev) {
1552			btrfs_warn_rl(fs_info,
1553				"scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1554			return -EIO;
1555		}
1556
1557		bio = btrfs_bio_alloc(1);
1558		bio_set_dev(bio, spage_bad->dev->bdev);
1559		bio->bi_iter.bi_sector = spage_bad->physical >> 9;
1560		bio->bi_opf = REQ_OP_WRITE;
1561
1562		ret = bio_add_page(bio, spage_good->page, sectorsize, 0);
1563		if (ret != sectorsize) {
1564			bio_put(bio);
1565			return -EIO;
1566		}
1567
1568		if (btrfsic_submit_bio_wait(bio)) {
1569			btrfs_dev_stat_inc_and_print(spage_bad->dev,
1570				BTRFS_DEV_STAT_WRITE_ERRS);
1571			atomic64_inc(&fs_info->dev_replace.num_write_errors);
1572			bio_put(bio);
1573			return -EIO;
1574		}
1575		bio_put(bio);
1576	}
1577
1578	return 0;
1579}
1580
1581static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1582{
1583	struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1584	int page_num;
1585
1586	/*
1587	 * This block is used for the check of the parity on the source device,
1588	 * so the data needn't be written into the destination device.
1589	 */
1590	if (sblock->sparity)
1591		return;
1592
1593	for (page_num = 0; page_num < sblock->page_count; page_num++) {
1594		int ret;
1595
1596		ret = scrub_write_page_to_dev_replace(sblock, page_num);
1597		if (ret)
1598			atomic64_inc(&fs_info->dev_replace.num_write_errors);
1599	}
1600}
1601
1602static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1603					   int page_num)
1604{
1605	struct scrub_page *spage = sblock->pagev[page_num];
1606
1607	BUG_ON(spage->page == NULL);
1608	if (spage->io_error)
1609		clear_page(page_address(spage->page));
1610
1611	return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1612}
1613
1614static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
1615{
1616	int ret = 0;
1617	u64 length;
1618
1619	if (!btrfs_is_zoned(sctx->fs_info))
1620		return 0;
1621
1622	if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
1623		return 0;
1624
1625	if (sctx->write_pointer < physical) {
1626		length = physical - sctx->write_pointer;
1627
1628		ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
1629						sctx->write_pointer, length);
1630		if (!ret)
1631			sctx->write_pointer = physical;
1632	}
1633	return ret;
1634}
1635
1636static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1637				    struct scrub_page *spage)
1638{
1639	struct scrub_bio *sbio;
1640	int ret;
1641	const u32 sectorsize = sctx->fs_info->sectorsize;
1642
1643	mutex_lock(&sctx->wr_lock);
1644again:
1645	if (!sctx->wr_curr_bio) {
1646		sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1647					      GFP_KERNEL);
1648		if (!sctx->wr_curr_bio) {
1649			mutex_unlock(&sctx->wr_lock);
1650			return -ENOMEM;
1651		}
1652		sctx->wr_curr_bio->sctx = sctx;
1653		sctx->wr_curr_bio->page_count = 0;
1654	}
1655	sbio = sctx->wr_curr_bio;
1656	if (sbio->page_count == 0) {
1657		struct bio *bio;
1658
1659		ret = fill_writer_pointer_gap(sctx,
1660					      spage->physical_for_dev_replace);
1661		if (ret) {
1662			mutex_unlock(&sctx->wr_lock);
1663			return ret;
1664		}
1665
1666		sbio->physical = spage->physical_for_dev_replace;
1667		sbio->logical = spage->logical;
1668		sbio->dev = sctx->wr_tgtdev;
1669		bio = sbio->bio;
1670		if (!bio) {
1671			bio = btrfs_bio_alloc(sctx->pages_per_bio);
1672			sbio->bio = bio;
1673		}
1674
1675		bio->bi_private = sbio;
1676		bio->bi_end_io = scrub_wr_bio_end_io;
1677		bio_set_dev(bio, sbio->dev->bdev);
1678		bio->bi_iter.bi_sector = sbio->physical >> 9;
1679		bio->bi_opf = REQ_OP_WRITE;
1680		sbio->status = 0;
1681	} else if (sbio->physical + sbio->page_count * sectorsize !=
1682		   spage->physical_for_dev_replace ||
1683		   sbio->logical + sbio->page_count * sectorsize !=
1684		   spage->logical) {
1685		scrub_wr_submit(sctx);
1686		goto again;
1687	}
1688
1689	ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0);
1690	if (ret != sectorsize) {
1691		if (sbio->page_count < 1) {
1692			bio_put(sbio->bio);
1693			sbio->bio = NULL;
1694			mutex_unlock(&sctx->wr_lock);
1695			return -EIO;
1696		}
1697		scrub_wr_submit(sctx);
1698		goto again;
1699	}
1700
1701	sbio->pagev[sbio->page_count] = spage;
1702	scrub_page_get(spage);
1703	sbio->page_count++;
1704	if (sbio->page_count == sctx->pages_per_bio)
1705		scrub_wr_submit(sctx);
1706	mutex_unlock(&sctx->wr_lock);
1707
1708	return 0;
1709}
1710
1711static void scrub_wr_submit(struct scrub_ctx *sctx)
1712{
1713	struct scrub_bio *sbio;
1714
1715	if (!sctx->wr_curr_bio)
1716		return;
1717
1718	sbio = sctx->wr_curr_bio;
1719	sctx->wr_curr_bio = NULL;
1720	WARN_ON(!sbio->bio->bi_bdev);
1721	scrub_pending_bio_inc(sctx);
1722	/* process all writes in a single worker thread. Then the block layer
1723	 * orders the requests before sending them to the driver which
1724	 * doubled the write performance on spinning disks when measured
1725	 * with Linux 3.5 */
1726	btrfsic_submit_bio(sbio->bio);
1727
1728	if (btrfs_is_zoned(sctx->fs_info))
1729		sctx->write_pointer = sbio->physical + sbio->page_count *
1730			sctx->fs_info->sectorsize;
1731}
1732
1733static void scrub_wr_bio_end_io(struct bio *bio)
1734{
1735	struct scrub_bio *sbio = bio->bi_private;
1736	struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1737
1738	sbio->status = bio->bi_status;
1739	sbio->bio = bio;
1740
1741	btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
1742	btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1743}
1744
1745static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1746{
1747	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1748	struct scrub_ctx *sctx = sbio->sctx;
1749	int i;
1750
1751	ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO);
1752	if (sbio->status) {
1753		struct btrfs_dev_replace *dev_replace =
1754			&sbio->sctx->fs_info->dev_replace;
1755
1756		for (i = 0; i < sbio->page_count; i++) {
1757			struct scrub_page *spage = sbio->pagev[i];
1758
1759			spage->io_error = 1;
1760			atomic64_inc(&dev_replace->num_write_errors);
1761		}
1762	}
1763
1764	for (i = 0; i < sbio->page_count; i++)
1765		scrub_page_put(sbio->pagev[i]);
1766
1767	bio_put(sbio->bio);
1768	kfree(sbio);
1769	scrub_pending_bio_dec(sctx);
1770}
1771
1772static int scrub_checksum(struct scrub_block *sblock)
1773{
1774	u64 flags;
1775	int ret;
1776
1777	/*
1778	 * No need to initialize these stats currently,
1779	 * because this function only use return value
1780	 * instead of these stats value.
1781	 *
1782	 * Todo:
1783	 * always use stats
1784	 */
1785	sblock->header_error = 0;
1786	sblock->generation_error = 0;
1787	sblock->checksum_error = 0;
1788
1789	WARN_ON(sblock->page_count < 1);
1790	flags = sblock->pagev[0]->flags;
1791	ret = 0;
1792	if (flags & BTRFS_EXTENT_FLAG_DATA)
1793		ret = scrub_checksum_data(sblock);
1794	else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1795		ret = scrub_checksum_tree_block(sblock);
1796	else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1797		(void)scrub_checksum_super(sblock);
1798	else
1799		WARN_ON(1);
1800	if (ret)
1801		scrub_handle_errored_block(sblock);
1802
1803	return ret;
1804}
1805
1806static int scrub_checksum_data(struct scrub_block *sblock)
1807{
1808	struct scrub_ctx *sctx = sblock->sctx;
1809	struct btrfs_fs_info *fs_info = sctx->fs_info;
1810	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1811	u8 csum[BTRFS_CSUM_SIZE];
1812	struct scrub_page *spage;
1813	char *kaddr;
1814
1815	BUG_ON(sblock->page_count < 1);
1816	spage = sblock->pagev[0];
1817	if (!spage->have_csum)
1818		return 0;
1819
1820	kaddr = page_address(spage->page);
1821
1822	shash->tfm = fs_info->csum_shash;
1823	crypto_shash_init(shash);
1824
1825	/*
1826	 * In scrub_pages() and scrub_pages_for_parity() we ensure each spage
1827	 * only contains one sector of data.
1828	 */
1829	crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
1830
1831	if (memcmp(csum, spage->csum, fs_info->csum_size))
1832		sblock->checksum_error = 1;
1833	return sblock->checksum_error;
1834}
1835
1836static int scrub_checksum_tree_block(struct scrub_block *sblock)
1837{
1838	struct scrub_ctx *sctx = sblock->sctx;
1839	struct btrfs_header *h;
1840	struct btrfs_fs_info *fs_info = sctx->fs_info;
1841	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1842	u8 calculated_csum[BTRFS_CSUM_SIZE];
1843	u8 on_disk_csum[BTRFS_CSUM_SIZE];
1844	/*
1845	 * This is done in sectorsize steps even for metadata as there's a
1846	 * constraint for nodesize to be aligned to sectorsize. This will need
1847	 * to change so we don't misuse data and metadata units like that.
1848	 */
1849	const u32 sectorsize = sctx->fs_info->sectorsize;
1850	const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
1851	int i;
1852	struct scrub_page *spage;
1853	char *kaddr;
1854
1855	BUG_ON(sblock->page_count < 1);
1856
1857	/* Each member in pagev is just one block, not a full page */
1858	ASSERT(sblock->page_count == num_sectors);
1859
1860	spage = sblock->pagev[0];
1861	kaddr = page_address(spage->page);
1862	h = (struct btrfs_header *)kaddr;
1863	memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
1864
1865	/*
1866	 * we don't use the getter functions here, as we
1867	 * a) don't have an extent buffer and
1868	 * b) the page is already kmapped
1869	 */
1870	if (spage->logical != btrfs_stack_header_bytenr(h))
1871		sblock->header_error = 1;
1872
1873	if (spage->generation != btrfs_stack_header_generation(h)) {
1874		sblock->header_error = 1;
1875		sblock->generation_error = 1;
1876	}
1877
1878	if (!scrub_check_fsid(h->fsid, spage))
1879		sblock->header_error = 1;
1880
1881	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1882		   BTRFS_UUID_SIZE))
1883		sblock->header_error = 1;
1884
1885	shash->tfm = fs_info->csum_shash;
1886	crypto_shash_init(shash);
1887	crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
1888			    sectorsize - BTRFS_CSUM_SIZE);
1889
1890	for (i = 1; i < num_sectors; i++) {
1891		kaddr = page_address(sblock->pagev[i]->page);
1892		crypto_shash_update(shash, kaddr, sectorsize);
1893	}
1894
1895	crypto_shash_final(shash, calculated_csum);
1896	if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
1897		sblock->checksum_error = 1;
1898
1899	return sblock->header_error || sblock->checksum_error;
1900}
1901
1902static int scrub_checksum_super(struct scrub_block *sblock)
1903{
1904	struct btrfs_super_block *s;
1905	struct scrub_ctx *sctx = sblock->sctx;
1906	struct btrfs_fs_info *fs_info = sctx->fs_info;
1907	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1908	u8 calculated_csum[BTRFS_CSUM_SIZE];
1909	struct scrub_page *spage;
1910	char *kaddr;
1911	int fail_gen = 0;
1912	int fail_cor = 0;
1913
1914	BUG_ON(sblock->page_count < 1);
1915	spage = sblock->pagev[0];
1916	kaddr = page_address(spage->page);
1917	s = (struct btrfs_super_block *)kaddr;
1918
1919	if (spage->logical != btrfs_super_bytenr(s))
1920		++fail_cor;
1921
1922	if (spage->generation != btrfs_super_generation(s))
1923		++fail_gen;
1924
1925	if (!scrub_check_fsid(s->fsid, spage))
1926		++fail_cor;
1927
1928	shash->tfm = fs_info->csum_shash;
1929	crypto_shash_init(shash);
1930	crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
1931			BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
1932
1933	if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
1934		++fail_cor;
1935
1936	if (fail_cor + fail_gen) {
1937		/*
1938		 * if we find an error in a super block, we just report it.
1939		 * They will get written with the next transaction commit
1940		 * anyway
1941		 */
1942		spin_lock(&sctx->stat_lock);
1943		++sctx->stat.super_errors;
1944		spin_unlock(&sctx->stat_lock);
1945		if (fail_cor)
1946			btrfs_dev_stat_inc_and_print(spage->dev,
1947				BTRFS_DEV_STAT_CORRUPTION_ERRS);
1948		else
1949			btrfs_dev_stat_inc_and_print(spage->dev,
1950				BTRFS_DEV_STAT_GENERATION_ERRS);
1951	}
1952
1953	return fail_cor + fail_gen;
1954}
1955
1956static void scrub_block_get(struct scrub_block *sblock)
1957{
1958	refcount_inc(&sblock->refs);
1959}
1960
1961static void scrub_block_put(struct scrub_block *sblock)
1962{
1963	if (refcount_dec_and_test(&sblock->refs)) {
1964		int i;
1965
1966		if (sblock->sparity)
1967			scrub_parity_put(sblock->sparity);
1968
1969		for (i = 0; i < sblock->page_count; i++)
1970			scrub_page_put(sblock->pagev[i]);
1971		kfree(sblock);
1972	}
1973}
1974
1975static void scrub_page_get(struct scrub_page *spage)
1976{
1977	atomic_inc(&spage->refs);
1978}
1979
1980static void scrub_page_put(struct scrub_page *spage)
1981{
1982	if (atomic_dec_and_test(&spage->refs)) {
1983		if (spage->page)
1984			__free_page(spage->page);
1985		kfree(spage);
1986	}
1987}
1988
1989/*
1990 * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
1991 * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
1992 */
1993static void scrub_throttle(struct scrub_ctx *sctx)
1994{
1995	const int time_slice = 1000;
1996	struct scrub_bio *sbio;
1997	struct btrfs_device *device;
1998	s64 delta;
1999	ktime_t now;
2000	u32 div;
2001	u64 bwlimit;
2002
2003	sbio = sctx->bios[sctx->curr];
2004	device = sbio->dev;
2005	bwlimit = READ_ONCE(device->scrub_speed_max);
2006	if (bwlimit == 0)
2007		return;
2008
2009	/*
2010	 * Slice is divided into intervals when the IO is submitted, adjust by
2011	 * bwlimit and maximum of 64 intervals.
2012	 */
2013	div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
2014	div = min_t(u32, 64, div);
2015
2016	/* Start new epoch, set deadline */
2017	now = ktime_get();
2018	if (sctx->throttle_deadline == 0) {
2019		sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
2020		sctx->throttle_sent = 0;
2021	}
2022
2023	/* Still in the time to send? */
2024	if (ktime_before(now, sctx->throttle_deadline)) {
2025		/* If current bio is within the limit, send it */
2026		sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
2027		if (sctx->throttle_sent <= div_u64(bwlimit, div))
2028			return;
2029
2030		/* We're over the limit, sleep until the rest of the slice */
2031		delta = ktime_ms_delta(sctx->throttle_deadline, now);
2032	} else {
2033		/* New request after deadline, start new epoch */
2034		delta = 0;
2035	}
2036
2037	if (delta) {
2038		long timeout;
2039
2040		timeout = div_u64(delta * HZ, 1000);
2041		schedule_timeout_interruptible(timeout);
2042	}
2043
2044	/* Next call will start the deadline period */
2045	sctx->throttle_deadline = 0;
2046}
2047
2048static void scrub_submit(struct scrub_ctx *sctx)
2049{
2050	struct scrub_bio *sbio;
2051
2052	if (sctx->curr == -1)
2053		return;
2054
2055	scrub_throttle(sctx);
2056
2057	sbio = sctx->bios[sctx->curr];
2058	sctx->curr = -1;
2059	scrub_pending_bio_inc(sctx);
2060	btrfsic_submit_bio(sbio->bio);
2061}
2062
2063static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2064				    struct scrub_page *spage)
2065{
2066	struct scrub_block *sblock = spage->sblock;
2067	struct scrub_bio *sbio;
2068	const u32 sectorsize = sctx->fs_info->sectorsize;
2069	int ret;
2070
2071again:
2072	/*
2073	 * grab a fresh bio or wait for one to become available
2074	 */
2075	while (sctx->curr == -1) {
2076		spin_lock(&sctx->list_lock);
2077		sctx->curr = sctx->first_free;
2078		if (sctx->curr != -1) {
2079			sctx->first_free = sctx->bios[sctx->curr]->next_free;
2080			sctx->bios[sctx->curr]->next_free = -1;
2081			sctx->bios[sctx->curr]->page_count = 0;
2082			spin_unlock(&sctx->list_lock);
2083		} else {
2084			spin_unlock(&sctx->list_lock);
2085			wait_event(sctx->list_wait, sctx->first_free != -1);
2086		}
2087	}
2088	sbio = sctx->bios[sctx->curr];
2089	if (sbio->page_count == 0) {
2090		struct bio *bio;
2091
2092		sbio->physical = spage->physical;
2093		sbio->logical = spage->logical;
2094		sbio->dev = spage->dev;
2095		bio = sbio->bio;
2096		if (!bio) {
2097			bio = btrfs_bio_alloc(sctx->pages_per_bio);
2098			sbio->bio = bio;
2099		}
2100
2101		bio->bi_private = sbio;
2102		bio->bi_end_io = scrub_bio_end_io;
2103		bio_set_dev(bio, sbio->dev->bdev);
2104		bio->bi_iter.bi_sector = sbio->physical >> 9;
2105		bio->bi_opf = REQ_OP_READ;
2106		sbio->status = 0;
2107	} else if (sbio->physical + sbio->page_count * sectorsize !=
2108		   spage->physical ||
2109		   sbio->logical + sbio->page_count * sectorsize !=
2110		   spage->logical ||
2111		   sbio->dev != spage->dev) {
2112		scrub_submit(sctx);
2113		goto again;
2114	}
2115
2116	sbio->pagev[sbio->page_count] = spage;
2117	ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0);
2118	if (ret != sectorsize) {
2119		if (sbio->page_count < 1) {
2120			bio_put(sbio->bio);
2121			sbio->bio = NULL;
2122			return -EIO;
2123		}
2124		scrub_submit(sctx);
2125		goto again;
2126	}
2127
2128	scrub_block_get(sblock); /* one for the page added to the bio */
2129	atomic_inc(&sblock->outstanding_pages);
2130	sbio->page_count++;
2131	if (sbio->page_count == sctx->pages_per_bio)
2132		scrub_submit(sctx);
2133
2134	return 0;
2135}
2136
2137static void scrub_missing_raid56_end_io(struct bio *bio)
2138{
2139	struct scrub_block *sblock = bio->bi_private;
2140	struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2141
2142	if (bio->bi_status)
2143		sblock->no_io_error_seen = 0;
2144
2145	bio_put(bio);
2146
2147	btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
2148}
2149
2150static void scrub_missing_raid56_worker(struct btrfs_work *work)
2151{
2152	struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2153	struct scrub_ctx *sctx = sblock->sctx;
2154	struct btrfs_fs_info *fs_info = sctx->fs_info;
2155	u64 logical;
2156	struct btrfs_device *dev;
2157
2158	logical = sblock->pagev[0]->logical;
2159	dev = sblock->pagev[0]->dev;
2160
2161	if (sblock->no_io_error_seen)
2162		scrub_recheck_block_checksum(sblock);
2163
2164	if (!sblock->no_io_error_seen) {
2165		spin_lock(&sctx->stat_lock);
2166		sctx->stat.read_errors++;
2167		spin_unlock(&sctx->stat_lock);
2168		btrfs_err_rl_in_rcu(fs_info,
2169			"IO error rebuilding logical %llu for dev %s",
2170			logical, rcu_str_deref(dev->name));
2171	} else if (sblock->header_error || sblock->checksum_error) {
2172		spin_lock(&sctx->stat_lock);
2173		sctx->stat.uncorrectable_errors++;
2174		spin_unlock(&sctx->stat_lock);
2175		btrfs_err_rl_in_rcu(fs_info,
2176			"failed to rebuild valid logical %llu for dev %s",
2177			logical, rcu_str_deref(dev->name));
2178	} else {
2179		scrub_write_block_to_dev_replace(sblock);
2180	}
2181
2182	if (sctx->is_dev_replace && sctx->flush_all_writes) {
2183		mutex_lock(&sctx->wr_lock);
2184		scrub_wr_submit(sctx);
2185		mutex_unlock(&sctx->wr_lock);
2186	}
2187
2188	scrub_block_put(sblock);
2189	scrub_pending_bio_dec(sctx);
2190}
2191
2192static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2193{
2194	struct scrub_ctx *sctx = sblock->sctx;
2195	struct btrfs_fs_info *fs_info = sctx->fs_info;
2196	u64 length = sblock->page_count * PAGE_SIZE;
2197	u64 logical = sblock->pagev[0]->logical;
2198	struct btrfs_io_context *bioc = NULL;
2199	struct bio *bio;
2200	struct btrfs_raid_bio *rbio;
2201	int ret;
2202	int i;
2203
2204	btrfs_bio_counter_inc_blocked(fs_info);
2205	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2206			       &length, &bioc);
2207	if (ret || !bioc || !bioc->raid_map)
2208		goto bioc_out;
2209
2210	if (WARN_ON(!sctx->is_dev_replace ||
2211		    !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2212		/*
2213		 * We shouldn't be scrubbing a missing device. Even for dev
2214		 * replace, we should only get here for RAID 5/6. We either
2215		 * managed to mount something with no mirrors remaining or
2216		 * there's a bug in scrub_remap_extent()/btrfs_map_block().
2217		 */
2218		goto bioc_out;
2219	}
2220
2221	bio = btrfs_bio_alloc(BIO_MAX_VECS);
2222	bio->bi_iter.bi_sector = logical >> 9;
2223	bio->bi_private = sblock;
2224	bio->bi_end_io = scrub_missing_raid56_end_io;
2225
2226	rbio = raid56_alloc_missing_rbio(bio, bioc, length);
2227	if (!rbio)
2228		goto rbio_out;
2229
2230	for (i = 0; i < sblock->page_count; i++) {
2231		struct scrub_page *spage = sblock->pagev[i];
2232
2233		raid56_add_scrub_pages(rbio, spage->page, spage->logical);
2234	}
2235
2236	btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL);
2237	scrub_block_get(sblock);
2238	scrub_pending_bio_inc(sctx);
2239	raid56_submit_missing_rbio(rbio);
2240	return;
2241
2242rbio_out:
2243	bio_put(bio);
2244bioc_out:
2245	btrfs_bio_counter_dec(fs_info);
2246	btrfs_put_bioc(bioc);
2247	spin_lock(&sctx->stat_lock);
2248	sctx->stat.malloc_errors++;
2249	spin_unlock(&sctx->stat_lock);
2250}
2251
2252static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
2253		       u64 physical, struct btrfs_device *dev, u64 flags,
2254		       u64 gen, int mirror_num, u8 *csum,
2255		       u64 physical_for_dev_replace)
2256{
2257	struct scrub_block *sblock;
2258	const u32 sectorsize = sctx->fs_info->sectorsize;
2259	int index;
2260
2261	sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2262	if (!sblock) {
2263		spin_lock(&sctx->stat_lock);
2264		sctx->stat.malloc_errors++;
2265		spin_unlock(&sctx->stat_lock);
2266		return -ENOMEM;
2267	}
2268
2269	/* one ref inside this function, plus one for each page added to
2270	 * a bio later on */
2271	refcount_set(&sblock->refs, 1);
2272	sblock->sctx = sctx;
2273	sblock->no_io_error_seen = 1;
2274
2275	for (index = 0; len > 0; index++) {
2276		struct scrub_page *spage;
2277		/*
2278		 * Here we will allocate one page for one sector to scrub.
2279		 * This is fine if PAGE_SIZE == sectorsize, but will cost
2280		 * more memory for PAGE_SIZE > sectorsize case.
2281		 */
2282		u32 l = min(sectorsize, len);
2283
2284		spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2285		if (!spage) {
2286leave_nomem:
2287			spin_lock(&sctx->stat_lock);
2288			sctx->stat.malloc_errors++;
2289			spin_unlock(&sctx->stat_lock);
2290			scrub_block_put(sblock);
2291			return -ENOMEM;
2292		}
2293		ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK);
2294		scrub_page_get(spage);
2295		sblock->pagev[index] = spage;
2296		spage->sblock = sblock;
2297		spage->dev = dev;
2298		spage->flags = flags;
2299		spage->generation = gen;
2300		spage->logical = logical;
2301		spage->physical = physical;
2302		spage->physical_for_dev_replace = physical_for_dev_replace;
2303		spage->mirror_num = mirror_num;
2304		if (csum) {
2305			spage->have_csum = 1;
2306			memcpy(spage->csum, csum, sctx->fs_info->csum_size);
2307		} else {
2308			spage->have_csum = 0;
2309		}
2310		sblock->page_count++;
2311		spage->page = alloc_page(GFP_KERNEL);
2312		if (!spage->page)
2313			goto leave_nomem;
2314		len -= l;
2315		logical += l;
2316		physical += l;
2317		physical_for_dev_replace += l;
2318	}
2319
2320	WARN_ON(sblock->page_count == 0);
2321	if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2322		/*
2323		 * This case should only be hit for RAID 5/6 device replace. See
2324		 * the comment in scrub_missing_raid56_pages() for details.
2325		 */
2326		scrub_missing_raid56_pages(sblock);
2327	} else {
2328		for (index = 0; index < sblock->page_count; index++) {
2329			struct scrub_page *spage = sblock->pagev[index];
2330			int ret;
2331
2332			ret = scrub_add_page_to_rd_bio(sctx, spage);
2333			if (ret) {
2334				scrub_block_put(sblock);
2335				return ret;
2336			}
2337		}
2338
2339		if (flags & BTRFS_EXTENT_FLAG_SUPER)
2340			scrub_submit(sctx);
2341	}
2342
2343	/* last one frees, either here or in bio completion for last page */
2344	scrub_block_put(sblock);
2345	return 0;
2346}
2347
2348static void scrub_bio_end_io(struct bio *bio)
2349{
2350	struct scrub_bio *sbio = bio->bi_private;
2351	struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2352
2353	sbio->status = bio->bi_status;
2354	sbio->bio = bio;
2355
2356	btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2357}
2358
2359static void scrub_bio_end_io_worker(struct btrfs_work *work)
2360{
2361	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2362	struct scrub_ctx *sctx = sbio->sctx;
2363	int i;
2364
2365	ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO);
2366	if (sbio->status) {
2367		for (i = 0; i < sbio->page_count; i++) {
2368			struct scrub_page *spage = sbio->pagev[i];
2369
2370			spage->io_error = 1;
2371			spage->sblock->no_io_error_seen = 0;
2372		}
2373	}
2374
2375	/* now complete the scrub_block items that have all pages completed */
2376	for (i = 0; i < sbio->page_count; i++) {
2377		struct scrub_page *spage = sbio->pagev[i];
2378		struct scrub_block *sblock = spage->sblock;
2379
2380		if (atomic_dec_and_test(&sblock->outstanding_pages))
2381			scrub_block_complete(sblock);
2382		scrub_block_put(sblock);
2383	}
2384
2385	bio_put(sbio->bio);
2386	sbio->bio = NULL;
2387	spin_lock(&sctx->list_lock);
2388	sbio->next_free = sctx->first_free;
2389	sctx->first_free = sbio->index;
2390	spin_unlock(&sctx->list_lock);
2391
2392	if (sctx->is_dev_replace && sctx->flush_all_writes) {
2393		mutex_lock(&sctx->wr_lock);
2394		scrub_wr_submit(sctx);
2395		mutex_unlock(&sctx->wr_lock);
2396	}
2397
2398	scrub_pending_bio_dec(sctx);
2399}
2400
2401static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2402				       unsigned long *bitmap,
2403				       u64 start, u32 len)
2404{
2405	u64 offset;
2406	u32 nsectors;
2407	u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
2408
2409	if (len >= sparity->stripe_len) {
2410		bitmap_set(bitmap, 0, sparity->nsectors);
2411		return;
2412	}
2413
2414	start -= sparity->logic_start;
2415	start = div64_u64_rem(start, sparity->stripe_len, &offset);
2416	offset = offset >> sectorsize_bits;
2417	nsectors = len >> sectorsize_bits;
2418
2419	if (offset + nsectors <= sparity->nsectors) {
2420		bitmap_set(bitmap, offset, nsectors);
2421		return;
2422	}
2423
2424	bitmap_set(bitmap, offset, sparity->nsectors - offset);
2425	bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2426}
2427
2428static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2429						   u64 start, u32 len)
2430{
2431	__scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2432}
2433
2434static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2435						  u64 start, u32 len)
2436{
2437	__scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2438}
2439
2440static void scrub_block_complete(struct scrub_block *sblock)
2441{
2442	int corrupted = 0;
2443
2444	if (!sblock->no_io_error_seen) {
2445		corrupted = 1;
2446		scrub_handle_errored_block(sblock);
2447	} else {
2448		/*
2449		 * if has checksum error, write via repair mechanism in
2450		 * dev replace case, otherwise write here in dev replace
2451		 * case.
2452		 */
2453		corrupted = scrub_checksum(sblock);
2454		if (!corrupted && sblock->sctx->is_dev_replace)
2455			scrub_write_block_to_dev_replace(sblock);
2456	}
2457
2458	if (sblock->sparity && corrupted && !sblock->data_corrected) {
2459		u64 start = sblock->pagev[0]->logical;
2460		u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2461			  sblock->sctx->fs_info->sectorsize;
2462
2463		ASSERT(end - start <= U32_MAX);
2464		scrub_parity_mark_sectors_error(sblock->sparity,
2465						start, end - start);
2466	}
2467}
2468
2469static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
2470{
2471	sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
2472	list_del(&sum->list);
2473	kfree(sum);
2474}
2475
2476/*
2477 * Find the desired csum for range [logical, logical + sectorsize), and store
2478 * the csum into @csum.
2479 *
2480 * The search source is sctx->csum_list, which is a pre-populated list
2481 * storing bytenr ordered csum ranges.  We're responsible to cleanup any range
2482 * that is before @logical.
2483 *
2484 * Return 0 if there is no csum for the range.
2485 * Return 1 if there is csum for the range and copied to @csum.
2486 */
2487static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2488{
2489	bool found = false;
2490
2491	while (!list_empty(&sctx->csum_list)) {
2492		struct btrfs_ordered_sum *sum = NULL;
2493		unsigned long index;
2494		unsigned long num_sectors;
2495
2496		sum = list_first_entry(&sctx->csum_list,
2497				       struct btrfs_ordered_sum, list);
2498		/* The current csum range is beyond our range, no csum found */
2499		if (sum->bytenr > logical)
2500			break;
2501
2502		/*
2503		 * The current sum is before our bytenr, since scrub is always
2504		 * done in bytenr order, the csum will never be used anymore,
2505		 * clean it up so that later calls won't bother with the range,
2506		 * and continue search the next range.
2507		 */
2508		if (sum->bytenr + sum->len <= logical) {
2509			drop_csum_range(sctx, sum);
2510			continue;
2511		}
2512
2513		/* Now the csum range covers our bytenr, copy the csum */
2514		found = true;
2515		index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
2516		num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
2517
2518		memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
2519		       sctx->fs_info->csum_size);
2520
2521		/* Cleanup the range if we're at the end of the csum range */
2522		if (index == num_sectors - 1)
2523			drop_csum_range(sctx, sum);
2524		break;
2525	}
2526	if (!found)
2527		return 0;
2528	return 1;
2529}
2530
2531/* scrub extent tries to collect up to 64 kB for each bio */
2532static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2533			u64 logical, u32 len,
2534			u64 physical, struct btrfs_device *dev, u64 flags,
2535			u64 gen, int mirror_num, u64 physical_for_dev_replace)
2536{
2537	int ret;
2538	u8 csum[BTRFS_CSUM_SIZE];
2539	u32 blocksize;
2540
2541	if (flags & BTRFS_EXTENT_FLAG_DATA) {
2542		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2543			blocksize = map->stripe_len;
2544		else
2545			blocksize = sctx->fs_info->sectorsize;
2546		spin_lock(&sctx->stat_lock);
2547		sctx->stat.data_extents_scrubbed++;
2548		sctx->stat.data_bytes_scrubbed += len;
2549		spin_unlock(&sctx->stat_lock);
2550	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2551		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2552			blocksize = map->stripe_len;
2553		else
2554			blocksize = sctx->fs_info->nodesize;
2555		spin_lock(&sctx->stat_lock);
2556		sctx->stat.tree_extents_scrubbed++;
2557		sctx->stat.tree_bytes_scrubbed += len;
2558		spin_unlock(&sctx->stat_lock);
2559	} else {
2560		blocksize = sctx->fs_info->sectorsize;
2561		WARN_ON(1);
2562	}
2563
2564	while (len) {
2565		u32 l = min(len, blocksize);
2566		int have_csum = 0;
2567
2568		if (flags & BTRFS_EXTENT_FLAG_DATA) {
2569			/* push csums to sbio */
2570			have_csum = scrub_find_csum(sctx, logical, csum);
2571			if (have_csum == 0)
2572				++sctx->stat.no_csum;
2573		}
2574		ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2575				  mirror_num, have_csum ? csum : NULL,
2576				  physical_for_dev_replace);
2577		if (ret)
2578			return ret;
2579		len -= l;
2580		logical += l;
2581		physical += l;
2582		physical_for_dev_replace += l;
2583	}
2584	return 0;
2585}
2586
2587static int scrub_pages_for_parity(struct scrub_parity *sparity,
2588				  u64 logical, u32 len,
2589				  u64 physical, struct btrfs_device *dev,
2590				  u64 flags, u64 gen, int mirror_num, u8 *csum)
2591{
2592	struct scrub_ctx *sctx = sparity->sctx;
2593	struct scrub_block *sblock;
2594	const u32 sectorsize = sctx->fs_info->sectorsize;
2595	int index;
2596
2597	ASSERT(IS_ALIGNED(len, sectorsize));
2598
2599	sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2600	if (!sblock) {
2601		spin_lock(&sctx->stat_lock);
2602		sctx->stat.malloc_errors++;
2603		spin_unlock(&sctx->stat_lock);
2604		return -ENOMEM;
2605	}
2606
2607	/* one ref inside this function, plus one for each page added to
2608	 * a bio later on */
2609	refcount_set(&sblock->refs, 1);
2610	sblock->sctx = sctx;
2611	sblock->no_io_error_seen = 1;
2612	sblock->sparity = sparity;
2613	scrub_parity_get(sparity);
2614
2615	for (index = 0; len > 0; index++) {
2616		struct scrub_page *spage;
2617
2618		spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2619		if (!spage) {
2620leave_nomem:
2621			spin_lock(&sctx->stat_lock);
2622			sctx->stat.malloc_errors++;
2623			spin_unlock(&sctx->stat_lock);
2624			scrub_block_put(sblock);
2625			return -ENOMEM;
2626		}
2627		ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK);
2628		/* For scrub block */
2629		scrub_page_get(spage);
2630		sblock->pagev[index] = spage;
2631		/* For scrub parity */
2632		scrub_page_get(spage);
2633		list_add_tail(&spage->list, &sparity->spages);
2634		spage->sblock = sblock;
2635		spage->dev = dev;
2636		spage->flags = flags;
2637		spage->generation = gen;
2638		spage->logical = logical;
2639		spage->physical = physical;
2640		spage->mirror_num = mirror_num;
2641		if (csum) {
2642			spage->have_csum = 1;
2643			memcpy(spage->csum, csum, sctx->fs_info->csum_size);
2644		} else {
2645			spage->have_csum = 0;
2646		}
2647		sblock->page_count++;
2648		spage->page = alloc_page(GFP_KERNEL);
2649		if (!spage->page)
2650			goto leave_nomem;
2651
2652
2653		/* Iterate over the stripe range in sectorsize steps */
2654		len -= sectorsize;
2655		logical += sectorsize;
2656		physical += sectorsize;
2657	}
2658
2659	WARN_ON(sblock->page_count == 0);
2660	for (index = 0; index < sblock->page_count; index++) {
2661		struct scrub_page *spage = sblock->pagev[index];
2662		int ret;
2663
2664		ret = scrub_add_page_to_rd_bio(sctx, spage);
2665		if (ret) {
2666			scrub_block_put(sblock);
2667			return ret;
2668		}
2669	}
2670
2671	/* last one frees, either here or in bio completion for last page */
2672	scrub_block_put(sblock);
2673	return 0;
2674}
2675
2676static int scrub_extent_for_parity(struct scrub_parity *sparity,
2677				   u64 logical, u32 len,
2678				   u64 physical, struct btrfs_device *dev,
2679				   u64 flags, u64 gen, int mirror_num)
2680{
2681	struct scrub_ctx *sctx = sparity->sctx;
2682	int ret;
2683	u8 csum[BTRFS_CSUM_SIZE];
2684	u32 blocksize;
2685
2686	if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2687		scrub_parity_mark_sectors_error(sparity, logical, len);
2688		return 0;
2689	}
2690
2691	if (flags & BTRFS_EXTENT_FLAG_DATA) {
2692		blocksize = sparity->stripe_len;
2693	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2694		blocksize = sparity->stripe_len;
2695	} else {
2696		blocksize = sctx->fs_info->sectorsize;
2697		WARN_ON(1);
2698	}
2699
2700	while (len) {
2701		u32 l = min(len, blocksize);
2702		int have_csum = 0;
2703
2704		if (flags & BTRFS_EXTENT_FLAG_DATA) {
2705			/* push csums to sbio */
2706			have_csum = scrub_find_csum(sctx, logical, csum);
2707			if (have_csum == 0)
2708				goto skip;
2709		}
2710		ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2711					     flags, gen, mirror_num,
2712					     have_csum ? csum : NULL);
2713		if (ret)
2714			return ret;
2715skip:
2716		len -= l;
2717		logical += l;
2718		physical += l;
2719	}
2720	return 0;
2721}
2722
2723/*
2724 * Given a physical address, this will calculate it's
2725 * logical offset. if this is a parity stripe, it will return
2726 * the most left data stripe's logical offset.
2727 *
2728 * return 0 if it is a data stripe, 1 means parity stripe.
2729 */
2730static int get_raid56_logic_offset(u64 physical, int num,
2731				   struct map_lookup *map, u64 *offset,
2732				   u64 *stripe_start)
2733{
2734	int i;
2735	int j = 0;
2736	u64 stripe_nr;
2737	u64 last_offset;
2738	u32 stripe_index;
2739	u32 rot;
2740	const int data_stripes = nr_data_stripes(map);
2741
2742	last_offset = (physical - map->stripes[num].physical) * data_stripes;
2743	if (stripe_start)
2744		*stripe_start = last_offset;
2745
2746	*offset = last_offset;
2747	for (i = 0; i < data_stripes; i++) {
2748		*offset = last_offset + i * map->stripe_len;
2749
2750		stripe_nr = div64_u64(*offset, map->stripe_len);
2751		stripe_nr = div_u64(stripe_nr, data_stripes);
2752
2753		/* Work out the disk rotation on this stripe-set */
2754		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2755		/* calculate which stripe this data locates */
2756		rot += i;
2757		stripe_index = rot % map->num_stripes;
2758		if (stripe_index == num)
2759			return 0;
2760		if (stripe_index < num)
2761			j++;
2762	}
2763	*offset = last_offset + j * map->stripe_len;
2764	return 1;
2765}
2766
2767static void scrub_free_parity(struct scrub_parity *sparity)
2768{
2769	struct scrub_ctx *sctx = sparity->sctx;
2770	struct scrub_page *curr, *next;
2771	int nbits;
2772
2773	nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2774	if (nbits) {
2775		spin_lock(&sctx->stat_lock);
2776		sctx->stat.read_errors += nbits;
2777		sctx->stat.uncorrectable_errors += nbits;
2778		spin_unlock(&sctx->stat_lock);
2779	}
2780
2781	list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2782		list_del_init(&curr->list);
2783		scrub_page_put(curr);
2784	}
2785
2786	kfree(sparity);
2787}
2788
2789static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
2790{
2791	struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2792						    work);
2793	struct scrub_ctx *sctx = sparity->sctx;
2794
2795	scrub_free_parity(sparity);
2796	scrub_pending_bio_dec(sctx);
2797}
2798
2799static void scrub_parity_bio_endio(struct bio *bio)
2800{
2801	struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2802	struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2803
2804	if (bio->bi_status)
2805		bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2806			  sparity->nsectors);
2807
2808	bio_put(bio);
2809
2810	btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL,
2811			NULL);
2812	btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
2813}
2814
2815static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2816{
2817	struct scrub_ctx *sctx = sparity->sctx;
2818	struct btrfs_fs_info *fs_info = sctx->fs_info;
2819	struct bio *bio;
2820	struct btrfs_raid_bio *rbio;
2821	struct btrfs_io_context *bioc = NULL;
2822	u64 length;
2823	int ret;
2824
2825	if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2826			   sparity->nsectors))
2827		goto out;
2828
2829	length = sparity->logic_end - sparity->logic_start;
2830
2831	btrfs_bio_counter_inc_blocked(fs_info);
2832	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2833			       &length, &bioc);
2834	if (ret || !bioc || !bioc->raid_map)
2835		goto bioc_out;
2836
2837	bio = btrfs_bio_alloc(BIO_MAX_VECS);
2838	bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2839	bio->bi_private = sparity;
2840	bio->bi_end_io = scrub_parity_bio_endio;
2841
2842	rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length,
2843					      sparity->scrub_dev,
2844					      sparity->dbitmap,
2845					      sparity->nsectors);
2846	if (!rbio)
2847		goto rbio_out;
2848
2849	scrub_pending_bio_inc(sctx);
2850	raid56_parity_submit_scrub_rbio(rbio);
2851	return;
2852
2853rbio_out:
2854	bio_put(bio);
2855bioc_out:
2856	btrfs_bio_counter_dec(fs_info);
2857	btrfs_put_bioc(bioc);
2858	bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2859		  sparity->nsectors);
2860	spin_lock(&sctx->stat_lock);
2861	sctx->stat.malloc_errors++;
2862	spin_unlock(&sctx->stat_lock);
2863out:
2864	scrub_free_parity(sparity);
2865}
2866
2867static inline int scrub_calc_parity_bitmap_len(int nsectors)
2868{
2869	return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
2870}
2871
2872static void scrub_parity_get(struct scrub_parity *sparity)
2873{
2874	refcount_inc(&sparity->refs);
2875}
2876
2877static void scrub_parity_put(struct scrub_parity *sparity)
2878{
2879	if (!refcount_dec_and_test(&sparity->refs))
2880		return;
2881
2882	scrub_parity_check_and_repair(sparity);
2883}
2884
2885static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2886						  struct map_lookup *map,
2887						  struct btrfs_device *sdev,
2888						  u64 logic_start,
2889						  u64 logic_end)
2890{
2891	struct btrfs_fs_info *fs_info = sctx->fs_info;
2892	struct btrfs_root *root = btrfs_extent_root(fs_info, logic_start);
2893	struct btrfs_root *csum_root;
2894	struct btrfs_extent_item *extent;
2895	struct btrfs_io_context *bioc = NULL;
2896	struct btrfs_path *path;
2897	u64 flags;
2898	int ret;
2899	int slot;
2900	struct extent_buffer *l;
2901	struct btrfs_key key;
2902	u64 generation;
2903	u64 extent_logical;
2904	u64 extent_physical;
2905	/* Check the comment in scrub_stripe() for why u32 is enough here */
2906	u32 extent_len;
2907	u64 mapped_length;
2908	struct btrfs_device *extent_dev;
2909	struct scrub_parity *sparity;
2910	int nsectors;
2911	int bitmap_len;
2912	int extent_mirror_num;
2913	int stop_loop = 0;
2914
2915	path = btrfs_alloc_path();
2916	if (!path) {
2917		spin_lock(&sctx->stat_lock);
2918		sctx->stat.malloc_errors++;
2919		spin_unlock(&sctx->stat_lock);
2920		return -ENOMEM;
2921	}
2922	path->search_commit_root = 1;
2923	path->skip_locking = 1;
2924
2925	ASSERT(map->stripe_len <= U32_MAX);
2926	nsectors = map->stripe_len >> fs_info->sectorsize_bits;
2927	bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2928	sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2929			  GFP_NOFS);
2930	if (!sparity) {
2931		spin_lock(&sctx->stat_lock);
2932		sctx->stat.malloc_errors++;
2933		spin_unlock(&sctx->stat_lock);
2934		btrfs_free_path(path);
2935		return -ENOMEM;
2936	}
2937
2938	ASSERT(map->stripe_len <= U32_MAX);
2939	sparity->stripe_len = map->stripe_len;
2940	sparity->nsectors = nsectors;
2941	sparity->sctx = sctx;
2942	sparity->scrub_dev = sdev;
2943	sparity->logic_start = logic_start;
2944	sparity->logic_end = logic_end;
2945	refcount_set(&sparity->refs, 1);
2946	INIT_LIST_HEAD(&sparity->spages);
2947	sparity->dbitmap = sparity->bitmap;
2948	sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2949
2950	ret = 0;
2951	while (logic_start < logic_end) {
2952		if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2953			key.type = BTRFS_METADATA_ITEM_KEY;
2954		else
2955			key.type = BTRFS_EXTENT_ITEM_KEY;
2956		key.objectid = logic_start;
2957		key.offset = (u64)-1;
2958
2959		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2960		if (ret < 0)
2961			goto out;
2962
2963		if (ret > 0) {
2964			ret = btrfs_previous_extent_item(root, path, 0);
2965			if (ret < 0)
2966				goto out;
2967			if (ret > 0) {
2968				btrfs_release_path(path);
2969				ret = btrfs_search_slot(NULL, root, &key,
2970							path, 0, 0);
2971				if (ret < 0)
2972					goto out;
2973			}
2974		}
2975
2976		stop_loop = 0;
2977		while (1) {
2978			u64 bytes;
2979
2980			l = path->nodes[0];
2981			slot = path->slots[0];
2982			if (slot >= btrfs_header_nritems(l)) {
2983				ret = btrfs_next_leaf(root, path);
2984				if (ret == 0)
2985					continue;
2986				if (ret < 0)
2987					goto out;
2988
2989				stop_loop = 1;
2990				break;
2991			}
2992			btrfs_item_key_to_cpu(l, &key, slot);
2993
2994			if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2995			    key.type != BTRFS_METADATA_ITEM_KEY)
2996				goto next;
2997
2998			if (key.type == BTRFS_METADATA_ITEM_KEY)
2999				bytes = fs_info->nodesize;
3000			else
3001				bytes = key.offset;
3002
3003			if (key.objectid + bytes <= logic_start)
3004				goto next;
3005
3006			if (key.objectid >= logic_end) {
3007				stop_loop = 1;
3008				break;
3009			}
3010
3011			while (key.objectid >= logic_start + map->stripe_len)
3012				logic_start += map->stripe_len;
3013
3014			extent = btrfs_item_ptr(l, slot,
3015						struct btrfs_extent_item);
3016			flags = btrfs_extent_flags(l, extent);
3017			generation = btrfs_extent_generation(l, extent);
3018
3019			if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3020			    (key.objectid < logic_start ||
3021			     key.objectid + bytes >
3022			     logic_start + map->stripe_len)) {
3023				btrfs_err(fs_info,
3024					  "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3025					  key.objectid, logic_start);
3026				spin_lock(&sctx->stat_lock);
3027				sctx->stat.uncorrectable_errors++;
3028				spin_unlock(&sctx->stat_lock);
3029				goto next;
3030			}
3031again:
3032			extent_logical = key.objectid;
3033			ASSERT(bytes <= U32_MAX);
3034			extent_len = bytes;
3035
3036			if (extent_logical < logic_start) {
3037				extent_len -= logic_start - extent_logical;
3038				extent_logical = logic_start;
3039			}
3040
3041			if (extent_logical + extent_len >
3042			    logic_start + map->stripe_len)
3043				extent_len = logic_start + map->stripe_len -
3044					     extent_logical;
3045
3046			scrub_parity_mark_sectors_data(sparity, extent_logical,
3047						       extent_len);
3048
3049			mapped_length = extent_len;
3050			bioc = NULL;
3051			ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
3052					extent_logical, &mapped_length, &bioc,
3053					0);
3054			if (!ret) {
3055				if (!bioc || mapped_length < extent_len)
3056					ret = -EIO;
3057			}
3058			if (ret) {
3059				btrfs_put_bioc(bioc);
3060				goto out;
3061			}
3062			extent_physical = bioc->stripes[0].physical;
3063			extent_mirror_num = bioc->mirror_num;
3064			extent_dev = bioc->stripes[0].dev;
3065			btrfs_put_bioc(bioc);
3066
3067			csum_root = btrfs_csum_root(fs_info, extent_logical);
3068			ret = btrfs_lookup_csums_range(csum_root,
3069						extent_logical,
3070						extent_logical + extent_len - 1,
3071						&sctx->csum_list, 1);
3072			if (ret)
3073				goto out;
3074
3075			ret = scrub_extent_for_parity(sparity, extent_logical,
3076						      extent_len,
3077						      extent_physical,
3078						      extent_dev, flags,
3079						      generation,
3080						      extent_mirror_num);
3081
3082			scrub_free_csums(sctx);
3083
3084			if (ret)
3085				goto out;
3086
3087			if (extent_logical + extent_len <
3088			    key.objectid + bytes) {
3089				logic_start += map->stripe_len;
3090
3091				if (logic_start >= logic_end) {
3092					stop_loop = 1;
3093					break;
3094				}
3095
3096				if (logic_start < key.objectid + bytes) {
3097					cond_resched();
3098					goto again;
3099				}
3100			}
3101next:
3102			path->slots[0]++;
3103		}
3104
3105		btrfs_release_path(path);
3106
3107		if (stop_loop)
3108			break;
3109
3110		logic_start += map->stripe_len;
3111	}
3112out:
3113	if (ret < 0) {
3114		ASSERT(logic_end - logic_start <= U32_MAX);
3115		scrub_parity_mark_sectors_error(sparity, logic_start,
3116						logic_end - logic_start);
3117	}
3118	scrub_parity_put(sparity);
3119	scrub_submit(sctx);
3120	mutex_lock(&sctx->wr_lock);
3121	scrub_wr_submit(sctx);
3122	mutex_unlock(&sctx->wr_lock);
3123
3124	btrfs_free_path(path);
3125	return ret < 0 ? ret : 0;
3126}
3127
3128static void sync_replace_for_zoned(struct scrub_ctx *sctx)
3129{
3130	if (!btrfs_is_zoned(sctx->fs_info))
3131		return;
3132
3133	sctx->flush_all_writes = true;
3134	scrub_submit(sctx);
3135	mutex_lock(&sctx->wr_lock);
3136	scrub_wr_submit(sctx);
3137	mutex_unlock(&sctx->wr_lock);
3138
3139	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3140}
3141
3142static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
3143					u64 physical, u64 physical_end)
3144{
3145	struct btrfs_fs_info *fs_info = sctx->fs_info;
3146	int ret = 0;
3147
3148	if (!btrfs_is_zoned(fs_info))
3149		return 0;
3150
3151	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3152
3153	mutex_lock(&sctx->wr_lock);
3154	if (sctx->write_pointer < physical_end) {
3155		ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
3156						    physical,
3157						    sctx->write_pointer);
3158		if (ret)
3159			btrfs_err(fs_info,
3160				  "zoned: failed to recover write pointer");
3161	}
3162	mutex_unlock(&sctx->wr_lock);
3163	btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
3164
3165	return ret;
3166}
3167
3168static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3169					   struct btrfs_block_group *bg,
3170					   struct map_lookup *map,
3171					   struct btrfs_device *scrub_dev,
3172					   int stripe_index, u64 dev_extent_len)
3173{
3174	struct btrfs_path *path;
3175	struct btrfs_fs_info *fs_info = sctx->fs_info;
3176	struct btrfs_root *root;
3177	struct btrfs_root *csum_root;
3178	struct btrfs_extent_item *extent;
3179	struct blk_plug plug;
3180	const u64 chunk_logical = bg->start;
3181	u64 flags;
3182	int ret;
3183	int slot;
3184	u64 nstripes;
3185	struct extent_buffer *l;
3186	u64 physical;
3187	u64 logical;
3188	u64 logic_end;
3189	u64 physical_end;
3190	u64 generation;
3191	int mirror_num;
3192	struct btrfs_key key;
3193	u64 increment;
3194	u64 offset;
3195	u64 extent_logical;
3196	u64 extent_physical;
3197	/*
3198	 * Unlike chunk length, extent length should never go beyond
3199	 * BTRFS_MAX_EXTENT_SIZE, thus u32 is enough here.
3200	 */
3201	u32 extent_len;
3202	u64 stripe_logical;
3203	u64 stripe_end;
3204	struct btrfs_device *extent_dev;
3205	int extent_mirror_num;
3206	int stop_loop = 0;
3207
3208	physical = map->stripes[stripe_index].physical;
3209	offset = 0;
3210	nstripes = div64_u64(dev_extent_len, map->stripe_len);
3211	mirror_num = 1;
3212	increment = map->stripe_len;
3213	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3214		offset = map->stripe_len * stripe_index;
3215		increment = map->stripe_len * map->num_stripes;
3216	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3217		int factor = map->num_stripes / map->sub_stripes;
3218		offset = map->stripe_len * (stripe_index / map->sub_stripes);
3219		increment = map->stripe_len * factor;
3220		mirror_num = stripe_index % map->sub_stripes + 1;
3221	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
3222		mirror_num = stripe_index % map->num_stripes + 1;
3223	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3224		mirror_num = stripe_index % map->num_stripes + 1;
3225	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3226		get_raid56_logic_offset(physical, stripe_index, map, &offset,
3227					NULL);
3228		increment = map->stripe_len * nr_data_stripes(map);
3229	}
3230
3231	path = btrfs_alloc_path();
3232	if (!path)
3233		return -ENOMEM;
3234
3235	/*
3236	 * work on commit root. The related disk blocks are static as
3237	 * long as COW is applied. This means, it is save to rewrite
3238	 * them to repair disk errors without any race conditions
3239	 */
3240	path->search_commit_root = 1;
3241	path->skip_locking = 1;
3242	path->reada = READA_FORWARD;
3243
3244	logical = chunk_logical + offset;
3245	physical_end = physical + nstripes * map->stripe_len;
3246	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3247		get_raid56_logic_offset(physical_end, stripe_index,
3248					map, &logic_end, NULL);
3249		logic_end += chunk_logical;
3250	} else {
3251		logic_end = logical + increment * nstripes;
3252	}
3253	wait_event(sctx->list_wait,
3254		   atomic_read(&sctx->bios_in_flight) == 0);
3255	scrub_blocked_if_needed(fs_info);
3256
3257	root = btrfs_extent_root(fs_info, logical);
3258	csum_root = btrfs_csum_root(fs_info, logical);
3259
3260	/*
3261	 * collect all data csums for the stripe to avoid seeking during
3262	 * the scrub. This might currently (crc32) end up to be about 1MB
3263	 */
3264	blk_start_plug(&plug);
3265
3266	if (sctx->is_dev_replace &&
3267	    btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
3268		mutex_lock(&sctx->wr_lock);
3269		sctx->write_pointer = physical;
3270		mutex_unlock(&sctx->wr_lock);
3271		sctx->flush_all_writes = true;
3272	}
3273
3274	/*
3275	 * now find all extents for each stripe and scrub them
3276	 */
3277	ret = 0;
3278	while (physical < physical_end) {
3279		/*
3280		 * canceled?
3281		 */
3282		if (atomic_read(&fs_info->scrub_cancel_req) ||
3283		    atomic_read(&sctx->cancel_req)) {
3284			ret = -ECANCELED;
3285			goto out;
3286		}
3287		/*
3288		 * check to see if we have to pause
3289		 */
3290		if (atomic_read(&fs_info->scrub_pause_req)) {
3291			/* push queued extents */
3292			sctx->flush_all_writes = true;
3293			scrub_submit(sctx);
3294			mutex_lock(&sctx->wr_lock);
3295			scrub_wr_submit(sctx);
3296			mutex_unlock(&sctx->wr_lock);
3297			wait_event(sctx->list_wait,
3298				   atomic_read(&sctx->bios_in_flight) == 0);
3299			sctx->flush_all_writes = false;
3300			scrub_blocked_if_needed(fs_info);
3301		}
3302
3303		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3304			ret = get_raid56_logic_offset(physical, stripe_index,
3305						      map, &logical,
3306						      &stripe_logical);
3307			logical += chunk_logical;
3308			if (ret) {
3309				/* it is parity strip */
3310				stripe_logical += chunk_logical;
3311				stripe_end = stripe_logical + increment;
3312				ret = scrub_raid56_parity(sctx, map, scrub_dev,
3313							  stripe_logical,
3314							  stripe_end);
3315				if (ret)
3316					goto out;
3317				goto skip;
3318			}
3319		}
3320
3321		if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3322			key.type = BTRFS_METADATA_ITEM_KEY;
3323		else
3324			key.type = BTRFS_EXTENT_ITEM_KEY;
3325		key.objectid = logical;
3326		key.offset = (u64)-1;
3327
3328		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3329		if (ret < 0)
3330			goto out;
3331
3332		if (ret > 0) {
3333			ret = btrfs_previous_extent_item(root, path, 0);
3334			if (ret < 0)
3335				goto out;
3336			if (ret > 0) {
3337				/* there's no smaller item, so stick with the
3338				 * larger one */
3339				btrfs_release_path(path);
3340				ret = btrfs_search_slot(NULL, root, &key,
3341							path, 0, 0);
3342				if (ret < 0)
3343					goto out;
3344			}
3345		}
3346
3347		stop_loop = 0;
3348		while (1) {
3349			u64 bytes;
3350
3351			l = path->nodes[0];
3352			slot = path->slots[0];
3353			if (slot >= btrfs_header_nritems(l)) {
3354				ret = btrfs_next_leaf(root, path);
3355				if (ret == 0)
3356					continue;
3357				if (ret < 0)
3358					goto out;
3359
3360				stop_loop = 1;
3361				break;
3362			}
3363			btrfs_item_key_to_cpu(l, &key, slot);
3364
3365			if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3366			    key.type != BTRFS_METADATA_ITEM_KEY)
3367				goto next;
3368
3369			if (key.type == BTRFS_METADATA_ITEM_KEY)
3370				bytes = fs_info->nodesize;
3371			else
3372				bytes = key.offset;
3373
3374			if (key.objectid + bytes <= logical)
3375				goto next;
3376
3377			if (key.objectid >= logical + map->stripe_len) {
3378				/* out of this device extent */
3379				if (key.objectid >= logic_end)
3380					stop_loop = 1;
3381				break;
3382			}
3383
3384			/*
3385			 * If our block group was removed in the meanwhile, just
3386			 * stop scrubbing since there is no point in continuing.
3387			 * Continuing would prevent reusing its device extents
3388			 * for new block groups for a long time.
3389			 */
3390			spin_lock(&bg->lock);
3391			if (bg->removed) {
3392				spin_unlock(&bg->lock);
3393				ret = 0;
3394				goto out;
3395			}
3396			spin_unlock(&bg->lock);
3397
3398			extent = btrfs_item_ptr(l, slot,
3399						struct btrfs_extent_item);
3400			flags = btrfs_extent_flags(l, extent);
3401			generation = btrfs_extent_generation(l, extent);
3402
3403			if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3404			    (key.objectid < logical ||
3405			     key.objectid + bytes >
3406			     logical + map->stripe_len)) {
3407				btrfs_err(fs_info,
3408					   "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3409				       key.objectid, logical);
3410				spin_lock(&sctx->stat_lock);
3411				sctx->stat.uncorrectable_errors++;
3412				spin_unlock(&sctx->stat_lock);
3413				goto next;
3414			}
3415
3416again:
3417			extent_logical = key.objectid;
3418			ASSERT(bytes <= U32_MAX);
3419			extent_len = bytes;
3420
3421			/*
3422			 * trim extent to this stripe
3423			 */
3424			if (extent_logical < logical) {
3425				extent_len -= logical - extent_logical;
3426				extent_logical = logical;
3427			}
3428			if (extent_logical + extent_len >
3429			    logical + map->stripe_len) {
3430				extent_len = logical + map->stripe_len -
3431					     extent_logical;
3432			}
3433
3434			extent_physical = extent_logical - logical + physical;
3435			extent_dev = scrub_dev;
3436			extent_mirror_num = mirror_num;
3437			if (sctx->is_dev_replace)
3438				scrub_remap_extent(fs_info, extent_logical,
3439						   extent_len, &extent_physical,
3440						   &extent_dev,
3441						   &extent_mirror_num);
3442
3443			if (flags & BTRFS_EXTENT_FLAG_DATA) {
3444				ret = btrfs_lookup_csums_range(csum_root,
3445						extent_logical,
3446						extent_logical + extent_len - 1,
3447						&sctx->csum_list, 1);
3448				if (ret)
3449					goto out;
3450			}
3451
3452			ret = scrub_extent(sctx, map, extent_logical, extent_len,
3453					   extent_physical, extent_dev, flags,
3454					   generation, extent_mirror_num,
3455					   extent_logical - logical + physical);
3456
3457			scrub_free_csums(sctx);
3458
3459			if (ret)
3460				goto out;
3461
3462			if (sctx->is_dev_replace)
3463				sync_replace_for_zoned(sctx);
3464
3465			if (extent_logical + extent_len <
3466			    key.objectid + bytes) {
3467				if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3468					/*
3469					 * loop until we find next data stripe
3470					 * or we have finished all stripes.
3471					 */
3472loop:
3473					physical += map->stripe_len;
3474					ret = get_raid56_logic_offset(physical,
3475							stripe_index, map,
3476							&logical, &stripe_logical);
3477					logical += chunk_logical;
3478
3479					if (ret && physical < physical_end) {
3480						stripe_logical += chunk_logical;
3481						stripe_end = stripe_logical +
3482								increment;
3483						ret = scrub_raid56_parity(sctx,
3484							map, scrub_dev,
3485							stripe_logical,
3486							stripe_end);
3487						if (ret)
3488							goto out;
3489						goto loop;
3490					}
3491				} else {
3492					physical += map->stripe_len;
3493					logical += increment;
3494				}
3495				if (logical < key.objectid + bytes) {
3496					cond_resched();
3497					goto again;
3498				}
3499
3500				if (physical >= physical_end) {
3501					stop_loop = 1;
3502					break;
3503				}
3504			}
3505next:
3506			path->slots[0]++;
3507		}
3508		btrfs_release_path(path);
3509skip:
3510		logical += increment;
3511		physical += map->stripe_len;
3512		spin_lock(&sctx->stat_lock);
3513		if (stop_loop)
3514			sctx->stat.last_physical = map->stripes[stripe_index].physical +
3515						   dev_extent_len;
3516		else
3517			sctx->stat.last_physical = physical;
3518		spin_unlock(&sctx->stat_lock);
3519		if (stop_loop)
3520			break;
3521	}
3522out:
3523	/* push queued extents */
3524	scrub_submit(sctx);
3525	mutex_lock(&sctx->wr_lock);
3526	scrub_wr_submit(sctx);
3527	mutex_unlock(&sctx->wr_lock);
3528
3529	blk_finish_plug(&plug);
3530	btrfs_free_path(path);
3531
3532	if (sctx->is_dev_replace && ret >= 0) {
3533		int ret2;
3534
3535		ret2 = sync_write_pointer_for_zoned(sctx,
3536				chunk_logical + offset,
3537				map->stripes[stripe_index].physical,
3538				physical_end);
3539		if (ret2)
3540			ret = ret2;
3541	}
3542
3543	return ret < 0 ? ret : 0;
3544}
3545
3546static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3547					  struct btrfs_block_group *bg,
3548					  struct btrfs_device *scrub_dev,
3549					  u64 dev_offset,
3550					  u64 dev_extent_len)
3551{
3552	struct btrfs_fs_info *fs_info = sctx->fs_info;
3553	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3554	struct map_lookup *map;
3555	struct extent_map *em;
3556	int i;
3557	int ret = 0;
3558
3559	read_lock(&map_tree->lock);
3560	em = lookup_extent_mapping(map_tree, bg->start, bg->length);
3561	read_unlock(&map_tree->lock);
3562
3563	if (!em) {
3564		/*
3565		 * Might have been an unused block group deleted by the cleaner
3566		 * kthread or relocation.
3567		 */
3568		spin_lock(&bg->lock);
3569		if (!bg->removed)
3570			ret = -EINVAL;
3571		spin_unlock(&bg->lock);
3572
3573		return ret;
3574	}
3575	if (em->start != bg->start)
3576		goto out;
3577	if (em->len < dev_extent_len)
3578		goto out;
3579
3580	map = em->map_lookup;
3581	for (i = 0; i < map->num_stripes; ++i) {
3582		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3583		    map->stripes[i].physical == dev_offset) {
3584			ret = scrub_stripe(sctx, bg, map, scrub_dev, i,
3585					   dev_extent_len);
3586			if (ret)
3587				goto out;
3588		}
3589	}
3590out:
3591	free_extent_map(em);
3592
3593	return ret;
3594}
3595
3596static int finish_extent_writes_for_zoned(struct btrfs_root *root,
3597					  struct btrfs_block_group *cache)
3598{
3599	struct btrfs_fs_info *fs_info = cache->fs_info;
3600	struct btrfs_trans_handle *trans;
3601
3602	if (!btrfs_is_zoned(fs_info))
3603		return 0;
3604
3605	btrfs_wait_block_group_reservations(cache);
3606	btrfs_wait_nocow_writers(cache);
3607	btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
3608
3609	trans = btrfs_join_transaction(root);
3610	if (IS_ERR(trans))
3611		return PTR_ERR(trans);
3612	return btrfs_commit_transaction(trans);
3613}
3614
3615static noinline_for_stack
3616int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3617			   struct btrfs_device *scrub_dev, u64 start, u64 end)
3618{
3619	struct btrfs_dev_extent *dev_extent = NULL;
3620	struct btrfs_path *path;
3621	struct btrfs_fs_info *fs_info = sctx->fs_info;
3622	struct btrfs_root *root = fs_info->dev_root;
3623	u64 chunk_offset;
3624	int ret = 0;
3625	int ro_set;
3626	int slot;
3627	struct extent_buffer *l;
3628	struct btrfs_key key;
3629	struct btrfs_key found_key;
3630	struct btrfs_block_group *cache;
3631	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3632
3633	path = btrfs_alloc_path();
3634	if (!path)
3635		return -ENOMEM;
3636
3637	path->reada = READA_FORWARD;
3638	path->search_commit_root = 1;
3639	path->skip_locking = 1;
3640
3641	key.objectid = scrub_dev->devid;
3642	key.offset = 0ull;
3643	key.type = BTRFS_DEV_EXTENT_KEY;
3644
3645	while (1) {
3646		u64 dev_extent_len;
3647
3648		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3649		if (ret < 0)
3650			break;
3651		if (ret > 0) {
3652			if (path->slots[0] >=
3653			    btrfs_header_nritems(path->nodes[0])) {
3654				ret = btrfs_next_leaf(root, path);
3655				if (ret < 0)
3656					break;
3657				if (ret > 0) {
3658					ret = 0;
3659					break;
3660				}
3661			} else {
3662				ret = 0;
3663			}
3664		}
3665
3666		l = path->nodes[0];
3667		slot = path->slots[0];
3668
3669		btrfs_item_key_to_cpu(l, &found_key, slot);
3670
3671		if (found_key.objectid != scrub_dev->devid)
3672			break;
3673
3674		if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3675			break;
3676
3677		if (found_key.offset >= end)
3678			break;
3679
3680		if (found_key.offset < key.offset)
3681			break;
3682
3683		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3684		dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
3685
3686		if (found_key.offset + dev_extent_len <= start)
3687			goto skip;
3688
3689		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3690
3691		/*
3692		 * get a reference on the corresponding block group to prevent
3693		 * the chunk from going away while we scrub it
3694		 */
3695		cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3696
3697		/* some chunks are removed but not committed to disk yet,
3698		 * continue scrubbing */
3699		if (!cache)
3700			goto skip;
3701
3702		ASSERT(cache->start <= chunk_offset);
3703		/*
3704		 * We are using the commit root to search for device extents, so
3705		 * that means we could have found a device extent item from a
3706		 * block group that was deleted in the current transaction. The
3707		 * logical start offset of the deleted block group, stored at
3708		 * @chunk_offset, might be part of the logical address range of
3709		 * a new block group (which uses different physical extents).
3710		 * In this case btrfs_lookup_block_group() has returned the new
3711		 * block group, and its start address is less than @chunk_offset.
3712		 *
3713		 * We skip such new block groups, because it's pointless to
3714		 * process them, as we won't find their extents because we search
3715		 * for them using the commit root of the extent tree. For a device
3716		 * replace it's also fine to skip it, we won't miss copying them
3717		 * to the target device because we have the write duplication
3718		 * setup through the regular write path (by btrfs_map_block()),
3719		 * and we have committed a transaction when we started the device
3720		 * replace, right after setting up the device replace state.
3721		 */
3722		if (cache->start < chunk_offset) {
3723			btrfs_put_block_group(cache);
3724			goto skip;
3725		}
3726
3727		if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
3728			spin_lock(&cache->lock);
3729			if (!cache->to_copy) {
3730				spin_unlock(&cache->lock);
3731				btrfs_put_block_group(cache);
3732				goto skip;
3733			}
3734			spin_unlock(&cache->lock);
3735		}
3736
3737		/*
3738		 * Make sure that while we are scrubbing the corresponding block
3739		 * group doesn't get its logical address and its device extents
3740		 * reused for another block group, which can possibly be of a
3741		 * different type and different profile. We do this to prevent
3742		 * false error detections and crashes due to bogus attempts to
3743		 * repair extents.
3744		 */
3745		spin_lock(&cache->lock);
3746		if (cache->removed) {
3747			spin_unlock(&cache->lock);
3748			btrfs_put_block_group(cache);
3749			goto skip;
3750		}
3751		btrfs_freeze_block_group(cache);
3752		spin_unlock(&cache->lock);
3753
3754		/*
3755		 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3756		 * to avoid deadlock caused by:
3757		 * btrfs_inc_block_group_ro()
3758		 * -> btrfs_wait_for_commit()
3759		 * -> btrfs_commit_transaction()
3760		 * -> btrfs_scrub_pause()
3761		 */
3762		scrub_pause_on(fs_info);
3763
3764		/*
3765		 * Don't do chunk preallocation for scrub.
3766		 *
3767		 * This is especially important for SYSTEM bgs, or we can hit
3768		 * -EFBIG from btrfs_finish_chunk_alloc() like:
3769		 * 1. The only SYSTEM bg is marked RO.
3770		 *    Since SYSTEM bg is small, that's pretty common.
3771		 * 2. New SYSTEM bg will be allocated
3772		 *    Due to regular version will allocate new chunk.
3773		 * 3. New SYSTEM bg is empty and will get cleaned up
3774		 *    Before cleanup really happens, it's marked RO again.
3775		 * 4. Empty SYSTEM bg get scrubbed
3776		 *    We go back to 2.
3777		 *
3778		 * This can easily boost the amount of SYSTEM chunks if cleaner
3779		 * thread can't be triggered fast enough, and use up all space
3780		 * of btrfs_super_block::sys_chunk_array
3781		 *
3782		 * While for dev replace, we need to try our best to mark block
3783		 * group RO, to prevent race between:
3784		 * - Write duplication
3785		 *   Contains latest data
3786		 * - Scrub copy
3787		 *   Contains data from commit tree
3788		 *
3789		 * If target block group is not marked RO, nocow writes can
3790		 * be overwritten by scrub copy, causing data corruption.
3791		 * So for dev-replace, it's not allowed to continue if a block
3792		 * group is not RO.
3793		 */
3794		ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
3795		if (!ret && sctx->is_dev_replace) {
3796			ret = finish_extent_writes_for_zoned(root, cache);
3797			if (ret) {
3798				btrfs_dec_block_group_ro(cache);
3799				scrub_pause_off(fs_info);
3800				btrfs_put_block_group(cache);
3801				break;
3802			}
3803		}
3804
3805		if (ret == 0) {
3806			ro_set = 1;
3807		} else if (ret == -ENOSPC && !sctx->is_dev_replace) {
3808			/*
3809			 * btrfs_inc_block_group_ro return -ENOSPC when it
3810			 * failed in creating new chunk for metadata.
3811			 * It is not a problem for scrub, because
3812			 * metadata are always cowed, and our scrub paused
3813			 * commit_transactions.
3814			 */
3815			ro_set = 0;
3816		} else if (ret == -ETXTBSY) {
3817			btrfs_warn(fs_info,
3818		   "skipping scrub of block group %llu due to active swapfile",
3819				   cache->start);
3820			scrub_pause_off(fs_info);
3821			ret = 0;
3822			goto skip_unfreeze;
3823		} else {
3824			btrfs_warn(fs_info,
3825				   "failed setting block group ro: %d", ret);
3826			btrfs_unfreeze_block_group(cache);
3827			btrfs_put_block_group(cache);
3828			scrub_pause_off(fs_info);
3829			break;
3830		}
3831
3832		/*
3833		 * Now the target block is marked RO, wait for nocow writes to
3834		 * finish before dev-replace.
3835		 * COW is fine, as COW never overwrites extents in commit tree.
3836		 */
3837		if (sctx->is_dev_replace) {
3838			btrfs_wait_nocow_writers(cache);
3839			btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
3840					cache->length);
3841		}
3842
3843		scrub_pause_off(fs_info);
3844		down_write(&dev_replace->rwsem);
3845		dev_replace->cursor_right = found_key.offset + dev_extent_len;
3846		dev_replace->cursor_left = found_key.offset;
3847		dev_replace->item_needs_writeback = 1;
3848		up_write(&dev_replace->rwsem);
3849
3850		ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
3851				  dev_extent_len);
3852
3853		/*
3854		 * flush, submit all pending read and write bios, afterwards
3855		 * wait for them.
3856		 * Note that in the dev replace case, a read request causes
3857		 * write requests that are submitted in the read completion
3858		 * worker. Therefore in the current situation, it is required
3859		 * that all write requests are flushed, so that all read and
3860		 * write requests are really completed when bios_in_flight
3861		 * changes to 0.
3862		 */
3863		sctx->flush_all_writes = true;
3864		scrub_submit(sctx);
3865		mutex_lock(&sctx->wr_lock);
3866		scrub_wr_submit(sctx);
3867		mutex_unlock(&sctx->wr_lock);
3868
3869		wait_event(sctx->list_wait,
3870			   atomic_read(&sctx->bios_in_flight) == 0);
3871
3872		scrub_pause_on(fs_info);
3873
3874		/*
3875		 * must be called before we decrease @scrub_paused.
3876		 * make sure we don't block transaction commit while
3877		 * we are waiting pending workers finished.
3878		 */
3879		wait_event(sctx->list_wait,
3880			   atomic_read(&sctx->workers_pending) == 0);
3881		sctx->flush_all_writes = false;
3882
3883		scrub_pause_off(fs_info);
3884
3885		if (sctx->is_dev_replace &&
3886		    !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
3887						      cache, found_key.offset))
3888			ro_set = 0;
3889
3890		down_write(&dev_replace->rwsem);
3891		dev_replace->cursor_left = dev_replace->cursor_right;
3892		dev_replace->item_needs_writeback = 1;
3893		up_write(&dev_replace->rwsem);
3894
3895		if (ro_set)
3896			btrfs_dec_block_group_ro(cache);
3897
3898		/*
3899		 * We might have prevented the cleaner kthread from deleting
3900		 * this block group if it was already unused because we raced
3901		 * and set it to RO mode first. So add it back to the unused
3902		 * list, otherwise it might not ever be deleted unless a manual
3903		 * balance is triggered or it becomes used and unused again.
3904		 */
3905		spin_lock(&cache->lock);
3906		if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3907		    cache->used == 0) {
3908			spin_unlock(&cache->lock);
3909			if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
3910				btrfs_discard_queue_work(&fs_info->discard_ctl,
3911							 cache);
3912			else
3913				btrfs_mark_bg_unused(cache);
3914		} else {
3915			spin_unlock(&cache->lock);
3916		}
3917skip_unfreeze:
3918		btrfs_unfreeze_block_group(cache);
3919		btrfs_put_block_group(cache);
3920		if (ret)
3921			break;
3922		if (sctx->is_dev_replace &&
3923		    atomic64_read(&dev_replace->num_write_errors) > 0) {
3924			ret = -EIO;
3925			break;
3926		}
3927		if (sctx->stat.malloc_errors > 0) {
3928			ret = -ENOMEM;
3929			break;
3930		}
3931skip:
3932		key.offset = found_key.offset + dev_extent_len;
3933		btrfs_release_path(path);
3934	}
3935
3936	btrfs_free_path(path);
3937
3938	return ret;
3939}
3940
3941static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3942					   struct btrfs_device *scrub_dev)
3943{
3944	int	i;
3945	u64	bytenr;
3946	u64	gen;
3947	int	ret;
3948	struct btrfs_fs_info *fs_info = sctx->fs_info;
3949
3950	if (BTRFS_FS_ERROR(fs_info))
3951		return -EROFS;
3952
3953	/* Seed devices of a new filesystem has their own generation. */
3954	if (scrub_dev->fs_devices != fs_info->fs_devices)
3955		gen = scrub_dev->generation;
3956	else
3957		gen = fs_info->last_trans_committed;
3958
3959	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3960		bytenr = btrfs_sb_offset(i);
3961		if (bytenr + BTRFS_SUPER_INFO_SIZE >
3962		    scrub_dev->commit_total_bytes)
3963			break;
3964		if (!btrfs_check_super_location(scrub_dev, bytenr))
3965			continue;
3966
3967		ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
3968				  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
3969				  NULL, bytenr);
3970		if (ret)
3971			return ret;
3972	}
3973	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3974
3975	return 0;
3976}
3977
3978static void scrub_workers_put(struct btrfs_fs_info *fs_info)
3979{
3980	if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
3981					&fs_info->scrub_lock)) {
3982		struct btrfs_workqueue *scrub_workers = NULL;
3983		struct btrfs_workqueue *scrub_wr_comp = NULL;
3984		struct btrfs_workqueue *scrub_parity = NULL;
3985
3986		scrub_workers = fs_info->scrub_workers;
3987		scrub_wr_comp = fs_info->scrub_wr_completion_workers;
3988		scrub_parity = fs_info->scrub_parity_workers;
3989
3990		fs_info->scrub_workers = NULL;
3991		fs_info->scrub_wr_completion_workers = NULL;
3992		fs_info->scrub_parity_workers = NULL;
3993		mutex_unlock(&fs_info->scrub_lock);
3994
3995		btrfs_destroy_workqueue(scrub_workers);
3996		btrfs_destroy_workqueue(scrub_wr_comp);
3997		btrfs_destroy_workqueue(scrub_parity);
3998	}
3999}
4000
4001/*
4002 * get a reference count on fs_info->scrub_workers. start worker if necessary
4003 */
4004static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4005						int is_dev_replace)
4006{
4007	struct btrfs_workqueue *scrub_workers = NULL;
4008	struct btrfs_workqueue *scrub_wr_comp = NULL;
4009	struct btrfs_workqueue *scrub_parity = NULL;
4010	unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
4011	int max_active = fs_info->thread_pool_size;
4012	int ret = -ENOMEM;
4013
4014	if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
4015		return 0;
4016
4017	scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags,
4018					      is_dev_replace ? 1 : max_active, 4);
4019	if (!scrub_workers)
4020		goto fail_scrub_workers;
4021
4022	scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
4023					      max_active, 2);
4024	if (!scrub_wr_comp)
4025		goto fail_scrub_wr_completion_workers;
4026
4027	scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
4028					     max_active, 2);
4029	if (!scrub_parity)
4030		goto fail_scrub_parity_workers;
4031
4032	mutex_lock(&fs_info->scrub_lock);
4033	if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
4034		ASSERT(fs_info->scrub_workers == NULL &&
4035		       fs_info->scrub_wr_completion_workers == NULL &&
4036		       fs_info->scrub_parity_workers == NULL);
4037		fs_info->scrub_workers = scrub_workers;
4038		fs_info->scrub_wr_completion_workers = scrub_wr_comp;
4039		fs_info->scrub_parity_workers = scrub_parity;
4040		refcount_set(&fs_info->scrub_workers_refcnt, 1);
4041		mutex_unlock(&fs_info->scrub_lock);
4042		return 0;
4043	}
4044	/* Other thread raced in and created the workers for us */
4045	refcount_inc(&fs_info->scrub_workers_refcnt);
4046	mutex_unlock(&fs_info->scrub_lock);
4047
4048	ret = 0;
4049	btrfs_destroy_workqueue(scrub_parity);
4050fail_scrub_parity_workers:
4051	btrfs_destroy_workqueue(scrub_wr_comp);
4052fail_scrub_wr_completion_workers:
4053	btrfs_destroy_workqueue(scrub_workers);
4054fail_scrub_workers:
4055	return ret;
4056}
4057
4058int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4059		    u64 end, struct btrfs_scrub_progress *progress,
4060		    int readonly, int is_dev_replace)
4061{
4062	struct btrfs_dev_lookup_args args = { .devid = devid };
4063	struct scrub_ctx *sctx;
4064	int ret;
4065	struct btrfs_device *dev;
4066	unsigned int nofs_flag;
4067
4068	if (btrfs_fs_closing(fs_info))
4069		return -EAGAIN;
4070
4071	if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
4072		/*
4073		 * in this case scrub is unable to calculate the checksum
4074		 * the way scrub is implemented. Do not handle this
4075		 * situation at all because it won't ever happen.
4076		 */
4077		btrfs_err(fs_info,
4078			   "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
4079		       fs_info->nodesize,
4080		       BTRFS_STRIPE_LEN);
4081		return -EINVAL;
4082	}
4083
4084	if (fs_info->nodesize >
4085	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
4086	    fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
4087		/*
4088		 * would exhaust the array bounds of pagev member in
4089		 * struct scrub_block
4090		 */
4091		btrfs_err(fs_info,
4092			  "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
4093		       fs_info->nodesize,
4094		       SCRUB_MAX_PAGES_PER_BLOCK,
4095		       fs_info->sectorsize,
4096		       SCRUB_MAX_PAGES_PER_BLOCK);
4097		return -EINVAL;
4098	}
4099
4100	/* Allocate outside of device_list_mutex */
4101	sctx = scrub_setup_ctx(fs_info, is_dev_replace);
4102	if (IS_ERR(sctx))
4103		return PTR_ERR(sctx);
4104
4105	ret = scrub_workers_get(fs_info, is_dev_replace);
4106	if (ret)
4107		goto out_free_ctx;
4108
4109	mutex_lock(&fs_info->fs_devices->device_list_mutex);
4110	dev = btrfs_find_device(fs_info->fs_devices, &args);
4111	if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4112		     !is_dev_replace)) {
4113		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4114		ret = -ENODEV;
4115		goto out;
4116	}
4117
4118	if (!is_dev_replace && !readonly &&
4119	    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
4120		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4121		btrfs_err_in_rcu(fs_info,
4122			"scrub on devid %llu: filesystem on %s is not writable",
4123				 devid, rcu_str_deref(dev->name));
4124		ret = -EROFS;
4125		goto out;
4126	}
4127
4128	mutex_lock(&fs_info->scrub_lock);
4129	if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4130	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
4131		mutex_unlock(&fs_info->scrub_lock);
4132		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4133		ret = -EIO;
4134		goto out;
4135	}
4136
4137	down_read(&fs_info->dev_replace.rwsem);
4138	if (dev->scrub_ctx ||
4139	    (!is_dev_replace &&
4140	     btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
4141		up_read(&fs_info->dev_replace.rwsem);
4142		mutex_unlock(&fs_info->scrub_lock);
4143		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4144		ret = -EINPROGRESS;
4145		goto out;
4146	}
4147	up_read(&fs_info->dev_replace.rwsem);
4148
4149	sctx->readonly = readonly;
4150	dev->scrub_ctx = sctx;
4151	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4152
4153	/*
4154	 * checking @scrub_pause_req here, we can avoid
4155	 * race between committing transaction and scrubbing.
4156	 */
4157	__scrub_blocked_if_needed(fs_info);
4158	atomic_inc(&fs_info->scrubs_running);
4159	mutex_unlock(&fs_info->scrub_lock);
4160
4161	/*
4162	 * In order to avoid deadlock with reclaim when there is a transaction
4163	 * trying to pause scrub, make sure we use GFP_NOFS for all the
4164	 * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
4165	 * invoked by our callees. The pausing request is done when the
4166	 * transaction commit starts, and it blocks the transaction until scrub
4167	 * is paused (done at specific points at scrub_stripe() or right above
4168	 * before incrementing fs_info->scrubs_running).
4169	 */
4170	nofs_flag = memalloc_nofs_save();
4171	if (!is_dev_replace) {
4172		btrfs_info(fs_info, "scrub: started on devid %llu", devid);
4173		/*
4174		 * by holding device list mutex, we can
4175		 * kick off writing super in log tree sync.
4176		 */
4177		mutex_lock(&fs_info->fs_devices->device_list_mutex);
4178		ret = scrub_supers(sctx, dev);
4179		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4180	}
4181
4182	if (!ret)
4183		ret = scrub_enumerate_chunks(sctx, dev, start, end);
4184	memalloc_nofs_restore(nofs_flag);
4185
4186	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4187	atomic_dec(&fs_info->scrubs_running);
4188	wake_up(&fs_info->scrub_pause_wait);
4189
4190	wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4191
4192	if (progress)
4193		memcpy(progress, &sctx->stat, sizeof(*progress));
4194
4195	if (!is_dev_replace)
4196		btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
4197			ret ? "not finished" : "finished", devid, ret);
4198
4199	mutex_lock(&fs_info->scrub_lock);
4200	dev->scrub_ctx = NULL;
4201	mutex_unlock(&fs_info->scrub_lock);
4202
4203	scrub_workers_put(fs_info);
4204	scrub_put_ctx(sctx);
4205
4206	return ret;
4207out:
4208	scrub_workers_put(fs_info);
4209out_free_ctx:
4210	scrub_free_ctx(sctx);
4211
4212	return ret;
4213}
4214
4215void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4216{
4217	mutex_lock(&fs_info->scrub_lock);
4218	atomic_inc(&fs_info->scrub_pause_req);
4219	while (atomic_read(&fs_info->scrubs_paused) !=
4220	       atomic_read(&fs_info->scrubs_running)) {
4221		mutex_unlock(&fs_info->scrub_lock);
4222		wait_event(fs_info->scrub_pause_wait,
4223			   atomic_read(&fs_info->scrubs_paused) ==
4224			   atomic_read(&fs_info->scrubs_running));
4225		mutex_lock(&fs_info->scrub_lock);
4226	}
4227	mutex_unlock(&fs_info->scrub_lock);
4228}
4229
4230void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4231{
4232	atomic_dec(&fs_info->scrub_pause_req);
4233	wake_up(&fs_info->scrub_pause_wait);
4234}
4235
4236int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4237{
4238	mutex_lock(&fs_info->scrub_lock);
4239	if (!atomic_read(&fs_info->scrubs_running)) {
4240		mutex_unlock(&fs_info->scrub_lock);
4241		return -ENOTCONN;
4242	}
4243
4244	atomic_inc(&fs_info->scrub_cancel_req);
4245	while (atomic_read(&fs_info->scrubs_running)) {
4246		mutex_unlock(&fs_info->scrub_lock);
4247		wait_event(fs_info->scrub_pause_wait,
4248			   atomic_read(&fs_info->scrubs_running) == 0);
4249		mutex_lock(&fs_info->scrub_lock);
4250	}
4251	atomic_dec(&fs_info->scrub_cancel_req);
4252	mutex_unlock(&fs_info->scrub_lock);
4253
4254	return 0;
4255}
4256
4257int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
4258{
4259	struct btrfs_fs_info *fs_info = dev->fs_info;
4260	struct scrub_ctx *sctx;
4261
4262	mutex_lock(&fs_info->scrub_lock);
4263	sctx = dev->scrub_ctx;
4264	if (!sctx) {
4265		mutex_unlock(&fs_info->scrub_lock);
4266		return -ENOTCONN;
4267	}
4268	atomic_inc(&sctx->cancel_req);
4269	while (dev->scrub_ctx) {
4270		mutex_unlock(&fs_info->scrub_lock);
4271		wait_event(fs_info->scrub_pause_wait,
4272			   dev->scrub_ctx == NULL);
4273		mutex_lock(&fs_info->scrub_lock);
4274	}
4275	mutex_unlock(&fs_info->scrub_lock);
4276
4277	return 0;
4278}
4279
4280int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4281			 struct btrfs_scrub_progress *progress)
4282{
4283	struct btrfs_dev_lookup_args args = { .devid = devid };
4284	struct btrfs_device *dev;
4285	struct scrub_ctx *sctx = NULL;
4286
4287	mutex_lock(&fs_info->fs_devices->device_list_mutex);
4288	dev = btrfs_find_device(fs_info->fs_devices, &args);
4289	if (dev)
4290		sctx = dev->scrub_ctx;
4291	if (sctx)
4292		memcpy(progress, &sctx->stat, sizeof(*progress));
4293	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4294
4295	return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4296}
4297
4298static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
4299			       u64 extent_logical, u32 extent_len,
4300			       u64 *extent_physical,
4301			       struct btrfs_device **extent_dev,
4302			       int *extent_mirror_num)
4303{
4304	u64 mapped_length;
4305	struct btrfs_io_context *bioc = NULL;
4306	int ret;
4307
4308	mapped_length = extent_len;
4309	ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4310			      &mapped_length, &bioc, 0);
4311	if (ret || !bioc || mapped_length < extent_len ||
4312	    !bioc->stripes[0].dev->bdev) {
4313		btrfs_put_bioc(bioc);
4314		return;
4315	}
4316
4317	*extent_physical = bioc->stripes[0].physical;
4318	*extent_mirror_num = bioc->mirror_num;
4319	*extent_dev = bioc->stripes[0].dev;
4320	btrfs_put_bioc(bioc);
4321}
Configure Feed

Configure Feed